summaryrefslogtreecommitdiff
path: root/collaborativeagents/training/grpo_verl
diff options
context:
space:
mode:
Diffstat (limited to 'collaborativeagents/training/grpo_verl')
-rw-r--r--collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42/.hydra/config.yaml626
-rw-r--r--collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42/.hydra/hydra.yaml214
-rw-r--r--collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42/.hydra/overrides.yaml47
-rw-r--r--collaborativeagents/training/grpo_verl/run_grpo.sbatch111
-rw-r--r--collaborativeagents/training/grpo_verl/run_verl_grpo.sh63
-rw-r--r--collaborativeagents/training/grpo_verl/verl_reward_functions.py160
6 files changed, 1221 insertions, 0 deletions
diff --git a/collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42/.hydra/config.yaml b/collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42/.hydra/config.yaml
new file mode 100644
index 0000000..7af183b
--- /dev/null
+++ b/collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42/.hydra/config.yaml
@@ -0,0 +1,626 @@
+actor_rollout_ref:
+ actor:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: bfloat16
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ ppo_mini_batch_size: 8
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: 4
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ clip_ratio: 0.2
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.2
+ tau_pos: 1.0
+ tau_neg: 1.05
+ freeze_vision_tower: false
+ policy_loss:
+ _target_: verl.workers.config.PolicyLossConfig
+ loss_mode: vanilla
+ clip_cov_ratio: 0.0002
+ clip_cov_lb: 1.0
+ clip_cov_ub: 5.0
+ kl_cov_ratio: 0.0002
+ ppo_kl_coef: 0.1
+ clip_ratio_c: 3.0
+ loss_agg_mode: token-mean
+ loss_scale_factor: null
+ entropy_coeff: 0
+ calculate_entropy: false
+ use_kl_loss: true
+ use_torch_compile: true
+ kl_loss_coef: 0.003
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ data_loader_seed: 42
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ step_start: 0
+ step_end: null
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ grad_clip: 1.0
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+ ref:
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: ${actor_rollout_ref.actor.strategy}
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 4
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ step_start: 0
+ step_end: null
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: bfloat16
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: true
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: vllm
+ mode: async
+ temperature: 0.9
+ top_k: -1
+ top_p: 0.9
+ prompt_length: ${oc.select:data.max_prompt_length,512}
+ response_length: ${oc.select:data.max_response_length,512}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.5
+ ignore_eos: false
+ enforce_eager: false
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ tensor_model_parallel_size: 1
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ pipeline_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_model_len: null
+ max_num_seqs: 1024
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ logprobs_mode: processed_logprobs
+ load_format: dummy
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 4
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ disable_log_stats: true
+ do_sample: true
+ 'n': 8
+ over_sample_rate: 0
+ multi_stage_wake_up: false
+ engine_kwargs:
+ vllm: {}
+ sglang: {}
+ val_kwargs:
+ _target_: verl.workers.config.SamplingConfig
+ top_k: -1
+ top_p: 1.0
+ temperature: 0
+ 'n': 1
+ do_sample: false
+ multi_turn:
+ _target_: verl.workers.config.MultiTurnConfig
+ enable: false
+ max_assistant_turns: null
+ tool_config_path: null
+ max_user_turns: null
+ max_parallel_calls: 1
+ max_tool_response_length: 256
+ tool_response_truncate_side: middle
+ interaction_config_path: null
+ use_inference_chat_template: false
+ tokenization_sanity_check_mode: strict
+ format: hermes
+ num_repeat_rollouts: null
+ calculate_log_probs: false
+ agent:
+ _target_: verl.workers.config.AgentLoopConfig
+ num_workers: 8
+ default_agent_loop: single_turn_agent
+ agent_loop_config_path: null
+ custom_async_server:
+ _target_: verl.workers.config.CustomAsyncServerConfig
+ path: null
+ name: null
+ update_weights_bucket_megabytes: 512
+ trace:
+ _target_: verl.workers.config.TraceConfig
+ backend: null
+ token2text: false
+ max_samples_per_step_per_worker: null
+ skip_rollout: false
+ skip_dump_dir: /tmp/rollout_dump
+ skip_tokenizer_init: true
+ enable_rollout_routing_replay: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ prometheus:
+ _target_: verl.workers.config.PrometheusConfig
+ enable: false
+ port: 9090
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+ quantization: null
+ quantization_config_file: null
+ layered_summon: false
+ model:
+ _target_: verl.workers.config.HFModelConfig
+ path: /work/nvme/bfqt/yurenh2/sft_checkpoints/checkpoint-200
+ hf_config_path: null
+ tokenizer_path: null
+ use_shm: false
+ trust_remote_code: false
+ custom_chat_template: null
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: true
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ exclude_modules: null
+ lora_adapter_path: null
+ use_liger: false
+ use_fused_kernels: false
+ fused_kernel_options:
+ impl_backend: torch
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ hybrid_engine: true
+ nccl_timeout: 600
+data:
+ tokenizer: null
+ use_shm: false
+ train_files: /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/data/session_level_reflection_grpo_train.parquet
+ val_files: /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/data/session_level_reflection_grpo_train.parquet
+ train_max_samples: -1
+ val_max_samples: -1
+ prompt_key: prompt
+ reward_fn_key: data_source
+ max_prompt_length: 2048
+ max_response_length: 1024
+ train_batch_size: 64
+ val_batch_size: null
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+ null}
+ return_raw_input_ids: false
+ return_raw_chat: true
+ return_full_prompt: false
+ shuffle: true
+ seed: null
+ dataloader_num_workers: 8
+ image_patch_size: 14
+ validation_shuffle: false
+ filter_overlong_prompts: true
+ filter_overlong_prompts_workers: 1
+ truncation: error
+ image_key: images
+ video_key: videos
+ trust_remote_code: false
+ custom_cls:
+ path: null
+ name: null
+ return_multi_modal_inputs: true
+ sampler:
+ class_path: null
+ class_name: null
+ datagen:
+ path: null
+ name: null
+ apply_chat_template_kwargs: {}
+reward_manager:
+ _target_: verl.trainer.config.config.RewardManagerConfig
+ source: register
+ name: ${oc.select:reward_model.reward_manager,naive}
+ module:
+ _target_: verl.trainer.config.config.ModuleConfig
+ path: null
+ name: custom_reward_manager
+critic:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ model:
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+ override_config: {}
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+ _target_: verl.workers.config.FSDPCriticModelCfg
+ use_shm: false
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: false
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ _target_: verl.workers.config.FSDPCriticConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ enable: null
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+ data_loader_seed: 42
+ cliprange_value: 0.5
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ step_start: 0
+ step_end: null
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+ ulysses_sequence_parallel_size: 1
+ grad_clip: 1.0
+reward_model:
+ enable: false
+ enable_resource_pool: false
+ n_gpus_per_node: 8
+ nnodes: 0
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ trust_remote_code: false
+ override_config: {}
+ use_shm: false
+ use_remove_padding: false
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ micro_batch_size: null
+ micro_batch_size_per_gpu: null
+ max_length: null
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ reward_manager: naive
+ reward_loop_source: register
+ reward_loop_module_path: null
+ reward_loop_class_name: null
+ launch_reward_fn_async: false
+ sandbox_fusion:
+ url: null
+ max_concurrent: 64
+ memory_limit_mb: 1024
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ ulysses_sequence_parallel_size: 1
+ use_reward_loop: true
+ num_workers: 1
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: ???
+ dtype: bfloat16
+ gpu_memory_utilization: 0.5
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_model_len: null
+ max_num_seqs: 1024
+ load_format: auto
+ engine_kwargs: {}
+ limit_images: null
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ disable_log_stats: true
+ skip_tokenizer_init: false
+ prompt_length: 2048
+ response_length: 2048
+algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ rollout_rs_threshold: null
+ rollout_rs_threshold_lower: null
+ rollout_token_veto_threshold: null
+ bypass_mode: false
+ loss_type: ppo_clip
+ rollout_is_batch_normalize: false
+ _target_: verl.trainer.config.AlgoConfig
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ norm_adv_by_std_in_grpo: true
+ use_kl_in_reward: false
+ kl_penalty: kl
+ kl_ctrl:
+ _target_: verl.trainer.config.KLControlConfig
+ type: fixed
+ kl_coef: 0.001
+ horizon: 10000
+ target_kl: 0.1
+ use_pf_ppo: false
+ pf_ppo:
+ reweight_method: pow
+ weight_pow: 2.0
+custom_reward_function:
+ path: /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/verl_reward_functions.py
+ name: compute_score
+trainer:
+ balance_batch: true
+ total_epochs: 1
+ total_training_steps: null
+ project_name: collaborative-agent-reflection-grpo
+ experiment_name: llama3.1-8b-grpo
+ logger:
+ - console
+ log_val_generations: 0
+ rollout_data_dir: null
+ validation_data_dir: null
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 50
+ esi_redundant_time: 0
+ resume_mode: auto
+ resume_from_path: null
+ val_before_train: false
+ val_only: false
+ test_freq: 100
+ critic_warmup: 0
+ default_hdfs_dir: null
+ del_local_ckpt_after_load: false
+ default_local_dir: /scratch/bfqt/yurenh2/grpo_outputs
+ max_actor_ckpt_to_keep: null
+ max_critic_ckpt_to_keep: null
+ ray_wait_register_center_timeout: 300
+ device: cuda
+ use_legacy_worker_impl: auto
+global_profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: null
+ steps: null
+ profile_continuous_steps: false
+ save_path: outputs/profile
+ global_tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: false
+ controller_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ worker_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ capture-range: cudaProfilerApi
+ capture-range-end: null
+ kill: none
+ torch_memory:
+ trace_alloc_max_entries: 100000
+ stack_depth: 32
+ context: all
+ stacks: all
+ kw_args: {}
+transfer_queue:
+ enable: false
+ray_kwargs:
+ ray_init:
+ num_cpus: null
+ timeline_json_file: null
diff --git a/collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42/.hydra/hydra.yaml b/collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42/.hydra/hydra.yaml
new file mode 100644
index 0000000..8e4c4ec
--- /dev/null
+++ b/collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42/.hydra/hydra.yaml
@@ -0,0 +1,214 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - algorithm.adv_estimator=grpo
+ - data.train_files=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/data/session_level_reflection_grpo_train.parquet
+ - data.val_files=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/data/session_level_reflection_grpo_train.parquet
+ - data.train_batch_size=64
+ - data.max_prompt_length=2048
+ - data.max_response_length=1024
+ - data.filter_overlong_prompts=True
+ - data.truncation=error
+ - data.prompt_key=prompt
+ - data.reward_fn_key=data_source
+ - actor_rollout_ref.model.path=/work/nvme/bfqt/yurenh2/sft_checkpoints/checkpoint-200
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.ppo_mini_batch_size=8
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4
+ - actor_rollout_ref.actor.use_kl_loss=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.003
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+ - actor_rollout_ref.actor.entropy_coeff=0
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
+ - actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.5
+ - actor_rollout_ref.rollout.n=8
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4
+ - actor_rollout_ref.ref.fsdp_config.model_dtype=bfloat16
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - actor_rollout_ref.rollout.temperature=0.9
+ - actor_rollout_ref.rollout.top_p=0.9
+ - custom_reward_function.path=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/verl_reward_functions.py
+ - custom_reward_function.name=compute_score
+ - algorithm.use_kl_in_reward=False
+ - trainer.critic_warmup=0
+ - trainer.val_before_train=False
+ - trainer.logger=["console"]
+ - trainer.project_name=collaborative-agent-reflection-grpo
+ - trainer.experiment_name=llama3.1-8b-grpo
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=50
+ - trainer.test_freq=100
+ - trainer.total_epochs=1
+ - trainer.default_local_dir=/scratch/bfqt/yurenh2/grpo_outputs
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.003,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4,actor_rollout_ref.actor.ppo_mini_batch_size=8,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=/work/nvme/bfqt/yurenh2/sft_checkpoints/checkpoint-200,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.model_dtype=bfloat16,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4,actor_rollout_ref.rollout.gpu_memory_utilization=0.5,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4,actor_rollout_ref.rollout.n=8,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.temperature=0.9,actor_rollout_ref.rollout.tensor_model_parallel_size=1,actor_rollout_ref.rollout.top_p=0.9,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.name=compute_score,custom_reward_function.path=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/verl_reward_functions.py,data.filter_overlong_prompts=True,data.max_prompt_length=2048,data.max_response_length=1024,data.prompt_key=prompt,data.reward_fn_key=data_source,data.train_batch_size=64,data.train_files=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/data/session_level_reflection_grpo_train.parquet,data.truncation=error,data.val_files=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/data/session_level_reflection_grpo_train.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/scratch/bfqt/yurenh2/grpo_outputs,trainer.experiment_name=llama3.1-8b-grpo,trainer.logger=["console"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=collaborative-agent-reflection-grpo,trainer.save_freq=50,trainer.test_freq=100,trainer.total_epochs=1,trainer.val_before_train=False
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42
+ choices:
+ algorithm@algorithm.rollout_correction: rollout_correction
+ reward_model: dp_reward_loop
+ critic: dp_critic
+ critic/../engine@critic.model.fsdp_config: fsdp
+ critic/../optim@critic.optim: fsdp
+ model@actor_rollout_ref.model: hf_model
+ rollout@actor_rollout_ref.rollout: rollout
+ ref@actor_rollout_ref.ref: dp_ref
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+ data: legacy_data
+ actor@actor_rollout_ref.actor: dp_actor
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42/.hydra/overrides.yaml b/collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42/.hydra/overrides.yaml
new file mode 100644
index 0000000..8c6f184
--- /dev/null
+++ b/collaborativeagents/training/grpo_verl/outputs/2026-01-11/03-50-42/.hydra/overrides.yaml
@@ -0,0 +1,47 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/data/session_level_reflection_grpo_train.parquet
+- data.val_files=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/data/session_level_reflection_grpo_train.parquet
+- data.train_batch_size=64
+- data.max_prompt_length=2048
+- data.max_response_length=1024
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- data.prompt_key=prompt
+- data.reward_fn_key=data_source
+- actor_rollout_ref.model.path=/work/nvme/bfqt/yurenh2/sft_checkpoints/checkpoint-200
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=8
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.003
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.5
+- actor_rollout_ref.rollout.n=8
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4
+- actor_rollout_ref.ref.fsdp_config.model_dtype=bfloat16
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- actor_rollout_ref.rollout.temperature=0.9
+- actor_rollout_ref.rollout.top_p=0.9
+- custom_reward_function.path=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/verl_reward_functions.py
+- custom_reward_function.name=compute_score
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.val_before_train=False
+- trainer.logger=["console"]
+- trainer.project_name=collaborative-agent-reflection-grpo
+- trainer.experiment_name=llama3.1-8b-grpo
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=50
+- trainer.test_freq=100
+- trainer.total_epochs=1
+- trainer.default_local_dir=/scratch/bfqt/yurenh2/grpo_outputs
diff --git a/collaborativeagents/training/grpo_verl/run_grpo.sbatch b/collaborativeagents/training/grpo_verl/run_grpo.sbatch
new file mode 100644
index 0000000..e22b221
--- /dev/null
+++ b/collaborativeagents/training/grpo_verl/run_grpo.sbatch
@@ -0,0 +1,111 @@
+#!/bin/bash
+#SBATCH --job-name=grpo_train
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=256G
+#SBATCH --time=12:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/grpo_%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/grpo_%j.err
+
+echo "=== GRPO Training with VERL ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model:$PYTHONPATH"
+export WANDB_MODE=offline
+
+# Paths
+TRAIN_DATA="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/data/session_level_reflection_grpo_train.parquet"
+MODEL_PATH="/work/nvme/bfqt/yurenh2/sft_checkpoints/checkpoint-200"
+REWARD_FN="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/verl_reward_functions.py"
+OUTPUT_DIR="/scratch/bfqt/yurenh2/grpo_outputs"
+
+mkdir -p $OUTPUT_DIR
+
+# Start 70B judge model for reward evaluation on GPUs 0,1
+echo "Starting 70B judge model on GPUs 0,1..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model meta-llama/Llama-3.1-70B-Instruct \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 --dtype bfloat16 --download-dir $HF_HOME &
+
+# Wait for judge model
+echo "Waiting for judge model..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "Judge model ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+
+echo ""
+echo "Starting GRPO training..."
+echo "Model: $MODEL_PATH"
+echo "Data: $TRAIN_DATA"
+echo "Output: $OUTPUT_DIR"
+
+# GRPO training with VERL
+CUDA_VISIBLE_DEVICES=2,3 python3 -m verl.trainer.main_ppo \
+ algorithm.adv_estimator=grpo \
+ data.train_files="$TRAIN_DATA" \
+ data.val_files="$TRAIN_DATA" \
+ data.train_batch_size=64 \
+ data.max_prompt_length=2048 \
+ data.max_response_length=1024 \
+ data.filter_overlong_prompts=True \
+ data.truncation='error' \
+ data.prompt_key=prompt \
+ data.reward_fn_key=data_source \
+ actor_rollout_ref.model.path=$MODEL_PATH \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.ppo_mini_batch_size=8 \
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+ actor_rollout_ref.actor.use_kl_loss=True \
+ actor_rollout_ref.actor.kl_loss_coef=0.003 \
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+ actor_rollout_ref.actor.entropy_coeff=0 \
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
+ actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+ actor_rollout_ref.rollout.n=8 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+ actor_rollout_ref.ref.fsdp_config.model_dtype=bfloat16 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.rollout.temperature=0.9 \
+ actor_rollout_ref.rollout.top_p=0.9 \
+ custom_reward_function.path=$REWARD_FN \
+ custom_reward_function.name=compute_score \
+ algorithm.use_kl_in_reward=False \
+ trainer.critic_warmup=0 \
+ trainer.val_before_train=False \
+ trainer.logger='["console"]' \
+ trainer.project_name='collaborative-agent-reflection-grpo' \
+ trainer.experiment_name='llama3.1-8b-grpo' \
+ trainer.n_gpus_per_node=2 \
+ trainer.nnodes=1 \
+ trainer.save_freq=50 \
+ trainer.test_freq=100 \
+ trainer.total_epochs=1 \
+ trainer.default_local_dir=$OUTPUT_DIR
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+
+echo ""
+echo "GRPO Training complete!"
+date
diff --git a/collaborativeagents/training/grpo_verl/run_verl_grpo.sh b/collaborativeagents/training/grpo_verl/run_verl_grpo.sh
new file mode 100644
index 0000000..ede35ab
--- /dev/null
+++ b/collaborativeagents/training/grpo_verl/run_verl_grpo.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+export PYTHONPATH="/shared/storage-01/users/mehri2/verl:$PYTHONPATH"
+set -x
+HYDRA_FULL_ERROR=1
+
+train_data="/shared/storage-01/users/mehri2/mem/collaborativeagents/training/grpo_verl/data/session_level_reflection_grpo_train.parquet"
+model_path="/shared/storage-01/users/mehri2/LLaMA-Factory/saves/llama-3.1-8b-instruct/full/sft_session_level_reflection/checkpoint-628"
+reward_fn_path="/shared/storage-01/users/mehri2/mem/collaborativeagents/training/grpo_verl/verl_reward_functions.py"
+
+max_prompt_length=2048
+max_response_length=1024
+train_batch_size=8
+n_generations=8
+# Effective batch size is 64
+
+python3 -m verl.trainer.main_ppo \
+ algorithm.adv_estimator=grpo \
+ data.train_files="$train_data" \
+ data.val_files="$train_data" \
+ data.train_batch_size=$train_batch_size \
+ data.max_prompt_length=$max_prompt_length \
+ data.max_response_length=$max_response_length \
+ data.filter_overlong_prompts=True \
+ data.truncation='error' \
+ data.prompt_key=prompt \
+ data.reward_fn_key=data_source \
+ actor_rollout_ref.model.path=$model_path \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.ppo_mini_batch_size=8 \
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
+ actor_rollout_ref.actor.use_kl_loss=True \
+ actor_rollout_ref.actor.kl_loss_coef=0.003 \
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+ actor_rollout_ref.actor.entropy_coeff=0 \
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
+ actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+ actor_rollout_ref.rollout.n=$n_generations \
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
+ actor_rollout_ref.ref.fsdp_config.model_dtype=bfloat16 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.rollout.temperature=0.9 \
+ actor_rollout_ref.rollout.top_p=0.9 \
+ custom_reward_function.path=$reward_fn_path \
+ custom_reward_function.name=compute_score \
+ algorithm.use_kl_in_reward=False \
+ trainer.critic_warmup=0 \
+ trainer.val_before_train=False \
+ trainer.logger='["console","wandb"]' \
+ trainer.project_name='collaborative-agent-reflection-grpo' \
+ trainer.experiment_name='llama3.1-8b-verl-grpo-v3' \
+ trainer.n_gpus_per_node=4 \
+ trainer.nnodes=1 \
+ trainer.save_freq=50 \
+ trainer.test_freq=100 \
+ trainer.total_epochs=1 \
+ trainer.default_local_dir=/shared/storage-01/users/mehri2/mem/collaborativeagents/training/grpo_verl/results/v3 $@ \ No newline at end of file
diff --git a/collaborativeagents/training/grpo_verl/verl_reward_functions.py b/collaborativeagents/training/grpo_verl/verl_reward_functions.py
new file mode 100644
index 0000000..fa38b8f
--- /dev/null
+++ b/collaborativeagents/training/grpo_verl/verl_reward_functions.py
@@ -0,0 +1,160 @@
+"""
+Custom reward functions for VERL GRPO training
+Compatible with VERL's reward function signature
+"""
+
+import openai
+from json_repair import repair_json
+import concurrent.futures
+import time
+
+# Initialize judge client
+client = openai.OpenAI(base_url="http://localhost:8004/v1", api_key="EMPTY")
+
+# Global tracker for reflection scores
+reflection_scores_tracker = {
+ "scores": [],
+ "batch_count": 0
+}
+
+
+def extract_json_answer(text: str) -> str:
+ """Extract agent_notes from JSON response"""
+ try:
+ answer = repair_json(text, return_objects=True)
+ answer = answer["agent_notes"]
+ except Exception as e:
+ print(f"Error extracting JSON answer: {e}")
+ return ""
+ return answer
+
+
+def ask_judge(prompt, system_prompt=None, max_retries=3):
+ """Ask the judge model for evaluation"""
+ if system_prompt:
+ messages = [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": prompt}
+ ]
+ else:
+ messages = [{"role": "user", "content": prompt}]
+
+ for attempt in range(max_retries):
+ try:
+ chat_completion = client.chat.completions.create(
+ model="meta-llama/Llama-3.1-70B-Instruct",
+ messages=messages,
+ max_tokens=2048
+ )
+ return chat_completion.choices[0].message.content.strip()
+ except Exception as e:
+ if attempt < max_retries - 1:
+ time.sleep(1)
+ else:
+ raise e
+
+
+def evaluate_reflection(completion, gold_response, responses_that_enforce_preferences):
+ """Evaluate a single reflection completion"""
+
+ completion_text = extract_json_answer(completion)
+ if completion_text == "":
+ print(f"Poorly formatted completion: {completion}")
+ return 0
+
+ user_messages_where_they_enforce_preferences = ""
+ for i, response in enumerate(responses_that_enforce_preferences):
+ user_messages_where_they_enforce_preferences += f"User message #{i+1}: {response}\n"
+
+ reflection_evaluation_prompt = f"""You are an expert evaluator analyzing a conversational agent's reflection of a conversation, where they analyze the conversation to identify the user's preferences and create actionable notes to help them satisfy these preferences in future conversations.
+
+Throughout the conversation, the user explicitly enforces their preferences whenever necessary. The agent analyzes the conversation to identify the user's preferences and create actionable notes to help them satisfy these preferences in future conversations.
+
+# Your Task:
+Evaluate whether the agent's reflection succesfully captures the user's preferences and provides actionable notes to help them satisfy these preferences in future conversations.
+
+# Agent's Reflection:
+{completion_text}
+
+# User Messages Where They Enforce Their Preferences:
+{user_messages_where_they_enforce_preferences}
+
+# Gold Reflection:
+Here is a gold reflection for the same conversation. Use this as a reference to evaluate the agent's reflection.
+{gold_response}
+
+# Evaluation Criteria:
+Assess the reflection on four dimensions:
+- **Coverage (Completeness):** Does the agent's reflection capture all of the user's preferences?
+- **Actionability (Quality):** Does the agent's reflection provide actionable notes and details that help the agent satisfy these preferences in future conversations?
+- **Accuracy (No Hallucination):** Are all points grounded in actual user statements? Does the reflection avoid inventing preferences or misrepresenting user statements?
+- **Clarity:** Is the reflection well-organized and clearly formatted? Does the reflection avoid redundancy, with each preference stated once without repetitive or overlapping notes?
+
+You will output a score from 0-3, where:
+- 0: Does not effectively capture user preferences: gaps in converage, or significant hallucinations
+- 1: Captures some preferences with limited actionable notes, may hallucinate some preferences
+- 2: Captures most preferences with actionable notes, may have some slight hallucinations
+- 3: Comprehensively captures all preferences with highly actionable notes and no hallucinations
+
+# Output Format:
+{{
+ "reasoning": # Brief explanation of your decision
+ "reflection_score": # 0-3
+}}
+
+Output a properly formatted JSON response, as specified by the Output Format."""
+
+ reflection_score_response = ask_judge(reflection_evaluation_prompt)
+ reflection_score = repair_json(reflection_score_response, return_objects=True)["reflection_score"]
+
+ print(f"Reflection Score: {reflection_score}")
+ return reflection_score
+
+
+def soft_format_reward(solution_str):
+ """Check if the completion has JSON format with required fields"""
+ reward = 0.0
+ try:
+ parsed_json = repair_json(solution_str, return_objects=True)
+ if "agent_notes" in parsed_json and "user_preferences_reasoning" in parsed_json:
+ reward = 0.5
+ except Exception:
+ pass
+
+ print(f"Soft Format Reward: {reward}")
+ return reward
+
+
+# VERL reward function signature: (data_source, solution_str, ground_truth, extra_info)
+def compute_score(data_source, solution_str, ground_truth, extra_info=None):
+ """
+ Main reward function for VERL (named 'compute_score' for default VERL compatibility).
+ This matches the signature expected by VERL's reward managers.
+
+ Args:
+ data_source: Source identifier for the data
+ solution_str: The model's generated completion
+ ground_truth: Not used directly, passed in extra_info
+ extra_info: Dictionary containing 'gold_response' and 'responses_that_enforce_preferences'
+
+ Returns:
+ float: Combined reward score
+ """
+ if extra_info is None:
+ print("Warning: extra_info is None")
+ return 0.0
+
+ gold_response = extra_info.get('gold_response', '')
+ responses_that_enforce_preferences = extra_info.get('responses_that_enforce_preferences', [])
+
+ # Soft format reward
+ format_reward = soft_format_reward(solution_str)
+ # Reflection quality reward
+ reflection_score = evaluate_reflection(solution_str, gold_response, responses_that_enforce_preferences)
+
+ total_reward = format_reward + reflection_score
+
+ reflection_scores_tracker["scores"].append(reflection_score)
+ print(f"Total Reward: {total_reward} (Format: {format_reward}, Reflection: {reflection_score})")
+
+ return total_reward \ No newline at end of file