actor_rollout_ref: actor: optim: _target_: verl.workers.config.FSDPOptimizerConfig optimizer: AdamW optimizer_impl: torch.optim lr: 1.0e-06 lr_warmup_steps_ratio: 0.0 total_training_steps: -1 weight_decay: 0.01 lr_warmup_steps: -1 betas: - 0.9 - 0.999 clip_grad: 1.0 min_lr_ratio: 0.0 num_cycles: 0.5 lr_scheduler_type: constant warmup_style: null override_optimizer_config: null fsdp_config: _target_: verl.workers.config.FSDPEngineConfig wrap_policy: min_num_params: 0 param_offload: false optimizer_offload: false offload_policy: false reshard_after_forward: true fsdp_size: -1 forward_prefetch: false model_dtype: bfloat16 use_orig_params: false seed: 42 full_determinism: false ulysses_sequence_parallel_size: 1 entropy_from_logits_with_chunking: false use_torch_compile: true entropy_checkpointing: false forward_only: false strategy: fsdp dtype: bfloat16 _target_: verl.workers.config.FSDPActorConfig rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} strategy: fsdp ppo_mini_batch_size: 8 ppo_micro_batch_size: null ppo_micro_batch_size_per_gpu: 4 use_dynamic_bsz: false ppo_max_token_len_per_gpu: 16384 clip_ratio: 0.2 clip_ratio_low: 0.2 clip_ratio_high: 0.2 tau_pos: 1.0 tau_neg: 1.05 freeze_vision_tower: false policy_loss: _target_: verl.workers.config.PolicyLossConfig loss_mode: vanilla clip_cov_ratio: 0.0002 clip_cov_lb: 1.0 clip_cov_ub: 5.0 kl_cov_ratio: 0.0002 ppo_kl_coef: 0.1 clip_ratio_c: 3.0 loss_agg_mode: token-mean loss_scale_factor: null entropy_coeff: 0 calculate_entropy: false use_kl_loss: true use_torch_compile: true kl_loss_coef: 0.003 kl_loss_type: low_var_kl ppo_epochs: 1 shuffle: false data_loader_seed: 42 checkpoint: _target_: verl.trainer.config.CheckpointConfig save_contents: - model - optimizer - extra load_contents: ${.save_contents} async_save: false use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} profiler: _target_: verl.utils.profiler.ProfilerConfig tool: ${oc.select:global_profiler.tool,null} enable: false all_ranks: false ranks: [] save_path: ${oc.select:global_profiler.save_path,null} tool_config: nsys: _target_: verl.utils.profiler.config.NsightToolConfig discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} npu: _target_: verl.utils.profiler.config.NPUToolConfig contents: [] level: level0 analysis: true discrete: false torch: _target_: verl.utils.profiler.config.TorchProfilerToolConfig step_start: 0 step_end: null torch_memory: _target_: verl.utils.profiler.config.TorchMemoryToolConfig trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} router_replay: _target_: verl.workers.config.RouterReplayConfig mode: disabled record_file: null replay_file: null grad_clip: 1.0 ulysses_sequence_parallel_size: 1 entropy_from_logits_with_chunking: false entropy_checkpointing: false use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} ref: rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} strategy: ${actor_rollout_ref.actor.strategy} use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} log_prob_micro_batch_size: null log_prob_micro_batch_size_per_gpu: 4 log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} profiler: _target_: verl.utils.profiler.ProfilerConfig tool: ${oc.select:global_profiler.tool,null} enable: false all_ranks: false ranks: [] save_path: ${oc.select:global_profiler.save_path,null} tool_config: nsys: _target_: verl.utils.profiler.config.NsightToolConfig discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} npu: _target_: verl.utils.profiler.config.NPUToolConfig contents: [] level: level0 analysis: true discrete: false torch: _target_: verl.utils.profiler.config.TorchProfilerToolConfig step_start: 0 step_end: null torch_memory: _target_: verl.utils.profiler.config.TorchMemoryToolConfig trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} router_replay: _target_: verl.workers.config.RouterReplayConfig mode: disabled record_file: null replay_file: null fsdp_config: _target_: verl.workers.config.FSDPEngineConfig wrap_policy: min_num_params: 0 param_offload: true optimizer_offload: false offload_policy: false reshard_after_forward: true fsdp_size: -1 forward_prefetch: false model_dtype: bfloat16 use_orig_params: false seed: 42 full_determinism: false ulysses_sequence_parallel_size: 1 entropy_from_logits_with_chunking: false use_torch_compile: true entropy_checkpointing: false forward_only: true strategy: fsdp dtype: bfloat16 _target_: verl.workers.config.FSDPActorConfig ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} entropy_from_logits_with_chunking: false entropy_checkpointing: false rollout: _target_: verl.workers.config.RolloutConfig name: vllm mode: async temperature: 0.9 top_k: -1 top_p: 0.9 prompt_length: ${oc.select:data.max_prompt_length,512} response_length: ${oc.select:data.max_response_length,512} dtype: bfloat16 gpu_memory_utilization: 0.5 ignore_eos: false enforce_eager: false cudagraph_capture_sizes: null free_cache_engine: true tensor_model_parallel_size: 1 data_parallel_size: 1 expert_parallel_size: 1 pipeline_model_parallel_size: 1 max_num_batched_tokens: 8192 max_model_len: null max_num_seqs: 1024 enable_chunked_prefill: true enable_prefix_caching: true logprobs_mode: processed_logprobs load_format: dummy log_prob_micro_batch_size: null log_prob_micro_batch_size_per_gpu: 4 log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} disable_log_stats: true do_sample: true 'n': 8 over_sample_rate: 0 multi_stage_wake_up: false engine_kwargs: vllm: {} sglang: {} val_kwargs: _target_: verl.workers.config.SamplingConfig top_k: -1 top_p: 1.0 temperature: 0 'n': 1 do_sample: false multi_turn: _target_: verl.workers.config.MultiTurnConfig enable: false max_assistant_turns: null tool_config_path: null max_user_turns: null max_parallel_calls: 1 max_tool_response_length: 256 tool_response_truncate_side: middle interaction_config_path: null use_inference_chat_template: false tokenization_sanity_check_mode: strict format: hermes num_repeat_rollouts: null calculate_log_probs: false agent: _target_: verl.workers.config.AgentLoopConfig num_workers: 8 default_agent_loop: single_turn_agent agent_loop_config_path: null custom_async_server: _target_: verl.workers.config.CustomAsyncServerConfig path: null name: null update_weights_bucket_megabytes: 512 trace: _target_: verl.workers.config.TraceConfig backend: null token2text: false max_samples_per_step_per_worker: null skip_rollout: false skip_dump_dir: /tmp/rollout_dump skip_tokenizer_init: true enable_rollout_routing_replay: false profiler: _target_: verl.utils.profiler.ProfilerConfig tool: ${oc.select:global_profiler.tool,null} enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} save_path: ${oc.select:global_profiler.save_path,null} tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} prometheus: _target_: verl.workers.config.PrometheusConfig enable: false port: 9090 file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml served_model_name: ${oc.select:actor_rollout_ref.model.path,null} quantization: null quantization_config_file: null layered_summon: false model: _target_: verl.workers.config.HFModelConfig path: /work/nvme/bfqt/yurenh2/sft_checkpoints/checkpoint-200 hf_config_path: null tokenizer_path: null use_shm: false trust_remote_code: false custom_chat_template: null external_lib: null override_config: {} enable_gradient_checkpointing: true enable_activation_offload: false use_remove_padding: true lora_rank: 0 lora_alpha: 16 target_modules: all-linear exclude_modules: null lora_adapter_path: null use_liger: false use_fused_kernels: false fused_kernel_options: impl_backend: torch tiled_mlp: enabled: false num_shards: 4 hybrid_engine: true nccl_timeout: 600 data: tokenizer: null use_shm: false train_files: /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/data/session_level_reflection_grpo_train.parquet val_files: /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/data/session_level_reflection_grpo_train.parquet train_max_samples: -1 val_max_samples: -1 prompt_key: prompt reward_fn_key: data_source max_prompt_length: 2048 max_response_length: 1024 train_batch_size: 64 val_batch_size: null tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, null} return_raw_input_ids: false return_raw_chat: true return_full_prompt: false shuffle: true seed: null dataloader_num_workers: 8 image_patch_size: 14 validation_shuffle: false filter_overlong_prompts: true filter_overlong_prompts_workers: 1 truncation: error image_key: images video_key: videos trust_remote_code: false custom_cls: path: null name: null return_multi_modal_inputs: true sampler: class_path: null class_name: null datagen: path: null name: null apply_chat_template_kwargs: {} reward_manager: _target_: verl.trainer.config.config.RewardManagerConfig source: register name: ${oc.select:reward_model.reward_manager,naive} module: _target_: verl.trainer.config.config.ModuleConfig path: null name: custom_reward_manager critic: optim: _target_: verl.workers.config.FSDPOptimizerConfig optimizer: AdamW optimizer_impl: torch.optim lr: 1.0e-05 lr_warmup_steps_ratio: 0.0 total_training_steps: -1 weight_decay: 0.01 lr_warmup_steps: -1 betas: - 0.9 - 0.999 clip_grad: 1.0 min_lr_ratio: 0.0 num_cycles: 0.5 lr_scheduler_type: constant warmup_style: null override_optimizer_config: null model: fsdp_config: _target_: verl.workers.config.FSDPEngineConfig wrap_policy: min_num_params: 0 param_offload: false optimizer_offload: false offload_policy: false reshard_after_forward: true fsdp_size: -1 forward_prefetch: false model_dtype: fp32 use_orig_params: false seed: 42 full_determinism: false ulysses_sequence_parallel_size: 1 entropy_from_logits_with_chunking: false use_torch_compile: true entropy_checkpointing: false forward_only: false strategy: fsdp dtype: bfloat16 path: ~/models/deepseek-llm-7b-chat tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} override_config: {} external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} _target_: verl.workers.config.FSDPCriticModelCfg use_shm: false enable_gradient_checkpointing: true enable_activation_offload: false use_remove_padding: false lora_rank: 0 lora_alpha: 16 target_modules: all-linear tiled_mlp: enabled: false num_shards: 4 _target_: verl.workers.config.FSDPCriticConfig rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} strategy: fsdp enable: null ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} ppo_micro_batch_size: null ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} ppo_max_token_len_per_gpu: 32768 forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} data_loader_seed: 42 cliprange_value: 0.5 loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} checkpoint: _target_: verl.trainer.config.CheckpointConfig save_contents: - model - optimizer - extra load_contents: ${.save_contents} async_save: false profiler: _target_: verl.utils.profiler.ProfilerConfig tool: ${oc.select:global_profiler.tool,null} enable: false all_ranks: false ranks: [] save_path: ${oc.select:global_profiler.save_path,null} tool_config: nsys: _target_: verl.utils.profiler.config.NsightToolConfig discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} npu: _target_: verl.utils.profiler.config.NPUToolConfig contents: [] level: level0 analysis: true discrete: false torch: _target_: verl.utils.profiler.config.TorchProfilerToolConfig step_start: 0 step_end: null torch_memory: _target_: verl.utils.profiler.config.TorchMemoryToolConfig trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} ulysses_sequence_parallel_size: 1 grad_clip: 1.0 reward_model: enable: false enable_resource_pool: false n_gpus_per_node: 8 nnodes: 0 strategy: fsdp model: input_tokenizer: ${actor_rollout_ref.model.path} path: ~/models/FsfairX-LLaMA3-RM-v0.1 external_lib: ${actor_rollout_ref.model.external_lib} trust_remote_code: false override_config: {} use_shm: false use_remove_padding: false use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} fsdp_config: _target_: verl.workers.config.FSDPEngineConfig wrap_policy: min_num_params: 0 param_offload: false reshard_after_forward: true fsdp_size: -1 forward_prefetch: false micro_batch_size: null micro_batch_size_per_gpu: null max_length: null use_dynamic_bsz: ${critic.use_dynamic_bsz} forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} reward_manager: naive reward_loop_source: register reward_loop_module_path: null reward_loop_class_name: null launch_reward_fn_async: false sandbox_fusion: url: null max_concurrent: 64 memory_limit_mb: 1024 profiler: _target_: verl.utils.profiler.ProfilerConfig tool: ${oc.select:global_profiler.tool,null} enable: false all_ranks: false ranks: [] save_path: ${oc.select:global_profiler.save_path,null} tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} ulysses_sequence_parallel_size: 1 use_reward_loop: true num_workers: 1 rollout: _target_: verl.workers.config.RolloutConfig name: ??? dtype: bfloat16 gpu_memory_utilization: 0.5 enforce_eager: true cudagraph_capture_sizes: null free_cache_engine: true data_parallel_size: 1 expert_parallel_size: 1 tensor_model_parallel_size: 2 max_num_batched_tokens: 8192 max_model_len: null max_num_seqs: 1024 load_format: auto engine_kwargs: {} limit_images: null enable_chunked_prefill: true enable_prefix_caching: true disable_log_stats: true skip_tokenizer_init: false prompt_length: 2048 response_length: 2048 algorithm: rollout_correction: rollout_is: null rollout_is_threshold: 2.0 rollout_rs: null rollout_rs_threshold: null rollout_rs_threshold_lower: null rollout_token_veto_threshold: null bypass_mode: false loss_type: ppo_clip rollout_is_batch_normalize: false _target_: verl.trainer.config.AlgoConfig gamma: 1.0 lam: 1.0 adv_estimator: grpo norm_adv_by_std_in_grpo: true use_kl_in_reward: false kl_penalty: kl kl_ctrl: _target_: verl.trainer.config.KLControlConfig type: fixed kl_coef: 0.001 horizon: 10000 target_kl: 0.1 use_pf_ppo: false pf_ppo: reweight_method: pow weight_pow: 2.0 custom_reward_function: path: /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/grpo_verl/verl_reward_functions.py name: compute_score trainer: balance_batch: true total_epochs: 1 total_training_steps: null project_name: collaborative-agent-reflection-grpo experiment_name: llama3.1-8b-grpo logger: - console log_val_generations: 0 rollout_data_dir: null validation_data_dir: null nnodes: 1 n_gpus_per_node: 2 save_freq: 50 esi_redundant_time: 0 resume_mode: auto resume_from_path: null val_before_train: false val_only: false test_freq: 100 critic_warmup: 0 default_hdfs_dir: null del_local_ckpt_after_load: false default_local_dir: /scratch/bfqt/yurenh2/grpo_outputs max_actor_ckpt_to_keep: null max_critic_ckpt_to_keep: null ray_wait_register_center_timeout: 300 device: cuda use_legacy_worker_impl: auto global_profiler: _target_: verl.utils.profiler.ProfilerConfig tool: null steps: null profile_continuous_steps: false save_path: outputs/profile global_tool_config: nsys: _target_: verl.utils.profiler.config.NsightToolConfig discrete: false controller_nsight_options: trace: cuda,nvtx,cublas,ucx cuda-memory-usage: 'true' cuda-graph-trace: graph worker_nsight_options: trace: cuda,nvtx,cublas,ucx cuda-memory-usage: 'true' cuda-graph-trace: graph capture-range: cudaProfilerApi capture-range-end: null kill: none torch_memory: trace_alloc_max_entries: 100000 stack_depth: 32 context: all stacks: all kw_args: {} transfer_queue: enable: false ray_kwargs: ray_init: num_cpus: null timeline_json_file: null