diff options
| author | YurenHao0426 <blackhao0426@gmail.com> | 2026-01-27 12:15:45 -0600 |
|---|---|---|
| committer | YurenHao0426 <blackhao0426@gmail.com> | 2026-01-27 12:15:45 -0600 |
| commit | 680513b7771a29f27cbbb3ffb009a69a913de6f9 (patch) | |
| tree | a0d60aef9ade1b2953b915f535b990c0de95e493 /collaborativeagents | |
| parent | c06ec2f3b80f8968f09eb801b69237495b055ec1 (diff) | |
local reward model
Diffstat (limited to 'collaborativeagents')
| -rw-r--r-- | collaborativeagents/.gitignore | 4 | ||||
| -rw-r--r-- | collaborativeagents/adapters/personalized_llm_adapter.py | 13 | ||||
| -rw-r--r-- | collaborativeagents/scripts/run_experiments.py | 13 |
3 files changed, 23 insertions, 7 deletions
diff --git a/collaborativeagents/.gitignore b/collaborativeagents/.gitignore index 3d9130f..b69c8e0 100644 --- a/collaborativeagents/.gitignore +++ b/collaborativeagents/.gitignore @@ -3,4 +3,6 @@ __pycache__/ cache/ *.out temp*.json -scripts/runs
\ No newline at end of file +scripts/runs +--only-binary=:all:/ +*.whl
\ No newline at end of file diff --git a/collaborativeagents/adapters/personalized_llm_adapter.py b/collaborativeagents/adapters/personalized_llm_adapter.py index c2d4727..b476272 100644 --- a/collaborativeagents/adapters/personalized_llm_adapter.py +++ b/collaborativeagents/adapters/personalized_llm_adapter.py @@ -58,9 +58,12 @@ class AdapterConfig: # Best-of-N sampling: generate N responses and pick best (for RAG methods) best_of_n: int = 1 - # Reward mode: "keyword" (legacy heuristic) or "llm" (GPT-5-nano judge) + # Reward mode: "keyword" (legacy heuristic), "llm" (GPT-4o-mini), or "llm_local" (local vLLM) reward_mode: str = "keyword" + # vLLM URL for local reward model (only used when reward_mode="llm_local") + reward_vllm_url: str = "http://localhost:8005/v1" + # Reward mapping for user behavior preference_enforcement_reward: float = -0.8 # Negative reward when user enforces disappointment_expression_reward: float = -0.4 # Milder negative for disappointment @@ -116,6 +119,7 @@ class PersonalizedLLMAdapter: reranker_type=self.config.reranker_type, best_of_n=self.config.best_of_n, reward_mode=self.config.reward_mode, + reward_vllm_url=self.config.reward_vllm_url, ) self._initialized = True print("[Adapter] Initialization complete.") @@ -423,6 +427,7 @@ def create_baseline_adapter( use_vllm: bool = False, use_shared_models: bool = False, reward_mode: str = "keyword", + reward_vllm_url: str = "http://localhost:8005/v1", ) -> PersonalizedLLMAdapter: """ Create an adapter configured for a specific baseline. @@ -438,7 +443,8 @@ def create_baseline_adapter( - "rag_vector": Full personalization (Extractor + RAG + User Vector) device_assignment: GPU assignment dict use_vllm: If True, use vLLM HTTP API for LLM inference (much faster) - reward_mode: Global reward mode ("keyword" or "llm") applied to all methods + reward_mode: Global reward mode ("keyword", "llm", or "llm_local") + reward_vllm_url: vLLM URL for local reward model (when reward_mode="llm_local") use_shared_models: If True, share embedding/reranker models across parallel workers. ESSENTIAL for parallel profile processing to avoid OOM. @@ -592,8 +598,9 @@ def create_baseline_adapter( if device_assignment: config.device_assignment = device_assignment - # Apply global reward_mode to all methods (overrides per-method defaults) + # Apply global reward settings to all methods (overrides per-method defaults) config.reward_mode = reward_mode + config.reward_vllm_url = reward_vllm_url return PersonalizedLLMAdapter(config) diff --git a/collaborativeagents/scripts/run_experiments.py b/collaborativeagents/scripts/run_experiments.py index 0ba0ba0..e04680c 100644 --- a/collaborativeagents/scripts/run_experiments.py +++ b/collaborativeagents/scripts/run_experiments.py @@ -89,10 +89,13 @@ class ExperimentConfig: use_openai_user: bool = False openai_user_model: str = "gpt-5" # Model name for OpenAI user agent - # Reward mode: "keyword" (implicit user signals) or "llm" (GPT-5-nano judge) + # Reward mode: "keyword" (implicit user signals), "llm" (GPT-4o-mini), or "llm_local" (local vLLM) # This is a global option applied to ALL methods that use RL updates reward_mode: str = "keyword" + # vLLM URL for local reward model (only used when reward_mode="llm_local") + reward_vllm_url: str = "http://localhost:8005/v1" + # Parallel/Batch processing parallel_profiles: int = 50 # Number of profiles to process in parallel use_batch_processing: bool = True # Use turn-synchronous batch processing for vanilla/all_memory @@ -248,6 +251,7 @@ class ExperimentRunner: use_vllm=self.config.use_vllm, use_shared_models=use_shared_models, reward_mode=self.config.reward_mode, + reward_vllm_url=self.config.reward_vllm_url, ) # Profile will be passed to start_session() when the conversation begins return adapter @@ -1264,8 +1268,10 @@ def main(): help="Use OpenAI API (GPT-5) for user simulation instead of vLLM") parser.add_argument("--openai-user-model", type=str, default="gpt-5", help="OpenAI model name for user simulator (default: gpt-5)") - parser.add_argument("--reward-mode", type=str, default="keyword", choices=["keyword", "llm"], - help="Reward mode for RL updates: 'keyword' (user signals) or 'llm' (GPT-5-nano judge)") + parser.add_argument("--reward-mode", type=str, default="keyword", choices=["keyword", "llm", "llm_local"], + help="Reward mode: 'keyword' (user signals), 'llm' (GPT-4o-mini), or 'llm_local' (local vLLM)") + parser.add_argument("--reward-vllm-url", type=str, default="http://localhost:8005/v1", + help="vLLM server URL for local reward model (when --reward-mode=llm_local)") parser.add_argument("--parallel-profiles", type=int, default=50, help="Number of profiles to process in parallel (requires --use-vllm)") @@ -1302,6 +1308,7 @@ def main(): use_openai_user=args.use_openai_user, openai_user_model=args.openai_user_model, reward_mode=args.reward_mode, + reward_vllm_url=args.reward_vllm_url, parallel_profiles=args.parallel_profiles, use_batch_processing=args.use_batch_processing, batch_size_conversations=args.batch_size, |
