local reward model

author: YurenHao0426 <blackhao0426@gmail.com> 2026-01-27 12:15:45 -0600
committer: YurenHao0426 <blackhao0426@gmail.com> 2026-01-27 12:15:45 -0600
commit: 680513b7771a29f27cbbb3ffb009a69a913de6f9 (patch)
tree: a0d60aef9ade1b2953b915f535b990c0de95e493 /collaborativeagents
parent: c06ec2f3b80f8968f09eb801b69237495b055ec1 (diff)
3 files changed, 23 insertions, 7 deletions
diff --git a/collaborativeagents/.gitignore b/collaborativeagents/.gitignore
index 3d9130f..b69c8e0 100644
--- a/collaborativeagents/.gitignore
+++ b/collaborativeagents/.gitignore
@@ -3,4 +3,6 @@ __pycache__/
 cache/
 *.out
 temp*.json
-scripts/runs
-\ No newline at end of file
+scripts/runs
+--only-binary=:all:/
+*.whl
+\ No newline at end of file
diff --git a/collaborativeagents/adapters/personalized_llm_adapter.py b/collaborativeagents/adapters/personalized_llm_adapter.py
index c2d4727..b476272 100644
--- a/collaborativeagents/adapters/personalized_llm_adapter.py
+++ b/collaborativeagents/adapters/personalized_llm_adapter.py
@@ -58,9 +58,12 @@ class AdapterConfig:
     # Best-of-N sampling: generate N responses and pick best (for RAG methods)
     best_of_n: int = 1
 
-    # Reward mode: "keyword" (legacy heuristic) or "llm" (GPT-5-nano judge)
+    # Reward mode: "keyword" (legacy heuristic), "llm" (GPT-4o-mini), or "llm_local" (local vLLM)
     reward_mode: str = "keyword"
 
+    # vLLM URL for local reward model (only used when reward_mode="llm_local")
+    reward_vllm_url: str = "http://localhost:8005/v1"
+
     # Reward mapping for user behavior
     preference_enforcement_reward: float = -0.8  # Negative reward when user enforces
     disappointment_expression_reward: float = -0.4  # Milder negative for disappointment
@@ -116,6 +119,7 @@ class PersonalizedLLMAdapter:
             reranker_type=self.config.reranker_type,
             best_of_n=self.config.best_of_n,
             reward_mode=self.config.reward_mode,
+            reward_vllm_url=self.config.reward_vllm_url,
         )
         self._initialized = True
         print("[Adapter] Initialization complete.")
@@ -423,6 +427,7 @@ def create_baseline_adapter(
     use_vllm: bool = False,
     use_shared_models: bool = False,
     reward_mode: str = "keyword",
+    reward_vllm_url: str = "http://localhost:8005/v1",
 ) -> PersonalizedLLMAdapter:
     """
     Create an adapter configured for a specific baseline.
@@ -438,7 +443,8 @@ def create_baseline_adapter(
             - "rag_vector": Full personalization (Extractor + RAG + User Vector)
         device_assignment: GPU assignment dict
         use_vllm: If True, use vLLM HTTP API for LLM inference (much faster)
-        reward_mode: Global reward mode ("keyword" or "llm") applied to all methods
+        reward_mode: Global reward mode ("keyword", "llm", or "llm_local")
+        reward_vllm_url: vLLM URL for local reward model (when reward_mode="llm_local")
         use_shared_models: If True, share embedding/reranker models across parallel
             workers. ESSENTIAL for parallel profile processing to avoid OOM.
 
@@ -592,8 +598,9 @@ def create_baseline_adapter(
     if device_assignment:
         config.device_assignment = device_assignment
 
-    # Apply global reward_mode to all methods (overrides per-method defaults)
+    # Apply global reward settings to all methods (overrides per-method defaults)
     config.reward_mode = reward_mode
+    config.reward_vllm_url = reward_vllm_url
 
     return PersonalizedLLMAdapter(config)
 
diff --git a/collaborativeagents/scripts/run_experiments.py b/collaborativeagents/scripts/run_experiments.py
index 0ba0ba0..e04680c 100644
--- a/collaborativeagents/scripts/run_experiments.py
+++ b/collaborativeagents/scripts/run_experiments.py
@@ -89,10 +89,13 @@ class ExperimentConfig:
     use_openai_user: bool = False
     openai_user_model: str = "gpt-5"  # Model name for OpenAI user agent
 
-    # Reward mode: "keyword" (implicit user signals) or "llm" (GPT-5-nano judge)
+    # Reward mode: "keyword" (implicit user signals), "llm" (GPT-4o-mini), or "llm_local" (local vLLM)
     # This is a global option applied to ALL methods that use RL updates
     reward_mode: str = "keyword"
 
+    # vLLM URL for local reward model (only used when reward_mode="llm_local")
+    reward_vllm_url: str = "http://localhost:8005/v1"
+
     # Parallel/Batch processing
     parallel_profiles: int = 50  # Number of profiles to process in parallel
     use_batch_processing: bool = True  # Use turn-synchronous batch processing for vanilla/all_memory
@@ -248,6 +251,7 @@ class ExperimentRunner:
             use_vllm=self.config.use_vllm,
             use_shared_models=use_shared_models,
             reward_mode=self.config.reward_mode,
+            reward_vllm_url=self.config.reward_vllm_url,
         )
         # Profile will be passed to start_session() when the conversation begins
         return adapter
@@ -1264,8 +1268,10 @@ def main():
                         help="Use OpenAI API (GPT-5) for user simulation instead of vLLM")
     parser.add_argument("--openai-user-model", type=str, default="gpt-5",
                         help="OpenAI model name for user simulator (default: gpt-5)")
-    parser.add_argument("--reward-mode", type=str, default="keyword", choices=["keyword", "llm"],
-                        help="Reward mode for RL updates: 'keyword' (user signals) or 'llm' (GPT-5-nano judge)")
+    parser.add_argument("--reward-mode", type=str, default="keyword", choices=["keyword", "llm", "llm_local"],
+                        help="Reward mode: 'keyword' (user signals), 'llm' (GPT-4o-mini), or 'llm_local' (local vLLM)")
+    parser.add_argument("--reward-vllm-url", type=str, default="http://localhost:8005/v1",
+                        help="vLLM server URL for local reward model (when --reward-mode=llm_local)")
 
     parser.add_argument("--parallel-profiles", type=int, default=50,
                         help="Number of profiles to process in parallel (requires --use-vllm)")
@@ -1302,6 +1308,7 @@ def main():
             use_openai_user=args.use_openai_user,
             openai_user_model=args.openai_user_model,
             reward_mode=args.reward_mode,
+            reward_vllm_url=args.reward_vllm_url,
             parallel_profiles=args.parallel_profiles,
             use_batch_processing=args.use_batch_processing,
             batch_size_conversations=args.batch_size,
author	YurenHao0426 <blackhao0426@gmail.com>	2026-01-27 12:15:45 -0600
committer	YurenHao0426 <blackhao0426@gmail.com>	2026-01-27 12:15:45 -0600
commit	680513b7771a29f27cbbb3ffb009a69a913de6f9 (patch)
tree	a0d60aef9ade1b2953b915f535b990c0de95e493 /collaborativeagents
parent	c06ec2f3b80f8968f09eb801b69237495b055ec1 (diff)