summaryrefslogtreecommitdiff
path: root/collaborativeagents/adapters/reflection_grpo_adapter.py
diff options
context:
space:
mode:
Diffstat (limited to 'collaborativeagents/adapters/reflection_grpo_adapter.py')
-rw-r--r--collaborativeagents/adapters/reflection_grpo_adapter.py321
1 files changed, 321 insertions, 0 deletions
diff --git a/collaborativeagents/adapters/reflection_grpo_adapter.py b/collaborativeagents/adapters/reflection_grpo_adapter.py
new file mode 100644
index 0000000..09c5b26
--- /dev/null
+++ b/collaborativeagents/adapters/reflection_grpo_adapter.py
@@ -0,0 +1,321 @@
+"""
+Reflection + GRPO Adapter - Local transformers-based implementation.
+
+This implements the "Reflection + GRPO" baseline from the MULTISESSIONCOLLAB paper:
+- Uses a GRPO-trained model for session-level reflection
+- The model is trained to generate higher-quality reflections that capture user preferences
+- Training uses rewards from LLM judge evaluating reflection quality
+
+Key difference from vanilla reflection:
+- Uses GRPO-trained model for reflection generation (better preference capture)
+- Produces more actionable and comprehensive agent notes
+"""
+
+import sys
+from pathlib import Path
+from typing import Optional, List, Dict, Any
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from json_repair import repair_json
+
+# Model paths - Use GRPO-trained model if available, fallback to base
+GRPO_MODEL_PATH = "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/outputs/grpo_reflection/final"
+SFT_MODEL_PATH = "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training/outputs/sft_reflection"
+DEFAULT_MODEL_PATH = "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+
+def get_best_available_model():
+ """Get the best available model path (GRPO > SFT > base)."""
+ grpo_path = Path(GRPO_MODEL_PATH)
+ sft_path = Path(SFT_MODEL_PATH)
+
+ if grpo_path.exists() and (grpo_path / "config.json").exists():
+ print(f"[ReflectionGRPOAdapter] Using GRPO-trained model: {grpo_path}")
+ return str(grpo_path)
+ elif sft_path.exists() and (sft_path / "config.json").exists():
+ print(f"[ReflectionGRPOAdapter] Using SFT model (GRPO not found): {sft_path}")
+ return str(sft_path)
+ else:
+ print(f"[ReflectionGRPOAdapter] WARNING: No trained model found, using base model")
+ print(f"[ReflectionGRPOAdapter] To train: run collaborativeagents/slurm/run_sft_training.sh")
+ print(f"[ReflectionGRPOAdapter] then collaborativeagents/slurm/run_grpo_training.sh")
+ return DEFAULT_MODEL_PATH
+
+# GRPO-enhanced system prompt with proper scaffolding
+REFLECTIVE_AGENT_SYSTEM_PROMPT = """You are a collaborative AI agent helping users solve writing, question answering, math, and coding problems.
+
+# Notes
+Remember, you have been taking notes throughout past conversations about user preferences. Use these notes to guide your response:
+{agent_notes}
+
+# Conversation Guidelines:
+- If the user's message is unclear, lacks details, or is ambiguous (e.g. length of an essay, format requirements, specific constraints), do not make assumptions. Ask for clarification and ensure you have enough information before providing an answer.
+- Your goal is to help the user solve their problem. Adhere to their preferences and do your best to help them solve their problem."""
+
+UPDATE_AGENT_NOTES_PROMPT = """You are a collaborative AI agent learning to better help a user with problem-solving tasks across multi-session interactions. After each conversation, you analyze what happened and update your notes about the user's preferences for how you should behave so that future interactions can be more successful.
+
+# Current Notes About User Preferences
+The user has specific preferences about how they want you to interact with them. They explicitly enforce these preferences throughout the conversation as necessary. Here are your current notes about the user's preferences from previous conversations:
+{agent_notes}
+
+# Conversation to Analyze
+{conversation_str}
+
+# Notes Updating Task
+Analyze the conversation above to identify the user's preferences and how you can best satisfy them. Your goal is to create actionable notes that help you satisfy these preferences for future conversations. Keep your notes concise and actionable, without adding unnecessary details. Consider:
+- When did the user explicitly ask you to adjust your response? What specifically did they want changed?
+- What specific actions, formats, or approaches satisfy each preference? What should you keep in mind for future conversations?
+As new situations arise, you may refine, combine, or split preferences to better reflect the user's needs. When updating the notes, do not lose any useful information from past interactions.
+Make sure to add information about the user preferences that you are sure about, and do not hallucinate preferences.
+
+Provide your updated notes as a clear, structured response. List each preference with actionable guidance."""
+
+# GRPO-trained reflection prompt - produces higher quality reflections
+UPDATE_AGENT_NOTES_PROMPT_GRPO = """You are a collaborative AI agent learning to better help a user with problem-solving tasks across multi-session interactions. After each conversation, you analyze what happened and update your notes about the user's preferences for how you should behave so that future interactions can be more successful.
+
+# Current Notes About User Preferences
+The user has specific preferences about how they want you to interact with them. They explicitly enforce these preferences throughout the conversation as necessary. Here are your current notes about the user's preferences from previous conversations:
+{agent_notes}
+
+# Conversation to Analyze
+{conversation_str}
+
+# Notes Updating Task
+Analyze the conversation above to identify the user's preferences and how you can best satisfy them. Your goal is to create actionable notes that help you satisfy these preferences for future conversations. Keep your notes concise and actionable, without adding unnecessary details. Consider:
+- When did the user explicitly ask you to adjust your response? What specifically did they want changed?
+- What specific actions, formats, or approaches satisfy each preference? What should you keep in mind for future conversations?
+As new situations arise, you may refine, combine, or split preferences to better reflect the user's needs. When updating the notes, do not lose any useful information from past interactions.
+Make sure to add information about the user preferences that you are sure about, and do not hallucinate preferences.
+
+# Output Format:
+{{
+ "user_preferences_reasoning": str, # Reasoning about the user preferences and how to satisfy them
+ "agent_notes": str, # Updated notes. Provide a description of the user preferences, how to satisfy them, and any additional notes. This will be provided to you in future conversations with this user. Ensure that you provide a structured response that is clear and easy to understand.
+}}
+For each response, output a valid JSON object using the exact format above, do not include any text before or after the JSON object."""
+
+
+class ReflectionGRPOAdapter:
+ """
+ Adapter for the Reflection + GRPO baseline from MULTISESSIONCOLLAB.
+
+ Uses GRPO-trained model for:
+ - Higher quality session-level reflections that better capture user preferences
+ - The model was trained with rewards from LLM judge evaluating reflection quality
+
+ Key difference from vanilla ReflectionAdapter:
+ - Uses GRPO-trained model (if available) for reflection generation
+ - Removes the faulty preprocessing step that was causing issues
+ - Produces more comprehensive and actionable agent notes
+ """
+
+ def __init__(
+ self,
+ model_name: str = None, # Auto-detect best available model
+ device_assignment: dict = None,
+ api_base: str = None, # Ignored, kept for compatibility
+ api_key: str = None, # Ignored, kept for compatibility
+ ):
+ # Auto-detect best model (GRPO > SFT > base)
+ self.model_path = model_name if model_name else get_best_available_model()
+ self.device_assignment = device_assignment
+
+ # Per-user memory storage
+ self._user_notes: Dict[str, str] = {}
+ self._current_user_id: Optional[str] = None
+ self._conversation_history: List[Dict[str, str]] = []
+
+ # Model components (loaded lazily)
+ self._model = None
+ self._tokenizer = None
+ self._initialized = False
+
+ def initialize(self):
+ """Initialize the adapter (loads model)."""
+ if self._initialized:
+ return
+
+ print(f"[ReflectionGRPOAdapter] Loading model from {self.model_path}...")
+ self._tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+ self._model = AutoModelForCausalLM.from_pretrained(
+ self.model_path,
+ torch_dtype=torch.bfloat16,
+ device_map="auto",
+ )
+ if self._tokenizer.pad_token_id is None:
+ self._tokenizer.pad_token = self._tokenizer.eos_token
+
+ self._initialized = True
+ print("[ReflectionGRPOAdapter] Initialized")
+
+ def _generate(self, messages: List[Dict[str, str]], max_new_tokens: int = 1024) -> str:
+ """Generate response using local model."""
+ if not self._initialized:
+ self.initialize()
+
+ # Apply chat template
+ prompt = self._tokenizer.apply_chat_template(
+ messages,
+ tokenize=False,
+ add_generation_prompt=True
+ )
+
+ inputs = self._tokenizer(
+ prompt,
+ return_tensors="pt",
+ truncation=True,
+ max_length=8192
+ ).to(self._model.device)
+
+ with torch.no_grad():
+ outputs = self._model.generate(
+ **inputs,
+ max_new_tokens=max_new_tokens,
+ do_sample=True,
+ temperature=0.7,
+ top_p=0.9,
+ eos_token_id=self._tokenizer.eos_token_id,
+ pad_token_id=self._tokenizer.pad_token_id,
+ )
+
+ # Extract only the generated part
+ input_len = inputs["input_ids"].shape[1]
+ gen_ids = outputs[0][input_len:]
+ response = self._tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+
+ return response
+
+ def start_session(self, user_id: str, user_profile: dict = None):
+ """Start a new session for a user."""
+ if not self._initialized:
+ self.initialize()
+
+ self._current_user_id = user_id
+ self._conversation_history = []
+
+ def generate_response(
+ self,
+ query: str,
+ conversation_history: List[Dict[str, str]] = None
+ ) -> Dict[str, Any]:
+ """
+ Generate a response using the GRPO-trained reflection agent.
+
+ Note: GRPO training improves the REFLECTION quality (in end_session),
+ not the runtime response generation. The improvement comes from better
+ agent_notes that are generated after each session.
+ """
+ if not self._initialized:
+ self.initialize()
+
+ # Add user query to history
+ self._conversation_history.append({"role": "user", "content": query})
+
+ # Get current notes for this user (these are higher quality due to GRPO training)
+ agent_notes = self._user_notes.get(self._current_user_id, "No notes yet about this user.")
+
+ # Build system prompt with notes
+ system_prompt = REFLECTIVE_AGENT_SYSTEM_PROMPT.format(agent_notes=agent_notes)
+
+ # Build messages for generation
+ messages = [{"role": "system", "content": system_prompt}]
+ messages.extend(self._conversation_history)
+
+ # Generate response
+ response_text = self._generate(messages)
+
+ self._conversation_history.append({"role": "assistant", "content": response_text})
+
+ return {
+ "response": response_text,
+ "reasoning": "",
+ "debug": {"agent_notes": agent_notes}
+ }
+
+ def end_session(self, task_success: bool = False) -> Dict[str, Any]:
+ """
+ End session and update agent notes via GRPO-trained reflection.
+
+ This is the KEY DIFFERENCE from vanilla reflection:
+ - The GRPO-trained model generates higher quality reflections
+ - Reflections better capture user preferences without hallucination
+ - Notes are more actionable and comprehensive
+
+ The improvement comes from GRPO training with rewards that evaluate:
+ - Coverage: Does reflection capture all enforced preferences?
+ - Actionability: Are notes useful for future interactions?
+ - Accuracy: No hallucinated preferences?
+ - Clarity: Well-organized and non-redundant?
+ """
+ if not self._current_user_id:
+ return {}
+
+ # Get current notes
+ current_notes = self._user_notes.get(self._current_user_id, "No notes yet.")
+
+ # Update notes via GRPO-trained session-level reflection
+ if len(self._conversation_history) > 0:
+ try:
+ # Build conversation string
+ conv_str = ""
+ for msg in self._conversation_history:
+ role = "User" if msg["role"] == "user" else "Assistant"
+ conv_str += f"{role}: {msg['content']}\n\n"
+
+ # Generate reflection using GRPO-trained model
+ reflection_prompt = UPDATE_AGENT_NOTES_PROMPT_GRPO.format(
+ agent_notes=current_notes,
+ conversation_str=conv_str
+ )
+
+ messages = [{"role": "user", "content": reflection_prompt}]
+ raw_output = self._generate(messages, max_new_tokens=512)
+
+ # Parse JSON output (GRPO-trained model outputs structured JSON)
+ try:
+ parsed = repair_json(raw_output, return_objects=True)
+ if isinstance(parsed, dict) and "agent_notes" in parsed:
+ updated_notes = parsed["agent_notes"]
+ else:
+ updated_notes = raw_output
+ except:
+ updated_notes = raw_output
+
+ if updated_notes:
+ self._user_notes[self._current_user_id] = updated_notes
+ print(f"[ReflectionGRPOAdapter] Updated notes for {self._current_user_id}")
+
+ except Exception as e:
+ print(f"[ReflectionGRPOAdapter] Failed to update notes: {e}")
+
+ return {
+ "turns": len(self._conversation_history),
+ "task_success": task_success,
+ "notes_updated": True,
+ }
+
+ def reset_user(self, user_id: str):
+ """Reset all memory for a user."""
+ if user_id in self._user_notes:
+ del self._user_notes[user_id]
+
+ def __call__(
+ self,
+ messages: List[Dict[str, str]],
+ user_profile: dict = None,
+ **kwargs
+ ) -> str:
+ """Callable interface for ConversationGenerator compatibility."""
+ if not messages:
+ return "How can I help you?"
+
+ last_user_msg = None
+ for msg in reversed(messages):
+ if msg["role"] == "user":
+ last_user_msg = msg["content"]
+ break
+
+ if last_user_msg is None:
+ return "How can I help you?"
+
+ result = self.generate_response(last_user_msg, messages)
+ return result["response"]