summaryrefslogtreecommitdiff
path: root/collaborativeagents/adapters/reflection_grpo_adapter.py
blob: 3c10942adf9a9f0595c4c563e4302bad4d94c239 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
"""
Reflection + GRPO Adapter - Local transformers-based implementation.

This implements the "Reflection + GRPO" baseline from the MULTISESSIONCOLLAB paper:
- Uses a GRPO-trained model for session-level reflection
- The model is trained to generate higher-quality reflections that capture user preferences
- Training uses rewards from LLM judge evaluating reflection quality

Key difference from vanilla reflection:
- Uses GRPO-trained model for reflection generation (better preference capture)
- Produces more actionable and comprehensive agent notes
"""

import sys
from pathlib import Path
from typing import Optional, List, Dict, Any
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from json_repair import repair_json

# Model paths - computed relative to project root
_PROJECT_ROOT = Path(__file__).parent.parent.parent
GRPO_MODEL_PATH = str(_PROJECT_ROOT / "collaborativeagents/training/outputs/grpo_reflection/final")
SFT_MODEL_PATH = str(_PROJECT_ROOT / "collaborativeagents/training/outputs/sft_reflection")
DEFAULT_MODEL_PATH = str(_PROJECT_ROOT / "models/llama-3.1-8b-instruct")

def get_best_available_model():
    """Get the best available model path (GRPO > SFT > base)."""
    grpo_path = Path(GRPO_MODEL_PATH)
    sft_path = Path(SFT_MODEL_PATH)

    if grpo_path.exists() and (grpo_path / "config.json").exists():
        print(f"[ReflectionGRPOAdapter] Using GRPO-trained model: {grpo_path}")
        return str(grpo_path)
    elif sft_path.exists() and (sft_path / "config.json").exists():
        print(f"[ReflectionGRPOAdapter] Using SFT model (GRPO not found): {sft_path}")
        return str(sft_path)
    else:
        print(f"[ReflectionGRPOAdapter] WARNING: No trained model found, using base model")
        print(f"[ReflectionGRPOAdapter] To train: run collaborativeagents/slurm/run_sft_training.sh")
        print(f"[ReflectionGRPOAdapter]          then collaborativeagents/slurm/run_grpo_training.sh")
        return DEFAULT_MODEL_PATH

# GRPO-enhanced system prompt with proper scaffolding
REFLECTIVE_AGENT_SYSTEM_PROMPT = """You are a collaborative AI agent helping users solve writing, question answering, math, and coding problems.

# Notes
Remember, you have been taking notes throughout past conversations about user preferences. Use these notes to guide your response:
{agent_notes}

# Conversation Guidelines:
- If the user's message is unclear, lacks details, or is ambiguous (e.g. length of an essay, format requirements, specific constraints), do not make assumptions. Ask for clarification and ensure you have enough information before providing an answer.
- Your goal is to help the user solve their problem. Adhere to their preferences and do your best to help them solve their problem."""

UPDATE_AGENT_NOTES_PROMPT = """You are a collaborative AI agent learning to better help a user with problem-solving tasks across multi-session interactions. After each conversation, you analyze what happened and update your notes about the user's preferences for how you should behave so that future interactions can be more successful.

# Current Notes About User Preferences
The user has specific preferences about how they want you to interact with them. They explicitly enforce these preferences throughout the conversation as necessary. Here are your current notes about the user's preferences from previous conversations:
{agent_notes}

# Conversation to Analyze
{conversation_str}

# Notes Updating Task
Analyze the conversation above to identify the user's preferences and how you can best satisfy them. Your goal is to create actionable notes that help you satisfy these preferences for future conversations. Keep your notes concise and actionable, without adding unnecessary details. Consider:
- When did the user explicitly ask you to adjust your response? What specifically did they want changed?
- What specific actions, formats, or approaches satisfy each preference? What should you keep in mind for future conversations?
As new situations arise, you may refine, combine, or split preferences to better reflect the user's needs. When updating the notes, do not lose any useful information from past interactions.
Make sure to add information about the user preferences that you are sure about, and do not hallucinate preferences.

Provide your updated notes as a clear, structured response. List each preference with actionable guidance."""

# GRPO-trained reflection prompt - produces higher quality reflections
UPDATE_AGENT_NOTES_PROMPT_GRPO = """You are a collaborative AI agent learning to better help a user with problem-solving tasks across multi-session interactions. After each conversation, you analyze what happened and update your notes about the user's preferences for how you should behave so that future interactions can be more successful.

# Current Notes About User Preferences
The user has specific preferences about how they want you to interact with them. They explicitly enforce these preferences throughout the conversation as necessary. Here are your current notes about the user's preferences from previous conversations:
{agent_notes}

# Conversation to Analyze
{conversation_str}

# Notes Updating Task
Analyze the conversation above to identify the user's preferences and how you can best satisfy them. Your goal is to create actionable notes that help you satisfy these preferences for future conversations. Keep your notes concise and actionable, without adding unnecessary details. Consider:
- When did the user explicitly ask you to adjust your response? What specifically did they want changed?
- What specific actions, formats, or approaches satisfy each preference? What should you keep in mind for future conversations?
As new situations arise, you may refine, combine, or split preferences to better reflect the user's needs. When updating the notes, do not lose any useful information from past interactions.
Make sure to add information about the user preferences that you are sure about, and do not hallucinate preferences.

# Output Format:
{{
   "user_preferences_reasoning": str, # Reasoning about the user preferences and how to satisfy them
   "agent_notes": str, # Updated notes. Provide a description of the user preferences, how to satisfy them, and any additional notes. This will be provided to you in future conversations with this user. Ensure that you provide a structured response that is clear and easy to understand.
}}
For each response, output a valid JSON object using the exact format above, do not include any text before or after the JSON object."""


class ReflectionGRPOAdapter:
    """
    Adapter for the Reflection + GRPO baseline from MULTISESSIONCOLLAB.

    Uses GRPO-trained model for:
    - Higher quality session-level reflections that better capture user preferences
    - The model was trained with rewards from LLM judge evaluating reflection quality

    Key difference from vanilla ReflectionAdapter:
    - Uses GRPO-trained model (if available) for reflection generation
    - Removes the faulty preprocessing step that was causing issues
    - Produces more comprehensive and actionable agent notes
    """

    def __init__(
        self,
        model_name: str = None,  # Auto-detect best available model
        device_assignment: dict = None,
        api_base: str = None,  # Ignored, kept for compatibility
        api_key: str = None,   # Ignored, kept for compatibility
    ):
        # Auto-detect best model (GRPO > SFT > base)
        self.model_path = model_name if model_name else get_best_available_model()
        self.device_assignment = device_assignment

        # Per-user memory storage
        self._user_notes: Dict[str, str] = {}
        self._current_user_id: Optional[str] = None
        self._conversation_history: List[Dict[str, str]] = []

        # Model components (loaded lazily)
        self._model = None
        self._tokenizer = None
        self._initialized = False

    def initialize(self):
        """Initialize the adapter (loads model)."""
        if self._initialized:
            return

        print(f"[ReflectionGRPOAdapter] Loading model from {self.model_path}...")
        self._tokenizer = AutoTokenizer.from_pretrained(self.model_path)
        self._model = AutoModelForCausalLM.from_pretrained(
            self.model_path,
            torch_dtype=torch.bfloat16,
            device_map="auto",
        )
        if self._tokenizer.pad_token_id is None:
            self._tokenizer.pad_token = self._tokenizer.eos_token

        self._initialized = True
        print("[ReflectionGRPOAdapter] Initialized")

    def _generate(self, messages: List[Dict[str, str]], max_new_tokens: int = 1024) -> str:
        """Generate response using local model."""
        if not self._initialized:
            self.initialize()

        # Apply chat template
        prompt = self._tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        inputs = self._tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=8192
        ).to(self._model.device)

        with torch.no_grad():
            outputs = self._model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                eos_token_id=self._tokenizer.eos_token_id,
                pad_token_id=self._tokenizer.pad_token_id,
            )

        # Extract only the generated part
        input_len = inputs["input_ids"].shape[1]
        gen_ids = outputs[0][input_len:]
        response = self._tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

        return response

    def start_session(self, user_id: str, user_profile: dict = None):
        """Start a new session for a user."""
        if not self._initialized:
            self.initialize()

        self._current_user_id = user_id
        self._conversation_history = []

    def generate_response(
        self,
        query: str,
        conversation_history: List[Dict[str, str]] = None
    ) -> Dict[str, Any]:
        """
        Generate a response using the GRPO-trained reflection agent.

        Note: GRPO training improves the REFLECTION quality (in end_session),
        not the runtime response generation. The improvement comes from better
        agent_notes that are generated after each session.
        """
        if not self._initialized:
            self.initialize()

        # Add user query to history
        self._conversation_history.append({"role": "user", "content": query})

        # Get current notes for this user (these are higher quality due to GRPO training)
        agent_notes = self._user_notes.get(self._current_user_id, "No notes yet about this user.")

        # Build system prompt with notes
        system_prompt = REFLECTIVE_AGENT_SYSTEM_PROMPT.format(agent_notes=agent_notes)

        # Build messages for generation
        messages = [{"role": "system", "content": system_prompt}]
        messages.extend(self._conversation_history)

        # Generate response
        response_text = self._generate(messages)

        self._conversation_history.append({"role": "assistant", "content": response_text})

        return {
            "response": response_text,
            "reasoning": "",
            "debug": {"agent_notes": agent_notes}
        }

    def end_session(self, task_success: bool = False) -> Dict[str, Any]:
        """
        End session and update agent notes via GRPO-trained reflection.

        This is the KEY DIFFERENCE from vanilla reflection:
        - The GRPO-trained model generates higher quality reflections
        - Reflections better capture user preferences without hallucination
        - Notes are more actionable and comprehensive

        The improvement comes from GRPO training with rewards that evaluate:
        - Coverage: Does reflection capture all enforced preferences?
        - Actionability: Are notes useful for future interactions?
        - Accuracy: No hallucinated preferences?
        - Clarity: Well-organized and non-redundant?
        """
        if not self._current_user_id:
            return {}

        # Get current notes
        current_notes = self._user_notes.get(self._current_user_id, "No notes yet.")

        # Update notes via GRPO-trained session-level reflection
        if len(self._conversation_history) > 0:
            try:
                # Build conversation string
                conv_str = ""
                for msg in self._conversation_history:
                    role = "User" if msg["role"] == "user" else "Assistant"
                    conv_str += f"{role}: {msg['content']}\n\n"

                # Generate reflection using GRPO-trained model
                reflection_prompt = UPDATE_AGENT_NOTES_PROMPT_GRPO.format(
                    agent_notes=current_notes,
                    conversation_str=conv_str
                )

                messages = [{"role": "user", "content": reflection_prompt}]
                raw_output = self._generate(messages, max_new_tokens=512)

                # Parse JSON output (GRPO-trained model outputs structured JSON)
                try:
                    parsed = repair_json(raw_output, return_objects=True)
                    if isinstance(parsed, dict) and "agent_notes" in parsed:
                        updated_notes = parsed["agent_notes"]
                    else:
                        updated_notes = raw_output
                except:
                    updated_notes = raw_output

                if updated_notes:
                    self._user_notes[self._current_user_id] = updated_notes
                    print(f"[ReflectionGRPOAdapter] Updated notes for {self._current_user_id}")

            except Exception as e:
                print(f"[ReflectionGRPOAdapter] Failed to update notes: {e}")

        return {
            "turns": len(self._conversation_history),
            "task_success": task_success,
            "notes_updated": True,
        }

    def reset_user(self, user_id: str):
        """Reset all memory for a user."""
        if user_id in self._user_notes:
            del self._user_notes[user_id]

    def __call__(
        self,
        messages: List[Dict[str, str]],
        user_profile: dict = None,
        **kwargs
    ) -> str:
        """Callable interface for ConversationGenerator compatibility."""
        if not messages:
            return "How can I help you?"

        last_user_msg = None
        for msg in reversed(messages):
            if msg["role"] == "user":
                last_user_msg = msg["content"]
                break

        if last_user_msg is None:
            return "How can I help you?"

        result = self.generate_response(last_user_msg, messages)
        return result["response"]