summaryrefslogtreecommitdiff
path: root/collaborativeagents/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'collaborativeagents/scripts')
-rwxr-xr-xcollaborativeagents/scripts/benchmark_inference.py429
l---------collaborativeagents/scripts/configs/local_models.yaml1
-rw-r--r--collaborativeagents/scripts/conflict_scenario_generator.py637
-rw-r--r--collaborativeagents/scripts/contextual_test_small.sbatch80
-rw-r--r--collaborativeagents/scripts/controlled_test.sbatch173
-rw-r--r--collaborativeagents/scripts/exp_all_memory.sbatch59
-rw-r--r--collaborativeagents/scripts/exp_contextual.sbatch66
-rw-r--r--collaborativeagents/scripts/exp_rag.sbatch59
-rw-r--r--collaborativeagents/scripts/exp_rag_vector.sbatch59
-rw-r--r--collaborativeagents/scripts/exp_reflection.sbatch66
-rw-r--r--collaborativeagents/scripts/exp_reflection_grpo.sbatch59
-rw-r--r--collaborativeagents/scripts/exp_template.sbatch59
-rw-r--r--collaborativeagents/scripts/exp_vanilla.sbatch59
-rw-r--r--collaborativeagents/scripts/extend_profiles.py195
-rw-r--r--collaborativeagents/scripts/full_experiment_batch.sbatch132
-rw-r--r--collaborativeagents/scripts/full_experiment_sequential.sbatch131
-rw-r--r--collaborativeagents/scripts/fullscale_method.sbatch92
-rw-r--r--collaborativeagents/scripts/fullscale_vanilla.sbatch43
-rw-r--r--collaborativeagents/scripts/generate_complex_profiles.py719
-rw-r--r--collaborativeagents/scripts/generate_profiles_v2.py475
-rw-r--r--collaborativeagents/scripts/generate_training_data.sh22
-rw-r--r--collaborativeagents/scripts/preflight_test.py311
-rw-r--r--collaborativeagents/scripts/quick_rag_debug.sbatch78
-rw-r--r--collaborativeagents/scripts/quick_test_a100.sbatch136
-rwxr-xr-xcollaborativeagents/scripts/quick_test_batch.sh137
-rw-r--r--collaborativeagents/scripts/quick_test_h200.sbatch137
-rw-r--r--collaborativeagents/scripts/rag_debug_interactive.sbatch87
-rw-r--r--collaborativeagents/scripts/rag_test_v4.sbatch92
-rw-r--r--collaborativeagents/scripts/rag_test_v5.sbatch96
-rw-r--r--collaborativeagents/scripts/run.py504
-rw-r--r--collaborativeagents/scripts/run.sh98
-rw-r--r--collaborativeagents/scripts/run_baseline_comparison.py608
-rw-r--r--collaborativeagents/scripts/run_debug.sh24
-rw-r--r--collaborativeagents/scripts/run_experiments.py1328
-rw-r--r--collaborativeagents/scripts/run_fp8.sh65
-rwxr-xr-xcollaborativeagents/scripts/run_preflight_test.sh89
-rw-r--r--collaborativeagents/scripts/scale_test_batch1.sbatch121
-rw-r--r--collaborativeagents/scripts/scale_test_batch2.sbatch126
-rw-r--r--collaborativeagents/scripts/scale_test_ctx_refl.sbatch114
-rw-r--r--collaborativeagents/scripts/smallscale_test.sbatch87
-rw-r--r--collaborativeagents/scripts/test_70b_pilot.py281
-rw-r--r--collaborativeagents/scripts/test_all_a100x8.sbatch124
-rw-r--r--collaborativeagents/scripts/test_all_h200.sbatch126
-rw-r--r--collaborativeagents/scripts/test_all_methods.sbatch91
-rw-r--r--collaborativeagents/scripts/test_batch_50.py98
-rwxr-xr-xcollaborativeagents/scripts/test_batch_50.sh107
-rwxr-xr-xcollaborativeagents/scripts/test_batch_vs_parallel.sh151
-rw-r--r--collaborativeagents/scripts/test_extractor.py46
-rw-r--r--collaborativeagents/scripts/test_multiturn.py248
-rwxr-xr-xcollaborativeagents/scripts/test_parallel_a100.sh172
-rwxr-xr-xcollaborativeagents/scripts/test_parallel_quick.sh158
-rw-r--r--collaborativeagents/scripts/test_parallel_speed.sbatch126
-rwxr-xr-xcollaborativeagents/scripts/test_parallel_speed_a100.sbatch126
-rwxr-xr-xcollaborativeagents/scripts/test_parallel_vllm.sh205
-rw-r--r--collaborativeagents/scripts/test_rag_empty.sbatch143
-rw-r--r--collaborativeagents/scripts/test_rag_empty_v2.sbatch124
-rw-r--r--collaborativeagents/scripts/test_rag_empty_v3.sbatch110
-rw-r--r--collaborativeagents/scripts/test_rag_fix.sbatch123
-rw-r--r--collaborativeagents/scripts/test_real_speed.sbatch87
-rwxr-xr-xcollaborativeagents/scripts/test_vllm_adapter.sh74
-rwxr-xr-xcollaborativeagents/scripts/test_vllm_interactive.sh212
-rw-r--r--collaborativeagents/scripts/test_vllm_speed.sbatch130
-rw-r--r--collaborativeagents/scripts/test_vllm_speed_a100.sbatch126
-rw-r--r--collaborativeagents/scripts/visualize.py492
64 files changed, 11533 insertions, 0 deletions
diff --git a/collaborativeagents/scripts/benchmark_inference.py b/collaborativeagents/scripts/benchmark_inference.py
new file mode 100755
index 0000000..6a2ee13
--- /dev/null
+++ b/collaborativeagents/scripts/benchmark_inference.py
@@ -0,0 +1,429 @@
+#!/usr/bin/env python3
+"""
+Benchmark inference speed: Transformers vs vLLM.
+
+This script helps diagnose the 100x slowdown issue by comparing:
+1. Raw transformers inference (current implementation)
+2. vLLM server inference (target implementation)
+
+Usage:
+ # First, start vLLM server:
+ # CUDA_VISIBLE_DEVICES=0 vllm serve /path/to/model --port 8003
+
+ # Then run benchmark:
+ python benchmark_inference.py --mode both --n 20
+ python benchmark_inference.py --mode vllm --url http://localhost:8003/v1 --n 50
+ python benchmark_inference.py --mode transformers --model /path/to/model --n 10
+"""
+
+import argparse
+import json
+import time
+import sys
+from pathlib import Path
+from typing import List, Dict, Any
+from dataclasses import dataclass
+
+# Add paths
+sys.path.insert(0, str(Path(__file__).parent.parent))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src"))
+
+
+@dataclass
+class BenchmarkResult:
+ mode: str
+ n_requests: int
+ total_time_s: float
+ avg_latency_ms: float
+ min_latency_ms: float
+ max_latency_ms: float
+ throughput_req_per_s: float
+ throughput_conv_per_hr: float # Estimated conversations per hour
+ errors: int
+
+
+def benchmark_transformers(
+ model_path: str,
+ n_requests: int = 10,
+ device: str = "cuda:0",
+) -> BenchmarkResult:
+ """Benchmark raw transformers inference."""
+ import torch
+ from transformers import AutoModelForCausalLM, AutoTokenizer
+
+ print(f"Loading model from {model_path}...")
+ load_start = time.time()
+
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
+ model = AutoModelForCausalLM.from_pretrained(
+ model_path,
+ torch_dtype=torch.bfloat16,
+ device_map=device,
+ )
+ if tokenizer.pad_token_id is None:
+ tokenizer.pad_token = tokenizer.eos_token
+
+ load_time = time.time() - load_start
+ print(f"Model loaded in {load_time:.1f}s")
+
+ # Test prompt (simulating a typical user simulator turn)
+ test_messages = [
+ {"role": "system", "content": "You are a user simulator. Output JSON with reasoning, draft_answer, should_terminate, and response fields."},
+ {"role": "user", "content": "The agent said: 'Hello, how can I help you today?' Respond as the user."},
+ ]
+
+ prompt = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)
+
+ latencies = []
+ errors = 0
+
+ print(f"Running {n_requests} inference requests...")
+ start_time = time.time()
+
+ for i in range(n_requests):
+ try:
+ req_start = time.time()
+
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
+
+ with torch.no_grad():
+ outputs = model.generate(
+ **inputs,
+ max_new_tokens=256,
+ do_sample=True,
+ temperature=0.7,
+ top_p=0.9,
+ eos_token_id=tokenizer.eos_token_id,
+ pad_token_id=tokenizer.pad_token_id,
+ )
+
+ # Decode output
+ input_len = inputs["input_ids"].shape[1]
+ gen_ids = outputs[0][input_len:]
+ response = tokenizer.decode(gen_ids, skip_special_tokens=True)
+
+ latency_ms = (time.time() - req_start) * 1000
+ latencies.append(latency_ms)
+
+ if (i + 1) % 5 == 0:
+ print(f" Completed {i + 1}/{n_requests}, last latency: {latency_ms:.0f}ms")
+
+ except Exception as e:
+ errors += 1
+ print(f" Error on request {i + 1}: {e}")
+
+ total_time = time.time() - start_time
+
+ if not latencies:
+ return BenchmarkResult(
+ mode="transformers",
+ n_requests=n_requests,
+ total_time_s=total_time,
+ avg_latency_ms=0,
+ min_latency_ms=0,
+ max_latency_ms=0,
+ throughput_req_per_s=0,
+ throughput_conv_per_hr=0,
+ errors=errors,
+ )
+
+ avg_latency = sum(latencies) / len(latencies)
+ # Estimate: ~10 turns per conversation, so conv/hr = (req/s) * 3600 / 10
+ throughput = len(latencies) / total_time
+ conv_per_hr = throughput * 3600 / 10
+
+ return BenchmarkResult(
+ mode="transformers",
+ n_requests=n_requests,
+ total_time_s=total_time,
+ avg_latency_ms=avg_latency,
+ min_latency_ms=min(latencies),
+ max_latency_ms=max(latencies),
+ throughput_req_per_s=throughput,
+ throughput_conv_per_hr=conv_per_hr,
+ errors=errors,
+ )
+
+
+def benchmark_vllm(
+ base_url: str = "http://localhost:8003/v1",
+ n_requests: int = 10,
+ concurrent: bool = False,
+ n_workers: int = 4,
+) -> BenchmarkResult:
+ """Benchmark vLLM server inference."""
+ from utils.vllm_client import VLLMClient
+
+ client = VLLMClient(base_url=base_url)
+
+ # Check health
+ if not client.health_check():
+ print(f"ERROR: vLLM server at {base_url} is not responding")
+ return BenchmarkResult(
+ mode="vllm",
+ n_requests=n_requests,
+ total_time_s=0,
+ avg_latency_ms=0,
+ min_latency_ms=0,
+ max_latency_ms=0,
+ throughput_req_per_s=0,
+ throughput_conv_per_hr=0,
+ errors=n_requests,
+ )
+
+ print(f"vLLM server healthy: {client.get_model_info()}")
+
+ # Test messages
+ test_messages = [
+ {"role": "system", "content": "You are a user simulator. Output JSON with reasoning, draft_answer, should_terminate, and response fields."},
+ {"role": "user", "content": "The agent said: 'Hello, how can I help you today?' Respond as the user."},
+ ]
+
+ latencies = []
+ errors = 0
+
+ print(f"Running {n_requests} inference requests (concurrent={concurrent})...")
+ start_time = time.time()
+
+ if concurrent:
+ from concurrent.futures import ThreadPoolExecutor, as_completed
+
+ with ThreadPoolExecutor(max_workers=n_workers) as executor:
+ futures = [
+ executor.submit(client.chat, test_messages, 256, 0.7)
+ for _ in range(n_requests)
+ ]
+ for i, future in enumerate(as_completed(futures)):
+ try:
+ result = future.result()
+ latencies.append(result["latency_ms"])
+ if (i + 1) % 10 == 0:
+ print(f" Completed {i + 1}/{n_requests}")
+ except Exception as e:
+ errors += 1
+ print(f" Error: {e}")
+ else:
+ for i in range(n_requests):
+ try:
+ result = client.chat(test_messages, 256, 0.7)
+ latencies.append(result["latency_ms"])
+
+ if (i + 1) % 5 == 0:
+ print(f" Completed {i + 1}/{n_requests}, last latency: {result['latency_ms']:.0f}ms")
+
+ except Exception as e:
+ errors += 1
+ print(f" Error on request {i + 1}: {e}")
+
+ total_time = time.time() - start_time
+
+ if not latencies:
+ return BenchmarkResult(
+ mode="vllm" + ("_concurrent" if concurrent else ""),
+ n_requests=n_requests,
+ total_time_s=total_time,
+ avg_latency_ms=0,
+ min_latency_ms=0,
+ max_latency_ms=0,
+ throughput_req_per_s=0,
+ throughput_conv_per_hr=0,
+ errors=errors,
+ )
+
+ avg_latency = sum(latencies) / len(latencies)
+ throughput = len(latencies) / total_time
+ conv_per_hr = throughput * 3600 / 10
+
+ return BenchmarkResult(
+ mode="vllm" + ("_concurrent" if concurrent else ""),
+ n_requests=n_requests,
+ total_time_s=total_time,
+ avg_latency_ms=avg_latency,
+ min_latency_ms=min(latencies),
+ max_latency_ms=max(latencies),
+ throughput_req_per_s=throughput,
+ throughput_conv_per_hr=conv_per_hr,
+ errors=errors,
+ )
+
+
+def benchmark_full_conversation(
+ vllm_url_70b: str,
+ vllm_url_8b: str,
+ n_conversations: int = 5,
+ max_turns: int = 10,
+) -> Dict[str, Any]:
+ """
+ Benchmark a full multi-turn conversation with user simulator and agent.
+ This simulates the actual experiment loop.
+ """
+ from utils.vllm_client import VLLMClient, VLLMUserSimulator, VLLMAgentAdapter
+
+ user_client = VLLMClient(base_url=vllm_url_70b)
+ agent_client = VLLMClient(base_url=vllm_url_8b)
+
+ if not user_client.health_check():
+ print(f"ERROR: 70B server at {vllm_url_70b} not responding")
+ return {"error": "70B server not available"}
+
+ if not agent_client.health_check():
+ print(f"ERROR: 8B server at {vllm_url_8b} not responding")
+ return {"error": "8B server not available"}
+
+ print(f"Running {n_conversations} full conversations (max {max_turns} turns each)...")
+
+ conversation_times = []
+ total_turns = 0
+
+ start_time = time.time()
+
+ for conv_idx in range(n_conversations):
+ conv_start = time.time()
+
+ # Create user simulator
+ user_sim = VLLMUserSimulator(
+ problem="What is 2 + 2? Explain your reasoning step by step.",
+ user_persona="A student learning math",
+ user_preferences="- I prefer step-by-step explanations\n- Always show your work",
+ vllm_client=user_client,
+ )
+
+ # Create agent
+ agent = VLLMAgentAdapter(
+ vllm_client=agent_client,
+ system_prompt="You are a helpful math tutor. Explain concepts clearly."
+ )
+
+ # Run conversation
+ conversation = [{"role": "assistant", "content": "How can I help you today?"}]
+
+ for turn in range(max_turns):
+ # User turn
+ user_response = user_sim.generate_user_response(conversation)
+ if user_response is None:
+ break
+
+ conversation.append({"role": "user", "content": user_response["response"]})
+
+ if user_response.get("should_terminate", False):
+ break
+
+ # Agent turn
+ agent_response = agent.generate_response(user_response["response"])
+ conversation.append({"role": "assistant", "content": agent_response["response"]})
+
+ total_turns += 1
+
+ conv_time = time.time() - conv_start
+ conversation_times.append(conv_time)
+ print(f" Conversation {conv_idx + 1}/{n_conversations}: {len(conversation)} messages, {conv_time:.1f}s")
+
+ total_time = time.time() - start_time
+
+ return {
+ "n_conversations": n_conversations,
+ "total_turns": total_turns,
+ "total_time_s": total_time,
+ "avg_conv_time_s": sum(conversation_times) / len(conversation_times) if conversation_times else 0,
+ "throughput_conv_per_hr": n_conversations / total_time * 3600,
+ "throughput_turns_per_hr": total_turns / total_time * 3600,
+ }
+
+
+def print_results(results: List[BenchmarkResult]):
+ """Print benchmark results in a nice table."""
+ print("\n" + "=" * 80)
+ print("BENCHMARK RESULTS")
+ print("=" * 80)
+
+ print(f"\n{'Mode':<20} {'Requests':<10} {'Avg Latency':<12} {'Throughput':<15} {'Conv/hr':<12} {'Errors':<8}")
+ print("-" * 80)
+
+ for r in results:
+ print(f"{r.mode:<20} {r.n_requests:<10} {r.avg_latency_ms:>8.0f}ms {r.throughput_req_per_s:>10.2f}/s {r.throughput_conv_per_hr:>8.0f} {r.errors:<8}")
+
+ print("-" * 80)
+
+ # Compare speedup
+ if len(results) >= 2:
+ transformers_result = next((r for r in results if r.mode == "transformers"), None)
+ vllm_result = next((r for r in results if "vllm" in r.mode and r.throughput_req_per_s > 0), None)
+
+ if transformers_result and vllm_result and transformers_result.throughput_req_per_s > 0:
+ speedup = vllm_result.throughput_req_per_s / transformers_result.throughput_req_per_s
+ print(f"\nvLLM speedup over transformers: {speedup:.1f}x")
+
+ # Target comparison
+ target_conv_per_hr = 2000
+ for r in results:
+ if r.throughput_conv_per_hr > 0:
+ ratio = r.throughput_conv_per_hr / target_conv_per_hr
+ status = "✓" if ratio >= 0.5 else "✗"
+ print(f"{status} {r.mode}: {r.throughput_conv_per_hr:.0f} conv/hr ({ratio:.1%} of paper's 2000 conv/hr)")
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Benchmark inference speed")
+ parser.add_argument("--mode", choices=["transformers", "vllm", "both", "conversation"], default="vllm")
+ parser.add_argument("--model", type=str, default="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct",
+ help="Model path for transformers benchmark")
+ parser.add_argument("--url", type=str, default="http://localhost:8003/v1",
+ help="vLLM server URL")
+ parser.add_argument("--url-70b", type=str, default="http://localhost:8004/v1",
+ help="vLLM server URL for 70B model (user simulator)")
+ parser.add_argument("--url-8b", type=str, default="http://localhost:8003/v1",
+ help="vLLM server URL for 8B model (agent)")
+ parser.add_argument("-n", type=int, default=20, help="Number of requests")
+ parser.add_argument("--concurrent", action="store_true", help="Run vLLM benchmark with concurrent requests")
+ parser.add_argument("--device", type=str, default="cuda:0", help="Device for transformers")
+
+ args = parser.parse_args()
+
+ results = []
+
+ if args.mode == "conversation":
+ # Full conversation benchmark
+ conv_results = benchmark_full_conversation(
+ args.url_70b,
+ args.url_8b,
+ n_conversations=args.n,
+ )
+ print("\n" + "=" * 80)
+ print("FULL CONVERSATION BENCHMARK")
+ print("=" * 80)
+ print(json.dumps(conv_results, indent=2))
+
+ if "throughput_conv_per_hr" in conv_results:
+ target = 2000
+ actual = conv_results["throughput_conv_per_hr"]
+ print(f"\nTarget: {target} conv/hr (paper)")
+ print(f"Actual: {actual:.0f} conv/hr ({actual/target:.1%} of target)")
+
+ else:
+ if args.mode in ["transformers", "both"]:
+ print("\n" + "=" * 40)
+ print("TRANSFORMERS BENCHMARK")
+ print("=" * 40)
+ result = benchmark_transformers(args.model, args.n, args.device)
+ results.append(result)
+
+ if args.mode in ["vllm", "both"]:
+ print("\n" + "=" * 40)
+ print("vLLM BENCHMARK (sequential)")
+ print("=" * 40)
+ result = benchmark_vllm(args.url, args.n, concurrent=False)
+ results.append(result)
+
+ if args.concurrent:
+ print("\n" + "=" * 40)
+ print("vLLM BENCHMARK (concurrent)")
+ print("=" * 40)
+ result = benchmark_vllm(args.url, args.n, concurrent=True, n_workers=4)
+ results.append(result)
+
+ print_results(results)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/collaborativeagents/scripts/configs/local_models.yaml b/collaborativeagents/scripts/configs/local_models.yaml
new file mode 120000
index 0000000..b6f8fad
--- /dev/null
+++ b/collaborativeagents/scripts/configs/local_models.yaml
@@ -0,0 +1 @@
+/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/configs/local_models.yaml \ No newline at end of file
diff --git a/collaborativeagents/scripts/conflict_scenario_generator.py b/collaborativeagents/scripts/conflict_scenario_generator.py
new file mode 100644
index 0000000..9d00de8
--- /dev/null
+++ b/collaborativeagents/scripts/conflict_scenario_generator.py
@@ -0,0 +1,637 @@
+"""
+Conflict Scenario Generator
+
+Generates queries that deliberately trigger preference conflicts.
+The key insight: RAG naturally resolves conflicts by retrieving ONLY
+the relevant preference, while context-based methods see ALL preferences
+and get confused.
+
+Design principles:
+1. Every test query should trigger 2+ conflicting preferences
+2. Only ONE preference is correct given the full context
+3. RAG retrieves the correct one (high similarity to query)
+4. Context methods see both and often pick wrong one or try to satisfy both
+"""
+
+import json
+import random
+from dataclasses import dataclass, field
+from typing import Optional
+from pathlib import Path
+
+
+# ============================================================================
+# Conflict Templates
+# ============================================================================
+
+@dataclass
+class ConflictScenario:
+ """A scenario that triggers a preference conflict."""
+ scenario_id: str
+ conflict_group: str
+ query: str
+ context_cues: list # What makes the correct preference clear
+ triggered_prefs: list # Preference IDs that could apply
+ correct_pref_id: str # The one that SHOULD apply
+ wrong_pref_ids: list # The ones that should NOT apply
+ why_correct: str # Explanation for ground truth
+ expected_rag_behavior: str # What RAG should do
+ expected_context_failure: str # How context methods fail
+
+
+# Core conflict scenarios - each designed to fail context methods
+CONFLICT_TEMPLATES = {
+ # =========================================================================
+ # FORMAT CONFLICTS
+ # =========================================================================
+ "format_bullets_vs_numbered": [
+ {
+ "query": "What are the steps to deploy a Docker container? Also list the common mistakes to avoid.",
+ "context_cues": ["steps to deploy = procedure", "list mistakes = enumeration"],
+ "correct_for": "both apply to different parts",
+ "why_context_fails": "Context sees both prefs, might use one format for everything",
+ "why_rag_wins": "RAG retrieves procedure-pref for deploy part, list-pref for mistakes part"
+ },
+ {
+ "query": "Walk me through setting up CI/CD - what tools should I consider?",
+ "context_cues": ["walk through = sequential", "consider = options"],
+ "correct_for": "numbered for walkthrough, bullets for tools",
+ "why_context_fails": "Mixes formats inconsistently",
+ "why_rag_wins": "Retrieves appropriate format preference per section"
+ },
+ {
+ "query": "How do I configure nginx? Give me the key parameters.",
+ "context_cues": ["how do I = procedure", "key parameters = list"],
+ "correct_for": "numbered steps + bulleted parameters",
+ "why_context_fails": "Context methods apply one format to all",
+ "why_rag_wins": "Separate retrieval for procedure vs enumeration context"
+ }
+ ],
+
+ "format_answer_first_vs_buildup": [
+ {
+ "query": "What's the time complexity of quicksort and why?",
+ "context_cues": ["what's = direct question", "why = needs explanation"],
+ "correct_for": "answer first (O(n log n)), then explain why",
+ "why_context_fails": "Either gives answer without why, or long buildup first",
+ "why_rag_wins": "Retrieves 'answer first' for 'what's', builds explanation for 'why'"
+ },
+ {
+ "query": "Explain how neural networks learn - what's backpropagation?",
+ "context_cues": ["explain how = learning", "what's = definition needed"],
+ "correct_for": "build up intuition for 'how', then define backprop",
+ "why_context_fails": "Starts with backprop definition (answer first) losing context",
+ "why_rag_wins": "Identifies learning intent first, answer-seeking second"
+ }
+ ],
+
+ # =========================================================================
+ # VERBOSITY CONFLICTS
+ # =========================================================================
+ "verbosity_concise_vs_detailed": [
+ {
+ "query": "Quick question - how does the GIL work in Python?",
+ "context_cues": ["quick question = brevity cue", "GIL = complex topic"],
+ "correct_for": "concise (user said quick)",
+ "why_context_fails": "Sees 'complex topic' pref, gives long explanation",
+ "why_rag_wins": "Explicit brevity cue has higher retrieval score"
+ },
+ {
+ "query": "Briefly explain the proof of the halting problem.",
+ "context_cues": ["briefly = brevity", "proof = normally detailed"],
+ "correct_for": "concise - user explicitly asked for brief",
+ "why_context_fails": "Proof preference triggers long format",
+ "why_rag_wins": "'Briefly' in query matches concise preference strongly"
+ },
+ {
+ "query": "TL;DR on microservices vs monolith for a startup?",
+ "context_cues": ["TL;DR = max brevity", "comparison = could be detailed"],
+ "correct_for": "ultra-concise comparison",
+ "why_context_fails": "Comparison pref might trigger table/detailed analysis",
+ "why_rag_wins": "TL;DR keyword retrieves brevity preference"
+ },
+ {
+ "query": "In detail, what's 2+2?",
+ "context_cues": ["in detail = verbosity cue", "2+2 = trivial"],
+ "correct_for": "brief (topic too simple for detail)",
+ "why_context_fails": "Might over-explain simple arithmetic",
+ "why_rag_wins": "Query simplicity context overrides detail cue"
+ }
+ ],
+
+ # =========================================================================
+ # CODE STYLE CONFLICTS
+ # =========================================================================
+ "code_naming_convention": [
+ {
+ "query": "Write a function to parse JSON, show it in Python and JavaScript.",
+ "context_cues": ["Python = snake_case", "JavaScript = camelCase"],
+ "correct_for": "snake_case for Python version, camelCase for JS version",
+ "why_context_fails": "Picks one convention for both, or inconsistent",
+ "why_rag_wins": "Language detection triggers correct convention per block"
+ },
+ {
+ "query": "Convert this Python script to TypeScript: def get_user_data(): ...",
+ "context_cues": ["Python source = snake_case", "TypeScript target = camelCase"],
+ "correct_for": "convert snake_case to camelCase in TypeScript output",
+ "why_context_fails": "Might keep snake_case in TypeScript",
+ "why_rag_wins": "Output language triggers appropriate convention"
+ },
+ {
+ "query": "Write SQL to join users and orders, then show Python code to run it.",
+ "context_cues": ["SQL = UPPERCASE keywords", "Python = snake_case"],
+ "correct_for": "SQL: SELECT, FROM; Python: result_set, fetch_data",
+ "why_context_fails": "Style bleeds across languages",
+ "why_rag_wins": "Separate retrieval for each language context"
+ }
+ ],
+
+ "code_comment_style": [
+ {
+ "query": "Here's a 5-line utility function, explain what each part does.",
+ "context_cues": ["5-line = short", "explain each part = inline comments"],
+ "correct_for": "inline comments for each line",
+ "why_context_fails": "Might use docstring style for short code",
+ "why_rag_wins": "Short code + explanation request = inline comments"
+ },
+ {
+ "query": "Write a complete data processing class with documentation.",
+ "context_cues": ["complete class = production code", "documentation = docstrings"],
+ "correct_for": "docstrings at class/method level, minimal inline",
+ "why_context_fails": "Over-comments with inline explanations",
+ "why_rag_wins": "Class + documentation context triggers docstring pref"
+ }
+ ],
+
+ "code_review_scope": [
+ {
+ "query": "Review this code for bugs, I need to ship it today.",
+ "context_cues": ["review = code review", "ship today = urgent, bugs only"],
+ "correct_for": "bugs only, skip style",
+ "why_context_fails": "Still comments on style issues",
+ "why_rag_wins": "Urgency cue + 'bugs' retrieves bugs-only preference"
+ },
+ {
+ "query": "Look at my code and help me improve it for the codebase.",
+ "context_cues": ["improve = refactor scope", "for codebase = style matters"],
+ "correct_for": "both logic and style suggestions",
+ "why_context_fails": "Might only focus on bugs",
+ "why_rag_wins": "'Improve' and 'codebase' retrieve full-review pref"
+ }
+ ],
+
+ # =========================================================================
+ # INTERACTION CONFLICTS
+ # =========================================================================
+ "interaction_autonomy": [
+ {
+ "query": "Refactor the authentication module.",
+ "context_cues": ["refactor = significant change", "no specific instruction"],
+ "correct_for": "confirm approach first",
+ "why_context_fails": "Might just start refactoring without plan",
+ "why_rag_wins": "Ambiguous scope triggers confirmation pref"
+ },
+ {
+ "query": "Change the variable name from 'x' to 'count' in line 5.",
+ "context_cues": ["specific instruction", "single change"],
+ "correct_for": "execute directly, no confirmation needed",
+ "why_context_fails": "Might still ask for confirmation",
+ "why_rag_wins": "Specific instruction retrieves execute-directly pref"
+ },
+ {
+ "query": "Update the database schema to add user preferences - it's complex.",
+ "context_cues": ["update schema = significant", "complex = acknowledged"],
+ "correct_for": "definitely confirm - user said it's complex",
+ "why_context_fails": "Might dive in because 'update' sounds actionable",
+ "why_rag_wins": "'Complex' keyword strongly triggers confirmation"
+ }
+ ],
+
+ "interaction_guidance": [
+ {
+ "query": "Should I use Redis or Memcached for caching?",
+ "context_cues": ["should I = asking for recommendation", "or = comparison"],
+ "correct_for": "give recommendation with rationale",
+ "why_context_fails": "Gives neutral pros/cons without recommendation",
+ "why_rag_wins": "'Should I' retrieves recommendation preference"
+ },
+ {
+ "query": "Compare React, Vue, and Angular for my project.",
+ "context_cues": ["compare = explicit comparison", "my project = context needed"],
+ "correct_for": "table format with tradeoffs",
+ "why_context_fails": "Might just recommend one or give long prose",
+ "why_rag_wins": "'Compare' retrieves comparison-table preference"
+ }
+ ],
+
+ # =========================================================================
+ # MATH/EXPLANATION CONFLICTS
+ # =========================================================================
+ "math_detail_level": [
+ {
+ "query": "What's the derivative of x^2? I'm preparing for an exam.",
+ "context_cues": ["what's = direct ask", "exam prep = practice context"],
+ "correct_for": "show steps + give practice problem",
+ "why_context_fails": "Just gives answer (2x) without exam context",
+ "why_rag_wins": "'Exam' retrieves practice-problem preference"
+ },
+ {
+ "query": "Verify my answer: integral of sin(x) = -cos(x) + C. Is this right?",
+ "context_cues": ["verify = checking work", "is this right = confirmation"],
+ "correct_for": "check step by step, confirm or point out issue",
+ "why_context_fails": "Might re-derive from scratch",
+ "why_rag_wins": "'Verify' retrieves check-their-work preference"
+ }
+ ],
+
+ "math_approach": [
+ {
+ "query": "What's the probability of rolling two sixes?",
+ "context_cues": ["probability = statistics", "rolling dice = intuitive example"],
+ "correct_for": "intuition first (1 in 36), then formula",
+ "why_context_fails": "Starts with P(A∩B) = P(A)P(B) formula",
+ "why_rag_wins": "Statistics topic retrieves intuition-first preference"
+ },
+ {
+ "query": "Prove that the sum of angles in a triangle is 180°.",
+ "context_cues": ["prove = formal proof", "geometry = visual possible"],
+ "correct_for": "structured proof format per preference",
+ "why_context_fails": "Might give intuitive explanation instead of proof",
+ "why_rag_wins": "'Prove' retrieves proof-format preference"
+ }
+ ],
+
+ # =========================================================================
+ # DOMAIN CONFLICTS
+ # =========================================================================
+ "domain_example_position": [
+ {
+ "query": "How do I use the requests library in Python?",
+ "context_cues": ["how do I use = practical/API", "library = code example helpful"],
+ "correct_for": "minimal example first, then explain parameters",
+ "why_context_fails": "Explains parameters first, example last",
+ "why_rag_wins": "API/library context retrieves example-first preference"
+ },
+ {
+ "query": "What is dynamic programming?",
+ "context_cues": ["what is = concept/theory", "definition needed"],
+ "correct_for": "definition first, then example, then edge cases",
+ "why_context_fails": "Might lead with example (Fibonacci)",
+ "why_rag_wins": "Theory context retrieves definition-first preference"
+ }
+ ],
+
+ # =========================================================================
+ # OUTPUT ARTIFACT CONFLICTS
+ # =========================================================================
+ "output_code_presentation": [
+ {
+ "query": "Give me a sorting function I can use, I'm in a hurry.",
+ "context_cues": ["give me = copyable", "in a hurry = no explanation"],
+ "correct_for": "single code block, no prose",
+ "why_context_fails": "Adds explanatory prose between code",
+ "why_rag_wins": "'Give me' + 'hurry' retrieves copy-paste preference"
+ },
+ {
+ "query": "Teach me how to implement quicksort step by step.",
+ "context_cues": ["teach me = learning", "step by step = chunked"],
+ "correct_for": "code in small chunks with explanation between",
+ "why_context_fails": "Gives full implementation at once",
+ "why_rag_wins": "'Teach' + 'step by step' retrieves chunked preference"
+ }
+ ],
+
+ # =========================================================================
+ # CORRECTION STYLE CONFLICTS
+ # =========================================================================
+ "correction_severity": [
+ {
+ "query": "I'm using a hashmap to store my data, is this right?",
+ "context_cues": ["hashmap = might mean dict/map", "is this right = validation"],
+ "correct_for": "gentle inline (hashmap is fine, also called dict)",
+ "why_context_fails": "Might pedantically correct terminology",
+ "why_rag_wins": "Minor terminology + validation retrieves gentle-correction pref"
+ },
+ {
+ "query": "I think recursion is just loops with extra steps, right?",
+ "context_cues": ["fundamental misconception", "asking for validation"],
+ "correct_for": "directly address misconception before proceeding",
+ "why_context_fails": "Might gloss over and just show recursion",
+ "why_rag_wins": "Fundamental error retrieves explicit-correction preference"
+ }
+ ],
+
+ # =========================================================================
+ # MULTI-DOMAIN CONFLICTS (hardest!)
+ # =========================================================================
+ "multi_domain_complex": [
+ {
+ "query": "Quick question - walk me through implementing a binary tree in Python with proper documentation.",
+ "context_cues": ["quick = brief", "walk through = detailed", "documentation = thorough"],
+ "correct_for": "quick wins (explicit), but include docstrings (documentation ask)",
+ "why_context_fails": "Confused by conflicting signals, inconsistent response",
+ "why_rag_wins": "Explicit brevity cue retrieved, documentation pref adds docstrings"
+ },
+ {
+ "query": "I'm debugging my ML model and it's not converging. This is frustrating! Compare Adam vs SGD for me.",
+ "context_cues": ["debugging = focus on issue", "frustrating = emotional", "compare = table"],
+ "correct_for": "acknowledge frustration, then comparison table for optimizers",
+ "why_context_fails": "Might skip emotional acknowledgment or wrong format",
+ "why_rag_wins": "Frustration pref + comparison pref both retrieved, applied in order"
+ },
+ {
+ "query": "Review this Python code and convert it to JavaScript. Focus on bugs first.",
+ "context_cues": ["review = bugs per 'focus' cue", "convert = language change"],
+ "correct_for": "Python review (bugs only) + JS conversion (camelCase)",
+ "why_context_fails": "Applies wrong scope or wrong naming convention",
+ "why_rag_wins": "Multiple relevant prefs retrieved per task segment"
+ }
+ ]
+}
+
+
+# ============================================================================
+# Scenario Generator
+# ============================================================================
+
+class ConflictScenarioGenerator:
+ """Generates conflict scenarios from templates and user profiles."""
+
+ def __init__(self, profile: dict = None, seed: int = 42):
+ self.profile = profile
+ self.preferences = {p['pref_id']: p for p in profile['preferences']} if profile else {}
+ self.random = random.Random(seed)
+
+ def generate_for_profile(self, preferences: list, domain: str = None) -> dict:
+ """Generate a single conflict scenario for given preferences and domain."""
+ # Find conflict groups in these preferences
+ conflict_groups = {}
+ for pref in preferences:
+ cg = pref.get('conflict_group')
+ if cg:
+ if cg not in conflict_groups:
+ conflict_groups[cg] = []
+ conflict_groups[cg].append(pref)
+
+ # Find a conflict group with at least 2 preferences
+ for cg, prefs in conflict_groups.items():
+ if len(prefs) >= 2 and cg in CONFLICT_TEMPLATES:
+ templates = CONFLICT_TEMPLATES[cg]
+ template = self.random.choice(templates)
+ return {
+ "query": template['query'],
+ "conflict_group": cg,
+ "preferences": prefs,
+ "expected_preference": prefs[0]['pref_id'], # First one as expected
+ }
+ return None
+
+ def generate_scenarios(self, num_per_conflict_type: int = 3) -> list:
+ """Generate conflict scenarios based on profile's preferences."""
+ scenarios = []
+
+ for conflict_group, templates in CONFLICT_TEMPLATES.items():
+ # Check if this conflict group exists in user's preferences
+ relevant_prefs = [
+ p for p in self.profile['preferences']
+ if p.get('conflict_group') == conflict_group
+ ]
+
+ if len(relevant_prefs) < 2:
+ continue # Need at least 2 prefs to have a conflict
+
+ # Generate scenarios from templates
+ selected_templates = self.random.sample(
+ templates,
+ min(num_per_conflict_type, len(templates))
+ )
+
+ for i, template in enumerate(selected_templates):
+ scenario = self._create_scenario(
+ conflict_group, template, relevant_prefs, i
+ )
+ if scenario:
+ scenarios.append(scenario)
+
+ return scenarios
+
+ def _create_scenario(
+ self,
+ conflict_group: str,
+ template: dict,
+ relevant_prefs: list,
+ index: int
+ ) -> ConflictScenario:
+ """Create a scenario from a template."""
+ # Determine which preference is correct
+ # Based on context cues in the query
+ query = template['query']
+ correct_pref = self._determine_correct_preference(query, relevant_prefs)
+ wrong_prefs = [p for p in relevant_prefs if p['pref_id'] != correct_pref['pref_id']]
+
+ return ConflictScenario(
+ scenario_id=f"{conflict_group}_{index:03d}",
+ conflict_group=conflict_group,
+ query=query,
+ context_cues=template.get('context_cues', []),
+ triggered_prefs=[p['pref_id'] for p in relevant_prefs],
+ correct_pref_id=correct_pref['pref_id'],
+ wrong_pref_ids=[p['pref_id'] for p in wrong_prefs],
+ why_correct=template.get('correct_for', ''),
+ expected_rag_behavior=template.get('why_rag_wins', ''),
+ expected_context_failure=template.get('why_context_fails', '')
+ )
+
+ def _determine_correct_preference(self, query: str, prefs: list) -> dict:
+ """
+ Determine which preference is correct for a query.
+ Uses keyword matching on priority_context.
+ """
+ query_lower = query.lower()
+ scores = []
+
+ for pref in prefs:
+ score = 0
+ for keyword in pref.get('priority_context', []):
+ if keyword.lower() in query_lower:
+ score += 1
+ # Bonus for condition match
+ if pref.get('condition', '').lower() in query_lower:
+ score += 2
+ scores.append((pref, score))
+
+ # Return highest scoring preference
+ scores.sort(key=lambda x: x[1], reverse=True)
+ return scores[0][0] if scores else prefs[0]
+
+
+def generate_conflict_enriched_dataset(
+ profiles_path: str,
+ output_path: str,
+ scenarios_per_conflict: int = 3,
+ seed: int = 42
+):
+ """
+ Generate a dataset where every query triggers at least one conflict.
+ """
+ profiles = []
+ with open(profiles_path) as f:
+ for line in f:
+ profiles.append(json.loads(line))
+
+ all_scenarios = []
+ conflict_coverage = {}
+
+ for profile in profiles:
+ generator = ConflictScenarioGenerator(profile, seed)
+ scenarios = generator.generate_scenarios(scenarios_per_conflict)
+
+ for scenario in scenarios:
+ scenario_dict = {
+ 'user_id': profile['user_id'],
+ 'scenario_id': scenario.scenario_id,
+ 'conflict_group': scenario.conflict_group,
+ 'query': scenario.query,
+ 'context_cues': scenario.context_cues,
+ 'triggered_prefs': scenario.triggered_prefs,
+ 'correct_pref_id': scenario.correct_pref_id,
+ 'wrong_pref_ids': scenario.wrong_pref_ids,
+ 'why_correct': scenario.why_correct,
+ 'expected_rag_behavior': scenario.expected_rag_behavior,
+ 'expected_context_failure': scenario.expected_context_failure
+ }
+ all_scenarios.append(scenario_dict)
+
+ # Track coverage
+ cg = scenario.conflict_group
+ conflict_coverage[cg] = conflict_coverage.get(cg, 0) + 1
+
+ # Save
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+ with open(output_path, 'w') as f:
+ for scenario in all_scenarios:
+ f.write(json.dumps(scenario) + '\n')
+
+ print(f"Generated {len(all_scenarios)} conflict scenarios")
+ print(f"Coverage by conflict type:")
+ for cg, count in sorted(conflict_coverage.items()):
+ print(f" {cg}: {count}")
+
+ return all_scenarios
+
+
+def create_evaluation_harness(scenarios: list) -> dict:
+ """
+ Create an evaluation harness that programmatically checks
+ if the correct preference was applied.
+ """
+ harness = {
+ "total_scenarios": len(scenarios),
+ "by_conflict_type": {},
+ "evaluation_functions": {}
+ }
+
+ # Group by conflict type
+ for scenario in scenarios:
+ cg = scenario['conflict_group']
+ if cg not in harness['by_conflict_type']:
+ harness['by_conflict_type'][cg] = []
+ harness['by_conflict_type'][cg].append(scenario)
+
+ # Add evaluation functions for each conflict type
+ harness['evaluation_functions'] = {
+ "format_structure": check_format_structure,
+ "verbosity": check_verbosity,
+ "naming_convention": check_naming_convention,
+ "answer_position": check_answer_position,
+ # ... more evaluators
+ }
+
+ return harness
+
+
+# ============================================================================
+# Evaluation Functions (check if correct preference was applied)
+# ============================================================================
+
+def check_format_structure(response: str, correct_pref: dict) -> bool:
+ """Check if response uses correct format (bullets vs numbered)."""
+ has_bullets = bool(any(c in response for c in ['•', '-', '*']))
+ has_numbers = bool(any(f"{i}." in response or f"{i})" in response for i in range(1, 10)))
+
+ if 'bullet' in correct_pref.get('action', '').lower():
+ return has_bullets and not has_numbers
+ elif 'numbered' in correct_pref.get('action', '').lower():
+ return has_numbers
+ return True # Can't determine
+
+
+def check_verbosity(response: str, correct_pref: dict) -> bool:
+ """Check if response matches verbosity preference."""
+ word_count = len(response.split())
+
+ if 'concise' in correct_pref.get('action', '').lower() or \
+ '3 sentences' in correct_pref.get('action', '').lower():
+ return word_count < 100 # Rough threshold
+ elif 'detailed' in correct_pref.get('action', '').lower():
+ return word_count > 150
+ return True
+
+
+def check_naming_convention(response: str, correct_pref: dict) -> bool:
+ """Check if code uses correct naming convention."""
+ import re
+
+ # Look for function/variable definitions
+ if 'snake_case' in correct_pref.get('action', '').lower():
+ # Should have underscores, no camelCase
+ has_snake = bool(re.search(r'[a-z]+_[a-z]+', response))
+ has_camel = bool(re.search(r'[a-z]+[A-Z][a-z]+', response))
+ return has_snake and not has_camel
+
+ elif 'camelCase' in correct_pref.get('action', '').lower():
+ has_camel = bool(re.search(r'[a-z]+[A-Z][a-z]+', response))
+ return has_camel
+
+ return True
+
+
+def check_answer_position(response: str, correct_pref: dict) -> bool:
+ """Check if answer comes first or explanation builds up."""
+ # Simplified: check if response starts with answer-like content
+ first_sentence = response.split('.')[0] if '.' in response else response[:100]
+
+ if 'answer first' in correct_pref.get('action', '').lower():
+ # First sentence should be direct
+ direct_indicators = ['is', 'are', 'the answer', 'yes', 'no', 'it\'s']
+ return any(ind in first_sentence.lower() for ind in direct_indicators)
+
+ elif 'build up' in correct_pref.get('action', '').lower():
+ # First sentence should be explanatory
+ buildup_indicators = ['let\'s', 'first', 'to understand', 'consider']
+ return any(ind in first_sentence.lower() for ind in buildup_indicators)
+
+ return True
+
+
+# ============================================================================
+# Main
+# ============================================================================
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--profiles", default="collaborativeagents/data/complex_profiles/profiles.jsonl")
+ parser.add_argument("--output", default="collaborativeagents/data/conflict_scenarios.jsonl")
+ parser.add_argument("--scenarios_per_conflict", type=int, default=3)
+ parser.add_argument("--seed", type=int, default=42)
+
+ args = parser.parse_args()
+
+ scenarios = generate_conflict_enriched_dataset(
+ args.profiles,
+ args.output,
+ args.scenarios_per_conflict,
+ args.seed
+ )
diff --git a/collaborativeagents/scripts/contextual_test_small.sbatch b/collaborativeagents/scripts/contextual_test_small.sbatch
new file mode 100644
index 0000000..83c20ef
--- /dev/null
+++ b/collaborativeagents/scripts/contextual_test_small.sbatch
@@ -0,0 +1,80 @@
+#!/bin/bash
+#SBATCH --job-name=ctx_test
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8-interactive
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=16
+#SBATCH --gres=gpu:4
+#SBATCH --mem=100G
+#SBATCH --time=00:20:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/ctx_test-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/ctx_test-%j.err
+
+# Small-scale contextual test: 1 profile, 15 sessions
+# Testing fix: token estimation ratio changed from 4:1 to 2.5:1
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+echo "=== Contextual Test (Token Fix) ==="
+echo "Fix: token estimation 4:1 -> 2.5:1"
+echo "1 profile, 15 sessions"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+# Start vLLM servers
+# User simulator: GPUs 0,1
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME &
+
+# Agent: GPUs 2,3
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 &
+
+echo "Waiting for vLLM servers..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+sleep 5
+
+OUTPUT_DIR="../results/contextual_test_$(date +%Y%m%d_%H%M%S)"
+
+echo ""
+echo "============================================"
+echo "Testing: contextual (with token fix)"
+echo "============================================"
+date
+
+python scripts/run_experiments.py --methods contextual \
+ --datasets math-hard --n-profiles 1 --n-sessions 15 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 1 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+echo ""
+echo "=== Done ==="
+date
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/controlled_test.sbatch b/collaborativeagents/scripts/controlled_test.sbatch
new file mode 100644
index 0000000..607b93b
--- /dev/null
+++ b/collaborativeagents/scripts/controlled_test.sbatch
@@ -0,0 +1,173 @@
+#!/bin/bash
+#SBATCH --job-name=ctrl_test
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8-interactive
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=00:45:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/ctrl_test-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/ctrl_test-%j.err
+
+# Controlled Test: Same user profile, same questions, 3 methods
+# Tests:
+# 1. Stronger user enforcement prompts
+# 2. Memory retrieval debug output
+# 3. Comparison across vanilla/rag/rag_vector
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+# Use first profile only for controlled comparison
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+MEMORY_STORE="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store"
+
+echo "=== Controlled Comparison Test ==="
+echo "Same user profile (1st), same 15 questions, 3 methods"
+echo "Testing: stronger enforcement + retrieval debug"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+# Start vLLM servers
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \
+ --max-model-len 16384 --dtype bfloat16 &
+
+echo "Waiting for vLLM servers..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+sleep 5
+
+OUTPUT_DIR="../results/controlled_test_$(date +%Y%m%d_%H%M%S)"
+
+# Run each method with SAME user (1 profile, 15 sessions)
+for METHOD in vanilla rag rag_vector; do
+ echo ""
+ echo "============================================"
+ echo "Testing: $METHOD"
+ echo "============================================"
+
+ # Clear memory store before each method (fresh start)
+ > ${MEMORY_STORE}/memory_cards.jsonl
+ rm -f ${MEMORY_STORE}/memory_embeddings.npy
+ echo "Memory store cleared"
+
+ date
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 1 --n-sessions 15 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 1 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+ echo "Method $METHOD completed"
+
+ # Show memory count for rag methods
+ if [ "$METHOD" != "vanilla" ]; then
+ echo "Final memory cards: $(wc -l < ${MEMORY_STORE}/memory_cards.jsonl)"
+ fi
+done
+
+echo ""
+echo "=== Done ==="
+date
+
+# Generate comparison summary
+python3 << 'EOF'
+import json
+import os
+from pathlib import Path
+
+output_base = sorted(Path("../results").glob("controlled_test_*"))[-1]
+print(f"\n=== Comparison Summary ===\n")
+print(f"Results dir: {output_base}")
+
+methods = ["vanilla", "rag", "rag_vector"]
+results = {}
+
+for subdir in output_base.iterdir():
+ if subdir.is_dir():
+ for method in methods:
+ result_file = subdir / method / "results.json"
+ if result_file.exists():
+ with open(result_file) as f:
+ results[method] = json.load(f)
+ break
+
+if results:
+ print(f"\n{'Metric':<25} {'vanilla':<12} {'rag':<12} {'rag_vector':<12}")
+ print("-" * 60)
+
+ for method in methods:
+ if method not in results:
+ continue
+ data = results[method]
+ task_succ = sum(r['metrics']['task_success'] for r in data) / len(data)
+ avg_turns = sum(r['metrics']['total_turns'] for r in data) / len(data)
+ avg_enf = sum(r['metrics']['enforcement_count'] for r in data) / len(data)
+
+ if method == methods[0]:
+ print(f"{'Task Success':<25} {task_succ:<12.1%} ", end="")
+ else:
+ print(f"{task_succ:<12.1%} ", end="")
+ print()
+
+ for method in methods:
+ if method not in results:
+ continue
+ data = results[method]
+ avg_turns = sum(r['metrics']['total_turns'] for r in data) / len(data)
+ if method == methods[0]:
+ print(f"{'Avg Turns':<25} {avg_turns:<12.1f} ", end="")
+ else:
+ print(f"{avg_turns:<12.1f} ", end="")
+ print()
+
+ for method in methods:
+ if method not in results:
+ continue
+ data = results[method]
+ avg_enf = sum(r['metrics']['enforcement_count'] for r in data) / len(data)
+ if method == methods[0]:
+ print(f"{'Avg Enforcement':<25} {avg_enf:<12.1f} ", end="")
+ else:
+ print(f"{avg_enf:<12.1f} ", end="")
+ print()
+
+ # Session-by-session comparison
+ print(f"\n=== Session-by-Session Turns ===")
+ print(f"{'Session':<10} {'vanilla':<12} {'rag':<12} {'rag_vector':<12}")
+ print("-" * 50)
+ for i in range(min(15, len(results.get('vanilla', [])))):
+ print(f"{i+1:<10} ", end="")
+ for method in methods:
+ if method in results and i < len(results[method]):
+ turns = results[method][i]['metrics']['total_turns']
+ print(f"{turns:<12} ", end="")
+ print()
+EOF
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/exp_all_memory.sbatch b/collaborativeagents/scripts/exp_all_memory.sbatch
new file mode 100644
index 0000000..c6310ee
--- /dev/null
+++ b/collaborativeagents/scripts/exp_all_memory.sbatch
@@ -0,0 +1,59 @@
+#!/bin/bash
+#SBATCH --job-name=exp_all_memory
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=24:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_all_memory-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_all_memory-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== all_memory (PersonalizedLLMAdapter with local transformers) ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# GPU 0,1: vLLM server for user simulator (port 8004)
+# PersonalizedLLMAdapter uses local transformers for embedding/reranker/chat/extractor
+echo "Starting user simulator vLLM server on GPU 0,1 (port 8004)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+echo "Waiting for vLLM server..."
+for i in $(seq 1 200); do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "Server ready after $((i*3))s"; break
+ fi
+ sleep 3
+done
+
+# GPU 2,3: PersonalizedLLMAdapter's transformers models
+# (embedding ~8B, reranker ~8B, chat ~1.5B, extractor ~0.6B)
+CUDA_VISIBLE_DEVICES=2,3 python scripts/run_experiments.py \
+ --methods all_memory \
+ --datasets math-hard,math-500,bigcodebench \
+ --n-profiles 200 --n-sessions 30 --max-turns 15 \
+ --use-vllm --parallel-profiles 10 \
+ --output-dir ../results/full_h200 \
+ --profile-path "$PROFILE_PATH"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+echo "Done: $(date)"
diff --git a/collaborativeagents/scripts/exp_contextual.sbatch b/collaborativeagents/scripts/exp_contextual.sbatch
new file mode 100644
index 0000000..2c06bb8
--- /dev/null
+++ b/collaborativeagents/scripts/exp_contextual.sbatch
@@ -0,0 +1,66 @@
+#!/bin/bash
+#SBATCH --job-name=exp_contextual
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=24:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_contextual-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_contextual-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== contextual (vLLM-based) ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# GPU 0,1: vLLM server for user simulator (port 8004)
+echo "Starting user simulator vLLM server on GPU 0,1 (port 8004)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+# GPU 2,3: vLLM server for agent (port 8003)
+echo "Starting agent vLLM server on GPU 2,3 (port 8003)..."
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+echo "Waiting for vLLM servers..."
+for i in $(seq 1 200); do
+ user_ready=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0)
+ agent_ready=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0)
+ if [ "$user_ready" = "1" ] && [ "$agent_ready" = "1" ]; then
+ echo "Both servers ready after $((i*3))s"; break
+ fi
+ sleep 3
+done
+
+# Run experiment (uses vLLM HTTP API, no local GPU needed)
+python scripts/run_experiments.py \
+ --methods contextual \
+ --datasets math-hard,math-500,bigcodebench \
+ --n-profiles 200 --n-sessions 30 --max-turns 15 \
+ --use-vllm --parallel-profiles 50 \
+ --output-dir ../results/full_h200 \
+ --profile-path "$PROFILE_PATH"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+echo "Done: $(date)"
diff --git a/collaborativeagents/scripts/exp_rag.sbatch b/collaborativeagents/scripts/exp_rag.sbatch
new file mode 100644
index 0000000..7dcad65
--- /dev/null
+++ b/collaborativeagents/scripts/exp_rag.sbatch
@@ -0,0 +1,59 @@
+#!/bin/bash
+#SBATCH --job-name=exp_rag
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=24:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_rag-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_rag-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== rag (PersonalizedLLMAdapter with local transformers) ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# GPU 0,1: vLLM server for user simulator (port 8004)
+# PersonalizedLLMAdapter uses local transformers for embedding/reranker/chat/extractor
+echo "Starting user simulator vLLM server on GPU 0,1 (port 8004)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+echo "Waiting for vLLM server..."
+for i in $(seq 1 200); do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "Server ready after $((i*3))s"; break
+ fi
+ sleep 3
+done
+
+# GPU 2,3: PersonalizedLLMAdapter's transformers models
+# (embedding ~8B, reranker ~8B, chat ~1.5B, extractor ~0.6B)
+CUDA_VISIBLE_DEVICES=2,3 python scripts/run_experiments.py \
+ --methods rag \
+ --datasets math-hard,math-500,bigcodebench \
+ --n-profiles 200 --n-sessions 30 --max-turns 15 \
+ --use-vllm --parallel-profiles 10 \
+ --output-dir ../results/full_h200 \
+ --profile-path "$PROFILE_PATH"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+echo "Done: $(date)"
diff --git a/collaborativeagents/scripts/exp_rag_vector.sbatch b/collaborativeagents/scripts/exp_rag_vector.sbatch
new file mode 100644
index 0000000..f63bd26
--- /dev/null
+++ b/collaborativeagents/scripts/exp_rag_vector.sbatch
@@ -0,0 +1,59 @@
+#!/bin/bash
+#SBATCH --job-name=exp_rag_vector
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=24:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_rag_vector-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_rag_vector-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== rag_vector (PersonalizedLLMAdapter with local transformers) ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# GPU 0,1: vLLM server for user simulator (port 8004)
+# PersonalizedLLMAdapter uses local transformers for embedding/reranker/chat/extractor
+echo "Starting user simulator vLLM server on GPU 0,1 (port 8004)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+echo "Waiting for vLLM server..."
+for i in $(seq 1 200); do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "Server ready after $((i*3))s"; break
+ fi
+ sleep 3
+done
+
+# GPU 2,3: PersonalizedLLMAdapter's transformers models
+# (embedding ~8B, reranker ~8B, chat ~1.5B, extractor ~0.6B)
+CUDA_VISIBLE_DEVICES=2,3 python scripts/run_experiments.py \
+ --methods rag_vector \
+ --datasets math-hard,math-500,bigcodebench \
+ --n-profiles 200 --n-sessions 30 --max-turns 15 \
+ --use-vllm --parallel-profiles 10 \
+ --output-dir ../results/full_h200 \
+ --profile-path "$PROFILE_PATH"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+echo "Done: $(date)"
diff --git a/collaborativeagents/scripts/exp_reflection.sbatch b/collaborativeagents/scripts/exp_reflection.sbatch
new file mode 100644
index 0000000..2c94495
--- /dev/null
+++ b/collaborativeagents/scripts/exp_reflection.sbatch
@@ -0,0 +1,66 @@
+#!/bin/bash
+#SBATCH --job-name=exp_reflection
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=24:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_reflection-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_reflection-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== reflection (vLLM-based) ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# GPU 0,1: vLLM server for user simulator (port 8004)
+echo "Starting user simulator vLLM server on GPU 0,1 (port 8004)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+# GPU 2,3: vLLM server for agent (port 8003)
+echo "Starting agent vLLM server on GPU 2,3 (port 8003)..."
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+echo "Waiting for vLLM servers..."
+for i in $(seq 1 200); do
+ user_ready=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0)
+ agent_ready=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0)
+ if [ "$user_ready" = "1" ] && [ "$agent_ready" = "1" ]; then
+ echo "Both servers ready after $((i*3))s"; break
+ fi
+ sleep 3
+done
+
+# Run experiment (uses vLLM HTTP API, no local GPU needed)
+python scripts/run_experiments.py \
+ --methods reflection \
+ --datasets math-hard,math-500,bigcodebench \
+ --n-profiles 200 --n-sessions 30 --max-turns 15 \
+ --use-vllm --parallel-profiles 50 \
+ --output-dir ../results/full_h200 \
+ --profile-path "$PROFILE_PATH"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+echo "Done: $(date)"
diff --git a/collaborativeagents/scripts/exp_reflection_grpo.sbatch b/collaborativeagents/scripts/exp_reflection_grpo.sbatch
new file mode 100644
index 0000000..10b5a4f
--- /dev/null
+++ b/collaborativeagents/scripts/exp_reflection_grpo.sbatch
@@ -0,0 +1,59 @@
+#!/bin/bash
+#SBATCH --job-name=exp_reflection_grpo
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=24:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_reflection_grpo-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_reflection_grpo-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== reflection_grpo (SEQUENTIAL) ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+echo "Waiting for servers..."
+for i in $(seq 1 200); do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1 && curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Servers ready after $((i*3))s"; break
+ fi
+ sleep 3
+done
+
+python scripts/run_experiments.py \
+ --methods reflection_grpo \
+ --datasets math-hard,math-500,bigcodebench \
+ --n-profiles 200 --n-sessions 30 --max-turns 15 \
+ --use-vllm --parallel-profiles 10 \
+ --output-dir ../results/full_h200 \
+ --profile-path "$PROFILE_PATH"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+echo "Done: $(date)"
diff --git a/collaborativeagents/scripts/exp_template.sbatch b/collaborativeagents/scripts/exp_template.sbatch
new file mode 100644
index 0000000..8f6ba04
--- /dev/null
+++ b/collaborativeagents/scripts/exp_template.sbatch
@@ -0,0 +1,59 @@
+#!/bin/bash
+#SBATCH --job-name=exp_METHOD
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=24:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_METHOD-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_METHOD-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== METHOD (SEQUENTIAL) ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+echo "Waiting for servers..."
+for i in $(seq 1 200); do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1 && curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Servers ready after $((i*3))s"; break
+ fi
+ sleep 3
+done
+
+python scripts/run_experiments.py \
+ --methods METHOD \
+ --datasets math-hard,math-500,bigcodebench \
+ --n-profiles 200 --n-sessions 30 --max-turns 15 \
+ --use-vllm --parallel-profiles 10 \
+ --output-dir ../results/full_h200 \
+ --profile-path "$PROFILE_PATH"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+echo "Done: $(date)"
diff --git a/collaborativeagents/scripts/exp_vanilla.sbatch b/collaborativeagents/scripts/exp_vanilla.sbatch
new file mode 100644
index 0000000..445f771
--- /dev/null
+++ b/collaborativeagents/scripts/exp_vanilla.sbatch
@@ -0,0 +1,59 @@
+#!/bin/bash
+#SBATCH --job-name=exp_vanilla
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=12:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_vanilla-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_vanilla-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== VANILLA (BATCH) ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+echo "Waiting for servers..."
+for i in $(seq 1 200); do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1 && curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Servers ready after $((i*3))s"; break
+ fi
+ sleep 3
+done
+
+python scripts/run_experiments.py \
+ --methods vanilla \
+ --datasets math-hard,math-500,bigcodebench \
+ --n-profiles 200 --n-sessions 30 --max-turns 15 \
+ --use-vllm --batch-size 50 --parallel-profiles 50 \
+ --output-dir ../results/full_h200 \
+ --profile-path "$PROFILE_PATH"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+echo "Done: $(date)"
diff --git a/collaborativeagents/scripts/extend_profiles.py b/collaborativeagents/scripts/extend_profiles.py
new file mode 100644
index 0000000..d780697
--- /dev/null
+++ b/collaborativeagents/scripts/extend_profiles.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""
+Generate additional profiles by remixing preferences from existing profiles.
+This creates diverse profile combinations without requiring LLM calls.
+"""
+
+import json
+import random
+import hashlib
+from pathlib import Path
+from typing import List, Dict
+import argparse
+
+
+def load_profiles(path: Path) -> List[Dict]:
+ """Load profiles from JSONL file."""
+ profiles = []
+ with open(path) as f:
+ for line in f:
+ profiles.append(json.loads(line.strip()))
+ return profiles
+
+
+def extract_all_preferences(profiles: List[Dict]) -> Dict[str, List[Dict]]:
+ """Extract all unique preferences grouped by category (prefix)."""
+ categories = {}
+ seen_ids = set()
+
+ for profile in profiles:
+ for pref in profile.get("preferences", []):
+ pref_id = pref.get("pref_id", "unknown")
+ if pref_id in seen_ids:
+ continue
+ seen_ids.add(pref_id)
+
+ # Extract category from prefix (e.g., "rf_001" -> "rf")
+ prefix = pref_id.split("_")[0] if "_" in pref_id else "other"
+ if prefix not in categories:
+ categories[prefix] = []
+ categories[prefix].append(pref)
+
+ return categories
+
+
+def extract_personas(profiles: List[Dict]) -> List[str]:
+ """Extract unique personas from profiles."""
+ personas = []
+ seen = set()
+ for profile in profiles:
+ persona = profile.get("persona", "")
+ if persona and persona not in seen:
+ personas.append(persona)
+ seen.add(persona)
+ return personas
+
+
+def generate_new_profile(
+ user_id: str,
+ preference_pool: Dict[str, List[Dict]],
+ personas: List[str],
+ target_prefs: int = 43,
+ rng: random.Random = None
+) -> Dict:
+ """Generate a new profile by sampling from preference pool."""
+ if rng is None:
+ rng = random.Random()
+
+ selected_prefs = []
+
+ # Sample from each category to maintain diversity
+ prefs_per_cat = max(1, target_prefs // len(preference_pool))
+
+ for cat, prefs in preference_pool.items():
+ # Sample with some randomness
+ n_sample = min(len(prefs), prefs_per_cat + rng.randint(-1, 2))
+ n_sample = max(1, n_sample)
+ sampled = rng.sample(prefs, min(n_sample, len(prefs)))
+ selected_prefs.extend(sampled)
+
+ # Add/remove to hit target
+ all_prefs = []
+ for prefs in preference_pool.values():
+ all_prefs.extend(prefs)
+
+ while len(selected_prefs) < target_prefs:
+ remaining = [p for p in all_prefs if p not in selected_prefs]
+ if not remaining:
+ break
+ selected_prefs.append(rng.choice(remaining))
+
+ while len(selected_prefs) > target_prefs:
+ selected_prefs.pop(rng.randint(0, len(selected_prefs) - 1))
+
+ # Build conflict groups
+ conflict_groups = {}
+ for pref in selected_prefs:
+ cg = pref.get("conflict_group")
+ if cg:
+ if cg not in conflict_groups:
+ conflict_groups[cg] = []
+ conflict_groups[cg].append(pref["pref_id"])
+
+ return {
+ "user_id": user_id,
+ "persona": rng.choice(personas),
+ "preferences": selected_prefs,
+ "conflict_groups": conflict_groups,
+ "meta": {
+ "total_preferences": len(selected_prefs),
+ "total_conflict_groups": len(conflict_groups),
+ "generator": "extend_profiles.py"
+ }
+ }
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Generate additional profiles by remixing existing ones"
+ )
+ parser.add_argument("--input", type=str, required=True,
+ help="Path to existing profiles JSONL")
+ parser.add_argument("--output", type=str, required=True,
+ help="Path for output profiles JSONL")
+ parser.add_argument("--num-new", type=int, default=100,
+ help="Number of new profiles to generate")
+ parser.add_argument("--seed", type=int, default=142,
+ help="Random seed (use different from original)")
+ parser.add_argument("--target-prefs", type=int, default=43,
+ help="Target number of preferences per profile")
+ parser.add_argument("--merge", action="store_true",
+ help="Merge with existing profiles in output")
+
+ args = parser.parse_args()
+
+ input_path = Path(args.input)
+ output_path = Path(args.output)
+
+ print(f"Loading profiles from: {input_path}")
+ profiles = load_profiles(input_path)
+ print(f" Loaded {len(profiles)} profiles")
+
+ # Extract preference pool and personas
+ pref_pool = extract_all_preferences(profiles)
+ personas = extract_personas(profiles)
+
+ print(f"\nPreference pool:")
+ for cat, prefs in pref_pool.items():
+ print(f" {cat}: {len(prefs)} preferences")
+ print(f" Total unique preferences: {sum(len(p) for p in pref_pool.values())}")
+ print(f" Unique personas: {len(personas)}")
+
+ # Generate new profiles
+ rng = random.Random(args.seed)
+ new_profiles = []
+
+ print(f"\nGenerating {args.num_new} new profiles...")
+ for i in range(args.num_new):
+ user_id = f"user_{hashlib.md5(f'{args.seed}_{i}'.encode()).hexdigest()[:8]}"
+ profile = generate_new_profile(
+ user_id=user_id,
+ preference_pool=pref_pool,
+ personas=personas,
+ target_prefs=args.target_prefs,
+ rng=rng
+ )
+ new_profiles.append(profile)
+
+ if (i + 1) % 20 == 0:
+ print(f" Generated {i + 1}/{args.num_new}")
+
+ # Optionally merge with original
+ if args.merge:
+ output_profiles = profiles + new_profiles
+ print(f"\nMerging: {len(profiles)} original + {len(new_profiles)} new = {len(output_profiles)}")
+ else:
+ output_profiles = new_profiles
+
+ # Save
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(output_path, 'w') as f:
+ for profile in output_profiles:
+ f.write(json.dumps(profile) + '\n')
+
+ print(f"\nSaved {len(output_profiles)} profiles to: {output_path}")
+
+ # Summary stats
+ pref_counts = [p["meta"]["total_preferences"] for p in output_profiles]
+ print(f"\nProfile statistics:")
+ print(f" Min preferences: {min(pref_counts)}")
+ print(f" Max preferences: {max(pref_counts)}")
+ print(f" Avg preferences: {sum(pref_counts)/len(pref_counts):.1f}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/collaborativeagents/scripts/full_experiment_batch.sbatch b/collaborativeagents/scripts/full_experiment_batch.sbatch
new file mode 100644
index 0000000..1dc4da1
--- /dev/null
+++ b/collaborativeagents/scripts/full_experiment_batch.sbatch
@@ -0,0 +1,132 @@
+#!/bin/bash
+#SBATCH --job-name=batch_exp
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=24:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/batch_exp-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/batch_exp-%j.err
+
+# Full experiment: Batch-processable methods (vanilla, all_memory)
+# 200 profiles × 30 sessions = 6,000 sessions per method
+# Using turn-synchronous batch processing (paper's approach)
+
+set -e
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PORT_USER=8004
+PORT_AGENT=8003
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "============================================"
+echo "Full Experiment: Batch Methods"
+echo "============================================"
+echo "Methods: vanilla, all_memory"
+echo "Profiles: 200"
+echo "Sessions/profile: 30"
+echo "Total: 6,000 sessions per method"
+echo ""
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+echo ""
+
+# Kill any existing servers
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start vLLM servers with optimized settings
+echo "Starting 8B user simulator (GPU 0-1, TP=2)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_USER \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 \
+ --max-model-len 8192 \
+ --disable-log-requests \
+ --dtype bfloat16 \
+ --max-num-seqs 256 &
+SERVER_USER_PID=$!
+
+echo "Starting 8B agent (GPU 2-3, TP=2)..."
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_AGENT \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 \
+ --max-model-len 8192 \
+ --disable-log-requests \
+ --dtype bfloat16 \
+ --max-num-seqs 256 &
+SERVER_AGENT_PID=$!
+
+echo "Waiting for servers (may take 5-10 min for CUDA graph compilation)..."
+for i in $(seq 1 200); do
+ READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0)
+ READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0)
+ if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then
+ echo "Both servers ready after $((i*3))s"
+ break
+ fi
+ if [ $((i % 20)) -eq 0 ]; then
+ echo " Still waiting... ($((i*3))s)"
+ fi
+ sleep 3
+done
+
+if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then
+ echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then
+ echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+echo "Both servers healthy"
+echo ""
+
+# Run batch experiment (only vanilla is truly stateless)
+for METHOD in vanilla; do
+ echo "============================================"
+ echo "Running: $METHOD (BATCH processing)"
+ echo "============================================"
+ START=$(date +%s)
+
+ python scripts/run_experiments.py \
+ --methods $METHOD \
+ --datasets math-hard,math-500,bigcodebench \
+ --n-profiles 200 \
+ --n-sessions 30 \
+ --max-turns 15 \
+ --use-vllm \
+ --batch-size 50 \
+ --parallel-profiles 50 \
+ --output-dir ../results/full_experiment_h200 \
+ --profile-path "$PROFILE_PATH"
+
+ END=$(date +%s)
+ ELAPSED=$((END-START))
+ echo ""
+ echo "$METHOD completed in ${ELAPSED}s"
+ THROUGHPUT=$((6000 * 3600 / ELAPSED))
+ echo "Throughput: ${THROUGHPUT} sessions/hr"
+ echo ""
+done
+
+# Cleanup
+echo "Cleaning up..."
+kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true
+
+echo ""
+echo "============================================"
+echo "BATCH EXPERIMENT COMPLETE"
+echo "============================================"
+date
diff --git a/collaborativeagents/scripts/full_experiment_sequential.sbatch b/collaborativeagents/scripts/full_experiment_sequential.sbatch
new file mode 100644
index 0000000..2f3bd4b
--- /dev/null
+++ b/collaborativeagents/scripts/full_experiment_sequential.sbatch
@@ -0,0 +1,131 @@
+#!/bin/bash
+#SBATCH --job-name=seq_exp
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=48:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/seq_exp-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/seq_exp-%j.err
+
+# Full experiment: Sequential methods (rag, rag_vector, contextual, reflection)
+# These methods require state tracking between sessions
+# 200 profiles × 30 sessions = 6,000 sessions per method
+
+set -e
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PORT_USER=8004
+PORT_AGENT=8003
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "============================================"
+echo "Full Experiment: Sequential Methods"
+echo "============================================"
+echo "Methods: rag, rag_vector, contextual, reflection"
+echo "Profiles: 200"
+echo "Sessions/profile: 30"
+echo "Total: 6,000 sessions per method"
+echo ""
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+echo ""
+
+# Kill any existing servers
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start vLLM servers
+echo "Starting 8B user simulator (GPU 0-1, TP=2)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_USER \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 \
+ --max-model-len 8192 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_USER_PID=$!
+
+echo "Starting 8B agent (GPU 2-3, TP=2)..."
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_AGENT \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 \
+ --max-model-len 8192 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_AGENT_PID=$!
+
+echo "Waiting for servers..."
+for i in $(seq 1 120); do
+ READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0)
+ READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0)
+ if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then
+ echo "Both servers ready after $((i*3))s"
+ break
+ fi
+ if [ $((i % 20)) -eq 0 ]; then
+ echo " Still waiting... ($((i*3))s)"
+ fi
+ sleep 3
+done
+
+if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then
+ echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then
+ echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+echo "Both servers healthy"
+echo ""
+
+# Run sequential experiments (all stateful methods)
+for METHOD in all_memory rag rag_vector contextual reflection; do
+ echo "============================================"
+ echo "Running: $METHOD (SEQUENTIAL processing)"
+ echo "============================================"
+ START=$(date +%s)
+
+ python scripts/run_experiments.py \
+ --methods $METHOD \
+ --datasets math-hard,math-500,bigcodebench \
+ --n-profiles 200 \
+ --n-sessions 30 \
+ --max-turns 15 \
+ --use-vllm \
+ --parallel-profiles 10 \
+ --output-dir ../results/full_experiment_h200 \
+ --profile-path "$PROFILE_PATH"
+
+ END=$(date +%s)
+ ELAPSED=$((END-START))
+ echo ""
+ echo "$METHOD completed in ${ELAPSED}s"
+ if [ $ELAPSED -gt 0 ]; then
+ THROUGHPUT=$((6000 * 3600 / ELAPSED))
+ echo "Throughput: ${THROUGHPUT} sessions/hr"
+ fi
+ echo ""
+done
+
+# Cleanup
+echo "Cleaning up..."
+kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true
+
+echo ""
+echo "============================================"
+echo "SEQUENTIAL EXPERIMENT COMPLETE"
+echo "============================================"
+date
diff --git a/collaborativeagents/scripts/fullscale_method.sbatch b/collaborativeagents/scripts/fullscale_method.sbatch
new file mode 100644
index 0000000..6847f4e
--- /dev/null
+++ b/collaborativeagents/scripts/fullscale_method.sbatch
@@ -0,0 +1,92 @@
+#!/bin/bash
+#SBATCH --job-name=fs_%x
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=30:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/fs_%x-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/fs_%x-%j.err
+
+# Usage: sbatch --job-name=vanilla fullscale_method.sbatch vanilla
+METHOD=$1
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+# Full-precision 70B for user simulator (H200 143GB/GPU can handle it with TP=2)
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+# vLLM memory and parallel workers for methods needing preference extractor
+# These methods need GPU memory for embedding/reranker/extractor models on GPUs 2,3
+if [[ "$METHOD" == "all_memory" || "$METHOD" == "rag" || "$METHOD" == "rag_vector" ]]; then
+ AGENT_MEM=0.40 # Leave 60% free for embedding/reranker/extractor
+ PARALLEL_PROFILES=30 # With CUDA_VISIBLE_DEVICES=2,3, extractor uses correct GPUs
+else
+ AGENT_MEM=0.90
+ PARALLEL_PROFILES=50
+fi
+
+echo "=== Starting vLLM servers ==="
+echo "Method: $METHOD"
+echo "User simulator: $USER_MODEL (70B full-precision)"
+echo "Agent: $AGENT_MODEL (8B)"
+echo "Agent memory: $AGENT_MEM"
+date
+
+# User simulator on GPUs 0,1 (70B full-precision, ~70GB/GPU with TP=2)
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME &
+
+# Agent on GPUs 2,3
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization $AGENT_MEM \
+ --max-model-len 16384 --dtype bfloat16 &
+
+# Wait for 70B model to load (takes 9-12 minutes)
+echo "Waiting for vLLM servers to be ready (this may take 10-15 minutes for 70B)..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator (8004) ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent (8003) ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+echo "Both vLLM servers ready"
+sleep 10
+
+# Batch processing only for vanilla
+if [[ "$METHOD" == "vanilla" ]]; then
+ EXTRA_ARGS="--use-batch-processing --batch-size 100"
+else
+ EXTRA_ARGS="--no-batch-processing"
+fi
+
+echo "Parallel profiles: $PARALLEL_PROFILES"
+
+# Run experiment with CUDA_VISIBLE_DEVICES=2,3 so preference extractor/embedding/reranker
+# use GPUs 2,3 (which have more headroom) instead of GPUs 0,1 (saturated by 70B model)
+CUDA_VISIBLE_DEVICES=2,3 python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 200 --n-sessions 30 --max-turns 15 \
+ --use-vllm $EXTRA_ARGS --parallel-profiles $PARALLEL_PROFILES \
+ --output-dir ../results/fullscale --profile-path $PROFILE_PATH
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/fullscale_vanilla.sbatch b/collaborativeagents/scripts/fullscale_vanilla.sbatch
new file mode 100644
index 0000000..798dc5f
--- /dev/null
+++ b/collaborativeagents/scripts/fullscale_vanilla.sbatch
@@ -0,0 +1,43 @@
+#!/bin/bash
+#SBATCH --job-name=fs_vanilla
+#SBATCH --partition=gpuH200x8
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=8:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/fs_vanilla-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/fs_vanilla-%j.err
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/user_profiles.jsonl"
+
+# Start vLLM servers
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 8192 --dtype float16 --download-dir $HF_HOME &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 8192 --dtype bfloat16 &
+
+for i in {1..60}; do
+ curl -s http://localhost:8004/health > /dev/null 2>&1 && curl -s http://localhost:8003/health > /dev/null 2>&1 && break
+ sleep 5
+done
+sleep 30
+
+python scripts/run_experiments.py --methods vanilla \
+ --datasets math-hard --n-profiles 200 --n-sessions 30 --max-turns 15 \
+ --use-vllm --use-batch-processing --batch-size 100 --parallel-profiles 50 \
+ --output-dir ../results/fullscale --profile-path $PROFILE_PATH
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/generate_complex_profiles.py b/collaborativeagents/scripts/generate_complex_profiles.py
new file mode 100644
index 0000000..3838413
--- /dev/null
+++ b/collaborativeagents/scripts/generate_complex_profiles.py
@@ -0,0 +1,719 @@
+"""
+Generate complex user profiles with conditional preferences using LLM.
+
+This script generates user profiles with ~40 situation-dependent preferences
+designed to stress-test retrieval-based personalization systems.
+"""
+
+import json
+import random
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass, field, asdict
+import hashlib
+
+# Will use litellm for generation
+try:
+ import litellm
+except ImportError:
+ litellm = None
+
+
+# ============================================================================
+# Schema Definitions
+# ============================================================================
+
+@dataclass
+class ConditionalPreference:
+ """A preference that applies under specific conditions."""
+ pref_id: str
+ condition: str # When this preference applies
+ action: str # What the user prefers
+ conflict_group: Optional[str] = None # Which preferences this might conflict with
+ priority_context: list = field(default_factory=list) # Keywords that trigger this pref
+
+ def to_natural_language(self) -> str:
+ """Convert to natural language statement."""
+ return f"When {self.condition}, {self.action}."
+
+ def to_memory_card_format(self) -> dict:
+ """Convert to format compatible with personalization system's MemoryCard."""
+ return {
+ "condition": self.condition,
+ "action": self.action,
+ "confidence": 1.0,
+ "source": "user_profile",
+ "pref_id": self.pref_id,
+ "conflict_group": self.conflict_group,
+ "priority_context": self.priority_context
+ }
+
+
+@dataclass
+class ConflictGroup:
+ """Defines a group of preferences that may conflict."""
+ group_id: str
+ description: str
+ resolution_rule: str # How to programmatically resolve
+ member_pref_ids: list = field(default_factory=list)
+
+
+@dataclass
+class UserProfile:
+ """A complex user profile with conditional preferences."""
+ user_id: str
+ persona: str # High-level description
+ preferences: list # List of ConditionalPreference
+ conflict_groups: dict = field(default_factory=dict) # group_id -> ConflictGroup
+
+ def get_preferences_by_category(self) -> dict:
+ """Group preferences by their category (derived from pref_id prefix)."""
+ categories = {}
+ for pref in self.preferences:
+ cat = pref.pref_id.split('_')[0]
+ if cat not in categories:
+ categories[cat] = []
+ categories[cat].append(pref)
+ return categories
+
+ def get_conflicting_preferences(self, query: str) -> list:
+ """Find preferences that might conflict for a given query."""
+ # Simple keyword matching - in practice, use embeddings
+ triggered = []
+ query_lower = query.lower()
+ for pref in self.preferences:
+ for keyword in pref.priority_context:
+ if keyword.lower() in query_lower:
+ triggered.append(pref)
+ break
+
+ # Group by conflict group
+ conflicts = {}
+ for pref in triggered:
+ if pref.conflict_group:
+ if pref.conflict_group not in conflicts:
+ conflicts[pref.conflict_group] = []
+ conflicts[pref.conflict_group].append(pref)
+
+ # Return groups with more than one triggered preference
+ return {k: v for k, v in conflicts.items() if len(v) > 1}
+
+ def to_dict(self) -> dict:
+ return {
+ "user_id": self.user_id,
+ "persona": self.persona,
+ "preferences": [asdict(p) for p in self.preferences],
+ "conflict_groups": {k: asdict(v) for k, v in self.conflict_groups.items()},
+ "meta": {
+ "total_preferences": len(self.preferences),
+ "total_conflict_groups": len(self.conflict_groups)
+ }
+ }
+
+
+# ============================================================================
+# Preference Templates for LLM Generation
+# ============================================================================
+
+PREFERENCE_CATEGORIES = {
+ "response_format": {
+ "description": "How responses should be structured",
+ "num_preferences": 4,
+ "example_conflicts": ["bullets vs numbered", "answer-first vs build-up"],
+ "generation_prompt": """Generate {n} preferences about response formatting.
+Include conflicting pairs like:
+- When to use bullet points vs numbered lists
+- When to give answer first vs build up to it
+Each preference must have a specific condition (when it applies) and action (what to do)."""
+ },
+ "verbosity": {
+ "description": "How detailed responses should be",
+ "num_preferences": 5,
+ "example_conflicts": ["concise vs detailed", "explain why vs just answer"],
+ "generation_prompt": """Generate {n} preferences about response verbosity.
+Include conflicting pairs like:
+- Brief responses vs detailed explanations
+- When to explain reasoning vs just give answer
+Conditions should include cue phrases like 'quick question', 'briefly', etc."""
+ },
+ "code_style": {
+ "description": "Programming and code preferences",
+ "num_preferences": 8,
+ "example_conflicts": ["naming conventions by language", "comment styles", "review focus"],
+ "generation_prompt": """Generate {n} preferences about code style.
+Include:
+- Language-specific naming conventions (Python snake_case, JS camelCase, etc.)
+- Comment styles for different code lengths
+- Code review focus (bugs only vs style too)
+- Error handling preferences"""
+ },
+ "math_style": {
+ "description": "Mathematical explanation preferences",
+ "num_preferences": 6,
+ "example_conflicts": ["step-by-step vs intuition", "formal vs informal"],
+ "generation_prompt": """Generate {n} preferences about mathematical explanations.
+Include:
+- When to show detailed steps vs high-level approach
+- Intuition-first vs formula-first for statistics
+- How to structure proofs
+- Verification requests"""
+ },
+ "interaction_pattern": {
+ "description": "How to interact with user",
+ "num_preferences": 6,
+ "example_conflicts": ["confirm vs execute", "recommend vs list options"],
+ "generation_prompt": """Generate {n} preferences about interaction patterns.
+Include:
+- When to confirm before acting vs execute directly
+- When to recommend vs present options
+- How to handle user emotions (frustration, gratitude)"""
+ },
+ "domain_specific": {
+ "description": "Preferences for specific technical domains",
+ "num_preferences": 6,
+ "example_conflicts": ["example-first vs definition-first"],
+ "generation_prompt": """Generate {n} domain-specific preferences for:
+- Machine learning explanations
+- System design discussions
+- API/library usage
+- Data structures (include complexity)"""
+ },
+ "error_correction": {
+ "description": "How to handle user mistakes",
+ "num_preferences": 4,
+ "example_conflicts": ["gentle vs direct correction"],
+ "generation_prompt": """Generate {n} preferences about error correction.
+Include:
+- Minor terminology errors vs fundamental misconceptions
+- Code bugs
+- Correcting own previous responses"""
+ },
+ "output_artifacts": {
+ "description": "How to present code and commands",
+ "num_preferences": 4,
+ "example_conflicts": ["single block vs chunked"],
+ "generation_prompt": """Generate {n} preferences about output artifacts.
+Include:
+- Copyable code blocks vs explained chunks
+- Command presentation
+- Language specification in code fences"""
+ }
+}
+
+
+LLM_GENERATION_PROMPT = """You are generating user preferences for a personalization benchmark.
+
+## Task
+Generate {num_prefs} conditional preferences for the category: {category_name}
+Description: {category_description}
+
+## Requirements
+1. Each preference must have:
+ - A specific CONDITION (when it applies, including trigger phrases/situations)
+ - An ACTION (what the user prefers to happen)
+ - A CONFLICT_GROUP (if this preference might conflict with another)
+ - PRIORITY_CONTEXT (list of keywords that trigger this preference)
+
+2. Include at least one pair of CONFLICTING preferences that could both be triggered
+ by different aspects of the same query. The conflict should be resolvable by
+ looking at the specific context.
+
+3. Conditions should be:
+ - Specific and observable (not vague like "when appropriate")
+ - Include trigger phrases users might say
+ - Cover different situations within this category
+
+4. Example conflicts for this category: {example_conflicts}
+
+## Additional Context (if any)
+{extra_context}
+
+## Output Format
+Return a JSON array of preferences:
+```json
+[
+ {{
+ "pref_id": "{category_prefix}_001",
+ "condition": "specific situation or trigger phrase",
+ "action": "what the user prefers",
+ "conflict_group": "group_name or null",
+ "priority_context": ["keyword1", "keyword2"]
+ }},
+ ...
+]
+```
+
+Generate exactly {num_prefs} preferences."""
+
+
+PERSONA_GENERATION_PROMPT = """Generate a realistic user persona for a software developer/researcher.
+
+## Requirements
+1. The persona should feel like a real person with:
+ - A professional background (role, experience level, domain)
+ - Communication style tendencies
+ - Learning preferences
+ - Work context (startup vs enterprise, solo vs team)
+
+2. The persona should naturally motivate the preferences that will be assigned.
+
+3. Keep it to 2-3 sentences.
+
+## Preference Summary
+This user will have preferences in these areas:
+{preference_summary}
+
+## Examples of good personas:
+- "A senior backend engineer at a fintech startup who values efficiency and directness. Prefers practical solutions over theoretical discussions, and likes to understand the 'why' behind recommendations."
+- "A PhD student in machine learning who is meticulous about mathematical rigor. Appreciates step-by-step derivations and often cross-references multiple sources before accepting an explanation."
+- "A junior developer transitioning from frontend to full-stack. Learns best through examples and appreciates patient, incremental explanations without condescension."
+
+## Output
+Return only the persona text (2-3 sentences), no JSON or formatting."""
+
+
+# ============================================================================
+# Conflict Resolution Logic
+# ============================================================================
+
+CONFLICT_RESOLUTION_RULES = {
+ "format_structure": {
+ "signals": {
+ "bullets": ["options", "alternatives", "list", "multiple", "comparison", "pros and cons"],
+ "numbered": ["steps", "procedure", "how to", "setup", "install", "first", "then", "sequence"]
+ },
+ "resolution": "sequential_process -> numbered; parallel_items -> bullets"
+ },
+ "answer_position": {
+ "signals": {
+ "answer_first": ["what is", "what's", "tell me", "give me", "?"],
+ "build_up": ["explain", "why", "how does", "teach", "help me understand"]
+ },
+ "resolution": "direct_question -> answer_first; learning_intent -> build_up"
+ },
+ "response_length": {
+ "signals": {
+ "concise": ["quick", "brief", "short", "tldr", "in a nutshell", "one line"],
+ "detailed": ["explain", "elaborate", "in detail", "thoroughly", "complex", "proof"]
+ },
+ "resolution": "explicit_brevity_cue -> concise (overrides topic complexity)"
+ },
+ "naming_convention": {
+ "signals": {
+ "snake_case": ["python", ".py", "def ", "import "],
+ "camelCase": ["javascript", "typescript", ".js", ".ts", "const ", "let ", "function "],
+ "UPPER_keywords": ["sql", "SELECT", "FROM", "WHERE", "database"]
+ },
+ "resolution": "determined by programming language detection"
+ },
+ "autonomy": {
+ "signals": {
+ "confirm": ["should I", "would you like", "complex", "multiple parts", "project"],
+ "execute": ["do this", "make this", "just", "please", "now"]
+ },
+ "resolution": "ambiguous_task -> confirm; clear_instruction -> execute"
+ },
+ "code_presentation": {
+ "signals": {
+ "single_block": ["copy", "paste", "use this", "give me the code", "full code"],
+ "chunked": ["teach", "explain", "understand", "walk through", "learn"]
+ },
+ "resolution": "copy_intent -> single_block; learning_intent -> chunked"
+ }
+}
+
+
+def resolve_conflict(conflict_group: str, query: str, candidates: list) -> Optional[str]:
+ """
+ Programmatically resolve which preference wins in a conflict.
+
+ Args:
+ conflict_group: The conflict group ID
+ query: The user query
+ candidates: List of ConditionalPreference objects in this conflict
+
+ Returns:
+ pref_id of the winning preference, or None if cannot resolve
+ """
+ if conflict_group not in CONFLICT_RESOLUTION_RULES:
+ return None
+
+ rules = CONFLICT_RESOLUTION_RULES[conflict_group]
+ query_lower = query.lower()
+
+ # Score each candidate based on signal matches
+ scores = {}
+ for pref in candidates:
+ scores[pref.pref_id] = 0
+
+ # Check each signal category
+ for signal_category, keywords in rules["signals"].items():
+ for keyword in keywords:
+ if keyword.lower() in query_lower:
+ # Check if this signal category matches this preference
+ for ctx in pref.priority_context:
+ if ctx.lower() in signal_category.lower() or signal_category.lower() in ctx.lower():
+ scores[pref.pref_id] += 1
+ # Also check if keyword is in priority context
+ if keyword.lower() in ctx.lower():
+ scores[pref.pref_id] += 1
+
+ # Return highest scoring preference
+ if scores:
+ winner = max(scores, key=scores.get)
+ if scores[winner] > 0:
+ return winner
+
+ return None
+
+
+def create_conflict_test_case(conflict_group: str, preferences: list) -> dict:
+ """
+ Create a test case that triggers a specific conflict.
+
+ Returns a dict with:
+ - query: A query that triggers multiple preferences
+ - triggered_prefs: List of preference IDs triggered
+ - correct_pref: The preference that should win
+ - resolution_reason: Why this preference wins
+ """
+ if conflict_group not in CONFLICT_RESOLUTION_RULES:
+ return None
+
+ rules = CONFLICT_RESOLUTION_RULES[conflict_group]
+
+ # Create queries that trigger conflicts
+ test_cases = {
+ "format_structure": {
+ "query": "How do I set up a Python virtual environment? List the main options.",
+ "ambiguity": "Both 'set up' (procedure->numbered) and 'list options' (parallel->bullets)",
+ "resolution": "Primary intent is setup procedure -> numbered steps"
+ },
+ "response_length": {
+ "query": "Quick question - how does backpropagation work?",
+ "ambiguity": "'Quick question' (concise) vs 'how does X work' (complex topic)",
+ "resolution": "Explicit brevity cue 'quick question' overrides topic complexity"
+ },
+ "answer_position": {
+ "query": "What is gradient descent and why is it used?",
+ "ambiguity": "'What is' (answer first) vs 'why' (build up explanation)",
+ "resolution": "Combined question: give brief answer, then explain why"
+ },
+ "naming_convention": {
+ "query": "Write a function to parse JSON in both Python and JavaScript",
+ "ambiguity": "Two languages with different conventions",
+ "resolution": "Use appropriate convention for each: snake_case for Python, camelCase for JS"
+ },
+ "autonomy": {
+ "query": "Refactor this authentication module to use JWT",
+ "ambiguity": "'Refactor' is complex, but instruction is specific",
+ "resolution": "Should confirm approach before major refactor"
+ },
+ "code_presentation": {
+ "query": "I want to understand how this sorting algorithm works, give me the code",
+ "ambiguity": "'understand' (chunked) vs 'give me the code' (single block)",
+ "resolution": "Learning intent detected -> chunked with explanations"
+ }
+ }
+
+ if conflict_group in test_cases:
+ tc = test_cases[conflict_group]
+ # Find which preferences are triggered
+ triggered = [p for p in preferences if p.conflict_group == conflict_group]
+ winner = resolve_conflict(conflict_group, tc["query"], triggered)
+
+ return {
+ "conflict_group": conflict_group,
+ "query": tc["query"],
+ "ambiguity": tc["ambiguity"],
+ "triggered_pref_ids": [p.pref_id for p in triggered],
+ "correct_pref_id": winner,
+ "resolution_reason": tc["resolution"]
+ }
+
+ return None
+
+
+# ============================================================================
+# LLM-based Profile Generation
+# ============================================================================
+
+def generate_preferences_with_llm(
+ category: str,
+ model: str = "gpt-4o-mini",
+ extra_context: str = ""
+) -> list:
+ """Generate preferences for a category using LLM."""
+ if litellm is None:
+ raise ImportError("litellm required for LLM generation")
+
+ cat_info = PREFERENCE_CATEGORIES[category]
+ prompt = LLM_GENERATION_PROMPT.format(
+ num_prefs=cat_info["num_preferences"],
+ category_name=category,
+ category_description=cat_info["description"],
+ example_conflicts=", ".join(cat_info["example_conflicts"]),
+ category_prefix=category[:2],
+ extra_context=extra_context or "None"
+ )
+
+ response = litellm.completion(
+ model=model,
+ messages=[{"role": "user", "content": prompt}],
+ response_format={"type": "json_object"}
+ )
+
+ content = response.choices[0].message.content
+ # Extract JSON from response
+ try:
+ data = json.loads(content)
+ if isinstance(data, dict) and "preferences" in data:
+ data = data["preferences"]
+ return [ConditionalPreference(**p) for p in data]
+ except json.JSONDecodeError:
+ # Try to extract JSON array from markdown code block
+ import re
+ match = re.search(r'\[[\s\S]*\]', content)
+ if match:
+ data = json.loads(match.group())
+ return [ConditionalPreference(**p) for p in data]
+ raise
+
+
+def generate_persona_with_llm(
+ preferences: list,
+ model: str = "gpt-4o-mini"
+) -> str:
+ """Generate a persona that matches the preferences."""
+ if litellm is None:
+ raise ImportError("litellm required for LLM generation")
+
+ # Summarize preferences by category
+ by_cat = {}
+ for p in preferences:
+ cat = p.pref_id.split('_')[0]
+ if cat not in by_cat:
+ by_cat[cat] = []
+ by_cat[cat].append(p.action[:50] + "...")
+
+ summary = "\n".join([f"- {cat}: {', '.join(actions[:3])}" for cat, actions in by_cat.items()])
+
+ prompt = PERSONA_GENERATION_PROMPT.format(preference_summary=summary)
+
+ response = litellm.completion(
+ model=model,
+ messages=[{"role": "user", "content": prompt}]
+ )
+
+ return response.choices[0].message.content.strip()
+
+
+def generate_full_profile(
+ user_id: str,
+ model: str = "gpt-4o-mini",
+ categories: list = None
+) -> UserProfile:
+ """Generate a complete user profile with all preferences."""
+ if categories is None:
+ categories = list(PREFERENCE_CATEGORIES.keys())
+
+ all_preferences = []
+ for cat in categories:
+ prefs = generate_preferences_with_llm(cat, model)
+ all_preferences.extend(prefs)
+
+ persona = generate_persona_with_llm(all_preferences, model)
+
+ # Build conflict groups
+ conflict_groups = {}
+ for pref in all_preferences:
+ if pref.conflict_group:
+ if pref.conflict_group not in conflict_groups:
+ conflict_groups[pref.conflict_group] = ConflictGroup(
+ group_id=pref.conflict_group,
+ description=CONFLICT_RESOLUTION_RULES.get(pref.conflict_group, {}).get("resolution", ""),
+ resolution_rule=CONFLICT_RESOLUTION_RULES.get(pref.conflict_group, {}).get("resolution", ""),
+ member_pref_ids=[]
+ )
+ conflict_groups[pref.conflict_group].member_pref_ids.append(pref.pref_id)
+
+ return UserProfile(
+ user_id=user_id,
+ persona=persona,
+ preferences=all_preferences,
+ conflict_groups=conflict_groups
+ )
+
+
+# ============================================================================
+# Dataset Loading and Challenging Question Selection
+# ============================================================================
+
+CHALLENGING_DATASETS = {
+ # Existing datasets with difficulty filtering
+ "math-hard": {
+ "source": "lighteval/MATH-Hard",
+ "filter": lambda x: x.get("level") in ["Level 4", "Level 5"],
+ "encourage_step_by_step": True
+ },
+ "humaneval-hard": {
+ "source": "openai_humaneval",
+ "filter": lambda x: len(x.get("prompt", "")) > 200, # Longer problems
+ "encourage_step_by_step": True
+ },
+
+ # New challenging datasets to add
+ "gpqa": {
+ "source": "Idavidrein/gpqa",
+ "description": "PhD-level science questions",
+ "filter": lambda x: x.get("difficulty") == "hard",
+ "encourage_step_by_step": True
+ },
+ "theoremqa": {
+ "source": "wenhu/TheoremQA",
+ "description": "Theorem-based math requiring multi-step proofs",
+ "filter": None,
+ "encourage_step_by_step": True
+ },
+ "livecodebench": {
+ "source": "livecodebench/livecodebench",
+ "description": "Recent competitive programming problems",
+ "filter": lambda x: x.get("difficulty") in ["medium", "hard"],
+ "encourage_step_by_step": True
+ },
+ "aime": {
+ "source": "AI-MO/aimo-progress-prize",
+ "description": "American Invitational Mathematics Examination",
+ "filter": None,
+ "encourage_step_by_step": True
+ },
+ "scicode": {
+ "source": "scicode-bench/SciCode",
+ "description": "Scientific computing problems",
+ "filter": None,
+ "encourage_step_by_step": True
+ }
+}
+
+
+STEP_BY_STEP_PROMPT_ADDITIONS = {
+ "math": """
+When solving this problem:
+1. First identify what type of problem this is
+2. State the key concepts/theorems needed
+3. Work through the solution step by step
+4. Verify your answer
+Take your time and show your reasoning at each step.""",
+
+ "code": """
+When solving this problem:
+1. First understand the requirements and edge cases
+2. Outline your approach before writing code
+3. Implement step by step, explaining your logic
+4. Consider time/space complexity
+5. Test with example inputs
+Show your reasoning throughout.""",
+
+ "reasoning": """
+When solving this problem:
+1. Carefully read and identify the key information
+2. State any assumptions you're making
+3. Work through the logic step by step
+4. Check for any flaws in your reasoning
+5. State your conclusion clearly
+Take your time and explain your thought process."""
+}
+
+
+# ============================================================================
+# Batch Generation Script
+# ============================================================================
+
+def generate_profiles_batch(
+ num_profiles: int,
+ output_path: Path,
+ model: str = "gpt-4o-mini",
+ seed: int = 42
+) -> list:
+ """Generate multiple user profiles."""
+ random.seed(seed)
+ profiles = []
+
+ for i in range(num_profiles):
+ user_id = f"user_{hashlib.md5(f'{seed}_{i}'.encode()).hexdigest()[:8]}"
+
+ # Optionally vary which categories are emphasized
+ # Some users might have stronger code preferences, others math, etc.
+ category_weights = {cat: random.random() for cat in PREFERENCE_CATEGORIES}
+
+ try:
+ profile = generate_full_profile(user_id, model)
+ profiles.append(profile)
+ print(f"Generated profile {i+1}/{num_profiles}: {user_id}")
+ except Exception as e:
+ print(f"Error generating profile {i+1}: {e}")
+ continue
+
+ # Save profiles
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(output_path, 'w') as f:
+ for profile in profiles:
+ f.write(json.dumps(profile.to_dict()) + '\n')
+
+ print(f"Saved {len(profiles)} profiles to {output_path}")
+ return profiles
+
+
+def generate_conflict_test_suite(profiles: list, output_path: Path):
+ """Generate test cases for conflict resolution evaluation."""
+ test_cases = []
+
+ for profile in profiles:
+ for conflict_group in profile.conflict_groups:
+ tc = create_conflict_test_case(
+ conflict_group,
+ profile.preferences
+ )
+ if tc:
+ tc["user_id"] = profile.user_id
+ test_cases.append(tc)
+
+ with open(output_path, 'w') as f:
+ json.dump(test_cases, f, indent=2)
+
+ print(f"Generated {len(test_cases)} conflict test cases")
+ return test_cases
+
+
+# ============================================================================
+# Main
+# ============================================================================
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--num_profiles", type=int, default=10)
+ parser.add_argument("--output_dir", type=str, default="collaborativeagents/data/complex_profiles")
+ parser.add_argument("--model", type=str, default="gpt-4o-mini")
+ parser.add_argument("--seed", type=int, default=42)
+ parser.add_argument("--generate_conflicts", action="store_true")
+
+ args = parser.parse_args()
+
+ output_dir = Path(args.output_dir)
+
+ # Generate profiles
+ profiles = generate_profiles_batch(
+ num_profiles=args.num_profiles,
+ output_path=output_dir / "profiles.jsonl",
+ model=args.model,
+ seed=args.seed
+ )
+
+ # Generate conflict test cases
+ if args.generate_conflicts:
+ generate_conflict_test_suite(
+ profiles,
+ output_path=output_dir / "conflict_tests.json"
+ )
diff --git a/collaborativeagents/scripts/generate_profiles_v2.py b/collaborativeagents/scripts/generate_profiles_v2.py
new file mode 100644
index 0000000..c431302
--- /dev/null
+++ b/collaborativeagents/scripts/generate_profiles_v2.py
@@ -0,0 +1,475 @@
+"""
+Generate 100 complex user profiles with ~40 conditional preferences using LLM.
+
+Key differences from original CollaborativeAgents:
+1. 40 conditional preferences (vs their 3 flat preferences)
+2. Preferences have explicit conditions for when they apply
+3. Conflict groups marked for testing conflict resolution
+4. LLM-based batch generation with quality control
+"""
+
+import json
+import random
+import hashlib
+from pathlib import Path
+from dataclasses import dataclass, field, asdict
+from typing import Optional, List, Dict, Any
+import argparse
+
+try:
+ import litellm
+except ImportError:
+ litellm = None
+
+
+# =============================================================================
+# Preference Category Definitions
+# =============================================================================
+
+PREFERENCE_CATEGORIES = {
+ "response_format": {
+ "num_preferences": 4,
+ "conflicts": [("rf_bullets", "rf_numbered"), ("rf_answer_first", "rf_build_up")],
+ "prompt": """Generate 4 preferences about response FORMAT:
+1. When to use bullet points vs numbered lists
+2. When to lead with the answer vs build up to it
+
+Each must have:
+- A SPECIFIC condition (trigger phrase or situation)
+- A clear action (what to do)
+- Conflict group (format_structure or answer_position)
+- Priority keywords that trigger this preference
+
+Make conditions mutually exclusive within each conflict group."""
+ },
+
+ "verbosity": {
+ "num_preferences": 5,
+ "conflicts": [("vb_concise", "vb_detailed"), ("vb_explain_why", "vb_just_answer")],
+ "prompt": """Generate 5 preferences about VERBOSITY/LENGTH:
+1. When to be concise (user says "quick", "briefly", "TL;DR")
+2. When to be detailed (complex topics, "explain", "in depth")
+3. When to explain reasoning vs just give answer
+
+Include explicit trigger phrases in conditions.
+Conflict groups: response_length, explanation_depth"""
+ },
+
+ "code_style": {
+ "num_preferences": 8,
+ "conflicts": [
+ ("cs_snake", "cs_camel", "cs_sql_upper"), # By language
+ ("cs_inline_comments", "cs_docstrings"), # Comment style
+ ("cs_bugs_only", "cs_full_review") # Review scope
+ ],
+ "prompt": """Generate 8 preferences about CODE STYLE:
+1-3. Naming conventions BY LANGUAGE (Python=snake_case, JS=camelCase, SQL=UPPERCASE)
+4-5. Comment styles for short snippets vs production code
+6-7. Code review scope (bugs only vs style too)
+8. Error handling preference
+
+Conflict groups: naming_convention, comment_style, review_scope"""
+ },
+
+ "math_style": {
+ "num_preferences": 6,
+ "conflicts": [("ms_show_steps", "ms_high_level"), ("ms_intuition", "ms_formula")],
+ "prompt": """Generate 6 preferences about MATHEMATICAL explanations:
+1-2. When to show detailed steps vs high-level approach
+3-4. When to lead with intuition vs formula (statistics vs pure math)
+5. How to structure proofs
+6. Practice problems when studying for exams
+
+Conflict groups: math_detail, math_approach"""
+ },
+
+ "interaction_pattern": {
+ "num_preferences": 6,
+ "conflicts": [("ip_confirm", "ip_execute"), ("ip_recommend", "ip_compare")],
+ "prompt": """Generate 6 preferences about INTERACTION patterns:
+1-2. When to confirm before acting vs execute directly
+3-4. When to recommend vs present options/comparison
+5. How to handle user frustration
+6. How to handle user thanks/satisfaction
+
+Conflict groups: autonomy, guidance_style"""
+ },
+
+ "domain_specific": {
+ "num_preferences": 6,
+ "conflicts": [("ds_example_first", "ds_definition_first")],
+ "prompt": """Generate 6 DOMAIN-SPECIFIC preferences:
+1. ML explanations (include math formulation)
+2. System design (components list before interactions)
+3. API/library usage (example first)
+4. Theoretical concepts (definition first)
+5. Data structures (include complexity)
+6. Documentation style
+
+Conflict group: example_position"""
+ },
+
+ "error_correction": {
+ "num_preferences": 4,
+ "conflicts": [("ec_gentle", "ec_direct")],
+ "prompt": """Generate 4 preferences about ERROR CORRECTION:
+1. Minor terminology errors (correct gently inline)
+2. Fundamental misconceptions (address directly)
+3. Code bugs
+4. Agent's own mistakes
+
+Conflict group: correction_style"""
+ },
+
+ "output_artifacts": {
+ "num_preferences": 4,
+ "conflicts": [("oa_single_block", "oa_chunked")],
+ "prompt": """Generate 4 preferences about OUTPUT format:
+1. Copyable code (single block)
+2. Teaching code (chunked with explanations)
+3. Terminal commands (bash blocks with expected output)
+4. Always specify language in code fences
+
+Conflict group: code_presentation"""
+ }
+}
+
+
+LLM_PREFERENCE_GENERATION_PROMPT = """You are generating CONDITIONAL user preferences for a personalization benchmark.
+
+# Category: {category_name}
+# Number of preferences to generate: {num_preferences}
+
+{category_prompt}
+
+# Output Requirements
+Generate exactly {num_preferences} preferences in this JSON format:
+```json
+{{
+ "preferences": [
+ {{
+ "pref_id": "{prefix}_001",
+ "condition": "When X happens / When user says Y / For Z type of content",
+ "action": "Do A, B, C (be specific)",
+ "conflict_group": "group_name_or_null",
+ "priority_context": ["keyword1", "keyword2", "phrase1"]
+ }},
+ ...
+ ]
+}}
+```
+
+# Critical Rules:
+1. Conditions must be SPECIFIC and OBSERVABLE (include trigger phrases)
+2. Within a conflict group, conditions must be MUTUALLY EXCLUSIVE
+3. Priority_context keywords should appear in queries that trigger this preference
+4. Actions must be concrete and verifiable
+
+Generate preferences that will:
+- Create interesting conflicts (RAG should resolve correctly, context methods fail)
+- Be testable (we can verify if an agent followed them)
+- Be realistic (based on actual user behavior)
+
+Output ONLY the JSON, no other text."""
+
+
+PERSONA_GENERATION_PROMPT = """Generate a realistic user persona (2-3 sentences) that would naturally have these preference categories:
+{categories}
+
+The persona should be a software developer, researcher, or technical professional. Include:
+- Professional background (role, experience level, domain)
+- Communication style tendencies
+- Work context
+
+Output ONLY the persona text, no JSON or formatting."""
+
+
+# =============================================================================
+# Profile Generator
+# =============================================================================
+
+class ProfileGenerator:
+ """Generate complex user profiles with conditional preferences."""
+
+ def __init__(self, model: str = "meta-llama/Llama-3.1-70B-Instruct", seed: int = 42):
+ self.model = model
+ self.random = random.Random(seed)
+
+ if litellm is None:
+ raise ImportError("litellm required for profile generation")
+
+ def _call_llm(self, prompt: str, json_mode: bool = True) -> str:
+ """Call LLM with prompt."""
+ kwargs = {
+ "model": self.model,
+ "messages": [{"role": "user", "content": prompt}],
+ "temperature": 0.7,
+ "max_tokens": 4096,
+ }
+
+ if json_mode:
+ kwargs["response_format"] = {"type": "json_object"}
+
+ response = litellm.completion(**kwargs)
+ return response.choices[0].message.content
+
+ def _parse_json(self, text: str) -> dict:
+ """Parse JSON from response."""
+ import re
+
+ try:
+ return json.loads(text)
+ except json.JSONDecodeError:
+ pass
+
+ # Try markdown code block
+ match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text)
+ if match:
+ try:
+ return json.loads(match.group(1))
+ except:
+ pass
+
+ # Try to find JSON object
+ match = re.search(r'\{[\s\S]*\}', text)
+ if match:
+ try:
+ return json.loads(match.group())
+ except:
+ pass
+
+ raise ValueError(f"Failed to parse JSON from: {text[:500]}")
+
+ def generate_preferences_for_category(
+ self,
+ category: str,
+ prefix: str
+ ) -> List[Dict]:
+ """Generate preferences for a single category."""
+ cat_info = PREFERENCE_CATEGORIES[category]
+
+ prompt = LLM_PREFERENCE_GENERATION_PROMPT.format(
+ category_name=category,
+ num_preferences=cat_info["num_preferences"],
+ category_prompt=cat_info["prompt"],
+ prefix=prefix
+ )
+
+ response = self._call_llm(prompt, json_mode=True)
+ data = self._parse_json(response)
+
+ prefs = data.get("preferences", data)
+ if isinstance(prefs, dict):
+ prefs = list(prefs.values())
+
+ # Validate and fix pref_ids
+ for i, pref in enumerate(prefs):
+ if "pref_id" not in pref:
+ pref["pref_id"] = f"{prefix}_{i+1:03d}"
+
+ return prefs
+
+ def generate_persona(self, categories: List[str]) -> str:
+ """Generate a persona for the given preference categories."""
+ prompt = PERSONA_GENERATION_PROMPT.format(
+ categories=", ".join(categories)
+ )
+
+ return self._call_llm(prompt, json_mode=False).strip()
+
+ def generate_profile(self, user_id: str) -> Dict:
+ """Generate a complete user profile with ~40 preferences."""
+ all_preferences = []
+ category_prefixes = {
+ "response_format": "rf",
+ "verbosity": "vb",
+ "code_style": "cs",
+ "math_style": "ms",
+ "interaction_pattern": "ip",
+ "domain_specific": "ds",
+ "error_correction": "ec",
+ "output_artifacts": "oa"
+ }
+
+ print(f" Generating preferences for {user_id}...")
+ for category, prefix in category_prefixes.items():
+ try:
+ prefs = self.generate_preferences_for_category(category, prefix)
+ all_preferences.extend(prefs)
+ print(f" {category}: {len(prefs)} preferences")
+ except Exception as e:
+ print(f" ERROR in {category}: {e}")
+
+ # Generate persona
+ print(f" Generating persona...")
+ persona = self.generate_persona(list(category_prefixes.keys()))
+
+ # Build conflict groups mapping
+ conflict_groups = {}
+ for pref in all_preferences:
+ cg = pref.get("conflict_group")
+ if cg:
+ if cg not in conflict_groups:
+ conflict_groups[cg] = []
+ conflict_groups[cg].append(pref["pref_id"])
+
+ return {
+ "user_id": user_id,
+ "persona": persona,
+ "preferences": all_preferences,
+ "conflict_groups": conflict_groups,
+ "meta": {
+ "total_preferences": len(all_preferences),
+ "total_conflict_groups": len(conflict_groups),
+ "generator": "generate_profiles_v2.py"
+ }
+ }
+
+
+def generate_profiles_batch(
+ num_profiles: int,
+ output_path: Path,
+ model: str = "meta-llama/Llama-3.1-70B-Instruct",
+ seed: int = 42
+):
+ """Generate multiple profiles."""
+ generator = ProfileGenerator(model=model, seed=seed)
+ profiles = []
+
+ for i in range(num_profiles):
+ user_id = f"user_{hashlib.md5(f'{seed}_{i}'.encode()).hexdigest()[:8]}"
+ print(f"\n[{i+1}/{num_profiles}] Generating profile: {user_id}")
+
+ try:
+ profile = generator.generate_profile(user_id)
+ profiles.append(profile)
+ print(f" Generated {profile['meta']['total_preferences']} preferences")
+ except Exception as e:
+ print(f" ERROR: {e}")
+ continue
+
+ # Save
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(output_path, 'w') as f:
+ for profile in profiles:
+ f.write(json.dumps(profile) + '\n')
+
+ print(f"\n{'='*60}")
+ print(f"Generated {len(profiles)} profiles")
+ print(f"Saved to: {output_path}")
+
+ return profiles
+
+
+# =============================================================================
+# Fallback: Generate from Schema (No LLM Required)
+# =============================================================================
+
+def generate_profiles_from_schema(
+ num_profiles: int,
+ schema_path: Path,
+ output_path: Path,
+ seed: int = 42
+) -> List[Dict]:
+ """
+ Generate profiles from the predefined schema (no LLM calls).
+ Useful for testing or when API is unavailable.
+ """
+ with open(schema_path) as f:
+ schema = json.load(f)
+
+ random.seed(seed)
+ profiles = []
+
+ # Extract all preferences from schema
+ all_prefs = []
+ for cat in schema["preference_categories"]:
+ all_prefs.extend(cat["preferences"])
+
+ # Sample personas
+ sample_personas = [
+ "A senior backend engineer who values efficiency and directness. Prefers practical solutions over theoretical discussions.",
+ "A PhD student in ML who is meticulous about mathematical rigor. Appreciates step-by-step derivations.",
+ "A junior developer learning full-stack. Prefers patient, incremental explanations with examples.",
+ "A DevOps engineer focused on automation. Wants concise, actionable answers with commands to run.",
+ "A data scientist who thinks visually. Prefers intuition before formulas and lots of examples.",
+ "A tech lead reviewing code from their team. Focuses on maintainability and best practices.",
+ "A researcher prototyping quickly. Wants working code fast, willing to refactor later.",
+ "A student preparing for technical interviews. Needs step-by-step problem solving practice.",
+ ]
+
+ for i in range(num_profiles):
+ user_id = f"user_{hashlib.md5(f'{seed}_{i}'.encode()).hexdigest()[:8]}"
+
+ # Select random subset of preferences (35-45)
+ num_prefs = random.randint(35, 45)
+ selected_prefs = random.sample(all_prefs, min(num_prefs, len(all_prefs)))
+
+ # Build conflict groups
+ conflict_groups = {}
+ for pref in selected_prefs:
+ cg = pref.get("conflict_group")
+ if cg:
+ if cg not in conflict_groups:
+ conflict_groups[cg] = []
+ conflict_groups[cg].append(pref["pref_id"])
+
+ profile = {
+ "user_id": user_id,
+ "persona": random.choice(sample_personas),
+ "preferences": selected_prefs,
+ "conflict_groups": conflict_groups,
+ "meta": {
+ "total_preferences": len(selected_prefs),
+ "total_conflict_groups": len(conflict_groups),
+ "generator": "schema_based"
+ }
+ }
+ profiles.append(profile)
+
+ # Save
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(output_path, 'w') as f:
+ for profile in profiles:
+ f.write(json.dumps(profile) + '\n')
+
+ print(f"Generated {len(profiles)} profiles from schema")
+ return profiles
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Generate complex user profiles with conditional preferences"
+ )
+ parser.add_argument("--num_profiles", type=int, default=100,
+ help="Number of profiles to generate")
+ parser.add_argument("--output", type=str,
+ default="collaborativeagents/data/complex_profiles_v2/profiles.jsonl")
+ parser.add_argument("--model", type=str,
+ default="meta-llama/Llama-3.1-70B-Instruct",
+ help="LLM model for generation")
+ parser.add_argument("--seed", type=int, default=42)
+ parser.add_argument("--from_schema", type=str, default=None,
+ help="Generate from schema file instead of LLM")
+
+ args = parser.parse_args()
+ output_path = Path(args.output)
+
+ if args.from_schema:
+ generate_profiles_from_schema(
+ num_profiles=args.num_profiles,
+ schema_path=Path(args.from_schema),
+ output_path=output_path,
+ seed=args.seed
+ )
+ else:
+ generate_profiles_batch(
+ num_profiles=args.num_profiles,
+ output_path=output_path,
+ model=args.model,
+ seed=args.seed
+ )
diff --git a/collaborativeagents/scripts/generate_training_data.sh b/collaborativeagents/scripts/generate_training_data.sh
new file mode 100644
index 0000000..bdd5fba
--- /dev/null
+++ b/collaborativeagents/scripts/generate_training_data.sh
@@ -0,0 +1,22 @@
+# python -m sglang.launch_server --model-path meta-llama/Llama-3.3-70B-Instruct --port 8004 --tp-size 4 --context-length 16384
+
+BATCH_SIZE=100
+
+# Loop over eval sizes and datasets
+for EVAL_SIZE in 20; do
+ for DATASET in math-hard math-500 logiqa mmlu medqa; do
+ # Convert dataset name for file paths (replace - with _)
+ DATASET_FILE=$(echo ${DATASET} | tr '-' '_')
+
+ echo "Generating training data for dataset: ${DATASET} with eval_size ${EVAL_SIZE}"
+
+ # training_data_with_user_profiles_with_preferences
+ python3 run.py --experiment_type training_data_with_user_profiles_with_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \
+ --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/training/training_data/${DATASET_FILE}_llama70b_user_llama70b_agent_training_data_with_reflection_eval_size_${EVAL_SIZE}.jsonl \
+ >> /shared/storage-01/users/mehri2/mem/collaborativeagents/training/training_data/${DATASET_FILE}_llama70b_user_llama70b_agent_training_data_with_reflection_eval_size_${EVAL_SIZE}.out 2>&1
+
+ done
+done \ No newline at end of file
diff --git a/collaborativeagents/scripts/preflight_test.py b/collaborativeagents/scripts/preflight_test.py
new file mode 100644
index 0000000..2411f1f
--- /dev/null
+++ b/collaborativeagents/scripts/preflight_test.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python
+"""
+Pre-flight tests before running full experiments.
+
+Tests:
+1. Timeout handling (infinite timeout)
+2. Large batch stress test (batch=100)
+3. Context length handling (auto-reduce max_tokens)
+4. Error recovery (partial failures)
+5. Sequential profile processing (for RAG/reflection methods)
+6. Memory usage estimation
+"""
+
+import sys
+import os
+import time
+import json
+import asyncio
+
+sys.path.insert(0, '/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents')
+
+from agents.batch_vllm_agent import BatchVLLMClient, BatchConversationGenerator
+
+
+def test_1_timeout_handling(user_url: str):
+ """Test 1: Infinite timeout configuration."""
+ print("\n" + "="*60)
+ print("TEST 1: Timeout Handling (Infinite Timeout)")
+ print("="*60)
+
+ # Create client with infinite timeout
+ client = BatchVLLMClient(
+ vllm_url=user_url,
+ max_tokens=256,
+ temperature=0.7,
+ timeout=None, # Infinite timeout
+ max_concurrent=50
+ )
+
+ print(f"✓ Client created with timeout=None (infinite)")
+ print(f" Model: {client.model_name}")
+ print(f" Max concurrent: {client.max_concurrent}")
+
+ # Test with a simple request
+ messages = [[{"role": "user", "content": "Say 'hello' and nothing else."}]]
+
+ start = time.time()
+ results = client.batch_completion(messages)
+ elapsed = time.time() - start
+
+ if results[0]:
+ print(f"✓ Single request succeeded in {elapsed:.1f}s")
+ print(f" Response: {results[0][:50]}...")
+ return True
+ else:
+ print(f"✗ Single request failed")
+ return False
+
+
+def test_2_large_batch(user_url: str, batch_size: int = 100):
+ """Test 2: Large batch stress test."""
+ print("\n" + "="*60)
+ print(f"TEST 2: Large Batch Stress Test (batch={batch_size})")
+ print("="*60)
+
+ client = BatchVLLMClient(
+ vllm_url=user_url,
+ max_tokens=128, # Small to speed up test
+ temperature=0.7,
+ timeout=None,
+ max_concurrent=100
+ )
+
+ # Create batch of simple requests
+ messages_list = [
+ [{"role": "user", "content": f"Count from 1 to 5. Request #{i+1}"}]
+ for i in range(batch_size)
+ ]
+
+ print(f"Sending {batch_size} concurrent requests...")
+ start = time.time()
+ results = client.batch_completion(messages_list)
+ elapsed = time.time() - start
+
+ successes = sum(1 for r in results if r is not None)
+
+ print(f"\nResults:")
+ print(f" Successes: {successes}/{batch_size}")
+ print(f" Time: {elapsed:.1f}s")
+ print(f" Throughput: {successes * 3600 / elapsed:.0f} requests/hr")
+
+ if successes >= batch_size * 0.9:
+ print(f"✓ Batch test PASSED (>90% success)")
+ return True
+ else:
+ print(f"✗ Batch test FAILED (<90% success)")
+ return False
+
+
+def test_3_context_length_handling(user_url: str):
+ """Test 3: Context length error handling."""
+ print("\n" + "="*60)
+ print("TEST 3: Context Length Handling")
+ print("="*60)
+
+ client = BatchVLLMClient(
+ vllm_url=user_url,
+ max_tokens=512, # Request large output
+ temperature=0.7,
+ timeout=None,
+ max_concurrent=10
+ )
+
+ # Create request with very long input (near 4096 token limit)
+ long_text = "This is a test. " * 500 # ~2000 tokens
+ messages_list = [
+ [{"role": "user", "content": f"Summarize: {long_text}"}], # Will hit limit
+ [{"role": "user", "content": "Say hello."}], # Should succeed
+ ]
+
+ print("Testing with 1 long + 1 short request...")
+ results = client.batch_completion(messages_list)
+
+ # The long one might fail or get reduced max_tokens
+ # The short one should succeed
+ short_success = results[1] is not None
+
+ if short_success:
+ print(f"✓ Short request succeeded despite long request")
+ print(f" Long request result: {'OK' if results[0] else 'Handled gracefully'}")
+ return True
+ else:
+ print(f"✗ Short request should not have failed")
+ return False
+
+
+def test_4_error_recovery(user_url: str, agent_url: str):
+ """Test 4: Error recovery in batch processing."""
+ print("\n" + "="*60)
+ print("TEST 4: Error Recovery (Partial Failures)")
+ print("="*60)
+
+ generator = BatchConversationGenerator(
+ user_vllm_url=user_url,
+ agent_vllm_url=agent_url,
+ max_turns=3,
+ user_max_tokens=256,
+ agent_max_tokens=256,
+ )
+
+ # Mix of valid and problematic samples
+ samples = [
+ {"problem": "What is 2+2?", "solution": "4"},
+ {"problem": "What is 3+3?", "solution": "6"},
+ {"problem": "What is 4+4?", "solution": "8"},
+ ]
+
+ print("Testing batch generation with 3 samples, 3 turns...")
+ start = time.time()
+ results = generator.generate_batch(
+ samples=samples,
+ user_persona="A student.",
+ user_preferences=None,
+ )
+ elapsed = time.time() - start
+
+ successes = sum(1 for r in results if r is not None)
+ print(f"\nResults:")
+ print(f" Successes: {successes}/{len(samples)}")
+ print(f" Time: {elapsed:.1f}s")
+
+ if successes >= 2:
+ print(f"✓ Error recovery PASSED")
+ return True
+ else:
+ print(f"✗ Error recovery FAILED")
+ return False
+
+
+def test_5_sequential_profile(user_url: str, agent_url: str):
+ """Test 5: Sequential profile processing (simulating RAG/reflection)."""
+ print("\n" + "="*60)
+ print("TEST 5: Sequential Profile Processing (RAG/Reflection Simulation)")
+ print("="*60)
+
+ # Simulate 3 profiles, each with 2 sequential sessions
+ # This is how RAG/reflection methods work - sequential within profile
+
+ generator = BatchConversationGenerator(
+ user_vllm_url=user_url,
+ agent_vllm_url=agent_url,
+ max_turns=2,
+ user_max_tokens=256,
+ agent_max_tokens=256,
+ )
+
+ n_profiles = 3
+ sessions_per_profile = 2
+ total_time = 0
+ total_sessions = 0
+
+ for profile_idx in range(n_profiles):
+ profile_start = time.time()
+
+ # Sequential sessions for this profile
+ for session_idx in range(sessions_per_profile):
+ samples = [
+ {"problem": f"Profile {profile_idx+1}, Session {session_idx+1}: What is {profile_idx+session_idx}+1?",
+ "solution": str(profile_idx + session_idx + 1)}
+ ]
+
+ results = generator.generate_batch(
+ samples=samples,
+ user_persona=f"User profile {profile_idx+1}",
+ user_preferences="Be concise.",
+ )
+
+ if results[0]:
+ total_sessions += 1
+
+ profile_elapsed = time.time() - profile_start
+ total_time += profile_elapsed
+ print(f" Profile {profile_idx+1}: {profile_elapsed:.1f}s for {sessions_per_profile} sessions")
+
+ print(f"\nResults:")
+ print(f" Total sessions: {total_sessions}/{n_profiles * sessions_per_profile}")
+ print(f" Total time: {total_time:.1f}s")
+ print(f" Throughput: {total_sessions * 3600 / total_time:.0f} sessions/hr")
+
+ if total_sessions >= n_profiles * sessions_per_profile * 0.8:
+ print(f"✓ Sequential profile test PASSED")
+ return True
+ else:
+ print(f"✗ Sequential profile test FAILED")
+ return False
+
+
+def test_6_memory_estimation():
+ """Test 6: Memory usage estimation."""
+ print("\n" + "="*60)
+ print("TEST 6: Memory Usage Estimation")
+ print("="*60)
+
+ try:
+ import subprocess
+ result = subprocess.run(
+ ['nvidia-smi', '--query-gpu=index,memory.used,memory.total', '--format=csv,noheader,nounits'],
+ capture_output=True, text=True
+ )
+
+ print("GPU Memory Usage:")
+ for line in result.stdout.strip().split('\n'):
+ parts = line.split(', ')
+ if len(parts) == 3:
+ gpu_idx, used, total = parts
+ used_pct = float(used) / float(total) * 100
+ print(f" GPU {gpu_idx}: {used}/{total} MiB ({used_pct:.1f}%)")
+
+ print("✓ Memory estimation completed")
+ return True
+ except Exception as e:
+ print(f"✗ Could not get memory info: {e}")
+ return False
+
+
+def run_all_tests(user_url: str, agent_url: str):
+ """Run all pre-flight tests."""
+ print("\n" + "="*60)
+ print("PRE-FLIGHT TESTS FOR FULL EXPERIMENTS")
+ print("="*60)
+ print(f"User URL: {user_url}")
+ print(f"Agent URL: {agent_url}")
+ print(f"Time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
+
+ results = {}
+
+ # Run each test
+ results['timeout'] = test_1_timeout_handling(user_url)
+ results['large_batch'] = test_2_large_batch(user_url, batch_size=50)
+ results['context_length'] = test_3_context_length_handling(user_url)
+ results['error_recovery'] = test_4_error_recovery(user_url, agent_url)
+ results['sequential_profile'] = test_5_sequential_profile(user_url, agent_url)
+ results['memory'] = test_6_memory_estimation()
+
+ # Summary
+ print("\n" + "="*60)
+ print("PRE-FLIGHT TEST SUMMARY")
+ print("="*60)
+
+ all_passed = True
+ for test_name, passed in results.items():
+ status = "✓ PASSED" if passed else "✗ FAILED"
+ print(f" {test_name}: {status}")
+ if not passed:
+ all_passed = False
+
+ print()
+ if all_passed:
+ print("✓ ALL TESTS PASSED - Ready for full experiments!")
+ else:
+ print("✗ SOME TESTS FAILED - Review before proceeding")
+
+ return all_passed
+
+
+if __name__ == "__main__":
+ user_url = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8004/v1"
+ agent_url = sys.argv[2] if len(sys.argv) > 2 else "http://localhost:8003/v1"
+
+ success = run_all_tests(user_url, agent_url)
+ sys.exit(0 if success else 1)
diff --git a/collaborativeagents/scripts/quick_rag_debug.sbatch b/collaborativeagents/scripts/quick_rag_debug.sbatch
new file mode 100644
index 0000000..efc4c31
--- /dev/null
+++ b/collaborativeagents/scripts/quick_rag_debug.sbatch
@@ -0,0 +1,78 @@
+#!/bin/bash
+#SBATCH --job-name=rag_debug
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=16
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=00:40:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_debug-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_debug-%j.err
+
+# Quick debug test: 2 profiles, 5 sessions - should complete in ~20 min
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+echo "=== Quick RAG Debug Test ==="
+echo "2 profiles, 5 sessions - checking if extraction/storage works"
+date
+
+# Clear empty store
+> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+
+# Start vLLM servers
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \
+ --max-model-len 16384 --dtype bfloat16 &
+
+echo "Waiting for vLLM servers..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+sleep 5
+
+OUTPUT_DIR="../results/rag_debug_$(date +%Y%m%d_%H%M%S)"
+
+# Only test RAG to see debug output
+echo "============================================"
+echo "Testing RAG with debug output"
+echo "============================================"
+
+python scripts/run_experiments.py --methods rag \
+ --datasets math-hard --n-profiles 2 --n-sessions 5 --max-turns 10 \
+ --use-vllm --no-batch-processing --parallel-profiles 2 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+echo "Memory cards in file: $(wc -l < /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl)"
+echo "=== Done ==="
+date
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/quick_test_a100.sbatch b/collaborativeagents/scripts/quick_test_a100.sbatch
new file mode 100644
index 0000000..0d823f5
--- /dev/null
+++ b/collaborativeagents/scripts/quick_test_a100.sbatch
@@ -0,0 +1,136 @@
+#!/bin/bash
+#SBATCH --job-name=quick_batch_a100
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuA100x4
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=128G
+#SBATCH --time=01:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/quick_batch_a100-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/quick_batch_a100-%j.err
+
+# Quick test: 10 profiles × 5 sessions = 50 sessions on A100
+# Tests batch (vanilla) processing while H200 queue is busy
+
+set -e
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PORT_USER=8004
+PORT_AGENT=8003
+
+echo "============================================"
+echo "Quick Test: Batch Processing on A100"
+echo "============================================"
+echo "Profiles: 10"
+echo "Sessions/profile: 5"
+echo "Total: 50 sessions"
+echo ""
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+echo ""
+
+# Kill any existing servers
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start vLLM servers
+echo "Starting 8B user simulator (GPU 0-1, TP=2)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_USER \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_USER_PID=$!
+
+echo "Starting 8B agent (GPU 2-3, TP=2)..."
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_AGENT \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_AGENT_PID=$!
+
+echo "Waiting for servers..."
+for i in $(seq 1 100); do
+ READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0)
+ READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0)
+ if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then
+ echo "Both servers ready after $((i*3))s"
+ break
+ fi
+ if [ $((i % 20)) -eq 0 ]; then
+ echo " Still waiting... ($((i*3))s)"
+ fi
+ sleep 3
+done
+
+if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then
+ echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then
+ echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+echo "Both servers healthy"
+echo ""
+
+# Run quick test with vanilla (batch)
+echo "============================================"
+echo "Test: BATCH processing (vanilla method)"
+echo "============================================"
+START=$(date +%s)
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+python scripts/run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 10 \
+ --n-sessions 5 \
+ --use-vllm \
+ --batch-size 50 \
+ --parallel-profiles 10 \
+ --output-dir ../results/quick_test_batch_a100 \
+ --profile-path "$PROFILE_PATH"
+
+END=$(date +%s)
+ELAPSED=$((END-START))
+echo ""
+echo "Vanilla (batch) completed in ${ELAPSED}s"
+
+# Cleanup
+echo ""
+echo "Cleaning up..."
+kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true
+
+echo ""
+echo "============================================"
+echo "QUICK TEST RESULTS (A100)"
+echo "============================================"
+echo ""
+echo "Vanilla (BATCH): ${ELAPSED}s for 50 sessions"
+echo ""
+
+if [ $ELAPSED -gt 0 ]; then
+ THROUGHPUT=$((50 * 3600 / ELAPSED))
+ echo "Throughput: ${THROUGHPUT} sessions/hr"
+fi
+
+echo ""
+echo "Results saved to: ../results/quick_test_batch_a100/"
+echo ""
+date
diff --git a/collaborativeagents/scripts/quick_test_batch.sh b/collaborativeagents/scripts/quick_test_batch.sh
new file mode 100755
index 0000000..4be6573
--- /dev/null
+++ b/collaborativeagents/scripts/quick_test_batch.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+# Quick test: 10 profiles × 5 sessions = 50 sessions
+# Tests both batch (vanilla) and sequential (rag) processing
+
+set -e
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PORT_USER=8004
+PORT_AGENT=8003
+
+echo "============================================"
+echo "Quick Test: Batch Processing Verification"
+echo "============================================"
+echo "Profiles: 10"
+echo "Sessions/profile: 5"
+echo "Total: 50 sessions"
+echo ""
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+echo ""
+
+# Kill any existing servers
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start vLLM servers
+echo "Starting 8B user simulator (GPU 0-1, TP=2)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_USER \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_USER_PID=$!
+
+echo "Starting 8B agent (GPU 2-3, TP=2)..."
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_AGENT \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_AGENT_PID=$!
+
+echo "Waiting for servers..."
+for i in $(seq 1 100); do
+ READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0)
+ READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0)
+ if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then
+ echo "Both servers ready after $((i*3))s"
+ break
+ fi
+ if [ $((i % 20)) -eq 0 ]; then
+ echo " Still waiting... ($((i*3))s)"
+ fi
+ sleep 3
+done
+
+if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then
+ echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then
+ echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+echo "✓ Both servers healthy"
+echo ""
+
+# Run quick test with vanilla (batch) and rag (sequential)
+echo "============================================"
+echo "Test 1: BATCH processing (vanilla method)"
+echo "============================================"
+START=$(date +%s)
+
+# Use absolute path for profile (your 100 profiles with ~40 preferences each)
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_100.jsonl"
+
+python scripts/run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 10 \
+ --n-sessions 5 \
+ --use-vllm \
+ --batch-size 50 \
+ --parallel-profiles 10 \
+ --output-dir ../results/quick_test_batch \
+ --profile-path "$PROFILE_PATH"
+
+END=$(date +%s)
+ELAPSED_BATCH=$((END-START))
+echo ""
+echo "Vanilla (batch) completed in ${ELAPSED_BATCH}s"
+
+ELAPSED_SEQ=0
+# Skip sequential test for now - just validate batch processing works
+echo ""
+echo "Skipping Test 2 (sequential) for quick validation..."
+
+# Cleanup
+echo ""
+echo "Cleaning up..."
+kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true
+
+echo ""
+echo "============================================"
+echo "QUICK TEST RESULTS"
+echo "============================================"
+echo ""
+echo "Vanilla (BATCH): ${ELAPSED_BATCH}s for 50 sessions"
+echo "RAG (SEQUENTIAL): ${ELAPSED_SEQ}s for 50 sessions"
+echo ""
+
+if [ $ELAPSED_BATCH -gt 0 ]; then
+ THROUGHPUT_BATCH=$((50 * 3600 / ELAPSED_BATCH))
+ echo "Vanilla throughput: ${THROUGHPUT_BATCH} sessions/hr"
+fi
+if [ $ELAPSED_SEQ -gt 0 ]; then
+ THROUGHPUT_SEQ=$((50 * 3600 / ELAPSED_SEQ))
+ echo "RAG throughput: ${THROUGHPUT_SEQ} sessions/hr"
+fi
+
+echo ""
+echo "Results saved to:"
+echo " ../results/quick_test_batch/"
+echo " ../results/quick_test_sequential/"
+echo ""
+date
diff --git a/collaborativeagents/scripts/quick_test_h200.sbatch b/collaborativeagents/scripts/quick_test_h200.sbatch
new file mode 100644
index 0000000..a1f115d
--- /dev/null
+++ b/collaborativeagents/scripts/quick_test_h200.sbatch
@@ -0,0 +1,137 @@
+#!/bin/bash
+#SBATCH --job-name=quick_batch_test
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=128G
+#SBATCH --time=01:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/quick_batch_test-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/quick_batch_test-%j.err
+
+# Quick test: 10 profiles × 5 sessions = 50 sessions
+# Tests batch (vanilla) processing on H200
+
+set -e
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PORT_USER=8004
+PORT_AGENT=8003
+
+echo "============================================"
+echo "Quick Test: Batch Processing on H200"
+echo "============================================"
+echo "Profiles: 10"
+echo "Sessions/profile: 5"
+echo "Total: 50 sessions"
+echo ""
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+echo ""
+
+# Kill any existing servers
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start vLLM servers
+echo "Starting 8B user simulator (GPU 0-1, TP=2)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_USER \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 8192 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_USER_PID=$!
+
+echo "Starting 8B agent (GPU 2-3, TP=2)..."
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_AGENT \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 8192 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_AGENT_PID=$!
+
+echo "Waiting for servers (may take 5-10 min for CUDA graph compilation)..."
+for i in $(seq 1 200); do
+ READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0)
+ READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0)
+ if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then
+ echo "Both servers ready after $((i*3))s"
+ break
+ fi
+ if [ $((i % 20)) -eq 0 ]; then
+ echo " Still waiting... ($((i*3))s)"
+ fi
+ sleep 3
+done
+
+if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then
+ echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then
+ echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+echo "Both servers healthy"
+echo ""
+
+# Run quick test with vanilla (batch)
+echo "============================================"
+echo "Test: BATCH processing (vanilla method)"
+echo "============================================"
+START=$(date +%s)
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_100.jsonl"
+
+python scripts/run_experiments.py \
+ --methods vanilla \
+ --datasets math-hard \
+ --n-profiles 10 \
+ --n-sessions 5 \
+ --max-turns 15 \
+ --use-vllm \
+ --batch-size 50 \
+ --parallel-profiles 10 \
+ --output-dir ../results/quick_test_batch_h200 \
+ --profile-path "$PROFILE_PATH"
+
+END=$(date +%s)
+ELAPSED=$((END-START))
+echo ""
+echo "Vanilla (batch) completed in ${ELAPSED}s"
+
+# Cleanup
+echo ""
+echo "Cleaning up..."
+kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true
+
+echo ""
+echo "============================================"
+echo "QUICK TEST RESULTS"
+echo "============================================"
+echo ""
+echo "Vanilla (BATCH): ${ELAPSED}s for 50 sessions"
+echo ""
+
+if [ $ELAPSED -gt 0 ]; then
+ THROUGHPUT=$((50 * 3600 / ELAPSED))
+ echo "Throughput: ${THROUGHPUT} sessions/hr"
+fi
+
+echo ""
+echo "Results saved to: ../results/quick_test_batch_h200/"
+echo ""
+date
diff --git a/collaborativeagents/scripts/rag_debug_interactive.sbatch b/collaborativeagents/scripts/rag_debug_interactive.sbatch
new file mode 100644
index 0000000..40a396c
--- /dev/null
+++ b/collaborativeagents/scripts/rag_debug_interactive.sbatch
@@ -0,0 +1,87 @@
+#!/bin/bash
+#SBATCH --job-name=rag_debug
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8-interactive
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=00:40:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_debug-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_debug-%j.err
+
+# Debug test on interactive partition: 5 profiles, 15 sessions
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+echo "=== RAG Debug Test (Interactive) ==="
+echo "5 profiles, 15 sessions - with debug output"
+date
+
+# Clear empty store
+> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+
+# Start vLLM servers
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \
+ --max-model-len 16384 --dtype bfloat16 &
+
+echo "Waiting for vLLM servers..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+sleep 5
+
+OUTPUT_DIR="../results/rag_debug_$(date +%Y%m%d_%H%M%S)"
+
+for METHOD in vanilla rag rag_vector; do
+ echo ""
+ echo "============================================"
+ echo "Testing: $METHOD"
+ echo "============================================"
+
+ # Clear memory store before each method
+ > /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+ rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+
+ date
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 5 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+ echo "Memory cards: $(wc -l < /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl)"
+done
+
+echo ""
+echo "=== Done ==="
+date
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/rag_test_v4.sbatch b/collaborativeagents/scripts/rag_test_v4.sbatch
new file mode 100644
index 0000000..ab3c8f6
--- /dev/null
+++ b/collaborativeagents/scripts/rag_test_v4.sbatch
@@ -0,0 +1,92 @@
+#!/bin/bash
+#SBATCH --job-name=rag_test
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8-interactive
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=00:40:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_test-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_test-%j.err
+
+# Test with:
+# 1. Skip reranking when few candidates
+# 2. Reduced vLLM memory (0.35 for agent) to leave room for reranker
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+echo "=== RAG Test v4 ==="
+echo "Changes: Skip rerank when <k candidates, reduced vLLM memory (0.35)"
+echo "5 profiles, 15 sessions"
+date
+
+# Clear empty store
+> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+
+# Start vLLM servers with adjusted memory
+# User simulator: 0.90 (unchanged)
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME &
+
+# Agent: reduced from 0.45 to 0.35 to leave room for reranker/embedding
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.35 \
+ --max-model-len 16384 --dtype bfloat16 &
+
+echo "Waiting for vLLM servers..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+sleep 5
+
+OUTPUT_DIR="../results/rag_test_v4_$(date +%Y%m%d_%H%M%S)"
+
+for METHOD in vanilla rag rag_vector; do
+ echo ""
+ echo "============================================"
+ echo "Testing: $METHOD"
+ echo "============================================"
+
+ # Clear memory store before each method
+ > /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+ rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+
+ date
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 5 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+ echo "Method $METHOD completed"
+done
+
+echo ""
+echo "=== Done ==="
+date
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/rag_test_v5.sbatch b/collaborativeagents/scripts/rag_test_v5.sbatch
new file mode 100644
index 0000000..d739253
--- /dev/null
+++ b/collaborativeagents/scripts/rag_test_v5.sbatch
@@ -0,0 +1,96 @@
+#!/bin/bash
+#SBATCH --job-name=rag_test
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8-interactive
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=00:40:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_test-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_test-%j.err
+
+# Test with explicit device assignment:
+# - vLLM user sim: GPUs 0,1
+# - vLLM agent: GPUs 2,3 (0.45 memory)
+# - Embedding: cuda:2
+# - Reranker: cuda:3
+# - Extractor: cuda:2
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+echo "=== RAG Test v5 ==="
+echo "Explicit device assignment: embed->cuda:2, reranker->cuda:3, extractor->cuda:2"
+echo "5 profiles, 15 sessions"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+# Clear empty store
+> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+
+# Start vLLM servers
+# User simulator: GPUs 0,1
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME &
+
+# Agent: GPUs 2,3 (restored to 0.45 since HF models now explicitly assigned)
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \
+ --max-model-len 16384 --dtype bfloat16 &
+
+echo "Waiting for vLLM servers..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+sleep 5
+
+OUTPUT_DIR="../results/rag_test_v5_$(date +%Y%m%d_%H%M%S)"
+
+for METHOD in vanilla rag rag_vector; do
+ echo ""
+ echo "============================================"
+ echo "Testing: $METHOD"
+ echo "============================================"
+
+ # Clear memory store before each method
+ > /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+ rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+
+ date
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 5 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+ echo "Method $METHOD completed"
+done
+
+echo ""
+echo "=== Done ==="
+date
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/run.py b/collaborativeagents/scripts/run.py
new file mode 100644
index 0000000..f6ed79e
--- /dev/null
+++ b/collaborativeagents/scripts/run.py
@@ -0,0 +1,504 @@
+import argparse
+import json
+import os
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from collaborativeagents.conversation_generator import ConversationGenerator
+from collaborativeagents.conversation_evaluator import ConversationEvaluator
+from collaborativeagents.datasets import datasets_info
+from collaborativeagents.agents import CollaboratorAgent,UserAgent
+from collaborativeagents.prompts import agent_system_prompt_no_user
+
+# import litellm
+# litellm._turn_on_debug()
+
+
+def load_dataset(dataset_name, eval_size, training=False):
+ if dataset_name not in datasets_info:
+ raise ValueError(f"Dataset '{dataset_name}' not found. Available datasets: {list(datasets_info.keys())}")
+
+ dataset_class,user_task_description = datasets_info[dataset_name]['class'],datasets_info[dataset_name]['task_description']
+ dataset_instance = dataset_class(eval_size=eval_size, training=training)
+ dataset = dataset_instance.get_dataset()
+ print(f"Loaded {len(dataset)} samples from {dataset_name}")
+
+ return dataset,user_task_description
+
+def load_user_profiles(training=False):
+ if training:
+ with open("/shared/storage-01/users/mehri2/mem/collaborativeagents/collaborativeagents/user_profiles/training_user_profiles.json", 'r') as f:
+ user_profiles = json.load(f)
+ else:
+ with open("/shared/storage-01/users/mehri2/mem/collaborativeagents/collaborativeagents/user_profiles/user_profiles.json", 'r') as f:
+ user_profiles = json.load(f)
+ return user_profiles
+
+
+def run_no_user(
+ dataset_name="math-hard",
+ eval_size=20,
+ batch_size=50,
+ collaborator_model_name="gpt-4.1-mini",
+ collaborator_api_base=None,
+ collaborator_api_key=None,
+ judge_model_name="gpt-4.1-mini",
+ judge_api_base=None,
+ judge_api_key=None,
+ output_file=None
+ ):
+ if os.path.exists(output_file):
+ with open(output_file, 'r') as f:
+ evaluation_results = []
+ for line in f:
+ if line.strip() == "":
+ continue
+ evaluation_result = json.loads(line)
+ evaluation_results.append(evaluation_result)
+
+ print(f"\n\n\nAll conversations generation and evaluation complete!")
+ print(f" # Total conversations: {len(evaluation_results)}")
+ print("\nEvaluation Results:")
+ print(f" # Overall average accuracy: {evaluation_results[0]['average_accuracy']}")
+ print(f" # Overall average conversation length (# messages): {evaluation_results[0]['average_conversation_length']}")
+ return
+
+ dataset,_ = load_dataset(dataset_name, eval_size)
+
+ collaborator_agent = CollaboratorAgent(
+ model_name=collaborator_model_name,
+ api_base=collaborator_api_base,
+ api_key=collaborator_api_key,
+ )
+ conversationEvaluator = ConversationEvaluator(
+ dataset_name=dataset_name,
+ model_name=judge_model_name,
+ api_base=judge_api_base,
+ api_key=judge_api_key
+ )
+
+ # Generate and evaluate conversations
+ print(f"\n\n\nGenerating answers for {len(dataset)} {dataset_name} samples\n")
+ generated_conversations = []
+
+ total_batches = (len(dataset) + batch_size - 1) // batch_size
+ with tqdm(total=total_batches, desc="Generating conversations") as progress_bar:
+ for i in range(0, len(dataset), batch_size):
+ batch_samples = dataset[i:i+batch_size]
+ # Prepare conversations for the collaborator
+ batch_conversations = [[{"role": "user", "content": s['problem']} ] for s in batch_samples]
+
+ # Batched collaborator responses
+ collab_responses = collaborator_agent.generate_collaborator_responses_batch(batch_conversations)
+
+ # Assemble results
+ for sample, conv, collab_response in zip(batch_samples, batch_conversations, collab_responses):
+ if collab_response is None:
+ # Skip failed items; they will be counted downstream if needed
+ continue
+ conv.append({"role": "assistant", "content": str(collab_response["response"])})
+
+ # Add draft_answer key for evaluator compatibility
+ collab_response["draft_answer"] = collab_response["response"]
+ full_conversation_log = [collab_response]
+
+ res = {
+ "sample": sample,
+ "conversation": conv,
+ "full_conversation_log": full_conversation_log
+ }
+ generated_conversations.append(res)
+
+ progress_bar.update(1)
+
+ evaluation_results = conversationEvaluator.evaluate_conversations(generated_conversations)
+
+ with open(output_file, 'a') as f:
+ f.write(json.dumps(evaluation_results) + "\n")
+ f.flush()
+
+ print(f"\n\n\nAll conversations generation and evaluation complete!")
+ print(f" # Total conversations: {len(generated_conversations)}")
+ print("\nEvaluation Results:")
+ print(f" # Overall average accuracy: {evaluation_results['average_accuracy']}")
+ print(f" # Overall average conversation length (# messages): {evaluation_results['average_conversation_length']}")
+
+def run_user_no_profile(
+ dataset_name="math-hard",
+ eval_size=20,
+ max_turns=10,
+ batch_size=100,
+ user_model_name="gpt-4.1-mini",
+ user_api_base=None,
+ user_api_key=None,
+ collaborator_model_name="gpt-4.1-mini",
+ collaborator_api_base=None,
+ collaborator_api_key=None,
+ judge_model_name="gpt-4.1-mini",
+ judge_api_base=None,
+ judge_api_key=None,
+ output_file=None
+ ):
+ if os.path.exists(output_file):
+ with open(output_file, 'r') as f:
+ evaluation_results = []
+ for line in f:
+ if line.strip() == "":
+ continue
+ evaluation_result = json.loads(line)
+ evaluation_results.append(evaluation_result)
+
+ print(f"\n\n\nAll conversations generation and evaluation complete!")
+ print(f" # Total conversations: {len(evaluation_results)}")
+ print("\nEvaluation Results:")
+ print(f" # Overall average accuracy: {evaluation_results[0]['average_accuracy']}")
+ print(f" # Overall average conversation length (# messages): {evaluation_results[0]['average_conversation_length']}")
+ return
+
+ dataset,user_task_description = load_dataset(dataset_name, eval_size)
+
+ # Generate conversations
+ generated_conversations = []
+
+ print(f"\n\n\nStarting generation conversations for user no preferences\n")
+
+ conversationGenerator = ConversationGenerator(
+ user_task_description=user_task_description,
+ user_persona=None,
+ user_preferences=None,
+ max_turns=max_turns,
+ agent_with_user_preferences=False,
+ batch_size=batch_size,
+ user_model_name=user_model_name,
+ user_api_base=user_api_base,
+ user_api_key=user_api_key,
+ collaborator_model_name=collaborator_model_name,
+ collaborator_api_base=collaborator_api_base,
+ collaborator_api_key=collaborator_api_key
+ )
+ generated_conversations = conversationGenerator.generate_conversations_parallel(dataset)
+
+ conversationEvaluator = ConversationEvaluator(
+ dataset_name=dataset_name,
+ model_name=judge_model_name,
+ api_base=judge_api_base,
+ api_key=judge_api_key
+ )
+ evaluation_results = conversationEvaluator.evaluate_conversations(generated_conversations)
+
+ with open(output_file, 'a') as f:
+ f.write(json.dumps(evaluation_results) + "\n")
+ f.flush()
+
+ print(f"\n\n\nAll conversations generation and evaluation complete!")
+ print(f" # Total conversations: {len(generated_conversations)}")
+ print("\nEvaluation Results:")
+ print(f" # Overall average accuracy: {evaluation_results['average_accuracy']}")
+ print(f" # Overall average conversation length (# messages): {evaluation_results['average_conversation_length']}")
+
+def run_user_profiles(
+ dataset_name="math-hard",
+ training=False,
+ user_profiles=None,
+ user_with_preferences=False,
+ agent_with_user_preferences=False,
+ agent_with_reflection=False,
+ with_scaffolding=False,
+ with_proper_scaffolding=False,
+ eval_size=20,
+ max_turns=10,
+ batch_size=100,
+ user_model_name="gpt-4.1-mini",
+ user_api_base=None,
+ user_api_key=None,
+ collaborator_model_name="gpt-4.1-mini",
+ collaborator_api_base=None,
+ collaborator_api_key=None,
+ judge_model_name="gpt-4.1-mini",
+ judge_api_base=None,
+ judge_api_key=None,
+ output_file=None
+ ):
+ dataset,user_task_description = load_dataset(dataset_name, eval_size, training=training)
+
+ generated_user_sessions = []
+ if os.path.exists(output_file):
+ with open(output_file, 'r') as f:
+ seen_users = set()
+ for line in f:
+ if line.strip() == "":
+ continue
+ curr_result = json.loads(line)
+ seen_users.add(curr_result["i"])
+ generated_user_sessions.append(curr_result)
+ user_profiles = [user_profile_elem for user_profile_elem in user_profiles if user_profile_elem["i"] not in seen_users]
+
+ def generate_and_evaluate_single_user_profile(user_profile_elem):
+ user_profile_i = user_profile_elem["i"]
+ user_persona = user_profile_elem["persona"]
+ if user_with_preferences:
+ user_preferences = "\n".join([f"{i+1}. {pref}" for i, pref in enumerate(user_profile_elem["preferences"])])
+ else:
+ user_preferences = None
+
+ # Generate conversations
+ if agent_with_reflection:
+ print(f"Starting generation conversation sessions for User {user_profile_i}")
+ conversationGenerator = ConversationGenerator(
+ user_task_description=user_task_description,
+ user_persona=user_persona,
+ user_preferences=user_preferences,
+ agent_with_user_preferences=agent_with_user_preferences,
+ max_turns=max_turns,
+ with_scaffolding=with_scaffolding,
+ with_proper_scaffolding=with_proper_scaffolding,
+ batch_size=batch_size,
+ user_model_name=user_model_name,
+ user_api_base=user_api_base,
+ user_api_key=user_api_key,
+ collaborator_model_name=collaborator_model_name,
+ collaborator_api_base=collaborator_api_base,
+ collaborator_api_key=collaborator_api_key
+ )
+ generated_conversations = conversationGenerator.generate_conversations_with_reflective_agent(dataset, training=training)
+ print(f"Finished generation conversation sessions for User {user_profile_i}")
+ print(f" # succeeded user conversation sessions: {len(generated_conversations)}")
+ print(f" # failed user conversation sessions: {len(dataset) - len(generated_conversations)}")
+ else:
+ print(f"Starting generation conversation sessions for User {user_profile_i}")
+ conversationGenerator = ConversationGenerator(
+ user_task_description=user_task_description,
+ user_persona=user_persona,
+ user_preferences=user_preferences,
+ agent_with_user_preferences=agent_with_user_preferences,
+ max_turns=max_turns,
+ batch_size=batch_size,
+ user_model_name=user_model_name,
+ user_api_base=user_api_base,
+ user_api_key=user_api_key,
+ collaborator_model_name=collaborator_model_name,
+ collaborator_api_base=collaborator_api_base,
+ collaborator_api_key=collaborator_api_key
+ )
+ generated_conversations = conversationGenerator.generate_conversations_parallel(dataset)
+ print(f"Finished generation conversation sessions for User {user_profile_i}")
+ print(f" # succeeded user conversation sessions: {len(generated_conversations)}")
+ print(f" # failed user conversation sessions: {len(dataset) - len(generated_conversations)}")
+
+ # Evaluate conversations
+ conversationEvaluator = ConversationEvaluator(
+ dataset_name=dataset_name,
+ model_name=judge_model_name,
+ api_base=judge_api_base,
+ api_key=judge_api_key
+ )
+ evaluation_results = conversationEvaluator.evaluate_conversations(generated_conversations)
+ user_profile_elem["generated_conversations"] = generated_conversations
+ user_profile_elem["evaluation"] = evaluation_results
+
+ return user_profile_elem
+
+
+ with open(output_file, 'a') as f:
+ with tqdm(total=len(user_profiles), desc="Processing user profiles") as progress_bar:
+ for i in range(0, len(user_profiles), batch_size):
+ batch = user_profiles[i:i+batch_size]
+
+ with ThreadPoolExecutor(max_workers=min(batch_size, len(batch))) as executor:
+ futures_to_profile = {
+ executor.submit(generate_and_evaluate_single_user_profile, user_profile_elem): user_profile_elem
+ for user_profile_elem in batch
+ }
+
+ for future in as_completed(futures_to_profile):
+ curr_result = future.result()
+ generated_user_sessions.append(curr_result)
+
+ f.write(json.dumps(curr_result) + "\n")
+ f.flush()
+
+ progress_bar.update(1)
+
+ # Aggregate evaluation results from all user sessions
+ avg_accuracy = sum([user_session['evaluation']['average_accuracy'] for user_session in generated_user_sessions]) / len(generated_user_sessions)
+ avg_length = sum([user_session['evaluation']['average_conversation_length'] for user_session in generated_user_sessions]) / len(generated_user_sessions)
+
+ num_enforced_preferences_per_conversation = []
+ for generated_user_session in generated_user_sessions:
+ for generated_conversation in generated_user_session['generated_conversations']:
+ curr_num_enforced_preferences = 0
+ for message in generated_conversation['full_conversation_log']:
+ if 'enforce_preferences' in message:
+ if message["enforce_preferences"] == True or message["enforce_preferences"] == "True":
+ curr_num_enforced_preferences += 1
+ num_enforced_preferences_per_conversation.append(curr_num_enforced_preferences)
+
+ print(f"\n\n\nAll user profiles generation and evaluation complete!")
+ print(f" # Total user profiles processed: {len(generated_user_sessions)}")
+ print(f" # Total conversations: {sum([len(user_session['generated_conversations']) for user_session in generated_user_sessions])}")
+ print("\nEvaluation Results:")
+ print(f" # Overall average accuracy: {avg_accuracy}")
+ print(f" # Overall average conversation length (# messages): {avg_length}")
+ print(f" # Overall average number of enforced preferences: {sum(num_enforced_preferences_per_conversation) / len(num_enforced_preferences_per_conversation)}")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--experiment_type", type=str)
+ parser.add_argument("--dataset", type=str)
+ parser.add_argument("--eval_size", type=int)
+ parser.add_argument("--output_file", type=str)
+ parser.add_argument("--max_turns", type=int)
+ parser.add_argument("--batch_size", type=int)
+ parser.add_argument("--user_model_name", type=str)
+ parser.add_argument("--user_api_base", type=str)
+ parser.add_argument("--user_api_key", type=str)
+ parser.add_argument("--collaborator_model_name", type=str)
+ parser.add_argument("--collaborator_api_base", type=str)
+ parser.add_argument("--collaborator_api_key", type=str)
+ parser.add_argument("--judge_model_name", type=str)
+ parser.add_argument("--judge_api_base", type=str)
+ parser.add_argument("--judge_api_key", type=str)
+ args = parser.parse_args()
+
+ if args.experiment_type == "no_user":
+ run_no_user(
+ dataset_name=args.dataset,
+ eval_size=args.eval_size,
+ batch_size=args.batch_size,
+ collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key,
+ judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key,
+ output_file=args.output_file
+ )
+ elif args.experiment_type == "user_no_profile":
+ run_user_no_profile(
+ dataset_name=args.dataset,
+ eval_size=args.eval_size,
+ max_turns=args.max_turns,
+ user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key,
+ collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key,
+ judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key,
+ output_file=args.output_file
+ )
+ elif args.experiment_type == "user_profiles_without_preferences":
+ user_profiles = load_user_profiles()
+ run_user_profiles(
+ dataset_name=args.dataset,
+ training=False,
+ user_profiles=user_profiles,
+ user_with_preferences=False,
+ agent_with_reflection=False,
+ eval_size=args.eval_size,
+ max_turns=args.max_turns,
+ batch_size=args.batch_size,
+ user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key,
+ collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key,
+ judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key,
+ output_file=args.output_file
+ )
+ elif args.experiment_type == "user_profiles_with_preferences":
+ user_profiles = load_user_profiles()
+ run_user_profiles(
+ dataset_name=args.dataset,
+ training=False,
+ user_profiles=user_profiles,
+ user_with_preferences=True,
+ agent_with_reflection=False,
+ eval_size=args.eval_size,
+ max_turns=args.max_turns,
+ batch_size=args.batch_size,
+ user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key,
+ collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key,
+ judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key,
+ output_file=args.output_file
+ )
+ elif args.experiment_type == "agent_with_user_preferences":
+ user_profiles = load_user_profiles()
+ run_user_profiles(
+ dataset_name=args.dataset,
+ training=False,
+ user_profiles=user_profiles,
+ user_with_preferences=True,
+ agent_with_user_preferences=True,
+ agent_with_reflection=False,
+ eval_size=args.eval_size,
+ max_turns=args.max_turns,
+ batch_size=args.batch_size,
+ user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key,
+ collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key,
+ judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key,
+ output_file=args.output_file
+ )
+ elif args.experiment_type == "agent_with_reflection":
+ user_profiles = load_user_profiles()
+ run_user_profiles(
+ dataset_name=args.dataset,
+ training=False,
+ user_profiles=user_profiles,
+ user_with_preferences=True,
+ agent_with_user_preferences=True,
+ agent_with_reflection=True,
+ eval_size=args.eval_size,
+ max_turns=args.max_turns,
+ batch_size=args.batch_size,
+ user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key,
+ collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key,
+ judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key,
+ output_file=args.output_file
+ )
+ elif args.experiment_type == "agent_with_reflection_and_scaffolding":
+ user_profiles = load_user_profiles()
+ run_user_profiles(
+ dataset_name=args.dataset,
+ training=False,
+ user_profiles=user_profiles,
+ user_with_preferences=True,
+ agent_with_user_preferences=True,
+ agent_with_reflection=True,
+ with_scaffolding=True,
+ eval_size=args.eval_size,
+ max_turns=args.max_turns,
+ batch_size=args.batch_size,
+ user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key,
+ collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key,
+ judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key,
+ output_file=args.output_file
+ )
+ elif args.experiment_type == "agent_with_reflection_and_proper_scaffolding":
+ user_profiles = load_user_profiles()
+ run_user_profiles(
+ dataset_name=args.dataset,
+ training=False,
+ user_profiles=user_profiles,
+ user_with_preferences=True,
+ agent_with_user_preferences=True,
+ agent_with_reflection=True,
+ with_scaffolding=True,
+ with_proper_scaffolding=True,
+ eval_size=args.eval_size,
+ max_turns=args.max_turns,
+ batch_size=args.batch_size,
+ user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key,
+ collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key,
+ judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key,
+ output_file=args.output_file
+ )
+ elif args.experiment_type == "training_data_with_user_profiles_with_preferences":
+ user_profiles = load_user_profiles(training=True)
+ run_user_profiles(
+ dataset_name=args.dataset,
+ training=True,
+ user_profiles=user_profiles,
+ user_with_preferences=True,
+ agent_with_user_preferences=True,
+ agent_with_reflection=True,
+ eval_size=args.eval_size,
+ max_turns=args.max_turns,
+ batch_size=args.batch_size,
+ user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key,
+ collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key,
+ judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key,
+ output_file=args.output_file
+ )
+ else:
+ raise ValueError(f"Invalid experiment type: {args.experiment_type}") \ No newline at end of file
diff --git a/collaborativeagents/scripts/run.sh b/collaborativeagents/scripts/run.sh
new file mode 100644
index 0000000..87d9234
--- /dev/null
+++ b/collaborativeagents/scripts/run.sh
@@ -0,0 +1,98 @@
+# vllm serve meta-llama/Llama-3.3-70B-Instruct --port 8004 --tensor-parallel-size 4 --max-model-len 16384 --gpu-memory-utilization 0.9
+# python -m sglang.launch_server --model-path meta-llama/Llama-3.3-70B-Instruct --port 8004 --tp-size 4 --context-length 16384
+# python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --port 8003 --tp-size 4 --context-length 16384
+# python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --port 8003 --tp-size 4 --context-length 16384
+
+
+# SFT Models
+# python -m sglang.launch_server --model-path /shared/storage-01/users/mehri2/LLaMA-Factory/saves/llama-3.1-8b-instruct/full/sft_session_level_reflection/checkpoint-628 --served-model-name meta-llama/Llama-3.1-8B-Instruct --port 8003 --tp-size 4 --context-length 16384
+
+# python -m sglang.launch_server --model-path /shared/storage-01/users/mehri2/LLaMA-Factory/saves/qwen2.5-7b/full/sft_session_level_reflection/checkpoint-628 --served-model-name Qwen/Qwen2.5-7B-Instruct --port 8003 --tp-size 4 --context-length 16384
+
+# GRPO Models
+
+# python -m verl.model_merger merge \
+# --backend fsdp \
+# --local_dir /shared/storage-01/users/mehri2/mem/collaborativeagents/training/grpo_verl/results/v3/global_step_200/actor \
+# --target_dir /shared/storage-01/users/mehri2/mem/collaborativeagents/training/grpo_verl/results/v3/global_step_200_merged_hf
+
+# python -m sglang.launch_server --model-path /shared/storage-01/users/mehri2/mem/collaborativeagents/training/grpo_verl/results/v3/global_step_200_merged_hf --served-model-name meta-llama/Llama-3.1-8B-Instruct --port 8003 --tp-size 4 --context-length 16384
+
+
+
+BATCH_SIZE=100
+BATCH_SIZE=50
+
+# Loop over eval sizes and datasets
+for EVAL_SIZE in 20; do
+ for DATASET in math-hard math-500 logiqa mmlu medqa; do # humaneval bigcodebench
+ # Convert dataset name for file paths (replace - with _)
+ DATASET_FILE=$(echo ${DATASET} | tr '-' '_')
+
+ echo "Running experiments for dataset: ${DATASET} with eval_size ${EVAL_SIZE}"
+
+ # # no_user experiment
+ # python3 run.py --experiment_type no_user --dataset ${DATASET} --eval_size ${EVAL_SIZE} --batch_size ${BATCH_SIZE} \
+ # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \
+ # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b/no_user/${DATASET_FILE}_llama70b_user_llama8b_agent_no_user_eval_size_${EVAL_SIZE}.jsonl \
+ # >> ./runs/llama70b_temp_1_llama8b/no_user/${DATASET_FILE}_llama70b_user_llama8b_agent_no_user_eval_size_${EVAL_SIZE}.out 2>&1
+
+ # # user_no_profile experiment
+ # python3 run.py --experiment_type user_no_profile --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \
+ # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b/user_no_profile/${DATASET_FILE}_llama70b_user_llama8b_agent_user_no_profile_eval_size_${EVAL_SIZE}.jsonl \
+ # >> ./runs/llama70b_temp_1_llama8b/user_no_profile/${DATASET_FILE}_llama70b_user_llama8b_agent_user_no_profile_eval_size_${EVAL_SIZE}.out 2>&1
+
+ # # user_profiles_without_preferences experiment
+ # python3 run.py --experiment_type user_profiles_without_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \
+ # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b/user_profiles_without_preferences/${DATASET_FILE}_llama70b_user_llama8b_agent_user_profiles_without_preferences_eval_size_${EVAL_SIZE}.jsonl \
+ # >> ./runs/llama70b_temp_1_llama8b/user_profiles_without_preferences/${DATASET_FILE}_llama70b_user_llama8b_agent_user_profiles_without_preferences_eval_size_${EVAL_SIZE}.out 2>&1
+
+ # user_profiles_with_preferences experiment
+ python3 run.py --experiment_type user_profiles_with_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \
+ --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b_grpo_v3_ckpt200/user_profiles_with_preferences/${DATASET_FILE}_llama70b_user_llama8b_agent_user_profiles_with_preferences_eval_size_${EVAL_SIZE}.jsonl \
+ >> /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b_grpo_v3_ckpt200/user_profiles_with_preferences/${DATASET_FILE}_llama70b_user_llama8b_agent_user_profiles_with_preferences_eval_size_${EVAL_SIZE}.out 2>&1
+
+ # # agent_with_user_preferences experiment
+ # python3 run.py --experiment_type agent_with_user_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \
+ # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b/agent_with_user_preferences/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_user_preferences_eval_size_${EVAL_SIZE}_v2.jsonl \
+ # >> ./runs/llama70b_temp_1_llama8b/agent_with_user_preferences/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_user_preferences_eval_size_${EVAL_SIZE}_v2.out 2>&1
+
+ # # agent_with_reflection experiment
+ # python3 run.py --experiment_type agent_with_reflection --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \
+ # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b/agent_with_reflection_v3/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_reflection_eval_size_${EVAL_SIZE}.jsonl \
+ # >> ./runs/llama70b_temp_1_llama8b/agent_with_reflection_v3/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_reflection_eval_size_${EVAL_SIZE}.out 2>&1
+
+ # # agent_with_reflection_and_scaffolding
+ # python3 run.py --experiment_type agent_with_reflection_and_scaffolding --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \
+ # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b/agent_with_reflection_and_scaffolding/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_reflection_and_scaffolding_eval_size_${EVAL_SIZE}.jsonl \
+ # >> ./runs/llama70b_temp_1_llama8b/agent_with_reflection_and_scaffolding/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_reflection_and_scaffolding_eval_size_${EVAL_SIZE}.out 2>&1
+
+ # agent_with_reflection_and_proper_scaffolding
+ python3 run.py --experiment_type agent_with_reflection_and_proper_scaffolding --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \
+ --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b_grpo_v3_ckpt200/agent_with_reflection_and_proper_scaffolding/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_reflection_and_proper_scaffolding_eval_size_${EVAL_SIZE}.jsonl \
+ >> /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b_grpo_v3_ckpt200/agent_with_reflection_and_proper_scaffolding/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_reflection_and_proper_scaffolding_eval_size_${EVAL_SIZE}.out 2>&1
+
+ done
+done
diff --git a/collaborativeagents/scripts/run_baseline_comparison.py b/collaborativeagents/scripts/run_baseline_comparison.py
new file mode 100644
index 0000000..0bdbcb5
--- /dev/null
+++ b/collaborativeagents/scripts/run_baseline_comparison.py
@@ -0,0 +1,608 @@
+"""
+Run baseline comparison experiments for personalization methods.
+
+Baselines:
+1. Vanilla - No memory
+2. Contextual Memory - Full history in context (summarize if exceeds limit)
+3. Reflection Memory - CollaborativeAgents' agent_notes approach
+4. Reflection + GRPO - Trained version of reflection
+5. All Memory Cards in Context - Extract all, no retrieval
+6. Extractor + RAG - Retrieval without user vector
+7. Extractor + RAG + User Vector - Full personalization
+
+Metrics:
+- Task Accuracy
+- User Effort (user token count)
+- Total Efficiency (all tokens)
+- Conflict Resolution Accuracy (new)
+- User Vector Similarity to Ground Truth (new)
+"""
+
+import json
+import time
+from pathlib import Path
+from dataclasses import dataclass, field, asdict
+from typing import Optional, Callable
+from abc import ABC, abstractmethod
+import numpy as np
+
+# ============================================================================
+# Metrics
+# ============================================================================
+
+@dataclass
+class ConversationMetrics:
+ """Metrics for a single conversation."""
+ task_accuracy: float # 0 or 1 for correct answer
+ user_tokens: int # Total tokens from user messages
+ assistant_tokens: int # Total tokens from assistant messages
+ total_tokens: int # All tokens
+ num_turns: int # Number of conversation turns
+ num_preference_enforcements: int # How many times user enforced preferences
+ conflict_resolution_correct: Optional[bool] = None # If conflict test, was it resolved correctly?
+ latency_seconds: float = 0.0
+
+ @property
+ def user_effort(self) -> int:
+ """User effort = user tokens (lower is better)."""
+ return self.user_tokens
+
+ @property
+ def efficiency(self) -> float:
+ """Efficiency = accuracy / total_tokens * 1000 (higher is better)."""
+ if self.total_tokens == 0:
+ return 0.0
+ return self.task_accuracy / self.total_tokens * 1000
+
+
+@dataclass
+class ExperimentResults:
+ """Aggregated results for an experiment."""
+ baseline_name: str
+ num_conversations: int
+ metrics: dict = field(default_factory=dict)
+
+ def add_conversation(self, conv_metrics: ConversationMetrics):
+ for key in ['task_accuracy', 'user_tokens', 'assistant_tokens',
+ 'total_tokens', 'num_turns', 'num_preference_enforcements']:
+ if key not in self.metrics:
+ self.metrics[key] = []
+ self.metrics[key].append(getattr(conv_metrics, key))
+
+ if conv_metrics.conflict_resolution_correct is not None:
+ if 'conflict_resolution_correct' not in self.metrics:
+ self.metrics['conflict_resolution_correct'] = []
+ self.metrics['conflict_resolution_correct'].append(
+ 1.0 if conv_metrics.conflict_resolution_correct else 0.0
+ )
+
+ def summary(self) -> dict:
+ """Compute summary statistics."""
+ summary = {"baseline": self.baseline_name, "n": self.num_conversations}
+ for key, values in self.metrics.items():
+ if values:
+ summary[f"{key}_mean"] = np.mean(values)
+ summary[f"{key}_std"] = np.std(values)
+ return summary
+
+
+# ============================================================================
+# Baseline Implementations (Abstract)
+# ============================================================================
+
+class BaselineMethod(ABC):
+ """Abstract base class for all baseline methods."""
+
+ def __init__(self, name: str, config: dict = None):
+ self.name = name
+ self.config = config or {}
+
+ @abstractmethod
+ def initialize_session(self, user_id: str, user_profile: dict):
+ """Initialize a new session for a user."""
+ pass
+
+ @abstractmethod
+ def generate_response(self, query: str, conversation_history: list) -> str:
+ """Generate a response given query and history."""
+ pass
+
+ @abstractmethod
+ def update_memory(self, conversation: list, feedback: dict = None):
+ """Update memory after a conversation or turn."""
+ pass
+
+ @abstractmethod
+ def get_context_for_prompt(self) -> str:
+ """Get the memory/context to include in prompts."""
+ pass
+
+ def count_tokens(self, text: str) -> int:
+ """Estimate token count (simple approximation)."""
+ return len(text.split()) * 1.3 # Rough estimate
+
+
+class VanillaBaseline(BaselineMethod):
+ """No memory - fresh context each time."""
+
+ def __init__(self):
+ super().__init__("vanilla")
+
+ def initialize_session(self, user_id: str, user_profile: dict):
+ self.user_id = user_id
+ # No memory initialization needed
+
+ def generate_response(self, query: str, conversation_history: list) -> str:
+ # Would call LLM here
+ pass
+
+ def update_memory(self, conversation: list, feedback: dict = None):
+ # No memory to update
+ pass
+
+ def get_context_for_prompt(self) -> str:
+ return "" # No additional context
+
+
+class ContextualMemoryBaseline(BaselineMethod):
+ """
+ Full conversation history in context.
+ Summarize when exceeds context limit.
+ """
+
+ def __init__(self, max_context_tokens: int = 32000):
+ super().__init__("contextual_memory")
+ self.max_context_tokens = max_context_tokens
+ self.full_history = []
+ self.summarized_history = ""
+
+ def initialize_session(self, user_id: str, user_profile: dict):
+ self.user_id = user_id
+ # Keep accumulated history across sessions
+
+ def generate_response(self, query: str, conversation_history: list) -> str:
+ pass
+
+ def update_memory(self, conversation: list, feedback: dict = None):
+ self.full_history.extend(conversation)
+
+ # Check if we need to summarize
+ total_tokens = sum(self.count_tokens(msg['content']) for msg in self.full_history)
+ if total_tokens > self.max_context_tokens:
+ self._summarize_old_history()
+
+ def _summarize_old_history(self):
+ """Summarize older parts of history to fit context."""
+ # Keep recent conversations, summarize older ones
+ # This is where information loss happens!
+ keep_recent = 10 # Keep last 10 turns verbatim
+ to_summarize = self.full_history[:-keep_recent]
+ recent = self.full_history[-keep_recent:]
+
+ # Would call LLM to summarize here
+ # self.summarized_history = summarize_with_llm(to_summarize)
+ self.full_history = recent
+
+ def get_context_for_prompt(self) -> str:
+ context = ""
+ if self.summarized_history:
+ context += f"Previous conversation summary:\n{self.summarized_history}\n\n"
+ context += "Recent conversation:\n"
+ for msg in self.full_history[-20:]: # Last 20 messages
+ context += f"{msg['role']}: {msg['content']}\n"
+ return context
+
+
+class ReflectionMemoryBaseline(BaselineMethod):
+ """
+ CollaborativeAgents' approach: maintain agent_notes that are
+ updated after each conversation via reflection.
+ """
+
+ def __init__(self):
+ super().__init__("reflection_memory")
+ self.agent_notes = {}
+
+ def initialize_session(self, user_id: str, user_profile: dict):
+ self.user_id = user_id
+ if user_id not in self.agent_notes:
+ self.agent_notes[user_id] = ""
+
+ def generate_response(self, query: str, conversation_history: list) -> str:
+ pass
+
+ def update_memory(self, conversation: list, feedback: dict = None):
+ # After conversation, reflect and update notes
+ # This is their update_agent_notes_prompt approach
+ pass
+
+ def get_context_for_prompt(self) -> str:
+ return f"Notes about this user:\n{self.agent_notes.get(self.user_id, '')}"
+
+
+class AllMemoryCardsBaseline(BaselineMethod):
+ """
+ Extract preferences into memory cards, but put ALL in context.
+ No retrieval - just dump everything.
+ """
+
+ def __init__(self, max_cards_in_context: int = 100):
+ super().__init__("all_memory_cards")
+ self.max_cards = max_cards_in_context
+ self.memory_cards = {} # user_id -> list of cards
+
+ def initialize_session(self, user_id: str, user_profile: dict):
+ self.user_id = user_id
+ if user_id not in self.memory_cards:
+ self.memory_cards[user_id] = []
+
+ def generate_response(self, query: str, conversation_history: list) -> str:
+ pass
+
+ def update_memory(self, conversation: list, feedback: dict = None):
+ # Extract preferences from conversation and add to cards
+ # Would use preference_extractor here
+ pass
+
+ def get_context_for_prompt(self) -> str:
+ cards = self.memory_cards.get(self.user_id, [])
+ if not cards:
+ return ""
+
+ # Just dump all cards - this is the weakness!
+ context = "User preferences (all known):\n"
+ for i, card in enumerate(cards[:self.max_cards]):
+ context += f"{i+1}. When {card['condition']}: {card['action']}\n"
+ return context
+
+
+class ExtractorRAGBaseline(BaselineMethod):
+ """
+ Extract preferences + RAG retrieval.
+ No user vector - just relevance-based retrieval.
+ """
+
+ def __init__(self, top_k: int = 5):
+ super().__init__("extractor_rag")
+ self.top_k = top_k
+ self.memory_store = None # Would be vector store
+
+ def initialize_session(self, user_id: str, user_profile: dict):
+ self.user_id = user_id
+
+ def generate_response(self, query: str, conversation_history: list) -> str:
+ pass
+
+ def update_memory(self, conversation: list, feedback: dict = None):
+ # Extract and store in vector DB
+ pass
+
+ def get_context_for_prompt(self) -> str:
+ # Would retrieve relevant memories here
+ return "Retrieved preferences:\n..."
+
+
+class ExtractorRAGUserVectorBaseline(BaselineMethod):
+ """
+ Full method: Extract + RAG + User Vector for personalized retrieval.
+ """
+
+ def __init__(self, top_k: int = 5):
+ super().__init__("extractor_rag_user_vector")
+ self.top_k = top_k
+ # Would integrate with your PersonalizedLLM
+
+ def initialize_session(self, user_id: str, user_profile: dict):
+ self.user_id = user_id
+
+ def generate_response(self, query: str, conversation_history: list) -> str:
+ pass
+
+ def update_memory(self, conversation: list, feedback: dict = None):
+ # Extract, store, and update user vector via REINFORCE
+ pass
+
+ def get_context_for_prompt(self) -> str:
+ # Would use policy-based retrieval here
+ return "Retrieved preferences (personalized):\n..."
+
+
+# ============================================================================
+# Experiment Runner
+# ============================================================================
+
+@dataclass
+class ExperimentConfig:
+ """Configuration for an experiment run."""
+ baselines: list # List of baseline names to run
+ dataset: str # Dataset to use
+ num_sessions: int = 10 # Sessions per user
+ num_users: int = 20 # Number of user profiles
+ max_turns_per_session: int = 15
+ profile_path: str = "collaborativeagents/data/complex_profiles/profiles.jsonl"
+ output_dir: str = "collaborativeagents/results"
+ include_conflict_tests: bool = True
+ seed: int = 42
+
+
+class ExperimentRunner:
+ """Runs baseline comparison experiments."""
+
+ BASELINE_CLASSES = {
+ "vanilla": VanillaBaseline,
+ "contextual_memory": ContextualMemoryBaseline,
+ "reflection_memory": ReflectionMemoryBaseline,
+ "all_memory_cards": AllMemoryCardsBaseline,
+ "extractor_rag": ExtractorRAGBaseline,
+ "extractor_rag_user_vector": ExtractorRAGUserVectorBaseline,
+ }
+
+ def __init__(self, config: ExperimentConfig):
+ self.config = config
+ self.results = {}
+
+ def load_profiles(self) -> list:
+ """Load user profiles."""
+ profiles = []
+ with open(self.config.profile_path) as f:
+ for line in f:
+ profiles.append(json.loads(line))
+ return profiles[:self.config.num_users]
+
+ def load_dataset(self) -> list:
+ """Load evaluation dataset."""
+ # Would load from collaborativeagents datasets
+ pass
+
+ def run_single_conversation(
+ self,
+ baseline: BaselineMethod,
+ user_profile: dict,
+ problem: dict,
+ session_num: int
+ ) -> ConversationMetrics:
+ """Run a single conversation and collect metrics."""
+ baseline.initialize_session(user_profile['user_id'], user_profile)
+
+ conversation = []
+ user_tokens = 0
+ assistant_tokens = 0
+ num_enforcements = 0
+
+ # Simulate conversation
+ # In practice, would use UserAgent and actual LLM calls
+
+ start_time = time.time()
+
+ # ... conversation loop ...
+
+ latency = time.time() - start_time
+
+ return ConversationMetrics(
+ task_accuracy=0.0, # Would evaluate
+ user_tokens=user_tokens,
+ assistant_tokens=assistant_tokens,
+ total_tokens=user_tokens + assistant_tokens,
+ num_turns=len(conversation) // 2,
+ num_preference_enforcements=num_enforcements,
+ latency_seconds=latency
+ )
+
+ def run_conflict_test(
+ self,
+ baseline: BaselineMethod,
+ user_profile: dict,
+ conflict_test: dict
+ ) -> bool:
+ """Test if baseline correctly resolves a preference conflict."""
+ baseline.initialize_session(user_profile['user_id'], user_profile)
+
+ # Generate response to conflicting query
+ query = conflict_test['query']
+ response = baseline.generate_response(query, [])
+
+ # Check if correct preference was applied
+ correct_pref_id = conflict_test['correct_pref_id']
+ # Would analyze response to check which preference was followed
+
+ return False # Placeholder
+
+ def run_experiment(self):
+ """Run full experiment across all baselines."""
+ profiles = self.load_profiles()
+ dataset = self.load_dataset()
+
+ for baseline_name in self.config.baselines:
+ print(f"\n{'='*60}")
+ print(f"Running baseline: {baseline_name}")
+ print(f"{'='*60}")
+
+ baseline_class = self.BASELINE_CLASSES[baseline_name]
+ baseline = baseline_class()
+
+ results = ExperimentResults(
+ baseline_name=baseline_name,
+ num_conversations=0
+ )
+
+ for user_profile in profiles:
+ user_id = user_profile['user_id']
+ print(f"\nUser: {user_id}")
+
+ # Run multiple sessions
+ for session in range(self.config.num_sessions):
+ # Select problems for this session
+ session_problems = dataset[session * 3:(session + 1) * 3]
+
+ for problem in session_problems:
+ metrics = self.run_single_conversation(
+ baseline, user_profile, problem, session
+ )
+ results.add_conversation(metrics)
+ results.num_conversations += 1
+
+ # Run conflict tests
+ if self.config.include_conflict_tests:
+ for conflict_test in user_profile.get('conflict_tests', []):
+ correct = self.run_conflict_test(
+ baseline, user_profile, conflict_test
+ )
+ # Would add to results
+
+ self.results[baseline_name] = results
+
+ return self.results
+
+ def compute_user_vector_similarity(
+ self,
+ learned_vector: np.ndarray,
+ ground_truth_profile: dict
+ ) -> float:
+ """
+ Compute similarity between learned user vector and ground truth.
+
+ Ground truth is derived from the preference profile:
+ - One-hot encode preference categories
+ - Weight by how often each preference was triggered
+ """
+ # Create ground truth vector from profile
+ # This is a key metric for your method!
+ pass
+
+ def save_results(self):
+ """Save experiment results."""
+ output_dir = Path(self.config.output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Summary table
+ summary = []
+ for name, results in self.results.items():
+ summary.append(results.summary())
+
+ with open(output_dir / "summary.json", 'w') as f:
+ json.dump(summary, f, indent=2)
+
+ # Detailed results
+ for name, results in self.results.items():
+ with open(output_dir / f"{name}_detailed.json", 'w') as f:
+ json.dump(asdict(results), f, indent=2)
+
+ print(f"\nResults saved to {output_dir}")
+
+ def print_comparison_table(self):
+ """Print a comparison table of all baselines."""
+ print("\n" + "=" * 80)
+ print("BASELINE COMPARISON RESULTS")
+ print("=" * 80)
+
+ headers = ["Baseline", "Accuracy", "User Effort", "Total Tokens", "Conflict Acc"]
+ row_format = "{:<30} {:>10} {:>12} {:>14} {:>12}"
+
+ print(row_format.format(*headers))
+ print("-" * 80)
+
+ for name, results in self.results.items():
+ summary = results.summary()
+ print(row_format.format(
+ name,
+ f"{summary.get('task_accuracy_mean', 0):.3f}",
+ f"{summary.get('user_tokens_mean', 0):.0f}",
+ f"{summary.get('total_tokens_mean', 0):.0f}",
+ f"{summary.get('conflict_resolution_correct_mean', 0):.3f}"
+ ))
+
+
+# ============================================================================
+# Analysis Functions
+# ============================================================================
+
+def analyze_context_overflow(results: dict) -> dict:
+ """
+ Analyze how methods degrade as context grows.
+
+ Returns degradation curves for each method.
+ """
+ analysis = {}
+
+ for baseline_name, baseline_results in results.items():
+ # Group by session number
+ by_session = {}
+ # Would analyze accuracy degradation over sessions
+ analysis[baseline_name] = by_session
+
+ return analysis
+
+
+def analyze_conflict_resolution(results: dict, conflict_tests: list) -> dict:
+ """
+ Analyze conflict resolution accuracy by conflict type.
+ """
+ analysis = {}
+
+ for conflict_type in set(t['conflict_group'] for t in conflict_tests):
+ type_tests = [t for t in conflict_tests if t['conflict_group'] == conflict_type]
+
+ for baseline_name in results:
+ if baseline_name not in analysis:
+ analysis[baseline_name] = {}
+ # Would compute accuracy per conflict type
+ analysis[baseline_name][conflict_type] = 0.0
+
+ return analysis
+
+
+def analyze_user_vector_quality(
+ learned_vectors: dict,
+ ground_truth_profiles: list
+) -> dict:
+ """
+ Analyze how well user vectors capture user identity.
+
+ Tests:
+ 1. Same user across sessions -> high similarity
+ 2. Different users -> low similarity
+ 3. Users with similar preferences -> moderate similarity
+ """
+ analysis = {
+ "intra_user_similarity": [], # Same user, different sessions
+ "inter_user_similarity": [], # Different users
+ "preference_cluster_quality": 0.0 # How well vectors cluster by preference
+ }
+
+ # Would compute similarities
+ return analysis
+
+
+# ============================================================================
+# Main
+# ============================================================================
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--baselines", nargs="+", default=[
+ "vanilla", "contextual_memory", "reflection_memory",
+ "all_memory_cards", "extractor_rag", "extractor_rag_user_vector"
+ ])
+ parser.add_argument("--dataset", default="math-500")
+ parser.add_argument("--num_sessions", type=int, default=10)
+ parser.add_argument("--num_users", type=int, default=20)
+ parser.add_argument("--output_dir", default="collaborativeagents/results")
+ parser.add_argument("--seed", type=int, default=42)
+
+ args = parser.parse_args()
+
+ config = ExperimentConfig(
+ baselines=args.baselines,
+ dataset=args.dataset,
+ num_sessions=args.num_sessions,
+ num_users=args.num_users,
+ output_dir=args.output_dir,
+ seed=args.seed
+ )
+
+ runner = ExperimentRunner(config)
+ results = runner.run_experiment()
+ runner.print_comparison_table()
+ runner.save_results()
diff --git a/collaborativeagents/scripts/run_debug.sh b/collaborativeagents/scripts/run_debug.sh
new file mode 100644
index 0000000..1f82d70
--- /dev/null
+++ b/collaborativeagents/scripts/run_debug.sh
@@ -0,0 +1,24 @@
+# vllm serve meta-llama/Llama-3.3-70B-Instruct --port 8004 --tensor-parallel-size 4 --max-model-len 16384 --gpu-memory-utilization 0.9
+
+
+BATCH_SIZE=100
+# BATCH_SIZE=20
+
+# Loop over eval sizes and datasets
+for EVAL_SIZE in 20; do
+ for DATASET in math-500; do
+ # Convert dataset name for file paths (replace - with _)
+ DATASET_FILE=$(echo ${DATASET} | tr '-' '_')
+
+ echo "Running experiments for dataset: ${DATASET} with eval_size ${EVAL_SIZE}"
+
+ # debug experiment
+ python3 run.py --experiment_type debug --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \
+ --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/debug/${DATASET_FILE}_llama70b_user_llama70b_agent_debug_eval_size_${EVAL_SIZE}.jsonl \
+ >> ./runs/llama70b/debug/${DATASET_FILE}_llama70b_user_llama70b_agent_debug_eval_size_${EVAL_SIZE}.out 2>&1
+
+ done
+done \ No newline at end of file
diff --git a/collaborativeagents/scripts/run_experiments.py b/collaborativeagents/scripts/run_experiments.py
new file mode 100644
index 0000000..0ba0ba0
--- /dev/null
+++ b/collaborativeagents/scripts/run_experiments.py
@@ -0,0 +1,1328 @@
+#!/usr/bin/env python3
+"""
+Main experiment orchestrator for personalization benchmark.
+
+This script runs all baselines and the proposed methods with PROPER multi-turn
+conversation simulation, user preference enforcement, and LLM-based evaluation.
+
+Usage:
+ python run_experiments.py --config config.yaml
+ python run_experiments.py --methods vanilla,rag,rag_vector --datasets gpqa,aime
+"""
+
+import argparse
+import json
+import yaml
+import os
+import sys
+from pathlib import Path
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, asdict
+import logging
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+import time
+
+# Add paths
+sys.path.insert(0, str(Path(__file__).parent.parent))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from datasets_extended import get_dataset, get_all_datasets, get_challenging_datasets
+from evaluation.llm_judge import LLMJudge, BatchEvaluator, ConversationMetrics
+from conflict_scenario_generator import ConflictScenarioGenerator
+from adapters.personalized_llm_adapter import PersonalizedLLMAdapter, create_baseline_adapter
+from agents.local_user_agent import LocalUserAgent, SharedLocalUserAgent, TERMINATION_SIGNAL
+from agents.vllm_user_agent import VLLMUserAgent, VLLMAgentClient
+from agents.openai_user_agent import OpenAIUserAgent
+from agents.batch_vllm_agent import BatchConversationGenerator, BatchVLLMClient
+
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ExperimentConfig:
+ """Configuration for an experiment run."""
+ # Methods to compare
+ methods: List[str]
+
+ # Datasets to use
+ datasets: List[str]
+
+ # User profiles
+ n_profiles: int = 200
+ profile_path: Optional[str] = None
+
+ # Profile range (for splitting jobs)
+ start_profile: int = 0 # Inclusive, 0-indexed
+ end_profile: Optional[int] = None # Exclusive, None means all
+
+ # Session settings
+ n_sessions_per_profile: int = 30
+ max_turns_per_session: int = 15 # Increased for harder tasks
+
+ # Model settings
+ user_model: str = "meta-llama/Llama-3.3-70B-Instruct"
+ agent_model: str = "meta-llama/Llama-3.1-8B-Instruct"
+ judge_model: str = "meta-llama/Llama-3.3-70B-Instruct"
+
+ # Output settings
+ output_dir: str = "results"
+ save_conversations: bool = True
+
+ # Conflict testing
+ conflict_ratio: float = 0.3 # proportion of queries that trigger conflicts
+
+ # Compute settings
+ batch_size: int = 4
+ n_gpus: int = 4
+
+ # vLLM settings (for high-performance inference)
+ use_vllm: bool = False
+ vllm_user_url: str = "http://localhost:8004/v1" # 70B user simulator
+ vllm_agent_url: str = "http://localhost:8003/v1" # 8B agent
+
+ # OpenAI user simulator (alternative to vLLM user agent)
+ use_openai_user: bool = False
+ openai_user_model: str = "gpt-5" # Model name for OpenAI user agent
+
+ # Reward mode: "keyword" (implicit user signals) or "llm" (GPT-5-nano judge)
+ # This is a global option applied to ALL methods that use RL updates
+ reward_mode: str = "keyword"
+
+ # Parallel/Batch processing
+ parallel_profiles: int = 50 # Number of profiles to process in parallel
+ use_batch_processing: bool = True # Use turn-synchronous batch processing for vanilla/all_memory
+ batch_size_conversations: int = 50 # Number of conversations to batch together
+
+ # Continue from existing experiment (for extending sessions)
+ continue_from: Optional[str] = None # Path to existing output directory to continue from
+
+
+# Available methods
+AVAILABLE_METHODS = {
+ "vanilla": "No memory, no personalization",
+ "contextual": "Full history in context, summarize when overflow",
+ "reflection": "CollaborativeAgents' agent_notes approach",
+ "reflection_grpo": "Reflection + GRPO training",
+ "all_memory": "All extracted memories in context (no retrieval)",
+ "rag": "Extractor + RAG (no user vector)",
+ "rag_vector": "Extractor + RAG + user vector (proposed method)",
+ "rag_bge": "Extractor + RAG with BGE reranker (278M)",
+ "rag_vector_bge": "Extractor + RAG + user vector with BGE reranker (278M)",
+}
+
+
+class ExperimentRunner:
+ """Main experiment runner."""
+
+ def __init__(self, config: ExperimentConfig):
+ self.config = config
+
+ # Use existing directory if continuing, otherwise create new timestamped one
+ if config.continue_from:
+ self.output_dir = Path(config.continue_from)
+ if not self.output_dir.exists():
+ raise ValueError(f"Continue-from directory does not exist: {config.continue_from}")
+ logger.info(f"Continuing from existing experiment: {self.output_dir}")
+ else:
+ self.output_dir = Path(config.output_dir) / datetime.now().strftime("%Y%m%d_%H%M%S")
+ self.output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Save/update config
+ with open(self.output_dir / "config.yaml", "w") as f:
+ yaml.dump(asdict(config), f)
+
+ # Initialize components
+ self.judge = LLMJudge(model_name=config.judge_model)
+ self.batch_evaluator = BatchEvaluator(self.judge)
+ self.conflict_generator = ConflictScenarioGenerator()
+
+ # Load datasets
+ self.datasets = {}
+ for ds_name in config.datasets:
+ try:
+ self.datasets[ds_name] = get_dataset(ds_name)
+ logger.info(f"Loaded dataset: {ds_name}")
+ except Exception as e:
+ logger.warning(f"Failed to load dataset {ds_name}: {e}")
+
+ # Load or generate profiles
+ self.profiles = self._load_profiles()
+
+ def _load_profiles(self) -> List[Dict]:
+ """Load user profiles from file or generate."""
+ logger.info(f"Profile path configured: {self.config.profile_path}")
+
+ if self.config.profile_path:
+ profile_path = Path(self.config.profile_path)
+ if profile_path.exists():
+ profiles = []
+ with open(profile_path) as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ profiles.append(json.loads(line))
+ logger.info(f"Loaded {len(profiles)} profiles from {self.config.profile_path}")
+ return profiles[:self.config.n_profiles]
+ else:
+ logger.warning(f"Profile path does not exist: {self.config.profile_path}")
+
+ # Generate simple placeholder profiles if no file provided
+ logger.info(f"Generating {self.config.n_profiles} placeholder profiles...")
+ profiles = []
+ for i in range(self.config.n_profiles):
+ profiles.append({
+ "id": i,
+ "persona": f"User {i+1} is a curious individual seeking help with problem solving.",
+ "preferences": [
+ "Provide clear, step-by-step explanations",
+ "Use simple language when possible",
+ "Give examples to illustrate concepts",
+ "Be concise but thorough",
+ "Acknowledge when something is uncertain"
+ ]
+ })
+
+ # Save generated profiles
+ profile_path = self.output_dir / "generated_profiles.json"
+ with open(profile_path, "w") as f:
+ json.dump(profiles, f, indent=2)
+
+ logger.info(f"Generated and saved {len(profiles)} placeholder profiles")
+ return profiles
+
+ def _create_method_adapter(self, method: str, profile: Dict, use_shared_models: bool = False) -> Any:
+ """Create adapter for a specific method.
+
+ Args:
+ method: One of the baseline method names
+ profile: User profile dict (used later in start_session, not constructor)
+ use_shared_models: If True, share embedding/reranker models across parallel
+ workers. ESSENTIAL for parallel profile processing to avoid OOM.
+
+ Returns:
+ Configured adapter instance
+ """
+ # Auto-detect available GPUs and set device assignment accordingly
+ # Layout with local 70B user (4 GPUs):
+ # GPU 0-1: 70B user simulator (TP=2)
+ # GPU 2: 8B agent vLLM server
+ # GPU 3: Embedding + Reranker + Extractor
+ # Layout with OpenAI user (2 GPUs):
+ # GPU 0: 8B agent vLLM server
+ # GPU 1: Embedding + Reranker + Extractor
+ device_assignment = None
+ try:
+ import torch
+ n_gpus = torch.cuda.device_count()
+ if n_gpus >= 4:
+ # 4 GPU layout: 70B user on 0-1, agent on 2, adapters on 3
+ device_assignment = {
+ "embed": "cuda:3",
+ "reranker": "cuda:3",
+ "extractor": "cuda:3",
+ }
+ elif n_gpus >= 2:
+ # 2 GPU layout: agent on 0, adapters on 1
+ device_assignment = {
+ "embed": "cuda:1",
+ "reranker": "cuda:1",
+ "extractor": "cuda:1",
+ }
+ elif n_gpus == 1:
+ device_assignment = {
+ "embed": "cuda:0",
+ "reranker": "cuda:0",
+ "extractor": "cuda:0",
+ }
+ except ImportError:
+ pass
+
+ adapter = create_baseline_adapter(
+ method,
+ device_assignment=device_assignment,
+ use_vllm=self.config.use_vllm,
+ use_shared_models=use_shared_models,
+ reward_mode=self.config.reward_mode,
+ )
+ # Profile will be passed to start_session() when the conversation begins
+ return adapter
+
+ def run_single_session(
+ self,
+ method: str,
+ profile: Dict,
+ problem: Dict,
+ is_conflict_query: bool = False,
+ adapter: Any = None,
+ user_agent: Any = None
+ ) -> Dict:
+ """Run a single session with PROPER multi-turn conversation and user simulation.
+
+ This implements:
+ 1. User simulator that role-plays with preferences
+ 2. Multi-turn conversation (up to max_turns)
+ 3. Preference enforcement by simulated user
+ 4. Proper metrics extraction from conversation
+ """
+ # Use provided adapter (reused across sessions) or create new one
+ agent_adapter = adapter if adapter else self._create_method_adapter(method, profile)
+
+ # Prepare conflict scenario if needed
+ conflict_scenario = None
+ original_problem = problem.get("problem", problem.get("question", ""))
+ if is_conflict_query:
+ conflict_scenario = self.conflict_generator.generate_for_profile(
+ profile["preferences"],
+ problem.get("domain", "general")
+ )
+ if conflict_scenario:
+ problem = dict(problem)
+ problem["problem"] = conflict_scenario["query"]
+
+ query = problem.get("problem", problem.get("question", ""))
+
+ # Extract user preferences as formatted string
+ user_prefs = profile.get("preferences", [])
+ if isinstance(user_prefs, list) and len(user_prefs) > 0:
+ if isinstance(user_prefs[0], dict):
+ # Structured preferences with condition/action
+ pref_str = "\n".join([
+ f"- When {p.get('condition', '')}, {p.get('action', '')}"
+ for p in user_prefs[:10] # Top 10 preferences
+ ])
+ else:
+ # Simple string preferences
+ pref_str = "\n".join([f"- {p}" for p in user_prefs[:10]])
+ else:
+ pref_str = str(user_prefs)
+
+ user_persona = profile.get("persona", "A user seeking help with problem solving.")
+
+ # Create user agent for this session (or reuse provided one)
+ if user_agent is None:
+ if self.config.use_openai_user:
+ user_agent = OpenAIUserAgent(
+ user_task_description="Help the user solve their problem.",
+ problem=query,
+ user_persona=user_persona,
+ user_preferences=pref_str,
+ model=self.config.openai_user_model,
+ )
+ elif self.config.use_vllm:
+ user_agent = VLLMUserAgent(
+ user_task_description="Help the user solve their problem.",
+ problem=query,
+ user_persona=user_persona,
+ user_preferences=pref_str,
+ vllm_url=self.config.vllm_user_url,
+ )
+ else:
+ user_agent = SharedLocalUserAgent(
+ user_task_description="Help the user solve their problem.",
+ problem=query,
+ user_persona=user_persona,
+ user_preferences=pref_str,
+ )
+
+ # Initialize conversation
+ turns = []
+ full_user_log = [] # Detailed user agent outputs
+
+ # Metrics tracking
+ enforcement_count = 0
+ disappointment_count = 0
+ user_token_count = 0
+ agent_token_count = 0
+ preference_compliance_scores = []
+
+ try:
+ # Initialize adapter for this user
+ if hasattr(agent_adapter, 'initialize'):
+ agent_adapter.initialize()
+ if hasattr(agent_adapter, 'start_session'):
+ agent_adapter.start_session(
+ user_id=profile.get("user_id", "test_user"),
+ user_profile={"preferences": user_prefs, "persona": user_persona}
+ )
+
+ # Start with agent greeting
+ conversation = [{"role": "assistant", "content": "How can I help you today?"}]
+
+ # Multi-turn conversation loop
+ for turn_num in range(self.config.max_turns_per_session):
+ # === User Turn ===
+ user_response = user_agent.generate_user_response(conversation)
+
+ if user_response is None:
+ logger.warning(f"User agent failed to respond at turn {turn_num}")
+ break
+
+ user_message = str(user_response.get("response", ""))
+ user_token_count += len(user_message.split())
+
+ # Add to conversation
+ conversation.append({"role": "user", "content": user_message})
+ turns.append({"role": "user", "content": user_message})
+ full_user_log.append(user_response)
+
+ # Check for termination
+ if user_response.get("should_terminate", False) or TERMINATION_SIGNAL in user_message:
+ break
+
+ # Detect preference enforcement (user correcting agent)
+ enforcement_keywords = ["please", "i asked", "i said", "i prefer", "can you", "could you", "instead"]
+ if any(kw in user_message.lower() for kw in enforcement_keywords):
+ enforcement_count += 1
+
+ # === Agent Turn ===
+ if hasattr(agent_adapter, 'generate_response'):
+ response = agent_adapter.generate_response(user_message, conversation[:-1])
+ agent_content = response.get("response", str(response)) if isinstance(response, dict) else str(response)
+ elif callable(agent_adapter):
+ agent_content = agent_adapter(conversation)
+ else:
+ agent_content = "[Error: Adapter not properly configured]"
+
+ agent_token_count += len(agent_content.split())
+
+ # Add to conversation
+ conversation.append({"role": "assistant", "content": agent_content})
+ turns.append({"role": "assistant", "content": agent_content})
+
+ # Estimate preference compliance for this turn (heuristic based on user satisfaction)
+ # If user doesn't enforce in next turn, assume compliance
+ # This is a simplified heuristic - LLM judge would be more accurate
+ compliance_score = 0.8 if enforcement_count == 0 else max(0.2, 1.0 - 0.2 * enforcement_count)
+ preference_compliance_scores.append(compliance_score)
+
+ # End session
+ if hasattr(agent_adapter, 'end_session'):
+ adapter_metrics = agent_adapter.end_session(task_success=True)
+ else:
+ adapter_metrics = {}
+
+ except Exception as e:
+ import traceback
+ logger.error(f"Error in session: {e}")
+ logger.error(f"Full traceback:\n{traceback.format_exc()}")
+ turns.append({"role": "assistant", "content": f"[Error: {e}]"})
+
+ # Compute metrics
+ total_turns = len(turns)
+ total_token_count = user_token_count + agent_token_count
+
+ # Check if user reached a satisfactory answer (from last user response)
+ task_success = False
+ if full_user_log:
+ last_user = full_user_log[-1]
+ if last_user.get("should_terminate", False):
+ draft = last_user.get("draft_answer", "")
+ # Consider success if draft answer is not empty/"I don't know"
+ task_success = bool(draft) and draft.lower() != "i don't know"
+
+ # Compute average compliance
+ avg_compliance = sum(preference_compliance_scores) / len(preference_compliance_scores) if preference_compliance_scores else 0.5
+
+ # Conflict resolution (if this was a conflict test)
+ conflict_accuracy = 0.0
+ if is_conflict_query and conflict_scenario:
+ # Check if the correct preference was applied
+ expected_pref = conflict_scenario.get("expected_preference", "")
+ # Simple heuristic: check if expected preference keywords appear in agent responses
+ agent_texts = " ".join([t["content"] for t in turns if t["role"] == "assistant"])
+ if expected_pref and any(kw in agent_texts.lower() for kw in expected_pref.lower().split()[:3]):
+ conflict_accuracy = 1.0
+
+ # Over-personalization detection (heuristic: if agent mentions preferences not in profile)
+ over_personalization = 0.0
+
+ metrics = ConversationMetrics(
+ task_success=task_success,
+ turns_to_success=total_turns if task_success else -1,
+ total_turns=total_turns,
+ user_token_count=user_token_count,
+ enforcement_count=enforcement_count,
+ disappointment_count=disappointment_count,
+ total_token_count=total_token_count,
+ agent_token_count=agent_token_count,
+ preference_compliance_scores=preference_compliance_scores,
+ conflict_resolution_accuracy=conflict_accuracy,
+ over_personalization_rate=over_personalization,
+ )
+
+ return {
+ "method": method,
+ "profile_id": profile.get("user_id", "unknown"),
+ "problem_id": problem.get("problem_id", str(hash(query))[:8]),
+ "problem": original_problem,
+ "ground_truth_solution": problem.get("solution", problem.get("answer", "")),
+ "is_conflict_test": is_conflict_query,
+ "conflict_scenario": conflict_scenario,
+ "conversation": {"turns": turns} if self.config.save_conversations else None,
+ "full_user_log": full_user_log if self.config.save_conversations else None,
+ "metrics": asdict(metrics),
+ "adapter_metrics": adapter_metrics if 'adapter_metrics' in dir() else {},
+ }
+
+ def _run_profile_sessions(
+ self,
+ method: str,
+ profile_idx: int,
+ profile: Dict,
+ adapter: Any = None
+ ) -> List[Dict]:
+ """Run all sessions for a single profile. Thread-safe for parallel execution."""
+ profile_results = []
+
+ # Create vLLM-based agent client if using vLLM (for methods that need it)
+ vllm_agent = None
+ if self.config.use_vllm and method == "vanilla":
+ vllm_agent = VLLMAgentClient(
+ vllm_url=self.config.vllm_agent_url,
+ system_prompt="You are a helpful AI assistant for problem-solving tasks."
+ )
+
+ # Run sessions across datasets
+ session_idx = 0
+ for ds_name, dataset in self.datasets.items():
+ samples = dataset.get_testset()
+
+ for sample in samples:
+ if session_idx >= self.config.n_sessions_per_profile:
+ break
+
+ # Decide if this is a conflict query
+ is_conflict = (session_idx % int(1 / self.config.conflict_ratio)) == 0
+
+ problem = {
+ "problem": sample.problem,
+ "solution": sample.solution,
+ "problem_id": sample.problem_id,
+ "domain": sample.domain,
+ }
+
+ try:
+ result = self.run_single_session(
+ method=method,
+ profile=profile,
+ problem=problem,
+ is_conflict_query=is_conflict,
+ adapter=vllm_agent if vllm_agent else adapter
+ )
+ profile_results.append(result)
+ except Exception as e:
+ logger.error(f"Error in session for profile {profile_idx}: {e}")
+
+ session_idx += 1
+
+ return profile_results
+
+ def run_method(self, method: str) -> List[Dict]:
+ """Run all sessions for a single method with checkpointing and parallel processing."""
+ logger.info(f"Running method: {method}")
+
+ # Setup method directory and checkpoint
+ method_dir = self.output_dir / method
+ method_dir.mkdir(exist_ok=True)
+ checkpoint_file = method_dir / "checkpoint.json"
+ results_file = method_dir / "results.json"
+
+ # Load existing results and checkpoint
+ results = []
+ completed_profiles = set()
+ sessions_per_profile = {} # Track session count per profile for continue functionality
+ if checkpoint_file.exists():
+ with open(checkpoint_file, "r") as f:
+ checkpoint = json.load(f)
+ completed_profiles = set(checkpoint.get("completed_profiles", []))
+ sessions_per_profile = checkpoint.get("sessions_per_profile", {})
+ logger.info(f" Resuming from checkpoint: {len(completed_profiles)} profiles completed")
+ if sessions_per_profile:
+ total_sessions = sum(sessions_per_profile.values())
+ logger.info(f" Session-level tracking: {total_sessions} sessions across {len(sessions_per_profile)} profiles")
+ if results_file.exists():
+ with open(results_file, "r") as f:
+ results = json.load(f)
+
+ # Determine profile range
+ start_idx = self.config.start_profile
+ end_idx = self.config.end_profile if self.config.end_profile else len(self.profiles)
+
+ # Build list of profiles that need more sessions
+ profiles_to_run = []
+ for idx in range(start_idx, min(end_idx, len(self.profiles))):
+ existing_sessions = sessions_per_profile.get(str(idx), 0)
+ if existing_sessions < self.config.n_sessions_per_profile:
+ profiles_to_run.append(idx)
+
+ # Log what we're running
+ if sessions_per_profile:
+ total_existing = sum(sessions_per_profile.get(str(idx), 0) for idx in profiles_to_run)
+ total_needed = len(profiles_to_run) * self.config.n_sessions_per_profile
+ logger.info(f" Running profiles {start_idx} to {end_idx-1}: {len(profiles_to_run)} profiles need sessions")
+ logger.info(f" Sessions: {total_existing} existing, {total_needed - total_existing} remaining")
+ else:
+ logger.info(f" Running profiles {start_idx} to {end_idx-1} ({len(profiles_to_run)} remaining)")
+
+ # When using batch processing with vLLM or OpenAI user: use turn-synchronous batch mode
+ # This batches both user and agent calls for maximum throughput
+ if self.config.use_batch_processing and self.config.use_vllm:
+ user_type = "OpenAI" if self.config.use_openai_user else "local vLLM"
+ logger.info(f" Using BATCH processing ({user_type} user) for {method}")
+ return self._run_method_batch(
+ method, profiles_to_run, results, completed_profiles,
+ sessions_per_profile, checkpoint_file, results_file
+ )
+
+ # Decide on parallelization for sequential methods
+ n_parallel = self.config.parallel_profiles if (self.config.use_vllm or self.config.use_openai_user) else 1
+
+ if n_parallel > 1:
+ logger.info(f" Using parallel processing with {n_parallel} workers")
+ self._run_method_parallel(
+ method, profiles_to_run, results, completed_profiles,
+ sessions_per_profile, checkpoint_file, results_file
+ )
+ else:
+ # Sequential execution (original behavior)
+ # Create ONE adapter per method and reuse it (avoids GPU OOM from repeated model loading)
+ adapter = self._create_method_adapter(method, None)
+ adapter.initialize()
+
+ for profile_idx in profiles_to_run:
+ profile = self.profiles[profile_idx]
+ logger.info(f" Profile {profile_idx + 1}/{len(self.profiles)}")
+
+ profile_results = self._run_profile_sessions(method, profile_idx, profile, adapter)
+
+ # Add profile results to overall results
+ results.extend(profile_results)
+ completed_profiles.add(profile_idx)
+ sessions_per_profile[str(profile_idx)] = self.config.n_sessions_per_profile
+
+ # Save checkpoint and results after each profile
+ with open(checkpoint_file, "w") as f:
+ json.dump({
+ "completed_profiles": sorted(list(completed_profiles)),
+ "sessions_per_profile": sessions_per_profile
+ }, f)
+ with open(results_file, "w") as f:
+ json.dump(results, f, indent=2)
+ logger.info(f" Profile {profile_idx + 1} completed and checkpointed")
+
+ return results
+
+ def _run_method_parallel(
+ self,
+ method: str,
+ profiles_to_run: List[int],
+ results: List[Dict],
+ completed_profiles: set,
+ sessions_per_profile: Dict[str, int],
+ checkpoint_file: Path,
+ results_file: Path
+ ):
+ """Run profiles in parallel using ThreadPoolExecutor.
+
+ Uses shared model singletons for embedding/reranker to avoid OOM
+ when multiple workers try to load their own copies.
+ """
+ n_parallel = self.config.parallel_profiles
+ results_lock = threading.Lock()
+ start_time = time.time()
+ profiles_completed = 0
+
+ def process_profile(profile_idx: int) -> tuple:
+ """Process a single profile and return (profile_idx, results)."""
+ profile = self.profiles[profile_idx]
+ # Create adapter with shared models to avoid OOM from duplicate model loading
+ adapter = self._create_method_adapter(method, profile, use_shared_models=True)
+ profile_results = self._run_profile_sessions(method, profile_idx, profile, adapter)
+ return profile_idx, profile_results
+
+ with ThreadPoolExecutor(max_workers=n_parallel) as executor:
+ # Submit all profile jobs
+ future_to_profile = {
+ executor.submit(process_profile, idx): idx
+ for idx in profiles_to_run
+ }
+
+ # Process completed profiles
+ for future in as_completed(future_to_profile):
+ profile_idx = future_to_profile[future]
+ try:
+ idx, profile_results = future.result()
+
+ with results_lock:
+ results.extend(profile_results)
+ completed_profiles.add(idx)
+ sessions_per_profile[str(idx)] = self.config.n_sessions_per_profile
+ profiles_completed += 1
+
+ # Save checkpoint with session-level tracking
+ with open(checkpoint_file, "w") as f:
+ json.dump({
+ "completed_profiles": sorted(list(completed_profiles)),
+ "sessions_per_profile": sessions_per_profile
+ }, f)
+ with open(results_file, "w") as f:
+ json.dump(results, f, indent=2)
+
+ # Log progress with throughput estimate
+ elapsed = time.time() - start_time
+ profiles_per_hour = profiles_completed / elapsed * 3600 if elapsed > 0 else 0
+ sessions_per_hour = len(results) / elapsed * 3600 if elapsed > 0 else 0
+ logger.info(
+ f" Profile {idx + 1} completed "
+ f"({profiles_completed}/{len(profiles_to_run)}) - "
+ f"{profiles_per_hour:.1f} profiles/hr, {sessions_per_hour:.1f} sessions/hr"
+ )
+
+ except Exception as e:
+ logger.error(f" Profile {profile_idx} failed: {e}")
+
+ def _run_method_batch(
+ self,
+ method: str,
+ profiles_to_run: List[int],
+ results: List[Dict],
+ completed_profiles: set,
+ sessions_per_profile: Dict[str, int],
+ checkpoint_file: Path,
+ results_file: Path
+ ) -> List[Dict]:
+ """
+ Turn-synchronous batch processing for ALL methods.
+
+ At each turn, user calls are batched concurrently via AsyncOpenAI,
+ then agent responses go through personalization adapters.
+ Sessions within a profile run sequentially (for stateful memory).
+ """
+ from agents.batch_vllm_agent import BatchOpenAIClient, BatchVLLMClient, TERMINATION_SIGNAL
+ from json_repair import repair_json
+
+ start_time = time.time()
+
+ # Create user client (OpenAI API or local vLLM)
+ if self.config.use_openai_user:
+ user_client = BatchOpenAIClient(
+ model=self.config.openai_user_model,
+ max_tokens=4096,
+ max_concurrent=32,
+ api_key=os.environ.get("OPENAI_API_KEY"),
+ )
+ logger.info(f" Using OpenAI user simulator: {self.config.openai_user_model}")
+ else:
+ user_client = BatchVLLMClient(
+ vllm_url=self.config.vllm_user_url,
+ max_tokens=4096,
+ temperature=1.0,
+ timeout=None,
+ max_concurrent=100,
+ json_mode=True, # User simulator needs JSON output
+ )
+ logger.info(f" Using local vLLM user simulator: {self.config.vllm_user_url}")
+
+ # Create async agent client for batched vLLM calls
+ agent_client = BatchVLLMClient(
+ vllm_url=self.config.vllm_agent_url,
+ max_tokens=2048,
+ temperature=0.7,
+ timeout=None, # Infinite timeout for long generations
+ max_concurrent=100,
+ )
+
+ USER_PROMPT_TEMPLATE = (
+ "You are a user simulator collaborating with an agent to solve a problem. "
+ "You will be provided with a problem description, and you must get the agent to help you solve it. "
+ "You will also be provided with user preferences, which you must follow and actively enforce throughout the conversation.\n\n"
+ "# Problem Description\n{problem}\nNote: the agent cannot see this problem description.\n\n"
+ "# User Persona\n{user_persona}\n\n"
+ "# User Preferences\n{user_preferences}\n"
+ "These preferences are NON-NEGOTIABLE that define how you prefer the agent to behave. They must be strictly enforced:\n"
+ " - **Answer clarifying questions**: The agent may ask clarifying questions before attempting an answer. "
+ "Answer such questions, and do not enforce preferences about answer format or content while the agent is clarifying.\n"
+ " - **Enforce immediately**: Every agent response must satisfy your preferences before you can proceed. "
+ "Explicitly ask the agent to adjust their response until it complies.\n"
+ " - **Never proceed without compliance**: Do NOT update your draft answer, do NOT consider terminating, "
+ "and do NOT move forward until the agent follows your preferences.\n\n"
+ "# Draft Answer Management\n"
+ "- **Maintain a working draft**: Start with \"I don't know\". Update your draft answer based on what you learn from agent responses.\n"
+ "- **Don't update when enforcing preferences**: If the agent response does not follow your preferences, "
+ "do NOT update your draft answer, regardless of whether the agent provides helpful information.\n\n"
+ "# Conversation Termination\n"
+ "Before generating your response, determine if you should terminate:\n"
+ " - Do you feel like your draft answer is a good answer to the problem?\n"
+ " - Do you feel like the agent cannot help further?\n"
+ "If the agent response does not follow your preferences, you must NOT terminate - instead, enforce the preferences.\n"
+ "When ready to terminate, respond with \"TERMINATE\".\n\n"
+ "# Output Format (respond in JSON):\n"
+ "{{\n"
+ " \"preferences_check\": \"For EACH relevant preference, evaluate: is it satisfied?\",\n"
+ " \"enforce_preferences\": true/false,\n"
+ " \"reasoning\": \"Brief reasoning (2-3 sentences). Does agent follow preferences? If no, enforce. If yes, update draft.\",\n"
+ " \"draft_answer\": \"Your current working draft answer\",\n"
+ " \"should_terminate\": true/false,\n"
+ " \"response\": \"Your response to the agent\"\n"
+ "}}"
+ )
+
+ def parse_user_response(content):
+ if not content:
+ return None
+ try:
+ parsed = repair_json(content, return_objects=True)
+ if isinstance(parsed, dict) and "response" in parsed:
+ return parsed
+ except:
+ pass
+ if TERMINATION_SIGNAL in (content or ""):
+ return {"reasoning": "", "draft_answer": "", "should_terminate": True, "response": TERMINATION_SIGNAL}
+ return {"reasoning": "", "draft_answer": "", "should_terminate": False, "response": content or ""}
+
+ def reverse_roles(conversation):
+ return [
+ {"role": "user" if m["role"] == "assistant" else "assistant", "content": m["content"]}
+ for m in conversation
+ ]
+
+ # Create per-profile adapters
+ adapters = {}
+ profile_sessions = {}
+
+ for profile_idx in profiles_to_run:
+ profile = self.profiles[profile_idx]
+ adapter = self._create_method_adapter(method, profile, use_shared_models=True)
+ if hasattr(adapter, 'initialize'):
+ adapter.initialize()
+ adapters[profile_idx] = adapter
+
+ sessions = []
+ for ds_name, ds_obj in self.datasets.items():
+ ds_items = ds_obj.get_testset()
+ for item in ds_items[:self.config.n_sessions_per_profile]:
+ sessions.append({"problem": item.problem, "solution": item.solution, "domain": ds_obj.domain})
+ sessions = sessions[:self.config.n_sessions_per_profile]
+ n_conflict = int(len(sessions) * self.config.conflict_ratio)
+ profile_sessions[profile_idx] = [(s, idx < n_conflict) for idx, s in enumerate(sessions)]
+
+ n_sessions = self.config.n_sessions_per_profile
+
+ # Calculate sessions to run per profile (accounting for existing sessions)
+ sessions_to_run_per_profile = {}
+ for profile_idx in profiles_to_run:
+ existing = sessions_per_profile.get(str(profile_idx), 0)
+ remaining = n_sessions - existing
+ if remaining > 0:
+ sessions_to_run_per_profile[profile_idx] = (existing, remaining) # (start_session, count)
+
+ if sessions_to_run_per_profile:
+ total_remaining = sum(v[1] for v in sessions_to_run_per_profile.values())
+ logger.info(f" Batch: {len(sessions_to_run_per_profile)} profiles, {total_remaining} sessions remaining")
+ else:
+ logger.info(f" Batch: All sessions already completed")
+ return results
+
+ # Process sessions in rounds
+ for session_idx in range(n_sessions):
+ # Initialize all conversations for this round
+ all_states = {} # profile_idx -> state dict
+ active_set = set()
+
+ for profile_idx in profiles_to_run:
+ # Skip if this profile doesn't need this session
+ if profile_idx not in sessions_to_run_per_profile:
+ continue
+ start_session, _ = sessions_to_run_per_profile[profile_idx]
+ if session_idx < start_session:
+ continue # Already completed this session
+ if session_idx >= len(profile_sessions[profile_idx]):
+ continue
+ problem_dict, is_conflict = profile_sessions[profile_idx][session_idx]
+ profile = self.profiles[profile_idx]
+ query = problem_dict["problem"]
+
+ if is_conflict:
+ cs = self.conflict_generator.generate_for_profile(
+ profile.get("preferences", []), problem_dict.get("domain", "general"))
+ if cs:
+ query = cs["query"]
+
+ user_prefs = profile.get("preferences", [])
+ if isinstance(user_prefs, list) and user_prefs:
+ if isinstance(user_prefs[0], dict):
+ pref_str = "\n".join([f"- When {p.get('condition','')}, {p.get('action','')}" for p in user_prefs[:10]])
+ else:
+ pref_str = "\n".join([f"- {p}" for p in user_prefs[:10]])
+ else:
+ pref_str = str(user_prefs)
+
+ user_persona = profile.get("persona", "A user seeking help with problem solving.")
+ adapter = adapters[profile_idx]
+ if hasattr(adapter, 'start_session'):
+ adapter.start_session(
+ user_id=profile.get("user_id", f"user_{profile_idx}"),
+ user_profile={"preferences": user_prefs, "persona": user_persona}
+ )
+
+ all_states[profile_idx] = {
+ "conversation": [{"role": "assistant", "content": "How can I help you today?"}],
+ "full_log": [],
+ "system_prompt": USER_PROMPT_TEMPLATE.format(
+ problem=query, user_persona=user_persona, user_preferences=pref_str),
+ "problem_dict": problem_dict,
+ "is_conflict": is_conflict,
+ "enforcement_count": 0,
+ }
+ active_set.add(profile_idx)
+
+ # Turn-synchronous loop
+ for turn in range(self.config.max_turns_per_session):
+ if not active_set:
+ break
+
+ # Batch user calls
+ active_list = sorted(active_set)
+ user_msgs_batch = []
+ for pidx in active_list:
+ state = all_states[pidx]
+ msgs = [{"role": "system", "content": state["system_prompt"]}]
+ msgs.extend(reverse_roles(state["conversation"]))
+ user_msgs_batch.append(msgs)
+
+ user_responses = user_client.batch_completion(user_msgs_batch)
+
+ # Process user responses and prepare agent prompts for batching
+ to_remove = []
+ agent_prompts_batch = [] # List of (pidx, messages, context)
+ for i, pidx in enumerate(active_list):
+ state = all_states[pidx]
+ parsed = parse_user_response(user_responses[i])
+
+ if parsed is None:
+ to_remove.append(pidx)
+ continue
+
+ user_msg = str(parsed.get("response", ""))
+ state["conversation"].append({"role": "user", "content": user_msg})
+ state["full_log"].append(parsed)
+
+ if parsed.get("enforce_preferences", False):
+ state["enforcement_count"] += 1
+
+ if parsed.get("should_terminate", False) or TERMINATION_SIGNAL in user_msg:
+ to_remove.append(pidx)
+ continue
+
+ # Prepare agent prompt for batching (don't call LLM yet)
+ try:
+ adapter = adapters[pidx]
+ if hasattr(adapter, 'prepare_prompt'):
+ messages, context = adapter.prepare_prompt(user_msg, state["conversation"][:-1])
+ agent_prompts_batch.append((pidx, messages, context))
+ elif hasattr(adapter, 'generate_response'):
+ # Fallback for adapters without prepare_prompt
+ agent_prompts_batch.append((pidx, None, None))
+ else:
+ state["conversation"].append({"role": "assistant", "content": "[Error: Adapter not configured]"})
+ except Exception as e:
+ logger.error(f" Agent prepare error p{pidx} t{turn}: {e}")
+ state["conversation"].append({"role": "assistant", "content": "I apologize, I encountered an error. Could you rephrase?"})
+
+ # Batch vLLM call for all agent prompts
+ if agent_prompts_batch:
+ # Separate prompts that can be batched from fallback
+ batchable = [(pidx, msgs, ctx) for pidx, msgs, ctx in agent_prompts_batch if msgs is not None]
+ fallback = [(pidx, msgs, ctx) for pidx, msgs, ctx in agent_prompts_batch if msgs is None]
+
+ # Batch call for batchable prompts
+ if batchable:
+ batch_messages = [msgs for _, msgs, _ in batchable]
+ batch_responses = agent_client.batch_completion(batch_messages)
+
+ # Process batched responses
+ for (pidx, _, context), response in zip(batchable, batch_responses):
+ try:
+ adapter = adapters[pidx]
+ state = all_states[pidx]
+ if response is not None:
+ result = adapter.process_response(response, context)
+ agent_content = result.get("response", str(result)) if isinstance(result, dict) else str(result)
+ else:
+ agent_content = "I apologize, I encountered an error. Could you rephrase?"
+ state["conversation"].append({"role": "assistant", "content": agent_content})
+ except Exception as e:
+ logger.error(f" Agent process error p{pidx} t{turn}: {e}")
+ all_states[pidx]["conversation"].append({"role": "assistant", "content": "I apologize, I encountered an error. Could you rephrase?"})
+
+ # Handle fallback (adapters without prepare_prompt - sequential calls)
+ for pidx, _, _ in fallback:
+ try:
+ adapter = adapters[pidx]
+ state = all_states[pidx]
+ user_msg = state["conversation"][-1]["content"]
+ resp = adapter.generate_response(user_msg, state["conversation"][:-1])
+ agent_content = resp.get("response", str(resp)) if isinstance(resp, dict) else str(resp)
+ state["conversation"].append({"role": "assistant", "content": agent_content})
+ except Exception as e:
+ logger.error(f" Agent fallback error p{pidx} t{turn}: {e}")
+ all_states[pidx]["conversation"].append({"role": "assistant", "content": "I apologize, I encountered an error. Could you rephrase?"})
+
+ active_set -= set(to_remove)
+
+ # Save results for this session round
+ for profile_idx in profiles_to_run:
+ if profile_idx not in all_states:
+ continue
+ state = all_states[profile_idx]
+ problem_dict = state["problem_dict"]
+ conversation = state["conversation"]
+ full_log = state["full_log"]
+
+ user_tokens = sum(len(m["content"].split()) for m in conversation if m["role"] == "user")
+ agent_tokens = sum(len(m["content"].split()) for m in conversation if m["role"] == "assistant")
+
+ enforcement_count = state["enforcement_count"]
+ task_success = 0
+ for entry in full_log:
+ if entry.get("should_terminate", False):
+ draft = entry.get("draft_answer", "")
+ if draft and "don't know" not in draft.lower() and len(draft) > 20:
+ task_success = 1
+
+ results.append({
+ "method": method,
+ "profile_id": self.profiles[profile_idx].get("user_id", f"user_{profile_idx}"),
+ "problem_id": f"session_{session_idx}",
+ "problem": problem_dict.get("problem", ""),
+ "ground_truth_solution": problem_dict.get("solution", ""),
+ "is_conflict_test": state["is_conflict"],
+ "conversation": {"turns": conversation},
+ "full_user_log": full_log,
+ "metrics": {
+ "task_success": bool(task_success),
+ "total_turns": len(conversation),
+ "user_token_count": user_tokens,
+ "agent_token_count": agent_tokens,
+ "total_token_count": user_tokens + agent_tokens,
+ "enforcement_count": enforcement_count,
+ "disappointment_count": 0,
+ "preference_compliance_scores": [],
+ "conflict_resolution_accuracy": 0,
+ "over_personalization_rate": 0,
+ },
+ "adapter_metrics": {},
+ })
+
+ # Checkpoint after each session round with session-level tracking
+ # Only increment for profiles that actually ran in this round (those in all_states)
+ for profile_idx in all_states.keys():
+ sessions_per_profile[str(profile_idx)] = sessions_per_profile.get(str(profile_idx), 0) + 1
+ if sessions_per_profile[str(profile_idx)] >= self.config.n_sessions_per_profile:
+ completed_profiles.add(profile_idx)
+
+ with open(checkpoint_file, "w") as f:
+ json.dump({
+ "completed_profiles": sorted(list(completed_profiles)),
+ "sessions_per_profile": sessions_per_profile
+ }, f)
+ with open(results_file, "w") as f:
+ json.dump(results, f, indent=2)
+
+ elapsed = time.time() - start_time
+ sessions_done = len(results)
+ rate = sessions_done / elapsed * 3600 if elapsed > 0 else 0
+ logger.info(f" Session round {session_idx+1}/{n_sessions}: {sessions_done} total, {rate:.0f} sessions/hr")
+
+ # Explicitly free adapter models to prevent GPU OOM across methods
+ for pidx, adapter in adapters.items():
+ if hasattr(adapter, 'cleanup'):
+ adapter.cleanup()
+ del adapters
+
+ return results
+
+ def run_all(self) -> Dict[str, Any]:
+ """Run all methods and generate comparative analysis."""
+ all_results = {}
+
+ for method in self.config.methods:
+ if method not in AVAILABLE_METHODS:
+ logger.warning(f"Unknown method: {method}, skipping")
+ continue
+
+ results = self.run_method(method)
+ all_results[method] = results
+
+ # Free GPU memory between methods to prevent OOM on later adapters
+ try:
+ from personalization.serving.personalized_llm import clear_shared_models
+ clear_shared_models()
+ except ImportError:
+ pass
+ try:
+ import gc
+ import torch
+ gc.collect()
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ logger.info(f" GPU memory freed after {method}: {torch.cuda.memory_allocated()/1e9:.1f}GB allocated")
+ except ImportError:
+ pass
+
+ # Comparative analysis
+ analysis = self._analyze_results(all_results)
+
+ # Save analysis
+ with open(self.output_dir / "analysis.json", "w") as f:
+ json.dump(analysis, f, indent=2)
+
+ # Generate report
+ self._generate_report(analysis)
+
+ return analysis
+
+ def _analyze_results(self, all_results: Dict[str, List[Dict]]) -> Dict:
+ """Analyze results across all methods."""
+ analysis = {
+ "per_method": {},
+ "comparison": {},
+ }
+
+ for method, results in all_results.items():
+ n = len(results)
+ if n == 0:
+ continue
+
+ # Aggregate metrics
+ task_success = sum(r["metrics"]["task_success"] for r in results) / n
+ avg_user_tokens = sum(r["metrics"]["user_token_count"] for r in results) / n
+ avg_total_tokens = sum(r["metrics"]["total_token_count"] for r in results) / n
+ avg_enforcement = sum(r["metrics"]["enforcement_count"] for r in results) / n
+ avg_turns = sum(r["metrics"]["total_turns"] for r in results) / n
+
+ # Compliance and conflict metrics
+ compliance_scores = [
+ sum(r["metrics"]["preference_compliance_scores"]) / len(r["metrics"]["preference_compliance_scores"])
+ if r["metrics"]["preference_compliance_scores"] else 0.5
+ for r in results
+ ]
+ avg_compliance = sum(compliance_scores) / len(compliance_scores)
+
+ conflict_results = [r for r in results if r["is_conflict_test"]]
+ conflict_accuracy = sum(
+ r["metrics"]["conflict_resolution_accuracy"] for r in conflict_results
+ ) / len(conflict_results) if conflict_results else 0
+
+ over_personalization = sum(
+ r["metrics"]["over_personalization_rate"] for r in results
+ ) / n
+
+ analysis["per_method"][method] = {
+ "n_sessions": n,
+ "task_success_rate": task_success,
+ "avg_user_tokens": avg_user_tokens,
+ "avg_total_tokens": avg_total_tokens,
+ "avg_enforcement_count": avg_enforcement,
+ "avg_turns": avg_turns,
+ "avg_preference_compliance": avg_compliance,
+ "conflict_resolution_accuracy": conflict_accuracy,
+ "over_personalization_rate": over_personalization,
+ }
+
+ # Comparison
+ metrics_to_compare = [
+ ("task_success_rate", True), # higher is better
+ ("avg_user_tokens", False), # lower is better
+ ("avg_total_tokens", False), # lower is better
+ ("avg_enforcement_count", False), # lower is better
+ ("avg_preference_compliance", True), # higher is better
+ ("conflict_resolution_accuracy", True), # higher is better
+ ("over_personalization_rate", False), # lower is better
+ ]
+
+ for metric, higher_better in metrics_to_compare:
+ values = {m: analysis["per_method"][m][metric] for m in analysis["per_method"]}
+ if not values:
+ logger.warning(f"No values for metric {metric}, skipping comparison")
+ continue
+ if higher_better:
+ best = max(values, key=values.get)
+ else:
+ best = min(values, key=values.get)
+
+ analysis["comparison"][metric] = {
+ "values": values,
+ "best_method": best,
+ "best_value": values[best],
+ }
+
+ return analysis
+
+ def _generate_report(self, analysis: Dict) -> None:
+ """Generate a human-readable report."""
+ report_lines = [
+ "# Personalization Experiment Report",
+ f"\nGenerated: {datetime.now().isoformat()}",
+ f"\nConfig: {self.config.n_profiles} profiles, {self.config.n_sessions_per_profile} sessions each",
+ "\n## Method Comparison\n",
+ ]
+
+ # Create comparison table
+ metrics_display = [
+ ("Task Success", "task_success_rate", "{:.1%}"),
+ ("User Effort (tokens)", "avg_user_tokens", "{:.0f}"),
+ ("Total Tokens", "avg_total_tokens", "{:.0f}"),
+ ("Enforcement Count", "avg_enforcement_count", "{:.2f}"),
+ ("Preference Compliance", "avg_preference_compliance", "{:.1%}"),
+ ("Conflict Resolution", "conflict_resolution_accuracy", "{:.1%}"),
+ ("Over-personalization", "over_personalization_rate", "{:.1%}"),
+ ]
+
+ methods = list(analysis["per_method"].keys())
+
+ # Header
+ header = "| Metric |" + "|".join(f" {m} " for m in methods) + "| Best |"
+ separator = "|" + "|".join(["-" * (len(m) + 2) for m in ["Metric"] + methods + ["Best"]]) + "|"
+
+ report_lines.extend([header, separator])
+
+ for display_name, metric_key, fmt in metrics_display:
+ row = f"| {display_name} |"
+ for m in methods:
+ val = analysis["per_method"].get(m, {}).get(metric_key, 0)
+ row += f" {fmt.format(val)} |"
+
+ if metric_key in analysis.get("comparison", {}):
+ best = analysis["comparison"][metric_key]["best_method"]
+ else:
+ best = "N/A"
+ row += f" {best} |"
+ report_lines.append(row)
+
+ # Key findings
+ report_lines.extend([
+ "\n## Key Findings\n",
+ ])
+
+ # Find advantages of proposed methods
+ rag_vector = analysis["per_method"].get("rag_vector", {})
+ rag = analysis["per_method"].get("rag", {})
+ contextual = analysis["per_method"].get("contextual", {})
+ all_memory = analysis["per_method"].get("all_memory", {})
+
+ if rag_vector and contextual:
+ token_reduction = (contextual.get("avg_total_tokens", 0) - rag_vector.get("avg_total_tokens", 0)) / contextual.get("avg_total_tokens", 1) * 100
+ report_lines.append(f"- **Token Efficiency**: RAG+Vector uses {token_reduction:.1f}% fewer tokens than contextual memory")
+
+ if rag_vector and all_memory:
+ conflict_improvement = rag_vector.get("conflict_resolution_accuracy", 0) - all_memory.get("conflict_resolution_accuracy", 0)
+ report_lines.append(f"- **Conflict Resolution**: RAG+Vector improves by {conflict_improvement:.1%} over all-memory baseline")
+
+ if rag_vector:
+ report_lines.append(f"- **Over-personalization**: RAG+Vector rate: {rag_vector.get('over_personalization_rate', 0):.1%}")
+
+ # Save report
+ report_path = self.output_dir / "report.md"
+ with open(report_path, "w") as f:
+ f.write("\n".join(report_lines))
+
+ logger.info(f"Report saved to {report_path}")
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Run personalization experiments")
+ parser.add_argument("--config", type=str, help="Path to config YAML file")
+ parser.add_argument("--methods", type=str, default="vanilla,contextual,rag,rag_vector",
+ help="Comma-separated list of methods to compare")
+ parser.add_argument("--datasets", type=str, default="math-hard,math-500,bigcodebench",
+ help="Comma-separated list of datasets")
+ parser.add_argument("--n-profiles", type=int, default=200, help="Number of user profiles")
+ parser.add_argument("--n-sessions", type=int, default=30, help="Sessions per profile")
+ parser.add_argument("--max-turns", type=int, default=15, help="Max turns per session")
+ parser.add_argument("--output-dir", type=str, default="results", help="Output directory")
+ parser.add_argument("--profile-path", type=str, help="Path to pre-generated profiles")
+ parser.add_argument("--start-profile", type=int, default=0,
+ help="Start profile index (inclusive, 0-indexed)")
+ parser.add_argument("--end-profile", type=int, default=None,
+ help="End profile index (exclusive). If not set, runs all profiles from start")
+
+ # vLLM and parallel processing options
+ parser.add_argument("--use-vllm", action="store_true",
+ help="Use vLLM servers for inference (much faster)")
+ parser.add_argument("--vllm-user-url", type=str, default="http://localhost:8004/v1",
+ help="vLLM server URL for user simulator (70B)")
+ parser.add_argument("--vllm-agent-url", type=str, default="http://localhost:8003/v1",
+ help="vLLM server URL for agent (8B)")
+ # OpenAI user agent options
+ parser.add_argument("--use-openai-user", action="store_true",
+ help="Use OpenAI API (GPT-5) for user simulation instead of vLLM")
+ parser.add_argument("--openai-user-model", type=str, default="gpt-5",
+ help="OpenAI model name for user simulator (default: gpt-5)")
+ parser.add_argument("--reward-mode", type=str, default="keyword", choices=["keyword", "llm"],
+ help="Reward mode for RL updates: 'keyword' (user signals) or 'llm' (GPT-5-nano judge)")
+
+ parser.add_argument("--parallel-profiles", type=int, default=50,
+ help="Number of profiles to process in parallel (requires --use-vllm)")
+ parser.add_argument("--use-batch-processing", action="store_true", default=True,
+ help="Use turn-synchronous batch processing for vanilla/all_memory")
+ parser.add_argument("--no-batch-processing", action="store_false", dest="use_batch_processing",
+ help="Disable batch processing")
+ parser.add_argument("--batch-size", type=int, default=50,
+ help="Number of conversations to batch together")
+ parser.add_argument("--continue-from", type=str, default=None,
+ help="Path to existing output directory to continue from (for extending sessions)")
+
+ args = parser.parse_args()
+
+ # Load or create config
+ if args.config and Path(args.config).exists():
+ with open(args.config) as f:
+ config_dict = yaml.safe_load(f)
+ config = ExperimentConfig(**config_dict)
+ else:
+ config = ExperimentConfig(
+ methods=args.methods.split(","),
+ datasets=args.datasets.split(","),
+ n_profiles=args.n_profiles,
+ n_sessions_per_profile=args.n_sessions,
+ max_turns_per_session=args.max_turns,
+ output_dir=args.output_dir,
+ profile_path=args.profile_path,
+ start_profile=args.start_profile,
+ end_profile=args.end_profile,
+ use_vllm=args.use_vllm,
+ vllm_user_url=args.vllm_user_url,
+ vllm_agent_url=args.vllm_agent_url,
+ use_openai_user=args.use_openai_user,
+ openai_user_model=args.openai_user_model,
+ reward_mode=args.reward_mode,
+ parallel_profiles=args.parallel_profiles,
+ use_batch_processing=args.use_batch_processing,
+ batch_size_conversations=args.batch_size,
+ continue_from=args.continue_from,
+ )
+
+ # Run experiments
+ runner = ExperimentRunner(config)
+ analysis = runner.run_all()
+
+ print("\n" + "=" * 60)
+ print("EXPERIMENT COMPLETE")
+ print("=" * 60)
+ print(f"\nResults saved to: {runner.output_dir}")
+ if analysis.get("comparison"):
+ print("\nBest methods per metric:")
+ for metric, data in analysis["comparison"].items():
+ print(f" {metric}: {data['best_method']} ({data['best_value']:.3f})")
+ else:
+ print("\nNo comparison data available (sessions may have failed)")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/collaborativeagents/scripts/run_fp8.sh b/collaborativeagents/scripts/run_fp8.sh
new file mode 100644
index 0000000..54537fa
--- /dev/null
+++ b/collaborativeagents/scripts/run_fp8.sh
@@ -0,0 +1,65 @@
+# vllm serve meta-llama/Llama-3.3-70B-Instruct --port 8004 --tensor-parallel-size 4 --max-model-len 16384 --gpu-memory-utilization 0.9 --quantization fp8 --enforce-eager
+# python -m sglang.launch_server --model-path meta-llama/Llama-3.3-70B-Instruct --port 8004 --tp-size 4 --context-length 16384 --mem-fraction-static 0.9 --quantization fp8
+
+
+BATCH_SIZE=1
+# BATCH_SIZE=20
+
+# Loop over eval sizes and datasets
+for EVAL_SIZE in 5; do
+ for DATASET in logiqa; do # mmlu medqa humaneval bigcodebench math-500 math-hard
+ # Convert dataset name for file paths (replace - with _)
+ DATASET_FILE=$(echo ${DATASET} | tr '-' '_')
+
+ echo "Running experiments for dataset: ${DATASET} with eval_size ${EVAL_SIZE}"
+
+ # # no_user experiment
+ # python3 run.py --experiment_type no_user --dataset ${DATASET} --eval_size ${EVAL_SIZE} --batch_size ${BATCH_SIZE} \
+ # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \
+ # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_fp8/no_user/${DATASET_FILE}_llama70b_user_llama70b_agent_no_user_eval_size_${EVAL_SIZE}.jsonl \
+ # >> ./runs/llama70b_fp8/no_user/${DATASET_FILE}_llama70b_user_llama70b_agent_no_user_eval_size_${EVAL_SIZE}.out 2>&1
+
+ # # user_no_profile experiment
+ # python3 run.py --experiment_type user_no_profile --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \
+ # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_fp8/user_no_profile/${DATASET_FILE}_llama70b_user_llama70b_agent_user_no_profile_eval_size_${EVAL_SIZE}.jsonl \
+ # >> ./runs/llama70b_fp8/user_no_profile/${DATASET_FILE}_llama70b_user_llama70b_agent_user_no_profile_eval_size_${EVAL_SIZE}.out 2>&1
+
+ # # user_profiles_without_preferences experiment
+ # python3 run.py --experiment_type user_profiles_without_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \
+ # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_fp8/user_profiles_without_preferences/${DATASET_FILE}_llama70b_user_llama70b_agent_user_profiles_without_preferences_eval_size_${EVAL_SIZE}.jsonl \
+ # >> ./runs/llama70b_fp8/user_profiles_without_preferences/${DATASET_FILE}_llama70b_user_llama70b_agent_user_profiles_without_preferences_eval_size_${EVAL_SIZE}.out 2>&1
+
+ # # user_profiles_with_preferences experiment
+ # python3 run.py --experiment_type user_profiles_with_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \
+ # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_fp8/user_profiles_with_preferences/${DATASET_FILE}_llama70b_user_llama70b_agent_user_profiles_with_preferences_eval_size_${EVAL_SIZE}.jsonl \
+ # >> ./runs/llama70b_fp8/user_profiles_with_preferences/${DATASET_FILE}_llama70b_user_llama70b_agent_user_profiles_with_preferences_eval_size_${EVAL_SIZE}.out 2>&1
+
+ # # agent_with_user_preferences experiment
+ # python3 run.py --experiment_type agent_with_user_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \
+ # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_fp8/agent_with_user_preferences/${DATASET_FILE}_llama70b_user_llama70b_agent_agent_with_user_preferences_eval_size_${EVAL_SIZE}_v2.jsonl \
+ # >> ./runs/llama70b_fp8/agent_with_user_preferences/${DATASET_FILE}_llama70b_user_llama70b_agent_agent_with_user_preferences_eval_size_${EVAL_SIZE}_v2.out 2>&1
+
+ # agent_with_reflection experiment
+ python3 run.py --experiment_type agent_with_reflection --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \
+ --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \
+ --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \
+ --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \
+ --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_fp8/agent_with_reflection/${DATASET_FILE}_llama70b_user_llama70b_agent_agent_with_reflection_eval_size_${EVAL_SIZE}.jsonl \
+ >> ./runs/llama70b_fp8/agent_with_reflection/${DATASET_FILE}_llama70b_user_llama70b_agent_agent_with_reflection_eval_size_${EVAL_SIZE}.out 2>&1
+
+
+ done
+done \ No newline at end of file
diff --git a/collaborativeagents/scripts/run_preflight_test.sh b/collaborativeagents/scripts/run_preflight_test.sh
new file mode 100755
index 0000000..8647a0b
--- /dev/null
+++ b/collaborativeagents/scripts/run_preflight_test.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# Run pre-flight tests before full experiments
+
+set -e
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PORT_USER=8004
+PORT_AGENT=8003
+
+echo "============================================"
+echo "Pre-Flight Tests for Full Experiments"
+echo "============================================"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+echo ""
+
+# Kill any existing servers
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start servers
+echo "Starting 8B user simulator (GPU 0-1)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_USER \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_USER_PID=$!
+
+echo "Starting 8B agent (GPU 2-3)..."
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_AGENT \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_AGENT_PID=$!
+
+echo "Waiting for servers..."
+for i in $(seq 1 100); do
+ READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0)
+ READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0)
+ if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then
+ echo "Both servers ready after $((i*3))s"
+ break
+ fi
+ if [ $((i % 20)) -eq 0 ]; then
+ echo " Still waiting... ($((i*3))s)"
+ fi
+ sleep 3
+done
+
+if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then
+ echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then
+ echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+echo "✓ Both servers healthy"
+echo ""
+
+# Run pre-flight tests
+python scripts/preflight_test.py \
+ http://localhost:$PORT_USER/v1 \
+ http://localhost:$PORT_AGENT/v1
+
+TEST_RESULT=$?
+
+# Cleanup
+echo ""
+echo "Cleaning up..."
+kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true
+
+echo ""
+date
+
+exit $TEST_RESULT
diff --git a/collaborativeagents/scripts/scale_test_batch1.sbatch b/collaborativeagents/scripts/scale_test_batch1.sbatch
new file mode 100644
index 0000000..119be9b
--- /dev/null
+++ b/collaborativeagents/scripts/scale_test_batch1.sbatch
@@ -0,0 +1,121 @@
+#!/bin/bash
+#SBATCH --job-name=scale_b1
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8-interactive
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=01:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/scale_b1-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/scale_b1-%j.err
+
+# Scale Test Batch 1: Users 1-5, 15 sessions each, 3 methods
+# With CollaborativeAgents-style prompts
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+MEMORY_STORE="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store"
+
+echo "=== Scale Test Batch 1: 5 users × 15 sessions × 3 methods ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+# Start vLLM servers
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \
+ --max-model-len 16384 --dtype bfloat16 &
+
+echo "Waiting for vLLM servers..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+sleep 5
+
+OUTPUT_DIR="../results/scale_test_$(date +%Y%m%d_%H%M%S)"
+
+# Run each method with 5 profiles, 15 sessions
+for METHOD in vanilla rag rag_vector; do
+ echo ""
+ echo "============================================"
+ echo "Testing: $METHOD (5 users × 15 sessions)"
+ echo "============================================"
+
+ # Clear memory store before each method
+ > ${MEMORY_STORE}/memory_cards.jsonl
+ rm -f ${MEMORY_STORE}/memory_embeddings.npy
+
+ date
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 1 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+ echo "Method $METHOD completed"
+ if [ "$METHOD" != "vanilla" ]; then
+ echo "Final memory cards: $(wc -l < ${MEMORY_STORE}/memory_cards.jsonl 2>/dev/null || echo 0)"
+ fi
+done
+
+echo ""
+echo "=== Scale Test Batch 1 Complete ==="
+date
+
+# Generate comparison
+python3 << 'PYEOF'
+import json
+from pathlib import Path
+
+output_base = sorted(Path("../results").glob("scale_test_*"))[-1]
+print(f"\n=== Results Summary ===\nDir: {output_base}\n")
+
+methods = ["vanilla", "rag", "rag_vector"]
+results = {}
+
+for subdir in output_base.iterdir():
+ if subdir.is_dir():
+ for method in methods:
+ result_file = subdir / method / "results.json"
+ if result_file.exists() and method not in results:
+ with open(result_file) as f:
+ results[method] = json.load(f)
+
+if results:
+ print(f"{'Method':<12} {'Success':<10} {'Turns':<10} {'Enforce':<10} {'Sessions':<10}")
+ print("-" * 55)
+ for method in methods:
+ if method in results:
+ data = results[method]
+ n = len(data)
+ succ = sum(r['metrics']['task_success'] for r in data) / n
+ turns = sum(r['metrics']['total_turns'] for r in data) / n
+ enf = sum(r['metrics']['enforcement_count'] for r in data) / n
+ print(f"{method:<12} {succ:<10.1%} {turns:<10.1f} {enf:<10.1f} {n:<10}")
+PYEOF
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/scale_test_batch2.sbatch b/collaborativeagents/scripts/scale_test_batch2.sbatch
new file mode 100644
index 0000000..6a1fb27
--- /dev/null
+++ b/collaborativeagents/scripts/scale_test_batch2.sbatch
@@ -0,0 +1,126 @@
+#!/bin/bash
+#SBATCH --job-name=scale_b2
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8-interactive
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=01:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/scale_b2-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/scale_b2-%j.err
+
+# Scale Test Batch 2: Users 6-10, 15 sessions each, 3 methods
+# With CollaborativeAgents-style prompts
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+MEMORY_STORE="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store_b2"
+
+echo "=== Scale Test Batch 2: 5 users × 15 sessions × 3 methods ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+# Create separate memory store for batch 2
+mkdir -p ${MEMORY_STORE}
+> ${MEMORY_STORE}/memory_cards.jsonl
+
+# Start vLLM servers
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \
+ --max-model-len 16384 --dtype bfloat16 &
+
+echo "Waiting for vLLM servers..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+sleep 5
+
+OUTPUT_DIR="../results/scale_test_b2_$(date +%Y%m%d_%H%M%S)"
+
+# Run each method with profiles 6-10 (skip first 5)
+for METHOD in vanilla rag rag_vector; do
+ echo ""
+ echo "============================================"
+ echo "Testing: $METHOD (users 6-10 × 15 sessions)"
+ echo "============================================"
+
+ # Clear memory store before each method
+ > ${MEMORY_STORE}/memory_cards.jsonl
+ rm -f ${MEMORY_STORE}/memory_embeddings.npy
+
+ date
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 1 \
+ --profile-offset 5 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+ echo "Method $METHOD completed"
+ if [ "$METHOD" != "vanilla" ]; then
+ echo "Final memory cards: $(wc -l < ${MEMORY_STORE}/memory_cards.jsonl 2>/dev/null || echo 0)"
+ fi
+done
+
+echo ""
+echo "=== Scale Test Batch 2 Complete ==="
+date
+
+# Generate comparison
+python3 << 'PYEOF'
+import json
+from pathlib import Path
+
+output_base = sorted(Path("../results").glob("scale_test_b2_*"))[-1]
+print(f"\n=== Results Summary (Batch 2) ===\nDir: {output_base}\n")
+
+methods = ["vanilla", "rag", "rag_vector"]
+results = {}
+
+for subdir in output_base.iterdir():
+ if subdir.is_dir():
+ for method in methods:
+ result_file = subdir / method / "results.json"
+ if result_file.exists() and method not in results:
+ with open(result_file) as f:
+ results[method] = json.load(f)
+
+if results:
+ print(f"{'Method':<12} {'Success':<10} {'Turns':<10} {'Enforce':<10} {'Sessions':<10}")
+ print("-" * 55)
+ for method in methods:
+ if method in results:
+ data = results[method]
+ n = len(data)
+ succ = sum(r['metrics']['task_success'] for r in data) / n
+ turns = sum(r['metrics']['total_turns'] for r in data) / n
+ enf = sum(r['metrics']['enforcement_count'] for r in data) / n
+ print(f"{method:<12} {succ:<10.1%} {turns:<10.1f} {enf:<10.1f} {n:<10}")
+PYEOF
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/scale_test_ctx_refl.sbatch b/collaborativeagents/scripts/scale_test_ctx_refl.sbatch
new file mode 100644
index 0000000..1055e16
--- /dev/null
+++ b/collaborativeagents/scripts/scale_test_ctx_refl.sbatch
@@ -0,0 +1,114 @@
+#!/bin/bash
+#SBATCH --job-name=scale_cr
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8-interactive
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=01:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/scale_cr-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/scale_cr-%j.err
+
+# Scale Test: Contextual and Reflection methods
+# 5 users × 15 sessions × 2 methods = 150 sessions
+# With CollaborativeAgents-style prompts
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+echo "=== Scale Test: Contextual & Reflection (5 users × 15 sessions × 2 methods) ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+# Start vLLM servers
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 &
+
+echo "Waiting for vLLM servers..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+sleep 5
+
+OUTPUT_DIR="../results/scale_test_ctx_refl_$(date +%Y%m%d_%H%M%S)"
+
+# Run contextual and reflection methods
+for METHOD in contextual reflection; do
+ echo ""
+ echo "============================================"
+ echo "Testing: $METHOD (5 users × 15 sessions)"
+ echo "============================================"
+
+ date
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 1 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+ echo "Method $METHOD completed"
+done
+
+echo ""
+echo "=== Contextual & Reflection Test Complete ==="
+date
+
+# Generate comparison
+python3 << 'PYEOF'
+import json
+from pathlib import Path
+
+output_base = sorted(Path("../results").glob("scale_test_ctx_refl_*"))[-1]
+print(f"\n=== Results Summary (Contextual & Reflection) ===\nDir: {output_base}\n")
+
+methods = ["contextual", "reflection"]
+results = {}
+
+for subdir in output_base.iterdir():
+ if subdir.is_dir():
+ for method in methods:
+ result_file = subdir / method / "results.json"
+ if result_file.exists() and method not in results:
+ with open(result_file) as f:
+ results[method] = json.load(f)
+
+if results:
+ print(f"{'Method':<12} {'Success':<10} {'Turns':<10} {'Enforce':<10} {'Sessions':<10}")
+ print("-" * 55)
+ for method in methods:
+ if method in results:
+ data = results[method]
+ n = len(data)
+ succ = sum(r['metrics']['task_success'] for r in data) / n
+ turns = sum(r['metrics']['total_turns'] for r in data) / n
+ enf = sum(r['metrics']['enforcement_count'] for r in data) / n
+ print(f"{method:<12} {succ:<10.1%} {turns:<10.1f} {enf:<10.1f} {n:<10}")
+PYEOF
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/smallscale_test.sbatch b/collaborativeagents/scripts/smallscale_test.sbatch
new file mode 100644
index 0000000..774575e
--- /dev/null
+++ b/collaborativeagents/scripts/smallscale_test.sbatch
@@ -0,0 +1,87 @@
+#!/bin/bash
+#SBATCH --job-name=smalltest
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=02:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/smalltest-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/smalltest-%j.err
+
+# Small-scale test: 5 profiles, 5 sessions, all 6 methods
+# Full settings (70B user sim, 8B agent) but fewer questions
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+echo "=== Small-scale Test: All 6 Methods ==="
+echo "Settings: 5 profiles, 5 sessions each, max 15 turns"
+echo "User simulator: $USER_MODEL (70B)"
+echo "Agent: $AGENT_MODEL (8B)"
+date
+
+# Start vLLM servers
+# User simulator on GPUs 0,1 (70B, TP=2)
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME &
+
+# Agent on GPUs 2,3 (8B, TP=2, lower memory for embedding/reranker)
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \
+ --max-model-len 16384 --dtype bfloat16 &
+
+# Wait for servers
+echo "Waiting for vLLM servers..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator (8004) ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent (8003) ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+echo "Both vLLM servers ready"
+sleep 10
+
+# Run all 6 methods sequentially with small scale
+for METHOD in vanilla contextual reflection all_memory rag rag_vector; do
+ echo ""
+ echo "=== Testing method: $METHOD ==="
+ date
+
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 5 --n-sessions 5 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 5 \
+ --output-dir ../results/smalltest --profile-path $PROFILE_PATH
+
+ if [ $? -eq 0 ]; then
+ echo "Method $METHOD: SUCCESS"
+ else
+ echo "Method $METHOD: FAILED"
+ fi
+done
+
+echo ""
+echo "=== Small-scale test complete ==="
+date
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/test_70b_pilot.py b/collaborativeagents/scripts/test_70b_pilot.py
new file mode 100644
index 0000000..4bb27a3
--- /dev/null
+++ b/collaborativeagents/scripts/test_70b_pilot.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+"""
+Pilot test for 70B AWQ user model.
+
+Tests:
+1. 70B AWQ model loads without OOM
+2. User simulation works correctly
+3. Multi-turn conversation completes
+4. Memory usage is acceptable
+
+Run with 4xA100 GPUs.
+"""
+
+import sys
+import json
+import torch
+from pathlib import Path
+
+# Add paths
+sys.path.insert(0, str(Path(__file__).parent.parent))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+
+def print_gpu_memory():
+ """Print current GPU memory usage."""
+ print("\n=== GPU Memory Usage ===")
+ for i in range(torch.cuda.device_count()):
+ total = torch.cuda.get_device_properties(i).total_memory / 1e9
+ allocated = torch.cuda.memory_allocated(i) / 1e9
+ reserved = torch.cuda.memory_reserved(i) / 1e9
+ print(f" GPU {i}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved, {total:.1f}GB total")
+ print()
+
+
+def test_70b_user_agent():
+ """Test 70B user agent standalone."""
+ print("=" * 60)
+ print("TEST 1: 70B AWQ User Agent Loading")
+ print("=" * 60)
+
+ from agents.local_user_agent import LocalUserAgent, DEFAULT_MODEL_PATH
+
+ print(f"Default model path: {DEFAULT_MODEL_PATH}")
+ print(f"Is AWQ model: {'awq' in DEFAULT_MODEL_PATH.lower()}")
+
+ # Create user agent
+ user_agent = LocalUserAgent(
+ user_task_description="Help solve a math problem",
+ problem="What is 2 + 2?",
+ user_persona="A student learning math",
+ user_preferences="- Show step by step solutions\n- Use simple language",
+ )
+
+ print("\nGenerating user response...")
+ print_gpu_memory()
+
+ # Simulate a conversation
+ conversation = [{"role": "assistant", "content": "How can I help you today?"}]
+ response = user_agent.generate_user_response(conversation)
+
+ print_gpu_memory()
+
+ if response:
+ print(f"SUCCESS! User response: {response.get('response', 'N/A')[:200]}...")
+ print(f"Should terminate: {response.get('should_terminate', 'N/A')}")
+ return True
+ else:
+ print("FAILED! User agent returned None")
+ return False
+
+
+def test_multiturn_with_70b():
+ """Test multi-turn conversation with 70B user model."""
+ print("\n" + "=" * 60)
+ print("TEST 2: Multi-turn Conversation with 70B User Model")
+ print("=" * 60)
+
+ from agents.local_user_agent import SharedLocalUserAgent, TERMINATION_SIGNAL
+ from adapters.personalized_llm_adapter import create_baseline_adapter
+
+ # Create vanilla adapter (uses Qwen 1.5B for agent)
+ print("\nCreating vanilla adapter...")
+ adapter = create_baseline_adapter("vanilla")
+ adapter.initialize()
+
+ print_gpu_memory()
+
+ # Load a test profile
+ profile_path = Path(__file__).parent.parent / "data/complex_profiles_v2/profiles_100.jsonl"
+ with open(profile_path) as f:
+ profile = json.loads(f.readline())
+
+ print(f"Loaded profile: {profile.get('user_id', 'unknown')}")
+
+ # Create user agent with 70B model
+ problem = "What is 15% of 80?"
+ user_prefs = profile.get("preferences", [])[:3]
+ pref_str = "\n".join([f"- {p}" for p in user_prefs])
+
+ print(f"\nUser preferences:\n{pref_str}")
+
+ user_agent = SharedLocalUserAgent(
+ user_task_description="Solve the math problem",
+ problem=problem,
+ user_persona=profile.get("persona", "A user"),
+ user_preferences=pref_str,
+ )
+
+ print_gpu_memory()
+
+ # Start session
+ adapter.start_session(user_id=profile.get("user_id", "test"))
+
+ # Run multi-turn conversation
+ conversation = [{"role": "assistant", "content": "How can I help you today?"}]
+ turns = []
+ max_turns = 5
+
+ print(f"\nStarting {max_turns}-turn conversation...")
+
+ for turn_num in range(max_turns):
+ print(f"\n--- Turn {turn_num + 1} ---")
+
+ # User turn
+ user_response = user_agent.generate_user_response(conversation)
+ if user_response is None:
+ print("User agent failed!")
+ break
+
+ user_msg = user_response.get("response", "")
+ print(f"USER: {user_msg[:150]}...")
+
+ conversation.append({"role": "user", "content": user_msg})
+ turns.append({"role": "user", "content": user_msg})
+
+ # Check termination
+ if user_response.get("should_terminate", False) or TERMINATION_SIGNAL in user_msg:
+ print("\n[User terminated conversation]")
+ break
+
+ # Agent turn
+ response = adapter.generate_response(user_msg, conversation[:-1])
+ agent_msg = response.get("response", str(response)) if isinstance(response, dict) else str(response)
+ print(f"AGENT: {agent_msg[:150]}...")
+
+ conversation.append({"role": "assistant", "content": agent_msg})
+ turns.append({"role": "assistant", "content": agent_msg})
+
+ # End session
+ adapter.end_session()
+
+ print(f"\n--- Results ---")
+ print(f"Total turns: {len(turns)}")
+ print(f"User turns: {len([t for t in turns if t['role'] == 'user'])}")
+ print(f"Agent turns: {len([t for t in turns if t['role'] == 'assistant'])}")
+
+ print_gpu_memory()
+
+ return len(turns) > 2 # Success if more than single turn
+
+
+def test_memory_after_multiple_sessions():
+ """Test memory doesn't grow unboundedly after multiple sessions."""
+ print("\n" + "=" * 60)
+ print("TEST 3: Memory Stability Across Sessions")
+ print("=" * 60)
+
+ from agents.local_user_agent import SharedLocalUserAgent, TERMINATION_SIGNAL
+ from adapters.personalized_llm_adapter import create_baseline_adapter
+
+ adapter = create_baseline_adapter("vanilla")
+ adapter.initialize()
+
+ profile_path = Path(__file__).parent.parent / "data/complex_profiles_v2/profiles_100.jsonl"
+ with open(profile_path) as f:
+ profile = json.loads(f.readline())
+
+ n_sessions = 3
+ print(f"\nRunning {n_sessions} sessions to check memory stability...")
+
+ for session_idx in range(n_sessions):
+ print(f"\n--- Session {session_idx + 1}/{n_sessions} ---")
+
+ user_agent = SharedLocalUserAgent(
+ user_task_description="Solve math",
+ problem=f"What is {session_idx + 1} + {session_idx + 2}?",
+ user_persona="A student",
+ user_preferences="- Be concise",
+ )
+
+ adapter.start_session(user_id=profile.get("user_id", "test"))
+
+ conversation = [{"role": "assistant", "content": "How can I help?"}]
+ for turn in range(3):
+ user_response = user_agent.generate_user_response(conversation)
+ if user_response is None or user_response.get("should_terminate"):
+ break
+ conversation.append({"role": "user", "content": user_response.get("response", "")})
+
+ response = adapter.generate_response(user_response.get("response", ""), conversation[:-1])
+ conversation.append({"role": "assistant", "content": response.get("response", str(response))})
+
+ adapter.end_session()
+ print_gpu_memory()
+
+ # Force garbage collection
+ import gc
+ gc.collect()
+ torch.cuda.empty_cache()
+
+ print("\nMemory stability test completed.")
+ return True
+
+
+if __name__ == "__main__":
+ import os
+ os.environ["HF_HOME"] = "/projects/bfqt/users/yurenh2/hf_cache/huggingface"
+
+ print("\n" + "=" * 60)
+ print("70B AWQ USER MODEL PILOT TEST")
+ print("=" * 60)
+ print(f"PyTorch version: {torch.__version__}")
+ print(f"CUDA available: {torch.cuda.is_available()}")
+ print(f"GPU count: {torch.cuda.device_count()}")
+
+ for i in range(torch.cuda.device_count()):
+ print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
+
+ print_gpu_memory()
+
+ results = {}
+
+ # Test 1: User agent loading
+ try:
+ results["70b_load"] = test_70b_user_agent()
+ except Exception as e:
+ print(f"TEST 1 FAILED: {e}")
+ import traceback
+ traceback.print_exc()
+ results["70b_load"] = False
+
+ # Test 2: Multi-turn conversation (only if test 1 passed)
+ if results.get("70b_load", False):
+ try:
+ results["multiturn"] = test_multiturn_with_70b()
+ except Exception as e:
+ print(f"TEST 2 FAILED: {e}")
+ import traceback
+ traceback.print_exc()
+ results["multiturn"] = False
+ else:
+ print("\nSkipping TEST 2 (TEST 1 failed)")
+ results["multiturn"] = False
+
+ # Test 3: Memory stability (only if test 2 passed)
+ if results.get("multiturn", False):
+ try:
+ results["memory_stable"] = test_memory_after_multiple_sessions()
+ except Exception as e:
+ print(f"TEST 3 FAILED: {e}")
+ import traceback
+ traceback.print_exc()
+ results["memory_stable"] = False
+ else:
+ print("\nSkipping TEST 3 (TEST 2 failed)")
+ results["memory_stable"] = False
+
+ # Summary
+ print("\n" + "=" * 60)
+ print("TEST SUMMARY")
+ print("=" * 60)
+ for test_name, passed in results.items():
+ status = "PASS" if passed else "FAIL"
+ print(f" {test_name}: {status}")
+
+ all_passed = all(results.values())
+ print(f"\nOverall: {'ALL TESTS PASSED - Ready for full experiment!' if all_passed else 'SOME TESTS FAILED'}")
+
+ print_gpu_memory()
+
+ sys.exit(0 if all_passed else 1)
diff --git a/collaborativeagents/scripts/test_all_a100x8.sbatch b/collaborativeagents/scripts/test_all_a100x8.sbatch
new file mode 100644
index 0000000..3f117e1
--- /dev/null
+++ b/collaborativeagents/scripts/test_all_a100x8.sbatch
@@ -0,0 +1,124 @@
+#!/bin/bash
+#SBATCH --job-name=test_all_a100x8
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuA100x8-interactive
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=01:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_a100x8-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_a100x8-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== Parallel Speed Test: ALL 6 METHODS (A100x8) ==="
+echo "Scale: 10 profiles × 3 sessions = 30 sessions per method"
+echo "vLLM memory: 45% (leaves room for embedding+reranker)"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start TWO vLLM servers with REDUCED memory (45%) to leave room for embedding+reranker
+echo ""
+echo "Starting vLLM servers (45% GPU memory)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.45 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.45 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+for i in $(seq 1 120); do
+ u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0)
+ a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0)
+ [ "$u" = "1" ] && [ "$a" = "1" ] && echo "Both servers ready after $((i*2))s" && break
+ sleep 2
+done
+
+sleep 30
+echo "Starting experiments..."
+
+COMMON_ARGS="--datasets math-hard --n-profiles 10 --n-sessions 3 --max-turns 10 --use-vllm --parallel-profiles 10 --no-batch-processing --output-dir ../results/test_a100x8 --profile-path $PROFILE_PATH"
+
+# Test 1: vanilla (batch processing)
+echo ""
+echo "=== TEST 1: vanilla (batch processing) ==="
+START=$(date +%s)
+python scripts/run_experiments.py \
+ --methods vanilla \
+ --datasets math-hard \
+ --n-profiles 10 --n-sessions 3 --max-turns 10 \
+ --use-vllm --parallel-profiles 10 \
+ --use-batch-processing --batch-size 30 \
+ --output-dir ../results/test_a100x8 \
+ --profile-path "$PROFILE_PATH"
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> vanilla: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 2: contextual
+echo ""
+echo "=== TEST 2: contextual ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods contextual $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> contextual: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 3: reflection
+echo ""
+echo "=== TEST 3: reflection ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods reflection $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> reflection: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 4: all_memory
+echo ""
+echo "=== TEST 4: all_memory ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods all_memory $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> all_memory: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 5: rag
+echo ""
+echo "=== TEST 5: rag ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods rag $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> rag: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 6: rag_vector
+echo ""
+echo "=== TEST 6: rag_vector ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods rag_vector $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> rag_vector: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+
+echo ""
+echo "=== ALL SPEED TESTS COMPLETE ==="
+date
diff --git a/collaborativeagents/scripts/test_all_h200.sbatch b/collaborativeagents/scripts/test_all_h200.sbatch
new file mode 100644
index 0000000..cc37a39
--- /dev/null
+++ b/collaborativeagents/scripts/test_all_h200.sbatch
@@ -0,0 +1,126 @@
+#!/bin/bash
+#SBATCH --job-name=test_all_h200
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8-interactive
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=01:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_h200-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_h200-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== Parallel Speed Test: ALL 6 METHODS (H200) ==="
+echo "Scale: 10 profiles × 3 sessions = 30 sessions per method"
+echo "vLLM memory: 45% (leaves room for embedding+reranker)"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start TWO vLLM servers with REDUCED memory (45%) to leave room for embedding+reranker
+echo ""
+echo "Starting vLLM servers (45% GPU memory)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.45 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.45 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+for i in $(seq 1 120); do
+ u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0)
+ a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0)
+ [ "$u" = "1" ] && [ "$a" = "1" ] && echo "Both servers ready after $((i*2))s" && break
+ sleep 2
+done
+
+sleep 30
+echo "Starting experiments..."
+
+# All methods can now run in parallel thanks to shared model singletons
+# Shared models: embedding (8B) and reranker (8B) are loaded ONCE and shared across all parallel workers
+COMMON_ARGS="--datasets math-hard --n-profiles 10 --n-sessions 3 --max-turns 10 --use-vllm --parallel-profiles 10 --no-batch-processing --output-dir ../results/test_h200 --profile-path $PROFILE_PATH"
+
+# Test 1: vanilla (batch processing)
+echo ""
+echo "=== TEST 1: vanilla (batch processing) ==="
+START=$(date +%s)
+python scripts/run_experiments.py \
+ --methods vanilla \
+ --datasets math-hard \
+ --n-profiles 10 --n-sessions 3 --max-turns 10 \
+ --use-vllm --parallel-profiles 10 \
+ --use-batch-processing --batch-size 30 \
+ --output-dir ../results/test_h200 \
+ --profile-path "$PROFILE_PATH"
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> vanilla: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 2: contextual
+echo ""
+echo "=== TEST 2: contextual ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods contextual $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> contextual: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 3: reflection
+echo ""
+echo "=== TEST 3: reflection ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods reflection $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> reflection: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 4: all_memory (parallel with shared models)
+echo ""
+echo "=== TEST 4: all_memory (parallel with shared models) ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods all_memory $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> all_memory: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 5: rag (parallel with shared models)
+echo ""
+echo "=== TEST 5: rag (parallel with shared models) ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods rag $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> rag: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 6: rag_vector (parallel with shared models)
+echo ""
+echo "=== TEST 6: rag_vector (parallel with shared models) ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods rag_vector $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> rag_vector: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+
+echo ""
+echo "=== ALL SPEED TESTS COMPLETE ==="
+date
diff --git a/collaborativeagents/scripts/test_all_methods.sbatch b/collaborativeagents/scripts/test_all_methods.sbatch
new file mode 100644
index 0000000..6550cdf
--- /dev/null
+++ b/collaborativeagents/scripts/test_all_methods.sbatch
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name=test_all
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=02:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_methods_%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_methods_%j.err
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+echo "=== Small-scale test: ALL methods with 70B user sim ==="
+echo "Scale: 5 profiles × 3 sessions = 15 sessions per method"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+# Start 70B user simulator on GPUs 0,1
+echo ""
+echo "Starting 70B user simulator..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 8192 --dtype bfloat16 --download-dir $HF_HOME &
+USER_PID=$!
+
+# Start 8B agent on GPUs 2,3 (0.45 for RAG methods)
+echo "Starting 8B agent (0.45 memory for embedding/reranker)..."
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \
+ --max-model-len 8192 --dtype bfloat16 &
+AGENT_PID=$!
+
+# Wait for servers
+echo "Waiting for vLLM servers (70B takes ~8 min)..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "70B user simulator ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "8B agent ready after $((i*5))s"
+ break
+ fi
+ sleep 5
+done
+
+echo ""
+echo "=== GPU Memory after vLLM servers ==="
+nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv
+
+# Test each method sequentially
+for METHOD in vanilla contextual reflection all_memory rag rag_vector; do
+ echo ""
+ echo "=============================================="
+ echo "Testing method: $METHOD"
+ echo "=============================================="
+ date
+
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 5 --n-sessions 3 --max-turns 10 \
+ --use-vllm --no-batch-processing --parallel-profiles 5 \
+ --output-dir ../results/test_all_methods --profile-path $PROFILE_PATH
+
+ echo ""
+ echo "=== GPU Memory after $METHOD ==="
+ nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv
+done
+
+echo ""
+echo "=============================================="
+echo "ALL METHODS TESTED"
+echo "=============================================="
+date
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/test_batch_50.py b/collaborativeagents/scripts/test_batch_50.py
new file mode 100644
index 0000000..b3f1c37
--- /dev/null
+++ b/collaborativeagents/scripts/test_batch_50.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+"""
+Test batch processing with 50 conversations (matching paper's setup).
+"""
+
+import sys
+import time
+sys.path.insert(0, '/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents')
+
+from agents.batch_vllm_agent import BatchConversationGenerator
+
+def main():
+ user_url = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8004/v1"
+ agent_url = sys.argv[2] if len(sys.argv) > 2 else "http://localhost:8003/v1"
+ batch_size = int(sys.argv[3]) if len(sys.argv) > 3 else 50
+ max_turns = int(sys.argv[4]) if len(sys.argv) > 4 else 10
+
+ print(f"\n{'='*60}")
+ print(f"Batch Processing Test (Paper Configuration)")
+ print(f"{'='*60}")
+ print(f"Batch size: {batch_size}")
+ print(f"Max turns: {max_turns}")
+ print(f"User URL: {user_url}")
+ print(f"Agent URL: {agent_url}")
+ print()
+
+ # Create samples (simulating MMLU-style questions)
+ samples = [
+ {
+ "problem": f"Question {i+1}: What is the capital of country number {i+1}? "
+ f"A) City A B) City B C) City C D) City D. "
+ f"Please explain your reasoning step by step.",
+ "solution": "City A"
+ }
+ for i in range(batch_size)
+ ]
+
+ generator = BatchConversationGenerator(
+ user_vllm_url=user_url,
+ agent_vllm_url=agent_url,
+ max_turns=max_turns,
+ user_max_tokens=512,
+ agent_max_tokens=1024,
+ temperature=0.7,
+ )
+
+ print(f"Starting batch generation of {batch_size} conversations...")
+ print(f"Expected: ~{batch_size * max_turns * 2} total LLM calls batched into ~{max_turns * 2} batch requests")
+ print()
+
+ start = time.time()
+ results = generator.generate_batch(
+ samples=samples,
+ user_persona="A curious student seeking help with exam questions.",
+ user_preferences="1. Explain your reasoning step by step\n2. Be concise but thorough\n3. Highlight the key concept",
+ agent_system_prompt="You are a helpful tutor. Answer questions clearly and explain your reasoning.",
+ )
+ elapsed = time.time() - start
+
+ successes = sum(1 for r in results if r is not None)
+ total_turns = sum(
+ len(r['conversation']) // 2 if r else 0
+ for r in results
+ )
+
+ print(f"\n{'='*60}")
+ print(f"RESULTS")
+ print(f"{'='*60}")
+ print(f"Batch size: {batch_size}")
+ print(f"Max turns: {max_turns}")
+ print(f"Successes: {successes}/{batch_size}")
+ print(f"Total conversation turns: {total_turns}")
+ print(f"Time: {elapsed:.1f}s")
+ print()
+ print(f"Throughput: {successes * 3600 / elapsed:.0f} conversations/hr")
+ print(f"Sessions/hr (3 sessions/profile): {successes * 3 * 3600 / elapsed:.0f}")
+ print()
+
+ # Compare with paper's claimed performance
+ paper_sessions = 2000 # sessions per hour claimed
+ our_sessions = successes * 3 * 3600 / elapsed
+ print(f"Paper's claimed throughput: ~{paper_sessions} sessions/hr")
+ print(f"Our throughput: {our_sessions:.0f} sessions/hr")
+ print(f"Ratio: {our_sessions / paper_sessions * 100:.1f}% of paper's performance")
+ print()
+
+ # Show sample conversation
+ if results[0]:
+ print(f"Sample conversation (first 4 messages):")
+ for msg in results[0]['conversation'][:4]:
+ role = msg['role'].upper()
+ content = msg['content'][:100] + "..." if len(msg['content']) > 100 else msg['content']
+ print(f" [{role}]: {content}")
+
+ return results
+
+if __name__ == "__main__":
+ main()
diff --git a/collaborativeagents/scripts/test_batch_50.sh b/collaborativeagents/scripts/test_batch_50.sh
new file mode 100755
index 0000000..35f4440
--- /dev/null
+++ b/collaborativeagents/scripts/test_batch_50.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+# Test batch processing with 50 conversations (paper's configuration)
+
+set -e
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PORT_USER=8004
+PORT_AGENT=8003
+
+echo "============================================"
+echo "Batch Processing Test (Paper Configuration)"
+echo "Batch Size: 50 conversations"
+echo "============================================"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+echo ""
+
+# Kill any existing vLLM servers
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start servers with TP=2
+echo "Starting 8B user simulator server (GPU 0-1, TP=2)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_USER \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_USER_PID=$!
+
+echo "Starting 8B agent server (GPU 2-3, TP=2)..."
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_AGENT \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_AGENT_PID=$!
+
+echo "Waiting for servers..."
+for i in $(seq 1 100); do
+ READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0)
+ READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0)
+ if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then
+ echo "Both servers ready after $((i*3)) seconds"
+ break
+ fi
+ if [ $((i % 20)) -eq 0 ]; then
+ echo " Still waiting... ($((i*3))s)"
+ fi
+ sleep 3
+done
+
+if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then
+ echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then
+ echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+echo "✓ Both servers healthy"
+
+echo ""
+echo "============================================"
+echo "Test 1: Batch=50, Turns=5"
+echo "============================================"
+python scripts/test_batch_50.py \
+ http://localhost:$PORT_USER/v1 \
+ http://localhost:$PORT_AGENT/v1 \
+ 50 5
+
+echo ""
+echo "============================================"
+echo "Test 2: Batch=50, Turns=10 (paper config)"
+echo "============================================"
+python scripts/test_batch_50.py \
+ http://localhost:$PORT_USER/v1 \
+ http://localhost:$PORT_AGENT/v1 \
+ 50 10
+
+echo ""
+echo "============================================"
+echo "Test 3: Batch=100, Turns=10 (stress test)"
+echo "============================================"
+python scripts/test_batch_50.py \
+ http://localhost:$PORT_USER/v1 \
+ http://localhost:$PORT_AGENT/v1 \
+ 100 10
+
+# Cleanup
+echo ""
+echo "Cleaning up..."
+kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true
+
+echo ""
+date
diff --git a/collaborativeagents/scripts/test_batch_vs_parallel.sh b/collaborativeagents/scripts/test_batch_vs_parallel.sh
new file mode 100755
index 0000000..616c593
--- /dev/null
+++ b/collaborativeagents/scripts/test_batch_vs_parallel.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+# Compare batch processing vs parallel profile processing on A100x4
+#
+# Expected result: Batch should be significantly faster because:
+# - Turn-synchronous: ALL conversations processed at same turn together
+# - Maximizes vLLM continuous batching
+# - Fewer total HTTP requests
+
+set -e
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PORT_USER=8004
+PORT_AGENT=8003
+
+echo "============================================"
+echo "Batch vs Parallel Processing Comparison"
+echo "============================================"
+date
+echo "Node: $(hostname)"
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+echo ""
+
+# Kill any existing vLLM servers
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start user simulator server (8B) on GPU 0-1
+echo "Starting 8B user simulator server (GPU 0-1, TP=2)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_USER \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_USER_PID=$!
+
+# Start agent server (8B) on GPU 2-3
+echo "Starting 8B agent server (GPU 2-3, TP=2)..."
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_AGENT \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_AGENT_PID=$!
+
+echo "Waiting for servers (up to 5 min)..."
+
+# Wait for servers
+for i in $(seq 1 100); do
+ READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0)
+ READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0)
+
+ if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then
+ echo "Both servers ready after $((i*3)) seconds"
+ break
+ fi
+ if [ $((i % 20)) -eq 0 ]; then
+ echo " Still waiting... user=$READY_USER, agent=$READY_AGENT ($((i*3))s)"
+ fi
+ sleep 3
+done
+
+# Check health
+if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then
+ echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then
+ echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+echo "✓ Both servers healthy"
+
+echo ""
+echo "============================================"
+echo "Test 1: NEW Batch Processing (20 samples)"
+echo "============================================"
+echo "This batches ALL user requests together, then ALL agent requests."
+echo ""
+
+START=$(date +%s)
+python agents/batch_vllm_agent.py \
+ http://localhost:$PORT_USER/v1 \
+ http://localhost:$PORT_AGENT/v1 \
+ 20
+END=$(date +%s)
+ELAPSED_BATCH=$((END-START))
+echo ""
+echo "Batch processing time: ${ELAPSED_BATCH} seconds"
+
+echo ""
+echo "============================================"
+echo "Test 2: OLD Parallel Profile Processing (20 samples)"
+echo "============================================"
+echo "This runs 20 profiles in parallel, but each makes separate requests."
+echo ""
+
+cd scripts
+START=$(date +%s)
+python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 20 \
+ --n-sessions 1 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_USER/v1 \
+ --vllm-agent-url http://localhost:$PORT_AGENT/v1 \
+ --parallel-profiles 20 \
+ --output-dir ../results/batch_compare_parallel \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -30
+END=$(date +%s)
+ELAPSED_PARALLEL=$((END-START))
+echo ""
+echo "Parallel profile processing time: ${ELAPSED_PARALLEL} seconds"
+
+cd ..
+
+# Cleanup
+echo ""
+echo "Cleaning up..."
+kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true
+
+echo ""
+echo "============================================"
+echo "RESULTS COMPARISON"
+echo "============================================"
+echo ""
+echo "NEW Batch processing (20 conv): ${ELAPSED_BATCH}s"
+echo "OLD Parallel profiles (20 conv): ${ELAPSED_PARALLEL}s"
+echo ""
+if [ $ELAPSED_BATCH -gt 0 ]; then
+ SPEEDUP=$(echo "scale=2; $ELAPSED_PARALLEL / $ELAPSED_BATCH" | bc)
+ echo "Speedup with batch processing: ${SPEEDUP}x"
+fi
+echo ""
+echo "Expected: Batch should be 5-10x faster due to:"
+echo " - Turn-synchronous processing (all convs at same turn batched)"
+echo " - Fewer HTTP request overhead"
+echo " - Better vLLM continuous batching utilization"
+echo ""
+date
diff --git a/collaborativeagents/scripts/test_extractor.py b/collaborativeagents/scripts/test_extractor.py
new file mode 100644
index 0000000..a2b4ac1
--- /dev/null
+++ b/collaborativeagents/scripts/test_extractor.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""Quick test for the preference extractor."""
+
+import sys
+sys.path.insert(0, "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src")
+
+from personalization.config.registry import get_preference_extractor
+
+print("="*60)
+print("PREFERENCE EXTRACTOR TEST")
+print("="*60)
+
+print("\nLoading extractor (qwen3_0_6b_sft)...")
+extractor = get_preference_extractor("qwen3_0_6b_sft")
+print("Extractor loaded successfully!")
+
+# Test extraction with various queries
+test_queries = [
+ "I prefer Python over Java for scripting tasks",
+ "Please use bullet points instead of numbered lists",
+ "Can you explain this in simpler terms? I'm a beginner.",
+ "I like concise answers, not long explanations",
+ "Always show code examples when explaining programming concepts",
+]
+
+print("\n" + "="*60)
+print("EXTRACTION TESTS")
+print("="*60)
+
+for i, query in enumerate(test_queries, 1):
+ print(f"\n--- Test {i} ---")
+ print(f"Query: {query}")
+ result = extractor.extract_preferences(query)
+ print(f"Extracted: {result}")
+
+ if result.get("preferences"):
+ for pref in result["preferences"]:
+ print(f" - condition: {pref.get('condition', 'N/A')}")
+ print(f" action: {pref.get('action', 'N/A')}")
+ print(f" confidence: {pref.get('confidence', 'N/A')}")
+ else:
+ print(" (No preferences extracted)")
+
+print("\n" + "="*60)
+print("TEST COMPLETE")
+print("="*60)
diff --git a/collaborativeagents/scripts/test_multiturn.py b/collaborativeagents/scripts/test_multiturn.py
new file mode 100644
index 0000000..1909c34
--- /dev/null
+++ b/collaborativeagents/scripts/test_multiturn.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""
+Minimal test script to validate multi-turn conversation works correctly.
+
+This runs a single profile with a single session to verify:
+1. LocalUserAgent loads and generates responses
+2. Multi-turn conversation loop works
+3. Metrics are properly extracted
+"""
+
+import sys
+import json
+from pathlib import Path
+
+# Add paths
+sys.path.insert(0, str(Path(__file__).parent.parent))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from agents.local_user_agent import LocalUserAgent, SharedLocalUserAgent, TERMINATION_SIGNAL
+
+def test_user_agent_standalone():
+ """Test LocalUserAgent in isolation."""
+ print("=" * 60)
+ print("TEST 1: LocalUserAgent Standalone")
+ print("=" * 60)
+
+ user_agent = LocalUserAgent(
+ user_task_description="Help solve a math problem",
+ problem="What is 2 + 2?",
+ user_persona="A student learning math",
+ user_preferences="- Show step by step solutions\n- Use simple language",
+ )
+
+ # Simulate a conversation
+ conversation = [{"role": "assistant", "content": "How can I help you today?"}]
+
+ print("\nGenerating user response...")
+ response = user_agent.generate_user_response(conversation)
+
+ if response:
+ print(f"SUCCESS! User response: {response.get('response', 'N/A')[:200]}...")
+ print(f"Should terminate: {response.get('should_terminate', 'N/A')}")
+ print(f"Draft answer: {response.get('draft_answer', 'N/A')[:100]}...")
+ return True
+ else:
+ print("FAILED! User agent returned None")
+ return False
+
+
+def test_multiturn_conversation():
+ """Test full multi-turn conversation with agent adapter."""
+ print("\n" + "=" * 60)
+ print("TEST 2: Multi-turn Conversation")
+ print("=" * 60)
+
+ from adapters.personalized_llm_adapter import create_baseline_adapter
+
+ # Create a simple agent adapter (vanilla mode)
+ print("\nCreating vanilla adapter...")
+ adapter = create_baseline_adapter("vanilla")
+ adapter.initialize()
+
+ # Load a test profile
+ profile_path = Path(__file__).parent.parent / "data/complex_profiles_v2/profiles_100.jsonl"
+ with open(profile_path) as f:
+ profile = json.loads(f.readline())
+
+ print(f"Loaded profile: {profile.get('user_id', 'unknown')}")
+
+ # Create user agent
+ problem = "What is 15% of 80?"
+ user_prefs = profile.get("preferences", [])[:3]
+ pref_str = "\n".join([f"- {p}" for p in user_prefs])
+
+ print(f"\nUser preferences:\n{pref_str}")
+
+ user_agent = SharedLocalUserAgent(
+ user_task_description="Solve the math problem",
+ problem=problem,
+ user_persona=profile.get("persona", "A user"),
+ user_preferences=pref_str,
+ )
+
+ # Start session
+ adapter.start_session(user_id=profile.get("user_id", "test"))
+
+ # Run multi-turn conversation
+ conversation = [{"role": "assistant", "content": "How can I help you today?"}]
+ turns = []
+ max_turns = 5
+
+ print(f"\nStarting {max_turns}-turn conversation...")
+
+ for turn_num in range(max_turns):
+ print(f"\n--- Turn {turn_num + 1} ---")
+
+ # User turn
+ user_response = user_agent.generate_user_response(conversation)
+ if user_response is None:
+ print("User agent failed!")
+ break
+
+ user_msg = user_response.get("response", "")
+ print(f"USER: {user_msg[:150]}...")
+
+ conversation.append({"role": "user", "content": user_msg})
+ turns.append({"role": "user", "content": user_msg})
+
+ # Check termination
+ if user_response.get("should_terminate", False) or TERMINATION_SIGNAL in user_msg:
+ print("\n[User terminated conversation]")
+ break
+
+ # Agent turn
+ response = adapter.generate_response(user_msg, conversation[:-1])
+ agent_msg = response.get("response", str(response)) if isinstance(response, dict) else str(response)
+ print(f"AGENT: {agent_msg[:150]}...")
+
+ conversation.append({"role": "assistant", "content": agent_msg})
+ turns.append({"role": "assistant", "content": agent_msg})
+
+ # End session
+ adapter.end_session()
+
+ print(f"\n--- Results ---")
+ print(f"Total turns: {len(turns)}")
+ print(f"User turns: {len([t for t in turns if t['role'] == 'user'])}")
+ print(f"Agent turns: {len([t for t in turns if t['role'] == 'assistant'])}")
+
+ return len(turns) > 2 # Success if more than single turn
+
+
+def test_full_session():
+ """Test run_single_session from ExperimentRunner."""
+ print("\n" + "=" * 60)
+ print("TEST 3: Full run_single_session")
+ print("=" * 60)
+
+ from run_experiments import ExperimentRunner, ExperimentConfig
+ from adapters.personalized_llm_adapter import create_baseline_adapter
+
+ config = ExperimentConfig(
+ methods=["vanilla"],
+ datasets=["math-500"],
+ n_profiles=1,
+ n_sessions_per_profile=1,
+ max_turns_per_session=5,
+ output_dir="/tmp/test_multiturn",
+ profile_path=str(Path(__file__).parent.parent / "data/complex_profiles_v2/profiles_100.jsonl"),
+ )
+
+ print("\nCreating ExperimentRunner...")
+ runner = ExperimentRunner(config)
+
+ # Get first profile and problem
+ profile = runner.profiles[0]
+ dataset = list(runner.datasets.values())[0]
+ sample = dataset.get_testset()[0]
+
+ problem = {
+ "problem": sample.problem,
+ "solution": sample.solution,
+ "problem_id": sample.problem_id,
+ "domain": sample.domain,
+ }
+
+ print(f"\nRunning single session...")
+ print(f"Profile: {profile.get('user_id', 'unknown')}")
+ print(f"Problem: {problem['problem'][:100]}...")
+
+ # Create adapter
+ adapter = create_baseline_adapter("vanilla")
+ adapter.initialize()
+
+ result = runner.run_single_session(
+ method="vanilla",
+ profile=profile,
+ problem=problem,
+ is_conflict_query=False,
+ adapter=adapter,
+ )
+
+ print(f"\n--- Session Results ---")
+ print(f"Total turns: {result['metrics']['total_turns']}")
+ print(f"Task success: {result['metrics']['task_success']}")
+ print(f"Enforcement count: {result['metrics']['enforcement_count']}")
+ print(f"User tokens: {result['metrics']['user_token_count']}")
+ print(f"Agent tokens: {result['metrics']['agent_token_count']}")
+ print(f"Compliance scores: {result['metrics']['preference_compliance_scores']}")
+
+ if result['conversation']:
+ print(f"\nConversation ({len(result['conversation']['turns'])} messages):")
+ for i, turn in enumerate(result['conversation']['turns'][:6]):
+ print(f" [{turn['role']}]: {turn['content'][:80]}...")
+
+ return result['metrics']['total_turns'] > 2
+
+
+if __name__ == "__main__":
+ print("\n" + "=" * 60)
+ print("MULTI-TURN CONVERSATION VALIDATION TEST")
+ print("=" * 60)
+
+ results = {}
+
+ # Test 1: User agent standalone
+ try:
+ results["user_agent"] = test_user_agent_standalone()
+ except Exception as e:
+ print(f"TEST 1 FAILED: {e}")
+ import traceback
+ traceback.print_exc()
+ results["user_agent"] = False
+
+ # Test 2: Multi-turn conversation
+ try:
+ results["multiturn"] = test_multiturn_conversation()
+ except Exception as e:
+ print(f"TEST 2 FAILED: {e}")
+ import traceback
+ traceback.print_exc()
+ results["multiturn"] = False
+
+ # Test 3: Full session (only if test 2 passed)
+ if results.get("multiturn", False):
+ try:
+ results["full_session"] = test_full_session()
+ except Exception as e:
+ print(f"TEST 3 FAILED: {e}")
+ import traceback
+ traceback.print_exc()
+ results["full_session"] = False
+ else:
+ print("\nSkipping TEST 3 (TEST 2 failed)")
+ results["full_session"] = False
+
+ # Summary
+ print("\n" + "=" * 60)
+ print("TEST SUMMARY")
+ print("=" * 60)
+ for test_name, passed in results.items():
+ status = "PASS" if passed else "FAIL"
+ print(f" {test_name}: {status}")
+
+ all_passed = all(results.values())
+ print(f"\nOverall: {'ALL TESTS PASSED' if all_passed else 'SOME TESTS FAILED'}")
+
+ sys.exit(0 if all_passed else 1)
diff --git a/collaborativeagents/scripts/test_parallel_a100.sh b/collaborativeagents/scripts/test_parallel_a100.sh
new file mode 100755
index 0000000..dfd74bc
--- /dev/null
+++ b/collaborativeagents/scripts/test_parallel_a100.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+# Quick test of parallel vLLM processing on A100x4-interactive
+
+set -e
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
+
+# Configuration - using only 8B model for user sim to fit in A100
+# (70B AWQ needs TP=2 which leaves only 2 GPUs for 8B)
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PORT_USER=8004
+PORT_AGENT=8003
+
+echo "============================================"
+echo "Quick Parallel vLLM Test (A100x4)"
+echo "============================================"
+date
+echo "Node: $(hostname)"
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+echo ""
+
+# Kill any existing vLLM servers
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# For A100 test, use 8B for both user and agent (to test parallelism)
+# In production, user would be 70B AWQ with TP=2
+
+# Start user simulator server (8B) on GPU 0-1
+echo "Starting 8B user simulator server (GPU 0-1)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_USER \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_USER_PID=$!
+
+# Start agent server (8B) on GPU 2-3
+echo "Starting 8B agent server (GPU 2-3)..."
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_AGENT \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.85 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_AGENT_PID=$!
+
+echo "Waiting for servers..."
+
+# Wait for servers (up to 5 minutes - A100 needs more time than H200)
+for i in $(seq 1 100); do
+ READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0)
+ READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0)
+
+ if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then
+ echo "Both servers ready after $((i*3)) seconds"
+ break
+ fi
+ if [ $((i % 20)) -eq 0 ]; then
+ echo " Still waiting... user=$READY_USER, agent=$READY_AGENT ($((i*3))s)"
+ fi
+ sleep 3
+done
+
+# Check health
+if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then
+ echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then
+ echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
+fi
+echo "✓ Both servers healthy"
+
+cd scripts
+
+echo ""
+echo "============================================"
+echo "Running throughput tests..."
+echo "============================================"
+echo "Note: Using 8B for both user and agent (parallelism test)"
+echo ""
+
+# Test 1: Sequential (1 profile, 3 sessions)
+echo "--- Test 1: Sequential (1 profile, 3 sessions) ---"
+START=$(date +%s)
+python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 1 \
+ --n-sessions 3 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_USER/v1 \
+ --vllm-agent-url http://localhost:$PORT_AGENT/v1 \
+ --parallel-profiles 1 \
+ --output-dir ../results/a100_test_1 \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -30
+END=$(date +%s)
+ELAPSED_1=$((END-START))
+echo ""
+echo "Time for 1 profile (3 sessions): ${ELAPSED_1} seconds"
+echo "Throughput: ~$(echo "scale=1; 3 * 3600 / $ELAPSED_1" | bc) sessions/hr"
+
+# Test 2: Parallel (4 profiles, 3 sessions each = 12 total)
+echo ""
+echo "--- Test 2: Parallel (4 profiles, 3 sessions each = 12 total) ---"
+START=$(date +%s)
+python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 4 \
+ --n-sessions 3 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_USER/v1 \
+ --vllm-agent-url http://localhost:$PORT_AGENT/v1 \
+ --parallel-profiles 4 \
+ --output-dir ../results/a100_test_4 \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -30
+END=$(date +%s)
+ELAPSED_4=$((END-START))
+echo ""
+echo "Time for 4 profiles (12 sessions): ${ELAPSED_4} seconds"
+echo "Throughput: ~$(echo "scale=1; 12 * 3600 / $ELAPSED_4" | bc) sessions/hr"
+
+# Test 3: More parallel (8 profiles)
+echo ""
+echo "--- Test 3: Parallel (8 profiles, 3 sessions each = 24 total) ---"
+START=$(date +%s)
+python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 8 \
+ --n-sessions 3 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_USER/v1 \
+ --vllm-agent-url http://localhost:$PORT_AGENT/v1 \
+ --parallel-profiles 8 \
+ --output-dir ../results/a100_test_8 \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -30
+END=$(date +%s)
+ELAPSED_8=$((END-START))
+echo ""
+echo "Time for 8 profiles (24 sessions): ${ELAPSED_8} seconds"
+echo "Throughput: ~$(echo "scale=1; 24 * 3600 / $ELAPSED_8" | bc) sessions/hr"
+
+# Cleanup
+echo ""
+echo "Cleaning up..."
+kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true
+
+echo ""
+echo "============================================"
+echo "RESULTS SUMMARY"
+echo "============================================"
+echo ""
+echo "1 profile (3 sessions): ${ELAPSED_1}s -> $(echo "scale=0; 3 * 3600 / $ELAPSED_1" | bc) sessions/hr"
+echo "4 profiles (12 sessions): ${ELAPSED_4}s -> $(echo "scale=0; 12 * 3600 / $ELAPSED_4" | bc) sessions/hr"
+echo "8 profiles (24 sessions): ${ELAPSED_8}s -> $(echo "scale=0; 24 * 3600 / $ELAPSED_8" | bc) sessions/hr"
+echo ""
+echo "Speedup 4x parallel: $(echo "scale=2; ($ELAPSED_1 * 4) / $ELAPSED_4" | bc)x"
+echo "Speedup 8x parallel: $(echo "scale=2; ($ELAPSED_1 * 8) / $ELAPSED_8" | bc)x"
+echo ""
+date
diff --git a/collaborativeagents/scripts/test_parallel_quick.sh b/collaborativeagents/scripts/test_parallel_quick.sh
new file mode 100755
index 0000000..8429da7
--- /dev/null
+++ b/collaborativeagents/scripts/test_parallel_quick.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+# Quick test of parallel vLLM processing on H200x8-interactive
+# Simplified version for 1 hour time limit
+
+set -e
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
+
+# Configuration
+MODEL_70B_AWQ="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PORT_70B=8004
+PORT_8B=8003
+
+echo "============================================"
+echo "Quick Parallel vLLM Test (H200)"
+echo "============================================"
+date
+echo "Node: $(hostname)"
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+echo ""
+
+# Kill any existing vLLM servers
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start 70B AWQ server on GPU 0-1 (TP=2)
+echo "Starting 70B AWQ server (GPU 0-1, TP=2)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_70B_AWQ \
+ --port $PORT_70B \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --quantization awq \
+ --dtype float16 &
+SERVER_70B_PID=$!
+
+# Start 8B server on GPU 2
+echo "Starting 8B server (GPU 2)..."
+CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_8B \
+ --gpu-memory-utilization 0.90 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_8B_PID=$!
+
+echo "Waiting for servers (up to 5 min)..."
+
+# Wait for servers
+for i in $(seq 1 100); do
+ READY_70B=$(curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1 && echo 1 || echo 0)
+ READY_8B=$(curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1 && echo 1 || echo 0)
+
+ if [ "$READY_70B" = "1" ] && [ "$READY_8B" = "1" ]; then
+ echo "Both servers ready after $((i*3)) seconds"
+ break
+ fi
+ if [ $((i % 20)) -eq 0 ]; then
+ echo " Still waiting... 70B=$READY_70B, 8B=$READY_8B ($((i*3))s)"
+ fi
+ sleep 3
+done
+
+# Check health
+if ! curl -s http://localhost:$PORT_70B/health > /dev/null; then
+ echo "ERROR: 70B server not healthy"; kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null; exit 1
+fi
+if ! curl -s http://localhost:$PORT_8B/health > /dev/null; then
+ echo "ERROR: 8B server not healthy"; kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null; exit 1
+fi
+echo "✓ Both servers healthy"
+
+cd scripts
+
+echo ""
+echo "============================================"
+echo "Running throughput tests..."
+echo "============================================"
+
+# Test 1: Sequential (1 profile, 2 sessions)
+echo ""
+echo "--- Test 1: Sequential (1 profile) ---"
+START=$(date +%s)
+python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 1 \
+ --n-sessions 2 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_70B/v1 \
+ --vllm-agent-url http://localhost:$PORT_8B/v1 \
+ --parallel-profiles 1 \
+ --output-dir ../results/quick_test_1 \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20
+END=$(date +%s)
+echo "Time: $((END-START)) seconds"
+
+# Test 2: Parallel (4 profiles, 2 sessions each)
+echo ""
+echo "--- Test 2: Parallel (4 profiles) ---"
+START=$(date +%s)
+python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 4 \
+ --n-sessions 2 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_70B/v1 \
+ --vllm-agent-url http://localhost:$PORT_8B/v1 \
+ --parallel-profiles 4 \
+ --output-dir ../results/quick_test_4 \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20
+END=$(date +%s)
+echo "Time: $((END-START)) seconds"
+
+# Test 3: Parallel (8 profiles, 2 sessions each)
+echo ""
+echo "--- Test 3: Parallel (8 profiles) ---"
+START=$(date +%s)
+python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 8 \
+ --n-sessions 2 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_70B/v1 \
+ --vllm-agent-url http://localhost:$PORT_8B/v1 \
+ --parallel-profiles 8 \
+ --output-dir ../results/quick_test_8 \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20
+END=$(date +%s)
+echo "Time: $((END-START)) seconds"
+
+# Cleanup
+echo ""
+echo "Cleaning up..."
+kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null || true
+
+echo ""
+echo "============================================"
+echo "TEST COMPLETE!"
+echo "============================================"
+echo ""
+echo "Summary: Compare timing above"
+echo " - Sequential (1 profile): baseline"
+echo " - Parallel (4 profiles): should be faster per profile"
+echo " - Parallel (8 profiles): should show more speedup"
+echo ""
+date
diff --git a/collaborativeagents/scripts/test_parallel_speed.sbatch b/collaborativeagents/scripts/test_parallel_speed.sbatch
new file mode 100644
index 0000000..28c5b79
--- /dev/null
+++ b/collaborativeagents/scripts/test_parallel_speed.sbatch
@@ -0,0 +1,126 @@
+#!/bin/bash
+#SBATCH --job-name=test_parallel
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8-interactive
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=01:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_parallel-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_parallel-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== Parallel Speed Test: ALL 6 METHODS ==="
+echo "Scale: 10 profiles × 3 sessions = 30 sessions per method"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start TWO vLLM servers (user on 8004, agent on 8003)
+echo ""
+echo "Starting vLLM servers..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+for i in $(seq 1 120); do
+ u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0)
+ a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0)
+ [ "$u" = "1" ] && [ "$a" = "1" ] && echo "Both servers ready after $((i*2))s" && break
+ sleep 2
+done
+
+# Additional wait for servers to fully load models
+echo "Waiting 30s for servers to fully initialize..."
+sleep 30
+echo "Starting experiments..."
+
+# Common parameters for all tests
+COMMON_ARGS="--datasets math-hard --n-profiles 10 --n-sessions 3 --max-turns 10 --use-vllm --parallel-profiles 10 --no-batch-processing --output-dir ../results/parallel_test_all --profile-path $PROFILE_PATH"
+
+# Test 1: vanilla (batch processing)
+echo ""
+echo "=== TEST 1: vanilla (batch processing) ==="
+START=$(date +%s)
+python scripts/run_experiments.py \
+ --methods vanilla \
+ --datasets math-hard \
+ --n-profiles 10 --n-sessions 3 --max-turns 10 \
+ --use-vllm --parallel-profiles 10 \
+ --use-batch-processing --batch-size 30 \
+ --output-dir ../results/parallel_test_all \
+ --profile-path "$PROFILE_PATH"
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> vanilla: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 2: contextual
+echo ""
+echo "=== TEST 2: contextual ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods contextual $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> contextual: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 3: reflection
+echo ""
+echo "=== TEST 3: reflection ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods reflection $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> reflection: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 4: all_memory
+echo ""
+echo "=== TEST 4: all_memory ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods all_memory $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> all_memory: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 5: rag
+echo ""
+echo "=== TEST 5: rag ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods rag $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> rag: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 6: rag_vector
+echo ""
+echo "=== TEST 6: rag_vector ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods rag_vector $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> rag_vector: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+
+echo ""
+echo "=== ALL SPEED TESTS COMPLETE ==="
+date
diff --git a/collaborativeagents/scripts/test_parallel_speed_a100.sbatch b/collaborativeagents/scripts/test_parallel_speed_a100.sbatch
new file mode 100755
index 0000000..f3d0848
--- /dev/null
+++ b/collaborativeagents/scripts/test_parallel_speed_a100.sbatch
@@ -0,0 +1,126 @@
+#!/bin/bash
+#SBATCH --job-name=test_all_a100
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuA100x4
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=128G
+#SBATCH --time=01:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_a100-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_a100-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== Parallel Speed Test: ALL 6 METHODS (A100) ==="
+echo "Scale: 10 profiles × 3 sessions = 30 sessions per method"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start TWO vLLM servers (user on 8004, agent on 8003)
+echo ""
+echo "Starting vLLM servers..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.45 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.45 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+for i in $(seq 1 120); do
+ u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0)
+ a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0)
+ [ "$u" = "1" ] && [ "$a" = "1" ] && echo "Both servers ready after $((i*2))s" && break
+ sleep 2
+done
+
+# Additional wait for servers to fully load models
+echo "Waiting 30s for servers to fully initialize..."
+sleep 30
+echo "Starting experiments..."
+
+# Common parameters for all tests
+COMMON_ARGS="--datasets math-hard --n-profiles 10 --n-sessions 3 --max-turns 10 --use-vllm --parallel-profiles 10 --no-batch-processing --output-dir ../results/parallel_test_a100 --profile-path $PROFILE_PATH"
+
+# Test 1: vanilla (batch processing)
+echo ""
+echo "=== TEST 1: vanilla (batch processing) ==="
+START=$(date +%s)
+python scripts/run_experiments.py \
+ --methods vanilla \
+ --datasets math-hard \
+ --n-profiles 10 --n-sessions 3 --max-turns 10 \
+ --use-vllm --parallel-profiles 10 \
+ --use-batch-processing --batch-size 30 \
+ --output-dir ../results/parallel_test_a100 \
+ --profile-path "$PROFILE_PATH"
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> vanilla: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 2: contextual
+echo ""
+echo "=== TEST 2: contextual ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods contextual $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> contextual: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 3: reflection
+echo ""
+echo "=== TEST 3: reflection ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods reflection $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> reflection: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 4: all_memory
+echo ""
+echo "=== TEST 4: all_memory ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods all_memory $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> all_memory: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 5: rag
+echo ""
+echo "=== TEST 5: rag ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods rag $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> rag: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+# Test 6: rag_vector
+echo ""
+echo "=== TEST 6: rag_vector ==="
+START=$(date +%s)
+python scripts/run_experiments.py --methods rag_vector $COMMON_ARGS
+END=$(date +%s)
+ELAPSED=$((END - START))
+echo ">>> rag_vector: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+
+echo ""
+echo "=== ALL SPEED TESTS COMPLETE ==="
+date
diff --git a/collaborativeagents/scripts/test_parallel_vllm.sh b/collaborativeagents/scripts/test_parallel_vllm.sh
new file mode 100755
index 0000000..0cd0f1f
--- /dev/null
+++ b/collaborativeagents/scripts/test_parallel_vllm.sh
@@ -0,0 +1,205 @@
+#!/bin/bash
+# Test parallel vLLM processing on H200x8-interactive
+# Usage: Run this on an interactive H200 node
+#
+# srun --account=bfqt-delta-gpu --partition=gpuH200x8-interactive \
+# --nodes=1 --gpus-per-node=4 --time=02:00:00 --mem=200G --pty bash
+# cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+# bash scripts/test_parallel_vllm.sh
+
+set -e
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
+
+# Configuration
+MODEL_70B_AWQ="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PORT_70B=8004
+PORT_8B=8003
+
+echo "============================================"
+echo "Parallel vLLM Experiment Test"
+echo "============================================"
+echo "Date: $(date)"
+echo "Node: $(hostname)"
+echo ""
+
+echo "=== GPU Info ==="
+nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv
+echo ""
+
+# Kill any existing vLLM servers
+echo "Cleaning up any existing vLLM servers..."
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+echo "============================================"
+echo "Starting vLLM Servers"
+echo "============================================"
+
+# Start 70B AWQ server on GPU 0-1 (needs 2 GPUs for tensor parallelism)
+echo ""
+echo "Starting 70B AWQ vLLM Server (GPU 0-1, TP=2)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_70B_AWQ \
+ --port $PORT_70B \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --quantization awq \
+ --dtype float16 &
+SERVER_70B_PID=$!
+echo "70B Server PID: $SERVER_70B_PID"
+
+# Start 8B server on GPU 2
+echo ""
+echo "Starting 8B vLLM Server (GPU 2)..."
+CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_8B \
+ --gpu-memory-utilization 0.90 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_8B_PID=$!
+echo "8B Server PID: $SERVER_8B_PID"
+
+echo ""
+echo "Waiting for servers to start..."
+
+# Wait for 70B (may take 3-5 minutes)
+for i in $(seq 1 120); do
+ if curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1; then
+ echo "70B Server ready after $((i*3)) seconds"
+ break
+ fi
+ if [ $((i % 20)) -eq 0 ]; then
+ echo " Waiting for 70B... ($((i*3)) seconds)"
+ fi
+ sleep 3
+done
+
+# Wait for 8B
+for i in $(seq 1 60); do
+ if curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1; then
+ echo "8B Server ready after $((i*2)) seconds"
+ break
+ fi
+ sleep 2
+done
+
+# Check both servers
+echo ""
+if ! curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1; then
+ echo "ERROR: 70B server failed to start"
+ kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null
+ exit 1
+fi
+echo "✓ 70B server healthy"
+
+if ! curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1; then
+ echo "ERROR: 8B server failed to start"
+ kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null
+ exit 1
+fi
+echo "✓ 8B server healthy"
+
+echo ""
+echo "=== vLLM Server Info ==="
+echo "70B model:"
+curl -s http://localhost:$PORT_70B/v1/models | python -m json.tool 2>/dev/null | head -10
+echo ""
+echo "8B model:"
+curl -s http://localhost:$PORT_8B/v1/models | python -m json.tool 2>/dev/null | head -10
+
+echo ""
+echo "============================================"
+echo "Test 1: Sequential Processing (1 profile)"
+echo "============================================"
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts
+
+time python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 1 \
+ --n-sessions 3 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_70B/v1 \
+ --vllm-agent-url http://localhost:$PORT_8B/v1 \
+ --parallel-profiles 1 \
+ --output-dir ../results/parallel_test_seq \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl
+
+echo ""
+echo "============================================"
+echo "Test 2: Parallel Processing (4 profiles)"
+echo "============================================"
+
+time python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 4 \
+ --n-sessions 3 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_70B/v1 \
+ --vllm-agent-url http://localhost:$PORT_8B/v1 \
+ --parallel-profiles 4 \
+ --output-dir ../results/parallel_test_4 \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl
+
+echo ""
+echo "============================================"
+echo "Test 3: Parallel Processing (8 profiles)"
+echo "============================================"
+
+time python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 8 \
+ --n-sessions 3 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_70B/v1 \
+ --vllm-agent-url http://localhost:$PORT_8B/v1 \
+ --parallel-profiles 8 \
+ --output-dir ../results/parallel_test_8 \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl
+
+echo ""
+echo "============================================"
+echo "Test 4: Parallel Processing (16 profiles)"
+echo "============================================"
+
+time python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 16 \
+ --n-sessions 3 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_70B/v1 \
+ --vllm-agent-url http://localhost:$PORT_8B/v1 \
+ --parallel-profiles 16 \
+ --output-dir ../results/parallel_test_16 \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl
+
+# Cleanup
+echo ""
+echo "Cleaning up..."
+kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null
+wait $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null
+
+echo ""
+echo "============================================"
+echo "TEST COMPLETE!"
+echo "============================================"
+echo ""
+echo "Compare the timing results above to estimate optimal parallelism."
+echo "Expected scaling: Higher parallelism → Higher throughput (until bottleneck)"
+echo ""
+date
diff --git a/collaborativeagents/scripts/test_rag_empty.sbatch b/collaborativeagents/scripts/test_rag_empty.sbatch
new file mode 100644
index 0000000..735adbc
--- /dev/null
+++ b/collaborativeagents/scripts/test_rag_empty.sbatch
@@ -0,0 +1,143 @@
+#!/bin/bash
+#SBATCH --job-name=rag_empty
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=250G
+#SBATCH --time=03:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.err
+
+# Test RAG with EMPTY memory store - start fresh and accumulate
+# 5 profiles, 15 sessions each (more sessions to test accumulation)
+# Compare: vanilla, rag, rag_vector
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+echo "=== RAG Empty Memory Store Test ==="
+echo "Key change: Starting with EMPTY memory store"
+echo " - RAG will accumulate memories during evaluation"
+echo " - Each user builds their own memory basket from scratch"
+echo ""
+echo "Settings: 5 profiles, 15 sessions each"
+echo "User simulator: $USER_MODEL (70B)"
+echo "Agent: $AGENT_MODEL (8B)"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+# Clear empty store before each run to ensure fresh start
+echo ""
+echo "Clearing empty memory store..."
+> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+echo "Memory store cleared."
+
+# Start vLLM servers with adjusted memory allocation
+echo ""
+echo "Starting vLLM servers..."
+
+# User simulator on GPUs 0,1 (70B, TP=2)
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.85 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME \
+ --disable-log-requests &
+USER_PID=$!
+
+# Agent on GPUs 2,3 (8B, TP=2) - reduced memory for embedding/reranker headroom
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.40 \
+ --max-model-len 16384 --dtype bfloat16 \
+ --disable-log-requests &
+AGENT_PID=$!
+
+# Wait for servers
+echo "Waiting for vLLM servers (may take 5-10 min)..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator (8004) ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent (8003) ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+
+if ! curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "ERROR: User server not healthy"
+ kill $USER_PID $AGENT_PID 2>/dev/null
+ exit 1
+fi
+if ! curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "ERROR: Agent server not healthy"
+ kill $USER_PID $AGENT_PID 2>/dev/null
+ exit 1
+fi
+echo "Both vLLM servers ready"
+sleep 5
+
+OUTPUT_DIR="../results/rag_empty_test_$(date +%Y%m%d_%H%M%S)"
+
+# Run methods sequentially (each starts with fresh empty memory)
+for METHOD in vanilla rag rag_vector; do
+ echo ""
+ echo "============================================"
+ echo "Testing method: $METHOD"
+ echo "============================================"
+
+ # Clear memory store before each method for fair comparison
+ > /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+ rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+ echo "Memory store cleared for $METHOD"
+
+ date
+ START=$(date +%s)
+
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 5 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+ END=$(date +%s)
+ ELAPSED=$((END-START))
+
+ # Show memory accumulation stats for RAG methods
+ if [[ "$METHOD" == "rag" || "$METHOD" == "rag_vector" ]]; then
+ CARD_COUNT=$(wc -l < /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl 2>/dev/null || echo 0)
+ echo "Memory cards accumulated: $CARD_COUNT"
+ fi
+
+ if [ $? -eq 0 ]; then
+ echo "Method $METHOD: SUCCESS (${ELAPSED}s)"
+ else
+ echo "Method $METHOD: FAILED after ${ELAPSED}s"
+ fi
+done
+
+echo ""
+echo "============================================"
+echo "RAG Empty Memory Test Complete"
+echo "============================================"
+echo "Results saved to: $OUTPUT_DIR"
+date
+
+# Cleanup
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/test_rag_empty_v2.sbatch b/collaborativeagents/scripts/test_rag_empty_v2.sbatch
new file mode 100644
index 0000000..834dccb
--- /dev/null
+++ b/collaborativeagents/scripts/test_rag_empty_v2.sbatch
@@ -0,0 +1,124 @@
+#!/bin/bash
+#SBATCH --job-name=rag_empty
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:8
+#SBATCH --mem=400G
+#SBATCH --time=04:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.err
+
+# Test RAG with EMPTY memory store - start fresh and accumulate
+# Using 8 GPUs with TP=4 for 70B to avoid CUDA errors
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export NCCL_P2P_DISABLE=1
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+echo "=== RAG Empty Memory Store Test v2 ==="
+echo "Using 8 GPUs: TP=4 for 70B user sim, TP=2 for 8B agent"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+# Clear empty store
+echo "Clearing empty memory store..."
+> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+
+# Start vLLM servers
+echo "Starting vLLM servers..."
+
+# User simulator on GPUs 0-3 (70B, TP=4)
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 4 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME \
+ --disable-log-requests &
+USER_PID=$!
+
+# Agent on GPUs 4-5 (8B, TP=2)
+CUDA_VISIBLE_DEVICES=4,5 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.40 \
+ --max-model-len 16384 --dtype bfloat16 \
+ --disable-log-requests &
+AGENT_PID=$!
+
+# Wait for servers
+echo "Waiting for vLLM servers..."
+for i in {1..300}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator (8004) ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+for i in {1..120}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent (8003) ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+
+if ! curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "ERROR: User server not healthy"
+ kill $USER_PID $AGENT_PID 2>/dev/null
+ exit 1
+fi
+if ! curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "ERROR: Agent server not healthy"
+ kill $USER_PID $AGENT_PID 2>/dev/null
+ exit 1
+fi
+echo "Both vLLM servers ready"
+sleep 5
+
+OUTPUT_DIR="../results/rag_empty_test_$(date +%Y%m%d_%H%M%S)"
+
+for METHOD in vanilla rag rag_vector; do
+ echo ""
+ echo "============================================"
+ echo "Testing method: $METHOD"
+ echo "============================================"
+
+ # Clear memory store before each method
+ > /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+ rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+
+ date
+ START=$(date +%s)
+
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 5 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+ END=$(date +%s)
+ ELAPSED=$((END-START))
+
+ if [[ "$METHOD" == "rag" || "$METHOD" == "rag_vector" ]]; then
+ CARD_COUNT=$(wc -l < /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl 2>/dev/null || echo 0)
+ echo "Memory cards accumulated: $CARD_COUNT"
+ fi
+
+ echo "Method $METHOD: completed in ${ELAPSED}s"
+done
+
+echo ""
+echo "=== Test Complete ==="
+echo "Results: $OUTPUT_DIR"
+date
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/test_rag_empty_v3.sbatch b/collaborativeagents/scripts/test_rag_empty_v3.sbatch
new file mode 100644
index 0000000..db9bd5c
--- /dev/null
+++ b/collaborativeagents/scripts/test_rag_empty_v3.sbatch
@@ -0,0 +1,110 @@
+#!/bin/bash
+#SBATCH --job-name=rag_empty
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=03:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.err
+
+# Test RAG with EMPTY memory store using previous working settings
+# 4 GPUs, TP=2 for both models (same as smallscale_test.sbatch)
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+echo "=== RAG Empty Memory Store Test ==="
+echo "Using previous working settings (4 GPUs, TP=2)"
+echo "Settings: 5 profiles, 15 sessions each"
+date
+
+# Clear empty store
+echo "Clearing empty memory store..."
+> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+
+# Start vLLM servers (same settings as smallscale_test.sbatch)
+# User simulator on GPUs 0,1 (70B, TP=2)
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME &
+
+# Agent on GPUs 2,3 (8B, TP=2, lower memory for embedding/reranker)
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \
+ --max-model-len 16384 --dtype bfloat16 &
+
+# Wait for servers
+echo "Waiting for vLLM servers..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator (8004) ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent (8003) ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+echo "Both vLLM servers ready"
+sleep 10
+
+OUTPUT_DIR="../results/rag_empty_test_$(date +%Y%m%d_%H%M%S)"
+
+for METHOD in vanilla rag rag_vector; do
+ echo ""
+ echo "============================================"
+ echo "Testing method: $METHOD"
+ echo "============================================"
+
+ # Clear memory store before each method for fair comparison
+ > /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
+ rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
+ echo "Memory store cleared for $METHOD"
+
+ date
+ START=$(date +%s)
+
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 5 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+ END=$(date +%s)
+ ELAPSED=$((END-START))
+
+ if [[ "$METHOD" == "rag" || "$METHOD" == "rag_vector" ]]; then
+ CARD_COUNT=$(wc -l < /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl 2>/dev/null || echo 0)
+ echo "Memory cards accumulated: $CARD_COUNT"
+ fi
+
+ if [ $? -eq 0 ]; then
+ echo "Method $METHOD: SUCCESS (${ELAPSED}s)"
+ else
+ echo "Method $METHOD: FAILED after ${ELAPSED}s"
+ fi
+done
+
+echo ""
+echo "=== Test Complete ==="
+echo "Results: $OUTPUT_DIR"
+date
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/test_rag_fix.sbatch b/collaborativeagents/scripts/test_rag_fix.sbatch
new file mode 100644
index 0000000..b07d286
--- /dev/null
+++ b/collaborativeagents/scripts/test_rag_fix.sbatch
@@ -0,0 +1,123 @@
+#!/bin/bash
+#SBATCH --job-name=test_rag_fix
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:4
+#SBATCH --mem=200G
+#SBATCH --time=02:00:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_rag_fix-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_rag_fix-%j.err
+
+# Small-scale test: 5 profiles, 10 sessions each
+# Tests RAG fixes: extract_session accumulation, nopersonal mode
+# Compare: vanilla, rag, rag_vector
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
+
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
+
+echo "=== RAG Fix Verification Test ==="
+echo "Testing fixes:"
+echo " 1. extract_session (accumulate all turns)"
+echo " 2. RAG mode=nopersonal (pure dense+rerank)"
+echo " 3. Explicit normalize=True"
+echo ""
+echo "Settings: 5 profiles, 10 sessions each, max 15 turns"
+echo "User simulator: $USER_MODEL (70B)"
+echo "Agent: $AGENT_MODEL (8B)"
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+# Start vLLM servers
+# User simulator on GPUs 0,1 (70B, TP=2)
+echo ""
+echo "Starting vLLM servers..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $USER_MODEL \
+ --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \
+ --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME \
+ --disable-log-requests &
+USER_PID=$!
+
+# Agent on GPUs 2,3 (8B, TP=2, lower memory for embedding/reranker)
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $AGENT_MODEL \
+ --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \
+ --max-model-len 16384 --dtype bfloat16 \
+ --disable-log-requests &
+AGENT_PID=$!
+
+# Wait for servers
+echo "Waiting for vLLM servers (may take 5-10 min)..."
+for i in {1..200}; do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "User simulator (8004) ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+for i in {1..60}; do
+ if curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "Agent (8003) ready after $((i*5)) seconds"
+ break
+ fi
+ sleep 5
+done
+
+if ! curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "ERROR: User server not healthy"
+ kill $USER_PID $AGENT_PID 2>/dev/null
+ exit 1
+fi
+if ! curl -s http://localhost:8003/health > /dev/null 2>&1; then
+ echo "ERROR: Agent server not healthy"
+ kill $USER_PID $AGENT_PID 2>/dev/null
+ exit 1
+fi
+echo "Both vLLM servers ready"
+sleep 5
+
+# Test methods: vanilla (baseline), rag (fixed), rag_vector (fixed)
+OUTPUT_DIR="../results/rag_fix_test_$(date +%Y%m%d_%H%M%S)"
+
+for METHOD in vanilla rag rag_vector; do
+ echo ""
+ echo "============================================"
+ echo "Testing method: $METHOD"
+ echo "============================================"
+ date
+ START=$(date +%s)
+
+ python scripts/run_experiments.py --methods $METHOD \
+ --datasets math-hard --n-profiles 5 --n-sessions 10 --max-turns 15 \
+ --use-vllm --no-batch-processing --parallel-profiles 5 \
+ --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
+
+ END=$(date +%s)
+ ELAPSED=$((END-START))
+
+ if [ $? -eq 0 ]; then
+ echo "Method $METHOD: SUCCESS (${ELAPSED}s)"
+ else
+ echo "Method $METHOD: FAILED after ${ELAPSED}s"
+ fi
+done
+
+echo ""
+echo "============================================"
+echo "RAG Fix Test Complete"
+echo "============================================"
+echo "Results saved to: $OUTPUT_DIR"
+date
+
+# Cleanup
+pkill -f "vllm.entrypoints" 2>/dev/null || true
diff --git a/collaborativeagents/scripts/test_real_speed.sbatch b/collaborativeagents/scripts/test_real_speed.sbatch
new file mode 100644
index 0000000..ff914e6
--- /dev/null
+++ b/collaborativeagents/scripts/test_real_speed.sbatch
@@ -0,0 +1,87 @@
+#!/bin/bash
+#SBATCH --job-name=test_real
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8-interactive
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH --time=00:30:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_real-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_real-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
+
+echo "=== Real Speed Test (5 profiles, 5 sessions) ==="
+date
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Test 1: contextual (vLLM-based)
+echo ""
+echo "=== TEST 1: contextual (2 vLLM servers) ==="
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+for i in $(seq 1 120); do
+ u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0)
+ a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0)
+ [ "$u" = "1" ] && [ "$a" = "1" ] && echo "Ready after $((i*2))s" && break
+ sleep 2
+done
+
+time python scripts/run_experiments.py \
+ --methods contextual \
+ --datasets math-hard \
+ --n-profiles 5 --n-sessions 5 --max-turns 10 \
+ --use-vllm --parallel-profiles 5 \
+ --output-dir ../results/speed_test \
+ --profile-path "$PROFILE_PATH"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 5
+
+# Test 2: all_memory (vLLM + transformers)
+echo ""
+echo "=== TEST 2: all_memory (vLLM user + transformers adapter) ==="
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+for i in $(seq 1 120); do
+ curl -s http://localhost:8004/health > /dev/null 2>&1 && echo "Ready after $((i*2))s" && break
+ sleep 2
+done
+
+CUDA_VISIBLE_DEVICES=2,3 time python scripts/run_experiments.py \
+ --methods all_memory \
+ --datasets math-hard \
+ --n-profiles 5 --n-sessions 5 --max-turns 10 \
+ --use-vllm --parallel-profiles 5 \
+ --output-dir ../results/speed_test \
+ --profile-path "$PROFILE_PATH"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+
+echo ""
+echo "=== DONE ==="
+date
diff --git a/collaborativeagents/scripts/test_vllm_adapter.sh b/collaborativeagents/scripts/test_vllm_adapter.sh
new file mode 100755
index 0000000..af22667
--- /dev/null
+++ b/collaborativeagents/scripts/test_vllm_adapter.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# Test vLLM with 45% memory + ContextualAdapter loading
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+
+echo "=== Testing vLLM 45% memory + Adapter ==="
+echo "GPUs available:"
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+# Kill any existing vLLM
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+echo ""
+echo "Memory before vLLM:"
+nvidia-smi --query-gpu=index,memory.used --format=csv
+
+echo ""
+echo "Starting vLLM with 45% memory on GPU 0,1..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.45 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+VLLM_PID=$!
+echo "vLLM PID: $VLLM_PID"
+
+echo "Waiting for vLLM to start..."
+for i in $(seq 1 60); do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "vLLM ready after $((i*2))s"
+ break
+ fi
+ sleep 2
+done
+
+echo ""
+echo "Memory after vLLM started:"
+nvidia-smi --query-gpu=index,memory.used --format=csv
+
+echo ""
+echo "Testing ContextualAdapter loading..."
+python -c "
+import sys
+sys.path.insert(0, 'collaborativeagents')
+sys.path.insert(0, 'src')
+
+from adapters.contextual_adapter import ContextualAdapter
+print('Creating ContextualAdapter...')
+adapter = ContextualAdapter()
+print('Initializing (loading model)...')
+adapter.initialize()
+print('Testing generation...')
+adapter.start_session('test')
+result = adapter.generate_response('What is 2+2?')
+print(f'Response: {result[\"response\"][:100]}')
+print('SUCCESS: ContextualAdapter works with vLLM running!')
+"
+
+echo ""
+echo "Final memory usage:"
+nvidia-smi --query-gpu=index,memory.used --format=csv
+
+# Cleanup
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+echo "Test complete!"
diff --git a/collaborativeagents/scripts/test_vllm_interactive.sh b/collaborativeagents/scripts/test_vllm_interactive.sh
new file mode 100755
index 0000000..5da73b4
--- /dev/null
+++ b/collaborativeagents/scripts/test_vllm_interactive.sh
@@ -0,0 +1,212 @@
+#!/bin/bash
+# Test vLLM inference speed on interactive node
+#
+# Usage:
+# 1. Get an interactive node:
+# srun --partition=gpu --gres=gpu:4 --time=2:00:00 --pty bash
+#
+# 2. Run this script:
+# bash scripts/test_vllm_interactive.sh
+#
+# This script will:
+# 1. Start vLLM server for 8B model (agent)
+# 2. Start vLLM server for 70B AWQ model (user simulator)
+# 3. Run benchmarks
+# 4. Compare with paper's 2000 conv/hr target
+
+set -e
+
+# Paths
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+MODEL_70B="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
+HF_CACHE="/projects/bfqt/users/yurenh2/hf_cache/huggingface"
+
+# Ports
+PORT_8B=8003
+PORT_70B=8004
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}======================================${NC}"
+echo -e "${GREEN} vLLM Inference Speed Test${NC}"
+echo -e "${GREEN}======================================${NC}"
+echo ""
+
+# Check GPU availability
+echo -e "${YELLOW}Checking GPUs...${NC}"
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
+echo -e "Found ${GREEN}${NUM_GPUS}${NC} GPUs"
+echo ""
+
+if [ "$NUM_GPUS" -lt 4 ]; then
+ echo -e "${RED}WARNING: Less than 4 GPUs available. 70B model may not fit.${NC}"
+fi
+
+# Setup environment
+export HF_HOME=$HF_CACHE
+export TRANSFORMERS_CACHE=$HF_CACHE
+
+# Activate conda environment if needed
+# source /path/to/conda/etc/profile.d/conda.sh
+# conda activate your_env
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+
+# Function to start vLLM server
+start_vllm_server() {
+ local model=$1
+ local port=$2
+ local gpus=$3
+ local extra_args=$4
+ local logfile=$5
+
+ echo -e "${YELLOW}Starting vLLM server on port $port with GPUs $gpus...${NC}"
+ echo "Model: $model"
+
+ CUDA_VISIBLE_DEVICES=$gpus python -m vllm.entrypoints.openai.api_server \
+ --model $model \
+ --port $port \
+ --gpu-memory-utilization 0.9 \
+ --max-model-len 8192 \
+ $extra_args \
+ > $logfile 2>&1 &
+
+ echo $!
+}
+
+# Function to wait for server to be ready
+wait_for_server() {
+ local port=$1
+ local max_wait=300 # 5 minutes
+ local waited=0
+
+ echo -n "Waiting for server on port $port"
+ while [ $waited -lt $max_wait ]; do
+ if curl -s http://localhost:$port/health > /dev/null 2>&1; then
+ echo -e " ${GREEN}Ready!${NC}"
+ return 0
+ fi
+ echo -n "."
+ sleep 5
+ waited=$((waited + 5))
+ done
+
+ echo -e " ${RED}Timeout!${NC}"
+ return 1
+}
+
+# Cleanup function
+cleanup() {
+ echo -e "\n${YELLOW}Cleaning up...${NC}"
+ if [ ! -z "$PID_8B" ]; then
+ kill $PID_8B 2>/dev/null || true
+ fi
+ if [ ! -z "$PID_70B" ]; then
+ kill $PID_70B 2>/dev/null || true
+ fi
+ echo "Done."
+}
+
+trap cleanup EXIT
+
+# Create log directory
+mkdir -p logs
+
+# ============================================
+# Test 1: 8B model only (single GPU)
+# ============================================
+echo -e "\n${GREEN}=== Test 1: 8B Model Benchmark ===${NC}"
+
+PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b.log")
+echo "Server PID: $PID_8B"
+
+if wait_for_server $PORT_8B; then
+ echo -e "\n${YELLOW}Running 8B benchmark (20 requests)...${NC}"
+ python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 20
+
+ echo -e "\n${YELLOW}Running 8B benchmark with concurrency...${NC}"
+ python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 50 --concurrent
+else
+ echo -e "${RED}Failed to start 8B server${NC}"
+fi
+
+# Stop 8B server
+kill $PID_8B 2>/dev/null || true
+sleep 5
+
+# ============================================
+# Test 2: 70B AWQ model (4 GPUs with tensor parallelism)
+# ============================================
+echo -e "\n${GREEN}=== Test 2: 70B AWQ Model Benchmark ===${NC}"
+
+if [ "$NUM_GPUS" -ge 4 ]; then
+ PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "0,1,2,3" "--tensor-parallel-size 4" "logs/vllm_70b.log")
+ echo "Server PID: $PID_70B"
+
+ if wait_for_server $PORT_70B; then
+ echo -e "\n${YELLOW}Running 70B benchmark (20 requests)...${NC}"
+ python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 20
+
+ echo -e "\n${YELLOW}Running 70B benchmark with concurrency...${NC}"
+ python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 50 --concurrent
+ else
+ echo -e "${RED}Failed to start 70B server${NC}"
+ echo "Check logs/vllm_70b.log for errors"
+ fi
+
+ # Stop 70B server
+ kill $PID_70B 2>/dev/null || true
+ sleep 5
+else
+ echo -e "${YELLOW}Skipping 70B test (need 4 GPUs)${NC}"
+fi
+
+# ============================================
+# Test 3: Full conversation simulation
+# ============================================
+echo -e "\n${GREEN}=== Test 3: Full Conversation Simulation ===${NC}"
+
+if [ "$NUM_GPUS" -ge 4 ]; then
+ # Start both servers
+ # 8B on GPU 0, 70B on GPUs 1,2,3 (tensor parallel 3)
+ # Or split differently based on memory
+
+ echo "Starting 8B server on GPU 0..."
+ PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b_conv.log")
+
+ echo "Starting 70B server on GPUs 1,2,3..."
+ PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "1,2,3" "--tensor-parallel-size 3" "logs/vllm_70b_conv.log")
+
+ wait_for_server $PORT_8B
+ wait_for_server $PORT_70B
+
+ if [ $? -eq 0 ]; then
+ echo -e "\n${YELLOW}Running full conversation benchmark (10 conversations)...${NC}"
+ python scripts/benchmark_inference.py --mode conversation \
+ --url-8b http://localhost:$PORT_8B/v1 \
+ --url-70b http://localhost:$PORT_70B/v1 \
+ -n 10
+ fi
+else
+ echo -e "${YELLOW}Skipping full conversation test (need 4 GPUs)${NC}"
+fi
+
+# ============================================
+# Summary
+# ============================================
+echo -e "\n${GREEN}======================================${NC}"
+echo -e "${GREEN} Test Complete!${NC}"
+echo -e "${GREEN}======================================${NC}"
+echo ""
+echo "Target: 2000 conversations/hour (paper on H100x8)"
+echo ""
+echo "Check the benchmark results above to see how close we are."
+echo "If throughput is still low, check:"
+echo " 1. GPU utilization during tests (nvidia-smi dmon -s u)"
+echo " 2. vLLM logs in logs/*.log"
+echo " 3. Network latency if using remote servers"
diff --git a/collaborativeagents/scripts/test_vllm_speed.sbatch b/collaborativeagents/scripts/test_vllm_speed.sbatch
new file mode 100644
index 0000000..070df5d
--- /dev/null
+++ b/collaborativeagents/scripts/test_vllm_speed.sbatch
@@ -0,0 +1,130 @@
+#!/bin/bash
+#SBATCH --job-name=test_vllm_speed
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=128G
+#SBATCH --time=00:30:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_vllm_speed-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_vllm_speed-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+
+echo "=== vLLM Speed Test ==="
+date
+nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+echo ""
+echo "=== Test 1: ContextualAdapter with vLLM (2 servers) ==="
+echo "Starting vLLM servers on GPU 0,1 (user) and GPU 2,3 (agent)..."
+
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+for i in $(seq 1 120); do
+ u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0)
+ a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0)
+ if [ "$u" = "1" ] && [ "$a" = "1" ]; then
+ echo "Both servers ready after $((i*2))s"; break
+ fi
+ sleep 2
+done
+
+python -c "
+import time
+import sys
+sys.path.insert(0, '.')
+from adapters.contextual_adapter import ContextualAdapter
+
+print('Testing ContextualAdapter with vLLM...')
+adapter = ContextualAdapter(vllm_url='http://localhost:8003/v1')
+adapter.initialize()
+adapter.start_session('test_user')
+
+# Warm up
+adapter.generate_response('Hello')
+
+# Benchmark
+n_requests = 20
+start = time.time()
+for i in range(n_requests):
+ resp = adapter.generate_response(f'Solve: What is {i*7} + {i*3}? Give a brief answer.')
+elapsed = time.time() - start
+
+print(f'ContextualAdapter (vLLM): {n_requests} requests in {elapsed:.2f}s')
+print(f'Throughput: {n_requests/elapsed:.2f} req/s = {n_requests/elapsed*3600:.0f} requests/hr')
+print(f'Estimated sessions/hr (assuming 5 turns/session): {n_requests/elapsed*3600/5:.0f}')
+"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 5
+
+echo ""
+echo "=== Test 2: PersonalizedLLMAdapter (vLLM user + transformers adapter) ==="
+echo "Starting vLLM on GPU 0,1 for user simulation..."
+
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+for i in $(seq 1 120); do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "Server ready after $((i*2))s"; break
+ fi
+ sleep 2
+done
+
+echo "Loading PersonalizedLLMAdapter on GPU 2,3..."
+CUDA_VISIBLE_DEVICES=2,3 python -c "
+import time
+import sys
+sys.path.insert(0, '.')
+from adapters.personalized_llm_adapter import create_baseline_adapter
+
+print('Testing PersonalizedLLMAdapter (all_memory mode)...')
+adapter = create_baseline_adapter('all_memory')
+adapter.initialize()
+adapter.start_session('test_user')
+
+# Warm up
+adapter.generate_response('Hello')
+
+# Benchmark
+n_requests = 10
+start = time.time()
+for i in range(n_requests):
+ resp = adapter.generate_response(f'Solve: What is {i*7} + {i*3}? Give a brief answer.')
+elapsed = time.time() - start
+
+print(f'PersonalizedLLMAdapter (transformers): {n_requests} requests in {elapsed:.2f}s')
+print(f'Throughput: {n_requests/elapsed:.2f} req/s = {n_requests/elapsed*3600:.0f} requests/hr')
+print(f'Estimated sessions/hr (assuming 5 turns/session): {n_requests/elapsed*3600/5:.0f}')
+"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+
+echo ""
+echo "=== Test Complete ==="
+date
diff --git a/collaborativeagents/scripts/test_vllm_speed_a100.sbatch b/collaborativeagents/scripts/test_vllm_speed_a100.sbatch
new file mode 100644
index 0000000..7695cfc
--- /dev/null
+++ b/collaborativeagents/scripts/test_vllm_speed_a100.sbatch
@@ -0,0 +1,126 @@
+#!/bin/bash
+#SBATCH --job-name=test_vllm_a100
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuA100x4
+#SBATCH --gres=gpu:4
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=128G
+#SBATCH --time=00:30:00
+#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_vllm_a100-%j.out
+#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_vllm_a100-%j.err
+
+set -e
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}"
+
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+
+echo "=== vLLM Speed Test (A100) ==="
+date
+nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+echo ""
+echo "=== Test 1: ContextualAdapter with vLLM (2 servers) ==="
+
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+for i in $(seq 1 120); do
+ u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0)
+ a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0)
+ if [ "$u" = "1" ] && [ "$a" = "1" ]; then
+ echo "Both servers ready after $((i*2))s"; break
+ fi
+ sleep 2
+done
+
+python -c "
+import time
+import sys
+sys.path.insert(0, '.')
+from adapters.contextual_adapter import ContextualAdapter
+
+print('Testing ContextualAdapter with vLLM...')
+adapter = ContextualAdapter(vllm_url='http://localhost:8003/v1')
+adapter.initialize()
+adapter.start_session('test_user')
+
+# Warm up
+adapter.generate_response('Hello')
+
+# Benchmark
+n_requests = 20
+start = time.time()
+for i in range(n_requests):
+ resp = adapter.generate_response(f'Solve: What is {i*7} + {i*3}? Give a brief answer.')
+elapsed = time.time() - start
+
+print(f'ContextualAdapter (vLLM): {n_requests} requests in {elapsed:.2f}s')
+print(f'Throughput: {n_requests/elapsed:.2f} req/s = {n_requests/elapsed*3600:.0f} requests/hr')
+print(f'Estimated sessions/hr (5 turns/session): {n_requests/elapsed*3600/5:.0f}')
+"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 5
+
+echo ""
+echo "=== Test 2: PersonalizedLLMAdapter (vLLM user + transformers) ==="
+
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 --max-model-len 8192 \
+ --disable-log-requests --dtype bfloat16 &
+
+for i in $(seq 1 120); do
+ if curl -s http://localhost:8004/health > /dev/null 2>&1; then
+ echo "Server ready after $((i*2))s"; break
+ fi
+ sleep 2
+done
+
+CUDA_VISIBLE_DEVICES=2,3 python -c "
+import time
+import sys
+sys.path.insert(0, '.')
+from adapters.personalized_llm_adapter import create_baseline_adapter
+
+print('Testing PersonalizedLLMAdapter (all_memory)...')
+adapter = create_baseline_adapter('all_memory')
+adapter.initialize()
+adapter.start_session('test_user')
+
+# Warm up
+adapter.generate_response('Hello')
+
+# Benchmark
+n_requests = 10
+start = time.time()
+for i in range(n_requests):
+ resp = adapter.generate_response(f'Solve: What is {i*7} + {i*3}? Give a brief answer.')
+elapsed = time.time() - start
+
+print(f'PersonalizedLLMAdapter (transformers): {n_requests} requests in {elapsed:.2f}s')
+print(f'Throughput: {n_requests/elapsed:.2f} req/s = {n_requests/elapsed*3600:.0f} requests/hr')
+print(f'Estimated sessions/hr (5 turns/session): {n_requests/elapsed*3600/5:.0f}')
+"
+
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+echo ""
+echo "=== Test Complete ==="
+date
diff --git a/collaborativeagents/scripts/visualize.py b/collaborativeagents/scripts/visualize.py
new file mode 100644
index 0000000..2cf7369
--- /dev/null
+++ b/collaborativeagents/scripts/visualize.py
@@ -0,0 +1,492 @@
+import json
+from itertools import zip_longest
+import textwrap
+
+def load_data(filepath):
+ """Load user data from a JSONL file"""
+ users = []
+ with open(filepath, 'r') as f:
+ for line in f:
+ if line.strip():
+ users.append(json.loads(line))
+ return users
+
+def format_conversation(conv, file_label):
+ """Format a conversation into a list of lines"""
+ lines = []
+ lines.append(f">>> {file_label} <<<")
+ lines.append("")
+
+ if 'conversation' in conv:
+ for i, msg in enumerate(conv['conversation'], 1):
+ role = msg.get('role', 'unknown').upper()
+ content = msg.get('content', '')
+ lines.append(f"[{i}] {role}:")
+ # Split content into lines and indent
+ for content_line in content.split('\n'):
+ lines.append(f" {content_line}")
+
+ if i-2 < len(conv['full_conversation_log']):
+ full_conversation_log_msg = conv['full_conversation_log'][i-2]
+ if 'enforce_preferences' in full_conversation_log_msg and (full_conversation_log_msg['enforce_preferences'] == True or full_conversation_log_msg['enforce_preferences'] == "True"):
+ lines.append(f"<<<<< Enforced preference >>>>>")
+
+ lines.append("") # Empty line after each message
+
+ # Format evaluation
+ if 'evaluation' in conv:
+ lines.append("[EVALUATION]")
+ eval_data = conv['evaluation']
+
+ if 'final_answer' in eval_data:
+ lines.append(f"• Final Answer: {eval_data['final_answer']}")
+
+ if 'accuracy' in eval_data:
+ acc = eval_data['accuracy']['accuracy']
+ acc_symbol = "✓" if acc == 1 else "✗"
+ lines.append(f"• Accuracy: {acc} {acc_symbol}")
+
+ num_enforced_preferences = len([message for message in conv['full_conversation_log'] if 'enforce_preferences' in message and (message['enforce_preferences'] == True or message['enforce_preferences'] == "True")])
+ lines.append(f"• Number of enforced preferences: {num_enforced_preferences}")
+
+ if 'conversation_length' in eval_data:
+ lines.append(f"• Length: {eval_data['conversation_length']} msgs")
+
+ return lines
+
+def wrap_lines(lines, width):
+ """Wrap lines to specified width"""
+ wrapped = []
+ for line in lines:
+ if len(line) <= width:
+ wrapped.append(line)
+ else:
+ # Wrap the line
+ wrapped_line = textwrap.wrap(line, width=width, break_long_words=True, break_on_hyphens=False)
+ wrapped.extend(wrapped_line)
+ return wrapped
+
+def calculate_aggregate_stats(users_data):
+ """Calculate aggregate statistics across all users"""
+ all_accuracies = []
+ all_lengths = []
+ all_enforced_counts = []
+
+ for user_data in users_data:
+ if 'generated_conversations' in user_data:
+ for conv in user_data['generated_conversations']:
+ # Collect accuracy
+ if 'evaluation' in conv and 'accuracy' in conv['evaluation']:
+ all_accuracies.append(conv['evaluation']['accuracy']['accuracy'])
+
+ # Collect conversation length
+ if 'evaluation' in conv and 'conversation_length' in conv['evaluation']:
+ all_lengths.append(conv['evaluation']['conversation_length'])
+
+ # Collect enforced preferences count
+ if 'full_conversation_log' in conv:
+ count = len([msg for msg in conv['full_conversation_log']
+ if 'enforce_preferences' in msg and (msg['enforce_preferences'] == True or msg['enforce_preferences'] == "True")])
+ all_enforced_counts.append(count)
+
+ avg_accuracy = sum(all_accuracies) / len(all_accuracies) if all_accuracies else 0
+ avg_length = sum(all_lengths) / len(all_lengths) if all_lengths else 0
+ avg_enforced = sum(all_enforced_counts) / len(all_enforced_counts) if all_enforced_counts else 0
+
+ return avg_accuracy, avg_length, avg_enforced
+
+def print_side_by_side(conv1, conv2, label1, label2, col_width=60):
+ """Print two conversations side by side"""
+ lines1 = format_conversation(conv1, label1)
+ lines2 = format_conversation(conv2, label2)
+
+ # Wrap lines to fit column width
+ lines1 = wrap_lines(lines1, col_width)
+ lines2 = wrap_lines(lines2, col_width)
+
+ # Print header
+ print(f"\n{label1:<{col_width}} | {label2}")
+ print(f"{'-'*col_width} | {'-'*col_width}")
+
+ # Print lines side by side
+ for line1, line2 in zip_longest(lines1, lines2, fillvalue=''):
+ # Pad line1 to col_width
+ line1 = line1.ljust(col_width)
+
+ print(f"{line1} | {line2}")
+
+def print_side_by_side_3(conv1, conv2, conv3, label1, label2, label3, col_width=42):
+ """Print three conversations side by side"""
+ lines1 = format_conversation(conv1, label1)
+ lines2 = format_conversation(conv2, label2)
+ lines3 = format_conversation(conv3, label3)
+
+ # Wrap lines to fit column width
+ lines1 = wrap_lines(lines1, col_width)
+ lines2 = wrap_lines(lines2, col_width)
+ lines3 = wrap_lines(lines3, col_width)
+
+ # Print header
+ print(f"\n{label1:<{col_width}} | {label2:<{col_width}} | {label3}")
+ print(f"{'-'*col_width} | {'-'*col_width} | {'-'*col_width}")
+
+ # Print lines side by side
+ for line1, line2, line3 in zip_longest(lines1, lines2, lines3, fillvalue=''):
+ # Pad line1 and line2 to col_width
+ line1 = line1.ljust(col_width)
+ line2 = line2.ljust(col_width)
+
+ print(f"{line1} | {line2} | {line3}")
+
+def format_detailed_full_log(conv, file_label):
+ """Format detailed conversation including all fields from full_conversation_log"""
+ lines = []
+ lines.append(f">>> {file_label} — FULL LOG <<<")
+ lines.append("")
+
+ if 'full_conversation_log' in conv and conv['full_conversation_log']:
+ for j, msg in enumerate(conv['full_conversation_log'], 1):
+ # Alternate roles starting with USER
+ role_label = 'USER' if j % 2 == 1 else 'ASSISTANT'
+ lines.append(f"[{j}] {role_label}:")
+
+ def is_enforced(value):
+ return value is True or value == "True" or value == "true"
+
+ # 1) Response first (as plain text)
+ response_text = msg.get('response')
+ if response_text is not None:
+ for line in str(response_text).split('\n'):
+ lines.append(f"{line}")
+
+ # 1a) Enforcement tag if applicable
+ if 'enforce_preferences' in msg and is_enforced(msg['enforce_preferences']):
+ lines.append("<<<<< Preferences Enforced >>>>>")
+
+ # 2) Ordered keys as bulleted items
+ ordered_keys = [
+ 'preference_1_satisfied',
+ 'preference_2_satisfied',
+ 'preference_3_satisfied',
+ 'enforce_preferences',
+ 'draft_answer',
+ 'reasoning',
+ 'should_terminate',
+ ]
+
+ def append_bullet(key, value):
+ if isinstance(value, (dict, list)):
+ try:
+ pretty_value = json.dumps(value, indent=2, sort_keys=True, ensure_ascii=False)
+ except Exception:
+ pretty_value = str(value)
+ lines.append(f" - {key}:")
+ for ln in pretty_value.split('\n'):
+ lines.append(f" {ln}")
+ else:
+ value_str = str(value) if value is not None else ""
+ value_lines = value_str.split('\n') if value_str else [""]
+ # First line on the bullet
+ lines.append(f" - {key}: {value_lines[0]}")
+ # Continuation lines indented slightly further
+ for cont in value_lines[1:]:
+ lines.append(f" {cont}")
+
+ for key in ordered_keys:
+ if key in msg:
+ append_bullet(key, msg.get(key))
+
+ # 3) Remaining keys grouped under Other fields
+ shown_keys = set(['response'] + ordered_keys)
+ remaining_keys = [k for k in msg.keys() if k not in shown_keys]
+ if remaining_keys:
+ lines.append(" - Other fields:")
+ for k in sorted(remaining_keys):
+ v = msg[k]
+ if isinstance(v, (dict, list)):
+ try:
+ pretty_v = json.dumps(v, indent=2, sort_keys=True, ensure_ascii=False)
+ except Exception:
+ pretty_v = str(v)
+ lines.append(f" {k}:")
+ for ln in pretty_v.split('\n'):
+ lines.append(f" {ln}")
+ else:
+ v_str = str(v)
+ v_lines = v_str.split('\n') if v_str else [""]
+ lines.append(f" {k}: {v_lines[0]}")
+ for cont in v_lines[1:]:
+ lines.append(f" {cont}")
+
+ lines.append("")
+ else:
+ lines.append("[No full_conversation_log available]")
+
+ # Include evaluation details if present
+ if 'evaluation' in conv:
+ lines.append("[EVALUATION — FULL]")
+ try:
+ eval_pretty = json.dumps(conv['evaluation'], indent=2, sort_keys=True, ensure_ascii=False)
+ except Exception:
+ eval_pretty = str(conv['evaluation'])
+ for line in eval_pretty.split('\n'):
+ lines.append(f" {line}")
+
+ return lines
+
+def print_detailed_logs_3(conv1, conv2, conv3, label1, label2, label3, col_width=42):
+ """Print detailed logs for three conversations side by side"""
+ lines1 = wrap_lines(format_detailed_full_log(conv1, label1), col_width)
+ lines2 = wrap_lines(format_detailed_full_log(conv2, label2), col_width)
+ lines3 = wrap_lines(format_detailed_full_log(conv3, label3), col_width)
+
+ print(f"\n{label1:<{col_width}} | {label2:<{col_width}} | {label3}")
+ print(f"{'-'*col_width} | {'-'*col_width} | {'-'*col_width}")
+ for line1, line2, line3 in zip_longest(lines1, lines2, lines3, fillvalue=''):
+ print(f"{line1.ljust(col_width)} | {line2.ljust(col_width)} | {line3}")
+
+def print_user_info(user_data):
+ """Print user profile information"""
+ print(f"\n[USER PROFILE]")
+ if 'i' in user_data:
+ print(f"User ID: {user_data['i']}")
+ if 'persona' in user_data:
+ print(f"Persona: {user_data['persona']}")
+ if 'preferences' in user_data:
+ print(f"Preferences:")
+ for preference in user_data['preferences']:
+ print(f" - {preference}")
+
+
+
+
+
+
+for task in ["bigcodebench"]: # ["math_500", "logiqa", "math_hard", "medqa", "mmlu"]:
+ user_profiles_without_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/user_profiles_without_preferences/logiqa_llama70b_user_llama70b_agent_user_profiles_without_preferences_eval_size_20.jsonl"
+ user_profiles_with_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/user_profiles_with_preferences/logiqa_llama70b_user_llama70b_agent_user_profiles_with_preferences_eval_size_20.jsonl"
+ agent_with_userpreferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/agent_with_user_preferences/logiqa_llama70b_user_llama70b_agent_agent_with_user_preferences_eval_size_20_v2.jsonl"
+ agnet_with_reflection_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/agent_with_reflection_v3/logiqa_llama70b_user_llama70b_agent_agent_with_reflection_eval_size_20.jsonl"
+
+
+ file1_path = user_profiles_without_preferences_path
+ file2_path = user_profiles_with_preferences_path
+ file3_path = agent_with_userpreferences_path
+ file4_path = agnet_with_reflection_path
+
+
+ # Load users from all three files
+ data1 = load_data(file1_path)
+ data2 = load_data(file2_path)
+ data3 = load_data(file3_path)
+ data4 = load_data(file4_path)
+
+ id_to_user_data1 = {elem['i']: elem for elem in data1+data2+data3+data4}
+ id_to_user_data2 = {elem['i']: elem for elem in data2}
+ id_to_user_data3 = {elem['i']: elem for elem in data3}
+ id_to_user_data4 = {elem['i']: elem for elem in data4}
+
+
+
+ for id in id_to_user_data1:
+ if id != 23: continue
+ user_avg_acc1, user_avg_len1, user_avg_enf1 = calculate_aggregate_stats([id_to_user_data1[id]])
+ user_avg_acc2, user_avg_len2, user_avg_enf2 = calculate_aggregate_stats([id_to_user_data2[id]])
+ user_avg_acc3, user_avg_len3, user_avg_enf3 = calculate_aggregate_stats([id_to_user_data3[id]])
+ user_avg_acc4, user_avg_len4, user_avg_enf4 = calculate_aggregate_stats([id_to_user_data4[id]])
+
+
+ # print user info
+ print("\n" + "="*125 + "\n")
+ print(f"### Task: {task}\n")
+ print("LOGGING FOR USER ID: ", id)
+ print_user_info(id_to_user_data1[id])
+
+ # Print the average performance for id_to_user_data1[id]
+ # Print the average performance for id_to_user_data2[id]
+ print("\n" + "-"*125)
+ print("COMPARISON FOR THIS USER")
+ print("-"*125)
+
+
+ print("\nUser Without Preferences:")
+ print(f" Average Accuracy: {user_avg_acc1:.2f}")
+ print(f" Average # Messages: {user_avg_len1:.2f}")
+ print(f" Average # Enforced Preferences: {user_avg_enf1:.2f}")
+
+ print("\nUser With Preferences:")
+ print(f" Average Accuracy: {user_avg_acc2:.2f}")
+ print(f" Average # Messages: {user_avg_len2:.2f}")
+ print(f" Average # Enforced Preferences: {user_avg_enf2:.2f}")
+
+ print("\nAgent With User Preferences:")
+ print(f" Average Accuracy: {user_avg_acc3:.2f}")
+ print(f" Average # Messages: {user_avg_len3:.2f}")
+ print(f" Average # Enforced Preferences: {user_avg_enf3:.2f}")
+
+ print("\nAgent With Reflection:")
+ print(f" Average Accuracy: {user_avg_acc4:.2f}")
+ print(f" Average # Messages: {user_avg_len4:.2f}")
+ print(f" Average # Enforced Preferences: {user_avg_enf4:.2f}")
+
+
+ # print conversations
+ problem_to_conversation1 = {conv['sample']['problem']: conv for conv in id_to_user_data1[id]['generated_conversations']}
+ problem_to_conversation2 = {conv['sample']['problem']: conv for conv in id_to_user_data2[id]['generated_conversations']}
+ problem_to_conversation3 = {conv['sample']['problem']: conv for conv in id_to_user_data3[id]['generated_conversations']}
+
+ for problem in problem_to_conversation1:
+ print("\n" + "="*125)
+ print(f"\n[PROBLEM]")
+ print(problem)
+ print(f"\n[SOLUTION]")
+ print(problem_to_conversation1[problem]['sample']['solution'])
+ print("\n" + "="*125)
+
+ print_side_by_side_3(
+ problem_to_conversation1[problem],
+ problem_to_conversation2[problem],
+ problem_to_conversation3.get(problem, {'conversation': [], 'evaluation': {}}),
+ "FILE 1 (WITHOUT PREFERENCES)",
+ "FILE 2 (WITH PREFERENCES)",
+ "FILE 3 (AGENT WITH USER PREFS)",
+ col_width=55
+ )
+
+ # Detailed logs below with all fields
+ print("\n" + "-"*125)
+ print("DETAILED FULL LOGS")
+ print("-"*125)
+ print_detailed_logs_3(
+ problem_to_conversation1[problem],
+ problem_to_conversation2[problem],
+ problem_to_conversation3.get(problem, {'conversation': [], 'evaluation': {}}),
+ "FILE 1 (WITHOUT PREFERENCES)",
+ "FILE 2 (WITH PREFERENCES)",
+ "FILE 3 (AGENT WITH USER PREFS)",
+ col_width=55
+ )
+
+ # break
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# # ==============================================================================
+# # SEPARATE SECTION: Per-User Statistics Averaged Over All Tasks
+# # ==============================================================================
+
+# print("\n" + "="*125)
+# print("="*125)
+# print("STATISTICS FOR EACH USER, AVERAGED OVER ALL TASKS")
+# print("="*125)
+# print("="*125 + "\n")
+
+# # Dictionary to store all data for each user across all tasks
+# user_to_all_data = {}
+
+# # Collect data for all users across all tasks
+# for task in ["math_500", "logiqa", "math_hard", "medqa", "mmlu"]:
+# user_profiles_without_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/user_profiles_without_preferences/{task}_llama70b_user_llama70b_agent_user_profiles_without_preferences_eval_size_20.jsonl"
+# user_profiles_with_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/user_profiles_with_preferences/{task}_llama70b_user_llama70b_agent_user_profiles_with_preferences_eval_size_20.jsonl"
+# agent_with_userpreferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/agent_with_user_preferences/{task}_llama70b_user_llama70b_agent_agent_with_user_preferences_eval_size_20_v2.jsonl"
+# agent_with_reflection_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/agent_with_reflection/{task}_llama70b_user_llama70b_agent_agent_with_reflection_eval_size_20.jsonl"
+
+# data1 = load_data(user_profiles_without_preferences_path)
+# data2 = load_data(user_profiles_with_preferences_path)
+# data3 = load_data(agent_with_userpreferences_path)
+# data4 = load_data(agent_with_reflection_path)
+
+# # For each user in this task, store their data
+# for user_data in data1:
+# user_id = user_data['i']
+# if user_id not in user_to_all_data:
+# user_to_all_data[user_id] = {
+# 'persona': user_data.get('persona'),
+# 'preferences': user_data.get('preferences'),
+# 'data1': [], # without preferences
+# 'data2': [], # with preferences
+# 'data3': [], # agent with user preferences
+# 'data4': [] # agent with reflection
+# }
+# user_to_all_data[user_id]['data1'].append(user_data)
+
+# for user_data in data2:
+# user_id = user_data['i']
+# if user_id in user_to_all_data:
+# user_to_all_data[user_id]['data2'].append(user_data)
+
+# for user_data in data3:
+# user_id = user_data['i']
+# if user_id in user_to_all_data:
+# user_to_all_data[user_id]['data3'].append(user_data)
+
+# for user_data in data4:
+# user_id = user_data['i']
+# if user_id in user_to_all_data:
+# user_to_all_data[user_id]['data4'].append(user_data)
+
+# # Now print statistics for each user, averaged over all tasks
+# for user_id in sorted(user_to_all_data.keys()):
+# user_info = user_to_all_data[user_id]
+
+# # Calculate aggregate stats across all tasks for this user
+# user_avg_acc1, user_avg_len1, user_avg_enf1 = calculate_aggregate_stats(user_info['data1'])
+# user_avg_acc2, user_avg_len2, user_avg_enf2 = calculate_aggregate_stats(user_info['data2'])
+# user_avg_acc3, user_avg_len3, user_avg_enf3 = calculate_aggregate_stats(user_info['data3'])
+# user_avg_acc4, user_avg_len4, user_avg_enf4 = calculate_aggregate_stats(user_info['data4'])
+
+# print("\n" + "="*125)
+# print(f"USER ID: {user_id}")
+# print("="*125)
+
+# # Print user profile info
+# if user_info['persona']:
+# print(f"Persona: {user_info['persona']}")
+# if user_info['preferences']:
+# print(f"Preferences:")
+# for preference in user_info['preferences']:
+# print(f" - {preference}")
+
+# print("\n" + "-"*125)
+# print("STATISTICS AVERAGED OVER ALL TASKS")
+# print("-"*125)
+
+# print("\nUser Without Preferences:")
+# print(f" Average Accuracy: {user_avg_acc1:.2f}")
+# print(f" Average # Messages: {user_avg_len1:.2f}")
+# print(f" Average # Enforced Preferences: {user_avg_enf1:.2f}")
+
+# print("\nUser With Preferences:")
+# print(f" Average Accuracy: {user_avg_acc2:.2f}")
+# print(f" Average # Messages: {user_avg_len2:.2f}")
+# print(f" Average # Enforced Preferences: {user_avg_enf2:.2f}")
+
+# print("\nAgent With User Preferences:")
+# print(f" Average Accuracy: {user_avg_acc3:.2f}")
+# print(f" Average # Messages: {user_avg_len3:.2f}")
+# print(f" Average # Enforced Preferences: {user_avg_enf3:.2f}")
+
+# print("\nAgent With Reflection:")
+# print(f" Average Accuracy: {user_avg_acc4:.2f}")
+# print(f" Average # Messages: {user_avg_len4:.2f}")
+# print(f" Average # Enforced Preferences: {user_avg_enf4:.2f}")
+
+# print("\n" + "="*125)
+# print("END OF PER-USER STATISTICS")
+# print("="*125 + "\n")
+