From dc801c07cf38b0c495686463e6ca6f871a64440e Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Tue, 27 Jan 2026 09:57:37 -0600 Subject: Add collaborativeagents module and update gitignore - Add collaborativeagents subproject with adapters, agents, and evaluation modules - Update .gitignore to exclude large binary files (.whl, .tar), wandb logs, and results Co-Authored-By: Claude Opus 4.5 --- collaborativeagents/scripts/benchmark_inference.py | 429 +++++++ .../scripts/configs/local_models.yaml | 1 + .../scripts/conflict_scenario_generator.py | 637 ++++++++++ .../scripts/contextual_test_small.sbatch | 80 ++ collaborativeagents/scripts/controlled_test.sbatch | 173 +++ collaborativeagents/scripts/exp_all_memory.sbatch | 59 + collaborativeagents/scripts/exp_contextual.sbatch | 66 + collaborativeagents/scripts/exp_rag.sbatch | 59 + collaborativeagents/scripts/exp_rag_vector.sbatch | 59 + collaborativeagents/scripts/exp_reflection.sbatch | 66 + .../scripts/exp_reflection_grpo.sbatch | 59 + collaborativeagents/scripts/exp_template.sbatch | 59 + collaborativeagents/scripts/exp_vanilla.sbatch | 59 + collaborativeagents/scripts/extend_profiles.py | 195 +++ .../scripts/full_experiment_batch.sbatch | 132 ++ .../scripts/full_experiment_sequential.sbatch | 131 ++ .../scripts/fullscale_method.sbatch | 92 ++ .../scripts/fullscale_vanilla.sbatch | 43 + .../scripts/generate_complex_profiles.py | 719 +++++++++++ .../scripts/generate_profiles_v2.py | 475 +++++++ .../scripts/generate_training_data.sh | 22 + collaborativeagents/scripts/preflight_test.py | 311 +++++ collaborativeagents/scripts/quick_rag_debug.sbatch | 78 ++ collaborativeagents/scripts/quick_test_a100.sbatch | 136 ++ collaborativeagents/scripts/quick_test_batch.sh | 137 ++ collaborativeagents/scripts/quick_test_h200.sbatch | 137 ++ .../scripts/rag_debug_interactive.sbatch | 87 ++ collaborativeagents/scripts/rag_test_v4.sbatch | 92 ++ collaborativeagents/scripts/rag_test_v5.sbatch | 96 ++ collaborativeagents/scripts/run.py | 504 ++++++++ collaborativeagents/scripts/run.sh | 98 ++ .../scripts/run_baseline_comparison.py | 608 +++++++++ collaborativeagents/scripts/run_debug.sh | 24 + collaborativeagents/scripts/run_experiments.py | 1328 ++++++++++++++++++++ collaborativeagents/scripts/run_fp8.sh | 65 + collaborativeagents/scripts/run_preflight_test.sh | 89 ++ .../scripts/scale_test_batch1.sbatch | 121 ++ .../scripts/scale_test_batch2.sbatch | 126 ++ .../scripts/scale_test_ctx_refl.sbatch | 114 ++ collaborativeagents/scripts/smallscale_test.sbatch | 87 ++ collaborativeagents/scripts/test_70b_pilot.py | 281 +++++ collaborativeagents/scripts/test_all_a100x8.sbatch | 124 ++ collaborativeagents/scripts/test_all_h200.sbatch | 126 ++ .../scripts/test_all_methods.sbatch | 91 ++ collaborativeagents/scripts/test_batch_50.py | 98 ++ collaborativeagents/scripts/test_batch_50.sh | 107 ++ .../scripts/test_batch_vs_parallel.sh | 151 +++ collaborativeagents/scripts/test_extractor.py | 46 + collaborativeagents/scripts/test_multiturn.py | 248 ++++ collaborativeagents/scripts/test_parallel_a100.sh | 172 +++ collaborativeagents/scripts/test_parallel_quick.sh | 158 +++ .../scripts/test_parallel_speed.sbatch | 126 ++ .../scripts/test_parallel_speed_a100.sbatch | 126 ++ collaborativeagents/scripts/test_parallel_vllm.sh | 205 +++ collaborativeagents/scripts/test_rag_empty.sbatch | 143 +++ .../scripts/test_rag_empty_v2.sbatch | 124 ++ .../scripts/test_rag_empty_v3.sbatch | 110 ++ collaborativeagents/scripts/test_rag_fix.sbatch | 123 ++ collaborativeagents/scripts/test_real_speed.sbatch | 87 ++ collaborativeagents/scripts/test_vllm_adapter.sh | 74 ++ .../scripts/test_vllm_interactive.sh | 212 ++++ collaborativeagents/scripts/test_vllm_speed.sbatch | 130 ++ .../scripts/test_vllm_speed_a100.sbatch | 126 ++ collaborativeagents/scripts/visualize.py | 492 ++++++++ 64 files changed, 11533 insertions(+) create mode 100755 collaborativeagents/scripts/benchmark_inference.py create mode 120000 collaborativeagents/scripts/configs/local_models.yaml create mode 100644 collaborativeagents/scripts/conflict_scenario_generator.py create mode 100644 collaborativeagents/scripts/contextual_test_small.sbatch create mode 100644 collaborativeagents/scripts/controlled_test.sbatch create mode 100644 collaborativeagents/scripts/exp_all_memory.sbatch create mode 100644 collaborativeagents/scripts/exp_contextual.sbatch create mode 100644 collaborativeagents/scripts/exp_rag.sbatch create mode 100644 collaborativeagents/scripts/exp_rag_vector.sbatch create mode 100644 collaborativeagents/scripts/exp_reflection.sbatch create mode 100644 collaborativeagents/scripts/exp_reflection_grpo.sbatch create mode 100644 collaborativeagents/scripts/exp_template.sbatch create mode 100644 collaborativeagents/scripts/exp_vanilla.sbatch create mode 100644 collaborativeagents/scripts/extend_profiles.py create mode 100644 collaborativeagents/scripts/full_experiment_batch.sbatch create mode 100644 collaborativeagents/scripts/full_experiment_sequential.sbatch create mode 100644 collaborativeagents/scripts/fullscale_method.sbatch create mode 100644 collaborativeagents/scripts/fullscale_vanilla.sbatch create mode 100644 collaborativeagents/scripts/generate_complex_profiles.py create mode 100644 collaborativeagents/scripts/generate_profiles_v2.py create mode 100644 collaborativeagents/scripts/generate_training_data.sh create mode 100644 collaborativeagents/scripts/preflight_test.py create mode 100644 collaborativeagents/scripts/quick_rag_debug.sbatch create mode 100644 collaborativeagents/scripts/quick_test_a100.sbatch create mode 100755 collaborativeagents/scripts/quick_test_batch.sh create mode 100644 collaborativeagents/scripts/quick_test_h200.sbatch create mode 100644 collaborativeagents/scripts/rag_debug_interactive.sbatch create mode 100644 collaborativeagents/scripts/rag_test_v4.sbatch create mode 100644 collaborativeagents/scripts/rag_test_v5.sbatch create mode 100644 collaborativeagents/scripts/run.py create mode 100644 collaborativeagents/scripts/run.sh create mode 100644 collaborativeagents/scripts/run_baseline_comparison.py create mode 100644 collaborativeagents/scripts/run_debug.sh create mode 100644 collaborativeagents/scripts/run_experiments.py create mode 100644 collaborativeagents/scripts/run_fp8.sh create mode 100755 collaborativeagents/scripts/run_preflight_test.sh create mode 100644 collaborativeagents/scripts/scale_test_batch1.sbatch create mode 100644 collaborativeagents/scripts/scale_test_batch2.sbatch create mode 100644 collaborativeagents/scripts/scale_test_ctx_refl.sbatch create mode 100644 collaborativeagents/scripts/smallscale_test.sbatch create mode 100644 collaborativeagents/scripts/test_70b_pilot.py create mode 100644 collaborativeagents/scripts/test_all_a100x8.sbatch create mode 100644 collaborativeagents/scripts/test_all_h200.sbatch create mode 100644 collaborativeagents/scripts/test_all_methods.sbatch create mode 100644 collaborativeagents/scripts/test_batch_50.py create mode 100755 collaborativeagents/scripts/test_batch_50.sh create mode 100755 collaborativeagents/scripts/test_batch_vs_parallel.sh create mode 100644 collaborativeagents/scripts/test_extractor.py create mode 100644 collaborativeagents/scripts/test_multiturn.py create mode 100755 collaborativeagents/scripts/test_parallel_a100.sh create mode 100755 collaborativeagents/scripts/test_parallel_quick.sh create mode 100644 collaborativeagents/scripts/test_parallel_speed.sbatch create mode 100755 collaborativeagents/scripts/test_parallel_speed_a100.sbatch create mode 100755 collaborativeagents/scripts/test_parallel_vllm.sh create mode 100644 collaborativeagents/scripts/test_rag_empty.sbatch create mode 100644 collaborativeagents/scripts/test_rag_empty_v2.sbatch create mode 100644 collaborativeagents/scripts/test_rag_empty_v3.sbatch create mode 100644 collaborativeagents/scripts/test_rag_fix.sbatch create mode 100644 collaborativeagents/scripts/test_real_speed.sbatch create mode 100755 collaborativeagents/scripts/test_vllm_adapter.sh create mode 100755 collaborativeagents/scripts/test_vllm_interactive.sh create mode 100644 collaborativeagents/scripts/test_vllm_speed.sbatch create mode 100644 collaborativeagents/scripts/test_vllm_speed_a100.sbatch create mode 100644 collaborativeagents/scripts/visualize.py (limited to 'collaborativeagents/scripts') diff --git a/collaborativeagents/scripts/benchmark_inference.py b/collaborativeagents/scripts/benchmark_inference.py new file mode 100755 index 0000000..6a2ee13 --- /dev/null +++ b/collaborativeagents/scripts/benchmark_inference.py @@ -0,0 +1,429 @@ +#!/usr/bin/env python3 +""" +Benchmark inference speed: Transformers vs vLLM. + +This script helps diagnose the 100x slowdown issue by comparing: +1. Raw transformers inference (current implementation) +2. vLLM server inference (target implementation) + +Usage: + # First, start vLLM server: + # CUDA_VISIBLE_DEVICES=0 vllm serve /path/to/model --port 8003 + + # Then run benchmark: + python benchmark_inference.py --mode both --n 20 + python benchmark_inference.py --mode vllm --url http://localhost:8003/v1 --n 50 + python benchmark_inference.py --mode transformers --model /path/to/model --n 10 +""" + +import argparse +import json +import time +import sys +from pathlib import Path +from typing import List, Dict, Any +from dataclasses import dataclass + +# Add paths +sys.path.insert(0, str(Path(__file__).parent.parent)) +sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) + + +@dataclass +class BenchmarkResult: + mode: str + n_requests: int + total_time_s: float + avg_latency_ms: float + min_latency_ms: float + max_latency_ms: float + throughput_req_per_s: float + throughput_conv_per_hr: float # Estimated conversations per hour + errors: int + + +def benchmark_transformers( + model_path: str, + n_requests: int = 10, + device: str = "cuda:0", +) -> BenchmarkResult: + """Benchmark raw transformers inference.""" + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer + + print(f"Loading model from {model_path}...") + load_start = time.time() + + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + device_map=device, + ) + if tokenizer.pad_token_id is None: + tokenizer.pad_token = tokenizer.eos_token + + load_time = time.time() - load_start + print(f"Model loaded in {load_time:.1f}s") + + # Test prompt (simulating a typical user simulator turn) + test_messages = [ + {"role": "system", "content": "You are a user simulator. Output JSON with reasoning, draft_answer, should_terminate, and response fields."}, + {"role": "user", "content": "The agent said: 'Hello, how can I help you today?' Respond as the user."}, + ] + + prompt = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True) + + latencies = [] + errors = 0 + + print(f"Running {n_requests} inference requests...") + start_time = time.time() + + for i in range(n_requests): + try: + req_start = time.time() + + inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096) + inputs = {k: v.to(model.device) for k, v in inputs.items()} + + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=256, + do_sample=True, + temperature=0.7, + top_p=0.9, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id, + ) + + # Decode output + input_len = inputs["input_ids"].shape[1] + gen_ids = outputs[0][input_len:] + response = tokenizer.decode(gen_ids, skip_special_tokens=True) + + latency_ms = (time.time() - req_start) * 1000 + latencies.append(latency_ms) + + if (i + 1) % 5 == 0: + print(f" Completed {i + 1}/{n_requests}, last latency: {latency_ms:.0f}ms") + + except Exception as e: + errors += 1 + print(f" Error on request {i + 1}: {e}") + + total_time = time.time() - start_time + + if not latencies: + return BenchmarkResult( + mode="transformers", + n_requests=n_requests, + total_time_s=total_time, + avg_latency_ms=0, + min_latency_ms=0, + max_latency_ms=0, + throughput_req_per_s=0, + throughput_conv_per_hr=0, + errors=errors, + ) + + avg_latency = sum(latencies) / len(latencies) + # Estimate: ~10 turns per conversation, so conv/hr = (req/s) * 3600 / 10 + throughput = len(latencies) / total_time + conv_per_hr = throughput * 3600 / 10 + + return BenchmarkResult( + mode="transformers", + n_requests=n_requests, + total_time_s=total_time, + avg_latency_ms=avg_latency, + min_latency_ms=min(latencies), + max_latency_ms=max(latencies), + throughput_req_per_s=throughput, + throughput_conv_per_hr=conv_per_hr, + errors=errors, + ) + + +def benchmark_vllm( + base_url: str = "http://localhost:8003/v1", + n_requests: int = 10, + concurrent: bool = False, + n_workers: int = 4, +) -> BenchmarkResult: + """Benchmark vLLM server inference.""" + from utils.vllm_client import VLLMClient + + client = VLLMClient(base_url=base_url) + + # Check health + if not client.health_check(): + print(f"ERROR: vLLM server at {base_url} is not responding") + return BenchmarkResult( + mode="vllm", + n_requests=n_requests, + total_time_s=0, + avg_latency_ms=0, + min_latency_ms=0, + max_latency_ms=0, + throughput_req_per_s=0, + throughput_conv_per_hr=0, + errors=n_requests, + ) + + print(f"vLLM server healthy: {client.get_model_info()}") + + # Test messages + test_messages = [ + {"role": "system", "content": "You are a user simulator. Output JSON with reasoning, draft_answer, should_terminate, and response fields."}, + {"role": "user", "content": "The agent said: 'Hello, how can I help you today?' Respond as the user."}, + ] + + latencies = [] + errors = 0 + + print(f"Running {n_requests} inference requests (concurrent={concurrent})...") + start_time = time.time() + + if concurrent: + from concurrent.futures import ThreadPoolExecutor, as_completed + + with ThreadPoolExecutor(max_workers=n_workers) as executor: + futures = [ + executor.submit(client.chat, test_messages, 256, 0.7) + for _ in range(n_requests) + ] + for i, future in enumerate(as_completed(futures)): + try: + result = future.result() + latencies.append(result["latency_ms"]) + if (i + 1) % 10 == 0: + print(f" Completed {i + 1}/{n_requests}") + except Exception as e: + errors += 1 + print(f" Error: {e}") + else: + for i in range(n_requests): + try: + result = client.chat(test_messages, 256, 0.7) + latencies.append(result["latency_ms"]) + + if (i + 1) % 5 == 0: + print(f" Completed {i + 1}/{n_requests}, last latency: {result['latency_ms']:.0f}ms") + + except Exception as e: + errors += 1 + print(f" Error on request {i + 1}: {e}") + + total_time = time.time() - start_time + + if not latencies: + return BenchmarkResult( + mode="vllm" + ("_concurrent" if concurrent else ""), + n_requests=n_requests, + total_time_s=total_time, + avg_latency_ms=0, + min_latency_ms=0, + max_latency_ms=0, + throughput_req_per_s=0, + throughput_conv_per_hr=0, + errors=errors, + ) + + avg_latency = sum(latencies) / len(latencies) + throughput = len(latencies) / total_time + conv_per_hr = throughput * 3600 / 10 + + return BenchmarkResult( + mode="vllm" + ("_concurrent" if concurrent else ""), + n_requests=n_requests, + total_time_s=total_time, + avg_latency_ms=avg_latency, + min_latency_ms=min(latencies), + max_latency_ms=max(latencies), + throughput_req_per_s=throughput, + throughput_conv_per_hr=conv_per_hr, + errors=errors, + ) + + +def benchmark_full_conversation( + vllm_url_70b: str, + vllm_url_8b: str, + n_conversations: int = 5, + max_turns: int = 10, +) -> Dict[str, Any]: + """ + Benchmark a full multi-turn conversation with user simulator and agent. + This simulates the actual experiment loop. + """ + from utils.vllm_client import VLLMClient, VLLMUserSimulator, VLLMAgentAdapter + + user_client = VLLMClient(base_url=vllm_url_70b) + agent_client = VLLMClient(base_url=vllm_url_8b) + + if not user_client.health_check(): + print(f"ERROR: 70B server at {vllm_url_70b} not responding") + return {"error": "70B server not available"} + + if not agent_client.health_check(): + print(f"ERROR: 8B server at {vllm_url_8b} not responding") + return {"error": "8B server not available"} + + print(f"Running {n_conversations} full conversations (max {max_turns} turns each)...") + + conversation_times = [] + total_turns = 0 + + start_time = time.time() + + for conv_idx in range(n_conversations): + conv_start = time.time() + + # Create user simulator + user_sim = VLLMUserSimulator( + problem="What is 2 + 2? Explain your reasoning step by step.", + user_persona="A student learning math", + user_preferences="- I prefer step-by-step explanations\n- Always show your work", + vllm_client=user_client, + ) + + # Create agent + agent = VLLMAgentAdapter( + vllm_client=agent_client, + system_prompt="You are a helpful math tutor. Explain concepts clearly." + ) + + # Run conversation + conversation = [{"role": "assistant", "content": "How can I help you today?"}] + + for turn in range(max_turns): + # User turn + user_response = user_sim.generate_user_response(conversation) + if user_response is None: + break + + conversation.append({"role": "user", "content": user_response["response"]}) + + if user_response.get("should_terminate", False): + break + + # Agent turn + agent_response = agent.generate_response(user_response["response"]) + conversation.append({"role": "assistant", "content": agent_response["response"]}) + + total_turns += 1 + + conv_time = time.time() - conv_start + conversation_times.append(conv_time) + print(f" Conversation {conv_idx + 1}/{n_conversations}: {len(conversation)} messages, {conv_time:.1f}s") + + total_time = time.time() - start_time + + return { + "n_conversations": n_conversations, + "total_turns": total_turns, + "total_time_s": total_time, + "avg_conv_time_s": sum(conversation_times) / len(conversation_times) if conversation_times else 0, + "throughput_conv_per_hr": n_conversations / total_time * 3600, + "throughput_turns_per_hr": total_turns / total_time * 3600, + } + + +def print_results(results: List[BenchmarkResult]): + """Print benchmark results in a nice table.""" + print("\n" + "=" * 80) + print("BENCHMARK RESULTS") + print("=" * 80) + + print(f"\n{'Mode':<20} {'Requests':<10} {'Avg Latency':<12} {'Throughput':<15} {'Conv/hr':<12} {'Errors':<8}") + print("-" * 80) + + for r in results: + print(f"{r.mode:<20} {r.n_requests:<10} {r.avg_latency_ms:>8.0f}ms {r.throughput_req_per_s:>10.2f}/s {r.throughput_conv_per_hr:>8.0f} {r.errors:<8}") + + print("-" * 80) + + # Compare speedup + if len(results) >= 2: + transformers_result = next((r for r in results if r.mode == "transformers"), None) + vllm_result = next((r for r in results if "vllm" in r.mode and r.throughput_req_per_s > 0), None) + + if transformers_result and vllm_result and transformers_result.throughput_req_per_s > 0: + speedup = vllm_result.throughput_req_per_s / transformers_result.throughput_req_per_s + print(f"\nvLLM speedup over transformers: {speedup:.1f}x") + + # Target comparison + target_conv_per_hr = 2000 + for r in results: + if r.throughput_conv_per_hr > 0: + ratio = r.throughput_conv_per_hr / target_conv_per_hr + status = "✓" if ratio >= 0.5 else "✗" + print(f"{status} {r.mode}: {r.throughput_conv_per_hr:.0f} conv/hr ({ratio:.1%} of paper's 2000 conv/hr)") + + +def main(): + parser = argparse.ArgumentParser(description="Benchmark inference speed") + parser.add_argument("--mode", choices=["transformers", "vllm", "both", "conversation"], default="vllm") + parser.add_argument("--model", type=str, default="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct", + help="Model path for transformers benchmark") + parser.add_argument("--url", type=str, default="http://localhost:8003/v1", + help="vLLM server URL") + parser.add_argument("--url-70b", type=str, default="http://localhost:8004/v1", + help="vLLM server URL for 70B model (user simulator)") + parser.add_argument("--url-8b", type=str, default="http://localhost:8003/v1", + help="vLLM server URL for 8B model (agent)") + parser.add_argument("-n", type=int, default=20, help="Number of requests") + parser.add_argument("--concurrent", action="store_true", help="Run vLLM benchmark with concurrent requests") + parser.add_argument("--device", type=str, default="cuda:0", help="Device for transformers") + + args = parser.parse_args() + + results = [] + + if args.mode == "conversation": + # Full conversation benchmark + conv_results = benchmark_full_conversation( + args.url_70b, + args.url_8b, + n_conversations=args.n, + ) + print("\n" + "=" * 80) + print("FULL CONVERSATION BENCHMARK") + print("=" * 80) + print(json.dumps(conv_results, indent=2)) + + if "throughput_conv_per_hr" in conv_results: + target = 2000 + actual = conv_results["throughput_conv_per_hr"] + print(f"\nTarget: {target} conv/hr (paper)") + print(f"Actual: {actual:.0f} conv/hr ({actual/target:.1%} of target)") + + else: + if args.mode in ["transformers", "both"]: + print("\n" + "=" * 40) + print("TRANSFORMERS BENCHMARK") + print("=" * 40) + result = benchmark_transformers(args.model, args.n, args.device) + results.append(result) + + if args.mode in ["vllm", "both"]: + print("\n" + "=" * 40) + print("vLLM BENCHMARK (sequential)") + print("=" * 40) + result = benchmark_vllm(args.url, args.n, concurrent=False) + results.append(result) + + if args.concurrent: + print("\n" + "=" * 40) + print("vLLM BENCHMARK (concurrent)") + print("=" * 40) + result = benchmark_vllm(args.url, args.n, concurrent=True, n_workers=4) + results.append(result) + + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/collaborativeagents/scripts/configs/local_models.yaml b/collaborativeagents/scripts/configs/local_models.yaml new file mode 120000 index 0000000..b6f8fad --- /dev/null +++ b/collaborativeagents/scripts/configs/local_models.yaml @@ -0,0 +1 @@ +/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/configs/local_models.yaml \ No newline at end of file diff --git a/collaborativeagents/scripts/conflict_scenario_generator.py b/collaborativeagents/scripts/conflict_scenario_generator.py new file mode 100644 index 0000000..9d00de8 --- /dev/null +++ b/collaborativeagents/scripts/conflict_scenario_generator.py @@ -0,0 +1,637 @@ +""" +Conflict Scenario Generator + +Generates queries that deliberately trigger preference conflicts. +The key insight: RAG naturally resolves conflicts by retrieving ONLY +the relevant preference, while context-based methods see ALL preferences +and get confused. + +Design principles: +1. Every test query should trigger 2+ conflicting preferences +2. Only ONE preference is correct given the full context +3. RAG retrieves the correct one (high similarity to query) +4. Context methods see both and often pick wrong one or try to satisfy both +""" + +import json +import random +from dataclasses import dataclass, field +from typing import Optional +from pathlib import Path + + +# ============================================================================ +# Conflict Templates +# ============================================================================ + +@dataclass +class ConflictScenario: + """A scenario that triggers a preference conflict.""" + scenario_id: str + conflict_group: str + query: str + context_cues: list # What makes the correct preference clear + triggered_prefs: list # Preference IDs that could apply + correct_pref_id: str # The one that SHOULD apply + wrong_pref_ids: list # The ones that should NOT apply + why_correct: str # Explanation for ground truth + expected_rag_behavior: str # What RAG should do + expected_context_failure: str # How context methods fail + + +# Core conflict scenarios - each designed to fail context methods +CONFLICT_TEMPLATES = { + # ========================================================================= + # FORMAT CONFLICTS + # ========================================================================= + "format_bullets_vs_numbered": [ + { + "query": "What are the steps to deploy a Docker container? Also list the common mistakes to avoid.", + "context_cues": ["steps to deploy = procedure", "list mistakes = enumeration"], + "correct_for": "both apply to different parts", + "why_context_fails": "Context sees both prefs, might use one format for everything", + "why_rag_wins": "RAG retrieves procedure-pref for deploy part, list-pref for mistakes part" + }, + { + "query": "Walk me through setting up CI/CD - what tools should I consider?", + "context_cues": ["walk through = sequential", "consider = options"], + "correct_for": "numbered for walkthrough, bullets for tools", + "why_context_fails": "Mixes formats inconsistently", + "why_rag_wins": "Retrieves appropriate format preference per section" + }, + { + "query": "How do I configure nginx? Give me the key parameters.", + "context_cues": ["how do I = procedure", "key parameters = list"], + "correct_for": "numbered steps + bulleted parameters", + "why_context_fails": "Context methods apply one format to all", + "why_rag_wins": "Separate retrieval for procedure vs enumeration context" + } + ], + + "format_answer_first_vs_buildup": [ + { + "query": "What's the time complexity of quicksort and why?", + "context_cues": ["what's = direct question", "why = needs explanation"], + "correct_for": "answer first (O(n log n)), then explain why", + "why_context_fails": "Either gives answer without why, or long buildup first", + "why_rag_wins": "Retrieves 'answer first' for 'what's', builds explanation for 'why'" + }, + { + "query": "Explain how neural networks learn - what's backpropagation?", + "context_cues": ["explain how = learning", "what's = definition needed"], + "correct_for": "build up intuition for 'how', then define backprop", + "why_context_fails": "Starts with backprop definition (answer first) losing context", + "why_rag_wins": "Identifies learning intent first, answer-seeking second" + } + ], + + # ========================================================================= + # VERBOSITY CONFLICTS + # ========================================================================= + "verbosity_concise_vs_detailed": [ + { + "query": "Quick question - how does the GIL work in Python?", + "context_cues": ["quick question = brevity cue", "GIL = complex topic"], + "correct_for": "concise (user said quick)", + "why_context_fails": "Sees 'complex topic' pref, gives long explanation", + "why_rag_wins": "Explicit brevity cue has higher retrieval score" + }, + { + "query": "Briefly explain the proof of the halting problem.", + "context_cues": ["briefly = brevity", "proof = normally detailed"], + "correct_for": "concise - user explicitly asked for brief", + "why_context_fails": "Proof preference triggers long format", + "why_rag_wins": "'Briefly' in query matches concise preference strongly" + }, + { + "query": "TL;DR on microservices vs monolith for a startup?", + "context_cues": ["TL;DR = max brevity", "comparison = could be detailed"], + "correct_for": "ultra-concise comparison", + "why_context_fails": "Comparison pref might trigger table/detailed analysis", + "why_rag_wins": "TL;DR keyword retrieves brevity preference" + }, + { + "query": "In detail, what's 2+2?", + "context_cues": ["in detail = verbosity cue", "2+2 = trivial"], + "correct_for": "brief (topic too simple for detail)", + "why_context_fails": "Might over-explain simple arithmetic", + "why_rag_wins": "Query simplicity context overrides detail cue" + } + ], + + # ========================================================================= + # CODE STYLE CONFLICTS + # ========================================================================= + "code_naming_convention": [ + { + "query": "Write a function to parse JSON, show it in Python and JavaScript.", + "context_cues": ["Python = snake_case", "JavaScript = camelCase"], + "correct_for": "snake_case for Python version, camelCase for JS version", + "why_context_fails": "Picks one convention for both, or inconsistent", + "why_rag_wins": "Language detection triggers correct convention per block" + }, + { + "query": "Convert this Python script to TypeScript: def get_user_data(): ...", + "context_cues": ["Python source = snake_case", "TypeScript target = camelCase"], + "correct_for": "convert snake_case to camelCase in TypeScript output", + "why_context_fails": "Might keep snake_case in TypeScript", + "why_rag_wins": "Output language triggers appropriate convention" + }, + { + "query": "Write SQL to join users and orders, then show Python code to run it.", + "context_cues": ["SQL = UPPERCASE keywords", "Python = snake_case"], + "correct_for": "SQL: SELECT, FROM; Python: result_set, fetch_data", + "why_context_fails": "Style bleeds across languages", + "why_rag_wins": "Separate retrieval for each language context" + } + ], + + "code_comment_style": [ + { + "query": "Here's a 5-line utility function, explain what each part does.", + "context_cues": ["5-line = short", "explain each part = inline comments"], + "correct_for": "inline comments for each line", + "why_context_fails": "Might use docstring style for short code", + "why_rag_wins": "Short code + explanation request = inline comments" + }, + { + "query": "Write a complete data processing class with documentation.", + "context_cues": ["complete class = production code", "documentation = docstrings"], + "correct_for": "docstrings at class/method level, minimal inline", + "why_context_fails": "Over-comments with inline explanations", + "why_rag_wins": "Class + documentation context triggers docstring pref" + } + ], + + "code_review_scope": [ + { + "query": "Review this code for bugs, I need to ship it today.", + "context_cues": ["review = code review", "ship today = urgent, bugs only"], + "correct_for": "bugs only, skip style", + "why_context_fails": "Still comments on style issues", + "why_rag_wins": "Urgency cue + 'bugs' retrieves bugs-only preference" + }, + { + "query": "Look at my code and help me improve it for the codebase.", + "context_cues": ["improve = refactor scope", "for codebase = style matters"], + "correct_for": "both logic and style suggestions", + "why_context_fails": "Might only focus on bugs", + "why_rag_wins": "'Improve' and 'codebase' retrieve full-review pref" + } + ], + + # ========================================================================= + # INTERACTION CONFLICTS + # ========================================================================= + "interaction_autonomy": [ + { + "query": "Refactor the authentication module.", + "context_cues": ["refactor = significant change", "no specific instruction"], + "correct_for": "confirm approach first", + "why_context_fails": "Might just start refactoring without plan", + "why_rag_wins": "Ambiguous scope triggers confirmation pref" + }, + { + "query": "Change the variable name from 'x' to 'count' in line 5.", + "context_cues": ["specific instruction", "single change"], + "correct_for": "execute directly, no confirmation needed", + "why_context_fails": "Might still ask for confirmation", + "why_rag_wins": "Specific instruction retrieves execute-directly pref" + }, + { + "query": "Update the database schema to add user preferences - it's complex.", + "context_cues": ["update schema = significant", "complex = acknowledged"], + "correct_for": "definitely confirm - user said it's complex", + "why_context_fails": "Might dive in because 'update' sounds actionable", + "why_rag_wins": "'Complex' keyword strongly triggers confirmation" + } + ], + + "interaction_guidance": [ + { + "query": "Should I use Redis or Memcached for caching?", + "context_cues": ["should I = asking for recommendation", "or = comparison"], + "correct_for": "give recommendation with rationale", + "why_context_fails": "Gives neutral pros/cons without recommendation", + "why_rag_wins": "'Should I' retrieves recommendation preference" + }, + { + "query": "Compare React, Vue, and Angular for my project.", + "context_cues": ["compare = explicit comparison", "my project = context needed"], + "correct_for": "table format with tradeoffs", + "why_context_fails": "Might just recommend one or give long prose", + "why_rag_wins": "'Compare' retrieves comparison-table preference" + } + ], + + # ========================================================================= + # MATH/EXPLANATION CONFLICTS + # ========================================================================= + "math_detail_level": [ + { + "query": "What's the derivative of x^2? I'm preparing for an exam.", + "context_cues": ["what's = direct ask", "exam prep = practice context"], + "correct_for": "show steps + give practice problem", + "why_context_fails": "Just gives answer (2x) without exam context", + "why_rag_wins": "'Exam' retrieves practice-problem preference" + }, + { + "query": "Verify my answer: integral of sin(x) = -cos(x) + C. Is this right?", + "context_cues": ["verify = checking work", "is this right = confirmation"], + "correct_for": "check step by step, confirm or point out issue", + "why_context_fails": "Might re-derive from scratch", + "why_rag_wins": "'Verify' retrieves check-their-work preference" + } + ], + + "math_approach": [ + { + "query": "What's the probability of rolling two sixes?", + "context_cues": ["probability = statistics", "rolling dice = intuitive example"], + "correct_for": "intuition first (1 in 36), then formula", + "why_context_fails": "Starts with P(A∩B) = P(A)P(B) formula", + "why_rag_wins": "Statistics topic retrieves intuition-first preference" + }, + { + "query": "Prove that the sum of angles in a triangle is 180°.", + "context_cues": ["prove = formal proof", "geometry = visual possible"], + "correct_for": "structured proof format per preference", + "why_context_fails": "Might give intuitive explanation instead of proof", + "why_rag_wins": "'Prove' retrieves proof-format preference" + } + ], + + # ========================================================================= + # DOMAIN CONFLICTS + # ========================================================================= + "domain_example_position": [ + { + "query": "How do I use the requests library in Python?", + "context_cues": ["how do I use = practical/API", "library = code example helpful"], + "correct_for": "minimal example first, then explain parameters", + "why_context_fails": "Explains parameters first, example last", + "why_rag_wins": "API/library context retrieves example-first preference" + }, + { + "query": "What is dynamic programming?", + "context_cues": ["what is = concept/theory", "definition needed"], + "correct_for": "definition first, then example, then edge cases", + "why_context_fails": "Might lead with example (Fibonacci)", + "why_rag_wins": "Theory context retrieves definition-first preference" + } + ], + + # ========================================================================= + # OUTPUT ARTIFACT CONFLICTS + # ========================================================================= + "output_code_presentation": [ + { + "query": "Give me a sorting function I can use, I'm in a hurry.", + "context_cues": ["give me = copyable", "in a hurry = no explanation"], + "correct_for": "single code block, no prose", + "why_context_fails": "Adds explanatory prose between code", + "why_rag_wins": "'Give me' + 'hurry' retrieves copy-paste preference" + }, + { + "query": "Teach me how to implement quicksort step by step.", + "context_cues": ["teach me = learning", "step by step = chunked"], + "correct_for": "code in small chunks with explanation between", + "why_context_fails": "Gives full implementation at once", + "why_rag_wins": "'Teach' + 'step by step' retrieves chunked preference" + } + ], + + # ========================================================================= + # CORRECTION STYLE CONFLICTS + # ========================================================================= + "correction_severity": [ + { + "query": "I'm using a hashmap to store my data, is this right?", + "context_cues": ["hashmap = might mean dict/map", "is this right = validation"], + "correct_for": "gentle inline (hashmap is fine, also called dict)", + "why_context_fails": "Might pedantically correct terminology", + "why_rag_wins": "Minor terminology + validation retrieves gentle-correction pref" + }, + { + "query": "I think recursion is just loops with extra steps, right?", + "context_cues": ["fundamental misconception", "asking for validation"], + "correct_for": "directly address misconception before proceeding", + "why_context_fails": "Might gloss over and just show recursion", + "why_rag_wins": "Fundamental error retrieves explicit-correction preference" + } + ], + + # ========================================================================= + # MULTI-DOMAIN CONFLICTS (hardest!) + # ========================================================================= + "multi_domain_complex": [ + { + "query": "Quick question - walk me through implementing a binary tree in Python with proper documentation.", + "context_cues": ["quick = brief", "walk through = detailed", "documentation = thorough"], + "correct_for": "quick wins (explicit), but include docstrings (documentation ask)", + "why_context_fails": "Confused by conflicting signals, inconsistent response", + "why_rag_wins": "Explicit brevity cue retrieved, documentation pref adds docstrings" + }, + { + "query": "I'm debugging my ML model and it's not converging. This is frustrating! Compare Adam vs SGD for me.", + "context_cues": ["debugging = focus on issue", "frustrating = emotional", "compare = table"], + "correct_for": "acknowledge frustration, then comparison table for optimizers", + "why_context_fails": "Might skip emotional acknowledgment or wrong format", + "why_rag_wins": "Frustration pref + comparison pref both retrieved, applied in order" + }, + { + "query": "Review this Python code and convert it to JavaScript. Focus on bugs first.", + "context_cues": ["review = bugs per 'focus' cue", "convert = language change"], + "correct_for": "Python review (bugs only) + JS conversion (camelCase)", + "why_context_fails": "Applies wrong scope or wrong naming convention", + "why_rag_wins": "Multiple relevant prefs retrieved per task segment" + } + ] +} + + +# ============================================================================ +# Scenario Generator +# ============================================================================ + +class ConflictScenarioGenerator: + """Generates conflict scenarios from templates and user profiles.""" + + def __init__(self, profile: dict = None, seed: int = 42): + self.profile = profile + self.preferences = {p['pref_id']: p for p in profile['preferences']} if profile else {} + self.random = random.Random(seed) + + def generate_for_profile(self, preferences: list, domain: str = None) -> dict: + """Generate a single conflict scenario for given preferences and domain.""" + # Find conflict groups in these preferences + conflict_groups = {} + for pref in preferences: + cg = pref.get('conflict_group') + if cg: + if cg not in conflict_groups: + conflict_groups[cg] = [] + conflict_groups[cg].append(pref) + + # Find a conflict group with at least 2 preferences + for cg, prefs in conflict_groups.items(): + if len(prefs) >= 2 and cg in CONFLICT_TEMPLATES: + templates = CONFLICT_TEMPLATES[cg] + template = self.random.choice(templates) + return { + "query": template['query'], + "conflict_group": cg, + "preferences": prefs, + "expected_preference": prefs[0]['pref_id'], # First one as expected + } + return None + + def generate_scenarios(self, num_per_conflict_type: int = 3) -> list: + """Generate conflict scenarios based on profile's preferences.""" + scenarios = [] + + for conflict_group, templates in CONFLICT_TEMPLATES.items(): + # Check if this conflict group exists in user's preferences + relevant_prefs = [ + p for p in self.profile['preferences'] + if p.get('conflict_group') == conflict_group + ] + + if len(relevant_prefs) < 2: + continue # Need at least 2 prefs to have a conflict + + # Generate scenarios from templates + selected_templates = self.random.sample( + templates, + min(num_per_conflict_type, len(templates)) + ) + + for i, template in enumerate(selected_templates): + scenario = self._create_scenario( + conflict_group, template, relevant_prefs, i + ) + if scenario: + scenarios.append(scenario) + + return scenarios + + def _create_scenario( + self, + conflict_group: str, + template: dict, + relevant_prefs: list, + index: int + ) -> ConflictScenario: + """Create a scenario from a template.""" + # Determine which preference is correct + # Based on context cues in the query + query = template['query'] + correct_pref = self._determine_correct_preference(query, relevant_prefs) + wrong_prefs = [p for p in relevant_prefs if p['pref_id'] != correct_pref['pref_id']] + + return ConflictScenario( + scenario_id=f"{conflict_group}_{index:03d}", + conflict_group=conflict_group, + query=query, + context_cues=template.get('context_cues', []), + triggered_prefs=[p['pref_id'] for p in relevant_prefs], + correct_pref_id=correct_pref['pref_id'], + wrong_pref_ids=[p['pref_id'] for p in wrong_prefs], + why_correct=template.get('correct_for', ''), + expected_rag_behavior=template.get('why_rag_wins', ''), + expected_context_failure=template.get('why_context_fails', '') + ) + + def _determine_correct_preference(self, query: str, prefs: list) -> dict: + """ + Determine which preference is correct for a query. + Uses keyword matching on priority_context. + """ + query_lower = query.lower() + scores = [] + + for pref in prefs: + score = 0 + for keyword in pref.get('priority_context', []): + if keyword.lower() in query_lower: + score += 1 + # Bonus for condition match + if pref.get('condition', '').lower() in query_lower: + score += 2 + scores.append((pref, score)) + + # Return highest scoring preference + scores.sort(key=lambda x: x[1], reverse=True) + return scores[0][0] if scores else prefs[0] + + +def generate_conflict_enriched_dataset( + profiles_path: str, + output_path: str, + scenarios_per_conflict: int = 3, + seed: int = 42 +): + """ + Generate a dataset where every query triggers at least one conflict. + """ + profiles = [] + with open(profiles_path) as f: + for line in f: + profiles.append(json.loads(line)) + + all_scenarios = [] + conflict_coverage = {} + + for profile in profiles: + generator = ConflictScenarioGenerator(profile, seed) + scenarios = generator.generate_scenarios(scenarios_per_conflict) + + for scenario in scenarios: + scenario_dict = { + 'user_id': profile['user_id'], + 'scenario_id': scenario.scenario_id, + 'conflict_group': scenario.conflict_group, + 'query': scenario.query, + 'context_cues': scenario.context_cues, + 'triggered_prefs': scenario.triggered_prefs, + 'correct_pref_id': scenario.correct_pref_id, + 'wrong_pref_ids': scenario.wrong_pref_ids, + 'why_correct': scenario.why_correct, + 'expected_rag_behavior': scenario.expected_rag_behavior, + 'expected_context_failure': scenario.expected_context_failure + } + all_scenarios.append(scenario_dict) + + # Track coverage + cg = scenario.conflict_group + conflict_coverage[cg] = conflict_coverage.get(cg, 0) + 1 + + # Save + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w') as f: + for scenario in all_scenarios: + f.write(json.dumps(scenario) + '\n') + + print(f"Generated {len(all_scenarios)} conflict scenarios") + print(f"Coverage by conflict type:") + for cg, count in sorted(conflict_coverage.items()): + print(f" {cg}: {count}") + + return all_scenarios + + +def create_evaluation_harness(scenarios: list) -> dict: + """ + Create an evaluation harness that programmatically checks + if the correct preference was applied. + """ + harness = { + "total_scenarios": len(scenarios), + "by_conflict_type": {}, + "evaluation_functions": {} + } + + # Group by conflict type + for scenario in scenarios: + cg = scenario['conflict_group'] + if cg not in harness['by_conflict_type']: + harness['by_conflict_type'][cg] = [] + harness['by_conflict_type'][cg].append(scenario) + + # Add evaluation functions for each conflict type + harness['evaluation_functions'] = { + "format_structure": check_format_structure, + "verbosity": check_verbosity, + "naming_convention": check_naming_convention, + "answer_position": check_answer_position, + # ... more evaluators + } + + return harness + + +# ============================================================================ +# Evaluation Functions (check if correct preference was applied) +# ============================================================================ + +def check_format_structure(response: str, correct_pref: dict) -> bool: + """Check if response uses correct format (bullets vs numbered).""" + has_bullets = bool(any(c in response for c in ['•', '-', '*'])) + has_numbers = bool(any(f"{i}." in response or f"{i})" in response for i in range(1, 10))) + + if 'bullet' in correct_pref.get('action', '').lower(): + return has_bullets and not has_numbers + elif 'numbered' in correct_pref.get('action', '').lower(): + return has_numbers + return True # Can't determine + + +def check_verbosity(response: str, correct_pref: dict) -> bool: + """Check if response matches verbosity preference.""" + word_count = len(response.split()) + + if 'concise' in correct_pref.get('action', '').lower() or \ + '3 sentences' in correct_pref.get('action', '').lower(): + return word_count < 100 # Rough threshold + elif 'detailed' in correct_pref.get('action', '').lower(): + return word_count > 150 + return True + + +def check_naming_convention(response: str, correct_pref: dict) -> bool: + """Check if code uses correct naming convention.""" + import re + + # Look for function/variable definitions + if 'snake_case' in correct_pref.get('action', '').lower(): + # Should have underscores, no camelCase + has_snake = bool(re.search(r'[a-z]+_[a-z]+', response)) + has_camel = bool(re.search(r'[a-z]+[A-Z][a-z]+', response)) + return has_snake and not has_camel + + elif 'camelCase' in correct_pref.get('action', '').lower(): + has_camel = bool(re.search(r'[a-z]+[A-Z][a-z]+', response)) + return has_camel + + return True + + +def check_answer_position(response: str, correct_pref: dict) -> bool: + """Check if answer comes first or explanation builds up.""" + # Simplified: check if response starts with answer-like content + first_sentence = response.split('.')[0] if '.' in response else response[:100] + + if 'answer first' in correct_pref.get('action', '').lower(): + # First sentence should be direct + direct_indicators = ['is', 'are', 'the answer', 'yes', 'no', 'it\'s'] + return any(ind in first_sentence.lower() for ind in direct_indicators) + + elif 'build up' in correct_pref.get('action', '').lower(): + # First sentence should be explanatory + buildup_indicators = ['let\'s', 'first', 'to understand', 'consider'] + return any(ind in first_sentence.lower() for ind in buildup_indicators) + + return True + + +# ============================================================================ +# Main +# ============================================================================ + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--profiles", default="collaborativeagents/data/complex_profiles/profiles.jsonl") + parser.add_argument("--output", default="collaborativeagents/data/conflict_scenarios.jsonl") + parser.add_argument("--scenarios_per_conflict", type=int, default=3) + parser.add_argument("--seed", type=int, default=42) + + args = parser.parse_args() + + scenarios = generate_conflict_enriched_dataset( + args.profiles, + args.output, + args.scenarios_per_conflict, + args.seed + ) diff --git a/collaborativeagents/scripts/contextual_test_small.sbatch b/collaborativeagents/scripts/contextual_test_small.sbatch new file mode 100644 index 0000000..83c20ef --- /dev/null +++ b/collaborativeagents/scripts/contextual_test_small.sbatch @@ -0,0 +1,80 @@ +#!/bin/bash +#SBATCH --job-name=ctx_test +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8-interactive +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:4 +#SBATCH --mem=100G +#SBATCH --time=00:20:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/ctx_test-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/ctx_test-%j.err + +# Small-scale contextual test: 1 profile, 15 sessions +# Testing fix: token estimation ratio changed from 4:1 to 2.5:1 + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +echo "=== Contextual Test (Token Fix) ===" +echo "Fix: token estimation 4:1 -> 2.5:1" +echo "1 profile, 15 sessions" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +# Start vLLM servers +# User simulator: GPUs 0,1 +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME & + +# Agent: GPUs 2,3 +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 & + +echo "Waiting for vLLM servers..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator ready after $((i*5))s" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent ready after $((i*5))s" + break + fi + sleep 5 +done +sleep 5 + +OUTPUT_DIR="../results/contextual_test_$(date +%Y%m%d_%H%M%S)" + +echo "" +echo "============================================" +echo "Testing: contextual (with token fix)" +echo "============================================" +date + +python scripts/run_experiments.py --methods contextual \ + --datasets math-hard --n-profiles 1 --n-sessions 15 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 1 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + +echo "" +echo "=== Done ===" +date + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/controlled_test.sbatch b/collaborativeagents/scripts/controlled_test.sbatch new file mode 100644 index 0000000..607b93b --- /dev/null +++ b/collaborativeagents/scripts/controlled_test.sbatch @@ -0,0 +1,173 @@ +#!/bin/bash +#SBATCH --job-name=ctrl_test +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8-interactive +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=00:45:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/ctrl_test-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/ctrl_test-%j.err + +# Controlled Test: Same user profile, same questions, 3 methods +# Tests: +# 1. Stronger user enforcement prompts +# 2. Memory retrieval debug output +# 3. Comparison across vanilla/rag/rag_vector + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +# Use first profile only for controlled comparison +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" +MEMORY_STORE="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store" + +echo "=== Controlled Comparison Test ===" +echo "Same user profile (1st), same 15 questions, 3 methods" +echo "Testing: stronger enforcement + retrieval debug" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +# Start vLLM servers +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \ + --max-model-len 16384 --dtype bfloat16 & + +echo "Waiting for vLLM servers..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator ready after $((i*5))s" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent ready after $((i*5))s" + break + fi + sleep 5 +done +sleep 5 + +OUTPUT_DIR="../results/controlled_test_$(date +%Y%m%d_%H%M%S)" + +# Run each method with SAME user (1 profile, 15 sessions) +for METHOD in vanilla rag rag_vector; do + echo "" + echo "============================================" + echo "Testing: $METHOD" + echo "============================================" + + # Clear memory store before each method (fresh start) + > ${MEMORY_STORE}/memory_cards.jsonl + rm -f ${MEMORY_STORE}/memory_embeddings.npy + echo "Memory store cleared" + + date + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 1 --n-sessions 15 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 1 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + + echo "Method $METHOD completed" + + # Show memory count for rag methods + if [ "$METHOD" != "vanilla" ]; then + echo "Final memory cards: $(wc -l < ${MEMORY_STORE}/memory_cards.jsonl)" + fi +done + +echo "" +echo "=== Done ===" +date + +# Generate comparison summary +python3 << 'EOF' +import json +import os +from pathlib import Path + +output_base = sorted(Path("../results").glob("controlled_test_*"))[-1] +print(f"\n=== Comparison Summary ===\n") +print(f"Results dir: {output_base}") + +methods = ["vanilla", "rag", "rag_vector"] +results = {} + +for subdir in output_base.iterdir(): + if subdir.is_dir(): + for method in methods: + result_file = subdir / method / "results.json" + if result_file.exists(): + with open(result_file) as f: + results[method] = json.load(f) + break + +if results: + print(f"\n{'Metric':<25} {'vanilla':<12} {'rag':<12} {'rag_vector':<12}") + print("-" * 60) + + for method in methods: + if method not in results: + continue + data = results[method] + task_succ = sum(r['metrics']['task_success'] for r in data) / len(data) + avg_turns = sum(r['metrics']['total_turns'] for r in data) / len(data) + avg_enf = sum(r['metrics']['enforcement_count'] for r in data) / len(data) + + if method == methods[0]: + print(f"{'Task Success':<25} {task_succ:<12.1%} ", end="") + else: + print(f"{task_succ:<12.1%} ", end="") + print() + + for method in methods: + if method not in results: + continue + data = results[method] + avg_turns = sum(r['metrics']['total_turns'] for r in data) / len(data) + if method == methods[0]: + print(f"{'Avg Turns':<25} {avg_turns:<12.1f} ", end="") + else: + print(f"{avg_turns:<12.1f} ", end="") + print() + + for method in methods: + if method not in results: + continue + data = results[method] + avg_enf = sum(r['metrics']['enforcement_count'] for r in data) / len(data) + if method == methods[0]: + print(f"{'Avg Enforcement':<25} {avg_enf:<12.1f} ", end="") + else: + print(f"{avg_enf:<12.1f} ", end="") + print() + + # Session-by-session comparison + print(f"\n=== Session-by-Session Turns ===") + print(f"{'Session':<10} {'vanilla':<12} {'rag':<12} {'rag_vector':<12}") + print("-" * 50) + for i in range(min(15, len(results.get('vanilla', [])))): + print(f"{i+1:<10} ", end="") + for method in methods: + if method in results and i < len(results[method]): + turns = results[method][i]['metrics']['total_turns'] + print(f"{turns:<12} ", end="") + print() +EOF + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/exp_all_memory.sbatch b/collaborativeagents/scripts/exp_all_memory.sbatch new file mode 100644 index 0000000..c6310ee --- /dev/null +++ b/collaborativeagents/scripts/exp_all_memory.sbatch @@ -0,0 +1,59 @@ +#!/bin/bash +#SBATCH --job-name=exp_all_memory +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=24:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_all_memory-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_all_memory-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== all_memory (PersonalizedLLMAdapter with local transformers) ===" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# GPU 0,1: vLLM server for user simulator (port 8004) +# PersonalizedLLMAdapter uses local transformers for embedding/reranker/chat/extractor +echo "Starting user simulator vLLM server on GPU 0,1 (port 8004)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +echo "Waiting for vLLM server..." +for i in $(seq 1 200); do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "Server ready after $((i*3))s"; break + fi + sleep 3 +done + +# GPU 2,3: PersonalizedLLMAdapter's transformers models +# (embedding ~8B, reranker ~8B, chat ~1.5B, extractor ~0.6B) +CUDA_VISIBLE_DEVICES=2,3 python scripts/run_experiments.py \ + --methods all_memory \ + --datasets math-hard,math-500,bigcodebench \ + --n-profiles 200 --n-sessions 30 --max-turns 15 \ + --use-vllm --parallel-profiles 10 \ + --output-dir ../results/full_h200 \ + --profile-path "$PROFILE_PATH" + +pkill -f "vllm.entrypoints" 2>/dev/null || true +echo "Done: $(date)" diff --git a/collaborativeagents/scripts/exp_contextual.sbatch b/collaborativeagents/scripts/exp_contextual.sbatch new file mode 100644 index 0000000..2c06bb8 --- /dev/null +++ b/collaborativeagents/scripts/exp_contextual.sbatch @@ -0,0 +1,66 @@ +#!/bin/bash +#SBATCH --job-name=exp_contextual +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=24:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_contextual-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_contextual-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== contextual (vLLM-based) ===" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# GPU 0,1: vLLM server for user simulator (port 8004) +echo "Starting user simulator vLLM server on GPU 0,1 (port 8004)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +# GPU 2,3: vLLM server for agent (port 8003) +echo "Starting agent vLLM server on GPU 2,3 (port 8003)..." +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +echo "Waiting for vLLM servers..." +for i in $(seq 1 200); do + user_ready=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0) + agent_ready=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0) + if [ "$user_ready" = "1" ] && [ "$agent_ready" = "1" ]; then + echo "Both servers ready after $((i*3))s"; break + fi + sleep 3 +done + +# Run experiment (uses vLLM HTTP API, no local GPU needed) +python scripts/run_experiments.py \ + --methods contextual \ + --datasets math-hard,math-500,bigcodebench \ + --n-profiles 200 --n-sessions 30 --max-turns 15 \ + --use-vllm --parallel-profiles 50 \ + --output-dir ../results/full_h200 \ + --profile-path "$PROFILE_PATH" + +pkill -f "vllm.entrypoints" 2>/dev/null || true +echo "Done: $(date)" diff --git a/collaborativeagents/scripts/exp_rag.sbatch b/collaborativeagents/scripts/exp_rag.sbatch new file mode 100644 index 0000000..7dcad65 --- /dev/null +++ b/collaborativeagents/scripts/exp_rag.sbatch @@ -0,0 +1,59 @@ +#!/bin/bash +#SBATCH --job-name=exp_rag +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=24:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_rag-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_rag-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== rag (PersonalizedLLMAdapter with local transformers) ===" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# GPU 0,1: vLLM server for user simulator (port 8004) +# PersonalizedLLMAdapter uses local transformers for embedding/reranker/chat/extractor +echo "Starting user simulator vLLM server on GPU 0,1 (port 8004)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +echo "Waiting for vLLM server..." +for i in $(seq 1 200); do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "Server ready after $((i*3))s"; break + fi + sleep 3 +done + +# GPU 2,3: PersonalizedLLMAdapter's transformers models +# (embedding ~8B, reranker ~8B, chat ~1.5B, extractor ~0.6B) +CUDA_VISIBLE_DEVICES=2,3 python scripts/run_experiments.py \ + --methods rag \ + --datasets math-hard,math-500,bigcodebench \ + --n-profiles 200 --n-sessions 30 --max-turns 15 \ + --use-vllm --parallel-profiles 10 \ + --output-dir ../results/full_h200 \ + --profile-path "$PROFILE_PATH" + +pkill -f "vllm.entrypoints" 2>/dev/null || true +echo "Done: $(date)" diff --git a/collaborativeagents/scripts/exp_rag_vector.sbatch b/collaborativeagents/scripts/exp_rag_vector.sbatch new file mode 100644 index 0000000..f63bd26 --- /dev/null +++ b/collaborativeagents/scripts/exp_rag_vector.sbatch @@ -0,0 +1,59 @@ +#!/bin/bash +#SBATCH --job-name=exp_rag_vector +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=24:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_rag_vector-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_rag_vector-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== rag_vector (PersonalizedLLMAdapter with local transformers) ===" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# GPU 0,1: vLLM server for user simulator (port 8004) +# PersonalizedLLMAdapter uses local transformers for embedding/reranker/chat/extractor +echo "Starting user simulator vLLM server on GPU 0,1 (port 8004)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +echo "Waiting for vLLM server..." +for i in $(seq 1 200); do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "Server ready after $((i*3))s"; break + fi + sleep 3 +done + +# GPU 2,3: PersonalizedLLMAdapter's transformers models +# (embedding ~8B, reranker ~8B, chat ~1.5B, extractor ~0.6B) +CUDA_VISIBLE_DEVICES=2,3 python scripts/run_experiments.py \ + --methods rag_vector \ + --datasets math-hard,math-500,bigcodebench \ + --n-profiles 200 --n-sessions 30 --max-turns 15 \ + --use-vllm --parallel-profiles 10 \ + --output-dir ../results/full_h200 \ + --profile-path "$PROFILE_PATH" + +pkill -f "vllm.entrypoints" 2>/dev/null || true +echo "Done: $(date)" diff --git a/collaborativeagents/scripts/exp_reflection.sbatch b/collaborativeagents/scripts/exp_reflection.sbatch new file mode 100644 index 0000000..2c94495 --- /dev/null +++ b/collaborativeagents/scripts/exp_reflection.sbatch @@ -0,0 +1,66 @@ +#!/bin/bash +#SBATCH --job-name=exp_reflection +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=24:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_reflection-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_reflection-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== reflection (vLLM-based) ===" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# GPU 0,1: vLLM server for user simulator (port 8004) +echo "Starting user simulator vLLM server on GPU 0,1 (port 8004)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +# GPU 2,3: vLLM server for agent (port 8003) +echo "Starting agent vLLM server on GPU 2,3 (port 8003)..." +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +echo "Waiting for vLLM servers..." +for i in $(seq 1 200); do + user_ready=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0) + agent_ready=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0) + if [ "$user_ready" = "1" ] && [ "$agent_ready" = "1" ]; then + echo "Both servers ready after $((i*3))s"; break + fi + sleep 3 +done + +# Run experiment (uses vLLM HTTP API, no local GPU needed) +python scripts/run_experiments.py \ + --methods reflection \ + --datasets math-hard,math-500,bigcodebench \ + --n-profiles 200 --n-sessions 30 --max-turns 15 \ + --use-vllm --parallel-profiles 50 \ + --output-dir ../results/full_h200 \ + --profile-path "$PROFILE_PATH" + +pkill -f "vllm.entrypoints" 2>/dev/null || true +echo "Done: $(date)" diff --git a/collaborativeagents/scripts/exp_reflection_grpo.sbatch b/collaborativeagents/scripts/exp_reflection_grpo.sbatch new file mode 100644 index 0000000..10b5a4f --- /dev/null +++ b/collaborativeagents/scripts/exp_reflection_grpo.sbatch @@ -0,0 +1,59 @@ +#!/bin/bash +#SBATCH --job-name=exp_reflection_grpo +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=24:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_reflection_grpo-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_reflection_grpo-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== reflection_grpo (SEQUENTIAL) ===" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +echo "Waiting for servers..." +for i in $(seq 1 200); do + if curl -s http://localhost:8004/health > /dev/null 2>&1 && curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Servers ready after $((i*3))s"; break + fi + sleep 3 +done + +python scripts/run_experiments.py \ + --methods reflection_grpo \ + --datasets math-hard,math-500,bigcodebench \ + --n-profiles 200 --n-sessions 30 --max-turns 15 \ + --use-vllm --parallel-profiles 10 \ + --output-dir ../results/full_h200 \ + --profile-path "$PROFILE_PATH" + +pkill -f "vllm.entrypoints" 2>/dev/null || true +echo "Done: $(date)" diff --git a/collaborativeagents/scripts/exp_template.sbatch b/collaborativeagents/scripts/exp_template.sbatch new file mode 100644 index 0000000..8f6ba04 --- /dev/null +++ b/collaborativeagents/scripts/exp_template.sbatch @@ -0,0 +1,59 @@ +#!/bin/bash +#SBATCH --job-name=exp_METHOD +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=24:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_METHOD-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_METHOD-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== METHOD (SEQUENTIAL) ===" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +echo "Waiting for servers..." +for i in $(seq 1 200); do + if curl -s http://localhost:8004/health > /dev/null 2>&1 && curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Servers ready after $((i*3))s"; break + fi + sleep 3 +done + +python scripts/run_experiments.py \ + --methods METHOD \ + --datasets math-hard,math-500,bigcodebench \ + --n-profiles 200 --n-sessions 30 --max-turns 15 \ + --use-vllm --parallel-profiles 10 \ + --output-dir ../results/full_h200 \ + --profile-path "$PROFILE_PATH" + +pkill -f "vllm.entrypoints" 2>/dev/null || true +echo "Done: $(date)" diff --git a/collaborativeagents/scripts/exp_vanilla.sbatch b/collaborativeagents/scripts/exp_vanilla.sbatch new file mode 100644 index 0000000..445f771 --- /dev/null +++ b/collaborativeagents/scripts/exp_vanilla.sbatch @@ -0,0 +1,59 @@ +#!/bin/bash +#SBATCH --job-name=exp_vanilla +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=12:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_vanilla-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/exp_vanilla-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== VANILLA (BATCH) ===" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +echo "Waiting for servers..." +for i in $(seq 1 200); do + if curl -s http://localhost:8004/health > /dev/null 2>&1 && curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Servers ready after $((i*3))s"; break + fi + sleep 3 +done + +python scripts/run_experiments.py \ + --methods vanilla \ + --datasets math-hard,math-500,bigcodebench \ + --n-profiles 200 --n-sessions 30 --max-turns 15 \ + --use-vllm --batch-size 50 --parallel-profiles 50 \ + --output-dir ../results/full_h200 \ + --profile-path "$PROFILE_PATH" + +pkill -f "vllm.entrypoints" 2>/dev/null || true +echo "Done: $(date)" diff --git a/collaborativeagents/scripts/extend_profiles.py b/collaborativeagents/scripts/extend_profiles.py new file mode 100644 index 0000000..d780697 --- /dev/null +++ b/collaborativeagents/scripts/extend_profiles.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +""" +Generate additional profiles by remixing preferences from existing profiles. +This creates diverse profile combinations without requiring LLM calls. +""" + +import json +import random +import hashlib +from pathlib import Path +from typing import List, Dict +import argparse + + +def load_profiles(path: Path) -> List[Dict]: + """Load profiles from JSONL file.""" + profiles = [] + with open(path) as f: + for line in f: + profiles.append(json.loads(line.strip())) + return profiles + + +def extract_all_preferences(profiles: List[Dict]) -> Dict[str, List[Dict]]: + """Extract all unique preferences grouped by category (prefix).""" + categories = {} + seen_ids = set() + + for profile in profiles: + for pref in profile.get("preferences", []): + pref_id = pref.get("pref_id", "unknown") + if pref_id in seen_ids: + continue + seen_ids.add(pref_id) + + # Extract category from prefix (e.g., "rf_001" -> "rf") + prefix = pref_id.split("_")[0] if "_" in pref_id else "other" + if prefix not in categories: + categories[prefix] = [] + categories[prefix].append(pref) + + return categories + + +def extract_personas(profiles: List[Dict]) -> List[str]: + """Extract unique personas from profiles.""" + personas = [] + seen = set() + for profile in profiles: + persona = profile.get("persona", "") + if persona and persona not in seen: + personas.append(persona) + seen.add(persona) + return personas + + +def generate_new_profile( + user_id: str, + preference_pool: Dict[str, List[Dict]], + personas: List[str], + target_prefs: int = 43, + rng: random.Random = None +) -> Dict: + """Generate a new profile by sampling from preference pool.""" + if rng is None: + rng = random.Random() + + selected_prefs = [] + + # Sample from each category to maintain diversity + prefs_per_cat = max(1, target_prefs // len(preference_pool)) + + for cat, prefs in preference_pool.items(): + # Sample with some randomness + n_sample = min(len(prefs), prefs_per_cat + rng.randint(-1, 2)) + n_sample = max(1, n_sample) + sampled = rng.sample(prefs, min(n_sample, len(prefs))) + selected_prefs.extend(sampled) + + # Add/remove to hit target + all_prefs = [] + for prefs in preference_pool.values(): + all_prefs.extend(prefs) + + while len(selected_prefs) < target_prefs: + remaining = [p for p in all_prefs if p not in selected_prefs] + if not remaining: + break + selected_prefs.append(rng.choice(remaining)) + + while len(selected_prefs) > target_prefs: + selected_prefs.pop(rng.randint(0, len(selected_prefs) - 1)) + + # Build conflict groups + conflict_groups = {} + for pref in selected_prefs: + cg = pref.get("conflict_group") + if cg: + if cg not in conflict_groups: + conflict_groups[cg] = [] + conflict_groups[cg].append(pref["pref_id"]) + + return { + "user_id": user_id, + "persona": rng.choice(personas), + "preferences": selected_prefs, + "conflict_groups": conflict_groups, + "meta": { + "total_preferences": len(selected_prefs), + "total_conflict_groups": len(conflict_groups), + "generator": "extend_profiles.py" + } + } + + +def main(): + parser = argparse.ArgumentParser( + description="Generate additional profiles by remixing existing ones" + ) + parser.add_argument("--input", type=str, required=True, + help="Path to existing profiles JSONL") + parser.add_argument("--output", type=str, required=True, + help="Path for output profiles JSONL") + parser.add_argument("--num-new", type=int, default=100, + help="Number of new profiles to generate") + parser.add_argument("--seed", type=int, default=142, + help="Random seed (use different from original)") + parser.add_argument("--target-prefs", type=int, default=43, + help="Target number of preferences per profile") + parser.add_argument("--merge", action="store_true", + help="Merge with existing profiles in output") + + args = parser.parse_args() + + input_path = Path(args.input) + output_path = Path(args.output) + + print(f"Loading profiles from: {input_path}") + profiles = load_profiles(input_path) + print(f" Loaded {len(profiles)} profiles") + + # Extract preference pool and personas + pref_pool = extract_all_preferences(profiles) + personas = extract_personas(profiles) + + print(f"\nPreference pool:") + for cat, prefs in pref_pool.items(): + print(f" {cat}: {len(prefs)} preferences") + print(f" Total unique preferences: {sum(len(p) for p in pref_pool.values())}") + print(f" Unique personas: {len(personas)}") + + # Generate new profiles + rng = random.Random(args.seed) + new_profiles = [] + + print(f"\nGenerating {args.num_new} new profiles...") + for i in range(args.num_new): + user_id = f"user_{hashlib.md5(f'{args.seed}_{i}'.encode()).hexdigest()[:8]}" + profile = generate_new_profile( + user_id=user_id, + preference_pool=pref_pool, + personas=personas, + target_prefs=args.target_prefs, + rng=rng + ) + new_profiles.append(profile) + + if (i + 1) % 20 == 0: + print(f" Generated {i + 1}/{args.num_new}") + + # Optionally merge with original + if args.merge: + output_profiles = profiles + new_profiles + print(f"\nMerging: {len(profiles)} original + {len(new_profiles)} new = {len(output_profiles)}") + else: + output_profiles = new_profiles + + # Save + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w') as f: + for profile in output_profiles: + f.write(json.dumps(profile) + '\n') + + print(f"\nSaved {len(output_profiles)} profiles to: {output_path}") + + # Summary stats + pref_counts = [p["meta"]["total_preferences"] for p in output_profiles] + print(f"\nProfile statistics:") + print(f" Min preferences: {min(pref_counts)}") + print(f" Max preferences: {max(pref_counts)}") + print(f" Avg preferences: {sum(pref_counts)/len(pref_counts):.1f}") + + +if __name__ == "__main__": + main() diff --git a/collaborativeagents/scripts/full_experiment_batch.sbatch b/collaborativeagents/scripts/full_experiment_batch.sbatch new file mode 100644 index 0000000..1dc4da1 --- /dev/null +++ b/collaborativeagents/scripts/full_experiment_batch.sbatch @@ -0,0 +1,132 @@ +#!/bin/bash +#SBATCH --job-name=batch_exp +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=24:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/batch_exp-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/batch_exp-%j.err + +# Full experiment: Batch-processable methods (vanilla, all_memory) +# 200 profiles × 30 sessions = 6,000 sessions per method +# Using turn-synchronous batch processing (paper's approach) + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT_USER=8004 +PORT_AGENT=8003 +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "============================================" +echo "Full Experiment: Batch Methods" +echo "============================================" +echo "Methods: vanilla, all_memory" +echo "Profiles: 200" +echo "Sessions/profile: 30" +echo "Total: 6,000 sessions per method" +echo "" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv +echo "" + +# Kill any existing servers +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start vLLM servers with optimized settings +echo "Starting 8B user simulator (GPU 0-1, TP=2)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_USER \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 \ + --max-model-len 8192 \ + --disable-log-requests \ + --dtype bfloat16 \ + --max-num-seqs 256 & +SERVER_USER_PID=$! + +echo "Starting 8B agent (GPU 2-3, TP=2)..." +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_AGENT \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 \ + --max-model-len 8192 \ + --disable-log-requests \ + --dtype bfloat16 \ + --max-num-seqs 256 & +SERVER_AGENT_PID=$! + +echo "Waiting for servers (may take 5-10 min for CUDA graph compilation)..." +for i in $(seq 1 200); do + READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0) + READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0) + if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then + echo "Both servers ready after $((i*3))s" + break + fi + if [ $((i % 20)) -eq 0 ]; then + echo " Still waiting... ($((i*3))s)" + fi + sleep 3 +done + +if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then + echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then + echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +echo "Both servers healthy" +echo "" + +# Run batch experiment (only vanilla is truly stateless) +for METHOD in vanilla; do + echo "============================================" + echo "Running: $METHOD (BATCH processing)" + echo "============================================" + START=$(date +%s) + + python scripts/run_experiments.py \ + --methods $METHOD \ + --datasets math-hard,math-500,bigcodebench \ + --n-profiles 200 \ + --n-sessions 30 \ + --max-turns 15 \ + --use-vllm \ + --batch-size 50 \ + --parallel-profiles 50 \ + --output-dir ../results/full_experiment_h200 \ + --profile-path "$PROFILE_PATH" + + END=$(date +%s) + ELAPSED=$((END-START)) + echo "" + echo "$METHOD completed in ${ELAPSED}s" + THROUGHPUT=$((6000 * 3600 / ELAPSED)) + echo "Throughput: ${THROUGHPUT} sessions/hr" + echo "" +done + +# Cleanup +echo "Cleaning up..." +kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true + +echo "" +echo "============================================" +echo "BATCH EXPERIMENT COMPLETE" +echo "============================================" +date diff --git a/collaborativeagents/scripts/full_experiment_sequential.sbatch b/collaborativeagents/scripts/full_experiment_sequential.sbatch new file mode 100644 index 0000000..2f3bd4b --- /dev/null +++ b/collaborativeagents/scripts/full_experiment_sequential.sbatch @@ -0,0 +1,131 @@ +#!/bin/bash +#SBATCH --job-name=seq_exp +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=48:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/seq_exp-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/seq_exp-%j.err + +# Full experiment: Sequential methods (rag, rag_vector, contextual, reflection) +# These methods require state tracking between sessions +# 200 profiles × 30 sessions = 6,000 sessions per method + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT_USER=8004 +PORT_AGENT=8003 +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "============================================" +echo "Full Experiment: Sequential Methods" +echo "============================================" +echo "Methods: rag, rag_vector, contextual, reflection" +echo "Profiles: 200" +echo "Sessions/profile: 30" +echo "Total: 6,000 sessions per method" +echo "" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv +echo "" + +# Kill any existing servers +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start vLLM servers +echo "Starting 8B user simulator (GPU 0-1, TP=2)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_USER \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 \ + --max-model-len 8192 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_USER_PID=$! + +echo "Starting 8B agent (GPU 2-3, TP=2)..." +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_AGENT \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 \ + --max-model-len 8192 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_AGENT_PID=$! + +echo "Waiting for servers..." +for i in $(seq 1 120); do + READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0) + READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0) + if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then + echo "Both servers ready after $((i*3))s" + break + fi + if [ $((i % 20)) -eq 0 ]; then + echo " Still waiting... ($((i*3))s)" + fi + sleep 3 +done + +if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then + echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then + echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +echo "Both servers healthy" +echo "" + +# Run sequential experiments (all stateful methods) +for METHOD in all_memory rag rag_vector contextual reflection; do + echo "============================================" + echo "Running: $METHOD (SEQUENTIAL processing)" + echo "============================================" + START=$(date +%s) + + python scripts/run_experiments.py \ + --methods $METHOD \ + --datasets math-hard,math-500,bigcodebench \ + --n-profiles 200 \ + --n-sessions 30 \ + --max-turns 15 \ + --use-vllm \ + --parallel-profiles 10 \ + --output-dir ../results/full_experiment_h200 \ + --profile-path "$PROFILE_PATH" + + END=$(date +%s) + ELAPSED=$((END-START)) + echo "" + echo "$METHOD completed in ${ELAPSED}s" + if [ $ELAPSED -gt 0 ]; then + THROUGHPUT=$((6000 * 3600 / ELAPSED)) + echo "Throughput: ${THROUGHPUT} sessions/hr" + fi + echo "" +done + +# Cleanup +echo "Cleaning up..." +kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true + +echo "" +echo "============================================" +echo "SEQUENTIAL EXPERIMENT COMPLETE" +echo "============================================" +date diff --git a/collaborativeagents/scripts/fullscale_method.sbatch b/collaborativeagents/scripts/fullscale_method.sbatch new file mode 100644 index 0000000..6847f4e --- /dev/null +++ b/collaborativeagents/scripts/fullscale_method.sbatch @@ -0,0 +1,92 @@ +#!/bin/bash +#SBATCH --job-name=fs_%x +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=30:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/fs_%x-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/fs_%x-%j.err + +# Usage: sbatch --job-name=vanilla fullscale_method.sbatch vanilla +METHOD=$1 + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +# Full-precision 70B for user simulator (H200 143GB/GPU can handle it with TP=2) +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +# vLLM memory and parallel workers for methods needing preference extractor +# These methods need GPU memory for embedding/reranker/extractor models on GPUs 2,3 +if [[ "$METHOD" == "all_memory" || "$METHOD" == "rag" || "$METHOD" == "rag_vector" ]]; then + AGENT_MEM=0.40 # Leave 60% free for embedding/reranker/extractor + PARALLEL_PROFILES=30 # With CUDA_VISIBLE_DEVICES=2,3, extractor uses correct GPUs +else + AGENT_MEM=0.90 + PARALLEL_PROFILES=50 +fi + +echo "=== Starting vLLM servers ===" +echo "Method: $METHOD" +echo "User simulator: $USER_MODEL (70B full-precision)" +echo "Agent: $AGENT_MODEL (8B)" +echo "Agent memory: $AGENT_MEM" +date + +# User simulator on GPUs 0,1 (70B full-precision, ~70GB/GPU with TP=2) +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME & + +# Agent on GPUs 2,3 +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization $AGENT_MEM \ + --max-model-len 16384 --dtype bfloat16 & + +# Wait for 70B model to load (takes 9-12 minutes) +echo "Waiting for vLLM servers to be ready (this may take 10-15 minutes for 70B)..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator (8004) ready after $((i*5)) seconds" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent (8003) ready after $((i*5)) seconds" + break + fi + sleep 5 +done +echo "Both vLLM servers ready" +sleep 10 + +# Batch processing only for vanilla +if [[ "$METHOD" == "vanilla" ]]; then + EXTRA_ARGS="--use-batch-processing --batch-size 100" +else + EXTRA_ARGS="--no-batch-processing" +fi + +echo "Parallel profiles: $PARALLEL_PROFILES" + +# Run experiment with CUDA_VISIBLE_DEVICES=2,3 so preference extractor/embedding/reranker +# use GPUs 2,3 (which have more headroom) instead of GPUs 0,1 (saturated by 70B model) +CUDA_VISIBLE_DEVICES=2,3 python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 200 --n-sessions 30 --max-turns 15 \ + --use-vllm $EXTRA_ARGS --parallel-profiles $PARALLEL_PROFILES \ + --output-dir ../results/fullscale --profile-path $PROFILE_PATH + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/fullscale_vanilla.sbatch b/collaborativeagents/scripts/fullscale_vanilla.sbatch new file mode 100644 index 0000000..798dc5f --- /dev/null +++ b/collaborativeagents/scripts/fullscale_vanilla.sbatch @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=fs_vanilla +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=8:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/fs_vanilla-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/fs_vanilla-%j.err + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/user_profiles.jsonl" + +# Start vLLM servers +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 8192 --dtype float16 --download-dir $HF_HOME & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 8192 --dtype bfloat16 & + +for i in {1..60}; do + curl -s http://localhost:8004/health > /dev/null 2>&1 && curl -s http://localhost:8003/health > /dev/null 2>&1 && break + sleep 5 +done +sleep 30 + +python scripts/run_experiments.py --methods vanilla \ + --datasets math-hard --n-profiles 200 --n-sessions 30 --max-turns 15 \ + --use-vllm --use-batch-processing --batch-size 100 --parallel-profiles 50 \ + --output-dir ../results/fullscale --profile-path $PROFILE_PATH + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/generate_complex_profiles.py b/collaborativeagents/scripts/generate_complex_profiles.py new file mode 100644 index 0000000..3838413 --- /dev/null +++ b/collaborativeagents/scripts/generate_complex_profiles.py @@ -0,0 +1,719 @@ +""" +Generate complex user profiles with conditional preferences using LLM. + +This script generates user profiles with ~40 situation-dependent preferences +designed to stress-test retrieval-based personalization systems. +""" + +import json +import random +from pathlib import Path +from typing import Optional +from dataclasses import dataclass, field, asdict +import hashlib + +# Will use litellm for generation +try: + import litellm +except ImportError: + litellm = None + + +# ============================================================================ +# Schema Definitions +# ============================================================================ + +@dataclass +class ConditionalPreference: + """A preference that applies under specific conditions.""" + pref_id: str + condition: str # When this preference applies + action: str # What the user prefers + conflict_group: Optional[str] = None # Which preferences this might conflict with + priority_context: list = field(default_factory=list) # Keywords that trigger this pref + + def to_natural_language(self) -> str: + """Convert to natural language statement.""" + return f"When {self.condition}, {self.action}." + + def to_memory_card_format(self) -> dict: + """Convert to format compatible with personalization system's MemoryCard.""" + return { + "condition": self.condition, + "action": self.action, + "confidence": 1.0, + "source": "user_profile", + "pref_id": self.pref_id, + "conflict_group": self.conflict_group, + "priority_context": self.priority_context + } + + +@dataclass +class ConflictGroup: + """Defines a group of preferences that may conflict.""" + group_id: str + description: str + resolution_rule: str # How to programmatically resolve + member_pref_ids: list = field(default_factory=list) + + +@dataclass +class UserProfile: + """A complex user profile with conditional preferences.""" + user_id: str + persona: str # High-level description + preferences: list # List of ConditionalPreference + conflict_groups: dict = field(default_factory=dict) # group_id -> ConflictGroup + + def get_preferences_by_category(self) -> dict: + """Group preferences by their category (derived from pref_id prefix).""" + categories = {} + for pref in self.preferences: + cat = pref.pref_id.split('_')[0] + if cat not in categories: + categories[cat] = [] + categories[cat].append(pref) + return categories + + def get_conflicting_preferences(self, query: str) -> list: + """Find preferences that might conflict for a given query.""" + # Simple keyword matching - in practice, use embeddings + triggered = [] + query_lower = query.lower() + for pref in self.preferences: + for keyword in pref.priority_context: + if keyword.lower() in query_lower: + triggered.append(pref) + break + + # Group by conflict group + conflicts = {} + for pref in triggered: + if pref.conflict_group: + if pref.conflict_group not in conflicts: + conflicts[pref.conflict_group] = [] + conflicts[pref.conflict_group].append(pref) + + # Return groups with more than one triggered preference + return {k: v for k, v in conflicts.items() if len(v) > 1} + + def to_dict(self) -> dict: + return { + "user_id": self.user_id, + "persona": self.persona, + "preferences": [asdict(p) for p in self.preferences], + "conflict_groups": {k: asdict(v) for k, v in self.conflict_groups.items()}, + "meta": { + "total_preferences": len(self.preferences), + "total_conflict_groups": len(self.conflict_groups) + } + } + + +# ============================================================================ +# Preference Templates for LLM Generation +# ============================================================================ + +PREFERENCE_CATEGORIES = { + "response_format": { + "description": "How responses should be structured", + "num_preferences": 4, + "example_conflicts": ["bullets vs numbered", "answer-first vs build-up"], + "generation_prompt": """Generate {n} preferences about response formatting. +Include conflicting pairs like: +- When to use bullet points vs numbered lists +- When to give answer first vs build up to it +Each preference must have a specific condition (when it applies) and action (what to do).""" + }, + "verbosity": { + "description": "How detailed responses should be", + "num_preferences": 5, + "example_conflicts": ["concise vs detailed", "explain why vs just answer"], + "generation_prompt": """Generate {n} preferences about response verbosity. +Include conflicting pairs like: +- Brief responses vs detailed explanations +- When to explain reasoning vs just give answer +Conditions should include cue phrases like 'quick question', 'briefly', etc.""" + }, + "code_style": { + "description": "Programming and code preferences", + "num_preferences": 8, + "example_conflicts": ["naming conventions by language", "comment styles", "review focus"], + "generation_prompt": """Generate {n} preferences about code style. +Include: +- Language-specific naming conventions (Python snake_case, JS camelCase, etc.) +- Comment styles for different code lengths +- Code review focus (bugs only vs style too) +- Error handling preferences""" + }, + "math_style": { + "description": "Mathematical explanation preferences", + "num_preferences": 6, + "example_conflicts": ["step-by-step vs intuition", "formal vs informal"], + "generation_prompt": """Generate {n} preferences about mathematical explanations. +Include: +- When to show detailed steps vs high-level approach +- Intuition-first vs formula-first for statistics +- How to structure proofs +- Verification requests""" + }, + "interaction_pattern": { + "description": "How to interact with user", + "num_preferences": 6, + "example_conflicts": ["confirm vs execute", "recommend vs list options"], + "generation_prompt": """Generate {n} preferences about interaction patterns. +Include: +- When to confirm before acting vs execute directly +- When to recommend vs present options +- How to handle user emotions (frustration, gratitude)""" + }, + "domain_specific": { + "description": "Preferences for specific technical domains", + "num_preferences": 6, + "example_conflicts": ["example-first vs definition-first"], + "generation_prompt": """Generate {n} domain-specific preferences for: +- Machine learning explanations +- System design discussions +- API/library usage +- Data structures (include complexity)""" + }, + "error_correction": { + "description": "How to handle user mistakes", + "num_preferences": 4, + "example_conflicts": ["gentle vs direct correction"], + "generation_prompt": """Generate {n} preferences about error correction. +Include: +- Minor terminology errors vs fundamental misconceptions +- Code bugs +- Correcting own previous responses""" + }, + "output_artifacts": { + "description": "How to present code and commands", + "num_preferences": 4, + "example_conflicts": ["single block vs chunked"], + "generation_prompt": """Generate {n} preferences about output artifacts. +Include: +- Copyable code blocks vs explained chunks +- Command presentation +- Language specification in code fences""" + } +} + + +LLM_GENERATION_PROMPT = """You are generating user preferences for a personalization benchmark. + +## Task +Generate {num_prefs} conditional preferences for the category: {category_name} +Description: {category_description} + +## Requirements +1. Each preference must have: + - A specific CONDITION (when it applies, including trigger phrases/situations) + - An ACTION (what the user prefers to happen) + - A CONFLICT_GROUP (if this preference might conflict with another) + - PRIORITY_CONTEXT (list of keywords that trigger this preference) + +2. Include at least one pair of CONFLICTING preferences that could both be triggered + by different aspects of the same query. The conflict should be resolvable by + looking at the specific context. + +3. Conditions should be: + - Specific and observable (not vague like "when appropriate") + - Include trigger phrases users might say + - Cover different situations within this category + +4. Example conflicts for this category: {example_conflicts} + +## Additional Context (if any) +{extra_context} + +## Output Format +Return a JSON array of preferences: +```json +[ + {{ + "pref_id": "{category_prefix}_001", + "condition": "specific situation or trigger phrase", + "action": "what the user prefers", + "conflict_group": "group_name or null", + "priority_context": ["keyword1", "keyword2"] + }}, + ... +] +``` + +Generate exactly {num_prefs} preferences.""" + + +PERSONA_GENERATION_PROMPT = """Generate a realistic user persona for a software developer/researcher. + +## Requirements +1. The persona should feel like a real person with: + - A professional background (role, experience level, domain) + - Communication style tendencies + - Learning preferences + - Work context (startup vs enterprise, solo vs team) + +2. The persona should naturally motivate the preferences that will be assigned. + +3. Keep it to 2-3 sentences. + +## Preference Summary +This user will have preferences in these areas: +{preference_summary} + +## Examples of good personas: +- "A senior backend engineer at a fintech startup who values efficiency and directness. Prefers practical solutions over theoretical discussions, and likes to understand the 'why' behind recommendations." +- "A PhD student in machine learning who is meticulous about mathematical rigor. Appreciates step-by-step derivations and often cross-references multiple sources before accepting an explanation." +- "A junior developer transitioning from frontend to full-stack. Learns best through examples and appreciates patient, incremental explanations without condescension." + +## Output +Return only the persona text (2-3 sentences), no JSON or formatting.""" + + +# ============================================================================ +# Conflict Resolution Logic +# ============================================================================ + +CONFLICT_RESOLUTION_RULES = { + "format_structure": { + "signals": { + "bullets": ["options", "alternatives", "list", "multiple", "comparison", "pros and cons"], + "numbered": ["steps", "procedure", "how to", "setup", "install", "first", "then", "sequence"] + }, + "resolution": "sequential_process -> numbered; parallel_items -> bullets" + }, + "answer_position": { + "signals": { + "answer_first": ["what is", "what's", "tell me", "give me", "?"], + "build_up": ["explain", "why", "how does", "teach", "help me understand"] + }, + "resolution": "direct_question -> answer_first; learning_intent -> build_up" + }, + "response_length": { + "signals": { + "concise": ["quick", "brief", "short", "tldr", "in a nutshell", "one line"], + "detailed": ["explain", "elaborate", "in detail", "thoroughly", "complex", "proof"] + }, + "resolution": "explicit_brevity_cue -> concise (overrides topic complexity)" + }, + "naming_convention": { + "signals": { + "snake_case": ["python", ".py", "def ", "import "], + "camelCase": ["javascript", "typescript", ".js", ".ts", "const ", "let ", "function "], + "UPPER_keywords": ["sql", "SELECT", "FROM", "WHERE", "database"] + }, + "resolution": "determined by programming language detection" + }, + "autonomy": { + "signals": { + "confirm": ["should I", "would you like", "complex", "multiple parts", "project"], + "execute": ["do this", "make this", "just", "please", "now"] + }, + "resolution": "ambiguous_task -> confirm; clear_instruction -> execute" + }, + "code_presentation": { + "signals": { + "single_block": ["copy", "paste", "use this", "give me the code", "full code"], + "chunked": ["teach", "explain", "understand", "walk through", "learn"] + }, + "resolution": "copy_intent -> single_block; learning_intent -> chunked" + } +} + + +def resolve_conflict(conflict_group: str, query: str, candidates: list) -> Optional[str]: + """ + Programmatically resolve which preference wins in a conflict. + + Args: + conflict_group: The conflict group ID + query: The user query + candidates: List of ConditionalPreference objects in this conflict + + Returns: + pref_id of the winning preference, or None if cannot resolve + """ + if conflict_group not in CONFLICT_RESOLUTION_RULES: + return None + + rules = CONFLICT_RESOLUTION_RULES[conflict_group] + query_lower = query.lower() + + # Score each candidate based on signal matches + scores = {} + for pref in candidates: + scores[pref.pref_id] = 0 + + # Check each signal category + for signal_category, keywords in rules["signals"].items(): + for keyword in keywords: + if keyword.lower() in query_lower: + # Check if this signal category matches this preference + for ctx in pref.priority_context: + if ctx.lower() in signal_category.lower() or signal_category.lower() in ctx.lower(): + scores[pref.pref_id] += 1 + # Also check if keyword is in priority context + if keyword.lower() in ctx.lower(): + scores[pref.pref_id] += 1 + + # Return highest scoring preference + if scores: + winner = max(scores, key=scores.get) + if scores[winner] > 0: + return winner + + return None + + +def create_conflict_test_case(conflict_group: str, preferences: list) -> dict: + """ + Create a test case that triggers a specific conflict. + + Returns a dict with: + - query: A query that triggers multiple preferences + - triggered_prefs: List of preference IDs triggered + - correct_pref: The preference that should win + - resolution_reason: Why this preference wins + """ + if conflict_group not in CONFLICT_RESOLUTION_RULES: + return None + + rules = CONFLICT_RESOLUTION_RULES[conflict_group] + + # Create queries that trigger conflicts + test_cases = { + "format_structure": { + "query": "How do I set up a Python virtual environment? List the main options.", + "ambiguity": "Both 'set up' (procedure->numbered) and 'list options' (parallel->bullets)", + "resolution": "Primary intent is setup procedure -> numbered steps" + }, + "response_length": { + "query": "Quick question - how does backpropagation work?", + "ambiguity": "'Quick question' (concise) vs 'how does X work' (complex topic)", + "resolution": "Explicit brevity cue 'quick question' overrides topic complexity" + }, + "answer_position": { + "query": "What is gradient descent and why is it used?", + "ambiguity": "'What is' (answer first) vs 'why' (build up explanation)", + "resolution": "Combined question: give brief answer, then explain why" + }, + "naming_convention": { + "query": "Write a function to parse JSON in both Python and JavaScript", + "ambiguity": "Two languages with different conventions", + "resolution": "Use appropriate convention for each: snake_case for Python, camelCase for JS" + }, + "autonomy": { + "query": "Refactor this authentication module to use JWT", + "ambiguity": "'Refactor' is complex, but instruction is specific", + "resolution": "Should confirm approach before major refactor" + }, + "code_presentation": { + "query": "I want to understand how this sorting algorithm works, give me the code", + "ambiguity": "'understand' (chunked) vs 'give me the code' (single block)", + "resolution": "Learning intent detected -> chunked with explanations" + } + } + + if conflict_group in test_cases: + tc = test_cases[conflict_group] + # Find which preferences are triggered + triggered = [p for p in preferences if p.conflict_group == conflict_group] + winner = resolve_conflict(conflict_group, tc["query"], triggered) + + return { + "conflict_group": conflict_group, + "query": tc["query"], + "ambiguity": tc["ambiguity"], + "triggered_pref_ids": [p.pref_id for p in triggered], + "correct_pref_id": winner, + "resolution_reason": tc["resolution"] + } + + return None + + +# ============================================================================ +# LLM-based Profile Generation +# ============================================================================ + +def generate_preferences_with_llm( + category: str, + model: str = "gpt-4o-mini", + extra_context: str = "" +) -> list: + """Generate preferences for a category using LLM.""" + if litellm is None: + raise ImportError("litellm required for LLM generation") + + cat_info = PREFERENCE_CATEGORIES[category] + prompt = LLM_GENERATION_PROMPT.format( + num_prefs=cat_info["num_preferences"], + category_name=category, + category_description=cat_info["description"], + example_conflicts=", ".join(cat_info["example_conflicts"]), + category_prefix=category[:2], + extra_context=extra_context or "None" + ) + + response = litellm.completion( + model=model, + messages=[{"role": "user", "content": prompt}], + response_format={"type": "json_object"} + ) + + content = response.choices[0].message.content + # Extract JSON from response + try: + data = json.loads(content) + if isinstance(data, dict) and "preferences" in data: + data = data["preferences"] + return [ConditionalPreference(**p) for p in data] + except json.JSONDecodeError: + # Try to extract JSON array from markdown code block + import re + match = re.search(r'\[[\s\S]*\]', content) + if match: + data = json.loads(match.group()) + return [ConditionalPreference(**p) for p in data] + raise + + +def generate_persona_with_llm( + preferences: list, + model: str = "gpt-4o-mini" +) -> str: + """Generate a persona that matches the preferences.""" + if litellm is None: + raise ImportError("litellm required for LLM generation") + + # Summarize preferences by category + by_cat = {} + for p in preferences: + cat = p.pref_id.split('_')[0] + if cat not in by_cat: + by_cat[cat] = [] + by_cat[cat].append(p.action[:50] + "...") + + summary = "\n".join([f"- {cat}: {', '.join(actions[:3])}" for cat, actions in by_cat.items()]) + + prompt = PERSONA_GENERATION_PROMPT.format(preference_summary=summary) + + response = litellm.completion( + model=model, + messages=[{"role": "user", "content": prompt}] + ) + + return response.choices[0].message.content.strip() + + +def generate_full_profile( + user_id: str, + model: str = "gpt-4o-mini", + categories: list = None +) -> UserProfile: + """Generate a complete user profile with all preferences.""" + if categories is None: + categories = list(PREFERENCE_CATEGORIES.keys()) + + all_preferences = [] + for cat in categories: + prefs = generate_preferences_with_llm(cat, model) + all_preferences.extend(prefs) + + persona = generate_persona_with_llm(all_preferences, model) + + # Build conflict groups + conflict_groups = {} + for pref in all_preferences: + if pref.conflict_group: + if pref.conflict_group not in conflict_groups: + conflict_groups[pref.conflict_group] = ConflictGroup( + group_id=pref.conflict_group, + description=CONFLICT_RESOLUTION_RULES.get(pref.conflict_group, {}).get("resolution", ""), + resolution_rule=CONFLICT_RESOLUTION_RULES.get(pref.conflict_group, {}).get("resolution", ""), + member_pref_ids=[] + ) + conflict_groups[pref.conflict_group].member_pref_ids.append(pref.pref_id) + + return UserProfile( + user_id=user_id, + persona=persona, + preferences=all_preferences, + conflict_groups=conflict_groups + ) + + +# ============================================================================ +# Dataset Loading and Challenging Question Selection +# ============================================================================ + +CHALLENGING_DATASETS = { + # Existing datasets with difficulty filtering + "math-hard": { + "source": "lighteval/MATH-Hard", + "filter": lambda x: x.get("level") in ["Level 4", "Level 5"], + "encourage_step_by_step": True + }, + "humaneval-hard": { + "source": "openai_humaneval", + "filter": lambda x: len(x.get("prompt", "")) > 200, # Longer problems + "encourage_step_by_step": True + }, + + # New challenging datasets to add + "gpqa": { + "source": "Idavidrein/gpqa", + "description": "PhD-level science questions", + "filter": lambda x: x.get("difficulty") == "hard", + "encourage_step_by_step": True + }, + "theoremqa": { + "source": "wenhu/TheoremQA", + "description": "Theorem-based math requiring multi-step proofs", + "filter": None, + "encourage_step_by_step": True + }, + "livecodebench": { + "source": "livecodebench/livecodebench", + "description": "Recent competitive programming problems", + "filter": lambda x: x.get("difficulty") in ["medium", "hard"], + "encourage_step_by_step": True + }, + "aime": { + "source": "AI-MO/aimo-progress-prize", + "description": "American Invitational Mathematics Examination", + "filter": None, + "encourage_step_by_step": True + }, + "scicode": { + "source": "scicode-bench/SciCode", + "description": "Scientific computing problems", + "filter": None, + "encourage_step_by_step": True + } +} + + +STEP_BY_STEP_PROMPT_ADDITIONS = { + "math": """ +When solving this problem: +1. First identify what type of problem this is +2. State the key concepts/theorems needed +3. Work through the solution step by step +4. Verify your answer +Take your time and show your reasoning at each step.""", + + "code": """ +When solving this problem: +1. First understand the requirements and edge cases +2. Outline your approach before writing code +3. Implement step by step, explaining your logic +4. Consider time/space complexity +5. Test with example inputs +Show your reasoning throughout.""", + + "reasoning": """ +When solving this problem: +1. Carefully read and identify the key information +2. State any assumptions you're making +3. Work through the logic step by step +4. Check for any flaws in your reasoning +5. State your conclusion clearly +Take your time and explain your thought process.""" +} + + +# ============================================================================ +# Batch Generation Script +# ============================================================================ + +def generate_profiles_batch( + num_profiles: int, + output_path: Path, + model: str = "gpt-4o-mini", + seed: int = 42 +) -> list: + """Generate multiple user profiles.""" + random.seed(seed) + profiles = [] + + for i in range(num_profiles): + user_id = f"user_{hashlib.md5(f'{seed}_{i}'.encode()).hexdigest()[:8]}" + + # Optionally vary which categories are emphasized + # Some users might have stronger code preferences, others math, etc. + category_weights = {cat: random.random() for cat in PREFERENCE_CATEGORIES} + + try: + profile = generate_full_profile(user_id, model) + profiles.append(profile) + print(f"Generated profile {i+1}/{num_profiles}: {user_id}") + except Exception as e: + print(f"Error generating profile {i+1}: {e}") + continue + + # Save profiles + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w') as f: + for profile in profiles: + f.write(json.dumps(profile.to_dict()) + '\n') + + print(f"Saved {len(profiles)} profiles to {output_path}") + return profiles + + +def generate_conflict_test_suite(profiles: list, output_path: Path): + """Generate test cases for conflict resolution evaluation.""" + test_cases = [] + + for profile in profiles: + for conflict_group in profile.conflict_groups: + tc = create_conflict_test_case( + conflict_group, + profile.preferences + ) + if tc: + tc["user_id"] = profile.user_id + test_cases.append(tc) + + with open(output_path, 'w') as f: + json.dump(test_cases, f, indent=2) + + print(f"Generated {len(test_cases)} conflict test cases") + return test_cases + + +# ============================================================================ +# Main +# ============================================================================ + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--num_profiles", type=int, default=10) + parser.add_argument("--output_dir", type=str, default="collaborativeagents/data/complex_profiles") + parser.add_argument("--model", type=str, default="gpt-4o-mini") + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--generate_conflicts", action="store_true") + + args = parser.parse_args() + + output_dir = Path(args.output_dir) + + # Generate profiles + profiles = generate_profiles_batch( + num_profiles=args.num_profiles, + output_path=output_dir / "profiles.jsonl", + model=args.model, + seed=args.seed + ) + + # Generate conflict test cases + if args.generate_conflicts: + generate_conflict_test_suite( + profiles, + output_path=output_dir / "conflict_tests.json" + ) diff --git a/collaborativeagents/scripts/generate_profiles_v2.py b/collaborativeagents/scripts/generate_profiles_v2.py new file mode 100644 index 0000000..c431302 --- /dev/null +++ b/collaborativeagents/scripts/generate_profiles_v2.py @@ -0,0 +1,475 @@ +""" +Generate 100 complex user profiles with ~40 conditional preferences using LLM. + +Key differences from original CollaborativeAgents: +1. 40 conditional preferences (vs their 3 flat preferences) +2. Preferences have explicit conditions for when they apply +3. Conflict groups marked for testing conflict resolution +4. LLM-based batch generation with quality control +""" + +import json +import random +import hashlib +from pathlib import Path +from dataclasses import dataclass, field, asdict +from typing import Optional, List, Dict, Any +import argparse + +try: + import litellm +except ImportError: + litellm = None + + +# ============================================================================= +# Preference Category Definitions +# ============================================================================= + +PREFERENCE_CATEGORIES = { + "response_format": { + "num_preferences": 4, + "conflicts": [("rf_bullets", "rf_numbered"), ("rf_answer_first", "rf_build_up")], + "prompt": """Generate 4 preferences about response FORMAT: +1. When to use bullet points vs numbered lists +2. When to lead with the answer vs build up to it + +Each must have: +- A SPECIFIC condition (trigger phrase or situation) +- A clear action (what to do) +- Conflict group (format_structure or answer_position) +- Priority keywords that trigger this preference + +Make conditions mutually exclusive within each conflict group.""" + }, + + "verbosity": { + "num_preferences": 5, + "conflicts": [("vb_concise", "vb_detailed"), ("vb_explain_why", "vb_just_answer")], + "prompt": """Generate 5 preferences about VERBOSITY/LENGTH: +1. When to be concise (user says "quick", "briefly", "TL;DR") +2. When to be detailed (complex topics, "explain", "in depth") +3. When to explain reasoning vs just give answer + +Include explicit trigger phrases in conditions. +Conflict groups: response_length, explanation_depth""" + }, + + "code_style": { + "num_preferences": 8, + "conflicts": [ + ("cs_snake", "cs_camel", "cs_sql_upper"), # By language + ("cs_inline_comments", "cs_docstrings"), # Comment style + ("cs_bugs_only", "cs_full_review") # Review scope + ], + "prompt": """Generate 8 preferences about CODE STYLE: +1-3. Naming conventions BY LANGUAGE (Python=snake_case, JS=camelCase, SQL=UPPERCASE) +4-5. Comment styles for short snippets vs production code +6-7. Code review scope (bugs only vs style too) +8. Error handling preference + +Conflict groups: naming_convention, comment_style, review_scope""" + }, + + "math_style": { + "num_preferences": 6, + "conflicts": [("ms_show_steps", "ms_high_level"), ("ms_intuition", "ms_formula")], + "prompt": """Generate 6 preferences about MATHEMATICAL explanations: +1-2. When to show detailed steps vs high-level approach +3-4. When to lead with intuition vs formula (statistics vs pure math) +5. How to structure proofs +6. Practice problems when studying for exams + +Conflict groups: math_detail, math_approach""" + }, + + "interaction_pattern": { + "num_preferences": 6, + "conflicts": [("ip_confirm", "ip_execute"), ("ip_recommend", "ip_compare")], + "prompt": """Generate 6 preferences about INTERACTION patterns: +1-2. When to confirm before acting vs execute directly +3-4. When to recommend vs present options/comparison +5. How to handle user frustration +6. How to handle user thanks/satisfaction + +Conflict groups: autonomy, guidance_style""" + }, + + "domain_specific": { + "num_preferences": 6, + "conflicts": [("ds_example_first", "ds_definition_first")], + "prompt": """Generate 6 DOMAIN-SPECIFIC preferences: +1. ML explanations (include math formulation) +2. System design (components list before interactions) +3. API/library usage (example first) +4. Theoretical concepts (definition first) +5. Data structures (include complexity) +6. Documentation style + +Conflict group: example_position""" + }, + + "error_correction": { + "num_preferences": 4, + "conflicts": [("ec_gentle", "ec_direct")], + "prompt": """Generate 4 preferences about ERROR CORRECTION: +1. Minor terminology errors (correct gently inline) +2. Fundamental misconceptions (address directly) +3. Code bugs +4. Agent's own mistakes + +Conflict group: correction_style""" + }, + + "output_artifacts": { + "num_preferences": 4, + "conflicts": [("oa_single_block", "oa_chunked")], + "prompt": """Generate 4 preferences about OUTPUT format: +1. Copyable code (single block) +2. Teaching code (chunked with explanations) +3. Terminal commands (bash blocks with expected output) +4. Always specify language in code fences + +Conflict group: code_presentation""" + } +} + + +LLM_PREFERENCE_GENERATION_PROMPT = """You are generating CONDITIONAL user preferences for a personalization benchmark. + +# Category: {category_name} +# Number of preferences to generate: {num_preferences} + +{category_prompt} + +# Output Requirements +Generate exactly {num_preferences} preferences in this JSON format: +```json +{{ + "preferences": [ + {{ + "pref_id": "{prefix}_001", + "condition": "When X happens / When user says Y / For Z type of content", + "action": "Do A, B, C (be specific)", + "conflict_group": "group_name_or_null", + "priority_context": ["keyword1", "keyword2", "phrase1"] + }}, + ... + ] +}} +``` + +# Critical Rules: +1. Conditions must be SPECIFIC and OBSERVABLE (include trigger phrases) +2. Within a conflict group, conditions must be MUTUALLY EXCLUSIVE +3. Priority_context keywords should appear in queries that trigger this preference +4. Actions must be concrete and verifiable + +Generate preferences that will: +- Create interesting conflicts (RAG should resolve correctly, context methods fail) +- Be testable (we can verify if an agent followed them) +- Be realistic (based on actual user behavior) + +Output ONLY the JSON, no other text.""" + + +PERSONA_GENERATION_PROMPT = """Generate a realistic user persona (2-3 sentences) that would naturally have these preference categories: +{categories} + +The persona should be a software developer, researcher, or technical professional. Include: +- Professional background (role, experience level, domain) +- Communication style tendencies +- Work context + +Output ONLY the persona text, no JSON or formatting.""" + + +# ============================================================================= +# Profile Generator +# ============================================================================= + +class ProfileGenerator: + """Generate complex user profiles with conditional preferences.""" + + def __init__(self, model: str = "meta-llama/Llama-3.1-70B-Instruct", seed: int = 42): + self.model = model + self.random = random.Random(seed) + + if litellm is None: + raise ImportError("litellm required for profile generation") + + def _call_llm(self, prompt: str, json_mode: bool = True) -> str: + """Call LLM with prompt.""" + kwargs = { + "model": self.model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.7, + "max_tokens": 4096, + } + + if json_mode: + kwargs["response_format"] = {"type": "json_object"} + + response = litellm.completion(**kwargs) + return response.choices[0].message.content + + def _parse_json(self, text: str) -> dict: + """Parse JSON from response.""" + import re + + try: + return json.loads(text) + except json.JSONDecodeError: + pass + + # Try markdown code block + match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text) + if match: + try: + return json.loads(match.group(1)) + except: + pass + + # Try to find JSON object + match = re.search(r'\{[\s\S]*\}', text) + if match: + try: + return json.loads(match.group()) + except: + pass + + raise ValueError(f"Failed to parse JSON from: {text[:500]}") + + def generate_preferences_for_category( + self, + category: str, + prefix: str + ) -> List[Dict]: + """Generate preferences for a single category.""" + cat_info = PREFERENCE_CATEGORIES[category] + + prompt = LLM_PREFERENCE_GENERATION_PROMPT.format( + category_name=category, + num_preferences=cat_info["num_preferences"], + category_prompt=cat_info["prompt"], + prefix=prefix + ) + + response = self._call_llm(prompt, json_mode=True) + data = self._parse_json(response) + + prefs = data.get("preferences", data) + if isinstance(prefs, dict): + prefs = list(prefs.values()) + + # Validate and fix pref_ids + for i, pref in enumerate(prefs): + if "pref_id" not in pref: + pref["pref_id"] = f"{prefix}_{i+1:03d}" + + return prefs + + def generate_persona(self, categories: List[str]) -> str: + """Generate a persona for the given preference categories.""" + prompt = PERSONA_GENERATION_PROMPT.format( + categories=", ".join(categories) + ) + + return self._call_llm(prompt, json_mode=False).strip() + + def generate_profile(self, user_id: str) -> Dict: + """Generate a complete user profile with ~40 preferences.""" + all_preferences = [] + category_prefixes = { + "response_format": "rf", + "verbosity": "vb", + "code_style": "cs", + "math_style": "ms", + "interaction_pattern": "ip", + "domain_specific": "ds", + "error_correction": "ec", + "output_artifacts": "oa" + } + + print(f" Generating preferences for {user_id}...") + for category, prefix in category_prefixes.items(): + try: + prefs = self.generate_preferences_for_category(category, prefix) + all_preferences.extend(prefs) + print(f" {category}: {len(prefs)} preferences") + except Exception as e: + print(f" ERROR in {category}: {e}") + + # Generate persona + print(f" Generating persona...") + persona = self.generate_persona(list(category_prefixes.keys())) + + # Build conflict groups mapping + conflict_groups = {} + for pref in all_preferences: + cg = pref.get("conflict_group") + if cg: + if cg not in conflict_groups: + conflict_groups[cg] = [] + conflict_groups[cg].append(pref["pref_id"]) + + return { + "user_id": user_id, + "persona": persona, + "preferences": all_preferences, + "conflict_groups": conflict_groups, + "meta": { + "total_preferences": len(all_preferences), + "total_conflict_groups": len(conflict_groups), + "generator": "generate_profiles_v2.py" + } + } + + +def generate_profiles_batch( + num_profiles: int, + output_path: Path, + model: str = "meta-llama/Llama-3.1-70B-Instruct", + seed: int = 42 +): + """Generate multiple profiles.""" + generator = ProfileGenerator(model=model, seed=seed) + profiles = [] + + for i in range(num_profiles): + user_id = f"user_{hashlib.md5(f'{seed}_{i}'.encode()).hexdigest()[:8]}" + print(f"\n[{i+1}/{num_profiles}] Generating profile: {user_id}") + + try: + profile = generator.generate_profile(user_id) + profiles.append(profile) + print(f" Generated {profile['meta']['total_preferences']} preferences") + except Exception as e: + print(f" ERROR: {e}") + continue + + # Save + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w') as f: + for profile in profiles: + f.write(json.dumps(profile) + '\n') + + print(f"\n{'='*60}") + print(f"Generated {len(profiles)} profiles") + print(f"Saved to: {output_path}") + + return profiles + + +# ============================================================================= +# Fallback: Generate from Schema (No LLM Required) +# ============================================================================= + +def generate_profiles_from_schema( + num_profiles: int, + schema_path: Path, + output_path: Path, + seed: int = 42 +) -> List[Dict]: + """ + Generate profiles from the predefined schema (no LLM calls). + Useful for testing or when API is unavailable. + """ + with open(schema_path) as f: + schema = json.load(f) + + random.seed(seed) + profiles = [] + + # Extract all preferences from schema + all_prefs = [] + for cat in schema["preference_categories"]: + all_prefs.extend(cat["preferences"]) + + # Sample personas + sample_personas = [ + "A senior backend engineer who values efficiency and directness. Prefers practical solutions over theoretical discussions.", + "A PhD student in ML who is meticulous about mathematical rigor. Appreciates step-by-step derivations.", + "A junior developer learning full-stack. Prefers patient, incremental explanations with examples.", + "A DevOps engineer focused on automation. Wants concise, actionable answers with commands to run.", + "A data scientist who thinks visually. Prefers intuition before formulas and lots of examples.", + "A tech lead reviewing code from their team. Focuses on maintainability and best practices.", + "A researcher prototyping quickly. Wants working code fast, willing to refactor later.", + "A student preparing for technical interviews. Needs step-by-step problem solving practice.", + ] + + for i in range(num_profiles): + user_id = f"user_{hashlib.md5(f'{seed}_{i}'.encode()).hexdigest()[:8]}" + + # Select random subset of preferences (35-45) + num_prefs = random.randint(35, 45) + selected_prefs = random.sample(all_prefs, min(num_prefs, len(all_prefs))) + + # Build conflict groups + conflict_groups = {} + for pref in selected_prefs: + cg = pref.get("conflict_group") + if cg: + if cg not in conflict_groups: + conflict_groups[cg] = [] + conflict_groups[cg].append(pref["pref_id"]) + + profile = { + "user_id": user_id, + "persona": random.choice(sample_personas), + "preferences": selected_prefs, + "conflict_groups": conflict_groups, + "meta": { + "total_preferences": len(selected_prefs), + "total_conflict_groups": len(conflict_groups), + "generator": "schema_based" + } + } + profiles.append(profile) + + # Save + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w') as f: + for profile in profiles: + f.write(json.dumps(profile) + '\n') + + print(f"Generated {len(profiles)} profiles from schema") + return profiles + + +# ============================================================================= +# Main +# ============================================================================= + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Generate complex user profiles with conditional preferences" + ) + parser.add_argument("--num_profiles", type=int, default=100, + help="Number of profiles to generate") + parser.add_argument("--output", type=str, + default="collaborativeagents/data/complex_profiles_v2/profiles.jsonl") + parser.add_argument("--model", type=str, + default="meta-llama/Llama-3.1-70B-Instruct", + help="LLM model for generation") + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--from_schema", type=str, default=None, + help="Generate from schema file instead of LLM") + + args = parser.parse_args() + output_path = Path(args.output) + + if args.from_schema: + generate_profiles_from_schema( + num_profiles=args.num_profiles, + schema_path=Path(args.from_schema), + output_path=output_path, + seed=args.seed + ) + else: + generate_profiles_batch( + num_profiles=args.num_profiles, + output_path=output_path, + model=args.model, + seed=args.seed + ) diff --git a/collaborativeagents/scripts/generate_training_data.sh b/collaborativeagents/scripts/generate_training_data.sh new file mode 100644 index 0000000..bdd5fba --- /dev/null +++ b/collaborativeagents/scripts/generate_training_data.sh @@ -0,0 +1,22 @@ +# python -m sglang.launch_server --model-path meta-llama/Llama-3.3-70B-Instruct --port 8004 --tp-size 4 --context-length 16384 + +BATCH_SIZE=100 + +# Loop over eval sizes and datasets +for EVAL_SIZE in 20; do + for DATASET in math-hard math-500 logiqa mmlu medqa; do + # Convert dataset name for file paths (replace - with _) + DATASET_FILE=$(echo ${DATASET} | tr '-' '_') + + echo "Generating training data for dataset: ${DATASET} with eval_size ${EVAL_SIZE}" + + # training_data_with_user_profiles_with_preferences + python3 run.py --experiment_type training_data_with_user_profiles_with_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \ + --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/training/training_data/${DATASET_FILE}_llama70b_user_llama70b_agent_training_data_with_reflection_eval_size_${EVAL_SIZE}.jsonl \ + >> /shared/storage-01/users/mehri2/mem/collaborativeagents/training/training_data/${DATASET_FILE}_llama70b_user_llama70b_agent_training_data_with_reflection_eval_size_${EVAL_SIZE}.out 2>&1 + + done +done \ No newline at end of file diff --git a/collaborativeagents/scripts/preflight_test.py b/collaborativeagents/scripts/preflight_test.py new file mode 100644 index 0000000..2411f1f --- /dev/null +++ b/collaborativeagents/scripts/preflight_test.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python +""" +Pre-flight tests before running full experiments. + +Tests: +1. Timeout handling (infinite timeout) +2. Large batch stress test (batch=100) +3. Context length handling (auto-reduce max_tokens) +4. Error recovery (partial failures) +5. Sequential profile processing (for RAG/reflection methods) +6. Memory usage estimation +""" + +import sys +import os +import time +import json +import asyncio + +sys.path.insert(0, '/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents') + +from agents.batch_vllm_agent import BatchVLLMClient, BatchConversationGenerator + + +def test_1_timeout_handling(user_url: str): + """Test 1: Infinite timeout configuration.""" + print("\n" + "="*60) + print("TEST 1: Timeout Handling (Infinite Timeout)") + print("="*60) + + # Create client with infinite timeout + client = BatchVLLMClient( + vllm_url=user_url, + max_tokens=256, + temperature=0.7, + timeout=None, # Infinite timeout + max_concurrent=50 + ) + + print(f"✓ Client created with timeout=None (infinite)") + print(f" Model: {client.model_name}") + print(f" Max concurrent: {client.max_concurrent}") + + # Test with a simple request + messages = [[{"role": "user", "content": "Say 'hello' and nothing else."}]] + + start = time.time() + results = client.batch_completion(messages) + elapsed = time.time() - start + + if results[0]: + print(f"✓ Single request succeeded in {elapsed:.1f}s") + print(f" Response: {results[0][:50]}...") + return True + else: + print(f"✗ Single request failed") + return False + + +def test_2_large_batch(user_url: str, batch_size: int = 100): + """Test 2: Large batch stress test.""" + print("\n" + "="*60) + print(f"TEST 2: Large Batch Stress Test (batch={batch_size})") + print("="*60) + + client = BatchVLLMClient( + vllm_url=user_url, + max_tokens=128, # Small to speed up test + temperature=0.7, + timeout=None, + max_concurrent=100 + ) + + # Create batch of simple requests + messages_list = [ + [{"role": "user", "content": f"Count from 1 to 5. Request #{i+1}"}] + for i in range(batch_size) + ] + + print(f"Sending {batch_size} concurrent requests...") + start = time.time() + results = client.batch_completion(messages_list) + elapsed = time.time() - start + + successes = sum(1 for r in results if r is not None) + + print(f"\nResults:") + print(f" Successes: {successes}/{batch_size}") + print(f" Time: {elapsed:.1f}s") + print(f" Throughput: {successes * 3600 / elapsed:.0f} requests/hr") + + if successes >= batch_size * 0.9: + print(f"✓ Batch test PASSED (>90% success)") + return True + else: + print(f"✗ Batch test FAILED (<90% success)") + return False + + +def test_3_context_length_handling(user_url: str): + """Test 3: Context length error handling.""" + print("\n" + "="*60) + print("TEST 3: Context Length Handling") + print("="*60) + + client = BatchVLLMClient( + vllm_url=user_url, + max_tokens=512, # Request large output + temperature=0.7, + timeout=None, + max_concurrent=10 + ) + + # Create request with very long input (near 4096 token limit) + long_text = "This is a test. " * 500 # ~2000 tokens + messages_list = [ + [{"role": "user", "content": f"Summarize: {long_text}"}], # Will hit limit + [{"role": "user", "content": "Say hello."}], # Should succeed + ] + + print("Testing with 1 long + 1 short request...") + results = client.batch_completion(messages_list) + + # The long one might fail or get reduced max_tokens + # The short one should succeed + short_success = results[1] is not None + + if short_success: + print(f"✓ Short request succeeded despite long request") + print(f" Long request result: {'OK' if results[0] else 'Handled gracefully'}") + return True + else: + print(f"✗ Short request should not have failed") + return False + + +def test_4_error_recovery(user_url: str, agent_url: str): + """Test 4: Error recovery in batch processing.""" + print("\n" + "="*60) + print("TEST 4: Error Recovery (Partial Failures)") + print("="*60) + + generator = BatchConversationGenerator( + user_vllm_url=user_url, + agent_vllm_url=agent_url, + max_turns=3, + user_max_tokens=256, + agent_max_tokens=256, + ) + + # Mix of valid and problematic samples + samples = [ + {"problem": "What is 2+2?", "solution": "4"}, + {"problem": "What is 3+3?", "solution": "6"}, + {"problem": "What is 4+4?", "solution": "8"}, + ] + + print("Testing batch generation with 3 samples, 3 turns...") + start = time.time() + results = generator.generate_batch( + samples=samples, + user_persona="A student.", + user_preferences=None, + ) + elapsed = time.time() - start + + successes = sum(1 for r in results if r is not None) + print(f"\nResults:") + print(f" Successes: {successes}/{len(samples)}") + print(f" Time: {elapsed:.1f}s") + + if successes >= 2: + print(f"✓ Error recovery PASSED") + return True + else: + print(f"✗ Error recovery FAILED") + return False + + +def test_5_sequential_profile(user_url: str, agent_url: str): + """Test 5: Sequential profile processing (simulating RAG/reflection).""" + print("\n" + "="*60) + print("TEST 5: Sequential Profile Processing (RAG/Reflection Simulation)") + print("="*60) + + # Simulate 3 profiles, each with 2 sequential sessions + # This is how RAG/reflection methods work - sequential within profile + + generator = BatchConversationGenerator( + user_vllm_url=user_url, + agent_vllm_url=agent_url, + max_turns=2, + user_max_tokens=256, + agent_max_tokens=256, + ) + + n_profiles = 3 + sessions_per_profile = 2 + total_time = 0 + total_sessions = 0 + + for profile_idx in range(n_profiles): + profile_start = time.time() + + # Sequential sessions for this profile + for session_idx in range(sessions_per_profile): + samples = [ + {"problem": f"Profile {profile_idx+1}, Session {session_idx+1}: What is {profile_idx+session_idx}+1?", + "solution": str(profile_idx + session_idx + 1)} + ] + + results = generator.generate_batch( + samples=samples, + user_persona=f"User profile {profile_idx+1}", + user_preferences="Be concise.", + ) + + if results[0]: + total_sessions += 1 + + profile_elapsed = time.time() - profile_start + total_time += profile_elapsed + print(f" Profile {profile_idx+1}: {profile_elapsed:.1f}s for {sessions_per_profile} sessions") + + print(f"\nResults:") + print(f" Total sessions: {total_sessions}/{n_profiles * sessions_per_profile}") + print(f" Total time: {total_time:.1f}s") + print(f" Throughput: {total_sessions * 3600 / total_time:.0f} sessions/hr") + + if total_sessions >= n_profiles * sessions_per_profile * 0.8: + print(f"✓ Sequential profile test PASSED") + return True + else: + print(f"✗ Sequential profile test FAILED") + return False + + +def test_6_memory_estimation(): + """Test 6: Memory usage estimation.""" + print("\n" + "="*60) + print("TEST 6: Memory Usage Estimation") + print("="*60) + + try: + import subprocess + result = subprocess.run( + ['nvidia-smi', '--query-gpu=index,memory.used,memory.total', '--format=csv,noheader,nounits'], + capture_output=True, text=True + ) + + print("GPU Memory Usage:") + for line in result.stdout.strip().split('\n'): + parts = line.split(', ') + if len(parts) == 3: + gpu_idx, used, total = parts + used_pct = float(used) / float(total) * 100 + print(f" GPU {gpu_idx}: {used}/{total} MiB ({used_pct:.1f}%)") + + print("✓ Memory estimation completed") + return True + except Exception as e: + print(f"✗ Could not get memory info: {e}") + return False + + +def run_all_tests(user_url: str, agent_url: str): + """Run all pre-flight tests.""" + print("\n" + "="*60) + print("PRE-FLIGHT TESTS FOR FULL EXPERIMENTS") + print("="*60) + print(f"User URL: {user_url}") + print(f"Agent URL: {agent_url}") + print(f"Time: {time.strftime('%Y-%m-%d %H:%M:%S')}") + + results = {} + + # Run each test + results['timeout'] = test_1_timeout_handling(user_url) + results['large_batch'] = test_2_large_batch(user_url, batch_size=50) + results['context_length'] = test_3_context_length_handling(user_url) + results['error_recovery'] = test_4_error_recovery(user_url, agent_url) + results['sequential_profile'] = test_5_sequential_profile(user_url, agent_url) + results['memory'] = test_6_memory_estimation() + + # Summary + print("\n" + "="*60) + print("PRE-FLIGHT TEST SUMMARY") + print("="*60) + + all_passed = True + for test_name, passed in results.items(): + status = "✓ PASSED" if passed else "✗ FAILED" + print(f" {test_name}: {status}") + if not passed: + all_passed = False + + print() + if all_passed: + print("✓ ALL TESTS PASSED - Ready for full experiments!") + else: + print("✗ SOME TESTS FAILED - Review before proceeding") + + return all_passed + + +if __name__ == "__main__": + user_url = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8004/v1" + agent_url = sys.argv[2] if len(sys.argv) > 2 else "http://localhost:8003/v1" + + success = run_all_tests(user_url, agent_url) + sys.exit(0 if success else 1) diff --git a/collaborativeagents/scripts/quick_rag_debug.sbatch b/collaborativeagents/scripts/quick_rag_debug.sbatch new file mode 100644 index 0000000..efc4c31 --- /dev/null +++ b/collaborativeagents/scripts/quick_rag_debug.sbatch @@ -0,0 +1,78 @@ +#!/bin/bash +#SBATCH --job-name=rag_debug +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=00:40:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_debug-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_debug-%j.err + +# Quick debug test: 2 profiles, 5 sessions - should complete in ~20 min + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +echo "=== Quick RAG Debug Test ===" +echo "2 profiles, 5 sessions - checking if extraction/storage works" +date + +# Clear empty store +> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl +rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy + +# Start vLLM servers +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \ + --max-model-len 16384 --dtype bfloat16 & + +echo "Waiting for vLLM servers..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator ready after $((i*5))s" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent ready after $((i*5))s" + break + fi + sleep 5 +done +sleep 5 + +OUTPUT_DIR="../results/rag_debug_$(date +%Y%m%d_%H%M%S)" + +# Only test RAG to see debug output +echo "============================================" +echo "Testing RAG with debug output" +echo "============================================" + +python scripts/run_experiments.py --methods rag \ + --datasets math-hard --n-profiles 2 --n-sessions 5 --max-turns 10 \ + --use-vllm --no-batch-processing --parallel-profiles 2 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + +echo "Memory cards in file: $(wc -l < /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl)" +echo "=== Done ===" +date + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/quick_test_a100.sbatch b/collaborativeagents/scripts/quick_test_a100.sbatch new file mode 100644 index 0000000..0d823f5 --- /dev/null +++ b/collaborativeagents/scripts/quick_test_a100.sbatch @@ -0,0 +1,136 @@ +#!/bin/bash +#SBATCH --job-name=quick_batch_a100 +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuA100x4 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=128G +#SBATCH --time=01:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/quick_batch_a100-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/quick_batch_a100-%j.err + +# Quick test: 10 profiles × 5 sessions = 50 sessions on A100 +# Tests batch (vanilla) processing while H200 queue is busy + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT_USER=8004 +PORT_AGENT=8003 + +echo "============================================" +echo "Quick Test: Batch Processing on A100" +echo "============================================" +echo "Profiles: 10" +echo "Sessions/profile: 5" +echo "Total: 50 sessions" +echo "" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv +echo "" + +# Kill any existing servers +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start vLLM servers +echo "Starting 8B user simulator (GPU 0-1, TP=2)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_USER \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_USER_PID=$! + +echo "Starting 8B agent (GPU 2-3, TP=2)..." +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_AGENT \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_AGENT_PID=$! + +echo "Waiting for servers..." +for i in $(seq 1 100); do + READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0) + READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0) + if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then + echo "Both servers ready after $((i*3))s" + break + fi + if [ $((i % 20)) -eq 0 ]; then + echo " Still waiting... ($((i*3))s)" + fi + sleep 3 +done + +if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then + echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then + echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +echo "Both servers healthy" +echo "" + +# Run quick test with vanilla (batch) +echo "============================================" +echo "Test: BATCH processing (vanilla method)" +echo "============================================" +START=$(date +%s) + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +python scripts/run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 10 \ + --n-sessions 5 \ + --use-vllm \ + --batch-size 50 \ + --parallel-profiles 10 \ + --output-dir ../results/quick_test_batch_a100 \ + --profile-path "$PROFILE_PATH" + +END=$(date +%s) +ELAPSED=$((END-START)) +echo "" +echo "Vanilla (batch) completed in ${ELAPSED}s" + +# Cleanup +echo "" +echo "Cleaning up..." +kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true + +echo "" +echo "============================================" +echo "QUICK TEST RESULTS (A100)" +echo "============================================" +echo "" +echo "Vanilla (BATCH): ${ELAPSED}s for 50 sessions" +echo "" + +if [ $ELAPSED -gt 0 ]; then + THROUGHPUT=$((50 * 3600 / ELAPSED)) + echo "Throughput: ${THROUGHPUT} sessions/hr" +fi + +echo "" +echo "Results saved to: ../results/quick_test_batch_a100/" +echo "" +date diff --git a/collaborativeagents/scripts/quick_test_batch.sh b/collaborativeagents/scripts/quick_test_batch.sh new file mode 100755 index 0000000..4be6573 --- /dev/null +++ b/collaborativeagents/scripts/quick_test_batch.sh @@ -0,0 +1,137 @@ +#!/bin/bash +# Quick test: 10 profiles × 5 sessions = 50 sessions +# Tests both batch (vanilla) and sequential (rag) processing + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT_USER=8004 +PORT_AGENT=8003 + +echo "============================================" +echo "Quick Test: Batch Processing Verification" +echo "============================================" +echo "Profiles: 10" +echo "Sessions/profile: 5" +echo "Total: 50 sessions" +echo "" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv +echo "" + +# Kill any existing servers +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start vLLM servers +echo "Starting 8B user simulator (GPU 0-1, TP=2)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_USER \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_USER_PID=$! + +echo "Starting 8B agent (GPU 2-3, TP=2)..." +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_AGENT \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_AGENT_PID=$! + +echo "Waiting for servers..." +for i in $(seq 1 100); do + READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0) + READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0) + if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then + echo "Both servers ready after $((i*3))s" + break + fi + if [ $((i % 20)) -eq 0 ]; then + echo " Still waiting... ($((i*3))s)" + fi + sleep 3 +done + +if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then + echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then + echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +echo "✓ Both servers healthy" +echo "" + +# Run quick test with vanilla (batch) and rag (sequential) +echo "============================================" +echo "Test 1: BATCH processing (vanilla method)" +echo "============================================" +START=$(date +%s) + +# Use absolute path for profile (your 100 profiles with ~40 preferences each) +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_100.jsonl" + +python scripts/run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 10 \ + --n-sessions 5 \ + --use-vllm \ + --batch-size 50 \ + --parallel-profiles 10 \ + --output-dir ../results/quick_test_batch \ + --profile-path "$PROFILE_PATH" + +END=$(date +%s) +ELAPSED_BATCH=$((END-START)) +echo "" +echo "Vanilla (batch) completed in ${ELAPSED_BATCH}s" + +ELAPSED_SEQ=0 +# Skip sequential test for now - just validate batch processing works +echo "" +echo "Skipping Test 2 (sequential) for quick validation..." + +# Cleanup +echo "" +echo "Cleaning up..." +kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true + +echo "" +echo "============================================" +echo "QUICK TEST RESULTS" +echo "============================================" +echo "" +echo "Vanilla (BATCH): ${ELAPSED_BATCH}s for 50 sessions" +echo "RAG (SEQUENTIAL): ${ELAPSED_SEQ}s for 50 sessions" +echo "" + +if [ $ELAPSED_BATCH -gt 0 ]; then + THROUGHPUT_BATCH=$((50 * 3600 / ELAPSED_BATCH)) + echo "Vanilla throughput: ${THROUGHPUT_BATCH} sessions/hr" +fi +if [ $ELAPSED_SEQ -gt 0 ]; then + THROUGHPUT_SEQ=$((50 * 3600 / ELAPSED_SEQ)) + echo "RAG throughput: ${THROUGHPUT_SEQ} sessions/hr" +fi + +echo "" +echo "Results saved to:" +echo " ../results/quick_test_batch/" +echo " ../results/quick_test_sequential/" +echo "" +date diff --git a/collaborativeagents/scripts/quick_test_h200.sbatch b/collaborativeagents/scripts/quick_test_h200.sbatch new file mode 100644 index 0000000..a1f115d --- /dev/null +++ b/collaborativeagents/scripts/quick_test_h200.sbatch @@ -0,0 +1,137 @@ +#!/bin/bash +#SBATCH --job-name=quick_batch_test +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=128G +#SBATCH --time=01:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/quick_batch_test-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/quick_batch_test-%j.err + +# Quick test: 10 profiles × 5 sessions = 50 sessions +# Tests batch (vanilla) processing on H200 + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT_USER=8004 +PORT_AGENT=8003 + +echo "============================================" +echo "Quick Test: Batch Processing on H200" +echo "============================================" +echo "Profiles: 10" +echo "Sessions/profile: 5" +echo "Total: 50 sessions" +echo "" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv +echo "" + +# Kill any existing servers +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start vLLM servers +echo "Starting 8B user simulator (GPU 0-1, TP=2)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_USER \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 8192 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_USER_PID=$! + +echo "Starting 8B agent (GPU 2-3, TP=2)..." +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_AGENT \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 8192 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_AGENT_PID=$! + +echo "Waiting for servers (may take 5-10 min for CUDA graph compilation)..." +for i in $(seq 1 200); do + READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0) + READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0) + if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then + echo "Both servers ready after $((i*3))s" + break + fi + if [ $((i % 20)) -eq 0 ]; then + echo " Still waiting... ($((i*3))s)" + fi + sleep 3 +done + +if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then + echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then + echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +echo "Both servers healthy" +echo "" + +# Run quick test with vanilla (batch) +echo "============================================" +echo "Test: BATCH processing (vanilla method)" +echo "============================================" +START=$(date +%s) + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_100.jsonl" + +python scripts/run_experiments.py \ + --methods vanilla \ + --datasets math-hard \ + --n-profiles 10 \ + --n-sessions 5 \ + --max-turns 15 \ + --use-vllm \ + --batch-size 50 \ + --parallel-profiles 10 \ + --output-dir ../results/quick_test_batch_h200 \ + --profile-path "$PROFILE_PATH" + +END=$(date +%s) +ELAPSED=$((END-START)) +echo "" +echo "Vanilla (batch) completed in ${ELAPSED}s" + +# Cleanup +echo "" +echo "Cleaning up..." +kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true + +echo "" +echo "============================================" +echo "QUICK TEST RESULTS" +echo "============================================" +echo "" +echo "Vanilla (BATCH): ${ELAPSED}s for 50 sessions" +echo "" + +if [ $ELAPSED -gt 0 ]; then + THROUGHPUT=$((50 * 3600 / ELAPSED)) + echo "Throughput: ${THROUGHPUT} sessions/hr" +fi + +echo "" +echo "Results saved to: ../results/quick_test_batch_h200/" +echo "" +date diff --git a/collaborativeagents/scripts/rag_debug_interactive.sbatch b/collaborativeagents/scripts/rag_debug_interactive.sbatch new file mode 100644 index 0000000..40a396c --- /dev/null +++ b/collaborativeagents/scripts/rag_debug_interactive.sbatch @@ -0,0 +1,87 @@ +#!/bin/bash +#SBATCH --job-name=rag_debug +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8-interactive +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=00:40:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_debug-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_debug-%j.err + +# Debug test on interactive partition: 5 profiles, 15 sessions + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +echo "=== RAG Debug Test (Interactive) ===" +echo "5 profiles, 15 sessions - with debug output" +date + +# Clear empty store +> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl +rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy + +# Start vLLM servers +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \ + --max-model-len 16384 --dtype bfloat16 & + +echo "Waiting for vLLM servers..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator ready after $((i*5))s" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent ready after $((i*5))s" + break + fi + sleep 5 +done +sleep 5 + +OUTPUT_DIR="../results/rag_debug_$(date +%Y%m%d_%H%M%S)" + +for METHOD in vanilla rag rag_vector; do + echo "" + echo "============================================" + echo "Testing: $METHOD" + echo "============================================" + + # Clear memory store before each method + > /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl + rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy + + date + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 5 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + + echo "Memory cards: $(wc -l < /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl)" +done + +echo "" +echo "=== Done ===" +date + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/rag_test_v4.sbatch b/collaborativeagents/scripts/rag_test_v4.sbatch new file mode 100644 index 0000000..ab3c8f6 --- /dev/null +++ b/collaborativeagents/scripts/rag_test_v4.sbatch @@ -0,0 +1,92 @@ +#!/bin/bash +#SBATCH --job-name=rag_test +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8-interactive +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=00:40:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_test-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_test-%j.err + +# Test with: +# 1. Skip reranking when few candidates +# 2. Reduced vLLM memory (0.35 for agent) to leave room for reranker + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +echo "=== RAG Test v4 ===" +echo "Changes: Skip rerank when /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl +rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy + +# Start vLLM servers with adjusted memory +# User simulator: 0.90 (unchanged) +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME & + +# Agent: reduced from 0.45 to 0.35 to leave room for reranker/embedding +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.35 \ + --max-model-len 16384 --dtype bfloat16 & + +echo "Waiting for vLLM servers..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator ready after $((i*5))s" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent ready after $((i*5))s" + break + fi + sleep 5 +done +sleep 5 + +OUTPUT_DIR="../results/rag_test_v4_$(date +%Y%m%d_%H%M%S)" + +for METHOD in vanilla rag rag_vector; do + echo "" + echo "============================================" + echo "Testing: $METHOD" + echo "============================================" + + # Clear memory store before each method + > /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl + rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy + + date + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 5 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + + echo "Method $METHOD completed" +done + +echo "" +echo "=== Done ===" +date + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/rag_test_v5.sbatch b/collaborativeagents/scripts/rag_test_v5.sbatch new file mode 100644 index 0000000..d739253 --- /dev/null +++ b/collaborativeagents/scripts/rag_test_v5.sbatch @@ -0,0 +1,96 @@ +#!/bin/bash +#SBATCH --job-name=rag_test +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8-interactive +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=00:40:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_test-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_test-%j.err + +# Test with explicit device assignment: +# - vLLM user sim: GPUs 0,1 +# - vLLM agent: GPUs 2,3 (0.45 memory) +# - Embedding: cuda:2 +# - Reranker: cuda:3 +# - Extractor: cuda:2 + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +echo "=== RAG Test v5 ===" +echo "Explicit device assignment: embed->cuda:2, reranker->cuda:3, extractor->cuda:2" +echo "5 profiles, 15 sessions" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +# Clear empty store +> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl +rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy + +# Start vLLM servers +# User simulator: GPUs 0,1 +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME & + +# Agent: GPUs 2,3 (restored to 0.45 since HF models now explicitly assigned) +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \ + --max-model-len 16384 --dtype bfloat16 & + +echo "Waiting for vLLM servers..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator ready after $((i*5))s" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent ready after $((i*5))s" + break + fi + sleep 5 +done +sleep 5 + +OUTPUT_DIR="../results/rag_test_v5_$(date +%Y%m%d_%H%M%S)" + +for METHOD in vanilla rag rag_vector; do + echo "" + echo "============================================" + echo "Testing: $METHOD" + echo "============================================" + + # Clear memory store before each method + > /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl + rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy + + date + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 5 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + + echo "Method $METHOD completed" +done + +echo "" +echo "=== Done ===" +date + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/run.py b/collaborativeagents/scripts/run.py new file mode 100644 index 0000000..f6ed79e --- /dev/null +++ b/collaborativeagents/scripts/run.py @@ -0,0 +1,504 @@ +import argparse +import json +import os +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor, as_completed + +from collaborativeagents.conversation_generator import ConversationGenerator +from collaborativeagents.conversation_evaluator import ConversationEvaluator +from collaborativeagents.datasets import datasets_info +from collaborativeagents.agents import CollaboratorAgent,UserAgent +from collaborativeagents.prompts import agent_system_prompt_no_user + +# import litellm +# litellm._turn_on_debug() + + +def load_dataset(dataset_name, eval_size, training=False): + if dataset_name not in datasets_info: + raise ValueError(f"Dataset '{dataset_name}' not found. Available datasets: {list(datasets_info.keys())}") + + dataset_class,user_task_description = datasets_info[dataset_name]['class'],datasets_info[dataset_name]['task_description'] + dataset_instance = dataset_class(eval_size=eval_size, training=training) + dataset = dataset_instance.get_dataset() + print(f"Loaded {len(dataset)} samples from {dataset_name}") + + return dataset,user_task_description + +def load_user_profiles(training=False): + if training: + with open("/shared/storage-01/users/mehri2/mem/collaborativeagents/collaborativeagents/user_profiles/training_user_profiles.json", 'r') as f: + user_profiles = json.load(f) + else: + with open("/shared/storage-01/users/mehri2/mem/collaborativeagents/collaborativeagents/user_profiles/user_profiles.json", 'r') as f: + user_profiles = json.load(f) + return user_profiles + + +def run_no_user( + dataset_name="math-hard", + eval_size=20, + batch_size=50, + collaborator_model_name="gpt-4.1-mini", + collaborator_api_base=None, + collaborator_api_key=None, + judge_model_name="gpt-4.1-mini", + judge_api_base=None, + judge_api_key=None, + output_file=None + ): + if os.path.exists(output_file): + with open(output_file, 'r') as f: + evaluation_results = [] + for line in f: + if line.strip() == "": + continue + evaluation_result = json.loads(line) + evaluation_results.append(evaluation_result) + + print(f"\n\n\nAll conversations generation and evaluation complete!") + print(f" # Total conversations: {len(evaluation_results)}") + print("\nEvaluation Results:") + print(f" # Overall average accuracy: {evaluation_results[0]['average_accuracy']}") + print(f" # Overall average conversation length (# messages): {evaluation_results[0]['average_conversation_length']}") + return + + dataset,_ = load_dataset(dataset_name, eval_size) + + collaborator_agent = CollaboratorAgent( + model_name=collaborator_model_name, + api_base=collaborator_api_base, + api_key=collaborator_api_key, + ) + conversationEvaluator = ConversationEvaluator( + dataset_name=dataset_name, + model_name=judge_model_name, + api_base=judge_api_base, + api_key=judge_api_key + ) + + # Generate and evaluate conversations + print(f"\n\n\nGenerating answers for {len(dataset)} {dataset_name} samples\n") + generated_conversations = [] + + total_batches = (len(dataset) + batch_size - 1) // batch_size + with tqdm(total=total_batches, desc="Generating conversations") as progress_bar: + for i in range(0, len(dataset), batch_size): + batch_samples = dataset[i:i+batch_size] + # Prepare conversations for the collaborator + batch_conversations = [[{"role": "user", "content": s['problem']} ] for s in batch_samples] + + # Batched collaborator responses + collab_responses = collaborator_agent.generate_collaborator_responses_batch(batch_conversations) + + # Assemble results + for sample, conv, collab_response in zip(batch_samples, batch_conversations, collab_responses): + if collab_response is None: + # Skip failed items; they will be counted downstream if needed + continue + conv.append({"role": "assistant", "content": str(collab_response["response"])}) + + # Add draft_answer key for evaluator compatibility + collab_response["draft_answer"] = collab_response["response"] + full_conversation_log = [collab_response] + + res = { + "sample": sample, + "conversation": conv, + "full_conversation_log": full_conversation_log + } + generated_conversations.append(res) + + progress_bar.update(1) + + evaluation_results = conversationEvaluator.evaluate_conversations(generated_conversations) + + with open(output_file, 'a') as f: + f.write(json.dumps(evaluation_results) + "\n") + f.flush() + + print(f"\n\n\nAll conversations generation and evaluation complete!") + print(f" # Total conversations: {len(generated_conversations)}") + print("\nEvaluation Results:") + print(f" # Overall average accuracy: {evaluation_results['average_accuracy']}") + print(f" # Overall average conversation length (# messages): {evaluation_results['average_conversation_length']}") + +def run_user_no_profile( + dataset_name="math-hard", + eval_size=20, + max_turns=10, + batch_size=100, + user_model_name="gpt-4.1-mini", + user_api_base=None, + user_api_key=None, + collaborator_model_name="gpt-4.1-mini", + collaborator_api_base=None, + collaborator_api_key=None, + judge_model_name="gpt-4.1-mini", + judge_api_base=None, + judge_api_key=None, + output_file=None + ): + if os.path.exists(output_file): + with open(output_file, 'r') as f: + evaluation_results = [] + for line in f: + if line.strip() == "": + continue + evaluation_result = json.loads(line) + evaluation_results.append(evaluation_result) + + print(f"\n\n\nAll conversations generation and evaluation complete!") + print(f" # Total conversations: {len(evaluation_results)}") + print("\nEvaluation Results:") + print(f" # Overall average accuracy: {evaluation_results[0]['average_accuracy']}") + print(f" # Overall average conversation length (# messages): {evaluation_results[0]['average_conversation_length']}") + return + + dataset,user_task_description = load_dataset(dataset_name, eval_size) + + # Generate conversations + generated_conversations = [] + + print(f"\n\n\nStarting generation conversations for user no preferences\n") + + conversationGenerator = ConversationGenerator( + user_task_description=user_task_description, + user_persona=None, + user_preferences=None, + max_turns=max_turns, + agent_with_user_preferences=False, + batch_size=batch_size, + user_model_name=user_model_name, + user_api_base=user_api_base, + user_api_key=user_api_key, + collaborator_model_name=collaborator_model_name, + collaborator_api_base=collaborator_api_base, + collaborator_api_key=collaborator_api_key + ) + generated_conversations = conversationGenerator.generate_conversations_parallel(dataset) + + conversationEvaluator = ConversationEvaluator( + dataset_name=dataset_name, + model_name=judge_model_name, + api_base=judge_api_base, + api_key=judge_api_key + ) + evaluation_results = conversationEvaluator.evaluate_conversations(generated_conversations) + + with open(output_file, 'a') as f: + f.write(json.dumps(evaluation_results) + "\n") + f.flush() + + print(f"\n\n\nAll conversations generation and evaluation complete!") + print(f" # Total conversations: {len(generated_conversations)}") + print("\nEvaluation Results:") + print(f" # Overall average accuracy: {evaluation_results['average_accuracy']}") + print(f" # Overall average conversation length (# messages): {evaluation_results['average_conversation_length']}") + +def run_user_profiles( + dataset_name="math-hard", + training=False, + user_profiles=None, + user_with_preferences=False, + agent_with_user_preferences=False, + agent_with_reflection=False, + with_scaffolding=False, + with_proper_scaffolding=False, + eval_size=20, + max_turns=10, + batch_size=100, + user_model_name="gpt-4.1-mini", + user_api_base=None, + user_api_key=None, + collaborator_model_name="gpt-4.1-mini", + collaborator_api_base=None, + collaborator_api_key=None, + judge_model_name="gpt-4.1-mini", + judge_api_base=None, + judge_api_key=None, + output_file=None + ): + dataset,user_task_description = load_dataset(dataset_name, eval_size, training=training) + + generated_user_sessions = [] + if os.path.exists(output_file): + with open(output_file, 'r') as f: + seen_users = set() + for line in f: + if line.strip() == "": + continue + curr_result = json.loads(line) + seen_users.add(curr_result["i"]) + generated_user_sessions.append(curr_result) + user_profiles = [user_profile_elem for user_profile_elem in user_profiles if user_profile_elem["i"] not in seen_users] + + def generate_and_evaluate_single_user_profile(user_profile_elem): + user_profile_i = user_profile_elem["i"] + user_persona = user_profile_elem["persona"] + if user_with_preferences: + user_preferences = "\n".join([f"{i+1}. {pref}" for i, pref in enumerate(user_profile_elem["preferences"])]) + else: + user_preferences = None + + # Generate conversations + if agent_with_reflection: + print(f"Starting generation conversation sessions for User {user_profile_i}") + conversationGenerator = ConversationGenerator( + user_task_description=user_task_description, + user_persona=user_persona, + user_preferences=user_preferences, + agent_with_user_preferences=agent_with_user_preferences, + max_turns=max_turns, + with_scaffolding=with_scaffolding, + with_proper_scaffolding=with_proper_scaffolding, + batch_size=batch_size, + user_model_name=user_model_name, + user_api_base=user_api_base, + user_api_key=user_api_key, + collaborator_model_name=collaborator_model_name, + collaborator_api_base=collaborator_api_base, + collaborator_api_key=collaborator_api_key + ) + generated_conversations = conversationGenerator.generate_conversations_with_reflective_agent(dataset, training=training) + print(f"Finished generation conversation sessions for User {user_profile_i}") + print(f" # succeeded user conversation sessions: {len(generated_conversations)}") + print(f" # failed user conversation sessions: {len(dataset) - len(generated_conversations)}") + else: + print(f"Starting generation conversation sessions for User {user_profile_i}") + conversationGenerator = ConversationGenerator( + user_task_description=user_task_description, + user_persona=user_persona, + user_preferences=user_preferences, + agent_with_user_preferences=agent_with_user_preferences, + max_turns=max_turns, + batch_size=batch_size, + user_model_name=user_model_name, + user_api_base=user_api_base, + user_api_key=user_api_key, + collaborator_model_name=collaborator_model_name, + collaborator_api_base=collaborator_api_base, + collaborator_api_key=collaborator_api_key + ) + generated_conversations = conversationGenerator.generate_conversations_parallel(dataset) + print(f"Finished generation conversation sessions for User {user_profile_i}") + print(f" # succeeded user conversation sessions: {len(generated_conversations)}") + print(f" # failed user conversation sessions: {len(dataset) - len(generated_conversations)}") + + # Evaluate conversations + conversationEvaluator = ConversationEvaluator( + dataset_name=dataset_name, + model_name=judge_model_name, + api_base=judge_api_base, + api_key=judge_api_key + ) + evaluation_results = conversationEvaluator.evaluate_conversations(generated_conversations) + user_profile_elem["generated_conversations"] = generated_conversations + user_profile_elem["evaluation"] = evaluation_results + + return user_profile_elem + + + with open(output_file, 'a') as f: + with tqdm(total=len(user_profiles), desc="Processing user profiles") as progress_bar: + for i in range(0, len(user_profiles), batch_size): + batch = user_profiles[i:i+batch_size] + + with ThreadPoolExecutor(max_workers=min(batch_size, len(batch))) as executor: + futures_to_profile = { + executor.submit(generate_and_evaluate_single_user_profile, user_profile_elem): user_profile_elem + for user_profile_elem in batch + } + + for future in as_completed(futures_to_profile): + curr_result = future.result() + generated_user_sessions.append(curr_result) + + f.write(json.dumps(curr_result) + "\n") + f.flush() + + progress_bar.update(1) + + # Aggregate evaluation results from all user sessions + avg_accuracy = sum([user_session['evaluation']['average_accuracy'] for user_session in generated_user_sessions]) / len(generated_user_sessions) + avg_length = sum([user_session['evaluation']['average_conversation_length'] for user_session in generated_user_sessions]) / len(generated_user_sessions) + + num_enforced_preferences_per_conversation = [] + for generated_user_session in generated_user_sessions: + for generated_conversation in generated_user_session['generated_conversations']: + curr_num_enforced_preferences = 0 + for message in generated_conversation['full_conversation_log']: + if 'enforce_preferences' in message: + if message["enforce_preferences"] == True or message["enforce_preferences"] == "True": + curr_num_enforced_preferences += 1 + num_enforced_preferences_per_conversation.append(curr_num_enforced_preferences) + + print(f"\n\n\nAll user profiles generation and evaluation complete!") + print(f" # Total user profiles processed: {len(generated_user_sessions)}") + print(f" # Total conversations: {sum([len(user_session['generated_conversations']) for user_session in generated_user_sessions])}") + print("\nEvaluation Results:") + print(f" # Overall average accuracy: {avg_accuracy}") + print(f" # Overall average conversation length (# messages): {avg_length}") + print(f" # Overall average number of enforced preferences: {sum(num_enforced_preferences_per_conversation) / len(num_enforced_preferences_per_conversation)}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--experiment_type", type=str) + parser.add_argument("--dataset", type=str) + parser.add_argument("--eval_size", type=int) + parser.add_argument("--output_file", type=str) + parser.add_argument("--max_turns", type=int) + parser.add_argument("--batch_size", type=int) + parser.add_argument("--user_model_name", type=str) + parser.add_argument("--user_api_base", type=str) + parser.add_argument("--user_api_key", type=str) + parser.add_argument("--collaborator_model_name", type=str) + parser.add_argument("--collaborator_api_base", type=str) + parser.add_argument("--collaborator_api_key", type=str) + parser.add_argument("--judge_model_name", type=str) + parser.add_argument("--judge_api_base", type=str) + parser.add_argument("--judge_api_key", type=str) + args = parser.parse_args() + + if args.experiment_type == "no_user": + run_no_user( + dataset_name=args.dataset, + eval_size=args.eval_size, + batch_size=args.batch_size, + collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key, + judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key, + output_file=args.output_file + ) + elif args.experiment_type == "user_no_profile": + run_user_no_profile( + dataset_name=args.dataset, + eval_size=args.eval_size, + max_turns=args.max_turns, + user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key, + collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key, + judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key, + output_file=args.output_file + ) + elif args.experiment_type == "user_profiles_without_preferences": + user_profiles = load_user_profiles() + run_user_profiles( + dataset_name=args.dataset, + training=False, + user_profiles=user_profiles, + user_with_preferences=False, + agent_with_reflection=False, + eval_size=args.eval_size, + max_turns=args.max_turns, + batch_size=args.batch_size, + user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key, + collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key, + judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key, + output_file=args.output_file + ) + elif args.experiment_type == "user_profiles_with_preferences": + user_profiles = load_user_profiles() + run_user_profiles( + dataset_name=args.dataset, + training=False, + user_profiles=user_profiles, + user_with_preferences=True, + agent_with_reflection=False, + eval_size=args.eval_size, + max_turns=args.max_turns, + batch_size=args.batch_size, + user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key, + collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key, + judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key, + output_file=args.output_file + ) + elif args.experiment_type == "agent_with_user_preferences": + user_profiles = load_user_profiles() + run_user_profiles( + dataset_name=args.dataset, + training=False, + user_profiles=user_profiles, + user_with_preferences=True, + agent_with_user_preferences=True, + agent_with_reflection=False, + eval_size=args.eval_size, + max_turns=args.max_turns, + batch_size=args.batch_size, + user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key, + collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key, + judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key, + output_file=args.output_file + ) + elif args.experiment_type == "agent_with_reflection": + user_profiles = load_user_profiles() + run_user_profiles( + dataset_name=args.dataset, + training=False, + user_profiles=user_profiles, + user_with_preferences=True, + agent_with_user_preferences=True, + agent_with_reflection=True, + eval_size=args.eval_size, + max_turns=args.max_turns, + batch_size=args.batch_size, + user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key, + collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key, + judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key, + output_file=args.output_file + ) + elif args.experiment_type == "agent_with_reflection_and_scaffolding": + user_profiles = load_user_profiles() + run_user_profiles( + dataset_name=args.dataset, + training=False, + user_profiles=user_profiles, + user_with_preferences=True, + agent_with_user_preferences=True, + agent_with_reflection=True, + with_scaffolding=True, + eval_size=args.eval_size, + max_turns=args.max_turns, + batch_size=args.batch_size, + user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key, + collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key, + judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key, + output_file=args.output_file + ) + elif args.experiment_type == "agent_with_reflection_and_proper_scaffolding": + user_profiles = load_user_profiles() + run_user_profiles( + dataset_name=args.dataset, + training=False, + user_profiles=user_profiles, + user_with_preferences=True, + agent_with_user_preferences=True, + agent_with_reflection=True, + with_scaffolding=True, + with_proper_scaffolding=True, + eval_size=args.eval_size, + max_turns=args.max_turns, + batch_size=args.batch_size, + user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key, + collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key, + judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key, + output_file=args.output_file + ) + elif args.experiment_type == "training_data_with_user_profiles_with_preferences": + user_profiles = load_user_profiles(training=True) + run_user_profiles( + dataset_name=args.dataset, + training=True, + user_profiles=user_profiles, + user_with_preferences=True, + agent_with_user_preferences=True, + agent_with_reflection=True, + eval_size=args.eval_size, + max_turns=args.max_turns, + batch_size=args.batch_size, + user_model_name=args.user_model_name, user_api_base=args.user_api_base, user_api_key=args.user_api_key, + collaborator_model_name=args.collaborator_model_name, collaborator_api_base=args.collaborator_api_base, collaborator_api_key=args.collaborator_api_key, + judge_model_name=args.judge_model_name, judge_api_base=args.judge_api_base, judge_api_key=args.judge_api_key, + output_file=args.output_file + ) + else: + raise ValueError(f"Invalid experiment type: {args.experiment_type}") \ No newline at end of file diff --git a/collaborativeagents/scripts/run.sh b/collaborativeagents/scripts/run.sh new file mode 100644 index 0000000..87d9234 --- /dev/null +++ b/collaborativeagents/scripts/run.sh @@ -0,0 +1,98 @@ +# vllm serve meta-llama/Llama-3.3-70B-Instruct --port 8004 --tensor-parallel-size 4 --max-model-len 16384 --gpu-memory-utilization 0.9 +# python -m sglang.launch_server --model-path meta-llama/Llama-3.3-70B-Instruct --port 8004 --tp-size 4 --context-length 16384 +# python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --port 8003 --tp-size 4 --context-length 16384 +# python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --port 8003 --tp-size 4 --context-length 16384 + + +# SFT Models +# python -m sglang.launch_server --model-path /shared/storage-01/users/mehri2/LLaMA-Factory/saves/llama-3.1-8b-instruct/full/sft_session_level_reflection/checkpoint-628 --served-model-name meta-llama/Llama-3.1-8B-Instruct --port 8003 --tp-size 4 --context-length 16384 + +# python -m sglang.launch_server --model-path /shared/storage-01/users/mehri2/LLaMA-Factory/saves/qwen2.5-7b/full/sft_session_level_reflection/checkpoint-628 --served-model-name Qwen/Qwen2.5-7B-Instruct --port 8003 --tp-size 4 --context-length 16384 + +# GRPO Models + +# python -m verl.model_merger merge \ +# --backend fsdp \ +# --local_dir /shared/storage-01/users/mehri2/mem/collaborativeagents/training/grpo_verl/results/v3/global_step_200/actor \ +# --target_dir /shared/storage-01/users/mehri2/mem/collaborativeagents/training/grpo_verl/results/v3/global_step_200_merged_hf + +# python -m sglang.launch_server --model-path /shared/storage-01/users/mehri2/mem/collaborativeagents/training/grpo_verl/results/v3/global_step_200_merged_hf --served-model-name meta-llama/Llama-3.1-8B-Instruct --port 8003 --tp-size 4 --context-length 16384 + + + +BATCH_SIZE=100 +BATCH_SIZE=50 + +# Loop over eval sizes and datasets +for EVAL_SIZE in 20; do + for DATASET in math-hard math-500 logiqa mmlu medqa; do # humaneval bigcodebench + # Convert dataset name for file paths (replace - with _) + DATASET_FILE=$(echo ${DATASET} | tr '-' '_') + + echo "Running experiments for dataset: ${DATASET} with eval_size ${EVAL_SIZE}" + + # # no_user experiment + # python3 run.py --experiment_type no_user --dataset ${DATASET} --eval_size ${EVAL_SIZE} --batch_size ${BATCH_SIZE} \ + # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \ + # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b/no_user/${DATASET_FILE}_llama70b_user_llama8b_agent_no_user_eval_size_${EVAL_SIZE}.jsonl \ + # >> ./runs/llama70b_temp_1_llama8b/no_user/${DATASET_FILE}_llama70b_user_llama8b_agent_no_user_eval_size_${EVAL_SIZE}.out 2>&1 + + # # user_no_profile experiment + # python3 run.py --experiment_type user_no_profile --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \ + # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b/user_no_profile/${DATASET_FILE}_llama70b_user_llama8b_agent_user_no_profile_eval_size_${EVAL_SIZE}.jsonl \ + # >> ./runs/llama70b_temp_1_llama8b/user_no_profile/${DATASET_FILE}_llama70b_user_llama8b_agent_user_no_profile_eval_size_${EVAL_SIZE}.out 2>&1 + + # # user_profiles_without_preferences experiment + # python3 run.py --experiment_type user_profiles_without_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \ + # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b/user_profiles_without_preferences/${DATASET_FILE}_llama70b_user_llama8b_agent_user_profiles_without_preferences_eval_size_${EVAL_SIZE}.jsonl \ + # >> ./runs/llama70b_temp_1_llama8b/user_profiles_without_preferences/${DATASET_FILE}_llama70b_user_llama8b_agent_user_profiles_without_preferences_eval_size_${EVAL_SIZE}.out 2>&1 + + # user_profiles_with_preferences experiment + python3 run.py --experiment_type user_profiles_with_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \ + --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b_grpo_v3_ckpt200/user_profiles_with_preferences/${DATASET_FILE}_llama70b_user_llama8b_agent_user_profiles_with_preferences_eval_size_${EVAL_SIZE}.jsonl \ + >> /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b_grpo_v3_ckpt200/user_profiles_with_preferences/${DATASET_FILE}_llama70b_user_llama8b_agent_user_profiles_with_preferences_eval_size_${EVAL_SIZE}.out 2>&1 + + # # agent_with_user_preferences experiment + # python3 run.py --experiment_type agent_with_user_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \ + # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b/agent_with_user_preferences/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_user_preferences_eval_size_${EVAL_SIZE}_v2.jsonl \ + # >> ./runs/llama70b_temp_1_llama8b/agent_with_user_preferences/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_user_preferences_eval_size_${EVAL_SIZE}_v2.out 2>&1 + + # # agent_with_reflection experiment + # python3 run.py --experiment_type agent_with_reflection --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \ + # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b/agent_with_reflection_v3/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_reflection_eval_size_${EVAL_SIZE}.jsonl \ + # >> ./runs/llama70b_temp_1_llama8b/agent_with_reflection_v3/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_reflection_eval_size_${EVAL_SIZE}.out 2>&1 + + # # agent_with_reflection_and_scaffolding + # python3 run.py --experiment_type agent_with_reflection_and_scaffolding --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \ + # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b/agent_with_reflection_and_scaffolding/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_reflection_and_scaffolding_eval_size_${EVAL_SIZE}.jsonl \ + # >> ./runs/llama70b_temp_1_llama8b/agent_with_reflection_and_scaffolding/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_reflection_and_scaffolding_eval_size_${EVAL_SIZE}.out 2>&1 + + # agent_with_reflection_and_proper_scaffolding + python3 run.py --experiment_type agent_with_reflection_and_proper_scaffolding --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + --collaborator_model_name hosted_vllm/meta-llama/Llama-3.1-8B-Instruct --collaborator_api_base http://localhost:8003/v1 --collaborator_api_key EMPTY \ + --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b_grpo_v3_ckpt200/agent_with_reflection_and_proper_scaffolding/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_reflection_and_proper_scaffolding_eval_size_${EVAL_SIZE}.jsonl \ + >> /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1_llama8b_grpo_v3_ckpt200/agent_with_reflection_and_proper_scaffolding/${DATASET_FILE}_llama70b_user_llama8b_agent_agent_with_reflection_and_proper_scaffolding_eval_size_${EVAL_SIZE}.out 2>&1 + + done +done diff --git a/collaborativeagents/scripts/run_baseline_comparison.py b/collaborativeagents/scripts/run_baseline_comparison.py new file mode 100644 index 0000000..0bdbcb5 --- /dev/null +++ b/collaborativeagents/scripts/run_baseline_comparison.py @@ -0,0 +1,608 @@ +""" +Run baseline comparison experiments for personalization methods. + +Baselines: +1. Vanilla - No memory +2. Contextual Memory - Full history in context (summarize if exceeds limit) +3. Reflection Memory - CollaborativeAgents' agent_notes approach +4. Reflection + GRPO - Trained version of reflection +5. All Memory Cards in Context - Extract all, no retrieval +6. Extractor + RAG - Retrieval without user vector +7. Extractor + RAG + User Vector - Full personalization + +Metrics: +- Task Accuracy +- User Effort (user token count) +- Total Efficiency (all tokens) +- Conflict Resolution Accuracy (new) +- User Vector Similarity to Ground Truth (new) +""" + +import json +import time +from pathlib import Path +from dataclasses import dataclass, field, asdict +from typing import Optional, Callable +from abc import ABC, abstractmethod +import numpy as np + +# ============================================================================ +# Metrics +# ============================================================================ + +@dataclass +class ConversationMetrics: + """Metrics for a single conversation.""" + task_accuracy: float # 0 or 1 for correct answer + user_tokens: int # Total tokens from user messages + assistant_tokens: int # Total tokens from assistant messages + total_tokens: int # All tokens + num_turns: int # Number of conversation turns + num_preference_enforcements: int # How many times user enforced preferences + conflict_resolution_correct: Optional[bool] = None # If conflict test, was it resolved correctly? + latency_seconds: float = 0.0 + + @property + def user_effort(self) -> int: + """User effort = user tokens (lower is better).""" + return self.user_tokens + + @property + def efficiency(self) -> float: + """Efficiency = accuracy / total_tokens * 1000 (higher is better).""" + if self.total_tokens == 0: + return 0.0 + return self.task_accuracy / self.total_tokens * 1000 + + +@dataclass +class ExperimentResults: + """Aggregated results for an experiment.""" + baseline_name: str + num_conversations: int + metrics: dict = field(default_factory=dict) + + def add_conversation(self, conv_metrics: ConversationMetrics): + for key in ['task_accuracy', 'user_tokens', 'assistant_tokens', + 'total_tokens', 'num_turns', 'num_preference_enforcements']: + if key not in self.metrics: + self.metrics[key] = [] + self.metrics[key].append(getattr(conv_metrics, key)) + + if conv_metrics.conflict_resolution_correct is not None: + if 'conflict_resolution_correct' not in self.metrics: + self.metrics['conflict_resolution_correct'] = [] + self.metrics['conflict_resolution_correct'].append( + 1.0 if conv_metrics.conflict_resolution_correct else 0.0 + ) + + def summary(self) -> dict: + """Compute summary statistics.""" + summary = {"baseline": self.baseline_name, "n": self.num_conversations} + for key, values in self.metrics.items(): + if values: + summary[f"{key}_mean"] = np.mean(values) + summary[f"{key}_std"] = np.std(values) + return summary + + +# ============================================================================ +# Baseline Implementations (Abstract) +# ============================================================================ + +class BaselineMethod(ABC): + """Abstract base class for all baseline methods.""" + + def __init__(self, name: str, config: dict = None): + self.name = name + self.config = config or {} + + @abstractmethod + def initialize_session(self, user_id: str, user_profile: dict): + """Initialize a new session for a user.""" + pass + + @abstractmethod + def generate_response(self, query: str, conversation_history: list) -> str: + """Generate a response given query and history.""" + pass + + @abstractmethod + def update_memory(self, conversation: list, feedback: dict = None): + """Update memory after a conversation or turn.""" + pass + + @abstractmethod + def get_context_for_prompt(self) -> str: + """Get the memory/context to include in prompts.""" + pass + + def count_tokens(self, text: str) -> int: + """Estimate token count (simple approximation).""" + return len(text.split()) * 1.3 # Rough estimate + + +class VanillaBaseline(BaselineMethod): + """No memory - fresh context each time.""" + + def __init__(self): + super().__init__("vanilla") + + def initialize_session(self, user_id: str, user_profile: dict): + self.user_id = user_id + # No memory initialization needed + + def generate_response(self, query: str, conversation_history: list) -> str: + # Would call LLM here + pass + + def update_memory(self, conversation: list, feedback: dict = None): + # No memory to update + pass + + def get_context_for_prompt(self) -> str: + return "" # No additional context + + +class ContextualMemoryBaseline(BaselineMethod): + """ + Full conversation history in context. + Summarize when exceeds context limit. + """ + + def __init__(self, max_context_tokens: int = 32000): + super().__init__("contextual_memory") + self.max_context_tokens = max_context_tokens + self.full_history = [] + self.summarized_history = "" + + def initialize_session(self, user_id: str, user_profile: dict): + self.user_id = user_id + # Keep accumulated history across sessions + + def generate_response(self, query: str, conversation_history: list) -> str: + pass + + def update_memory(self, conversation: list, feedback: dict = None): + self.full_history.extend(conversation) + + # Check if we need to summarize + total_tokens = sum(self.count_tokens(msg['content']) for msg in self.full_history) + if total_tokens > self.max_context_tokens: + self._summarize_old_history() + + def _summarize_old_history(self): + """Summarize older parts of history to fit context.""" + # Keep recent conversations, summarize older ones + # This is where information loss happens! + keep_recent = 10 # Keep last 10 turns verbatim + to_summarize = self.full_history[:-keep_recent] + recent = self.full_history[-keep_recent:] + + # Would call LLM to summarize here + # self.summarized_history = summarize_with_llm(to_summarize) + self.full_history = recent + + def get_context_for_prompt(self) -> str: + context = "" + if self.summarized_history: + context += f"Previous conversation summary:\n{self.summarized_history}\n\n" + context += "Recent conversation:\n" + for msg in self.full_history[-20:]: # Last 20 messages + context += f"{msg['role']}: {msg['content']}\n" + return context + + +class ReflectionMemoryBaseline(BaselineMethod): + """ + CollaborativeAgents' approach: maintain agent_notes that are + updated after each conversation via reflection. + """ + + def __init__(self): + super().__init__("reflection_memory") + self.agent_notes = {} + + def initialize_session(self, user_id: str, user_profile: dict): + self.user_id = user_id + if user_id not in self.agent_notes: + self.agent_notes[user_id] = "" + + def generate_response(self, query: str, conversation_history: list) -> str: + pass + + def update_memory(self, conversation: list, feedback: dict = None): + # After conversation, reflect and update notes + # This is their update_agent_notes_prompt approach + pass + + def get_context_for_prompt(self) -> str: + return f"Notes about this user:\n{self.agent_notes.get(self.user_id, '')}" + + +class AllMemoryCardsBaseline(BaselineMethod): + """ + Extract preferences into memory cards, but put ALL in context. + No retrieval - just dump everything. + """ + + def __init__(self, max_cards_in_context: int = 100): + super().__init__("all_memory_cards") + self.max_cards = max_cards_in_context + self.memory_cards = {} # user_id -> list of cards + + def initialize_session(self, user_id: str, user_profile: dict): + self.user_id = user_id + if user_id not in self.memory_cards: + self.memory_cards[user_id] = [] + + def generate_response(self, query: str, conversation_history: list) -> str: + pass + + def update_memory(self, conversation: list, feedback: dict = None): + # Extract preferences from conversation and add to cards + # Would use preference_extractor here + pass + + def get_context_for_prompt(self) -> str: + cards = self.memory_cards.get(self.user_id, []) + if not cards: + return "" + + # Just dump all cards - this is the weakness! + context = "User preferences (all known):\n" + for i, card in enumerate(cards[:self.max_cards]): + context += f"{i+1}. When {card['condition']}: {card['action']}\n" + return context + + +class ExtractorRAGBaseline(BaselineMethod): + """ + Extract preferences + RAG retrieval. + No user vector - just relevance-based retrieval. + """ + + def __init__(self, top_k: int = 5): + super().__init__("extractor_rag") + self.top_k = top_k + self.memory_store = None # Would be vector store + + def initialize_session(self, user_id: str, user_profile: dict): + self.user_id = user_id + + def generate_response(self, query: str, conversation_history: list) -> str: + pass + + def update_memory(self, conversation: list, feedback: dict = None): + # Extract and store in vector DB + pass + + def get_context_for_prompt(self) -> str: + # Would retrieve relevant memories here + return "Retrieved preferences:\n..." + + +class ExtractorRAGUserVectorBaseline(BaselineMethod): + """ + Full method: Extract + RAG + User Vector for personalized retrieval. + """ + + def __init__(self, top_k: int = 5): + super().__init__("extractor_rag_user_vector") + self.top_k = top_k + # Would integrate with your PersonalizedLLM + + def initialize_session(self, user_id: str, user_profile: dict): + self.user_id = user_id + + def generate_response(self, query: str, conversation_history: list) -> str: + pass + + def update_memory(self, conversation: list, feedback: dict = None): + # Extract, store, and update user vector via REINFORCE + pass + + def get_context_for_prompt(self) -> str: + # Would use policy-based retrieval here + return "Retrieved preferences (personalized):\n..." + + +# ============================================================================ +# Experiment Runner +# ============================================================================ + +@dataclass +class ExperimentConfig: + """Configuration for an experiment run.""" + baselines: list # List of baseline names to run + dataset: str # Dataset to use + num_sessions: int = 10 # Sessions per user + num_users: int = 20 # Number of user profiles + max_turns_per_session: int = 15 + profile_path: str = "collaborativeagents/data/complex_profiles/profiles.jsonl" + output_dir: str = "collaborativeagents/results" + include_conflict_tests: bool = True + seed: int = 42 + + +class ExperimentRunner: + """Runs baseline comparison experiments.""" + + BASELINE_CLASSES = { + "vanilla": VanillaBaseline, + "contextual_memory": ContextualMemoryBaseline, + "reflection_memory": ReflectionMemoryBaseline, + "all_memory_cards": AllMemoryCardsBaseline, + "extractor_rag": ExtractorRAGBaseline, + "extractor_rag_user_vector": ExtractorRAGUserVectorBaseline, + } + + def __init__(self, config: ExperimentConfig): + self.config = config + self.results = {} + + def load_profiles(self) -> list: + """Load user profiles.""" + profiles = [] + with open(self.config.profile_path) as f: + for line in f: + profiles.append(json.loads(line)) + return profiles[:self.config.num_users] + + def load_dataset(self) -> list: + """Load evaluation dataset.""" + # Would load from collaborativeagents datasets + pass + + def run_single_conversation( + self, + baseline: BaselineMethod, + user_profile: dict, + problem: dict, + session_num: int + ) -> ConversationMetrics: + """Run a single conversation and collect metrics.""" + baseline.initialize_session(user_profile['user_id'], user_profile) + + conversation = [] + user_tokens = 0 + assistant_tokens = 0 + num_enforcements = 0 + + # Simulate conversation + # In practice, would use UserAgent and actual LLM calls + + start_time = time.time() + + # ... conversation loop ... + + latency = time.time() - start_time + + return ConversationMetrics( + task_accuracy=0.0, # Would evaluate + user_tokens=user_tokens, + assistant_tokens=assistant_tokens, + total_tokens=user_tokens + assistant_tokens, + num_turns=len(conversation) // 2, + num_preference_enforcements=num_enforcements, + latency_seconds=latency + ) + + def run_conflict_test( + self, + baseline: BaselineMethod, + user_profile: dict, + conflict_test: dict + ) -> bool: + """Test if baseline correctly resolves a preference conflict.""" + baseline.initialize_session(user_profile['user_id'], user_profile) + + # Generate response to conflicting query + query = conflict_test['query'] + response = baseline.generate_response(query, []) + + # Check if correct preference was applied + correct_pref_id = conflict_test['correct_pref_id'] + # Would analyze response to check which preference was followed + + return False # Placeholder + + def run_experiment(self): + """Run full experiment across all baselines.""" + profiles = self.load_profiles() + dataset = self.load_dataset() + + for baseline_name in self.config.baselines: + print(f"\n{'='*60}") + print(f"Running baseline: {baseline_name}") + print(f"{'='*60}") + + baseline_class = self.BASELINE_CLASSES[baseline_name] + baseline = baseline_class() + + results = ExperimentResults( + baseline_name=baseline_name, + num_conversations=0 + ) + + for user_profile in profiles: + user_id = user_profile['user_id'] + print(f"\nUser: {user_id}") + + # Run multiple sessions + for session in range(self.config.num_sessions): + # Select problems for this session + session_problems = dataset[session * 3:(session + 1) * 3] + + for problem in session_problems: + metrics = self.run_single_conversation( + baseline, user_profile, problem, session + ) + results.add_conversation(metrics) + results.num_conversations += 1 + + # Run conflict tests + if self.config.include_conflict_tests: + for conflict_test in user_profile.get('conflict_tests', []): + correct = self.run_conflict_test( + baseline, user_profile, conflict_test + ) + # Would add to results + + self.results[baseline_name] = results + + return self.results + + def compute_user_vector_similarity( + self, + learned_vector: np.ndarray, + ground_truth_profile: dict + ) -> float: + """ + Compute similarity between learned user vector and ground truth. + + Ground truth is derived from the preference profile: + - One-hot encode preference categories + - Weight by how often each preference was triggered + """ + # Create ground truth vector from profile + # This is a key metric for your method! + pass + + def save_results(self): + """Save experiment results.""" + output_dir = Path(self.config.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Summary table + summary = [] + for name, results in self.results.items(): + summary.append(results.summary()) + + with open(output_dir / "summary.json", 'w') as f: + json.dump(summary, f, indent=2) + + # Detailed results + for name, results in self.results.items(): + with open(output_dir / f"{name}_detailed.json", 'w') as f: + json.dump(asdict(results), f, indent=2) + + print(f"\nResults saved to {output_dir}") + + def print_comparison_table(self): + """Print a comparison table of all baselines.""" + print("\n" + "=" * 80) + print("BASELINE COMPARISON RESULTS") + print("=" * 80) + + headers = ["Baseline", "Accuracy", "User Effort", "Total Tokens", "Conflict Acc"] + row_format = "{:<30} {:>10} {:>12} {:>14} {:>12}" + + print(row_format.format(*headers)) + print("-" * 80) + + for name, results in self.results.items(): + summary = results.summary() + print(row_format.format( + name, + f"{summary.get('task_accuracy_mean', 0):.3f}", + f"{summary.get('user_tokens_mean', 0):.0f}", + f"{summary.get('total_tokens_mean', 0):.0f}", + f"{summary.get('conflict_resolution_correct_mean', 0):.3f}" + )) + + +# ============================================================================ +# Analysis Functions +# ============================================================================ + +def analyze_context_overflow(results: dict) -> dict: + """ + Analyze how methods degrade as context grows. + + Returns degradation curves for each method. + """ + analysis = {} + + for baseline_name, baseline_results in results.items(): + # Group by session number + by_session = {} + # Would analyze accuracy degradation over sessions + analysis[baseline_name] = by_session + + return analysis + + +def analyze_conflict_resolution(results: dict, conflict_tests: list) -> dict: + """ + Analyze conflict resolution accuracy by conflict type. + """ + analysis = {} + + for conflict_type in set(t['conflict_group'] for t in conflict_tests): + type_tests = [t for t in conflict_tests if t['conflict_group'] == conflict_type] + + for baseline_name in results: + if baseline_name not in analysis: + analysis[baseline_name] = {} + # Would compute accuracy per conflict type + analysis[baseline_name][conflict_type] = 0.0 + + return analysis + + +def analyze_user_vector_quality( + learned_vectors: dict, + ground_truth_profiles: list +) -> dict: + """ + Analyze how well user vectors capture user identity. + + Tests: + 1. Same user across sessions -> high similarity + 2. Different users -> low similarity + 3. Users with similar preferences -> moderate similarity + """ + analysis = { + "intra_user_similarity": [], # Same user, different sessions + "inter_user_similarity": [], # Different users + "preference_cluster_quality": 0.0 # How well vectors cluster by preference + } + + # Would compute similarities + return analysis + + +# ============================================================================ +# Main +# ============================================================================ + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--baselines", nargs="+", default=[ + "vanilla", "contextual_memory", "reflection_memory", + "all_memory_cards", "extractor_rag", "extractor_rag_user_vector" + ]) + parser.add_argument("--dataset", default="math-500") + parser.add_argument("--num_sessions", type=int, default=10) + parser.add_argument("--num_users", type=int, default=20) + parser.add_argument("--output_dir", default="collaborativeagents/results") + parser.add_argument("--seed", type=int, default=42) + + args = parser.parse_args() + + config = ExperimentConfig( + baselines=args.baselines, + dataset=args.dataset, + num_sessions=args.num_sessions, + num_users=args.num_users, + output_dir=args.output_dir, + seed=args.seed + ) + + runner = ExperimentRunner(config) + results = runner.run_experiment() + runner.print_comparison_table() + runner.save_results() diff --git a/collaborativeagents/scripts/run_debug.sh b/collaborativeagents/scripts/run_debug.sh new file mode 100644 index 0000000..1f82d70 --- /dev/null +++ b/collaborativeagents/scripts/run_debug.sh @@ -0,0 +1,24 @@ +# vllm serve meta-llama/Llama-3.3-70B-Instruct --port 8004 --tensor-parallel-size 4 --max-model-len 16384 --gpu-memory-utilization 0.9 + + +BATCH_SIZE=100 +# BATCH_SIZE=20 + +# Loop over eval sizes and datasets +for EVAL_SIZE in 20; do + for DATASET in math-500; do + # Convert dataset name for file paths (replace - with _) + DATASET_FILE=$(echo ${DATASET} | tr '-' '_') + + echo "Running experiments for dataset: ${DATASET} with eval_size ${EVAL_SIZE}" + + # debug experiment + python3 run.py --experiment_type debug --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \ + --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/debug/${DATASET_FILE}_llama70b_user_llama70b_agent_debug_eval_size_${EVAL_SIZE}.jsonl \ + >> ./runs/llama70b/debug/${DATASET_FILE}_llama70b_user_llama70b_agent_debug_eval_size_${EVAL_SIZE}.out 2>&1 + + done +done \ No newline at end of file diff --git a/collaborativeagents/scripts/run_experiments.py b/collaborativeagents/scripts/run_experiments.py new file mode 100644 index 0000000..0ba0ba0 --- /dev/null +++ b/collaborativeagents/scripts/run_experiments.py @@ -0,0 +1,1328 @@ +#!/usr/bin/env python3 +""" +Main experiment orchestrator for personalization benchmark. + +This script runs all baselines and the proposed methods with PROPER multi-turn +conversation simulation, user preference enforcement, and LLM-based evaluation. + +Usage: + python run_experiments.py --config config.yaml + python run_experiments.py --methods vanilla,rag,rag_vector --datasets gpqa,aime +""" + +import argparse +import json +import yaml +import os +import sys +from pathlib import Path +from datetime import datetime +from typing import List, Dict, Any, Optional +from dataclasses import dataclass, asdict +import logging +import re +from concurrent.futures import ThreadPoolExecutor, as_completed +import threading +import time + +# Add paths +sys.path.insert(0, str(Path(__file__).parent.parent)) +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +from datasets_extended import get_dataset, get_all_datasets, get_challenging_datasets +from evaluation.llm_judge import LLMJudge, BatchEvaluator, ConversationMetrics +from conflict_scenario_generator import ConflictScenarioGenerator +from adapters.personalized_llm_adapter import PersonalizedLLMAdapter, create_baseline_adapter +from agents.local_user_agent import LocalUserAgent, SharedLocalUserAgent, TERMINATION_SIGNAL +from agents.vllm_user_agent import VLLMUserAgent, VLLMAgentClient +from agents.openai_user_agent import OpenAIUserAgent +from agents.batch_vllm_agent import BatchConversationGenerator, BatchVLLMClient + + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +@dataclass +class ExperimentConfig: + """Configuration for an experiment run.""" + # Methods to compare + methods: List[str] + + # Datasets to use + datasets: List[str] + + # User profiles + n_profiles: int = 200 + profile_path: Optional[str] = None + + # Profile range (for splitting jobs) + start_profile: int = 0 # Inclusive, 0-indexed + end_profile: Optional[int] = None # Exclusive, None means all + + # Session settings + n_sessions_per_profile: int = 30 + max_turns_per_session: int = 15 # Increased for harder tasks + + # Model settings + user_model: str = "meta-llama/Llama-3.3-70B-Instruct" + agent_model: str = "meta-llama/Llama-3.1-8B-Instruct" + judge_model: str = "meta-llama/Llama-3.3-70B-Instruct" + + # Output settings + output_dir: str = "results" + save_conversations: bool = True + + # Conflict testing + conflict_ratio: float = 0.3 # proportion of queries that trigger conflicts + + # Compute settings + batch_size: int = 4 + n_gpus: int = 4 + + # vLLM settings (for high-performance inference) + use_vllm: bool = False + vllm_user_url: str = "http://localhost:8004/v1" # 70B user simulator + vllm_agent_url: str = "http://localhost:8003/v1" # 8B agent + + # OpenAI user simulator (alternative to vLLM user agent) + use_openai_user: bool = False + openai_user_model: str = "gpt-5" # Model name for OpenAI user agent + + # Reward mode: "keyword" (implicit user signals) or "llm" (GPT-5-nano judge) + # This is a global option applied to ALL methods that use RL updates + reward_mode: str = "keyword" + + # Parallel/Batch processing + parallel_profiles: int = 50 # Number of profiles to process in parallel + use_batch_processing: bool = True # Use turn-synchronous batch processing for vanilla/all_memory + batch_size_conversations: int = 50 # Number of conversations to batch together + + # Continue from existing experiment (for extending sessions) + continue_from: Optional[str] = None # Path to existing output directory to continue from + + +# Available methods +AVAILABLE_METHODS = { + "vanilla": "No memory, no personalization", + "contextual": "Full history in context, summarize when overflow", + "reflection": "CollaborativeAgents' agent_notes approach", + "reflection_grpo": "Reflection + GRPO training", + "all_memory": "All extracted memories in context (no retrieval)", + "rag": "Extractor + RAG (no user vector)", + "rag_vector": "Extractor + RAG + user vector (proposed method)", + "rag_bge": "Extractor + RAG with BGE reranker (278M)", + "rag_vector_bge": "Extractor + RAG + user vector with BGE reranker (278M)", +} + + +class ExperimentRunner: + """Main experiment runner.""" + + def __init__(self, config: ExperimentConfig): + self.config = config + + # Use existing directory if continuing, otherwise create new timestamped one + if config.continue_from: + self.output_dir = Path(config.continue_from) + if not self.output_dir.exists(): + raise ValueError(f"Continue-from directory does not exist: {config.continue_from}") + logger.info(f"Continuing from existing experiment: {self.output_dir}") + else: + self.output_dir = Path(config.output_dir) / datetime.now().strftime("%Y%m%d_%H%M%S") + self.output_dir.mkdir(parents=True, exist_ok=True) + + # Save/update config + with open(self.output_dir / "config.yaml", "w") as f: + yaml.dump(asdict(config), f) + + # Initialize components + self.judge = LLMJudge(model_name=config.judge_model) + self.batch_evaluator = BatchEvaluator(self.judge) + self.conflict_generator = ConflictScenarioGenerator() + + # Load datasets + self.datasets = {} + for ds_name in config.datasets: + try: + self.datasets[ds_name] = get_dataset(ds_name) + logger.info(f"Loaded dataset: {ds_name}") + except Exception as e: + logger.warning(f"Failed to load dataset {ds_name}: {e}") + + # Load or generate profiles + self.profiles = self._load_profiles() + + def _load_profiles(self) -> List[Dict]: + """Load user profiles from file or generate.""" + logger.info(f"Profile path configured: {self.config.profile_path}") + + if self.config.profile_path: + profile_path = Path(self.config.profile_path) + if profile_path.exists(): + profiles = [] + with open(profile_path) as f: + for line in f: + line = line.strip() + if line: + profiles.append(json.loads(line)) + logger.info(f"Loaded {len(profiles)} profiles from {self.config.profile_path}") + return profiles[:self.config.n_profiles] + else: + logger.warning(f"Profile path does not exist: {self.config.profile_path}") + + # Generate simple placeholder profiles if no file provided + logger.info(f"Generating {self.config.n_profiles} placeholder profiles...") + profiles = [] + for i in range(self.config.n_profiles): + profiles.append({ + "id": i, + "persona": f"User {i+1} is a curious individual seeking help with problem solving.", + "preferences": [ + "Provide clear, step-by-step explanations", + "Use simple language when possible", + "Give examples to illustrate concepts", + "Be concise but thorough", + "Acknowledge when something is uncertain" + ] + }) + + # Save generated profiles + profile_path = self.output_dir / "generated_profiles.json" + with open(profile_path, "w") as f: + json.dump(profiles, f, indent=2) + + logger.info(f"Generated and saved {len(profiles)} placeholder profiles") + return profiles + + def _create_method_adapter(self, method: str, profile: Dict, use_shared_models: bool = False) -> Any: + """Create adapter for a specific method. + + Args: + method: One of the baseline method names + profile: User profile dict (used later in start_session, not constructor) + use_shared_models: If True, share embedding/reranker models across parallel + workers. ESSENTIAL for parallel profile processing to avoid OOM. + + Returns: + Configured adapter instance + """ + # Auto-detect available GPUs and set device assignment accordingly + # Layout with local 70B user (4 GPUs): + # GPU 0-1: 70B user simulator (TP=2) + # GPU 2: 8B agent vLLM server + # GPU 3: Embedding + Reranker + Extractor + # Layout with OpenAI user (2 GPUs): + # GPU 0: 8B agent vLLM server + # GPU 1: Embedding + Reranker + Extractor + device_assignment = None + try: + import torch + n_gpus = torch.cuda.device_count() + if n_gpus >= 4: + # 4 GPU layout: 70B user on 0-1, agent on 2, adapters on 3 + device_assignment = { + "embed": "cuda:3", + "reranker": "cuda:3", + "extractor": "cuda:3", + } + elif n_gpus >= 2: + # 2 GPU layout: agent on 0, adapters on 1 + device_assignment = { + "embed": "cuda:1", + "reranker": "cuda:1", + "extractor": "cuda:1", + } + elif n_gpus == 1: + device_assignment = { + "embed": "cuda:0", + "reranker": "cuda:0", + "extractor": "cuda:0", + } + except ImportError: + pass + + adapter = create_baseline_adapter( + method, + device_assignment=device_assignment, + use_vllm=self.config.use_vllm, + use_shared_models=use_shared_models, + reward_mode=self.config.reward_mode, + ) + # Profile will be passed to start_session() when the conversation begins + return adapter + + def run_single_session( + self, + method: str, + profile: Dict, + problem: Dict, + is_conflict_query: bool = False, + adapter: Any = None, + user_agent: Any = None + ) -> Dict: + """Run a single session with PROPER multi-turn conversation and user simulation. + + This implements: + 1. User simulator that role-plays with preferences + 2. Multi-turn conversation (up to max_turns) + 3. Preference enforcement by simulated user + 4. Proper metrics extraction from conversation + """ + # Use provided adapter (reused across sessions) or create new one + agent_adapter = adapter if adapter else self._create_method_adapter(method, profile) + + # Prepare conflict scenario if needed + conflict_scenario = None + original_problem = problem.get("problem", problem.get("question", "")) + if is_conflict_query: + conflict_scenario = self.conflict_generator.generate_for_profile( + profile["preferences"], + problem.get("domain", "general") + ) + if conflict_scenario: + problem = dict(problem) + problem["problem"] = conflict_scenario["query"] + + query = problem.get("problem", problem.get("question", "")) + + # Extract user preferences as formatted string + user_prefs = profile.get("preferences", []) + if isinstance(user_prefs, list) and len(user_prefs) > 0: + if isinstance(user_prefs[0], dict): + # Structured preferences with condition/action + pref_str = "\n".join([ + f"- When {p.get('condition', '')}, {p.get('action', '')}" + for p in user_prefs[:10] # Top 10 preferences + ]) + else: + # Simple string preferences + pref_str = "\n".join([f"- {p}" for p in user_prefs[:10]]) + else: + pref_str = str(user_prefs) + + user_persona = profile.get("persona", "A user seeking help with problem solving.") + + # Create user agent for this session (or reuse provided one) + if user_agent is None: + if self.config.use_openai_user: + user_agent = OpenAIUserAgent( + user_task_description="Help the user solve their problem.", + problem=query, + user_persona=user_persona, + user_preferences=pref_str, + model=self.config.openai_user_model, + ) + elif self.config.use_vllm: + user_agent = VLLMUserAgent( + user_task_description="Help the user solve their problem.", + problem=query, + user_persona=user_persona, + user_preferences=pref_str, + vllm_url=self.config.vllm_user_url, + ) + else: + user_agent = SharedLocalUserAgent( + user_task_description="Help the user solve their problem.", + problem=query, + user_persona=user_persona, + user_preferences=pref_str, + ) + + # Initialize conversation + turns = [] + full_user_log = [] # Detailed user agent outputs + + # Metrics tracking + enforcement_count = 0 + disappointment_count = 0 + user_token_count = 0 + agent_token_count = 0 + preference_compliance_scores = [] + + try: + # Initialize adapter for this user + if hasattr(agent_adapter, 'initialize'): + agent_adapter.initialize() + if hasattr(agent_adapter, 'start_session'): + agent_adapter.start_session( + user_id=profile.get("user_id", "test_user"), + user_profile={"preferences": user_prefs, "persona": user_persona} + ) + + # Start with agent greeting + conversation = [{"role": "assistant", "content": "How can I help you today?"}] + + # Multi-turn conversation loop + for turn_num in range(self.config.max_turns_per_session): + # === User Turn === + user_response = user_agent.generate_user_response(conversation) + + if user_response is None: + logger.warning(f"User agent failed to respond at turn {turn_num}") + break + + user_message = str(user_response.get("response", "")) + user_token_count += len(user_message.split()) + + # Add to conversation + conversation.append({"role": "user", "content": user_message}) + turns.append({"role": "user", "content": user_message}) + full_user_log.append(user_response) + + # Check for termination + if user_response.get("should_terminate", False) or TERMINATION_SIGNAL in user_message: + break + + # Detect preference enforcement (user correcting agent) + enforcement_keywords = ["please", "i asked", "i said", "i prefer", "can you", "could you", "instead"] + if any(kw in user_message.lower() for kw in enforcement_keywords): + enforcement_count += 1 + + # === Agent Turn === + if hasattr(agent_adapter, 'generate_response'): + response = agent_adapter.generate_response(user_message, conversation[:-1]) + agent_content = response.get("response", str(response)) if isinstance(response, dict) else str(response) + elif callable(agent_adapter): + agent_content = agent_adapter(conversation) + else: + agent_content = "[Error: Adapter not properly configured]" + + agent_token_count += len(agent_content.split()) + + # Add to conversation + conversation.append({"role": "assistant", "content": agent_content}) + turns.append({"role": "assistant", "content": agent_content}) + + # Estimate preference compliance for this turn (heuristic based on user satisfaction) + # If user doesn't enforce in next turn, assume compliance + # This is a simplified heuristic - LLM judge would be more accurate + compliance_score = 0.8 if enforcement_count == 0 else max(0.2, 1.0 - 0.2 * enforcement_count) + preference_compliance_scores.append(compliance_score) + + # End session + if hasattr(agent_adapter, 'end_session'): + adapter_metrics = agent_adapter.end_session(task_success=True) + else: + adapter_metrics = {} + + except Exception as e: + import traceback + logger.error(f"Error in session: {e}") + logger.error(f"Full traceback:\n{traceback.format_exc()}") + turns.append({"role": "assistant", "content": f"[Error: {e}]"}) + + # Compute metrics + total_turns = len(turns) + total_token_count = user_token_count + agent_token_count + + # Check if user reached a satisfactory answer (from last user response) + task_success = False + if full_user_log: + last_user = full_user_log[-1] + if last_user.get("should_terminate", False): + draft = last_user.get("draft_answer", "") + # Consider success if draft answer is not empty/"I don't know" + task_success = bool(draft) and draft.lower() != "i don't know" + + # Compute average compliance + avg_compliance = sum(preference_compliance_scores) / len(preference_compliance_scores) if preference_compliance_scores else 0.5 + + # Conflict resolution (if this was a conflict test) + conflict_accuracy = 0.0 + if is_conflict_query and conflict_scenario: + # Check if the correct preference was applied + expected_pref = conflict_scenario.get("expected_preference", "") + # Simple heuristic: check if expected preference keywords appear in agent responses + agent_texts = " ".join([t["content"] for t in turns if t["role"] == "assistant"]) + if expected_pref and any(kw in agent_texts.lower() for kw in expected_pref.lower().split()[:3]): + conflict_accuracy = 1.0 + + # Over-personalization detection (heuristic: if agent mentions preferences not in profile) + over_personalization = 0.0 + + metrics = ConversationMetrics( + task_success=task_success, + turns_to_success=total_turns if task_success else -1, + total_turns=total_turns, + user_token_count=user_token_count, + enforcement_count=enforcement_count, + disappointment_count=disappointment_count, + total_token_count=total_token_count, + agent_token_count=agent_token_count, + preference_compliance_scores=preference_compliance_scores, + conflict_resolution_accuracy=conflict_accuracy, + over_personalization_rate=over_personalization, + ) + + return { + "method": method, + "profile_id": profile.get("user_id", "unknown"), + "problem_id": problem.get("problem_id", str(hash(query))[:8]), + "problem": original_problem, + "ground_truth_solution": problem.get("solution", problem.get("answer", "")), + "is_conflict_test": is_conflict_query, + "conflict_scenario": conflict_scenario, + "conversation": {"turns": turns} if self.config.save_conversations else None, + "full_user_log": full_user_log if self.config.save_conversations else None, + "metrics": asdict(metrics), + "adapter_metrics": adapter_metrics if 'adapter_metrics' in dir() else {}, + } + + def _run_profile_sessions( + self, + method: str, + profile_idx: int, + profile: Dict, + adapter: Any = None + ) -> List[Dict]: + """Run all sessions for a single profile. Thread-safe for parallel execution.""" + profile_results = [] + + # Create vLLM-based agent client if using vLLM (for methods that need it) + vllm_agent = None + if self.config.use_vllm and method == "vanilla": + vllm_agent = VLLMAgentClient( + vllm_url=self.config.vllm_agent_url, + system_prompt="You are a helpful AI assistant for problem-solving tasks." + ) + + # Run sessions across datasets + session_idx = 0 + for ds_name, dataset in self.datasets.items(): + samples = dataset.get_testset() + + for sample in samples: + if session_idx >= self.config.n_sessions_per_profile: + break + + # Decide if this is a conflict query + is_conflict = (session_idx % int(1 / self.config.conflict_ratio)) == 0 + + problem = { + "problem": sample.problem, + "solution": sample.solution, + "problem_id": sample.problem_id, + "domain": sample.domain, + } + + try: + result = self.run_single_session( + method=method, + profile=profile, + problem=problem, + is_conflict_query=is_conflict, + adapter=vllm_agent if vllm_agent else adapter + ) + profile_results.append(result) + except Exception as e: + logger.error(f"Error in session for profile {profile_idx}: {e}") + + session_idx += 1 + + return profile_results + + def run_method(self, method: str) -> List[Dict]: + """Run all sessions for a single method with checkpointing and parallel processing.""" + logger.info(f"Running method: {method}") + + # Setup method directory and checkpoint + method_dir = self.output_dir / method + method_dir.mkdir(exist_ok=True) + checkpoint_file = method_dir / "checkpoint.json" + results_file = method_dir / "results.json" + + # Load existing results and checkpoint + results = [] + completed_profiles = set() + sessions_per_profile = {} # Track session count per profile for continue functionality + if checkpoint_file.exists(): + with open(checkpoint_file, "r") as f: + checkpoint = json.load(f) + completed_profiles = set(checkpoint.get("completed_profiles", [])) + sessions_per_profile = checkpoint.get("sessions_per_profile", {}) + logger.info(f" Resuming from checkpoint: {len(completed_profiles)} profiles completed") + if sessions_per_profile: + total_sessions = sum(sessions_per_profile.values()) + logger.info(f" Session-level tracking: {total_sessions} sessions across {len(sessions_per_profile)} profiles") + if results_file.exists(): + with open(results_file, "r") as f: + results = json.load(f) + + # Determine profile range + start_idx = self.config.start_profile + end_idx = self.config.end_profile if self.config.end_profile else len(self.profiles) + + # Build list of profiles that need more sessions + profiles_to_run = [] + for idx in range(start_idx, min(end_idx, len(self.profiles))): + existing_sessions = sessions_per_profile.get(str(idx), 0) + if existing_sessions < self.config.n_sessions_per_profile: + profiles_to_run.append(idx) + + # Log what we're running + if sessions_per_profile: + total_existing = sum(sessions_per_profile.get(str(idx), 0) for idx in profiles_to_run) + total_needed = len(profiles_to_run) * self.config.n_sessions_per_profile + logger.info(f" Running profiles {start_idx} to {end_idx-1}: {len(profiles_to_run)} profiles need sessions") + logger.info(f" Sessions: {total_existing} existing, {total_needed - total_existing} remaining") + else: + logger.info(f" Running profiles {start_idx} to {end_idx-1} ({len(profiles_to_run)} remaining)") + + # When using batch processing with vLLM or OpenAI user: use turn-synchronous batch mode + # This batches both user and agent calls for maximum throughput + if self.config.use_batch_processing and self.config.use_vllm: + user_type = "OpenAI" if self.config.use_openai_user else "local vLLM" + logger.info(f" Using BATCH processing ({user_type} user) for {method}") + return self._run_method_batch( + method, profiles_to_run, results, completed_profiles, + sessions_per_profile, checkpoint_file, results_file + ) + + # Decide on parallelization for sequential methods + n_parallel = self.config.parallel_profiles if (self.config.use_vllm or self.config.use_openai_user) else 1 + + if n_parallel > 1: + logger.info(f" Using parallel processing with {n_parallel} workers") + self._run_method_parallel( + method, profiles_to_run, results, completed_profiles, + sessions_per_profile, checkpoint_file, results_file + ) + else: + # Sequential execution (original behavior) + # Create ONE adapter per method and reuse it (avoids GPU OOM from repeated model loading) + adapter = self._create_method_adapter(method, None) + adapter.initialize() + + for profile_idx in profiles_to_run: + profile = self.profiles[profile_idx] + logger.info(f" Profile {profile_idx + 1}/{len(self.profiles)}") + + profile_results = self._run_profile_sessions(method, profile_idx, profile, adapter) + + # Add profile results to overall results + results.extend(profile_results) + completed_profiles.add(profile_idx) + sessions_per_profile[str(profile_idx)] = self.config.n_sessions_per_profile + + # Save checkpoint and results after each profile + with open(checkpoint_file, "w") as f: + json.dump({ + "completed_profiles": sorted(list(completed_profiles)), + "sessions_per_profile": sessions_per_profile + }, f) + with open(results_file, "w") as f: + json.dump(results, f, indent=2) + logger.info(f" Profile {profile_idx + 1} completed and checkpointed") + + return results + + def _run_method_parallel( + self, + method: str, + profiles_to_run: List[int], + results: List[Dict], + completed_profiles: set, + sessions_per_profile: Dict[str, int], + checkpoint_file: Path, + results_file: Path + ): + """Run profiles in parallel using ThreadPoolExecutor. + + Uses shared model singletons for embedding/reranker to avoid OOM + when multiple workers try to load their own copies. + """ + n_parallel = self.config.parallel_profiles + results_lock = threading.Lock() + start_time = time.time() + profiles_completed = 0 + + def process_profile(profile_idx: int) -> tuple: + """Process a single profile and return (profile_idx, results).""" + profile = self.profiles[profile_idx] + # Create adapter with shared models to avoid OOM from duplicate model loading + adapter = self._create_method_adapter(method, profile, use_shared_models=True) + profile_results = self._run_profile_sessions(method, profile_idx, profile, adapter) + return profile_idx, profile_results + + with ThreadPoolExecutor(max_workers=n_parallel) as executor: + # Submit all profile jobs + future_to_profile = { + executor.submit(process_profile, idx): idx + for idx in profiles_to_run + } + + # Process completed profiles + for future in as_completed(future_to_profile): + profile_idx = future_to_profile[future] + try: + idx, profile_results = future.result() + + with results_lock: + results.extend(profile_results) + completed_profiles.add(idx) + sessions_per_profile[str(idx)] = self.config.n_sessions_per_profile + profiles_completed += 1 + + # Save checkpoint with session-level tracking + with open(checkpoint_file, "w") as f: + json.dump({ + "completed_profiles": sorted(list(completed_profiles)), + "sessions_per_profile": sessions_per_profile + }, f) + with open(results_file, "w") as f: + json.dump(results, f, indent=2) + + # Log progress with throughput estimate + elapsed = time.time() - start_time + profiles_per_hour = profiles_completed / elapsed * 3600 if elapsed > 0 else 0 + sessions_per_hour = len(results) / elapsed * 3600 if elapsed > 0 else 0 + logger.info( + f" Profile {idx + 1} completed " + f"({profiles_completed}/{len(profiles_to_run)}) - " + f"{profiles_per_hour:.1f} profiles/hr, {sessions_per_hour:.1f} sessions/hr" + ) + + except Exception as e: + logger.error(f" Profile {profile_idx} failed: {e}") + + def _run_method_batch( + self, + method: str, + profiles_to_run: List[int], + results: List[Dict], + completed_profiles: set, + sessions_per_profile: Dict[str, int], + checkpoint_file: Path, + results_file: Path + ) -> List[Dict]: + """ + Turn-synchronous batch processing for ALL methods. + + At each turn, user calls are batched concurrently via AsyncOpenAI, + then agent responses go through personalization adapters. + Sessions within a profile run sequentially (for stateful memory). + """ + from agents.batch_vllm_agent import BatchOpenAIClient, BatchVLLMClient, TERMINATION_SIGNAL + from json_repair import repair_json + + start_time = time.time() + + # Create user client (OpenAI API or local vLLM) + if self.config.use_openai_user: + user_client = BatchOpenAIClient( + model=self.config.openai_user_model, + max_tokens=4096, + max_concurrent=32, + api_key=os.environ.get("OPENAI_API_KEY"), + ) + logger.info(f" Using OpenAI user simulator: {self.config.openai_user_model}") + else: + user_client = BatchVLLMClient( + vllm_url=self.config.vllm_user_url, + max_tokens=4096, + temperature=1.0, + timeout=None, + max_concurrent=100, + json_mode=True, # User simulator needs JSON output + ) + logger.info(f" Using local vLLM user simulator: {self.config.vllm_user_url}") + + # Create async agent client for batched vLLM calls + agent_client = BatchVLLMClient( + vllm_url=self.config.vllm_agent_url, + max_tokens=2048, + temperature=0.7, + timeout=None, # Infinite timeout for long generations + max_concurrent=100, + ) + + USER_PROMPT_TEMPLATE = ( + "You are a user simulator collaborating with an agent to solve a problem. " + "You will be provided with a problem description, and you must get the agent to help you solve it. " + "You will also be provided with user preferences, which you must follow and actively enforce throughout the conversation.\n\n" + "# Problem Description\n{problem}\nNote: the agent cannot see this problem description.\n\n" + "# User Persona\n{user_persona}\n\n" + "# User Preferences\n{user_preferences}\n" + "These preferences are NON-NEGOTIABLE that define how you prefer the agent to behave. They must be strictly enforced:\n" + " - **Answer clarifying questions**: The agent may ask clarifying questions before attempting an answer. " + "Answer such questions, and do not enforce preferences about answer format or content while the agent is clarifying.\n" + " - **Enforce immediately**: Every agent response must satisfy your preferences before you can proceed. " + "Explicitly ask the agent to adjust their response until it complies.\n" + " - **Never proceed without compliance**: Do NOT update your draft answer, do NOT consider terminating, " + "and do NOT move forward until the agent follows your preferences.\n\n" + "# Draft Answer Management\n" + "- **Maintain a working draft**: Start with \"I don't know\". Update your draft answer based on what you learn from agent responses.\n" + "- **Don't update when enforcing preferences**: If the agent response does not follow your preferences, " + "do NOT update your draft answer, regardless of whether the agent provides helpful information.\n\n" + "# Conversation Termination\n" + "Before generating your response, determine if you should terminate:\n" + " - Do you feel like your draft answer is a good answer to the problem?\n" + " - Do you feel like the agent cannot help further?\n" + "If the agent response does not follow your preferences, you must NOT terminate - instead, enforce the preferences.\n" + "When ready to terminate, respond with \"TERMINATE\".\n\n" + "# Output Format (respond in JSON):\n" + "{{\n" + " \"preferences_check\": \"For EACH relevant preference, evaluate: is it satisfied?\",\n" + " \"enforce_preferences\": true/false,\n" + " \"reasoning\": \"Brief reasoning (2-3 sentences). Does agent follow preferences? If no, enforce. If yes, update draft.\",\n" + " \"draft_answer\": \"Your current working draft answer\",\n" + " \"should_terminate\": true/false,\n" + " \"response\": \"Your response to the agent\"\n" + "}}" + ) + + def parse_user_response(content): + if not content: + return None + try: + parsed = repair_json(content, return_objects=True) + if isinstance(parsed, dict) and "response" in parsed: + return parsed + except: + pass + if TERMINATION_SIGNAL in (content or ""): + return {"reasoning": "", "draft_answer": "", "should_terminate": True, "response": TERMINATION_SIGNAL} + return {"reasoning": "", "draft_answer": "", "should_terminate": False, "response": content or ""} + + def reverse_roles(conversation): + return [ + {"role": "user" if m["role"] == "assistant" else "assistant", "content": m["content"]} + for m in conversation + ] + + # Create per-profile adapters + adapters = {} + profile_sessions = {} + + for profile_idx in profiles_to_run: + profile = self.profiles[profile_idx] + adapter = self._create_method_adapter(method, profile, use_shared_models=True) + if hasattr(adapter, 'initialize'): + adapter.initialize() + adapters[profile_idx] = adapter + + sessions = [] + for ds_name, ds_obj in self.datasets.items(): + ds_items = ds_obj.get_testset() + for item in ds_items[:self.config.n_sessions_per_profile]: + sessions.append({"problem": item.problem, "solution": item.solution, "domain": ds_obj.domain}) + sessions = sessions[:self.config.n_sessions_per_profile] + n_conflict = int(len(sessions) * self.config.conflict_ratio) + profile_sessions[profile_idx] = [(s, idx < n_conflict) for idx, s in enumerate(sessions)] + + n_sessions = self.config.n_sessions_per_profile + + # Calculate sessions to run per profile (accounting for existing sessions) + sessions_to_run_per_profile = {} + for profile_idx in profiles_to_run: + existing = sessions_per_profile.get(str(profile_idx), 0) + remaining = n_sessions - existing + if remaining > 0: + sessions_to_run_per_profile[profile_idx] = (existing, remaining) # (start_session, count) + + if sessions_to_run_per_profile: + total_remaining = sum(v[1] for v in sessions_to_run_per_profile.values()) + logger.info(f" Batch: {len(sessions_to_run_per_profile)} profiles, {total_remaining} sessions remaining") + else: + logger.info(f" Batch: All sessions already completed") + return results + + # Process sessions in rounds + for session_idx in range(n_sessions): + # Initialize all conversations for this round + all_states = {} # profile_idx -> state dict + active_set = set() + + for profile_idx in profiles_to_run: + # Skip if this profile doesn't need this session + if profile_idx not in sessions_to_run_per_profile: + continue + start_session, _ = sessions_to_run_per_profile[profile_idx] + if session_idx < start_session: + continue # Already completed this session + if session_idx >= len(profile_sessions[profile_idx]): + continue + problem_dict, is_conflict = profile_sessions[profile_idx][session_idx] + profile = self.profiles[profile_idx] + query = problem_dict["problem"] + + if is_conflict: + cs = self.conflict_generator.generate_for_profile( + profile.get("preferences", []), problem_dict.get("domain", "general")) + if cs: + query = cs["query"] + + user_prefs = profile.get("preferences", []) + if isinstance(user_prefs, list) and user_prefs: + if isinstance(user_prefs[0], dict): + pref_str = "\n".join([f"- When {p.get('condition','')}, {p.get('action','')}" for p in user_prefs[:10]]) + else: + pref_str = "\n".join([f"- {p}" for p in user_prefs[:10]]) + else: + pref_str = str(user_prefs) + + user_persona = profile.get("persona", "A user seeking help with problem solving.") + adapter = adapters[profile_idx] + if hasattr(adapter, 'start_session'): + adapter.start_session( + user_id=profile.get("user_id", f"user_{profile_idx}"), + user_profile={"preferences": user_prefs, "persona": user_persona} + ) + + all_states[profile_idx] = { + "conversation": [{"role": "assistant", "content": "How can I help you today?"}], + "full_log": [], + "system_prompt": USER_PROMPT_TEMPLATE.format( + problem=query, user_persona=user_persona, user_preferences=pref_str), + "problem_dict": problem_dict, + "is_conflict": is_conflict, + "enforcement_count": 0, + } + active_set.add(profile_idx) + + # Turn-synchronous loop + for turn in range(self.config.max_turns_per_session): + if not active_set: + break + + # Batch user calls + active_list = sorted(active_set) + user_msgs_batch = [] + for pidx in active_list: + state = all_states[pidx] + msgs = [{"role": "system", "content": state["system_prompt"]}] + msgs.extend(reverse_roles(state["conversation"])) + user_msgs_batch.append(msgs) + + user_responses = user_client.batch_completion(user_msgs_batch) + + # Process user responses and prepare agent prompts for batching + to_remove = [] + agent_prompts_batch = [] # List of (pidx, messages, context) + for i, pidx in enumerate(active_list): + state = all_states[pidx] + parsed = parse_user_response(user_responses[i]) + + if parsed is None: + to_remove.append(pidx) + continue + + user_msg = str(parsed.get("response", "")) + state["conversation"].append({"role": "user", "content": user_msg}) + state["full_log"].append(parsed) + + if parsed.get("enforce_preferences", False): + state["enforcement_count"] += 1 + + if parsed.get("should_terminate", False) or TERMINATION_SIGNAL in user_msg: + to_remove.append(pidx) + continue + + # Prepare agent prompt for batching (don't call LLM yet) + try: + adapter = adapters[pidx] + if hasattr(adapter, 'prepare_prompt'): + messages, context = adapter.prepare_prompt(user_msg, state["conversation"][:-1]) + agent_prompts_batch.append((pidx, messages, context)) + elif hasattr(adapter, 'generate_response'): + # Fallback for adapters without prepare_prompt + agent_prompts_batch.append((pidx, None, None)) + else: + state["conversation"].append({"role": "assistant", "content": "[Error: Adapter not configured]"}) + except Exception as e: + logger.error(f" Agent prepare error p{pidx} t{turn}: {e}") + state["conversation"].append({"role": "assistant", "content": "I apologize, I encountered an error. Could you rephrase?"}) + + # Batch vLLM call for all agent prompts + if agent_prompts_batch: + # Separate prompts that can be batched from fallback + batchable = [(pidx, msgs, ctx) for pidx, msgs, ctx in agent_prompts_batch if msgs is not None] + fallback = [(pidx, msgs, ctx) for pidx, msgs, ctx in agent_prompts_batch if msgs is None] + + # Batch call for batchable prompts + if batchable: + batch_messages = [msgs for _, msgs, _ in batchable] + batch_responses = agent_client.batch_completion(batch_messages) + + # Process batched responses + for (pidx, _, context), response in zip(batchable, batch_responses): + try: + adapter = adapters[pidx] + state = all_states[pidx] + if response is not None: + result = adapter.process_response(response, context) + agent_content = result.get("response", str(result)) if isinstance(result, dict) else str(result) + else: + agent_content = "I apologize, I encountered an error. Could you rephrase?" + state["conversation"].append({"role": "assistant", "content": agent_content}) + except Exception as e: + logger.error(f" Agent process error p{pidx} t{turn}: {e}") + all_states[pidx]["conversation"].append({"role": "assistant", "content": "I apologize, I encountered an error. Could you rephrase?"}) + + # Handle fallback (adapters without prepare_prompt - sequential calls) + for pidx, _, _ in fallback: + try: + adapter = adapters[pidx] + state = all_states[pidx] + user_msg = state["conversation"][-1]["content"] + resp = adapter.generate_response(user_msg, state["conversation"][:-1]) + agent_content = resp.get("response", str(resp)) if isinstance(resp, dict) else str(resp) + state["conversation"].append({"role": "assistant", "content": agent_content}) + except Exception as e: + logger.error(f" Agent fallback error p{pidx} t{turn}: {e}") + all_states[pidx]["conversation"].append({"role": "assistant", "content": "I apologize, I encountered an error. Could you rephrase?"}) + + active_set -= set(to_remove) + + # Save results for this session round + for profile_idx in profiles_to_run: + if profile_idx not in all_states: + continue + state = all_states[profile_idx] + problem_dict = state["problem_dict"] + conversation = state["conversation"] + full_log = state["full_log"] + + user_tokens = sum(len(m["content"].split()) for m in conversation if m["role"] == "user") + agent_tokens = sum(len(m["content"].split()) for m in conversation if m["role"] == "assistant") + + enforcement_count = state["enforcement_count"] + task_success = 0 + for entry in full_log: + if entry.get("should_terminate", False): + draft = entry.get("draft_answer", "") + if draft and "don't know" not in draft.lower() and len(draft) > 20: + task_success = 1 + + results.append({ + "method": method, + "profile_id": self.profiles[profile_idx].get("user_id", f"user_{profile_idx}"), + "problem_id": f"session_{session_idx}", + "problem": problem_dict.get("problem", ""), + "ground_truth_solution": problem_dict.get("solution", ""), + "is_conflict_test": state["is_conflict"], + "conversation": {"turns": conversation}, + "full_user_log": full_log, + "metrics": { + "task_success": bool(task_success), + "total_turns": len(conversation), + "user_token_count": user_tokens, + "agent_token_count": agent_tokens, + "total_token_count": user_tokens + agent_tokens, + "enforcement_count": enforcement_count, + "disappointment_count": 0, + "preference_compliance_scores": [], + "conflict_resolution_accuracy": 0, + "over_personalization_rate": 0, + }, + "adapter_metrics": {}, + }) + + # Checkpoint after each session round with session-level tracking + # Only increment for profiles that actually ran in this round (those in all_states) + for profile_idx in all_states.keys(): + sessions_per_profile[str(profile_idx)] = sessions_per_profile.get(str(profile_idx), 0) + 1 + if sessions_per_profile[str(profile_idx)] >= self.config.n_sessions_per_profile: + completed_profiles.add(profile_idx) + + with open(checkpoint_file, "w") as f: + json.dump({ + "completed_profiles": sorted(list(completed_profiles)), + "sessions_per_profile": sessions_per_profile + }, f) + with open(results_file, "w") as f: + json.dump(results, f, indent=2) + + elapsed = time.time() - start_time + sessions_done = len(results) + rate = sessions_done / elapsed * 3600 if elapsed > 0 else 0 + logger.info(f" Session round {session_idx+1}/{n_sessions}: {sessions_done} total, {rate:.0f} sessions/hr") + + # Explicitly free adapter models to prevent GPU OOM across methods + for pidx, adapter in adapters.items(): + if hasattr(adapter, 'cleanup'): + adapter.cleanup() + del adapters + + return results + + def run_all(self) -> Dict[str, Any]: + """Run all methods and generate comparative analysis.""" + all_results = {} + + for method in self.config.methods: + if method not in AVAILABLE_METHODS: + logger.warning(f"Unknown method: {method}, skipping") + continue + + results = self.run_method(method) + all_results[method] = results + + # Free GPU memory between methods to prevent OOM on later adapters + try: + from personalization.serving.personalized_llm import clear_shared_models + clear_shared_models() + except ImportError: + pass + try: + import gc + import torch + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + logger.info(f" GPU memory freed after {method}: {torch.cuda.memory_allocated()/1e9:.1f}GB allocated") + except ImportError: + pass + + # Comparative analysis + analysis = self._analyze_results(all_results) + + # Save analysis + with open(self.output_dir / "analysis.json", "w") as f: + json.dump(analysis, f, indent=2) + + # Generate report + self._generate_report(analysis) + + return analysis + + def _analyze_results(self, all_results: Dict[str, List[Dict]]) -> Dict: + """Analyze results across all methods.""" + analysis = { + "per_method": {}, + "comparison": {}, + } + + for method, results in all_results.items(): + n = len(results) + if n == 0: + continue + + # Aggregate metrics + task_success = sum(r["metrics"]["task_success"] for r in results) / n + avg_user_tokens = sum(r["metrics"]["user_token_count"] for r in results) / n + avg_total_tokens = sum(r["metrics"]["total_token_count"] for r in results) / n + avg_enforcement = sum(r["metrics"]["enforcement_count"] for r in results) / n + avg_turns = sum(r["metrics"]["total_turns"] for r in results) / n + + # Compliance and conflict metrics + compliance_scores = [ + sum(r["metrics"]["preference_compliance_scores"]) / len(r["metrics"]["preference_compliance_scores"]) + if r["metrics"]["preference_compliance_scores"] else 0.5 + for r in results + ] + avg_compliance = sum(compliance_scores) / len(compliance_scores) + + conflict_results = [r for r in results if r["is_conflict_test"]] + conflict_accuracy = sum( + r["metrics"]["conflict_resolution_accuracy"] for r in conflict_results + ) / len(conflict_results) if conflict_results else 0 + + over_personalization = sum( + r["metrics"]["over_personalization_rate"] for r in results + ) / n + + analysis["per_method"][method] = { + "n_sessions": n, + "task_success_rate": task_success, + "avg_user_tokens": avg_user_tokens, + "avg_total_tokens": avg_total_tokens, + "avg_enforcement_count": avg_enforcement, + "avg_turns": avg_turns, + "avg_preference_compliance": avg_compliance, + "conflict_resolution_accuracy": conflict_accuracy, + "over_personalization_rate": over_personalization, + } + + # Comparison + metrics_to_compare = [ + ("task_success_rate", True), # higher is better + ("avg_user_tokens", False), # lower is better + ("avg_total_tokens", False), # lower is better + ("avg_enforcement_count", False), # lower is better + ("avg_preference_compliance", True), # higher is better + ("conflict_resolution_accuracy", True), # higher is better + ("over_personalization_rate", False), # lower is better + ] + + for metric, higher_better in metrics_to_compare: + values = {m: analysis["per_method"][m][metric] for m in analysis["per_method"]} + if not values: + logger.warning(f"No values for metric {metric}, skipping comparison") + continue + if higher_better: + best = max(values, key=values.get) + else: + best = min(values, key=values.get) + + analysis["comparison"][metric] = { + "values": values, + "best_method": best, + "best_value": values[best], + } + + return analysis + + def _generate_report(self, analysis: Dict) -> None: + """Generate a human-readable report.""" + report_lines = [ + "# Personalization Experiment Report", + f"\nGenerated: {datetime.now().isoformat()}", + f"\nConfig: {self.config.n_profiles} profiles, {self.config.n_sessions_per_profile} sessions each", + "\n## Method Comparison\n", + ] + + # Create comparison table + metrics_display = [ + ("Task Success", "task_success_rate", "{:.1%}"), + ("User Effort (tokens)", "avg_user_tokens", "{:.0f}"), + ("Total Tokens", "avg_total_tokens", "{:.0f}"), + ("Enforcement Count", "avg_enforcement_count", "{:.2f}"), + ("Preference Compliance", "avg_preference_compliance", "{:.1%}"), + ("Conflict Resolution", "conflict_resolution_accuracy", "{:.1%}"), + ("Over-personalization", "over_personalization_rate", "{:.1%}"), + ] + + methods = list(analysis["per_method"].keys()) + + # Header + header = "| Metric |" + "|".join(f" {m} " for m in methods) + "| Best |" + separator = "|" + "|".join(["-" * (len(m) + 2) for m in ["Metric"] + methods + ["Best"]]) + "|" + + report_lines.extend([header, separator]) + + for display_name, metric_key, fmt in metrics_display: + row = f"| {display_name} |" + for m in methods: + val = analysis["per_method"].get(m, {}).get(metric_key, 0) + row += f" {fmt.format(val)} |" + + if metric_key in analysis.get("comparison", {}): + best = analysis["comparison"][metric_key]["best_method"] + else: + best = "N/A" + row += f" {best} |" + report_lines.append(row) + + # Key findings + report_lines.extend([ + "\n## Key Findings\n", + ]) + + # Find advantages of proposed methods + rag_vector = analysis["per_method"].get("rag_vector", {}) + rag = analysis["per_method"].get("rag", {}) + contextual = analysis["per_method"].get("contextual", {}) + all_memory = analysis["per_method"].get("all_memory", {}) + + if rag_vector and contextual: + token_reduction = (contextual.get("avg_total_tokens", 0) - rag_vector.get("avg_total_tokens", 0)) / contextual.get("avg_total_tokens", 1) * 100 + report_lines.append(f"- **Token Efficiency**: RAG+Vector uses {token_reduction:.1f}% fewer tokens than contextual memory") + + if rag_vector and all_memory: + conflict_improvement = rag_vector.get("conflict_resolution_accuracy", 0) - all_memory.get("conflict_resolution_accuracy", 0) + report_lines.append(f"- **Conflict Resolution**: RAG+Vector improves by {conflict_improvement:.1%} over all-memory baseline") + + if rag_vector: + report_lines.append(f"- **Over-personalization**: RAG+Vector rate: {rag_vector.get('over_personalization_rate', 0):.1%}") + + # Save report + report_path = self.output_dir / "report.md" + with open(report_path, "w") as f: + f.write("\n".join(report_lines)) + + logger.info(f"Report saved to {report_path}") + + +def main(): + parser = argparse.ArgumentParser(description="Run personalization experiments") + parser.add_argument("--config", type=str, help="Path to config YAML file") + parser.add_argument("--methods", type=str, default="vanilla,contextual,rag,rag_vector", + help="Comma-separated list of methods to compare") + parser.add_argument("--datasets", type=str, default="math-hard,math-500,bigcodebench", + help="Comma-separated list of datasets") + parser.add_argument("--n-profiles", type=int, default=200, help="Number of user profiles") + parser.add_argument("--n-sessions", type=int, default=30, help="Sessions per profile") + parser.add_argument("--max-turns", type=int, default=15, help="Max turns per session") + parser.add_argument("--output-dir", type=str, default="results", help="Output directory") + parser.add_argument("--profile-path", type=str, help="Path to pre-generated profiles") + parser.add_argument("--start-profile", type=int, default=0, + help="Start profile index (inclusive, 0-indexed)") + parser.add_argument("--end-profile", type=int, default=None, + help="End profile index (exclusive). If not set, runs all profiles from start") + + # vLLM and parallel processing options + parser.add_argument("--use-vllm", action="store_true", + help="Use vLLM servers for inference (much faster)") + parser.add_argument("--vllm-user-url", type=str, default="http://localhost:8004/v1", + help="vLLM server URL for user simulator (70B)") + parser.add_argument("--vllm-agent-url", type=str, default="http://localhost:8003/v1", + help="vLLM server URL for agent (8B)") + # OpenAI user agent options + parser.add_argument("--use-openai-user", action="store_true", + help="Use OpenAI API (GPT-5) for user simulation instead of vLLM") + parser.add_argument("--openai-user-model", type=str, default="gpt-5", + help="OpenAI model name for user simulator (default: gpt-5)") + parser.add_argument("--reward-mode", type=str, default="keyword", choices=["keyword", "llm"], + help="Reward mode for RL updates: 'keyword' (user signals) or 'llm' (GPT-5-nano judge)") + + parser.add_argument("--parallel-profiles", type=int, default=50, + help="Number of profiles to process in parallel (requires --use-vllm)") + parser.add_argument("--use-batch-processing", action="store_true", default=True, + help="Use turn-synchronous batch processing for vanilla/all_memory") + parser.add_argument("--no-batch-processing", action="store_false", dest="use_batch_processing", + help="Disable batch processing") + parser.add_argument("--batch-size", type=int, default=50, + help="Number of conversations to batch together") + parser.add_argument("--continue-from", type=str, default=None, + help="Path to existing output directory to continue from (for extending sessions)") + + args = parser.parse_args() + + # Load or create config + if args.config and Path(args.config).exists(): + with open(args.config) as f: + config_dict = yaml.safe_load(f) + config = ExperimentConfig(**config_dict) + else: + config = ExperimentConfig( + methods=args.methods.split(","), + datasets=args.datasets.split(","), + n_profiles=args.n_profiles, + n_sessions_per_profile=args.n_sessions, + max_turns_per_session=args.max_turns, + output_dir=args.output_dir, + profile_path=args.profile_path, + start_profile=args.start_profile, + end_profile=args.end_profile, + use_vllm=args.use_vllm, + vllm_user_url=args.vllm_user_url, + vllm_agent_url=args.vllm_agent_url, + use_openai_user=args.use_openai_user, + openai_user_model=args.openai_user_model, + reward_mode=args.reward_mode, + parallel_profiles=args.parallel_profiles, + use_batch_processing=args.use_batch_processing, + batch_size_conversations=args.batch_size, + continue_from=args.continue_from, + ) + + # Run experiments + runner = ExperimentRunner(config) + analysis = runner.run_all() + + print("\n" + "=" * 60) + print("EXPERIMENT COMPLETE") + print("=" * 60) + print(f"\nResults saved to: {runner.output_dir}") + if analysis.get("comparison"): + print("\nBest methods per metric:") + for metric, data in analysis["comparison"].items(): + print(f" {metric}: {data['best_method']} ({data['best_value']:.3f})") + else: + print("\nNo comparison data available (sessions may have failed)") + + +if __name__ == "__main__": + main() diff --git a/collaborativeagents/scripts/run_fp8.sh b/collaborativeagents/scripts/run_fp8.sh new file mode 100644 index 0000000..54537fa --- /dev/null +++ b/collaborativeagents/scripts/run_fp8.sh @@ -0,0 +1,65 @@ +# vllm serve meta-llama/Llama-3.3-70B-Instruct --port 8004 --tensor-parallel-size 4 --max-model-len 16384 --gpu-memory-utilization 0.9 --quantization fp8 --enforce-eager +# python -m sglang.launch_server --model-path meta-llama/Llama-3.3-70B-Instruct --port 8004 --tp-size 4 --context-length 16384 --mem-fraction-static 0.9 --quantization fp8 + + +BATCH_SIZE=1 +# BATCH_SIZE=20 + +# Loop over eval sizes and datasets +for EVAL_SIZE in 5; do + for DATASET in logiqa; do # mmlu medqa humaneval bigcodebench math-500 math-hard + # Convert dataset name for file paths (replace - with _) + DATASET_FILE=$(echo ${DATASET} | tr '-' '_') + + echo "Running experiments for dataset: ${DATASET} with eval_size ${EVAL_SIZE}" + + # # no_user experiment + # python3 run.py --experiment_type no_user --dataset ${DATASET} --eval_size ${EVAL_SIZE} --batch_size ${BATCH_SIZE} \ + # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \ + # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_fp8/no_user/${DATASET_FILE}_llama70b_user_llama70b_agent_no_user_eval_size_${EVAL_SIZE}.jsonl \ + # >> ./runs/llama70b_fp8/no_user/${DATASET_FILE}_llama70b_user_llama70b_agent_no_user_eval_size_${EVAL_SIZE}.out 2>&1 + + # # user_no_profile experiment + # python3 run.py --experiment_type user_no_profile --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \ + # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_fp8/user_no_profile/${DATASET_FILE}_llama70b_user_llama70b_agent_user_no_profile_eval_size_${EVAL_SIZE}.jsonl \ + # >> ./runs/llama70b_fp8/user_no_profile/${DATASET_FILE}_llama70b_user_llama70b_agent_user_no_profile_eval_size_${EVAL_SIZE}.out 2>&1 + + # # user_profiles_without_preferences experiment + # python3 run.py --experiment_type user_profiles_without_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \ + # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_fp8/user_profiles_without_preferences/${DATASET_FILE}_llama70b_user_llama70b_agent_user_profiles_without_preferences_eval_size_${EVAL_SIZE}.jsonl \ + # >> ./runs/llama70b_fp8/user_profiles_without_preferences/${DATASET_FILE}_llama70b_user_llama70b_agent_user_profiles_without_preferences_eval_size_${EVAL_SIZE}.out 2>&1 + + # # user_profiles_with_preferences experiment + # python3 run.py --experiment_type user_profiles_with_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \ + # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_fp8/user_profiles_with_preferences/${DATASET_FILE}_llama70b_user_llama70b_agent_user_profiles_with_preferences_eval_size_${EVAL_SIZE}.jsonl \ + # >> ./runs/llama70b_fp8/user_profiles_with_preferences/${DATASET_FILE}_llama70b_user_llama70b_agent_user_profiles_with_preferences_eval_size_${EVAL_SIZE}.out 2>&1 + + # # agent_with_user_preferences experiment + # python3 run.py --experiment_type agent_with_user_preferences --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + # --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + # --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \ + # --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + # --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_fp8/agent_with_user_preferences/${DATASET_FILE}_llama70b_user_llama70b_agent_agent_with_user_preferences_eval_size_${EVAL_SIZE}_v2.jsonl \ + # >> ./runs/llama70b_fp8/agent_with_user_preferences/${DATASET_FILE}_llama70b_user_llama70b_agent_agent_with_user_preferences_eval_size_${EVAL_SIZE}_v2.out 2>&1 + + # agent_with_reflection experiment + python3 run.py --experiment_type agent_with_reflection --dataset ${DATASET} --eval_size ${EVAL_SIZE} --max_turns 10 --batch_size ${BATCH_SIZE} \ + --user_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --user_api_base http://localhost:8004/v1 --user_api_key EMPTY \ + --collaborator_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --collaborator_api_base http://localhost:8004/v1 --collaborator_api_key EMPTY \ + --judge_model_name hosted_vllm/meta-llama/Llama-3.3-70B-Instruct --judge_api_base http://localhost:8004/v1 --judge_api_key EMPTY \ + --output_file /shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_fp8/agent_with_reflection/${DATASET_FILE}_llama70b_user_llama70b_agent_agent_with_reflection_eval_size_${EVAL_SIZE}.jsonl \ + >> ./runs/llama70b_fp8/agent_with_reflection/${DATASET_FILE}_llama70b_user_llama70b_agent_agent_with_reflection_eval_size_${EVAL_SIZE}.out 2>&1 + + + done +done \ No newline at end of file diff --git a/collaborativeagents/scripts/run_preflight_test.sh b/collaborativeagents/scripts/run_preflight_test.sh new file mode 100755 index 0000000..8647a0b --- /dev/null +++ b/collaborativeagents/scripts/run_preflight_test.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Run pre-flight tests before full experiments + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT_USER=8004 +PORT_AGENT=8003 + +echo "============================================" +echo "Pre-Flight Tests for Full Experiments" +echo "============================================" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv +echo "" + +# Kill any existing servers +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start servers +echo "Starting 8B user simulator (GPU 0-1)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_USER \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_USER_PID=$! + +echo "Starting 8B agent (GPU 2-3)..." +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_AGENT \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_AGENT_PID=$! + +echo "Waiting for servers..." +for i in $(seq 1 100); do + READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0) + READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0) + if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then + echo "Both servers ready after $((i*3))s" + break + fi + if [ $((i % 20)) -eq 0 ]; then + echo " Still waiting... ($((i*3))s)" + fi + sleep 3 +done + +if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then + echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then + echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +echo "✓ Both servers healthy" +echo "" + +# Run pre-flight tests +python scripts/preflight_test.py \ + http://localhost:$PORT_USER/v1 \ + http://localhost:$PORT_AGENT/v1 + +TEST_RESULT=$? + +# Cleanup +echo "" +echo "Cleaning up..." +kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true + +echo "" +date + +exit $TEST_RESULT diff --git a/collaborativeagents/scripts/scale_test_batch1.sbatch b/collaborativeagents/scripts/scale_test_batch1.sbatch new file mode 100644 index 0000000..119be9b --- /dev/null +++ b/collaborativeagents/scripts/scale_test_batch1.sbatch @@ -0,0 +1,121 @@ +#!/bin/bash +#SBATCH --job-name=scale_b1 +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8-interactive +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=01:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/scale_b1-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/scale_b1-%j.err + +# Scale Test Batch 1: Users 1-5, 15 sessions each, 3 methods +# With CollaborativeAgents-style prompts + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" +MEMORY_STORE="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store" + +echo "=== Scale Test Batch 1: 5 users × 15 sessions × 3 methods ===" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +# Start vLLM servers +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \ + --max-model-len 16384 --dtype bfloat16 & + +echo "Waiting for vLLM servers..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator ready after $((i*5))s" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent ready after $((i*5))s" + break + fi + sleep 5 +done +sleep 5 + +OUTPUT_DIR="../results/scale_test_$(date +%Y%m%d_%H%M%S)" + +# Run each method with 5 profiles, 15 sessions +for METHOD in vanilla rag rag_vector; do + echo "" + echo "============================================" + echo "Testing: $METHOD (5 users × 15 sessions)" + echo "============================================" + + # Clear memory store before each method + > ${MEMORY_STORE}/memory_cards.jsonl + rm -f ${MEMORY_STORE}/memory_embeddings.npy + + date + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 1 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + + echo "Method $METHOD completed" + if [ "$METHOD" != "vanilla" ]; then + echo "Final memory cards: $(wc -l < ${MEMORY_STORE}/memory_cards.jsonl 2>/dev/null || echo 0)" + fi +done + +echo "" +echo "=== Scale Test Batch 1 Complete ===" +date + +# Generate comparison +python3 << 'PYEOF' +import json +from pathlib import Path + +output_base = sorted(Path("../results").glob("scale_test_*"))[-1] +print(f"\n=== Results Summary ===\nDir: {output_base}\n") + +methods = ["vanilla", "rag", "rag_vector"] +results = {} + +for subdir in output_base.iterdir(): + if subdir.is_dir(): + for method in methods: + result_file = subdir / method / "results.json" + if result_file.exists() and method not in results: + with open(result_file) as f: + results[method] = json.load(f) + +if results: + print(f"{'Method':<12} {'Success':<10} {'Turns':<10} {'Enforce':<10} {'Sessions':<10}") + print("-" * 55) + for method in methods: + if method in results: + data = results[method] + n = len(data) + succ = sum(r['metrics']['task_success'] for r in data) / n + turns = sum(r['metrics']['total_turns'] for r in data) / n + enf = sum(r['metrics']['enforcement_count'] for r in data) / n + print(f"{method:<12} {succ:<10.1%} {turns:<10.1f} {enf:<10.1f} {n:<10}") +PYEOF + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/scale_test_batch2.sbatch b/collaborativeagents/scripts/scale_test_batch2.sbatch new file mode 100644 index 0000000..6a1fb27 --- /dev/null +++ b/collaborativeagents/scripts/scale_test_batch2.sbatch @@ -0,0 +1,126 @@ +#!/bin/bash +#SBATCH --job-name=scale_b2 +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8-interactive +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=01:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/scale_b2-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/scale_b2-%j.err + +# Scale Test Batch 2: Users 6-10, 15 sessions each, 3 methods +# With CollaborativeAgents-style prompts + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" +MEMORY_STORE="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store_b2" + +echo "=== Scale Test Batch 2: 5 users × 15 sessions × 3 methods ===" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +# Create separate memory store for batch 2 +mkdir -p ${MEMORY_STORE} +> ${MEMORY_STORE}/memory_cards.jsonl + +# Start vLLM servers +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \ + --max-model-len 16384 --dtype bfloat16 & + +echo "Waiting for vLLM servers..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator ready after $((i*5))s" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent ready after $((i*5))s" + break + fi + sleep 5 +done +sleep 5 + +OUTPUT_DIR="../results/scale_test_b2_$(date +%Y%m%d_%H%M%S)" + +# Run each method with profiles 6-10 (skip first 5) +for METHOD in vanilla rag rag_vector; do + echo "" + echo "============================================" + echo "Testing: $METHOD (users 6-10 × 15 sessions)" + echo "============================================" + + # Clear memory store before each method + > ${MEMORY_STORE}/memory_cards.jsonl + rm -f ${MEMORY_STORE}/memory_embeddings.npy + + date + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 1 \ + --profile-offset 5 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + + echo "Method $METHOD completed" + if [ "$METHOD" != "vanilla" ]; then + echo "Final memory cards: $(wc -l < ${MEMORY_STORE}/memory_cards.jsonl 2>/dev/null || echo 0)" + fi +done + +echo "" +echo "=== Scale Test Batch 2 Complete ===" +date + +# Generate comparison +python3 << 'PYEOF' +import json +from pathlib import Path + +output_base = sorted(Path("../results").glob("scale_test_b2_*"))[-1] +print(f"\n=== Results Summary (Batch 2) ===\nDir: {output_base}\n") + +methods = ["vanilla", "rag", "rag_vector"] +results = {} + +for subdir in output_base.iterdir(): + if subdir.is_dir(): + for method in methods: + result_file = subdir / method / "results.json" + if result_file.exists() and method not in results: + with open(result_file) as f: + results[method] = json.load(f) + +if results: + print(f"{'Method':<12} {'Success':<10} {'Turns':<10} {'Enforce':<10} {'Sessions':<10}") + print("-" * 55) + for method in methods: + if method in results: + data = results[method] + n = len(data) + succ = sum(r['metrics']['task_success'] for r in data) / n + turns = sum(r['metrics']['total_turns'] for r in data) / n + enf = sum(r['metrics']['enforcement_count'] for r in data) / n + print(f"{method:<12} {succ:<10.1%} {turns:<10.1f} {enf:<10.1f} {n:<10}") +PYEOF + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/scale_test_ctx_refl.sbatch b/collaborativeagents/scripts/scale_test_ctx_refl.sbatch new file mode 100644 index 0000000..1055e16 --- /dev/null +++ b/collaborativeagents/scripts/scale_test_ctx_refl.sbatch @@ -0,0 +1,114 @@ +#!/bin/bash +#SBATCH --job-name=scale_cr +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8-interactive +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=01:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/scale_cr-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/scale_cr-%j.err + +# Scale Test: Contextual and Reflection methods +# 5 users × 15 sessions × 2 methods = 150 sessions +# With CollaborativeAgents-style prompts + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +echo "=== Scale Test: Contextual & Reflection (5 users × 15 sessions × 2 methods) ===" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +# Start vLLM servers +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 & + +echo "Waiting for vLLM servers..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator ready after $((i*5))s" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent ready after $((i*5))s" + break + fi + sleep 5 +done +sleep 5 + +OUTPUT_DIR="../results/scale_test_ctx_refl_$(date +%Y%m%d_%H%M%S)" + +# Run contextual and reflection methods +for METHOD in contextual reflection; do + echo "" + echo "============================================" + echo "Testing: $METHOD (5 users × 15 sessions)" + echo "============================================" + + date + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 1 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + + echo "Method $METHOD completed" +done + +echo "" +echo "=== Contextual & Reflection Test Complete ===" +date + +# Generate comparison +python3 << 'PYEOF' +import json +from pathlib import Path + +output_base = sorted(Path("../results").glob("scale_test_ctx_refl_*"))[-1] +print(f"\n=== Results Summary (Contextual & Reflection) ===\nDir: {output_base}\n") + +methods = ["contextual", "reflection"] +results = {} + +for subdir in output_base.iterdir(): + if subdir.is_dir(): + for method in methods: + result_file = subdir / method / "results.json" + if result_file.exists() and method not in results: + with open(result_file) as f: + results[method] = json.load(f) + +if results: + print(f"{'Method':<12} {'Success':<10} {'Turns':<10} {'Enforce':<10} {'Sessions':<10}") + print("-" * 55) + for method in methods: + if method in results: + data = results[method] + n = len(data) + succ = sum(r['metrics']['task_success'] for r in data) / n + turns = sum(r['metrics']['total_turns'] for r in data) / n + enf = sum(r['metrics']['enforcement_count'] for r in data) / n + print(f"{method:<12} {succ:<10.1%} {turns:<10.1f} {enf:<10.1f} {n:<10}") +PYEOF + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/smallscale_test.sbatch b/collaborativeagents/scripts/smallscale_test.sbatch new file mode 100644 index 0000000..774575e --- /dev/null +++ b/collaborativeagents/scripts/smallscale_test.sbatch @@ -0,0 +1,87 @@ +#!/bin/bash +#SBATCH --job-name=smalltest +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=02:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/smalltest-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/smalltest-%j.err + +# Small-scale test: 5 profiles, 5 sessions, all 6 methods +# Full settings (70B user sim, 8B agent) but fewer questions + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +echo "=== Small-scale Test: All 6 Methods ===" +echo "Settings: 5 profiles, 5 sessions each, max 15 turns" +echo "User simulator: $USER_MODEL (70B)" +echo "Agent: $AGENT_MODEL (8B)" +date + +# Start vLLM servers +# User simulator on GPUs 0,1 (70B, TP=2) +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME & + +# Agent on GPUs 2,3 (8B, TP=2, lower memory for embedding/reranker) +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \ + --max-model-len 16384 --dtype bfloat16 & + +# Wait for servers +echo "Waiting for vLLM servers..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator (8004) ready after $((i*5)) seconds" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent (8003) ready after $((i*5)) seconds" + break + fi + sleep 5 +done +echo "Both vLLM servers ready" +sleep 10 + +# Run all 6 methods sequentially with small scale +for METHOD in vanilla contextual reflection all_memory rag rag_vector; do + echo "" + echo "=== Testing method: $METHOD ===" + date + + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 5 --n-sessions 5 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 5 \ + --output-dir ../results/smalltest --profile-path $PROFILE_PATH + + if [ $? -eq 0 ]; then + echo "Method $METHOD: SUCCESS" + else + echo "Method $METHOD: FAILED" + fi +done + +echo "" +echo "=== Small-scale test complete ===" +date + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/test_70b_pilot.py b/collaborativeagents/scripts/test_70b_pilot.py new file mode 100644 index 0000000..4bb27a3 --- /dev/null +++ b/collaborativeagents/scripts/test_70b_pilot.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +""" +Pilot test for 70B AWQ user model. + +Tests: +1. 70B AWQ model loads without OOM +2. User simulation works correctly +3. Multi-turn conversation completes +4. Memory usage is acceptable + +Run with 4xA100 GPUs. +""" + +import sys +import json +import torch +from pathlib import Path + +# Add paths +sys.path.insert(0, str(Path(__file__).parent.parent)) +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + + +def print_gpu_memory(): + """Print current GPU memory usage.""" + print("\n=== GPU Memory Usage ===") + for i in range(torch.cuda.device_count()): + total = torch.cuda.get_device_properties(i).total_memory / 1e9 + allocated = torch.cuda.memory_allocated(i) / 1e9 + reserved = torch.cuda.memory_reserved(i) / 1e9 + print(f" GPU {i}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved, {total:.1f}GB total") + print() + + +def test_70b_user_agent(): + """Test 70B user agent standalone.""" + print("=" * 60) + print("TEST 1: 70B AWQ User Agent Loading") + print("=" * 60) + + from agents.local_user_agent import LocalUserAgent, DEFAULT_MODEL_PATH + + print(f"Default model path: {DEFAULT_MODEL_PATH}") + print(f"Is AWQ model: {'awq' in DEFAULT_MODEL_PATH.lower()}") + + # Create user agent + user_agent = LocalUserAgent( + user_task_description="Help solve a math problem", + problem="What is 2 + 2?", + user_persona="A student learning math", + user_preferences="- Show step by step solutions\n- Use simple language", + ) + + print("\nGenerating user response...") + print_gpu_memory() + + # Simulate a conversation + conversation = [{"role": "assistant", "content": "How can I help you today?"}] + response = user_agent.generate_user_response(conversation) + + print_gpu_memory() + + if response: + print(f"SUCCESS! User response: {response.get('response', 'N/A')[:200]}...") + print(f"Should terminate: {response.get('should_terminate', 'N/A')}") + return True + else: + print("FAILED! User agent returned None") + return False + + +def test_multiturn_with_70b(): + """Test multi-turn conversation with 70B user model.""" + print("\n" + "=" * 60) + print("TEST 2: Multi-turn Conversation with 70B User Model") + print("=" * 60) + + from agents.local_user_agent import SharedLocalUserAgent, TERMINATION_SIGNAL + from adapters.personalized_llm_adapter import create_baseline_adapter + + # Create vanilla adapter (uses Qwen 1.5B for agent) + print("\nCreating vanilla adapter...") + adapter = create_baseline_adapter("vanilla") + adapter.initialize() + + print_gpu_memory() + + # Load a test profile + profile_path = Path(__file__).parent.parent / "data/complex_profiles_v2/profiles_100.jsonl" + with open(profile_path) as f: + profile = json.loads(f.readline()) + + print(f"Loaded profile: {profile.get('user_id', 'unknown')}") + + # Create user agent with 70B model + problem = "What is 15% of 80?" + user_prefs = profile.get("preferences", [])[:3] + pref_str = "\n".join([f"- {p}" for p in user_prefs]) + + print(f"\nUser preferences:\n{pref_str}") + + user_agent = SharedLocalUserAgent( + user_task_description="Solve the math problem", + problem=problem, + user_persona=profile.get("persona", "A user"), + user_preferences=pref_str, + ) + + print_gpu_memory() + + # Start session + adapter.start_session(user_id=profile.get("user_id", "test")) + + # Run multi-turn conversation + conversation = [{"role": "assistant", "content": "How can I help you today?"}] + turns = [] + max_turns = 5 + + print(f"\nStarting {max_turns}-turn conversation...") + + for turn_num in range(max_turns): + print(f"\n--- Turn {turn_num + 1} ---") + + # User turn + user_response = user_agent.generate_user_response(conversation) + if user_response is None: + print("User agent failed!") + break + + user_msg = user_response.get("response", "") + print(f"USER: {user_msg[:150]}...") + + conversation.append({"role": "user", "content": user_msg}) + turns.append({"role": "user", "content": user_msg}) + + # Check termination + if user_response.get("should_terminate", False) or TERMINATION_SIGNAL in user_msg: + print("\n[User terminated conversation]") + break + + # Agent turn + response = adapter.generate_response(user_msg, conversation[:-1]) + agent_msg = response.get("response", str(response)) if isinstance(response, dict) else str(response) + print(f"AGENT: {agent_msg[:150]}...") + + conversation.append({"role": "assistant", "content": agent_msg}) + turns.append({"role": "assistant", "content": agent_msg}) + + # End session + adapter.end_session() + + print(f"\n--- Results ---") + print(f"Total turns: {len(turns)}") + print(f"User turns: {len([t for t in turns if t['role'] == 'user'])}") + print(f"Agent turns: {len([t for t in turns if t['role'] == 'assistant'])}") + + print_gpu_memory() + + return len(turns) > 2 # Success if more than single turn + + +def test_memory_after_multiple_sessions(): + """Test memory doesn't grow unboundedly after multiple sessions.""" + print("\n" + "=" * 60) + print("TEST 3: Memory Stability Across Sessions") + print("=" * 60) + + from agents.local_user_agent import SharedLocalUserAgent, TERMINATION_SIGNAL + from adapters.personalized_llm_adapter import create_baseline_adapter + + adapter = create_baseline_adapter("vanilla") + adapter.initialize() + + profile_path = Path(__file__).parent.parent / "data/complex_profiles_v2/profiles_100.jsonl" + with open(profile_path) as f: + profile = json.loads(f.readline()) + + n_sessions = 3 + print(f"\nRunning {n_sessions} sessions to check memory stability...") + + for session_idx in range(n_sessions): + print(f"\n--- Session {session_idx + 1}/{n_sessions} ---") + + user_agent = SharedLocalUserAgent( + user_task_description="Solve math", + problem=f"What is {session_idx + 1} + {session_idx + 2}?", + user_persona="A student", + user_preferences="- Be concise", + ) + + adapter.start_session(user_id=profile.get("user_id", "test")) + + conversation = [{"role": "assistant", "content": "How can I help?"}] + for turn in range(3): + user_response = user_agent.generate_user_response(conversation) + if user_response is None or user_response.get("should_terminate"): + break + conversation.append({"role": "user", "content": user_response.get("response", "")}) + + response = adapter.generate_response(user_response.get("response", ""), conversation[:-1]) + conversation.append({"role": "assistant", "content": response.get("response", str(response))}) + + adapter.end_session() + print_gpu_memory() + + # Force garbage collection + import gc + gc.collect() + torch.cuda.empty_cache() + + print("\nMemory stability test completed.") + return True + + +if __name__ == "__main__": + import os + os.environ["HF_HOME"] = "/projects/bfqt/users/yurenh2/hf_cache/huggingface" + + print("\n" + "=" * 60) + print("70B AWQ USER MODEL PILOT TEST") + print("=" * 60) + print(f"PyTorch version: {torch.__version__}") + print(f"CUDA available: {torch.cuda.is_available()}") + print(f"GPU count: {torch.cuda.device_count()}") + + for i in range(torch.cuda.device_count()): + print(f" GPU {i}: {torch.cuda.get_device_name(i)}") + + print_gpu_memory() + + results = {} + + # Test 1: User agent loading + try: + results["70b_load"] = test_70b_user_agent() + except Exception as e: + print(f"TEST 1 FAILED: {e}") + import traceback + traceback.print_exc() + results["70b_load"] = False + + # Test 2: Multi-turn conversation (only if test 1 passed) + if results.get("70b_load", False): + try: + results["multiturn"] = test_multiturn_with_70b() + except Exception as e: + print(f"TEST 2 FAILED: {e}") + import traceback + traceback.print_exc() + results["multiturn"] = False + else: + print("\nSkipping TEST 2 (TEST 1 failed)") + results["multiturn"] = False + + # Test 3: Memory stability (only if test 2 passed) + if results.get("multiturn", False): + try: + results["memory_stable"] = test_memory_after_multiple_sessions() + except Exception as e: + print(f"TEST 3 FAILED: {e}") + import traceback + traceback.print_exc() + results["memory_stable"] = False + else: + print("\nSkipping TEST 3 (TEST 2 failed)") + results["memory_stable"] = False + + # Summary + print("\n" + "=" * 60) + print("TEST SUMMARY") + print("=" * 60) + for test_name, passed in results.items(): + status = "PASS" if passed else "FAIL" + print(f" {test_name}: {status}") + + all_passed = all(results.values()) + print(f"\nOverall: {'ALL TESTS PASSED - Ready for full experiment!' if all_passed else 'SOME TESTS FAILED'}") + + print_gpu_memory() + + sys.exit(0 if all_passed else 1) diff --git a/collaborativeagents/scripts/test_all_a100x8.sbatch b/collaborativeagents/scripts/test_all_a100x8.sbatch new file mode 100644 index 0000000..3f117e1 --- /dev/null +++ b/collaborativeagents/scripts/test_all_a100x8.sbatch @@ -0,0 +1,124 @@ +#!/bin/bash +#SBATCH --job-name=test_all_a100x8 +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuA100x8-interactive +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=01:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_a100x8-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_a100x8-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== Parallel Speed Test: ALL 6 METHODS (A100x8) ===" +echo "Scale: 10 profiles × 3 sessions = 30 sessions per method" +echo "vLLM memory: 45% (leaves room for embedding+reranker)" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start TWO vLLM servers with REDUCED memory (45%) to leave room for embedding+reranker +echo "" +echo "Starting vLLM servers (45% GPU memory)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.45 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.45 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +for i in $(seq 1 120); do + u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0) + a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0) + [ "$u" = "1" ] && [ "$a" = "1" ] && echo "Both servers ready after $((i*2))s" && break + sleep 2 +done + +sleep 30 +echo "Starting experiments..." + +COMMON_ARGS="--datasets math-hard --n-profiles 10 --n-sessions 3 --max-turns 10 --use-vllm --parallel-profiles 10 --no-batch-processing --output-dir ../results/test_a100x8 --profile-path $PROFILE_PATH" + +# Test 1: vanilla (batch processing) +echo "" +echo "=== TEST 1: vanilla (batch processing) ===" +START=$(date +%s) +python scripts/run_experiments.py \ + --methods vanilla \ + --datasets math-hard \ + --n-profiles 10 --n-sessions 3 --max-turns 10 \ + --use-vllm --parallel-profiles 10 \ + --use-batch-processing --batch-size 30 \ + --output-dir ../results/test_a100x8 \ + --profile-path "$PROFILE_PATH" +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> vanilla: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 2: contextual +echo "" +echo "=== TEST 2: contextual ===" +START=$(date +%s) +python scripts/run_experiments.py --methods contextual $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> contextual: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 3: reflection +echo "" +echo "=== TEST 3: reflection ===" +START=$(date +%s) +python scripts/run_experiments.py --methods reflection $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> reflection: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 4: all_memory +echo "" +echo "=== TEST 4: all_memory ===" +START=$(date +%s) +python scripts/run_experiments.py --methods all_memory $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> all_memory: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 5: rag +echo "" +echo "=== TEST 5: rag ===" +START=$(date +%s) +python scripts/run_experiments.py --methods rag $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> rag: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 6: rag_vector +echo "" +echo "=== TEST 6: rag_vector ===" +START=$(date +%s) +python scripts/run_experiments.py --methods rag_vector $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> rag_vector: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +pkill -f "vllm.entrypoints" 2>/dev/null || true + +echo "" +echo "=== ALL SPEED TESTS COMPLETE ===" +date diff --git a/collaborativeagents/scripts/test_all_h200.sbatch b/collaborativeagents/scripts/test_all_h200.sbatch new file mode 100644 index 0000000..cc37a39 --- /dev/null +++ b/collaborativeagents/scripts/test_all_h200.sbatch @@ -0,0 +1,126 @@ +#!/bin/bash +#SBATCH --job-name=test_all_h200 +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8-interactive +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=01:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_h200-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_h200-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== Parallel Speed Test: ALL 6 METHODS (H200) ===" +echo "Scale: 10 profiles × 3 sessions = 30 sessions per method" +echo "vLLM memory: 45% (leaves room for embedding+reranker)" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start TWO vLLM servers with REDUCED memory (45%) to leave room for embedding+reranker +echo "" +echo "Starting vLLM servers (45% GPU memory)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.45 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.45 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +for i in $(seq 1 120); do + u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0) + a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0) + [ "$u" = "1" ] && [ "$a" = "1" ] && echo "Both servers ready after $((i*2))s" && break + sleep 2 +done + +sleep 30 +echo "Starting experiments..." + +# All methods can now run in parallel thanks to shared model singletons +# Shared models: embedding (8B) and reranker (8B) are loaded ONCE and shared across all parallel workers +COMMON_ARGS="--datasets math-hard --n-profiles 10 --n-sessions 3 --max-turns 10 --use-vllm --parallel-profiles 10 --no-batch-processing --output-dir ../results/test_h200 --profile-path $PROFILE_PATH" + +# Test 1: vanilla (batch processing) +echo "" +echo "=== TEST 1: vanilla (batch processing) ===" +START=$(date +%s) +python scripts/run_experiments.py \ + --methods vanilla \ + --datasets math-hard \ + --n-profiles 10 --n-sessions 3 --max-turns 10 \ + --use-vllm --parallel-profiles 10 \ + --use-batch-processing --batch-size 30 \ + --output-dir ../results/test_h200 \ + --profile-path "$PROFILE_PATH" +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> vanilla: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 2: contextual +echo "" +echo "=== TEST 2: contextual ===" +START=$(date +%s) +python scripts/run_experiments.py --methods contextual $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> contextual: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 3: reflection +echo "" +echo "=== TEST 3: reflection ===" +START=$(date +%s) +python scripts/run_experiments.py --methods reflection $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> reflection: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 4: all_memory (parallel with shared models) +echo "" +echo "=== TEST 4: all_memory (parallel with shared models) ===" +START=$(date +%s) +python scripts/run_experiments.py --methods all_memory $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> all_memory: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 5: rag (parallel with shared models) +echo "" +echo "=== TEST 5: rag (parallel with shared models) ===" +START=$(date +%s) +python scripts/run_experiments.py --methods rag $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> rag: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 6: rag_vector (parallel with shared models) +echo "" +echo "=== TEST 6: rag_vector (parallel with shared models) ===" +START=$(date +%s) +python scripts/run_experiments.py --methods rag_vector $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> rag_vector: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +pkill -f "vllm.entrypoints" 2>/dev/null || true + +echo "" +echo "=== ALL SPEED TESTS COMPLETE ===" +date diff --git a/collaborativeagents/scripts/test_all_methods.sbatch b/collaborativeagents/scripts/test_all_methods.sbatch new file mode 100644 index 0000000..6550cdf --- /dev/null +++ b/collaborativeagents/scripts/test_all_methods.sbatch @@ -0,0 +1,91 @@ +#!/bin/bash +#SBATCH --job-name=test_all +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=02:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_methods_%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_methods_%j.err + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +echo "=== Small-scale test: ALL methods with 70B user sim ===" +echo "Scale: 5 profiles × 3 sessions = 15 sessions per method" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +# Start 70B user simulator on GPUs 0,1 +echo "" +echo "Starting 70B user simulator..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 8192 --dtype bfloat16 --download-dir $HF_HOME & +USER_PID=$! + +# Start 8B agent on GPUs 2,3 (0.45 for RAG methods) +echo "Starting 8B agent (0.45 memory for embedding/reranker)..." +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \ + --max-model-len 8192 --dtype bfloat16 & +AGENT_PID=$! + +# Wait for servers +echo "Waiting for vLLM servers (70B takes ~8 min)..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "70B user simulator ready after $((i*5))s" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "8B agent ready after $((i*5))s" + break + fi + sleep 5 +done + +echo "" +echo "=== GPU Memory after vLLM servers ===" +nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv + +# Test each method sequentially +for METHOD in vanilla contextual reflection all_memory rag rag_vector; do + echo "" + echo "==============================================" + echo "Testing method: $METHOD" + echo "==============================================" + date + + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 5 --n-sessions 3 --max-turns 10 \ + --use-vllm --no-batch-processing --parallel-profiles 5 \ + --output-dir ../results/test_all_methods --profile-path $PROFILE_PATH + + echo "" + echo "=== GPU Memory after $METHOD ===" + nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv +done + +echo "" +echo "==============================================" +echo "ALL METHODS TESTED" +echo "==============================================" +date + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/test_batch_50.py b/collaborativeagents/scripts/test_batch_50.py new file mode 100644 index 0000000..b3f1c37 --- /dev/null +++ b/collaborativeagents/scripts/test_batch_50.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +""" +Test batch processing with 50 conversations (matching paper's setup). +""" + +import sys +import time +sys.path.insert(0, '/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents') + +from agents.batch_vllm_agent import BatchConversationGenerator + +def main(): + user_url = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8004/v1" + agent_url = sys.argv[2] if len(sys.argv) > 2 else "http://localhost:8003/v1" + batch_size = int(sys.argv[3]) if len(sys.argv) > 3 else 50 + max_turns = int(sys.argv[4]) if len(sys.argv) > 4 else 10 + + print(f"\n{'='*60}") + print(f"Batch Processing Test (Paper Configuration)") + print(f"{'='*60}") + print(f"Batch size: {batch_size}") + print(f"Max turns: {max_turns}") + print(f"User URL: {user_url}") + print(f"Agent URL: {agent_url}") + print() + + # Create samples (simulating MMLU-style questions) + samples = [ + { + "problem": f"Question {i+1}: What is the capital of country number {i+1}? " + f"A) City A B) City B C) City C D) City D. " + f"Please explain your reasoning step by step.", + "solution": "City A" + } + for i in range(batch_size) + ] + + generator = BatchConversationGenerator( + user_vllm_url=user_url, + agent_vllm_url=agent_url, + max_turns=max_turns, + user_max_tokens=512, + agent_max_tokens=1024, + temperature=0.7, + ) + + print(f"Starting batch generation of {batch_size} conversations...") + print(f"Expected: ~{batch_size * max_turns * 2} total LLM calls batched into ~{max_turns * 2} batch requests") + print() + + start = time.time() + results = generator.generate_batch( + samples=samples, + user_persona="A curious student seeking help with exam questions.", + user_preferences="1. Explain your reasoning step by step\n2. Be concise but thorough\n3. Highlight the key concept", + agent_system_prompt="You are a helpful tutor. Answer questions clearly and explain your reasoning.", + ) + elapsed = time.time() - start + + successes = sum(1 for r in results if r is not None) + total_turns = sum( + len(r['conversation']) // 2 if r else 0 + for r in results + ) + + print(f"\n{'='*60}") + print(f"RESULTS") + print(f"{'='*60}") + print(f"Batch size: {batch_size}") + print(f"Max turns: {max_turns}") + print(f"Successes: {successes}/{batch_size}") + print(f"Total conversation turns: {total_turns}") + print(f"Time: {elapsed:.1f}s") + print() + print(f"Throughput: {successes * 3600 / elapsed:.0f} conversations/hr") + print(f"Sessions/hr (3 sessions/profile): {successes * 3 * 3600 / elapsed:.0f}") + print() + + # Compare with paper's claimed performance + paper_sessions = 2000 # sessions per hour claimed + our_sessions = successes * 3 * 3600 / elapsed + print(f"Paper's claimed throughput: ~{paper_sessions} sessions/hr") + print(f"Our throughput: {our_sessions:.0f} sessions/hr") + print(f"Ratio: {our_sessions / paper_sessions * 100:.1f}% of paper's performance") + print() + + # Show sample conversation + if results[0]: + print(f"Sample conversation (first 4 messages):") + for msg in results[0]['conversation'][:4]: + role = msg['role'].upper() + content = msg['content'][:100] + "..." if len(msg['content']) > 100 else msg['content'] + print(f" [{role}]: {content}") + + return results + +if __name__ == "__main__": + main() diff --git a/collaborativeagents/scripts/test_batch_50.sh b/collaborativeagents/scripts/test_batch_50.sh new file mode 100755 index 0000000..35f4440 --- /dev/null +++ b/collaborativeagents/scripts/test_batch_50.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# Test batch processing with 50 conversations (paper's configuration) + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT_USER=8004 +PORT_AGENT=8003 + +echo "============================================" +echo "Batch Processing Test (Paper Configuration)" +echo "Batch Size: 50 conversations" +echo "============================================" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv +echo "" + +# Kill any existing vLLM servers +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start servers with TP=2 +echo "Starting 8B user simulator server (GPU 0-1, TP=2)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_USER \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_USER_PID=$! + +echo "Starting 8B agent server (GPU 2-3, TP=2)..." +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_AGENT \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_AGENT_PID=$! + +echo "Waiting for servers..." +for i in $(seq 1 100); do + READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0) + READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0) + if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then + echo "Both servers ready after $((i*3)) seconds" + break + fi + if [ $((i % 20)) -eq 0 ]; then + echo " Still waiting... ($((i*3))s)" + fi + sleep 3 +done + +if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then + echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then + echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +echo "✓ Both servers healthy" + +echo "" +echo "============================================" +echo "Test 1: Batch=50, Turns=5" +echo "============================================" +python scripts/test_batch_50.py \ + http://localhost:$PORT_USER/v1 \ + http://localhost:$PORT_AGENT/v1 \ + 50 5 + +echo "" +echo "============================================" +echo "Test 2: Batch=50, Turns=10 (paper config)" +echo "============================================" +python scripts/test_batch_50.py \ + http://localhost:$PORT_USER/v1 \ + http://localhost:$PORT_AGENT/v1 \ + 50 10 + +echo "" +echo "============================================" +echo "Test 3: Batch=100, Turns=10 (stress test)" +echo "============================================" +python scripts/test_batch_50.py \ + http://localhost:$PORT_USER/v1 \ + http://localhost:$PORT_AGENT/v1 \ + 100 10 + +# Cleanup +echo "" +echo "Cleaning up..." +kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true + +echo "" +date diff --git a/collaborativeagents/scripts/test_batch_vs_parallel.sh b/collaborativeagents/scripts/test_batch_vs_parallel.sh new file mode 100755 index 0000000..616c593 --- /dev/null +++ b/collaborativeagents/scripts/test_batch_vs_parallel.sh @@ -0,0 +1,151 @@ +#!/bin/bash +# Compare batch processing vs parallel profile processing on A100x4 +# +# Expected result: Batch should be significantly faster because: +# - Turn-synchronous: ALL conversations processed at same turn together +# - Maximizes vLLM continuous batching +# - Fewer total HTTP requests + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT_USER=8004 +PORT_AGENT=8003 + +echo "============================================" +echo "Batch vs Parallel Processing Comparison" +echo "============================================" +date +echo "Node: $(hostname)" +nvidia-smi --query-gpu=index,name,memory.total --format=csv +echo "" + +# Kill any existing vLLM servers +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start user simulator server (8B) on GPU 0-1 +echo "Starting 8B user simulator server (GPU 0-1, TP=2)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_USER \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_USER_PID=$! + +# Start agent server (8B) on GPU 2-3 +echo "Starting 8B agent server (GPU 2-3, TP=2)..." +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_AGENT \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_AGENT_PID=$! + +echo "Waiting for servers (up to 5 min)..." + +# Wait for servers +for i in $(seq 1 100); do + READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0) + READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0) + + if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then + echo "Both servers ready after $((i*3)) seconds" + break + fi + if [ $((i % 20)) -eq 0 ]; then + echo " Still waiting... user=$READY_USER, agent=$READY_AGENT ($((i*3))s)" + fi + sleep 3 +done + +# Check health +if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then + echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then + echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +echo "✓ Both servers healthy" + +echo "" +echo "============================================" +echo "Test 1: NEW Batch Processing (20 samples)" +echo "============================================" +echo "This batches ALL user requests together, then ALL agent requests." +echo "" + +START=$(date +%s) +python agents/batch_vllm_agent.py \ + http://localhost:$PORT_USER/v1 \ + http://localhost:$PORT_AGENT/v1 \ + 20 +END=$(date +%s) +ELAPSED_BATCH=$((END-START)) +echo "" +echo "Batch processing time: ${ELAPSED_BATCH} seconds" + +echo "" +echo "============================================" +echo "Test 2: OLD Parallel Profile Processing (20 samples)" +echo "============================================" +echo "This runs 20 profiles in parallel, but each makes separate requests." +echo "" + +cd scripts +START=$(date +%s) +python run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 20 \ + --n-sessions 1 \ + --use-vllm \ + --vllm-user-url http://localhost:$PORT_USER/v1 \ + --vllm-agent-url http://localhost:$PORT_AGENT/v1 \ + --parallel-profiles 20 \ + --output-dir ../results/batch_compare_parallel \ + --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -30 +END=$(date +%s) +ELAPSED_PARALLEL=$((END-START)) +echo "" +echo "Parallel profile processing time: ${ELAPSED_PARALLEL} seconds" + +cd .. + +# Cleanup +echo "" +echo "Cleaning up..." +kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true + +echo "" +echo "============================================" +echo "RESULTS COMPARISON" +echo "============================================" +echo "" +echo "NEW Batch processing (20 conv): ${ELAPSED_BATCH}s" +echo "OLD Parallel profiles (20 conv): ${ELAPSED_PARALLEL}s" +echo "" +if [ $ELAPSED_BATCH -gt 0 ]; then + SPEEDUP=$(echo "scale=2; $ELAPSED_PARALLEL / $ELAPSED_BATCH" | bc) + echo "Speedup with batch processing: ${SPEEDUP}x" +fi +echo "" +echo "Expected: Batch should be 5-10x faster due to:" +echo " - Turn-synchronous processing (all convs at same turn batched)" +echo " - Fewer HTTP request overhead" +echo " - Better vLLM continuous batching utilization" +echo "" +date diff --git a/collaborativeagents/scripts/test_extractor.py b/collaborativeagents/scripts/test_extractor.py new file mode 100644 index 0000000..a2b4ac1 --- /dev/null +++ b/collaborativeagents/scripts/test_extractor.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +"""Quick test for the preference extractor.""" + +import sys +sys.path.insert(0, "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src") + +from personalization.config.registry import get_preference_extractor + +print("="*60) +print("PREFERENCE EXTRACTOR TEST") +print("="*60) + +print("\nLoading extractor (qwen3_0_6b_sft)...") +extractor = get_preference_extractor("qwen3_0_6b_sft") +print("Extractor loaded successfully!") + +# Test extraction with various queries +test_queries = [ + "I prefer Python over Java for scripting tasks", + "Please use bullet points instead of numbered lists", + "Can you explain this in simpler terms? I'm a beginner.", + "I like concise answers, not long explanations", + "Always show code examples when explaining programming concepts", +] + +print("\n" + "="*60) +print("EXTRACTION TESTS") +print("="*60) + +for i, query in enumerate(test_queries, 1): + print(f"\n--- Test {i} ---") + print(f"Query: {query}") + result = extractor.extract_preferences(query) + print(f"Extracted: {result}") + + if result.get("preferences"): + for pref in result["preferences"]: + print(f" - condition: {pref.get('condition', 'N/A')}") + print(f" action: {pref.get('action', 'N/A')}") + print(f" confidence: {pref.get('confidence', 'N/A')}") + else: + print(" (No preferences extracted)") + +print("\n" + "="*60) +print("TEST COMPLETE") +print("="*60) diff --git a/collaborativeagents/scripts/test_multiturn.py b/collaborativeagents/scripts/test_multiturn.py new file mode 100644 index 0000000..1909c34 --- /dev/null +++ b/collaborativeagents/scripts/test_multiturn.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +""" +Minimal test script to validate multi-turn conversation works correctly. + +This runs a single profile with a single session to verify: +1. LocalUserAgent loads and generates responses +2. Multi-turn conversation loop works +3. Metrics are properly extracted +""" + +import sys +import json +from pathlib import Path + +# Add paths +sys.path.insert(0, str(Path(__file__).parent.parent)) +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +from agents.local_user_agent import LocalUserAgent, SharedLocalUserAgent, TERMINATION_SIGNAL + +def test_user_agent_standalone(): + """Test LocalUserAgent in isolation.""" + print("=" * 60) + print("TEST 1: LocalUserAgent Standalone") + print("=" * 60) + + user_agent = LocalUserAgent( + user_task_description="Help solve a math problem", + problem="What is 2 + 2?", + user_persona="A student learning math", + user_preferences="- Show step by step solutions\n- Use simple language", + ) + + # Simulate a conversation + conversation = [{"role": "assistant", "content": "How can I help you today?"}] + + print("\nGenerating user response...") + response = user_agent.generate_user_response(conversation) + + if response: + print(f"SUCCESS! User response: {response.get('response', 'N/A')[:200]}...") + print(f"Should terminate: {response.get('should_terminate', 'N/A')}") + print(f"Draft answer: {response.get('draft_answer', 'N/A')[:100]}...") + return True + else: + print("FAILED! User agent returned None") + return False + + +def test_multiturn_conversation(): + """Test full multi-turn conversation with agent adapter.""" + print("\n" + "=" * 60) + print("TEST 2: Multi-turn Conversation") + print("=" * 60) + + from adapters.personalized_llm_adapter import create_baseline_adapter + + # Create a simple agent adapter (vanilla mode) + print("\nCreating vanilla adapter...") + adapter = create_baseline_adapter("vanilla") + adapter.initialize() + + # Load a test profile + profile_path = Path(__file__).parent.parent / "data/complex_profiles_v2/profiles_100.jsonl" + with open(profile_path) as f: + profile = json.loads(f.readline()) + + print(f"Loaded profile: {profile.get('user_id', 'unknown')}") + + # Create user agent + problem = "What is 15% of 80?" + user_prefs = profile.get("preferences", [])[:3] + pref_str = "\n".join([f"- {p}" for p in user_prefs]) + + print(f"\nUser preferences:\n{pref_str}") + + user_agent = SharedLocalUserAgent( + user_task_description="Solve the math problem", + problem=problem, + user_persona=profile.get("persona", "A user"), + user_preferences=pref_str, + ) + + # Start session + adapter.start_session(user_id=profile.get("user_id", "test")) + + # Run multi-turn conversation + conversation = [{"role": "assistant", "content": "How can I help you today?"}] + turns = [] + max_turns = 5 + + print(f"\nStarting {max_turns}-turn conversation...") + + for turn_num in range(max_turns): + print(f"\n--- Turn {turn_num + 1} ---") + + # User turn + user_response = user_agent.generate_user_response(conversation) + if user_response is None: + print("User agent failed!") + break + + user_msg = user_response.get("response", "") + print(f"USER: {user_msg[:150]}...") + + conversation.append({"role": "user", "content": user_msg}) + turns.append({"role": "user", "content": user_msg}) + + # Check termination + if user_response.get("should_terminate", False) or TERMINATION_SIGNAL in user_msg: + print("\n[User terminated conversation]") + break + + # Agent turn + response = adapter.generate_response(user_msg, conversation[:-1]) + agent_msg = response.get("response", str(response)) if isinstance(response, dict) else str(response) + print(f"AGENT: {agent_msg[:150]}...") + + conversation.append({"role": "assistant", "content": agent_msg}) + turns.append({"role": "assistant", "content": agent_msg}) + + # End session + adapter.end_session() + + print(f"\n--- Results ---") + print(f"Total turns: {len(turns)}") + print(f"User turns: {len([t for t in turns if t['role'] == 'user'])}") + print(f"Agent turns: {len([t for t in turns if t['role'] == 'assistant'])}") + + return len(turns) > 2 # Success if more than single turn + + +def test_full_session(): + """Test run_single_session from ExperimentRunner.""" + print("\n" + "=" * 60) + print("TEST 3: Full run_single_session") + print("=" * 60) + + from run_experiments import ExperimentRunner, ExperimentConfig + from adapters.personalized_llm_adapter import create_baseline_adapter + + config = ExperimentConfig( + methods=["vanilla"], + datasets=["math-500"], + n_profiles=1, + n_sessions_per_profile=1, + max_turns_per_session=5, + output_dir="/tmp/test_multiturn", + profile_path=str(Path(__file__).parent.parent / "data/complex_profiles_v2/profiles_100.jsonl"), + ) + + print("\nCreating ExperimentRunner...") + runner = ExperimentRunner(config) + + # Get first profile and problem + profile = runner.profiles[0] + dataset = list(runner.datasets.values())[0] + sample = dataset.get_testset()[0] + + problem = { + "problem": sample.problem, + "solution": sample.solution, + "problem_id": sample.problem_id, + "domain": sample.domain, + } + + print(f"\nRunning single session...") + print(f"Profile: {profile.get('user_id', 'unknown')}") + print(f"Problem: {problem['problem'][:100]}...") + + # Create adapter + adapter = create_baseline_adapter("vanilla") + adapter.initialize() + + result = runner.run_single_session( + method="vanilla", + profile=profile, + problem=problem, + is_conflict_query=False, + adapter=adapter, + ) + + print(f"\n--- Session Results ---") + print(f"Total turns: {result['metrics']['total_turns']}") + print(f"Task success: {result['metrics']['task_success']}") + print(f"Enforcement count: {result['metrics']['enforcement_count']}") + print(f"User tokens: {result['metrics']['user_token_count']}") + print(f"Agent tokens: {result['metrics']['agent_token_count']}") + print(f"Compliance scores: {result['metrics']['preference_compliance_scores']}") + + if result['conversation']: + print(f"\nConversation ({len(result['conversation']['turns'])} messages):") + for i, turn in enumerate(result['conversation']['turns'][:6]): + print(f" [{turn['role']}]: {turn['content'][:80]}...") + + return result['metrics']['total_turns'] > 2 + + +if __name__ == "__main__": + print("\n" + "=" * 60) + print("MULTI-TURN CONVERSATION VALIDATION TEST") + print("=" * 60) + + results = {} + + # Test 1: User agent standalone + try: + results["user_agent"] = test_user_agent_standalone() + except Exception as e: + print(f"TEST 1 FAILED: {e}") + import traceback + traceback.print_exc() + results["user_agent"] = False + + # Test 2: Multi-turn conversation + try: + results["multiturn"] = test_multiturn_conversation() + except Exception as e: + print(f"TEST 2 FAILED: {e}") + import traceback + traceback.print_exc() + results["multiturn"] = False + + # Test 3: Full session (only if test 2 passed) + if results.get("multiturn", False): + try: + results["full_session"] = test_full_session() + except Exception as e: + print(f"TEST 3 FAILED: {e}") + import traceback + traceback.print_exc() + results["full_session"] = False + else: + print("\nSkipping TEST 3 (TEST 2 failed)") + results["full_session"] = False + + # Summary + print("\n" + "=" * 60) + print("TEST SUMMARY") + print("=" * 60) + for test_name, passed in results.items(): + status = "PASS" if passed else "FAIL" + print(f" {test_name}: {status}") + + all_passed = all(results.values()) + print(f"\nOverall: {'ALL TESTS PASSED' if all_passed else 'SOME TESTS FAILED'}") + + sys.exit(0 if all_passed else 1) diff --git a/collaborativeagents/scripts/test_parallel_a100.sh b/collaborativeagents/scripts/test_parallel_a100.sh new file mode 100755 index 0000000..dfd74bc --- /dev/null +++ b/collaborativeagents/scripts/test_parallel_a100.sh @@ -0,0 +1,172 @@ +#!/bin/bash +# Quick test of parallel vLLM processing on A100x4-interactive + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" + +# Configuration - using only 8B model for user sim to fit in A100 +# (70B AWQ needs TP=2 which leaves only 2 GPUs for 8B) +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT_USER=8004 +PORT_AGENT=8003 + +echo "============================================" +echo "Quick Parallel vLLM Test (A100x4)" +echo "============================================" +date +echo "Node: $(hostname)" +nvidia-smi --query-gpu=index,name,memory.total --format=csv +echo "" + +# Kill any existing vLLM servers +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# For A100 test, use 8B for both user and agent (to test parallelism) +# In production, user would be 70B AWQ with TP=2 + +# Start user simulator server (8B) on GPU 0-1 +echo "Starting 8B user simulator server (GPU 0-1)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_USER \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_USER_PID=$! + +# Start agent server (8B) on GPU 2-3 +echo "Starting 8B agent server (GPU 2-3)..." +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_AGENT \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_AGENT_PID=$! + +echo "Waiting for servers..." + +# Wait for servers (up to 5 minutes - A100 needs more time than H200) +for i in $(seq 1 100); do + READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0) + READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0) + + if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then + echo "Both servers ready after $((i*3)) seconds" + break + fi + if [ $((i % 20)) -eq 0 ]; then + echo " Still waiting... user=$READY_USER, agent=$READY_AGENT ($((i*3))s)" + fi + sleep 3 +done + +# Check health +if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then + echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then + echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1 +fi +echo "✓ Both servers healthy" + +cd scripts + +echo "" +echo "============================================" +echo "Running throughput tests..." +echo "============================================" +echo "Note: Using 8B for both user and agent (parallelism test)" +echo "" + +# Test 1: Sequential (1 profile, 3 sessions) +echo "--- Test 1: Sequential (1 profile, 3 sessions) ---" +START=$(date +%s) +python run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 1 \ + --n-sessions 3 \ + --use-vllm \ + --vllm-user-url http://localhost:$PORT_USER/v1 \ + --vllm-agent-url http://localhost:$PORT_AGENT/v1 \ + --parallel-profiles 1 \ + --output-dir ../results/a100_test_1 \ + --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -30 +END=$(date +%s) +ELAPSED_1=$((END-START)) +echo "" +echo "Time for 1 profile (3 sessions): ${ELAPSED_1} seconds" +echo "Throughput: ~$(echo "scale=1; 3 * 3600 / $ELAPSED_1" | bc) sessions/hr" + +# Test 2: Parallel (4 profiles, 3 sessions each = 12 total) +echo "" +echo "--- Test 2: Parallel (4 profiles, 3 sessions each = 12 total) ---" +START=$(date +%s) +python run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 4 \ + --n-sessions 3 \ + --use-vllm \ + --vllm-user-url http://localhost:$PORT_USER/v1 \ + --vllm-agent-url http://localhost:$PORT_AGENT/v1 \ + --parallel-profiles 4 \ + --output-dir ../results/a100_test_4 \ + --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -30 +END=$(date +%s) +ELAPSED_4=$((END-START)) +echo "" +echo "Time for 4 profiles (12 sessions): ${ELAPSED_4} seconds" +echo "Throughput: ~$(echo "scale=1; 12 * 3600 / $ELAPSED_4" | bc) sessions/hr" + +# Test 3: More parallel (8 profiles) +echo "" +echo "--- Test 3: Parallel (8 profiles, 3 sessions each = 24 total) ---" +START=$(date +%s) +python run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 8 \ + --n-sessions 3 \ + --use-vllm \ + --vllm-user-url http://localhost:$PORT_USER/v1 \ + --vllm-agent-url http://localhost:$PORT_AGENT/v1 \ + --parallel-profiles 8 \ + --output-dir ../results/a100_test_8 \ + --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -30 +END=$(date +%s) +ELAPSED_8=$((END-START)) +echo "" +echo "Time for 8 profiles (24 sessions): ${ELAPSED_8} seconds" +echo "Throughput: ~$(echo "scale=1; 24 * 3600 / $ELAPSED_8" | bc) sessions/hr" + +# Cleanup +echo "" +echo "Cleaning up..." +kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true + +echo "" +echo "============================================" +echo "RESULTS SUMMARY" +echo "============================================" +echo "" +echo "1 profile (3 sessions): ${ELAPSED_1}s -> $(echo "scale=0; 3 * 3600 / $ELAPSED_1" | bc) sessions/hr" +echo "4 profiles (12 sessions): ${ELAPSED_4}s -> $(echo "scale=0; 12 * 3600 / $ELAPSED_4" | bc) sessions/hr" +echo "8 profiles (24 sessions): ${ELAPSED_8}s -> $(echo "scale=0; 24 * 3600 / $ELAPSED_8" | bc) sessions/hr" +echo "" +echo "Speedup 4x parallel: $(echo "scale=2; ($ELAPSED_1 * 4) / $ELAPSED_4" | bc)x" +echo "Speedup 8x parallel: $(echo "scale=2; ($ELAPSED_1 * 8) / $ELAPSED_8" | bc)x" +echo "" +date diff --git a/collaborativeagents/scripts/test_parallel_quick.sh b/collaborativeagents/scripts/test_parallel_quick.sh new file mode 100755 index 0000000..8429da7 --- /dev/null +++ b/collaborativeagents/scripts/test_parallel_quick.sh @@ -0,0 +1,158 @@ +#!/bin/bash +# Quick test of parallel vLLM processing on H200x8-interactive +# Simplified version for 1 hour time limit + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" + +# Configuration +MODEL_70B_AWQ="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4" +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT_70B=8004 +PORT_8B=8003 + +echo "============================================" +echo "Quick Parallel vLLM Test (H200)" +echo "============================================" +date +echo "Node: $(hostname)" +nvidia-smi --query-gpu=index,name,memory.total --format=csv +echo "" + +# Kill any existing vLLM servers +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start 70B AWQ server on GPU 0-1 (TP=2) +echo "Starting 70B AWQ server (GPU 0-1, TP=2)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_70B_AWQ \ + --port $PORT_70B \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 \ + --max-model-len 4096 \ + --disable-log-requests \ + --quantization awq \ + --dtype float16 & +SERVER_70B_PID=$! + +# Start 8B server on GPU 2 +echo "Starting 8B server (GPU 2)..." +CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_8B \ + --gpu-memory-utilization 0.90 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_8B_PID=$! + +echo "Waiting for servers (up to 5 min)..." + +# Wait for servers +for i in $(seq 1 100); do + READY_70B=$(curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1 && echo 1 || echo 0) + READY_8B=$(curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1 && echo 1 || echo 0) + + if [ "$READY_70B" = "1" ] && [ "$READY_8B" = "1" ]; then + echo "Both servers ready after $((i*3)) seconds" + break + fi + if [ $((i % 20)) -eq 0 ]; then + echo " Still waiting... 70B=$READY_70B, 8B=$READY_8B ($((i*3))s)" + fi + sleep 3 +done + +# Check health +if ! curl -s http://localhost:$PORT_70B/health > /dev/null; then + echo "ERROR: 70B server not healthy"; kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null; exit 1 +fi +if ! curl -s http://localhost:$PORT_8B/health > /dev/null; then + echo "ERROR: 8B server not healthy"; kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null; exit 1 +fi +echo "✓ Both servers healthy" + +cd scripts + +echo "" +echo "============================================" +echo "Running throughput tests..." +echo "============================================" + +# Test 1: Sequential (1 profile, 2 sessions) +echo "" +echo "--- Test 1: Sequential (1 profile) ---" +START=$(date +%s) +python run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 1 \ + --n-sessions 2 \ + --use-vllm \ + --vllm-user-url http://localhost:$PORT_70B/v1 \ + --vllm-agent-url http://localhost:$PORT_8B/v1 \ + --parallel-profiles 1 \ + --output-dir ../results/quick_test_1 \ + --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20 +END=$(date +%s) +echo "Time: $((END-START)) seconds" + +# Test 2: Parallel (4 profiles, 2 sessions each) +echo "" +echo "--- Test 2: Parallel (4 profiles) ---" +START=$(date +%s) +python run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 4 \ + --n-sessions 2 \ + --use-vllm \ + --vllm-user-url http://localhost:$PORT_70B/v1 \ + --vllm-agent-url http://localhost:$PORT_8B/v1 \ + --parallel-profiles 4 \ + --output-dir ../results/quick_test_4 \ + --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20 +END=$(date +%s) +echo "Time: $((END-START)) seconds" + +# Test 3: Parallel (8 profiles, 2 sessions each) +echo "" +echo "--- Test 3: Parallel (8 profiles) ---" +START=$(date +%s) +python run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 8 \ + --n-sessions 2 \ + --use-vllm \ + --vllm-user-url http://localhost:$PORT_70B/v1 \ + --vllm-agent-url http://localhost:$PORT_8B/v1 \ + --parallel-profiles 8 \ + --output-dir ../results/quick_test_8 \ + --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20 +END=$(date +%s) +echo "Time: $((END-START)) seconds" + +# Cleanup +echo "" +echo "Cleaning up..." +kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null || true + +echo "" +echo "============================================" +echo "TEST COMPLETE!" +echo "============================================" +echo "" +echo "Summary: Compare timing above" +echo " - Sequential (1 profile): baseline" +echo " - Parallel (4 profiles): should be faster per profile" +echo " - Parallel (8 profiles): should show more speedup" +echo "" +date diff --git a/collaborativeagents/scripts/test_parallel_speed.sbatch b/collaborativeagents/scripts/test_parallel_speed.sbatch new file mode 100644 index 0000000..28c5b79 --- /dev/null +++ b/collaborativeagents/scripts/test_parallel_speed.sbatch @@ -0,0 +1,126 @@ +#!/bin/bash +#SBATCH --job-name=test_parallel +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8-interactive +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=01:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_parallel-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_parallel-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== Parallel Speed Test: ALL 6 METHODS ===" +echo "Scale: 10 profiles × 3 sessions = 30 sessions per method" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start TWO vLLM servers (user on 8004, agent on 8003) +echo "" +echo "Starting vLLM servers..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +for i in $(seq 1 120); do + u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0) + a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0) + [ "$u" = "1" ] && [ "$a" = "1" ] && echo "Both servers ready after $((i*2))s" && break + sleep 2 +done + +# Additional wait for servers to fully load models +echo "Waiting 30s for servers to fully initialize..." +sleep 30 +echo "Starting experiments..." + +# Common parameters for all tests +COMMON_ARGS="--datasets math-hard --n-profiles 10 --n-sessions 3 --max-turns 10 --use-vllm --parallel-profiles 10 --no-batch-processing --output-dir ../results/parallel_test_all --profile-path $PROFILE_PATH" + +# Test 1: vanilla (batch processing) +echo "" +echo "=== TEST 1: vanilla (batch processing) ===" +START=$(date +%s) +python scripts/run_experiments.py \ + --methods vanilla \ + --datasets math-hard \ + --n-profiles 10 --n-sessions 3 --max-turns 10 \ + --use-vllm --parallel-profiles 10 \ + --use-batch-processing --batch-size 30 \ + --output-dir ../results/parallel_test_all \ + --profile-path "$PROFILE_PATH" +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> vanilla: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 2: contextual +echo "" +echo "=== TEST 2: contextual ===" +START=$(date +%s) +python scripts/run_experiments.py --methods contextual $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> contextual: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 3: reflection +echo "" +echo "=== TEST 3: reflection ===" +START=$(date +%s) +python scripts/run_experiments.py --methods reflection $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> reflection: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 4: all_memory +echo "" +echo "=== TEST 4: all_memory ===" +START=$(date +%s) +python scripts/run_experiments.py --methods all_memory $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> all_memory: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 5: rag +echo "" +echo "=== TEST 5: rag ===" +START=$(date +%s) +python scripts/run_experiments.py --methods rag $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> rag: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 6: rag_vector +echo "" +echo "=== TEST 6: rag_vector ===" +START=$(date +%s) +python scripts/run_experiments.py --methods rag_vector $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> rag_vector: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +pkill -f "vllm.entrypoints" 2>/dev/null || true + +echo "" +echo "=== ALL SPEED TESTS COMPLETE ===" +date diff --git a/collaborativeagents/scripts/test_parallel_speed_a100.sbatch b/collaborativeagents/scripts/test_parallel_speed_a100.sbatch new file mode 100755 index 0000000..f3d0848 --- /dev/null +++ b/collaborativeagents/scripts/test_parallel_speed_a100.sbatch @@ -0,0 +1,126 @@ +#!/bin/bash +#SBATCH --job-name=test_all_a100 +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuA100x4 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=128G +#SBATCH --time=01:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_a100-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_all_a100-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== Parallel Speed Test: ALL 6 METHODS (A100) ===" +echo "Scale: 10 profiles × 3 sessions = 30 sessions per method" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Start TWO vLLM servers (user on 8004, agent on 8003) +echo "" +echo "Starting vLLM servers..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.45 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.45 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +for i in $(seq 1 120); do + u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0) + a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0) + [ "$u" = "1" ] && [ "$a" = "1" ] && echo "Both servers ready after $((i*2))s" && break + sleep 2 +done + +# Additional wait for servers to fully load models +echo "Waiting 30s for servers to fully initialize..." +sleep 30 +echo "Starting experiments..." + +# Common parameters for all tests +COMMON_ARGS="--datasets math-hard --n-profiles 10 --n-sessions 3 --max-turns 10 --use-vllm --parallel-profiles 10 --no-batch-processing --output-dir ../results/parallel_test_a100 --profile-path $PROFILE_PATH" + +# Test 1: vanilla (batch processing) +echo "" +echo "=== TEST 1: vanilla (batch processing) ===" +START=$(date +%s) +python scripts/run_experiments.py \ + --methods vanilla \ + --datasets math-hard \ + --n-profiles 10 --n-sessions 3 --max-turns 10 \ + --use-vllm --parallel-profiles 10 \ + --use-batch-processing --batch-size 30 \ + --output-dir ../results/parallel_test_a100 \ + --profile-path "$PROFILE_PATH" +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> vanilla: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 2: contextual +echo "" +echo "=== TEST 2: contextual ===" +START=$(date +%s) +python scripts/run_experiments.py --methods contextual $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> contextual: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 3: reflection +echo "" +echo "=== TEST 3: reflection ===" +START=$(date +%s) +python scripts/run_experiments.py --methods reflection $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> reflection: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 4: all_memory +echo "" +echo "=== TEST 4: all_memory ===" +START=$(date +%s) +python scripts/run_experiments.py --methods all_memory $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> all_memory: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 5: rag +echo "" +echo "=== TEST 5: rag ===" +START=$(date +%s) +python scripts/run_experiments.py --methods rag $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> rag: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +# Test 6: rag_vector +echo "" +echo "=== TEST 6: rag_vector ===" +START=$(date +%s) +python scripts/run_experiments.py --methods rag_vector $COMMON_ARGS +END=$(date +%s) +ELAPSED=$((END - START)) +echo ">>> rag_vector: 30 sessions in ${ELAPSED}s = $((30 * 3600 / ELAPSED)) sessions/hr" + +pkill -f "vllm.entrypoints" 2>/dev/null || true + +echo "" +echo "=== ALL SPEED TESTS COMPLETE ===" +date diff --git a/collaborativeagents/scripts/test_parallel_vllm.sh b/collaborativeagents/scripts/test_parallel_vllm.sh new file mode 100755 index 0000000..0cd0f1f --- /dev/null +++ b/collaborativeagents/scripts/test_parallel_vllm.sh @@ -0,0 +1,205 @@ +#!/bin/bash +# Test parallel vLLM processing on H200x8-interactive +# Usage: Run this on an interactive H200 node +# +# srun --account=bfqt-delta-gpu --partition=gpuH200x8-interactive \ +# --nodes=1 --gpus-per-node=4 --time=02:00:00 --mem=200G --pty bash +# cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +# bash scripts/test_parallel_vllm.sh + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" + +# Configuration +MODEL_70B_AWQ="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4" +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT_70B=8004 +PORT_8B=8003 + +echo "============================================" +echo "Parallel vLLM Experiment Test" +echo "============================================" +echo "Date: $(date)" +echo "Node: $(hostname)" +echo "" + +echo "=== GPU Info ===" +nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv +echo "" + +# Kill any existing vLLM servers +echo "Cleaning up any existing vLLM servers..." +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +echo "============================================" +echo "Starting vLLM Servers" +echo "============================================" + +# Start 70B AWQ server on GPU 0-1 (needs 2 GPUs for tensor parallelism) +echo "" +echo "Starting 70B AWQ vLLM Server (GPU 0-1, TP=2)..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_70B_AWQ \ + --port $PORT_70B \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 \ + --max-model-len 4096 \ + --disable-log-requests \ + --quantization awq \ + --dtype float16 & +SERVER_70B_PID=$! +echo "70B Server PID: $SERVER_70B_PID" + +# Start 8B server on GPU 2 +echo "" +echo "Starting 8B vLLM Server (GPU 2)..." +CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT_8B \ + --gpu-memory-utilization 0.90 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_8B_PID=$! +echo "8B Server PID: $SERVER_8B_PID" + +echo "" +echo "Waiting for servers to start..." + +# Wait for 70B (may take 3-5 minutes) +for i in $(seq 1 120); do + if curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1; then + echo "70B Server ready after $((i*3)) seconds" + break + fi + if [ $((i % 20)) -eq 0 ]; then + echo " Waiting for 70B... ($((i*3)) seconds)" + fi + sleep 3 +done + +# Wait for 8B +for i in $(seq 1 60); do + if curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1; then + echo "8B Server ready after $((i*2)) seconds" + break + fi + sleep 2 +done + +# Check both servers +echo "" +if ! curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1; then + echo "ERROR: 70B server failed to start" + kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null + exit 1 +fi +echo "✓ 70B server healthy" + +if ! curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1; then + echo "ERROR: 8B server failed to start" + kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null + exit 1 +fi +echo "✓ 8B server healthy" + +echo "" +echo "=== vLLM Server Info ===" +echo "70B model:" +curl -s http://localhost:$PORT_70B/v1/models | python -m json.tool 2>/dev/null | head -10 +echo "" +echo "8B model:" +curl -s http://localhost:$PORT_8B/v1/models | python -m json.tool 2>/dev/null | head -10 + +echo "" +echo "============================================" +echo "Test 1: Sequential Processing (1 profile)" +echo "============================================" + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts + +time python run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 1 \ + --n-sessions 3 \ + --use-vllm \ + --vllm-user-url http://localhost:$PORT_70B/v1 \ + --vllm-agent-url http://localhost:$PORT_8B/v1 \ + --parallel-profiles 1 \ + --output-dir ../results/parallel_test_seq \ + --profile-path ../data/complex_profiles_v2/profiles_100.jsonl + +echo "" +echo "============================================" +echo "Test 2: Parallel Processing (4 profiles)" +echo "============================================" + +time python run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 4 \ + --n-sessions 3 \ + --use-vllm \ + --vllm-user-url http://localhost:$PORT_70B/v1 \ + --vllm-agent-url http://localhost:$PORT_8B/v1 \ + --parallel-profiles 4 \ + --output-dir ../results/parallel_test_4 \ + --profile-path ../data/complex_profiles_v2/profiles_100.jsonl + +echo "" +echo "============================================" +echo "Test 3: Parallel Processing (8 profiles)" +echo "============================================" + +time python run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 8 \ + --n-sessions 3 \ + --use-vllm \ + --vllm-user-url http://localhost:$PORT_70B/v1 \ + --vllm-agent-url http://localhost:$PORT_8B/v1 \ + --parallel-profiles 8 \ + --output-dir ../results/parallel_test_8 \ + --profile-path ../data/complex_profiles_v2/profiles_100.jsonl + +echo "" +echo "============================================" +echo "Test 4: Parallel Processing (16 profiles)" +echo "============================================" + +time python run_experiments.py \ + --methods vanilla \ + --datasets mmlu \ + --n-profiles 16 \ + --n-sessions 3 \ + --use-vllm \ + --vllm-user-url http://localhost:$PORT_70B/v1 \ + --vllm-agent-url http://localhost:$PORT_8B/v1 \ + --parallel-profiles 16 \ + --output-dir ../results/parallel_test_16 \ + --profile-path ../data/complex_profiles_v2/profiles_100.jsonl + +# Cleanup +echo "" +echo "Cleaning up..." +kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null +wait $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null + +echo "" +echo "============================================" +echo "TEST COMPLETE!" +echo "============================================" +echo "" +echo "Compare the timing results above to estimate optimal parallelism." +echo "Expected scaling: Higher parallelism → Higher throughput (until bottleneck)" +echo "" +date diff --git a/collaborativeagents/scripts/test_rag_empty.sbatch b/collaborativeagents/scripts/test_rag_empty.sbatch new file mode 100644 index 0000000..735adbc --- /dev/null +++ b/collaborativeagents/scripts/test_rag_empty.sbatch @@ -0,0 +1,143 @@ +#!/bin/bash +#SBATCH --job-name=rag_empty +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=250G +#SBATCH --time=03:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.err + +# Test RAG with EMPTY memory store - start fresh and accumulate +# 5 profiles, 15 sessions each (more sessions to test accumulation) +# Compare: vanilla, rag, rag_vector + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +echo "=== RAG Empty Memory Store Test ===" +echo "Key change: Starting with EMPTY memory store" +echo " - RAG will accumulate memories during evaluation" +echo " - Each user builds their own memory basket from scratch" +echo "" +echo "Settings: 5 profiles, 15 sessions each" +echo "User simulator: $USER_MODEL (70B)" +echo "Agent: $AGENT_MODEL (8B)" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +# Clear empty store before each run to ensure fresh start +echo "" +echo "Clearing empty memory store..." +> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl +rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy +echo "Memory store cleared." + +# Start vLLM servers with adjusted memory allocation +echo "" +echo "Starting vLLM servers..." + +# User simulator on GPUs 0,1 (70B, TP=2) +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.85 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME \ + --disable-log-requests & +USER_PID=$! + +# Agent on GPUs 2,3 (8B, TP=2) - reduced memory for embedding/reranker headroom +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.40 \ + --max-model-len 16384 --dtype bfloat16 \ + --disable-log-requests & +AGENT_PID=$! + +# Wait for servers +echo "Waiting for vLLM servers (may take 5-10 min)..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator (8004) ready after $((i*5)) seconds" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent (8003) ready after $((i*5)) seconds" + break + fi + sleep 5 +done + +if ! curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "ERROR: User server not healthy" + kill $USER_PID $AGENT_PID 2>/dev/null + exit 1 +fi +if ! curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "ERROR: Agent server not healthy" + kill $USER_PID $AGENT_PID 2>/dev/null + exit 1 +fi +echo "Both vLLM servers ready" +sleep 5 + +OUTPUT_DIR="../results/rag_empty_test_$(date +%Y%m%d_%H%M%S)" + +# Run methods sequentially (each starts with fresh empty memory) +for METHOD in vanilla rag rag_vector; do + echo "" + echo "============================================" + echo "Testing method: $METHOD" + echo "============================================" + + # Clear memory store before each method for fair comparison + > /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl + rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy + echo "Memory store cleared for $METHOD" + + date + START=$(date +%s) + + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 5 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + + END=$(date +%s) + ELAPSED=$((END-START)) + + # Show memory accumulation stats for RAG methods + if [[ "$METHOD" == "rag" || "$METHOD" == "rag_vector" ]]; then + CARD_COUNT=$(wc -l < /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl 2>/dev/null || echo 0) + echo "Memory cards accumulated: $CARD_COUNT" + fi + + if [ $? -eq 0 ]; then + echo "Method $METHOD: SUCCESS (${ELAPSED}s)" + else + echo "Method $METHOD: FAILED after ${ELAPSED}s" + fi +done + +echo "" +echo "============================================" +echo "RAG Empty Memory Test Complete" +echo "============================================" +echo "Results saved to: $OUTPUT_DIR" +date + +# Cleanup +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/test_rag_empty_v2.sbatch b/collaborativeagents/scripts/test_rag_empty_v2.sbatch new file mode 100644 index 0000000..834dccb --- /dev/null +++ b/collaborativeagents/scripts/test_rag_empty_v2.sbatch @@ -0,0 +1,124 @@ +#!/bin/bash +#SBATCH --job-name=rag_empty +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:8 +#SBATCH --mem=400G +#SBATCH --time=04:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.err + +# Test RAG with EMPTY memory store - start fresh and accumulate +# Using 8 GPUs with TP=4 for 70B to avoid CUDA errors + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +export NCCL_P2P_DISABLE=1 + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +echo "=== RAG Empty Memory Store Test v2 ===" +echo "Using 8 GPUs: TP=4 for 70B user sim, TP=2 for 8B agent" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +# Clear empty store +echo "Clearing empty memory store..." +> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl +rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy + +# Start vLLM servers +echo "Starting vLLM servers..." + +# User simulator on GPUs 0-3 (70B, TP=4) +CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 4 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME \ + --disable-log-requests & +USER_PID=$! + +# Agent on GPUs 4-5 (8B, TP=2) +CUDA_VISIBLE_DEVICES=4,5 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.40 \ + --max-model-len 16384 --dtype bfloat16 \ + --disable-log-requests & +AGENT_PID=$! + +# Wait for servers +echo "Waiting for vLLM servers..." +for i in {1..300}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator (8004) ready after $((i*5)) seconds" + break + fi + sleep 5 +done +for i in {1..120}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent (8003) ready after $((i*5)) seconds" + break + fi + sleep 5 +done + +if ! curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "ERROR: User server not healthy" + kill $USER_PID $AGENT_PID 2>/dev/null + exit 1 +fi +if ! curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "ERROR: Agent server not healthy" + kill $USER_PID $AGENT_PID 2>/dev/null + exit 1 +fi +echo "Both vLLM servers ready" +sleep 5 + +OUTPUT_DIR="../results/rag_empty_test_$(date +%Y%m%d_%H%M%S)" + +for METHOD in vanilla rag rag_vector; do + echo "" + echo "============================================" + echo "Testing method: $METHOD" + echo "============================================" + + # Clear memory store before each method + > /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl + rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy + + date + START=$(date +%s) + + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 5 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + + END=$(date +%s) + ELAPSED=$((END-START)) + + if [[ "$METHOD" == "rag" || "$METHOD" == "rag_vector" ]]; then + CARD_COUNT=$(wc -l < /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl 2>/dev/null || echo 0) + echo "Memory cards accumulated: $CARD_COUNT" + fi + + echo "Method $METHOD: completed in ${ELAPSED}s" +done + +echo "" +echo "=== Test Complete ===" +echo "Results: $OUTPUT_DIR" +date + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/test_rag_empty_v3.sbatch b/collaborativeagents/scripts/test_rag_empty_v3.sbatch new file mode 100644 index 0000000..db9bd5c --- /dev/null +++ b/collaborativeagents/scripts/test_rag_empty_v3.sbatch @@ -0,0 +1,110 @@ +#!/bin/bash +#SBATCH --job-name=rag_empty +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=03:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.err + +# Test RAG with EMPTY memory store using previous working settings +# 4 GPUs, TP=2 for both models (same as smallscale_test.sbatch) + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +echo "=== RAG Empty Memory Store Test ===" +echo "Using previous working settings (4 GPUs, TP=2)" +echo "Settings: 5 profiles, 15 sessions each" +date + +# Clear empty store +echo "Clearing empty memory store..." +> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl +rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy + +# Start vLLM servers (same settings as smallscale_test.sbatch) +# User simulator on GPUs 0,1 (70B, TP=2) +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME & + +# Agent on GPUs 2,3 (8B, TP=2, lower memory for embedding/reranker) +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \ + --max-model-len 16384 --dtype bfloat16 & + +# Wait for servers +echo "Waiting for vLLM servers..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator (8004) ready after $((i*5)) seconds" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent (8003) ready after $((i*5)) seconds" + break + fi + sleep 5 +done +echo "Both vLLM servers ready" +sleep 10 + +OUTPUT_DIR="../results/rag_empty_test_$(date +%Y%m%d_%H%M%S)" + +for METHOD in vanilla rag rag_vector; do + echo "" + echo "============================================" + echo "Testing method: $METHOD" + echo "============================================" + + # Clear memory store before each method for fair comparison + > /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl + rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy + echo "Memory store cleared for $METHOD" + + date + START=$(date +%s) + + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 5 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + + END=$(date +%s) + ELAPSED=$((END-START)) + + if [[ "$METHOD" == "rag" || "$METHOD" == "rag_vector" ]]; then + CARD_COUNT=$(wc -l < /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl 2>/dev/null || echo 0) + echo "Memory cards accumulated: $CARD_COUNT" + fi + + if [ $? -eq 0 ]; then + echo "Method $METHOD: SUCCESS (${ELAPSED}s)" + else + echo "Method $METHOD: FAILED after ${ELAPSED}s" + fi +done + +echo "" +echo "=== Test Complete ===" +echo "Results: $OUTPUT_DIR" +date + +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/test_rag_fix.sbatch b/collaborativeagents/scripts/test_rag_fix.sbatch new file mode 100644 index 0000000..b07d286 --- /dev/null +++ b/collaborativeagents/scripts/test_rag_fix.sbatch @@ -0,0 +1,123 @@ +#!/bin/bash +#SBATCH --job-name=test_rag_fix +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:4 +#SBATCH --mem=200G +#SBATCH --time=02:00:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_rag_fix-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_rag_fix-%j.err + +# Small-scale test: 5 profiles, 10 sessions each +# Tests RAG fixes: extract_session accumulation, nopersonal mode +# Compare: vanilla, rag, rag_vector + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH" + +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" +AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +USER_MODEL="meta-llama/Llama-3.1-70B-Instruct" + +echo "=== RAG Fix Verification Test ===" +echo "Testing fixes:" +echo " 1. extract_session (accumulate all turns)" +echo " 2. RAG mode=nopersonal (pure dense+rerank)" +echo " 3. Explicit normalize=True" +echo "" +echo "Settings: 5 profiles, 10 sessions each, max 15 turns" +echo "User simulator: $USER_MODEL (70B)" +echo "Agent: $AGENT_MODEL (8B)" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +# Start vLLM servers +# User simulator on GPUs 0,1 (70B, TP=2) +echo "" +echo "Starting vLLM servers..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $USER_MODEL \ + --port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.90 \ + --max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME \ + --disable-log-requests & +USER_PID=$! + +# Agent on GPUs 2,3 (8B, TP=2, lower memory for embedding/reranker) +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $AGENT_MODEL \ + --port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.45 \ + --max-model-len 16384 --dtype bfloat16 \ + --disable-log-requests & +AGENT_PID=$! + +# Wait for servers +echo "Waiting for vLLM servers (may take 5-10 min)..." +for i in {1..200}; do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "User simulator (8004) ready after $((i*5)) seconds" + break + fi + sleep 5 +done +for i in {1..60}; do + if curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "Agent (8003) ready after $((i*5)) seconds" + break + fi + sleep 5 +done + +if ! curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "ERROR: User server not healthy" + kill $USER_PID $AGENT_PID 2>/dev/null + exit 1 +fi +if ! curl -s http://localhost:8003/health > /dev/null 2>&1; then + echo "ERROR: Agent server not healthy" + kill $USER_PID $AGENT_PID 2>/dev/null + exit 1 +fi +echo "Both vLLM servers ready" +sleep 5 + +# Test methods: vanilla (baseline), rag (fixed), rag_vector (fixed) +OUTPUT_DIR="../results/rag_fix_test_$(date +%Y%m%d_%H%M%S)" + +for METHOD in vanilla rag rag_vector; do + echo "" + echo "============================================" + echo "Testing method: $METHOD" + echo "============================================" + date + START=$(date +%s) + + python scripts/run_experiments.py --methods $METHOD \ + --datasets math-hard --n-profiles 5 --n-sessions 10 --max-turns 15 \ + --use-vllm --no-batch-processing --parallel-profiles 5 \ + --output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH + + END=$(date +%s) + ELAPSED=$((END-START)) + + if [ $? -eq 0 ]; then + echo "Method $METHOD: SUCCESS (${ELAPSED}s)" + else + echo "Method $METHOD: FAILED after ${ELAPSED}s" + fi +done + +echo "" +echo "============================================" +echo "RAG Fix Test Complete" +echo "============================================" +echo "Results saved to: $OUTPUT_DIR" +date + +# Cleanup +pkill -f "vllm.entrypoints" 2>/dev/null || true diff --git a/collaborativeagents/scripts/test_real_speed.sbatch b/collaborativeagents/scripts/test_real_speed.sbatch new file mode 100644 index 0000000..ff914e6 --- /dev/null +++ b/collaborativeagents/scripts/test_real_speed.sbatch @@ -0,0 +1,87 @@ +#!/bin/bash +#SBATCH --job-name=test_real +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8-interactive +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH --time=00:30:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_real-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_real-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl" + +echo "=== Real Speed Test (5 profiles, 5 sessions) ===" +date +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +# Test 1: contextual (vLLM-based) +echo "" +echo "=== TEST 1: contextual (2 vLLM servers) ===" +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +for i in $(seq 1 120); do + u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0) + a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0) + [ "$u" = "1" ] && [ "$a" = "1" ] && echo "Ready after $((i*2))s" && break + sleep 2 +done + +time python scripts/run_experiments.py \ + --methods contextual \ + --datasets math-hard \ + --n-profiles 5 --n-sessions 5 --max-turns 10 \ + --use-vllm --parallel-profiles 5 \ + --output-dir ../results/speed_test \ + --profile-path "$PROFILE_PATH" + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 5 + +# Test 2: all_memory (vLLM + transformers) +echo "" +echo "=== TEST 2: all_memory (vLLM user + transformers adapter) ===" +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +for i in $(seq 1 120); do + curl -s http://localhost:8004/health > /dev/null 2>&1 && echo "Ready after $((i*2))s" && break + sleep 2 +done + +CUDA_VISIBLE_DEVICES=2,3 time python scripts/run_experiments.py \ + --methods all_memory \ + --datasets math-hard \ + --n-profiles 5 --n-sessions 5 --max-turns 10 \ + --use-vllm --parallel-profiles 5 \ + --output-dir ../results/speed_test \ + --profile-path "$PROFILE_PATH" + +pkill -f "vllm.entrypoints" 2>/dev/null || true + +echo "" +echo "=== DONE ===" +date diff --git a/collaborativeagents/scripts/test_vllm_adapter.sh b/collaborativeagents/scripts/test_vllm_adapter.sh new file mode 100755 index 0000000..af22667 --- /dev/null +++ b/collaborativeagents/scripts/test_vllm_adapter.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# Test vLLM with 45% memory + ContextualAdapter loading + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" + +echo "=== Testing vLLM 45% memory + Adapter ===" +echo "GPUs available:" +nvidia-smi --query-gpu=index,name,memory.total --format=csv + +# Kill any existing vLLM +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +echo "" +echo "Memory before vLLM:" +nvidia-smi --query-gpu=index,memory.used --format=csv + +echo "" +echo "Starting vLLM with 45% memory on GPU 0,1..." +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.45 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +VLLM_PID=$! +echo "vLLM PID: $VLLM_PID" + +echo "Waiting for vLLM to start..." +for i in $(seq 1 60); do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "vLLM ready after $((i*2))s" + break + fi + sleep 2 +done + +echo "" +echo "Memory after vLLM started:" +nvidia-smi --query-gpu=index,memory.used --format=csv + +echo "" +echo "Testing ContextualAdapter loading..." +python -c " +import sys +sys.path.insert(0, 'collaborativeagents') +sys.path.insert(0, 'src') + +from adapters.contextual_adapter import ContextualAdapter +print('Creating ContextualAdapter...') +adapter = ContextualAdapter() +print('Initializing (loading model)...') +adapter.initialize() +print('Testing generation...') +adapter.start_session('test') +result = adapter.generate_response('What is 2+2?') +print(f'Response: {result[\"response\"][:100]}') +print('SUCCESS: ContextualAdapter works with vLLM running!') +" + +echo "" +echo "Final memory usage:" +nvidia-smi --query-gpu=index,memory.used --format=csv + +# Cleanup +pkill -f "vllm.entrypoints" 2>/dev/null || true +echo "Test complete!" diff --git a/collaborativeagents/scripts/test_vllm_interactive.sh b/collaborativeagents/scripts/test_vllm_interactive.sh new file mode 100755 index 0000000..5da73b4 --- /dev/null +++ b/collaborativeagents/scripts/test_vllm_interactive.sh @@ -0,0 +1,212 @@ +#!/bin/bash +# Test vLLM inference speed on interactive node +# +# Usage: +# 1. Get an interactive node: +# srun --partition=gpu --gres=gpu:4 --time=2:00:00 --pty bash +# +# 2. Run this script: +# bash scripts/test_vllm_interactive.sh +# +# This script will: +# 1. Start vLLM server for 8B model (agent) +# 2. Start vLLM server for 70B AWQ model (user simulator) +# 3. Run benchmarks +# 4. Compare with paper's 2000 conv/hr target + +set -e + +# Paths +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +MODEL_70B="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4" +HF_CACHE="/projects/bfqt/users/yurenh2/hf_cache/huggingface" + +# Ports +PORT_8B=8003 +PORT_70B=8004 + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${GREEN}======================================${NC}" +echo -e "${GREEN} vLLM Inference Speed Test${NC}" +echo -e "${GREEN}======================================${NC}" +echo "" + +# Check GPU availability +echo -e "${YELLOW}Checking GPUs...${NC}" +nvidia-smi --query-gpu=index,name,memory.total --format=csv +NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l) +echo -e "Found ${GREEN}${NUM_GPUS}${NC} GPUs" +echo "" + +if [ "$NUM_GPUS" -lt 4 ]; then + echo -e "${RED}WARNING: Less than 4 GPUs available. 70B model may not fit.${NC}" +fi + +# Setup environment +export HF_HOME=$HF_CACHE +export TRANSFORMERS_CACHE=$HF_CACHE + +# Activate conda environment if needed +# source /path/to/conda/etc/profile.d/conda.sh +# conda activate your_env + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents + +# Function to start vLLM server +start_vllm_server() { + local model=$1 + local port=$2 + local gpus=$3 + local extra_args=$4 + local logfile=$5 + + echo -e "${YELLOW}Starting vLLM server on port $port with GPUs $gpus...${NC}" + echo "Model: $model" + + CUDA_VISIBLE_DEVICES=$gpus python -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port $port \ + --gpu-memory-utilization 0.9 \ + --max-model-len 8192 \ + $extra_args \ + > $logfile 2>&1 & + + echo $! +} + +# Function to wait for server to be ready +wait_for_server() { + local port=$1 + local max_wait=300 # 5 minutes + local waited=0 + + echo -n "Waiting for server on port $port" + while [ $waited -lt $max_wait ]; do + if curl -s http://localhost:$port/health > /dev/null 2>&1; then + echo -e " ${GREEN}Ready!${NC}" + return 0 + fi + echo -n "." + sleep 5 + waited=$((waited + 5)) + done + + echo -e " ${RED}Timeout!${NC}" + return 1 +} + +# Cleanup function +cleanup() { + echo -e "\n${YELLOW}Cleaning up...${NC}" + if [ ! -z "$PID_8B" ]; then + kill $PID_8B 2>/dev/null || true + fi + if [ ! -z "$PID_70B" ]; then + kill $PID_70B 2>/dev/null || true + fi + echo "Done." +} + +trap cleanup EXIT + +# Create log directory +mkdir -p logs + +# ============================================ +# Test 1: 8B model only (single GPU) +# ============================================ +echo -e "\n${GREEN}=== Test 1: 8B Model Benchmark ===${NC}" + +PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b.log") +echo "Server PID: $PID_8B" + +if wait_for_server $PORT_8B; then + echo -e "\n${YELLOW}Running 8B benchmark (20 requests)...${NC}" + python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 20 + + echo -e "\n${YELLOW}Running 8B benchmark with concurrency...${NC}" + python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 50 --concurrent +else + echo -e "${RED}Failed to start 8B server${NC}" +fi + +# Stop 8B server +kill $PID_8B 2>/dev/null || true +sleep 5 + +# ============================================ +# Test 2: 70B AWQ model (4 GPUs with tensor parallelism) +# ============================================ +echo -e "\n${GREEN}=== Test 2: 70B AWQ Model Benchmark ===${NC}" + +if [ "$NUM_GPUS" -ge 4 ]; then + PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "0,1,2,3" "--tensor-parallel-size 4" "logs/vllm_70b.log") + echo "Server PID: $PID_70B" + + if wait_for_server $PORT_70B; then + echo -e "\n${YELLOW}Running 70B benchmark (20 requests)...${NC}" + python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 20 + + echo -e "\n${YELLOW}Running 70B benchmark with concurrency...${NC}" + python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 50 --concurrent + else + echo -e "${RED}Failed to start 70B server${NC}" + echo "Check logs/vllm_70b.log for errors" + fi + + # Stop 70B server + kill $PID_70B 2>/dev/null || true + sleep 5 +else + echo -e "${YELLOW}Skipping 70B test (need 4 GPUs)${NC}" +fi + +# ============================================ +# Test 3: Full conversation simulation +# ============================================ +echo -e "\n${GREEN}=== Test 3: Full Conversation Simulation ===${NC}" + +if [ "$NUM_GPUS" -ge 4 ]; then + # Start both servers + # 8B on GPU 0, 70B on GPUs 1,2,3 (tensor parallel 3) + # Or split differently based on memory + + echo "Starting 8B server on GPU 0..." + PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b_conv.log") + + echo "Starting 70B server on GPUs 1,2,3..." + PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "1,2,3" "--tensor-parallel-size 3" "logs/vllm_70b_conv.log") + + wait_for_server $PORT_8B + wait_for_server $PORT_70B + + if [ $? -eq 0 ]; then + echo -e "\n${YELLOW}Running full conversation benchmark (10 conversations)...${NC}" + python scripts/benchmark_inference.py --mode conversation \ + --url-8b http://localhost:$PORT_8B/v1 \ + --url-70b http://localhost:$PORT_70B/v1 \ + -n 10 + fi +else + echo -e "${YELLOW}Skipping full conversation test (need 4 GPUs)${NC}" +fi + +# ============================================ +# Summary +# ============================================ +echo -e "\n${GREEN}======================================${NC}" +echo -e "${GREEN} Test Complete!${NC}" +echo -e "${GREEN}======================================${NC}" +echo "" +echo "Target: 2000 conversations/hour (paper on H100x8)" +echo "" +echo "Check the benchmark results above to see how close we are." +echo "If throughput is still low, check:" +echo " 1. GPU utilization during tests (nvidia-smi dmon -s u)" +echo " 2. vLLM logs in logs/*.log" +echo " 3. Network latency if using remote servers" diff --git a/collaborativeagents/scripts/test_vllm_speed.sbatch b/collaborativeagents/scripts/test_vllm_speed.sbatch new file mode 100644 index 0000000..070df5d --- /dev/null +++ b/collaborativeagents/scripts/test_vllm_speed.sbatch @@ -0,0 +1,130 @@ +#!/bin/bash +#SBATCH --job-name=test_vllm_speed +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=128G +#SBATCH --time=00:30:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_vllm_speed-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_vllm_speed-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" + +echo "=== vLLM Speed Test ===" +date +nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +echo "" +echo "=== Test 1: ContextualAdapter with vLLM (2 servers) ===" +echo "Starting vLLM servers on GPU 0,1 (user) and GPU 2,3 (agent)..." + +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +for i in $(seq 1 120); do + u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0) + a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0) + if [ "$u" = "1" ] && [ "$a" = "1" ]; then + echo "Both servers ready after $((i*2))s"; break + fi + sleep 2 +done + +python -c " +import time +import sys +sys.path.insert(0, '.') +from adapters.contextual_adapter import ContextualAdapter + +print('Testing ContextualAdapter with vLLM...') +adapter = ContextualAdapter(vllm_url='http://localhost:8003/v1') +adapter.initialize() +adapter.start_session('test_user') + +# Warm up +adapter.generate_response('Hello') + +# Benchmark +n_requests = 20 +start = time.time() +for i in range(n_requests): + resp = adapter.generate_response(f'Solve: What is {i*7} + {i*3}? Give a brief answer.') +elapsed = time.time() - start + +print(f'ContextualAdapter (vLLM): {n_requests} requests in {elapsed:.2f}s') +print(f'Throughput: {n_requests/elapsed:.2f} req/s = {n_requests/elapsed*3600:.0f} requests/hr') +print(f'Estimated sessions/hr (assuming 5 turns/session): {n_requests/elapsed*3600/5:.0f}') +" + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 5 + +echo "" +echo "=== Test 2: PersonalizedLLMAdapter (vLLM user + transformers adapter) ===" +echo "Starting vLLM on GPU 0,1 for user simulation..." + +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +for i in $(seq 1 120); do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "Server ready after $((i*2))s"; break + fi + sleep 2 +done + +echo "Loading PersonalizedLLMAdapter on GPU 2,3..." +CUDA_VISIBLE_DEVICES=2,3 python -c " +import time +import sys +sys.path.insert(0, '.') +from adapters.personalized_llm_adapter import create_baseline_adapter + +print('Testing PersonalizedLLMAdapter (all_memory mode)...') +adapter = create_baseline_adapter('all_memory') +adapter.initialize() +adapter.start_session('test_user') + +# Warm up +adapter.generate_response('Hello') + +# Benchmark +n_requests = 10 +start = time.time() +for i in range(n_requests): + resp = adapter.generate_response(f'Solve: What is {i*7} + {i*3}? Give a brief answer.') +elapsed = time.time() - start + +print(f'PersonalizedLLMAdapter (transformers): {n_requests} requests in {elapsed:.2f}s') +print(f'Throughput: {n_requests/elapsed:.2f} req/s = {n_requests/elapsed*3600:.0f} requests/hr') +print(f'Estimated sessions/hr (assuming 5 turns/session): {n_requests/elapsed*3600/5:.0f}') +" + +pkill -f "vllm.entrypoints" 2>/dev/null || true + +echo "" +echo "=== Test Complete ===" +date diff --git a/collaborativeagents/scripts/test_vllm_speed_a100.sbatch b/collaborativeagents/scripts/test_vllm_speed_a100.sbatch new file mode 100644 index 0000000..7695cfc --- /dev/null +++ b/collaborativeagents/scripts/test_vllm_speed_a100.sbatch @@ -0,0 +1,126 @@ +#!/bin/bash +#SBATCH --job-name=test_vllm_a100 +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuA100x4 +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=128G +#SBATCH --time=00:30:00 +#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_vllm_a100-%j.out +#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_vllm_a100-%j.err + +set -e +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" + +echo "=== vLLM Speed Test (A100) ===" +date +nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 2 + +echo "" +echo "=== Test 1: ContextualAdapter with vLLM (2 servers) ===" + +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +for i in $(seq 1 120); do + u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0) + a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0) + if [ "$u" = "1" ] && [ "$a" = "1" ]; then + echo "Both servers ready after $((i*2))s"; break + fi + sleep 2 +done + +python -c " +import time +import sys +sys.path.insert(0, '.') +from adapters.contextual_adapter import ContextualAdapter + +print('Testing ContextualAdapter with vLLM...') +adapter = ContextualAdapter(vllm_url='http://localhost:8003/v1') +adapter.initialize() +adapter.start_session('test_user') + +# Warm up +adapter.generate_response('Hello') + +# Benchmark +n_requests = 20 +start = time.time() +for i in range(n_requests): + resp = adapter.generate_response(f'Solve: What is {i*7} + {i*3}? Give a brief answer.') +elapsed = time.time() - start + +print(f'ContextualAdapter (vLLM): {n_requests} requests in {elapsed:.2f}s') +print(f'Throughput: {n_requests/elapsed:.2f} req/s = {n_requests/elapsed*3600:.0f} requests/hr') +print(f'Estimated sessions/hr (5 turns/session): {n_requests/elapsed*3600/5:.0f}') +" + +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 5 + +echo "" +echo "=== Test 2: PersonalizedLLMAdapter (vLLM user + transformers) ===" + +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 --max-model-len 8192 \ + --disable-log-requests --dtype bfloat16 & + +for i in $(seq 1 120); do + if curl -s http://localhost:8004/health > /dev/null 2>&1; then + echo "Server ready after $((i*2))s"; break + fi + sleep 2 +done + +CUDA_VISIBLE_DEVICES=2,3 python -c " +import time +import sys +sys.path.insert(0, '.') +from adapters.personalized_llm_adapter import create_baseline_adapter + +print('Testing PersonalizedLLMAdapter (all_memory)...') +adapter = create_baseline_adapter('all_memory') +adapter.initialize() +adapter.start_session('test_user') + +# Warm up +adapter.generate_response('Hello') + +# Benchmark +n_requests = 10 +start = time.time() +for i in range(n_requests): + resp = adapter.generate_response(f'Solve: What is {i*7} + {i*3}? Give a brief answer.') +elapsed = time.time() - start + +print(f'PersonalizedLLMAdapter (transformers): {n_requests} requests in {elapsed:.2f}s') +print(f'Throughput: {n_requests/elapsed:.2f} req/s = {n_requests/elapsed*3600:.0f} requests/hr') +print(f'Estimated sessions/hr (5 turns/session): {n_requests/elapsed*3600/5:.0f}') +" + +pkill -f "vllm.entrypoints" 2>/dev/null || true +echo "" +echo "=== Test Complete ===" +date diff --git a/collaborativeagents/scripts/visualize.py b/collaborativeagents/scripts/visualize.py new file mode 100644 index 0000000..2cf7369 --- /dev/null +++ b/collaborativeagents/scripts/visualize.py @@ -0,0 +1,492 @@ +import json +from itertools import zip_longest +import textwrap + +def load_data(filepath): + """Load user data from a JSONL file""" + users = [] + with open(filepath, 'r') as f: + for line in f: + if line.strip(): + users.append(json.loads(line)) + return users + +def format_conversation(conv, file_label): + """Format a conversation into a list of lines""" + lines = [] + lines.append(f">>> {file_label} <<<") + lines.append("") + + if 'conversation' in conv: + for i, msg in enumerate(conv['conversation'], 1): + role = msg.get('role', 'unknown').upper() + content = msg.get('content', '') + lines.append(f"[{i}] {role}:") + # Split content into lines and indent + for content_line in content.split('\n'): + lines.append(f" {content_line}") + + if i-2 < len(conv['full_conversation_log']): + full_conversation_log_msg = conv['full_conversation_log'][i-2] + if 'enforce_preferences' in full_conversation_log_msg and (full_conversation_log_msg['enforce_preferences'] == True or full_conversation_log_msg['enforce_preferences'] == "True"): + lines.append(f"<<<<< Enforced preference >>>>>") + + lines.append("") # Empty line after each message + + # Format evaluation + if 'evaluation' in conv: + lines.append("[EVALUATION]") + eval_data = conv['evaluation'] + + if 'final_answer' in eval_data: + lines.append(f"• Final Answer: {eval_data['final_answer']}") + + if 'accuracy' in eval_data: + acc = eval_data['accuracy']['accuracy'] + acc_symbol = "✓" if acc == 1 else "✗" + lines.append(f"• Accuracy: {acc} {acc_symbol}") + + num_enforced_preferences = len([message for message in conv['full_conversation_log'] if 'enforce_preferences' in message and (message['enforce_preferences'] == True or message['enforce_preferences'] == "True")]) + lines.append(f"• Number of enforced preferences: {num_enforced_preferences}") + + if 'conversation_length' in eval_data: + lines.append(f"• Length: {eval_data['conversation_length']} msgs") + + return lines + +def wrap_lines(lines, width): + """Wrap lines to specified width""" + wrapped = [] + for line in lines: + if len(line) <= width: + wrapped.append(line) + else: + # Wrap the line + wrapped_line = textwrap.wrap(line, width=width, break_long_words=True, break_on_hyphens=False) + wrapped.extend(wrapped_line) + return wrapped + +def calculate_aggregate_stats(users_data): + """Calculate aggregate statistics across all users""" + all_accuracies = [] + all_lengths = [] + all_enforced_counts = [] + + for user_data in users_data: + if 'generated_conversations' in user_data: + for conv in user_data['generated_conversations']: + # Collect accuracy + if 'evaluation' in conv and 'accuracy' in conv['evaluation']: + all_accuracies.append(conv['evaluation']['accuracy']['accuracy']) + + # Collect conversation length + if 'evaluation' in conv and 'conversation_length' in conv['evaluation']: + all_lengths.append(conv['evaluation']['conversation_length']) + + # Collect enforced preferences count + if 'full_conversation_log' in conv: + count = len([msg for msg in conv['full_conversation_log'] + if 'enforce_preferences' in msg and (msg['enforce_preferences'] == True or msg['enforce_preferences'] == "True")]) + all_enforced_counts.append(count) + + avg_accuracy = sum(all_accuracies) / len(all_accuracies) if all_accuracies else 0 + avg_length = sum(all_lengths) / len(all_lengths) if all_lengths else 0 + avg_enforced = sum(all_enforced_counts) / len(all_enforced_counts) if all_enforced_counts else 0 + + return avg_accuracy, avg_length, avg_enforced + +def print_side_by_side(conv1, conv2, label1, label2, col_width=60): + """Print two conversations side by side""" + lines1 = format_conversation(conv1, label1) + lines2 = format_conversation(conv2, label2) + + # Wrap lines to fit column width + lines1 = wrap_lines(lines1, col_width) + lines2 = wrap_lines(lines2, col_width) + + # Print header + print(f"\n{label1:<{col_width}} | {label2}") + print(f"{'-'*col_width} | {'-'*col_width}") + + # Print lines side by side + for line1, line2 in zip_longest(lines1, lines2, fillvalue=''): + # Pad line1 to col_width + line1 = line1.ljust(col_width) + + print(f"{line1} | {line2}") + +def print_side_by_side_3(conv1, conv2, conv3, label1, label2, label3, col_width=42): + """Print three conversations side by side""" + lines1 = format_conversation(conv1, label1) + lines2 = format_conversation(conv2, label2) + lines3 = format_conversation(conv3, label3) + + # Wrap lines to fit column width + lines1 = wrap_lines(lines1, col_width) + lines2 = wrap_lines(lines2, col_width) + lines3 = wrap_lines(lines3, col_width) + + # Print header + print(f"\n{label1:<{col_width}} | {label2:<{col_width}} | {label3}") + print(f"{'-'*col_width} | {'-'*col_width} | {'-'*col_width}") + + # Print lines side by side + for line1, line2, line3 in zip_longest(lines1, lines2, lines3, fillvalue=''): + # Pad line1 and line2 to col_width + line1 = line1.ljust(col_width) + line2 = line2.ljust(col_width) + + print(f"{line1} | {line2} | {line3}") + +def format_detailed_full_log(conv, file_label): + """Format detailed conversation including all fields from full_conversation_log""" + lines = [] + lines.append(f">>> {file_label} — FULL LOG <<<") + lines.append("") + + if 'full_conversation_log' in conv and conv['full_conversation_log']: + for j, msg in enumerate(conv['full_conversation_log'], 1): + # Alternate roles starting with USER + role_label = 'USER' if j % 2 == 1 else 'ASSISTANT' + lines.append(f"[{j}] {role_label}:") + + def is_enforced(value): + return value is True or value == "True" or value == "true" + + # 1) Response first (as plain text) + response_text = msg.get('response') + if response_text is not None: + for line in str(response_text).split('\n'): + lines.append(f"{line}") + + # 1a) Enforcement tag if applicable + if 'enforce_preferences' in msg and is_enforced(msg['enforce_preferences']): + lines.append("<<<<< Preferences Enforced >>>>>") + + # 2) Ordered keys as bulleted items + ordered_keys = [ + 'preference_1_satisfied', + 'preference_2_satisfied', + 'preference_3_satisfied', + 'enforce_preferences', + 'draft_answer', + 'reasoning', + 'should_terminate', + ] + + def append_bullet(key, value): + if isinstance(value, (dict, list)): + try: + pretty_value = json.dumps(value, indent=2, sort_keys=True, ensure_ascii=False) + except Exception: + pretty_value = str(value) + lines.append(f" - {key}:") + for ln in pretty_value.split('\n'): + lines.append(f" {ln}") + else: + value_str = str(value) if value is not None else "" + value_lines = value_str.split('\n') if value_str else [""] + # First line on the bullet + lines.append(f" - {key}: {value_lines[0]}") + # Continuation lines indented slightly further + for cont in value_lines[1:]: + lines.append(f" {cont}") + + for key in ordered_keys: + if key in msg: + append_bullet(key, msg.get(key)) + + # 3) Remaining keys grouped under Other fields + shown_keys = set(['response'] + ordered_keys) + remaining_keys = [k for k in msg.keys() if k not in shown_keys] + if remaining_keys: + lines.append(" - Other fields:") + for k in sorted(remaining_keys): + v = msg[k] + if isinstance(v, (dict, list)): + try: + pretty_v = json.dumps(v, indent=2, sort_keys=True, ensure_ascii=False) + except Exception: + pretty_v = str(v) + lines.append(f" {k}:") + for ln in pretty_v.split('\n'): + lines.append(f" {ln}") + else: + v_str = str(v) + v_lines = v_str.split('\n') if v_str else [""] + lines.append(f" {k}: {v_lines[0]}") + for cont in v_lines[1:]: + lines.append(f" {cont}") + + lines.append("") + else: + lines.append("[No full_conversation_log available]") + + # Include evaluation details if present + if 'evaluation' in conv: + lines.append("[EVALUATION — FULL]") + try: + eval_pretty = json.dumps(conv['evaluation'], indent=2, sort_keys=True, ensure_ascii=False) + except Exception: + eval_pretty = str(conv['evaluation']) + for line in eval_pretty.split('\n'): + lines.append(f" {line}") + + return lines + +def print_detailed_logs_3(conv1, conv2, conv3, label1, label2, label3, col_width=42): + """Print detailed logs for three conversations side by side""" + lines1 = wrap_lines(format_detailed_full_log(conv1, label1), col_width) + lines2 = wrap_lines(format_detailed_full_log(conv2, label2), col_width) + lines3 = wrap_lines(format_detailed_full_log(conv3, label3), col_width) + + print(f"\n{label1:<{col_width}} | {label2:<{col_width}} | {label3}") + print(f"{'-'*col_width} | {'-'*col_width} | {'-'*col_width}") + for line1, line2, line3 in zip_longest(lines1, lines2, lines3, fillvalue=''): + print(f"{line1.ljust(col_width)} | {line2.ljust(col_width)} | {line3}") + +def print_user_info(user_data): + """Print user profile information""" + print(f"\n[USER PROFILE]") + if 'i' in user_data: + print(f"User ID: {user_data['i']}") + if 'persona' in user_data: + print(f"Persona: {user_data['persona']}") + if 'preferences' in user_data: + print(f"Preferences:") + for preference in user_data['preferences']: + print(f" - {preference}") + + + + + + +for task in ["bigcodebench"]: # ["math_500", "logiqa", "math_hard", "medqa", "mmlu"]: + user_profiles_without_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/user_profiles_without_preferences/logiqa_llama70b_user_llama70b_agent_user_profiles_without_preferences_eval_size_20.jsonl" + user_profiles_with_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/user_profiles_with_preferences/logiqa_llama70b_user_llama70b_agent_user_profiles_with_preferences_eval_size_20.jsonl" + agent_with_userpreferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/agent_with_user_preferences/logiqa_llama70b_user_llama70b_agent_agent_with_user_preferences_eval_size_20_v2.jsonl" + agnet_with_reflection_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/agent_with_reflection_v3/logiqa_llama70b_user_llama70b_agent_agent_with_reflection_eval_size_20.jsonl" + + + file1_path = user_profiles_without_preferences_path + file2_path = user_profiles_with_preferences_path + file3_path = agent_with_userpreferences_path + file4_path = agnet_with_reflection_path + + + # Load users from all three files + data1 = load_data(file1_path) + data2 = load_data(file2_path) + data3 = load_data(file3_path) + data4 = load_data(file4_path) + + id_to_user_data1 = {elem['i']: elem for elem in data1+data2+data3+data4} + id_to_user_data2 = {elem['i']: elem for elem in data2} + id_to_user_data3 = {elem['i']: elem for elem in data3} + id_to_user_data4 = {elem['i']: elem for elem in data4} + + + + for id in id_to_user_data1: + if id != 23: continue + user_avg_acc1, user_avg_len1, user_avg_enf1 = calculate_aggregate_stats([id_to_user_data1[id]]) + user_avg_acc2, user_avg_len2, user_avg_enf2 = calculate_aggregate_stats([id_to_user_data2[id]]) + user_avg_acc3, user_avg_len3, user_avg_enf3 = calculate_aggregate_stats([id_to_user_data3[id]]) + user_avg_acc4, user_avg_len4, user_avg_enf4 = calculate_aggregate_stats([id_to_user_data4[id]]) + + + # print user info + print("\n" + "="*125 + "\n") + print(f"### Task: {task}\n") + print("LOGGING FOR USER ID: ", id) + print_user_info(id_to_user_data1[id]) + + # Print the average performance for id_to_user_data1[id] + # Print the average performance for id_to_user_data2[id] + print("\n" + "-"*125) + print("COMPARISON FOR THIS USER") + print("-"*125) + + + print("\nUser Without Preferences:") + print(f" Average Accuracy: {user_avg_acc1:.2f}") + print(f" Average # Messages: {user_avg_len1:.2f}") + print(f" Average # Enforced Preferences: {user_avg_enf1:.2f}") + + print("\nUser With Preferences:") + print(f" Average Accuracy: {user_avg_acc2:.2f}") + print(f" Average # Messages: {user_avg_len2:.2f}") + print(f" Average # Enforced Preferences: {user_avg_enf2:.2f}") + + print("\nAgent With User Preferences:") + print(f" Average Accuracy: {user_avg_acc3:.2f}") + print(f" Average # Messages: {user_avg_len3:.2f}") + print(f" Average # Enforced Preferences: {user_avg_enf3:.2f}") + + print("\nAgent With Reflection:") + print(f" Average Accuracy: {user_avg_acc4:.2f}") + print(f" Average # Messages: {user_avg_len4:.2f}") + print(f" Average # Enforced Preferences: {user_avg_enf4:.2f}") + + + # print conversations + problem_to_conversation1 = {conv['sample']['problem']: conv for conv in id_to_user_data1[id]['generated_conversations']} + problem_to_conversation2 = {conv['sample']['problem']: conv for conv in id_to_user_data2[id]['generated_conversations']} + problem_to_conversation3 = {conv['sample']['problem']: conv for conv in id_to_user_data3[id]['generated_conversations']} + + for problem in problem_to_conversation1: + print("\n" + "="*125) + print(f"\n[PROBLEM]") + print(problem) + print(f"\n[SOLUTION]") + print(problem_to_conversation1[problem]['sample']['solution']) + print("\n" + "="*125) + + print_side_by_side_3( + problem_to_conversation1[problem], + problem_to_conversation2[problem], + problem_to_conversation3.get(problem, {'conversation': [], 'evaluation': {}}), + "FILE 1 (WITHOUT PREFERENCES)", + "FILE 2 (WITH PREFERENCES)", + "FILE 3 (AGENT WITH USER PREFS)", + col_width=55 + ) + + # Detailed logs below with all fields + print("\n" + "-"*125) + print("DETAILED FULL LOGS") + print("-"*125) + print_detailed_logs_3( + problem_to_conversation1[problem], + problem_to_conversation2[problem], + problem_to_conversation3.get(problem, {'conversation': [], 'evaluation': {}}), + "FILE 1 (WITHOUT PREFERENCES)", + "FILE 2 (WITH PREFERENCES)", + "FILE 3 (AGENT WITH USER PREFS)", + col_width=55 + ) + + # break + + + + + + + + + + + + + + + + + + +# # ============================================================================== +# # SEPARATE SECTION: Per-User Statistics Averaged Over All Tasks +# # ============================================================================== + +# print("\n" + "="*125) +# print("="*125) +# print("STATISTICS FOR EACH USER, AVERAGED OVER ALL TASKS") +# print("="*125) +# print("="*125 + "\n") + +# # Dictionary to store all data for each user across all tasks +# user_to_all_data = {} + +# # Collect data for all users across all tasks +# for task in ["math_500", "logiqa", "math_hard", "medqa", "mmlu"]: +# user_profiles_without_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/user_profiles_without_preferences/{task}_llama70b_user_llama70b_agent_user_profiles_without_preferences_eval_size_20.jsonl" +# user_profiles_with_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/user_profiles_with_preferences/{task}_llama70b_user_llama70b_agent_user_profiles_with_preferences_eval_size_20.jsonl" +# agent_with_userpreferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/agent_with_user_preferences/{task}_llama70b_user_llama70b_agent_agent_with_user_preferences_eval_size_20_v2.jsonl" +# agent_with_reflection_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/agent_with_reflection/{task}_llama70b_user_llama70b_agent_agent_with_reflection_eval_size_20.jsonl" + +# data1 = load_data(user_profiles_without_preferences_path) +# data2 = load_data(user_profiles_with_preferences_path) +# data3 = load_data(agent_with_userpreferences_path) +# data4 = load_data(agent_with_reflection_path) + +# # For each user in this task, store their data +# for user_data in data1: +# user_id = user_data['i'] +# if user_id not in user_to_all_data: +# user_to_all_data[user_id] = { +# 'persona': user_data.get('persona'), +# 'preferences': user_data.get('preferences'), +# 'data1': [], # without preferences +# 'data2': [], # with preferences +# 'data3': [], # agent with user preferences +# 'data4': [] # agent with reflection +# } +# user_to_all_data[user_id]['data1'].append(user_data) + +# for user_data in data2: +# user_id = user_data['i'] +# if user_id in user_to_all_data: +# user_to_all_data[user_id]['data2'].append(user_data) + +# for user_data in data3: +# user_id = user_data['i'] +# if user_id in user_to_all_data: +# user_to_all_data[user_id]['data3'].append(user_data) + +# for user_data in data4: +# user_id = user_data['i'] +# if user_id in user_to_all_data: +# user_to_all_data[user_id]['data4'].append(user_data) + +# # Now print statistics for each user, averaged over all tasks +# for user_id in sorted(user_to_all_data.keys()): +# user_info = user_to_all_data[user_id] + +# # Calculate aggregate stats across all tasks for this user +# user_avg_acc1, user_avg_len1, user_avg_enf1 = calculate_aggregate_stats(user_info['data1']) +# user_avg_acc2, user_avg_len2, user_avg_enf2 = calculate_aggregate_stats(user_info['data2']) +# user_avg_acc3, user_avg_len3, user_avg_enf3 = calculate_aggregate_stats(user_info['data3']) +# user_avg_acc4, user_avg_len4, user_avg_enf4 = calculate_aggregate_stats(user_info['data4']) + +# print("\n" + "="*125) +# print(f"USER ID: {user_id}") +# print("="*125) + +# # Print user profile info +# if user_info['persona']: +# print(f"Persona: {user_info['persona']}") +# if user_info['preferences']: +# print(f"Preferences:") +# for preference in user_info['preferences']: +# print(f" - {preference}") + +# print("\n" + "-"*125) +# print("STATISTICS AVERAGED OVER ALL TASKS") +# print("-"*125) + +# print("\nUser Without Preferences:") +# print(f" Average Accuracy: {user_avg_acc1:.2f}") +# print(f" Average # Messages: {user_avg_len1:.2f}") +# print(f" Average # Enforced Preferences: {user_avg_enf1:.2f}") + +# print("\nUser With Preferences:") +# print(f" Average Accuracy: {user_avg_acc2:.2f}") +# print(f" Average # Messages: {user_avg_len2:.2f}") +# print(f" Average # Enforced Preferences: {user_avg_enf2:.2f}") + +# print("\nAgent With User Preferences:") +# print(f" Average Accuracy: {user_avg_acc3:.2f}") +# print(f" Average # Messages: {user_avg_len3:.2f}") +# print(f" Average # Enforced Preferences: {user_avg_enf3:.2f}") + +# print("\nAgent With Reflection:") +# print(f" Average Accuracy: {user_avg_acc4:.2f}") +# print(f" Average # Messages: {user_avg_len4:.2f}") +# print(f" Average # Enforced Preferences: {user_avg_enf4:.2f}") + +# print("\n" + "="*125) +# print("END OF PER-USER STATISTICS") +# print("="*125 + "\n") + -- cgit v1.2.3