From dc801c07cf38b0c495686463e6ca6f871a64440e Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Tue, 27 Jan 2026 09:57:37 -0600 Subject: Add collaborativeagents module and update gitignore - Add collaborativeagents subproject with adapters, agents, and evaluation modules - Update .gitignore to exclude large binary files (.whl, .tar), wandb logs, and results Co-Authored-By: Claude Opus 4.5 --- .../slurm/logs/all_memory_14360420.err | 127 + .../slurm/logs/contextual_14360423.err | 139 + .../slurm/logs/full_exp_14357783.err | 69 + .../slurm/logs/full_exp_14358390.err | 97 + collaborativeagents/slurm/logs/rag_14360421.err | 9442 ++++++++++++ .../slurm/logs/rag_vector_14360422.err | 129 + .../slurm/logs/reflection_14360424.err | 101 + .../slurm/logs/reflection_grpo_14360425.err | 119 + .../slurm/logs/run_collab_baselines_14355966.err | 14466 +++++++++++++++++++ .../slurm/logs/run_collab_baselines_14355975.err | 70 + .../slurm/logs/run_expts_a100_14355787.err | 15 + .../slurm/logs/run_expts_a100_14355797.err | 14 + .../slurm/logs/run_expts_a100_14355807.err | 10 + .../slurm/logs/run_expts_a100_14355816.err | 22 + .../slurm/logs/run_expts_a100_14355842.err | 23 + .../slurm/logs/run_expts_a100_14355851.err | 28 + .../slurm/logs/run_expts_a100_14355856.err | 10 + .../slurm/logs/run_expts_a100_14355861.err | 14 + .../slurm/logs/run_expts_a100_14355863.err | 11 + .../slurm/logs/run_expts_a100_14355865.err | 19 + .../slurm/logs/run_expts_a100_14355871.err | 16 + .../slurm/logs/run_expts_a100_14355878.err | 109 + .../slurm/logs/run_expts_a100_14355885.err | 17 + .../slurm/logs/run_expts_a100_14355888.err | 19 + .../slurm/logs/run_expts_a100_14355896.err | 12 + .../slurm/logs/run_expts_a100_14355901.err | 47 + .../slurm/logs/run_expts_a100_14355902.err | 185 + .../slurm/logs/run_expts_a100_14355919.err | 97 + .../slurm/logs/run_multiturn_14357110.err | 9 + .../slurm/logs/run_multiturn_14357122.err | 98 + .../slurm/logs/test_70b_14357753.err | 1 + .../slurm/logs/test_70b_14357762.err | 24 + .../slurm/logs/test_extractor_14363568.err | 3 + .../slurm/logs/test_multiturn_14357116.err | 13 + .../slurm/logs/test_multiturn_14357119.err | 46 + .../slurm/logs/vanilla_14360419.err | 138 + .../slurm/logs/vllm_bench_14367333.err | 6 + .../slurm/logs/vllm_bench_70b_8b_14367370.err | 180 + .../slurm/logs/vllm_only_14367345.err | 1 + 39 files changed, 25946 insertions(+) create mode 100644 collaborativeagents/slurm/logs/all_memory_14360420.err create mode 100644 collaborativeagents/slurm/logs/contextual_14360423.err create mode 100644 collaborativeagents/slurm/logs/full_exp_14357783.err create mode 100644 collaborativeagents/slurm/logs/full_exp_14358390.err create mode 100644 collaborativeagents/slurm/logs/rag_14360421.err create mode 100644 collaborativeagents/slurm/logs/rag_vector_14360422.err create mode 100644 collaborativeagents/slurm/logs/reflection_14360424.err create mode 100644 collaborativeagents/slurm/logs/reflection_grpo_14360425.err create mode 100644 collaborativeagents/slurm/logs/run_collab_baselines_14355966.err create mode 100644 collaborativeagents/slurm/logs/run_collab_baselines_14355975.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355787.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355797.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355807.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355816.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355842.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355851.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355856.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355861.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355863.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355865.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355871.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355878.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355885.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355888.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355896.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355901.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355902.err create mode 100644 collaborativeagents/slurm/logs/run_expts_a100_14355919.err create mode 100644 collaborativeagents/slurm/logs/run_multiturn_14357110.err create mode 100644 collaborativeagents/slurm/logs/run_multiturn_14357122.err create mode 100644 collaborativeagents/slurm/logs/test_70b_14357753.err create mode 100644 collaborativeagents/slurm/logs/test_70b_14357762.err create mode 100644 collaborativeagents/slurm/logs/test_extractor_14363568.err create mode 100644 collaborativeagents/slurm/logs/test_multiturn_14357116.err create mode 100644 collaborativeagents/slurm/logs/test_multiturn_14357119.err create mode 100644 collaborativeagents/slurm/logs/vanilla_14360419.err create mode 100644 collaborativeagents/slurm/logs/vllm_bench_14367333.err create mode 100644 collaborativeagents/slurm/logs/vllm_bench_70b_8b_14367370.err create mode 100644 collaborativeagents/slurm/logs/vllm_only_14367345.err (limited to 'collaborativeagents/slurm/logs') diff --git a/collaborativeagents/slurm/logs/all_memory_14360420.err b/collaborativeagents/slurm/logs/all_memory_14360420.err new file mode 100644 index 0000000..93582b3 --- /dev/null +++ b/collaborativeagents/slurm/logs/all_memory_14360420.err @@ -0,0 +1,127 @@ +/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +2025-12-27 01:56:00,092 - INFO - Loaded dataset: mmlu +2025-12-27 01:56:00,092 - INFO - Loaded dataset: aime +2025-12-27 01:56:00,092 - INFO - Loaded dataset: math-hard +2025-12-27 01:56:00,092 - INFO - Loaded dataset: humaneval +2025-12-27 01:56:00,108 - INFO - Loaded 100 profiles from ../data/complex_profiles_v2/profiles_100.jsonl +2025-12-27 01:56:00,108 - INFO - Running method: all_memory +`torch_dtype` is deprecated! Use `dtype` instead! + Loading checkpoint shards: 0%| | 0/4 [00:00 + main() + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 608, in main + analysis = runner.run_all() + ^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 414, in run_all + results = self.run_method(method) + ^^^^^^^^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 367, in run_method + samples = dataset.get_testset() + ^^^^^^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/datasets_extended.py", line 71, in get_testset + self._test_data = self._load_data("test")[:self.eval_size] + ^^^^^^^^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/datasets_extended.py", line 153, in _load_data + solution=item["answer"], + ~~~~^^^^^^^^^^ +KeyError: 'answer' diff --git a/collaborativeagents/slurm/logs/full_exp_14358390.err b/collaborativeagents/slurm/logs/full_exp_14358390.err new file mode 100644 index 0000000..65669a6 --- /dev/null +++ b/collaborativeagents/slurm/logs/full_exp_14358390.err @@ -0,0 +1,97 @@ +/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +2025-12-26 11:03:58,116 - INFO - Loaded dataset: mmlu +2025-12-26 11:03:58,116 - INFO - Loaded dataset: aime +2025-12-26 11:03:58,116 - INFO - Loaded dataset: math-hard +2025-12-26 11:03:58,116 - INFO - Loaded dataset: humaneval +2025-12-26 11:03:58,213 - INFO - Loaded 100 profiles from ../data/complex_profiles_v2/profiles_100.jsonl +2025-12-26 11:03:58,214 - INFO - Running method: vanilla +`torch_dtype` is deprecated! Use `dtype` instead! + Loading checkpoint shards: 0%| | 0/4 [00:00 + from adapters.personalized_llm_adapter import PersonalizedLLMAdapter, create_baseline_adapter + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/adapters/__init__.py", line 3, in + from .personalized_llm_adapter import ( + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/adapters/personalized_llm_adapter.py", line 21, in + from personalization.serving.personalized_llm import ( + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src/personalization/serving/__init__.py", line 5, in + from personalization.serving.personalized_llm import ( + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src/personalization/serving/personalized_llm.py", line 35, in + from personalization.models.reranker.qwen3_reranker import Qwen3Reranker + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src/personalization/models/reranker/qwen3_reranker.py", line 26 + self.model = AutoModelForCausalLM.from_pretrained( + ^ +IndentationError: expected an indented block after 'else' statement on line 24 diff --git a/collaborativeagents/slurm/logs/run_expts_a100_14355797.err b/collaborativeagents/slurm/logs/run_expts_a100_14355797.err new file mode 100644 index 0000000..28ce959 --- /dev/null +++ b/collaborativeagents/slurm/logs/run_expts_a100_14355797.err @@ -0,0 +1,14 @@ +Traceback (most recent call last): + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 31, in + from adapters.personalized_llm_adapter import PersonalizedLLMAdapter, create_baseline_adapter + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/adapters/__init__.py", line 3, in + from .personalized_llm_adapter import ( + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/adapters/personalized_llm_adapter.py", line 21, in + from personalization.serving.personalized_llm import ( + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src/personalization/serving/__init__.py", line 5, in + from personalization.serving.personalized_llm import ( + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src/personalization/serving/personalized_llm.py", line 38, in + from personalization.user_model.features import ItemProjection + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src/personalization/user_model/features.py", line 3, in + from sklearn.decomposition import PCA +ModuleNotFoundError: No module named 'sklearn' diff --git a/collaborativeagents/slurm/logs/run_expts_a100_14355807.err b/collaborativeagents/slurm/logs/run_expts_a100_14355807.err new file mode 100644 index 0000000..33fa583 --- /dev/null +++ b/collaborativeagents/slurm/logs/run_expts_a100_14355807.err @@ -0,0 +1,10 @@ +Traceback (most recent call last): + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 484, in + main() + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 471, in main + runner = ExperimentRunner(config) + ^^^^^^^^^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 99, in __init__ + self.conflict_generator = ConflictScenarioGenerator() + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +TypeError: ConflictScenarioGenerator.__init__() missing 1 required positional argument: 'profile' diff --git a/collaborativeagents/slurm/logs/run_expts_a100_14355816.err b/collaborativeagents/slurm/logs/run_expts_a100_14355816.err new file mode 100644 index 0000000..f147ab6 --- /dev/null +++ b/collaborativeagents/slurm/logs/run_expts_a100_14355816.err @@ -0,0 +1,22 @@ +2025-12-25 07:53:36,898 - INFO - Loaded dataset: math-500 +Traceback (most recent call last): + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 484, in + main() + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 471, in main + runner = ExperimentRunner(config) + ^^^^^^^^^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 111, in __init__ + self.profiles = self._load_profiles() + ^^^^^^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 117, in _load_profiles + profiles = json.load(f) + ^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/json/__init__.py", line 293, in load + return loads(fp.read(), + ^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/json/__init__.py", line 346, in loads + return _default_decoder.decode(s) + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/json/decoder.py", line 340, in decode + raise JSONDecodeError("Extra data", s, end) +json.decoder.JSONDecodeError: Extra data: line 2 column 1 (char 10782) diff --git a/collaborativeagents/slurm/logs/run_expts_a100_14355842.err b/collaborativeagents/slurm/logs/run_expts_a100_14355842.err new file mode 100644 index 0000000..0144ee9 --- /dev/null +++ b/collaborativeagents/slurm/logs/run_expts_a100_14355842.err @@ -0,0 +1,23 @@ +2025-12-25 07:55:08,518 - INFO - Loaded dataset: math-500 +2025-12-25 07:55:08,527 - INFO - Loaded 100 profiles from ../data/complex_profiles_v2/profiles_100.jsonl +2025-12-25 07:55:08,528 - INFO - Running method: rag_vector +2025-12-25 07:55:08,528 - INFO - Profile 1/2 + Generating test split: 0%| | 0/500 [00:00 + main() + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 476, in main + analysis = runner.run_all() + ^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 292, in run_all + analysis = self._analyze_results(all_results) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 365, in _analyze_results + best = max(values, key=values.get) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +ValueError: max() arg is an empty sequence diff --git a/collaborativeagents/slurm/logs/run_expts_a100_14355851.err b/collaborativeagents/slurm/logs/run_expts_a100_14355851.err new file mode 100644 index 0000000..59bbe1a --- /dev/null +++ b/collaborativeagents/slurm/logs/run_expts_a100_14355851.err @@ -0,0 +1,28 @@ +2025-12-25 07:58:42,438 - INFO - Loaded dataset: math-500 +2025-12-25 07:58:42,447 - INFO - Loaded 100 profiles from ../data/complex_profiles_v2/profiles_100.jsonl +2025-12-25 07:58:42,448 - INFO - Running method: rag_vector +2025-12-25 07:58:42,448 - INFO - Profile 1/2 +2025-12-25 07:58:47,959 - ERROR - Error in session: ConversationGenerator.__init__() got an unexpected keyword argument 'user_model' +2025-12-25 07:58:47,959 - ERROR - Error in session: ConversationGenerator.__init__() got an unexpected keyword argument 'user_model' +2025-12-25 07:58:47,960 - INFO - Profile 2/2 +2025-12-25 07:58:47,960 - ERROR - Error in session: ConversationGenerator.__init__() got an unexpected keyword argument 'user_model' +2025-12-25 07:58:47,960 - ERROR - Error in session: ConversationGenerator.__init__() got an unexpected keyword argument 'user_model' +2025-12-25 07:58:47,962 - WARNING - No values for metric task_success_rate, skipping comparison +2025-12-25 07:58:47,962 - WARNING - No values for metric avg_user_tokens, skipping comparison +2025-12-25 07:58:47,962 - WARNING - No values for metric avg_total_tokens, skipping comparison +2025-12-25 07:58:47,962 - WARNING - No values for metric avg_enforcement_count, skipping comparison +2025-12-25 07:58:47,962 - WARNING - No values for metric avg_preference_compliance, skipping comparison +2025-12-25 07:58:47,962 - WARNING - No values for metric conflict_resolution_accuracy, skipping comparison +2025-12-25 07:58:47,962 - WARNING - No values for metric over_personalization_rate, skipping comparison +Traceback (most recent call last): + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 491, in + main() + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 479, in main + analysis = runner.run_all() + ^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 299, in run_all + self._generate_report(analysis) + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/run_experiments.py", line 414, in _generate_report + best = analysis["comparison"][metric_key]["best_method"] + ~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^ +KeyError: 'task_success_rate' diff --git a/collaborativeagents/slurm/logs/run_expts_a100_14355856.err b/collaborativeagents/slurm/logs/run_expts_a100_14355856.err new file mode 100644 index 0000000..608e91f --- /dev/null +++ b/collaborativeagents/slurm/logs/run_expts_a100_14355856.err @@ -0,0 +1,10 @@ +2025-12-25 08:08:52,658 - INFO - Loaded dataset: math-500 +2025-12-25 08:08:52,698 - INFO - Loaded 100 profiles from ../data/complex_profiles_v2/profiles_100.jsonl +2025-12-25 08:08:52,699 - INFO - Running method: rag_vector +2025-12-25 08:08:52,699 - INFO - Profile 1/2 +2025-12-25 08:08:55,383 - ERROR - Error in session: [Errno 2] No such file or directory: '/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/configs/local_models.yaml' +2025-12-25 08:08:55,383 - ERROR - Error in session: [Errno 2] No such file or directory: '/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/configs/local_models.yaml' +2025-12-25 08:08:55,383 - INFO - Profile 2/2 +2025-12-25 08:08:55,384 - ERROR - Error in session: [Errno 2] No such file or directory: '/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/configs/local_models.yaml' +2025-12-25 08:08:55,384 - ERROR - Error in session: [Errno 2] No such file or directory: '/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/configs/local_models.yaml' +2025-12-25 08:08:55,390 - INFO - Report saved to ../results/test_a100_20251225_080844/20251225_080852/report.md diff --git a/collaborativeagents/slurm/logs/run_expts_a100_14355861.err b/collaborativeagents/slurm/logs/run_expts_a100_14355861.err new file mode 100644 index 0000000..fac726b --- /dev/null +++ b/collaborativeagents/slurm/logs/run_expts_a100_14355861.err @@ -0,0 +1,14 @@ +2025-12-25 08:12:25,638 - INFO - Loaded dataset: math-500 +2025-12-25 08:12:25,647 - INFO - Loaded 100 profiles from ../data/complex_profiles_v2/profiles_100.jsonl +2025-12-25 08:12:25,648 - INFO - Running method: rag_vector +2025-12-25 08:12:25,648 - INFO - Profile 1/2 +2025-12-25 08:12:28,020 - ERROR - Error in session: models/qwen3-embedding-8b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' +If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=` +2025-12-25 08:12:28,062 - ERROR - Error in session: models/qwen3-embedding-8b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' +If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=` +2025-12-25 08:12:28,062 - INFO - Profile 2/2 +2025-12-25 08:12:28,107 - ERROR - Error in session: models/qwen3-embedding-8b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' +If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=` +2025-12-25 08:12:28,155 - ERROR - Error in session: models/qwen3-embedding-8b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' +If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=` +2025-12-25 08:12:28,189 - INFO - Report saved to ../results/test_a100_20251225_081218/20251225_081225/report.md diff --git a/collaborativeagents/slurm/logs/run_expts_a100_14355863.err b/collaborativeagents/slurm/logs/run_expts_a100_14355863.err new file mode 100644 index 0000000..1a41e92 --- /dev/null +++ b/collaborativeagents/slurm/logs/run_expts_a100_14355863.err @@ -0,0 +1,11 @@ +2025-12-25 08:15:58,390 - INFO - Loaded dataset: math-500 +2025-12-25 08:15:58,399 - INFO - Loaded 100 profiles from ../data/complex_profiles_v2/profiles_100.jsonl +2025-12-25 08:15:58,399 - INFO - Running method: rag_vector +2025-12-25 08:15:58,399 - INFO - Profile 1/2 +`torch_dtype` is deprecated! Use `dtype` instead! +2025-12-25 08:16:02,559 - ERROR - Error in session: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate` +2025-12-25 08:16:02,854 - ERROR - Error in session: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate` +2025-12-25 08:16:02,874 - INFO - Profile 2/2 +2025-12-25 08:16:03,126 - ERROR - Error in session: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate` +2025-12-25 08:16:03,394 - ERROR - Error in session: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate` +2025-12-25 08:16:03,459 - INFO - Report saved to ../results/test_a100_20251225_081551/20251225_081558/report.md diff --git a/collaborativeagents/slurm/logs/run_expts_a100_14355865.err b/collaborativeagents/slurm/logs/run_expts_a100_14355865.err new file mode 100644 index 0000000..769bc20 --- /dev/null +++ b/collaborativeagents/slurm/logs/run_expts_a100_14355865.err @@ -0,0 +1,19 @@ +2025-12-25 08:20:02,816 - INFO - Loaded dataset: math-500 +2025-12-25 08:20:02,951 - INFO - Loaded 100 profiles from ../data/complex_profiles_v2/profiles_100.jsonl +2025-12-25 08:20:02,952 - INFO - Running method: rag_vector +2025-12-25 08:20:02,952 - INFO - Profile 1/2 +`torch_dtype` is deprecated! Use `dtype` instead! + Loading checkpoint shards: 0%| | 0/4 [00:00 + results["full_session"] = test_full_session() + ^^^^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/test_multiturn.py", line 171, in test_full_session + adapter = create_baseline_adapter("vanilla") + ^^^^^^^^^^^^^^^^^^^^^^^ +NameError: name 'create_baseline_adapter' is not defined diff --git a/collaborativeagents/slurm/logs/test_multiturn_14357119.err b/collaborativeagents/slurm/logs/test_multiturn_14357119.err new file mode 100644 index 0000000..1a66fce --- /dev/null +++ b/collaborativeagents/slurm/logs/test_multiturn_14357119.err @@ -0,0 +1,46 @@ +`torch_dtype` is deprecated! Use `dtype` instead! + Loading checkpoint shards: 0%| | 0/4 [00:00 + results["full_session"] = test_full_session() + ^^^^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/test_multiturn.py", line 173, in test_full_session + adapter.initialize() + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/adapters/personalized_llm_adapter.py", line 87, in initialize + self._llm = PersonalizedLLM( + ^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src/personalization/serving/personalized_llm.py", line 227, in __init__ + self._load_models() + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src/personalization/serving/personalized_llm.py", line 295, in _load_models + self._reranker = Qwen3Reranker( + ^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src/personalization/models/reranker/qwen3_reranker.py", line 26, in __init__ + self.model = AutoModelForCausalLM.from_pretrained( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained + return model_class.from_pretrained( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/modeling_utils.py", line 277, in _wrapper + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/modeling_utils.py", line 5048, in from_pretrained + ) = cls._load_pretrained_model( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/modeling_utils.py", line 5468, in _load_pretrained_model + _error_msgs, disk_offload_index = load_shard_file(args) + ^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/modeling_utils.py", line 843, in load_shard_file + disk_offload_index = _load_state_dict_into_meta_model( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/modeling_utils.py", line 770, in _load_state_dict_into_meta_model + _load_parameter_into_model(model, param_name, param.to(param_device)) + ^^^^^^^^^^^^^^^^^^^^^^ +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 1 has a total capacity of 39.49 GiB of which 30.31 MiB is free. Including non-PyTorch memory, this process has 39.46 GiB memory in use. Of the allocated memory 38.87 GiB is allocated by PyTorch, and 87.01 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/collaborativeagents/slurm/logs/vanilla_14360419.err b/collaborativeagents/slurm/logs/vanilla_14360419.err new file mode 100644 index 0000000..467f4f7 --- /dev/null +++ b/collaborativeagents/slurm/logs/vanilla_14360419.err @@ -0,0 +1,138 @@ +/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +2025-12-27 01:32:07,469 - INFO - Loaded dataset: mmlu +2025-12-27 01:32:07,469 - INFO - Loaded dataset: aime +2025-12-27 01:32:07,469 - INFO - Loaded dataset: math-hard +2025-12-27 01:32:07,469 - INFO - Loaded dataset: humaneval +2025-12-27 01:32:07,504 - INFO - Loaded 100 profiles from ../data/complex_profiles_v2/profiles_100.jsonl +2025-12-27 01:32:07,505 - INFO - Running method: vanilla +`torch_dtype` is deprecated! Use `dtype` instead! + Loading checkpoint shards: 0%| | 0/4 [00:00", line 198, in _run_module_as_main +(APIServer pid=3643829) File "", line 88, in _run_code +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 1469, in +(APIServer pid=3643829) uvloop.run(run_server(args)) +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/uvloop/__init__.py", line 92, in run +(APIServer pid=3643829) return runner.run(wrapper()) +(APIServer pid=3643829) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/asyncio/runners.py", line 118, in run +(APIServer pid=3643829) return self._loop.run_until_complete(task) +(APIServer pid=3643829) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=3643829) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=3643829) return await main +(APIServer pid=3643829) ^^^^^^^^^^ +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 1398, in run_server +(APIServer pid=3643829) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 1417, in run_server_worker +(APIServer pid=3643829) async with build_async_engine_client( +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/contextlib.py", line 210, in __aenter__ +(APIServer pid=3643829) return await anext(self.gen) +(APIServer pid=3643829) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 172, in build_async_engine_client +(APIServer pid=3643829) async with build_async_engine_client_from_engine_args( +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/contextlib.py", line 210, in __aenter__ +(APIServer pid=3643829) return await anext(self.gen) +(APIServer pid=3643829) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 213, in build_async_engine_client_from_engine_args +(APIServer pid=3643829) async_llm = AsyncLLM.from_vllm_config( +(APIServer pid=3643829) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py", line 215, in from_vllm_config +(APIServer pid=3643829) return cls( +(APIServer pid=3643829) ^^^^ +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py", line 134, in __init__ +(APIServer pid=3643829) self.engine_core = EngineCoreClient.make_async_mp_client( +(APIServer pid=3643829) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 121, in make_async_mp_client +(APIServer pid=3643829) return AsyncMPClient(*client_args) +(APIServer pid=3643829) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 820, in __init__ +(APIServer pid=3643829) super().__init__( +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 477, in __init__ +(APIServer pid=3643829) with launch_core_engines(vllm_config, executor_class, log_stats) as ( +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/contextlib.py", line 144, in __exit__ +(APIServer pid=3643829) next(self.gen) +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/vllm/v1/engine/utils.py", line 903, in launch_core_engines +(APIServer pid=3643829) wait_for_engine_startup( +(APIServer pid=3643829) File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/vllm/v1/engine/utils.py", line 960, in wait_for_engine_startup +(APIServer pid=3643829) raise RuntimeError( +(APIServer pid=3643829) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} +[2025-12-29T07:04:21.056] error: *** JOB 14367370 ON gpua051 CANCELLED AT 2025-12-29T07:04:21 DUE to SIGNAL Terminated *** diff --git a/collaborativeagents/slurm/logs/vllm_only_14367345.err b/collaborativeagents/slurm/logs/vllm_only_14367345.err new file mode 100644 index 0000000..8713769 --- /dev/null +++ b/collaborativeagents/slurm/logs/vllm_only_14367345.err @@ -0,0 +1 @@ +/usr/bin/python: Error while finding module specification for 'vllm.entrypoints.openai.api_server' (ModuleNotFoundError: No module named 'vllm') -- cgit v1.2.3