From dc801c07cf38b0c495686463e6ca6f871a64440e Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Tue, 27 Jan 2026 09:57:37 -0600 Subject: Add collaborativeagents module and update gitignore - Add collaborativeagents subproject with adapters, agents, and evaluation modules - Update .gitignore to exclude large binary files (.whl, .tar), wandb logs, and results Co-Authored-By: Claude Opus 4.5 --- .../slurm/logs/test_multiturn_14357119.err | 46 ++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 collaborativeagents/slurm/logs/test_multiturn_14357119.err (limited to 'collaborativeagents/slurm/logs/test_multiturn_14357119.err') diff --git a/collaborativeagents/slurm/logs/test_multiturn_14357119.err b/collaborativeagents/slurm/logs/test_multiturn_14357119.err new file mode 100644 index 0000000..1a66fce --- /dev/null +++ b/collaborativeagents/slurm/logs/test_multiturn_14357119.err @@ -0,0 +1,46 @@ +`torch_dtype` is deprecated! Use `dtype` instead! + Loading checkpoint shards: 0%| | 0/4 [00:00 + results["full_session"] = test_full_session() + ^^^^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts/test_multiturn.py", line 173, in test_full_session + adapter.initialize() + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/adapters/personalized_llm_adapter.py", line 87, in initialize + self._llm = PersonalizedLLM( + ^^^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src/personalization/serving/personalized_llm.py", line 227, in __init__ + self._load_models() + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src/personalization/serving/personalized_llm.py", line 295, in _load_models + self._reranker = Qwen3Reranker( + ^^^^^^^^^^^^^^ + File "/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src/personalization/models/reranker/qwen3_reranker.py", line 26, in __init__ + self.model = AutoModelForCausalLM.from_pretrained( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained + return model_class.from_pretrained( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/modeling_utils.py", line 277, in _wrapper + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/modeling_utils.py", line 5048, in from_pretrained + ) = cls._load_pretrained_model( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/modeling_utils.py", line 5468, in _load_pretrained_model + _error_msgs, disk_offload_index = load_shard_file(args) + ^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/modeling_utils.py", line 843, in load_shard_file + disk_offload_index = _load_state_dict_into_meta_model( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/modeling_utils.py", line 770, in _load_state_dict_into_meta_model + _load_parameter_into_model(model, param_name, param.to(param_device)) + ^^^^^^^^^^^^^^^^^^^^^^ +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 1 has a total capacity of 39.49 GiB of which 30.31 MiB is free. Including non-PyTorch memory, this process has 39.46 GiB memory in use. Of the allocated memory 38.87 GiB is allocated by PyTorch, and 87.01 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -- cgit v1.2.3