collaborativeagents/slurm/fullscale/test_local_user_15652698.err


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215

/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
  warnings.warn(
/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
  warnings.warn(
usage: api_server.py [-h] [--headless] [--api-server-count API_SERVER_COUNT]
                     [--config CONFIG] [--host HOST] [--port PORT] [--uds UDS]
                     [--uvicorn-log-level {critical,debug,error,info,trace,warning}]
                     [--disable-uvicorn-access-log | --no-disable-uvicorn-access-log]
                     [--allow-credentials | --no-allow-credentials]
                     [--allowed-origins ALLOWED_ORIGINS]
                     [--allowed-methods ALLOWED_METHODS]
                     [--allowed-headers ALLOWED_HEADERS]
                     [--api-key API_KEY [API_KEY ...]]
                     [--lora-modules LORA_MODULES [LORA_MODULES ...]]
                     [--chat-template CHAT_TEMPLATE]
                     [--chat-template-content-format {auto,openai,string}]
                     [--trust-request-chat-template | --no-trust-request-chat-template]
                     [--response-role RESPONSE_ROLE]
                     [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
                     [--ssl-ca-certs SSL_CA_CERTS]
                     [--enable-ssl-refresh | --no-enable-ssl-refresh]
                     [--ssl-cert-reqs SSL_CERT_REQS] [--root-path ROOT_PATH]
                     [--middleware MIDDLEWARE]
                     [--return-tokens-as-token-ids | --no-return-tokens-as-token-ids]
                     [--disable-frontend-multiprocessing | --no-disable-frontend-multiprocessing]
                     [--enable-request-id-headers | --no-enable-request-id-headers]
                     [--enable-auto-tool-choice | --no-enable-auto-tool-choice]
                     [--exclude-tools-when-tool-choice-none | --no-exclude-tools-when-tool-choice-none]
                     [--tool-call-parser {deepseek_v3,deepseek_v31,deepseek_v32,ernie45,gigachat3,glm45,granite,granite-20b-fc,hermes,hunyuan_a13b,internlm,jamba,kimi_k2,llama3_json,llama4_json,llama4_pythonic,longcat,minimax,minimax_m2,mistral,olmo3,openai,phi4_mini_json,pythonic,qwen3_coder,qwen3_xml,seed_oss,step3,xlam} or name registered in --tool-parser-plugin]
                     [--tool-parser-plugin TOOL_PARSER_PLUGIN]
                     [--tool-server TOOL_SERVER]
                     [--log-config-file LOG_CONFIG_FILE]
                     [--max-log-len MAX_LOG_LEN]
                     [--disable-fastapi-docs | --no-disable-fastapi-docs]
                     [--enable-prompt-tokens-details | --no-enable-prompt-tokens-details]
                     [--enable-server-load-tracking | --no-enable-server-load-tracking]
                     [--enable-force-include-usage | --no-enable-force-include-usage]
                     [--enable-tokenizer-info-endpoint | --no-enable-tokenizer-info-endpoint]
                     [--enable-log-outputs | --no-enable-log-outputs]
                     [--h11-max-incomplete-event-size H11_MAX_INCOMPLETE_EVENT_SIZE]
                     [--h11-max-header-count H11_MAX_HEADER_COUNT]
                     [--log-error-stack | --no-log-error-stack]
                     [--tokens-only | --no-tokens-only] [--model MODEL]
                     [--runner {auto,draft,generate,pooling}]
                     [--convert {auto,classify,embed,none,reward}]
                     [--tokenizer TOKENIZER]
                     [--tokenizer-mode ['auto', 'deepseek_v32', 'hf', 'mistral', 'slow']]
                     [--trust-remote-code | --no-trust-remote-code]
                     [--dtype {auto,bfloat16,float,float16,float32,half}]
                     [--seed SEED] [--hf-config-path HF_CONFIG_PATH]
                     [--allowed-local-media-path ALLOWED_LOCAL_MEDIA_PATH]
                     [--allowed-media-domains ALLOWED_MEDIA_DOMAINS [ALLOWED_MEDIA_DOMAINS ...]]
                     [--revision REVISION] [--code-revision CODE_REVISION]
                     [--tokenizer-revision TOKENIZER_REVISION]
                     [--max-model-len MAX_MODEL_LEN]
                     [--quantization QUANTIZATION]
                     [--enforce-eager | --no-enforce-eager]
                     [--max-logprobs MAX_LOGPROBS]
                     [--logprobs-mode {processed_logits,processed_logprobs,raw_logits,raw_logprobs}]
                     [--disable-sliding-window | --no-disable-sliding-window]
                     [--disable-cascade-attn | --no-disable-cascade-attn]
                     [--skip-tokenizer-init | --no-skip-tokenizer-init]
                     [--enable-prompt-embeds | --no-enable-prompt-embeds]
                     [--served-model-name SERVED_MODEL_NAME [SERVED_MODEL_NAME ...]]
                     [--config-format ['auto', 'hf', 'mistral']]
                     [--hf-token [HF_TOKEN]] [--hf-overrides HF_OVERRIDES]
                     [--pooler-config POOLER_CONFIG]
                     [--logits-processor-pattern LOGITS_PROCESSOR_PATTERN]
                     [--generation-config GENERATION_CONFIG]
                     [--override-generation-config OVERRIDE_GENERATION_CONFIG]
                     [--enable-sleep-mode | --no-enable-sleep-mode]
                     [--model-impl ['auto', 'terratorch', 'transformers', 'vllm']]
                     [--override-attention-dtype OVERRIDE_ATTENTION_DTYPE]
                     [--logits-processors LOGITS_PROCESSORS [LOGITS_PROCESSORS ...]]
                     [--io-processor-plugin IO_PROCESSOR_PLUGIN]
                     [--load-format LOAD_FORMAT] [--download-dir DOWNLOAD_DIR]
                     [--safetensors-load-strategy SAFETENSORS_LOAD_STRATEGY]
                     [--model-loader-extra-config MODEL_LOADER_EXTRA_CONFIG]
                     [--ignore-patterns IGNORE_PATTERNS [IGNORE_PATTERNS ...]]
                     [--use-tqdm-on-load | --no-use-tqdm-on-load]
                     [--pt-load-map-location PT_LOAD_MAP_LOCATION]
                     [--attention-backend ATTENTION_BACKEND]
                     [--reasoning-parser REASONING_PARSER]
                     [--reasoning-parser-plugin REASONING_PARSER_PLUGIN]
                     [--distributed-executor-backend ['external_launcher', 'mp', 'ray', 'uni']]
                     [--pipeline-parallel-size PIPELINE_PARALLEL_SIZE]
                     [--master-addr MASTER_ADDR] [--master-port MASTER_PORT]
                     [--nnodes NNODES] [--node-rank NODE_RANK]
                     [--tensor-parallel-size TENSOR_PARALLEL_SIZE]
                     [--decode-context-parallel-size DECODE_CONTEXT_PARALLEL_SIZE]
                     [--dcp-kv-cache-interleave-size DCP_KV_CACHE_INTERLEAVE_SIZE]
                     [--cp-kv-cache-interleave-size CP_KV_CACHE_INTERLEAVE_SIZE]
                     [--prefill-context-parallel-size PREFILL_CONTEXT_PARALLEL_SIZE]
                     [--data-parallel-size DATA_PARALLEL_SIZE]
                     [--data-parallel-rank DATA_PARALLEL_RANK]
                     [--data-parallel-start-rank DATA_PARALLEL_START_RANK]
                     [--data-parallel-size-local DATA_PARALLEL_SIZE_LOCAL]
                     [--data-parallel-address DATA_PARALLEL_ADDRESS]
                     [--data-parallel-rpc-port DATA_PARALLEL_RPC_PORT]
                     [--data-parallel-backend DATA_PARALLEL_BACKEND]
                     [--data-parallel-hybrid-lb | --no-data-parallel-hybrid-lb | -dph]
                     [--data-parallel-external-lb | --no-data-parallel-external-lb | -dpe]
                     [--enable-expert-parallel | --no-enable-expert-parallel]
                     [--all2all-backend {allgather_reducescatter,deepep_high_throughput,deepep_low_latency,flashinfer_all2allv,naive,pplx,None}]
                     [--enable-dbo | --no-enable-dbo]
                     [--dbo-decode-token-threshold DBO_DECODE_TOKEN_THRESHOLD]
                     [--dbo-prefill-token-threshold DBO_PREFILL_TOKEN_THRESHOLD]
                     [--disable-nccl-for-dp-synchronization | --no-disable-nccl-for-dp-synchronization]
                     [--enable-eplb | --no-enable-eplb]
                     [--eplb-config EPLB_CONFIG]
                     [--expert-placement-strategy {linear,round_robin}]
                     [--max-parallel-loading-workers MAX_PARALLEL_LOADING_WORKERS]
                     [--ray-workers-use-nsight | --no-ray-workers-use-nsight]
                     [--disable-custom-all-reduce | --no-disable-custom-all-reduce]
                     [--worker-cls WORKER_CLS]
                     [--worker-extension-cls WORKER_EXTENSION_CLS]
                     [--block-size {1,8,16,32,64,128,256}]
                     [--gpu-memory-utilization GPU_MEMORY_UTILIZATION]
                     [--kv-cache-memory-bytes KV_CACHE_MEMORY_BYTES]
                     [--swap-space SWAP_SPACE]
                     [--kv-cache-dtype {auto,bfloat16,fp8,fp8_ds_mla,fp8_e4m3,fp8_e5m2,fp8_inc}]
                     [--num-gpu-blocks-override NUM_GPU_BLOCKS_OVERRIDE]
                     [--enable-prefix-caching | --no-enable-prefix-caching]
                     [--prefix-caching-hash-algo {sha256,sha256_cbor,xxhash,xxhash_cbor}]
                     [--cpu-offload-gb CPU_OFFLOAD_GB]
                     [--calculate-kv-scales | --no-calculate-kv-scales]
                     [--kv-sharing-fast-prefill | --no-kv-sharing-fast-prefill]
                     [--mamba-cache-dtype {auto,float16,float32}]
                     [--mamba-ssm-cache-dtype {auto,float16,float32}]
                     [--mamba-block-size MAMBA_BLOCK_SIZE]
                     [--kv-offloading-size KV_OFFLOADING_SIZE]
                     [--kv-offloading-backend {lmcache,native,None}]
                     [--limit-mm-per-prompt LIMIT_MM_PER_PROMPT]
                     [--enable-mm-embeds | --no-enable-mm-embeds]
                     [--media-io-kwargs MEDIA_IO_KWARGS]
                     [--mm-processor-kwargs MM_PROCESSOR_KWARGS]
                     [--mm-processor-cache-gb MM_PROCESSOR_CACHE_GB]
                     [--mm-processor-cache-type {lru,shm}]
                     [--mm-shm-cache-max-object-size-mb MM_SHM_CACHE_MAX_OBJECT_SIZE_MB]
                     [--mm-encoder-tp-mode {data,weights}]
                     [--mm-encoder-attn-backend MM_ENCODER_ATTN_BACKEND]
                     [--interleave-mm-strings | --no-interleave-mm-strings]
                     [--skip-mm-profiling | --no-skip-mm-profiling]
                     [--video-pruning-rate VIDEO_PRUNING_RATE]
                     [--enable-lora | --no-enable-lora]
                     [--max-loras MAX_LORAS]
                     [--max-lora-rank {1,8,16,32,64,128,256,320,512}]
                     [--lora-dtype {auto,bfloat16,float16}]
                     [--max-cpu-loras MAX_CPU_LORAS]
                     [--fully-sharded-loras | --no-fully-sharded-loras]
                     [--default-mm-loras DEFAULT_MM_LORAS]
                     [--show-hidden-metrics-for-version SHOW_HIDDEN_METRICS_FOR_VERSION]
                     [--otlp-traces-endpoint OTLP_TRACES_ENDPOINT]
                     [--collect-detailed-traces {all,model,worker,None} [{all,model,worker,None} ...]]
                     [--kv-cache-metrics | --no-kv-cache-metrics]
                     [--kv-cache-metrics-sample KV_CACHE_METRICS_SAMPLE]
                     [--cudagraph-metrics | --no-cudagraph-metrics]
                     [--enable-layerwise-nvtx-tracing | --no-enable-layerwise-nvtx-tracing]
                     [--max-num-batched-tokens MAX_NUM_BATCHED_TOKENS]
                     [--max-num-seqs MAX_NUM_SEQS]
                     [--max-num-partial-prefills MAX_NUM_PARTIAL_PREFILLS]
                     [--max-long-partial-prefills MAX_LONG_PARTIAL_PREFILLS]
                     [--long-prefill-token-threshold LONG_PREFILL_TOKEN_THRESHOLD]
                     [--scheduling-policy {fcfs,priority}]
                     [--enable-chunked-prefill | --no-enable-chunked-prefill]
                     [--disable-chunked-mm-input | --no-disable-chunked-mm-input]
                     [--scheduler-cls SCHEDULER_CLS]
                     [--disable-hybrid-kv-cache-manager | --no-disable-hybrid-kv-cache-manager]
                     [--async-scheduling | --no-async-scheduling]
                     [--stream-interval STREAM_INTERVAL]
                     [--cudagraph-capture-sizes CUDAGRAPH_CAPTURE_SIZES [CUDAGRAPH_CAPTURE_SIZES ...]]
                     [--max-cudagraph-capture-size MAX_CUDAGRAPH_CAPTURE_SIZE]
                     [--speculative-config SPECULATIVE_CONFIG]
                     [--kv-transfer-config KV_TRANSFER_CONFIG]
                     [--kv-events-config KV_EVENTS_CONFIG]
                     [--ec-transfer-config EC_TRANSFER_CONFIG]
                     [--compilation-config COMPILATION_CONFIG]
                     [--attention-config ATTENTION_CONFIG]
                     [--additional-config ADDITIONAL_CONFIG]
                     [--structured-outputs-config STRUCTURED_OUTPUTS_CONFIG]
                     [--profiler-config PROFILER_CONFIG]
                     [--optimization-level OPTIMIZATION_LEVEL]
                     [--disable-log-stats] [--aggregate-engine-logging]
                     [--enable-log-requests | --no-enable-log-requests]
                     [--disable-log-requests | --no-disable-log-requests]
                     [model_tag]
api_server.py: error: unrecognized arguments: --guided-decoding-backend
/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
  warnings.warn(
[0;36m(EngineCore_DP0 pid=3603174)[0;0m 
Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
[0;36m(EngineCore_DP0 pid=3603174)[0;0m 
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:06<00:19,  6.52s/it]
[0;36m(EngineCore_DP0 pid=3603174)[0;0m 
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:08<00:07,  3.70s/it]
[0;36m(EngineCore_DP0 pid=3603174)[0;0m 
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:15<00:05,  5.28s/it]
[0;36m(EngineCore_DP0 pid=3603174)[0;0m 
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:21<00:00,  5.57s/it]
[0;36m(EngineCore_DP0 pid=3603174)[0;0m 
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:21<00:00,  5.35s/it]
[0;36m(EngineCore_DP0 pid=3603174)[0;0m 
[0;36m(EngineCore_DP0 pid=3603174)[0;0m 
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/51 [00:00<?, ?it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   4%|▍         | 2/51 [00:00<00:03, 14.83it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   8%|▊         | 4/51 [00:00<00:03, 15.62it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  12%|█▏        | 6/51 [00:00<00:02, 16.45it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  16%|█▌        | 8/51 [00:00<00:02, 16.87it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  22%|██▏       | 11/51 [00:00<00:02, 18.23it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  27%|██▋       | 14/51 [00:00<00:01, 19.00it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  33%|███▎      | 17/51 [00:00<00:01, 19.74it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  39%|███▉      | 20/51 [00:01<00:01, 20.74it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  45%|████▌     | 23/51 [00:01<00:01, 21.38it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  51%|█████     | 26/51 [00:01<00:01, 21.90it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  57%|█████▋    | 29/51 [00:01<00:01, 20.09it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  63%|██████▎   | 32/51 [00:01<00:00, 21.04it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  69%|██████▊   | 35/51 [00:01<00:00, 21.92it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  75%|███████▍  | 38/51 [00:01<00:00, 22.58it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  80%|████████  | 41/51 [00:01<00:00, 23.20it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  86%|████████▋ | 44/51 [00:02<00:00, 23.84it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  92%|█████████▏| 47/51 [00:02<00:00, 23.47it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  98%|█████████▊| 50/51 [00:02<00:00, 24.19it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:02<00:00, 20.91it/s]
[0;36m(EngineCore_DP0 pid=3603174)[0;0m 
Capturing CUDA graphs (decode, FULL):   0%|          | 0/35 [00:00<?, ?it/s]
Capturing CUDA graphs (decode, FULL):   3%|▎         | 1/35 [00:00<00:04,  7.51it/s]
Capturing CUDA graphs (decode, FULL):  11%|█▏        | 4/35 [00:00<00:01, 16.43it/s]
Capturing CUDA graphs (decode, FULL):  20%|██        | 7/35 [00:00<00:01, 19.40it/s]
Capturing CUDA graphs (decode, FULL):  29%|██▊       | 10/35 [00:00<00:01, 20.92it/s]
Capturing CUDA graphs (decode, FULL):  37%|███▋      | 13/35 [00:00<00:01, 21.93it/s]
Capturing CUDA graphs (decode, FULL):  46%|████▌     | 16/35 [00:00<00:00, 22.62it/s]
Capturing CUDA graphs (decode, FULL):  54%|█████▍    | 19/35 [00:00<00:00, 23.03it/s]
Capturing CUDA graphs (decode, FULL):  63%|██████▎   | 22/35 [00:01<00:00, 23.33it/s]
Capturing CUDA graphs (decode, FULL):  71%|███████▏  | 25/35 [00:01<00:00, 23.31it/s]
Capturing CUDA graphs (decode, FULL):  80%|████████  | 28/35 [00:01<00:00, 23.67it/s]
Capturing CUDA graphs (decode, FULL):  89%|████████▊ | 31/35 [00:01<00:00, 24.00it/s]
Capturing CUDA graphs (decode, FULL):  97%|█████████▋| 34/35 [00:01<00:00, 24.21it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:01<00:00, 22.44it/s]
[0;36m(APIServer pid=3602630)[0;0m INFO:     Started server process [3602630]
[0;36m(APIServer pid=3602630)[0;0m INFO:     Waiting for application startup.
[0;36m(APIServer pid=3602630)[0;0m INFO:     Application startup complete.
/u/yurenh2/miniforge3/envs/eval/lib/python3.11/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
  warnings.warn(
2026-01-26 12:38:05,935 - INFO - Loaded dataset: math-hard
2026-01-26 12:38:05,935 - INFO - Profile path configured: ../data/complex_profiles_v2/profiles_200.jsonl
2026-01-26 12:38:06,001 - INFO - Loaded 200 profiles from ../data/complex_profiles_v2/profiles_200.jsonl
2026-01-26 12:38:06,003 - INFO - Running method: vanilla
2026-01-26 12:38:06,004 - INFO -   Running profiles 0 to 9 (10 remaining)
2026-01-26 12:38:06,004 - INFO -   Using BATCH processing (local vLLM user) for vanilla
2026-01-26 12:38:06,006 - INFO -   Using local vLLM user simulator: http://localhost:8004/v1
2026-01-26 12:38:11,230 - INFO -   Batch: 10 profiles, 20 sessions remaining
2026-01-26 12:38:11,240 - INFO -   Session round 1/2: 10 total, 6876 sessions/hr
2026-01-26 12:38:11,248 - INFO -   Session round 2/2: 20 total, 13730 sessions/hr
2026-01-26 12:38:11,996 - INFO -   GPU memory freed after vanilla: 0.0GB allocated
2026-01-26 12:38:12,000 - INFO - Report saved to ../results/test_local_user_20260126_123755/20260126_123805/report.md