From 680513b7771a29f27cbbb3ffb009a69a913de6f9 Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Tue, 27 Jan 2026 12:15:45 -0600 Subject: local reward model --- test_local_reward_15667799.out | 161 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 test_local_reward_15667799.out (limited to 'test_local_reward_15667799.out') diff --git a/test_local_reward_15667799.out b/test_local_reward_15667799.out new file mode 100644 index 0000000..a5f0a7d --- /dev/null +++ b/test_local_reward_15667799.out @@ -0,0 +1,161 @@ +=== Local LLM Reward Model Batch Test === +Model: models/llama-3.1-8b-instruct +GPU: NVIDIA A100-SXM4-40GB + +Starting vLLM server on port 8005... +Waiting for vLLM server to start... +(APIServer pid=3030837) INFO 01-27 12:07:55 [api_server.py:1351] vLLM API server version 0.13.0 +(APIServer pid=3030837) INFO 01-27 12:07:55 [utils.py:253] non-default args: {'port': 8005, 'model': 'models/llama-3.1-8b-instruct', 'dtype': 'bfloat16', 'max_model_len': 4096, 'gpu_memory_utilization': 0.85} +(APIServer pid=3030837) INFO 01-27 12:07:55 [model.py:514] Resolved architecture: LlamaForCausalLM +(APIServer pid=3030837) INFO 01-27 12:07:55 [model.py:1661] Using max model len 4096 +(APIServer pid=3030837) INFO 01-27 12:07:56 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=2048. +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:08 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='models/llama-3.1-8b-instruct', speculative_config=None, tokenizer='models/llama-3.1-8b-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False), seed=0, served_model_name=models/llama-3.1-8b-instruct, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [2048], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False}, 'local_cache_dir': None} +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:09 [parallel_state.py:1203] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://141.142.254.46:56145 backend=nccl +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:09 [parallel_state.py:1411] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0 +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:11 [gpu_model_runner.py:3562] Starting to load model models/llama-3.1-8b-instruct... +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:12 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION') +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:26 [default_loader.py:308] Loading weights took 14.16 seconds +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:27 [gpu_model_runner.py:3659] Model loading took 14.9889 GiB memory and 15.238846 seconds +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:37 [backends.py:643] Using cache directory: /u/yurenh2/.cache/vllm/torch_compile_cache/1c763cd906/rank_0_0/backbone for vLLM's torch.compile +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:37 [backends.py:703] Dynamo bytecode transform time: 9.87 s +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:42 [backends.py:226] Directly load the compiled graph(s) for compile range (1, 2048) from the cache, took 1.177 s +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:42 [monitor.py:34] torch.compile takes 11.05 s in total +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:42 [gpu_worker.py:375] Available KV cache memory: 17.35 GiB +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:43 [kv_cache_utils.py:1291] GPU KV cache size: 142,144 tokens +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:43 [kv_cache_utils.py:1296] Maximum concurrency for 4,096 tokens per request: 34.70x +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:47 [gpu_model_runner.py:4587] Graph capturing finished in 5 secs, took 0.56 GiB +(EngineCore_DP0 pid=3030963) INFO 01-27 12:08:47 [core.py:259] init engine (profile, create kv cache, warmup model) took 20.58 seconds +(APIServer pid=3030837) INFO 01-27 12:08:48 [api_server.py:1099] Supported tasks: ['generate'] +(APIServer pid=3030837) WARNING 01-27 12:08:48 [model.py:1487] Default sampling parameters have been overridden by the model's Hugging Face generation config recommended from the model creator. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. +(APIServer pid=3030837) INFO 01-27 12:08:48 [serving_responses.py:201] Using default chat sampling params from model: {'temperature': 0.6, 'top_p': 0.9} +(APIServer pid=3030837) INFO 01-27 12:08:48 [serving_chat.py:137] Using default chat sampling params from model: {'temperature': 0.6, 'top_p': 0.9} +(APIServer pid=3030837) INFO 01-27 12:08:48 [serving_completion.py:77] Using default completion sampling params from model: {'temperature': 0.6, 'top_p': 0.9} +(APIServer pid=3030837) INFO 01-27 12:08:48 [serving_chat.py:137] Using default chat sampling params from model: {'temperature': 0.6, 'top_p': 0.9} +(APIServer pid=3030837) INFO 01-27 12:08:48 [api_server.py:1425] Starting vLLM API server 0 on http://0.0.0.0:8005 +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:38] Available routes are: +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /openapi.json, Methods: HEAD, GET +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /docs, Methods: HEAD, GET +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /redoc, Methods: HEAD, GET +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /pause, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /resume, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /is_paused, Methods: GET +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /health, Methods: GET +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /load, Methods: GET +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /version, Methods: GET +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v1/audio/transcriptions, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v1/audio/translations, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /classify, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v1/embeddings, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /score, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v1/score, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /rerank, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v1/rerank, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /v2/rerank, Methods: POST +(APIServer pid=3030837) INFO 01-27 12:08:48 [launcher.py:46] Route: /pooling, Methods: POST +(APIServer pid=3030837) INFO: 127.0.0.1:58106 - "GET /health HTTP/1.1" 200 OK +vLLM server ready after 85s +(APIServer pid=3030837) INFO: 127.0.0.1:58122 - "GET /health HTTP/1.1" 200 OK + +Running batch test... +(APIServer pid=3030837) INFO: 127.0.0.1:58132 - "GET /v1/models HTTP/1.1" 200 OK +(APIServer pid=3030837) INFO 01-27 12:08:51 [chat_utils.py:590] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=3030837) INFO: 127.0.0.1:58184 - "POST /v1/chat/completions HTTP/1.1" 200 OK +(APIServer pid=3030837) INFO: 127.0.0.1:58232 - "POST /v1/chat/completions HTTP/1.1" 200 OK +(APIServer pid=3030837) INFO: 127.0.0.1:58244 - "POST /v1/chat/completions HTTP/1.1" 200 OK +(APIServer pid=3030837) INFO: 127.0.0.1:58170 - "POST /v1/chat/completions HTTP/1.1" 200 OK +(APIServer pid=3030837) INFO: 127.0.0.1:58222 - "POST /v1/chat/completions HTTP/1.1" 200 OK +(APIServer pid=3030837) INFO: 127.0.0.1:58194 - "POST /v1/chat/completions HTTP/1.1" 200 OK +(APIServer pid=3030837) INFO: 127.0.0.1:58208 - "POST /v1/chat/completions HTTP/1.1" 200 OK +(APIServer pid=3030837) INFO: 127.0.0.1:58200 - "POST /v1/chat/completions HTTP/1.1" 200 OK +(APIServer pid=3030837) INFO: 127.0.0.1:58180 - "POST /v1/chat/completions HTTP/1.1" 200 OK +(APIServer pid=3030837) INFO: 127.0.0.1:58152 - "POST /v1/chat/completions HTTP/1.1" 200 OK +(APIServer pid=3030837) INFO: 127.0.0.1:58144 - "POST /v1/chat/completions HTTP/1.1" 200 OK +(APIServer pid=3030837) INFO: 127.0.0.1:58154 - "POST /v1/chat/completions HTTP/1.1" 200 OK +====================================================================== +Local LLM Reward Model Batch Test +====================================================================== +vLLM URL: http://localhost:8005/v1 + +Model: models/llama-3.1-8b-instruct + +Running batch inference on 12 samples... +Completed in 3.29s (3.6 samples/sec) + +[ 1] neg_constraint_restate - format preference + Expected: neg_constraint_restate Got: neg_correction [WRONG] + Confidence: 0.80, Reward: -0.8, Update: True + +[ 2] neg_constraint_restate - step by step + Expected: neg_constraint_restate Got: neg_constraint_restate [OK] + Confidence: 0.90, Reward: -1.0, Update: True + +[ 3] neg_correction - wrong answer + Expected: neg_correction Got: neg_correction [OK] + Confidence: 0.90, Reward: -0.8, Update: True + +[ 4] neg_confusion - unclear explanation + Expected: neg_confusion Got: neg_confusion [OK] + Confidence: 0.90, Reward: -0.6, Update: True + +[ 5] pos_praise - explicit thanks + Expected: pos_praise Got: pos_praise [OK] + Confidence: 1.00, Reward: +0.8, Update: True + +[ 6] pos_praise - great explanation + Expected: pos_praise Got: pos_praise [OK] + Confidence: 1.00, Reward: +0.8, Update: True + +[ 7] pos_progress - follow-up question + Expected: pos_progress Got: pos_progress [OK] + Confidence: 0.90, Reward: +0.1, Update: True + +[ 8] pos_progress - extension + Expected: pos_progress Got: pos_progress [OK] + Confidence: 0.90, Reward: +0.1, Update: True + +[ 9] neutral - minimal response + Expected: neutral Got: neg_correction [WRONG] + Confidence: 0.80, Reward: -0.8, Update: True + +[10] topic_shift - new topic + Expected: topic_shift Got: topic_shift [OK] + Confidence: 0.90, Reward: +0.0, Update: False + +[11] neg_constraint_restate - language preference + Expected: neg_constraint_restate Got: neg_constraint_restate [OK] + Confidence: 0.80, Reward: -1.0, Update: True + +[12] neg_correction - incomplete answer + Expected: neg_correction Got: neg_correction [OK] + Confidence: 0.90, Reward: -0.8, Update: True + +====================================================================== +SUMMARY +====================================================================== +Accuracy: 83.3% (10/12) +Time: 3.29s +Throughput: 3.6 samples/sec +Avg latency: 274ms per sample (batched) + +Errors (2): + - neg_constraint_restate - format preference: Got neg_correction, Expected neg_constraint_restate + - neutral - minimal response: Got neg_correction, Expected neutral + +=== Test Complete === -- cgit v1.2.3