From f1c2cc22d46a6976df3555391e667c7e61592fad Mon Sep 17 00:00:00 2001
From: YurenHao0426 <blackhao0426@gmail.com>
Date: Wed, 4 Feb 2026 18:59:35 -0600
Subject: Initial commit: RL floating-point noise project

---
 scripts/run_evaluation.sh | 58 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100755 scripts/run_evaluation.sh

(limited to 'scripts/run_evaluation.sh')

diff --git a/scripts/run_evaluation.sh b/scripts/run_evaluation.sh
new file mode 100755
index 0000000..b39c230
--- /dev/null
+++ b/scripts/run_evaluation.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# run_evaluation.sh
+# Script to run evaluation on trained models
+
+set -e
+set -o pipefail  # Properly capture exit codes through pipes
+
+# Configuration
+export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}
+
+# HuggingFace cache - use shared HDD storage to avoid quota issues
+export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache"
+export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub"
+mkdir -p "$HF_HOME" "$HF_HUB_CACHE"
+
+# Default values
+PRECISION_MODE=${1:-"bf16"}
+SEED=${2:-1}
+BASE_MODEL=${BASE_MODEL:-"Qwen/Qwen2.5-Math-7B"}
+TRAIN_LOGS_DIR=${TRAIN_LOGS_DIR:-"./results/train_logs"}
+EVAL_METRICS_DIR=${EVAL_METRICS_DIR:-"./results/eval_metrics"}
+EVAL_CONFIG=${EVAL_CONFIG:-"./configs/eval_tasks_config.json"}
+
+# Paths
+FT_CKPT="${TRAIN_LOGS_DIR}/${PRECISION_MODE}_seed${SEED}/final_model"
+OUTPUT_PATH="${EVAL_METRICS_DIR}/${PRECISION_MODE}_seed${SEED}.json"
+
+# Create output directory
+mkdir -p "$EVAL_METRICS_DIR"
+
+echo "=============================================="
+echo "Model Evaluation"
+echo "=============================================="
+echo "Precision Mode: $PRECISION_MODE"
+echo "Seed: $SEED"
+echo "Base Model: $BASE_MODEL"
+echo "Finetuned Model: $FT_CKPT"
+echo "Output: $OUTPUT_PATH"
+echo "=============================================="
+
+# Check if checkpoint exists
+if [ ! -d "$FT_CKPT" ]; then
+    echo "Error: Checkpoint not found at $FT_CKPT"
+    exit 1
+fi
+
+# Run evaluation
+python eval_policy.py \
+    --base_ckpt "$BASE_MODEL" \
+    --ft_ckpt "$FT_CKPT" \
+    --eval_tasks_config "$EVAL_CONFIG" \
+    --output_path "$OUTPUT_PATH" \
+    --eval_base \
+    --use_amp \
+    2>&1 | tee "${EVAL_METRICS_DIR}/${PRECISION_MODE}_seed${SEED}_eval.log"
+
+echo "Evaluation complete. Results saved to: $OUTPUT_PATH"
+
-- 
cgit v1.2.3