blob: b39c2303367fc9f66e9869998ba84a355726ae5f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
#!/bin/bash
# run_evaluation.sh
# Script to run evaluation on trained models
set -e
set -o pipefail # Properly capture exit codes through pipes
# Configuration
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}
# HuggingFace cache - use shared HDD storage to avoid quota issues
export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache"
export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub"
mkdir -p "$HF_HOME" "$HF_HUB_CACHE"
# Default values
PRECISION_MODE=${1:-"bf16"}
SEED=${2:-1}
BASE_MODEL=${BASE_MODEL:-"Qwen/Qwen2.5-Math-7B"}
TRAIN_LOGS_DIR=${TRAIN_LOGS_DIR:-"./results/train_logs"}
EVAL_METRICS_DIR=${EVAL_METRICS_DIR:-"./results/eval_metrics"}
EVAL_CONFIG=${EVAL_CONFIG:-"./configs/eval_tasks_config.json"}
# Paths
FT_CKPT="${TRAIN_LOGS_DIR}/${PRECISION_MODE}_seed${SEED}/final_model"
OUTPUT_PATH="${EVAL_METRICS_DIR}/${PRECISION_MODE}_seed${SEED}.json"
# Create output directory
mkdir -p "$EVAL_METRICS_DIR"
echo "=============================================="
echo "Model Evaluation"
echo "=============================================="
echo "Precision Mode: $PRECISION_MODE"
echo "Seed: $SEED"
echo "Base Model: $BASE_MODEL"
echo "Finetuned Model: $FT_CKPT"
echo "Output: $OUTPUT_PATH"
echo "=============================================="
# Check if checkpoint exists
if [ ! -d "$FT_CKPT" ]; then
echo "Error: Checkpoint not found at $FT_CKPT"
exit 1
fi
# Run evaluation
python eval_policy.py \
--base_ckpt "$BASE_MODEL" \
--ft_ckpt "$FT_CKPT" \
--eval_tasks_config "$EVAL_CONFIG" \
--output_path "$OUTPUT_PATH" \
--eval_base \
--use_amp \
2>&1 | tee "${EVAL_METRICS_DIR}/${PRECISION_MODE}_seed${SEED}_eval.log"
echo "Evaluation complete. Results saved to: $OUTPUT_PATH"
|