#!/bin/bash # run_evaluation.sh # Script to run evaluation on trained models set -e set -o pipefail # Properly capture exit codes through pipes # Configuration export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"} # HuggingFace cache - use shared HDD storage to avoid quota issues export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache" export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub" mkdir -p "$HF_HOME" "$HF_HUB_CACHE" # Default values PRECISION_MODE=${1:-"bf16"} SEED=${2:-1} BASE_MODEL=${BASE_MODEL:-"Qwen/Qwen2.5-Math-7B"} TRAIN_LOGS_DIR=${TRAIN_LOGS_DIR:-"./results/train_logs"} EVAL_METRICS_DIR=${EVAL_METRICS_DIR:-"./results/eval_metrics"} EVAL_CONFIG=${EVAL_CONFIG:-"./configs/eval_tasks_config.json"} # Paths FT_CKPT="${TRAIN_LOGS_DIR}/${PRECISION_MODE}_seed${SEED}/final_model" OUTPUT_PATH="${EVAL_METRICS_DIR}/${PRECISION_MODE}_seed${SEED}.json" # Create output directory mkdir -p "$EVAL_METRICS_DIR" echo "==============================================" echo "Model Evaluation" echo "==============================================" echo "Precision Mode: $PRECISION_MODE" echo "Seed: $SEED" echo "Base Model: $BASE_MODEL" echo "Finetuned Model: $FT_CKPT" echo "Output: $OUTPUT_PATH" echo "==============================================" # Check if checkpoint exists if [ ! -d "$FT_CKPT" ]; then echo "Error: Checkpoint not found at $FT_CKPT" exit 1 fi # Run evaluation python eval_policy.py \ --base_ckpt "$BASE_MODEL" \ --ft_ckpt "$FT_CKPT" \ --eval_tasks_config "$EVAL_CONFIG" \ --output_path "$OUTPUT_PATH" \ --eval_base \ --use_amp \ 2>&1 | tee "${EVAL_METRICS_DIR}/${PRECISION_MODE}_seed${SEED}_eval.log" echo "Evaluation complete. Results saved to: $OUTPUT_PATH"