diff options
Diffstat (limited to 'scripts')
| -rwxr-xr-x | scripts/prepare_data.py | 258 | ||||
| -rwxr-xr-x | scripts/run_evaluation.sh | 58 | ||||
| -rwxr-xr-x | scripts/run_full_experiment.sh | 106 | ||||
| -rwxr-xr-x | scripts/run_training.sh | 50 | ||||
| -rwxr-xr-x | scripts/setup_env.sh | 79 | ||||
| -rwxr-xr-x | scripts/slurm_train.sh | 145 | ||||
| -rwxr-xr-x | scripts/submit_all_jobs.sh | 66 | ||||
| -rwxr-xr-x | scripts/submit_single_job.sh | 32 | ||||
| -rw-r--r-- | scripts/test_quick.sh | 78 |
9 files changed, 872 insertions, 0 deletions
diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py new file mode 100755 index 0000000..5ef3c29 --- /dev/null +++ b/scripts/prepare_data.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +""" +Prepare REAL datasets for RLVR floating-point precision experiments. + +Downloads from HuggingFace: +- Training: GSM8K train (7473 samples) +- Evaluation: GSM8K test, MATH-500, AIME, AMC, MMLU-STEM, HumanEval + +Usage: + python scripts/prepare_data.py +""" + +import json +import os +import random +from pathlib import Path +from datasets import load_dataset +from tqdm import tqdm + +DATA_DIR = Path("data") +DATA_DIR.mkdir(exist_ok=True) + + +def save_json(data: list, path: Path): + """Save data as JSON file.""" + with open(path, "w") as f: + json.dump(data, f, indent=2) + print(f" Saved {len(data)} samples to {path}") + + +def prepare_gsm8k_train(): + """Prepare GSM8K training data.""" + print("\n=== Downloading GSM8K Train ===") + ds = load_dataset("openai/gsm8k", "main", split="train") + + data = [] + for i, sample in enumerate(tqdm(ds, desc="Processing")): + # Extract answer from "#### N" format + answer = sample["answer"].split("####")[-1].strip() + data.append({ + "id": f"gsm8k_train_{i}", + "prompt": sample["question"], + "answer": answer, + "solution": sample["answer"], + "source": "gsm8k_train" + }) + + save_json(data, DATA_DIR / "dm_train.json") + return data + + +def prepare_gsm8k_test(): + """Prepare GSM8K test data for evaluation.""" + print("\n=== Downloading GSM8K Test ===") + ds = load_dataset("openai/gsm8k", "main", split="test") + + data = [] + for i, sample in enumerate(tqdm(ds, desc="Processing")): + answer = sample["answer"].split("####")[-1].strip() + data.append({ + "id": f"gsm8k_test_{i}", + "prompt": sample["question"], + "answer": answer, + "solution": sample["answer"], + "source": "gsm8k" + }) + + save_json(data, DATA_DIR / "gsm8k.json") + + # Also create dm_val as a subset (first 500 for on-task eval) + save_json(data[:500], DATA_DIR / "dm_val.json") + return data + + +def prepare_math500(): + """Prepare MATH-500 dataset.""" + print("\n=== Downloading MATH-500 ===") + ds = load_dataset("HuggingFaceH4/MATH-500", split="test") + + data = [] + for i, sample in enumerate(tqdm(ds, desc="Processing")): + data.append({ + "id": f"math500_{i}", + "prompt": sample["problem"], + "answer": sample["answer"], + "solution": sample["solution"], + "subject": sample.get("subject", ""), + "level": sample.get("level", ""), + "source": "math500" + }) + + save_json(data, DATA_DIR / "math500.json") + return data + + +def prepare_aime(): + """Prepare AIME dataset from AI-MO.""" + print("\n=== Downloading AIME ===") + ds = load_dataset("AI-MO/aimo-validation-aime", split="train") + + data = [] + for i, sample in enumerate(tqdm(ds, desc="Processing")): + data.append({ + "id": f"aime_{i}", + "prompt": sample["problem"], + "answer": str(sample["answer"]), + "solution": sample.get("solution", ""), + "url": sample.get("url", ""), + "source": "aime" + }) + + # Split into aime24 and aime25 + # Real AIME has 15 problems per contest, 2 contests per year = 30/year + save_json(data[:30], DATA_DIR / "aime24.json") + save_json(data[30:60], DATA_DIR / "aime25.json") + save_json(data, DATA_DIR / "aime_all.json") + return data + + +def prepare_amc(): + """Prepare AMC dataset from AI-MO.""" + print("\n=== Downloading AMC ===") + ds = load_dataset("AI-MO/aimo-validation-amc", split="train") + + data = [] + for i, sample in enumerate(tqdm(ds, desc="Processing")): + data.append({ + "id": f"amc_{i}", + "prompt": sample["problem"], + "answer": str(sample["answer"]), + "solution": sample.get("solution", ""), + "source": "amc" + }) + + save_json(data, DATA_DIR / "amc23.json") + return data + + +def prepare_mmlu_stem(): + """Prepare MMLU-STEM subset.""" + print("\n=== Downloading MMLU-STEM ===") + + stem_subjects = [ + "abstract_algebra", "astronomy", "college_biology", "college_chemistry", + "college_computer_science", "college_mathematics", "college_physics", + "computer_security", "conceptual_physics", "electrical_engineering", + "elementary_mathematics", "high_school_biology", "high_school_chemistry", + "high_school_computer_science", "high_school_mathematics", "high_school_physics", + "high_school_statistics", "machine_learning" + ] + + data = [] + for subject in tqdm(stem_subjects, desc="Loading subjects"): + try: + ds = load_dataset("cais/mmlu", subject, split="test") + for i, sample in enumerate(ds): + choices = sample["choices"] + correct_idx = sample["answer"] + # Format as multiple choice + prompt = f"{sample['question']}\n" + for j, choice in enumerate(choices): + prompt += f"({chr(65+j)}) {choice}\n" + + data.append({ + "id": f"mmlu_{subject}_{i}", + "prompt": prompt, + "answer": chr(65 + correct_idx), + "subject": subject, + "source": "mmlu_stem" + }) + except Exception as e: + print(f" Warning: Skipping {subject}: {e}") + + # Take a random subset of 500 + random.seed(42) + if len(data) > 500: + data = random.sample(data, 500) + + save_json(data, DATA_DIR / "mmlu_stem.json") + return data + + +def prepare_humaneval(): + """Prepare HumanEval code dataset.""" + print("\n=== Downloading HumanEval ===") + ds = load_dataset("openai/openai_humaneval", split="test") + + data = [] + for i, sample in enumerate(tqdm(ds, desc="Processing")): + data.append({ + "id": f"humaneval_{i}", + "prompt": sample["prompt"], + "answer": sample["canonical_solution"], + "entry_point": sample["entry_point"], + "test": sample["test"], + "source": "humaneval" + }) + + save_json(data, DATA_DIR / "humaneval.json") + return data + + +def verify_data(): + """Verify downloaded data quality.""" + print("\n" + "=" * 60) + print("Verifying Data Quality") + print("=" * 60) + + for f in sorted(DATA_DIR.glob("*.json")): + with open(f) as fp: + data = json.load(fp) + + # Check for unique prompts + prompts = [d["prompt"] for d in data] + unique = len(set(prompts)) + + status = "OK" if unique == len(prompts) else f"WARN: {len(prompts)-unique} duplicates" + print(f" {f.name}: {len(data)} samples, {unique} unique [{status}]") + + # Show first example + if data: + print(f" Example: {data[0]['prompt'][:60]}...") + + +def main(): + print("=" * 60) + print("RLVR Real Data Preparation") + print("=" * 60) + + # Backup old data + backup_dir = DATA_DIR / "backup_synthetic" + if not backup_dir.exists() and any(DATA_DIR.glob("*.json")): + backup_dir.mkdir(exist_ok=True) + for f in DATA_DIR.glob("*.json"): + f.rename(backup_dir / f.name) + print(f"Backed up synthetic data to {backup_dir}") + + # Training data + prepare_gsm8k_train() + + # Evaluation data + prepare_gsm8k_test() + prepare_math500() + prepare_aime() + prepare_amc() + prepare_mmlu_stem() + prepare_humaneval() + + # Verify + verify_data() + + print("\n" + "=" * 60) + print("Data preparation complete!") + print("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/scripts/run_evaluation.sh b/scripts/run_evaluation.sh new file mode 100755 index 0000000..b39c230 --- /dev/null +++ b/scripts/run_evaluation.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# run_evaluation.sh +# Script to run evaluation on trained models + +set -e +set -o pipefail # Properly capture exit codes through pipes + +# Configuration +export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"} + +# HuggingFace cache - use shared HDD storage to avoid quota issues +export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache" +export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub" +mkdir -p "$HF_HOME" "$HF_HUB_CACHE" + +# Default values +PRECISION_MODE=${1:-"bf16"} +SEED=${2:-1} +BASE_MODEL=${BASE_MODEL:-"Qwen/Qwen2.5-Math-7B"} +TRAIN_LOGS_DIR=${TRAIN_LOGS_DIR:-"./results/train_logs"} +EVAL_METRICS_DIR=${EVAL_METRICS_DIR:-"./results/eval_metrics"} +EVAL_CONFIG=${EVAL_CONFIG:-"./configs/eval_tasks_config.json"} + +# Paths +FT_CKPT="${TRAIN_LOGS_DIR}/${PRECISION_MODE}_seed${SEED}/final_model" +OUTPUT_PATH="${EVAL_METRICS_DIR}/${PRECISION_MODE}_seed${SEED}.json" + +# Create output directory +mkdir -p "$EVAL_METRICS_DIR" + +echo "==============================================" +echo "Model Evaluation" +echo "==============================================" +echo "Precision Mode: $PRECISION_MODE" +echo "Seed: $SEED" +echo "Base Model: $BASE_MODEL" +echo "Finetuned Model: $FT_CKPT" +echo "Output: $OUTPUT_PATH" +echo "==============================================" + +# Check if checkpoint exists +if [ ! -d "$FT_CKPT" ]; then + echo "Error: Checkpoint not found at $FT_CKPT" + exit 1 +fi + +# Run evaluation +python eval_policy.py \ + --base_ckpt "$BASE_MODEL" \ + --ft_ckpt "$FT_CKPT" \ + --eval_tasks_config "$EVAL_CONFIG" \ + --output_path "$OUTPUT_PATH" \ + --eval_base \ + --use_amp \ + 2>&1 | tee "${EVAL_METRICS_DIR}/${PRECISION_MODE}_seed${SEED}_eval.log" + +echo "Evaluation complete. Results saved to: $OUTPUT_PATH" + diff --git a/scripts/run_full_experiment.sh b/scripts/run_full_experiment.sh new file mode 100755 index 0000000..43e9dd5 --- /dev/null +++ b/scripts/run_full_experiment.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# run_full_experiment.sh +# Master script to run the complete RLVR floating-point precision experiment + +set -e + +# Configuration +SEEDS=(1 2 3 4 5) +PRECISION_MODES=("fp32" "bf16") +TRAIN_DATA=${TRAIN_DATA:-"./data/dm_train.json"} +OUTPUT_BASE=${OUTPUT_BASE:-"./results"} + +echo "==============================================" +echo "RLVR Floating-Point Precision Experiment" +echo "==============================================" +echo "Seeds: ${SEEDS[*]}" +echo "Precision Modes: ${PRECISION_MODES[*]}" +echo "Output: $OUTPUT_BASE" +echo "==============================================" + +# Create directories +mkdir -p "$OUTPUT_BASE/train_logs" +mkdir -p "$OUTPUT_BASE/eval_metrics" +mkdir -p "$OUTPUT_BASE/analysis" + +# Phase 1: Training +echo "" +echo "==============================================" +echo "PHASE 1: TRAINING" +echo "==============================================" + +for precision in "${PRECISION_MODES[@]}"; do + for seed in "${SEEDS[@]}"; do + echo "Training: precision=$precision, seed=$seed" + + OUTPUT_DIR="$OUTPUT_BASE/train_logs/${precision}_seed${seed}" + + # Skip if already completed + if [ -d "$OUTPUT_DIR/final_model" ]; then + echo " -> Skipping (already completed)" + continue + fi + + # Run training + bash scripts/run_training.sh "$precision" "$seed" + done +done + +# Phase 2: Evaluation +echo "" +echo "==============================================" +echo "PHASE 2: EVALUATION" +echo "==============================================" + +for precision in "${PRECISION_MODES[@]}"; do + for seed in "${SEEDS[@]}"; do + echo "Evaluating: precision=$precision, seed=$seed" + + OUTPUT_PATH="$OUTPUT_BASE/eval_metrics/${precision}_seed${seed}.json" + + # Skip if already completed + if [ -f "$OUTPUT_PATH" ]; then + echo " -> Skipping (already completed)" + continue + fi + + # Run evaluation + bash scripts/run_evaluation.sh "$precision" "$seed" + done +done + +# Phase 3: bf16 Sparsity Analysis +echo "" +echo "==============================================" +echo "PHASE 3: BF16 SPARSITY ANALYSIS" +echo "==============================================" + +python run_experiments.py --mode sparsity \ + --base_output_dir "$OUTPUT_BASE" \ + --seeds "${SEEDS[@]}" + +# Phase 4: Results Analysis +echo "" +echo "==============================================" +echo "PHASE 4: RESULTS ANALYSIS" +echo "==============================================" + +python analyze_results.py \ + --results_dir "$OUTPUT_BASE/eval_metrics" \ + --output_dir "$OUTPUT_BASE/analysis" \ + --on_task dm_val \ + --off_task aime24 aime25 amc23 math500 mmlu_stem humaneval + +echo "" +echo "==============================================" +echo "EXPERIMENT COMPLETE" +echo "==============================================" +echo "Results saved to: $OUTPUT_BASE" +echo "" +echo "Key output files:" +echo " - Training logs: $OUTPUT_BASE/train_logs/" +echo " - Evaluation metrics: $OUTPUT_BASE/eval_metrics/" +echo " - Analysis: $OUTPUT_BASE/analysis/full_analysis.json" +echo " - Plots: $OUTPUT_BASE/analysis/*.png" +echo "==============================================" + diff --git a/scripts/run_training.sh b/scripts/run_training.sh new file mode 100755 index 0000000..38b2fc8 --- /dev/null +++ b/scripts/run_training.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# run_training.sh +# Script to run RLVR training experiments with different precision modes + +set -e +set -o pipefail # Properly capture exit codes through pipes + +# Configuration +export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0,1"} + +# HuggingFace cache - use shared HDD storage to avoid quota issues +export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache" +export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub" +mkdir -p "$HF_HOME" "$HF_HUB_CACHE" + +# Default values +PRECISION_MODE=${1:-"bf16"} +SEED=${2:-1} +TRAIN_DATA=${TRAIN_DATA:-"./data/dm_train.json"} +OUTPUT_BASE=${OUTPUT_BASE:-"./results/train_logs"} +MODEL_NAME=${MODEL_NAME:-"Qwen/Qwen2.5-Math-7B"} +NUM_STEPS=${NUM_STEPS:-300} + +# Create output directory +OUTPUT_DIR="${OUTPUT_BASE}/${PRECISION_MODE}_seed${SEED}" +mkdir -p "$OUTPUT_DIR" + +echo "==============================================" +echo "RLVR Training" +echo "==============================================" +echo "Precision Mode: $PRECISION_MODE" +echo "Seed: $SEED" +echo "Model: $MODEL_NAME" +echo "Training Data: $TRAIN_DATA" +echo "Output: $OUTPUT_DIR" +echo "Num Steps: $NUM_STEPS" +echo "==============================================" + +# Run training +python train_rlvr.py \ + --precision_mode "$PRECISION_MODE" \ + --seed "$SEED" \ + --output_dir "$OUTPUT_DIR" \ + --train_dataset_path "$TRAIN_DATA" \ + --model_name "$MODEL_NAME" \ + --num_steps "$NUM_STEPS" \ + 2>&1 | tee "${OUTPUT_DIR}/training.log" + +echo "Training complete. Output saved to: $OUTPUT_DIR" + diff --git a/scripts/setup_env.sh b/scripts/setup_env.sh new file mode 100755 index 0000000..743f99a --- /dev/null +++ b/scripts/setup_env.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# setup_env.sh +# One-time setup script for the RLVR floating-point precision experiment +# Run this BEFORE submitting any jobs + +set -e + +CONDA_ENV="rlvr-fp" +PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise" + +echo "============================================" +echo "RLVR Environment Setup" +echo "============================================" + +# Setup HuggingFace cache directories +echo "" +echo "Setting up HuggingFace cache..." +HF_CACHE_DIR="/work/hdd/bfqt/yurenh2/huggingface_cache" +mkdir -p "$HF_CACHE_DIR/hub" "$HF_CACHE_DIR/transformers" +echo " Cache directory: $HF_CACHE_DIR" + +# Add to shell profile if not already present +PROFILE_FILE="$HOME/.bashrc" +if ! grep -q "HF_HOME.*huggingface_cache" "$PROFILE_FILE" 2>/dev/null; then + echo "" + echo "Adding HuggingFace cache settings to $PROFILE_FILE..." + cat >> "$PROFILE_FILE" << 'EOF' + +# HuggingFace cache - shared across all projects (added by RLVR setup) +export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache" +export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub" +export TRANSFORMERS_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/transformers" +EOF + echo " Added to $PROFILE_FILE" +else + echo " HuggingFace settings already in $PROFILE_FILE" +fi + +# Source to apply changes +source "$PROFILE_FILE" + +# Check if conda environment exists +echo "" +echo "Checking conda environment..." +source ~/.bashrc + +if conda env list | grep -q "^${CONDA_ENV} "; then + echo " Environment '$CONDA_ENV' already exists" + echo " To recreate, run: conda env remove -n $CONDA_ENV && $0" +else + echo " Creating conda environment: $CONDA_ENV" + conda create -n "$CONDA_ENV" python=3.10 -y + + echo "" + echo "Installing dependencies..." + conda activate "$CONDA_ENV" + cd "$PROJECT_DIR" + pip install -r requirements.txt + + echo "" + echo "Verifying installation..." + python -c "import torch; print(f'PyTorch: {torch.__version__}')" + python -c "import transformers; print(f'Transformers: {transformers.__version__}')" +fi + +echo "" +echo "============================================" +echo "Setup complete!" +echo "============================================" +echo "" +echo "To activate the environment:" +echo " conda activate $CONDA_ENV" +echo "" +echo "To run experiments:" +echo " ./scripts/submit_all_jobs.sh" +echo "" +echo "HuggingFace cache location: $HF_CACHE_DIR" +echo " (1TB quota, shared across all projects)" +echo "============================================" diff --git a/scripts/slurm_train.sh b/scripts/slurm_train.sh new file mode 100755 index 0000000..36bd5b1 --- /dev/null +++ b/scripts/slurm_train.sh @@ -0,0 +1,145 @@ +#!/bin/bash +#SBATCH --job-name=rlvr_fp_exp +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:h200:4 +#SBATCH --mem=200G +#SBATCH --time=2-00:00:00 +#SBATCH --output=results/slurm_logs/%x_%j.out +#SBATCH --error=results/slurm_logs/%x_%j.err +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=$USER@example.com + +# Exit on error and propagate exit codes through pipes +set -o pipefail + +# ============================================ +# RLVR Floating-Point Precision Experiment +# H200x8 SLURM Job Script +# ============================================ + +# Configuration - modify these as needed +PRECISION_MODE=${PRECISION_MODE:-"bf16"} +SEED=${SEED:-1} +NUM_STEPS=${NUM_STEPS:-150} # ~45 hours on H200 with sequential generation + +# Paths +PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise" +CONDA_ENV="rlvr-fp" # Change to your conda env name +MODEL_NAME="Qwen/Qwen2.5-Math-7B" +TRAIN_DATA="${PROJECT_DIR}/data/dm_train.json" + +# ============================================ +# HuggingFace cache configuration +# Use shared HDD storage to avoid home directory quota issues +# This cache is shared across all projects +# ============================================ +export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache" +export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub" +mkdir -p "$HF_HOME" "$HF_HUB_CACHE" + +# Print job info +echo "============================================" +echo "SLURM Job ID: $SLURM_JOB_ID" +echo "Running on: $(hostname)" +echo "Start time: $(date)" +echo "============================================" +echo "Precision Mode: $PRECISION_MODE" +echo "Seed: $SEED" +echo "Num Steps: $NUM_STEPS" +echo "GPUs: $CUDA_VISIBLE_DEVICES" +echo "HF Cache: $HF_HOME" +echo "============================================" + +# Setup environment +cd "$PROJECT_DIR" +mkdir -p results/slurm_logs + +# Activate conda environment +source ~/.bashrc + +# Check if conda environment exists +if conda env list | grep -q "^${CONDA_ENV} "; then + echo "Activating existing conda environment: $CONDA_ENV" + conda activate "$CONDA_ENV" +else + echo "ERROR: Conda environment '$CONDA_ENV' does not exist!" + echo "Please create it first by running:" + echo " conda create -n $CONDA_ENV python=3.10 -y" + echo " conda activate $CONDA_ENV" + echo " pip install -r requirements.txt" + exit 1 +fi + +# Verify activation succeeded +if [[ "$CONDA_DEFAULT_ENV" != "$CONDA_ENV" ]]; then + echo "ERROR: Failed to activate conda environment '$CONDA_ENV'" + exit 1 +fi +echo "Conda environment activated: $CONDA_DEFAULT_ENV" + +# Check GPU availability +nvidia-smi +echo "CUDA devices: $(python -c 'import torch; print(torch.cuda.device_count())')" + +# Output directory (use /work for large checkpoints - /projects is limited) +OUTPUT_DIR="/work/hdd/bfqt/yurenh2/rlvr_results/${PRECISION_MODE}_seed${SEED}" +mkdir -p "$OUTPUT_DIR" + +# DeepSpeed config (ZeRO-3 for full sharding of model/optimizer/gradients) +DEEPSPEED_CONFIG="${PROJECT_DIR}/configs/deepspeed_zero3.json" + +# Number of GPUs for DeepSpeed (all GPUs, ref model on same GPU as training per rank) +NUM_GPUS=$(python -c 'import torch; print(torch.cuda.device_count())') +echo "Using $NUM_GPUS GPUs for DeepSpeed training (ref model on each rank's GPU)" + +# Use random port to avoid conflicts with other jobs +MASTER_PORT=$((29500 + RANDOM % 1000)) +echo "Using master port: $MASTER_PORT" + +# Run training with DeepSpeed +echo "" +echo "Starting training with DeepSpeed ZeRO-3..." +echo "============================================" + +deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \ + --precision_mode "$PRECISION_MODE" \ + --seed "$SEED" \ + --output_dir "$OUTPUT_DIR" \ + --train_dataset_path "$TRAIN_DATA" \ + --model_name "$MODEL_NAME" \ + --num_steps "$NUM_STEPS" \ + --deepspeed "$DEEPSPEED_CONFIG" \ + 2>&1 | tee "${OUTPUT_DIR}/training_slurm.log" + +# IMPORTANT: With set -o pipefail, $? now captures python's exit code, not tee's +TRAIN_EXIT_CODE=$? + +echo "" +echo "============================================" +echo "Training completed with exit code: $TRAIN_EXIT_CODE" +echo "End time: $(date)" +echo "============================================" + +# If training succeeded, run evaluation +if [ $TRAIN_EXIT_CODE -eq 0 ]; then + echo "" + echo "Starting evaluation..." + echo "============================================" + + python eval_policy.py \ + --base_ckpt "$MODEL_NAME" \ + --ft_ckpt "${OUTPUT_DIR}/final_model" \ + --eval_tasks_config configs/eval_tasks_config.json \ + --output_path "results/eval_metrics/${PRECISION_MODE}_seed${SEED}.json" \ + --eval_base \ + --use_amp \ + 2>&1 | tee "${OUTPUT_DIR}/eval_slurm.log" +fi + +echo "" +echo "Job completed at: $(date)" + diff --git a/scripts/submit_all_jobs.sh b/scripts/submit_all_jobs.sh new file mode 100755 index 0000000..86c0f5d --- /dev/null +++ b/scripts/submit_all_jobs.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# submit_all_jobs.sh +# Submit all experiment jobs to SLURM queue +# Jobs will run automatically when resources become available + +set -e + +PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise" +cd "$PROJECT_DIR" + +# Create log directory +mkdir -p results/slurm_logs + +# Configuration +SEEDS=(1 2 3 4 5) +PRECISION_MODES=("fp32" "bf16") + +echo "============================================" +echo "Submitting RLVR Experiment Jobs" +echo "============================================" +echo "Seeds: ${SEEDS[*]}" +echo "Precision Modes: ${PRECISION_MODES[*]}" +echo "Total jobs: $((${#SEEDS[@]} * ${#PRECISION_MODES[@]}))" +echo "============================================" + +# Track submitted job IDs +declare -a JOB_IDS + +for precision in "${PRECISION_MODES[@]}"; do + for seed in "${SEEDS[@]}"; do + JOB_NAME="rlvr_${precision}_s${seed}" + + echo "Submitting: $JOB_NAME" + + # Submit job with environment variables + JOB_ID=$(sbatch \ + --job-name="$JOB_NAME" \ + --export=ALL,PRECISION_MODE="$precision",SEED="$seed" \ + scripts/slurm_train.sh | awk '{print $4}') + + JOB_IDS+=("$JOB_ID") + echo " -> Job ID: $JOB_ID" + done +done + +echo "" +echo "============================================" +echo "All jobs submitted!" +echo "Job IDs: ${JOB_IDS[*]}" +echo "============================================" +echo "" +echo "Monitor with:" +echo " squeue -u $USER" +echo " squeue -j $(IFS=,; echo "${JOB_IDS[*]}")" +echo "" +echo "View logs:" +echo " tail -f results/slurm_logs/rlvr_*.out" +echo "" +echo "Cancel all:" +echo " scancel ${JOB_IDS[*]}" +echo "============================================" + +# Save job IDs for reference +echo "${JOB_IDS[*]}" > results/slurm_logs/submitted_jobs.txt +echo "Job IDs saved to: results/slurm_logs/submitted_jobs.txt" + diff --git a/scripts/submit_single_job.sh b/scripts/submit_single_job.sh new file mode 100755 index 0000000..7fe7492 --- /dev/null +++ b/scripts/submit_single_job.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# submit_single_job.sh +# Submit a single training job +# Usage: ./submit_single_job.sh <precision_mode> <seed> +# Example: ./submit_single_job.sh bf16 1 + +PRECISION_MODE=${1:-"bf16"} +SEED=${2:-1} + +PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise" +cd "$PROJECT_DIR" + +mkdir -p results/slurm_logs + +JOB_NAME="rlvr_${PRECISION_MODE}_s${SEED}" + +echo "Submitting job: $JOB_NAME" +echo " Precision: $PRECISION_MODE" +echo " Seed: $SEED" + +JOB_ID=$(sbatch \ + --job-name="$JOB_NAME" \ + --export=ALL,PRECISION_MODE="$PRECISION_MODE",SEED="$SEED" \ + scripts/slurm_train.sh | awk '{print $4}') + +echo "" +echo "Submitted! Job ID: $JOB_ID" +echo "" +echo "Monitor with: squeue -j $JOB_ID" +echo "View output: tail -f results/slurm_logs/${JOB_NAME}_${JOB_ID}.out" +echo "Cancel: scancel $JOB_ID" + diff --git a/scripts/test_quick.sh b/scripts/test_quick.sh new file mode 100644 index 0000000..f66e73b --- /dev/null +++ b/scripts/test_quick.sh @@ -0,0 +1,78 @@ +#!/bin/bash +#SBATCH --job-name=rlvr_test +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:h200:4 +#SBATCH --mem=200G +#SBATCH --time=04:00:00 +#SBATCH --output=results/slurm_logs/test_%j.out +#SBATCH --error=results/slurm_logs/test_%j.err + +set -o pipefail + +PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise" +cd "$PROJECT_DIR" + +source ~/.bashrc +conda activate rlvr-fp + +export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache" +export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub" + +echo "============================================" +echo "Quick test on $(hostname)" +echo "SLURM Job ID: $SLURM_JOB_ID" +nvidia-smi +echo "============================================" + +NUM_GPUS=$(python -c 'import torch; print(torch.cuda.device_count())') +echo "Using $NUM_GPUS GPUs for DeepSpeed" + +# Use random port to avoid conflicts +MASTER_PORT=$((29500 + RANDOM % 1000)) +echo "Using master port: $MASTER_PORT" + +# Test fp32 with just 3 steps +echo "Testing fp32..." +deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \ + --precision_mode fp32 \ + --seed 1 \ + --output_dir /work/hdd/bfqt/yurenh2/rlvr_results/test_fp32 \ + --train_dataset_path data/dm_train.json \ + --model_name Qwen/Qwen2.5-Math-7B \ + --num_steps 3 \ + --deepspeed configs/deepspeed_zero3.json + +FP32_EXIT=$? +echo "fp32 test exit code: $FP32_EXIT" + +if [ $FP32_EXIT -eq 0 ]; then + echo "fp32 test PASSED" + + # Also test bf16 + echo "Testing bf16..." + deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \ + --precision_mode bf16 \ + --seed 1 \ + --output_dir /work/hdd/bfqt/yurenh2/rlvr_results/test_bf16 \ + --train_dataset_path data/dm_train.json \ + --model_name Qwen/Qwen2.5-Math-7B \ + --num_steps 3 \ + --deepspeed configs/deepspeed_zero3.json + + BF16_EXIT=$? + echo "bf16 test exit code: $BF16_EXIT" + + if [ $BF16_EXIT -eq 0 ]; then + echo "============================================" + echo "ALL TESTS PASSED!" + echo "============================================" + else + echo "bf16 test FAILED" + fi +else + echo "fp32 test FAILED" +fi |
