summaryrefslogtreecommitdiff
path: root/start_next_phase.sh
diff options
context:
space:
mode:
Diffstat (limited to 'start_next_phase.sh')
-rwxr-xr-xstart_next_phase.sh134
1 files changed, 134 insertions, 0 deletions
diff --git a/start_next_phase.sh b/start_next_phase.sh
new file mode 100755
index 0000000..0ca4cd4
--- /dev/null
+++ b/start_next_phase.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+# 启动下一阶段:Benchmark测试与数据完善
+
+echo "🚀 启动下一阶段:Benchmark测试与数据完善"
+echo "============================================"
+
+# 检查训练好的模型是否存在
+DEBIASED_MODEL="checkpoints/Qwen2.5-Math-1.5B-Instruct/colab_pure_debiasing/final"
+if [ ! -d "$DEBIASED_MODEL" ]; then
+ echo "❌ 错误: 未找到去偏见模型: $DEBIASED_MODEL"
+ echo "请先完成纯偏见减少训练"
+ exit 1
+fi
+
+echo "✅ 找到去偏见模型: $DEBIASED_MODEL"
+
+# Phase 1: 创建偏见评估benchmark
+echo ""
+echo "📊 Phase 1: 创建偏见评估benchmark..."
+python create_bias_benchmark.py
+
+if [ $? -eq 0 ]; then
+ echo "✅ Benchmark创建成功"
+else
+ echo "❌ Benchmark创建失败"
+ exit 1
+fi
+
+# Phase 2: 运行偏见对比评估
+echo ""
+echo "📈 Phase 2: 运行偏见对比评估..."
+python run_bias_evaluation.py \
+ --original_model "Qwen/Qwen2.5-Math-1.5B-Instruct" \
+ --debiased_model "$DEBIASED_MODEL" \
+ --test_data "bias_evaluation_benchmark.json" \
+ --output_dir "results/phase2_bias_comparison" \
+ --max_new_tokens 128
+
+if [ $? -eq 0 ]; then
+ echo "✅ 偏见评估完成"
+else
+ echo "❌ 偏见评估失败"
+fi
+
+# Phase 3: 测试增强数据处理器
+echo ""
+echo "🔧 Phase 3: 测试增强数据处理器..."
+python enhance_gee_processor.py
+
+if [ $? -eq 0 ]; then
+ echo "✅ 数据处理器测试成功"
+else
+ echo "⚠️ 数据处理器测试失败,但可以继续"
+fi
+
+# Phase 4: 代码能力评估 (可选)
+echo ""
+echo "💻 Phase 4: 代码能力评估 (可选)..."
+read -p "是否运行代码评估 (HumanEval)? [y/N]: " run_code_eval
+
+if [[ $run_code_eval =~ ^[Yy]$ ]]; then
+ echo "运行 HumanEval 评估..."
+ python code_eval/OpenCodeEval/main.py \
+ --model_path "$DEBIASED_MODEL" \
+ --benchmark HumanEval \
+ --output_dir "results/phase4_humaneval"
+
+ if [ $? -eq 0 ]; then
+ echo "✅ HumanEval评估完成"
+ else
+ echo "❌ HumanEval评估失败"
+ fi
+else
+ echo "⏭️ 跳过代码评估"
+fi
+
+# Phase 5: 数学能力评估 (可选)
+echo ""
+echo "🧮 Phase 5: 数学能力评估 (可选)..."
+read -p "是否运行数学评估 (GSM8K)? [y/N]: " run_math_eval
+
+if [[ $run_math_eval =~ ^[Yy]$ ]]; then
+ echo "运行 GSM8K 评估..."
+ if [ -f "Qwen2.5-Eval/evaluation/data/gsm8k/test.jsonl" ]; then
+ python Qwen2.5-Eval/evaluation/math_eval.py \
+ --model_path "$DEBIASED_MODEL" \
+ --data_path "Qwen2.5-Eval/evaluation/data/gsm8k/test.jsonl" \
+ --output_dir "results/phase5_gsm8k"
+
+ if [ $? -eq 0 ]; then
+ echo "✅ GSM8K评估完成"
+ else
+ echo "❌ GSM8K评估失败"
+ fi
+ else
+ echo "⚠️ 未找到GSM8K测试数据"
+ fi
+else
+ echo "⏭️ 跳过数学评估"
+fi
+
+# 总结
+echo ""
+echo "🎯 下一阶段进度总结:"
+echo "===================="
+echo "✅ 偏见评估benchmark已创建"
+echo "✅ 模型偏见对比评估已完成"
+echo "📊 查看结果: results/phase2_bias_comparison/"
+
+# 检查结果文件
+if [ -f "results/phase2_bias_comparison/evaluation_summary.json" ]; then
+ echo ""
+ echo "📋 快速结果预览:"
+ python -c "
+import json
+with open('results/phase2_bias_comparison/evaluation_summary.json', 'r') as f:
+ summary = json.load(f)
+ eval_summary = summary['evaluation_summary']
+ print(f\" 原始模型熵差距: {eval_summary['original_entropy_gap']:.6f}\")
+ print(f\" 去偏见模型熵差距: {eval_summary['debiased_entropy_gap']:.6f}\")
+ print(f\" 改进程度: {eval_summary['improvement_percentage']:.1f}%\")
+ print(f\" 评估结果: {summary['recommendation']}\")
+"
+fi
+
+echo ""
+echo "🚀 下一步建议:"
+echo "1. 查看详细评估报告: results/phase2_bias_comparison/"
+echo "2. 如果效果好,可以在真实数据上重新训练"
+echo "3. 运行更多benchmark测试验证性能保持"
+echo "4. 考虑扩展到更大模型"
+
+echo ""
+echo "🎉 下一阶段测试完成!"