Add PEFT baselines, ICL baselines, profile-based, and unified pipeline

New baselines: - baselines/peft_baseline.py: LoRA, Tiny LoRA, VeRA (per-user PEFT adaptation) - baselines/dense_retrieval.py: Dense retrieval ICL (sentence-transformers) - baselines/profile_based.py: LLM-generated user profile conditioned generation New scripts: - scripts/run_all_methods.py: Unified pipeline running all 9 methods with per-method directory output structure (method/per_user.json) - scripts/run_peft_baselines.py: PEFT-only evaluation (legacy) - scripts/run_significance.py: Significance tests (UPH+Base per-user) - scripts/run_uph_base_per_user.py: UPH+Base with full per-user data - scripts/compute_bertscore.py: BERTScore from saved predictions - scripts/significance_test.py: Standalone significance test framework Updated .gitignore to exclude outputs/ directory. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
author: YurenHao0426 <Blackhao0426@gmail.com> 2026-04-05 10:31:36 -0500
committer: YurenHao0426 <Blackhao0426@gmail.com> 2026-04-05 10:31:36 -0500
commit: ea4a8f837e81b5e5fab6086cb3014c711c5e58e9 (patch)
tree: 11638546dc91c97815e5bdab8fa0b587481d0a3c /scripts/compute_bertscore.py
parent: 8fe28101366dd32562b8c5534d7fe359b252bdf3 (diff)
1 files changed, 145 insertions, 0 deletions
diff --git a/scripts/compute_bertscore.py b/scripts/compute_bertscore.py
new file mode 100644
index 0000000..4fb1dc2
--- /dev/null
+++ b/scripts/compute_bertscore.py
@@ -0,0 +1,145 @@
+"""Compute BERTScore from saved per-user predictions.
+
+Uses saved predictions from significance tests (UPH, Base) and PEFT per-user data.
+
+Usage:
+    python scripts/compute_bertscore.py --task review --setting user --device cuda:0
+"""
+
+import sys
+import os
+import json
+import numpy as np
+from scipy import stats
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+def paired_test(scores_a, scores_b, name_a, name_b):
+    a = np.array(scores_a)
+    b = np.array(scores_b)
+    diff = a - b
+
+    mean_a, mean_b = np.mean(a), np.mean(b)
+    mean_diff = np.mean(diff)
+
+    t_stat, t_pval = stats.ttest_rel(a, b)
+    try:
+        w_stat, w_pval = stats.wilcoxon(a, b)
+    except ValueError:
+        w_stat, w_pval = float('nan'), float('nan')
+
+    se = stats.sem(diff)
+    ci_low = mean_diff - 1.96 * se
+    ci_high = mean_diff + 1.96 * se
+
+    print(f"  {name_a} vs {name_b}:")
+    print(f"    Mean {name_a}: {mean_a:.4f}, Mean {name_b}: {mean_b:.4f}, Diff: {mean_diff:+.4f}")
+    print(f"    95% CI: [{ci_low:+.4f}, {ci_high:+.4f}]")
+    print(f"    t-test: p={t_pval:.2e}, Wilcoxon: p={w_pval:.2e}")
+
+    return {
+        'mean_a': float(mean_a), 'mean_b': float(mean_b),
+        'mean_diff': float(mean_diff),
+        'ci_low': float(ci_low), 'ci_high': float(ci_high),
+        't_pval': float(t_pval), 'w_pval': float(w_pval),
+    }
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--task', type=str, default='review')
+    parser.add_argument('--setting', type=str, default='user')
+    parser.add_argument('--device', type=str, default='cuda:0')
+    parser.add_argument('--bert_model', type=str, default='roberta-large')
+    args = parser.parse_args()
+
+    task = args.task
+    setting = args.setting
+    N = 200
+
+    # Load saved predictions
+    sig_path = f"outputs/significance/{task}_{setting}_significance.json"
+    peft_path = f"outputs/peft_baselines/{task}_{setting}_K4_N{N}_peft_per_user.json"
+
+    if not os.path.exists(sig_path):
+        print(f"Significance data not found: {sig_path}")
+        return
+    if not os.path.exists(peft_path):
+        print(f"PEFT per-user data not found: {peft_path}")
+        return
+
+    with open(sig_path) as f:
+        sig_data = json.load(f)
+    with open(peft_path) as f:
+        peft_data = json.load(f)
+
+    # Collect all predictions and references
+    all_preds = {}
+    all_refs = {}
+
+    # UPH and Base from significance data
+    all_preds['UPH'] = sig_data['uph_predictions']
+    all_preds['Base'] = sig_data['base_predictions']
+
+    # References (same for all methods)
+    refs = [u['reference'] for u in peft_data['per_user']['lora']]
+
+    # PEFT predictions
+    for method in ['lora', 'tiny_lora', 'vera']:
+        all_preds[method] = [u['prediction'] for u in peft_data['per_user'][method]]
+
+    print(f"=== BERTScore: {task}_{setting}, N={len(refs)} ===")
+    print(f"Model: {args.bert_model}")
+    print(f"Methods: {list(all_preds.keys())}")
+
+    # Compute BERTScore for each method
+    from bert_score import score as bert_score_fn
+
+    all_bertscore = {}
+    for method, preds in all_preds.items():
+        print(f"\n  Computing BERTScore for {method}...")
+        P, R, F1 = bert_score_fn(
+            preds, refs,
+            model_type=args.bert_model,
+            device=args.device,
+            verbose=False,
+        )
+        all_bertscore[method] = F1.tolist()
+        print(f"    Mean F1: {np.mean(F1.tolist()):.4f}")
+
+    # Summary table
+    print("\n" + "=" * 60)
+    print("BERTScore F1 Summary")
+    print("=" * 60)
+    for method in all_preds:
+        scores = all_bertscore[method]
+        print(f"  {method:<15} Mean: {np.mean(scores):.4f}, Std: {np.std(scores):.4f}")
+
+    # Significance tests
+    print("\n" + "=" * 60)
+    print("Significance Tests — BERTScore F1 (paired)")
+    print("=" * 60)
+
+    test_results = {}
+    for other in ['Base', 'lora', 'tiny_lora', 'vera']:
+        r = paired_test(all_bertscore['UPH'], all_bertscore[other], 'UPH', other)
+        test_results[f'UPH_vs_{other}'] = r
+
+    # Save
+    output_path = f"outputs/significance/{task}_{setting}_bertscore.json"
+    with open(output_path, 'w') as f:
+        json.dump({
+            'bertscore_f1': all_bertscore,
+            'significance_tests': test_results,
+            'model': args.bert_model,
+            'task': task,
+            'setting': setting,
+            'num_examples': len(refs),
+        }, f, indent=2)
+    print(f"\nSaved to {output_path}")
+
+
+if __name__ == '__main__':
+    main()
author	YurenHao0426 <Blackhao0426@gmail.com>	2026-04-05 10:31:36 -0500
committer	YurenHao0426 <Blackhao0426@gmail.com>	2026-04-05 10:31:36 -0500
commit	ea4a8f837e81b5e5fab6086cb3014c711c5e58e9 (patch)
tree	11638546dc91c97815e5bdab8fa0b587481d0a3c /scripts/compute_bertscore.py
parent	8fe28101366dd32562b8c5534d7fe359b252bdf3 (diff)