summaryrefslogtreecommitdiff
path: root/scripts/significance_test.py
diff options
context:
space:
mode:
authorYurenHao0426 <Blackhao0426@gmail.com>2026-04-05 10:31:36 -0500
committerYurenHao0426 <Blackhao0426@gmail.com>2026-04-05 10:31:36 -0500
commitea4a8f837e81b5e5fab6086cb3014c711c5e58e9 (patch)
tree11638546dc91c97815e5bdab8fa0b587481d0a3c /scripts/significance_test.py
parent8fe28101366dd32562b8c5534d7fe359b252bdf3 (diff)
Add PEFT baselines, ICL baselines, profile-based, and unified pipeline
New baselines: - baselines/peft_baseline.py: LoRA, Tiny LoRA, VeRA (per-user PEFT adaptation) - baselines/dense_retrieval.py: Dense retrieval ICL (sentence-transformers) - baselines/profile_based.py: LLM-generated user profile conditioned generation New scripts: - scripts/run_all_methods.py: Unified pipeline running all 9 methods with per-method directory output structure (method/per_user.json) - scripts/run_peft_baselines.py: PEFT-only evaluation (legacy) - scripts/run_significance.py: Significance tests (UPH+Base per-user) - scripts/run_uph_base_per_user.py: UPH+Base with full per-user data - scripts/compute_bertscore.py: BERTScore from saved predictions - scripts/significance_test.py: Standalone significance test framework Updated .gitignore to exclude outputs/ directory. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat (limited to 'scripts/significance_test.py')
-rw-r--r--scripts/significance_test.py257
1 files changed, 257 insertions, 0 deletions
diff --git a/scripts/significance_test.py b/scripts/significance_test.py
new file mode 100644
index 0000000..a276b61
--- /dev/null
+++ b/scripts/significance_test.py
@@ -0,0 +1,257 @@
+"""Run significance tests between UPH and PEFT baselines.
+
+Re-runs all methods on review_user (or specified task/setting),
+saves per-user R-L scores, and computes paired significance tests.
+
+Usage:
+ python scripts/significance_test.py --task review --setting user --device cuda:0
+"""
+
+import sys
+import os
+import json
+import time
+import numpy as np
+import torch
+from scipy import stats
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from data.longlamp import load_longlamp, select_k_profile_items
+from data.templates import build_query_prompt
+from models.qwen_wrapper import QwenWrapper
+from models.cvh import UnconditionalHead
+from adapt.cache_hidden import cache_support_hidden_states
+from adapt.fit_theta import fit_theta
+from baselines.peft_baseline import (
+ PEFTBaseline, get_lora_config, get_tiny_lora_config, get_vera_config,
+)
+from eval.metrics import compute_rouge
+
+
+def per_user_rouge_l(predictions, references):
+ """Compute per-example ROUGE-L scores."""
+ scores = []
+ for pred, ref in zip(predictions, references):
+ r = compute_rouge([pred], [ref])
+ scores.append(r['rougeL'])
+ return scores
+
+
+def run_base(wrapper, examples, N):
+ """Run base (no personalization)."""
+ from scripts.run_fair_audit import generate_base_with_min
+ preds = []
+ for i, ex in enumerate(examples):
+ prompt = build_query_prompt(ex['query_input'], ex['task'])
+ pred = generate_base_with_min(wrapper, prompt, min_new_tokens=128)
+ preds.append(pred)
+ if (i + 1) % 40 == 0:
+ print(f" Base: {i+1}/{N}")
+ return preds
+
+
+def run_uph(wrapper, examples, support_sets, N, device):
+ """Run UPH (Uncond-Head)."""
+ H = wrapper.hidden_size
+ uncond = UnconditionalHead(H, d=64, alpha=0.1, basis_seed=42).to(device)
+ lm_head_bias = None
+ if hasattr(wrapper.model.lm_head, 'bias') and wrapper.model.lm_head.bias is not None:
+ lm_head_bias = wrapper.model.lm_head.bias.data
+
+ preds = []
+ for i, (ex, support) in enumerate(zip(examples, support_sets)):
+ cached_h = cache_support_hidden_states(wrapper, support, ex['task'])
+ if not cached_h:
+ prompt = build_query_prompt(ex['query_input'], ex['task'])
+ from scripts.run_fair_audit import generate_base_with_min
+ pred = generate_base_with_min(wrapper, prompt)
+ preds.append(pred)
+ continue
+
+ theta = fit_theta(
+ cached_h=cached_h,
+ lm_head_weight=wrapper.lm_head_weight,
+ lm_head_bias=lm_head_bias,
+ head_module=uncond,
+ d=64, lr=0.05, steps=30, beta=0.05, lam=1e-4,
+ max_grad_norm=5.0, device=device,
+ )
+
+ prompt = build_query_prompt(ex['query_input'], ex['task'])
+ pred = wrapper.generate_with_head_blended(
+ prompt, theta, uncond.forward_fn,
+ blend_gamma=0.5, max_new_tokens=512,
+ min_new_tokens=128, temperature=0.0,
+ )
+ preds.append(pred)
+ del cached_h, theta
+ torch.cuda.empty_cache()
+
+ if (i + 1) % 40 == 0:
+ print(f" UPH: {i+1}/{N}")
+ return preds
+
+
+def run_peft_method(wrapper, examples, support_sets, N, config, lr, desc):
+ """Run a PEFT method."""
+ baseline = PEFTBaseline(wrapper, config)
+ print(f" {desc}: {baseline.n_params:,} params")
+ preds = []
+ for i, (ex, support) in enumerate(zip(examples, support_sets)):
+ pred = baseline.adapt_and_generate(
+ support_items=support,
+ query_input=ex['query_input'],
+ task=ex['task'],
+ lr=lr, steps=30,
+ max_new_tokens=512, min_new_tokens=128,
+ )
+ preds.append(pred)
+ if (i + 1) % 40 == 0:
+ print(f" {desc}: {i+1}/{N}")
+ baseline.cleanup()
+ return preds
+
+
+def paired_tests(scores_a, scores_b, name_a, name_b):
+ """Run paired t-test and Wilcoxon signed-rank test."""
+ a = np.array(scores_a)
+ b = np.array(scores_b)
+ diff = a - b
+
+ mean_a = np.mean(a)
+ mean_b = np.mean(b)
+ mean_diff = np.mean(diff)
+
+ # Paired t-test
+ t_stat, t_pval = stats.ttest_rel(a, b)
+
+ # Wilcoxon signed-rank test
+ try:
+ w_stat, w_pval = stats.wilcoxon(a, b)
+ except ValueError:
+ w_stat, w_pval = float('nan'), float('nan')
+
+ # 95% CI for mean difference
+ se = stats.sem(diff)
+ ci_low = mean_diff - 1.96 * se
+ ci_high = mean_diff + 1.96 * se
+
+ print(f"\n {name_a} vs {name_b}:")
+ print(f" Mean {name_a}: {mean_a:.4f}, Mean {name_b}: {mean_b:.4f}, Diff: {mean_diff:+.4f}")
+ print(f" 95% CI: [{ci_low:+.4f}, {ci_high:+.4f}]")
+ print(f" Paired t-test: t={t_stat:.3f}, p={t_pval:.2e}")
+ print(f" Wilcoxon: W={w_stat:.0f}, p={w_pval:.2e}")
+
+ return {
+ 'mean_a': mean_a, 'mean_b': mean_b, 'mean_diff': mean_diff,
+ 'ci_low': ci_low, 'ci_high': ci_high,
+ 't_stat': t_stat, 't_pval': t_pval,
+ 'w_stat': float(w_stat), 'w_pval': float(w_pval),
+ }
+
+
+def main():
+ import argparse
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--num_eval', type=int, default=200)
+ parser.add_argument('--task', type=str, default='review', choices=['review', 'topic'])
+ parser.add_argument('--setting', type=str, default='user', choices=['user', 'temporal'])
+ parser.add_argument('--device', type=str, default='cuda:0')
+ parser.add_argument('--output_dir', type=str, default='outputs/significance')
+ args = parser.parse_args()
+
+ N = args.num_eval
+ device = args.device
+ task = args.task
+ setting = args.setting
+
+ config_map = {
+ ('review', 'user'): 'product_review_user',
+ ('review', 'temporal'): 'product_review_temporal',
+ ('topic', 'user'): 'topic_writing_user',
+ ('topic', 'temporal'): 'topic_writing_temporal',
+ }
+ config_name = config_map[(task, setting)]
+
+ print(f"=== Significance Tests: {task}_{setting}, N={N} ===")
+
+ print("\nLoading data...")
+ examples = load_longlamp(config_name, split='val')[:N]
+ K = 4
+ support_sets = [select_k_profile_items(ex['profile_items'], K, seed=0) for ex in examples]
+ references = [ex['target_output'] for ex in examples]
+
+ print(f"Loading model on {device}...")
+ wrapper = QwenWrapper('Qwen/Qwen2.5-1.5B-Instruct', device=device)
+
+ all_preds = {}
+ all_per_user_rl = {}
+
+ # Run Base
+ print("\n--- Base ---")
+ preds = run_base(wrapper, examples, N)
+ all_preds['Base'] = preds
+ all_per_user_rl['Base'] = per_user_rouge_l(preds, references)
+ print(f" Mean R-L: {np.mean(all_per_user_rl['Base']):.4f}")
+
+ # Run UPH
+ print("\n--- UPH ---")
+ preds = run_uph(wrapper, examples, support_sets, N, device)
+ all_preds['UPH'] = preds
+ all_per_user_rl['UPH'] = per_user_rouge_l(preds, references)
+ print(f" Mean R-L: {np.mean(all_per_user_rl['UPH']):.4f}")
+
+ # Run PEFT methods
+ peft_methods = [
+ ('LoRA_r8', get_lora_config(rank=8), 1e-4, 'LoRA r=8'),
+ ('TinyLoRA_r1', get_tiny_lora_config(rank=1), 1e-4, 'Tiny LoRA r=1'),
+ ('VeRA_r256', get_vera_config(rank=256), 1e-3, 'VeRA r=256'),
+ ]
+
+ for key, config, lr, desc in peft_methods:
+ print(f"\n--- {desc} ---")
+ preds = run_peft_method(wrapper, examples, support_sets, N, config, lr, desc)
+ all_preds[key] = preds
+ all_per_user_rl[key] = per_user_rouge_l(preds, references)
+ print(f" Mean R-L: {np.mean(all_per_user_rl[key]):.4f}")
+
+ # Significance tests
+ print("\n" + "=" * 80)
+ print("SIGNIFICANCE TESTS (ROUGE-L, paired)")
+ print("=" * 80)
+
+ test_results = {}
+ comparisons = [
+ ('UPH', 'Base'),
+ ('UPH', 'LoRA_r8'),
+ ('UPH', 'TinyLoRA_r1'),
+ ('UPH', 'VeRA_r256'),
+ ]
+
+ for name_a, name_b in comparisons:
+ r = paired_tests(
+ all_per_user_rl[name_a],
+ all_per_user_rl[name_b],
+ name_a, name_b,
+ )
+ test_results[f'{name_a}_vs_{name_b}'] = r
+
+ # Save results
+ os.makedirs(args.output_dir, exist_ok=True)
+ output_path = os.path.join(args.output_dir, f'{task}_{setting}_significance.json')
+
+ save_data = {
+ 'per_user_rougeL': {k: v for k, v in all_per_user_rl.items()},
+ 'significance_tests': test_results,
+ 'num_examples': N,
+ 'task': task,
+ 'setting': setting,
+ }
+ with open(output_path, 'w') as f:
+ json.dump(save_data, f, indent=2, default=str)
+ print(f"\nResults saved to {output_path}")
+
+
+if __name__ == '__main__':
+ main()