diff options
| author | YurenHao0426 <Blackhao0426@gmail.com> | 2026-04-05 10:31:36 -0500 |
|---|---|---|
| committer | YurenHao0426 <Blackhao0426@gmail.com> | 2026-04-05 10:31:36 -0500 |
| commit | ea4a8f837e81b5e5fab6086cb3014c711c5e58e9 (patch) | |
| tree | 11638546dc91c97815e5bdab8fa0b587481d0a3c /scripts/significance_test.py | |
| parent | 8fe28101366dd32562b8c5534d7fe359b252bdf3 (diff) | |
Add PEFT baselines, ICL baselines, profile-based, and unified pipeline
New baselines:
- baselines/peft_baseline.py: LoRA, Tiny LoRA, VeRA (per-user PEFT adaptation)
- baselines/dense_retrieval.py: Dense retrieval ICL (sentence-transformers)
- baselines/profile_based.py: LLM-generated user profile conditioned generation
New scripts:
- scripts/run_all_methods.py: Unified pipeline running all 9 methods with
per-method directory output structure (method/per_user.json)
- scripts/run_peft_baselines.py: PEFT-only evaluation (legacy)
- scripts/run_significance.py: Significance tests (UPH+Base per-user)
- scripts/run_uph_base_per_user.py: UPH+Base with full per-user data
- scripts/compute_bertscore.py: BERTScore from saved predictions
- scripts/significance_test.py: Standalone significance test framework
Updated .gitignore to exclude outputs/ directory.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat (limited to 'scripts/significance_test.py')
| -rw-r--r-- | scripts/significance_test.py | 257 |
1 files changed, 257 insertions, 0 deletions
diff --git a/scripts/significance_test.py b/scripts/significance_test.py new file mode 100644 index 0000000..a276b61 --- /dev/null +++ b/scripts/significance_test.py @@ -0,0 +1,257 @@ +"""Run significance tests between UPH and PEFT baselines. + +Re-runs all methods on review_user (or specified task/setting), +saves per-user R-L scores, and computes paired significance tests. + +Usage: + python scripts/significance_test.py --task review --setting user --device cuda:0 +""" + +import sys +import os +import json +import time +import numpy as np +import torch +from scipy import stats + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from data.longlamp import load_longlamp, select_k_profile_items +from data.templates import build_query_prompt +from models.qwen_wrapper import QwenWrapper +from models.cvh import UnconditionalHead +from adapt.cache_hidden import cache_support_hidden_states +from adapt.fit_theta import fit_theta +from baselines.peft_baseline import ( + PEFTBaseline, get_lora_config, get_tiny_lora_config, get_vera_config, +) +from eval.metrics import compute_rouge + + +def per_user_rouge_l(predictions, references): + """Compute per-example ROUGE-L scores.""" + scores = [] + for pred, ref in zip(predictions, references): + r = compute_rouge([pred], [ref]) + scores.append(r['rougeL']) + return scores + + +def run_base(wrapper, examples, N): + """Run base (no personalization).""" + from scripts.run_fair_audit import generate_base_with_min + preds = [] + for i, ex in enumerate(examples): + prompt = build_query_prompt(ex['query_input'], ex['task']) + pred = generate_base_with_min(wrapper, prompt, min_new_tokens=128) + preds.append(pred) + if (i + 1) % 40 == 0: + print(f" Base: {i+1}/{N}") + return preds + + +def run_uph(wrapper, examples, support_sets, N, device): + """Run UPH (Uncond-Head).""" + H = wrapper.hidden_size + uncond = UnconditionalHead(H, d=64, alpha=0.1, basis_seed=42).to(device) + lm_head_bias = None + if hasattr(wrapper.model.lm_head, 'bias') and wrapper.model.lm_head.bias is not None: + lm_head_bias = wrapper.model.lm_head.bias.data + + preds = [] + for i, (ex, support) in enumerate(zip(examples, support_sets)): + cached_h = cache_support_hidden_states(wrapper, support, ex['task']) + if not cached_h: + prompt = build_query_prompt(ex['query_input'], ex['task']) + from scripts.run_fair_audit import generate_base_with_min + pred = generate_base_with_min(wrapper, prompt) + preds.append(pred) + continue + + theta = fit_theta( + cached_h=cached_h, + lm_head_weight=wrapper.lm_head_weight, + lm_head_bias=lm_head_bias, + head_module=uncond, + d=64, lr=0.05, steps=30, beta=0.05, lam=1e-4, + max_grad_norm=5.0, device=device, + ) + + prompt = build_query_prompt(ex['query_input'], ex['task']) + pred = wrapper.generate_with_head_blended( + prompt, theta, uncond.forward_fn, + blend_gamma=0.5, max_new_tokens=512, + min_new_tokens=128, temperature=0.0, + ) + preds.append(pred) + del cached_h, theta + torch.cuda.empty_cache() + + if (i + 1) % 40 == 0: + print(f" UPH: {i+1}/{N}") + return preds + + +def run_peft_method(wrapper, examples, support_sets, N, config, lr, desc): + """Run a PEFT method.""" + baseline = PEFTBaseline(wrapper, config) + print(f" {desc}: {baseline.n_params:,} params") + preds = [] + for i, (ex, support) in enumerate(zip(examples, support_sets)): + pred = baseline.adapt_and_generate( + support_items=support, + query_input=ex['query_input'], + task=ex['task'], + lr=lr, steps=30, + max_new_tokens=512, min_new_tokens=128, + ) + preds.append(pred) + if (i + 1) % 40 == 0: + print(f" {desc}: {i+1}/{N}") + baseline.cleanup() + return preds + + +def paired_tests(scores_a, scores_b, name_a, name_b): + """Run paired t-test and Wilcoxon signed-rank test.""" + a = np.array(scores_a) + b = np.array(scores_b) + diff = a - b + + mean_a = np.mean(a) + mean_b = np.mean(b) + mean_diff = np.mean(diff) + + # Paired t-test + t_stat, t_pval = stats.ttest_rel(a, b) + + # Wilcoxon signed-rank test + try: + w_stat, w_pval = stats.wilcoxon(a, b) + except ValueError: + w_stat, w_pval = float('nan'), float('nan') + + # 95% CI for mean difference + se = stats.sem(diff) + ci_low = mean_diff - 1.96 * se + ci_high = mean_diff + 1.96 * se + + print(f"\n {name_a} vs {name_b}:") + print(f" Mean {name_a}: {mean_a:.4f}, Mean {name_b}: {mean_b:.4f}, Diff: {mean_diff:+.4f}") + print(f" 95% CI: [{ci_low:+.4f}, {ci_high:+.4f}]") + print(f" Paired t-test: t={t_stat:.3f}, p={t_pval:.2e}") + print(f" Wilcoxon: W={w_stat:.0f}, p={w_pval:.2e}") + + return { + 'mean_a': mean_a, 'mean_b': mean_b, 'mean_diff': mean_diff, + 'ci_low': ci_low, 'ci_high': ci_high, + 't_stat': t_stat, 't_pval': t_pval, + 'w_stat': float(w_stat), 'w_pval': float(w_pval), + } + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--num_eval', type=int, default=200) + parser.add_argument('--task', type=str, default='review', choices=['review', 'topic']) + parser.add_argument('--setting', type=str, default='user', choices=['user', 'temporal']) + parser.add_argument('--device', type=str, default='cuda:0') + parser.add_argument('--output_dir', type=str, default='outputs/significance') + args = parser.parse_args() + + N = args.num_eval + device = args.device + task = args.task + setting = args.setting + + config_map = { + ('review', 'user'): 'product_review_user', + ('review', 'temporal'): 'product_review_temporal', + ('topic', 'user'): 'topic_writing_user', + ('topic', 'temporal'): 'topic_writing_temporal', + } + config_name = config_map[(task, setting)] + + print(f"=== Significance Tests: {task}_{setting}, N={N} ===") + + print("\nLoading data...") + examples = load_longlamp(config_name, split='val')[:N] + K = 4 + support_sets = [select_k_profile_items(ex['profile_items'], K, seed=0) for ex in examples] + references = [ex['target_output'] for ex in examples] + + print(f"Loading model on {device}...") + wrapper = QwenWrapper('Qwen/Qwen2.5-1.5B-Instruct', device=device) + + all_preds = {} + all_per_user_rl = {} + + # Run Base + print("\n--- Base ---") + preds = run_base(wrapper, examples, N) + all_preds['Base'] = preds + all_per_user_rl['Base'] = per_user_rouge_l(preds, references) + print(f" Mean R-L: {np.mean(all_per_user_rl['Base']):.4f}") + + # Run UPH + print("\n--- UPH ---") + preds = run_uph(wrapper, examples, support_sets, N, device) + all_preds['UPH'] = preds + all_per_user_rl['UPH'] = per_user_rouge_l(preds, references) + print(f" Mean R-L: {np.mean(all_per_user_rl['UPH']):.4f}") + + # Run PEFT methods + peft_methods = [ + ('LoRA_r8', get_lora_config(rank=8), 1e-4, 'LoRA r=8'), + ('TinyLoRA_r1', get_tiny_lora_config(rank=1), 1e-4, 'Tiny LoRA r=1'), + ('VeRA_r256', get_vera_config(rank=256), 1e-3, 'VeRA r=256'), + ] + + for key, config, lr, desc in peft_methods: + print(f"\n--- {desc} ---") + preds = run_peft_method(wrapper, examples, support_sets, N, config, lr, desc) + all_preds[key] = preds + all_per_user_rl[key] = per_user_rouge_l(preds, references) + print(f" Mean R-L: {np.mean(all_per_user_rl[key]):.4f}") + + # Significance tests + print("\n" + "=" * 80) + print("SIGNIFICANCE TESTS (ROUGE-L, paired)") + print("=" * 80) + + test_results = {} + comparisons = [ + ('UPH', 'Base'), + ('UPH', 'LoRA_r8'), + ('UPH', 'TinyLoRA_r1'), + ('UPH', 'VeRA_r256'), + ] + + for name_a, name_b in comparisons: + r = paired_tests( + all_per_user_rl[name_a], + all_per_user_rl[name_b], + name_a, name_b, + ) + test_results[f'{name_a}_vs_{name_b}'] = r + + # Save results + os.makedirs(args.output_dir, exist_ok=True) + output_path = os.path.join(args.output_dir, f'{task}_{setting}_significance.json') + + save_data = { + 'per_user_rougeL': {k: v for k, v in all_per_user_rl.items()}, + 'significance_tests': test_results, + 'num_examples': N, + 'task': task, + 'setting': setting, + } + with open(output_path, 'w') as f: + json.dump(save_data, f, indent=2, default=str) + print(f"\nResults saved to {output_path}") + + +if __name__ == '__main__': + main() |
