"""Run significance tests between UPH and PEFT baselines. Re-runs all methods on review_user (or specified task/setting), saves per-user R-L scores, and computes paired significance tests. Usage: python scripts/significance_test.py --task review --setting user --device cuda:0 """ import sys import os import json import time import numpy as np import torch from scipy import stats sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from data.longlamp import load_longlamp, select_k_profile_items from data.templates import build_query_prompt from models.qwen_wrapper import QwenWrapper from models.cvh import UnconditionalHead from adapt.cache_hidden import cache_support_hidden_states from adapt.fit_theta import fit_theta from baselines.peft_baseline import ( PEFTBaseline, get_lora_config, get_tiny_lora_config, get_vera_config, ) from eval.metrics import compute_rouge def per_user_rouge_l(predictions, references): """Compute per-example ROUGE-L scores.""" scores = [] for pred, ref in zip(predictions, references): r = compute_rouge([pred], [ref]) scores.append(r['rougeL']) return scores def run_base(wrapper, examples, N): """Run base (no personalization).""" from scripts.run_fair_audit import generate_base_with_min preds = [] for i, ex in enumerate(examples): prompt = build_query_prompt(ex['query_input'], ex['task']) pred = generate_base_with_min(wrapper, prompt, min_new_tokens=128) preds.append(pred) if (i + 1) % 40 == 0: print(f" Base: {i+1}/{N}") return preds def run_uph(wrapper, examples, support_sets, N, device): """Run UPH (Uncond-Head).""" H = wrapper.hidden_size uncond = UnconditionalHead(H, d=64, alpha=0.1, basis_seed=42).to(device) lm_head_bias = None if hasattr(wrapper.model.lm_head, 'bias') and wrapper.model.lm_head.bias is not None: lm_head_bias = wrapper.model.lm_head.bias.data preds = [] for i, (ex, support) in enumerate(zip(examples, support_sets)): cached_h = cache_support_hidden_states(wrapper, support, ex['task']) if not cached_h: prompt = build_query_prompt(ex['query_input'], ex['task']) from scripts.run_fair_audit import generate_base_with_min pred = generate_base_with_min(wrapper, prompt) preds.append(pred) continue theta = fit_theta( cached_h=cached_h, lm_head_weight=wrapper.lm_head_weight, lm_head_bias=lm_head_bias, head_module=uncond, d=64, lr=0.05, steps=30, beta=0.05, lam=1e-4, max_grad_norm=5.0, device=device, ) prompt = build_query_prompt(ex['query_input'], ex['task']) pred = wrapper.generate_with_head_blended( prompt, theta, uncond.forward_fn, blend_gamma=0.5, max_new_tokens=512, min_new_tokens=128, temperature=0.0, ) preds.append(pred) del cached_h, theta torch.cuda.empty_cache() if (i + 1) % 40 == 0: print(f" UPH: {i+1}/{N}") return preds def run_peft_method(wrapper, examples, support_sets, N, config, lr, desc): """Run a PEFT method.""" baseline = PEFTBaseline(wrapper, config) print(f" {desc}: {baseline.n_params:,} params") preds = [] for i, (ex, support) in enumerate(zip(examples, support_sets)): pred = baseline.adapt_and_generate( support_items=support, query_input=ex['query_input'], task=ex['task'], lr=lr, steps=30, max_new_tokens=512, min_new_tokens=128, ) preds.append(pred) if (i + 1) % 40 == 0: print(f" {desc}: {i+1}/{N}") baseline.cleanup() return preds def paired_tests(scores_a, scores_b, name_a, name_b): """Run paired t-test and Wilcoxon signed-rank test.""" a = np.array(scores_a) b = np.array(scores_b) diff = a - b mean_a = np.mean(a) mean_b = np.mean(b) mean_diff = np.mean(diff) # Paired t-test t_stat, t_pval = stats.ttest_rel(a, b) # Wilcoxon signed-rank test try: w_stat, w_pval = stats.wilcoxon(a, b) except ValueError: w_stat, w_pval = float('nan'), float('nan') # 95% CI for mean difference se = stats.sem(diff) ci_low = mean_diff - 1.96 * se ci_high = mean_diff + 1.96 * se print(f"\n {name_a} vs {name_b}:") print(f" Mean {name_a}: {mean_a:.4f}, Mean {name_b}: {mean_b:.4f}, Diff: {mean_diff:+.4f}") print(f" 95% CI: [{ci_low:+.4f}, {ci_high:+.4f}]") print(f" Paired t-test: t={t_stat:.3f}, p={t_pval:.2e}") print(f" Wilcoxon: W={w_stat:.0f}, p={w_pval:.2e}") return { 'mean_a': mean_a, 'mean_b': mean_b, 'mean_diff': mean_diff, 'ci_low': ci_low, 'ci_high': ci_high, 't_stat': t_stat, 't_pval': t_pval, 'w_stat': float(w_stat), 'w_pval': float(w_pval), } def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--num_eval', type=int, default=200) parser.add_argument('--task', type=str, default='review', choices=['review', 'topic']) parser.add_argument('--setting', type=str, default='user', choices=['user', 'temporal']) parser.add_argument('--device', type=str, default='cuda:0') parser.add_argument('--output_dir', type=str, default='outputs/significance') args = parser.parse_args() N = args.num_eval device = args.device task = args.task setting = args.setting config_map = { ('review', 'user'): 'product_review_user', ('review', 'temporal'): 'product_review_temporal', ('topic', 'user'): 'topic_writing_user', ('topic', 'temporal'): 'topic_writing_temporal', } config_name = config_map[(task, setting)] print(f"=== Significance Tests: {task}_{setting}, N={N} ===") print("\nLoading data...") examples = load_longlamp(config_name, split='val')[:N] K = 4 support_sets = [select_k_profile_items(ex['profile_items'], K, seed=0) for ex in examples] references = [ex['target_output'] for ex in examples] print(f"Loading model on {device}...") wrapper = QwenWrapper('Qwen/Qwen2.5-1.5B-Instruct', device=device) all_preds = {} all_per_user_rl = {} # Run Base print("\n--- Base ---") preds = run_base(wrapper, examples, N) all_preds['Base'] = preds all_per_user_rl['Base'] = per_user_rouge_l(preds, references) print(f" Mean R-L: {np.mean(all_per_user_rl['Base']):.4f}") # Run UPH print("\n--- UPH ---") preds = run_uph(wrapper, examples, support_sets, N, device) all_preds['UPH'] = preds all_per_user_rl['UPH'] = per_user_rouge_l(preds, references) print(f" Mean R-L: {np.mean(all_per_user_rl['UPH']):.4f}") # Run PEFT methods peft_methods = [ ('LoRA_r8', get_lora_config(rank=8), 1e-4, 'LoRA r=8'), ('TinyLoRA_r1', get_tiny_lora_config(rank=1), 1e-4, 'Tiny LoRA r=1'), ('VeRA_r256', get_vera_config(rank=256), 1e-3, 'VeRA r=256'), ] for key, config, lr, desc in peft_methods: print(f"\n--- {desc} ---") preds = run_peft_method(wrapper, examples, support_sets, N, config, lr, desc) all_preds[key] = preds all_per_user_rl[key] = per_user_rouge_l(preds, references) print(f" Mean R-L: {np.mean(all_per_user_rl[key]):.4f}") # Significance tests print("\n" + "=" * 80) print("SIGNIFICANCE TESTS (ROUGE-L, paired)") print("=" * 80) test_results = {} comparisons = [ ('UPH', 'Base'), ('UPH', 'LoRA_r8'), ('UPH', 'TinyLoRA_r1'), ('UPH', 'VeRA_r256'), ] for name_a, name_b in comparisons: r = paired_tests( all_per_user_rl[name_a], all_per_user_rl[name_b], name_a, name_b, ) test_results[f'{name_a}_vs_{name_b}'] = r # Save results os.makedirs(args.output_dir, exist_ok=True) output_path = os.path.join(args.output_dir, f'{task}_{setting}_significance.json') save_data = { 'per_user_rougeL': {k: v for k, v in all_per_user_rl.items()}, 'significance_tests': test_results, 'num_examples': N, 'task': task, 'setting': setting, } with open(output_path, 'w') as f: json.dump(save_data, f, indent=2, default=str) print(f"\nResults saved to {output_path}") if __name__ == '__main__': main()