diff options
| author | YurenHao0426 <Blackhao0426@gmail.com> | 2026-04-05 10:31:36 -0500 |
|---|---|---|
| committer | YurenHao0426 <Blackhao0426@gmail.com> | 2026-04-05 10:31:36 -0500 |
| commit | ea4a8f837e81b5e5fab6086cb3014c711c5e58e9 (patch) | |
| tree | 11638546dc91c97815e5bdab8fa0b587481d0a3c /scripts | |
| parent | 8fe28101366dd32562b8c5534d7fe359b252bdf3 (diff) | |
Add PEFT baselines, ICL baselines, profile-based, and unified pipeline
New baselines:
- baselines/peft_baseline.py: LoRA, Tiny LoRA, VeRA (per-user PEFT adaptation)
- baselines/dense_retrieval.py: Dense retrieval ICL (sentence-transformers)
- baselines/profile_based.py: LLM-generated user profile conditioned generation
New scripts:
- scripts/run_all_methods.py: Unified pipeline running all 9 methods with
per-method directory output structure (method/per_user.json)
- scripts/run_peft_baselines.py: PEFT-only evaluation (legacy)
- scripts/run_significance.py: Significance tests (UPH+Base per-user)
- scripts/run_uph_base_per_user.py: UPH+Base with full per-user data
- scripts/compute_bertscore.py: BERTScore from saved predictions
- scripts/significance_test.py: Standalone significance test framework
Updated .gitignore to exclude outputs/ directory.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/compute_bertscore.py | 145 | ||||
| -rw-r--r-- | scripts/run_all_methods.py | 438 | ||||
| -rw-r--r-- | scripts/run_peft_baselines.py | 271 | ||||
| -rw-r--r-- | scripts/run_significance.py | 265 | ||||
| -rw-r--r-- | scripts/run_uph_base_per_user.py | 263 | ||||
| -rw-r--r-- | scripts/significance_test.py | 257 |
6 files changed, 1639 insertions, 0 deletions
diff --git a/scripts/compute_bertscore.py b/scripts/compute_bertscore.py new file mode 100644 index 0000000..4fb1dc2 --- /dev/null +++ b/scripts/compute_bertscore.py @@ -0,0 +1,145 @@ +"""Compute BERTScore from saved per-user predictions. + +Uses saved predictions from significance tests (UPH, Base) and PEFT per-user data. + +Usage: + python scripts/compute_bertscore.py --task review --setting user --device cuda:0 +""" + +import sys +import os +import json +import numpy as np +from scipy import stats + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +def paired_test(scores_a, scores_b, name_a, name_b): + a = np.array(scores_a) + b = np.array(scores_b) + diff = a - b + + mean_a, mean_b = np.mean(a), np.mean(b) + mean_diff = np.mean(diff) + + t_stat, t_pval = stats.ttest_rel(a, b) + try: + w_stat, w_pval = stats.wilcoxon(a, b) + except ValueError: + w_stat, w_pval = float('nan'), float('nan') + + se = stats.sem(diff) + ci_low = mean_diff - 1.96 * se + ci_high = mean_diff + 1.96 * se + + print(f" {name_a} vs {name_b}:") + print(f" Mean {name_a}: {mean_a:.4f}, Mean {name_b}: {mean_b:.4f}, Diff: {mean_diff:+.4f}") + print(f" 95% CI: [{ci_low:+.4f}, {ci_high:+.4f}]") + print(f" t-test: p={t_pval:.2e}, Wilcoxon: p={w_pval:.2e}") + + return { + 'mean_a': float(mean_a), 'mean_b': float(mean_b), + 'mean_diff': float(mean_diff), + 'ci_low': float(ci_low), 'ci_high': float(ci_high), + 't_pval': float(t_pval), 'w_pval': float(w_pval), + } + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--task', type=str, default='review') + parser.add_argument('--setting', type=str, default='user') + parser.add_argument('--device', type=str, default='cuda:0') + parser.add_argument('--bert_model', type=str, default='roberta-large') + args = parser.parse_args() + + task = args.task + setting = args.setting + N = 200 + + # Load saved predictions + sig_path = f"outputs/significance/{task}_{setting}_significance.json" + peft_path = f"outputs/peft_baselines/{task}_{setting}_K4_N{N}_peft_per_user.json" + + if not os.path.exists(sig_path): + print(f"Significance data not found: {sig_path}") + return + if not os.path.exists(peft_path): + print(f"PEFT per-user data not found: {peft_path}") + return + + with open(sig_path) as f: + sig_data = json.load(f) + with open(peft_path) as f: + peft_data = json.load(f) + + # Collect all predictions and references + all_preds = {} + all_refs = {} + + # UPH and Base from significance data + all_preds['UPH'] = sig_data['uph_predictions'] + all_preds['Base'] = sig_data['base_predictions'] + + # References (same for all methods) + refs = [u['reference'] for u in peft_data['per_user']['lora']] + + # PEFT predictions + for method in ['lora', 'tiny_lora', 'vera']: + all_preds[method] = [u['prediction'] for u in peft_data['per_user'][method]] + + print(f"=== BERTScore: {task}_{setting}, N={len(refs)} ===") + print(f"Model: {args.bert_model}") + print(f"Methods: {list(all_preds.keys())}") + + # Compute BERTScore for each method + from bert_score import score as bert_score_fn + + all_bertscore = {} + for method, preds in all_preds.items(): + print(f"\n Computing BERTScore for {method}...") + P, R, F1 = bert_score_fn( + preds, refs, + model_type=args.bert_model, + device=args.device, + verbose=False, + ) + all_bertscore[method] = F1.tolist() + print(f" Mean F1: {np.mean(F1.tolist()):.4f}") + + # Summary table + print("\n" + "=" * 60) + print("BERTScore F1 Summary") + print("=" * 60) + for method in all_preds: + scores = all_bertscore[method] + print(f" {method:<15} Mean: {np.mean(scores):.4f}, Std: {np.std(scores):.4f}") + + # Significance tests + print("\n" + "=" * 60) + print("Significance Tests — BERTScore F1 (paired)") + print("=" * 60) + + test_results = {} + for other in ['Base', 'lora', 'tiny_lora', 'vera']: + r = paired_test(all_bertscore['UPH'], all_bertscore[other], 'UPH', other) + test_results[f'UPH_vs_{other}'] = r + + # Save + output_path = f"outputs/significance/{task}_{setting}_bertscore.json" + with open(output_path, 'w') as f: + json.dump({ + 'bertscore_f1': all_bertscore, + 'significance_tests': test_results, + 'model': args.bert_model, + 'task': task, + 'setting': setting, + 'num_examples': len(refs), + }, f, indent=2) + print(f"\nSaved to {output_path}") + + +if __name__ == '__main__': + main() diff --git a/scripts/run_all_methods.py b/scripts/run_all_methods.py new file mode 100644 index 0000000..c5eb523 --- /dev/null +++ b/scripts/run_all_methods.py @@ -0,0 +1,438 @@ +"""Unified evaluation pipeline: all methods, all per-user data saved. + +Runs Base, UPH, PEFT baselines, and ICL baselines in one script. +Saves complete per-user data (predictions, references, scores, metadata) for ALL methods. + +Usage: + python scripts/run_all_methods.py --task review --setting user --device cuda:0 + python scripts/run_all_methods.py --task review --setting user --methods base,uph,lora +""" + +import sys +import os +import json +import time +import numpy as np +import torch +from scipy import stats + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from data.longlamp import load_longlamp, select_k_profile_items +from data.templates import build_query_prompt, build_prompt_with_examples +from data.style_features import compute_sfd, compute_feature_deltas +from models.qwen_wrapper import QwenWrapper +from models.cvh import UnconditionalHead +from adapt.cache_hidden import cache_support_hidden_states +from adapt.fit_theta import fit_theta +from baselines.peft_baseline import ( + PEFTBaseline, get_lora_config, get_tiny_lora_config, get_vera_config, +) +from baselines.bm25_top1 import bm25_select_top1 +from baselines.dense_retrieval import DenseRetriever +from baselines.profile_based import generate_profile, build_profile_conditioned_prompt +from eval.metrics import compute_rouge, compute_meteor + + +ALL_METHODS = [ + 'base', 'uph', + 'prompt_all_k', 'bm25_top1', 'dense_top1', 'profile_based', + 'lora', 'tiny_lora', 'vera', +] + + +def compute_per_user_metrics(pred, ref, support_texts): + r = compute_rouge([pred], [ref]) + m = compute_meteor([pred], [ref]) + p = pred if pred.strip() else "empty" + sfd_all = compute_sfd(p, support_texts, exclude_length=False) + sfd_nolen = compute_sfd(p, support_texts, exclude_length=True) + deltas = compute_feature_deltas(p, support_texts) + return { + 'rouge1': r['rouge1'], + 'rougeL': r['rougeL'], + 'meteor': m, + 'sfd_all': sfd_all, + 'sfd_nolen': sfd_nolen, + 'length': len(pred.split()), + 'feature_deltas': {k: v['delta'] for k, v in deltas.items()}, + } + + +def generate_greedy(wrapper, prompt, max_new_tokens=512, min_new_tokens=128): + chat_messages = [ + {"role": "system", "content": "You are a helpful writing assistant."}, + {"role": "user", "content": prompt}, + ] + prompt_text = wrapper.tokenizer.apply_chat_template( + chat_messages, tokenize=False, add_generation_prompt=True + ) + input_ids = wrapper.tokenizer.encode(prompt_text, return_tensors="pt").to(wrapper.device) + with torch.no_grad(): + outputs = wrapper.model.generate( + input_ids, + max_new_tokens=max_new_tokens, min_new_tokens=min_new_tokens, + temperature=None, top_p=None, do_sample=False, + pad_token_id=wrapper.tokenizer.pad_token_id, + ) + return wrapper.tokenizer.decode(outputs[0, input_ids.shape[1]:], skip_special_tokens=True) + + +class MethodRunner: + """Encapsulates running a single method across all examples.""" + + def __init__(self, wrapper, device, dense_retriever=None): + self.wrapper = wrapper + self.device = device + self.dense_retriever = dense_retriever + + def run(self, method_name, examples, support_sets, references, support_texts, N): + dispatch = { + 'base': self._run_base, + 'uph': self._run_uph, + 'prompt_all_k': self._run_prompt_all_k, + 'bm25_top1': self._run_bm25_top1, + 'dense_top1': self._run_dense_top1, + 'profile_based': self._run_profile_based, + 'lora': lambda *a: self._run_peft(*a, config=get_lora_config(rank=8), lr=1e-4, desc='LoRA r=8'), + 'tiny_lora': lambda *a: self._run_peft(*a, config=get_tiny_lora_config(rank=1), lr=1e-4, desc='Tiny LoRA r=1'), + 'vera': lambda *a: self._run_peft(*a, config=get_vera_config(rank=256), lr=1e-3, desc='VeRA r=256'), + } + + if method_name not in dispatch: + print(f"Unknown method: {method_name}") + return [] + + print(f"\n--- {method_name} ---") + per_user = dispatch[method_name](examples, support_sets, references, support_texts, N) + + avg_rl = np.mean([u['metrics']['rougeL'] for u in per_user]) + avg_sfd = np.mean([u['metrics']['sfd_nolen'] for u in per_user]) + print(f" Mean R-L: {avg_rl:.4f}, SFD_-len: {avg_sfd:.4f}") + return per_user + + def _make_per_user_entry(self, ex, ref, stexts, K, pred, timing, extra=None): + metrics = compute_per_user_metrics(pred, ref, stexts) + entry = { + 'example_id': ex['example_id'], + 'user_id': ex['user_id'], + 'prediction': pred, + 'reference': ref, + 'support_texts': stexts, + 'K': K, + 'metrics': metrics, + **timing, + } + if extra: + entry.update(extra) + return entry + + def _run_base(self, examples, support_sets, references, support_texts, N): + per_user = [] + for i, ex in enumerate(examples): + t0 = time.time() + prompt = build_query_prompt(ex['query_input'], ex['task']) + pred = generate_greedy(self.wrapper, prompt) + entry = self._make_per_user_entry( + ex, references[i], support_texts[i], len(support_sets[i]), + pred, {'gen_time': time.time() - t0} + ) + per_user.append(entry) + if (i + 1) % 40 == 0: + print(f" {i+1}/{N}") + return per_user + + def _run_prompt_all_k(self, examples, support_sets, references, support_texts, N): + per_user = [] + for i, (ex, support) in enumerate(zip(examples, support_sets)): + t0 = time.time() + prompt = build_prompt_with_examples(ex['query_input'], support, ex['task']) + pred = generate_greedy(self.wrapper, prompt) + entry = self._make_per_user_entry( + ex, references[i], support_texts[i], len(support), + pred, {'gen_time': time.time() - t0} + ) + per_user.append(entry) + if (i + 1) % 40 == 0: + print(f" {i+1}/{N}") + return per_user + + def _run_bm25_top1(self, examples, support_sets, references, support_texts, N): + per_user = [] + for i, (ex, support) in enumerate(zip(examples, support_sets)): + t0 = time.time() + selected = bm25_select_top1(ex['query_input'], support) + prompt = build_prompt_with_examples(ex['query_input'], selected, ex['task']) + pred = generate_greedy(self.wrapper, prompt) + entry = self._make_per_user_entry( + ex, references[i], support_texts[i], len(support), + pred, {'gen_time': time.time() - t0} + ) + per_user.append(entry) + if (i + 1) % 40 == 0: + print(f" {i+1}/{N}") + return per_user + + def _run_dense_top1(self, examples, support_sets, references, support_texts, N): + if self.dense_retriever is None: + self.dense_retriever = DenseRetriever(device='cpu') + per_user = [] + for i, (ex, support) in enumerate(zip(examples, support_sets)): + t0 = time.time() + selected = self.dense_retriever.retrieve_top_k(ex['query_input'], support, k=1) + prompt = build_prompt_with_examples(ex['query_input'], selected, ex['task']) + pred = generate_greedy(self.wrapper, prompt) + entry = self._make_per_user_entry( + ex, references[i], support_texts[i], len(support), + pred, {'gen_time': time.time() - t0} + ) + per_user.append(entry) + if (i + 1) % 40 == 0: + print(f" {i+1}/{N}") + return per_user + + def _run_profile_based(self, examples, support_sets, references, support_texts, N): + per_user = [] + for i, (ex, support) in enumerate(zip(examples, support_sets)): + t0 = time.time() + # Step 1: Generate user profile summary from support examples + profile = generate_profile(self.wrapper, support, ex['task']) + # Step 2: Generate conditioned on profile + prompt = build_profile_conditioned_prompt(ex['query_input'], profile, ex['task']) + pred = generate_greedy(self.wrapper, prompt) + entry = self._make_per_user_entry( + ex, references[i], support_texts[i], len(support), + pred, {'gen_time': time.time() - t0}, + extra={'profile_summary': profile}, + ) + per_user.append(entry) + if (i + 1) % 40 == 0: + print(f" {i+1}/{N}") + return per_user + + def _run_uph(self, examples, support_sets, references, support_texts, N): + H = self.wrapper.hidden_size + uncond = UnconditionalHead(H, d=64, alpha=0.1, basis_seed=42).to(self.device) + lm_head_bias = None + if hasattr(self.wrapper.model.lm_head, 'bias') and self.wrapper.model.lm_head.bias is not None: + lm_head_bias = self.wrapper.model.lm_head.bias.data + + per_user = [] + for i, (ex, support) in enumerate(zip(examples, support_sets)): + t0 = time.time() + cached_h = cache_support_hidden_states(self.wrapper, support, ex['task']) + if not cached_h: + prompt = build_query_prompt(ex['query_input'], ex['task']) + pred = generate_greedy(self.wrapper, prompt) + else: + theta = fit_theta( + cached_h=cached_h, + lm_head_weight=self.wrapper.lm_head_weight, + lm_head_bias=lm_head_bias, + head_module=uncond, + d=64, lr=0.05, steps=30, beta=0.05, lam=1e-4, + max_grad_norm=5.0, device=self.device, + ) + prompt = build_query_prompt(ex['query_input'], ex['task']) + pred = self.wrapper.generate_with_head_blended( + prompt, theta, uncond.forward_fn, + blend_gamma=0.5, max_new_tokens=512, + min_new_tokens=128, temperature=0.0, + ) + del cached_h, theta + torch.cuda.empty_cache() + + entry = self._make_per_user_entry( + ex, references[i], support_texts[i], len(support), + pred, {'adapt_time': time.time() - t0} + ) + per_user.append(entry) + if (i + 1) % 40 == 0: + avg_rl = np.mean([u['metrics']['rougeL'] for u in per_user]) + print(f" {i+1}/{N} (avg R-L: {avg_rl:.4f})") + return per_user + + def _run_peft(self, examples, support_sets, references, support_texts, N, + config, lr, desc): + baseline = PEFTBaseline(self.wrapper, config) + print(f" Trainable params: {baseline.n_params:,} ({baseline.n_bytes:,} bytes)") + + per_user = [] + for i, (ex, support) in enumerate(zip(examples, support_sets)): + t0 = time.time() + pred = baseline.adapt_and_generate( + support_items=support, + query_input=ex['query_input'], + task=ex['task'], + lr=lr, steps=30, + max_new_tokens=512, min_new_tokens=128, + ) + entry = self._make_per_user_entry( + ex, references[i], support_texts[i], len(support), + pred, {'adapt_time': time.time() - t0}, + extra={'n_params': baseline.n_params, 'n_bytes': baseline.n_bytes}, + ) + per_user.append(entry) + if (i + 1) % 20 == 0: + avg_rl = np.mean([u['metrics']['rougeL'] for u in per_user]) + avg_t = np.mean([u['adapt_time'] for u in per_user]) + print(f" {i+1}/{N} (avg R-L: {avg_rl:.4f}, avg time: {avg_t:.1f}s)") + + baseline.cleanup() + return per_user + + +def paired_test(scores_a, scores_b, name_a, name_b, metric_name): + a, b = np.array(scores_a), np.array(scores_b) + diff = a - b + mean_diff = np.mean(diff) + t_stat, t_pval = stats.ttest_rel(a, b) + try: + w_stat, w_pval = stats.wilcoxon(a, b) + except ValueError: + w_stat, w_pval = float('nan'), float('nan') + se = stats.sem(diff) + ci_low, ci_high = mean_diff - 1.96 * se, mean_diff + 1.96 * se + return { + 'mean_a': float(np.mean(a)), 'mean_b': float(np.mean(b)), + 'mean_diff': float(mean_diff), + 'ci_low': float(ci_low), 'ci_high': float(ci_high), + 't_pval': float(t_pval), 'w_pval': float(w_pval), + } + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--num_eval', type=int, default=200) + parser.add_argument('--task', type=str, default='review', choices=['review', 'topic']) + parser.add_argument('--setting', type=str, default='user', choices=['user', 'temporal']) + parser.add_argument('--methods', type=str, default='all', + help='Comma-separated methods or "all"') + parser.add_argument('--device', type=str, default='cuda:0') + parser.add_argument('--K', type=int, default=4) + parser.add_argument('--output_dir', type=str, default='outputs/unified') + args = parser.parse_args() + + N = args.num_eval + task = args.task + setting = args.setting + K = args.K + + config_map = { + ('review', 'user'): 'product_review_user', + ('review', 'temporal'): 'product_review_temporal', + ('topic', 'user'): 'topic_writing_user', + ('topic', 'temporal'): 'topic_writing_temporal', + } + config_name = config_map[(task, setting)] + + if args.methods == 'all': + methods = ALL_METHODS + else: + methods = [m.strip() for m in args.methods.split(',')] + + print(f"=== Unified Eval: {task}_{setting}, N={N}, K={K} ===") + print(f"Methods: {methods}") + print(f"Decode: greedy, min=128, max=512") + + print("\nLoading data...") + examples = load_longlamp(config_name, split='val')[:N] + support_sets = [select_k_profile_items(ex['profile_items'], K, seed=0) for ex in examples] + references = [ex['target_output'] for ex in examples] + support_texts = [[s['support_output'] for s in ss] for ss in support_sets] + + print(f"Loading model on {args.device}...") + wrapper = QwenWrapper('Qwen/Qwen2.5-1.5B-Instruct', device=args.device) + + runner = MethodRunner(wrapper, args.device) + all_per_user = {} + + for method in methods: + per_user = runner.run(method, examples, support_sets, references, support_texts, N) + all_per_user[method] = per_user + + # Summary table + print("\n" + "=" * 90) + print(f"{'Method':<15} {'R-L':<8} {'METEOR':<8} {'SFD_-len':<9} {'Len':<6}") + print("-" * 90) + for method in methods: + pu = all_per_user[method] + rl = np.mean([u['metrics']['rougeL'] for u in pu]) + mt = np.mean([u['metrics']['meteor'] for u in pu]) + sf = np.mean([u['metrics']['sfd_nolen'] for u in pu]) + ln = np.mean([u['metrics']['length'] for u in pu]) + print(f"{method:<15} {rl:<8.4f} {mt:<8.4f} {sf:<9.4f} {ln:<6.0f}") + + # Significance tests (UPH vs all others) + if 'uph' in all_per_user: + print("\n" + "=" * 90) + print("Significance (UPH vs each, paired t-test p-value)") + print("=" * 90) + uph_rl = [u['metrics']['rougeL'] for u in all_per_user['uph']] + uph_sf = [u['metrics']['sfd_nolen'] for u in all_per_user['uph']] + sig_results = {} + for method in methods: + if method == 'uph': + continue + other_rl = [u['metrics']['rougeL'] for u in all_per_user[method]] + other_sf = [u['metrics']['sfd_nolen'] for u in all_per_user[method]] + rl_t = paired_test(uph_rl, other_rl, 'uph', method, 'R-L') + sf_t = paired_test(uph_sf, other_sf, 'uph', method, 'SFD') + sig_results[method] = {'rougeL': rl_t, 'sfd_nolen': sf_t} + print(f" vs {method:<12} R-L: diff={rl_t['mean_diff']:+.4f} p={rl_t['t_pval']:.2e} " + f"SFD: diff={sf_t['mean_diff']:+.4f} p={sf_t['t_pval']:.2e}") + + # Save per-method data in separate directories + # Structure: output_dir/task_setting_K{K}/{method}/per_user.json + exp_dir = os.path.join(args.output_dir, f"{task}_{setting}_K{K}") + os.makedirs(exp_dir, exist_ok=True) + + for method in methods: + method_dir = os.path.join(exp_dir, method) + os.makedirs(method_dir, exist_ok=True) + + pu = all_per_user[method] + agg_m = { + 'rougeL': float(np.mean([u['metrics']['rougeL'] for u in pu])), + 'meteor': float(np.mean([u['metrics']['meteor'] for u in pu])), + 'sfd_nolen': float(np.mean([u['metrics']['sfd_nolen'] for u in pu])), + 'avg_len': float(np.mean([u['metrics']['length'] for u in pu])), + } + + with open(os.path.join(method_dir, 'per_user.json'), 'w') as f: + json.dump({ + 'per_user': pu, + 'aggregate': agg_m, + 'num_examples': N, 'task': task, 'setting': setting, 'K': K, + 'method': method, + 'decode_policy': 'greedy, min=128, max=512', + }, f, indent=2, default=str) + + print(f" Saved: {method_dir}/per_user.json") + + # Also save a combined summary (aggregate only, no per-user data) + summary = {} + for method in methods: + pu = all_per_user[method] + summary[method] = { + 'rougeL': float(np.mean([u['metrics']['rougeL'] for u in pu])), + 'meteor': float(np.mean([u['metrics']['meteor'] for u in pu])), + 'sfd_nolen': float(np.mean([u['metrics']['sfd_nolen'] for u in pu])), + 'avg_len': float(np.mean([u['metrics']['length'] for u in pu])), + } + summary_path = os.path.join(exp_dir, 'summary.json') + with open(summary_path, 'w') as f: + json.dump({ + 'aggregate': summary, + 'significance': sig_results if 'uph' in all_per_user else {}, + 'num_examples': N, 'task': task, 'setting': setting, 'K': K, + 'methods': methods, + }, f, indent=2, default=str) + + print(f"\nPer-method data: {exp_dir}/{{method}}/per_user.json") + print(f"Summary: {summary_path}") + + +if __name__ == '__main__': + main() diff --git a/scripts/run_peft_baselines.py b/scripts/run_peft_baselines.py new file mode 100644 index 0000000..c23256b --- /dev/null +++ b/scripts/run_peft_baselines.py @@ -0,0 +1,271 @@ +"""Evaluate PEFT baselines (LoRA, Tiny LoRA, VeRA) with fair decode policy. + +Saves complete per-user data: predictions, references, scores, metadata. + +Usage: + python scripts/run_peft_baselines.py --task review --setting user + python scripts/run_peft_baselines.py --task topic --setting user + python scripts/run_peft_baselines.py --task review --setting user --methods lora +""" + +import sys +import os +import json +import time +import torch + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from data.longlamp import load_longlamp, select_k_profile_items +from data.templates import build_query_prompt +from data.style_features import FEATURE_NAMES, compute_sfd, compute_feature_deltas +from models.qwen_wrapper import QwenWrapper +from baselines.peft_baseline import ( + PEFTBaseline, get_lora_config, get_tiny_lora_config, get_vera_config, +) +from eval.metrics import compute_rouge, compute_meteor + + +PEFT_CONFIGS = { + 'lora': { + 'config_fn': lambda: get_lora_config(rank=8), + 'lr': 1e-4, + 'steps': 30, + 'desc': 'LoRA (rank=8, q+v proj)', + }, + 'tiny_lora': { + 'config_fn': lambda: get_tiny_lora_config(rank=1), + 'lr': 1e-4, + 'steps': 30, + 'desc': 'Tiny LoRA (rank=1, q+v proj)', + }, + 'vera': { + 'config_fn': lambda: get_vera_config(rank=256), + 'lr': 1e-3, + 'steps': 30, + 'desc': 'VeRA (rank=256, q+v proj)', + }, +} + + +def compute_per_user_metrics(pred, ref, support_texts): + """Compute all metrics for a single prediction.""" + r = compute_rouge([pred], [ref]) + m = compute_meteor([pred], [ref]) + sfd_all = compute_sfd(pred if pred.strip() else "empty", support_texts, exclude_length=False) + sfd_nolen = compute_sfd(pred if pred.strip() else "empty", support_texts, exclude_length=True) + deltas = compute_feature_deltas(pred if pred.strip() else "empty", support_texts) + + return { + 'rouge1': r['rouge1'], + 'rougeL': r['rougeL'], + 'meteor': m, + 'sfd_all': sfd_all, + 'sfd_nolen': sfd_nolen, + 'length': len(pred.split()), + 'feature_deltas': {k: v['delta'] for k, v in deltas.items()}, + } + + +def run_peft_method(wrapper, examples, support_sets, references, support_texts, + method_name, config_entry, N): + """Run one PEFT baseline, returning per-user results.""" + cfg = config_entry['config_fn']() + lr = config_entry['lr'] + steps = config_entry['steps'] + + print(f"\n--- {config_entry['desc']} ---") + + baseline = PEFTBaseline(wrapper, cfg) + print(f" Trainable params: {baseline.n_params:,} ({baseline.n_bytes:,} bytes)") + + per_user = [] + + for i, (ex, support) in enumerate(zip(examples, support_sets)): + t0 = time.time() + + pred = baseline.adapt_and_generate( + support_items=support, + query_input=ex['query_input'], + task=ex['task'], + lr=lr, + steps=steps, + max_new_tokens=512, + min_new_tokens=128, + verbose=False, + ) + adapt_time = time.time() - t0 + + # Per-user metrics + metrics = compute_per_user_metrics(pred, references[i], support_texts[i]) + + per_user.append({ + 'example_id': ex['example_id'], + 'user_id': ex['user_id'], + 'prediction': pred, + 'reference': references[i], + 'support_texts': support_texts[i], + 'K': len(support), + 'adapt_time': adapt_time, + 'metrics': metrics, + }) + + if (i + 1) % 20 == 0: + avg_t = sum(u['adapt_time'] for u in per_user) / len(per_user) + avg_rl = sum(u['metrics']['rougeL'] for u in per_user) / len(per_user) + print(f" {i+1}/{N} (avg time: {avg_t:.1f}s, avg R-L: {avg_rl:.4f})") + + # Aggregate metrics + agg = { + 'rouge1': sum(u['metrics']['rouge1'] for u in per_user) / N, + 'rougeL': sum(u['metrics']['rougeL'] for u in per_user) / N, + 'meteor': sum(u['metrics']['meteor'] for u in per_user) / N, + 'sfd_all': sum(u['metrics']['sfd_all'] for u in per_user) / N, + 'sfd_nolen': sum(u['metrics']['sfd_nolen'] for u in per_user) / N, + 'avg_len': sum(u['metrics']['length'] for u in per_user) / N, + 'adapt_time': sum(u['adapt_time'] for u in per_user) / N, + 'n_params': baseline.n_params, + 'n_bytes': baseline.n_bytes, + } + + # Cleanup + baseline.cleanup() + + print(f" R-L: {agg['rougeL']:.4f}, METEOR: {agg['meteor']:.4f}, " + f"SFD_-len: {agg['sfd_nolen']:.4f}, len: {agg['avg_len']:.0f}, " + f"adapt: {agg['adapt_time']:.1f}s") + + return per_user, agg + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--num_eval', type=int, default=200) + parser.add_argument('--task', type=str, default='review', choices=['review', 'topic']) + parser.add_argument('--setting', type=str, default='user', choices=['user', 'temporal']) + parser.add_argument('--methods', type=str, default='all', + help='Comma-separated methods: lora,tiny_lora,vera or "all"') + parser.add_argument('--output_dir', type=str, default='outputs/peft_baselines') + parser.add_argument('--device', type=str, default='cuda:1') + parser.add_argument('--steps', type=int, default=None, help='Override adaptation steps') + args = parser.parse_args() + + N = args.num_eval + task = args.task + setting = args.setting + + config_map = { + ('review', 'user'): 'product_review_user', + ('review', 'temporal'): 'product_review_temporal', + ('topic', 'user'): 'topic_writing_user', + ('topic', 'temporal'): 'topic_writing_temporal', + } + config_name = config_map[(task, setting)] + + if args.methods == 'all': + methods = list(PEFT_CONFIGS.keys()) + else: + methods = [m.strip() for m in args.methods.split(',')] + for m in methods: + if m not in PEFT_CONFIGS: + print(f"Unknown method: {m}. Available: {list(PEFT_CONFIGS.keys())}") + return + + print(f"=== PEFT Baselines: {task}_{setting}, N={N} ===") + print(f"Methods: {methods}") + print(f"Decode policy: greedy, min_new_tokens=128, max_new_tokens=512") + + print("\nLoading data...") + examples = load_longlamp(config_name, split='val')[:N] + K = 4 + support_sets = [select_k_profile_items(ex['profile_items'], K, seed=0) for ex in examples] + references = [ex['target_output'] for ex in examples] + support_texts = [[s['support_output'] for s in ss] for ss in support_sets] + + avg_ref_len = sum(len(r.split()) for r in references) / len(references) + print(f"Examples: {len(examples)}, Avg reference len: {avg_ref_len:.0f}") + + print(f"\nLoading model on {args.device}...") + wrapper = QwenWrapper('Qwen/Qwen2.5-1.5B-Instruct', device=args.device) + + all_agg = {} + all_per_user = {} + + for method_name in methods: + config_entry = PEFT_CONFIGS[method_name].copy() + if args.steps is not None: + config_entry['steps'] = args.steps + + per_user, agg = run_peft_method( + wrapper, examples, support_sets, references, support_texts, + method_name, config_entry, N, + ) + all_agg[method_name] = agg + all_per_user[method_name] = per_user + + # Print summary + print("\n" + "=" * 100) + print("PEFT BASELINES SUMMARY") + print("=" * 100) + header = (f"{'Method':<25} {'R-L':<8} {'METEOR':<8} {'SFD_-len':<9} " + f"{'Len':<6} {'Params':<12} {'Bytes':<10} {'Time/user':<10}") + print(header) + print("-" * 100) + + uph_path = f"outputs/fair_audit/{task}_{setting}_K4_d64_N{N}_fair_results.json" + if os.path.exists(uph_path): + with open(uph_path) as f: + uph_data = json.load(f) + if 'Uncond-Head' in uph_data.get('results', {}): + uph_r = uph_data['results']['Uncond-Head'] + print(f"{'UPH (reference)':<25} {uph_r['rougeL']:<8.4f} {uph_r['meteor']:<8.4f} " + f"{uph_r['sfd_nolen']:<9.4f} {uph_r['avg_len']:<6.0f} " + f"{'64':<12} {'128':<10} {'~7s':<10}") + if 'Base' in uph_data.get('results', {}): + base_r = uph_data['results']['Base'] + print(f"{'Base (reference)':<25} {base_r['rougeL']:<8.4f} {base_r['meteor']:<8.4f} " + f"{base_r['sfd_nolen']:<9.4f} {base_r['avg_len']:<6.0f} " + f"{'0':<12} {'0':<10} {'0s':<10}") + print("-" * 100) + + for name, agg in all_agg.items(): + print(f"{PEFT_CONFIGS[name]['desc']:<25} {agg['rougeL']:<8.4f} {agg['meteor']:<8.4f} " + f"{agg['sfd_nolen']:<9.4f} {agg['avg_len']:<6.0f} " + f"{agg['n_params']:<12,} {agg['n_bytes']:<10,} " + f"{agg['adapt_time']:<10.1f}s") + + # Save complete results with per-user data + os.makedirs(args.output_dir, exist_ok=True) + exp_name = f"{task}_{setting}_K4_N{N}_peft" + + # Aggregate results (lightweight) + agg_path = os.path.join(args.output_dir, f"{exp_name}_results.json") + with open(agg_path, 'w') as f: + json.dump({ + 'aggregate': all_agg, + 'num_examples': N, + 'task': task, + 'setting': setting, + 'K': K, + 'decode_policy': 'greedy, min_new_tokens=128, max_new_tokens=512', + 'methods': {k: PEFT_CONFIGS[k]['desc'] for k in methods}, + }, f, indent=2, default=str) + + # Per-user data (complete) + per_user_path = os.path.join(args.output_dir, f"{exp_name}_per_user.json") + with open(per_user_path, 'w') as f: + json.dump({ + 'per_user': all_per_user, + 'num_examples': N, + 'task': task, + 'setting': setting, + 'K': K, + }, f, indent=2, default=str) + + print(f"\nAggregate results saved to {agg_path}") + print(f"Per-user data saved to {per_user_path}") + + +if __name__ == '__main__': + main() diff --git a/scripts/run_significance.py b/scripts/run_significance.py new file mode 100644 index 0000000..c8b2392 --- /dev/null +++ b/scripts/run_significance.py @@ -0,0 +1,265 @@ +"""Run UPH+Base with per-user scores, then compute significance tests vs PEFT baselines. + +Loads PEFT per-user data from run_peft_baselines.py output, runs UPH and Base +to get per-user R-L, then computes paired significance tests. + +Usage: + python scripts/run_significance.py --task review --setting user --device cuda:0 +""" + +import sys +import os +import json +import time +import numpy as np +import torch +from scipy import stats + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from data.longlamp import load_longlamp, select_k_profile_items +from data.templates import build_query_prompt +from models.qwen_wrapper import QwenWrapper +from models.cvh import UnconditionalHead +from adapt.cache_hidden import cache_support_hidden_states +from adapt.fit_theta import fit_theta +from eval.metrics import compute_rouge, compute_meteor + + +def per_user_scores(predictions, references): + """Compute per-example ROUGE-L and METEOR.""" + rl_scores = [] + meteor_scores = [] + for pred, ref in zip(predictions, references): + r = compute_rouge([pred], [ref]) + m = compute_meteor([pred], [ref]) + rl_scores.append(r['rougeL']) + meteor_scores.append(m) + return rl_scores, meteor_scores + + +def generate_base(wrapper, prompt, max_new_tokens=512, min_new_tokens=128): + chat_messages = [ + {"role": "system", "content": "You are a helpful writing assistant."}, + {"role": "user", "content": prompt}, + ] + prompt_text = wrapper.tokenizer.apply_chat_template( + chat_messages, tokenize=False, add_generation_prompt=True + ) + input_ids = wrapper.tokenizer.encode(prompt_text, return_tensors="pt").to(wrapper.device) + with torch.no_grad(): + outputs = wrapper.model.generate( + input_ids, + max_new_tokens=max_new_tokens, + min_new_tokens=min_new_tokens, + temperature=None, top_p=None, do_sample=False, + pad_token_id=wrapper.tokenizer.pad_token_id, + ) + return wrapper.tokenizer.decode(outputs[0, input_ids.shape[1]:], skip_special_tokens=True) + + +def run_base(wrapper, examples, N): + preds = [] + for i, ex in enumerate(examples): + prompt = build_query_prompt(ex['query_input'], ex['task']) + pred = generate_base(wrapper, prompt) + preds.append(pred) + if (i + 1) % 40 == 0: + print(f" Base: {i+1}/{N}") + return preds + + +def run_uph(wrapper, examples, support_sets, N, device): + H = wrapper.hidden_size + uncond = UnconditionalHead(H, d=64, alpha=0.1, basis_seed=42).to(device) + lm_head_bias = None + if hasattr(wrapper.model.lm_head, 'bias') and wrapper.model.lm_head.bias is not None: + lm_head_bias = wrapper.model.lm_head.bias.data + + preds = [] + for i, (ex, support) in enumerate(zip(examples, support_sets)): + cached_h = cache_support_hidden_states(wrapper, support, ex['task']) + if not cached_h: + prompt = build_query_prompt(ex['query_input'], ex['task']) + pred = generate_base(wrapper, prompt) + preds.append(pred) + continue + + theta = fit_theta( + cached_h=cached_h, + lm_head_weight=wrapper.lm_head_weight, + lm_head_bias=lm_head_bias, + head_module=uncond, + d=64, lr=0.05, steps=30, beta=0.05, lam=1e-4, + max_grad_norm=5.0, device=device, + ) + + prompt = build_query_prompt(ex['query_input'], ex['task']) + pred = wrapper.generate_with_head_blended( + prompt, theta, uncond.forward_fn, + blend_gamma=0.5, max_new_tokens=512, + min_new_tokens=128, temperature=0.0, + ) + preds.append(pred) + del cached_h, theta + torch.cuda.empty_cache() + + if (i + 1) % 40 == 0: + print(f" UPH: {i+1}/{N}") + return preds + + +def paired_tests(scores_a, scores_b, name_a, name_b): + a = np.array(scores_a) + b = np.array(scores_b) + diff = a - b + + mean_a, mean_b = np.mean(a), np.mean(b) + mean_diff = np.mean(diff) + + t_stat, t_pval = stats.ttest_rel(a, b) + try: + w_stat, w_pval = stats.wilcoxon(a, b) + except ValueError: + w_stat, w_pval = float('nan'), float('nan') + + se = stats.sem(diff) + ci_low = mean_diff - 1.96 * se + ci_high = mean_diff + 1.96 * se + + print(f"\n {name_a} vs {name_b}:") + print(f" Mean {name_a}: {mean_a:.4f}, Mean {name_b}: {mean_b:.4f}, Diff: {mean_diff:+.4f}") + print(f" 95% CI: [{ci_low:+.4f}, {ci_high:+.4f}]") + print(f" Paired t-test: t={t_stat:.3f}, p={t_pval:.2e}") + print(f" Wilcoxon: W={w_stat:.0f}, p={w_pval:.2e}") + + return { + 'mean_a': float(mean_a), 'mean_b': float(mean_b), + 'mean_diff': float(mean_diff), + 'ci_low': float(ci_low), 'ci_high': float(ci_high), + 't_stat': float(t_stat), 't_pval': float(t_pval), + 'w_stat': float(w_stat), 'w_pval': float(w_pval), + } + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--num_eval', type=int, default=200) + parser.add_argument('--task', type=str, default='review', choices=['review', 'topic']) + parser.add_argument('--setting', type=str, default='user', choices=['user', 'temporal']) + parser.add_argument('--device', type=str, default='cuda:0') + parser.add_argument('--peft_dir', type=str, default='outputs/peft_baselines') + parser.add_argument('--output_dir', type=str, default='outputs/significance') + args = parser.parse_args() + + N = args.num_eval + device = args.device + task = args.task + setting = args.setting + + config_map = { + ('review', 'user'): 'product_review_user', + ('review', 'temporal'): 'product_review_temporal', + ('topic', 'user'): 'topic_writing_user', + ('topic', 'temporal'): 'topic_writing_temporal', + } + config_name = config_map[(task, setting)] + + # Load PEFT per-user data + peft_path = os.path.join(args.peft_dir, f"{task}_{setting}_K4_N{N}_peft_per_user.json") + if not os.path.exists(peft_path): + print(f"PEFT per-user data not found: {peft_path}") + print("Run run_peft_baselines.py first.") + return + + with open(peft_path) as f: + peft_data = json.load(f) + + # Extract PEFT per-user R-L scores + peft_rl = {} + peft_meteor = {} + for method, users in peft_data['per_user'].items(): + peft_rl[method] = [u['metrics']['rougeL'] for u in users] + peft_meteor[method] = [u['metrics']['meteor'] for u in users] + + print(f"=== Significance Tests: {task}_{setting}, N={N} ===") + print(f"Loaded PEFT per-user data: {list(peft_rl.keys())}") + + # Load data and run UPH + Base + print("\nLoading data...") + examples = load_longlamp(config_name, split='val')[:N] + K = 4 + support_sets = [select_k_profile_items(ex['profile_items'], K, seed=0) for ex in examples] + references = [ex['target_output'] for ex in examples] + + print(f"Loading model on {device}...") + wrapper = QwenWrapper('Qwen/Qwen2.5-1.5B-Instruct', device=device) + + # Run Base + print("\n--- Base ---") + base_preds = run_base(wrapper, examples, N) + base_rl, base_meteor = per_user_scores(base_preds, references) + print(f" Mean R-L: {np.mean(base_rl):.4f}, METEOR: {np.mean(base_meteor):.4f}") + + # Run UPH + print("\n--- UPH ---") + uph_preds = run_uph(wrapper, examples, support_sets, N, device) + uph_rl, uph_meteor = per_user_scores(uph_preds, references) + print(f" Mean R-L: {np.mean(uph_rl):.4f}, METEOR: {np.mean(uph_meteor):.4f}") + + # Significance tests + all_rl = {'Base': base_rl, 'UPH': uph_rl} + all_rl.update(peft_rl) + + all_meteor = {'Base': base_meteor, 'UPH': uph_meteor} + all_meteor.update(peft_meteor) + + print("\n" + "=" * 80) + print("SIGNIFICANCE TESTS — ROUGE-L (paired)") + print("=" * 80) + + rl_tests = {} + comparisons = [ + ('UPH', 'Base'), + ('UPH', 'lora'), + ('UPH', 'tiny_lora'), + ('UPH', 'vera'), + ] + for name_a, name_b in comparisons: + if name_b in all_rl: + r = paired_tests(all_rl[name_a], all_rl[name_b], name_a, name_b) + rl_tests[f'{name_a}_vs_{name_b}'] = r + + print("\n" + "=" * 80) + print("SIGNIFICANCE TESTS — METEOR (paired)") + print("=" * 80) + + meteor_tests = {} + for name_a, name_b in comparisons: + if name_b in all_meteor: + r = paired_tests(all_meteor[name_a], all_meteor[name_b], name_a, name_b) + meteor_tests[f'{name_a}_vs_{name_b}'] = r + + # Save + os.makedirs(args.output_dir, exist_ok=True) + output_path = os.path.join(args.output_dir, f'{task}_{setting}_significance.json') + + save_data = { + 'per_user_rougeL': {k: [float(x) for x in v] for k, v in all_rl.items()}, + 'per_user_meteor': {k: [float(x) for x in v] for k, v in all_meteor.items()}, + 'significance_rougeL': rl_tests, + 'significance_meteor': meteor_tests, + 'num_examples': N, + 'task': task, + 'setting': setting, + 'base_predictions': base_preds, + 'uph_predictions': uph_preds, + } + with open(output_path, 'w') as f: + json.dump(save_data, f, indent=2, default=str) + print(f"\nResults saved to {output_path}") + + +if __name__ == '__main__': + main() diff --git a/scripts/run_uph_base_per_user.py b/scripts/run_uph_base_per_user.py new file mode 100644 index 0000000..4a48396 --- /dev/null +++ b/scripts/run_uph_base_per_user.py @@ -0,0 +1,263 @@ +"""Run UPH and Base with complete per-user data saving. + +Saves predictions, references, all per-user metrics (R-L, METEOR, SFD, feature deltas), +and metadata. Then computes significance tests vs PEFT baselines. + +Usage: + python scripts/run_uph_base_per_user.py --task review --setting user --device cuda:0 +""" + +import sys +import os +import json +import time +import numpy as np +import torch +from scipy import stats + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from data.longlamp import load_longlamp, select_k_profile_items +from data.templates import build_query_prompt +from data.style_features import compute_sfd, compute_feature_deltas +from models.qwen_wrapper import QwenWrapper +from models.cvh import UnconditionalHead +from adapt.cache_hidden import cache_support_hidden_states +from adapt.fit_theta import fit_theta +from eval.metrics import compute_rouge, compute_meteor + + +def compute_per_user_metrics(pred, ref, support_texts): + r = compute_rouge([pred], [ref]) + m = compute_meteor([pred], [ref]) + p = pred if pred.strip() else "empty" + sfd_all = compute_sfd(p, support_texts, exclude_length=False) + sfd_nolen = compute_sfd(p, support_texts, exclude_length=True) + deltas = compute_feature_deltas(p, support_texts) + return { + 'rouge1': r['rouge1'], + 'rougeL': r['rougeL'], + 'meteor': m, + 'sfd_all': sfd_all, + 'sfd_nolen': sfd_nolen, + 'length': len(pred.split()), + 'feature_deltas': {k: v['delta'] for k, v in deltas.items()}, + } + + +def generate_base(wrapper, prompt, max_new_tokens=512, min_new_tokens=128): + chat_messages = [ + {"role": "system", "content": "You are a helpful writing assistant."}, + {"role": "user", "content": prompt}, + ] + prompt_text = wrapper.tokenizer.apply_chat_template( + chat_messages, tokenize=False, add_generation_prompt=True + ) + input_ids = wrapper.tokenizer.encode(prompt_text, return_tensors="pt").to(wrapper.device) + with torch.no_grad(): + outputs = wrapper.model.generate( + input_ids, + max_new_tokens=max_new_tokens, min_new_tokens=min_new_tokens, + temperature=None, top_p=None, do_sample=False, + pad_token_id=wrapper.tokenizer.pad_token_id, + ) + return wrapper.tokenizer.decode(outputs[0, input_ids.shape[1]:], skip_special_tokens=True) + + +def paired_test(scores_a, scores_b, name_a, name_b, metric_name): + a, b = np.array(scores_a), np.array(scores_b) + diff = a - b + mean_diff = np.mean(diff) + t_stat, t_pval = stats.ttest_rel(a, b) + try: + w_stat, w_pval = stats.wilcoxon(a, b) + except ValueError: + w_stat, w_pval = float('nan'), float('nan') + se = stats.sem(diff) + ci_low, ci_high = mean_diff - 1.96 * se, mean_diff + 1.96 * se + + print(f" {name_a} vs {name_b} ({metric_name}): " + f"diff={mean_diff:+.4f}, 95% CI=[{ci_low:+.4f}, {ci_high:+.4f}], " + f"t-test p={t_pval:.2e}, Wilcoxon p={w_pval:.2e}") + return { + 'mean_a': float(np.mean(a)), 'mean_b': float(np.mean(b)), + 'mean_diff': float(mean_diff), + 'ci_low': float(ci_low), 'ci_high': float(ci_high), + 't_pval': float(t_pval), 'w_pval': float(w_pval), + } + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--num_eval', type=int, default=200) + parser.add_argument('--task', type=str, default='review', choices=['review', 'topic']) + parser.add_argument('--setting', type=str, default='user', choices=['user', 'temporal']) + parser.add_argument('--device', type=str, default='cuda:0') + parser.add_argument('--output_dir', type=str, default='outputs/per_user') + args = parser.parse_args() + + N = args.num_eval + device = args.device + task = args.task + setting = args.setting + + config_map = { + ('review', 'user'): 'product_review_user', + ('review', 'temporal'): 'product_review_temporal', + ('topic', 'user'): 'topic_writing_user', + ('topic', 'temporal'): 'topic_writing_temporal', + } + config_name = config_map[(task, setting)] + + print(f"=== UPH + Base per-user: {task}_{setting}, N={N} ===") + + print("\nLoading data...") + examples = load_longlamp(config_name, split='val')[:N] + K = 4 + support_sets = [select_k_profile_items(ex['profile_items'], K, seed=0) for ex in examples] + references = [ex['target_output'] for ex in examples] + support_texts = [[s['support_output'] for s in ss] for ss in support_sets] + + print(f"Loading model on {device}...") + wrapper = QwenWrapper('Qwen/Qwen2.5-1.5B-Instruct', device=device) + H = wrapper.hidden_size + + all_per_user = {} + + # === Base === + print("\n--- Base ---") + base_per_user = [] + for i, ex in enumerate(examples): + prompt = build_query_prompt(ex['query_input'], ex['task']) + t0 = time.time() + pred = generate_base(wrapper, prompt) + gen_time = time.time() - t0 + + metrics = compute_per_user_metrics(pred, references[i], support_texts[i]) + base_per_user.append({ + 'example_id': ex['example_id'], + 'user_id': ex['user_id'], + 'prediction': pred, + 'reference': references[i], + 'support_texts': support_texts[i], + 'K': K, + 'gen_time': gen_time, + 'metrics': metrics, + }) + if (i + 1) % 40 == 0: + avg_rl = np.mean([u['metrics']['rougeL'] for u in base_per_user]) + print(f" {i+1}/{N} (avg R-L: {avg_rl:.4f})") + + all_per_user['Base'] = base_per_user + avg_rl = np.mean([u['metrics']['rougeL'] for u in base_per_user]) + print(f" Mean R-L: {avg_rl:.4f}") + + # === UPH === + print("\n--- UPH ---") + uncond = UnconditionalHead(H, d=64, alpha=0.1, basis_seed=42).to(device) + lm_head_bias = None + if hasattr(wrapper.model.lm_head, 'bias') and wrapper.model.lm_head.bias is not None: + lm_head_bias = wrapper.model.lm_head.bias.data + + uph_per_user = [] + for i, (ex, support) in enumerate(zip(examples, support_sets)): + t0 = time.time() + cached_h = cache_support_hidden_states(wrapper, support, ex['task']) + if not cached_h: + prompt = build_query_prompt(ex['query_input'], ex['task']) + pred = generate_base(wrapper, prompt) + else: + theta = fit_theta( + cached_h=cached_h, + lm_head_weight=wrapper.lm_head_weight, + lm_head_bias=lm_head_bias, + head_module=uncond, + d=64, lr=0.05, steps=30, beta=0.05, lam=1e-4, + max_grad_norm=5.0, device=device, + ) + prompt = build_query_prompt(ex['query_input'], ex['task']) + pred = wrapper.generate_with_head_blended( + prompt, theta, uncond.forward_fn, + blend_gamma=0.5, max_new_tokens=512, + min_new_tokens=128, temperature=0.0, + ) + del cached_h, theta + torch.cuda.empty_cache() + + adapt_time = time.time() - t0 + metrics = compute_per_user_metrics(pred, references[i], support_texts[i]) + uph_per_user.append({ + 'example_id': ex['example_id'], + 'user_id': ex['user_id'], + 'prediction': pred, + 'reference': references[i], + 'support_texts': support_texts[i], + 'K': K, + 'adapt_time': adapt_time, + 'metrics': metrics, + }) + if (i + 1) % 40 == 0: + avg_rl = np.mean([u['metrics']['rougeL'] for u in uph_per_user]) + print(f" {i+1}/{N} (avg R-L: {avg_rl:.4f})") + + all_per_user['UPH'] = uph_per_user + avg_rl = np.mean([u['metrics']['rougeL'] for u in uph_per_user]) + print(f" Mean R-L: {avg_rl:.4f}") + + # Save per-user data + os.makedirs(args.output_dir, exist_ok=True) + per_user_path = os.path.join(args.output_dir, f"{task}_{setting}_uph_base_per_user.json") + with open(per_user_path, 'w') as f: + json.dump({ + 'per_user': all_per_user, + 'num_examples': N, 'task': task, 'setting': setting, 'K': K, + }, f, indent=2, default=str) + print(f"\nPer-user data saved to {per_user_path}") + + # === Significance tests vs PEFT === + peft_path = f"outputs/peft_baselines/{task}_{setting}_K4_N{N}_peft_per_user.json" + if os.path.exists(peft_path): + with open(peft_path) as f: + peft_data = json.load(f) + + print("\n" + "=" * 80) + print("SIGNIFICANCE TESTS — ALL METRICS (UPH vs each baseline)") + print("=" * 80) + + uph_rl = [u['metrics']['rougeL'] for u in uph_per_user] + uph_sfd = [u['metrics']['sfd_nolen'] for u in uph_per_user] + uph_meteor = [u['metrics']['meteor'] for u in uph_per_user] + + all_tests = {} + comparisons = { + 'Base': base_per_user, + } + for m in ['lora', 'tiny_lora', 'vera']: + if m in peft_data['per_user']: + comparisons[m] = peft_data['per_user'][m] + + for name, users in comparisons.items(): + other_rl = [u['metrics']['rougeL'] for u in users] + other_sfd = [u['metrics']['sfd_nolen'] for u in users] + other_meteor = [u['metrics']['meteor'] for u in users] + + print(f"\n--- UPH vs {name} ---") + tests = {} + tests['rougeL'] = paired_test(uph_rl, other_rl, 'UPH', name, 'ROUGE-L') + tests['sfd_nolen'] = paired_test(uph_sfd, other_sfd, 'UPH', name, 'SFD_-len') + tests['meteor'] = paired_test(uph_meteor, other_meteor, 'UPH', name, 'METEOR') + all_tests[f'UPH_vs_{name}'] = tests + + # Save significance results + sig_path = os.path.join(args.output_dir, f"{task}_{setting}_all_significance.json") + with open(sig_path, 'w') as f: + json.dump({ + 'significance_tests': all_tests, + 'num_examples': N, 'task': task, 'setting': setting, + }, f, indent=2, default=str) + print(f"\nSignificance tests saved to {sig_path}") + + +if __name__ == '__main__': + main() diff --git a/scripts/significance_test.py b/scripts/significance_test.py new file mode 100644 index 0000000..a276b61 --- /dev/null +++ b/scripts/significance_test.py @@ -0,0 +1,257 @@ +"""Run significance tests between UPH and PEFT baselines. + +Re-runs all methods on review_user (or specified task/setting), +saves per-user R-L scores, and computes paired significance tests. + +Usage: + python scripts/significance_test.py --task review --setting user --device cuda:0 +""" + +import sys +import os +import json +import time +import numpy as np +import torch +from scipy import stats + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from data.longlamp import load_longlamp, select_k_profile_items +from data.templates import build_query_prompt +from models.qwen_wrapper import QwenWrapper +from models.cvh import UnconditionalHead +from adapt.cache_hidden import cache_support_hidden_states +from adapt.fit_theta import fit_theta +from baselines.peft_baseline import ( + PEFTBaseline, get_lora_config, get_tiny_lora_config, get_vera_config, +) +from eval.metrics import compute_rouge + + +def per_user_rouge_l(predictions, references): + """Compute per-example ROUGE-L scores.""" + scores = [] + for pred, ref in zip(predictions, references): + r = compute_rouge([pred], [ref]) + scores.append(r['rougeL']) + return scores + + +def run_base(wrapper, examples, N): + """Run base (no personalization).""" + from scripts.run_fair_audit import generate_base_with_min + preds = [] + for i, ex in enumerate(examples): + prompt = build_query_prompt(ex['query_input'], ex['task']) + pred = generate_base_with_min(wrapper, prompt, min_new_tokens=128) + preds.append(pred) + if (i + 1) % 40 == 0: + print(f" Base: {i+1}/{N}") + return preds + + +def run_uph(wrapper, examples, support_sets, N, device): + """Run UPH (Uncond-Head).""" + H = wrapper.hidden_size + uncond = UnconditionalHead(H, d=64, alpha=0.1, basis_seed=42).to(device) + lm_head_bias = None + if hasattr(wrapper.model.lm_head, 'bias') and wrapper.model.lm_head.bias is not None: + lm_head_bias = wrapper.model.lm_head.bias.data + + preds = [] + for i, (ex, support) in enumerate(zip(examples, support_sets)): + cached_h = cache_support_hidden_states(wrapper, support, ex['task']) + if not cached_h: + prompt = build_query_prompt(ex['query_input'], ex['task']) + from scripts.run_fair_audit import generate_base_with_min + pred = generate_base_with_min(wrapper, prompt) + preds.append(pred) + continue + + theta = fit_theta( + cached_h=cached_h, + lm_head_weight=wrapper.lm_head_weight, + lm_head_bias=lm_head_bias, + head_module=uncond, + d=64, lr=0.05, steps=30, beta=0.05, lam=1e-4, + max_grad_norm=5.0, device=device, + ) + + prompt = build_query_prompt(ex['query_input'], ex['task']) + pred = wrapper.generate_with_head_blended( + prompt, theta, uncond.forward_fn, + blend_gamma=0.5, max_new_tokens=512, + min_new_tokens=128, temperature=0.0, + ) + preds.append(pred) + del cached_h, theta + torch.cuda.empty_cache() + + if (i + 1) % 40 == 0: + print(f" UPH: {i+1}/{N}") + return preds + + +def run_peft_method(wrapper, examples, support_sets, N, config, lr, desc): + """Run a PEFT method.""" + baseline = PEFTBaseline(wrapper, config) + print(f" {desc}: {baseline.n_params:,} params") + preds = [] + for i, (ex, support) in enumerate(zip(examples, support_sets)): + pred = baseline.adapt_and_generate( + support_items=support, + query_input=ex['query_input'], + task=ex['task'], + lr=lr, steps=30, + max_new_tokens=512, min_new_tokens=128, + ) + preds.append(pred) + if (i + 1) % 40 == 0: + print(f" {desc}: {i+1}/{N}") + baseline.cleanup() + return preds + + +def paired_tests(scores_a, scores_b, name_a, name_b): + """Run paired t-test and Wilcoxon signed-rank test.""" + a = np.array(scores_a) + b = np.array(scores_b) + diff = a - b + + mean_a = np.mean(a) + mean_b = np.mean(b) + mean_diff = np.mean(diff) + + # Paired t-test + t_stat, t_pval = stats.ttest_rel(a, b) + + # Wilcoxon signed-rank test + try: + w_stat, w_pval = stats.wilcoxon(a, b) + except ValueError: + w_stat, w_pval = float('nan'), float('nan') + + # 95% CI for mean difference + se = stats.sem(diff) + ci_low = mean_diff - 1.96 * se + ci_high = mean_diff + 1.96 * se + + print(f"\n {name_a} vs {name_b}:") + print(f" Mean {name_a}: {mean_a:.4f}, Mean {name_b}: {mean_b:.4f}, Diff: {mean_diff:+.4f}") + print(f" 95% CI: [{ci_low:+.4f}, {ci_high:+.4f}]") + print(f" Paired t-test: t={t_stat:.3f}, p={t_pval:.2e}") + print(f" Wilcoxon: W={w_stat:.0f}, p={w_pval:.2e}") + + return { + 'mean_a': mean_a, 'mean_b': mean_b, 'mean_diff': mean_diff, + 'ci_low': ci_low, 'ci_high': ci_high, + 't_stat': t_stat, 't_pval': t_pval, + 'w_stat': float(w_stat), 'w_pval': float(w_pval), + } + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--num_eval', type=int, default=200) + parser.add_argument('--task', type=str, default='review', choices=['review', 'topic']) + parser.add_argument('--setting', type=str, default='user', choices=['user', 'temporal']) + parser.add_argument('--device', type=str, default='cuda:0') + parser.add_argument('--output_dir', type=str, default='outputs/significance') + args = parser.parse_args() + + N = args.num_eval + device = args.device + task = args.task + setting = args.setting + + config_map = { + ('review', 'user'): 'product_review_user', + ('review', 'temporal'): 'product_review_temporal', + ('topic', 'user'): 'topic_writing_user', + ('topic', 'temporal'): 'topic_writing_temporal', + } + config_name = config_map[(task, setting)] + + print(f"=== Significance Tests: {task}_{setting}, N={N} ===") + + print("\nLoading data...") + examples = load_longlamp(config_name, split='val')[:N] + K = 4 + support_sets = [select_k_profile_items(ex['profile_items'], K, seed=0) for ex in examples] + references = [ex['target_output'] for ex in examples] + + print(f"Loading model on {device}...") + wrapper = QwenWrapper('Qwen/Qwen2.5-1.5B-Instruct', device=device) + + all_preds = {} + all_per_user_rl = {} + + # Run Base + print("\n--- Base ---") + preds = run_base(wrapper, examples, N) + all_preds['Base'] = preds + all_per_user_rl['Base'] = per_user_rouge_l(preds, references) + print(f" Mean R-L: {np.mean(all_per_user_rl['Base']):.4f}") + + # Run UPH + print("\n--- UPH ---") + preds = run_uph(wrapper, examples, support_sets, N, device) + all_preds['UPH'] = preds + all_per_user_rl['UPH'] = per_user_rouge_l(preds, references) + print(f" Mean R-L: {np.mean(all_per_user_rl['UPH']):.4f}") + + # Run PEFT methods + peft_methods = [ + ('LoRA_r8', get_lora_config(rank=8), 1e-4, 'LoRA r=8'), + ('TinyLoRA_r1', get_tiny_lora_config(rank=1), 1e-4, 'Tiny LoRA r=1'), + ('VeRA_r256', get_vera_config(rank=256), 1e-3, 'VeRA r=256'), + ] + + for key, config, lr, desc in peft_methods: + print(f"\n--- {desc} ---") + preds = run_peft_method(wrapper, examples, support_sets, N, config, lr, desc) + all_preds[key] = preds + all_per_user_rl[key] = per_user_rouge_l(preds, references) + print(f" Mean R-L: {np.mean(all_per_user_rl[key]):.4f}") + + # Significance tests + print("\n" + "=" * 80) + print("SIGNIFICANCE TESTS (ROUGE-L, paired)") + print("=" * 80) + + test_results = {} + comparisons = [ + ('UPH', 'Base'), + ('UPH', 'LoRA_r8'), + ('UPH', 'TinyLoRA_r1'), + ('UPH', 'VeRA_r256'), + ] + + for name_a, name_b in comparisons: + r = paired_tests( + all_per_user_rl[name_a], + all_per_user_rl[name_b], + name_a, name_b, + ) + test_results[f'{name_a}_vs_{name_b}'] = r + + # Save results + os.makedirs(args.output_dir, exist_ok=True) + output_path = os.path.join(args.output_dir, f'{task}_{setting}_significance.json') + + save_data = { + 'per_user_rougeL': {k: v for k, v in all_per_user_rl.items()}, + 'significance_tests': test_results, + 'num_examples': N, + 'task': task, + 'setting': setting, + } + with open(output_path, 'w') as f: + json.dump(save_data, f, indent=2, default=str) + print(f"\nResults saved to {output_path}") + + +if __name__ == '__main__': + main() |
