scripts/compute_bertscore.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

"""Compute BERTScore from saved per-user predictions.

Uses saved predictions from significance tests (UPH, Base) and PEFT per-user data.

Usage:
    python scripts/compute_bertscore.py --task review --setting user --device cuda:0
"""

import sys
import os
import json
import numpy as np
from scipy import stats

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


def paired_test(scores_a, scores_b, name_a, name_b):
    a = np.array(scores_a)
    b = np.array(scores_b)
    diff = a - b

    mean_a, mean_b = np.mean(a), np.mean(b)
    mean_diff = np.mean(diff)

    t_stat, t_pval = stats.ttest_rel(a, b)
    try:
        w_stat, w_pval = stats.wilcoxon(a, b)
    except ValueError:
        w_stat, w_pval = float('nan'), float('nan')

    se = stats.sem(diff)
    ci_low = mean_diff - 1.96 * se
    ci_high = mean_diff + 1.96 * se

    print(f"  {name_a} vs {name_b}:")
    print(f"    Mean {name_a}: {mean_a:.4f}, Mean {name_b}: {mean_b:.4f}, Diff: {mean_diff:+.4f}")
    print(f"    95% CI: [{ci_low:+.4f}, {ci_high:+.4f}]")
    print(f"    t-test: p={t_pval:.2e}, Wilcoxon: p={w_pval:.2e}")

    return {
        'mean_a': float(mean_a), 'mean_b': float(mean_b),
        'mean_diff': float(mean_diff),
        'ci_low': float(ci_low), 'ci_high': float(ci_high),
        't_pval': float(t_pval), 'w_pval': float(w_pval),
    }


def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='review')
    parser.add_argument('--setting', type=str, default='user')
    parser.add_argument('--device', type=str, default='cuda:0')
    parser.add_argument('--bert_model', type=str, default='roberta-large')
    args = parser.parse_args()

    task = args.task
    setting = args.setting
    N = 200

    # Load saved predictions
    sig_path = f"outputs/significance/{task}_{setting}_significance.json"
    peft_path = f"outputs/peft_baselines/{task}_{setting}_K4_N{N}_peft_per_user.json"

    if not os.path.exists(sig_path):
        print(f"Significance data not found: {sig_path}")
        return
    if not os.path.exists(peft_path):
        print(f"PEFT per-user data not found: {peft_path}")
        return

    with open(sig_path) as f:
        sig_data = json.load(f)
    with open(peft_path) as f:
        peft_data = json.load(f)

    # Collect all predictions and references
    all_preds = {}
    all_refs = {}

    # UPH and Base from significance data
    all_preds['UPH'] = sig_data['uph_predictions']
    all_preds['Base'] = sig_data['base_predictions']

    # References (same for all methods)
    refs = [u['reference'] for u in peft_data['per_user']['lora']]

    # PEFT predictions
    for method in ['lora', 'tiny_lora', 'vera']:
        all_preds[method] = [u['prediction'] for u in peft_data['per_user'][method]]

    print(f"=== BERTScore: {task}_{setting}, N={len(refs)} ===")
    print(f"Model: {args.bert_model}")
    print(f"Methods: {list(all_preds.keys())}")

    # Compute BERTScore for each method
    from bert_score import score as bert_score_fn

    all_bertscore = {}
    for method, preds in all_preds.items():
        print(f"\n  Computing BERTScore for {method}...")
        P, R, F1 = bert_score_fn(
            preds, refs,
            model_type=args.bert_model,
            device=args.device,
            verbose=False,
        )
        all_bertscore[method] = F1.tolist()
        print(f"    Mean F1: {np.mean(F1.tolist()):.4f}")

    # Summary table
    print("\n" + "=" * 60)
    print("BERTScore F1 Summary")
    print("=" * 60)
    for method in all_preds:
        scores = all_bertscore[method]
        print(f"  {method:<15} Mean: {np.mean(scores):.4f}, Std: {np.std(scores):.4f}")

    # Significance tests
    print("\n" + "=" * 60)
    print("Significance Tests — BERTScore F1 (paired)")
    print("=" * 60)

    test_results = {}
    for other in ['Base', 'lora', 'tiny_lora', 'vera']:
        r = paired_test(all_bertscore['UPH'], all_bertscore[other], 'UPH', other)
        test_results[f'UPH_vs_{other}'] = r

    # Save
    output_path = f"outputs/significance/{task}_{setting}_bertscore.json"
    with open(output_path, 'w') as f:
        json.dump({
            'bertscore_f1': all_bertscore,
            'significance_tests': test_results,
            'model': args.bert_model,
            'task': task,
            'setting': setting,
            'num_examples': len(refs),
        }, f, indent=2)
    print(f"\nSaved to {output_path}")


if __name__ == '__main__':
    main()