1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
|
"""Style feature extraction for SFD (Style Feature Distance) metric."""
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
FEATURE_NAMES = [
'length', 'avg_sent_len', 'TTR', 'newline_rate', 'exclaim_rate',
'first_person_rate', 'adj_adv_rate', 'sentiment_score'
]
def extract_style_features(text: str) -> list:
"""Extract style feature vector from text.
Returns:
[length, avg_sent_len, TTR, newline_rate, exclaim_rate,
first_person_rate, adj_adv_rate, sentiment_score]
"""
if not text or not text.strip():
return [0.0] * 8
words = word_tokenize(text)
num_words = max(len(words), 1)
# 1. Length (word count)
length = float(num_words)
# 2. Average sentence length
sentences = sent_tokenize(text)
num_sents = max(len(sentences), 1)
avg_sent_len = num_words / num_sents
# 3. Type-Token Ratio
unique_words = set(w.lower() for w in words if w.isalpha())
alpha_words = [w for w in words if w.isalpha()]
ttr = len(unique_words) / max(len(alpha_words), 1)
# 4. Newline rate
newline_count = text.count('\n')
newline_rate = newline_count / num_sents
# 5. Exclamation rate
exclaim_count = text.count('!')
exclaim_rate = exclaim_count / num_sents
# 6. First-person rate
first_person = {'i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves'}
fp_count = sum(1 for w in words if w.lower() in first_person)
first_person_rate = fp_count / num_words
# 7. Adjective/Adverb rate
try:
tagged = nltk.pos_tag(words)
adj_adv_tags = {'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'}
adj_adv_count = sum(1 for _, tag in tagged if tag in adj_adv_tags)
adj_adv_rate = adj_adv_count / num_words
except Exception:
adj_adv_rate = 0.0
# 8. Simple sentiment score (positive - negative word ratio)
positive_words = {
'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic',
'love', 'loved', 'best', 'perfect', 'awesome', 'beautiful',
'enjoy', 'enjoyed', 'happy', 'glad', 'nice', 'brilliant',
'outstanding', 'superb', 'delightful', 'pleasant', 'favorite',
'recommend', 'recommended', 'impressive', 'incredible',
}
negative_words = {
'bad', 'terrible', 'awful', 'horrible', 'worst', 'poor',
'hate', 'hated', 'boring', 'disappointing', 'disappointed',
'ugly', 'annoying', 'waste', 'useless', 'mediocre', 'dull',
'pathetic', 'garbage', 'rubbish', 'disgusting', 'dreadful',
}
pos_count = sum(1 for w in words if w.lower() in positive_words)
neg_count = sum(1 for w in words if w.lower() in negative_words)
sentiment_score = (pos_count - neg_count) / num_words
return [length, avg_sent_len, ttr, newline_rate, exclaim_rate,
first_person_rate, adj_adv_rate, sentiment_score]
def compute_sfd(generated_text: str, support_texts: list, exclude_length: bool = False) -> float:
"""Compute Style Feature Distance.
Args:
generated_text: The model's generated output.
support_texts: List of the user's support set output texts.
exclude_length: If True, exclude length feature (index 0) from SFD.
Returns:
L1 distance between generated style and user style prototype.
"""
gen_features = extract_style_features(generated_text)
support_features_list = [extract_style_features(t) for t in support_texts]
num_support = len(support_features_list)
if num_support == 0:
return 0.0
prototype = [0.0] * len(gen_features)
for sf in support_features_list:
for i in range(len(prototype)):
prototype[i] += sf[i]
prototype = [p / num_support for p in prototype]
start_idx = 1 if exclude_length else 0
sfd = 0.0
for i in range(start_idx, len(gen_features)):
g, p = gen_features[i], prototype[i]
scale = max(abs(p), 1.0)
sfd += abs(g - p) / scale
return sfd
def compute_feature_deltas(generated_text: str, support_texts: list) -> dict:
"""Compute per-feature deltas between generated text and user style prototype.
Returns dict mapping feature_name -> (gen_value, proto_value, delta).
"""
gen_features = extract_style_features(generated_text)
support_features_list = [extract_style_features(t) for t in support_texts]
num_support = len(support_features_list)
if num_support == 0:
return {}
prototype = [0.0] * len(gen_features)
for sf in support_features_list:
for i in range(len(prototype)):
prototype[i] += sf[i]
prototype = [p / num_support for p in prototype]
deltas = {}
for i, name in enumerate(FEATURE_NAMES):
deltas[name] = {
'gen': gen_features[i],
'proto': prototype[i],
'delta': gen_features[i] - prototype[i],
}
return deltas
|