summaryrefslogtreecommitdiff
path: root/data/style_features.py
blob: d674419be51125425138cf3323d9fbe91e278309 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""Style feature extraction for SFD (Style Feature Distance) metric."""

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

FEATURE_NAMES = [
    'length', 'avg_sent_len', 'TTR', 'newline_rate', 'exclaim_rate',
    'first_person_rate', 'adj_adv_rate', 'sentiment_score'
]


def extract_style_features(text: str) -> list:
    """Extract style feature vector from text.

    Returns:
        [length, avg_sent_len, TTR, newline_rate, exclaim_rate,
         first_person_rate, adj_adv_rate, sentiment_score]
    """
    if not text or not text.strip():
        return [0.0] * 8

    words = word_tokenize(text)
    num_words = max(len(words), 1)

    # 1. Length (word count)
    length = float(num_words)

    # 2. Average sentence length
    sentences = sent_tokenize(text)
    num_sents = max(len(sentences), 1)
    avg_sent_len = num_words / num_sents

    # 3. Type-Token Ratio
    unique_words = set(w.lower() for w in words if w.isalpha())
    alpha_words = [w for w in words if w.isalpha()]
    ttr = len(unique_words) / max(len(alpha_words), 1)

    # 4. Newline rate
    newline_count = text.count('\n')
    newline_rate = newline_count / num_sents

    # 5. Exclamation rate
    exclaim_count = text.count('!')
    exclaim_rate = exclaim_count / num_sents

    # 6. First-person rate
    first_person = {'i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves'}
    fp_count = sum(1 for w in words if w.lower() in first_person)
    first_person_rate = fp_count / num_words

    # 7. Adjective/Adverb rate
    try:
        tagged = nltk.pos_tag(words)
        adj_adv_tags = {'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'}
        adj_adv_count = sum(1 for _, tag in tagged if tag in adj_adv_tags)
        adj_adv_rate = adj_adv_count / num_words
    except Exception:
        adj_adv_rate = 0.0

    # 8. Simple sentiment score (positive - negative word ratio)
    positive_words = {
        'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic',
        'love', 'loved', 'best', 'perfect', 'awesome', 'beautiful',
        'enjoy', 'enjoyed', 'happy', 'glad', 'nice', 'brilliant',
        'outstanding', 'superb', 'delightful', 'pleasant', 'favorite',
        'recommend', 'recommended', 'impressive', 'incredible',
    }
    negative_words = {
        'bad', 'terrible', 'awful', 'horrible', 'worst', 'poor',
        'hate', 'hated', 'boring', 'disappointing', 'disappointed',
        'ugly', 'annoying', 'waste', 'useless', 'mediocre', 'dull',
        'pathetic', 'garbage', 'rubbish', 'disgusting', 'dreadful',
    }
    pos_count = sum(1 for w in words if w.lower() in positive_words)
    neg_count = sum(1 for w in words if w.lower() in negative_words)
    sentiment_score = (pos_count - neg_count) / num_words

    return [length, avg_sent_len, ttr, newline_rate, exclaim_rate,
            first_person_rate, adj_adv_rate, sentiment_score]


def compute_sfd(generated_text: str, support_texts: list, exclude_length: bool = False) -> float:
    """Compute Style Feature Distance.

    Args:
        generated_text: The model's generated output.
        support_texts: List of the user's support set output texts.
        exclude_length: If True, exclude length feature (index 0) from SFD.

    Returns:
        L1 distance between generated style and user style prototype.
    """
    gen_features = extract_style_features(generated_text)
    support_features_list = [extract_style_features(t) for t in support_texts]
    num_support = len(support_features_list)
    if num_support == 0:
        return 0.0

    prototype = [0.0] * len(gen_features)
    for sf in support_features_list:
        for i in range(len(prototype)):
            prototype[i] += sf[i]
    prototype = [p / num_support for p in prototype]

    start_idx = 1 if exclude_length else 0
    sfd = 0.0
    for i in range(start_idx, len(gen_features)):
        g, p = gen_features[i], prototype[i]
        scale = max(abs(p), 1.0)
        sfd += abs(g - p) / scale
    return sfd


def compute_feature_deltas(generated_text: str, support_texts: list) -> dict:
    """Compute per-feature deltas between generated text and user style prototype.

    Returns dict mapping feature_name -> (gen_value, proto_value, delta).
    """
    gen_features = extract_style_features(generated_text)
    support_features_list = [extract_style_features(t) for t in support_texts]
    num_support = len(support_features_list)
    if num_support == 0:
        return {}

    prototype = [0.0] * len(gen_features)
    for sf in support_features_list:
        for i in range(len(prototype)):
            prototype[i] += sf[i]
    prototype = [p / num_support for p in prototype]

    deltas = {}
    for i, name in enumerate(FEATURE_NAMES):
        deltas[name] = {
            'gen': gen_features[i],
            'proto': prototype[i],
            'delta': gen_features[i] - prototype[i],
        }
    return deltas