scripts/pilot_study.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

import json
import os
import random
import asyncio
from typing import List, Dict, Any
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm_asyncio

# --- Configuration ---
INPUT_FILE = "data/raw_datasets/combined_raw_queries.jsonl"
OUTPUT_FILE = "data/raw_datasets/pilot_study_1000.jsonl"
SAMPLE_SIZE = 1000
MODEL_NAME = "gpt-5.1"  # Or your specific model ID
MAX_CONCURRENCY = 100    # Adjust based on your rate limits

# --- Load System Prompt ---
with open("fine_tuning_prompt_template.txt", "r", encoding="utf-8") as f:
    # Extract the system prompt part (before the examples to save tokens, 
    # or keep full if you want few-shot behavior).
    # Based on the file content you wrote earlier, let's use the whole thing 
    # as the system instruction to ensure high quality.
    SYSTEM_PROMPT = f.read()

# --- Async Worker ---
async def label_query(client: AsyncOpenAI, sem: asyncio.Semaphore, item: Dict[str, Any]) -> Dict[str, Any]:
    query = item["query"]
    async with sem:
        try:
            response = await client.chat.completions.create(
                model=MODEL_NAME,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": query}
                ],
                temperature=0.0, # Deterministic for extraction
                response_format={"type": "json_object"} # Enforce JSON
            )
            result_text = response.choices[0].message.content
            
            # Parse to ensure validity
            try:
                parsed = json.loads(result_text)
                prefs = parsed.get("preferences", [])
                has_pref = len(prefs) > 0
            except:
                parsed = {"error": "json_parse_fail", "raw": result_text}
                has_pref = False
                
            return {
                "original_query": query,
                "source": item.get("source"),
                "extracted_json": parsed,
                "has_preference": has_pref
            }
        except Exception as e:
            return {
                "original_query": query,
                "source": item.get("source"),
                "error": str(e),
                "has_preference": False
            }

async def main():
    # 1. Load and Sample
    print(f"Loading data from {INPUT_FILE}...")
    all_lines = []
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                all_lines.append(json.loads(line))
    
    if len(all_lines) > SAMPLE_SIZE:
        sampled_data = random.sample(all_lines, SAMPLE_SIZE)
    else:
        sampled_data = all_lines
    print(f"Sampled {len(sampled_data)} items.")

    # 2. Setup OpenAI Client
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        print("Error: OPENAI_API_KEY environment variable not set.")
        return
    
    client = AsyncOpenAI(api_key=api_key)
    sem = asyncio.Semaphore(MAX_CONCURRENCY)

    # 3. Run Labeling
    tasks = [label_query(client, sem, item) for item in sampled_data]
    results = await tqdm_asyncio.gather(*tasks, desc="Labeling")

    # 4. Statistics & Save
    pos_count = sum(1 for r in results if r.get("has_preference"))
    total = len(results)
    ratio = (pos_count / total) * 100 if total > 0 else 0

    print(f"\n--- Results ---")
    print(f"Total processed: {total}")
    print(f"Positive (has preferences): {pos_count}")
    print(f"Negative (empty): {total - pos_count}")
    print(f"Positive Ratio: {ratio:.2f}%")
    
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for res in results:
            f.write(json.dumps(res, ensure_ascii=False) + "\n")
    print(f"Saved detailed results to {OUTPUT_FILE}")

if __name__ == "__main__":
    asyncio.run(main())