src/personalization/models/preference_extractor/gpt4o_extractor.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

from __future__ import annotations

import json
import os
from typing import Any, Dict, List

from openai import OpenAI
from personalization.config.settings import LocalModelsConfig
from personalization.models.preference_extractor.base import PreferenceExtractorBase as PreferenceExtractor
from personalization.retrieval.preference_store.schemas import (
    ChatTurn,
    PreferenceList,
    preference_list_json_schema,
)


class GPT4OExtractor(PreferenceExtractor):
    def __init__(self, api_key: str, model: str = "gpt-4o") -> None:
        self.client = OpenAI(api_key=api_key)
        self.model = model
        
        # Load system prompt template
        template_path = "fine_tuning_prompt_template.txt"
        if os.path.exists(template_path):
            with open(template_path, "r", encoding="utf-8") as f:
                self.system_prompt = f.read()
        else:
            # Fallback simple prompt if file missing
            self.system_prompt = (
                "You are a preference extraction assistant. "
                "Extract user preferences from the query into a JSON object."
            )

    @classmethod
    def from_config(cls, cfg: LocalModelsConfig) -> "GPT4OExtractor":
        # We rely on env var for API key, config for other potential settings if needed
        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("OPENAI_API_KEY environment variable not set")
        return cls(api_key=api_key)

    def build_preference_prompt(self, query: str) -> str:
        # GPT4OExtractor uses the system prompt loaded in __init__
        return self.system_prompt

    def extract_preferences(self, query: str) -> Dict[str, Any]:
        # Reuse logic but return raw dict
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": query},
                ],
                response_format={"type": "json_object"},
                temperature=0.0,
            )
            content = response.choices[0].message.content
            if content:
                return json.loads(content)
        except Exception as e:
            print(f"Error calling GPT-4o: {e}")
        return {"preferences": []}

    def extract_turn(self, turn: ChatTurn) -> PreferenceList:
        if turn.role != "user":
            return PreferenceList(preferences=[])

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": turn.text},
                ],
                response_format={"type": "json_object"},
                temperature=0.0,
            )
            
            content = response.choices[0].message.content
            if not content:
                return PreferenceList(preferences=[])

            data = json.loads(content)
            # The prompt might return {"preferences": [...]}, validate it
            return PreferenceList.model_validate(data)
            
        except Exception as e:
            print(f"Error calling GPT-4o: {e}")
            return PreferenceList(preferences=[])

    def extract_session(self, turns: List[ChatTurn]) -> List[PreferenceList]:
        results = []
        for turn in turns:
            results.append(self.extract_turn(turn))
        return results