summaryrefslogtreecommitdiff
path: root/backend/app/services/llm.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/app/services/llm.py')
-rw-r--r--backend/app/services/llm.py384
1 files changed, 344 insertions, 40 deletions
diff --git a/backend/app/services/llm.py b/backend/app/services/llm.py
index 958ab4c..b372f9e 100644
--- a/backend/app/services/llm.py
+++ b/backend/app/services/llm.py
@@ -34,57 +34,206 @@ async def stream_openai(messages: list[Message], config: LLMConfig) -> AsyncGene
for msg in messages:
openai_messages.append({"role": msg.role.value, "content": msg.content})
- stream = await client.chat.completions.create(
- model=config.model_name,
- messages=openai_messages,
- temperature=config.temperature,
- max_tokens=config.max_tokens,
- stream=True
+ # Models that ONLY support Responses API (no Chat Completions fallback)
+ responses_only_models = ['gpt-5-pro']
+
+ # Models that CAN use Responses API (and thus support web_search tool)
+ responses_capable_models = [
+ 'gpt-5', 'gpt-5-chat-latest', 'gpt-5-mini', 'gpt-5-nano',
+ 'gpt-5-pro', 'gpt-5.1', 'gpt-5.1-chat-latest', 'o3'
+ ]
+
+ # Use Responses API if:
+ # 1. Model ONLY supports Responses API, OR
+ # 2. User wants web search AND model is capable of Responses API
+ use_responses_api = (
+ config.model_name in responses_only_models or
+ (config.enable_google_search and config.model_name in responses_capable_models)
)
+ if use_responses_api:
+ # Debug: Confirm config reception
+ # yield f"[Debug: Config Search={config.enable_google_search}, Model={config.model_name}]\n"
+
+ # Use new client.responses.create API with Polling Strategy
+ # Convert messages to Responses API format (same as Chat Completions)
+ # Responses API accepts input as array of message objects
+
+ # Filter out system messages (use instructions instead) and format for Responses API
+ input_messages = []
+ for msg in openai_messages:
+ if msg['role'] != 'system': # System prompt goes to instructions
+ input_messages.append({
+ "role": msg['role'],
+ "content": msg['content']
+ })
+
+ resp_params = {
+ "model": config.model_name,
+ "input": input_messages, # Full conversation history
+ "stream": False, # Disable stream to get immediate ID
+ "background": True, # Enable background mode for async execution
+ "store": True
+ }
+
+ # Add reasoning effort (not supported by chat-latest models)
+ models_without_effort = ['gpt-5-chat-latest', 'gpt-5.1-chat-latest']
+ if config.model_name not in models_without_effort:
+ resp_params["reasoning"] = {"effort": config.reasoning_effort.value}
+
+ # Enable Web Search if requested (Reusing enable_google_search flag as generic web_search flag)
+ if config.enable_google_search:
+ resp_params["tools"] = [{"type": "web_search"}]
+ resp_params["tool_choice"] = "auto"
+ # Debugging tool injection
+ # yield "[Debug: Web Search Tool Injected]" # Uncomment to debug
+
+ if config.system_prompt:
+ resp_params["instructions"] = config.system_prompt
+
+ # 1. Create Response (Async/Background)
+ # This returns a Response object immediately with status 'queued' or 'in_progress'
+ initial_resp = await client.responses.create(**resp_params)
+ response_id = initial_resp.id
+
+ # 2. Poll for Completion
+ import asyncio
+ # Poll for up to 10 minutes
+ for _ in range(300):
+ final_resp = await client.responses.retrieve(response_id)
+
+ if final_resp.status == 'completed':
+ # Parse final response object
+ found_content = False
+ if hasattr(final_resp, 'output'):
+ for out in final_resp.output:
+ out_type = getattr(out, 'type', None)
+ out_content = getattr(out, 'content', None)
+
+ if out_type == 'message' and out_content:
+ for c in out_content:
+ c_type = getattr(c, 'type', None)
+ if c_type == 'output_text':
+ text_val = getattr(c, 'text', None)
+ if text_val:
+ yield text_val
+ found_content = True
+
+ if not found_content:
+ yield f"\n[Debug: Completed but no content. Resp: {final_resp}]"
+ return
+
+ elif final_resp.status in ['failed', 'cancelled', 'expired']:
+ error_msg = getattr(final_resp, 'error', 'Unknown error')
+ yield f"\n[Error: Response generation {final_resp.status}: {error_msg}]"
+ return
+
+ # Still in_progress
+ await asyncio.sleep(2)
+
+ yield "\n[Error: Polling timed out]"
+ return
+
+ # Standard Chat Completions API
+ # Prepare parameters
+ req_params = {
+ "model": config.model_name,
+ "messages": openai_messages,
+ "stream": True
+ }
+
+ # Identify reasoning models
+ is_reasoning_model = config.model_name in [
+ 'gpt-5', 'gpt-5-chat-latest', 'gpt-5-mini', 'gpt-5-nano',
+ 'gpt-5-pro', 'gpt-5.1', 'gpt-5.1-chat-latest', 'o3',
+ 'o1', 'o1-mini', 'o1-preview'
+ ]
+
+ if is_reasoning_model:
+ # Reasoning models use max_completion_tokens
+ if config.max_tokens:
+ req_params["max_completion_tokens"] = config.max_tokens
+ # IMPORTANT: Reasoning models often DO NOT support 'temperature'.
+ # We skip adding it.
+ else:
+ req_params["max_tokens"] = config.max_tokens
+ req_params["temperature"] = config.temperature
+
+ stream = await client.chat.completions.create(**req_params)
+
async for chunk in stream:
- if chunk.choices[0].delta.content:
- yield chunk.choices[0].delta.content
+ if chunk.choices and chunk.choices[0].delta:
+ delta = chunk.choices[0].delta
+ if delta.content:
+ yield delta.content
+ elif delta.tool_calls:
+ # If the model tries to call a tool (even if we didn't send any?)
+ # This shouldn't happen unless we sent tools.
+ # But let's notify the user.
+ # Or maybe it's just an empty delta at the start/end.
+ pass
+ elif getattr(delta, 'refusal', None):
+ yield f"[Refusal: {delta.refusal}]"
async def stream_google(messages: list[Message], config: LLMConfig) -> AsyncGenerator[str, None]:
- configure_google(config.api_key)
- model = genai.GenerativeModel(config.model_name)
-
- # Google Generative AI history format:
- # [{"role": "user", "parts": ["..."]}, {"role": "model", "parts": ["..."]}]
- # System prompt is usually set on model init or prepended.
+ # Use new Google GenAI SDK (google-genai)
+ from google import genai
+ from google.genai import types
- history = []
- # If system prompt exists, we might prepend it to the first user message or use specific system instruction if supported
- # Gemini 1.5 Pro supports system instructions. For simplicity, let's prepend to history if possible or context.
+ key = config.api_key or os.getenv("GOOGLE_API_KEY")
+ if not key:
+ raise ValueError("Google API Key not found")
+
+ client = genai.Client(api_key=key)
- system_instruction = config.system_prompt
- if system_instruction:
- model = genai.GenerativeModel(config.model_name, system_instruction=system_instruction)
+ # Configure Tools (Google Search)
+ tools = None
+ if config.enable_google_search:
+ # Enable Google Search Grounding
+ tools = [types.Tool(google_search=types.GoogleSearch())]
- # Convert messages
- # Note: Gemini strictly requires user/model alternation in history usually.
- # We will need to handle this. For MVP, we assume the input is clean or we blindly map.
- for msg in messages:
+ # Configure Generation
+ gen_config = types.GenerateContentConfig(
+ temperature=config.temperature,
+ max_output_tokens=config.max_tokens,
+ system_instruction=config.system_prompt,
+ tools=tools
+ )
+
+ # Prepare History
+ # Extract last message as the prompt
+ prompt_msg = "..."
+ history_msgs = messages
+ if messages and messages[-1].role == Role.USER:
+ prompt_msg = messages[-1].content
+ history_msgs = messages[:-1]
+
+ history_content = []
+ for msg in history_msgs:
role = "user" if msg.role == Role.USER else "model"
- history.append({"role": role, "parts": [msg.content]})
-
- # The last message should be the prompt, strictly speaking, `chat.send_message` takes the new message
- # But if we are treating everything as history...
- # Let's separate the last user message as the prompt if possible.
+ history_content.append(types.Content(
+ role=role,
+ parts=[types.Part(text=msg.content)]
+ ))
+
+ # Use Async Client via .aio
+ chat_session = client.aio.chats.create(
+ model=config.model_name,
+ history=history_content,
+ config=gen_config
+ )
+
+ # Streaming call
+ # In google-genai SDK, streaming is usually via send_message_stream
+
+ # Check if send_message_stream exists, otherwise use send_message with stream=True (but error says no)
+ # Let's assume send_message_stream is the way.
+
+ # Note: chat_session.send_message_stream returns an AsyncIterator (or a coroutine returning one)
+ response_stream = await chat_session.send_message_stream(prompt_msg)
- if history and history[-1]["role"] == "user":
- last_msg = history.pop()
- chat = model.start_chat(history=history)
- response_stream = await chat.send_message_async(last_msg["parts"][0], stream=True)
- else:
- # If the last message is not user, we might be in a weird state.
- # Just send an empty prompt or handle error?
- # For now, assume the user always provides a prompt in the node.
- chat = model.start_chat(history=history)
- response_stream = await chat.send_message_async("...", stream=True) # Fallback
-
async for chunk in response_stream:
+ # Access text safely
if chunk.text:
yield chunk.text
@@ -114,3 +263,158 @@ async def llm_streamer(context: Context, user_prompt: str, config: LLMConfig) ->
except Exception as e:
yield f"Error calling LLM: {str(e)}"
+
+async def generate_title(user_prompt: str, response: str) -> str:
+ """
+ Generate a short title (3-4 words) for a Q-A pair using gpt-5-nano.
+ Uses Responses API (required for gpt-5 series), synchronous mode (no background).
+ """
+ client = get_openai_client()
+
+ instructions = """TASK: Extract a short topic title from the given Q&A. Do NOT answer the question - only extract the topic.
+
+Rules:
+- Output 2-3 short words OR 2 longer words
+- No punctuation, no quotes, no explanation
+- Capitalize each word
+- Be specific to the topic discussed
+- Output ONLY the title, nothing else
+
+Examples:
+Q: "How to sort a list in Python?" -> "Python Sorting"
+Q: "What is React state?" -> "React State"
+Q: "Explain AWS Lambda pricing" -> "Lambda Pricing"
+Q: "Who are you?" -> "AI Identity"
+Q: "What's the weather in NYC?" -> "NYC Weather\""""
+
+ # Truncate to avoid token limits
+ truncated_prompt = user_prompt[:300] if len(user_prompt) > 300 else user_prompt
+ truncated_response = response[:300] if len(response) > 300 else response
+
+ input_text = f"Question: {truncated_prompt}\n\nAnswer: {truncated_response}"
+
+ try:
+ print(f"[generate_title] Called with prompt: {truncated_prompt[:50]}...")
+
+ # Use Responses API for gpt-5-nano (synchronous, no background)
+ # Note: max_output_tokens includes reasoning tokens, so needs to be higher
+ resp = await client.responses.create(
+ model="gpt-5-nano",
+ input=input_text,
+ instructions=instructions,
+ max_output_tokens=500, # Higher to accommodate reasoning tokens
+ reasoning={"effort": "low"}, # Minimize reasoning for simple task
+ stream=False
+ )
+
+ print(f"[generate_title] Response status: {getattr(resp, 'status', 'unknown')}")
+ print(f"[generate_title] Response output: {getattr(resp, 'output', 'no output')}")
+
+ # Response should be completed immediately (no polling needed)
+ if hasattr(resp, 'output'):
+ for out in resp.output:
+ if getattr(out, 'type', None) == 'message':
+ content = getattr(out, 'content', [])
+ for c in content:
+ if getattr(c, 'type', None) == 'output_text':
+ title = getattr(c, 'text', '').strip()
+ # Clean up
+ title = title.strip('"\'')
+ print(f"[generate_title] Extracted title: {title}")
+ if title:
+ return title
+
+ print("[generate_title] No title found, returning default")
+ return "New Question"
+
+ except Exception as e:
+ print(f"Title generation error: {e}")
+ return "New Question"
+
+
+async def summarize_content(content: str, model: str) -> str:
+ """
+ Summarize the given content using the specified model.
+ Supports both OpenAI and Gemini models.
+ """
+ instructions = """Summarize the following content concisely.
+Keep the key points and main ideas.
+Output only the summary, no preamble."""
+
+ # Truncate very long content
+ max_content = 8000
+ if len(content) > max_content:
+ content = content[:max_content] + "\n\n[Content truncated...]"
+
+ try:
+ if model.startswith('gemini'):
+ # Use Gemini
+ from google import genai
+ from google.genai import types
+ import os
+
+ key = os.getenv("GOOGLE_API_KEY")
+ if not key:
+ return "Error: Google API Key not found"
+
+ client = genai.Client(api_key=key)
+
+ gen_config = types.GenerateContentConfig(
+ temperature=0.3,
+ max_output_tokens=1000,
+ system_instruction=instructions
+ )
+
+ response = await client.aio.models.generate_content(
+ model=model,
+ contents=content,
+ config=gen_config
+ )
+
+ return response.text or "No summary generated"
+
+ else:
+ # Use OpenAI
+ client = get_openai_client()
+
+ # Check if model needs Responses API
+ responses_api_models = [
+ 'gpt-5', 'gpt-5-chat-latest', 'gpt-5-mini', 'gpt-5-nano',
+ 'gpt-5-pro', 'gpt-5.1', 'gpt-5.1-chat-latest', 'o3'
+ ]
+
+ if model in responses_api_models:
+ # Use Responses API
+ resp = await client.responses.create(
+ model=model,
+ input=content,
+ instructions=instructions,
+ max_output_tokens=2000,
+ stream=False
+ )
+
+ if hasattr(resp, 'output'):
+ for out in resp.output:
+ if getattr(out, 'type', None) == 'message':
+ for c in getattr(out, 'content', []):
+ if getattr(c, 'type', None) == 'output_text':
+ return getattr(c, 'text', '') or "No summary generated"
+
+ return "No summary generated"
+ else:
+ # Use Chat Completions API
+ result = await client.chat.completions.create(
+ model=model,
+ messages=[
+ {"role": "system", "content": instructions},
+ {"role": "user", "content": content}
+ ],
+ max_tokens=1000,
+ temperature=0.3
+ )
+
+ return result.choices[0].message.content or "No summary generated"
+
+ except Exception as e:
+ print(f"Summarization error: {e}")
+ return f"Error: {str(e)}"