1 files changed, 344 insertions, 40 deletions
diff --git a/backend/app/services/llm.py b/backend/app/services/llm.py
index 958ab4c..b372f9e 100644
--- a/backend/app/services/llm.py
+++ b/backend/app/services/llm.py
@@ -34,57 +34,206 @@ async def stream_openai(messages: list[Message], config: LLMConfig) -> AsyncGene
     for msg in messages:
         openai_messages.append({"role": msg.role.value, "content": msg.content})
 
-    stream = await client.chat.completions.create(
-        model=config.model_name,
-        messages=openai_messages,
-        temperature=config.temperature,
-        max_tokens=config.max_tokens,
-        stream=True
+    # Models that ONLY support Responses API (no Chat Completions fallback)
+    responses_only_models = ['gpt-5-pro']
+    
+    # Models that CAN use Responses API (and thus support web_search tool)
+    responses_capable_models = [
+        'gpt-5', 'gpt-5-chat-latest', 'gpt-5-mini', 'gpt-5-nano', 
+        'gpt-5-pro', 'gpt-5.1', 'gpt-5.1-chat-latest', 'o3'
+    ]
+    
+    # Use Responses API if:
+    # 1. Model ONLY supports Responses API, OR
+    # 2. User wants web search AND model is capable of Responses API
+    use_responses_api = (
+        config.model_name in responses_only_models or 
+        (config.enable_google_search and config.model_name in responses_capable_models)
     )
 
+    if use_responses_api:
+        # Debug: Confirm config reception
+        # yield f"[Debug: Config Search={config.enable_google_search}, Model={config.model_name}]\n" 
+
+        # Use new client.responses.create API with Polling Strategy
+        # Convert messages to Responses API format (same as Chat Completions)
+        # Responses API accepts input as array of message objects
+        
+        # Filter out system messages (use instructions instead) and format for Responses API
+        input_messages = []
+        for msg in openai_messages:
+            if msg['role'] != 'system':  # System prompt goes to instructions
+                input_messages.append({
+                    "role": msg['role'],
+                    "content": msg['content']
+                })
+        
+        resp_params = {
+            "model": config.model_name,
+            "input": input_messages,  # Full conversation history
+            "stream": False,    # Disable stream to get immediate ID
+            "background": True, # Enable background mode for async execution
+            "store": True
+        }
+        
+        # Add reasoning effort (not supported by chat-latest models)
+        models_without_effort = ['gpt-5-chat-latest', 'gpt-5.1-chat-latest']
+        if config.model_name not in models_without_effort:
+            resp_params["reasoning"] = {"effort": config.reasoning_effort.value}
+        
+        # Enable Web Search if requested (Reusing enable_google_search flag as generic web_search flag)
+        if config.enable_google_search:
+            resp_params["tools"] = [{"type": "web_search"}]
+            resp_params["tool_choice"] = "auto"
+            # Debugging tool injection
+            # yield "[Debug: Web Search Tool Injected]" # Uncomment to debug
+        
+        if config.system_prompt:
+            resp_params["instructions"] = config.system_prompt
+            
+        # 1. Create Response (Async/Background)
+        # This returns a Response object immediately with status 'queued' or 'in_progress'
+        initial_resp = await client.responses.create(**resp_params)
+        response_id = initial_resp.id
+        
+        # 2. Poll for Completion
+        import asyncio
+        # Poll for up to 10 minutes
+        for _ in range(300): 
+            final_resp = await client.responses.retrieve(response_id)
+            
+            if final_resp.status == 'completed':
+                # Parse final response object
+                found_content = False
+                if hasattr(final_resp, 'output'):
+                    for out in final_resp.output:
+                        out_type = getattr(out, 'type', None)
+                        out_content = getattr(out, 'content', None)
+                        
+                        if out_type == 'message' and out_content:
+                            for c in out_content:
+                                c_type = getattr(c, 'type', None)
+                                if c_type == 'output_text':
+                                    text_val = getattr(c, 'text', None)
+                                    if text_val:
+                                        yield text_val
+                                        found_content = True
+                
+                if not found_content:
+                    yield f"\n[Debug: Completed but no content. Resp: {final_resp}]"
+                return
+            
+            elif final_resp.status in ['failed', 'cancelled', 'expired']:
+                error_msg = getattr(final_resp, 'error', 'Unknown error')
+                yield f"\n[Error: Response generation {final_resp.status}: {error_msg}]"
+                return
+            
+            # Still in_progress
+            await asyncio.sleep(2)
+            
+        yield "\n[Error: Polling timed out]"
+        return
+
+    # Standard Chat Completions API
+    # Prepare parameters
+    req_params = {
+        "model": config.model_name,
+        "messages": openai_messages,
+        "stream": True
+    }
+
+    # Identify reasoning models
+    is_reasoning_model = config.model_name in [
+        'gpt-5', 'gpt-5-chat-latest', 'gpt-5-mini', 'gpt-5-nano', 
+        'gpt-5-pro', 'gpt-5.1', 'gpt-5.1-chat-latest', 'o3',
+        'o1', 'o1-mini', 'o1-preview'
+    ]
+
+    if is_reasoning_model:
+        # Reasoning models use max_completion_tokens
+        if config.max_tokens:
+            req_params["max_completion_tokens"] = config.max_tokens
+        # IMPORTANT: Reasoning models often DO NOT support 'temperature'.
+        # We skip adding it.
+    else:
+        req_params["max_tokens"] = config.max_tokens
+        req_params["temperature"] = config.temperature
+
+    stream = await client.chat.completions.create(**req_params)
+
     async for chunk in stream:
-        if chunk.choices[0].delta.content:
-            yield chunk.choices[0].delta.content
+        if chunk.choices and chunk.choices[0].delta:
+            delta = chunk.choices[0].delta
+            if delta.content:
+                yield delta.content
+            elif delta.tool_calls:
+                # If the model tries to call a tool (even if we didn't send any?)
+                # This shouldn't happen unless we sent tools.
+                # But let's notify the user.
+                # Or maybe it's just an empty delta at the start/end.
+                pass 
+            elif getattr(delta, 'refusal', None):
+                yield f"[Refusal: {delta.refusal}]"
 
 async def stream_google(messages: list[Message], config: LLMConfig) -> AsyncGenerator[str, None]:
-    configure_google(config.api_key)
-    model = genai.GenerativeModel(config.model_name)
-    
-    # Google Generative AI history format:
-    # [{"role": "user", "parts": ["..."]}, {"role": "model", "parts": ["..."]}]
-    # System prompt is usually set on model init or prepended.
+    # Use new Google GenAI SDK (google-genai)
+    from google import genai
+    from google.genai import types
     
-    history = []
-    # If system prompt exists, we might prepend it to the first user message or use specific system instruction if supported
-    # Gemini 1.5 Pro supports system instructions. For simplicity, let's prepend to history if possible or context.
+    key = config.api_key or os.getenv("GOOGLE_API_KEY")
+    if not key:
+        raise ValueError("Google API Key not found")
+        
+    client = genai.Client(api_key=key)
     
-    system_instruction = config.system_prompt
-    if system_instruction:
-         model = genai.GenerativeModel(config.model_name, system_instruction=system_instruction)
+    # Configure Tools (Google Search)
+    tools = None
+    if config.enable_google_search:
+        # Enable Google Search Grounding
+        tools = [types.Tool(google_search=types.GoogleSearch())]
 
-    # Convert messages
-    # Note: Gemini strictly requires user/model alternation in history usually.
-    # We will need to handle this. For MVP, we assume the input is clean or we blindly map.
-    for msg in messages:
+    # Configure Generation
+    gen_config = types.GenerateContentConfig(
+        temperature=config.temperature,
+        max_output_tokens=config.max_tokens,
+        system_instruction=config.system_prompt,
+        tools=tools
+    )
+    
+    # Prepare History
+    # Extract last message as the prompt
+    prompt_msg = "..."
+    history_msgs = messages
+    if messages and messages[-1].role == Role.USER:
+        prompt_msg = messages[-1].content
+        history_msgs = messages[:-1]
+    
+    history_content = []
+    for msg in history_msgs:
         role = "user" if msg.role == Role.USER else "model"
-        history.append({"role": role, "parts": [msg.content]})
-
-    # The last message should be the prompt, strictly speaking, `chat.send_message` takes the new message
-    # But if we are treating everything as history...
-    # Let's separate the last user message as the prompt if possible.
+        history_content.append(types.Content(
+            role=role,
+            parts=[types.Part(text=msg.content)]
+        ))
+    
+    # Use Async Client via .aio
+    chat_session = client.aio.chats.create(
+        model=config.model_name,
+        history=history_content,
+        config=gen_config
+    )
+    
+    # Streaming call
+    # In google-genai SDK, streaming is usually via send_message_stream
+    
+    # Check if send_message_stream exists, otherwise use send_message with stream=True (but error says no)
+    # Let's assume send_message_stream is the way.
+    
+    # Note: chat_session.send_message_stream returns an AsyncIterator (or a coroutine returning one)
+    response_stream = await chat_session.send_message_stream(prompt_msg)
     
-    if history and history[-1]["role"] == "user":
-        last_msg = history.pop()
-        chat = model.start_chat(history=history)
-        response_stream = await chat.send_message_async(last_msg["parts"][0], stream=True)
-    else:
-        # If the last message is not user, we might be in a weird state.
-        # Just send an empty prompt or handle error? 
-        # For now, assume the user always provides a prompt in the node.
-        chat = model.start_chat(history=history)
-        response_stream = await chat.send_message_async("...", stream=True) # Fallback
-
     async for chunk in response_stream:
+        # Access text safely
         if chunk.text:
             yield chunk.text
 
@@ -114,3 +263,158 @@ async def llm_streamer(context: Context, user_prompt: str, config: LLMConfig) ->
     except Exception as e:
         yield f"Error calling LLM: {str(e)}"
 
+
+async def generate_title(user_prompt: str, response: str) -> str:
+    """
+    Generate a short title (3-4 words) for a Q-A pair using gpt-5-nano.
+    Uses Responses API (required for gpt-5 series), synchronous mode (no background).
+    """
+    client = get_openai_client()
+    
+    instructions = """TASK: Extract a short topic title from the given Q&A. Do NOT answer the question - only extract the topic.
+
+Rules:
+- Output 2-3 short words OR 2 longer words
+- No punctuation, no quotes, no explanation
+- Capitalize each word
+- Be specific to the topic discussed
+- Output ONLY the title, nothing else
+
+Examples:
+Q: "How to sort a list in Python?" -> "Python Sorting"
+Q: "What is React state?" -> "React State"
+Q: "Explain AWS Lambda pricing" -> "Lambda Pricing"
+Q: "Who are you?" -> "AI Identity"
+Q: "What's the weather in NYC?" -> "NYC Weather\""""
+    
+    # Truncate to avoid token limits
+    truncated_prompt = user_prompt[:300] if len(user_prompt) > 300 else user_prompt
+    truncated_response = response[:300] if len(response) > 300 else response
+    
+    input_text = f"Question: {truncated_prompt}\n\nAnswer: {truncated_response}"
+    
+    try:
+        print(f"[generate_title] Called with prompt: {truncated_prompt[:50]}...")
+        
+        # Use Responses API for gpt-5-nano (synchronous, no background)
+        # Note: max_output_tokens includes reasoning tokens, so needs to be higher
+        resp = await client.responses.create(
+            model="gpt-5-nano",
+            input=input_text,
+            instructions=instructions,
+            max_output_tokens=500,  # Higher to accommodate reasoning tokens
+            reasoning={"effort": "low"},  # Minimize reasoning for simple task
+            stream=False
+        )
+        
+        print(f"[generate_title] Response status: {getattr(resp, 'status', 'unknown')}")
+        print(f"[generate_title] Response output: {getattr(resp, 'output', 'no output')}")
+        
+        # Response should be completed immediately (no polling needed)
+        if hasattr(resp, 'output'):
+            for out in resp.output:
+                if getattr(out, 'type', None) == 'message':
+                    content = getattr(out, 'content', [])
+                    for c in content:
+                        if getattr(c, 'type', None) == 'output_text':
+                            title = getattr(c, 'text', '').strip()
+                            # Clean up
+                            title = title.strip('"\'')
+                            print(f"[generate_title] Extracted title: {title}")
+                            if title:
+                                return title
+        
+        print("[generate_title] No title found, returning default")
+        return "New Question"
+        
+    except Exception as e:
+        print(f"Title generation error: {e}")
+        return "New Question"
+
+
+async def summarize_content(content: str, model: str) -> str:
+    """
+    Summarize the given content using the specified model.
+    Supports both OpenAI and Gemini models.
+    """
+    instructions = """Summarize the following content concisely. 
+Keep the key points and main ideas. 
+Output only the summary, no preamble."""
+    
+    # Truncate very long content
+    max_content = 8000
+    if len(content) > max_content:
+        content = content[:max_content] + "\n\n[Content truncated...]"
+    
+    try:
+        if model.startswith('gemini'):
+            # Use Gemini
+            from google import genai
+            from google.genai import types
+            import os
+            
+            key = os.getenv("GOOGLE_API_KEY")
+            if not key:
+                return "Error: Google API Key not found"
+            
+            client = genai.Client(api_key=key)
+            
+            gen_config = types.GenerateContentConfig(
+                temperature=0.3,
+                max_output_tokens=1000,
+                system_instruction=instructions
+            )
+            
+            response = await client.aio.models.generate_content(
+                model=model,
+                contents=content,
+                config=gen_config
+            )
+            
+            return response.text or "No summary generated"
+            
+        else:
+            # Use OpenAI
+            client = get_openai_client()
+            
+            # Check if model needs Responses API
+            responses_api_models = [
+                'gpt-5', 'gpt-5-chat-latest', 'gpt-5-mini', 'gpt-5-nano', 
+                'gpt-5-pro', 'gpt-5.1', 'gpt-5.1-chat-latest', 'o3'
+            ]
+            
+            if model in responses_api_models:
+                # Use Responses API
+                resp = await client.responses.create(
+                    model=model,
+                    input=content,
+                    instructions=instructions,
+                    max_output_tokens=2000,
+                    stream=False
+                )
+                
+                if hasattr(resp, 'output'):
+                    for out in resp.output:
+                        if getattr(out, 'type', None) == 'message':
+                            for c in getattr(out, 'content', []):
+                                if getattr(c, 'type', None) == 'output_text':
+                                    return getattr(c, 'text', '') or "No summary generated"
+                
+                return "No summary generated"
+            else:
+                # Use Chat Completions API
+                result = await client.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {"role": "system", "content": instructions},
+                        {"role": "user", "content": content}
+                    ],
+                    max_tokens=1000,
+                    temperature=0.3
+                )
+                
+                return result.choices[0].message.content or "No summary generated"
+                
+    except Exception as e:
+        print(f"Summarization error: {e}")
+        return f"Error: {str(e)}"