12 files changed, 3432 insertions, 0 deletions
diff --git a/putnam-bench-anon/loader/__init__.py b/putnam-bench-anon/loader/__init__.py
new file mode 100644
index 0000000..f6e9cf9
--- /dev/null
+++ b/putnam-bench-anon/loader/__init__.py
@@ -0,0 +1,290 @@
+"""
+Model loader package for mathematical problem solving.
+
+This package provides a unified interface for loading and using different AI models
+to solve mathematical problems and grade solutions.
+
+Usage:
+    # Create an OpenAI loader
+    loader = create_loader("openai", solver_model="gpt-4o-mini", grader_model="o3")
+    
+    # Create an OpenRouter loader
+    loader = create_loader("openrouter", solver_model="openai/gpt-4o", grader_model="anthropic/claude-3-opus")
+    
+    # Or directly instantiate
+    from loader import OpenAIModelLoader
+    loader = OpenAIModelLoader()
+    
+    # Test a problem
+    import json
+    with open("dataset/1938-A-1.json") as f:
+        data = json.load(f)
+    
+    result = await loader.test_single_problem(data, variant_type="original")
+"""
+
+from .base import ModelLoader
+from .openai_client import OpenAIModelLoader, KimiModelLoader
+from .anthropic_client import AnthropicModelLoader
+from .gemini_client import GeminiModelLoader
+from .xai_client import XAIModelLoader
+from .openrouter_client import OpenRouterModelLoader
+from .vllm_local import VLLMModelLoader
+from .vllm_direct import VLLMDirectModelLoader
+from .hf_local import HuggingFaceModelLoader
+from .cross_provider import CrossProviderLoader
+from .prompts import (
+    SOLVER_SYSTEM_PROMPT,
+    SOLVER_USER_TEMPLATE,
+    PROOF_GRADER_SYSTEM_PROMPT,
+    CALCULATION_GRADER_SYSTEM_PROMPT,
+    PROOF_GRADER_USER_TEMPLATE,
+    CALCULATION_GRADER_USER_TEMPLATE,
+    RESPONSE_FORMAT,
+    DEFAULT_RETRIES,
+    DEFAULT_TIMEOUT_BASE
+)
+from typing import Optional
+
+def create_loader(provider: str, **kwargs) -> ModelLoader:
+    """
+    Factory function to create model loaders.
+    
+    Args:
+        provider: Provider name ("openai", "anthropic", "gemini", etc.)
+        **kwargs: Additional arguments passed to the loader constructor
+        
+    Returns:
+        ModelLoader instance
+        
+    Raises:
+        ValueError: If provider is not supported
+        
+    Examples:
+        # Create OpenAI loader
+        loader = create_loader("openai", solver_model="gpt-4o-mini", grader_model="o3")
+        
+        # Create loader with custom settings
+        loader = create_loader(
+            "openai", 
+            solver_model="gpt-4", 
+            grader_model="o3",
+            retries=5,
+            timeout_base=600
+        )
+    """
+    provider_lower = provider.lower()
+    
+    if provider_lower == "openai":
+        return OpenAIModelLoader(**kwargs)
+    elif provider_lower == "anthropic":
+        return AnthropicModelLoader(**kwargs)
+    elif provider_lower == "gemini":
+        return GeminiModelLoader(**kwargs)
+    elif provider_lower == "xai":
+        return XAIModelLoader(**kwargs)
+    elif provider_lower == "openrouter":
+        return OpenRouterModelLoader(**kwargs)
+    elif provider_lower == "kimi":
+        return KimiModelLoader(**kwargs)
+    elif provider_lower == "vllm":
+        return VLLMModelLoader(**kwargs)
+    elif provider_lower == "vllm_direct":
+        return VLLMDirectModelLoader(**kwargs)
+    elif provider_lower in ["huggingface", "hf"]:
+        return HuggingFaceModelLoader(**kwargs)
+    else:
+        supported = ["openai", "anthropic", "gemini", "xai", "openrouter", "kimi", "vllm", "vllm_direct", "huggingface"]
+        raise ValueError(f"Unsupported provider: {provider}. Supported providers: {supported}")
+
+
+def create_cross_provider_loader(
+    solver_provider: str,
+    grader_provider: Optional[str] = None,
+    solver_model: Optional[str] = None,
+    grader_model: Optional[str] = None,
+    **kwargs
+) -> ModelLoader:
+    """
+    Create a loader that can use different providers for solving and grading.
+    
+    Args:
+        solver_provider: Provider for solving problems
+        grader_provider: Provider for grading (if None, uses solver_provider)
+        solver_model: Override solver model
+        grader_model: Override grader model
+        **kwargs: Additional arguments (can include provider-specific settings)
+        
+    Returns:
+        CrossProviderLoader instance
+        
+    Examples:
+        # Use Kimi for solving and OpenAI for grading
+        loader = create_cross_provider_loader(
+            solver_provider="kimi",
+            grader_provider="openai",
+            solver_model="Kimi-K2-Instruct",
+            grader_model="o3"
+        )
+        
+        # Use same provider but different models
+        loader = create_cross_provider_loader(
+            solver_provider="openai",
+            solver_model="gpt-4o-mini",
+            grader_model="o3"
+        )
+    """
+    # Extract provider-specific kwargs
+    solver_kwargs = kwargs.pop('solver_kwargs', {})
+    grader_kwargs = kwargs.pop('grader_kwargs', {})
+    
+    # Extract common parameters that should be passed to both loaders
+    quick = kwargs.pop('quick', False)
+    debug = kwargs.pop('debug', False)
+    
+    # Add common parameters to both solver and grader kwargs
+    solver_kwargs.update({'quick': quick, 'debug': debug})
+    grader_kwargs.update({'quick': quick, 'debug': debug})
+    
+    # Get default models if not specified
+    if not solver_model:
+        solver_defaults = get_default_models(solver_provider)
+        solver_model = solver_defaults['solver_model']
+    
+    if not grader_provider:
+        grader_provider = solver_provider
+        
+    if not grader_model:
+        grader_defaults = get_default_models(grader_provider)
+        grader_model = grader_defaults['grader_model']
+    
+    # Create solver loader
+    solver_loader = create_loader(
+        solver_provider,
+        solver_model=solver_model,
+        grader_model=solver_model,  # Use solver model for both in solver loader
+        **solver_kwargs
+    )
+    
+    # Create grader loader if different provider
+    if grader_provider != solver_provider:
+        grader_loader = create_loader(
+            grader_provider,
+            solver_model=grader_model,  # Use grader model for both in grader loader
+            grader_model=grader_model,
+            **grader_kwargs
+        )
+        return CrossProviderLoader(solver_loader, grader_loader, **kwargs)
+    else:
+        # Same provider, but possibly different models
+        if solver_model != grader_model:
+            # Need to create a single loader with both models
+            single_loader = create_loader(
+                solver_provider,
+                solver_model=solver_model,
+                grader_model=grader_model,
+                **solver_kwargs
+            )
+            return single_loader
+        else:
+            # Same provider and model
+            return solver_loader
+
+def get_supported_providers() -> list[str]:
+    """
+    Get list of supported model providers.
+    
+    Returns:
+        List of supported provider names
+    """
+    return ["openai", "anthropic", "gemini", "xai", "openrouter", "kimi", "vllm", "vllm_direct", "huggingface"]
+
+def get_default_models(provider: str) -> dict[str, str]:
+    """
+    Get default model names for a provider.
+    
+    Args:
+        provider: Provider name
+        
+    Returns:
+        Dictionary with default solver_model and grader_model
+    """
+    defaults = {
+        "openai": {
+            "solver_model": "gpt-4o-mini",
+            "grader_model": "o3"
+        },
+        "anthropic": {
+            "solver_model": "claude-3-5-haiku-20241022",
+            "grader_model": "claude-3-5-sonnet-20241022"
+        },
+        "gemini": {
+            "solver_model": "gemini-1.5-flash",
+            "grader_model": "gemini-1.5-pro"
+        },
+        "xai": {
+            "solver_model": "grok-3",
+            "grader_model": "grok-3"
+        },
+        "openrouter": {
+            "solver_model": "openai/gpt-4o",
+            "grader_model": "openai/gpt-4o"
+        },
+        "kimi": {
+            "solver_model": "moonshot-v1-8k",
+            "grader_model": "moonshot-v1-8k"
+        },
+        "vllm": {
+            "solver_model": "meta-llama/Llama-3.2-3B-Instruct",
+            "grader_model": "meta-llama/Llama-3.2-8B-Instruct"
+        },
+        "vllm_direct": {
+            "solver_model": "gpt2",
+            "grader_model": "gpt2"
+        },
+        "huggingface": {
+            "solver_model": "microsoft/DialoGPT-medium",
+            "grader_model": "microsoft/DialoGPT-large"
+        }
+    }
+    
+    provider_lower = provider.lower()
+    if provider_lower not in defaults:
+        raise ValueError(f"No defaults available for provider: {provider}")
+    
+    return defaults[provider_lower]
+
+# Export main classes and functions
+__all__ = [
+    # Main classes
+    "ModelLoader",
+    "OpenAIModelLoader",
+    "AnthropicModelLoader",
+    "GeminiModelLoader",
+    "XAIModelLoader",
+    "OpenRouterModelLoader",
+    "KimiModelLoader",
+    "VLLMModelLoader",
+    "VLLMDirectModelLoader",
+    "HuggingFaceModelLoader",
+    "CrossProviderLoader",
+    
+    # Factory functions
+    "create_loader",
+    "create_cross_provider_loader",
+    "get_supported_providers", 
+    "get_default_models",
+    
+    # Prompts (for advanced users)
+    "SOLVER_SYSTEM_PROMPT",
+    "SOLVER_USER_TEMPLATE",
+    "PROOF_GRADER_SYSTEM_PROMPT",
+    "CALCULATION_GRADER_SYSTEM_PROMPT",
+    "PROOF_GRADER_USER_TEMPLATE",
+    "CALCULATION_GRADER_USER_TEMPLATE",
+    
+    # Configuration constants
+    "RESPONSE_FORMAT",
+    "DEFAULT_RETRIES",
+    "DEFAULT_TIMEOUT_BASE"
+]
diff --git a/putnam-bench-anon/loader/anthropic_client.py b/putnam-bench-anon/loader/anthropic_client.py
new file mode 100644
index 0000000..e81f220
--- /dev/null
+++ b/putnam-bench-anon/loader/anthropic_client.py
@@ -0,0 +1,227 @@
+"""
+Anthropic model loader implementation.
+Handles API calls to Anthropic Claude models with proper error handling and retry logic.
+"""
+
+import asyncio
+import random
+from typing import Dict, List, Tuple, Optional
+
+try:
+    from anthropic import AsyncAnthropic, RateLimitError, APIError, APIConnectionError
+except ImportError:
+    AsyncAnthropic = None
+    RateLimitError = Exception
+    APIError = Exception
+    APIConnectionError = Exception
+
+from .base import ModelLoader
+from .prompts import RESPONSE_FORMAT
+
+
+class AnthropicModelLoader(ModelLoader):
+    """Anthropic implementation of the ModelLoader."""
+    
+    def __init__(self, 
+                 solver_model: str = "claude-3-5-haiku-20241022",
+                 grader_model: str = "claude-3-5-sonnet-20241022",
+                 api_key: Optional[str] = None,
+                 base_url: Optional[str] = None,
+                 **kwargs):
+        """
+        Initialize Anthropic model loader.
+        
+        Args:
+            solver_model: Anthropic model for solving problems (default: claude-3-5-haiku)
+            grader_model: Anthropic model for grading solutions (default: claude-3-5-sonnet)
+            api_key: Anthropic API key (if None, uses environment variable)
+            base_url: Custom base URL for Anthropic API
+            **kwargs: Additional arguments passed to parent class
+        """
+        if AsyncAnthropic is None:
+            raise ImportError(
+                "anthropic package is required for AnthropicModelLoader. "
+                "Install with: pip install anthropic"
+            )
+            
+        super().__init__(solver_model, grader_model, **kwargs)
+        
+        # Initialize Anthropic client
+        client_kwargs = {}
+        if api_key:
+            client_kwargs["api_key"] = api_key
+        if base_url:
+            client_kwargs["base_url"] = base_url
+            
+        self.client = AsyncAnthropic(**client_kwargs)
+    
+    async def _call_api(self, 
+                       model: str, 
+                       messages: List[Dict[str, str]], 
+                       temperature: float = 0.0) -> Tuple[Optional[str], str]:
+        """
+        Make an API call to Anthropic.
+        
+        Args:
+            model: Anthropic model name
+            messages: List of messages in chat format
+            temperature: Temperature for generation
+            
+        Returns:
+            Tuple of (response_content, raw_response)
+        """
+        try:
+            # Convert OpenAI format to Anthropic format
+            system_message = None
+            user_messages = []
+            
+            for msg in messages:
+                if msg["role"] == "system":
+                    system_message = msg["content"]
+                else:
+                    user_messages.append(msg)
+            
+            # Prepare API call parameters
+            api_params = {
+                "model": model,
+                "messages": user_messages,
+                "max_tokens": 4000,  # Anthropic requires max_tokens
+                "temperature": temperature,
+            }
+            
+            if system_message:
+                api_params["system"] = system_message
+            
+            # Make the API call
+            response = await self.client.messages.create(**api_params)
+            
+            # Extract response content
+            content = ""
+            if response.content:
+                for block in response.content:
+                    if hasattr(block, 'text'):
+                        content += block.text
+            
+            return content, content
+            
+        except RateLimitError as e:
+            # Handle rate limiting with special logic
+            error_str = str(e)
+            print(f"🚫 RateLimitError: {error_str}")
+            
+            if "insufficient_quota" in error_str.lower():
+                print("⏳ Detected quota exhaustion - sleeping 15 minutes")
+                await asyncio.sleep(900)  # 15 minutes
+            else:
+                # Standard rate limit - shorter sleep
+                sleep_time = 2 + random.random()
+                print(f"   ⏰ Rate limited, sleeping {sleep_time:.1f}s")
+                await asyncio.sleep(sleep_time)
+            
+            # Re-raise to trigger retry logic
+            raise
+            
+        except (APIError, APIConnectionError) as e:
+            print(f"❌ Anthropic API Error: {str(e)}")
+            raise
+            
+        except Exception as e:
+            print(f"❌ Unexpected error in Anthropic API call: {str(e)}")
+            raise
+    
+    def get_model_info(self) -> Dict[str, str]:
+        """Get information about the configured models."""
+        return {
+            "solver_model": self.solver_model,
+            "grader_model": self.grader_model,
+            "provider": "anthropic"
+        }
+    
+    async def health_check(self) -> bool:
+        """
+        Perform a simple health check to verify API connectivity.
+        
+        Returns:
+            True if API is accessible, False otherwise
+        """
+        try:
+            # Simple test call
+            test_messages = [
+                {"role": "user", "content": "Hello, please respond with a simple JSON: {\"status\": \"ok\"}"}
+            ]
+            
+            result, _ = await self._call_api(
+                model=self.solver_model,
+                messages=test_messages,
+                temperature=0.0
+            )
+            
+            if result and "ok" in result.lower():
+                print(f"✅ Anthropic API health check passed for {self.solver_model}")
+                return True
+            else:
+                print(f"⚠️ Anthropic API health check returned unexpected response")
+                return False
+                
+        except Exception as e:
+            print(f"❌ Anthropic API health check failed: {str(e)}")
+            return False
+    
+    async def estimate_cost(self, 
+                          num_problems: int, 
+                          avg_problem_length: int = 1000,
+                          avg_solution_length: int = 2000) -> Dict[str, float]:
+        """
+        Estimate the cost for processing a given number of problems.
+        
+        Args:
+            num_problems: Number of problems to process
+            avg_problem_length: Average length of problem statements in characters
+            avg_solution_length: Average length of solutions in characters
+            
+        Returns:
+            Dictionary with cost estimates
+        """
+        # Rough token estimates (1 token ≈ 4 characters for English)
+        tokens_per_solve = (avg_problem_length + avg_solution_length) // 4
+        tokens_per_grade = (avg_problem_length + avg_solution_length * 2) // 4
+        
+        # Anthropic pricing (update with actual Anthropic pricing)
+        # These are rough estimates and should be updated with current pricing
+        pricing = {
+            "claude-3-5-haiku-20241022": {"input": 0.0008, "output": 0.004},  # per 1K tokens
+            "claude-3-5-sonnet-20241022": {"input": 0.003, "output": 0.015},  # per 1K tokens
+            "claude-3-opus-20240229": {"input": 0.015, "output": 0.075},  # per 1K tokens
+            "claude-3-haiku-20240307": {"input": 0.00025, "output": 0.00125},  # per 1K tokens
+        }
+        
+        def get_model_cost(model: str, input_tokens: int, output_tokens: int) -> float:
+            if model not in pricing:
+                model = "claude-3-5-sonnet-20241022"  # Default fallback
+            
+            input_cost = (input_tokens / 1000) * pricing[model]["input"]
+            output_cost = (output_tokens / 1000) * pricing[model]["output"]
+            return input_cost + output_cost
+        
+        # Calculate costs
+        solve_cost = get_model_cost(
+            self.solver_model, 
+            tokens_per_solve * num_problems,
+            tokens_per_solve * num_problems // 2  # Assume output is ~50% of input
+        )
+        
+        grade_cost = get_model_cost(
+            self.grader_model,
+            tokens_per_grade * num_problems,
+            tokens_per_grade * num_problems // 3  # Assume output is ~33% of input
+        )
+        
+        total_cost = solve_cost + grade_cost
+        
+        return {
+            "solve_cost": round(solve_cost, 4),
+            "grade_cost": round(grade_cost, 4),
+            "total_cost": round(total_cost, 4),
+            "cost_per_problem": round(total_cost / num_problems, 6),
+            "currency": "USD"
+        }
diff --git a/putnam-bench-anon/loader/base.py b/putnam-bench-anon/loader/base.py
new file mode 100644
index 0000000..5e24a8f
--- /dev/null
+++ b/putnam-bench-anon/loader/base.py
@@ -0,0 +1,514 @@
+"""
+Abstract base class for model loaders.
+Defines the interface for mathematical problem solving and grading.
+"""
+
+import re
+import json
+import asyncio
+import random
+from abc import ABC, abstractmethod
+from typing import Dict, List, Tuple, Optional, Any
+
+from .prompts import (
+    SOLVER_SYSTEM_PROMPT,
+    SOLVER_USER_TEMPLATE,
+    PROOF_GRADER_SYSTEM_PROMPT,
+    CALCULATION_GRADER_SYSTEM_PROMPT,
+    PROOF_GRADER_USER_TEMPLATE,
+    CALCULATION_GRADER_USER_TEMPLATE,
+    RESPONSE_FORMAT,
+    DEFAULT_RETRIES,
+    DEFAULT_TIMEOUT_BASE
+)
+
+# JSON extraction regex
+JSON_RE = re.compile(r"\{[\s\S]*\}")
+
+
+class ModelLoader(ABC):
+    """Abstract base class for model loaders."""
+    
+    def __init__(self, 
+                 solver_model: str,
+                 grader_model: str,
+                 retries: int = DEFAULT_RETRIES,
+                 timeout_base: int = DEFAULT_TIMEOUT_BASE,
+                 debug: bool = False,
+                 quick: bool = False):
+        """
+        Initialize the model loader.
+        
+        Args:
+            solver_model: Model name for solving problems
+            grader_model: Model name for grading solutions
+            retries: Number of retry attempts for API calls
+            timeout_base: Base timeout in seconds for API calls
+            debug: Enable debug logging for JSON parsing
+            quick: Quick mode - allows one retry with 1200s timeout each attempt
+        """
+        self.solver_model = solver_model
+        self.grader_model = grader_model
+        self.retries = retries
+        self.timeout_base = timeout_base
+        self.debug = debug
+        self.quick = quick
+        
+        # Override settings for quick mode
+        if self.quick:
+            self.retries = 1  # Only try once
+            self.timeout_base = 1200  # 20 minutes timeout
+    
+    @abstractmethod
+    async def _call_api(self, 
+                       model: str, 
+                       messages: List[Dict[str, str]], 
+                       temperature: float = 0.0) -> Tuple[Optional[str], str]:
+        """
+        Make an API call to the model.
+        
+        Args:
+            model: Model name to use
+            messages: List of messages in chat format
+            temperature: Temperature for generation
+            
+        Returns:
+            Tuple of (parsed_response, raw_response)
+        """
+        pass
+    
+    def parse_json_response(self, raw: str, debug: bool = False) -> Optional[Dict]:
+        """Parse JSON from LLM response with fallback strategies."""
+        if not raw:
+            return None
+        
+        # Try direct JSON parse
+        try:
+            return json.loads(raw)
+        except Exception as e:
+            if debug:
+                print(f"⚠️ Direct JSON parse failed: {e}")
+        
+        # Try to find JSON in the response
+        match = JSON_RE.search(raw)
+        if match:
+            try:
+                return json.loads(match.group(0))
+            except Exception as e:
+                if debug:
+                    print(f"⚠️ Regex JSON parse failed: {e}")
+        
+        # Try fixing common JSON issues including control characters
+        try:
+            # Fix escaped quotes and backslashes
+            fixed = raw.replace('\\"', '"').replace('\\\\', '\\')
+            
+            # Fix unescaped newlines and other control characters in JSON strings
+            # This is a more robust approach for LLM responses
+            import ast
+            
+            # Try to use ast.literal_eval if it's a simple dict-like structure
+            if fixed.strip().startswith('{') and fixed.strip().endswith('}'):
+                try:
+                    # Replace common problematic patterns
+                    fixed = fixed.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
+                    return json.loads(fixed)
+                except Exception as e:
+                    if debug:
+                        print(f"⚠️ Fixed JSON parse failed: {e}")
+            
+        except Exception as e:
+            if debug:
+                print(f"⚠️ JSON fixing failed: {e}")
+        
+        # ENHANCED: Try to complete truncated JSON
+        try:
+            if raw.strip().startswith('{') and not raw.strip().endswith('}'):
+                if debug:
+                    print("🔧 Attempting to fix truncated JSON...")
+                
+                # Try to find the last complete key-value pair
+                # Look for solution content
+                if '"solution"' in raw:
+                    # Extract solution up to the truncation point
+                    solution_start = raw.find('"solution"')
+                    solution_content = raw[solution_start:]
+                    
+                    # Find the actual solution text
+                    import re
+                    solution_match = re.search(r'"solution":\s*"([^"]*(?:\\"[^"]*)*)', raw, re.DOTALL)
+                    if solution_match:
+                        solution_text = solution_match.group(1)
+                        # Clean up the solution text
+                        solution_text = solution_text.replace('\\"', '"').replace('\\n', '\n')
+                        
+                        if debug:
+                            print(f"🔧 Extracted solution from truncated JSON ({len(solution_text)} chars)")
+                        return {
+                            "solution": solution_text,
+                            "final_answer": "Solution was truncated - see solution field for complete answer"
+                        }
+        except Exception as e:
+            if debug:
+                print(f"⚠️ Truncated JSON recovery failed: {e}")
+        
+        # Final fallback: try to extract key-value pairs manually
+        try:
+            if '"solution"' in raw:
+                import re
+                
+                if debug:
+                    print("🔧 Attempting manual key-value extraction...")
+                
+                # Extract solution (more robust pattern)
+                solution_match = re.search(r'"solution":\s*"([^"]*(?:\\"[^"]*)*)', raw, re.DOTALL)
+                solution = solution_match.group(1) if solution_match else ""
+                
+                # Extract final_answer if it exists
+                answer_match = re.search(r'"final_answer":\s*"([^"]*)"', raw)
+                final_answer = answer_match.group(1) if answer_match else ""
+                
+                if solution:
+                    # Clean up the solution text
+                    solution = solution.replace('\\"', '"').replace('\\n', '\n')
+                    
+                    if debug:
+                        print(f"🔧 Manual extraction successful ({len(solution)} chars solution)")
+                    return {
+                        "solution": solution,
+                        "final_answer": final_answer if final_answer else "See solution field for complete answer"
+                    }
+        except Exception as e:
+            if debug:
+                print(f"⚠️ Manual extraction failed: {e}")
+        
+        if debug:
+            print("❌ All JSON parsing strategies failed")
+        return None
+    
+    def to_str(self, x) -> str:
+        """Convert various types to string safely."""
+        if x is None:
+            return ""
+        if isinstance(x, str):
+            return x
+        if isinstance(x, (list, tuple)):
+            return "\n".join(map(str, x))
+        return str(x)
+    
+    async def call_api_with_retry(self, 
+                                 model: str, 
+                                 messages: List[Dict[str, str]], 
+                                 temperature: float = 0.0) -> Tuple[Optional[Dict], str]:
+        """
+        Make API call with retry logic and JSON parsing.
+        
+        Args:
+            model: Model name to use
+            messages: List of messages in chat format
+            temperature: Temperature for generation
+            
+        Returns:
+            Tuple of (parsed_json_response, raw_response)
+        """
+        raw_response = ""
+        
+        # In quick mode, we allow one retry with a fixed timeout
+        if self.quick:
+            max_attempts = 2  # Allow one retry in quick mode
+            if self.debug:
+                print(f"⚡ Quick mode: Up to {max_attempts} attempts with {self.timeout_base}s timeout each")
+            
+            for attempt in range(1, max_attempts + 1):
+                try:
+                    if attempt > 1 and self.debug:
+                        print(f"🔄 Quick mode retry attempt {attempt}/{max_attempts}")
+                    
+                    parsed, raw_response = await asyncio.wait_for(
+                        self._call_api(model, messages, temperature),
+                        timeout=self.timeout_base
+                    )
+                    
+                    if parsed:
+                        # Try to parse as JSON
+                        debug_mode = getattr(self, 'debug', False)
+                        json_parsed = self.parse_json_response(parsed, debug=debug_mode)
+                        if json_parsed:
+                            return json_parsed, raw_response
+                        return None, raw_response
+                    else:
+                        raise ValueError("Empty response from API")
+                        
+                except Exception as e:
+                    error_type = type(e).__name__
+                    error_msg = str(e)
+                    print(f"❌ {error_type} in quick mode (attempt {attempt}/{max_attempts}): {error_msg}")
+                    
+                    # If this was the last attempt, mark as failed
+                    if attempt == max_attempts:
+                        return {"_max_retries_reached": True, "error": str(e)}, raw_response
+                    
+                    # Otherwise, wait a bit before retrying
+                    if self.debug:
+                        print("⏳ Waiting 5 seconds before retry...")
+                    await asyncio.sleep(5)
+        
+        # Regular mode with retries
+        for attempt in range(1, self.retries + 1):
+            # More aggressive timeout scaling for persistent failures
+            # Cap timeout at 30 minutes to prevent extremely long waits
+            timeout = min(self.timeout_base * (1.5 ** (attempt - 1)), 1800)
+            if self.debug:
+                print(f"🔄 Attempt {attempt}/{self.retries} with timeout {timeout:.0f}s")
+            try:
+                parsed, raw_response = await asyncio.wait_for(
+                    self._call_api(model, messages, temperature),
+                    timeout=timeout
+                )
+                
+                if parsed:
+                    # Try to parse as JSON
+                    debug_mode = getattr(self, 'debug', False)
+                    json_parsed = self.parse_json_response(parsed, debug=debug_mode)
+                    if json_parsed:
+                        return json_parsed, raw_response
+                    return None, raw_response
+                else:
+                    raise ValueError("Empty response from API")
+                    
+            except Exception as e:
+                error_type = type(e).__name__
+                error_msg = str(e)
+                
+                # Only show detailed error info on first attempt or in debug mode
+                if attempt == 1 or self.debug:
+                    print(f"❌ {error_type} (attempt {attempt}/{self.retries}): {error_msg}")
+                
+                if attempt == self.retries:
+                    print(f"🔥 All {self.retries} retry attempts exhausted for {error_type}")
+                    # Return a special marker for max retries reached
+                    return {"_max_retries_reached": True, "error": str(e)}, raw_response
+                
+                # Custom retry strategy: 600s -> 900s -> 900s -> 1200s...
+                if attempt == 1:
+                    base_sleep = 600  # 10 minutes
+                elif attempt == 2 or attempt == 3:
+                    base_sleep = 900  # 15 minutes
+                else:
+                    base_sleep = 1200  # 20 minutes
+                
+                # Add small random jitter to avoid synchronized retries
+                jitter = random.uniform(0, 30)  # 0-30 seconds jitter
+                sleep_time = base_sleep + jitter
+                
+                if self.debug:
+                    print(f"   ⏰ Using custom backoff strategy: {base_sleep}s base + {jitter:.1f}s jitter")
+                
+                if self.debug:
+                    print(f"   ⏰ Retrying in {sleep_time:.1f}s")
+                await asyncio.sleep(sleep_time)
+        
+        return None, raw_response
+    
+    async def solve_problem(self, problem_statement: str, model: Optional[str] = None) -> Tuple[Optional[Dict], str]:
+        """
+        Have model solve mathematical problems.
+        
+        Args:
+            problem_statement: Problem statement
+            model: Model name to use for solving (if None, uses default solver_model)
+            
+        Returns:
+            Tuple of (solving result dictionary, raw response)
+            Solving result contains: {"solution": "detailed solution", "final_answer": "final answer"}
+        """
+        messages = [
+            {"role": "system", "content": SOLVER_SYSTEM_PROMPT},
+            {"role": "user", "content": SOLVER_USER_TEMPLATE.format(
+                problem_statement=problem_statement
+            )}
+        ]
+        
+        # Use specified model or default solver model
+        solver_model = model if model is not None else self.solver_model
+        
+        # Set temperature based on model
+        # o3, o3-mini, and o4-mini require temperature 1.0
+        if any(model_name in solver_model.lower() for model_name in ['o3', 'o3-mini', 'o4-mini']):
+            temperature = 1.0
+        else:
+            # Use temperature 0.0 for deterministic solving with other models
+            temperature = 0.0
+        
+        return await self.call_api_with_retry(solver_model, messages, temperature=temperature)
+    
+    async def grade_solution(self, 
+                           problem_statement: str, 
+                           solution: str,
+                           reference_solution: str, 
+                           problem_type: str = "proof",
+                           model: Optional[str] = None) -> Tuple[Optional[Dict], str]:
+        """
+        Have model grade solution based on problem type.
+        
+        Args:
+            problem_statement: Problem statement
+            solution: Student solution
+            reference_solution: Reference solution
+            problem_type: Problem type ("proof" strict grading, "calculation" lenient grading)
+            model: Model name to use for grading (if None, uses default grader_model)
+            
+        Returns:
+            Tuple of (grading result dictionary, raw response)
+            Grading result contains: {"grade": "CORRECT"/"INCORRECT", "detailed_feedback": "...", ...}
+        """
+        if problem_type == "calculation":
+            system_prompt = CALCULATION_GRADER_SYSTEM_PROMPT
+            user_template = CALCULATION_GRADER_USER_TEMPLATE
+        else:  # Default to proof (strict grading)
+            system_prompt = PROOF_GRADER_SYSTEM_PROMPT
+            user_template = PROOF_GRADER_USER_TEMPLATE
+        
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_template.format(
+                problem_statement=problem_statement,
+                solution=solution,
+                reference_solution=reference_solution
+            )}
+        ]
+        
+        # Use specified model or default grader model
+        grader_model = model if model is not None else self.grader_model
+        
+        # Use temperature 1.0 for grading (as per original script for o3)
+        return await self.call_api_with_retry(grader_model, messages, temperature=1.0)
+    
+    async def test_single_problem(self, 
+                                data: Dict, 
+                                variant_type: str = "original",
+                                solver_model: Optional[str] = None,
+                                grader_model: Optional[str] = None) -> Dict:
+        """
+        Test complete workflow for single problem: solving + grading.
+        
+        Args:
+            data: Problem data dictionary
+            variant_type: Problem variant type ("original" or key names in variants)
+            solver_model: Model name for solving (if None, uses default solver_model)
+            grader_model: Model name for grading (if None, uses default grader_model)
+            
+        Returns:
+            Test result dictionary
+        """
+        index = data.get("index", "unknown")
+        problem_type = data.get("problem_type", "proof")
+        
+        try:
+            # Get problem and reference solution
+            if variant_type == "original":
+                question = self.to_str(data.get("question", "")).strip()
+                reference_solution = self.to_str(data.get("solution", "")).strip()
+            else:
+                variant = data.get("variants", {}).get(variant_type)
+                if not variant:
+                    return {
+                        "index": index,
+                        "variant_type": variant_type,
+                        "status": "skipped",
+                        "reason": f"no_{variant_type}_variant"
+                    }
+                question = self.to_str(variant.get("question", "")).strip()
+                reference_solution = self.to_str(variant.get("solution", "")).strip()
+            
+            if not question or not reference_solution:
+                return {
+                    "index": index,
+                    "variant_type": variant_type,
+                    "status": "skipped",
+                    "reason": "missing_fields"
+                }
+            
+            result = {
+                "index": index,
+                "variant_type": variant_type,
+                "problem_type": problem_type,
+                "status": "completed",
+                "solve": {},
+                "grade": {}
+            }
+            
+            # 1. Solve problem
+            solve_result, solve_raw = await self.solve_problem(question, model=solver_model)
+            
+            # Check if max retries reached
+            if solve_result and solve_result.get("_max_retries_reached"):
+                # Mark as completed but with INCORRECT grade due to max retries
+                result["solve"]["status"] = "max_retries"
+                result["solve"]["solution"] = "Failed to generate solution after maximum retry attempts"
+                result["solve"]["final_answer"] = "No answer - max retries reached"
+                result["grade"]["status"] = "auto_failed"
+                result["grade"]["grade"] = "INCORRECT"
+                result["grade"]["detailed_feedback"] = f"Automatically marked as incorrect due to reaching maximum retry limit ({self.retries} attempts)"
+                result["grade"]["major_issues"] = "API call failed after all retry attempts"
+                result["grade"]["final_answer_correct"] = False
+                result["grade"]["reasoning_rigor_score"] = 0
+                result["grade"]["overall_assessment"] = "Failed to generate solution"
+                result["correct"] = False
+                result["status"] = "completed"  # Mark as completed, not failed
+                return result
+            
+            if not solve_result:
+                result["solve"]["status"] = "failed"
+                result["status"] = "failed"
+                return result
+            
+            student_solution = self.to_str(solve_result.get("solution", "")).strip()
+            final_answer = self.to_str(solve_result.get("final_answer", "")).strip()
+            
+            result["solve"]["status"] = "success"
+            result["solve"]["solution"] = student_solution
+            result["solve"]["final_answer"] = final_answer
+            
+            # 2. Grade solution
+            grade_result, grade_raw = await self.grade_solution(
+                question, student_solution, reference_solution, problem_type, model=grader_model
+            )
+            
+            # Check if grading max retries reached
+            if grade_result and grade_result.get("_max_retries_reached"):
+                # Mark as completed but with INCORRECT grade due to max retries in grading
+                result["grade"]["status"] = "auto_failed"
+                result["grade"]["grade"] = "INCORRECT"
+                result["grade"]["detailed_feedback"] = f"Automatically marked as incorrect due to grading reaching maximum retry limit ({self.retries} attempts)"
+                result["grade"]["major_issues"] = "Grading API call failed after all retry attempts"
+                result["grade"]["final_answer_correct"] = False
+                result["grade"]["reasoning_rigor_score"] = 0
+                result["grade"]["overall_assessment"] = "Failed to grade solution"
+                result["correct"] = False
+                result["status"] = "completed"  # Mark as completed, not partial/failed
+            elif not grade_result:
+                result["grade"]["status"] = "failed"
+                result["status"] = "partial"  # solving succeeded but grading failed
+            else:
+                result["grade"]["status"] = "success"
+                result["grade"]["grade"] = grade_result.get("grade", "UNKNOWN")
+                result["grade"]["detailed_feedback"] = grade_result.get("detailed_feedback", "")
+                result["grade"]["major_issues"] = grade_result.get("major_issues", "")
+                result["grade"]["final_answer_correct"] = grade_result.get("final_answer_correct", False)
+                result["grade"]["reasoning_rigor_score"] = grade_result.get("reasoning_rigor_score", 0)
+                result["grade"]["overall_assessment"] = grade_result.get("overall_assessment", "")
+                
+                # Mark whether correct
+                result["correct"] = grade_result.get("grade") == "CORRECT"
+            
+            return result
+            
+        except Exception as e:
+            return {
+                "index": index,
+                "variant_type": variant_type,
+                "status": "error",
+                "error": str(e),
+                "error_type": type(e).__name__
+            }
diff --git a/putnam-bench-anon/loader/cross_provider.py b/putnam-bench-anon/loader/cross_provider.py
new file mode 100644
index 0000000..afd833c
--- /dev/null
+++ b/putnam-bench-anon/loader/cross_provider.py
@@ -0,0 +1,155 @@
+"""
+Cross-provider model loader implementation.
+Allows using different providers for solving and grading tasks.
+"""
+
+from typing import Dict, Optional, Tuple, Any
+from .base import ModelLoader
+
+
+class CrossProviderLoader(ModelLoader):
+    """Wrapper that allows using different providers for solving and grading."""
+    
+    def __init__(self, 
+                 solver_loader: ModelLoader,
+                 grader_loader: Optional[ModelLoader] = None,
+                 **kwargs):
+        """
+        Initialize cross-provider loader.
+        
+        Args:
+            solver_loader: ModelLoader instance for solving problems
+            grader_loader: ModelLoader instance for grading (if None, uses solver_loader)
+            **kwargs: Additional arguments passed to parent class
+        """
+        # If no grader loader specified, use the solver loader for both
+        self.solver_loader = solver_loader
+        self.grader_loader = grader_loader or solver_loader
+        
+        # Initialize parent with combined model info
+        super().__init__(
+            solver_model=solver_loader.solver_model,
+            grader_model=self.grader_loader.grader_model,
+            **kwargs
+        )
+        
+        # Track if we're using cross-provider
+        self.is_cross_provider = grader_loader is not None and grader_loader != solver_loader
+    
+    async def _call_api(self, 
+                       model: str, 
+                       messages: list[Dict[str, str]], 
+                       temperature: float = 0.0) -> Tuple[Optional[str], str]:
+        """
+        Route API calls to the appropriate provider based on the model.
+        
+        Args:
+            model: Model name to use
+            messages: List of messages in chat format
+            temperature: Temperature for generation
+            
+        Returns:
+            Tuple of (response_content, raw_response)
+        """
+        # Determine which loader to use based on the model
+        if model == self.solver_model:
+            return await self.solver_loader._call_api(model, messages, temperature)
+        elif model == self.grader_model:
+            return await self.grader_loader._call_api(model, messages, temperature)
+        else:
+            # Try to determine based on which loader has the model
+            if hasattr(self.solver_loader, 'solver_model') and model == self.solver_loader.solver_model:
+                return await self.solver_loader._call_api(model, messages, temperature)
+            elif hasattr(self.grader_loader, 'grader_model') and model == self.grader_loader.grader_model:
+                return await self.grader_loader._call_api(model, messages, temperature)
+            else:
+                raise ValueError(f"Model {model} not found in either solver or grader loader")
+    
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the configured models and providers."""
+        solver_info = self.solver_loader.get_model_info()
+        grader_info = self.grader_loader.get_model_info()
+        
+        return {
+            "solver_model": self.solver_model,
+            "grader_model": self.grader_model,
+            "solver_provider": solver_info.get("provider", "unknown"),
+            "grader_provider": grader_info.get("provider", "unknown"),
+            "is_cross_provider": self.is_cross_provider,
+            "solver_info": solver_info,
+            "grader_info": grader_info
+        }
+    
+    async def health_check(self) -> bool:
+        """
+        Perform health checks on both providers.
+        
+        Returns:
+            True if both providers are healthy, False otherwise
+        """
+        print("🔍 Checking solver provider health...")
+        solver_health = await self.solver_loader.health_check()
+        
+        if self.is_cross_provider:
+            print("🔍 Checking grader provider health...")
+            grader_health = await self.grader_loader.health_check()
+            return solver_health and grader_health
+        else:
+            return solver_health
+    
+    async def estimate_cost(self, 
+                          num_problems: int, 
+                          avg_problem_length: int = 1000,
+                          avg_solution_length: int = 2000) -> Dict[str, float]:
+        """
+        Estimate costs for both providers.
+        
+        Args:
+            num_problems: Number of problems to process
+            avg_problem_length: Average length of problem statements in characters
+            avg_solution_length: Average length of solutions in characters
+            
+        Returns:
+            Dictionary with combined cost estimates
+        """
+        # Get solver costs
+        solver_costs = await self.solver_loader.estimate_cost(
+            num_problems, avg_problem_length, avg_solution_length
+        )
+        
+        if self.is_cross_provider:
+            # Get grader costs separately
+            grader_costs = await self.grader_loader.estimate_cost(
+                num_problems, avg_problem_length, avg_solution_length
+            )
+            
+            # Combine costs
+            return {
+                "solver_cost": solver_costs.get("solve_cost", 0),
+                "grader_cost": grader_costs.get("grade_cost", 0),
+                "total_cost": solver_costs.get("solve_cost", 0) + grader_costs.get("grade_cost", 0),
+                "solver_provider": self.solver_loader.get_model_info().get("provider"),
+                "grader_provider": self.grader_loader.get_model_info().get("provider"),
+                "solver_model": self.solver_model,
+                "grader_model": self.grader_model,
+                "num_problems": num_problems,
+                "note": "Cross-provider costs combined"
+            }
+        else:
+            # Single provider costs
+            return solver_costs
+    
+    async def __aenter__(self):
+        """Async context manager entry."""
+        if hasattr(self.solver_loader, '__aenter__'):
+            await self.solver_loader.__aenter__()
+        if self.is_cross_provider and hasattr(self.grader_loader, '__aenter__'):
+            await self.grader_loader.__aenter__()
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        if hasattr(self.solver_loader, '__aexit__'):
+            await self.solver_loader.__aexit__(exc_type, exc_val, exc_tb)
+        if self.is_cross_provider and hasattr(self.grader_loader, '__aexit__'):
+            await self.grader_loader.__aexit__(exc_type, exc_val, exc_tb) 
+\ No newline at end of file
diff --git a/putnam-bench-anon/loader/gemini_client.py b/putnam-bench-anon/loader/gemini_client.py
new file mode 100644
index 0000000..3ff0be0
--- /dev/null
+++ b/putnam-bench-anon/loader/gemini_client.py
@@ -0,0 +1,239 @@
+"""
+Gemini model loader implementation.
+Handles API calls to Google Gemini models with proper error handling and retry logic.
+"""
+
+import asyncio
+import random
+from typing import Dict, List, Tuple, Optional
+
+try:
+    import google.generativeai as genai
+    from google.generativeai.types import generation_types
+except ImportError:
+    genai = None
+    generation_types = None
+
+from .base import ModelLoader
+from .prompts import RESPONSE_FORMAT
+
+
+class GeminiModelLoader(ModelLoader):
+    """Gemini implementation of the ModelLoader."""
+    
+    def __init__(self, 
+                 solver_model: str = "gemini-1.5-flash",
+                 grader_model: str = "gemini-1.5-pro",
+                 api_key: Optional[str] = None,
+                 **kwargs):
+        """
+        Initialize Gemini model loader.
+        
+        Args:
+            solver_model: Gemini model for solving problems (default: gemini-1.5-flash)
+            grader_model: Gemini model for grading solutions (default: gemini-1.5-pro)
+            api_key: Google AI API key (if None, uses environment variable GOOGLE_API_KEY)
+            **kwargs: Additional arguments passed to parent class
+        """
+        if genai is None:
+            raise ImportError(
+                "google-generativeai package is required for GeminiModelLoader. "
+                "Install with: pip install google-generativeai"
+            )
+            
+        super().__init__(solver_model, grader_model, **kwargs)
+        
+        # Configure Google AI
+        if api_key:
+            genai.configure(api_key=api_key)
+        else:
+            # Will use GOOGLE_API_KEY environment variable
+            genai.configure()
+    
+    async def _call_api(self, 
+                       model: str, 
+                       messages: List[Dict[str, str]], 
+                       temperature: float = 0.0) -> Tuple[Optional[str], str]:
+        """
+        Make an API call to Gemini.
+        
+        Args:
+            model: Gemini model name
+            messages: List of messages in chat format
+            temperature: Temperature for generation
+            
+        Returns:
+            Tuple of (response_content, raw_response)
+        """
+        try:
+            # Initialize the model
+            model_instance = genai.GenerativeModel(model)
+            
+            # Convert OpenAI format to Gemini format
+            system_instruction = None
+            conversation = []
+            
+            for msg in messages:
+                if msg["role"] == "system":
+                    system_instruction = msg["content"]
+                elif msg["role"] == "user":
+                    conversation.append({"role": "user", "parts": [msg["content"]]})
+                elif msg["role"] == "assistant":
+                    conversation.append({"role": "model", "parts": [msg["content"]]})
+            
+            # Configure generation parameters
+            generation_config = genai.types.GenerationConfig(
+                temperature=temperature,
+                max_output_tokens=4000,
+            )
+            
+            # Request JSON format for all Gemini models
+            # Flash models now support JSON format as per latest API documentation
+            generation_config.response_mime_type = "application/json"
+            
+            # Make the API call
+            if system_instruction and len(conversation) == 1:
+                # Single user message with system instruction
+                prompt = f"{system_instruction}\n\n{conversation[0]['parts'][0]}"
+                response = await asyncio.to_thread(
+                    model_instance.generate_content,
+                    prompt,
+                    generation_config=generation_config
+                )
+            else:
+                # Multi-turn conversation
+                if system_instruction:
+                    # Prepend system instruction to first user message
+                    if conversation and conversation[0]["role"] == "user":
+                        conversation[0]["parts"][0] = f"{system_instruction}\n\n{conversation[0]['parts'][0]}"
+                
+                response = await asyncio.to_thread(
+                    model_instance.generate_content,
+                    conversation,
+                    generation_config=generation_config
+                )
+            
+            # Extract response content
+            content = ""
+            if response.text:
+                content = response.text
+            
+            return content, content
+            
+        except Exception as e:
+            error_str = str(e)
+            
+            # Handle different types of errors
+            if "quota" in error_str.lower() or "rate" in error_str.lower():
+                print(f"🚫 Rate/Quota Error: {error_str}")
+                if "quota" in error_str.lower():
+                    print("⏳ Detected quota exhaustion - sleeping 15 minutes")
+                    await asyncio.sleep(900)  # 15 minutes
+                else:
+                    sleep_time = 2 + random.random()
+                    print(f"   ⏰ Rate limited, sleeping {sleep_time:.1f}s")
+                    await asyncio.sleep(sleep_time)
+                # Re-raise to trigger retry logic
+                raise
+            elif "api" in error_str.lower():
+                print(f"❌ Gemini API Error: {error_str}")
+                raise
+            else:
+                print(f"❌ Unexpected error in Gemini API call: {error_str}")
+                raise
+    
+    def get_model_info(self) -> Dict[str, str]:
+        """Get information about the configured models."""
+        return {
+            "solver_model": self.solver_model,
+            "grader_model": self.grader_model,
+            "provider": "gemini"
+        }
+    
+    async def health_check(self) -> bool:
+        """
+        Perform a simple health check to verify API connectivity.
+        
+        Returns:
+            True if API is accessible, False otherwise
+        """
+        try:
+            # Simple test call
+            test_messages = [
+                {"role": "user", "content": "Hello, please respond with a simple JSON: {\"status\": \"ok\"}"}
+            ]
+            
+            result, _ = await self._call_api(
+                model=self.solver_model,
+                messages=test_messages,
+                temperature=0.0
+            )
+            
+            if result and "ok" in result.lower():
+                print(f"✅ Gemini API health check passed for {self.solver_model}")
+                return True
+            else:
+                print(f"⚠️ Gemini API health check returned unexpected response")
+                return False
+                
+        except Exception as e:
+            print(f"❌ Gemini API health check failed: {str(e)}")
+            return False
+    
+    async def estimate_cost(self, 
+                          num_problems: int, 
+                          avg_problem_length: int = 1000,
+                          avg_solution_length: int = 2000) -> Dict[str, float]:
+        """
+        Estimate the cost for processing a given number of problems.
+        
+        Args:
+            num_problems: Number of problems to process
+            avg_problem_length: Average length of problem statements in characters
+            avg_solution_length: Average length of solutions in characters
+            
+        Returns:
+            Dictionary with cost estimates
+        """
+        # Rough token estimates (1 token ≈ 4 characters for English)
+        tokens_per_solve = (avg_problem_length + avg_solution_length) // 4
+        tokens_per_grade = (avg_problem_length + avg_solution_length * 2) // 4
+        
+        # Gemini pricing (update with actual Google AI pricing)
+        # These are rough estimates and should be updated with current pricing
+        pricing = {
+            "gemini-1.5-flash": {"input": 0.000075, "output": 0.0003},  # per 1K tokens
+            "gemini-1.5-pro": {"input": 0.00125, "output": 0.005},  # per 1K tokens
+            "gemini-1.0-pro": {"input": 0.0005, "output": 0.0015},  # per 1K tokens
+        }
+        
+        def get_model_cost(model: str, input_tokens: int, output_tokens: int) -> float:
+            if model not in pricing:
+                model = "gemini-1.5-pro"  # Default fallback
+            
+            input_cost = (input_tokens / 1000) * pricing[model]["input"]
+            output_cost = (output_tokens / 1000) * pricing[model]["output"]
+            return input_cost + output_cost
+        
+        # Calculate costs
+        solve_cost = get_model_cost(
+            self.solver_model, 
+            tokens_per_solve * num_problems,
+            tokens_per_solve * num_problems // 2  # Assume output is ~50% of input
+        )
+        
+        grade_cost = get_model_cost(
+            self.grader_model,
+            tokens_per_grade * num_problems,
+            tokens_per_grade * num_problems // 3  # Assume output is ~33% of input
+        )
+        
+        total_cost = solve_cost + grade_cost
+        
+        return {
+            "solve_cost": round(solve_cost, 4),
+            "grade_cost": round(grade_cost, 4),
+            "total_cost": round(total_cost, 4),
+            "cost_per_problem": round(total_cost / num_problems, 6),
+            "currency": "USD"
+        }
diff --git a/putnam-bench-anon/loader/hf_local.py b/putnam-bench-anon/loader/hf_local.py
new file mode 100644
index 0000000..9371436
--- /dev/null
+++ b/putnam-bench-anon/loader/hf_local.py
@@ -0,0 +1,375 @@
+"""
+Hugging Face local model loader implementation.
+Handles direct inference with locally loaded transformers models.
+"""
+
+import asyncio
+import random
+from typing import Dict, List, Tuple, Optional
+import json
+
+try:
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+    import transformers
+except ImportError:
+    torch = None
+    AutoModelForCausalLM = None
+    AutoTokenizer = None
+    pipeline = None
+    transformers = None
+
+from .base import ModelLoader
+
+
+class HuggingFaceModelLoader(ModelLoader):
+    """Hugging Face local model implementation of the ModelLoader."""
+    
+    def __init__(self, 
+                 solver_model: str = "microsoft/DialoGPT-medium",
+                 grader_model: str = "microsoft/DialoGPT-large",
+                 device: str = "auto",
+                 max_length: int = 4000,
+                 **kwargs):
+        """
+        Initialize Hugging Face model loader.
+        
+        Args:
+            solver_model: HuggingFace model name for solving problems
+            grader_model: HuggingFace model name for grading solutions  
+            device: Device to run models on ("auto", "cuda", "cpu")
+            max_length: Maximum generation length
+            **kwargs: Additional arguments passed to parent class
+        """
+        if transformers is None or torch is None:
+            raise ImportError(
+                "transformers and torch packages are required for HuggingFaceModelLoader. "
+                "Install with: pip install transformers torch"
+            )
+            
+        super().__init__(solver_model, grader_model, **kwargs)
+        
+        # Device setup
+        if device == "auto":
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+            
+        self.max_length = max_length
+        
+        # Model and tokenizer caches
+        self._models = {}
+        self._tokenizers = {}
+        self._pipelines = {}
+        
+        print(f"🔧 HuggingFace loader initialized on device: {self.device}")
+    
+    async def _load_model(self, model_name: str) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+        """Load model and tokenizer, with caching."""
+        if model_name not in self._models:
+            print(f"📥 Loading model: {model_name}")
+            
+            try:
+                # Load in a separate thread to avoid blocking
+                tokenizer = await asyncio.to_thread(
+                    AutoTokenizer.from_pretrained, 
+                    model_name,
+                    trust_remote_code=True
+                )
+                
+                model = await asyncio.to_thread(
+                    AutoModelForCausalLM.from_pretrained,
+                    model_name,
+                    torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                    device_map="auto" if self.device == "cuda" else None,
+                    trust_remote_code=True
+                )
+                
+                if self.device == "cpu":
+                    model = model.to(self.device)
+                
+                # Set pad token if not present
+                if tokenizer.pad_token is None:
+                    tokenizer.pad_token = tokenizer.eos_token
+                
+                self._models[model_name] = model
+                self._tokenizers[model_name] = tokenizer
+                
+                print(f"✅ Model loaded successfully: {model_name}")
+                
+            except Exception as e:
+                print(f"❌ Failed to load model {model_name}: {str(e)}")
+                raise
+        
+        return self._models[model_name], self._tokenizers[model_name]
+    
+    async def _call_api(self, 
+                       model: str, 
+                       messages: List[Dict[str, str]], 
+                       temperature: float = 0.0) -> Tuple[Optional[str], str]:
+        """
+        Make a local inference call using the HuggingFace model.
+        
+        Args:
+            model: Model name to use
+            messages: List of messages in chat format
+            temperature: Temperature for generation
+            
+        Returns:
+            Tuple of (response_content, raw_response)
+        """
+        try:
+            # Load model and tokenizer
+            hf_model, tokenizer = await self._load_model(model)
+            
+            # Convert messages to prompt format
+            prompt = self._format_messages(messages)
+            
+            # Generate response
+            response = await self._generate_response(
+                hf_model, tokenizer, prompt, temperature
+            )
+            
+            return response, response
+            
+        except Exception as e:
+            print(f"❌ HuggingFace inference error: {str(e)}")
+            raise
+    
+    def _format_messages(self, messages: List[Dict[str, str]]) -> str:
+        """Convert OpenAI message format to a prompt string."""
+        prompt_parts = []
+        
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            
+            if role == "system":
+                prompt_parts.append(f"System: {content}")
+            elif role == "user":
+                prompt_parts.append(f"User: {content}")
+            elif role == "assistant":
+                prompt_parts.append(f"Assistant: {content}")
+        
+        prompt_parts.append("Assistant:")
+        return "\n\n".join(prompt_parts)
+    
+    async def _generate_response(self, 
+                               model: AutoModelForCausalLM,
+                               tokenizer: AutoTokenizer,
+                               prompt: str, 
+                               temperature: float) -> str:
+        """Generate response using the loaded model."""
+        
+        # Tokenize input
+        inputs = await asyncio.to_thread(
+            tokenizer.encode,
+            prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=2048  # Leave room for generation
+        )
+        
+        if self.device == "cuda":
+            inputs = inputs.to(self.device)
+        
+        # Generation parameters
+        gen_kwargs = {
+            "max_new_tokens": min(self.max_length, 2048),
+            "temperature": max(temperature, 0.1),  # Avoid 0 temperature
+            "do_sample": temperature > 0.0,
+            "pad_token_id": tokenizer.eos_token_id,
+            "eos_token_id": tokenizer.eos_token_id,
+            "attention_mask": torch.ones_like(inputs)
+        }
+        
+        if temperature > 0.0:
+            gen_kwargs.update({
+                "top_p": 0.9,
+                "top_k": 50
+            })
+        
+        # Generate
+        with torch.no_grad():
+            outputs = await asyncio.to_thread(
+                model.generate,
+                inputs,
+                **gen_kwargs
+            )
+        
+        # Decode response
+        generated_text = await asyncio.to_thread(
+            tokenizer.decode,
+            outputs[0][inputs.shape[1]:],  # Only new tokens
+            skip_special_tokens=True
+        )
+        
+        return generated_text.strip()
+    
+    def get_model_info(self) -> Dict[str, str]:
+        """Get information about the configured models."""
+        return {
+            "solver_model": self.solver_model,
+            "grader_model": self.grader_model,
+            "provider": "huggingface",
+            "device": self.device,
+            "loaded_models": list(self._models.keys())
+        }
+    
+    async def health_check(self) -> bool:
+        """
+        Perform a simple health check by testing model loading and inference.
+        
+        Returns:
+            True if models can be loaded and run, False otherwise
+        """
+        try:
+            # Simple test
+            test_messages = [
+                {"role": "user", "content": "Hello, please say 'ok' to confirm you're working."}
+            ]
+            
+            result, _ = await self._call_api(
+                model=self.solver_model,
+                messages=test_messages,
+                temperature=0.1
+            )
+            
+            if result and len(result) > 0:
+                print(f"✅ HuggingFace health check passed for {self.solver_model}")
+                return True
+            else:
+                print(f"⚠️ HuggingFace health check returned empty response")
+                return False
+                
+        except Exception as e:
+            print(f"❌ HuggingFace health check failed: {str(e)}")
+            return False
+    
+    async def estimate_cost(self, 
+                          num_problems: int, 
+                          avg_problem_length: int = 1000,
+                          avg_solution_length: int = 2000) -> Dict[str, float]:
+        """
+        Estimate computational cost for processing problems locally.
+        
+        Args:
+            num_problems: Number of problems to process
+            avg_problem_length: Average length of problem statements in characters
+            avg_solution_length: Average length of solutions in characters
+            
+        Returns:
+            Dictionary with cost estimates (computational cost in arbitrary units)
+        """
+        # Rough token estimates (1 token ≈ 4 characters for English)
+        tokens_per_solve = (avg_problem_length + avg_solution_length) // 4
+        tokens_per_grade = (avg_problem_length + avg_solution_length * 2) // 4
+        
+        # Model size-based cost estimation (FLOPS approximation)
+        model_costs = {
+            # Small models (< 1B parameters)
+            "gpt2": 0.5,
+            "distilgpt2": 0.3,
+            "dialogpt-small": 0.4,
+            "dialogpt-medium": 0.8,
+            
+            # Medium models (1B - 10B parameters)  
+            "dialogpt-large": 1.5,
+            "gpt2-medium": 1.0,
+            "gpt2-large": 2.0,
+            "gpt2-xl": 4.0,
+            
+            # Large models (10B+ parameters)
+            "llama-7b": 8.0,
+            "llama-13b": 15.0,
+            "llama-30b": 35.0,
+            "llama-65b": 70.0,
+        }
+        
+        def get_model_cost(model: str) -> float:
+            model_lower = model.lower()
+            for key, cost in model_costs.items():
+                if key in model_lower:
+                    return cost
+            
+            # Default based on common model sizes
+            if any(size in model_lower for size in ["small", "mini"]):
+                return 0.5
+            elif any(size in model_lower for size in ["medium", "base"]):
+                return 1.0  
+            elif any(size in model_lower for size in ["large", "xl"]):
+                return 2.0
+            else:
+                return 1.5  # Default for unknown models
+        
+        # Calculate computational costs
+        solver_cost_factor = get_model_cost(self.solver_model)
+        grader_cost_factor = get_model_cost(self.grader_model)
+        
+        # Device multiplier (GPU is faster but uses more power)
+        device_multiplier = 0.3 if self.device == "cuda" else 1.0
+        
+        solve_cost = tokens_per_solve * num_problems * solver_cost_factor * device_multiplier / 1000
+        grade_cost = tokens_per_grade * num_problems * grader_cost_factor * device_multiplier / 1000
+        
+        total_cost = solve_cost + grade_cost
+        
+        return {
+            "solve_cost": round(solve_cost, 4),
+            "grade_cost": round(grade_cost, 4), 
+            "total_cost": round(total_cost, 4),
+            "cost_per_problem": round(total_cost / num_problems, 6),
+            "currency": "computational_units",
+            "device": self.device,
+            "note": "Local HuggingFace costs are computational (time/energy/memory)"
+        }
+    
+    async def unload_model(self, model_name: str) -> bool:
+        """
+        Unload a specific model to free memory.
+        
+        Args:
+            model_name: Name of the model to unload
+            
+        Returns:
+            True if successfully unloaded, False otherwise
+        """
+        try:
+            if model_name in self._models:
+                del self._models[model_name]
+                del self._tokenizers[model_name]
+                
+                # Force garbage collection
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                
+                print(f"🗑️ Unloaded model: {model_name}")
+                return True
+            else:
+                print(f"⚠️ Model not loaded: {model_name}")
+                return False
+                
+        except Exception as e:
+            print(f"❌ Error unloading model {model_name}: {str(e)}")
+            return False
+    
+    async def unload_all_models(self) -> bool:
+        """
+        Unload all models to free memory.
+        
+        Returns:
+            True if all models successfully unloaded
+        """
+        try:
+            model_names = list(self._models.keys())
+            success = True
+            
+            for model_name in model_names:
+                if not await self.unload_model(model_name):
+                    success = False
+            
+            return success
+            
+        except Exception as e:
+            print(f"❌ Error unloading all models: {str(e)}")
+            return False
diff --git a/putnam-bench-anon/loader/openai_client.py b/putnam-bench-anon/loader/openai_client.py
new file mode 100644
index 0000000..fcbe247
--- /dev/null
+++ b/putnam-bench-anon/loader/openai_client.py
@@ -0,0 +1,603 @@
+"""
+OpenAI model loader implementation.
+Handles API calls to OpenAI models with proper error handling and retry logic.
+"""
+
+import asyncio
+import random
+from typing import Dict, List, Tuple, Optional
+import os # Added for KimiModelLoader
+
+from openai import AsyncOpenAI, RateLimitError, APIError, APIConnectionError, BadRequestError
+
+from .base import ModelLoader
+from .prompts import RESPONSE_FORMAT
+
+
+class OpenAIModelLoader(ModelLoader):
+    """OpenAI implementation of the ModelLoader."""
+    
+    def __init__(self, 
+                 solver_model: str = "gpt-4o-mini",
+                 grader_model: str = "o3",
+                 api_key: Optional[str] = None,
+                 base_url: Optional[str] = None,
+                 **kwargs):
+        """
+        Initialize OpenAI model loader.
+        
+        Args:
+            solver_model: OpenAI model for solving problems (default: gpt-4o-mini)
+            grader_model: OpenAI model for grading solutions (default: o3)
+            api_key: OpenAI API key (if None, uses environment variable)
+            base_url: Custom base URL for OpenAI API
+            **kwargs: Additional arguments passed to parent class
+        """
+        super().__init__(solver_model, grader_model, **kwargs)
+        
+        # Initialize OpenAI client with custom httpx client for high concurrency
+        client_kwargs = {}
+        if api_key:
+            client_kwargs["api_key"] = api_key
+        if base_url:
+            client_kwargs["base_url"] = base_url
+        
+        # Configure httpx for high concurrency
+        import httpx
+        limits = httpx.Limits(
+            max_connections=1000,  # Total connection pool size
+            max_keepalive_connections=500,  # Persistent connections
+            keepalive_expiry=30.0  # Keep connections alive for 30s
+        )
+        timeout = httpx.Timeout(
+            timeout=600.0,  # Overall timeout (increased from 300)
+            connect=60.0,   # Connection timeout
+            read=600.0,     # Read timeout (increased from 300)
+            write=60.0      # Write timeout
+        )
+        
+        http_client = httpx.AsyncClient(
+            limits=limits,
+            timeout=timeout
+        )
+        client_kwargs["http_client"] = http_client
+            
+        self.client = AsyncOpenAI(**client_kwargs)
+        self._http_client = http_client  # Keep reference to close later
+    
+    async def __aenter__(self):
+        """Async context manager entry."""
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit - close http client."""
+        if hasattr(self, '_http_client'):
+            await self._http_client.aclose()
+    
+    async def _call_api(self, 
+                       model: str, 
+                       messages: List[Dict[str, str]], 
+                       temperature: float = 0.0) -> Tuple[Optional[str], str]:
+        """
+        Make an API call to OpenAI.
+        
+        Args:
+            model: OpenAI model name
+            messages: List of messages in chat format
+            temperature: Temperature for generation
+            
+        Returns:
+            Tuple of (response_content, raw_response)
+        """
+        try:
+            # Override temperature for models that require it
+            # o1, o3, o3-mini, and o4-mini only support temperature 1.0
+            if any(model_name in model.lower() for model_name in ['o1', 'o3', 'o3-mini', 'o4-mini']):
+                actual_temperature = 1.0
+                if self.debug and temperature != 1.0:
+                    print(f"⚠️  Overriding temperature from {temperature} to 1.0 for model {model}")
+            else:
+                actual_temperature = temperature
+            
+            # Prepare API call parameters
+            api_params = {
+                "model": model,
+                "messages": messages,
+                "temperature": actual_temperature,
+                # Set max_tokens to avoid truncation
+                # Most OpenAI models support at least 4096, newer ones support much more
+                "max_tokens": 32000,  # High default that works for GPT-4 and newer models
+            }
+            
+            # Only add response_format for models that support it
+            # o1 models and some older models don't support JSON format
+            # Note: o3 and o3-mini DO support response_format (tested and confirmed)
+            if not (model.startswith("o1") or model in ["gpt-4", "gpt-3.5-turbo"]):
+                api_params["response_format"] = RESPONSE_FORMAT
+            
+            # Remove max_tokens for models that don't support it
+            # o1 and o3 models don't support max_tokens parameter
+            if model.startswith("o1") or model.startswith("o3"):
+                api_params.pop("max_tokens", None)
+            
+            # Make the API call
+            response = await self.client.chat.completions.create(**api_params)
+            
+            # Extract response content
+            content = response.choices[0].message.content or ""
+            
+            return content, content
+            
+        except RateLimitError as e:
+            # Handle rate limiting with special logic
+            error_str = str(e)
+            if self.debug:
+                print(f"🚫 RateLimitError: {error_str}")
+            
+            if "insufficient_quota" in error_str:
+                if self.debug:
+                    print("⏳ Detected quota exhaustion - sleeping 15 minutes")
+                await asyncio.sleep(900)  # 15 minutes
+            else:
+                # Standard rate limit - shorter sleep
+                sleep_time = 2 + random.random()
+                if self.debug:
+                    print(f"   ⏰ Rate limited, sleeping {sleep_time:.1f}s")
+                await asyncio.sleep(sleep_time)
+            
+            # Re-raise to trigger retry logic
+            raise
+            
+        except BadRequestError as e:
+            # Handle policy violations and other 400 errors with special logic
+            error_str = str(e)
+            if self.debug:
+                print(f"🚫 BadRequestError: {error_str}")
+            
+            if "usage policy" in error_str or "flagged" in error_str:
+                if self.debug:
+                    print("⏳ Detected policy violation - sleeping 30 seconds before retry")
+                await asyncio.sleep(30)  # Longer delay for policy violations
+            else:
+                # Standard bad request - shorter sleep
+                sleep_time = 5 + random.random()
+                if self.debug:
+                    print(f"   ⏰ Bad request error, sleeping {sleep_time:.1f}s")
+                await asyncio.sleep(sleep_time)
+            
+            # Re-raise to trigger retry logic
+            raise
+            
+        except (APIError, APIConnectionError) as e:
+            if self.debug:
+                print(f"❌ OpenAI API Error: {str(e)}")
+            raise
+            
+        except Exception as e:
+            if self.debug:
+                print(f"❌ Unexpected error in OpenAI API call: {str(e)}")
+            raise
+    
+    def get_model_info(self) -> Dict[str, str]:
+        """Get information about the configured models."""
+        return {
+            "solver_model": self.solver_model,
+            "grader_model": self.grader_model,
+            "provider": "openai"
+        }
+    
+    async def health_check(self) -> bool:
+        """
+        Perform a simple health check to verify API connectivity.
+        
+        Returns:
+            True if API is accessible, False otherwise
+        """
+        try:
+            # Simple test call
+            test_messages = [
+                {"role": "user", "content": "Hello, please respond with a simple JSON: {\"status\": \"ok\"}"}
+            ]
+            
+            # Set temperature based on model
+            # o1, o3, o3-mini, and o4-mini require temperature 1.0
+            if any(model_name in self.solver_model.lower() for model_name in ['o1', 'o3', 'o3-mini', 'o4-mini']):
+                temperature = 1.0
+            else:
+                # Use temperature 0.0 for deterministic results with other models
+                temperature = 0.0
+            
+            result, _ = await self._call_api(
+                model=self.solver_model,
+                messages=test_messages,
+                temperature=temperature
+            )
+            
+            if result and "ok" in result.lower():
+                if self.debug:
+                    print(f"✅ OpenAI API health check passed for {self.solver_model}")
+                return True
+            else:
+                if self.debug:
+                    print(f"⚠️ OpenAI API health check returned unexpected response")
+                return False
+                
+        except Exception as e:
+            if self.debug:
+                print(f"❌ OpenAI API health check failed: {str(e)}")
+            return False
+    
+    async def estimate_cost(self, 
+                          num_problems: int, 
+                          avg_problem_length: int = 1000,
+                          avg_solution_length: int = 2000) -> Dict[str, float]:
+        """
+        Estimate the cost for processing a given number of problems.
+        
+        Args:
+            num_problems: Number of problems to process
+            avg_problem_length: Average length of problem statements in characters
+            avg_solution_length: Average length of solutions in characters
+            
+        Returns:
+            Dictionary with cost estimates
+        """
+        # Rough token estimates (1 token ≈ 4 characters for English)
+        tokens_per_solve = (avg_problem_length + avg_solution_length) // 4
+        tokens_per_grade = (avg_problem_length + avg_solution_length * 2) // 4
+        
+        # Simplified pricing (update with actual OpenAI pricing)
+        # These are rough estimates and should be updated with current pricing
+        pricing = {
+            "gpt-4o-mini": {"input": 0.00015, "output": 0.0006},  # per 1K tokens
+            "o3": {"input": 0.03, "output": 0.12},  # per 1K tokens (estimated)
+            "gpt-4": {"input": 0.03, "output": 0.06},  # per 1K tokens
+        }
+        
+        def get_model_cost(model: str, input_tokens: int, output_tokens: int) -> float:
+            if model not in pricing:
+                model = "gpt-4"  # Default fallback
+            
+            input_cost = (input_tokens / 1000) * pricing[model]["input"]
+            output_cost = (output_tokens / 1000) * pricing[model]["output"]
+            return input_cost + output_cost
+        
+        # Calculate costs
+        solve_cost = get_model_cost(
+            self.solver_model, 
+            tokens_per_solve * num_problems,
+            tokens_per_solve * num_problems // 2  # Assume output is ~50% of input
+        )
+        
+        grade_cost = get_model_cost(
+            self.grader_model,
+            tokens_per_grade * num_problems,
+            tokens_per_grade * num_problems // 3  # Assume output is ~33% of input
+        )
+        
+        total_cost = solve_cost + grade_cost
+        
+        return {
+            "solve_cost": round(solve_cost, 4),
+            "grade_cost": round(grade_cost, 4),
+            "total_cost": round(total_cost, 4),
+            "cost_per_problem": round(total_cost / num_problems, 6),
+            "currency": "USD"
+        }
+
+
+class KimiModelLoader(OpenAIModelLoader):
+    """Kimi/Moonshot implementation using OpenAI-compatible API."""
+    
+    def __init__(self, 
+                 solver_model: str = "kimi-k2-0711-preview",
+                 grader_model: str = "kimi-k2-0711-preview",
+                 api_key: Optional[str] = None,
+                 **kwargs):
+        """
+        Initialize Kimi model loader.
+        
+        Args:
+            solver_model: Kimi model for solving problems (default: moonshot-v1-8k)
+            grader_model: Kimi model for grading solutions (default: moonshot-v1-8k)
+            api_key: Kimi API key (if None, uses MOONSHOT_API_KEY environment variable)
+            **kwargs: Additional arguments passed to parent class
+        """
+        # Get API key from parameter or environment
+        if api_key is None:
+            api_key = os.getenv('MOONSHOT_API_KEY')
+        
+        # Initialize with Kimi-specific settings
+        super().__init__(
+            solver_model=solver_model,
+            grader_model=grader_model,
+            api_key=api_key,
+            base_url="https://api.moonshot.ai/v1",
+            **kwargs
+        )
+    
+    async def _call_api(self, 
+                       model: str, 
+                       messages: List[Dict[str, str]], 
+                       temperature: float = 0.0) -> Tuple[Optional[str], str]:
+        """
+        Make an API call to Kimi with proper error handling.
+        
+        Args:
+            model: Kimi model name
+            messages: List of messages in chat format
+            temperature: Temperature for generation
+            
+        Returns:
+            Tuple of (response_content, raw_response)
+        """
+        import time
+        
+        start_time = time.time()
+        if self.debug:
+            print(f"🔄 Starting Kimi API call with model: {model}")
+        
+        try:
+            # Prepare API call parameters
+            api_params = {
+                "model": model,
+                "messages": messages,
+                "temperature": temperature,
+                "response_format": RESPONSE_FORMAT,  # Kimi supports JSON format
+            }
+            
+            # Set max_tokens based on model
+            if "128k" in model:
+                api_params["max_tokens"] = 32000  # For 128k context models
+            elif "32k" in model:
+                api_params["max_tokens"] = 16000  # For 32k context models  
+            elif "8k" in model:
+                api_params["max_tokens"] = 8000   # For 8k context models
+            elif "k2" in model.lower():
+                api_params["max_tokens"] = 24000  # For K2 models
+            else:
+                api_params["max_tokens"] = 16000  # Default high limit
+            
+            if self.debug:
+                print(f"📋 API call parameters: model={model}, messages={len(messages)}, temp={temperature}, max_tokens={api_params['max_tokens']}")
+            
+            # Make the API call
+            response = await self.client.chat.completions.create(**api_params)
+            
+            elapsed_time = time.time() - start_time
+            if self.debug:
+                print(f"✅ Kimi API call completed in {elapsed_time:.2f}s")
+            
+            # Extract response content
+            content = response.choices[0].message.content or ""
+            if self.debug:
+                print(f"📄 Response length: {len(content)} characters")
+            
+            # Check if response might be truncated
+            if self.debug and hasattr(response, 'usage'):
+                completion_tokens = response.usage.completion_tokens
+                print(f"📊 Completion tokens used: {completion_tokens}")
+                if completion_tokens >= api_params['max_tokens'] * 0.95:  # 95% of limit
+                    print(f"⚠️  WARNING: Response may be truncated (used {completion_tokens}/{api_params['max_tokens']} tokens)")
+            
+            # Check if content ends abruptly (truncation signs)
+            if self.debug and content and not content.strip().endswith(('"}', '"}')):
+                print("⚠️  WARNING: Response doesn't end with proper JSON closure - likely truncated")
+            
+            # ============= RAW RESPONSE LOGGING (DEBUG ONLY) =============
+            if self.debug:
+                import json
+                from pathlib import Path
+                from datetime import datetime
+                
+                # Create raw response log directory
+                log_dir = Path("kimi_raw_responses")
+                log_dir.mkdir(exist_ok=True)
+                
+                # Save raw response
+                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')[:-3]  # Include milliseconds
+                raw_log_file = log_dir / f"kimi_raw_response_{timestamp}.json"
+                
+                raw_response_data = {
+                    "timestamp": datetime.now().isoformat(),
+                    "model": model,
+                    "api_params": api_params,
+                    "response_time_seconds": elapsed_time,
+                    "raw_content": content,
+                    "content_length": len(content),
+                    "response_object": {
+                        "choices": [
+                            {
+                                "message": {
+                                    "content": content,
+                                    "role": response.choices[0].message.role
+                                }
+                            }
+                        ]
+                    }
+                }
+                
+                try:
+                    with open(raw_log_file, 'w', encoding='utf-8') as f:
+                        json.dump(raw_response_data, f, indent=2, ensure_ascii=False)
+                    print(f"💾 Raw response saved to: {raw_log_file}")
+                except Exception as save_error:
+                    print(f"❌ Failed to save raw response: {save_error}")
+                
+                # Also print raw content to console
+                print(f"📋 RAW RESPONSE CONTENT:")
+                print(f"{'='*60}")
+                print(content[:1000] + ("..." if len(content) > 1000 else ""))
+                print(f"{'='*60}")
+            # ============= END RAW RESPONSE LOGGING =============
+            
+            return content, content
+            
+        except RateLimitError as e:
+            elapsed_time = time.time() - start_time
+            error_str = str(e)
+            if self.debug:
+                print(f"🚫 Kimi RateLimitError after {elapsed_time:.2f}s: {error_str}")
+            
+            # Try to capture response details
+            if self.debug and hasattr(e, 'response') and e.response:
+                print(f"   Status: {e.response.status_code}")
+                print(f"   Headers: {dict(e.response.headers)}")
+                print(f"   Response: {e.response.text[:500]}...")
+            
+            if "insufficient_quota" in error_str:
+                if self.debug:
+                    print("⏳ Detected Kimi quota exhaustion - sleeping 15 minutes")
+                await asyncio.sleep(900)  # 15 minutes
+            else:
+                # Standard rate limit - shorter sleep
+                sleep_time = 2 + random.random()
+                if self.debug:
+                    print(f"   ⏰ Rate limited on Kimi API, sleeping {sleep_time:.1f}s")
+                await asyncio.sleep(sleep_time)
+            
+            # Re-raise to trigger retry logic
+            raise
+            
+        except (APIError, APIConnectionError) as e:
+            elapsed_time = time.time() - start_time
+            error_str = str(e)
+            if self.debug:
+                print(f"❌ Kimi API Error after {elapsed_time:.2f}s: {error_str}")
+            
+            # Try to capture response details
+            if self.debug and hasattr(e, 'response') and e.response:
+                print(f"   Status: {e.response.status_code}")
+                print(f"   Headers: {dict(e.response.headers)}")
+                print(f"   Response: {e.response.text[:500]}...")
+            
+            # Log request details for debugging
+            if self.debug and hasattr(e, 'request') and e.request:
+                print(f"   Request URL: {e.request.url}")
+                print(f"   Request method: {e.request.method}")
+                print(f"   Request headers: {dict(e.request.headers)}")
+            
+            raise
+            
+        except Exception as e:
+            elapsed_time = time.time() - start_time
+            error_str = str(e)
+            if self.debug:
+                print(f"❌ Unexpected error in Kimi API call after {elapsed_time:.2f}s: {error_str}")
+                print(f"   Error type: {type(e).__name__}")
+            
+            # Try to capture any additional error details
+            if self.debug and hasattr(e, 'response'):
+                try:
+                    print(f"   Response status: {e.response.status_code}")
+                    print(f"   Response headers: {dict(e.response.headers)}")
+                    print(f"   Response text: {e.response.text[:500]}...")
+                except:
+                    print("   Could not extract response details")
+            
+            # Log the full exception
+            if self.debug:
+                import traceback
+                print(f"   Full traceback: {traceback.format_exc()}")
+            
+            raise
+    
+    def get_model_info(self) -> Dict[str, str]:
+        """Get information about the configured models."""
+        return {
+            "solver_model": self.solver_model,
+            "grader_model": self.grader_model,
+            "provider": "kimi",
+            "base_url": "https://api.moonshot.ai/v1"
+        }
+    
+    async def health_check(self) -> bool:
+        """
+        Perform a simple health check to verify Kimi API connectivity.
+        
+        Returns:
+            True if API is accessible, False otherwise
+        """
+        try:
+            # Simple test call with Kimi's system prompt
+            test_messages = [
+                {"role": "system", "content": "You are Kimi, an AI assistant provided by Moonshot AI. You are proficient in Chinese and English conversations. You provide users with safe, helpful, and accurate answers. You will reject any questions involving terrorism, racism, or explicit content. Moonshot AI is a proper noun and should not be translated."},
+                {"role": "user", "content": "Hello, please respond with a simple JSON: {\"status\": \"ok\"}"}
+            ]
+            
+            result, _ = await self._call_api(
+                model=self.solver_model,
+                messages=test_messages,
+                temperature=0.0
+            )
+            
+            if result and "ok" in result.lower():
+                if self.debug:
+                    print(f"✅ Kimi API health check passed for {self.solver_model}")
+                return True
+            else:
+                if self.debug:
+                    print(f"⚠️ Kimi API health check returned unexpected response")
+                return False
+                
+        except Exception as e:
+            if self.debug:
+                print(f"❌ Kimi API health check failed: {str(e)}")
+            return False
+    
+    async def estimate_cost(self, 
+                          num_problems: int, 
+                          avg_problem_length: int = 1000,
+                          avg_solution_length: int = 2000) -> Dict[str, float]:
+        """
+        Estimate the cost for processing a given number of problems with Kimi models.
+        
+        Args:
+            num_problems: Number of problems to process
+            avg_problem_length: Average length of problem statements in characters
+            avg_solution_length: Average length of solutions in characters
+            
+        Returns:
+            Dictionary with cost estimates
+        """
+        # Rough token estimates (1 token ≈ 4 characters for English)
+        tokens_per_solve = (avg_problem_length + avg_solution_length) // 4
+        tokens_per_grade = (avg_problem_length + avg_solution_length * 2) // 4
+        
+        # Kimi pricing (in USD per 1K tokens)
+        # These are example prices - update with actual Kimi pricing
+        pricing = {
+            "moonshot-v1-8k": {"input": 0.012, "output": 0.012},
+            "moonshot-v1-32k": {"input": 0.024, "output": 0.024},
+            "moonshot-v1-128k": {"input": 0.06, "output": 0.06},
+        }
+        
+        def get_model_cost(model: str, input_tokens: int, output_tokens: int) -> float:
+            if model not in pricing:
+                model = "moonshot-v1-8k"  # Default to 8k pricing
+            
+            input_cost = (input_tokens / 1000) * pricing[model]["input"]
+            output_cost = (output_tokens / 1000) * pricing[model]["output"]
+            return input_cost + output_cost
+        
+        # Calculate costs
+        solve_cost = get_model_cost(
+            self.solver_model, 
+            tokens_per_solve * num_problems,
+            tokens_per_solve * num_problems // 2  # Assume output is ~50% of input
+        )
+        
+        grade_cost = get_model_cost(
+            self.grader_model,
+            tokens_per_grade * num_problems,
+            tokens_per_grade * num_problems // 4  # Grading output is shorter
+        )
+        
+        return {
+            "solver_cost": solve_cost,
+            "grader_cost": grade_cost,
+            "total_cost": solve_cost + grade_cost,
+            "num_problems": num_problems,
+            "solver_model": self.solver_model,
+            "grader_model": self.grader_model
+        }
diff --git a/putnam-bench-anon/loader/openrouter_client.py b/putnam-bench-anon/loader/openrouter_client.py
new file mode 100644
index 0000000..13cd7fa
--- /dev/null
+++ b/putnam-bench-anon/loader/openrouter_client.py
@@ -0,0 +1,213 @@
+"""
+OpenRouter model loader implementation.
+Handles API calls to OpenRouter service using OpenAI-compatible interface.
+OpenRouter provides access to multiple model providers through a single API.
+"""
+
+import os
+from typing import Dict, Optional, List, Tuple
+
+from .openai_client import OpenAIModelLoader
+
+
+class OpenRouterModelLoader(OpenAIModelLoader):
+    """OpenRouter implementation using OpenAI-compatible API."""
+    
+    def __init__(self, 
+                 solver_model: str = "openai/gpt-4o",
+                 grader_model: str = "openai/gpt-4o",
+                 api_key: Optional[str] = None,
+                 site_url: Optional[str] = None,
+                 site_name: Optional[str] = None,
+                 **kwargs):
+        """
+        Initialize OpenRouter model loader.
+        
+        Args:
+            solver_model: Model for solving problems (default: openai/gpt-4o)
+                        Format should be "provider/model-name" (e.g., "openai/gpt-4o", "anthropic/claude-3-opus")
+            grader_model: Model for grading solutions (default: openai/gpt-4o)
+                        Format should be "provider/model-name"
+            api_key: OpenRouter API key (if None, uses OPENROUTER_API_KEY environment variable)
+            site_url: Optional site URL for rankings on openrouter.ai
+            site_name: Optional site name for rankings on openrouter.ai
+            **kwargs: Additional arguments passed to parent class
+        """
+        # Get API key from parameter or environment
+        if api_key is None:
+            api_key = os.getenv('OPENROUTER_API_KEY')
+            if not api_key:
+                raise ValueError("OpenRouter API key not provided. Set OPENROUTER_API_KEY environment variable or pass api_key parameter")
+        
+        # Store site information for headers
+        self.site_url = site_url
+        self.site_name = site_name
+        
+        # Initialize with OpenRouter-specific settings
+        super().__init__(
+            solver_model=solver_model,
+            grader_model=grader_model,
+            api_key=api_key,
+            base_url="https://openrouter.ai/api/v1",
+            **kwargs
+        )
+    
+    async def _call_api(self, 
+                       model: str, 
+                       messages: List[Dict[str, str]], 
+                       temperature: float = 0.0) -> Tuple[Optional[str], str]:
+        """
+        Make an API call to OpenRouter with proper headers.
+        
+        Args:
+            model: Model name in format "provider/model-name"
+            messages: List of messages in chat format
+            temperature: Temperature for generation
+            
+        Returns:
+            Tuple of (response_content, raw_response)
+        """
+        try:
+            # Prepare extra headers for OpenRouter
+            extra_headers = {}
+            if self.site_url:
+                extra_headers["HTTP-Referer"] = self.site_url
+            if self.site_name:
+                extra_headers["X-Title"] = self.site_name
+            
+            # Prepare API call parameters
+            api_params = {
+                "model": model,
+                "messages": messages,
+                "temperature": temperature,
+                # Set max_tokens to avoid truncation, especially for models like Gemini
+                # 32000 is a reasonable default that works for most models
+                "max_tokens": 32000,
+            }
+            
+            # Add response_format for all models - OpenRouter handles compatibility
+            from .prompts import RESPONSE_FORMAT
+            api_params["response_format"] = RESPONSE_FORMAT
+            
+            # Make the API call with extra headers
+            if extra_headers:
+                response = await self.client.chat.completions.create(
+                    **api_params,
+                    extra_headers=extra_headers
+                )
+            else:
+                response = await self.client.chat.completions.create(**api_params)
+            
+            # Check if response is valid
+            if not response or not response.choices or len(response.choices) == 0:
+                raise ValueError("Empty response from OpenRouter API")
+            
+            content = response.choices[0].message.content
+            if not content:
+                raise ValueError("Empty content in OpenRouter API response")
+                
+            return content, content
+            
+        except Exception as e:
+            # Replace "OpenAI" with "OpenRouter" in error messages
+            error_msg = str(e)
+            if "OpenAI API Error" in error_msg:
+                error_msg = error_msg.replace("OpenAI API Error", "OpenRouter API Error")
+            
+            # Log with OpenRouter-specific prefix
+            if "RateLimitError" in type(e).__name__:
+                print(f"🚫 OpenRouter RateLimitError: {error_msg}")
+                raise
+            elif "APIError" in type(e).__name__ or "APIConnectionError" in type(e).__name__:
+                print(f"❌ OpenRouter API Error: {error_msg}")
+                raise
+            else:
+                print(f"❌ Unexpected error in OpenRouter API call: {error_msg}")
+                raise
+    
+    def get_model_info(self) -> Dict[str, str]:
+        """Get information about the configured models."""
+        return {
+            "solver_model": self.solver_model,
+            "grader_model": self.grader_model,
+            "provider": "openrouter",
+            "base_url": "https://openrouter.ai/api/v1"
+        }
+    
+    async def health_check(self) -> bool:
+        """
+        Perform a simple health check to verify OpenRouter API connectivity.
+        
+        Returns:
+            True if API is accessible, False otherwise
+        """
+        try:
+            # Simple test call
+            test_messages = [
+                {"role": "user", "content": "Hello, please respond with a simple JSON: {\"status\": \"ok\"}"}
+            ]
+            
+            result, _ = await self._call_api(
+                self.solver_model,
+                test_messages,
+                temperature=0.0
+            )
+            
+            return result is not None
+            
+        except Exception as e:
+            print(f"❌ OpenRouter health check failed: {e}")
+            return False
+    
+    @staticmethod
+    def get_available_models() -> List[str]:
+        """
+        Get a list of commonly available models on OpenRouter.
+        Note: This is not exhaustive. Check https://openrouter.ai/models for full list.
+        
+        Returns:
+            List of model identifiers in "provider/model-name" format
+        """
+        return [
+            # OpenAI models
+            "openai/gpt-4o",
+            "openai/gpt-4o-mini", 
+            "openai/gpt-4-turbo",
+            "openai/gpt-3.5-turbo",
+            "openai/o1-preview",
+            "openai/o1-mini",
+            
+            # Anthropic models
+            "anthropic/claude-3-opus",
+            "anthropic/claude-3-sonnet",
+            "anthropic/claude-3-haiku",
+            "anthropic/claude-2.1",
+            "anthropic/claude-2",
+            
+            # Google models
+            "google/gemini-pro",
+            "google/gemini-pro-vision",
+            "google/palm-2-codechat-bison",
+            "google/palm-2-chat-bison",
+            
+            # Meta models
+            "meta-llama/llama-3-70b-instruct",
+            "meta-llama/llama-3-8b-instruct",
+            "meta-llama/codellama-70b-instruct",
+            
+            # Mistral models
+            "mistralai/mistral-large",
+            "mistralai/mistral-medium",
+            "mistralai/mistral-small",
+            "mistralai/mistral-7b-instruct",
+            "mistralai/mixtral-8x7b-instruct",
+            
+            # Other notable models
+            "cohere/command-r-plus",
+            "cohere/command-r",
+            "databricks/dbrx-instruct",
+            "deepseek/deepseek-coder",
+            "deepseek/deepseek-chat",
+            "qwen/qwen-2-72b-instruct",
+            "qwen/qwen-1.5-110b-chat",
+        ] 
+\ No newline at end of file
diff --git a/putnam-bench-anon/loader/prompts.py b/putnam-bench-anon/loader/prompts.py
new file mode 100644
index 0000000..7f1be83
--- /dev/null
+++ b/putnam-bench-anon/loader/prompts.py
@@ -0,0 +1,106 @@
+"""
+Prompt templates for mathematical problem solving and grading.
+These prompts have been refined and validated through extensive testing.
+"""
+
+# Solver system prompt - 4o-mini
+SOLVER_SYSTEM_PROMPT = """You are an expert mathematician solving competition-level problems.
+Provide detailed, step-by-step solutions with clear mathematical reasoning.
+
+Requirements:
+- Show all your work and intermediate steps
+- Justify each major step of your reasoning
+- Use proper mathematical notation
+- Be thorough but concise
+- State your final answer clearly
+
+Solve the problem completely and rigorously."""
+
+SOLVER_USER_TEMPLATE = """Please solve this mathematical problem:
+
+{problem_statement}
+
+Provide a complete solution with detailed reasoning. Return your response in JSON format:
+{{"solution": "your complete step-by-step solution with mathematical reasoning",
+  "final_answer": "your final answer in a clear, concise form"}}"""
+
+# Proof strict grading system prompt - o3
+PROOF_GRADER_SYSTEM_PROMPT = """You are an extremely strict mathematical grader evaluating competition-level PROOF problems.
+
+GRADING STANDARDS (BE VERY STRICT):
+- Mathematical rigor: Every step must be mathematically sound and justified
+- Logical flow: The reasoning must be clear, complete, and logically connected
+- Correctness: All calculations, algebraic manipulations, and conclusions must be correct
+- Completeness: The solution must address all parts of the problem fully
+- Precision: Mathematical statements must be precise and unambiguous
+
+FAILING CRITERIA (Mark as INCORRECT if ANY of these apply):
+- Any unjustified logical leap or gap in reasoning
+- Any computational error, no matter how small
+- Missing steps in critical parts of the argument
+- Imprecise or ambiguous mathematical statements
+- Incorrect final answer, even if approach is partially correct
+- Circular reasoning or logical fallacies
+- Misuse of mathematical theorems or definitions
+
+BE EXTREMELY STRICT. Competition mathematics proofs require perfect precision."""
+
+# Calculation lenient grading system prompt - o3  
+CALCULATION_GRADER_SYSTEM_PROMPT = """You are a mathematical grader evaluating competition-level CALCULATION problems.
+
+GRADING STANDARDS FOR CALCULATION PROBLEMS:
+- Primary focus: Is the final answer correct?
+- Secondary focus: Is the overall approach reasonable and mathematically sound?
+- Computation: Allow minor computational slips if the method is correct and final answer is right
+
+GRADING CRITERIA:
+- CORRECT: Final answer is correct AND approach is fundamentally sound
+- INCORRECT: Final answer is wrong OR approach is fundamentally flawed
+
+For calculation problems, the final numerical answer is the most important criterion.
+Minor intermediate errors are acceptable if they don't affect the final result."""
+
+PROOF_GRADER_USER_TEMPLATE = """Grade this PROOF solution with extreme strictness.
+
+PROBLEM:
+{problem_statement}
+
+STUDENT SOLUTION:
+{solution}
+
+CORRECT REFERENCE SOLUTION:
+{reference_solution}
+
+Evaluate with maximum strictness. Every logical step must be perfect. Return JSON with:
+{{"grade": "CORRECT" or "INCORRECT",
+  "detailed_feedback": "specific detailed analysis of what is right/wrong",
+  "major_issues": "list of significant mathematical errors or gaps",
+  "final_answer_correct": true or false,
+  "reasoning_rigor_score": 0-10 integer (10=perfect rigor, 0=severely flawed),
+  "overall_assessment": "comprehensive evaluation summary"}}"""
+
+CALCULATION_GRADER_USER_TEMPLATE = """Grade this CALCULATION solution with focus on final answer correctness.
+
+PROBLEM:
+{problem_statement}
+
+STUDENT SOLUTION:
+{solution}
+
+CORRECT REFERENCE SOLUTION:
+{reference_solution}
+
+Focus primarily on whether the final answer is correct. Return JSON with:
+{{"grade": "CORRECT" or "INCORRECT",
+  "detailed_feedback": "specific detailed analysis of what is right/wrong",
+  "major_issues": "list of significant mathematical errors or gaps",
+  "final_answer_correct": true or false,
+  "reasoning_rigor_score": 0-10 integer (10=perfect rigor, 0=severely flawed),
+  "overall_assessment": "comprehensive evaluation summary"}}"""
+
+# Response format for JSON output
+RESPONSE_FORMAT = {"type": "json_object"}
+
+# Default retry and timeout settings
+DEFAULT_RETRIES = 6  # Limited to 6 retries before marking as failed
+DEFAULT_TIMEOUT_BASE = 600 
+\ No newline at end of file
diff --git a/putnam-bench-anon/loader/vllm_direct.py b/putnam-bench-anon/loader/vllm_direct.py
new file mode 100644
index 0000000..b35d99b
--- /dev/null
+++ b/putnam-bench-anon/loader/vllm_direct.py
@@ -0,0 +1,313 @@
+"""
+VLLM direct Python API model loader implementation.
+Uses VLLM's Python API directly without requiring a separate server process.
+"""
+
+import asyncio
+import json
+import re
+from typing import Dict, List, Tuple, Optional, Any
+import torch
+
+try:
+    from vllm import LLM, SamplingParams
+    VLLM_AVAILABLE = True
+except ImportError:
+    LLM = None
+    SamplingParams = None
+    VLLM_AVAILABLE = False
+
+from .base import ModelLoader
+from .prompts import SOLVER_SYSTEM_PROMPT, PROOF_GRADER_SYSTEM_PROMPT
+
+
+class VLLMDirectModelLoader(ModelLoader):
+    """VLLM direct Python API implementation of the ModelLoader."""
+    
+    def __init__(self, 
+                 solver_model: str = "gpt2",
+                 grader_model: str = "gpt2",
+                 max_model_len: int = 512,
+                 gpu_memory_utilization: float = 0.4,
+                 device: str = "auto",
+                 **kwargs):
+        """
+        Initialize VLLM direct model loader.
+        
+        Args:
+            solver_model: Model name for solving problems (default: gpt2)
+            grader_model: Model name for grading solutions (default: gpt2)
+            max_model_len: Maximum sequence length (default: 512 for testing)
+            gpu_memory_utilization: GPU memory utilization ratio (default: 0.4)
+            device: Device to use ('auto', 'cuda', 'cpu')
+            **kwargs: Additional arguments passed to parent class
+        """
+        if not VLLM_AVAILABLE:
+            raise ImportError(
+                "vllm package is required for VLLMDirectModelLoader. "
+                "Install with: pip install vllm"
+            )
+            
+        super().__init__(solver_model, grader_model, **kwargs)
+        
+        self.max_model_len = max_model_len
+        self.gpu_memory_utilization = gpu_memory_utilization
+        self.device = device
+        
+        # Model instances (lazy loaded)
+        self._solver_llm = None
+        self._grader_llm = None
+        self._loaded_models = []
+        
+        print(f"🔧 VLLM Direct loader initialized")
+        print(f"   Device: {device}")
+        print(f"   Max length: {max_model_len}")
+        print(f"   GPU utilization: {gpu_memory_utilization}")
+    
+    def _get_vllm_config(self, model: str) -> Dict[str, Any]:
+        """Get VLLM configuration for a model."""
+        return {
+            "model": model,
+            "max_model_len": self.max_model_len,
+            "gpu_memory_utilization": self.gpu_memory_utilization,
+            "trust_remote_code": False,
+            "enforce_eager": True,  # Disable graph optimization for faster startup
+        }
+    
+    async def _load_model(self, model: str, purpose: str) -> LLM:
+        """Load a VLLM model instance."""
+        print(f"📥 Loading {purpose} model: {model}")
+        
+        try:
+            config = self._get_vllm_config(model)
+            llm = LLM(**config)
+            
+            self._loaded_models.append(model)
+            print(f"✅ Model loaded successfully: {model}")
+            return llm
+            
+        except Exception as e:
+            print(f"❌ Failed to load model {model}: {e}")
+            raise
+    
+    async def _get_solver_model(self) -> LLM:
+        """Get or load the solver model."""
+        if self._solver_llm is None:
+            self._solver_llm = await self._load_model(self.solver_model, "solver")
+        return self._solver_llm
+    
+    async def _get_grader_model(self) -> LLM:
+        """Get or load the grader model."""
+        if self._grader_llm is None:
+            # If solver and grader use the same model, reuse the instance
+            if self.solver_model == self.grader_model and self._solver_llm is not None:
+                print(f"♻️ Reusing solver model for grading: {self.grader_model}")
+                self._grader_llm = self._solver_llm
+            else:
+                self._grader_llm = await self._load_model(self.grader_model, "grader")
+        return self._grader_llm
+    
+    def _format_messages_as_prompt(self, messages: List[Dict[str, str]]) -> str:
+        """Convert chat messages to a single prompt string."""
+        prompt_parts = []
+        
+        for message in messages:
+            role = message["role"]
+            content = message["content"]
+            
+            if role == "system":
+                prompt_parts.append(f"System: {content}")
+            elif role == "user":
+                prompt_parts.append(f"User: {content}")
+            elif role == "assistant":
+                prompt_parts.append(f"Assistant: {content}")
+        
+        # Add final assistant prompt
+        if not messages[-1]["role"] == "assistant":
+            prompt_parts.append("Assistant:")
+        
+        return "\n\n".join(prompt_parts)
+    
+    def _extract_json_from_response(self, response: str) -> Optional[Dict]:
+        """Extract JSON from model response."""
+        try:
+            # Try to find JSON in the response
+            json_match = re.search(r'\{.*\}', response, re.DOTALL)
+            if json_match:
+                json_str = json_match.group()
+                return json.loads(json_str)
+            
+            # If no JSON found, try to parse the entire response
+            return json.loads(response.strip())
+            
+        except json.JSONDecodeError:
+            # If JSON parsing fails, return None
+            return None
+    
+    async def _call_api(self, 
+                       model: str, 
+                       messages: List[Dict[str, str]], 
+                       temperature: float = 0.0) -> Tuple[Optional[str], str]:
+        """
+        Make an inference call using VLLM.
+        
+        Args:
+            model: Model name to use
+            messages: List of messages in chat format
+            temperature: Temperature for generation
+            
+        Returns:
+            Tuple of (response_content, raw_response)
+        """
+        try:
+            # Get the appropriate model instance
+            if model == self.solver_model:
+                llm = await self._get_solver_model()
+            elif model == self.grader_model:
+                llm = await self._get_grader_model()
+            else:
+                raise ValueError(f"Unknown model: {model}")
+            
+            # Convert messages to prompt
+            prompt = self._format_messages_as_prompt(messages)
+            
+            # Set up sampling parameters
+            sampling_params = SamplingParams(
+                temperature=temperature,
+                top_p=0.95,
+                max_tokens=500,  # Reasonable limit for responses
+                stop=["\nUser:", "\nSystem:"]  # Stop at new conversation turns
+            )
+            
+            # Generate response
+            outputs = llm.generate([prompt], sampling_params)
+            
+            if outputs and len(outputs) > 0:
+                generated_text = outputs[0].outputs[0].text
+                return generated_text.strip(), generated_text
+            else:
+                return None, ""
+            
+        except Exception as e:
+            print(f"❌ VLLM inference error: {str(e)}")
+            raise
+    
+    def get_model_info(self) -> Dict[str, str]:
+        """Get information about the configured models."""
+        return {
+            "solver_model": self.solver_model,
+            "grader_model": self.grader_model,
+            "provider": "vllm_direct",
+            "device": self.device,
+            "loaded_models": self._loaded_models
+        }
+    
+    async def health_check(self) -> bool:
+        """
+        Perform a simple health check to verify VLLM functionality.
+        
+        Returns:
+            True if models can be loaded and generate text, False otherwise
+        """
+        try:
+            print(f"🔍 VLLM health check starting...")
+            
+            # Try to load and use the solver model
+            test_messages = [
+                {"role": "user", "content": "Hello! Please respond with 'Health check OK'."}
+            ]
+            
+            result, _ = await self._call_api(
+                model=self.solver_model,
+                messages=test_messages,
+                temperature=0.1
+            )
+            
+            if result and len(result) > 0:
+                print(f"✅ VLLM health check passed for {self.solver_model}")
+                print(f"   Response: {result[:50]}...")
+                return True
+            else:
+                print(f"❌ VLLM health check failed: empty response")
+                return False
+                
+        except Exception as e:
+            print(f"❌ VLLM health check failed: {str(e)}")
+            return False
+    
+    async def estimate_cost(self, 
+                          num_problems: int, 
+                          avg_problem_length: int = 1000,
+                          avg_solution_length: int = 2000) -> Dict[str, float]:
+        """
+        Estimate the cost for processing a given number of problems.
+        For direct VLLM, cost is computational (time/energy).
+        
+        Args:
+            num_problems: Number of problems to process
+            avg_problem_length: Average length of problem statements in characters
+            avg_solution_length: Average length of solutions in characters
+            
+        Returns:
+            Dictionary with cost estimates
+        """
+        # Token estimates (1 token ≈ 4 characters)
+        tokens_per_solve = (avg_problem_length + avg_solution_length) // 4
+        tokens_per_grade = (avg_problem_length + avg_solution_length * 2) // 4
+        
+        # Model size cost factors (based on parameter count)
+        model_costs = {
+            "gpt2": 1.0,      # 124M params
+            "distilgpt2": 0.5, # 82M params
+            "microsoft/dialo": 1.2,  # DialoGPT variants
+            "tinyllama": 2.0,  # 1.1B params
+        }
+        
+        def get_model_cost(model: str) -> float:
+            model_lower = model.lower()
+            for key, cost in model_costs.items():
+                if key in model_lower:
+                    return cost
+            return 1.5  # Default cost
+        
+        solver_cost_factor = get_model_cost(self.solver_model)
+        grader_cost_factor = get_model_cost(self.grader_model)
+        
+        # Computational cost estimation (arbitrary units)
+        solve_cost = tokens_per_solve * num_problems * solver_cost_factor / 10000
+        grade_cost = tokens_per_grade * num_problems * grader_cost_factor / 10000
+        
+        total_cost = solve_cost + grade_cost
+        
+        return {
+            "solve_cost": round(solve_cost, 4),
+            "grade_cost": round(grade_cost, 4),
+            "total_cost": round(total_cost, 4),
+            "cost_per_problem": round(total_cost / num_problems, 6),
+            "currency": "computational_units",
+            "note": "Direct VLLM costs are computational (GPU time/energy)"
+        }
+    
+    async def unload_all_models(self):
+        """Unload all loaded models to free GPU memory."""
+        try:
+            print("🗑️ Unloading VLLM models...")
+            
+            # Clean up model instances
+            if self._solver_llm is not None:
+                del self._solver_llm
+                self._solver_llm = None
+            
+            if self._grader_llm is not None and self._grader_llm != self._solver_llm:
+                del self._grader_llm
+                self._grader_llm = None
+            
+            # Clear CUDA cache
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            
+            self._loaded_models.clear()
+            print("✅ Models unloaded successfully")
+            
+        except Exception as e:
+            print(f"⚠️ Error during model cleanup: {e}") 
+\ No newline at end of file
diff --git a/putnam-bench-anon/loader/vllm_local.py b/putnam-bench-anon/loader/vllm_local.py
new file mode 100644
index 0000000..bc8c4fb
--- /dev/null
+++ b/putnam-bench-anon/loader/vllm_local.py
@@ -0,0 +1,224 @@
+"""
+VLLM local model loader implementation.
+Handles API calls to locally deployed VLLM services with OpenAI-compatible endpoints.
+"""
+
+import asyncio
+import random
+from typing import Dict, List, Tuple, Optional
+
+try:
+    from openai import AsyncOpenAI, RateLimitError, APIError, APIConnectionError
+except ImportError:
+    AsyncOpenAI = None
+    RateLimitError = Exception
+    APIError = Exception
+    APIConnectionError = Exception
+
+from .base import ModelLoader
+from .prompts import RESPONSE_FORMAT
+
+
+class VLLMModelLoader(ModelLoader):
+    """VLLM local model implementation of the ModelLoader."""
+    
+    def __init__(self, 
+                 solver_model: str = "meta-llama/Llama-3.2-3B-Instruct",
+                 grader_model: str = "meta-llama/Llama-3.2-8B-Instruct", 
+                 base_url: str = "http://localhost:8000/v1",
+                 api_key: str = "EMPTY",
+                 **kwargs):
+        """
+        Initialize VLLM model loader.
+        
+        Args:
+            solver_model: Model name for solving problems (default: Llama-3.2-3B-Instruct)
+            grader_model: Model name for grading solutions (default: Llama-3.2-8B-Instruct)
+            base_url: VLLM server URL (default: http://localhost:8000/v1)
+            api_key: API key for VLLM server (default: "EMPTY" for local)
+            **kwargs: Additional arguments passed to parent class
+        """
+        if AsyncOpenAI is None:
+            raise ImportError(
+                "openai package is required for VLLMModelLoader. "
+                "Install with: pip install openai"
+            )
+            
+        super().__init__(solver_model, grader_model, **kwargs)
+        
+        # Initialize OpenAI-compatible client for VLLM
+        self.client = AsyncOpenAI(
+            base_url=base_url,
+            api_key=api_key
+        )
+        self.base_url = base_url
+    
+    async def _call_api(self, 
+                       model: str, 
+                       messages: List[Dict[str, str]], 
+                       temperature: float = 0.0) -> Tuple[Optional[str], str]:
+        """
+        Make an API call to VLLM server.
+        
+        Args:
+            model: Model name to use
+            messages: List of messages in chat format
+            temperature: Temperature for generation
+            
+        Returns:
+            Tuple of (response_content, raw_response)
+        """
+        try:
+            # Prepare API call parameters
+            api_params = {
+                "model": model,
+                "messages": messages,
+                "temperature": temperature,
+                "max_tokens": 4000,
+            }
+            
+            # Only add response_format for models that support it
+            # Most local models may not support structured JSON output
+            if temperature == 0.0:
+                try:
+                    api_params["response_format"] = RESPONSE_FORMAT
+                except:
+                    # If JSON format is not supported, we'll parse manually
+                    pass
+            
+            # Make the API call
+            response = await self.client.chat.completions.create(**api_params)
+            
+            # Extract response content
+            content = response.choices[0].message.content or ""
+            
+            return content, content
+            
+        except (RateLimitError, APIError, APIConnectionError) as e:
+            # Handle various API errors
+            error_str = str(e)
+            print(f"❌ VLLM API Error: {error_str}")
+            
+            if "rate" in error_str.lower() or "limit" in error_str.lower():
+                sleep_time = 2 + random.random()
+                print(f"   ⏰ Rate limited, sleeping {sleep_time:.1f}s")
+                await asyncio.sleep(sleep_time)
+            
+            # Re-raise to trigger retry logic
+            raise
+            
+        except Exception as e:
+            print(f"❌ Unexpected error in VLLM API call: {str(e)}")
+            raise
+    
+    def get_model_info(self) -> Dict[str, str]:
+        """Get information about the configured models."""
+        return {
+            "solver_model": self.solver_model,
+            "grader_model": self.grader_model,
+            "provider": "vllm",
+            "base_url": self.base_url
+        }
+    
+    async def health_check(self) -> bool:
+        """
+        Perform a simple health check to verify VLLM server connectivity.
+        
+        Returns:
+            True if server is accessible, False otherwise
+        """
+        try:
+            # Simple test call
+            test_messages = [
+                {"role": "user", "content": "Hello, please respond with a simple JSON: {\"status\": \"ok\"}"}
+            ]
+            
+            result, _ = await self._call_api(
+                model=self.solver_model,
+                messages=test_messages,
+                temperature=0.0
+            )
+            
+            if result and ("ok" in result.lower() or "hello" in result.lower()):
+                print(f"✅ VLLM API health check passed for {self.solver_model}")
+                return True
+            else:
+                print(f"⚠️ VLLM API health check returned unexpected response")
+                return False
+                
+        except Exception as e:
+            print(f"❌ VLLM API health check failed: {str(e)}")
+            print(f"   Make sure VLLM server is running at {self.base_url}")
+            return False
+    
+    async def estimate_cost(self, 
+                          num_problems: int, 
+                          avg_problem_length: int = 1000,
+                          avg_solution_length: int = 2000) -> Dict[str, float]:
+        """
+        Estimate the cost for processing a given number of problems.
+        For local VLLM, cost is typically computational (time/energy) rather than monetary.
+        
+        Args:
+            num_problems: Number of problems to process
+            avg_problem_length: Average length of problem statements in characters
+            avg_solution_length: Average length of solutions in characters
+            
+        Returns:
+            Dictionary with cost estimates (computational cost in arbitrary units)
+        """
+        # Rough token estimates (1 token ≈ 4 characters for English)
+        tokens_per_solve = (avg_problem_length + avg_solution_length) // 4
+        tokens_per_grade = (avg_problem_length + avg_solution_length * 2) // 4
+        
+        # Computational cost estimation (arbitrary units based on model size)
+        # Larger models consume more computational resources
+        model_costs = {
+            "llama-3.2-1b": 1.0,
+            "llama-3.2-3b": 2.0, 
+            "llama-3.2-8b": 4.0,
+            "llama-3.1-8b": 4.0,
+            "llama-3.1-70b": 20.0,
+            "mistral-7b": 3.0,
+            "qwen2.5-7b": 3.0,
+        }
+        
+        def get_model_cost(model: str) -> float:
+            model_lower = model.lower()
+            for key, cost in model_costs.items():
+                if key in model_lower:
+                    return cost
+            return 3.0  # Default cost for unknown models
+        
+        # Calculate computational costs
+        solver_cost_factor = get_model_cost(self.solver_model)
+        grader_cost_factor = get_model_cost(self.grader_model)
+        
+        solve_cost = tokens_per_solve * num_problems * solver_cost_factor / 1000
+        grade_cost = tokens_per_grade * num_problems * grader_cost_factor / 1000
+        
+        total_cost = solve_cost + grade_cost
+        
+        return {
+            "solve_cost": round(solve_cost, 4),
+            "grade_cost": round(grade_cost, 4),
+            "total_cost": round(total_cost, 4),
+            "cost_per_problem": round(total_cost / num_problems, 6),
+            "currency": "computational_units",
+            "note": "Local VLLM costs are computational (time/energy) rather than monetary"
+        }
+    
+    async def list_models(self) -> List[str]:
+        """
+        List available models on the VLLM server.
+        
+        Returns:
+            List of available model names
+        """
+        try:
+            # Try to get models list from VLLM server
+            models_response = await self.client.models.list()
+            return [model.id for model in models_response.data]
+        except Exception as e:
+            print(f"⚠️ Could not retrieve models list: {str(e)}")
+            return [self.solver_model, self.grader_model]
diff --git a/putnam-bench-anon/loader/xai_client.py b/putnam-bench-anon/loader/xai_client.py
new file mode 100644
index 0000000..10c4cf4
--- /dev/null
+++ b/putnam-bench-anon/loader/xai_client.py
@@ -0,0 +1,173 @@
+"""
+xAI model loader implementation.
+Handles API calls to xAI Grok models using OpenAI-compatible interface.
+"""
+
+import os
+from typing import Dict, Optional, List, Tuple
+
+from .openai_client import OpenAIModelLoader
+
+
+class XAIModelLoader(OpenAIModelLoader):
+    """xAI implementation using OpenAI-compatible API."""
+    
+    def __init__(self, 
+                 solver_model: str = "grok-3",
+                 grader_model: str = "grok-3",
+                 api_key: Optional[str] = None,
+                 **kwargs):
+        """
+        Initialize xAI model loader.
+        
+        Args:
+            solver_model: xAI model for solving problems (default: grok-3)
+            grader_model: xAI model for grading solutions (default: grok-3)
+            api_key: xAI API key (if None, uses XAI_API_KEY environment variable)
+            **kwargs: Additional arguments passed to parent class
+        """
+        # Get API key from parameter or environment
+        if api_key is None:
+            api_key = os.getenv('XAI_API_KEY')
+        
+        # Initialize with xAI-specific settings
+        super().__init__(
+            solver_model=solver_model,
+            grader_model=grader_model,
+            api_key=api_key,
+            base_url="https://api.x.ai/v1",
+            **kwargs
+        )
+    
+    async def _call_api(self, 
+                       model: str, 
+                       messages: List[Dict[str, str]], 
+                       temperature: float = 0.0) -> Tuple[Optional[str], str]:
+        """
+        Make an API call to xAI with proper error handling.
+        
+        Args:
+            model: xAI model name
+            messages: List of messages in chat format
+            temperature: Temperature for generation
+            
+        Returns:
+            Tuple of (response_content, raw_response)
+        """
+        try:
+            # Call parent's implementation
+            return await super()._call_api(model, messages, temperature)
+            
+        except Exception as e:
+            # Replace "OpenAI" with "xAI" in error messages
+            error_msg = str(e)
+            if "OpenAI API Error" in error_msg:
+                error_msg = error_msg.replace("OpenAI API Error", "xAI API Error")
+            
+            # Log with xAI-specific prefix
+            if "RateLimitError" in type(e).__name__:
+                print(f"🚫 xAI RateLimitError: {error_msg}")
+                raise
+            elif "APIError" in type(e).__name__ or "APIConnectionError" in type(e).__name__:
+                print(f"❌ xAI API Error: {error_msg}")
+                raise
+            else:
+                print(f"❌ Unexpected error in xAI API call: {error_msg}")
+                raise
+    
+    def get_model_info(self) -> Dict[str, str]:
+        """Get information about the configured models."""
+        return {
+            "solver_model": self.solver_model,
+            "grader_model": self.grader_model,
+            "provider": "xai",
+            "base_url": "https://api.x.ai/v1"
+        }
+    
+    async def health_check(self) -> bool:
+        """
+        Perform a simple health check to verify xAI API connectivity.
+        
+        Returns:
+            True if API is accessible, False otherwise
+        """
+        try:
+            # Simple test call
+            test_messages = [
+                {"role": "user", "content": "Hello, please respond with a simple JSON: {\"status\": \"ok\"}"}
+            ]
+            
+            result, _ = await self._call_api(
+                model=self.solver_model,
+                messages=test_messages,
+                temperature=0.0
+            )
+            
+            if result and "ok" in result.lower():
+                print(f"✅ xAI API health check passed for {self.solver_model}")
+                return True
+            else:
+                print(f"⚠️ xAI API health check returned unexpected response")
+                return False
+                
+        except Exception as e:
+            print(f"❌ xAI API health check failed: {str(e)}")
+            return False
+    
+    async def estimate_cost(self, 
+                          num_problems: int, 
+                          avg_problem_length: int = 1000,
+                          avg_solution_length: int = 2000) -> Dict[str, float]:
+        """
+        Estimate the cost for processing a given number of problems with xAI models.
+        
+        Args:
+            num_problems: Number of problems to process
+            avg_problem_length: Average length of problem statements in characters
+            avg_solution_length: Average length of solutions in characters
+            
+        Returns:
+            Dictionary with cost estimates
+        """
+        # Rough token estimates (1 token ≈ 4 characters for English)
+        tokens_per_solve = (avg_problem_length + avg_solution_length) // 4
+        tokens_per_grade = (avg_problem_length + avg_solution_length * 2) // 4
+        
+        # xAI pricing (update with actual pricing when available)
+        # These are estimates based on similar model pricing
+        pricing = {
+            "grok-3": {"input": 0.01, "output": 0.03},  # per 1K tokens (estimated)
+            "grok-2": {"input": 0.005, "output": 0.015},  # per 1K tokens (estimated)
+        }
+        
+        def get_model_cost(model: str, input_tokens: int, output_tokens: int) -> float:
+            if model not in pricing:
+                model = "grok-3"  # Default to grok-3 pricing
+            
+            input_cost = (input_tokens / 1000) * pricing[model]["input"]
+            output_cost = (output_tokens / 1000) * pricing[model]["output"]
+            return input_cost + output_cost
+        
+        # Calculate costs
+        solve_cost = get_model_cost(
+            self.solver_model, 
+            tokens_per_solve * num_problems,
+            tokens_per_solve * num_problems // 2  # Assume output is ~50% of input
+        )
+        
+        grade_cost = get_model_cost(
+            self.grader_model,
+            tokens_per_grade * num_problems,
+            tokens_per_grade * num_problems // 3  # Assume output is ~33% of input
+        )
+        
+        total_cost = solve_cost + grade_cost
+        
+        return {
+            "solve_cost": round(solve_cost, 4),
+            "grade_cost": round(grade_cost, 4),
+            "total_cost": round(total_cost, 4),
+            "cost_per_problem": round(total_cost / num_problems, 6),
+            "currency": "USD",
+            "note": "xAI pricing estimates - update with actual pricing"
+        } 
+\ No newline at end of file