putnam-bench-anon/examples/openrouter_example.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

#!/usr/bin/env python3
"""
Example of using OpenRouter with putnam-bench to solve mathematical problems.

This example demonstrates:
1. Using different model combinations from different providers
2. Solving a real problem from the dataset
3. Comparing results across different models
"""

import asyncio
import json
import os
from loader import create_loader

async def solve_with_openrouter():
    """Example of solving a Putnam problem using OpenRouter."""
    
    # Check API key
    if not os.getenv('OPENROUTER_API_KEY'):
        print("❌ Please set OPENROUTER_API_KEY environment variable")
        return
    
    # Load a sample problem
    problem_file = "dataset/1938-A-1.json"
    if not os.path.exists(problem_file):
        print(f"❌ Problem file not found: {problem_file}")
        print("   Make sure you're running from the project root directory")
        return
    
    with open(problem_file) as f:
        problem_data = json.load(f)
    
    print(f"📚 Problem: {problem_data['problem_statement'][:100]}...")
    print(f"   Type: {problem_data['problem_type']}")
    print(f"   Year: {problem_data['year']}")
    
    # Test with different model combinations
    test_configs = [
        {
            "name": "OpenAI Only",
            "solver": "openai/gpt-4o-mini",
            "grader": "openai/gpt-4o"
        },
        {
            "name": "Mixed OpenAI/Anthropic",
            "solver": "openai/gpt-4o",
            "grader": "anthropic/claude-3-haiku"
        },
        {
            "name": "Google Gemini",
            "solver": "google/gemini-pro",
            "grader": "google/gemini-pro"
        }
    ]
    
    for config in test_configs:
        print(f"\n{'='*60}")
        print(f"🧪 Testing: {config['name']}")
        print(f"   Solver: {config['solver']}")
        print(f"   Grader: {config['grader']}")
        
        try:
            # Create loader with specific models
            loader = create_loader(
                "openrouter",
                solver_model=config['solver'],
                grader_model=config['grader'],
                retries=3,
                timeout_base=120
            )
            
            # Solve the problem
            print("\n⏳ Solving problem...")
            solution, raw = await loader.solve_problem(problem_data['problem_statement'])
            
            if solution:
                print("✅ Solution found!")
                print(f"   Final answer: {solution.get('final_answer', 'N/A')}")
                
                # Grade the solution (if it's a proof problem)
                if problem_data['problem_type'] == 'proof':
                    print("\n⏳ Grading solution...")
                    grade_result = await loader.grade_solution(
                        problem_data['problem_statement'],
                        solution['solution'],
                        problem_data.get('ground_truth_solution', ''),
                        problem_type='proof'
                    )
                    
                    if grade_result:
                        print(f"📊 Grade: {grade_result.get('score', 'N/A')}/10")
                        print(f"   Reasoning: {grade_result.get('reasoning', 'N/A')[:100]}...")
                else:
                    print("   (Calculation problem - grading skipped)")
            else:
                print("❌ Failed to get solution")
                
        except Exception as e:
            print(f"❌ Error: {type(e).__name__}: {e}")
            
    print(f"\n{'='*60}")
    print("✅ Example completed!")

async def list_recommended_models():
    """List recommended model combinations for different use cases."""
    
    print("\n📋 Recommended OpenRouter Model Combinations:\n")
    
    recommendations = [
        {
            "use_case": "Best Quality (Expensive)",
            "solver": "openai/gpt-4o",
            "grader": "anthropic/claude-3-opus",
            "notes": "Highest accuracy but most expensive"
        },
        {
            "use_case": "Balanced Performance",
            "solver": "openai/gpt-4o-mini",
            "grader": "anthropic/claude-3-sonnet",
            "notes": "Good balance of cost and performance"
        },
        {
            "use_case": "Budget Friendly",
            "solver": "openai/gpt-3.5-turbo",
            "grader": "google/gemini-pro",
            "notes": "Cheapest option, still decent quality"
        },
        {
            "use_case": "Open Source Models",
            "solver": "meta-llama/llama-3-70b-instruct",
            "grader": "mistralai/mixtral-8x7b-instruct",
            "notes": "Using open-source models only"
        },
        {
            "use_case": "Code-Focused",
            "solver": "deepseek/deepseek-coder",
            "grader": "meta-llama/codellama-70b-instruct",
            "notes": "Optimized for problems with code"
        }
    ]
    
    for rec in recommendations:
        print(f"🎯 {rec['use_case']}")
        print(f"   Solver: {rec['solver']}")
        print(f"   Grader: {rec['grader']}")
        print(f"   Notes: {rec['notes']}")
        print()

if __name__ == "__main__":
    print("🚀 OpenRouter Example for Putnam Bench")
    
    # Run the example
    asyncio.run(solve_with_openrouter())
    
    # Show recommendations
    asyncio.run(list_recommended_models())