diff options
| author | Yuren Hao <yurenh2@illinois.edu> | 2026-04-08 22:06:05 -0500 |
|---|---|---|
| committer | Yuren Hao <yurenh2@illinois.edu> | 2026-04-08 22:06:05 -0500 |
| commit | 05704d0eb2fa59fe727652465b07db40bcb06c38 (patch) | |
| tree | 8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /putnam-bench-anon/examples/openrouter_example.py | |
Initial release: GAP framework
- Full pipeline: variant generation, multi-judge verification, evaluation
- Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM
- Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction
- Unicode -> bare-LaTeX cleaner + audit + spot-check
- Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP
Diffstat (limited to 'putnam-bench-anon/examples/openrouter_example.py')
| -rw-r--r-- | putnam-bench-anon/examples/openrouter_example.py | 157 |
1 files changed, 157 insertions, 0 deletions
diff --git a/putnam-bench-anon/examples/openrouter_example.py b/putnam-bench-anon/examples/openrouter_example.py new file mode 100644 index 0000000..bc75877 --- /dev/null +++ b/putnam-bench-anon/examples/openrouter_example.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +""" +Example of using OpenRouter with putnam-bench to solve mathematical problems. + +This example demonstrates: +1. Using different model combinations from different providers +2. Solving a real problem from the dataset +3. Comparing results across different models +""" + +import asyncio +import json +import os +from loader import create_loader + +async def solve_with_openrouter(): + """Example of solving a Putnam problem using OpenRouter.""" + + # Check API key + if not os.getenv('OPENROUTER_API_KEY'): + print("โ Please set OPENROUTER_API_KEY environment variable") + return + + # Load a sample problem + problem_file = "dataset/1938-A-1.json" + if not os.path.exists(problem_file): + print(f"โ Problem file not found: {problem_file}") + print(" Make sure you're running from the project root directory") + return + + with open(problem_file) as f: + problem_data = json.load(f) + + print(f"๐ Problem: {problem_data['problem_statement'][:100]}...") + print(f" Type: {problem_data['problem_type']}") + print(f" Year: {problem_data['year']}") + + # Test with different model combinations + test_configs = [ + { + "name": "OpenAI Only", + "solver": "openai/gpt-4o-mini", + "grader": "openai/gpt-4o" + }, + { + "name": "Mixed OpenAI/Anthropic", + "solver": "openai/gpt-4o", + "grader": "anthropic/claude-3-haiku" + }, + { + "name": "Google Gemini", + "solver": "google/gemini-pro", + "grader": "google/gemini-pro" + } + ] + + for config in test_configs: + print(f"\n{'='*60}") + print(f"๐งช Testing: {config['name']}") + print(f" Solver: {config['solver']}") + print(f" Grader: {config['grader']}") + + try: + # Create loader with specific models + loader = create_loader( + "openrouter", + solver_model=config['solver'], + grader_model=config['grader'], + retries=3, + timeout_base=120 + ) + + # Solve the problem + print("\nโณ Solving problem...") + solution, raw = await loader.solve_problem(problem_data['problem_statement']) + + if solution: + print("โ
Solution found!") + print(f" Final answer: {solution.get('final_answer', 'N/A')}") + + # Grade the solution (if it's a proof problem) + if problem_data['problem_type'] == 'proof': + print("\nโณ Grading solution...") + grade_result = await loader.grade_solution( + problem_data['problem_statement'], + solution['solution'], + problem_data.get('ground_truth_solution', ''), + problem_type='proof' + ) + + if grade_result: + print(f"๐ Grade: {grade_result.get('score', 'N/A')}/10") + print(f" Reasoning: {grade_result.get('reasoning', 'N/A')[:100]}...") + else: + print(" (Calculation problem - grading skipped)") + else: + print("โ Failed to get solution") + + except Exception as e: + print(f"โ Error: {type(e).__name__}: {e}") + + print(f"\n{'='*60}") + print("โ
Example completed!") + +async def list_recommended_models(): + """List recommended model combinations for different use cases.""" + + print("\n๐ Recommended OpenRouter Model Combinations:\n") + + recommendations = [ + { + "use_case": "Best Quality (Expensive)", + "solver": "openai/gpt-4o", + "grader": "anthropic/claude-3-opus", + "notes": "Highest accuracy but most expensive" + }, + { + "use_case": "Balanced Performance", + "solver": "openai/gpt-4o-mini", + "grader": "anthropic/claude-3-sonnet", + "notes": "Good balance of cost and performance" + }, + { + "use_case": "Budget Friendly", + "solver": "openai/gpt-3.5-turbo", + "grader": "google/gemini-pro", + "notes": "Cheapest option, still decent quality" + }, + { + "use_case": "Open Source Models", + "solver": "meta-llama/llama-3-70b-instruct", + "grader": "mistralai/mixtral-8x7b-instruct", + "notes": "Using open-source models only" + }, + { + "use_case": "Code-Focused", + "solver": "deepseek/deepseek-coder", + "grader": "meta-llama/codellama-70b-instruct", + "notes": "Optimized for problems with code" + } + ] + + for rec in recommendations: + print(f"๐ฏ {rec['use_case']}") + print(f" Solver: {rec['solver']}") + print(f" Grader: {rec['grader']}") + print(f" Notes: {rec['notes']}") + print() + +if __name__ == "__main__": + print("๐ OpenRouter Example for Putnam Bench") + + # Run the example + asyncio.run(solve_with_openrouter()) + + # Show recommendations + asyncio.run(list_recommended_models())
\ No newline at end of file |
