1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
|
"""Async API caller for rescue experiment.
Supports OpenAI, Anthropic, Google. All callers return a unified dict:
{"status": "success"|"failed", "content": str, "error": str|None}
Concurrency is controlled per-provider via asyncio.Semaphore so we don't
saturate rate limits in any one provider.
"""
from __future__ import annotations
import asyncio
import json
import os
import random
from typing import Optional
# ---------- Provider constants ----------
# Solver model -> provider mapping
SOLVER_PROVIDERS = {
"gpt-4.1-mini": "openai",
"gpt-4o-mini": "openai",
"claude-sonnet-4": "anthropic",
"gemini-2.5-flash": "google",
}
# API model strings (the canonical IDs to send)
API_MODEL_NAMES = {
"gpt-4.1-mini": "gpt-4.1-mini",
"gpt-4o-mini": "gpt-4o-mini",
"claude-sonnet-4": "claude-sonnet-4-20250514",
"gemini-2.5-flash": "gemini-2.5-flash",
}
GRADER_MODEL = "gpt-4o"
GRADER_PROVIDER = "openai"
PER_PROVIDER_CONCURRENCY = {
"openai": 500,
"anthropic": 25, # 90k tok/min cap; 25 in flight keeps us comfortably under
"google": 300,
}
DEFAULT_RETRIES = 6
DEFAULT_BASE_TIMEOUT = 300.0
RATE_LIMIT_BACKOFF_SECONDS = 60.0 # min sleep on rate limit hits
# ---------- Solver / grader prompts (consistent with paper) ----------
SOLVER_SYSTEM_PROMPT = """You are an expert mathematician solving competition-level problems.
Provide detailed, step-by-step solutions with clear mathematical reasoning.
Requirements:
- Show all your work and intermediate steps
- Justify each major step of your reasoning
- Use proper mathematical notation
- Be thorough but concise
- State your final answer clearly
Solve the problem completely and rigorously."""
PROOF_GRADER_SYSTEM_PROMPT = """You are an extremely strict mathematical grader evaluating competition-level PROOF problems.
GRADING STANDARDS (BE VERY STRICT):
- Mathematical rigor: Every step must be mathematically sound and justified
- Logical flow: The reasoning must be clear, complete, and logically connected
- Correctness: All calculations, algebraic manipulations, and conclusions must be correct
- Completeness: The solution must address all parts of the problem fully
- Precision: Mathematical statements must be precise and unambiguous
FAILING CRITERIA (Mark as INCORRECT if ANY of these apply):
- Any unjustified logical leap or gap in reasoning
- Any computational error, no matter how small
- Missing steps in critical parts of the argument
- Imprecise or ambiguous mathematical statements
- Incorrect final answer, even if approach is partially correct
- Circular reasoning or logical fallacies
- Misuse of mathematical theorems or definitions
BE EXTREMELY STRICT. Competition mathematics proofs require perfect precision."""
CALCULATION_GRADER_SYSTEM_PROMPT = """You are a mathematical grader evaluating competition-level CALCULATION problems.
GRADING STANDARDS FOR CALCULATION PROBLEMS:
- Primary focus: Is the final answer correct?
- Secondary focus: Is the overall approach reasonable and mathematically sound?
- Computation: Allow minor computational slips if the method is correct and final answer is right
GRADING CRITERIA:
- CORRECT: Final answer is correct AND approach is fundamentally sound
- INCORRECT: Final answer is wrong OR approach is fundamentally flawed
For calculation problems, the final numerical answer is the most important criterion.
Minor intermediate errors are acceptable if they don't affect the final result."""
PROOF_GRADER_USER_TEMPLATE = """Grade this PROOF solution with extreme strictness.
PROBLEM:
{problem_statement}
STUDENT SOLUTION:
{solution}
CORRECT REFERENCE SOLUTION:
{reference_solution}
Evaluate with maximum strictness. Every logical step must be perfect. Return JSON with:
{{"grade": "CORRECT" or "INCORRECT",
"detailed_feedback": "specific detailed analysis of what is right/wrong",
"major_issues": "list of significant mathematical errors or gaps",
"final_answer_correct": true or false,
"reasoning_rigor_score": 0-10 integer (10=perfect rigor, 0=severely flawed),
"overall_assessment": "comprehensive evaluation summary"}}"""
CALCULATION_GRADER_USER_TEMPLATE = """Grade this CALCULATION solution with focus on final answer correctness.
PROBLEM:
{problem_statement}
STUDENT SOLUTION:
{solution}
CORRECT REFERENCE SOLUTION:
{reference_solution}
Focus primarily on whether the final answer is correct. Return JSON with:
{{"grade": "CORRECT" or "INCORRECT",
"detailed_feedback": "specific detailed analysis of what is right/wrong",
"major_issues": "list of significant mathematical errors or gaps",
"final_answer_correct": true or false,
"reasoning_rigor_score": 0-10 integer (10=perfect rigor, 0=severely flawed),
"overall_assessment": "comprehensive evaluation summary"}}"""
# ---------- Lazy client builders ----------
_openai_client = None
_anthropic_client = None
_google_client = None
def _get_openai_client():
global _openai_client
if _openai_client is None:
from openai import AsyncOpenAI
import httpx
limits = httpx.Limits(max_connections=2000, max_keepalive_connections=1000)
timeout = httpx.Timeout(timeout=DEFAULT_BASE_TIMEOUT, connect=30.0,
read=DEFAULT_BASE_TIMEOUT, write=30.0)
_openai_client = AsyncOpenAI(http_client=httpx.AsyncClient(limits=limits, timeout=timeout))
return _openai_client
def _get_anthropic_client():
global _anthropic_client
if _anthropic_client is None:
from anthropic import AsyncAnthropic
_anthropic_client = AsyncAnthropic()
return _anthropic_client
def _get_google_client():
global _google_client
if _google_client is None:
from google import genai
_google_client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
return _google_client
# ---------- Per-provider call functions ----------
async def _call_openai(model: str, system: str, user: str,
temperature: float, max_tokens: int = 16000) -> dict:
client = _get_openai_client()
api_params = {
"model": model,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
"max_tokens": max_tokens,
}
# o-series models force temperature=1 and don't accept max_tokens
if any(p in model.lower() for p in ["o1", "o3", "o4"]):
api_params.pop("max_tokens", None)
api_params["temperature"] = 1.0
else:
api_params["temperature"] = temperature
api_params["response_format"] = {"type": "json_object"}
resp = await client.chat.completions.create(**api_params)
content = resp.choices[0].message.content or ""
return {"status": "success", "content": content, "error": None}
async def _call_anthropic(model: str, system: str, user: str,
temperature: float, max_tokens: int = 16000) -> dict:
client = _get_anthropic_client()
resp = await client.messages.create(
model=model,
system=system,
messages=[{"role": "user", "content": user}],
temperature=temperature,
max_tokens=max_tokens,
)
content = ""
if resp.content:
for block in resp.content:
if hasattr(block, "text"):
content += block.text
return {"status": "success", "content": content, "error": None}
async def _call_google(model: str, system: str, user: str,
temperature: float, max_tokens: int = 16000) -> dict:
client = _get_google_client()
from google.genai.types import GenerateContentConfig
config = GenerateContentConfig(
system_instruction=system,
temperature=temperature,
max_output_tokens=max_tokens,
response_mime_type="application/json",
)
resp = await client.aio.models.generate_content(
model=model, contents=user, config=config,
)
content = resp.text or ""
return {"status": "success", "content": content, "error": None}
# ---------- Unified caller with retries and per-provider semaphore ----------
_provider_sems: dict = {}
def _sem_for(provider: str) -> asyncio.Semaphore:
if provider not in _provider_sems:
_provider_sems[provider] = asyncio.Semaphore(PER_PROVIDER_CONCURRENCY[provider])
return _provider_sems[provider]
async def call_model(model_short: str, system: str, user: str,
temperature: float = 0.0, max_tokens: int = 16000,
retries: int = DEFAULT_RETRIES) -> dict:
"""Call any supported model by short alias. Includes retries."""
if model_short == GRADER_MODEL:
provider = GRADER_PROVIDER
api_model = GRADER_MODEL
else:
provider = SOLVER_PROVIDERS[model_short]
api_model = API_MODEL_NAMES[model_short]
sem = _sem_for(provider)
async with sem:
last_err = None
for attempt in range(retries):
try:
if provider == "openai":
return await _call_openai(api_model, system, user, temperature, max_tokens)
elif provider == "anthropic":
return await _call_anthropic(api_model, system, user, temperature, max_tokens)
elif provider == "google":
return await _call_google(api_model, system, user, temperature, max_tokens)
else:
return {"status": "failed", "content": "",
"error": f"unknown provider {provider}"}
except Exception as e:
last_err = e
err_str = str(e).lower()
# Longer backoff for rate-limit-style errors so the per-minute
# window has time to refill.
if "rate_limit" in err_str or "429" in err_str or "rate limit" in err_str:
await asyncio.sleep(RATE_LIMIT_BACKOFF_SECONDS + random.random() * 10)
else:
await asyncio.sleep(min(2 ** attempt + random.random(), 30))
return {"status": "failed", "content": "",
"error": f"{type(last_err).__name__}: {str(last_err)[:300]}"}
# ---------- High-level helpers ----------
async def solve(model_short: str, problem_user_msg: str) -> dict:
"""Run the solver. The user message already contains problem + any prefix."""
return await call_model(model_short, SOLVER_SYSTEM_PROMPT, problem_user_msg, temperature=0.0)
async def grade(problem_type: str, problem_statement: str,
solution: str, reference_solution: str) -> dict:
"""Run the grader (gpt-4o)."""
if problem_type == "proof":
sys = PROOF_GRADER_SYSTEM_PROMPT
tmpl = PROOF_GRADER_USER_TEMPLATE
else:
sys = CALCULATION_GRADER_SYSTEM_PROMPT
tmpl = CALCULATION_GRADER_USER_TEMPLATE
user = tmpl.format(problem_statement=problem_statement,
solution=solution,
reference_solution=reference_solution)
return await call_model(GRADER_MODEL, sys, user, temperature=0.0)
def parse_solution(content: str) -> dict:
"""Parse JSON {solution, final_answer} from model output, with tolerance."""
if not content:
return {"solution": "", "final_answer": "", "_parse_error": "empty"}
try:
d = json.loads(content)
return {"solution": d.get("solution", ""),
"final_answer": d.get("final_answer", ""),
"_parse_error": None}
except Exception:
# Try to extract a JSON object substring
import re
m = re.search(r"\{.*\}", content, re.DOTALL)
if m:
try:
d = json.loads(m.group(0))
return {"solution": d.get("solution", ""),
"final_answer": d.get("final_answer", ""),
"_parse_error": None}
except Exception as e:
return {"solution": content, "final_answer": "",
"_parse_error": f"json parse: {e}"}
return {"solution": content, "final_answer": "",
"_parse_error": "no JSON object found"}
def parse_grade(content: str) -> dict:
"""Parse JSON grade output."""
if not content:
return {"grade": "INCORRECT", "_parse_error": "empty"}
try:
d = json.loads(content)
# Normalize grade
g = (d.get("grade") or "").strip().upper()
return {
"grade": g if g in ("CORRECT", "INCORRECT") else "INCORRECT",
"final_answer_correct": d.get("final_answer_correct"),
"detailed_feedback": d.get("detailed_feedback", ""),
"_parse_error": None,
}
except Exception:
import re
m = re.search(r"\{.*\}", content, re.DOTALL)
if m:
try:
d = json.loads(m.group(0))
g = (d.get("grade") or "").strip().upper()
return {
"grade": g if g in ("CORRECT", "INCORRECT") else "INCORRECT",
"final_answer_correct": d.get("final_answer_correct"),
"detailed_feedback": d.get("detailed_feedback", ""),
"_parse_error": None,
}
except Exception as e:
return {"grade": "INCORRECT", "_parse_error": f"json parse: {e}"}
return {"grade": "INCORRECT", "_parse_error": "no JSON object found"}
# ---------- Standalone health check ----------
async def _health_check():
print("Running health checks ...")
msg = ('Reply with JSON {"status": "ok"} only.')
for short in ["gpt-4o-mini", "claude-sonnet-4", "gemini-2.5-flash"]:
r = await call_model(short, "You are a test. Reply only the requested JSON.",
msg, temperature=0.0, max_tokens=200, retries=2)
print(f" {short}: {r['status']} - {r['content'][:200]!r} err={r['error']}")
# Grader
r = await call_model(GRADER_MODEL, "You are a test.", msg, temperature=0.0,
max_tokens=200, retries=2)
print(f" {GRADER_MODEL} (grader): {r['status']} - {r['content'][:200]!r} err={r['error']}")
if __name__ == "__main__":
asyncio.run(_health_check())
|