summaryrefslogtreecommitdiff
path: root/putnam-bench-anon/calibrate_to_o3.py
diff options
context:
space:
mode:
authorYuren Hao <yurenh2@illinois.edu>2026-04-08 22:06:05 -0500
committerYuren Hao <yurenh2@illinois.edu>2026-04-08 22:06:05 -0500
commit05704d0eb2fa59fe727652465b07db40bcb06c38 (patch)
tree8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /putnam-bench-anon/calibrate_to_o3.py
Initial release: GAP framework
- Full pipeline: variant generation, multi-judge verification, evaluation - Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM - Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction - Unicode -> bare-LaTeX cleaner + audit + spot-check - Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP
Diffstat (limited to 'putnam-bench-anon/calibrate_to_o3.py')
-rw-r--r--putnam-bench-anon/calibrate_to_o3.py335
1 files changed, 335 insertions, 0 deletions
diff --git a/putnam-bench-anon/calibrate_to_o3.py b/putnam-bench-anon/calibrate_to_o3.py
new file mode 100644
index 0000000..d2373cc
--- /dev/null
+++ b/putnam-bench-anon/calibrate_to_o3.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python3
+"""
+calibrate_to_o3.py – End-to-end pipeline that
+1. ingests existing o4-mini grading results for multiple models,
+2. draws a budget-constrained stratified sample,
+3. (optionally) re-grades those samples with o3 to obtain gold labels,
+4. learns per-stratum error rates and calibrates all o4 labels to the o3 scale,
+5. outputs required artefacts:
+ – sample_list.csv
+ – o3_raw.parquet (only when --run-o3)
+ – calibrated_o3_scores.csv
+
+Run:
+ python calibrate_to_o3.py # stop after sampling only
+ python calibrate_to_o3.py --run-o3 # also call o3 re-grader
+
+"""
+
+from __future__ import annotations
+import argparse
+import asyncio
+import json
+import logging
+import math
+import random
+from pathlib import Path
+from typing import Dict, List, Tuple, Any
+
+import numpy as np
+import pandas as pd
+from scipy.stats import norm
+
+# Third-party library used by --run-o3 mode
+try:
+ from loader.openai_client import OpenAIModelLoader # type: ignore
+except ModuleNotFoundError:
+ OpenAIModelLoader = None # graceful degradation when running sampling-only mode
+
+###############################################################################
+# Constants – adjust here if the budget or cost model ever changes
+###############################################################################
+COST_PER_RECORD = 0.154 # USD per o3 grading request
+BUDGET_MAX = 800.0 # USD hard cap
+N_MAX = math.floor(BUDGET_MAX / COST_PER_RECORD) # 5194 with default params
+SEED = 42
+MIN_PER_LAYER = 10
+
+############################# Logging setup ###################################
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s │ %(levelname)-8s │ %(message)s",
+ datefmt="%H:%M:%S",
+)
+LOGGER = logging.getLogger("calibrate")
+
+###############################################################################
+# Utility functions
+###############################################################################
+
+def wilson_ci(k: float, n: int, conf: float = 0.95) -> Tuple[float, float]:
+ """Wilson score interval for a proportion.
+ k may be fractional (calibrated successes). Returns (low, high)."""
+ if n == 0:
+ return 0.0, 0.0
+ z = norm.ppf(1 - (1 - conf) / 2)
+ p_hat = k / n
+ denom = 1 + z ** 2 / n
+ centre = (p_hat + z ** 2 / (2 * n)) / denom
+ half_width = (
+ z
+ * math.sqrt((p_hat * (1 - p_hat) + z ** 2 / (4 * n)) / n)
+ / denom
+ )
+ return max(0.0, centre - half_width), min(1.0, centre + half_width)
+
+
+def parse_diff(index: str) -> int:
+ """Extract trailing difficulty digit (1-6) from an index like 2024-B-6."""
+ try:
+ return int(index.split("-")[-1])
+ except (ValueError, IndexError):
+ return -1 # fallback – will be filtered out later
+
+###############################################################################
+# 1 Load meta-data from dataset/*.json – mapping from problem index to (type,diff)
+###############################################################################
+
+def load_dataset_metadata(dataset_dir: Path) -> Dict[str, Tuple[str, int]]:
+ mapping: Dict[str, Tuple[str, int]] = {}
+ json_files = sorted(dataset_dir.glob("*.json"))
+ for fp in json_files:
+ try:
+ with fp.open("r", encoding="utf-8") as f:
+ data = json.load(f)
+ idx = data.get("index")
+ typ = data.get("type")
+ diff = parse_diff(idx)
+ if idx and typ and diff != -1:
+ mapping[idx] = (typ, diff)
+ except Exception as e:
+ LOGGER.warning(f"Failed to parse {fp}: {e}")
+ LOGGER.info(f"Loaded metadata for {len(mapping):,} problems from dataset")
+ return mapping
+
+###############################################################################
+# 2 Load all o4-mini result JSONs into one DataFrame
+###############################################################################
+
+def load_o4_results(results_root: Path, meta: Dict[str, Tuple[str, int]]) -> pd.DataFrame:
+ rows: List[Dict[str, Any]] = []
+ model_dirs = [d for d in results_root.iterdir() if d.is_dir()]
+ for model_dir in model_dirs:
+ model_id = model_dir.name
+ # consider only *_original.json for uniformity
+ for fp in model_dir.glob("*original.json"):
+ try:
+ with fp.open("r", encoding="utf-8") as f:
+ res = json.load(f)
+ for pr in res.get("problems", []):
+ idx = pr.get("index")
+ grade_info = pr.get("grade", {})
+ o4_score = int(grade_info.get("grade") == "CORRECT")
+ # meta info
+ typ, diff = meta.get(idx, (None, None))
+ if typ is None:
+ continue # skip problems without meta
+ row = {
+ "id": idx,
+ "model_id": model_id,
+ "type": typ,
+ "diff": diff,
+ "o4_score": o4_score,
+ # Extra fields useful for optional o3 grading
+ "student_solution": pr.get("solve", {}).get("solution", ""),
+ }
+ rows.append(row)
+ except Exception as e:
+ LOGGER.warning(f"Failed to process {fp}: {e}")
+ df = pd.DataFrame(rows)
+ LOGGER.info(f"Ingested {len(df):,} problem-model pairs across {df['model_id'].nunique()} models")
+ return df
+
+###############################################################################
+# 3 Stratified sampling under budget
+###############################################################################
+
+def stratified_sample(df: pd.DataFrame) -> pd.DataFrame:
+ rng = np.random.default_rng(SEED)
+ group_cols = ["type", "diff", "o4_score"]
+
+ # Compute desired sample sizes per layer
+ layer_counts = df.groupby(group_cols, observed=True).size().rename("N_k")
+ total_records = len(df)
+ target_sizes = (
+ (layer_counts / total_records * N_MAX).apply(np.ceil).astype(int).clip(lower=MIN_PER_LAYER)
+ )
+
+ # If the initial allocation exceeds budget, scale down proportionally (but keep >=MIN_PER_LAYER)
+ total_target = target_sizes.sum()
+ if total_target > N_MAX:
+ LOGGER.info(
+ f"Initial allocation {total_target} exceeds N_MAX={N_MAX}. Scaling down proportionally."
+ )
+ scaling = (N_MAX - MIN_PER_LAYER * target_sizes.size) / (
+ total_target - MIN_PER_LAYER * target_sizes.size
+ )
+ scaling = max(scaling, 0.0)
+ target_sizes = (
+ MIN_PER_LAYER
+ + np.floor((target_sizes - MIN_PER_LAYER) * scaling).astype(int)
+ )
+ LOGGER.info(
+ f"Final per-stratum sample sizes prepared (sum={target_sizes.sum()}) – within budget"
+ )
+
+ # Actual sampling
+ samples = []
+ for key, group in df.groupby(group_cols, observed=True):
+ n = min(target_sizes.get(key, MIN_PER_LAYER), len(group))
+ if n <= 0:
+ continue
+ sample_idx = rng.choice(group.index.to_numpy(), size=n, replace=False)
+ samples.append(df.loc[sample_idx])
+ sample_df = pd.concat(samples, ignore_index=True)
+ LOGGER.info(f"Sampled {len(sample_df):,} rows in total (<= {N_MAX})")
+ return sample_df
+
+###############################################################################
+# 4 Async o3 re-grading helper
+###############################################################################
+
+async def grade_with_o3(sample_df: pd.DataFrame, meta: Dict[str, Tuple[str, int]]) -> pd.Series:
+ """Returns pd.Series of int o3_score aligned with sample_df.index."""
+ if OpenAIModelLoader is None:
+ raise RuntimeError("OpenAIModelLoader not available. Install dependencies or run without --run-o3.")
+
+ async with OpenAIModelLoader(solver_model="o3", grader_model="o3") as loader:
+
+ async def grade_one(row) -> int:
+ idx = row.id
+ question = None
+ reference_solution = None
+ # load dataset file lazily when needed
+ dataset_file = Path("dataset") / f"{idx}.json"
+ if dataset_file.exists():
+ try:
+ with dataset_file.open("r", encoding="utf-8") as f:
+ data = json.load(f)
+ question = data.get("question", "")
+ reference_solution = data.get("solution", "")
+ except Exception:
+ pass
+ if not question:
+ return -1 # cannot grade
+ student_solution = row.student_solution or ""
+ try:
+ grade_result, _ = await loader.grade_solution(
+ question,
+ student_solution,
+ reference_solution,
+ problem_type="proof",
+ model="o3",
+ )
+ return int(grade_result.get("grade") == "CORRECT") if grade_result else -1
+ except Exception as exc:
+ LOGGER.warning(f"o3 grading failed for {idx}: {exc}")
+ return -1
+
+ sem = asyncio.Semaphore(20)
+ async def sem_grade(row):
+ async with sem:
+ return await grade_one(row)
+
+ tasks = [asyncio.create_task(sem_grade(row)) for _, row in sample_df.iterrows()]
+ o3_scores = await asyncio.gather(*tasks)
+ return pd.Series(o3_scores, index=sample_df.index, name="o3_score")
+
+###############################################################################
+# 5 Calibration – compute per-stratum error rates and apply
+###############################################################################
+
+def compute_error_rates(sample_df: pd.DataFrame) -> pd.DataFrame:
+ group_cols = ["type", "diff"]
+
+ # Build contingency counts per stratum
+ counts = sample_df.groupby(group_cols + ["o4_score", "o3_score"], observed=True).size().unstack(fill_value=0)
+ # Ensure o3_score columns 0 and 1 exist
+ for col in [0, 1]:
+ if col not in counts.columns:
+ counts[col] = 0
+ # counts index columns: type, diff, o4_score
+ # Compute p1_k and p0_k
+ records = []
+ for (typ, diff, o4_val), row in counts.reset_index().groupby(["type", "diff", "o4_score"], observed=True):
+ n = row[[0, 1]].sum(axis=1).values[0]
+ k = row[0].values[0] # for p1 or p0 depends
+ if o4_val == 1: # looking at false positives (o4=1 but o3=0)
+ p1 = k / n if n else 0.10
+ records.append({"type": typ, "diff": diff, "p1": p1})
+ else: # o4=0
+ p0 = row[1].values[0] / n if n else 0.10
+ records.append({"type": typ, "diff": diff, "p0": p0})
+ errs = pd.DataFrame(records).groupby(["type", "diff"], observed=True).first().reset_index()
+ errs["p1"].fillna(0.10, inplace=True)
+ errs["p0"].fillna(0.10, inplace=True)
+ return errs
+
+
+def apply_calibration(full_df: pd.DataFrame, err_df: pd.DataFrame) -> pd.Series:
+ merged = full_df.merge(err_df, on=["type", "diff"], how="left")
+ merged["p1"].fillna(0.10, inplace=True)
+ merged["p0"].fillna(0.10, inplace=True)
+ est = np.where(merged.o4_score == 1, 1 - merged.p1, merged.p0)
+ return pd.Series(est, index=full_df.index, name="o3_est")
+
+###############################################################################
+# 6 Main entry
+###############################################################################
+
+def main():
+ parser = argparse.ArgumentParser(description="Calibrate o4-mini results to o3 scale")
+ parser.add_argument("--run-o3", action="store_true", help="Actually call o3 to grade the sampled pairs")
+ parser.add_argument("--output-dir", default="calibration_out", help="Directory to store generated artefacts")
+ args = parser.parse_args()
+
+ out_dir = Path(args.output_dir)
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ # 1 Load meta and results
+ meta = load_dataset_metadata(Path("dataset"))
+ full_df = load_o4_results(Path("results"), meta)
+
+ # 2 Sampling
+ sample_df = stratified_sample(full_df)
+ sample_df.to_csv(out_dir / "sample_list.csv", index=False)
+
+ if args.run_o3:
+ LOGGER.info("Starting o3 re-grading – this may incur cost!")
+ start = asyncio.run(grade_with_o3(sample_df, meta))
+ sample_df["o3_score"] = start
+ sample_df.to_parquet(out_dir / "o3_raw.parquet", index=False)
+ spent = sample_df["o3_score"].notna().sum() * COST_PER_RECORD
+ LOGGER.info(f"o3 grading finished. Cost ≈ ${spent:.2f}")
+ else:
+ LOGGER.info("--run-o3 not provided; skipping API calls and downstream calibration")
+ return # exit early
+
+ # 3 Calibration
+ err_df = compute_error_rates(sample_df)
+ full_df["o3_est"] = apply_calibration(full_df, err_df)
+
+ # 4 Aggregate per model
+ agg_rows = []
+ for model_id, grp in full_df.groupby("model_id", observed=True):
+ mean_est = grp.o3_est.mean()
+ n = len(grp)
+ k_hat = mean_est * n
+ ci_low, ci_high = wilson_ci(k_hat, n)
+ agg_rows.append({
+ "model_id": model_id,
+ "mean": mean_est,
+ "ci_low": ci_low,
+ "ci_high": ci_high,
+ })
+ agg_df = pd.DataFrame(agg_rows)
+ agg_df.to_csv(out_dir / "calibrated_o3_scores.csv", index=False)
+ LOGGER.info("Calibration finished. Artefacts saved to %s", out_dir)
+
+
+if __name__ == "__main__":
+ try:
+ main()
+ except Exception as exc:
+ LOGGER.error("Fatal error: %s", exc)
+ raise \ No newline at end of file