#!/usr/bin/env python3
"""
KV redo: Re-run slot discovery with o3 on no_slots problems, then evaluate all accepted.
"""
import json, asyncio, random, re, os
from datasets import load_dataset
from openai import AsyncOpenAI

client = AsyncOpenAI()
SEM_O3 = asyncio.Semaphore(3)
SEM_EVAL = asyncio.Semaphore(40)
random.seed(42)

OUTPUT_DIR = '/home/yurenh2/gap/mini_gap_math_results/kv_200'
REDO_FILE = os.path.join(OUTPUT_DIR, 'kv_redo.json')
LOCK = asyncio.Lock()

# Prompts
SLOT_DISCOVERY_O3 = """You are a world-class mathematician. Given a math problem and its reference solution, find ALL numerical constants, coefficients, parameters, or specific values that could be changed to create a structurally equivalent but numerically different problem.

Be AGGRESSIVE in finding slots. Even if a value seems "natural" (like 2, 3, etc.), if changing it to another value would still yield a solvable problem with the same solution technique, list it.

Examples of mutable slots:
- Coefficients in equations (2x+3 → 5x+7)
- Exponents (x^3 → x^5)
- Bounds or limits (sum from 1 to 100 → sum from 1 to 200)
- Specific numbers in word problems
- Dimensions or sizes
- Modular bases

Return ONLY valid JSON:
{"mutable_slots": [{"value": "...", "role": "...", "constraints": "..."}, ...]}
If truly no slots exist (every constant is mathematically forced), return: {"mutable_slots": []}"""

BACK_SYNTHESIS = """You are creating a mathematical variant. Given the original problem, solution, and mutable slots:
- Choose NEW values satisfying constraints
- Rewrite the full problem with new values
- Solve it completely step by step
- The solution MUST use the same mathematical technique

Return ONLY valid JSON:
{"new_problem": "...", "new_solution": "...", "new_answer": "...", "slot_changes": [{"original": "...", "new": "..."}]}"""

VERIFY = """You are a rigorous mathematical verifier. Check:
1. Is the problem well-defined?
2. Is every solution step correct?
3. Does it reach the stated answer?

Reply EXACTLY: VERDICT: ACCEPT or VERDICT: REJECT
REASON: [explanation]"""

REPAIR = """Fix this rejected variant.
Problem: {problem}
Solution: {solution}
Reason: {reason}
Return ONLY JSON: {{"new_problem": "...", "new_solution": "...", "new_answer": "..."}}"""

def extract_json(text):
    if not text: return None
    try: return json.loads(text)
    except: pass
    m = re.search(r'```(?:json)?\s*(\{[\s\S]*?\})\s*```', text)
    if m:
        try: return json.loads(m.group(1))
        except: pass
    m = re.search(r'\{[\s\S]*\}', text)
    if m:
        try: return json.loads(m.group())
        except: pass
    return None

async def api_call(messages, model="o3", max_tokens=4000):
    sem = SEM_O3 if model == "o3" else SEM_EVAL
    async with sem:
        for attempt in range(5):
            try:
                kw = {"model": model, "messages": messages}
                if model == "o3": kw["max_completion_tokens"] = max_tokens
                else: kw["max_tokens"] = max_tokens; kw["temperature"] = 0
                r = await client.chat.completions.create(**kw)
                return r.choices[0].message.content
            except Exception as e:
                w = min(60, (2**attempt)*3)
                if attempt < 4: await asyncio.sleep(w)
                else: return None

async def save(result, results_list):
    async with LOCK:
        results_list.append(result)
        with open(REDO_FILE, 'w') as f:
            json.dump(results_list, f)

async def process_one(problem, solution, idx, results_list):
    # Stage 1: o3 slot discovery
    slot_text = await api_call(
        [{"role": "system", "content": SLOT_DISCOVERY_O3},
         {"role": "user", "content": f"Problem:\n{problem}\n\nSolution:\n{solution}"}],
        model="o3", max_tokens=2000)
    slots = extract_json(slot_text) if slot_text else None
    if not slots or not slots.get('mutable_slots'):
        await save({'status': 'no_slots', 'idx': idx}, results_list)
        print(f"[{idx}] no_slots (o3)")
        return

    n = len(slots['mutable_slots'])

    # Stage 2: o3 back-synthesis
    synth_text = await api_call(
        [{"role": "system", "content": BACK_SYNTHESIS},
         {"role": "user", "content": f"Original:\n{problem}\n\nSolution:\n{solution}\n\nSlots:\n{json.dumps(slots['mutable_slots'])}\n\nCreate variant."}],
        model="o3", max_tokens=6000)
    synth = extract_json(synth_text) if synth_text else None
    if not synth or not synth.get('new_problem'):
        await save({'status': 'error', 'idx': idx, 'reason': 'synthesis failed'}, results_list)
        print(f"[{idx}] synth_error")
        return

    new_p, new_s, new_a = synth['new_problem'], synth['new_solution'], synth.get('new_answer', '')

    # Stage 3: 3 judges + 3 repair rounds
    for rr in range(4):
        judges = await asyncio.gather(*[api_call(
            [{"role": "system", "content": VERIFY},
             {"role": "user", "content": f"Problem:\n{new_p}\n\nSolution:\n{new_s}"}],
            model="o3", max_tokens=500) for _ in range(3)])
        accepts = sum(1 for j in judges if j and 'ACCEPT' in j.upper() and 'REJECT' not in j.upper())
        if accepts == 3:
            await save({
                'status': 'accepted', 'idx': idx,
                'original_problem': problem, 'original_solution': solution,
                'kv_problem': new_p, 'kv_solution': new_s, 'kv_answer': new_a,
                'mutable_slots': slots['mutable_slots'],
                'slot_changes': synth.get('slot_changes', []),
                'repair_rounds': rr, 'n_slots': n,
            }, results_list)
            print(f"[{idx}] ACCEPTED (round {rr}, {n} slots)")
            return
        if rr < 3:
            reasons = [re.search(r'REASON:\s*(.*)', j or '', re.I) for j in judges]
            reason_str = '; '.join(m.group(1)[:200] for m in reasons if m)[:500]
            fix = await api_call(
                [{"role": "system", "content": REPAIR.format(problem=new_p, solution=new_s, reason=reason_str)},
                 {"role": "user", "content": "Fix."}],
                model="o3", max_tokens=6000)
            fd = extract_json(fix) if fix else None
            if fd:
                new_p = fd.get('new_problem', new_p)
                new_s = fd.get('new_solution', new_s)
                new_a = fd.get('new_answer', new_a)

    await save({'status': 'rejected', 'idx': idx}, results_list)
    print(f"[{idx}] REJECTED")

def extract_boxed(text):
    if not text: return None
    matches = []
    i = 0
    while i < len(text):
        idx = text.find('\\boxed{', i)
        if idx == -1: break
        depth = 1; j = idx + 7
        while j < len(text) and depth > 0:
            if text[j] == '{': depth += 1
            elif text[j] == '}': depth -= 1
            j += 1
        if depth == 0: matches.append(text[idx+7:j-1].strip())
        i = j
    return matches[-1] if matches else None

async def evaluate_all(all_accepted):
    async def solve(problem, model):
        async with SEM_EVAL:
            r = await client.chat.completions.create(
                model=model, temperature=0, max_tokens=2048,
                messages=[{"role": "system", "content": "Solve step by step. Final answer in \\boxed{}."},
                          {"role": "user", "content": problem}])
            return r.choices[0].message.content

    async def grade(ref, stu):
        async with SEM_EVAL:
            r = await client.chat.completions.create(
                model="gpt-4o", temperature=0, max_tokens=10,
                messages=[{"role": "user", "content": f"Are these equivalent? Ref: {ref}\nStudent: {stu}\nCORRECT or INCORRECT."}])
            t = r.choices[0].message.content.upper()
            return 'INCORRECT' not in t and 'CORRECT' in t

    results = {}
    for model in ['gpt-4o', 'gpt-4o-mini']:
        print(f"\nEval {len(all_accepted)} with {model}...")
        orig_sols = await asyncio.gather(*[solve(a['original_problem'], model) for a in all_accepted])
        kv_sols = await asyncio.gather(*[solve(a['kv_problem'], model) for a in all_accepted])

        og, kg = [], []
        for i, a in enumerate(all_accepted):
            ro = extract_boxed(a['original_solution']); so = extract_boxed(orig_sols[i])
            rk = a.get('kv_answer') or extract_boxed(a.get('kv_solution','')); sk = extract_boxed(kv_sols[i])
            og.append(await grade(ro or 'N/A', so or 'N/A') if ro and so else False)
            kg.append(await grade(rk or 'N/A', sk or 'N/A') if rk and sk else False)

        oa = sum(og)/len(og)*100; ka = sum(kg)/len(kg)*100
        results[model] = {'orig': oa, 'kv': ka, 'delta': ka-oa, 'n': len(all_accepted),
                          'orig_c': sum(og), 'kv_c': sum(kg)}
        print(f"  {model}: orig={oa:.1f}% kv={ka:.1f}% Δ={ka-oa:+.1f}pp (n={len(all_accepted)})")
    return results

async def main():
    # Load all Level 5 problems (same seed as kv_math_200.py)
    subsets = ['algebra', 'number_theory', 'precalculus', 'intermediate_algebra', 'counting_and_probability', 'geometry']
    all_l5 = []
    for s in subsets:
        ds = load_dataset('EleutherAI/hendrycks_math', s, split='test')
        for item in ds:
            if item.get('level') == 'Level 5' and len(item.get('solution','')) > 50:
                item['subject'] = s; all_l5.append(item)
    random.shuffle(all_l5)
    selected = all_l5[:200]
    print(f"Total pool: {len(selected)} Level 5 problems")

    # Load previous kv_200 results to find no_slots indices
    with open(os.path.join(OUTPUT_DIR, 'kv_generation.json')) as f:
        prev = json.load(f)
    no_slots_indices = [r['original_index'] for r in prev if r['status'] == 'no_slots']
    prev_accepted = [r for r in prev if r['status'] == 'accepted']
    print(f"Previous: {len(prev_accepted)} accepted, {len(no_slots_indices)} no_slots to redo with o3")

    # Also load kv_50 accepted
    kv50_file = '/home/yurenh2/gap/mini_gap_math_results/kv_50/kv_final_results.json'
    kv50_accepted = []
    if os.path.exists(kv50_file):
        with open(kv50_file) as f:
            kv50 = json.load(f)
        kv50_accepted = kv50.get('accepted_variants', [])
        print(f"kv_50 accepted: {len(kv50_accepted)}")

    # Resume redo progress
    redo_results = []
    done_indices = set()
    if os.path.exists(REDO_FILE):
        with open(REDO_FILE) as f:
            redo_results = json.load(f)
        done_indices = {r['idx'] for r in redo_results}
        print(f"Resuming redo: {len(redo_results)} done")

    remaining = [i for i in no_slots_indices if i not in done_indices]
    print(f"Remaining to redo: {len(remaining)}")

    # Process in batches of 8
    for batch_start in range(0, len(remaining), 8):
        batch = remaining[batch_start:batch_start+8]
        tasks = [process_one(selected[i]['problem'], selected[i]['solution'], i, redo_results) for i in batch]
        await asyncio.gather(*tasks)
        from collections import Counter
        st = Counter(r['status'] for r in redo_results)
        print(f"--- Redo progress: {len(redo_results)}/{len(no_slots_indices)}, {dict(st)} ---")

    # Combine all accepted
    redo_accepted = [r for r in redo_results if r['status'] == 'accepted']
    all_accepted = kv50_accepted + prev_accepted + redo_accepted
    print(f"\nTotal accepted: {len(all_accepted)} (kv50={len(kv50_accepted)}, kv200={len(prev_accepted)}, redo={len(redo_accepted)})")

    # Evaluate
    if all_accepted:
        eval_results = await evaluate_all(all_accepted)
        final = {
            'total_accepted': len(all_accepted),
            'sources': {'kv50': len(kv50_accepted), 'kv200': len(prev_accepted), 'redo': len(redo_accepted)},
            'evaluation': eval_results,
        }
        with open(os.path.join(OUTPUT_DIR, 'kv_combined_final.json'), 'w') as f:
            json.dump(final, f, indent=2)
        print(f"\n{'='*60}\nFINAL COMBINED RESULTS ({len(all_accepted)} KV variants)\n{'='*60}")
        for m, r in eval_results.items():
            print(f"  {m}: orig={r['orig']:.1f}% kv={r['kv']:.1f}% Δ={r['delta']:+.1f}pp (n={r['n']})")

asyncio.run(main())