diff options
Diffstat (limited to 'scripts/audit_results.py')
| -rwxr-xr-x | scripts/audit_results.py | 645 |
1 files changed, 645 insertions, 0 deletions
diff --git a/scripts/audit_results.py b/scripts/audit_results.py new file mode 100755 index 0000000..53d04a1 --- /dev/null +++ b/scripts/audit_results.py @@ -0,0 +1,645 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +import math +import statistics +import sys +from collections import defaultdict +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Any + + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + + +HIGHER_BETTER = {"accuracy", "ap", "auc", "f1", "mrr", "rocauc"} +LOWER_BETTER = {"mae", "raw-mae", "rmse", "mae_sum"} + + +def metric_direction(metric: str) -> int: + metric = metric.lower() + if metric in HIGHER_BETTER: + return 1 + if metric in LOWER_BETTER or "mae" in metric or "rmse" in metric: + return -1 + return 1 + + +def mean(xs: list[float]) -> float: + return sum(xs) / len(xs) if xs else math.nan + + +def stdev(xs: list[float]) -> float: + return statistics.stdev(xs) if len(xs) > 1 else 0.0 + + +def median(xs: list[float]) -> float: + return statistics.median(xs) if xs else math.nan + + +def fmt(x: float | None, digits: int = 4) -> str: + if x is None: + return "" + if isinstance(x, float) and math.isnan(x): + return "" + return f"{x:.{digits}f}" + + +def read_json(path: Path) -> dict[str, Any] | None: + try: + with path.open() as f: + obj = json.load(f) + except (OSError, json.JSONDecodeError): + return None + return obj if isinstance(obj, dict) else None + + +def score_from_split(rep: dict[str, Any], split: str, metric: str) -> float | None: + value = rep.get(split, {}).get(metric) + return None if value is None else float(value) + + +def ogb_compute_label(rep: dict[str, Any]) -> str: + compute = str(rep["compute"]) + t = int(rep.get("T", -1)) + n_sup = int(rep.get("n_sup", -1)) + if compute == "classic" and t == 0 and n_sup == 1: + label = "classic" + elif compute == "classic" and t == 0: + label = f"view-only-T{t}-ns{n_sup}" + elif compute == "rrog-act": + target = str(rep.get("halt_target", "")) + if target == "loss": + target += f"{float(rep.get('halt_loss_threshold', 0.0) or 0.0):g}" + label = ( + f"rrog-act-{rep.get('act_train_mode', 'stream')}-T{rep.get('T')}-ns{rep.get('n_sup')}" + f"-hm{rep.get('halt_max_steps')}-min{rep.get('halt_min_steps')}" + f"-{target}-lq{float(rep.get('lam_q', 0.0) or 0.0):g}" + f"-hex{float(rep.get('halt_exploration_prob', 0.0) or 0.0):g}" + f"-qw{rep.get('q_warmup_epochs', 0)}" + ) + else: + label = f"{compute}-T{rep.get('T')}-ns{rep.get('n_sup')}" + ema = float(rep.get("ema", 0.0) or 0.0) + if ema > 0: + label += f"+ema{ema:g}" + return label + + +def ogb_compute_family(rep: dict[str, Any]) -> str: + compute = str(rep["compute"]) + t = int(rep.get("T", -1)) + n_sup = int(rep.get("n_sup", -1)) + if compute == "classic" and not (t == 0 and n_sup == 1): + return "view-only" if t == 0 else "classic-nonbaseline" + return compute + + +def zinc_compute_label(rep: dict[str, Any]) -> tuple[str, str]: + t = int(rep.get("T", -1)) + n_sup = int(rep.get("n_sup", -1)) + if rep.get("act"): + return "rrog-act", f"rrog-act-T{t}-ns{n_sup}" + if t == 0 and n_sup == 1: + return "classic", "classic" + if t == 0: + return "view-only", f"view-only-T{t}-ns{n_sup}" + label = f"fixed-rrog-T{t}-ns{n_sup}" + if rep.get("loss_mode") == "trace": + label += "+trace" + return "fixed-rrog", label + + +@dataclass(frozen=True) +class Record: + dataset: str + view: str + compute_family: str + compute_label: str + seed: int + metric: str + direction: int + val: float + test: float + epochs: int + hidden: int + T: int + n_sup: int + ep: int + source: str + adaptive_test: float | None = None + adaptive_steps: float | None = None + fixed_steps: float | None = None + + @property + def key(self) -> tuple[str, str, str, int]: + return (self.dataset, self.view, self.compute_label, self.seed) + + @property + def rank_key(self) -> tuple[int, int, int]: + return (self.epochs, self.hidden, self.ep) + + +def standard_record(path: Path, rep: dict[str, Any]) -> Record | None: + required = {"dataset", "view", "compute", "seed", "metric", "val", "test"} + if not required.issubset(rep): + return None + metric = str(rep["metric"]) + val = score_from_split(rep, "val", metric) + test = score_from_split(rep, "test", metric) + if val is None or test is None: + return None + adaptive_test = score_from_split(rep, "test_adaptive", metric) + return Record( + dataset=str(rep["dataset"]).lower(), + view=str(rep["view"]), + compute_family=ogb_compute_family(rep), + compute_label=ogb_compute_label(rep), + seed=int(rep["seed"]), + metric=metric, + direction=metric_direction(metric), + val=val, + test=test, + adaptive_test=adaptive_test, + adaptive_steps=none_or_float(rep.get("adaptive_steps")), + fixed_steps=none_or_float(rep.get("fixed_steps")), + epochs=int(rep.get("epochs", 0) or 0), + hidden=int(rep.get("hidden", 0) or 0), + T=int(rep.get("T", -1) or -1), + n_sup=int(rep.get("n_sup", -1) or -1), + ep=int(rep.get("ep", 0) or 0), + source=str(path), + ) + + +def zinc_record(path: Path, rep: dict[str, Any]) -> Record | None: + if rep.get("dataset") != "ZINC-cycle56": + return None + if rep.get("K") != 1 or rep.get("select") != "none" or float(rep.get("sigma", 0.0)) != 0.0: + return None + if "val_mae" not in rep or "test_mae" not in rep: + return None + family, label = zinc_compute_label(rep) + val = float(sum(rep["val_mae"])) + test = float(sum(rep["test_mae"])) + return Record( + dataset="zinc-cycle56", + view=str(rep.get("view", "gin")), + compute_family=family, + compute_label=label, + seed=int(rep["seed"]), + metric="mae_sum", + direction=-1, + val=val, + test=test, + epochs=int(rep.get("epochs", 0) or 0), + hidden=int(rep.get("hidden", 0) or 0), + T=int(rep.get("T", -1) or -1), + n_sup=int(rep.get("n_sup", -1) or -1), + ep=int(rep.get("ep", 0) or 0), + source=str(path), + ) + + +def none_or_float(value: Any) -> float | None: + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def load_records(runs_dir: Path, min_epochs: int) -> tuple[list[Record], int]: + candidates: dict[tuple[str, str, str, int], Record] = {} + raw_count = 0 + for path in sorted(runs_dir.glob("*.json")): + rep = read_json(path) + if rep is None: + continue + raw_count += 1 + rec = standard_record(path, rep) or zinc_record(path, rep) + if rec is None or rec.epochs < min_epochs: + continue + old = candidates.get(rec.key) + if old is None or rec.rank_key > old.rank_key: + candidates[rec.key] = rec + return sorted(candidates.values(), key=lambda r: (r.dataset, r.view, r.compute_label, r.seed)), raw_count + + +def group_cells(records: list[Record]) -> dict[tuple[str, str, str], list[Record]]: + cells: dict[tuple[str, str, str], list[Record]] = defaultdict(list) + for rec in records: + cells[(rec.dataset, rec.view, rec.compute_label)].append(rec) + return dict(cells) + + +def summarize_records(records: list[Record]) -> dict[str, Any]: + vals = [r.val for r in records] + tests = [r.test for r in records] + adaptive = [r.adaptive_test for r in records if r.adaptive_test is not None] + steps = [r.adaptive_steps for r in records if r.adaptive_steps is not None] + first = records[0] + return { + "dataset": first.dataset, + "view": first.view, + "compute_family": first.compute_family, + "compute_label": first.compute_label, + "metric": first.metric, + "n": len(records), + "seeds": " ".join(str(r.seed) for r in sorted(records, key=lambda x: x.seed)), + "epochs_min": min(r.epochs for r in records), + "epochs_max": max(r.epochs for r in records), + "hidden": first.hidden, + "T": first.T, + "n_sup": first.n_sup, + "val_mean": mean(vals), + "val_std": stdev(vals), + "test_mean": mean(tests), + "test_std": stdev(tests), + "adaptive_test_mean": mean(adaptive) if adaptive else math.nan, + "adaptive_steps_mean": mean(steps) if steps else math.nan, + "sources": " ".join(r.source for r in sorted(records, key=lambda x: x.seed)), + } + + +def paired_deltas(records: list[Record]) -> tuple[list[dict[str, Any]], list[Record]]: + classic: dict[tuple[str, str, int], Record] = {} + for rec in records: + if rec.compute_label == "classic" and rec.compute_family == "classic": + classic[(rec.dataset, rec.view, rec.seed)] = rec + + rows = [] + unpaired = [] + for rec in records: + if rec.compute_label == "classic": + continue + base = classic.get((rec.dataset, rec.view, rec.seed)) + if base is None: + unpaired.append(rec) + continue + adaptive_delta = None + if rec.adaptive_test is not None: + adaptive_delta = rec.direction * (rec.adaptive_test - base.test) + rows.append({ + "dataset": rec.dataset, + "view": rec.view, + "compute_family": rec.compute_family, + "compute_label": rec.compute_label, + "seed": rec.seed, + "metric": rec.metric, + "direction": rec.direction, + "base_val": base.val, + "base_test": base.test, + "val": rec.val, + "test": rec.test, + "adaptive_test": rec.adaptive_test, + "val_delta": rec.direction * (rec.val - base.val), + "test_delta": rec.direction * (rec.test - base.test), + "adaptive_test_delta": adaptive_delta, + "adaptive_steps": rec.adaptive_steps, + "fixed_steps": rec.fixed_steps, + "epochs": rec.epochs, + "hidden": rec.hidden, + "T": rec.T, + "n_sup": rec.n_sup, + "source": rec.source, + "base_source": base.source, + }) + rows.sort(key=lambda r: (r["dataset"], r["view"], r["compute_label"], r["seed"])) + return rows, unpaired + + +def summarize_deltas(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + grouped: dict[tuple[str, str], list[dict[str, Any]]] = defaultdict(list) + for row in rows: + grouped[(row["compute_family"], row["compute_label"])].append(row) + + out = [] + for (family, label), group in sorted(grouped.items()): + test_d = [float(r["test_delta"]) for r in group] + val_d = [float(r["val_delta"]) for r in group] + adaptive_d = [ + float(r["adaptive_test_delta"]) + for r in group + if r["adaptive_test_delta"] is not None + ] + steps = [float(r["adaptive_steps"]) for r in group if r["adaptive_steps"] is not None] + out.append({ + "compute_family": family, + "compute_label": label, + "n": len(group), + "test_mean_delta": mean(test_d), + "test_median_delta": median(test_d), + "test_positive": sum(x > 0 for x in test_d), + "test_negative": sum(x < 0 for x in test_d), + "val_mean_delta": mean(val_d), + "val_positive": sum(x > 0 for x in val_d), + "adaptive_mean_delta": mean(adaptive_d) if adaptive_d else math.nan, + "adaptive_positive": sum(x > 0 for x in adaptive_d) if adaptive_d else "", + "adaptive_negative": sum(x < 0 for x in adaptive_d) if adaptive_d else "", + "adaptive_steps_mean": mean(steps) if steps else math.nan, + }) + return out + + +def expected_specs() -> list[tuple[str, str, str]]: + try: + from rrog.runspecs import RUN_SPECS + except Exception: + return [] + return sorted({(s.task.lower(), s.view, s.compute) for s in RUN_SPECS}) + + +def coverage_rows( + records: list[Record], + expected_seeds: set[int], + coverage_families: set[str], +) -> list[dict[str, Any]]: + by_family: dict[tuple[str, str, str], list[Record]] = defaultdict(list) + for rec in records: + by_family[(rec.dataset, rec.view, rec.compute_family)].append(rec) + + rows = [] + for dataset, view, family in expected_specs(): + if coverage_families and family not in coverage_families: + continue + present = by_family.get((dataset, view, family), []) + seeds = sorted({r.seed for r in present}) + labels = sorted({r.compute_label for r in present}) + missing_seeds = sorted(expected_seeds.difference(seeds)) if expected_seeds else [] + if not present: + status = "missing" + elif missing_seeds: + status = "missing-seeds" + else: + status = "ok" + rows.append({ + "dataset": dataset, + "view": view, + "compute_family": family, + "status": status, + "n_runs": len(present), + "seeds": " ".join(map(str, seeds)), + "missing_seeds": " ".join(map(str, missing_seeds)), + "labels": " | ".join(labels), + }) + return rows + + +def write_csv(path: Path, rows: list[dict[str, Any]], fields: list[str]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore") + writer.writeheader() + for row in rows: + writer.writerow(row) + + +def md_table(headers: list[str], rows: list[list[str]]) -> str: + out = [ + "| " + " | ".join(headers) + " |", + "| " + " | ".join(["---"] * len(headers)) + " |", + ] + out.extend("| " + " | ".join(row) + " |" for row in rows) + return "\n".join(out) + + +def build_markdown( + records: list[Record], + raw_count: int, + cells: list[dict[str, Any]], + deltas: list[dict[str, Any]], + delta_summary: list[dict[str, Any]], + coverage: list[dict[str, Any]], + unpaired: list[Record], + digits: int, +) -> str: + missing = [r for r in coverage if r["status"] == "missing"] + missing_seeds = [r for r in coverage if r["status"] == "missing-seeds"] + negative = sorted(deltas, key=lambda r: float(r["test_delta"]))[:30] + adaptive_negative = sorted( + [r for r in deltas if r["adaptive_test_delta"] is not None], + key=lambda r: float(r["adaptive_test_delta"]), + )[:30] + val_pos_test_neg = [ + r for r in deltas + if float(r["val_delta"]) > 0 and float(r["test_delta"]) < 0 + ] + + lines = [ + "# Result Audit", + "", + f"Generated: {datetime.now().isoformat(timespec='seconds')}", + "", + "## Scope", + "", + f"- Raw JSON files scanned: {raw_count}", + f"- Deduplicated runs after min-epoch filter: {len(records)}", + f"- Aggregated cells: {len(cells)}", + f"- Paired non-classic deltas vs matching classic: {len(deltas)}", + f"- Unpaired non-classic runs: {len(unpaired)}", + f"- Missing expected cells: {len(missing)}", + f"- Cells missing expected seeds: {len(missing_seeds)}", + "", + "## Delta Summary", + "", + md_table( + [ + "family", "label", "n", "test mean", "test median", + "test +/-", "val mean", "adaptive mean", "steps", + ], + [ + [ + r["compute_family"], + r["compute_label"], + str(r["n"]), + fmt(r["test_mean_delta"], digits), + fmt(r["test_median_delta"], digits), + f"{r['test_positive']}/{r['test_negative']}", + fmt(r["val_mean_delta"], digits), + fmt(r["adaptive_mean_delta"], digits), + fmt(r["adaptive_steps_mean"], 2), + ] + for r in delta_summary + ], + ), + "", + "## Worst Fixed-Test Deltas", + "", + md_table( + ["dataset", "view", "label", "seed", "metric", "val delta", "test delta", "source"], + [ + [ + r["dataset"], + r["view"], + r["compute_label"], + str(r["seed"]), + r["metric"], + fmt(r["val_delta"], digits), + fmt(r["test_delta"], digits), + Path(r["source"]).name, + ] + for r in negative + ], + ), + "", + ] + if adaptive_negative: + lines.extend([ + "## Worst Adaptive-Test Deltas", + "", + md_table( + ["dataset", "view", "label", "seed", "metric", "adaptive delta", "steps", "source"], + [ + [ + r["dataset"], + r["view"], + r["compute_label"], + str(r["seed"]), + r["metric"], + fmt(r["adaptive_test_delta"], digits), + fmt(r["adaptive_steps"], 2), + Path(r["source"]).name, + ] + for r in adaptive_negative + ], + ), + "", + ]) + + if val_pos_test_neg: + lines.extend([ + "## Val-Positive Test-Negative Cases", + "", + md_table( + ["dataset", "view", "label", "seed", "val delta", "test delta"], + [ + [ + r["dataset"], + r["view"], + r["compute_label"], + str(r["seed"]), + fmt(r["val_delta"], digits), + fmt(r["test_delta"], digits), + ] + for r in sorted(val_pos_test_neg, key=lambda x: float(x["test_delta"]))[:30] + ], + ), + "", + ]) + + if missing[:40]: + lines.extend([ + "## First Missing Expected Cells", + "", + md_table( + ["dataset", "view", "family"], + [[r["dataset"], r["view"], r["compute_family"]] for r in missing[:40]], + ), + "", + ]) + + if unpaired[:40]: + lines.extend([ + "## First Unpaired Non-Classic Runs", + "", + md_table( + ["dataset", "view", "label", "seed", "source"], + [ + [r.dataset, r.view, r.compute_label, str(r.seed), Path(r.source).name] + for r in unpaired[:40] + ], + ), + "", + ]) + + lines.extend([ + "## Files", + "", + "- `analysis/result_cells.csv`: one row per deduplicated dataset/backbone/compute cell.", + "- `analysis/paired_deltas.csv`: seed-paired deltas against matching classic baselines.", + "- `analysis/delta_summary.csv`: aggregate delta statistics by compute label.", + "- `analysis/coverage.csv`: expected matrix coverage from `rrog.runspecs`.", + "", + ]) + return "\n".join(lines) + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--runs-dir", default="runs") + ap.add_argument("--out-dir", default="analysis") + ap.add_argument("--min-epochs", type=int, default=10) + ap.add_argument("--expected-seeds", default="0") + ap.add_argument("--coverage-families", default="classic fixed-rrog rrog-act") + ap.add_argument("--digits", type=int, default=4) + args = ap.parse_args() + + runs_dir = Path(args.runs_dir) + out_dir = Path(args.out_dir) + expected_seeds = { + int(x) + for part in args.expected_seeds.replace(",", " ").split() + for x in [part.strip()] + if x + } + coverage_families = { + x + for part in args.coverage_families.replace(",", " ").split() + for x in [part.strip()] + if x + } + + records, raw_count = load_records(runs_dir, args.min_epochs) + grouped = group_cells(records) + cells = [summarize_records(recs) for _, recs in sorted(grouped.items())] + deltas, unpaired = paired_deltas(records) + delta_summary = summarize_deltas(deltas) + coverage = coverage_rows(records, expected_seeds, coverage_families) + + write_csv(out_dir / "result_cells.csv", cells, [ + "dataset", "view", "compute_family", "compute_label", "metric", "n", "seeds", + "epochs_min", "epochs_max", "hidden", "T", "n_sup", "val_mean", "val_std", + "test_mean", "test_std", "adaptive_test_mean", "adaptive_steps_mean", "sources", + ]) + write_csv(out_dir / "paired_deltas.csv", deltas, [ + "dataset", "view", "compute_family", "compute_label", "seed", "metric", "direction", + "base_val", "base_test", "val", "test", "adaptive_test", "val_delta", "test_delta", + "adaptive_test_delta", "adaptive_steps", "fixed_steps", "epochs", "hidden", "T", + "n_sup", "source", "base_source", + ]) + write_csv(out_dir / "delta_summary.csv", delta_summary, [ + "compute_family", "compute_label", "n", "test_mean_delta", "test_median_delta", + "test_positive", "test_negative", "val_mean_delta", "val_positive", + "adaptive_mean_delta", "adaptive_positive", "adaptive_negative", "adaptive_steps_mean", + ]) + write_csv(out_dir / "coverage.csv", coverage, [ + "dataset", "view", "compute_family", "status", "n_runs", "seeds", "missing_seeds", + "labels", + ]) + (out_dir / "result_audit.md").write_text( + build_markdown( + records, + raw_count, + cells, + deltas, + delta_summary, + coverage, + unpaired, + args.digits, + ) + ) + print(f"wrote {out_dir / 'result_audit.md'}") + print(f"records={len(records)} cells={len(cells)} paired_deltas={len(deltas)}") + + +if __name__ == "__main__": + main() |
