#!/usr/bin/env python3 """Generate SUMMARY.md from scores.csv.""" import csv from collections import defaultdict from pathlib import Path CSV_PATH = Path(__file__).parent / "scores.csv" SUMMARY_PATH = Path(__file__).parent / "SUMMARY.md" def load_scores() -> list[dict]: rows = [] with open(CSV_PATH) as f: for row in csv.DictReader(f): row["correct"] = row["correct"].lower() in ("true", "1", "yes") row["latency_ms"] = float(row.get("latency_ms", 0) or 0) rows.append(row) return rows def main() -> None: rows = load_scores() if not rows: print("No data in scores.csv") return models = sorted(set(r["model"] for r in rows)) benchmarks = ["mmlu", "gsm8k", "humaneval"] # Compute scores scores = {} # (model, bench) -> (correct, total) for r in rows: key = (r["model"], r["benchmark"]) if key not in scores: scores[key] = [0, 0] scores[key][1] += 1 if r["correct"]: scores[key][0] += 1 # MMLU per-category cat_scores = defaultdict(lambda: [0, 0]) for r in rows: if r["benchmark"] == "mmlu" and r.get("category"): key = (r["model"], r["category"]) cat_scores[key][1] += 1 if r["correct"]: cat_scores[key][0] += 1 categories = sorted(set(r.get("category", "") for r in rows if r.get("category"))) lines = ["# Eval Results\n"] # Main table lines.append("## Overall Scores\n") header = "| Model | MMLU (%) | GSM8K (%) | HumanEval (%) | Avg (%) |" sep = "|-------|---------|---------|--------------|---------|" lines.append(header) lines.append(sep) model_avgs = [] for model in models: cells = [] pcts = [] for bench in benchmarks: key = (model, bench) if key in scores: c, t = scores[key] pct = c / t * 100 if t > 0 else 0 cells.append(f"{pct:.1f}") pcts.append(pct) else: cells.append("—") avg = sum(pcts) / len(pcts) if pcts else 0 model_avgs.append((model, avg)) cells.append(f"{avg:.1f}") lines.append(f"| {model} | " + " | ".join(cells) + " |") # Sort summary model_avgs.sort(key=lambda x: -x[1]) lines.append(f"\n**Best overall: {model_avgs[0][0]}** ({model_avgs[0][1]:.1f}% avg)\n") # MMLU category breakdown if categories: lines.append("\n## MMLU Per-Category Breakdown\n") header = "| Model | " + " | ".join(c.replace("_", " ").title() for c in categories) + " |" sep = "|-------" + "|-------" * len(categories) + "|" lines.append(header) lines.append(sep) for model in models: cells = [] for cat in categories: key = (model, cat) if key in cat_scores: c, t = cat_scores[key] cells.append(f"{c}/{t}") else: cells.append("—") lines.append(f"| {model} | " + " | ".join(cells) + " |") # Latency summary lines.append("\n## Median Latency (ms)\n") lines.append("| Model | MMLU | GSM8K | HumanEval |") lines.append("|-------|------|-------|-----------|") for model in models: cells = [] for bench in benchmarks: lats = sorted([r["latency_ms"] for r in rows if r["model"] == model and r["benchmark"] == bench and r["latency_ms"] > 0]) if lats: med = lats[len(lats)//2] cells.append(f"{med:.0f}") else: cells.append("—") lines.append(f"| {model} | " + " | ".join(cells) + " |") summary = "\n".join(lines) + "\n" SUMMARY_PATH.write_text(summary) print(summary) print(f"\nWritten to: {SUMMARY_PATH}") if __name__ == "__main__": main()