Files
llama-sidecar/eval/analyze.py
indifferentketchup fe7f36ae98 llama-sidecar v0.1.0: daemon + benchmarks + eval suite
Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with
LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port),
deterministic hash-keyed sidecar reuse. Windows service support via
schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx
decoupled child lifetime.

Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM
in JSON config, -fa → --flash-attn on default, child process exit after
one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED,
context.Background for child lifetime, background reaper goroutine).

bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks
automation to sam-desktop. Per-GGUF production flags from llama-swap
config with --ctx-size 32768 override.

eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) +
A/B model comparison (14 agent-typed prompts × 8 models). All scripts
resumable at individual question level.

94 Go tests, race detector clean.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-28 01:55:13 +00:00

126 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""Generate SUMMARY.md from scores.csv."""
import csv
from collections import defaultdict
from pathlib import Path
CSV_PATH = Path(__file__).parent / "scores.csv"
SUMMARY_PATH = Path(__file__).parent / "SUMMARY.md"
def load_scores() -> list[dict]:
rows = []
with open(CSV_PATH) as f:
for row in csv.DictReader(f):
row["correct"] = row["correct"].lower() in ("true", "1", "yes")
row["latency_ms"] = float(row.get("latency_ms", 0) or 0)
rows.append(row)
return rows
def main() -> None:
rows = load_scores()
if not rows:
print("No data in scores.csv")
return
models = sorted(set(r["model"] for r in rows))
benchmarks = ["mmlu", "gsm8k", "humaneval"]
# Compute scores
scores = {} # (model, bench) -> (correct, total)
for r in rows:
key = (r["model"], r["benchmark"])
if key not in scores:
scores[key] = [0, 0]
scores[key][1] += 1
if r["correct"]:
scores[key][0] += 1
# MMLU per-category
cat_scores = defaultdict(lambda: [0, 0])
for r in rows:
if r["benchmark"] == "mmlu" and r.get("category"):
key = (r["model"], r["category"])
cat_scores[key][1] += 1
if r["correct"]:
cat_scores[key][0] += 1
categories = sorted(set(r.get("category", "") for r in rows if r.get("category")))
lines = ["# Eval Results\n"]
# Main table
lines.append("## Overall Scores\n")
header = "| Model | MMLU (%) | GSM8K (%) | HumanEval (%) | Avg (%) |"
sep = "|-------|---------|---------|--------------|---------|"
lines.append(header)
lines.append(sep)
model_avgs = []
for model in models:
cells = []
pcts = []
for bench in benchmarks:
key = (model, bench)
if key in scores:
c, t = scores[key]
pct = c / t * 100 if t > 0 else 0
cells.append(f"{pct:.1f}")
pcts.append(pct)
else:
cells.append("")
avg = sum(pcts) / len(pcts) if pcts else 0
model_avgs.append((model, avg))
cells.append(f"{avg:.1f}")
lines.append(f"| {model} | " + " | ".join(cells) + " |")
# Sort summary
model_avgs.sort(key=lambda x: -x[1])
lines.append(f"\n**Best overall: {model_avgs[0][0]}** ({model_avgs[0][1]:.1f}% avg)\n")
# MMLU category breakdown
if categories:
lines.append("\n## MMLU Per-Category Breakdown\n")
header = "| Model | " + " | ".join(c.replace("_", " ").title() for c in categories) + " |"
sep = "|-------" + "|-------" * len(categories) + "|"
lines.append(header)
lines.append(sep)
for model in models:
cells = []
for cat in categories:
key = (model, cat)
if key in cat_scores:
c, t = cat_scores[key]
cells.append(f"{c}/{t}")
else:
cells.append("")
lines.append(f"| {model} | " + " | ".join(cells) + " |")
# Latency summary
lines.append("\n## Median Latency (ms)\n")
lines.append("| Model | MMLU | GSM8K | HumanEval |")
lines.append("|-------|------|-------|-----------|")
for model in models:
cells = []
for bench in benchmarks:
lats = sorted([r["latency_ms"] for r in rows
if r["model"] == model and r["benchmark"] == bench
and r["latency_ms"] > 0])
if lats:
med = lats[len(lats)//2]
cells.append(f"{med:.0f}")
else:
cells.append("")
lines.append(f"| {model} | " + " | ".join(cells) + " |")
summary = "\n".join(lines) + "\n"
SUMMARY_PATH.write_text(summary)
print(summary)
print(f"\nWritten to: {SUMMARY_PATH}")
if __name__ == "__main__":
main()