llama-sidecar v0.1.0: daemon + benchmarks + eval suite

Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-28 01:55:13 +00:00
parent babbb4f39b
commit fe7f36ae98
39 changed files with 4228 additions and 0 deletions
--- a/eval/analyze.py
+++ b/eval/analyze.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""Generate SUMMARY.md from scores.csv."""
+
+import csv
+from collections import defaultdict
+from pathlib import Path
+
+CSV_PATH = Path(__file__).parent / "scores.csv"
+SUMMARY_PATH = Path(__file__).parent / "SUMMARY.md"
+
+
+def load_scores() -> list[dict]:
+    rows = []
+    with open(CSV_PATH) as f:
+        for row in csv.DictReader(f):
+            row["correct"] = row["correct"].lower() in ("true", "1", "yes")
+            row["latency_ms"] = float(row.get("latency_ms", 0) or 0)
+            rows.append(row)
+    return rows
+
+
+def main() -> None:
+    rows = load_scores()
+    if not rows:
+        print("No data in scores.csv")
+        return
+
+    models = sorted(set(r["model"] for r in rows))
+    benchmarks = ["mmlu", "gsm8k", "humaneval"]
+
+    # Compute scores
+    scores = {}  # (model, bench) -> (correct, total)
+    for r in rows:
+        key = (r["model"], r["benchmark"])
+        if key not in scores:
+            scores[key] = [0, 0]
+        scores[key][1] += 1
+        if r["correct"]:
+            scores[key][0] += 1
+
+    # MMLU per-category
+    cat_scores = defaultdict(lambda: [0, 0])
+    for r in rows:
+        if r["benchmark"] == "mmlu" and r.get("category"):
+            key = (r["model"], r["category"])
+            cat_scores[key][1] += 1
+            if r["correct"]:
+                cat_scores[key][0] += 1
+
+    categories = sorted(set(r.get("category", "") for r in rows if r.get("category")))
+
+    lines = ["# Eval Results\n"]
+
+    # Main table
+    lines.append("## Overall Scores\n")
+    header = "| Model | MMLU (%) | GSM8K (%) | HumanEval (%) | Avg (%) |"
+    sep = "|-------|---------|---------|--------------|---------|"
+    lines.append(header)
+    lines.append(sep)
+
+    model_avgs = []
+    for model in models:
+        cells = []
+        pcts = []
+        for bench in benchmarks:
+            key = (model, bench)
+            if key in scores:
+                c, t = scores[key]
+                pct = c / t * 100 if t > 0 else 0
+                cells.append(f"{pct:.1f}")
+                pcts.append(pct)
+            else:
+                cells.append("—")
+        avg = sum(pcts) / len(pcts) if pcts else 0
+        model_avgs.append((model, avg))
+        cells.append(f"{avg:.1f}")
+        lines.append(f"| {model} | " + " | ".join(cells) + " |")
+
+    # Sort summary
+    model_avgs.sort(key=lambda x: -x[1])
+    lines.append(f"\n**Best overall: {model_avgs[0][0]}** ({model_avgs[0][1]:.1f}% avg)\n")
+
+    # MMLU category breakdown
+    if categories:
+        lines.append("\n## MMLU Per-Category Breakdown\n")
+        header = "| Model | " + " | ".join(c.replace("_", " ").title() for c in categories) + " |"
+        sep = "|-------" + "|-------" * len(categories) + "|"
+        lines.append(header)
+        lines.append(sep)
+        for model in models:
+            cells = []
+            for cat in categories:
+                key = (model, cat)
+                if key in cat_scores:
+                    c, t = cat_scores[key]
+                    cells.append(f"{c}/{t}")
+                else:
+                    cells.append("—")
+            lines.append(f"| {model} | " + " | ".join(cells) + " |")
+
+    # Latency summary
+    lines.append("\n## Median Latency (ms)\n")
+    lines.append("| Model | MMLU | GSM8K | HumanEval |")
+    lines.append("|-------|------|-------|-----------|")
+    for model in models:
+        cells = []
+        for bench in benchmarks:
+            lats = sorted([r["latency_ms"] for r in rows
+                          if r["model"] == model and r["benchmark"] == bench
+                          and r["latency_ms"] > 0])
+            if lats:
+                med = lats[len(lats)//2]
+                cells.append(f"{med:.0f}")
+            else:
+                cells.append("—")
+        lines.append(f"| {model} | " + " | ".join(cells) + " |")
+
+    summary = "\n".join(lines) + "\n"
+    SUMMARY_PATH.write_text(summary)
+    print(summary)
+    print(f"\nWritten to: {SUMMARY_PATH}")
+
+
+if __name__ == "__main__":
+    main()