Files
llama-sidecar/benchmarks/3d/analyze.py
indifferentketchup fe7f36ae98 llama-sidecar v0.1.0: daemon + benchmarks + eval suite
Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with
LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port),
deterministic hash-keyed sidecar reuse. Windows service support via
schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx
decoupled child lifetime.

Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM
in JSON config, -fa → --flash-attn on default, child process exit after
one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED,
context.Background for child lifetime, background reaper goroutine).

bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks
automation to sam-desktop. Per-GGUF production flags from llama-swap
config with --ctx-size 32768 override.

eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) +
A/B model comparison (14 agent-typed prompts × 8 models). All scripts
resumable at individual question level.

94 Go tests, race detector clean.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-28 01:55:13 +00:00

110 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""Analyze MTP n_max sweep results and produce summary.md."""
import json
from pathlib import Path
RESULTS_PATH = Path(__file__).parent / "results.json"
SUMMARY_PATH = Path(__file__).parent / "summary.md"
def load_results() -> list[dict]:
data = json.loads(RESULTS_PATH.read_text())
return [r for r in data if r.get("eval_tok_s") is not None and r.get("error") is None]
def main() -> None:
rows = load_results()
if not rows:
print("No valid results found.")
return
models = sorted(set(r["model"] for r in rows))
lines = ["# MTP n_max Sweep Results\n"]
lines.append(f"**{len(rows)} valid measurements across {len(models)} models.**\n")
recommendations = []
for model in models:
model_rows = [r for r in rows if r["model"] == model]
n_max_values = sorted(set(r["n_max"] for r in model_rows))
prompt_names = sorted(set(r["prompt"] for r in model_rows))
lines.append(f"\n## {model}\n")
header = "| n_max | " + " | ".join(f"{p} tok/s" for p in prompt_names) + " | avg tok/s | vs n_max=0 |"
sep = "|-------|" + "|".join("-" * (len(p) + 7) for p in prompt_names) + "|-----------|------------|"
lines.append(header)
lines.append(sep)
baseline_avg = None
best_avg = 0
best_n = 0
for n in n_max_values:
cells = []
vals = []
for p in prompt_names:
matching = [r for r in model_rows if r["n_max"] == n and r["prompt"] == p]
if matching:
v = matching[0]["eval_tok_s"]
cells.append(f"{v:.1f}")
vals.append(v)
else:
cells.append("")
avg = sum(vals) / len(vals) if vals else 0
if n == 0:
baseline_avg = avg
delta = "baseline"
elif baseline_avg and baseline_avg > 0:
pct = ((avg - baseline_avg) / baseline_avg) * 100
delta = f"{pct:+.1f}%"
else:
delta = ""
if avg > best_avg:
best_avg = avg
best_n = n
draft_info = ""
draft_rows = [r for r in model_rows if r["n_max"] == n and r.get("draft_n")]
if draft_rows:
total_draft = sum(r.get("draft_n", 0) for r in draft_rows)
total_accepted = sum(r.get("draft_n_accepted", 0) for r in draft_rows)
if total_draft > 0:
accept_pct = (total_accepted / total_draft) * 100
draft_info = f" (accept {accept_pct:.0f}%)"
row_str = f"| {n} | " + " | ".join(cells) + f" | {avg:.1f} | {delta}{draft_info} |"
lines.append(row_str)
if baseline_avg and baseline_avg > 0 and best_avg > 0:
improvement = ((best_avg - baseline_avg) / baseline_avg) * 100
lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s, {improvement:+.1f}% vs baseline)\n")
recommendations.append((model, best_n, best_avg, improvement))
else:
lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s)\n")
# Recommendations section
lines.append("\n---\n")
lines.append("## Recommended `llama_extra_args` per model\n")
lines.append("| Model | n_max | avg tok/s | vs baseline | suggested flags |")
lines.append("|-------|-------|-----------|-------------|-----------------|")
for model, n, avg, imp in recommendations:
if n > 0:
flags = f'`["--spec-type", "draft-mtp", "--spec-draft-n-max", "{n}"]`'
else:
flags = "_(none — MTP not beneficial)_"
lines.append(f"| {model} | {n} | {avg:.1f} | {imp:+.1f}% | {flags} |")
lines.append("")
summary = "\n".join(lines)
SUMMARY_PATH.write_text(summary)
print(summary)
print(f"\nWritten to: {SUMMARY_PATH}")
if __name__ == "__main__":
main()