llama-sidecar v0.1.0: daemon + benchmarks + eval suite
Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
215
bench/analyze.py
Normal file
215
bench/analyze.py
Normal file
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Analyze MTP on/off benchmark results → CSV + SUMMARY.md + recommendations."""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
from pathlib import Path
|
||||
|
||||
RESULTS_DIR = Path(__file__).parent / "results"
|
||||
CSV_PATH = Path(__file__).parent / "results.csv"
|
||||
SUMMARY_PATH = Path(__file__).parent / "SUMMARY.md"
|
||||
RECO_PATH = Path(__file__).parent / "llama-swap-recommendations.md"
|
||||
|
||||
FNAME_RE = re.compile(
|
||||
r"^(?P<stem>.+?)__mtp-(?P<mtp>on|off)__len(?P<len>\d+)__run(?P<run>\d+)\.json$"
|
||||
)
|
||||
|
||||
|
||||
def parse_result(path: Path) -> dict | None:
|
||||
m = FNAME_RE.match(path.name)
|
||||
if not m:
|
||||
return None
|
||||
try:
|
||||
data = json.loads(path.read_text())
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return None
|
||||
t = data.get("timings", {})
|
||||
return {
|
||||
"gguf": m.group("stem"),
|
||||
"mtp": m.group("mtp"),
|
||||
"prompt_len": int(m.group("len")),
|
||||
"run": int(m.group("run")),
|
||||
"prompt_tps": t.get("prompt_per_second"),
|
||||
"predicted_tps": t.get("predicted_per_second"),
|
||||
"cache_n": t.get("cache_n"),
|
||||
"draft_n": t.get("draft_n"),
|
||||
"accepted_n": t.get("draft_n_accepted"),
|
||||
"total_ms": (t.get("prompt_ms", 0) or 0) + (t.get("predicted_ms", 0) or 0),
|
||||
}
|
||||
|
||||
|
||||
def load_all() -> list[dict]:
|
||||
rows = []
|
||||
for f in sorted(RESULTS_DIR.glob("*.json")):
|
||||
r = parse_result(f)
|
||||
if r:
|
||||
rows.append(r)
|
||||
return rows
|
||||
|
||||
|
||||
def write_csv(rows: list[dict]) -> None:
|
||||
fields = ["gguf", "mtp", "prompt_len", "run", "prompt_tps", "predicted_tps",
|
||||
"cache_n", "draft_n", "accepted_n", "total_ms"]
|
||||
with open(CSV_PATH, "w", newline="") as f:
|
||||
w = csv.DictWriter(f, fieldnames=fields)
|
||||
w.writeheader()
|
||||
w.writerows(rows)
|
||||
print(f"Wrote {len(rows)} rows to {CSV_PATH}")
|
||||
|
||||
|
||||
def median_of(values: list[float]) -> float:
|
||||
return statistics.median(values) if values else 0.0
|
||||
|
||||
|
||||
def write_summary(rows: list[dict]) -> None:
|
||||
ggufs = sorted(set(r["gguf"] for r in rows))
|
||||
lens = sorted(set(r["prompt_len"] for r in rows))
|
||||
lines = ["# MTP On/Off Benchmark Results\n"]
|
||||
lines.append(f"**{len(rows)} measurements across {len(ggufs)} GGUFs.**\n")
|
||||
lines.append(f"Runs 2 & 3 used for median (run 1 = warmup, discarded).\n")
|
||||
|
||||
verdicts = []
|
||||
|
||||
for gguf in ggufs:
|
||||
lines.append(f"\n## {gguf}\n")
|
||||
header_parts = ["prompt_len"]
|
||||
for state in ["off", "on"]:
|
||||
header_parts.append(f"MTP-{state} tok/s")
|
||||
header_parts.extend(["delta %", "accept %"])
|
||||
lines.append("| " + " | ".join(header_parts) + " |")
|
||||
lines.append("|" + "|".join("---" for _ in header_parts) + "|")
|
||||
|
||||
any_above_10 = False
|
||||
for pl in lens:
|
||||
off_vals = [r["predicted_tps"] for r in rows
|
||||
if r["gguf"] == gguf and r["mtp"] == "off"
|
||||
and r["prompt_len"] == pl and r["run"] >= 2
|
||||
and r["predicted_tps"] is not None]
|
||||
on_vals = [r["predicted_tps"] for r in rows
|
||||
if r["gguf"] == gguf and r["mtp"] == "on"
|
||||
and r["prompt_len"] == pl and r["run"] >= 2
|
||||
and r["predicted_tps"] is not None]
|
||||
|
||||
off_med = median_of(off_vals)
|
||||
on_med = median_of(on_vals)
|
||||
|
||||
if off_med > 0:
|
||||
delta = ((on_med - off_med) / off_med) * 100
|
||||
else:
|
||||
delta = 0.0
|
||||
|
||||
if abs(delta) >= 10:
|
||||
any_above_10 = True
|
||||
|
||||
draft_rows = [r for r in rows
|
||||
if r["gguf"] == gguf and r["mtp"] == "on"
|
||||
and r["prompt_len"] == pl and r["run"] >= 2
|
||||
and r.get("draft_n")]
|
||||
total_draft = sum(r.get("draft_n", 0) for r in draft_rows)
|
||||
total_accepted = sum(r.get("accepted_n", 0) for r in draft_rows)
|
||||
accept_pct = f"{(total_accepted / total_draft * 100):.0f}%" if total_draft > 0 else "—"
|
||||
|
||||
lines.append(
|
||||
f"| {pl} | {off_med:.1f} | {on_med:.1f} | {delta:+.1f}% | {accept_pct} |"
|
||||
)
|
||||
|
||||
if any_above_10:
|
||||
verdict = "KEEP MTP"
|
||||
else:
|
||||
verdict = "DROP MTP"
|
||||
verdicts.append((gguf, verdict))
|
||||
lines.append(f"\n**Verdict: {verdict}**\n")
|
||||
|
||||
lines.append("\n---\n")
|
||||
lines.append("## Verdict Summary\n")
|
||||
lines.append("| GGUF | Verdict |")
|
||||
lines.append("|------|---------|")
|
||||
for gguf, verdict in verdicts:
|
||||
lines.append(f"| {gguf} | {verdict} |")
|
||||
|
||||
summary = "\n".join(lines) + "\n"
|
||||
SUMMARY_PATH.write_text(summary)
|
||||
print(f"Wrote {SUMMARY_PATH}")
|
||||
print(summary)
|
||||
|
||||
|
||||
def write_recommendations(rows: list[dict]) -> None:
|
||||
ggufs = sorted(set(r["gguf"] for r in rows))
|
||||
lens = sorted(set(r["prompt_len"] for r in rows))
|
||||
|
||||
lines = ["# llama-swap Config Recommendations\n"]
|
||||
lines.append("Based on MTP on/off benchmark results.\n")
|
||||
lines.append("**Read-only reference** — do NOT edit D:\\llama-swap\\config.yaml directly.\n")
|
||||
lines.append("```yaml")
|
||||
lines.append("# Commented diff against current config.yaml")
|
||||
lines.append("# Lines starting with + should be added, - should be removed")
|
||||
lines.append("")
|
||||
|
||||
model_map = {
|
||||
"Qwen3.6-35B-A3B-MXFP4_MOE": "qwen3.6-35b-a3b-mxfp4",
|
||||
"Qwen3.6-27B-Q6_K": "qwen3.6-27b-mtp",
|
||||
"Qwopus3.5-4B-v3-MTP-Q8_0": "qwopus3.5-4b-mtp",
|
||||
"Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0": "qwen3.5-9b-deepseek-v4-mtp",
|
||||
"Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M": "qwopus3.6-35b-a3b-v1-mtp",
|
||||
"Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16": "qwopus3.6-35b-a3b-mxfp4-mtp",
|
||||
"Qwopus3.6-27B-v2-MTP-Q6_K": "qwopus3.6-27b-v2-mtp",
|
||||
"Qwopus3.5-9B-Coder-MTP-Q8_0": "qwopus3.5-9b-coder-mtp",
|
||||
}
|
||||
|
||||
currently_mtp = {
|
||||
"Qwen3.6-35B-A3B-MXFP4_MOE": False,
|
||||
"Qwen3.6-27B-Q6_K": True,
|
||||
"Qwopus3.5-4B-v3-MTP-Q8_0": True,
|
||||
"Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0": True,
|
||||
"Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M": True,
|
||||
"Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16": True,
|
||||
"Qwopus3.6-27B-v2-MTP-Q6_K": True,
|
||||
"Qwopus3.5-9B-Coder-MTP-Q8_0": True,
|
||||
}
|
||||
|
||||
for gguf in ggufs:
|
||||
model_id = model_map.get(gguf, gguf)
|
||||
is_mtp_now = currently_mtp.get(gguf, False)
|
||||
|
||||
off_vals = [r["predicted_tps"] for r in rows
|
||||
if r["gguf"] == gguf and r["mtp"] == "off" and r["run"] >= 2
|
||||
and r["predicted_tps"] is not None]
|
||||
on_vals = [r["predicted_tps"] for r in rows
|
||||
if r["gguf"] == gguf and r["mtp"] == "on" and r["run"] >= 2
|
||||
and r["predicted_tps"] is not None]
|
||||
off_med = median_of(off_vals)
|
||||
on_med = median_of(on_vals)
|
||||
delta = ((on_med - off_med) / off_med * 100) if off_med > 0 else 0
|
||||
|
||||
should_mtp = delta >= 10
|
||||
lines.append(f" # {model_id}: MTP {'on' if is_mtp_now else 'off'} → {'on' if should_mtp else 'off'} (delta {delta:+.1f}%)")
|
||||
|
||||
if should_mtp and not is_mtp_now:
|
||||
lines.append(f" # + --spec-type draft-mtp --spec-draft-n-max 2")
|
||||
elif not should_mtp and is_mtp_now:
|
||||
lines.append(f" # - --spec-type draft-mtp --spec-draft-n-max 2")
|
||||
else:
|
||||
lines.append(f" # (no change)")
|
||||
lines.append("")
|
||||
|
||||
lines.append("```\n")
|
||||
reco = "\n".join(lines)
|
||||
RECO_PATH.write_text(reco)
|
||||
print(f"Wrote {RECO_PATH}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
rows = load_all()
|
||||
if not rows:
|
||||
print("No results found in", RESULTS_DIR)
|
||||
return
|
||||
write_csv(rows)
|
||||
write_summary(rows)
|
||||
write_recommendations(rows)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user