llama-sidecar v0.1.0: daemon + benchmarks + eval suite
Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
109
benchmarks/3d/analyze.py
Normal file
109
benchmarks/3d/analyze.py
Normal file
@@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Analyze MTP n_max sweep results and produce summary.md."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
RESULTS_PATH = Path(__file__).parent / "results.json"
|
||||
SUMMARY_PATH = Path(__file__).parent / "summary.md"
|
||||
|
||||
|
||||
def load_results() -> list[dict]:
|
||||
data = json.loads(RESULTS_PATH.read_text())
|
||||
return [r for r in data if r.get("eval_tok_s") is not None and r.get("error") is None]
|
||||
|
||||
|
||||
def main() -> None:
|
||||
rows = load_results()
|
||||
if not rows:
|
||||
print("No valid results found.")
|
||||
return
|
||||
|
||||
models = sorted(set(r["model"] for r in rows))
|
||||
lines = ["# MTP n_max Sweep Results\n"]
|
||||
lines.append(f"**{len(rows)} valid measurements across {len(models)} models.**\n")
|
||||
|
||||
recommendations = []
|
||||
|
||||
for model in models:
|
||||
model_rows = [r for r in rows if r["model"] == model]
|
||||
n_max_values = sorted(set(r["n_max"] for r in model_rows))
|
||||
prompt_names = sorted(set(r["prompt"] for r in model_rows))
|
||||
|
||||
lines.append(f"\n## {model}\n")
|
||||
|
||||
header = "| n_max | " + " | ".join(f"{p} tok/s" for p in prompt_names) + " | avg tok/s | vs n_max=0 |"
|
||||
sep = "|-------|" + "|".join("-" * (len(p) + 7) for p in prompt_names) + "|-----------|------------|"
|
||||
lines.append(header)
|
||||
lines.append(sep)
|
||||
|
||||
baseline_avg = None
|
||||
best_avg = 0
|
||||
best_n = 0
|
||||
|
||||
for n in n_max_values:
|
||||
cells = []
|
||||
vals = []
|
||||
for p in prompt_names:
|
||||
matching = [r for r in model_rows if r["n_max"] == n and r["prompt"] == p]
|
||||
if matching:
|
||||
v = matching[0]["eval_tok_s"]
|
||||
cells.append(f"{v:.1f}")
|
||||
vals.append(v)
|
||||
else:
|
||||
cells.append("—")
|
||||
|
||||
avg = sum(vals) / len(vals) if vals else 0
|
||||
if n == 0:
|
||||
baseline_avg = avg
|
||||
delta = "baseline"
|
||||
elif baseline_avg and baseline_avg > 0:
|
||||
pct = ((avg - baseline_avg) / baseline_avg) * 100
|
||||
delta = f"{pct:+.1f}%"
|
||||
else:
|
||||
delta = "—"
|
||||
|
||||
if avg > best_avg:
|
||||
best_avg = avg
|
||||
best_n = n
|
||||
|
||||
draft_info = ""
|
||||
draft_rows = [r for r in model_rows if r["n_max"] == n and r.get("draft_n")]
|
||||
if draft_rows:
|
||||
total_draft = sum(r.get("draft_n", 0) for r in draft_rows)
|
||||
total_accepted = sum(r.get("draft_n_accepted", 0) for r in draft_rows)
|
||||
if total_draft > 0:
|
||||
accept_pct = (total_accepted / total_draft) * 100
|
||||
draft_info = f" (accept {accept_pct:.0f}%)"
|
||||
|
||||
row_str = f"| {n} | " + " | ".join(cells) + f" | {avg:.1f} | {delta}{draft_info} |"
|
||||
lines.append(row_str)
|
||||
|
||||
if baseline_avg and baseline_avg > 0 and best_avg > 0:
|
||||
improvement = ((best_avg - baseline_avg) / baseline_avg) * 100
|
||||
lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s, {improvement:+.1f}% vs baseline)\n")
|
||||
recommendations.append((model, best_n, best_avg, improvement))
|
||||
else:
|
||||
lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s)\n")
|
||||
|
||||
# Recommendations section
|
||||
lines.append("\n---\n")
|
||||
lines.append("## Recommended `llama_extra_args` per model\n")
|
||||
lines.append("| Model | n_max | avg tok/s | vs baseline | suggested flags |")
|
||||
lines.append("|-------|-------|-----------|-------------|-----------------|")
|
||||
for model, n, avg, imp in recommendations:
|
||||
if n > 0:
|
||||
flags = f'`["--spec-type", "draft-mtp", "--spec-draft-n-max", "{n}"]`'
|
||||
else:
|
||||
flags = "_(none — MTP not beneficial)_"
|
||||
lines.append(f"| {model} | {n} | {avg:.1f} | {imp:+.1f}% | {flags} |")
|
||||
|
||||
lines.append("")
|
||||
summary = "\n".join(lines)
|
||||
SUMMARY_PATH.write_text(summary)
|
||||
print(summary)
|
||||
print(f"\nWritten to: {SUMMARY_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user