Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
110 lines
3.9 KiB
Python
110 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Analyze MTP n_max sweep results and produce summary.md."""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
RESULTS_PATH = Path(__file__).parent / "results.json"
|
|
SUMMARY_PATH = Path(__file__).parent / "summary.md"
|
|
|
|
|
|
def load_results() -> list[dict]:
|
|
data = json.loads(RESULTS_PATH.read_text())
|
|
return [r for r in data if r.get("eval_tok_s") is not None and r.get("error") is None]
|
|
|
|
|
|
def main() -> None:
|
|
rows = load_results()
|
|
if not rows:
|
|
print("No valid results found.")
|
|
return
|
|
|
|
models = sorted(set(r["model"] for r in rows))
|
|
lines = ["# MTP n_max Sweep Results\n"]
|
|
lines.append(f"**{len(rows)} valid measurements across {len(models)} models.**\n")
|
|
|
|
recommendations = []
|
|
|
|
for model in models:
|
|
model_rows = [r for r in rows if r["model"] == model]
|
|
n_max_values = sorted(set(r["n_max"] for r in model_rows))
|
|
prompt_names = sorted(set(r["prompt"] for r in model_rows))
|
|
|
|
lines.append(f"\n## {model}\n")
|
|
|
|
header = "| n_max | " + " | ".join(f"{p} tok/s" for p in prompt_names) + " | avg tok/s | vs n_max=0 |"
|
|
sep = "|-------|" + "|".join("-" * (len(p) + 7) for p in prompt_names) + "|-----------|------------|"
|
|
lines.append(header)
|
|
lines.append(sep)
|
|
|
|
baseline_avg = None
|
|
best_avg = 0
|
|
best_n = 0
|
|
|
|
for n in n_max_values:
|
|
cells = []
|
|
vals = []
|
|
for p in prompt_names:
|
|
matching = [r for r in model_rows if r["n_max"] == n and r["prompt"] == p]
|
|
if matching:
|
|
v = matching[0]["eval_tok_s"]
|
|
cells.append(f"{v:.1f}")
|
|
vals.append(v)
|
|
else:
|
|
cells.append("—")
|
|
|
|
avg = sum(vals) / len(vals) if vals else 0
|
|
if n == 0:
|
|
baseline_avg = avg
|
|
delta = "baseline"
|
|
elif baseline_avg and baseline_avg > 0:
|
|
pct = ((avg - baseline_avg) / baseline_avg) * 100
|
|
delta = f"{pct:+.1f}%"
|
|
else:
|
|
delta = "—"
|
|
|
|
if avg > best_avg:
|
|
best_avg = avg
|
|
best_n = n
|
|
|
|
draft_info = ""
|
|
draft_rows = [r for r in model_rows if r["n_max"] == n and r.get("draft_n")]
|
|
if draft_rows:
|
|
total_draft = sum(r.get("draft_n", 0) for r in draft_rows)
|
|
total_accepted = sum(r.get("draft_n_accepted", 0) for r in draft_rows)
|
|
if total_draft > 0:
|
|
accept_pct = (total_accepted / total_draft) * 100
|
|
draft_info = f" (accept {accept_pct:.0f}%)"
|
|
|
|
row_str = f"| {n} | " + " | ".join(cells) + f" | {avg:.1f} | {delta}{draft_info} |"
|
|
lines.append(row_str)
|
|
|
|
if baseline_avg and baseline_avg > 0 and best_avg > 0:
|
|
improvement = ((best_avg - baseline_avg) / baseline_avg) * 100
|
|
lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s, {improvement:+.1f}% vs baseline)\n")
|
|
recommendations.append((model, best_n, best_avg, improvement))
|
|
else:
|
|
lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s)\n")
|
|
|
|
# Recommendations section
|
|
lines.append("\n---\n")
|
|
lines.append("## Recommended `llama_extra_args` per model\n")
|
|
lines.append("| Model | n_max | avg tok/s | vs baseline | suggested flags |")
|
|
lines.append("|-------|-------|-----------|-------------|-----------------|")
|
|
for model, n, avg, imp in recommendations:
|
|
if n > 0:
|
|
flags = f'`["--spec-type", "draft-mtp", "--spec-draft-n-max", "{n}"]`'
|
|
else:
|
|
flags = "_(none — MTP not beneficial)_"
|
|
lines.append(f"| {model} | {n} | {avg:.1f} | {imp:+.1f}% | {flags} |")
|
|
|
|
lines.append("")
|
|
summary = "\n".join(lines)
|
|
SUMMARY_PATH.write_text(summary)
|
|
print(summary)
|
|
print(f"\nWritten to: {SUMMARY_PATH}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|