llama-sidecar v0.1.0: daemon + benchmarks + eval suite

Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-28 01:55:13 +00:00
parent babbb4f39b
commit fe7f36ae98
39 changed files with 4228 additions and 0 deletions
--- a/benchmarks/3d/analyze.py
+++ b/benchmarks/3d/analyze.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+"""Analyze MTP n_max sweep results and produce summary.md."""
+
+import json
+from pathlib import Path
+
+RESULTS_PATH = Path(__file__).parent / "results.json"
+SUMMARY_PATH = Path(__file__).parent / "summary.md"
+
+
+def load_results() -> list[dict]:
+    data = json.loads(RESULTS_PATH.read_text())
+    return [r for r in data if r.get("eval_tok_s") is not None and r.get("error") is None]
+
+
+def main() -> None:
+    rows = load_results()
+    if not rows:
+        print("No valid results found.")
+        return
+
+    models = sorted(set(r["model"] for r in rows))
+    lines = ["# MTP n_max Sweep Results\n"]
+    lines.append(f"**{len(rows)} valid measurements across {len(models)} models.**\n")
+
+    recommendations = []
+
+    for model in models:
+        model_rows = [r for r in rows if r["model"] == model]
+        n_max_values = sorted(set(r["n_max"] for r in model_rows))
+        prompt_names = sorted(set(r["prompt"] for r in model_rows))
+
+        lines.append(f"\n## {model}\n")
+
+        header = "| n_max | " + " | ".join(f"{p} tok/s" for p in prompt_names) + " | avg tok/s | vs n_max=0 |"
+        sep = "|-------|" + "|".join("-" * (len(p) + 7) for p in prompt_names) + "|-----------|------------|"
+        lines.append(header)
+        lines.append(sep)
+
+        baseline_avg = None
+        best_avg = 0
+        best_n = 0
+
+        for n in n_max_values:
+            cells = []
+            vals = []
+            for p in prompt_names:
+                matching = [r for r in model_rows if r["n_max"] == n and r["prompt"] == p]
+                if matching:
+                    v = matching[0]["eval_tok_s"]
+                    cells.append(f"{v:.1f}")
+                    vals.append(v)
+                else:
+                    cells.append("—")
+
+            avg = sum(vals) / len(vals) if vals else 0
+            if n == 0:
+                baseline_avg = avg
+                delta = "baseline"
+            elif baseline_avg and baseline_avg > 0:
+                pct = ((avg - baseline_avg) / baseline_avg) * 100
+                delta = f"{pct:+.1f}%"
+            else:
+                delta = "—"
+
+            if avg > best_avg:
+                best_avg = avg
+                best_n = n
+
+            draft_info = ""
+            draft_rows = [r for r in model_rows if r["n_max"] == n and r.get("draft_n")]
+            if draft_rows:
+                total_draft = sum(r.get("draft_n", 0) for r in draft_rows)
+                total_accepted = sum(r.get("draft_n_accepted", 0) for r in draft_rows)
+                if total_draft > 0:
+                    accept_pct = (total_accepted / total_draft) * 100
+                    draft_info = f" (accept {accept_pct:.0f}%)"
+
+            row_str = f"| {n} | " + " | ".join(cells) + f" | {avg:.1f} | {delta}{draft_info} |"
+            lines.append(row_str)
+
+        if baseline_avg and baseline_avg > 0 and best_avg > 0:
+            improvement = ((best_avg - baseline_avg) / baseline_avg) * 100
+            lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s, {improvement:+.1f}% vs baseline)\n")
+            recommendations.append((model, best_n, best_avg, improvement))
+        else:
+            lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s)\n")
+
+    # Recommendations section
+    lines.append("\n---\n")
+    lines.append("## Recommended `llama_extra_args` per model\n")
+    lines.append("| Model | n_max | avg tok/s | vs baseline | suggested flags |")
+    lines.append("|-------|-------|-----------|-------------|-----------------|")
+    for model, n, avg, imp in recommendations:
+        if n > 0:
+            flags = f'`["--spec-type", "draft-mtp", "--spec-draft-n-max", "{n}"]`'
+        else:
+            flags = "_(none — MTP not beneficial)_"
+        lines.append(f"| {model} | {n} | {avg:.1f} | {imp:+.1f}% | {flags} |")
+
+    lines.append("")
+    summary = "\n".join(lines)
+    SUMMARY_PATH.write_text(summary)
+    print(summary)
+    print(f"\nWritten to: {SUMMARY_PATH}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/3d/run_sweep.py
+++ b/benchmarks/3d/run_sweep.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""MTP n_max sweep across MTP-capable models via llama-sidecar.
+
+Usage:
+    python3 run_sweep.py             # full sweep
+    python3 run_sweep.py --dry-run   # print matrix, no API calls
+    python3 run_sweep.py --limit 1   # run first combo only (smoke)
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from urllib.request import Request, urlopen
+from urllib.error import URLError, HTTPError
+
+SIDECAR_URL = os.environ.get("SIDECAR_URL", "http://100.101.41.16:8402")
+RESULTS_PATH = Path(__file__).parent / "results.json"
+
+MATRIX = [
+    ("qwen3.6-35b-a3b-mxfp4", [0, 1, 2, 3]),
+    ("qwen3.6-27b-mtp",        [0, 1, 2, 3, 4]),
+    ("qwopus3.6-27b-v2-mtp",   [0, 2]),
+    ("qwopus3.5-9b-coder-mtp", [0, 2]),
+]
+
+PROMPTS = {
+    "short": {
+        "content": "Reply with exactly five words: a haiku-like greeting.",
+        "max_tokens": 100,
+    },
+    "medium": {
+        "content": (
+            "Explain how multi-token prediction speculative decoding works in transformer "
+            "inference. Cover: 1) the draft model role, 2) the verification mechanism, "
+            "3) acceptance rate dynamics, 4) why MoE models gain less than dense models. "
+            "Aim for 400-500 words."
+        ),
+        "max_tokens": 700,
+    },
+    "long": {
+        "content": (
+            "Write a complete Python implementation of a simple HTTP server that "
+            "accepts POST requests on /v1/chat/completions, validates JSON bodies "
+            "against a basic OpenAI schema, logs each request to stdout in JSON "
+            "format, and returns a hardcoded streaming response. Include error "
+            "handling for malformed JSON, missing required fields, and unsupported "
+            "methods. Add docstrings and type hints throughout. Show full file."
+        ),
+        "max_tokens": 2500,
+    },
+}
+
+
+def build_flags(n_max: int) -> str:
+    if n_max > 0:
+        return f"--spec-type draft-mtp --spec-draft-n-max {n_max} --repeat-penalty 1.0"
+    return "--repeat-penalty 1.0"
+
+
+def sidecar_request(method: str, path: str, body: dict | None = None,
+                    headers: dict | None = None, timeout: int = 180) -> dict | None:
+    url = f"{SIDECAR_URL}{path}"
+    data = json.dumps(body).encode() if body else None
+    hdrs = {"Content-Type": "application/json"}
+    if headers:
+        hdrs.update(headers)
+    req = Request(url, data=data, headers=hdrs, method=method)
+    try:
+        with urlopen(req, timeout=timeout) as resp:
+            return json.loads(resp.read())
+    except HTTPError as e:
+        body_text = e.read().decode(errors="replace")
+        try:
+            return json.loads(body_text)
+        except json.JSONDecodeError:
+            return {"error": f"HTTP {e.code}", "body": body_text[:500]}
+    except URLError as e:
+        return {"error": str(e)}
+
+
+def send_completion(model: str, flags: str, prompt: str, max_tokens: int) -> dict:
+    body = {
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "stream": False,
+    }
+    headers = {
+        "X-Agent-Flags": flags,
+        "X-Model-Id": model,
+    }
+    t0 = time.perf_counter()
+    resp = sidecar_request("POST", "/v1/chat/completions", body=body, headers=headers)
+    wall_ms = (time.perf_counter() - t0) * 1000
+    if resp is None:
+        return {"error": "no response", "wall_clock_ms": wall_ms}
+    resp["wall_clock_ms"] = wall_ms
+    return resp
+
+
+def extract_metrics(resp: dict, model: str, n_max: int, prompt_name: str) -> dict:
+    timings = resp.get("timings", {})
+    usage = resp.get("usage", {})
+    sidecars = sidecar_request("GET", "/sidecars") or []
+    sidecar_hash = ""
+    sidecar_port = 0
+    if isinstance(sidecars, list):
+        for s in sidecars:
+            if s.get("model_id") == model:
+                sidecar_hash = s.get("hash", "")
+                sidecar_port = s.get("port", 0)
+                break
+
+    return {
+        "model": model,
+        "n_max": n_max,
+        "prompt": prompt_name,
+        "timestamp_utc": datetime.now(timezone.utc).isoformat(),
+        "completion_tokens": usage.get("completion_tokens"),
+        "prompt_tokens": usage.get("prompt_tokens"),
+        "eval_tok_s": timings.get("predicted_per_second"),
+        "prompt_tok_s": timings.get("prompt_per_second"),
+        "eval_ms": timings.get("predicted_ms"),
+        "prompt_ms": timings.get("prompt_ms"),
+        "draft_n": timings.get("draft_n"),
+        "draft_n_accepted": timings.get("draft_n_accepted"),
+        "wall_clock_ms": resp.get("wall_clock_ms"),
+        "sidecar_hash": sidecar_hash,
+        "sidecar_port": sidecar_port,
+        "error": resp.get("error"),
+    }
+
+
+def append_result(row: dict) -> None:
+    results = []
+    if RESULTS_PATH.exists():
+        try:
+            results = json.loads(RESULTS_PATH.read_text())
+        except (json.JSONDecodeError, OSError):
+            pass
+    results.append(row)
+    RESULTS_PATH.write_text(json.dumps(results, indent=2) + "\n")
+
+
+def evict_all_sidecars() -> None:
+    sidecars = sidecar_request("GET", "/sidecars")
+    if not isinstance(sidecars, list):
+        return
+    for s in sidecars:
+        h = s.get("hash", "")
+        if h:
+            sidecar_request("DELETE", f"/sidecars/{h}")
+
+
+def run_combo(model: str, n_max: int, combo_idx: int, total_combos: int,
+              prompt_names: list[str]) -> None:
+    flags = build_flags(n_max)
+    label = f"[{combo_idx}/{total_combos}] {model} n_max={n_max}"
+    print(f"\n{'='*60}")
+    print(f"{label}")
+    print(f"  flags: {flags}")
+    print(f"{'='*60}")
+
+    for pname in prompt_names:
+        p = PROMPTS[pname]
+        # Warmup
+        print(f"  {pname}: warmup...", end="", flush=True)
+        send_completion(model, flags, p["content"], p["max_tokens"])
+        print(" done.", flush=True)
+        time.sleep(2)
+
+        # Record
+        print(f"  {pname}: recording...", end="", flush=True)
+        resp = send_completion(model, flags, p["content"], p["max_tokens"])
+        row = extract_metrics(resp, model, n_max, pname)
+        append_result(row)
+
+        tok_s = row.get("eval_tok_s")
+        draft = row.get("draft_n")
+        err = row.get("error")
+        if err:
+            print(f" ERROR: {err}")
+        elif tok_s:
+            draft_str = f" draft_n={draft}" if draft else ""
+            print(f" {tok_s:.1f} tok/s{draft_str}")
+        else:
+            print(" (no timings in response)")
+
+    # Evict this sidecar to free VRAM
+    evict_all_sidecars()
+    print(f"  evicted sidecars, sleeping 5s for VRAM release...")
+    time.sleep(5)
+
+
+def dry_run() -> None:
+    combos = [(model, n) for model, ns in MATRIX for n in ns]
+    print(f"Dry run: {len(combos)} combos × 3 prompts × 2 calls = {len(combos)*6} API calls")
+    print(f"Estimated runtime: 60-90 minutes\n")
+    for i, (model, n_max) in enumerate(combos, 1):
+        flags = build_flags(n_max)
+        print(f"  [{i}/{len(combos)}] {model} n_max={n_max}")
+        print(f"    flags: {flags}")
+        for pname in PROMPTS:
+            p = PROMPTS[pname]
+            print(f"    {pname}: max_tokens={p['max_tokens']}")
+    print(f"\nResults would be written to: {RESULTS_PATH}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="MTP n_max sweep benchmark")
+    parser.add_argument("--dry-run", action="store_true", help="Print matrix without running")
+    parser.add_argument("--limit", type=int, default=0, help="Run only first N combos")
+    args = parser.parse_args()
+
+    if args.dry_run:
+        dry_run()
+        return
+
+    # Check sidecar health
+    health = sidecar_request("GET", "/health")
+    if not health or health.get("status") != "ok":
+        print(f"Sidecar unhealthy: {health}", file=sys.stderr)
+        sys.exit(1)
+    print(f"Sidecar healthy: {health}")
+
+    # Clear existing sidecars
+    evict_all_sidecars()
+
+    combos = [(model, n) for model, ns in MATRIX for n in ns]
+    if args.limit > 0:
+        combos = combos[:args.limit]
+    prompt_names = list(PROMPTS.keys())
+
+    t_start = time.perf_counter()
+    for i, (model, n_max) in enumerate(combos, 1):
+        run_combo(model, n_max, i, len(combos), prompt_names)
+
+    elapsed = time.perf_counter() - t_start
+    print(f"\nSweep complete. {len(combos)} combos in {elapsed/60:.1f} minutes.")
+    print(f"Results: {RESULTS_PATH}")
+
+
+if __name__ == "__main__":
+    main()