llama-sidecar v0.1.0: daemon + benchmarks + eval suite
Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
109
benchmarks/3d/analyze.py
Normal file
109
benchmarks/3d/analyze.py
Normal file
@@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Analyze MTP n_max sweep results and produce summary.md."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
RESULTS_PATH = Path(__file__).parent / "results.json"
|
||||
SUMMARY_PATH = Path(__file__).parent / "summary.md"
|
||||
|
||||
|
||||
def load_results() -> list[dict]:
|
||||
data = json.loads(RESULTS_PATH.read_text())
|
||||
return [r for r in data if r.get("eval_tok_s") is not None and r.get("error") is None]
|
||||
|
||||
|
||||
def main() -> None:
|
||||
rows = load_results()
|
||||
if not rows:
|
||||
print("No valid results found.")
|
||||
return
|
||||
|
||||
models = sorted(set(r["model"] for r in rows))
|
||||
lines = ["# MTP n_max Sweep Results\n"]
|
||||
lines.append(f"**{len(rows)} valid measurements across {len(models)} models.**\n")
|
||||
|
||||
recommendations = []
|
||||
|
||||
for model in models:
|
||||
model_rows = [r for r in rows if r["model"] == model]
|
||||
n_max_values = sorted(set(r["n_max"] for r in model_rows))
|
||||
prompt_names = sorted(set(r["prompt"] for r in model_rows))
|
||||
|
||||
lines.append(f"\n## {model}\n")
|
||||
|
||||
header = "| n_max | " + " | ".join(f"{p} tok/s" for p in prompt_names) + " | avg tok/s | vs n_max=0 |"
|
||||
sep = "|-------|" + "|".join("-" * (len(p) + 7) for p in prompt_names) + "|-----------|------------|"
|
||||
lines.append(header)
|
||||
lines.append(sep)
|
||||
|
||||
baseline_avg = None
|
||||
best_avg = 0
|
||||
best_n = 0
|
||||
|
||||
for n in n_max_values:
|
||||
cells = []
|
||||
vals = []
|
||||
for p in prompt_names:
|
||||
matching = [r for r in model_rows if r["n_max"] == n and r["prompt"] == p]
|
||||
if matching:
|
||||
v = matching[0]["eval_tok_s"]
|
||||
cells.append(f"{v:.1f}")
|
||||
vals.append(v)
|
||||
else:
|
||||
cells.append("—")
|
||||
|
||||
avg = sum(vals) / len(vals) if vals else 0
|
||||
if n == 0:
|
||||
baseline_avg = avg
|
||||
delta = "baseline"
|
||||
elif baseline_avg and baseline_avg > 0:
|
||||
pct = ((avg - baseline_avg) / baseline_avg) * 100
|
||||
delta = f"{pct:+.1f}%"
|
||||
else:
|
||||
delta = "—"
|
||||
|
||||
if avg > best_avg:
|
||||
best_avg = avg
|
||||
best_n = n
|
||||
|
||||
draft_info = ""
|
||||
draft_rows = [r for r in model_rows if r["n_max"] == n and r.get("draft_n")]
|
||||
if draft_rows:
|
||||
total_draft = sum(r.get("draft_n", 0) for r in draft_rows)
|
||||
total_accepted = sum(r.get("draft_n_accepted", 0) for r in draft_rows)
|
||||
if total_draft > 0:
|
||||
accept_pct = (total_accepted / total_draft) * 100
|
||||
draft_info = f" (accept {accept_pct:.0f}%)"
|
||||
|
||||
row_str = f"| {n} | " + " | ".join(cells) + f" | {avg:.1f} | {delta}{draft_info} |"
|
||||
lines.append(row_str)
|
||||
|
||||
if baseline_avg and baseline_avg > 0 and best_avg > 0:
|
||||
improvement = ((best_avg - baseline_avg) / baseline_avg) * 100
|
||||
lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s, {improvement:+.1f}% vs baseline)\n")
|
||||
recommendations.append((model, best_n, best_avg, improvement))
|
||||
else:
|
||||
lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s)\n")
|
||||
|
||||
# Recommendations section
|
||||
lines.append("\n---\n")
|
||||
lines.append("## Recommended `llama_extra_args` per model\n")
|
||||
lines.append("| Model | n_max | avg tok/s | vs baseline | suggested flags |")
|
||||
lines.append("|-------|-------|-----------|-------------|-----------------|")
|
||||
for model, n, avg, imp in recommendations:
|
||||
if n > 0:
|
||||
flags = f'`["--spec-type", "draft-mtp", "--spec-draft-n-max", "{n}"]`'
|
||||
else:
|
||||
flags = "_(none — MTP not beneficial)_"
|
||||
lines.append(f"| {model} | {n} | {avg:.1f} | {imp:+.1f}% | {flags} |")
|
||||
|
||||
lines.append("")
|
||||
summary = "\n".join(lines)
|
||||
SUMMARY_PATH.write_text(summary)
|
||||
print(summary)
|
||||
print(f"\nWritten to: {SUMMARY_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
248
benchmarks/3d/run_sweep.py
Normal file
248
benchmarks/3d/run_sweep.py
Normal file
@@ -0,0 +1,248 @@
|
||||
#!/usr/bin/env python3
|
||||
"""MTP n_max sweep across MTP-capable models via llama-sidecar.
|
||||
|
||||
Usage:
|
||||
python3 run_sweep.py # full sweep
|
||||
python3 run_sweep.py --dry-run # print matrix, no API calls
|
||||
python3 run_sweep.py --limit 1 # run first combo only (smoke)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import URLError, HTTPError
|
||||
|
||||
SIDECAR_URL = os.environ.get("SIDECAR_URL", "http://100.101.41.16:8402")
|
||||
RESULTS_PATH = Path(__file__).parent / "results.json"
|
||||
|
||||
MATRIX = [
|
||||
("qwen3.6-35b-a3b-mxfp4", [0, 1, 2, 3]),
|
||||
("qwen3.6-27b-mtp", [0, 1, 2, 3, 4]),
|
||||
("qwopus3.6-27b-v2-mtp", [0, 2]),
|
||||
("qwopus3.5-9b-coder-mtp", [0, 2]),
|
||||
]
|
||||
|
||||
PROMPTS = {
|
||||
"short": {
|
||||
"content": "Reply with exactly five words: a haiku-like greeting.",
|
||||
"max_tokens": 100,
|
||||
},
|
||||
"medium": {
|
||||
"content": (
|
||||
"Explain how multi-token prediction speculative decoding works in transformer "
|
||||
"inference. Cover: 1) the draft model role, 2) the verification mechanism, "
|
||||
"3) acceptance rate dynamics, 4) why MoE models gain less than dense models. "
|
||||
"Aim for 400-500 words."
|
||||
),
|
||||
"max_tokens": 700,
|
||||
},
|
||||
"long": {
|
||||
"content": (
|
||||
"Write a complete Python implementation of a simple HTTP server that "
|
||||
"accepts POST requests on /v1/chat/completions, validates JSON bodies "
|
||||
"against a basic OpenAI schema, logs each request to stdout in JSON "
|
||||
"format, and returns a hardcoded streaming response. Include error "
|
||||
"handling for malformed JSON, missing required fields, and unsupported "
|
||||
"methods. Add docstrings and type hints throughout. Show full file."
|
||||
),
|
||||
"max_tokens": 2500,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def build_flags(n_max: int) -> str:
|
||||
if n_max > 0:
|
||||
return f"--spec-type draft-mtp --spec-draft-n-max {n_max} --repeat-penalty 1.0"
|
||||
return "--repeat-penalty 1.0"
|
||||
|
||||
|
||||
def sidecar_request(method: str, path: str, body: dict | None = None,
|
||||
headers: dict | None = None, timeout: int = 180) -> dict | None:
|
||||
url = f"{SIDECAR_URL}{path}"
|
||||
data = json.dumps(body).encode() if body else None
|
||||
hdrs = {"Content-Type": "application/json"}
|
||||
if headers:
|
||||
hdrs.update(headers)
|
||||
req = Request(url, data=data, headers=hdrs, method=method)
|
||||
try:
|
||||
with urlopen(req, timeout=timeout) as resp:
|
||||
return json.loads(resp.read())
|
||||
except HTTPError as e:
|
||||
body_text = e.read().decode(errors="replace")
|
||||
try:
|
||||
return json.loads(body_text)
|
||||
except json.JSONDecodeError:
|
||||
return {"error": f"HTTP {e.code}", "body": body_text[:500]}
|
||||
except URLError as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
def send_completion(model: str, flags: str, prompt: str, max_tokens: int) -> dict:
|
||||
body = {
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"stream": False,
|
||||
}
|
||||
headers = {
|
||||
"X-Agent-Flags": flags,
|
||||
"X-Model-Id": model,
|
||||
}
|
||||
t0 = time.perf_counter()
|
||||
resp = sidecar_request("POST", "/v1/chat/completions", body=body, headers=headers)
|
||||
wall_ms = (time.perf_counter() - t0) * 1000
|
||||
if resp is None:
|
||||
return {"error": "no response", "wall_clock_ms": wall_ms}
|
||||
resp["wall_clock_ms"] = wall_ms
|
||||
return resp
|
||||
|
||||
|
||||
def extract_metrics(resp: dict, model: str, n_max: int, prompt_name: str) -> dict:
|
||||
timings = resp.get("timings", {})
|
||||
usage = resp.get("usage", {})
|
||||
sidecars = sidecar_request("GET", "/sidecars") or []
|
||||
sidecar_hash = ""
|
||||
sidecar_port = 0
|
||||
if isinstance(sidecars, list):
|
||||
for s in sidecars:
|
||||
if s.get("model_id") == model:
|
||||
sidecar_hash = s.get("hash", "")
|
||||
sidecar_port = s.get("port", 0)
|
||||
break
|
||||
|
||||
return {
|
||||
"model": model,
|
||||
"n_max": n_max,
|
||||
"prompt": prompt_name,
|
||||
"timestamp_utc": datetime.now(timezone.utc).isoformat(),
|
||||
"completion_tokens": usage.get("completion_tokens"),
|
||||
"prompt_tokens": usage.get("prompt_tokens"),
|
||||
"eval_tok_s": timings.get("predicted_per_second"),
|
||||
"prompt_tok_s": timings.get("prompt_per_second"),
|
||||
"eval_ms": timings.get("predicted_ms"),
|
||||
"prompt_ms": timings.get("prompt_ms"),
|
||||
"draft_n": timings.get("draft_n"),
|
||||
"draft_n_accepted": timings.get("draft_n_accepted"),
|
||||
"wall_clock_ms": resp.get("wall_clock_ms"),
|
||||
"sidecar_hash": sidecar_hash,
|
||||
"sidecar_port": sidecar_port,
|
||||
"error": resp.get("error"),
|
||||
}
|
||||
|
||||
|
||||
def append_result(row: dict) -> None:
|
||||
results = []
|
||||
if RESULTS_PATH.exists():
|
||||
try:
|
||||
results = json.loads(RESULTS_PATH.read_text())
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
results.append(row)
|
||||
RESULTS_PATH.write_text(json.dumps(results, indent=2) + "\n")
|
||||
|
||||
|
||||
def evict_all_sidecars() -> None:
|
||||
sidecars = sidecar_request("GET", "/sidecars")
|
||||
if not isinstance(sidecars, list):
|
||||
return
|
||||
for s in sidecars:
|
||||
h = s.get("hash", "")
|
||||
if h:
|
||||
sidecar_request("DELETE", f"/sidecars/{h}")
|
||||
|
||||
|
||||
def run_combo(model: str, n_max: int, combo_idx: int, total_combos: int,
|
||||
prompt_names: list[str]) -> None:
|
||||
flags = build_flags(n_max)
|
||||
label = f"[{combo_idx}/{total_combos}] {model} n_max={n_max}"
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{label}")
|
||||
print(f" flags: {flags}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for pname in prompt_names:
|
||||
p = PROMPTS[pname]
|
||||
# Warmup
|
||||
print(f" {pname}: warmup...", end="", flush=True)
|
||||
send_completion(model, flags, p["content"], p["max_tokens"])
|
||||
print(" done.", flush=True)
|
||||
time.sleep(2)
|
||||
|
||||
# Record
|
||||
print(f" {pname}: recording...", end="", flush=True)
|
||||
resp = send_completion(model, flags, p["content"], p["max_tokens"])
|
||||
row = extract_metrics(resp, model, n_max, pname)
|
||||
append_result(row)
|
||||
|
||||
tok_s = row.get("eval_tok_s")
|
||||
draft = row.get("draft_n")
|
||||
err = row.get("error")
|
||||
if err:
|
||||
print(f" ERROR: {err}")
|
||||
elif tok_s:
|
||||
draft_str = f" draft_n={draft}" if draft else ""
|
||||
print(f" {tok_s:.1f} tok/s{draft_str}")
|
||||
else:
|
||||
print(" (no timings in response)")
|
||||
|
||||
# Evict this sidecar to free VRAM
|
||||
evict_all_sidecars()
|
||||
print(f" evicted sidecars, sleeping 5s for VRAM release...")
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
def dry_run() -> None:
|
||||
combos = [(model, n) for model, ns in MATRIX for n in ns]
|
||||
print(f"Dry run: {len(combos)} combos × 3 prompts × 2 calls = {len(combos)*6} API calls")
|
||||
print(f"Estimated runtime: 60-90 minutes\n")
|
||||
for i, (model, n_max) in enumerate(combos, 1):
|
||||
flags = build_flags(n_max)
|
||||
print(f" [{i}/{len(combos)}] {model} n_max={n_max}")
|
||||
print(f" flags: {flags}")
|
||||
for pname in PROMPTS:
|
||||
p = PROMPTS[pname]
|
||||
print(f" {pname}: max_tokens={p['max_tokens']}")
|
||||
print(f"\nResults would be written to: {RESULTS_PATH}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="MTP n_max sweep benchmark")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Print matrix without running")
|
||||
parser.add_argument("--limit", type=int, default=0, help="Run only first N combos")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dry_run:
|
||||
dry_run()
|
||||
return
|
||||
|
||||
# Check sidecar health
|
||||
health = sidecar_request("GET", "/health")
|
||||
if not health or health.get("status") != "ok":
|
||||
print(f"Sidecar unhealthy: {health}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
print(f"Sidecar healthy: {health}")
|
||||
|
||||
# Clear existing sidecars
|
||||
evict_all_sidecars()
|
||||
|
||||
combos = [(model, n) for model, ns in MATRIX for n in ns]
|
||||
if args.limit > 0:
|
||||
combos = combos[:args.limit]
|
||||
prompt_names = list(PROMPTS.keys())
|
||||
|
||||
t_start = time.perf_counter()
|
||||
for i, (model, n_max) in enumerate(combos, 1):
|
||||
run_combo(model, n_max, i, len(combos), prompt_names)
|
||||
|
||||
elapsed = time.perf_counter() - t_start
|
||||
print(f"\nSweep complete. {len(combos)} combos in {elapsed/60:.1f} minutes.")
|
||||
print(f"Results: {RESULTS_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user