llama-sidecar v0.1.0: daemon + benchmarks + eval suite

Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-28 01:55:13 +00:00
parent babbb4f39b
commit fe7f36ae98
39 changed files with 4228 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,18 @@
+bin/
+*.exe
+eval/.venv/
+eval/results/
+eval/scores.csv
+eval/SUMMARY.md
+eval/eval.log
+eval/ab/results/
+eval/ab/COMPARE.md
+eval/ab/timing.csv
+eval/ab/run.log
+bench/results/
+bench/SUMMARY.md
+bench/results.csv
+bench/llama-swap-recommendations.md
+internal/pool/*.bak-*
+internal/pool/sidecar_windows.go.bak-*
+__pycache__/
--- a/19
+++ b/19
@@ -0,0 +1,19 @@
+.PHONY: build build-windows test test-integration lint
+
+GO = /snap/go/current/bin/go
+
+build:
+	$(GO) build -o bin/llama-sidecar ./cmd/llama-sidecar
+
+build-windows:
+	GOOS=windows GOARCH=amd64 $(GO) build -o bin/llama-sidecar.exe ./cmd/llama-sidecar
+
+test:
+	$(GO) test ./internal/...
+
+test-integration:
+	$(GO) test -tags=integration ./internal/...
+
+lint:
+	$(GO) vet ./...
+	gofmt -l .
--- a/README.md
+++ b/README.md
@@ -0,0 +1,77 @@
+# llama-sidecar
+
+Per-agent llama-server process pool daemon. Runs on sam-desktop alongside llama-swap. Spawns or reuses llama-server processes keyed on (modelID, flags) hash.
+
+## License
+
+AGPL-3.0-only.
+
+The validator package (`internal/validator/`) is ported from [Unsloth Studio](https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/llama_server_args.py) (AGPL-3.0). BooCode's TypeScript port (`apps/server/src/services/inference/llama-args-validator.ts`) is the sibling — update both when upstream changes.
+
+## Build
+
+```bash
+# Linux (development)
+make build
+
+# Windows AMD64 (production target — cross-compile from Linux)
+make build-windows
+
+# Copy to sam-desktop
+# scp bin/llama-sidecar.exe sam-desktop:C:\llama-sidecar\
+```
+
+## Configuration
+
+All via environment variables (no CLI flags):
+
+| Variable | Required | Default | Description |
+|----------|----------|---------|-------------|
+| `LLAMA_SERVER_BIN` | yes | — | Path to llama-server.exe |
+| `MODEL_DIR_MAP_FILE` | yes | — | JSON file mapping model IDs to GGUF paths |
+| `LLAMA_SIDECAR_BIND` | no | `127.0.0.1:8402` | Listen address |
+| `PORT_RANGE` | no | `8500-8599` | Port range for sidecar processes |
+| `MAX_SIDECARS` | no | `2` | Max concurrent sidecar processes |
+| `LOG_LEVEL` | no | `info` | Log level (debug, info, warn, error) |
+| `BASE_ARGS` | no | `["-ngl","999","-c","32768","--flash-attn","on","--no-mmap"]` | JSON array of base llama-server args |
+| `HEALTH_TIMEOUT_SECONDS` | no | `60` | Max wait for sidecar health check |
+| `HEALTH_INTERVAL_SECONDS` | no | `30` | Background health check interval |
+
+## API
+
+### `GET /health`
+
+Returns daemon status.
+
+### `GET /sidecars`
+
+Returns list of active sidecar processes.
+
+### `DELETE /sidecars/{hash}`
+
+Kill and remove a sidecar process.
+
+### `POST /v1/chat/completions`
+
+OpenAI-compatible proxy. Routes to a sidecar process based on model + flags.
+
+Headers:
+- `X-Agent-Flags: --top-k 20 --cache-type-k q8_0` (optional)
+- `X-Model-Id: qwen3.6-35b-a3b-mxfp4` (optional, overrides body.model)
+
+## Test
+
+```bash
+make test                  # unit tests
+make test-integration      # requires real llama-server + GGUF
+make lint                  # vet + gofmt
+```
+
+## NSSM Service
+
+Pre-configured on sam-desktop as `llama-sidecar`. Start/stop via:
+```
+C:\Tools\nssm\nssm.exe start llama-sidecar
+C:\Tools\nssm\nssm.exe stop llama-sidecar
+C:\Tools\nssm\nssm.exe status llama-sidecar
+```
--- a/bench/analyze.py
+++ b/bench/analyze.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""Analyze MTP on/off benchmark results → CSV + SUMMARY.md + recommendations."""
+
+import csv
+import json
+import os
+import re
+import statistics
+from pathlib import Path
+
+RESULTS_DIR = Path(__file__).parent / "results"
+CSV_PATH = Path(__file__).parent / "results.csv"
+SUMMARY_PATH = Path(__file__).parent / "SUMMARY.md"
+RECO_PATH = Path(__file__).parent / "llama-swap-recommendations.md"
+
+FNAME_RE = re.compile(
+    r"^(?P<stem>.+?)__mtp-(?P<mtp>on|off)__len(?P<len>\d+)__run(?P<run>\d+)\.json$"
+)
+
+
+def parse_result(path: Path) -> dict | None:
+    m = FNAME_RE.match(path.name)
+    if not m:
+        return None
+    try:
+        data = json.loads(path.read_text())
+    except (json.JSONDecodeError, OSError):
+        return None
+    t = data.get("timings", {})
+    return {
+        "gguf": m.group("stem"),
+        "mtp": m.group("mtp"),
+        "prompt_len": int(m.group("len")),
+        "run": int(m.group("run")),
+        "prompt_tps": t.get("prompt_per_second"),
+        "predicted_tps": t.get("predicted_per_second"),
+        "cache_n": t.get("cache_n"),
+        "draft_n": t.get("draft_n"),
+        "accepted_n": t.get("draft_n_accepted"),
+        "total_ms": (t.get("prompt_ms", 0) or 0) + (t.get("predicted_ms", 0) or 0),
+    }
+
+
+def load_all() -> list[dict]:
+    rows = []
+    for f in sorted(RESULTS_DIR.glob("*.json")):
+        r = parse_result(f)
+        if r:
+            rows.append(r)
+    return rows
+
+
+def write_csv(rows: list[dict]) -> None:
+    fields = ["gguf", "mtp", "prompt_len", "run", "prompt_tps", "predicted_tps",
+              "cache_n", "draft_n", "accepted_n", "total_ms"]
+    with open(CSV_PATH, "w", newline="") as f:
+        w = csv.DictWriter(f, fieldnames=fields)
+        w.writeheader()
+        w.writerows(rows)
+    print(f"Wrote {len(rows)} rows to {CSV_PATH}")
+
+
+def median_of(values: list[float]) -> float:
+    return statistics.median(values) if values else 0.0
+
+
+def write_summary(rows: list[dict]) -> None:
+    ggufs = sorted(set(r["gguf"] for r in rows))
+    lens = sorted(set(r["prompt_len"] for r in rows))
+    lines = ["# MTP On/Off Benchmark Results\n"]
+    lines.append(f"**{len(rows)} measurements across {len(ggufs)} GGUFs.**\n")
+    lines.append(f"Runs 2 & 3 used for median (run 1 = warmup, discarded).\n")
+
+    verdicts = []
+
+    for gguf in ggufs:
+        lines.append(f"\n## {gguf}\n")
+        header_parts = ["prompt_len"]
+        for state in ["off", "on"]:
+            header_parts.append(f"MTP-{state} tok/s")
+        header_parts.extend(["delta %", "accept %"])
+        lines.append("| " + " | ".join(header_parts) + " |")
+        lines.append("|" + "|".join("---" for _ in header_parts) + "|")
+
+        any_above_10 = False
+        for pl in lens:
+            off_vals = [r["predicted_tps"] for r in rows
+                        if r["gguf"] == gguf and r["mtp"] == "off"
+                        and r["prompt_len"] == pl and r["run"] >= 2
+                        and r["predicted_tps"] is not None]
+            on_vals = [r["predicted_tps"] for r in rows
+                       if r["gguf"] == gguf and r["mtp"] == "on"
+                       and r["prompt_len"] == pl and r["run"] >= 2
+                       and r["predicted_tps"] is not None]
+
+            off_med = median_of(off_vals)
+            on_med = median_of(on_vals)
+
+            if off_med > 0:
+                delta = ((on_med - off_med) / off_med) * 100
+            else:
+                delta = 0.0
+
+            if abs(delta) >= 10:
+                any_above_10 = True
+
+            draft_rows = [r for r in rows
+                          if r["gguf"] == gguf and r["mtp"] == "on"
+                          and r["prompt_len"] == pl and r["run"] >= 2
+                          and r.get("draft_n")]
+            total_draft = sum(r.get("draft_n", 0) for r in draft_rows)
+            total_accepted = sum(r.get("accepted_n", 0) for r in draft_rows)
+            accept_pct = f"{(total_accepted / total_draft * 100):.0f}%" if total_draft > 0 else "—"
+
+            lines.append(
+                f"| {pl} | {off_med:.1f} | {on_med:.1f} | {delta:+.1f}% | {accept_pct} |"
+            )
+
+        if any_above_10:
+            verdict = "KEEP MTP"
+        else:
+            verdict = "DROP MTP"
+        verdicts.append((gguf, verdict))
+        lines.append(f"\n**Verdict: {verdict}**\n")
+
+    lines.append("\n---\n")
+    lines.append("## Verdict Summary\n")
+    lines.append("| GGUF | Verdict |")
+    lines.append("|------|---------|")
+    for gguf, verdict in verdicts:
+        lines.append(f"| {gguf} | {verdict} |")
+
+    summary = "\n".join(lines) + "\n"
+    SUMMARY_PATH.write_text(summary)
+    print(f"Wrote {SUMMARY_PATH}")
+    print(summary)
+
+
+def write_recommendations(rows: list[dict]) -> None:
+    ggufs = sorted(set(r["gguf"] for r in rows))
+    lens = sorted(set(r["prompt_len"] for r in rows))
+
+    lines = ["# llama-swap Config Recommendations\n"]
+    lines.append("Based on MTP on/off benchmark results.\n")
+    lines.append("**Read-only reference** — do NOT edit D:\\llama-swap\\config.yaml directly.\n")
+    lines.append("```yaml")
+    lines.append("# Commented diff against current config.yaml")
+    lines.append("# Lines starting with + should be added, - should be removed")
+    lines.append("")
+
+    model_map = {
+        "Qwen3.6-35B-A3B-MXFP4_MOE": "qwen3.6-35b-a3b-mxfp4",
+        "Qwen3.6-27B-Q6_K": "qwen3.6-27b-mtp",
+        "Qwopus3.5-4B-v3-MTP-Q8_0": "qwopus3.5-4b-mtp",
+        "Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0": "qwen3.5-9b-deepseek-v4-mtp",
+        "Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M": "qwopus3.6-35b-a3b-v1-mtp",
+        "Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16": "qwopus3.6-35b-a3b-mxfp4-mtp",
+        "Qwopus3.6-27B-v2-MTP-Q6_K": "qwopus3.6-27b-v2-mtp",
+        "Qwopus3.5-9B-Coder-MTP-Q8_0": "qwopus3.5-9b-coder-mtp",
+    }
+
+    currently_mtp = {
+        "Qwen3.6-35B-A3B-MXFP4_MOE": False,
+        "Qwen3.6-27B-Q6_K": True,
+        "Qwopus3.5-4B-v3-MTP-Q8_0": True,
+        "Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0": True,
+        "Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M": True,
+        "Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16": True,
+        "Qwopus3.6-27B-v2-MTP-Q6_K": True,
+        "Qwopus3.5-9B-Coder-MTP-Q8_0": True,
+    }
+
+    for gguf in ggufs:
+        model_id = model_map.get(gguf, gguf)
+        is_mtp_now = currently_mtp.get(gguf, False)
+
+        off_vals = [r["predicted_tps"] for r in rows
+                    if r["gguf"] == gguf and r["mtp"] == "off" and r["run"] >= 2
+                    and r["predicted_tps"] is not None]
+        on_vals = [r["predicted_tps"] for r in rows
+                   if r["gguf"] == gguf and r["mtp"] == "on" and r["run"] >= 2
+                   and r["predicted_tps"] is not None]
+        off_med = median_of(off_vals)
+        on_med = median_of(on_vals)
+        delta = ((on_med - off_med) / off_med * 100) if off_med > 0 else 0
+
+        should_mtp = delta >= 10
+        lines.append(f"  # {model_id}: MTP {'on' if is_mtp_now else 'off'} → {'on' if should_mtp else 'off'} (delta {delta:+.1f}%)")
+
+        if should_mtp and not is_mtp_now:
+            lines.append(f"  # + --spec-type draft-mtp --spec-draft-n-max 2")
+        elif not should_mtp and is_mtp_now:
+            lines.append(f"  # - --spec-type draft-mtp --spec-draft-n-max 2")
+        else:
+            lines.append(f"  # (no change)")
+        lines.append("")
+
+    lines.append("```\n")
+    reco = "\n".join(lines)
+    RECO_PATH.write_text(reco)
+    print(f"Wrote {RECO_PATH}")
+
+
+def main() -> None:
+    rows = load_all()
+    if not rows:
+        print("No results found in", RESULTS_DIR)
+        return
+    write_csv(rows)
+    write_summary(rows)
+    write_recommendations(rows)
+
+
+if __name__ == "__main__":
+    main()
--- a/bench/bench.sh
+++ b/bench/bench.sh
@@ -0,0 +1,192 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ENDPOINT="http://100.101.41.16:8650"
+SSH_HOST="samki@100.101.41.16"
+TASK_NAME="bench_llama"
+BAT_PATH='%TEMP%\bench_run.bat'
+RESULTS_DIR="$(cd "$(dirname "$0")" && pwd)/results"
+PROMPTS_DIR="$(cd "$(dirname "$0")" && pwd)/prompts"
+MAX_TOKENS=200
+HEALTH_TIMEOUT=120
+LLAMA_BIN='D:\llama-server\llama-server.exe'
+
+mkdir -p "$RESULTS_DIR"
+
+# ── Config matrix: STEM|MTP_STATE|FULL_ARGS ───────────────────────────
+
+CONFIGS=(
+'Qwen3.6-35B-A3B-MXFP4_MOE|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-35B-A3B-MXFP4_MOE.gguf --mmproj D:\models\Qwen3.6-35B-A3B-MXFP4_MOE\mmproj.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwen3.6-35B-A3B-MXFP4_MOE|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-35B-A3B-MXFP4_MOE.gguf --mmproj D:\models\Qwen3.6-35B-A3B-MXFP4_MOE\mmproj.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwen3.6-27B-Q6_K|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-27B-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwen3.6-27B-Q6_K|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-27B-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwopus3.5-4B-v3-MTP-Q8_0|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-4B-v3-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwopus3.5-4B-v3-MTP-Q8_0|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-4B-v3-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwopus3.6-27B-v2-MTP-Q6_K|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-27B-v2-MTP-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwopus3.6-27B-v2-MTP-Q6_K|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-27B-v2-MTP-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwopus3.5-9B-Coder-MTP-Q8_0|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-9B-Coder-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.4 --top-p 0.8 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+
+'Qwopus3.5-9B-Coder-MTP-Q8_0|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-9B-Coder-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.4 --top-p 0.8 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
+)
+
+PROMPT_LENS=(256 1024 4096)
+
+# ── Helper functions ──────────────────────────────────────────────────
+
+kill_bench_server() {
+  local pids
+  pids=$(ssh "$SSH_HOST" 'for /f "tokens=5" %a in ('"'"'netstat -aon ^| findstr :8650 ^| findstr LISTENING'"'"') do @echo %a' 2>/dev/null || true)
+  for pid in $pids; do
+    if [ -n "$pid" ] && [ "$pid" != "0" ]; then
+      ssh "$SSH_HOST" "taskkill /F /PID $pid" 2>/dev/null || true
+    fi
+  done
+  ssh "$SSH_HOST" "schtasks /Delete /TN ${TASK_NAME} /F" 2>/dev/null || true
+  sleep 3
+}
+
+start_bench_server() {
+  local args="$1"
+  # Write a batch file, then run it via schtasks
+  ssh "$SSH_HOST" "echo ${LLAMA_BIN} ${args} > ${BAT_PATH}" 2>/dev/null
+  ssh "$SSH_HOST" "schtasks /Create /TN ${TASK_NAME} /TR ${BAT_PATH} /SC ONCE /ST 00:00 /F /RL HIGHEST" 2>/dev/null
+  ssh "$SSH_HOST" "schtasks /Run /TN ${TASK_NAME}" 2>/dev/null
+}
+
+poll_health() {
+  local elapsed=0
+  while [ $elapsed -lt $HEALTH_TIMEOUT ]; do
+    if curl -sf "${ENDPOINT}/health" >/dev/null 2>&1; then
+      echo "  health OK (${elapsed}s)"
+      return 0
+    fi
+    sleep 3
+    elapsed=$((elapsed + 3))
+    if [ $((elapsed % 15)) -eq 0 ]; then
+      echo "  waiting... (${elapsed}s)"
+    fi
+  done
+  echo "  HEALTH TIMEOUT after ${HEALTH_TIMEOUT}s"
+  return 1
+}
+
+send_request() {
+  local prompt_file="$1"
+  local output_file="$2"
+  local body
+  body=$(python3 -c "
+import json
+prompt = open('${prompt_file}').read()
+print(json.dumps({
+    'messages': [{'role': 'user', 'content': prompt}],
+    'max_tokens': ${MAX_TOKENS},
+    'temperature': 0,
+    'seed': 42,
+    'stream': False
+}))
+")
+  local http_code
+  http_code=$(curl -s -w '%{http_code}' -o "$output_file" \
+    --max-time 300 \
+    -X POST "${ENDPOINT}/v1/chat/completions" \
+    -H "Content-Type: application/json" \
+    -d "$body" 2>/dev/null)
+  if [ "$http_code" != "200" ]; then
+    echo "HTTP ${http_code}"
+    return 1
+  fi
+  return 0
+}
+
+print_metrics() {
+  python3 -c "
+import json
+d = json.load(open('${1}'))
+t = d.get('timings', {})
+ptps = t.get('prompt_per_second', 0)
+etps = t.get('predicted_per_second', 0)
+dn = t.get('draft_n', '')
+da = t.get('draft_n_accepted', '')
+draft = ''
+if dn != '':
+    draft = f'  draft={da}/{dn}'
+print(f'prompt={ptps:.1f}  eval={etps:.1f} tok/s{draft}')
+" 2>/dev/null || echo "(parse error)"
+}
+
+# ── Main ──────────────────────────────────────────────────────────────
+
+total=${#CONFIGS[@]}
+echo "================================================================"
+echo "  MTP ON/OFF BENCHMARK SWEEP"
+echo "  ${total} configs x 3 prompts x 3 runs"
+echo "  Endpoint: ${ENDPOINT}"
+echo "================================================================"
+
+t_start=$(date +%s)
+config_idx=0
+
+for config_entry in "${CONFIGS[@]}"; do
+  config_idx=$((config_idx + 1))
+  IFS='|' read -r stem mtp_state args <<< "$config_entry"
+
+  echo ""
+  echo "================================================================"
+  echo "  [${config_idx}/${total}] ${stem}  MTP=${mtp_state}"
+  echo "================================================================"
+
+  kill_bench_server
+  echo "  Starting llama-server..."
+  start_bench_server "$args"
+
+  if ! poll_health; then
+    echo "  SKIPPING"
+    kill_bench_server
+    continue
+  fi
+
+  for len in "${PROMPT_LENS[@]}"; do
+    prompt_file="${PROMPTS_DIR}/p${len}.txt"
+    [ -f "$prompt_file" ] || { echo "  Missing p${len}.txt"; continue; }
+    echo "  -- p${len} --"
+    for run in 1 2 3; do
+      outfile="${RESULTS_DIR}/${stem}__mtp-${mtp_state}__len${len}__run${run}.json"
+      printf "    run %d: " "$run"
+      if send_request "$prompt_file" "$outfile"; then
+        print_metrics "$outfile"
+      fi
+      sleep 1
+    done
+  done
+
+  echo "  Killing..."
+  kill_bench_server
+done
+
+t_end=$(date +%s)
+elapsed=$(( t_end - t_start ))
+echo ""
+echo "================================================================"
+echo "  SWEEP COMPLETE in $(( elapsed / 60 ))m $(( elapsed % 60 ))s"
+echo "  Run: python3 $(dirname "$0")/analyze.py"
+echo "================================================================"
--- a/bench/prompts/p1024.txt
+++ b/bench/prompts/p1024.txt
@@ -0,0 +1,67 @@
+You will rejoice to hear that no disaster has accompanied the
+commencement of an enterprise which you have regarded with such evil
+forebodings. I arrived here yesterday, and my first task is to assure
+my dear sister of my welfare and increasing confidence in the success
+of my undertaking.
+
+I am already far north of London, and as I walk in the streets of
+Petersburgh, I feel a cold northern breeze play upon my cheeks, which
+braces my nerves and fills me with delight. Do you understand this
+feeling? This breeze, which has travelled from the regions towards
+which I am advancing, gives me a foretaste of those icy climes.
+Inspirited by this wind of promise, my daydreams become more fervent
+and vivid. I try in vain to be persuaded that the pole is the seat of
+frost and desolation; it ever presents itself to my imagination as the
+region of beauty and delight. There, Margaret, the sun is for ever
+visible, its broad disk just skirting the horizon and diffusing a
+perpetual splendour. There—for with your leave, my sister, I will put
+some trust in preceding navigators—there snow and frost are banished;
+and, sailing over a calm sea, we may be wafted to a land surpassing in
+wonders and in beauty every region hitherto discovered on the habitable
+globe. Its productions and features may be without example, as the
+phenomena of the heavenly bodies undoubtedly are in those undiscovered
+solitudes. What may not be expected in a country of eternal light? I
+may there discover the wondrous power which attracts the needle and may
+regulate a thousand celestial observations that require only this
+voyage to render their seeming eccentricities consistent for ever. I
+shall satiate my ardent curiosity with the sight of a part of the world
+never before visited, and may tread a land never before imprinted by
+the foot of man. These are my enticements, and they are sufficient to
+conquer all fear of danger or death and to induce me to commence this
+laborious voyage with the joy a child feels when he embarks in a little
+boat, with his holiday mates, on an expedition of discovery up his
+native river. But supposing all these conjectures to be false, you
+cannot contest the inestimable benefit which I shall confer on all
+mankind, to the last generation, by discovering a passage near the pole
+to those countries, to reach which at present so many months are
+requisite; or by ascertaining the secret of the magnet, which, if at
+all possible, can only be effected by an undertaking such as mine.
+
+These reflections have dispelled the agitation with which I began my
+letter, and I feel my heart glow with an enthusiasm which elevates me
+to heaven, for nothing contributes so much to tranquillise the mind as
+a steady purpose—a point on which the soul may fix its intellectual
+eye. This expedition has been the favourite dream of my early years. I
+have read with ardour the accounts of the various voyages which have
+been made in the prospect of arriving at the North Pacific Ocean
+through the seas which surround the pole. You may remember that a
+history of all the voyages made for purposes of discovery composed the
+whole of our good Uncle Thomas’ library. My education was neglected,
+yet I was passionately fond of reading. These volumes were my study
+day and night, and my familiarity with them increased that regret which
+I had felt, as a child, on learning that my father’s dying injunction
+had forbidden my uncle to allow me to embark in a seafaring life.
+
+These visions faded when I perused, for the first time, those poets
+whose effusions entranced my soul and lifted it to heaven. I also
+became a poet and for one year lived in a paradise of my own creation;
+I imagined that I also might obtain a niche in the temple where the
+names of Homer and Shakespeare are consecrated. You are well
+acquainted with my failure and how heavily I bore the disappointment.
+But just at that time I inherited the fortune of my cousin, and my
+thoughts were turned into the channel of their earlier bent.
+
+Six years have passed since I resolved on my present undertaking. I
+can, even now, remember the hour from which I dedicated myself to this
+great enterprise. I commenced by inuring my body to hardship.
+Continue this passage in exactly 200 tokens of prose.
--- a/bench/prompts/p256.txt
+++ b/bench/prompts/p256.txt
@@ -0,0 +1,18 @@
+You will rejoice to hear that no disaster has accompanied the
+commencement of an enterprise which you have regarded with such evil
+forebodings. I arrived here yesterday, and my first task is to assure
+my dear sister of my welfare and increasing confidence in the success
+of my undertaking.
+
+I am already far north of London, and as I walk in the streets of
+Petersburgh, I feel a cold northern breeze play upon my cheeks, which
+braces my nerves and fills me with delight. Do you understand this
+feeling? This breeze, which has travelled from the regions towards
+which I am advancing, gives me a foretaste of those icy climes.
+Inspirited by this wind of promise, my daydreams become more fervent
+and vivid. I try in vain to be persuaded that the pole is the seat of
+frost and desolation; it ever presents itself to my imagination as the
+region of beauty and delight. There, Margaret, the sun is for ever
+visible, its broad disk just skirting the horizon and diffusing a
+perpetual splendour.
+Continue this passage in exactly 200 tokens of prose.
--- a/bench/prompts/p4096.txt
+++ b/bench/prompts/p4096.txt
@@ -0,0 +1,319 @@
+You will rejoice to hear that no disaster has accompanied the
+commencement of an enterprise which you have regarded with such evil
+forebodings. I arrived here yesterday, and my first task is to assure
+my dear sister of my welfare and increasing confidence in the success
+of my undertaking.
+
+I am already far north of London, and as I walk in the streets of
+Petersburgh, I feel a cold northern breeze play upon my cheeks, which
+braces my nerves and fills me with delight. Do you understand this
+feeling? This breeze, which has travelled from the regions towards
+which I am advancing, gives me a foretaste of those icy climes.
+Inspirited by this wind of promise, my daydreams become more fervent
+and vivid. I try in vain to be persuaded that the pole is the seat of
+frost and desolation; it ever presents itself to my imagination as the
+region of beauty and delight. There, Margaret, the sun is for ever
+visible, its broad disk just skirting the horizon and diffusing a
+perpetual splendour. There—for with your leave, my sister, I will put
+some trust in preceding navigators—there snow and frost are banished;
+and, sailing over a calm sea, we may be wafted to a land surpassing in
+wonders and in beauty every region hitherto discovered on the habitable
+globe. Its productions and features may be without example, as the
+phenomena of the heavenly bodies undoubtedly are in those undiscovered
+solitudes. What may not be expected in a country of eternal light? I
+may there discover the wondrous power which attracts the needle and may
+regulate a thousand celestial observations that require only this
+voyage to render their seeming eccentricities consistent for ever. I
+shall satiate my ardent curiosity with the sight of a part of the world
+never before visited, and may tread a land never before imprinted by
+the foot of man. These are my enticements, and they are sufficient to
+conquer all fear of danger or death and to induce me to commence this
+laborious voyage with the joy a child feels when he embarks in a little
+boat, with his holiday mates, on an expedition of discovery up his
+native river. But supposing all these conjectures to be false, you
+cannot contest the inestimable benefit which I shall confer on all
+mankind, to the last generation, by discovering a passage near the pole
+to those countries, to reach which at present so many months are
+requisite; or by ascertaining the secret of the magnet, which, if at
+all possible, can only be effected by an undertaking such as mine.
+
+These reflections have dispelled the agitation with which I began my
+letter, and I feel my heart glow with an enthusiasm which elevates me
+to heaven, for nothing contributes so much to tranquillise the mind as
+a steady purpose—a point on which the soul may fix its intellectual
+eye. This expedition has been the favourite dream of my early years. I
+have read with ardour the accounts of the various voyages which have
+been made in the prospect of arriving at the North Pacific Ocean
+through the seas which surround the pole. You may remember that a
+history of all the voyages made for purposes of discovery composed the
+whole of our good Uncle Thomas’ library. My education was neglected,
+yet I was passionately fond of reading. These volumes were my study
+day and night, and my familiarity with them increased that regret which
+I had felt, as a child, on learning that my father’s dying injunction
+had forbidden my uncle to allow me to embark in a seafaring life.
+
+These visions faded when I perused, for the first time, those poets
+whose effusions entranced my soul and lifted it to heaven. I also
+became a poet and for one year lived in a paradise of my own creation;
+I imagined that I also might obtain a niche in the temple where the
+names of Homer and Shakespeare are consecrated. You are well
+acquainted with my failure and how heavily I bore the disappointment.
+But just at that time I inherited the fortune of my cousin, and my
+thoughts were turned into the channel of their earlier bent.
+
+Six years have passed since I resolved on my present undertaking. I
+can, even now, remember the hour from which I dedicated myself to this
+great enterprise. I commenced by inuring my body to hardship. I
+accompanied the whale-fishers on several expeditions to the North Sea;
+I voluntarily endured cold, famine, thirst, and want of sleep; I often
+worked harder than the common sailors during the day and devoted my
+nights to the study of mathematics, the theory of medicine, and those
+branches of physical science from which a naval adventurer might derive
+the greatest practical advantage. Twice I actually hired myself as an
+under-mate in a Greenland whaler, and acquitted myself to admiration. I
+must own I felt a little proud when my captain offered me the second
+dignity in the vessel and entreated me to remain with the greatest
+earnestness, so valuable did he consider my services.
+
+And now, dear Margaret, do I not deserve to accomplish some great purpose?
+My life might have been passed in ease and luxury, but I preferred glory to
+every enticement that wealth placed in my path. Oh, that some encouraging
+voice would answer in the affirmative! My courage and my resolution is
+firm; but my hopes fluctuate, and my spirits are often depressed. I am
+about to proceed on a long and difficult voyage, the emergencies of which
+will demand all my fortitude: I am required not only to raise the spirits
+of others, but sometimes to sustain my own, when theirs are failing.
+
+This is the most favourable period for travelling in Russia. They fly
+quickly over the snow in their sledges; the motion is pleasant, and, in
+my opinion, far more agreeable than that of an English stagecoach. The
+cold is not excessive, if you are wrapped in furs—a dress which I have
+already adopted, for there is a great difference between walking the
+deck and remaining seated motionless for hours, when no exercise
+prevents the blood from actually freezing in your veins. I have no
+ambition to lose my life on the post-road between St. Petersburgh and
+Archangel.
+
+I shall depart for the latter town in a fortnight or three weeks; and my
+intention is to hire a ship there, which can easily be done by paying the
+insurance for the owner, and to engage as many sailors as I think necessary
+among those who are accustomed to the whale-fishing. I do not intend to
+sail until the month of June; and when shall I return? Ah, dear sister, how
+can I answer this question? If I succeed, many, many months, perhaps years,
+will pass before you and I may meet. If I fail, you will see me again soon,
+or never.
+
+Farewell, my dear, excellent Margaret. Heaven shower down blessings on you,
+and save me, that I may again and again testify my gratitude for all your
+love and kindness.
+
+Your affectionate brother,
+
+R. Walton
+
+
+
+
+Letter 2
+
+_To Mrs. Saville, England._
+
+Archangel, 28th March, 17—.
+
+
+How slowly the time passes here, encompassed as I am by frost and snow!
+Yet a second step is taken towards my enterprise. I have hired a
+vessel and am occupied in collecting my sailors; those whom I have
+already engaged appear to be men on whom I can depend and are certainly
+possessed of dauntless courage.
+
+But I have one want which I have never yet been able to satisfy, and the
+absence of the object of which I now feel as a most severe evil, I have no
+friend, Margaret: when I am glowing with the enthusiasm of success, there
+will be none to participate my joy; if I am assailed by disappointment, no
+one will endeavour to sustain me in dejection. I shall commit my thoughts
+to paper, it is true; but that is a poor medium for the communication of
+feeling. I desire the company of a man who could sympathise with me, whose
+eyes would reply to mine. You may deem me romantic, my dear sister, but I
+bitterly feel the want of a friend. I have no one near me, gentle yet
+courageous, possessed of a cultivated as well as of a capacious mind, whose
+tastes are like my own, to approve or amend my plans. How would such a
+friend repair the faults of your poor brother! I am too ardent in execution
+and too impatient of difficulties. But it is a still greater evil to me
+that I am self-educated: for the first fourteen years of my life I ran wild
+on a common and read nothing but our Uncle Thomas’ books of voyages.
+At that age I became acquainted with the celebrated poets of our own
+country; but it was only when it had ceased to be in my power to derive its
+most important benefits from such a conviction that I perceived the
+necessity of becoming acquainted with more languages than that of my native
+country. Now I am twenty-eight and am in reality more illiterate than many
+schoolboys of fifteen. It is true that I have thought more and that my
+daydreams are more extended and magnificent, but they want (as the painters
+call it) _keeping;_ and I greatly need a friend who would have sense
+enough not to despise me as romantic, and affection enough for me to
+endeavour to regulate my mind.
+
+Well, these are useless complaints; I shall certainly find no friend on the
+wide ocean, nor even here in Archangel, among merchants and seamen. Yet
+some feelings, unallied to the dross of human nature, beat even in these
+rugged bosoms. My lieutenant, for instance, is a man of wonderful courage
+and enterprise; he is madly desirous of glory, or rather, to word my phrase
+more characteristically, of advancement in his profession. He is an
+Englishman, and in the midst of national and professional prejudices,
+unsoftened by cultivation, retains some of the noblest endowments of
+humanity. I first became acquainted with him on board a whale vessel;
+finding that he was unemployed in this city, I easily engaged him to assist
+in my enterprise.
+
+The master is a person of an excellent disposition and is remarkable in the
+ship for his gentleness and the mildness of his discipline. This
+circumstance, added to his well-known integrity and dauntless courage, made
+me very desirous to engage him. A youth passed in solitude, my best years
+spent under your gentle and feminine fosterage, has so refined the
+groundwork of my character that I cannot overcome an intense distaste to
+the usual brutality exercised on board ship: I have never believed it to be
+necessary, and when I heard of a mariner equally noted for his kindliness
+of heart and the respect and obedience paid to him by his crew, I felt
+myself peculiarly fortunate in being able to secure his services. I heard
+of him first in rather a romantic manner, from a lady who owes to him the
+happiness of her life. This, briefly, is his story. Some years ago he loved
+a young Russian lady of moderate fortune, and having amassed a considerable
+sum in prize-money, the father of the girl consented to the match. He saw
+his mistress once before the destined ceremony; but she was bathed in
+tears, and throwing herself at his feet, entreated him to spare her,
+confessing at the same time that she loved another, but that he was poor,
+and that her father would never consent to the union. My generous friend
+reassured the suppliant, and on being informed of the name of her lover,
+instantly abandoned his pursuit. He had already bought a farm with his
+money, on which he had designed to pass the remainder of his life; but he
+bestowed the whole on his rival, together with the remains of his
+prize-money to purchase stock, and then himself solicited the young
+woman’s father to consent to her marriage with her lover. But the old
+man decidedly refused, thinking himself bound in honour to my friend, who,
+when he found the father inexorable, quitted his country, nor returned
+until he heard that his former mistress was married according to her
+inclinations. “What a noble fellow!” you will exclaim. He is
+so; but then he is wholly uneducated: he is as silent as a Turk, and a kind
+of ignorant carelessness attends him, which, while it renders his conduct
+the more astonishing, detracts from the interest and sympathy which
+otherwise he would command.
+
+Yet do not suppose, because I complain a little or because I can
+conceive a consolation for my toils which I may never know, that I am
+wavering in my resolutions. Those are as fixed as fate, and my voyage
+is only now delayed until the weather shall permit my embarkation. The
+winter has been dreadfully severe, but the spring promises well, and it
+is considered as a remarkably early season, so that perhaps I may sail
+sooner than I expected. I shall do nothing rashly: you know me
+sufficiently to confide in my prudence and considerateness whenever the
+safety of others is committed to my care.
+
+I cannot describe to you my sensations on the near prospect of my
+undertaking. It is impossible to communicate to you a conception of
+the trembling sensation, half pleasurable and half fearful, with which
+I am preparing to depart. I am going to unexplored regions, to “the
+land of mist and snow,” but I shall kill no albatross; therefore do not
+be alarmed for my safety or if I should come back to you as worn and
+woeful as the “Ancient Mariner.” You will smile at my allusion, but I
+will disclose a secret. I have often attributed my attachment to, my
+passionate enthusiasm for, the dangerous mysteries of ocean to that
+production of the most imaginative of modern poets. There is something
+at work in my soul which I do not understand. I am practically
+industrious—painstaking, a workman to execute with perseverance and
+labour—but besides this there is a love for the marvellous, a belief
+in the marvellous, intertwined in all my projects, which hurries me out
+of the common pathways of men, even to the wild sea and unvisited
+regions I am about to explore.
+
+But to return to dearer considerations. Shall I meet you again, after
+having traversed immense seas, and returned by the most southern cape of
+Africa or America? I dare not expect such success, yet I cannot bear to
+look on the reverse of the picture. Continue for the present to write to
+me by every opportunity: I may receive your letters on some occasions when
+I need them most to support my spirits. I love you very tenderly.
+Remember me with affection, should you never hear from me again.
+
+Your affectionate brother,
+ Robert Walton
+
+
+
+
+Letter 3
+
+_To Mrs. Saville, England._
+
+July 7th, 17—.
+
+
+My dear Sister,
+
+I write a few lines in haste to say that I am safe—and well advanced
+on my voyage. This letter will reach England by a merchantman now on
+its homeward voyage from Archangel; more fortunate than I, who may not
+see my native land, perhaps, for many years. I am, however, in good
+spirits: my men are bold and apparently firm of purpose, nor do the
+floating sheets of ice that continually pass us, indicating the dangers
+of the region towards which we are advancing, appear to dismay them. We
+have already reached a very high latitude; but it is the height of
+summer, and although not so warm as in England, the southern gales,
+which blow us speedily towards those shores which I so ardently desire
+to attain, breathe a degree of renovating warmth which I had not
+expected.
+
+No incidents have hitherto befallen us that would make a figure in a
+letter. One or two stiff gales and the springing of a leak are
+accidents which experienced navigators scarcely remember to record, and
+I shall be well content if nothing worse happen to us during our voyage.
+
+Adieu, my dear Margaret. Be assured that for my own sake, as well as
+yours, I will not rashly encounter danger. I will be cool,
+persevering, and prudent.
+
+But success _shall_ crown my endeavours. Wherefore not? Thus far I
+have gone, tracing a secure way over the pathless seas, the very stars
+themselves being witnesses and testimonies of my triumph. Why not
+still proceed over the untamed yet obedient element? What can stop the
+determined heart and resolved will of man?
+
+My swelling heart involuntarily pours itself out thus. But I must
+finish. Heaven bless my beloved sister!
+
+R.W.
+
+
+
+
+Letter 4
+
+
+_To Mrs. Saville, England._
+
+August 5th, 17—.
+
+So strange an accident has happened to us that I cannot forbear
+recording it, although it is very probable that you will see me before
+these papers can come into your possession.
+
+Last Monday (July 31st) we were nearly surrounded by ice, which closed
+in the ship on all sides, scarcely leaving her the sea-room in which
+she floated. Our situation was somewhat dangerous, especially as we
+were compassed round by a very thick fog. We accordingly lay to,
+hoping that some change would take place in the atmosphere and weather.
+
+About two o’clock the mist cleared away, and we beheld, stretched out
+in every direction, vast and irregular plains of ice, which seemed to
+have no end. Some of my comrades groaned, and my own mind began to
+grow watchful with anxious thoughts, when a strange sight suddenly
+attracted our attention and diverted our solicitude from our own
+situation. We perceived a low carriage, fixed on a sledge and drawn by
+dogs, pass on towards the north, at the distance of half a mile; a
+being which had the shape of a man, but apparently of gigantic stature,
+sat in the sledge and guided the dogs. We watched the rapid progress
+of the traveller with our telescopes until he was lost among the
+distant inequalities of the ice.
+
+This appearance excited our unqualified wonder. We were, as we believed,
+many hundred miles from any land; but this apparition seemed to denote that
+it was not, in reality, so distant as we had supposed.
+Continue this passage in exactly 200 tokens of prose.
--- a/benchmarks/3d/analyze.py
+++ b/benchmarks/3d/analyze.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+"""Analyze MTP n_max sweep results and produce summary.md."""
+
+import json
+from pathlib import Path
+
+RESULTS_PATH = Path(__file__).parent / "results.json"
+SUMMARY_PATH = Path(__file__).parent / "summary.md"
+
+
+def load_results() -> list[dict]:
+    data = json.loads(RESULTS_PATH.read_text())
+    return [r for r in data if r.get("eval_tok_s") is not None and r.get("error") is None]
+
+
+def main() -> None:
+    rows = load_results()
+    if not rows:
+        print("No valid results found.")
+        return
+
+    models = sorted(set(r["model"] for r in rows))
+    lines = ["# MTP n_max Sweep Results\n"]
+    lines.append(f"**{len(rows)} valid measurements across {len(models)} models.**\n")
+
+    recommendations = []
+
+    for model in models:
+        model_rows = [r for r in rows if r["model"] == model]
+        n_max_values = sorted(set(r["n_max"] for r in model_rows))
+        prompt_names = sorted(set(r["prompt"] for r in model_rows))
+
+        lines.append(f"\n## {model}\n")
+
+        header = "| n_max | " + " | ".join(f"{p} tok/s" for p in prompt_names) + " | avg tok/s | vs n_max=0 |"
+        sep = "|-------|" + "|".join("-" * (len(p) + 7) for p in prompt_names) + "|-----------|------------|"
+        lines.append(header)
+        lines.append(sep)
+
+        baseline_avg = None
+        best_avg = 0
+        best_n = 0
+
+        for n in n_max_values:
+            cells = []
+            vals = []
+            for p in prompt_names:
+                matching = [r for r in model_rows if r["n_max"] == n and r["prompt"] == p]
+                if matching:
+                    v = matching[0]["eval_tok_s"]
+                    cells.append(f"{v:.1f}")
+                    vals.append(v)
+                else:
+                    cells.append("—")
+
+            avg = sum(vals) / len(vals) if vals else 0
+            if n == 0:
+                baseline_avg = avg
+                delta = "baseline"
+            elif baseline_avg and baseline_avg > 0:
+                pct = ((avg - baseline_avg) / baseline_avg) * 100
+                delta = f"{pct:+.1f}%"
+            else:
+                delta = "—"
+
+            if avg > best_avg:
+                best_avg = avg
+                best_n = n
+
+            draft_info = ""
+            draft_rows = [r for r in model_rows if r["n_max"] == n and r.get("draft_n")]
+            if draft_rows:
+                total_draft = sum(r.get("draft_n", 0) for r in draft_rows)
+                total_accepted = sum(r.get("draft_n_accepted", 0) for r in draft_rows)
+                if total_draft > 0:
+                    accept_pct = (total_accepted / total_draft) * 100
+                    draft_info = f" (accept {accept_pct:.0f}%)"
+
+            row_str = f"| {n} | " + " | ".join(cells) + f" | {avg:.1f} | {delta}{draft_info} |"
+            lines.append(row_str)
+
+        if baseline_avg and baseline_avg > 0 and best_avg > 0:
+            improvement = ((best_avg - baseline_avg) / baseline_avg) * 100
+            lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s, {improvement:+.1f}% vs baseline)\n")
+            recommendations.append((model, best_n, best_avg, improvement))
+        else:
+            lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s)\n")
+
+    # Recommendations section
+    lines.append("\n---\n")
+    lines.append("## Recommended `llama_extra_args` per model\n")
+    lines.append("| Model | n_max | avg tok/s | vs baseline | suggested flags |")
+    lines.append("|-------|-------|-----------|-------------|-----------------|")
+    for model, n, avg, imp in recommendations:
+        if n > 0:
+            flags = f'`["--spec-type", "draft-mtp", "--spec-draft-n-max", "{n}"]`'
+        else:
+            flags = "_(none — MTP not beneficial)_"
+        lines.append(f"| {model} | {n} | {avg:.1f} | {imp:+.1f}% | {flags} |")
+
+    lines.append("")
+    summary = "\n".join(lines)
+    SUMMARY_PATH.write_text(summary)
+    print(summary)
+    print(f"\nWritten to: {SUMMARY_PATH}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/3d/run_sweep.py
+++ b/benchmarks/3d/run_sweep.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""MTP n_max sweep across MTP-capable models via llama-sidecar.
+
+Usage:
+    python3 run_sweep.py             # full sweep
+    python3 run_sweep.py --dry-run   # print matrix, no API calls
+    python3 run_sweep.py --limit 1   # run first combo only (smoke)
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from urllib.request import Request, urlopen
+from urllib.error import URLError, HTTPError
+
+SIDECAR_URL = os.environ.get("SIDECAR_URL", "http://100.101.41.16:8402")
+RESULTS_PATH = Path(__file__).parent / "results.json"
+
+MATRIX = [
+    ("qwen3.6-35b-a3b-mxfp4", [0, 1, 2, 3]),
+    ("qwen3.6-27b-mtp",        [0, 1, 2, 3, 4]),
+    ("qwopus3.6-27b-v2-mtp",   [0, 2]),
+    ("qwopus3.5-9b-coder-mtp", [0, 2]),
+]
+
+PROMPTS = {
+    "short": {
+        "content": "Reply with exactly five words: a haiku-like greeting.",
+        "max_tokens": 100,
+    },
+    "medium": {
+        "content": (
+            "Explain how multi-token prediction speculative decoding works in transformer "
+            "inference. Cover: 1) the draft model role, 2) the verification mechanism, "
+            "3) acceptance rate dynamics, 4) why MoE models gain less than dense models. "
+            "Aim for 400-500 words."
+        ),
+        "max_tokens": 700,
+    },
+    "long": {
+        "content": (
+            "Write a complete Python implementation of a simple HTTP server that "
+            "accepts POST requests on /v1/chat/completions, validates JSON bodies "
+            "against a basic OpenAI schema, logs each request to stdout in JSON "
+            "format, and returns a hardcoded streaming response. Include error "
+            "handling for malformed JSON, missing required fields, and unsupported "
+            "methods. Add docstrings and type hints throughout. Show full file."
+        ),
+        "max_tokens": 2500,
+    },
+}
+
+
+def build_flags(n_max: int) -> str:
+    if n_max > 0:
+        return f"--spec-type draft-mtp --spec-draft-n-max {n_max} --repeat-penalty 1.0"
+    return "--repeat-penalty 1.0"
+
+
+def sidecar_request(method: str, path: str, body: dict | None = None,
+                    headers: dict | None = None, timeout: int = 180) -> dict | None:
+    url = f"{SIDECAR_URL}{path}"
+    data = json.dumps(body).encode() if body else None
+    hdrs = {"Content-Type": "application/json"}
+    if headers:
+        hdrs.update(headers)
+    req = Request(url, data=data, headers=hdrs, method=method)
+    try:
+        with urlopen(req, timeout=timeout) as resp:
+            return json.loads(resp.read())
+    except HTTPError as e:
+        body_text = e.read().decode(errors="replace")
+        try:
+            return json.loads(body_text)
+        except json.JSONDecodeError:
+            return {"error": f"HTTP {e.code}", "body": body_text[:500]}
+    except URLError as e:
+        return {"error": str(e)}
+
+
+def send_completion(model: str, flags: str, prompt: str, max_tokens: int) -> dict:
+    body = {
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "stream": False,
+    }
+    headers = {
+        "X-Agent-Flags": flags,
+        "X-Model-Id": model,
+    }
+    t0 = time.perf_counter()
+    resp = sidecar_request("POST", "/v1/chat/completions", body=body, headers=headers)
+    wall_ms = (time.perf_counter() - t0) * 1000
+    if resp is None:
+        return {"error": "no response", "wall_clock_ms": wall_ms}
+    resp["wall_clock_ms"] = wall_ms
+    return resp
+
+
+def extract_metrics(resp: dict, model: str, n_max: int, prompt_name: str) -> dict:
+    timings = resp.get("timings", {})
+    usage = resp.get("usage", {})
+    sidecars = sidecar_request("GET", "/sidecars") or []
+    sidecar_hash = ""
+    sidecar_port = 0
+    if isinstance(sidecars, list):
+        for s in sidecars:
+            if s.get("model_id") == model:
+                sidecar_hash = s.get("hash", "")
+                sidecar_port = s.get("port", 0)
+                break
+
+    return {
+        "model": model,
+        "n_max": n_max,
+        "prompt": prompt_name,
+        "timestamp_utc": datetime.now(timezone.utc).isoformat(),
+        "completion_tokens": usage.get("completion_tokens"),
+        "prompt_tokens": usage.get("prompt_tokens"),
+        "eval_tok_s": timings.get("predicted_per_second"),
+        "prompt_tok_s": timings.get("prompt_per_second"),
+        "eval_ms": timings.get("predicted_ms"),
+        "prompt_ms": timings.get("prompt_ms"),
+        "draft_n": timings.get("draft_n"),
+        "draft_n_accepted": timings.get("draft_n_accepted"),
+        "wall_clock_ms": resp.get("wall_clock_ms"),
+        "sidecar_hash": sidecar_hash,
+        "sidecar_port": sidecar_port,
+        "error": resp.get("error"),
+    }
+
+
+def append_result(row: dict) -> None:
+    results = []
+    if RESULTS_PATH.exists():
+        try:
+            results = json.loads(RESULTS_PATH.read_text())
+        except (json.JSONDecodeError, OSError):
+            pass
+    results.append(row)
+    RESULTS_PATH.write_text(json.dumps(results, indent=2) + "\n")
+
+
+def evict_all_sidecars() -> None:
+    sidecars = sidecar_request("GET", "/sidecars")
+    if not isinstance(sidecars, list):
+        return
+    for s in sidecars:
+        h = s.get("hash", "")
+        if h:
+            sidecar_request("DELETE", f"/sidecars/{h}")
+
+
+def run_combo(model: str, n_max: int, combo_idx: int, total_combos: int,
+              prompt_names: list[str]) -> None:
+    flags = build_flags(n_max)
+    label = f"[{combo_idx}/{total_combos}] {model} n_max={n_max}"
+    print(f"\n{'='*60}")
+    print(f"{label}")
+    print(f"  flags: {flags}")
+    print(f"{'='*60}")
+
+    for pname in prompt_names:
+        p = PROMPTS[pname]
+        # Warmup
+        print(f"  {pname}: warmup...", end="", flush=True)
+        send_completion(model, flags, p["content"], p["max_tokens"])
+        print(" done.", flush=True)
+        time.sleep(2)
+
+        # Record
+        print(f"  {pname}: recording...", end="", flush=True)
+        resp = send_completion(model, flags, p["content"], p["max_tokens"])
+        row = extract_metrics(resp, model, n_max, pname)
+        append_result(row)
+
+        tok_s = row.get("eval_tok_s")
+        draft = row.get("draft_n")
+        err = row.get("error")
+        if err:
+            print(f" ERROR: {err}")
+        elif tok_s:
+            draft_str = f" draft_n={draft}" if draft else ""
+            print(f" {tok_s:.1f} tok/s{draft_str}")
+        else:
+            print(" (no timings in response)")
+
+    # Evict this sidecar to free VRAM
+    evict_all_sidecars()
+    print(f"  evicted sidecars, sleeping 5s for VRAM release...")
+    time.sleep(5)
+
+
+def dry_run() -> None:
+    combos = [(model, n) for model, ns in MATRIX for n in ns]
+    print(f"Dry run: {len(combos)} combos × 3 prompts × 2 calls = {len(combos)*6} API calls")
+    print(f"Estimated runtime: 60-90 minutes\n")
+    for i, (model, n_max) in enumerate(combos, 1):
+        flags = build_flags(n_max)
+        print(f"  [{i}/{len(combos)}] {model} n_max={n_max}")
+        print(f"    flags: {flags}")
+        for pname in PROMPTS:
+            p = PROMPTS[pname]
+            print(f"    {pname}: max_tokens={p['max_tokens']}")
+    print(f"\nResults would be written to: {RESULTS_PATH}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="MTP n_max sweep benchmark")
+    parser.add_argument("--dry-run", action="store_true", help="Print matrix without running")
+    parser.add_argument("--limit", type=int, default=0, help="Run only first N combos")
+    args = parser.parse_args()
+
+    if args.dry_run:
+        dry_run()
+        return
+
+    # Check sidecar health
+    health = sidecar_request("GET", "/health")
+    if not health or health.get("status") != "ok":
+        print(f"Sidecar unhealthy: {health}", file=sys.stderr)
+        sys.exit(1)
+    print(f"Sidecar healthy: {health}")
+
+    # Clear existing sidecars
+    evict_all_sidecars()
+
+    combos = [(model, n) for model, ns in MATRIX for n in ns]
+    if args.limit > 0:
+        combos = combos[:args.limit]
+    prompt_names = list(PROMPTS.keys())
+
+    t_start = time.perf_counter()
+    for i, (model, n_max) in enumerate(combos, 1):
+        run_combo(model, n_max, i, len(combos), prompt_names)
+
+    elapsed = time.perf_counter() - t_start
+    print(f"\nSweep complete. {len(combos)} combos in {elapsed/60:.1f} minutes.")
+    print(f"Results: {RESULTS_PATH}")
+
+
+if __name__ == "__main__":
+    main()
--- a/cmd/llama-sidecar/main.go
+++ b/cmd/llama-sidecar/main.go
@@ -0,0 +1,74 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"net/http"
+	"os"
+	"time"
+
+	"github.com/indifferentketchup/llama-sidecar/internal/config"
+	"github.com/indifferentketchup/llama-sidecar/internal/pool"
+	"github.com/indifferentketchup/llama-sidecar/internal/server"
+	"github.com/indifferentketchup/llama-sidecar/internal/winsvc"
+)
+
+func main() {
+	cfg, err := config.Load()
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "config error: %v\n", err)
+		os.Exit(1)
+	}
+
+	initLogger(cfg.LogLevel)
+	slog.Info("starting llama-sidecar",
+		"bind", cfg.Bind,
+		"max_sidecars", cfg.MaxSidecars,
+		"port_range", fmt.Sprintf("%d-%d", cfg.PortRangeLo, cfg.PortRangeHi),
+		"models", len(cfg.ModelDirMap),
+		"base_args", cfg.BaseArgs,
+	)
+
+	startedAt := time.Now()
+	spawner := &pool.RealSpawner{}
+	p := pool.New(cfg, spawner)
+	srv := server.New(cfg, p, startedAt)
+
+	go func() {
+		slog.Info("listening", "addr", cfg.Bind)
+		if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
+			slog.Error("server error", "err", err)
+			os.Exit(1)
+		}
+	}()
+
+	winsvc.RegisterShutdownHandler(context.Background(), func(ctx context.Context) error {
+		slog.Info("draining HTTP server")
+		drainCtx, drainCancel := context.WithTimeout(ctx, 10*time.Second)
+		defer drainCancel()
+		if err := srv.Shutdown(drainCtx); err != nil {
+			slog.Error("HTTP drain failed", "err", err)
+		}
+		slog.Info("shutting down sidecar pool")
+		poolCtx, poolCancel := context.WithTimeout(ctx, 30*time.Second)
+		defer poolCancel()
+		return p.Shutdown(poolCtx)
+	})
+}
+
+func initLogger(level string) {
+	var lvl slog.Level
+	switch level {
+	case "debug":
+		lvl = slog.LevelDebug
+	case "warn":
+		lvl = slog.LevelWarn
+	case "error":
+		lvl = slog.LevelError
+	default:
+		lvl = slog.LevelInfo
+	}
+	handler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: lvl})
+	slog.SetDefault(slog.New(handler))
+}
--- a/eval/ab/prompts.json
+++ b/eval/ab/prompts.json
@@ -0,0 +1,72 @@
+[
+  {
+    "id": "review-1",
+    "agent": "Code Reviewer",
+    "prompt": "Review the `buildHeadPayload` function in `apps/server/src/services/compaction.ts`. It was recently patched in v1.13.6 to embed `reasoning_parts` as a `<reasoning>...</reasoning>` prose prefix on the assistant content for tool-bearing turns. Check: does the current implementation handle the case where `reasoning_parts` is an empty array? Does it handle turns that have both reasoning_parts AND non-empty text content (not just tool calls)? Cite file:line for any issues."
+  },
+  {
+    "id": "review-2",
+    "agent": "Code Reviewer",
+    "prompt": "Review the path guard layer in `apps/coder/services/path_guard.ts`. It enforces per-project scoping with a blanket `/opt:rw` mount and policy at the tool layer. Check for: symlink traversal (does it resolve symlinks before checking?), double-encoding attacks on path components, race conditions between check and use (TOCTOU), and whether `extraRoots` from `request_read_access` grants could be abused to escape the project scope. Cite file:line."
+  },
+  {
+    "id": "debug-1",
+    "agent": "Debugger",
+    "prompt": "Bug report: after a long BooCode chat session (~40 messages), the compaction trigger fires but the resulting summary is empty — the assistant message with `summary=true` has blank content. The `ctx_max` is correctly fetched from `/upstream/<model>/props` (verified in logs). The `needs_compaction` flag is being set. But the summary inference returns an empty string. This started happening after the v1.13.7 compaction trigger change that lowered the threshold to `floor(0.85 * ctx_max)`. Diagnose: what code path could produce an empty summary, and what would you check first?"
+  },
+  {
+    "id": "debug-2",
+    "agent": "Debugger",
+    "prompt": "Bug report: BooTerm terminal pane shows garbled output past column 66 on initial open, but corrects itself after manually resizing the browser window. The `stty size` inside the terminal reports `82 66` even though the pane is visually ~132 columns wide. tmux `list-windows` confirms the session was created at 66 columns. This only happens when opening a terminal pane via the split-pane button, not when opening it as the sole pane. Diagnose the root cause in `apps/web/src/components/panes/TerminalPane.tsx`."
+  },
+  {
+    "id": "refactor-1",
+    "agent": "Refactorer",
+    "prompt": "The `streamCompletion` function in `apps/server/src/services/provider.ts` has grown to handle: AI SDK v6 streaming, XML fallback parsing for qwen3.6 tool-call emissions, abort signal handling (the explicit `if (signal?.aborted) throw` patch), reasoning-delta counting, and usage extraction. It's now ~200 lines. Propose a refactor that separates concerns without breaking the streaming contract. The function must remain a single entry point for callers."
+  },
+  {
+    "id": "refactor-2",
+    "agent": "Refactorer",
+    "prompt": "The WebSocket frame publishing in BooCode went through two batches (v1.13.12 + v1.13.13) that converted ~80 publish sites to typed `publishFrame`/`publishUserFrame` wrappers with Zod validation. The schemas are duplicated byte-identical between `apps/server/src/types/ws-frames.ts` and `apps/web/src/api/ws-frames.ts` with a parity test. Propose a refactor to share the schema definition from a single source instead of maintaining the duplication + parity test."
+  },
+  {
+    "id": "architect-1",
+    "agent": "Architect",
+    "prompt": "Design the system-prompt prefix cache for BooCode. Context: `buildSystemPromptWithFingerprint` already computes a SHA-256 of the assembled prefix and logs drift. The prefix is rebuilt on every inference turn from: project settings, agent instructions (AGENTS.md), skills, session-level overrides, and web_search_enabled flag. Most of these don't change between turns in the same session. Design a cache that avoids rebuilding+rehashing on every turn. Consider: process-memory vs DB-backed, invalidation strategy, cache key shape, and whether the fingerprint can serve as the cache key itself."
+  },
+  {
+    "id": "architect-2",
+    "agent": "Architect",
+    "prompt": "Design the v2.5 task model integration with BooCoder's ACP dispatch. Context: v2.5.0-task-model just shipped a `tasks` table and lightweight task model services. BooCoder dispatches external agents (opencode, goose, claude) via ACP or PTY. Design how a task created in BooChat should flow through to a BooCoder dispatch: task creation → agent selection → ACP session → status updates back to the task row → completion. Consider: which fields from the task row map to ACP session params, how task status syncs with the agent's exit code, and how the UI surfaces progress."
+  },
+  {
+    "id": "security-1",
+    "agent": "Security Auditor",
+    "prompt": "Audit the `web_fetch` tool implementation in BooCode. It fetches arbitrary URLs on behalf of the LLM agent. Check for: SSRF against internal Tailscale IPs (100.x.x.x), DNS rebinding, redirect following to internal hosts, response size limits, content-type validation, and whether the `url_guard.ts` layer covers all cases. The tool is gated by `session.web_search_enabled` but once enabled, the URL is user-agent-controlled (the LLM decides what to fetch)."
+  },
+  {
+    "id": "security-2",
+    "agent": "Security Auditor",
+    "prompt": "Audit the `request_read_access` tool and `allowed_read_paths` grant mechanism (v1.13.17). When an agent needs to read files outside its project scope, it calls `request_read_access(path)` which triggers an `ask_user_input` elicitation for approval. On approval, the path is added to `allowed_read_paths` for that session, and `pathGuard` is extended with `extraRoots`. Check: can the agent request a path like `/etc/shadow` or `/opt/boocode/.env`? Is the grant scoped to the session or persistent? Can the path be a symlink that resolves to a sensitive location after the grant?"
+  },
+  {
+    "id": "prompt-1",
+    "agent": "Prompt Builder",
+    "prompt": "Write a Claude Code dispatch prompt for: adding a new BooCode agent called 'Documenter' to AGENTS.md. The agent should read source files and produce inline JSDoc/TSDoc comments. It should use the read-only tool set. Temperature 0.4, steps 10. The prompt should include pre-flight checks, the exact file to modify, backup instructions, and verification steps."
+  },
+  {
+    "id": "prompt-2",
+    "agent": "Prompt Builder",
+    "prompt": "Write an OpenCode dispatch prompt for: fixing the codecontext sidecar to handle projects with more than 10,000 files without OOMing. The fork is at /opt/forks/codecontext/. The agent should investigate the memory profile of the graph analysis pass, identify the allocation hotspot, and propose a streaming or chunked alternative. Include #careful hashtag, backup rules, and stop conditions."
+  },
+  {
+    "id": "recon-1",
+    "agent": "Recon",
+    "prompt": "Map the BooCode monorepo at /opt/boocode/. I need: top-level directory structure, the three apps and their roles, how they share the database, the Docker container topology, and the key service files in apps/server/src/services/. Identify the data flow from a user message in BooChat through to the LLM inference call and back."
+  },
+  {
+    "id": "recon-2",
+    "agent": "Recon",
+    "prompt": "Map the codecontext fork at /opt/forks/codecontext/. I need: the MCP tool surface (what tools are exposed), the parser architecture (how tree-sitter grammars are registered), the graph analysis pipeline (how dependencies and call graphs are built), and the codesight-merge additions (blast radius, hot files, routes, middleware). Identify the main entry points and the caching layer."
+  }
+]
--- a/eval/ab/run.sh
+++ b/eval/ab/run.sh
@@ -0,0 +1,242 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+ENDPOINT="http://100.101.41.16:8401/v1"
+PROMPTS_FILE="${SCRIPT_DIR}/prompts.json"
+RESULTS_DIR="${SCRIPT_DIR}/results"
+COMPARE_FILE="${SCRIPT_DIR}/COMPARE.md"
+TIMING_FILE="${SCRIPT_DIR}/timing.csv"
+
+MODELS=(
+  qwen3.6-35b-a3b-mxfp4
+  qwen3-coder-30b-apex
+  qwen3.6-27b-mtp
+  qwopus3.5-4b-mtp
+  qwen3.5-9b-deepseek-v4-mtp
+  qwopus3.6-35b-a3b-v1
+  qwopus3.6-27b-v2-mtp
+  qwopus3.5-9b-coder-mtp
+)
+
+mkdir -p "$RESULTS_DIR"
+
+# ── Parse prompts ─────────────────────────────────────────────────────
+
+PROMPT_COUNT=$(python3 -c "import json; print(len(json.load(open('${PROMPTS_FILE}'))))")
+TOTAL=$((PROMPT_COUNT * ${#MODELS[@]}))
+EST_MIN=$(( TOTAL * 30 / 60 ))
+
+echo "================================================================"
+echo "  A/B MODEL COMPARISON"
+echo "  ${PROMPT_COUNT} prompts × ${#MODELS[@]} models = ${TOTAL} requests"
+echo "  Estimated runtime: ~${EST_MIN} minutes"
+echo "  Endpoint: ${ENDPOINT}"
+echo "================================================================"
+echo ""
+
+# ── Main loop: models (outer) × prompts (inner) ──────────────────────
+# One model load per model, all prompts answered, then swap.
+
+t_start=$(date +%s)
+done_count=0
+
+for model in "${MODELS[@]}"; do
+  echo ""
+  echo "================================================================"
+  echo "  MODEL: ${model}"
+  echo "================================================================"
+
+  # Warmup: load the model with a trivial request
+  all_cached=true
+  for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do
+    PID=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['id'])")
+    if [ ! -f "${RESULTS_DIR}/${PID}/${model}.json" ] || [ ! -s "${RESULTS_DIR}/${PID}/${model}.json" ]; then
+      all_cached=false
+      break
+    fi
+  done
+
+  if [ "$all_cached" = "true" ]; then
+    echo "  All ${PROMPT_COUNT} prompts cached, skipping model"
+    for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do
+      done_count=$((done_count + 1))
+    done
+    continue
+  fi
+
+  echo "  Warming up..."
+  curl -s -X POST "${ENDPOINT}/chat/completions" \
+    -H "Content-Type: application/json" \
+    -d "{\"model\":\"${model}\",\"messages\":[{\"role\":\"user\",\"content\":\"Say OK.\"}],\"max_tokens\":10,\"temperature\":0}" \
+    --max-time 300 > /dev/null 2>&1
+  echo "  Warm."
+
+  for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do
+    PROMPT_ID=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['id'])")
+    AGENT=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['agent'])")
+
+    mkdir -p "${RESULTS_DIR}/${PROMPT_ID}"
+    OUT_JSON="${RESULTS_DIR}/${PROMPT_ID}/${model}.json"
+    OUT_MD="${RESULTS_DIR}/${PROMPT_ID}/${model}.md"
+
+    # Resume: skip if already done
+    if [ -f "$OUT_JSON" ] && [ -s "$OUT_JSON" ]; then
+      done_count=$((done_count + 1))
+      echo "  [${PROMPT_ID}] cached (${done_count}/${TOTAL})"
+      continue
+    fi
+
+    BODY=$(python3 -c "
+import json
+p = json.load(open('${PROMPTS_FILE}'))[${pidx}]
+print(json.dumps({
+    'model': '${model}',
+    'messages': [{'role': 'user', 'content': p['prompt']}],
+    'temperature': 0.6,
+    'max_tokens': 2048,
+    'seed': 42,
+    'stream': False
+}))
+")
+
+    SUCCESS=0
+    for attempt in 1 2; do
+      HTTP_CODE=$(curl -s -w '%{http_code}' -o "$OUT_JSON" \
+        --max-time 300 \
+        -X POST "${ENDPOINT}/chat/completions" \
+        -H "Content-Type: application/json" \
+        -d "$BODY" 2>/dev/null)
+
+      if [ "$HTTP_CODE" = "200" ]; then
+        SUCCESS=1
+        break
+      else
+        if [ "$attempt" = "1" ]; then
+          echo "  [${PROMPT_ID}] HTTP ${HTTP_CODE}, retrying in 10s..."
+          sleep 10
+        else
+          echo "ERROR: HTTP ${HTTP_CODE}" > "$OUT_MD"
+          echo "  [${PROMPT_ID}] FAILED (HTTP ${HTTP_CODE})"
+        fi
+      fi
+    done
+
+    if [ "$SUCCESS" = "1" ]; then
+      python3 -c "
+import json
+d = json.load(open('${OUT_JSON}'))
+msg = d.get('choices', [{}])[0].get('message', {})
+content = msg.get('content', '') or ''
+reasoning = msg.get('reasoning_content', '') or ''
+out = ''
+if reasoning:
+    out += '<think>\n' + reasoning + '\n</think>\n\n'
+out += content
+open('${OUT_MD}', 'w').write(out)
+" 2>/dev/null
+      done_count=$((done_count + 1))
+      METRICS=$(python3 -c "
+import json
+d = json.load(open('${OUT_JSON}'))
+t = d.get('timings', {})
+tps = t.get('predicted_per_second', 0)
+tok = d.get('usage', {}).get('completion_tokens', 0)
+print(f'{tps:.1f}tok/s {tok}tok')
+" 2>/dev/null || echo "?")
+      echo "  [${PROMPT_ID}] done (${METRICS}) [${done_count}/${TOTAL}]"
+    fi
+
+    sleep 2
+  done
+done
+
+# ── Generate COMPARE.md ──────────────────────────────────────────────
+
+echo ""
+echo "Generating COMPARE.md..."
+
+MODELS_JSON=$(printf '%s\n' "${MODELS[@]}" | python3 -c "import json,sys; print(json.dumps([l.strip() for l in sys.stdin if l.strip()]))")
+
+python3 -c "
+import json
+from pathlib import Path
+
+prompts = json.load(open('${PROMPTS_FILE}'))
+results_dir = Path('${RESULTS_DIR}')
+models = json.loads('${MODELS_JSON}')
+
+lines = ['# A/B Model Comparison\n']
+
+timing_rows = []
+
+for p in prompts:
+    pid = p['id']
+    agent = p['agent']
+    short = p['prompt'][:80]
+    lines.append(f'## [{pid}] {agent}\n')
+    lines.append(f'> {short}...\n')
+
+    for model in models:
+        md_path = results_dir / pid / f'{model}.md'
+        json_path = results_dir / pid / f'{model}.json'
+        lines.append(f'### {model}\n')
+        if md_path.exists():
+            content = md_path.read_text().strip()
+            lines.append(f'{content}\n')
+        else:
+            lines.append('*(no response)*\n')
+
+        if json_path.exists():
+            try:
+                d = json.loads(json_path.read_text())
+                t = d.get('timings', {})
+                u = d.get('usage', {})
+                timing_rows.append({
+                    'prompt_id': pid,
+                    'model_id': model,
+                    'prompt_tps': t.get('prompt_per_second', 0),
+                    'predicted_tps': t.get('predicted_per_second', 0),
+                    'total_tokens': u.get('total_tokens', 0),
+                    'latency_ms': round((t.get('prompt_ms', 0) or 0) + (t.get('predicted_ms', 0) or 0), 1),
+                })
+            except:
+                pass
+    lines.append('---\n')
+
+# Timing table
+lines.append('## Timing Summary\n')
+pids = list(dict.fromkeys(r['prompt_id'] for r in timing_rows))
+lines.append('| prompt | ' + ' | '.join(models) + ' |')
+lines.append('|--------' + '|------' * len(models) + '|')
+for pid in pids:
+    cells = []
+    for model in models:
+        match = [r for r in timing_rows if r['prompt_id'] == pid and r['model_id'] == model]
+        if match:
+            cells.append(f\"{match[0]['predicted_tps']:.0f}\")
+        else:
+            cells.append('—')
+    lines.append(f'| {pid} | ' + ' | '.join(cells) + ' |')
+
+Path('${COMPARE_FILE}').write_text('\n'.join(lines) + '\n')
+print(f'Wrote ${COMPARE_FILE}')
+
+# timing.csv
+import csv
+with open('${TIMING_FILE}', 'w', newline='') as f:
+    w = csv.DictWriter(f, fieldnames=['prompt_id', 'model_id', 'prompt_tps', 'predicted_tps', 'total_tokens', 'latency_ms'])
+    w.writeheader()
+    w.writerows(timing_rows)
+print(f'Wrote ${TIMING_FILE}')
+"
+
+t_end=$(date +%s)
+elapsed=$(( t_end - t_start ))
+echo ""
+echo "================================================================"
+echo "  COMPLETE in $(( elapsed / 60 ))m $(( elapsed % 60 ))s"
+echo "  Results: ${RESULTS_DIR}/"
+echo "  Compare: ${COMPARE_FILE}"
+echo "  Timing:  ${TIMING_FILE}"
+echo "================================================================"
--- a/eval/analyze.py
+++ b/eval/analyze.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""Generate SUMMARY.md from scores.csv."""
+
+import csv
+from collections import defaultdict
+from pathlib import Path
+
+CSV_PATH = Path(__file__).parent / "scores.csv"
+SUMMARY_PATH = Path(__file__).parent / "SUMMARY.md"
+
+
+def load_scores() -> list[dict]:
+    rows = []
+    with open(CSV_PATH) as f:
+        for row in csv.DictReader(f):
+            row["correct"] = row["correct"].lower() in ("true", "1", "yes")
+            row["latency_ms"] = float(row.get("latency_ms", 0) or 0)
+            rows.append(row)
+    return rows
+
+
+def main() -> None:
+    rows = load_scores()
+    if not rows:
+        print("No data in scores.csv")
+        return
+
+    models = sorted(set(r["model"] for r in rows))
+    benchmarks = ["mmlu", "gsm8k", "humaneval"]
+
+    # Compute scores
+    scores = {}  # (model, bench) -> (correct, total)
+    for r in rows:
+        key = (r["model"], r["benchmark"])
+        if key not in scores:
+            scores[key] = [0, 0]
+        scores[key][1] += 1
+        if r["correct"]:
+            scores[key][0] += 1
+
+    # MMLU per-category
+    cat_scores = defaultdict(lambda: [0, 0])
+    for r in rows:
+        if r["benchmark"] == "mmlu" and r.get("category"):
+            key = (r["model"], r["category"])
+            cat_scores[key][1] += 1
+            if r["correct"]:
+                cat_scores[key][0] += 1
+
+    categories = sorted(set(r.get("category", "") for r in rows if r.get("category")))
+
+    lines = ["# Eval Results\n"]
+
+    # Main table
+    lines.append("## Overall Scores\n")
+    header = "| Model | MMLU (%) | GSM8K (%) | HumanEval (%) | Avg (%) |"
+    sep = "|-------|---------|---------|--------------|---------|"
+    lines.append(header)
+    lines.append(sep)
+
+    model_avgs = []
+    for model in models:
+        cells = []
+        pcts = []
+        for bench in benchmarks:
+            key = (model, bench)
+            if key in scores:
+                c, t = scores[key]
+                pct = c / t * 100 if t > 0 else 0
+                cells.append(f"{pct:.1f}")
+                pcts.append(pct)
+            else:
+                cells.append("—")
+        avg = sum(pcts) / len(pcts) if pcts else 0
+        model_avgs.append((model, avg))
+        cells.append(f"{avg:.1f}")
+        lines.append(f"| {model} | " + " | ".join(cells) + " |")
+
+    # Sort summary
+    model_avgs.sort(key=lambda x: -x[1])
+    lines.append(f"\n**Best overall: {model_avgs[0][0]}** ({model_avgs[0][1]:.1f}% avg)\n")
+
+    # MMLU category breakdown
+    if categories:
+        lines.append("\n## MMLU Per-Category Breakdown\n")
+        header = "| Model | " + " | ".join(c.replace("_", " ").title() for c in categories) + " |"
+        sep = "|-------" + "|-------" * len(categories) + "|"
+        lines.append(header)
+        lines.append(sep)
+        for model in models:
+            cells = []
+            for cat in categories:
+                key = (model, cat)
+                if key in cat_scores:
+                    c, t = cat_scores[key]
+                    cells.append(f"{c}/{t}")
+                else:
+                    cells.append("—")
+            lines.append(f"| {model} | " + " | ".join(cells) + " |")
+
+    # Latency summary
+    lines.append("\n## Median Latency (ms)\n")
+    lines.append("| Model | MMLU | GSM8K | HumanEval |")
+    lines.append("|-------|------|-------|-----------|")
+    for model in models:
+        cells = []
+        for bench in benchmarks:
+            lats = sorted([r["latency_ms"] for r in rows
+                          if r["model"] == model and r["benchmark"] == bench
+                          and r["latency_ms"] > 0])
+            if lats:
+                med = lats[len(lats)//2]
+                cells.append(f"{med:.0f}")
+            else:
+                cells.append("—")
+        lines.append(f"| {model} | " + " | ".join(cells) + " |")
+
+    summary = "\n".join(lines) + "\n"
+    SUMMARY_PATH.write_text(summary)
+    print(summary)
+    print(f"\nWritten to: {SUMMARY_PATH}")
+
+
+if __name__ == "__main__":
+    main()
--- a/eval/gsm8k.py
+++ b/eval/gsm8k.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""GSM8K 50-question subset benchmark (seed=42)."""
+
+import json
+import os
+import random
+import re
+import sys
+import time
+from pathlib import Path
+
+from datasets import load_dataset
+from openai import OpenAI
+from tqdm import tqdm
+
+ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
+RESULTS_DIR = Path(__file__).parent / "results"
+MAX_TOKENS = 512
+SEED = 42
+TEMPERATURE = 0
+N_QUESTIONS = 50
+
+
+def load_questions() -> list[dict]:
+    rng = random.Random(SEED)
+    ds = load_dataset("openai/gsm8k", "main", split="test", trust_remote_code=True)
+    indices = list(range(len(ds)))
+    rng.shuffle(indices)
+    questions = []
+    for idx in indices[:N_QUESTIONS]:
+        row = ds[idx]
+        answer_text = row["answer"]
+        # GSM8K answer format: "#### <number>" at end
+        match = re.search(r"####\s*([0-9,.-]+)", answer_text)
+        expected = int(match.group(1).replace(",", "")) if match else 0
+        questions.append({
+            "id": f"gsm8k_{idx}",
+            "question": row["question"],
+            "expected": expected,
+        })
+    return questions
+
+
+def format_prompt(q: dict) -> str:
+    return (
+        "Solve this problem step by step, then on the final line write "
+        "'ANSWER: <number>'.\n\n" + q["question"]
+    )
+
+
+def parse_answer(text: str) -> int | None:
+    matches = re.findall(r"ANSWER:\s*([0-9,.-]+)", text, re.IGNORECASE)
+    if matches:
+        try:
+            return int(matches[-1].replace(",", ""))
+        except ValueError:
+            return None
+    # Fallback: last number in the response
+    nums = re.findall(r"-?\d[\d,]*", text)
+    if nums:
+        try:
+            return int(nums[-1].replace(",", ""))
+        except ValueError:
+            return None
+    return None
+
+
+def run_gsm8k(model: str, client: OpenAI, questions: list[dict]) -> list[dict]:
+    model_dir = RESULTS_DIR / model / "gsm8k"
+    model_dir.mkdir(parents=True, exist_ok=True)
+
+    results = []
+    correct = 0
+    total = 0
+
+    skipped = 0
+    for i, q in enumerate(tqdm(questions, desc=f"  GSM8K {model}", file=sys.stderr)):
+        expected = q["expected"]
+        out_path = model_dir / f"{q['id']}.json"
+
+        if out_path.exists():
+            try:
+                cached = json.loads(out_path.read_text())
+                raw = ""
+                if "choices" in cached:
+                    msg = cached["choices"][0].get("message", {})
+                    raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
+                parsed = parse_answer(raw)
+                is_correct = parsed is not None and parsed == expected
+                if is_correct:
+                    correct += 1
+                total += 1
+                results.append({
+                    "model": model, "benchmark": "gsm8k", "question_id": q["id"],
+                    "correct": is_correct, "raw_answer": raw[:200],
+                    "parsed_answer": str(parsed) if parsed is not None else "",
+                    "expected": str(expected), "latency_ms": 0,
+                })
+                skipped += 1
+                continue
+            except (json.JSONDecodeError, KeyError):
+                pass
+
+        prompt = format_prompt(q)
+        t0 = time.time()
+        resp_json = None
+        for attempt in range(2):
+            try:
+                resp = client.chat.completions.create(
+                    model=model,
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=MAX_TOKENS,
+                    temperature=TEMPERATURE,
+                    seed=SEED,
+                )
+                resp_json = resp.model_dump()
+                break
+            except Exception as e:
+                if attempt == 0:
+                    time.sleep(5)
+                else:
+                    resp_json = {"error": str(e)}
+        latency = (time.time() - t0) * 1000
+
+        raw = ""
+        if resp_json and "choices" in resp_json:
+            msg = resp_json["choices"][0].get("message", {})
+            raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
+
+        parsed = parse_answer(raw)
+        is_correct = parsed is not None and parsed == expected
+        if is_correct:
+            correct += 1
+        total += 1
+
+        out_path.write_text(json.dumps(resp_json, indent=2, default=str))
+
+        results.append({
+            "model": model,
+            "benchmark": "gsm8k",
+            "question_id": q["id"],
+            "correct": is_correct,
+            "raw_answer": raw[:200],
+            "parsed_answer": str(parsed) if parsed is not None else "",
+            "expected": str(expected),
+            "latency_ms": round(latency, 1),
+        })
+
+        if (i + 1) % 10 == 0:
+            print(f"  [{model}] GSM8K {i+1}/{len(questions)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr)
+
+    if skipped:
+        print(f"  [{model}] GSM8K resumed: {skipped} cached, {total-skipped} new", file=sys.stderr)
+    print(f"  [{model}] GSM8K FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr)
+    return results
+
+
+if __name__ == "__main__":
+    model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4"
+    client = OpenAI(base_url=ENDPOINT, api_key="dummy")
+    questions = load_questions()
+    results = run_gsm8k(model, client, questions)
+    for r in results:
+        print(json.dumps(r))
--- a/eval/humaneval.py
+++ b/eval/humaneval.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""HumanEval benchmark — 164 problems with sandboxed execution."""
+
+import json
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import textwrap
+import time
+from pathlib import Path
+
+from datasets import load_dataset
+from openai import OpenAI
+from tqdm import tqdm
+
+ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
+RESULTS_DIR = Path(__file__).parent / "results"
+MAX_TOKENS = 1024
+SEED = 42
+TEMPERATURE = 0
+EXEC_TIMEOUT = 30
+
+
+def load_problems() -> list[dict]:
+    ds = load_dataset("openai/openai_humaneval", split="test", trust_remote_code=True)
+    problems = []
+    for row in ds:
+        problems.append({
+            "id": row["task_id"],
+            "prompt": row["prompt"],
+            "canonical": row["canonical_solution"],
+            "test": row["test"],
+            "entry_point": row["entry_point"],
+        })
+    return problems
+
+
+def extract_code(response: str, prompt: str) -> str:
+    # Try to find a code block
+    blocks = re.findall(r"```(?:python)?\n(.*?)```", response, re.DOTALL)
+    if blocks:
+        code = blocks[0]
+        # If the code block contains the function signature, use it directly
+        if "def " in code:
+            return code
+        # Otherwise prepend the prompt (function signature)
+        return prompt + code
+
+    # No code block — try to extract everything from the first def onwards
+    lines = response.split("\n")
+    in_code = False
+    code_lines = []
+    for line in lines:
+        if line.strip().startswith("def ") or in_code:
+            in_code = True
+            code_lines.append(line)
+        elif in_code and line.strip() == "":
+            code_lines.append(line)
+
+    if code_lines:
+        return "\n".join(code_lines)
+
+    # Last resort: prepend prompt to raw response
+    return prompt + response
+
+
+def run_test(code: str, test_code: str, entry_point: str) -> tuple[bool, str]:
+    full = code + "\n\n" + test_code + f"\n\ncheck({entry_point})\n"
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".py", dir="/tmp", delete=False
+    ) as f:
+        f.write(full)
+        f.flush()
+        fpath = f.name
+
+    try:
+        # Sandboxed execution: restrict to /tmp, limited PATH
+        env = {"PATH": "/usr/bin:/usr/local/bin", "HOME": "/tmp"}
+        result = subprocess.run(
+            [sys.executable, fpath],
+            capture_output=True, text=True,
+            timeout=EXEC_TIMEOUT,
+            cwd="/tmp",
+            env=env,
+        )
+        passed = result.returncode == 0
+        output = result.stderr[:500] if result.stderr else result.stdout[:500]
+        return passed, output
+    except subprocess.TimeoutExpired:
+        return False, "TIMEOUT"
+    except Exception as e:
+        return False, str(e)[:500]
+    finally:
+        try:
+            os.unlink(fpath)
+        except OSError:
+            pass
+
+
+def run_humaneval(model: str, client: OpenAI, problems: list[dict]) -> list[dict]:
+    model_dir = RESULTS_DIR / model / "humaneval"
+    model_dir.mkdir(parents=True, exist_ok=True)
+
+    results = []
+    correct = 0
+    total = 0
+
+    skipped = 0
+    for i, p in enumerate(tqdm(problems, desc=f"  HumanEval {model}", file=sys.stderr)):
+        out_path = model_dir / f"{p['id'].replace('/', '_')}.json"
+
+        if out_path.exists():
+            try:
+                cached = json.loads(out_path.read_text())
+                passed = cached.get("passed", False)
+                if passed:
+                    correct += 1
+                total += 1
+                results.append({
+                    "model": model, "benchmark": "humaneval",
+                    "question_id": p["id"], "correct": passed,
+                    "raw_answer": "", "parsed_answer": "pass" if passed else "fail",
+                    "expected": "pass", "latency_ms": 0,
+                })
+                skipped += 1
+                continue
+            except (json.JSONDecodeError, KeyError):
+                pass
+
+        t0 = time.time()
+        resp_json = None
+        for attempt in range(2):
+            try:
+                resp = client.chat.completions.create(
+                    model=model,
+                    messages=[{"role": "user", "content": (
+                        "Complete the following Python function. "
+                        "Return ONLY the complete function implementation.\n\n"
+                        + p["prompt"]
+                    )}],
+                    max_tokens=MAX_TOKENS,
+                    temperature=TEMPERATURE,
+                    seed=SEED,
+                )
+                resp_json = resp.model_dump()
+                break
+            except Exception as e:
+                if attempt == 0:
+                    time.sleep(5)
+                else:
+                    resp_json = {"error": str(e)}
+        latency = (time.time() - t0) * 1000
+
+        raw = ""
+        if resp_json and "choices" in resp_json:
+            msg = resp_json["choices"][0].get("message", {})
+            raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
+
+        code = extract_code(raw, p["prompt"])
+        passed, exec_output = run_test(code, p["test"], p["entry_point"])
+        if passed:
+            correct += 1
+        total += 1
+
+        out_path.write_text(json.dumps({
+            "response": resp_json,
+            "extracted_code": code[:2000],
+            "passed": passed,
+            "exec_output": exec_output,
+        }, indent=2, default=str))
+
+        results.append({
+            "model": model,
+            "benchmark": "humaneval",
+            "question_id": p["id"],
+            "correct": passed,
+            "raw_answer": raw[:200],
+            "parsed_answer": "pass" if passed else "fail",
+            "expected": "pass",
+            "latency_ms": round(latency, 1),
+        })
+
+        if (i + 1) % 10 == 0:
+            print(f"  [{model}] HumanEval {i+1}/{len(problems)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr)
+
+    if skipped:
+        print(f"  [{model}] HumanEval resumed: {skipped} cached, {total-skipped} new", file=sys.stderr)
+    print(f"  [{model}] HumanEval FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr)
+    return results
+
+
+if __name__ == "__main__":
+    model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4"
+    client = OpenAI(base_url=ENDPOINT, api_key="dummy")
+    problems = load_problems()
+    results = run_humaneval(model, client, problems)
+    for r in results:
+        print(json.dumps(r))
--- a/eval/mmlu.py
+++ b/eval/mmlu.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""MMLU 100-question subset benchmark (20 per category, seed=42)."""
+
+import json
+import os
+import random
+import re
+import sys
+import time
+from pathlib import Path
+
+from datasets import load_dataset
+from openai import OpenAI
+from tqdm import tqdm
+
+ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
+RESULTS_DIR = Path(__file__).parent / "results"
+MAX_TOKENS = 512
+SEED = 42
+TEMPERATURE = 0
+
+CATEGORIES = [
+    "high_school_mathematics",
+    "college_computer_science",
+    "professional_medicine",
+    "formal_logic",
+    "miscellaneous",
+]
+PER_CATEGORY = 20
+
+CHOICES = ["A", "B", "C", "D"]
+
+
+def load_questions() -> list[dict]:
+    rng = random.Random(SEED)
+    questions = []
+    for cat in CATEGORIES:
+        ds = load_dataset("cais/mmlu", cat, split="test", trust_remote_code=True)
+        indices = list(range(len(ds)))
+        rng.shuffle(indices)
+        for idx in indices[:PER_CATEGORY]:
+            row = ds[idx]
+            questions.append({
+                "id": f"{cat}_{idx}",
+                "category": cat,
+                "question": row["question"],
+                "choices": row["choices"],
+                "answer_idx": row["answer"],
+            })
+    return questions
+
+
+def format_prompt(q: dict) -> str:
+    lines = [f"Question: {q['question']}"]
+    for i, choice in enumerate(q["choices"]):
+        lines.append(f"{CHOICES[i]}) {choice}")
+    lines.append("Answer with a single letter: ")
+    return "\n".join(lines)
+
+
+def parse_answer(text: str) -> str | None:
+    for ch in text.strip():
+        if ch.upper() in CHOICES:
+            return ch.upper()
+    return None
+
+
+def run_mmlu(model: str, client: OpenAI, questions: list[dict]) -> list[dict]:
+    model_dir = RESULTS_DIR / model / "mmlu"
+    model_dir.mkdir(parents=True, exist_ok=True)
+
+    results = []
+    correct = 0
+    total = 0
+
+    skipped = 0
+    for i, q in enumerate(tqdm(questions, desc=f"  MMLU {model}", file=sys.stderr)):
+        expected = CHOICES[q["answer_idx"]]
+        out_path = model_dir / f"{q['id']}.json"
+
+        # Resume: skip if result file exists
+        if out_path.exists():
+            try:
+                cached = json.loads(out_path.read_text())
+                raw = ""
+                if "choices" in cached:
+                    msg = cached["choices"][0].get("message", {})
+                    raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
+                parsed = parse_answer(raw)
+                is_correct = parsed == expected
+                if is_correct:
+                    correct += 1
+                total += 1
+                results.append({
+                    "model": model, "benchmark": "mmlu", "question_id": q["id"],
+                    "category": q["category"], "correct": is_correct,
+                    "raw_answer": raw[:200], "parsed_answer": parsed or "",
+                    "expected": expected, "latency_ms": 0,
+                })
+                skipped += 1
+                continue
+            except (json.JSONDecodeError, KeyError):
+                pass
+
+        prompt = format_prompt(q)
+        t0 = time.time()
+        resp_json = None
+        for attempt in range(2):
+            try:
+                resp = client.chat.completions.create(
+                    model=model,
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=MAX_TOKENS,
+                    temperature=TEMPERATURE,
+                    seed=SEED,
+                )
+                resp_json = resp.model_dump()
+                break
+            except Exception as e:
+                if attempt == 0:
+                    time.sleep(5)
+                else:
+                    resp_json = {"error": str(e)}
+        latency = (time.time() - t0) * 1000
+
+        raw = ""
+        if resp_json and "choices" in resp_json:
+            msg = resp_json["choices"][0].get("message", {})
+            raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
+
+        parsed = parse_answer(raw)
+        is_correct = parsed == expected
+        if is_correct:
+            correct += 1
+        total += 1
+
+        out_path.write_text(json.dumps(resp_json, indent=2, default=str))
+
+        results.append({
+            "model": model,
+            "benchmark": "mmlu",
+            "question_id": q["id"],
+            "category": q["category"],
+            "correct": is_correct,
+            "raw_answer": raw[:200],
+            "parsed_answer": parsed or "",
+            "expected": expected,
+            "latency_ms": round(latency, 1),
+        })
+
+        if (i + 1) % 10 == 0:
+            print(f"  [{model}] MMLU {i+1}/{len(questions)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr)
+
+    if skipped:
+        print(f"  [{model}] MMLU resumed: {skipped} cached, {total-skipped} new", file=sys.stderr)
+    print(f"  [{model}] MMLU FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr)
+    return results
+
+
+if __name__ == "__main__":
+    model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4"
+    client = OpenAI(base_url=ENDPOINT, api_key="dummy")
+    questions = load_questions()
+    results = run_mmlu(model, client, questions)
+    for r in results:
+        print(json.dumps(r))
--- a/eval/run_all.py
+++ b/eval/run_all.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""Orchestrate MMLU, GSM8K, HumanEval across all models."""
+
+import csv
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+from openai import OpenAI
+
+ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
+RESULTS_DIR = Path(__file__).parent / "results"
+CSV_PATH = Path(__file__).parent / "scores.csv"
+
+MODELS = [
+    "qwen3.6-35b-a3b-mxfp4",
+    "qwen3-coder-30b-apex",
+    "qwen3.6-27b-mtp",
+    "qwopus3.5-4b-mtp",
+    "qwen3.5-9b-deepseek-v4-mtp",
+    "qwopus3.6-35b-a3b-v1",
+    "qwopus3.6-27b-v2-mtp",
+    "qwopus3.5-9b-coder-mtp",
+]
+
+
+def warmup_model(client: OpenAI, model: str) -> bool:
+    print(f"\n{'='*60}", file=sys.stderr)
+    print(f"  Loading model: {model}", file=sys.stderr)
+    print(f"{'='*60}", file=sys.stderr)
+    for attempt in range(3):
+        try:
+            resp = client.chat.completions.create(
+                model=model,
+                messages=[{"role": "user", "content": "Say OK."}],
+                max_tokens=10,
+                temperature=0,
+            )
+            print(f"  Warmup OK", file=sys.stderr)
+            return True
+        except Exception as e:
+            print(f"  Warmup attempt {attempt+1} failed: {e}", file=sys.stderr)
+            time.sleep(10)
+    print(f"  WARNING: warmup failed for {model}, continuing anyway", file=sys.stderr)
+    return False
+
+
+def run_benchmark(module_name: str, model: str, client: OpenAI) -> list[dict]:
+    if module_name == "mmlu":
+        from mmlu import load_questions, run_mmlu
+        questions = load_questions()
+        return run_mmlu(model, client, questions)
+    elif module_name == "gsm8k":
+        from gsm8k import load_questions, run_gsm8k
+        questions = load_questions()
+        return run_gsm8k(model, client, questions)
+    elif module_name == "humaneval":
+        from humaneval import load_problems, run_humaneval
+        problems = load_problems()
+        return run_humaneval(model, client, problems)
+    else:
+        raise ValueError(f"Unknown benchmark: {module_name}")
+
+
+def main() -> None:
+    client = OpenAI(base_url=ENDPOINT, api_key="dummy")
+
+    # Check connectivity
+    try:
+        client.models.list()
+        print("Connected to llama-swap", file=sys.stderr)
+    except Exception as e:
+        print(f"Cannot connect to {ENDPOINT}: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+    all_results: list[dict] = []
+    benchmarks = ["mmlu", "gsm8k", "humaneval"]
+
+    t_start = time.time()
+
+    for model in MODELS:
+        warmup_model(client, model)
+
+        for bench in benchmarks:
+            print(f"\n  --- {model} / {bench} ---", file=sys.stderr)
+            try:
+                results = run_benchmark(bench, model, client)
+                all_results.extend(results)
+                write_csv(all_results)
+            except Exception as e:
+                print(f"  ERROR in {model}/{bench}: {e}", file=sys.stderr)
+
+    elapsed = time.time() - t_start
+    print(f"\nAll benchmarks complete in {elapsed/60:.0f} minutes", file=sys.stderr)
+    print(f"Results: {CSV_PATH}", file=sys.stderr)
+
+
+def write_csv(results: list[dict]) -> None:
+    if not results:
+        return
+    fields = ["model", "benchmark", "question_id", "correct", "raw_answer",
+              "parsed_answer", "expected", "latency_ms"]
+    # Also include category if present (MMLU)
+    if any("category" in r for r in results):
+        fields.insert(3, "category")
+
+    with open(CSV_PATH, "w", newline="") as f:
+        w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
+        w.writeheader()
+        w.writerows(results)
+
+
+if __name__ == "__main__":
+    main()
--- a/eval/run_all.sh
+++ b/eval/run_all.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+EVAL_DIR="$(cd "$(dirname "$0")" && pwd)"
+VENV="${EVAL_DIR}/.venv/bin/python3"
+
+cd "$EVAL_DIR"
+
+echo "Starting eval sweep at $(date)"
+echo "Using venv: ${VENV}"
+echo ""
+
+$VENV run_all.py 2>&1 | tee eval.log
+
+echo ""
+echo "Generating summary..."
+$VENV analyze.py
+
+echo ""
+echo "Done at $(date)"
--- a/go.mod
+++ b/go.mod
@@ -0,0 +1,3 @@
+module github.com/indifferentketchup/llama-sidecar
+
+go 1.26.3
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -0,0 +1,139 @@
+package config
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+)
+
+var utf8BOM = []byte{0xEF, 0xBB, 0xBF}
+
+type Config struct {
+	Bind                  string
+	LlamaServerBin        string
+	ModelDirMap           map[string]string
+	PortRangeLo           int
+	PortRangeHi           int
+	MaxSidecars           int
+	LogLevel              string
+	BaseArgs              []string
+	HealthTimeoutSeconds  int
+	HealthIntervalSeconds int
+}
+
+func Load() (*Config, error) {
+	bin := os.Getenv("LLAMA_SERVER_BIN")
+	if bin == "" {
+		return nil, fmt.Errorf("LLAMA_SERVER_BIN is required")
+	}
+	if _, err := os.Stat(bin); err != nil {
+		return nil, fmt.Errorf("LLAMA_SERVER_BIN %q: %w", bin, err)
+	}
+
+	mapFile := os.Getenv("MODEL_DIR_MAP_FILE")
+	if mapFile == "" {
+		return nil, fmt.Errorf("MODEL_DIR_MAP_FILE is required")
+	}
+	modelMap, err := loadModelMap(mapFile)
+	if err != nil {
+		return nil, fmt.Errorf("MODEL_DIR_MAP_FILE: %w", err)
+	}
+
+	bind := envOr("LLAMA_SIDECAR_BIND", "127.0.0.1:8402")
+	logLevel := envOr("LOG_LEVEL", "info")
+	maxSidecars := envIntOr("MAX_SIDECARS", 2)
+	healthTimeout := envIntOr("HEALTH_TIMEOUT_SECONDS", 60)
+	healthInterval := envIntOr("HEALTH_INTERVAL_SECONDS", 30)
+
+	lo, hi, err := parsePortRange(envOr("PORT_RANGE", "8500-8599"))
+	if err != nil {
+		return nil, fmt.Errorf("PORT_RANGE: %w", err)
+	}
+	if hi-lo+1 < maxSidecars {
+		return nil, fmt.Errorf("PORT_RANGE %d-%d has %d ports but MAX_SIDECARS is %d", lo, hi, hi-lo+1, maxSidecars)
+	}
+
+	baseArgs := defaultBaseArgs()
+	if env := os.Getenv("BASE_ARGS"); env != "" {
+		var parsed []string
+		envBytes := bytes.TrimPrefix([]byte(env), utf8BOM)
+		if err := json.Unmarshal(envBytes, &parsed); err != nil {
+			return nil, fmt.Errorf("BASE_ARGS: invalid JSON array: %w", err)
+		}
+		baseArgs = parsed
+	}
+
+	return &Config{
+		Bind:                  bind,
+		LlamaServerBin:        bin,
+		ModelDirMap:           modelMap,
+		PortRangeLo:           lo,
+		PortRangeHi:           hi,
+		MaxSidecars:           maxSidecars,
+		LogLevel:              logLevel,
+		BaseArgs:              baseArgs,
+		HealthTimeoutSeconds:  healthTimeout,
+		HealthIntervalSeconds: healthInterval,
+	}, nil
+}
+
+func defaultBaseArgs() []string {
+	return []string{"-ngl", "999", "-c", "32768", "--flash-attn", "on", "--no-mmap"}
+}
+
+func loadModelMap(path string) (map[string]string, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	data = bytes.TrimPrefix(data, utf8BOM)
+	var m map[string]string
+	if err := json.Unmarshal(data, &m); err != nil {
+		return nil, fmt.Errorf("invalid JSON: %w", err)
+	}
+	if len(m) == 0 {
+		return nil, fmt.Errorf("model map is empty")
+	}
+	return m, nil
+}
+
+func parsePortRange(s string) (int, int, error) {
+	parts := strings.SplitN(s, "-", 2)
+	if len(parts) != 2 {
+		return 0, 0, fmt.Errorf("expected lo-hi format, got %q", s)
+	}
+	lo, err := strconv.Atoi(strings.TrimSpace(parts[0]))
+	if err != nil {
+		return 0, 0, fmt.Errorf("invalid lo port: %w", err)
+	}
+	hi, err := strconv.Atoi(strings.TrimSpace(parts[1]))
+	if err != nil {
+		return 0, 0, fmt.Errorf("invalid hi port: %w", err)
+	}
+	if hi <= lo {
+		return 0, 0, fmt.Errorf("hi (%d) must be > lo (%d)", hi, lo)
+	}
+	return lo, hi, nil
+}
+
+func envOr(key, fallback string) string {
+	if v := os.Getenv(key); v != "" {
+		return v
+	}
+	return fallback
+}
+
+func envIntOr(key string, fallback int) int {
+	v := os.Getenv(key)
+	if v == "" {
+		return fallback
+	}
+	n, err := strconv.Atoi(v)
+	if err != nil {
+		return fallback
+	}
+	return n
+}
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -0,0 +1,79 @@
+package config
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestLoad_MissingRequired(t *testing.T) {
+	os.Unsetenv("LLAMA_SERVER_BIN")
+	os.Unsetenv("MODEL_DIR_MAP_FILE")
+	_, err := Load()
+	if err == nil {
+		t.Fatal("expected error for missing LLAMA_SERVER_BIN")
+	}
+}
+
+func TestParsePortRange(t *testing.T) {
+	lo, hi, err := parsePortRange("8500-8599")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if lo != 8500 || hi != 8599 {
+		t.Fatalf("got %d-%d", lo, hi)
+	}
+}
+
+func TestParsePortRange_Bad(t *testing.T) {
+	_, _, err := parsePortRange("abc")
+	if err == nil {
+		t.Fatal("expected error")
+	}
+	_, _, err = parsePortRange("100-50")
+	if err == nil {
+		t.Fatal("expected error for hi <= lo")
+	}
+}
+
+func TestLoadModelMap_BOM(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "model_map.json")
+	content := append([]byte{0xEF, 0xBB, 0xBF}, []byte(`{"test-model": "/fake/path.gguf"}`)...)
+	if err := os.WriteFile(path, content, 0644); err != nil {
+		t.Fatal(err)
+	}
+	m, err := loadModelMap(path)
+	if err != nil {
+		t.Fatalf("BOM-prefixed JSON should parse: %v", err)
+	}
+	if m["test-model"] != "/fake/path.gguf" {
+		t.Fatalf("unexpected map: %v", m)
+	}
+}
+
+func TestDefaultBaseArgs_FlashAttn(t *testing.T) {
+	args := defaultBaseArgs()
+	for i, a := range args {
+		if a == "--flash-attn" && i+1 < len(args) && args[i+1] == "on" {
+			return
+		}
+	}
+	t.Fatal("expected --flash-attn on in default args")
+}
+
+func TestDefaultBaseArgs(t *testing.T) {
+	args := defaultBaseArgs()
+	if len(args) == 0 {
+		t.Fatal("expected non-empty default args")
+	}
+	found := false
+	for _, a := range args {
+		if a == "--no-mmap" {
+			found = true
+		}
+	}
+	if !found {
+		t.Fatal("expected --no-mmap in default args")
+	}
+}
--- a/internal/pool/hash.go
+++ b/internal/pool/hash.go
@@ -0,0 +1,53 @@
+package pool
+
+import (
+	"crypto/sha256"
+	"fmt"
+	"sort"
+	"strings"
+
+	"github.com/indifferentketchup/llama-sidecar/internal/validator"
+)
+
+// Hash computes a deterministic hash for a (modelID, flags) pair.
+// Flag order does not affect the result.
+func Hash(modelID string, flags []string) string {
+	type pair struct {
+		key, val string
+	}
+
+	var pairs []pair
+	i := 0
+	for i < len(flags) {
+		tok := flags[i]
+		key := validator.FlagName(tok)
+		if key == "" {
+			i++
+			continue
+		}
+		if idx := strings.IndexByte(tok, '='); idx >= 0 {
+			pairs = append(pairs, pair{key: tok[:idx], val: tok[idx+1:]})
+			i++
+		} else if i+1 < len(flags) && validator.FlagName(flags[i+1]) == "" {
+			pairs = append(pairs, pair{key: key, val: flags[i+1]})
+			i += 2
+		} else {
+			pairs = append(pairs, pair{key: key, val: ""})
+			i++
+		}
+	}
+
+	sort.Slice(pairs, func(a, b int) bool {
+		return pairs[a].key < pairs[b].key
+	})
+
+	var parts []string
+	for _, p := range pairs {
+		parts = append(parts, p.key+"\x1f"+p.val)
+	}
+	serialized := strings.Join(parts, "\x1e")
+	input := modelID + "\x1d" + serialized
+
+	sum := sha256.Sum256([]byte(input))
+	return fmt.Sprintf("%x", sum[:8])
+}
--- a/internal/pool/hash_test.go
+++ b/internal/pool/hash_test.go
@@ -0,0 +1,53 @@
+package pool
+
+import (
+	"math/rand"
+	"testing"
+)
+
+func TestHash_OrderIndependence(t *testing.T) {
+	flags1 := []string{"--a", "1", "--b", "2", "--c", "3"}
+	h1 := Hash("foo", flags1)
+
+	for i := 0; i < 5; i++ {
+		shuffled := make([]string, len(flags1))
+		copy(shuffled, flags1)
+		// Shuffle pairs (each pair is 2 tokens)
+		pairs := make([][2]string, 0)
+		for j := 0; j < len(shuffled); j += 2 {
+			pairs = append(pairs, [2]string{shuffled[j], shuffled[j+1]})
+		}
+		rand.Shuffle(len(pairs), func(a, b int) { pairs[a], pairs[b] = pairs[b], pairs[a] })
+		var flat []string
+		for _, p := range pairs {
+			flat = append(flat, p[0], p[1])
+		}
+		h := Hash("foo", flat)
+		if h != h1 {
+			t.Errorf("iteration %d: hash %s != %s for order %v", i, h, h1, flat)
+		}
+	}
+}
+
+func TestHash_SeparatorCollision(t *testing.T) {
+	h1 := Hash("foo", []string{"--a\x1eb", "1"})
+	h2 := Hash("foo", []string{"--ab", "1"})
+	if h1 == h2 {
+		t.Error("separator collision: hashes should differ")
+	}
+}
+
+func TestHash_Length(t *testing.T) {
+	h := Hash("model", []string{"--top-k", "20"})
+	if len(h) != 16 {
+		t.Errorf("expected 16 hex chars, got %d: %s", len(h), h)
+	}
+}
+
+func TestHash_DifferentModels(t *testing.T) {
+	h1 := Hash("model-a", []string{"--top-k", "20"})
+	h2 := Hash("model-b", []string{"--top-k", "20"})
+	if h1 == h2 {
+		t.Error("different models should produce different hashes")
+	}
+}
--- a/internal/pool/pool.go
+++ b/internal/pool/pool.go
@@ -0,0 +1,188 @@
+package pool
+
+import (
+	"container/list"
+	"context"
+	"fmt"
+	"log/slog"
+	"sync"
+	"time"
+
+	"github.com/indifferentketchup/llama-sidecar/internal/config"
+	"github.com/indifferentketchup/llama-sidecar/internal/validator"
+)
+
+type SidecarInfo struct {
+	Hash      string    `json:"hash"`
+	ModelID   string    `json:"model_id"`
+	Flags     []string  `json:"flags"`
+	Port      int       `json:"port"`
+	Pid       int       `json:"pid"`
+	StartedAt time.Time `json:"started_at"`
+	LastUsed  time.Time `json:"last_used"`
+	Healthy   bool      `json:"healthy"`
+}
+
+type Pool struct {
+	mu       sync.Mutex
+	cfg      *config.Config
+	sidecars map[string]*Sidecar
+	lru      *list.List
+	lruIdx   map[string]*list.Element
+	ports    *PortAllocator
+	spawner  Spawner
+}
+
+func New(cfg *config.Config, spawner Spawner) *Pool {
+	return &Pool{
+		cfg:      cfg,
+		sidecars: make(map[string]*Sidecar),
+		lru:      list.New(),
+		lruIdx:   make(map[string]*list.Element),
+		ports:    NewPortAllocator(cfg.PortRangeLo, cfg.PortRangeHi),
+		spawner:  spawner,
+	}
+}
+
+func (p *Pool) Acquire(ctx context.Context, modelID string, flags []string) (*Sidecar, error) {
+	if _, err := validator.ValidateExtraArgs(flags); err != nil {
+		return nil, fmt.Errorf("validation: %w", err)
+	}
+
+	modelPath, ok := p.cfg.ModelDirMap[modelID]
+	if !ok {
+		return nil, fmt.Errorf("unknown model: %s", modelID)
+	}
+
+	hash := Hash(modelID, flags)
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	if s, ok := p.sidecars[hash]; ok {
+		if s.Healthy() {
+			if el, ok := p.lruIdx[hash]; ok {
+				p.lru.MoveToFront(el)
+			}
+			s.TouchLastUsed()
+			return s, nil
+		}
+		p.removeLocked(hash)
+	}
+
+	if len(p.sidecars) >= p.cfg.MaxSidecars {
+		if err := p.evictLRULocked(); err != nil {
+			return nil, fmt.Errorf("eviction failed: %w", err)
+		}
+	}
+
+	port, err := p.ports.Allocate()
+	if err != nil {
+		return nil, fmt.Errorf("port allocation: %w", err)
+	}
+
+	p.mu.Unlock()
+	s, err := p.spawner.Spawn(ctx, p.cfg, modelID, modelPath, flags, port, hash)
+	p.mu.Lock()
+
+	if err != nil {
+		p.ports.Release(port)
+		return nil, fmt.Errorf("spawn: %w", err)
+	}
+
+	p.sidecars[hash] = s
+	el := p.lru.PushFront(hash)
+	p.lruIdx[hash] = el
+	return s, nil
+}
+
+func (p *Pool) List() []SidecarInfo {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	out := make([]SidecarInfo, 0, len(p.sidecars))
+	for _, s := range p.sidecars {
+		out = append(out, SidecarInfo{
+			Hash:      s.Hash,
+			ModelID:   s.ModelID,
+			Flags:     s.Flags,
+			Port:      s.Port,
+			Pid:       s.Pid,
+			StartedAt: s.StartedAt,
+			LastUsed:  time.Unix(0, s.LastUsed.Load()),
+			Healthy:   s.Healthy(),
+		})
+	}
+	return out
+}
+
+func (p *Pool) Remove(hash string) error {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	if _, ok := p.sidecars[hash]; !ok {
+		return fmt.Errorf("sidecar %s not found", hash)
+	}
+	return p.removeLocked(hash)
+}
+
+func (p *Pool) Shutdown(ctx context.Context) error {
+	p.mu.Lock()
+	hashes := make([]string, 0, len(p.sidecars))
+	for h := range p.sidecars {
+		hashes = append(hashes, h)
+	}
+	p.mu.Unlock()
+
+	var wg sync.WaitGroup
+	for _, h := range hashes {
+		wg.Add(1)
+		go func(hash string) {
+			defer wg.Done()
+			p.mu.Lock()
+			s, ok := p.sidecars[hash]
+			p.mu.Unlock()
+			if !ok {
+				return
+			}
+			if err := p.spawner.Kill(s); err != nil {
+				slog.Error("shutdown kill failed", "hash", hash, "err", err)
+			}
+		}(h)
+	}
+
+	done := make(chan struct{})
+	go func() { wg.Wait(); close(done) }()
+	select {
+	case <-done:
+	case <-ctx.Done():
+		return ctx.Err()
+	}
+	slog.Info("pool shutdown complete", "count", len(hashes))
+	return nil
+}
+
+func (p *Pool) removeLocked(hash string) error {
+	s, ok := p.sidecars[hash]
+	if !ok {
+		return nil
+	}
+	delete(p.sidecars, hash)
+	if el, ok := p.lruIdx[hash]; ok {
+		p.lru.Remove(el)
+		delete(p.lruIdx, hash)
+	}
+	if err := p.spawner.Kill(s); err != nil {
+		slog.Error("kill failed during remove", "hash", hash, "err", err)
+	}
+	p.ports.Release(s.Port)
+	return nil
+}
+
+func (p *Pool) evictLRULocked() error {
+	back := p.lru.Back()
+	if back == nil {
+		return fmt.Errorf("pool full but LRU empty")
+	}
+	hash := back.Value.(string)
+	slog.Info("evicting LRU sidecar", "hash", hash)
+	return p.removeLocked(hash)
+}
--- a/internal/pool/pool_test.go
+++ b/internal/pool/pool_test.go
@@ -0,0 +1,151 @@
+package pool
+
+import (
+	"context"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/indifferentketchup/llama-sidecar/internal/config"
+)
+
+type fakeSpawner struct {
+	spawnCount atomic.Int32
+	killCount  atomic.Int32
+}
+
+func (f *fakeSpawner) Spawn(ctx context.Context, cfg *config.Config, modelID, modelPath string, flags []string, port int, hash string) (*Sidecar, error) {
+	f.spawnCount.Add(1)
+	s := &Sidecar{
+		Hash:      hash,
+		ModelID:   modelID,
+		ModelPath: modelPath,
+		Flags:     flags,
+		Port:      port,
+		Pid:       99999,
+		StartedAt: time.Now(),
+		stderr:    newRingBuffer(8),
+		cancel:    func() {},
+	}
+	s.healthy.Store(true)
+	s.LastUsed.Store(time.Now().UnixNano())
+	return s, nil
+}
+
+func (f *fakeSpawner) Kill(s *Sidecar) error {
+	f.killCount.Add(1)
+	return nil
+}
+
+func testConfig() *config.Config {
+	return &config.Config{
+		Bind:           "127.0.0.1:0",
+		LlamaServerBin: "/fake/llama-server",
+		ModelDirMap: map[string]string{
+			"model-a": "/fake/model-a.gguf",
+			"model-b": "/fake/model-b.gguf",
+		},
+		PortRangeLo:          8500,
+		PortRangeHi:          8509,
+		MaxSidecars:          2,
+		BaseArgs:             []string{"-ngl", "999"},
+		HealthTimeoutSeconds: 60,
+	}
+}
+
+func TestPool_AcquireSameKey(t *testing.T) {
+	fs := &fakeSpawner{}
+	p := New(testConfig(), fs)
+	ctx := context.Background()
+
+	s1, err := p.Acquire(ctx, "model-a", []string{"--top-k", "20"})
+	if err != nil {
+		t.Fatal(err)
+	}
+	s2, err := p.Acquire(ctx, "model-a", []string{"--top-k", "20"})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if s1.Hash != s2.Hash {
+		t.Fatalf("expected same sidecar, got different hashes: %s vs %s", s1.Hash, s2.Hash)
+	}
+	if fs.spawnCount.Load() != 1 {
+		t.Fatalf("expected 1 spawn, got %d", fs.spawnCount.Load())
+	}
+}
+
+func TestPool_EvictLRU(t *testing.T) {
+	cfg := testConfig()
+	cfg.MaxSidecars = 1
+	fs := &fakeSpawner{}
+	p := New(cfg, fs)
+	ctx := context.Background()
+
+	_, err := p.Acquire(ctx, "model-a", []string{"--top-k", "20"})
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = p.Acquire(ctx, "model-b", []string{"--top-k", "40"})
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if fs.spawnCount.Load() != 2 {
+		t.Fatalf("expected 2 spawns, got %d", fs.spawnCount.Load())
+	}
+	if fs.killCount.Load() != 1 {
+		t.Fatalf("expected 1 kill (eviction), got %d", fs.killCount.Load())
+	}
+	list := p.List()
+	if len(list) != 1 {
+		t.Fatalf("expected 1 sidecar, got %d", len(list))
+	}
+	if list[0].ModelID != "model-b" {
+		t.Fatalf("expected model-b, got %s", list[0].ModelID)
+	}
+}
+
+func TestPool_ValidatorReject(t *testing.T) {
+	fs := &fakeSpawner{}
+	p := New(testConfig(), fs)
+	_, err := p.Acquire(context.Background(), "model-a", []string{"--model", "evil.gguf"})
+	if err == nil {
+		t.Fatal("expected validation error")
+	}
+}
+
+func TestPool_UnknownModel(t *testing.T) {
+	fs := &fakeSpawner{}
+	p := New(testConfig(), fs)
+	_, err := p.Acquire(context.Background(), "nonexistent", nil)
+	if err == nil {
+		t.Fatal("expected unknown model error")
+	}
+}
+
+func TestPool_ConcurrentAcquire(t *testing.T) {
+	cfg := testConfig()
+	cfg.MaxSidecars = 10
+	cfg.PortRangeHi = 8599
+	fs := &fakeSpawner{}
+	p := New(cfg, fs)
+	ctx := context.Background()
+
+	var wg sync.WaitGroup
+	for i := 0; i < 10; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for j := 0; j < 50; j++ {
+				_, _ = p.Acquire(ctx, "model-a", []string{"--top-k", "20"})
+			}
+		}()
+	}
+	wg.Wait()
+
+	list := p.List()
+	if len(list) != 1 {
+		t.Fatalf("expected 1 sidecar (same key), got %d", len(list))
+	}
+}
--- a/internal/pool/ports.go
+++ b/internal/pool/ports.go
@@ -0,0 +1,28 @@
+package pool
+
+import "fmt"
+
+type PortAllocator struct {
+	ports chan int
+}
+
+func NewPortAllocator(lo, hi int) *PortAllocator {
+	ch := make(chan int, hi-lo+1)
+	for p := lo; p <= hi; p++ {
+		ch <- p
+	}
+	return &PortAllocator{ports: ch}
+}
+
+func (pa *PortAllocator) Allocate() (int, error) {
+	select {
+	case p := <-pa.ports:
+		return p, nil
+	default:
+		return 0, fmt.Errorf("port allocator exhausted")
+	}
+}
+
+func (pa *PortAllocator) Release(port int) {
+	pa.ports <- port
+}
--- a/internal/pool/ports_test.go
+++ b/internal/pool/ports_test.go
@@ -0,0 +1,74 @@
+package pool
+
+import (
+	"sync"
+	"testing"
+)
+
+func TestPortAllocator_AllocateRelease(t *testing.T) {
+	pa := NewPortAllocator(8500, 8502)
+	p1, err := pa.Allocate()
+	if err != nil {
+		t.Fatal(err)
+	}
+	p2, err := pa.Allocate()
+	if err != nil {
+		t.Fatal(err)
+	}
+	p3, err := pa.Allocate()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// All three ports should be distinct
+	if p1 == p2 || p2 == p3 || p1 == p3 {
+		t.Fatalf("expected distinct ports: %d, %d, %d", p1, p2, p3)
+	}
+
+	// Exhausted
+	_, err = pa.Allocate()
+	if err == nil {
+		t.Fatal("expected error when exhausted")
+	}
+
+	// Release and re-allocate
+	pa.Release(p2)
+	p4, err := pa.Allocate()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if p4 != p2 {
+		t.Fatalf("expected released port %d, got %d", p2, p4)
+	}
+}
+
+func TestPortAllocator_Concurrent(t *testing.T) {
+	pa := NewPortAllocator(8500, 8599)
+	var wg sync.WaitGroup
+	allocated := make(chan int, 100)
+
+	for i := 0; i < 100; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			p, err := pa.Allocate()
+			if err != nil {
+				return
+			}
+			allocated <- p
+		}()
+	}
+	wg.Wait()
+	close(allocated)
+
+	seen := make(map[int]bool)
+	for p := range allocated {
+		if seen[p] {
+			t.Fatalf("duplicate port %d", p)
+		}
+		seen[p] = true
+	}
+	if len(seen) != 100 {
+		t.Fatalf("expected 100 ports, got %d", len(seen))
+	}
+}
--- a/internal/pool/sidecar.go
+++ b/internal/pool/sidecar.go
@@ -0,0 +1,313 @@
+package pool
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"os"
+	"os/exec"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/indifferentketchup/llama-sidecar/internal/config"
+	"github.com/indifferentketchup/llama-sidecar/internal/validator"
+)
+
+type Sidecar struct {
+	Hash      string
+	ModelID   string
+	ModelPath string
+	Flags     []string
+	Port      int
+	Pid       int
+	StartedAt time.Time
+	LastUsed  atomic.Int64
+	healthy   atomic.Bool
+	cmd       *exec.Cmd
+	cancel    context.CancelFunc
+	done      chan error
+	stderr    *ringBuffer
+	stopMon   context.CancelFunc
+	stdinFile  *os.File
+	stdoutR    *os.File
+	stdoutFile *os.File
+}
+
+func (s *Sidecar) Healthy() bool {
+	return s.healthy.Load()
+}
+
+func (s *Sidecar) TouchLastUsed() {
+	s.LastUsed.Store(time.Now().UnixNano())
+}
+
+func (s *Sidecar) LastStderr() string {
+	return s.stderr.String()
+}
+
+// Spawner abstracts sidecar creation for testing.
+type Spawner interface {
+	Spawn(ctx context.Context, cfg *config.Config, modelID, modelPath string, flags []string, port int, hash string) (*Sidecar, error)
+	Kill(s *Sidecar) error
+}
+
+type RealSpawner struct{}
+
+func (rs *RealSpawner) Spawn(ctx context.Context, cfg *config.Config, modelID, modelPath string, flags []string, port int, hash string) (*Sidecar, error) {
+	args := buildArgs(cfg.BaseArgs, modelPath, port, flags)
+	_ = ctx
+	childCtx, cancel := context.WithCancel(context.Background())
+	cmd := exec.CommandContext(childCtx, cfg.LlamaServerBin, args...)
+	setPlatformAttrs(cmd)
+
+	devNull, err := os.Open(os.DevNull)
+	if err != nil {
+		cancel()
+		return nil, fmt.Errorf("open devnull: %w", err)
+	}
+	cmd.Stdin = devNull
+
+	stderr := newRingBuffer(64)
+	prefix := fmt.Sprintf("[sidecar:%s:%d] ", hash[:8], port)
+	cmd.Stderr = io.MultiWriter(stderr, &prefixWriter{prefix: prefix})
+	stdoutR, stdoutW, err := os.Pipe()
+	if err != nil {
+		cancel()
+		devNull.Close()
+		return nil, fmt.Errorf("stdout pipe: %w", err)
+	}
+	go io.Copy(io.Discard, stdoutR)
+	cmd.Stdout = stdoutW
+
+	slog.Info("spawning sidecar", "hash", hash, "model", modelID, "port", port, "args", strings.Join(args, " "))
+	if err := cmd.Start(); err != nil {
+		cancel()
+		return nil, fmt.Errorf("spawn failed: %w", err)
+	}
+
+	s := &Sidecar{
+		Hash:      hash,
+		ModelID:   modelID,
+		ModelPath: modelPath,
+		Flags:     flags,
+		Port:      port,
+		Pid:       cmd.Process.Pid,
+		StartedAt: time.Now(),
+		cmd:       cmd,
+		cancel:    cancel,
+		done:      make(chan error, 1),
+		stderr:    stderr,
+		stdinFile:  devNull,
+		stdoutR:    stdoutR,
+		stdoutFile: stdoutW,
+	}
+	s.LastUsed.Store(time.Now().UnixNano())
+
+	go func() {
+		err := cmd.Wait()
+		s.healthy.Store(false)
+		exitCode := -1
+		if cmd.ProcessState != nil {
+			exitCode = cmd.ProcessState.ExitCode()
+		}
+		slog.Error("sidecar child exited",
+			"hash", hash,
+			"port", port,
+			"pid", s.Pid,
+			"exit_code", exitCode,
+			"wait_err", fmt.Sprintf("%v", err),
+			"uptime", time.Since(s.StartedAt).Round(time.Millisecond),
+			"stderr_tail", stderr.String(),
+		)
+		s.done <- err
+		close(s.done)
+	}()
+
+	// Wait for health
+	healthURL := fmt.Sprintf("http://127.0.0.1:%d/health", port)
+	deadline := time.Now().Add(time.Duration(cfg.HealthTimeoutSeconds) * time.Second)
+	for time.Now().Before(deadline) {
+		resp, err := http.Get(healthURL)
+		if err == nil {
+			resp.Body.Close()
+			if resp.StatusCode == 200 {
+				s.healthy.Store(true)
+				slog.Info("sidecar healthy", "hash", hash, "port", port, "elapsed", time.Since(s.StartedAt).Round(time.Millisecond))
+				monCtx, monCancel := context.WithCancel(ctx)
+				s.stopMon = monCancel
+				go s.healthMonitor(monCtx, cfg.HealthIntervalSeconds)
+				return s, nil
+			}
+		}
+		select {
+		case <-childCtx.Done():
+			return nil, fmt.Errorf("sidecar process exited during health check")
+		case <-time.After(500 * time.Millisecond):
+		}
+	}
+
+	_ = rs.Kill(s)
+	return nil, fmt.Errorf("health check timed out after %ds, last stderr: %s", cfg.HealthTimeoutSeconds, s.stderr.LastLine())
+}
+
+func (rs *RealSpawner) Kill(s *Sidecar) error {
+	if s.stopMon != nil {
+		s.stopMon()
+	}
+	s.cancel()
+	select {
+	case <-s.done:
+	case <-time.After(5 * time.Second):
+		if s.cmd.Process != nil {
+			_ = s.cmd.Process.Kill()
+		}
+		<-s.done
+	}
+	if s.stdinFile != nil {
+		s.stdinFile.Close()
+	}
+	if s.stdoutFile != nil {
+		s.stdoutFile.Close()
+	}
+	if s.stdoutR != nil {
+		s.stdoutR.Close()
+	}
+	slog.Info("sidecar killed", "hash", s.Hash, "port", s.Port)
+	return nil
+}
+
+func (s *Sidecar) healthMonitor(ctx context.Context, intervalSec int) {
+	ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
+	defer ticker.Stop()
+	failures := 0
+	url := fmt.Sprintf("http://127.0.0.1:%d/health", s.Port)
+	client := &http.Client{Timeout: 5 * time.Second}
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			resp, err := client.Get(url)
+			if err != nil || resp.StatusCode != 200 {
+				if resp != nil {
+					resp.Body.Close()
+				}
+				failures++
+				if failures >= 3 {
+					slog.Warn("sidecar unhealthy, marking for eviction", "hash", s.Hash, "port", s.Port)
+					s.healthy.Store(false)
+					return
+				}
+			} else {
+				resp.Body.Close()
+				failures = 0
+			}
+		}
+	}
+}
+
+func buildArgs(baseArgs []string, modelPath string, port int, userFlags []string) []string {
+	deduped := dedupFlags(baseArgs, userFlags)
+	args := make([]string, 0, len(deduped)+len(userFlags)+4)
+	args = append(args, deduped...)
+	args = append(args, "--model", modelPath)
+	args = append(args, "--port", strconv.Itoa(port))
+	args = append(args, userFlags...)
+	return args
+}
+
+// dedupFlags removes from autoArgs any flag that the user also supplied,
+// so the user's value wins via llama.cpp's last-wins CLI parsing.
+func dedupFlags(autoArgs, userArgs []string) []string {
+	userNames := make(map[string]bool)
+	for _, tok := range userArgs {
+		if name := validator.FlagName(tok); name != "" {
+			userNames[name] = true
+		}
+	}
+	out := make([]string, 0, len(autoArgs))
+	i := 0
+	for i < len(autoArgs) {
+		tok := autoArgs[i]
+		name := validator.FlagName(tok)
+		if name == "" || !userNames[name] {
+			out = append(out, tok)
+			i++
+			continue
+		}
+		if strings.Contains(tok, "=") {
+			i++
+		} else if i+1 < len(autoArgs) && validator.FlagName(autoArgs[i+1]) == "" {
+			i += 2
+		} else {
+			i++
+		}
+	}
+	return out
+}
+
+// Ring buffer for last N lines of stderr
+type ringBuffer struct {
+	mu    sync.Mutex
+	lines []string
+	max   int
+}
+
+func newRingBuffer(max int) *ringBuffer {
+	return &ringBuffer{lines: make([]string, 0, max), max: max}
+}
+
+func (rb *ringBuffer) Write(p []byte) (int, error) {
+	rb.mu.Lock()
+	defer rb.mu.Unlock()
+	for _, line := range strings.Split(string(p), "\n") {
+		line = strings.TrimRight(line, "\r\n")
+		if line == "" {
+			continue
+		}
+		if len(rb.lines) >= rb.max {
+			rb.lines = rb.lines[1:]
+		}
+		rb.lines = append(rb.lines, line)
+	}
+	return len(p), nil
+}
+
+func (rb *ringBuffer) String() string {
+	rb.mu.Lock()
+	defer rb.mu.Unlock()
+	return strings.Join(rb.lines, "\n")
+}
+
+func (rb *ringBuffer) LastLine() string {
+	rb.mu.Lock()
+	defer rb.mu.Unlock()
+	if len(rb.lines) == 0 {
+		return ""
+	}
+	return rb.lines[len(rb.lines)-1]
+}
+
+type prefixWriter struct {
+	prefix string
+	buf    bytes.Buffer
+}
+
+func (pw *prefixWriter) Write(p []byte) (int, error) {
+	pw.buf.Write(p)
+	for {
+		line, err := pw.buf.ReadString('\n')
+		if err != nil {
+			pw.buf.WriteString(line)
+			break
+		}
+		fmt.Fprint(os.Stderr, pw.prefix+line)
+	}
+	return len(p), nil
+}
--- a/internal/pool/sidecar_test.go
+++ b/internal/pool/sidecar_test.go
@@ -0,0 +1,96 @@
+package pool
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestBuildArgs_PreservesNonOverlapping(t *testing.T) {
+	base := []string{"-ngl", "999", "-c", "32768", "--flash-attn", "on", "--no-mmap"}
+	user := []string{"--top-k", "20"}
+	got := buildArgs(base, "/model.gguf", 8500, user)
+
+	// -c 32768 must survive (user didn't supply -c)
+	if !containsSeq(got, "-c", "32768") {
+		t.Errorf("-c 32768 missing from args: %v", got)
+	}
+	// --top-k 20 must be present (user flag)
+	if !containsSeq(got, "--top-k", "20") {
+		t.Errorf("--top-k 20 missing from args: %v", got)
+	}
+	// --model and --port injected
+	if !containsSeq(got, "--model", "/model.gguf") {
+		t.Errorf("--model missing: %v", got)
+	}
+	if !containsSeq(got, "--port", "8500") {
+		t.Errorf("--port missing: %v", got)
+	}
+}
+
+func TestBuildArgs_UserOverridesBase(t *testing.T) {
+	base := []string{"-ngl", "999", "-c", "32768"}
+	user := []string{"-c", "131072"}
+	got := buildArgs(base, "/model.gguf", 8500, user)
+
+	// base -c should be dropped, user -c should be present
+	count := 0
+	for i, tok := range got {
+		if tok == "-c" && i+1 < len(got) {
+			count++
+			if got[i+1] == "32768" {
+				t.Errorf("base -c 32768 should have been deduped: %v", got)
+			}
+		}
+	}
+	if count != 1 {
+		t.Errorf("expected exactly 1 -c flag, got %d in %v", count, got)
+	}
+}
+
+func TestBuildArgs_NoUserFlags(t *testing.T) {
+	base := []string{"-ngl", "999", "-c", "32768", "--no-mmap"}
+	got := buildArgs(base, "/model.gguf", 8500, nil)
+
+	if !containsSeq(got, "-c", "32768") {
+		t.Errorf("-c 32768 missing when no user flags: %v", got)
+	}
+	if !containsSeq(got, "--no-mmap") {
+		t.Errorf("--no-mmap missing: %v", got)
+	}
+}
+
+func TestDedupFlags_Mixed(t *testing.T) {
+	auto := []string{"--top-k", "40", "-c", "32768", "--no-mmap"}
+	user := []string{"--top-k", "20"}
+	got := dedupFlags(auto, user)
+	want := []string{"-c", "32768", "--no-mmap"}
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("dedupFlags = %v, want %v", got, want)
+	}
+}
+
+func TestDedupFlags_EqualsForm(t *testing.T) {
+	auto := []string{"--ctx-size=4096", "--no-mmap"}
+	user := []string{"--ctx-size", "8192"}
+	got := dedupFlags(auto, user)
+	want := []string{"--no-mmap"}
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("dedupFlags = %v, want %v", got, want)
+	}
+}
+
+func containsSeq(args []string, seq ...string) bool {
+	for i := 0; i <= len(args)-len(seq); i++ {
+		match := true
+		for j, s := range seq {
+			if args[i+j] != s {
+				match = false
+				break
+			}
+		}
+		if match {
+			return true
+		}
+	}
+	return false
+}
--- a/internal/pool/sidecar_unix.go
+++ b/internal/pool/sidecar_unix.go
@@ -0,0 +1,7 @@
+//go:build !windows
+
+package pool
+
+import "os/exec"
+
+func setPlatformAttrs(_ *exec.Cmd) {}
--- a/internal/pool/sidecar_windows.go
+++ b/internal/pool/sidecar_windows.go
@@ -0,0 +1,15 @@
+//go:build windows
+
+package pool
+
+import (
+	"os/exec"
+	"syscall"
+)
+
+func setPlatformAttrs(cmd *exec.Cmd) {
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		HideWindow:    true,
+		CreationFlags: 0x00000008 | 0x00000200, // DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP
+	}
+}
--- a/internal/server/admin.go
+++ b/internal/server/admin.go
@@ -0,0 +1,42 @@
+package server
+
+import (
+	"net/http"
+	"time"
+
+	"github.com/indifferentketchup/llama-sidecar/internal/config"
+	"github.com/indifferentketchup/llama-sidecar/internal/pool"
+)
+
+func healthHandler(p *pool.Pool, cfg *config.Config, startedAt time.Time) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		sidecars := p.List()
+		writeJSON(w, http.StatusOK, map[string]any{
+			"status":         "ok",
+			"sidecars":       len(sidecars),
+			"max":            cfg.MaxSidecars,
+			"uptime_seconds": int(time.Since(startedAt).Seconds()),
+		})
+	}
+}
+
+func listSidecarsHandler(p *pool.Pool) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		writeJSON(w, http.StatusOK, p.List())
+	}
+}
+
+func deleteSidecarHandler(p *pool.Pool) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		hash := r.PathValue("hash")
+		if hash == "" {
+			writeJSON(w, http.StatusBadRequest, map[string]string{"error": "hash required"})
+			return
+		}
+		if err := p.Remove(hash); err != nil {
+			writeJSON(w, http.StatusNotFound, map[string]string{"error": err.Error()})
+			return
+		}
+		writeJSON(w, http.StatusOK, map[string]string{"status": "removed"})
+	}
+}
--- a/internal/server/proxy.go
+++ b/internal/server/proxy.go
@@ -0,0 +1,111 @@
+package server
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"net/http/httputil"
+	"net/url"
+	"strings"
+
+	"github.com/indifferentketchup/llama-sidecar/internal/pool"
+)
+
+var shellUnsafe = strings.NewReplacer(
+	"`", "", "$", "", "|", "", ";", "", "&", "", "\n", "",
+)
+
+func parseFlags(raw string) ([]string, error) {
+	cleaned := shellUnsafe.Replace(raw)
+	if cleaned != raw {
+		return nil, fmt.Errorf("flags contain unsafe characters")
+	}
+	return splitArgs(strings.TrimSpace(raw)), nil
+}
+
+func splitArgs(s string) []string {
+	if s == "" {
+		return nil
+	}
+	return strings.Fields(s)
+}
+
+func proxyHandler(p *pool.Pool) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		flagsRaw := r.Header.Get("X-Agent-Flags")
+		var flags []string
+		if flagsRaw != "" {
+			var err error
+			flags, err = parseFlags(flagsRaw)
+			if err != nil {
+				writeJSON(w, http.StatusBadRequest, map[string]string{
+					"error": err.Error(),
+				})
+				return
+			}
+		}
+
+		modelID := r.Header.Get("X-Model-Id")
+		if modelID == "" {
+			body, err := io.ReadAll(io.LimitReader(r.Body, 1<<20))
+			if err != nil {
+				writeJSON(w, http.StatusBadRequest, map[string]string{"error": "failed to read body"})
+				return
+			}
+			var req struct {
+				Model string `json:"model"`
+			}
+			if err := json.Unmarshal(body, &req); err == nil && req.Model != "" {
+				modelID = req.Model
+			}
+			r.Body = io.NopCloser(strings.NewReader(string(body)))
+			r.ContentLength = int64(len(body))
+		}
+		if modelID == "" {
+			writeJSON(w, http.StatusBadRequest, map[string]string{"error": "model not specified (X-Model-Id header or body.model)"})
+			return
+		}
+
+		sidecar, err := p.Acquire(r.Context(), modelID, flags)
+		if err != nil {
+			errMsg := err.Error()
+			status := http.StatusInternalServerError
+			if strings.Contains(errMsg, "validation:") {
+				status = http.StatusBadRequest
+			} else if strings.Contains(errMsg, "unknown model:") {
+				status = http.StatusNotFound
+			} else if strings.Contains(errMsg, "port allocation:") {
+				status = http.StatusServiceUnavailable
+			}
+			writeJSON(w, status, map[string]string{"error": errMsg})
+			return
+		}
+
+		target := &url.URL{
+			Scheme: "http",
+			Host:   fmt.Sprintf("127.0.0.1:%d", sidecar.Port),
+		}
+		proxy := httputil.NewSingleHostReverseProxy(target)
+		proxy.ErrorHandler = func(rw http.ResponseWriter, req *http.Request, err error) {
+			slog.Error("upstream error", "hash", sidecar.Hash, "port", sidecar.Port, "err", err)
+			writeJSON(rw, http.StatusBadGateway, map[string]any{
+				"error":        "upstream unavailable",
+				"error_detail": err.Error(),
+				"sidecar_hash": sidecar.Hash,
+				"sidecar_port": sidecar.Port,
+				"last_stderr":  sidecar.LastStderr(),
+			})
+		}
+
+		sidecar.TouchLastUsed()
+		proxy.ServeHTTP(w, r)
+	}
+}
+
+func writeJSON(w http.ResponseWriter, status int, v any) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	json.NewEncoder(w).Encode(v)
+}
--- a/internal/server/server.go
+++ b/internal/server/server.go
@@ -0,0 +1,56 @@
+package server
+
+import (
+	"log/slog"
+	"net/http"
+	"time"
+
+	"github.com/indifferentketchup/llama-sidecar/internal/config"
+	"github.com/indifferentketchup/llama-sidecar/internal/pool"
+)
+
+func New(cfg *config.Config, p *pool.Pool, startedAt time.Time) *http.Server {
+	mux := http.NewServeMux()
+	mux.HandleFunc("GET /health", healthHandler(p, cfg, startedAt))
+	mux.HandleFunc("GET /sidecars", listSidecarsHandler(p))
+	mux.HandleFunc("DELETE /sidecars/{hash}", deleteSidecarHandler(p))
+	mux.HandleFunc("POST /v1/chat/completions", proxyHandler(p))
+	mux.HandleFunc("POST /v1/completions", proxyHandler(p))
+
+	handler := requestLogger(mux)
+
+	return &http.Server{
+		Addr:    cfg.Bind,
+		Handler: handler,
+	}
+}
+
+func requestLogger(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		start := time.Now()
+		rw := &statusRecorder{ResponseWriter: w, status: 200}
+		next.ServeHTTP(rw, r)
+		slog.Info("request",
+			"method", r.Method,
+			"path", r.URL.Path,
+			"status", rw.status,
+			"duration_ms", time.Since(start).Milliseconds(),
+		)
+	})
+}
+
+type statusRecorder struct {
+	http.ResponseWriter
+	status int
+}
+
+func (sr *statusRecorder) WriteHeader(code int) {
+	sr.status = code
+	sr.ResponseWriter.WriteHeader(code)
+}
+
+func (sr *statusRecorder) Flush() {
+	if f, ok := sr.ResponseWriter.(http.Flusher); ok {
+		f.Flush()
+	}
+}
--- a/internal/validator/validator.go
+++ b/internal/validator/validator.go
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: AGPL-3.0-only
+// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
+// Ported from studio/backend/core/inference/llama_server_args.py.
+// Original: https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/llama_server_args.py
+
+package validator
+
+import (
+	"fmt"
+	"strings"
+)
+
+var denylistGroups = [][]string{
+	// Model identity
+	{"-m", "--model"},
+	{"-mu", "--model-url"},
+	{"-dr", "--docker-repo"},
+	{"-hf", "-hfr", "--hf-repo"},
+	{"-hff", "--hf-file"},
+	{"-hfv", "-hfrv", "--hf-repo-v"},
+	{"-hffv", "--hf-file-v"},
+	{"-hft", "--hf-token"},
+	{"-mm", "--mmproj"},
+	{"-mmu", "--mmproj-url"},
+	// Networking
+	{"--host"},
+	{"--port"},
+	{"--path"},
+	{"--api-prefix"},
+	{"--reuse-port"},
+	// Auth / TLS
+	{"--api-key"},
+	{"--api-key-file"},
+	{"--ssl-key-file"},
+	{"--ssl-cert-file"},
+	// Server UI / multi-model
+	{"--webui", "--no-webui"},
+	{"--ui", "--no-ui"},
+	{"--ui-config"},
+	{"--ui-config-file"},
+	{"--ui-mcp-proxy", "--no-ui-mcp-proxy"},
+	{"--models-dir"},
+	{"--models-preset"},
+	{"--models-max"},
+	{"--models-autoload", "--no-models-autoload"},
+}
+
+var denylist map[string]bool
+
+func init() {
+	denylist = make(map[string]bool)
+	for _, group := range denylistGroups {
+		for _, flag := range group {
+			denylist[flag] = true
+		}
+	}
+}
+
+// FlagName returns the flag name for a CLI token, or "" if it isn't a flag.
+// Peels --key=value to the bare --key. Numeric values like -1 or -0.5
+// (e.g. --seed -1) are treated as values, not flags.
+func FlagName(token string) string {
+	if !strings.HasPrefix(token, "-") || token == "-" || token == "--" {
+		return ""
+	}
+	if len(token) >= 2 && (token[1] >= '0' && token[1] <= '9' || token[1] == '.') {
+		return ""
+	}
+	if idx := strings.IndexByte(token, '='); idx >= 0 {
+		return token[:idx]
+	}
+	return token
+}
+
+// ValidateExtraArgs validates user-supplied llama-server args. Returns the
+// args as a flat slice. Returns an error with the offending flag if any
+// token resolves to a managed flag.
+func ValidateExtraArgs(args []string) ([]string, error) {
+	if len(args) == 0 {
+		return nil, nil
+	}
+	out := make([]string, 0, len(args))
+	for _, raw := range args {
+		flag := FlagName(raw)
+		if flag != "" && denylist[flag] {
+			return nil, fmt.Errorf("llama-server flag '%s' is managed and cannot be passed as an extra arg", flag)
+		}
+		out = append(out, raw)
+	}
+	return out, nil
+}
+
+// IsManagedFlag returns true if flag is a managed llama-server flag.
+func IsManagedFlag(flag string) bool {
+	return denylist[flag]
+}
+
+var contextFlags = setOf("-c", "--ctx-size")
+var cacheFlags = setOf("-ctk", "--cache-type-k", "-ctv", "--cache-type-v")
+var specFlags = setOf(
+	"--spec-default", "--spec-type", "--spec-ngram-size-n", "--spec-ngram-size",
+	"--draft-min", "--draft-max",
+	"--spec-draft-n-max", "--spec-draft-n-min", "--spec-draft-p-min", "--spec-draft-p-split",
+	"--spec-ngram-mod-n-match", "--spec-ngram-mod-n-min", "--spec-ngram-mod-n-max",
+)
+var templateFlags = setOf(
+	"--chat-template", "--chat-template-file", "--chat-template-kwargs",
+	"--jinja", "--no-jinja",
+)
+var booleanShadowingFlags = setOf("--spec-default", "--jinja", "--no-jinja")
+
+func setOf(vals ...string) map[string]bool {
+	m := make(map[string]bool, len(vals))
+	for _, v := range vals {
+		m[v] = true
+	}
+	return m
+}
+
+// StripShadowingFlags removes flags that shadow first-class settings from
+// the arg list. By default all shadowing groups are stripped.
+func StripShadowingFlags(args []string) []string {
+	shadowing := make(map[string]bool)
+	for k, v := range contextFlags {
+		shadowing[k] = v
+	}
+	for k, v := range cacheFlags {
+		shadowing[k] = v
+	}
+	for k, v := range specFlags {
+		shadowing[k] = v
+	}
+	for k, v := range templateFlags {
+		shadowing[k] = v
+	}
+
+	out := make([]string, 0, len(args))
+	i, n := 0, len(args)
+	for i < n {
+		tok := args[i]
+		flag := FlagName(tok)
+		if flag == "" || !shadowing[flag] {
+			out = append(out, tok)
+			i++
+			continue
+		}
+		if booleanShadowingFlags[flag] || strings.Contains(tok, "=") {
+			i++
+		} else if i+1 < n && FlagName(args[i+1]) == "" {
+			i += 2
+		} else {
+			i++
+		}
+	}
+	return out
+}
--- a/internal/validator/validator_test.go
+++ b/internal/validator/validator_test.go
@@ -0,0 +1,150 @@
+package validator
+
+import (
+	"testing"
+)
+
+func TestValidateExtraArgs_DenyList(t *testing.T) {
+	denied := []string{
+		"-m", "--model",
+		"-mu", "--model-url",
+		"-dr", "--docker-repo",
+		"-hf", "-hfr", "--hf-repo",
+		"-hff", "--hf-file",
+		"-hfv", "-hfrv", "--hf-repo-v",
+		"-hffv", "--hf-file-v",
+		"-hft", "--hf-token",
+		"-mm", "--mmproj",
+		"-mmu", "--mmproj-url",
+		"--host", "--port", "--path", "--api-prefix", "--reuse-port",
+		"--api-key", "--api-key-file",
+		"--ssl-key-file", "--ssl-cert-file",
+		"--webui", "--no-webui", "--ui", "--no-ui",
+		"--ui-config", "--ui-config-file",
+		"--ui-mcp-proxy", "--no-ui-mcp-proxy",
+		"--models-dir", "--models-preset", "--models-max",
+		"--models-autoload", "--no-models-autoload",
+	}
+	for _, flag := range denied {
+		t.Run(flag, func(t *testing.T) {
+			_, err := ValidateExtraArgs([]string{flag})
+			if err == nil {
+				t.Fatalf("expected error for %s", flag)
+			}
+		})
+	}
+}
+
+func TestValidateExtraArgs_SafeFlags(t *testing.T) {
+	safe := []string{
+		"-c", "--ctx-size", "-ngl", "--gpu-layers",
+		"--top-k", "--cache-type-k", "--jinja", "--no-jinja",
+		"--spec-draft-n-max", "-fa", "--flash-attn",
+		"-t", "--threads", "-np", "--parallel", "--no-mmap",
+	}
+	for _, flag := range safe {
+		t.Run(flag, func(t *testing.T) {
+			out, err := ValidateExtraArgs([]string{flag})
+			if err != nil {
+				t.Fatalf("unexpected error for %s: %v", flag, err)
+			}
+			if len(out) != 1 || out[0] != flag {
+				t.Fatalf("expected [%s], got %v", flag, out)
+			}
+		})
+	}
+}
+
+func TestValidateExtraArgs_FlagEqualsValue(t *testing.T) {
+	_, err := ValidateExtraArgs([]string{"--model=evil.gguf"})
+	if err == nil {
+		t.Fatal("expected error for --model=evil.gguf")
+	}
+	out, err := ValidateExtraArgs([]string{"--ctx-size=4096"})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(out) != 1 || out[0] != "--ctx-size=4096" {
+		t.Fatalf("expected [--ctx-size=4096], got %v", out)
+	}
+}
+
+func TestValidateExtraArgs_NegativeNumber(t *testing.T) {
+	out, err := ValidateExtraArgs([]string{"--seed", "-1"})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(out) != 2 {
+		t.Fatalf("expected 2 tokens, got %d", len(out))
+	}
+}
+
+func TestValidateExtraArgs_Empty(t *testing.T) {
+	out, err := ValidateExtraArgs(nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if out != nil {
+		t.Fatalf("expected nil, got %v", out)
+	}
+}
+
+func TestIsManagedFlag(t *testing.T) {
+	if !IsManagedFlag("--model") {
+		t.Fatal("--model should be managed")
+	}
+	if !IsManagedFlag("-m") {
+		t.Fatal("-m should be managed")
+	}
+	if IsManagedFlag("-c") {
+		t.Fatal("-c should not be managed")
+	}
+}
+
+func TestFlagName(t *testing.T) {
+	tests := []struct {
+		in, want string
+	}{
+		{"--model=foo", "--model"},
+		{"-c", "-c"},
+		{"--top-k", "--top-k"},
+		{"-1", ""},
+		{"-0.5", ""},
+		{"-", ""},
+		{"--", ""},
+		{"hello", ""},
+	}
+	for _, tt := range tests {
+		got := FlagName(tt.in)
+		if got != tt.want {
+			t.Errorf("FlagName(%q) = %q, want %q", tt.in, got, tt.want)
+		}
+	}
+}
+
+func TestStripShadowingFlags(t *testing.T) {
+	t.Run("strips context flag with value", func(t *testing.T) {
+		out := StripShadowingFlags([]string{"-c", "4096", "--top-k", "40"})
+		if len(out) != 2 || out[0] != "--top-k" || out[1] != "40" {
+			t.Fatalf("got %v", out)
+		}
+	})
+	t.Run("retains non-shadowing flags", func(t *testing.T) {
+		out := StripShadowingFlags([]string{"--top-k", "40", "--top-p", "0.95"})
+		if len(out) != 4 {
+			t.Fatalf("got %v", out)
+		}
+	})
+	t.Run("strips boolean jinja flag", func(t *testing.T) {
+		out := StripShadowingFlags([]string{"--jinja", "--top-k", "40"})
+		if len(out) != 2 || out[0] != "--top-k" {
+			t.Fatalf("got %v", out)
+		}
+	})
+	t.Run("strips equals form", func(t *testing.T) {
+		out := StripShadowingFlags([]string{"--ctx-size=4096"})
+		if len(out) != 0 {
+			t.Fatalf("got %v", out)
+		}
+	})
+}
--- a/internal/winsvc/winsvc_unix.go
+++ b/internal/winsvc/winsvc_unix.go
@@ -0,0 +1,26 @@
+//go:build !windows
+
+package winsvc
+
+import (
+	"context"
+	"log/slog"
+	"os"
+	"os/signal"
+	"syscall"
+	"time"
+)
+
+func RegisterShutdownHandler(ctx context.Context, shutdownFunc func(context.Context) error) {
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGINT)
+	<-sigCh
+	slog.Info("shutdown signal received")
+	shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
+	defer cancel()
+	if err := shutdownFunc(shutdownCtx); err != nil {
+		slog.Error("shutdown error", "err", err)
+		os.Exit(1)
+	}
+	os.Exit(0)
+}
--- a/internal/winsvc/winsvc_windows.go
+++ b/internal/winsvc/winsvc_windows.go
@@ -0,0 +1,25 @@
+//go:build windows
+
+package winsvc
+
+import (
+	"context"
+	"log/slog"
+	"os"
+	"os/signal"
+	"time"
+)
+
+func RegisterShutdownHandler(ctx context.Context, shutdownFunc func(context.Context) error) {
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh, os.Interrupt)
+	<-sigCh
+	slog.Info("shutdown signal received")
+	shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
+	defer cancel()
+	if err := shutdownFunc(shutdownCtx); err != nil {
+		slog.Error("shutdown error", "err", err)
+		os.Exit(1)
+	}
+	os.Exit(0)
+}