llama-sidecar v0.1.0: daemon + benchmarks + eval suite

Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-28 01:55:13 +00:00
parent babbb4f39b
commit fe7f36ae98
39 changed files with 4228 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,18 @@
 bin/
 *.exe
 eval/.venv/
 eval/results/
 eval/scores.csv
 eval/SUMMARY.md
 eval/eval.log
 eval/ab/results/
 eval/ab/COMPARE.md
 eval/ab/timing.csv
 eval/ab/run.log
 bench/results/
 bench/SUMMARY.md
 bench/results.csv
 bench/llama-swap-recommendations.md
 internal/pool/*.bak-*
 internal/pool/sidecar_windows.go.bak-*
 __pycache__/
--- a/19
+++ b/19
@@ -0,0 +1,19 @@
 .PHONY: build build-windows test test-integration lint
 GO = /snap/go/current/bin/go
 build:
 	$(GO) build -o bin/llama-sidecar ./cmd/llama-sidecar
 build-windows:
 	GOOS=windows GOARCH=amd64 $(GO) build -o bin/llama-sidecar.exe ./cmd/llama-sidecar
 test:
 	$(GO) test ./internal/...
 test-integration:
 	$(GO) test -tags=integration ./internal/...
 lint:
 	$(GO) vet ./...
 	gofmt -l .
--- a/README.md
+++ b/README.md
@@ -0,0 +1,77 @@
 # llama-sidecar
 Per-agent llama-server process pool daemon. Runs on sam-desktop alongside llama-swap. Spawns or reuses llama-server processes keyed on (modelID, flags) hash.
 ## License
 AGPL-3.0-only.
 The validator package (`internal/validator/`) is ported from [Unsloth Studio](https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/llama_server_args.py) (AGPL-3.0). BooCode's TypeScript port (`apps/server/src/services/inference/llama-args-validator.ts`) is the sibling — update both when upstream changes.
 ## Build
 ```bash
 # Linux (development)
 make build
 # Windows AMD64 (production target — cross-compile from Linux)
 make build-windows
 # Copy to sam-desktop
 # scp bin/llama-sidecar.exe sam-desktop:C:\llama-sidecar\
 ```
 ## Configuration
 All via environment variables (no CLI flags):
 | Variable | Required | Default | Description |
 |----------|----------|---------|-------------|
 | `LLAMA_SERVER_BIN` | yes | — | Path to llama-server.exe |
 | `MODEL_DIR_MAP_FILE` | yes | — | JSON file mapping model IDs to GGUF paths |
 | `LLAMA_SIDECAR_BIND` | no | `127.0.0.1:8402` | Listen address |
 | `PORT_RANGE` | no | `8500-8599` | Port range for sidecar processes |
 | `MAX_SIDECARS` | no | `2` | Max concurrent sidecar processes |
 | `LOG_LEVEL` | no | `info` | Log level (debug, info, warn, error) |
 | `BASE_ARGS` | no | `["-ngl","999","-c","32768","--flash-attn","on","--no-mmap"]` | JSON array of base llama-server args |
 | `HEALTH_TIMEOUT_SECONDS` | no | `60` | Max wait for sidecar health check |
 | `HEALTH_INTERVAL_SECONDS` | no | `30` | Background health check interval |
 ## API
 ### `GET /health`
 Returns daemon status.
 ### `GET /sidecars`
 Returns list of active sidecar processes.
 ### `DELETE /sidecars/{hash}`
 Kill and remove a sidecar process.
 ### `POST /v1/chat/completions`
 OpenAI-compatible proxy. Routes to a sidecar process based on model + flags.
 Headers:
 - `X-Agent-Flags: --top-k 20 --cache-type-k q8_0` (optional)
 - `X-Model-Id: qwen3.6-35b-a3b-mxfp4` (optional, overrides body.model)
 ## Test
 ```bash
 make test                  # unit tests
 make test-integration      # requires real llama-server + GGUF
 make lint                  # vet + gofmt
 ```
 ## NSSM Service
 Pre-configured on sam-desktop as `llama-sidecar`. Start/stop via:
 ```
 C:\Tools\nssm\nssm.exe start llama-sidecar
 C:\Tools\nssm\nssm.exe stop llama-sidecar
 C:\Tools\nssm\nssm.exe status llama-sidecar
 ```
--- a/bench/analyze.py
+++ b/bench/analyze.py
@@ -0,0 +1,215 @@
 #!/usr/bin/env python3
 """Analyze MTP on/off benchmark results → CSV + SUMMARY.md + recommendations."""
 import csv
 import json
 import os
 import re
 import statistics
 from pathlib import Path
 RESULTS_DIR = Path(__file__).parent / "results"
 CSV_PATH = Path(__file__).parent / "results.csv"
 SUMMARY_PATH = Path(__file__).parent / "SUMMARY.md"
 RECO_PATH = Path(__file__).parent / "llama-swap-recommendations.md"
 FNAME_RE = re.compile(
    r"^(?P<stem>.+?)__mtp-(?P<mtp>on|off)__len(?P<len>\d+)__run(?P<run>\d+)\.json$"
 )
 def parse_result(path: Path) -> dict | None:
    m = FNAME_RE.match(path.name)
    if not m:
        return None
    try:
        data = json.loads(path.read_text())
    except (json.JSONDecodeError, OSError):
        return None
    t = data.get("timings", {})
    return {
        "gguf": m.group("stem"),
        "mtp": m.group("mtp"),
        "prompt_len": int(m.group("len")),
        "run": int(m.group("run")),
        "prompt_tps": t.get("prompt_per_second"),
        "predicted_tps": t.get("predicted_per_second"),
        "cache_n": t.get("cache_n"),
        "draft_n": t.get("draft_n"),
        "accepted_n": t.get("draft_n_accepted"),
        "total_ms": (t.get("prompt_ms", 0) or 0) + (t.get("predicted_ms", 0) or 0),
    }
 def load_all() -> list[dict]:
    rows = []
    for f in sorted(RESULTS_DIR.glob("*.json")):
        r = parse_result(f)
        if r:
            rows.append(r)
    return rows
 def write_csv(rows: list[dict]) -> None:
    fields = ["gguf", "mtp", "prompt_len", "run", "prompt_tps", "predicted_tps",
              "cache_n", "draft_n", "accepted_n", "total_ms"]
    with open(CSV_PATH, "w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=fields)
        w.writeheader()
        w.writerows(rows)
    print(f"Wrote {len(rows)} rows to {CSV_PATH}")
 def median_of(values: list[float]) -> float:
    return statistics.median(values) if values else 0.0
 def write_summary(rows: list[dict]) -> None:
    ggufs = sorted(set(r["gguf"] for r in rows))
    lens = sorted(set(r["prompt_len"] for r in rows))
    lines = ["# MTP On/Off Benchmark Results\n"]
    lines.append(f"**{len(rows)} measurements across {len(ggufs)} GGUFs.**\n")
    lines.append(f"Runs 2 & 3 used for median (run 1 = warmup, discarded).\n")
    verdicts = []
    for gguf in ggufs:
        lines.append(f"\n## {gguf}\n")
        header_parts = ["prompt_len"]
        for state in ["off", "on"]:
            header_parts.append(f"MTP-{state} tok/s")
        header_parts.extend(["delta %", "accept %"])
        lines.append("| " + " | ".join(header_parts) + " |")
        lines.append("|" + "|".join("---" for _ in header_parts) + "|")
        any_above_10 = False
        for pl in lens:
            off_vals = [r["predicted_tps"] for r in rows
                        if r["gguf"] == gguf and r["mtp"] == "off"
                        and r["prompt_len"] == pl and r["run"] >= 2
                        and r["predicted_tps"] is not None]
            on_vals = [r["predicted_tps"] for r in rows
                       if r["gguf"] == gguf and r["mtp"] == "on"
                       and r["prompt_len"] == pl and r["run"] >= 2
                       and r["predicted_tps"] is not None]
            off_med = median_of(off_vals)
            on_med = median_of(on_vals)
            if off_med > 0:
                delta = ((on_med - off_med) / off_med) * 100
            else:
                delta = 0.0
            if abs(delta) >= 10:
                any_above_10 = True
            draft_rows = [r for r in rows
                          if r["gguf"] == gguf and r["mtp"] == "on"
                          and r["prompt_len"] == pl and r["run"] >= 2
                          and r.get("draft_n")]
            total_draft = sum(r.get("draft_n", 0) for r in draft_rows)
            total_accepted = sum(r.get("accepted_n", 0) for r in draft_rows)
            accept_pct = f"{(total_accepted / total_draft * 100):.0f}%" if total_draft > 0 else "—"
            lines.append(
                f"| {pl} | {off_med:.1f} | {on_med:.1f} | {delta:+.1f}% | {accept_pct} |"
            )
        if any_above_10:
            verdict = "KEEP MTP"
        else:
            verdict = "DROP MTP"
        verdicts.append((gguf, verdict))
        lines.append(f"\n**Verdict: {verdict}**\n")
    lines.append("\n---\n")
    lines.append("## Verdict Summary\n")
    lines.append("| GGUF | Verdict |")
    lines.append("|------|---------|")
    for gguf, verdict in verdicts:
        lines.append(f"| {gguf} | {verdict} |")
    summary = "\n".join(lines) + "\n"
    SUMMARY_PATH.write_text(summary)
    print(f"Wrote {SUMMARY_PATH}")
    print(summary)
 def write_recommendations(rows: list[dict]) -> None:
    ggufs = sorted(set(r["gguf"] for r in rows))
    lens = sorted(set(r["prompt_len"] for r in rows))
    lines = ["# llama-swap Config Recommendations\n"]
    lines.append("Based on MTP on/off benchmark results.\n")
    lines.append("**Read-only reference** — do NOT edit D:\\llama-swap\\config.yaml directly.\n")
    lines.append("```yaml")
    lines.append("# Commented diff against current config.yaml")
    lines.append("# Lines starting with + should be added, - should be removed")
    lines.append("")
    model_map = {
        "Qwen3.6-35B-A3B-MXFP4_MOE": "qwen3.6-35b-a3b-mxfp4",
        "Qwen3.6-27B-Q6_K": "qwen3.6-27b-mtp",
        "Qwopus3.5-4B-v3-MTP-Q8_0": "qwopus3.5-4b-mtp",
        "Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0": "qwen3.5-9b-deepseek-v4-mtp",
        "Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M": "qwopus3.6-35b-a3b-v1-mtp",
        "Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16": "qwopus3.6-35b-a3b-mxfp4-mtp",
        "Qwopus3.6-27B-v2-MTP-Q6_K": "qwopus3.6-27b-v2-mtp",
        "Qwopus3.5-9B-Coder-MTP-Q8_0": "qwopus3.5-9b-coder-mtp",
    }
    currently_mtp = {
        "Qwen3.6-35B-A3B-MXFP4_MOE": False,
        "Qwen3.6-27B-Q6_K": True,
        "Qwopus3.5-4B-v3-MTP-Q8_0": True,
        "Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0": True,
        "Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M": True,
        "Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16": True,
        "Qwopus3.6-27B-v2-MTP-Q6_K": True,
        "Qwopus3.5-9B-Coder-MTP-Q8_0": True,
    }
    for gguf in ggufs:
        model_id = model_map.get(gguf, gguf)
        is_mtp_now = currently_mtp.get(gguf, False)
        off_vals = [r["predicted_tps"] for r in rows
                    if r["gguf"] == gguf and r["mtp"] == "off" and r["run"] >= 2
                    and r["predicted_tps"] is not None]
        on_vals = [r["predicted_tps"] for r in rows
                   if r["gguf"] == gguf and r["mtp"] == "on" and r["run"] >= 2
                   and r["predicted_tps"] is not None]
        off_med = median_of(off_vals)
        on_med = median_of(on_vals)
        delta = ((on_med - off_med) / off_med * 100) if off_med > 0 else 0
        should_mtp = delta >= 10
        lines.append(f"  # {model_id}: MTP {'on' if is_mtp_now else 'off'} → {'on' if should_mtp else 'off'} (delta {delta:+.1f}%)")
        if should_mtp and not is_mtp_now:
            lines.append(f"  # + --spec-type draft-mtp --spec-draft-n-max 2")
        elif not should_mtp and is_mtp_now:
            lines.append(f"  # - --spec-type draft-mtp --spec-draft-n-max 2")
        else:
            lines.append(f"  # (no change)")
        lines.append("")
    lines.append("```\n")
    reco = "\n".join(lines)
    RECO_PATH.write_text(reco)
    print(f"Wrote {RECO_PATH}")
 def main() -> None:
    rows = load_all()
    if not rows:
        print("No results found in", RESULTS_DIR)
        return
    write_csv(rows)
    write_summary(rows)
    write_recommendations(rows)
 if __name__ == "__main__":
    main()
--- a/bench/bench.sh
+++ b/bench/bench.sh
@@ -0,0 +1,192 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ENDPOINT="http://100.101.41.16:8650"
 SSH_HOST="samki@100.101.41.16"
 TASK_NAME="bench_llama"
 BAT_PATH='%TEMP%\bench_run.bat'
 RESULTS_DIR="$(cd "$(dirname "$0")" && pwd)/results"
 PROMPTS_DIR="$(cd "$(dirname "$0")" && pwd)/prompts"
 MAX_TOKENS=200
 HEALTH_TIMEOUT=120
 LLAMA_BIN='D:\llama-server\llama-server.exe'
 mkdir -p "$RESULTS_DIR"
 # ── Config matrix: STEM|MTP_STATE|FULL_ARGS ───────────────────────────
 CONFIGS=(
 'Qwen3.6-35B-A3B-MXFP4_MOE|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-35B-A3B-MXFP4_MOE.gguf --mmproj D:\models\Qwen3.6-35B-A3B-MXFP4_MOE\mmproj.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwen3.6-35B-A3B-MXFP4_MOE|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-35B-A3B-MXFP4_MOE.gguf --mmproj D:\models\Qwen3.6-35B-A3B-MXFP4_MOE\mmproj.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwen3.6-27B-Q6_K|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-27B-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwen3.6-27B-Q6_K|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-27B-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwopus3.5-4B-v3-MTP-Q8_0|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-4B-v3-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwopus3.5-4B-v3-MTP-Q8_0|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-4B-v3-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwopus3.6-27B-v2-MTP-Q6_K|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-27B-v2-MTP-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwopus3.6-27B-v2-MTP-Q6_K|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-27B-v2-MTP-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwopus3.5-9B-Coder-MTP-Q8_0|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-9B-Coder-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.4 --top-p 0.8 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 'Qwopus3.5-9B-Coder-MTP-Q8_0|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-9B-Coder-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.4 --top-p 0.8 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
 )
 PROMPT_LENS=(256 1024 4096)
 # ── Helper functions ──────────────────────────────────────────────────
 kill_bench_server() {
  local pids
  pids=$(ssh "$SSH_HOST" 'for /f "tokens=5" %a in ('"'"'netstat -aon ^| findstr :8650 ^| findstr LISTENING'"'"') do @echo %a' 2>/dev/null || true)
  for pid in $pids; do
    if [ -n "$pid" ] && [ "$pid" != "0" ]; then
      ssh "$SSH_HOST" "taskkill /F /PID $pid" 2>/dev/null || true
    fi
  done
  ssh "$SSH_HOST" "schtasks /Delete /TN ${TASK_NAME} /F" 2>/dev/null || true
  sleep 3
 }
 start_bench_server() {
  local args="$1"
  # Write a batch file, then run it via schtasks
  ssh "$SSH_HOST" "echo ${LLAMA_BIN} ${args} > ${BAT_PATH}" 2>/dev/null
  ssh "$SSH_HOST" "schtasks /Create /TN ${TASK_NAME} /TR ${BAT_PATH} /SC ONCE /ST 00:00 /F /RL HIGHEST" 2>/dev/null
  ssh "$SSH_HOST" "schtasks /Run /TN ${TASK_NAME}" 2>/dev/null
 }
 poll_health() {
  local elapsed=0
  while [ $elapsed -lt $HEALTH_TIMEOUT ]; do
    if curl -sf "${ENDPOINT}/health" >/dev/null 2>&1; then
      echo "  health OK (${elapsed}s)"
      return 0
    fi
    sleep 3
    elapsed=$((elapsed + 3))
    if [ $((elapsed % 15)) -eq 0 ]; then
      echo "  waiting... (${elapsed}s)"
    fi
  done
  echo "  HEALTH TIMEOUT after ${HEALTH_TIMEOUT}s"
  return 1
 }
 send_request() {
  local prompt_file="$1"
  local output_file="$2"
  local body
  body=$(python3 -c "
 import json
 prompt = open('${prompt_file}').read()
 print(json.dumps({
    'messages': [{'role': 'user', 'content': prompt}],
    'max_tokens': ${MAX_TOKENS},
    'temperature': 0,
    'seed': 42,
    'stream': False
 }))
 ")
  local http_code
  http_code=$(curl -s -w '%{http_code}' -o "$output_file" \
    --max-time 300 \
    -X POST "${ENDPOINT}/v1/chat/completions" \
    -H "Content-Type: application/json" \
    -d "$body" 2>/dev/null)
  if [ "$http_code" != "200" ]; then
    echo "HTTP ${http_code}"
    return 1
  fi
  return 0
 }
 print_metrics() {
  python3 -c "
 import json
 d = json.load(open('${1}'))
 t = d.get('timings', {})
 ptps = t.get('prompt_per_second', 0)
 etps = t.get('predicted_per_second', 0)
 dn = t.get('draft_n', '')
 da = t.get('draft_n_accepted', '')
 draft = ''
 if dn != '':
    draft = f'  draft={da}/{dn}'
 print(f'prompt={ptps:.1f}  eval={etps:.1f} tok/s{draft}')
 " 2>/dev/null || echo "(parse error)"
 }
 # ── Main ──────────────────────────────────────────────────────────────
 total=${#CONFIGS[@]}
 echo "================================================================"
 echo "  MTP ON/OFF BENCHMARK SWEEP"
 echo "  ${total} configs x 3 prompts x 3 runs"
 echo "  Endpoint: ${ENDPOINT}"
 echo "================================================================"
 t_start=$(date +%s)
 config_idx=0
 for config_entry in "${CONFIGS[@]}"; do
  config_idx=$((config_idx + 1))
  IFS='|' read -r stem mtp_state args <<< "$config_entry"
  echo ""
  echo "================================================================"
  echo "  [${config_idx}/${total}] ${stem}  MTP=${mtp_state}"
  echo "================================================================"
  kill_bench_server
  echo "  Starting llama-server..."
  start_bench_server "$args"
  if ! poll_health; then
    echo "  SKIPPING"
    kill_bench_server
    continue
  fi
  for len in "${PROMPT_LENS[@]}"; do
    prompt_file="${PROMPTS_DIR}/p${len}.txt"
    [ -f "$prompt_file" ] || { echo "  Missing p${len}.txt"; continue; }
    echo "  -- p${len} --"
    for run in 1 2 3; do
      outfile="${RESULTS_DIR}/${stem}__mtp-${mtp_state}__len${len}__run${run}.json"
      printf "    run %d: " "$run"
      if send_request "$prompt_file" "$outfile"; then
        print_metrics "$outfile"
      fi
      sleep 1
    done
  done
  echo "  Killing..."
  kill_bench_server
 done
 t_end=$(date +%s)
 elapsed=$(( t_end - t_start ))
 echo ""
 echo "================================================================"
 echo "  SWEEP COMPLETE in $(( elapsed / 60 ))m $(( elapsed % 60 ))s"
 echo "  Run: python3 $(dirname "$0")/analyze.py"
 echo "================================================================"
--- a/bench/prompts/p1024.txt
+++ b/bench/prompts/p1024.txt
@@ -0,0 +1,67 @@
 You will rejoice to hear that no disaster has accompanied the
 commencement of an enterprise which you have regarded with such evil
 forebodings. I arrived here yesterday, and my first task is to assure
 my dear sister of my welfare and increasing confidence in the success
 of my undertaking.
 I am already far north of London, and as I walk in the streets of
 Petersburgh, I feel a cold northern breeze play upon my cheeks, which
 braces my nerves and fills me with delight. Do you understand this
 feeling? This breeze, which has travelled from the regions towards
 which I am advancing, gives me a foretaste of those icy climes.
 Inspirited by this wind of promise, my daydreams become more fervent
 and vivid. I try in vain to be persuaded that the pole is the seat of
 frost and desolation; it ever presents itself to my imagination as the
 region of beauty and delight. There, Margaret, the sun is for ever
 visible, its broad disk just skirting the horizon and diffusing a
 perpetual splendour. There—for with your leave, my sister, I will put
 some trust in preceding navigators—there snow and frost are banished;
 and, sailing over a calm sea, we may be wafted to a land surpassing in
 wonders and in beauty every region hitherto discovered on the habitable
 globe. Its productions and features may be without example, as the
 phenomena of the heavenly bodies undoubtedly are in those undiscovered
 solitudes. What may not be expected in a country of eternal light? I
 may there discover the wondrous power which attracts the needle and may
 regulate a thousand celestial observations that require only this
 voyage to render their seeming eccentricities consistent for ever. I
 shall satiate my ardent curiosity with the sight of a part of the world
 never before visited, and may tread a land never before imprinted by
 the foot of man. These are my enticements, and they are sufficient to
 conquer all fear of danger or death and to induce me to commence this
 laborious voyage with the joy a child feels when he embarks in a little
 boat, with his holiday mates, on an expedition of discovery up his
 native river. But supposing all these conjectures to be false, you
 cannot contest the inestimable benefit which I shall confer on all
 mankind, to the last generation, by discovering a passage near the pole
 to those countries, to reach which at present so many months are
 requisite; or by ascertaining the secret of the magnet, which, if at
 all possible, can only be effected by an undertaking such as mine.
 These reflections have dispelled the agitation with which I began my
 letter, and I feel my heart glow with an enthusiasm which elevates me
 to heaven, for nothing contributes so much to tranquillise the mind as
 a steady purpose—a point on which the soul may fix its intellectual
 eye. This expedition has been the favourite dream of my early years. I
 have read with ardour the accounts of the various voyages which have
 been made in the prospect of arriving at the North Pacific Ocean
 through the seas which surround the pole. You may remember that a
 history of all the voyages made for purposes of discovery composed the
 whole of our good Uncle Thomas’ library. My education was neglected,
 yet I was passionately fond of reading. These volumes were my study
 day and night, and my familiarity with them increased that regret which
 I had felt, as a child, on learning that my father’s dying injunction
 had forbidden my uncle to allow me to embark in a seafaring life.
 These visions faded when I perused, for the first time, those poets
 whose effusions entranced my soul and lifted it to heaven. I also
 became a poet and for one year lived in a paradise of my own creation;
 I imagined that I also might obtain a niche in the temple where the
 names of Homer and Shakespeare are consecrated. You are well
 acquainted with my failure and how heavily I bore the disappointment.
 But just at that time I inherited the fortune of my cousin, and my
 thoughts were turned into the channel of their earlier bent.
 Six years have passed since I resolved on my present undertaking. I
 can, even now, remember the hour from which I dedicated myself to this
 great enterprise. I commenced by inuring my body to hardship.
 Continue this passage in exactly 200 tokens of prose.
--- a/bench/prompts/p256.txt
+++ b/bench/prompts/p256.txt
@@ -0,0 +1,18 @@
 You will rejoice to hear that no disaster has accompanied the
 commencement of an enterprise which you have regarded with such evil
 forebodings. I arrived here yesterday, and my first task is to assure
 my dear sister of my welfare and increasing confidence in the success
 of my undertaking.
 I am already far north of London, and as I walk in the streets of
 Petersburgh, I feel a cold northern breeze play upon my cheeks, which
 braces my nerves and fills me with delight. Do you understand this
 feeling? This breeze, which has travelled from the regions towards
 which I am advancing, gives me a foretaste of those icy climes.
 Inspirited by this wind of promise, my daydreams become more fervent
 and vivid. I try in vain to be persuaded that the pole is the seat of
 frost and desolation; it ever presents itself to my imagination as the
 region of beauty and delight. There, Margaret, the sun is for ever
 visible, its broad disk just skirting the horizon and diffusing a
 perpetual splendour.
 Continue this passage in exactly 200 tokens of prose.
--- a/bench/prompts/p4096.txt
+++ b/bench/prompts/p4096.txt
@@ -0,0 +1,319 @@
 You will rejoice to hear that no disaster has accompanied the
 commencement of an enterprise which you have regarded with such evil
 forebodings. I arrived here yesterday, and my first task is to assure
 my dear sister of my welfare and increasing confidence in the success
 of my undertaking.
 I am already far north of London, and as I walk in the streets of
 Petersburgh, I feel a cold northern breeze play upon my cheeks, which
 braces my nerves and fills me with delight. Do you understand this
 feeling? This breeze, which has travelled from the regions towards
 which I am advancing, gives me a foretaste of those icy climes.
 Inspirited by this wind of promise, my daydreams become more fervent
 and vivid. I try in vain to be persuaded that the pole is the seat of
 frost and desolation; it ever presents itself to my imagination as the
 region of beauty and delight. There, Margaret, the sun is for ever
 visible, its broad disk just skirting the horizon and diffusing a
 perpetual splendour. There—for with your leave, my sister, I will put
 some trust in preceding navigators—there snow and frost are banished;
 and, sailing over a calm sea, we may be wafted to a land surpassing in
 wonders and in beauty every region hitherto discovered on the habitable
 globe. Its productions and features may be without example, as the
 phenomena of the heavenly bodies undoubtedly are in those undiscovered
 solitudes. What may not be expected in a country of eternal light? I
 may there discover the wondrous power which attracts the needle and may
 regulate a thousand celestial observations that require only this
 voyage to render their seeming eccentricities consistent for ever. I
 shall satiate my ardent curiosity with the sight of a part of the world
 never before visited, and may tread a land never before imprinted by
 the foot of man. These are my enticements, and they are sufficient to
 conquer all fear of danger or death and to induce me to commence this
 laborious voyage with the joy a child feels when he embarks in a little
 boat, with his holiday mates, on an expedition of discovery up his
 native river. But supposing all these conjectures to be false, you
 cannot contest the inestimable benefit which I shall confer on all
 mankind, to the last generation, by discovering a passage near the pole
 to those countries, to reach which at present so many months are
 requisite; or by ascertaining the secret of the magnet, which, if at
 all possible, can only be effected by an undertaking such as mine.
 These reflections have dispelled the agitation with which I began my
 letter, and I feel my heart glow with an enthusiasm which elevates me
 to heaven, for nothing contributes so much to tranquillise the mind as
 a steady purpose—a point on which the soul may fix its intellectual
 eye. This expedition has been the favourite dream of my early years. I
 have read with ardour the accounts of the various voyages which have
 been made in the prospect of arriving at the North Pacific Ocean
 through the seas which surround the pole. You may remember that a
 history of all the voyages made for purposes of discovery composed the
 whole of our good Uncle Thomas’ library. My education was neglected,
 yet I was passionately fond of reading. These volumes were my study
 day and night, and my familiarity with them increased that regret which
 I had felt, as a child, on learning that my father’s dying injunction
 had forbidden my uncle to allow me to embark in a seafaring life.
 These visions faded when I perused, for the first time, those poets
 whose effusions entranced my soul and lifted it to heaven. I also
 became a poet and for one year lived in a paradise of my own creation;
 I imagined that I also might obtain a niche in the temple where the
 names of Homer and Shakespeare are consecrated. You are well
 acquainted with my failure and how heavily I bore the disappointment.
 But just at that time I inherited the fortune of my cousin, and my
 thoughts were turned into the channel of their earlier bent.
 Six years have passed since I resolved on my present undertaking. I
 can, even now, remember the hour from which I dedicated myself to this
 great enterprise. I commenced by inuring my body to hardship. I
 accompanied the whale-fishers on several expeditions to the North Sea;
 I voluntarily endured cold, famine, thirst, and want of sleep; I often
 worked harder than the common sailors during the day and devoted my
 nights to the study of mathematics, the theory of medicine, and those
 branches of physical science from which a naval adventurer might derive
 the greatest practical advantage. Twice I actually hired myself as an
 under-mate in a Greenland whaler, and acquitted myself to admiration. I
 must own I felt a little proud when my captain offered me the second
 dignity in the vessel and entreated me to remain with the greatest
 earnestness, so valuable did he consider my services.
 And now, dear Margaret, do I not deserve to accomplish some great purpose?
 My life might have been passed in ease and luxury, but I preferred glory to
 every enticement that wealth placed in my path. Oh, that some encouraging
 voice would answer in the affirmative! My courage and my resolution is
 firm; but my hopes fluctuate, and my spirits are often depressed. I am
 about to proceed on a long and difficult voyage, the emergencies of which
 will demand all my fortitude: I am required not only to raise the spirits
 of others, but sometimes to sustain my own, when theirs are failing.
 This is the most favourable period for travelling in Russia. They fly
 quickly over the snow in their sledges; the motion is pleasant, and, in
 my opinion, far more agreeable than that of an English stagecoach. The
 cold is not excessive, if you are wrapped in furs—a dress which I have
 already adopted, for there is a great difference between walking the
 deck and remaining seated motionless for hours, when no exercise
 prevents the blood from actually freezing in your veins. I have no
 ambition to lose my life on the post-road between St. Petersburgh and
 Archangel.
 I shall depart for the latter town in a fortnight or three weeks; and my
 intention is to hire a ship there, which can easily be done by paying the
 insurance for the owner, and to engage as many sailors as I think necessary
 among those who are accustomed to the whale-fishing. I do not intend to
 sail until the month of June; and when shall I return? Ah, dear sister, how
 can I answer this question? If I succeed, many, many months, perhaps years,
 will pass before you and I may meet. If I fail, you will see me again soon,
 or never.
 Farewell, my dear, excellent Margaret. Heaven shower down blessings on you,
 and save me, that I may again and again testify my gratitude for all your
 love and kindness.
 Your affectionate brother,
 R. Walton
 Letter 2
 _To Mrs. Saville, England._
 Archangel, 28th March, 17—.
 How slowly the time passes here, encompassed as I am by frost and snow!
 Yet a second step is taken towards my enterprise. I have hired a
 vessel and am occupied in collecting my sailors; those whom I have
 already engaged appear to be men on whom I can depend and are certainly
 possessed of dauntless courage.
 But I have one want which I have never yet been able to satisfy, and the
 absence of the object of which I now feel as a most severe evil, I have no
 friend, Margaret: when I am glowing with the enthusiasm of success, there
 will be none to participate my joy; if I am assailed by disappointment, no
 one will endeavour to sustain me in dejection. I shall commit my thoughts
 to paper, it is true; but that is a poor medium for the communication of
 feeling. I desire the company of a man who could sympathise with me, whose
 eyes would reply to mine. You may deem me romantic, my dear sister, but I
 bitterly feel the want of a friend. I have no one near me, gentle yet
 courageous, possessed of a cultivated as well as of a capacious mind, whose
 tastes are like my own, to approve or amend my plans. How would such a
 friend repair the faults of your poor brother! I am too ardent in execution
 and too impatient of difficulties. But it is a still greater evil to me
 that I am self-educated: for the first fourteen years of my life I ran wild
 on a common and read nothing but our Uncle Thomas’ books of voyages.
 At that age I became acquainted with the celebrated poets of our own
 country; but it was only when it had ceased to be in my power to derive its
 most important benefits from such a conviction that I perceived the
 necessity of becoming acquainted with more languages than that of my native
 country. Now I am twenty-eight and am in reality more illiterate than many
 schoolboys of fifteen. It is true that I have thought more and that my
 daydreams are more extended and magnificent, but they want (as the painters
 call it) _keeping;_ and I greatly need a friend who would have sense
 enough not to despise me as romantic, and affection enough for me to
 endeavour to regulate my mind.
 Well, these are useless complaints; I shall certainly find no friend on the
 wide ocean, nor even here in Archangel, among merchants and seamen. Yet
 some feelings, unallied to the dross of human nature, beat even in these
 rugged bosoms. My lieutenant, for instance, is a man of wonderful courage
 and enterprise; he is madly desirous of glory, or rather, to word my phrase
 more characteristically, of advancement in his profession. He is an
 Englishman, and in the midst of national and professional prejudices,
 unsoftened by cultivation, retains some of the noblest endowments of
 humanity. I first became acquainted with him on board a whale vessel;
 finding that he was unemployed in this city, I easily engaged him to assist
 in my enterprise.
 The master is a person of an excellent disposition and is remarkable in the
 ship for his gentleness and the mildness of his discipline. This
 circumstance, added to his well-known integrity and dauntless courage, made
 me very desirous to engage him. A youth passed in solitude, my best years
 spent under your gentle and feminine fosterage, has so refined the
 groundwork of my character that I cannot overcome an intense distaste to
 the usual brutality exercised on board ship: I have never believed it to be
 necessary, and when I heard of a mariner equally noted for his kindliness
 of heart and the respect and obedience paid to him by his crew, I felt
 myself peculiarly fortunate in being able to secure his services. I heard
 of him first in rather a romantic manner, from a lady who owes to him the
 happiness of her life. This, briefly, is his story. Some years ago he loved
 a young Russian lady of moderate fortune, and having amassed a considerable
 sum in prize-money, the father of the girl consented to the match. He saw
 his mistress once before the destined ceremony; but she was bathed in
 tears, and throwing herself at his feet, entreated him to spare her,
 confessing at the same time that she loved another, but that he was poor,
 and that her father would never consent to the union. My generous friend
 reassured the suppliant, and on being informed of the name of her lover,
 instantly abandoned his pursuit. He had already bought a farm with his
 money, on which he had designed to pass the remainder of his life; but he
 bestowed the whole on his rival, together with the remains of his
 prize-money to purchase stock, and then himself solicited the young
 woman’s father to consent to her marriage with her lover. But the old
 man decidedly refused, thinking himself bound in honour to my friend, who,
 when he found the father inexorable, quitted his country, nor returned
 until he heard that his former mistress was married according to her
 inclinations. “What a noble fellow!” you will exclaim. He is
 so; but then he is wholly uneducated: he is as silent as a Turk, and a kind
 of ignorant carelessness attends him, which, while it renders his conduct
 the more astonishing, detracts from the interest and sympathy which
 otherwise he would command.
 Yet do not suppose, because I complain a little or because I can
 conceive a consolation for my toils which I may never know, that I am
 wavering in my resolutions. Those are as fixed as fate, and my voyage
 is only now delayed until the weather shall permit my embarkation. The
 winter has been dreadfully severe, but the spring promises well, and it
 is considered as a remarkably early season, so that perhaps I may sail
 sooner than I expected. I shall do nothing rashly: you know me
 sufficiently to confide in my prudence and considerateness whenever the
 safety of others is committed to my care.
 I cannot describe to you my sensations on the near prospect of my
 undertaking. It is impossible to communicate to you a conception of
 the trembling sensation, half pleasurable and half fearful, with which
 I am preparing to depart. I am going to unexplored regions, to “the
 land of mist and snow,” but I shall kill no albatross; therefore do not
 be alarmed for my safety or if I should come back to you as worn and
 woeful as the “Ancient Mariner.” You will smile at my allusion, but I
 will disclose a secret. I have often attributed my attachment to, my
 passionate enthusiasm for, the dangerous mysteries of ocean to that
 production of the most imaginative of modern poets. There is something
 at work in my soul which I do not understand. I am practically
 industrious—painstaking, a workman to execute with perseverance and
 labour—but besides this there is a love for the marvellous, a belief
 in the marvellous, intertwined in all my projects, which hurries me out
 of the common pathways of men, even to the wild sea and unvisited
 regions I am about to explore.
 But to return to dearer considerations. Shall I meet you again, after
 having traversed immense seas, and returned by the most southern cape of
 Africa or America? I dare not expect such success, yet I cannot bear to
 look on the reverse of the picture. Continue for the present to write to
 me by every opportunity: I may receive your letters on some occasions when
 I need them most to support my spirits. I love you very tenderly.
 Remember me with affection, should you never hear from me again.
 Your affectionate brother,
 Robert Walton
 Letter 3
 _To Mrs. Saville, England._
 July 7th, 17—.
 My dear Sister,
 I write a few lines in haste to say that I am safe—and well advanced
 on my voyage. This letter will reach England by a merchantman now on
 its homeward voyage from Archangel; more fortunate than I, who may not
 see my native land, perhaps, for many years. I am, however, in good
 spirits: my men are bold and apparently firm of purpose, nor do the
 floating sheets of ice that continually pass us, indicating the dangers
 of the region towards which we are advancing, appear to dismay them. We
 have already reached a very high latitude; but it is the height of
 summer, and although not so warm as in England, the southern gales,
 which blow us speedily towards those shores which I so ardently desire
 to attain, breathe a degree of renovating warmth which I had not
 expected.
 No incidents have hitherto befallen us that would make a figure in a
 letter. One or two stiff gales and the springing of a leak are
 accidents which experienced navigators scarcely remember to record, and
 I shall be well content if nothing worse happen to us during our voyage.
 Adieu, my dear Margaret. Be assured that for my own sake, as well as
 yours, I will not rashly encounter danger. I will be cool,
 persevering, and prudent.
 But success _shall_ crown my endeavours. Wherefore not? Thus far I
 have gone, tracing a secure way over the pathless seas, the very stars
 themselves being witnesses and testimonies of my triumph. Why not
 still proceed over the untamed yet obedient element? What can stop the
 determined heart and resolved will of man?
 My swelling heart involuntarily pours itself out thus. But I must
 finish. Heaven bless my beloved sister!
 R.W.
 Letter 4
 _To Mrs. Saville, England._
 August 5th, 17—.
 So strange an accident has happened to us that I cannot forbear
 recording it, although it is very probable that you will see me before
 these papers can come into your possession.
 Last Monday (July 31st) we were nearly surrounded by ice, which closed
 in the ship on all sides, scarcely leaving her the sea-room in which
 she floated. Our situation was somewhat dangerous, especially as we
 were compassed round by a very thick fog. We accordingly lay to,
 hoping that some change would take place in the atmosphere and weather.
 About two o’clock the mist cleared away, and we beheld, stretched out
 in every direction, vast and irregular plains of ice, which seemed to
 have no end. Some of my comrades groaned, and my own mind began to
 grow watchful with anxious thoughts, when a strange sight suddenly
 attracted our attention and diverted our solicitude from our own
 situation. We perceived a low carriage, fixed on a sledge and drawn by
 dogs, pass on towards the north, at the distance of half a mile; a
 being which had the shape of a man, but apparently of gigantic stature,
 sat in the sledge and guided the dogs. We watched the rapid progress
 of the traveller with our telescopes until he was lost among the
 distant inequalities of the ice.
 This appearance excited our unqualified wonder. We were, as we believed,
 many hundred miles from any land; but this apparition seemed to denote that
 it was not, in reality, so distant as we had supposed.
 Continue this passage in exactly 200 tokens of prose.
--- a/benchmarks/3d/analyze.py
+++ b/benchmarks/3d/analyze.py
@@ -0,0 +1,109 @@
 #!/usr/bin/env python3
 """Analyze MTP n_max sweep results and produce summary.md."""
 import json
 from pathlib import Path
 RESULTS_PATH = Path(__file__).parent / "results.json"
 SUMMARY_PATH = Path(__file__).parent / "summary.md"
 def load_results() -> list[dict]:
    data = json.loads(RESULTS_PATH.read_text())
    return [r for r in data if r.get("eval_tok_s") is not None and r.get("error") is None]
 def main() -> None:
    rows = load_results()
    if not rows:
        print("No valid results found.")
        return
    models = sorted(set(r["model"] for r in rows))
    lines = ["# MTP n_max Sweep Results\n"]
    lines.append(f"**{len(rows)} valid measurements across {len(models)} models.**\n")
    recommendations = []
    for model in models:
        model_rows = [r for r in rows if r["model"] == model]
        n_max_values = sorted(set(r["n_max"] for r in model_rows))
        prompt_names = sorted(set(r["prompt"] for r in model_rows))
        lines.append(f"\n## {model}\n")
        header = "| n_max | " + " | ".join(f"{p} tok/s" for p in prompt_names) + " | avg tok/s | vs n_max=0 |"
        sep = "|-------|" + "|".join("-" * (len(p) + 7) for p in prompt_names) + "|-----------|------------|"
        lines.append(header)
        lines.append(sep)
        baseline_avg = None
        best_avg = 0
        best_n = 0
        for n in n_max_values:
            cells = []
            vals = []
            for p in prompt_names:
                matching = [r for r in model_rows if r["n_max"] == n and r["prompt"] == p]
                if matching:
                    v = matching[0]["eval_tok_s"]
                    cells.append(f"{v:.1f}")
                    vals.append(v)
                else:
                    cells.append("—")
            avg = sum(vals) / len(vals) if vals else 0
            if n == 0:
                baseline_avg = avg
                delta = "baseline"
            elif baseline_avg and baseline_avg > 0:
                pct = ((avg - baseline_avg) / baseline_avg) * 100
                delta = f"{pct:+.1f}%"
            else:
                delta = "—"
            if avg > best_avg:
                best_avg = avg
                best_n = n
            draft_info = ""
            draft_rows = [r for r in model_rows if r["n_max"] == n and r.get("draft_n")]
            if draft_rows:
                total_draft = sum(r.get("draft_n", 0) for r in draft_rows)
                total_accepted = sum(r.get("draft_n_accepted", 0) for r in draft_rows)
                if total_draft > 0:
                    accept_pct = (total_accepted / total_draft) * 100
                    draft_info = f" (accept {accept_pct:.0f}%)"
            row_str = f"| {n} | " + " | ".join(cells) + f" | {avg:.1f} | {delta}{draft_info} |"
            lines.append(row_str)
        if baseline_avg and baseline_avg > 0 and best_avg > 0:
            improvement = ((best_avg - baseline_avg) / baseline_avg) * 100
            lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s, {improvement:+.1f}% vs baseline)\n")
            recommendations.append((model, best_n, best_avg, improvement))
        else:
            lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s)\n")
    # Recommendations section
    lines.append("\n---\n")
    lines.append("## Recommended `llama_extra_args` per model\n")
    lines.append("| Model | n_max | avg tok/s | vs baseline | suggested flags |")
    lines.append("|-------|-------|-----------|-------------|-----------------|")
    for model, n, avg, imp in recommendations:
        if n > 0:
            flags = f'`["--spec-type", "draft-mtp", "--spec-draft-n-max", "{n}"]`'
        else:
            flags = "_(none — MTP not beneficial)_"
        lines.append(f"| {model} | {n} | {avg:.1f} | {imp:+.1f}% | {flags} |")
    lines.append("")
    summary = "\n".join(lines)
    SUMMARY_PATH.write_text(summary)
    print(summary)
    print(f"\nWritten to: {SUMMARY_PATH}")
 if __name__ == "__main__":
    main()
--- a/benchmarks/3d/run_sweep.py
+++ b/benchmarks/3d/run_sweep.py
@@ -0,0 +1,248 @@
 #!/usr/bin/env python3
 """MTP n_max sweep across MTP-capable models via llama-sidecar.
 Usage:
    python3 run_sweep.py             # full sweep
    python3 run_sweep.py --dry-run   # print matrix, no API calls
    python3 run_sweep.py --limit 1   # run first combo only (smoke)
 """
 import argparse
 import json
 import os
 import sys
 import time
 from datetime import datetime, timezone
 from pathlib import Path
 from urllib.request import Request, urlopen
 from urllib.error import URLError, HTTPError
 SIDECAR_URL = os.environ.get("SIDECAR_URL", "http://100.101.41.16:8402")
 RESULTS_PATH = Path(__file__).parent / "results.json"
 MATRIX = [
    ("qwen3.6-35b-a3b-mxfp4", [0, 1, 2, 3]),
    ("qwen3.6-27b-mtp",        [0, 1, 2, 3, 4]),
    ("qwopus3.6-27b-v2-mtp",   [0, 2]),
    ("qwopus3.5-9b-coder-mtp", [0, 2]),
 ]
 PROMPTS = {
    "short": {
        "content": "Reply with exactly five words: a haiku-like greeting.",
        "max_tokens": 100,
    },
    "medium": {
        "content": (
            "Explain how multi-token prediction speculative decoding works in transformer "
            "inference. Cover: 1) the draft model role, 2) the verification mechanism, "
            "3) acceptance rate dynamics, 4) why MoE models gain less than dense models. "
            "Aim for 400-500 words."
        ),
        "max_tokens": 700,
    },
    "long": {
        "content": (
            "Write a complete Python implementation of a simple HTTP server that "
            "accepts POST requests on /v1/chat/completions, validates JSON bodies "
            "against a basic OpenAI schema, logs each request to stdout in JSON "
            "format, and returns a hardcoded streaming response. Include error "
            "handling for malformed JSON, missing required fields, and unsupported "
            "methods. Add docstrings and type hints throughout. Show full file."
        ),
        "max_tokens": 2500,
    },
 }
 def build_flags(n_max: int) -> str:
    if n_max > 0:
        return f"--spec-type draft-mtp --spec-draft-n-max {n_max} --repeat-penalty 1.0"
    return "--repeat-penalty 1.0"
 def sidecar_request(method: str, path: str, body: dict | None = None,
                    headers: dict | None = None, timeout: int = 180) -> dict | None:
    url = f"{SIDECAR_URL}{path}"
    data = json.dumps(body).encode() if body else None
    hdrs = {"Content-Type": "application/json"}
    if headers:
        hdrs.update(headers)
    req = Request(url, data=data, headers=hdrs, method=method)
    try:
        with urlopen(req, timeout=timeout) as resp:
            return json.loads(resp.read())
    except HTTPError as e:
        body_text = e.read().decode(errors="replace")
        try:
            return json.loads(body_text)
        except json.JSONDecodeError:
            return {"error": f"HTTP {e.code}", "body": body_text[:500]}
    except URLError as e:
        return {"error": str(e)}
 def send_completion(model: str, flags: str, prompt: str, max_tokens: int) -> dict:
    body = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": max_tokens,
        "stream": False,
    }
    headers = {
        "X-Agent-Flags": flags,
        "X-Model-Id": model,
    }
    t0 = time.perf_counter()
    resp = sidecar_request("POST", "/v1/chat/completions", body=body, headers=headers)
    wall_ms = (time.perf_counter() - t0) * 1000
    if resp is None:
        return {"error": "no response", "wall_clock_ms": wall_ms}
    resp["wall_clock_ms"] = wall_ms
    return resp
 def extract_metrics(resp: dict, model: str, n_max: int, prompt_name: str) -> dict:
    timings = resp.get("timings", {})
    usage = resp.get("usage", {})
    sidecars = sidecar_request("GET", "/sidecars") or []
    sidecar_hash = ""
    sidecar_port = 0
    if isinstance(sidecars, list):
        for s in sidecars:
            if s.get("model_id") == model:
                sidecar_hash = s.get("hash", "")
                sidecar_port = s.get("port", 0)
                break
    return {
        "model": model,
        "n_max": n_max,
        "prompt": prompt_name,
        "timestamp_utc": datetime.now(timezone.utc).isoformat(),
        "completion_tokens": usage.get("completion_tokens"),
        "prompt_tokens": usage.get("prompt_tokens"),
        "eval_tok_s": timings.get("predicted_per_second"),
        "prompt_tok_s": timings.get("prompt_per_second"),
        "eval_ms": timings.get("predicted_ms"),
        "prompt_ms": timings.get("prompt_ms"),
        "draft_n": timings.get("draft_n"),
        "draft_n_accepted": timings.get("draft_n_accepted"),
        "wall_clock_ms": resp.get("wall_clock_ms"),
        "sidecar_hash": sidecar_hash,
        "sidecar_port": sidecar_port,
        "error": resp.get("error"),
    }
 def append_result(row: dict) -> None:
    results = []
    if RESULTS_PATH.exists():
        try:
            results = json.loads(RESULTS_PATH.read_text())
        except (json.JSONDecodeError, OSError):
            pass
    results.append(row)
    RESULTS_PATH.write_text(json.dumps(results, indent=2) + "\n")
 def evict_all_sidecars() -> None:
    sidecars = sidecar_request("GET", "/sidecars")
    if not isinstance(sidecars, list):
        return
    for s in sidecars:
        h = s.get("hash", "")
        if h:
            sidecar_request("DELETE", f"/sidecars/{h}")
 def run_combo(model: str, n_max: int, combo_idx: int, total_combos: int,
              prompt_names: list[str]) -> None:
    flags = build_flags(n_max)
    label = f"[{combo_idx}/{total_combos}] {model} n_max={n_max}"
    print(f"\n{'='*60}")
    print(f"{label}")
    print(f"  flags: {flags}")
    print(f"{'='*60}")
    for pname in prompt_names:
        p = PROMPTS[pname]
        # Warmup
        print(f"  {pname}: warmup...", end="", flush=True)
        send_completion(model, flags, p["content"], p["max_tokens"])
        print(" done.", flush=True)
        time.sleep(2)
        # Record
        print(f"  {pname}: recording...", end="", flush=True)
        resp = send_completion(model, flags, p["content"], p["max_tokens"])
        row = extract_metrics(resp, model, n_max, pname)
        append_result(row)
        tok_s = row.get("eval_tok_s")
        draft = row.get("draft_n")
        err = row.get("error")
        if err:
            print(f" ERROR: {err}")
        elif tok_s:
            draft_str = f" draft_n={draft}" if draft else ""
            print(f" {tok_s:.1f} tok/s{draft_str}")
        else:
            print(" (no timings in response)")
    # Evict this sidecar to free VRAM
    evict_all_sidecars()
    print(f"  evicted sidecars, sleeping 5s for VRAM release...")
    time.sleep(5)
 def dry_run() -> None:
    combos = [(model, n) for model, ns in MATRIX for n in ns]
    print(f"Dry run: {len(combos)} combos × 3 prompts × 2 calls = {len(combos)*6} API calls")
    print(f"Estimated runtime: 60-90 minutes\n")
    for i, (model, n_max) in enumerate(combos, 1):
        flags = build_flags(n_max)
        print(f"  [{i}/{len(combos)}] {model} n_max={n_max}")
        print(f"    flags: {flags}")
        for pname in PROMPTS:
            p = PROMPTS[pname]
            print(f"    {pname}: max_tokens={p['max_tokens']}")
    print(f"\nResults would be written to: {RESULTS_PATH}")
 def main() -> None:
    parser = argparse.ArgumentParser(description="MTP n_max sweep benchmark")
    parser.add_argument("--dry-run", action="store_true", help="Print matrix without running")
    parser.add_argument("--limit", type=int, default=0, help="Run only first N combos")
    args = parser.parse_args()
    if args.dry_run:
        dry_run()
        return
    # Check sidecar health
    health = sidecar_request("GET", "/health")
    if not health or health.get("status") != "ok":
        print(f"Sidecar unhealthy: {health}", file=sys.stderr)
        sys.exit(1)
    print(f"Sidecar healthy: {health}")
    # Clear existing sidecars
    evict_all_sidecars()
    combos = [(model, n) for model, ns in MATRIX for n in ns]
    if args.limit > 0:
        combos = combos[:args.limit]
    prompt_names = list(PROMPTS.keys())
    t_start = time.perf_counter()
    for i, (model, n_max) in enumerate(combos, 1):
        run_combo(model, n_max, i, len(combos), prompt_names)
    elapsed = time.perf_counter() - t_start
    print(f"\nSweep complete. {len(combos)} combos in {elapsed/60:.1f} minutes.")
    print(f"Results: {RESULTS_PATH}")
 if __name__ == "__main__":
    main()
--- a/cmd/llama-sidecar/main.go
+++ b/cmd/llama-sidecar/main.go
@@ -0,0 +1,74 @@
 package main
 import (
 	"context"
 	"fmt"
 	"log/slog"
 	"net/http"
 	"os"
 	"time"
 	"github.com/indifferentketchup/llama-sidecar/internal/config"
 	"github.com/indifferentketchup/llama-sidecar/internal/pool"
 	"github.com/indifferentketchup/llama-sidecar/internal/server"
 	"github.com/indifferentketchup/llama-sidecar/internal/winsvc"
 )
 func main() {
 	cfg, err := config.Load()
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "config error: %v\n", err)
 		os.Exit(1)
 	}
 	initLogger(cfg.LogLevel)
 	slog.Info("starting llama-sidecar",
 		"bind", cfg.Bind,
 		"max_sidecars", cfg.MaxSidecars,
 		"port_range", fmt.Sprintf("%d-%d", cfg.PortRangeLo, cfg.PortRangeHi),
 		"models", len(cfg.ModelDirMap),
 		"base_args", cfg.BaseArgs,
 	)
 	startedAt := time.Now()
 	spawner := &pool.RealSpawner{}
 	p := pool.New(cfg, spawner)
 	srv := server.New(cfg, p, startedAt)
 	go func() {
 		slog.Info("listening", "addr", cfg.Bind)
 		if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
 			slog.Error("server error", "err", err)
 			os.Exit(1)
 		}
 	}()
 	winsvc.RegisterShutdownHandler(context.Background(), func(ctx context.Context) error {
 		slog.Info("draining HTTP server")
 		drainCtx, drainCancel := context.WithTimeout(ctx, 10*time.Second)
 		defer drainCancel()
 		if err := srv.Shutdown(drainCtx); err != nil {
 			slog.Error("HTTP drain failed", "err", err)
 		}
 		slog.Info("shutting down sidecar pool")
 		poolCtx, poolCancel := context.WithTimeout(ctx, 30*time.Second)
 		defer poolCancel()
 		return p.Shutdown(poolCtx)
 	})
 }
 func initLogger(level string) {
 	var lvl slog.Level
 	switch level {
 	case "debug":
 		lvl = slog.LevelDebug
 	case "warn":
 		lvl = slog.LevelWarn
 	case "error":
 		lvl = slog.LevelError
 	default:
 		lvl = slog.LevelInfo
 	}
 	handler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: lvl})
 	slog.SetDefault(slog.New(handler))
 }
--- a/eval/ab/prompts.json
+++ b/eval/ab/prompts.json
@@ -0,0 +1,72 @@
 [
  {
    "id": "review-1",
    "agent": "Code Reviewer",
    "prompt": "Review the `buildHeadPayload` function in `apps/server/src/services/compaction.ts`. It was recently patched in v1.13.6 to embed `reasoning_parts` as a `<reasoning>...</reasoning>` prose prefix on the assistant content for tool-bearing turns. Check: does the current implementation handle the case where `reasoning_parts` is an empty array? Does it handle turns that have both reasoning_parts AND non-empty text content (not just tool calls)? Cite file:line for any issues."
  },
  {
    "id": "review-2",
    "agent": "Code Reviewer",
    "prompt": "Review the path guard layer in `apps/coder/services/path_guard.ts`. It enforces per-project scoping with a blanket `/opt:rw` mount and policy at the tool layer. Check for: symlink traversal (does it resolve symlinks before checking?), double-encoding attacks on path components, race conditions between check and use (TOCTOU), and whether `extraRoots` from `request_read_access` grants could be abused to escape the project scope. Cite file:line."
  },
  {
    "id": "debug-1",
    "agent": "Debugger",
    "prompt": "Bug report: after a long BooCode chat session (~40 messages), the compaction trigger fires but the resulting summary is empty — the assistant message with `summary=true` has blank content. The `ctx_max` is correctly fetched from `/upstream/<model>/props` (verified in logs). The `needs_compaction` flag is being set. But the summary inference returns an empty string. This started happening after the v1.13.7 compaction trigger change that lowered the threshold to `floor(0.85 * ctx_max)`. Diagnose: what code path could produce an empty summary, and what would you check first?"
  },
  {
    "id": "debug-2",
    "agent": "Debugger",
    "prompt": "Bug report: BooTerm terminal pane shows garbled output past column 66 on initial open, but corrects itself after manually resizing the browser window. The `stty size` inside the terminal reports `82 66` even though the pane is visually ~132 columns wide. tmux `list-windows` confirms the session was created at 66 columns. This only happens when opening a terminal pane via the split-pane button, not when opening it as the sole pane. Diagnose the root cause in `apps/web/src/components/panes/TerminalPane.tsx`."
  },
  {
    "id": "refactor-1",
    "agent": "Refactorer",
    "prompt": "The `streamCompletion` function in `apps/server/src/services/provider.ts` has grown to handle: AI SDK v6 streaming, XML fallback parsing for qwen3.6 tool-call emissions, abort signal handling (the explicit `if (signal?.aborted) throw` patch), reasoning-delta counting, and usage extraction. It's now ~200 lines. Propose a refactor that separates concerns without breaking the streaming contract. The function must remain a single entry point for callers."
  },
  {
    "id": "refactor-2",
    "agent": "Refactorer",
    "prompt": "The WebSocket frame publishing in BooCode went through two batches (v1.13.12 + v1.13.13) that converted ~80 publish sites to typed `publishFrame`/`publishUserFrame` wrappers with Zod validation. The schemas are duplicated byte-identical between `apps/server/src/types/ws-frames.ts` and `apps/web/src/api/ws-frames.ts` with a parity test. Propose a refactor to share the schema definition from a single source instead of maintaining the duplication + parity test."
  },
  {
    "id": "architect-1",
    "agent": "Architect",
    "prompt": "Design the system-prompt prefix cache for BooCode. Context: `buildSystemPromptWithFingerprint` already computes a SHA-256 of the assembled prefix and logs drift. The prefix is rebuilt on every inference turn from: project settings, agent instructions (AGENTS.md), skills, session-level overrides, and web_search_enabled flag. Most of these don't change between turns in the same session. Design a cache that avoids rebuilding+rehashing on every turn. Consider: process-memory vs DB-backed, invalidation strategy, cache key shape, and whether the fingerprint can serve as the cache key itself."
  },
  {
    "id": "architect-2",
    "agent": "Architect",
    "prompt": "Design the v2.5 task model integration with BooCoder's ACP dispatch. Context: v2.5.0-task-model just shipped a `tasks` table and lightweight task model services. BooCoder dispatches external agents (opencode, goose, claude) via ACP or PTY. Design how a task created in BooChat should flow through to a BooCoder dispatch: task creation → agent selection → ACP session → status updates back to the task row → completion. Consider: which fields from the task row map to ACP session params, how task status syncs with the agent's exit code, and how the UI surfaces progress."
  },
  {
    "id": "security-1",
    "agent": "Security Auditor",
    "prompt": "Audit the `web_fetch` tool implementation in BooCode. It fetches arbitrary URLs on behalf of the LLM agent. Check for: SSRF against internal Tailscale IPs (100.x.x.x), DNS rebinding, redirect following to internal hosts, response size limits, content-type validation, and whether the `url_guard.ts` layer covers all cases. The tool is gated by `session.web_search_enabled` but once enabled, the URL is user-agent-controlled (the LLM decides what to fetch)."
  },
  {
    "id": "security-2",
    "agent": "Security Auditor",
    "prompt": "Audit the `request_read_access` tool and `allowed_read_paths` grant mechanism (v1.13.17). When an agent needs to read files outside its project scope, it calls `request_read_access(path)` which triggers an `ask_user_input` elicitation for approval. On approval, the path is added to `allowed_read_paths` for that session, and `pathGuard` is extended with `extraRoots`. Check: can the agent request a path like `/etc/shadow` or `/opt/boocode/.env`? Is the grant scoped to the session or persistent? Can the path be a symlink that resolves to a sensitive location after the grant?"
  },
  {
    "id": "prompt-1",
    "agent": "Prompt Builder",
    "prompt": "Write a Claude Code dispatch prompt for: adding a new BooCode agent called 'Documenter' to AGENTS.md. The agent should read source files and produce inline JSDoc/TSDoc comments. It should use the read-only tool set. Temperature 0.4, steps 10. The prompt should include pre-flight checks, the exact file to modify, backup instructions, and verification steps."
  },
  {
    "id": "prompt-2",
    "agent": "Prompt Builder",
    "prompt": "Write an OpenCode dispatch prompt for: fixing the codecontext sidecar to handle projects with more than 10,000 files without OOMing. The fork is at /opt/forks/codecontext/. The agent should investigate the memory profile of the graph analysis pass, identify the allocation hotspot, and propose a streaming or chunked alternative. Include #careful hashtag, backup rules, and stop conditions."
  },
  {
    "id": "recon-1",
    "agent": "Recon",
    "prompt": "Map the BooCode monorepo at /opt/boocode/. I need: top-level directory structure, the three apps and their roles, how they share the database, the Docker container topology, and the key service files in apps/server/src/services/. Identify the data flow from a user message in BooChat through to the LLM inference call and back."
  },
  {
    "id": "recon-2",
    "agent": "Recon",
    "prompt": "Map the codecontext fork at /opt/forks/codecontext/. I need: the MCP tool surface (what tools are exposed), the parser architecture (how tree-sitter grammars are registered), the graph analysis pipeline (how dependencies and call graphs are built), and the codesight-merge additions (blast radius, hot files, routes, middleware). Identify the main entry points and the caching layer."
  }
 ]
--- a/eval/ab/run.sh
+++ b/eval/ab/run.sh
@@ -0,0 +1,242 @@
 #!/usr/bin/env bash
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 ENDPOINT="http://100.101.41.16:8401/v1"
 PROMPTS_FILE="${SCRIPT_DIR}/prompts.json"
 RESULTS_DIR="${SCRIPT_DIR}/results"
 COMPARE_FILE="${SCRIPT_DIR}/COMPARE.md"
 TIMING_FILE="${SCRIPT_DIR}/timing.csv"
 MODELS=(
  qwen3.6-35b-a3b-mxfp4
  qwen3-coder-30b-apex
  qwen3.6-27b-mtp
  qwopus3.5-4b-mtp
  qwen3.5-9b-deepseek-v4-mtp
  qwopus3.6-35b-a3b-v1
  qwopus3.6-27b-v2-mtp
  qwopus3.5-9b-coder-mtp
 )
 mkdir -p "$RESULTS_DIR"
 # ── Parse prompts ─────────────────────────────────────────────────────
 PROMPT_COUNT=$(python3 -c "import json; print(len(json.load(open('${PROMPTS_FILE}'))))")
 TOTAL=$((PROMPT_COUNT * ${#MODELS[@]}))
 EST_MIN=$(( TOTAL * 30 / 60 ))
 echo "================================================================"
 echo "  A/B MODEL COMPARISON"
 echo "  ${PROMPT_COUNT} prompts × ${#MODELS[@]} models = ${TOTAL} requests"
 echo "  Estimated runtime: ~${EST_MIN} minutes"
 echo "  Endpoint: ${ENDPOINT}"
 echo "================================================================"
 echo ""
 # ── Main loop: models (outer) × prompts (inner) ──────────────────────
 # One model load per model, all prompts answered, then swap.
 t_start=$(date +%s)
 done_count=0
 for model in "${MODELS[@]}"; do
  echo ""
  echo "================================================================"
  echo "  MODEL: ${model}"
  echo "================================================================"
  # Warmup: load the model with a trivial request
  all_cached=true
  for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do
    PID=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['id'])")
    if [ ! -f "${RESULTS_DIR}/${PID}/${model}.json" ] || [ ! -s "${RESULTS_DIR}/${PID}/${model}.json" ]; then
      all_cached=false
      break
    fi
  done
  if [ "$all_cached" = "true" ]; then
    echo "  All ${PROMPT_COUNT} prompts cached, skipping model"
    for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do
      done_count=$((done_count + 1))
    done
    continue
  fi
  echo "  Warming up..."
  curl -s -X POST "${ENDPOINT}/chat/completions" \
    -H "Content-Type: application/json" \
    -d "{\"model\":\"${model}\",\"messages\":[{\"role\":\"user\",\"content\":\"Say OK.\"}],\"max_tokens\":10,\"temperature\":0}" \
    --max-time 300 > /dev/null 2>&1
  echo "  Warm."
  for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do
    PROMPT_ID=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['id'])")
    AGENT=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['agent'])")
    mkdir -p "${RESULTS_DIR}/${PROMPT_ID}"
    OUT_JSON="${RESULTS_DIR}/${PROMPT_ID}/${model}.json"
    OUT_MD="${RESULTS_DIR}/${PROMPT_ID}/${model}.md"
    # Resume: skip if already done
    if [ -f "$OUT_JSON" ] && [ -s "$OUT_JSON" ]; then
      done_count=$((done_count + 1))
      echo "  [${PROMPT_ID}] cached (${done_count}/${TOTAL})"
      continue
    fi
    BODY=$(python3 -c "
 import json
 p = json.load(open('${PROMPTS_FILE}'))[${pidx}]
 print(json.dumps({
    'model': '${model}',
    'messages': [{'role': 'user', 'content': p['prompt']}],
    'temperature': 0.6,
    'max_tokens': 2048,
    'seed': 42,
    'stream': False
 }))
 ")
    SUCCESS=0
    for attempt in 1 2; do
      HTTP_CODE=$(curl -s -w '%{http_code}' -o "$OUT_JSON" \
        --max-time 300 \
        -X POST "${ENDPOINT}/chat/completions" \
        -H "Content-Type: application/json" \
        -d "$BODY" 2>/dev/null)
      if [ "$HTTP_CODE" = "200" ]; then
        SUCCESS=1
        break
      else
        if [ "$attempt" = "1" ]; then
          echo "  [${PROMPT_ID}] HTTP ${HTTP_CODE}, retrying in 10s..."
          sleep 10
        else
          echo "ERROR: HTTP ${HTTP_CODE}" > "$OUT_MD"
          echo "  [${PROMPT_ID}] FAILED (HTTP ${HTTP_CODE})"
        fi
      fi
    done
    if [ "$SUCCESS" = "1" ]; then
      python3 -c "
 import json
 d = json.load(open('${OUT_JSON}'))
 msg = d.get('choices', [{}])[0].get('message', {})
 content = msg.get('content', '') or ''
 reasoning = msg.get('reasoning_content', '') or ''
 out = ''
 if reasoning:
    out += '<think>\n' + reasoning + '\n</think>\n\n'
 out += content
 open('${OUT_MD}', 'w').write(out)
 " 2>/dev/null
      done_count=$((done_count + 1))
      METRICS=$(python3 -c "
 import json
 d = json.load(open('${OUT_JSON}'))
 t = d.get('timings', {})
 tps = t.get('predicted_per_second', 0)
 tok = d.get('usage', {}).get('completion_tokens', 0)
 print(f'{tps:.1f}tok/s {tok}tok')
 " 2>/dev/null || echo "?")
      echo "  [${PROMPT_ID}] done (${METRICS}) [${done_count}/${TOTAL}]"
    fi
    sleep 2
  done
 done
 # ── Generate COMPARE.md ──────────────────────────────────────────────
 echo ""
 echo "Generating COMPARE.md..."
 MODELS_JSON=$(printf '%s\n' "${MODELS[@]}" | python3 -c "import json,sys; print(json.dumps([l.strip() for l in sys.stdin if l.strip()]))")
 python3 -c "
 import json
 from pathlib import Path
 prompts = json.load(open('${PROMPTS_FILE}'))
 results_dir = Path('${RESULTS_DIR}')
 models = json.loads('${MODELS_JSON}')
 lines = ['# A/B Model Comparison\n']
 timing_rows = []
 for p in prompts:
    pid = p['id']
    agent = p['agent']
    short = p['prompt'][:80]
    lines.append(f'## [{pid}] {agent}\n')
    lines.append(f'> {short}...\n')
    for model in models:
        md_path = results_dir / pid / f'{model}.md'
        json_path = results_dir / pid / f'{model}.json'
        lines.append(f'### {model}\n')
        if md_path.exists():
            content = md_path.read_text().strip()
            lines.append(f'{content}\n')
        else:
            lines.append('*(no response)*\n')
        if json_path.exists():
            try:
                d = json.loads(json_path.read_text())
                t = d.get('timings', {})
                u = d.get('usage', {})
                timing_rows.append({
                    'prompt_id': pid,
                    'model_id': model,
                    'prompt_tps': t.get('prompt_per_second', 0),
                    'predicted_tps': t.get('predicted_per_second', 0),
                    'total_tokens': u.get('total_tokens', 0),
                    'latency_ms': round((t.get('prompt_ms', 0) or 0) + (t.get('predicted_ms', 0) or 0), 1),
                })
            except:
                pass
    lines.append('---\n')
 # Timing table
 lines.append('## Timing Summary\n')
 pids = list(dict.fromkeys(r['prompt_id'] for r in timing_rows))
 lines.append('| prompt | ' + ' | '.join(models) + ' |')
 lines.append('|--------' + '|------' * len(models) + '|')
 for pid in pids:
    cells = []
    for model in models:
        match = [r for r in timing_rows if r['prompt_id'] == pid and r['model_id'] == model]
        if match:
            cells.append(f\"{match[0]['predicted_tps']:.0f}\")
        else:
            cells.append('—')
    lines.append(f'| {pid} | ' + ' | '.join(cells) + ' |')
 Path('${COMPARE_FILE}').write_text('\n'.join(lines) + '\n')
 print(f'Wrote ${COMPARE_FILE}')
 # timing.csv
 import csv
 with open('${TIMING_FILE}', 'w', newline='') as f:
    w = csv.DictWriter(f, fieldnames=['prompt_id', 'model_id', 'prompt_tps', 'predicted_tps', 'total_tokens', 'latency_ms'])
    w.writeheader()
    w.writerows(timing_rows)
 print(f'Wrote ${TIMING_FILE}')
 "
 t_end=$(date +%s)
 elapsed=$(( t_end - t_start ))
 echo ""
 echo "================================================================"
 echo "  COMPLETE in $(( elapsed / 60 ))m $(( elapsed % 60 ))s"
 echo "  Results: ${RESULTS_DIR}/"
 echo "  Compare: ${COMPARE_FILE}"
 echo "  Timing:  ${TIMING_FILE}"
 echo "================================================================"
--- a/eval/analyze.py
+++ b/eval/analyze.py
@@ -0,0 +1,125 @@
 #!/usr/bin/env python3
 """Generate SUMMARY.md from scores.csv."""
 import csv
 from collections import defaultdict
 from pathlib import Path
 CSV_PATH = Path(__file__).parent / "scores.csv"
 SUMMARY_PATH = Path(__file__).parent / "SUMMARY.md"
 def load_scores() -> list[dict]:
    rows = []
    with open(CSV_PATH) as f:
        for row in csv.DictReader(f):
            row["correct"] = row["correct"].lower() in ("true", "1", "yes")
            row["latency_ms"] = float(row.get("latency_ms", 0) or 0)
            rows.append(row)
    return rows
 def main() -> None:
    rows = load_scores()
    if not rows:
        print("No data in scores.csv")
        return
    models = sorted(set(r["model"] for r in rows))
    benchmarks = ["mmlu", "gsm8k", "humaneval"]
    # Compute scores
    scores = {}  # (model, bench) -> (correct, total)
    for r in rows:
        key = (r["model"], r["benchmark"])
        if key not in scores:
            scores[key] = [0, 0]
        scores[key][1] += 1
        if r["correct"]:
            scores[key][0] += 1
    # MMLU per-category
    cat_scores = defaultdict(lambda: [0, 0])
    for r in rows:
        if r["benchmark"] == "mmlu" and r.get("category"):
            key = (r["model"], r["category"])
            cat_scores[key][1] += 1
            if r["correct"]:
                cat_scores[key][0] += 1
    categories = sorted(set(r.get("category", "") for r in rows if r.get("category")))
    lines = ["# Eval Results\n"]
    # Main table
    lines.append("## Overall Scores\n")
    header = "| Model | MMLU (%) | GSM8K (%) | HumanEval (%) | Avg (%) |"
    sep = "|-------|---------|---------|--------------|---------|"
    lines.append(header)
    lines.append(sep)
    model_avgs = []
    for model in models:
        cells = []
        pcts = []
        for bench in benchmarks:
            key = (model, bench)
            if key in scores:
                c, t = scores[key]
                pct = c / t * 100 if t > 0 else 0
                cells.append(f"{pct:.1f}")
                pcts.append(pct)
            else:
                cells.append("—")
        avg = sum(pcts) / len(pcts) if pcts else 0
        model_avgs.append((model, avg))
        cells.append(f"{avg:.1f}")
        lines.append(f"| {model} | " + " | ".join(cells) + " |")
    # Sort summary
    model_avgs.sort(key=lambda x: -x[1])
    lines.append(f"\n**Best overall: {model_avgs[0][0]}** ({model_avgs[0][1]:.1f}% avg)\n")
    # MMLU category breakdown
    if categories:
        lines.append("\n## MMLU Per-Category Breakdown\n")
        header = "| Model | " + " | ".join(c.replace("_", " ").title() for c in categories) + " |"
        sep = "|-------" + "|-------" * len(categories) + "|"
        lines.append(header)
        lines.append(sep)
        for model in models:
            cells = []
            for cat in categories:
                key = (model, cat)
                if key in cat_scores:
                    c, t = cat_scores[key]
                    cells.append(f"{c}/{t}")
                else:
                    cells.append("—")
            lines.append(f"| {model} | " + " | ".join(cells) + " |")
    # Latency summary
    lines.append("\n## Median Latency (ms)\n")
    lines.append("| Model | MMLU | GSM8K | HumanEval |")
    lines.append("|-------|------|-------|-----------|")
    for model in models:
        cells = []
        for bench in benchmarks:
            lats = sorted([r["latency_ms"] for r in rows
                          if r["model"] == model and r["benchmark"] == bench
                          and r["latency_ms"] > 0])
            if lats:
                med = lats[len(lats)//2]
                cells.append(f"{med:.0f}")
            else:
                cells.append("—")
        lines.append(f"| {model} | " + " | ".join(cells) + " |")
    summary = "\n".join(lines) + "\n"
    SUMMARY_PATH.write_text(summary)
    print(summary)
    print(f"\nWritten to: {SUMMARY_PATH}")
 if __name__ == "__main__":
    main()
--- a/eval/gsm8k.py
+++ b/eval/gsm8k.py
@@ -0,0 +1,164 @@
 #!/usr/bin/env python3
 """GSM8K 50-question subset benchmark (seed=42)."""
 import json
 import os
 import random
 import re
 import sys
 import time
 from pathlib import Path
 from datasets import load_dataset
 from openai import OpenAI
 from tqdm import tqdm
 ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
 RESULTS_DIR = Path(__file__).parent / "results"
 MAX_TOKENS = 512
 SEED = 42
 TEMPERATURE = 0
 N_QUESTIONS = 50
 def load_questions() -> list[dict]:
    rng = random.Random(SEED)
    ds = load_dataset("openai/gsm8k", "main", split="test", trust_remote_code=True)
    indices = list(range(len(ds)))
    rng.shuffle(indices)
    questions = []
    for idx in indices[:N_QUESTIONS]:
        row = ds[idx]
        answer_text = row["answer"]
        # GSM8K answer format: "#### <number>" at end
        match = re.search(r"####\s*([0-9,.-]+)", answer_text)
        expected = int(match.group(1).replace(",", "")) if match else 0
        questions.append({
            "id": f"gsm8k_{idx}",
            "question": row["question"],
            "expected": expected,
        })
    return questions
 def format_prompt(q: dict) -> str:
    return (
        "Solve this problem step by step, then on the final line write "
        "'ANSWER: <number>'.\n\n" + q["question"]
    )
 def parse_answer(text: str) -> int | None:
    matches = re.findall(r"ANSWER:\s*([0-9,.-]+)", text, re.IGNORECASE)
    if matches:
        try:
            return int(matches[-1].replace(",", ""))
        except ValueError:
            return None
    # Fallback: last number in the response
    nums = re.findall(r"-?\d[\d,]*", text)
    if nums:
        try:
            return int(nums[-1].replace(",", ""))
        except ValueError:
            return None
    return None
 def run_gsm8k(model: str, client: OpenAI, questions: list[dict]) -> list[dict]:
    model_dir = RESULTS_DIR / model / "gsm8k"
    model_dir.mkdir(parents=True, exist_ok=True)
    results = []
    correct = 0
    total = 0
    skipped = 0
    for i, q in enumerate(tqdm(questions, desc=f"  GSM8K {model}", file=sys.stderr)):
        expected = q["expected"]
        out_path = model_dir / f"{q['id']}.json"
        if out_path.exists():
            try:
                cached = json.loads(out_path.read_text())
                raw = ""
                if "choices" in cached:
                    msg = cached["choices"][0].get("message", {})
                    raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
                parsed = parse_answer(raw)
                is_correct = parsed is not None and parsed == expected
                if is_correct:
                    correct += 1
                total += 1
                results.append({
                    "model": model, "benchmark": "gsm8k", "question_id": q["id"],
                    "correct": is_correct, "raw_answer": raw[:200],
                    "parsed_answer": str(parsed) if parsed is not None else "",
                    "expected": str(expected), "latency_ms": 0,
                })
                skipped += 1
                continue
            except (json.JSONDecodeError, KeyError):
                pass
        prompt = format_prompt(q)
        t0 = time.time()
        resp_json = None
        for attempt in range(2):
            try:
                resp = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=MAX_TOKENS,
                    temperature=TEMPERATURE,
                    seed=SEED,
                )
                resp_json = resp.model_dump()
                break
            except Exception as e:
                if attempt == 0:
                    time.sleep(5)
                else:
                    resp_json = {"error": str(e)}
        latency = (time.time() - t0) * 1000
        raw = ""
        if resp_json and "choices" in resp_json:
            msg = resp_json["choices"][0].get("message", {})
            raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
        parsed = parse_answer(raw)
        is_correct = parsed is not None and parsed == expected
        if is_correct:
            correct += 1
        total += 1
        out_path.write_text(json.dumps(resp_json, indent=2, default=str))
        results.append({
            "model": model,
            "benchmark": "gsm8k",
            "question_id": q["id"],
            "correct": is_correct,
            "raw_answer": raw[:200],
            "parsed_answer": str(parsed) if parsed is not None else "",
            "expected": str(expected),
            "latency_ms": round(latency, 1),
        })
        if (i + 1) % 10 == 0:
            print(f"  [{model}] GSM8K {i+1}/{len(questions)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr)
    if skipped:
        print(f"  [{model}] GSM8K resumed: {skipped} cached, {total-skipped} new", file=sys.stderr)
    print(f"  [{model}] GSM8K FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr)
    return results
 if __name__ == "__main__":
    model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4"
    client = OpenAI(base_url=ENDPOINT, api_key="dummy")
    questions = load_questions()
    results = run_gsm8k(model, client, questions)
    for r in results:
        print(json.dumps(r))
--- a/eval/humaneval.py
+++ b/eval/humaneval.py
@@ -0,0 +1,201 @@
 #!/usr/bin/env python3
 """HumanEval benchmark — 164 problems with sandboxed execution."""
 import json
 import os
 import re
 import subprocess
 import sys
 import tempfile
 import textwrap
 import time
 from pathlib import Path
 from datasets import load_dataset
 from openai import OpenAI
 from tqdm import tqdm
 ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
 RESULTS_DIR = Path(__file__).parent / "results"
 MAX_TOKENS = 1024
 SEED = 42
 TEMPERATURE = 0
 EXEC_TIMEOUT = 30
 def load_problems() -> list[dict]:
    ds = load_dataset("openai/openai_humaneval", split="test", trust_remote_code=True)
    problems = []
    for row in ds:
        problems.append({
            "id": row["task_id"],
            "prompt": row["prompt"],
            "canonical": row["canonical_solution"],
            "test": row["test"],
            "entry_point": row["entry_point"],
        })
    return problems
 def extract_code(response: str, prompt: str) -> str:
    # Try to find a code block
    blocks = re.findall(r"```(?:python)?\n(.*?)```", response, re.DOTALL)
    if blocks:
        code = blocks[0]
        # If the code block contains the function signature, use it directly
        if "def " in code:
            return code
        # Otherwise prepend the prompt (function signature)
        return prompt + code
    # No code block — try to extract everything from the first def onwards
    lines = response.split("\n")
    in_code = False
    code_lines = []
    for line in lines:
        if line.strip().startswith("def ") or in_code:
            in_code = True
            code_lines.append(line)
        elif in_code and line.strip() == "":
            code_lines.append(line)
    if code_lines:
        return "\n".join(code_lines)
    # Last resort: prepend prompt to raw response
    return prompt + response
 def run_test(code: str, test_code: str, entry_point: str) -> tuple[bool, str]:
    full = code + "\n\n" + test_code + f"\n\ncheck({entry_point})\n"
    with tempfile.NamedTemporaryFile(
        mode="w", suffix=".py", dir="/tmp", delete=False
    ) as f:
        f.write(full)
        f.flush()
        fpath = f.name
    try:
        # Sandboxed execution: restrict to /tmp, limited PATH
        env = {"PATH": "/usr/bin:/usr/local/bin", "HOME": "/tmp"}
        result = subprocess.run(
            [sys.executable, fpath],
            capture_output=True, text=True,
            timeout=EXEC_TIMEOUT,
            cwd="/tmp",
            env=env,
        )
        passed = result.returncode == 0
        output = result.stderr[:500] if result.stderr else result.stdout[:500]
        return passed, output
    except subprocess.TimeoutExpired:
        return False, "TIMEOUT"
    except Exception as e:
        return False, str(e)[:500]
    finally:
        try:
            os.unlink(fpath)
        except OSError:
            pass
 def run_humaneval(model: str, client: OpenAI, problems: list[dict]) -> list[dict]:
    model_dir = RESULTS_DIR / model / "humaneval"
    model_dir.mkdir(parents=True, exist_ok=True)
    results = []
    correct = 0
    total = 0
    skipped = 0
    for i, p in enumerate(tqdm(problems, desc=f"  HumanEval {model}", file=sys.stderr)):
        out_path = model_dir / f"{p['id'].replace('/', '_')}.json"
        if out_path.exists():
            try:
                cached = json.loads(out_path.read_text())
                passed = cached.get("passed", False)
                if passed:
                    correct += 1
                total += 1
                results.append({
                    "model": model, "benchmark": "humaneval",
                    "question_id": p["id"], "correct": passed,
                    "raw_answer": "", "parsed_answer": "pass" if passed else "fail",
                    "expected": "pass", "latency_ms": 0,
                })
                skipped += 1
                continue
            except (json.JSONDecodeError, KeyError):
                pass
        t0 = time.time()
        resp_json = None
        for attempt in range(2):
            try:
                resp = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": (
                        "Complete the following Python function. "
                        "Return ONLY the complete function implementation.\n\n"
                        + p["prompt"]
                    )}],
                    max_tokens=MAX_TOKENS,
                    temperature=TEMPERATURE,
                    seed=SEED,
                )
                resp_json = resp.model_dump()
                break
            except Exception as e:
                if attempt == 0:
                    time.sleep(5)
                else:
                    resp_json = {"error": str(e)}
        latency = (time.time() - t0) * 1000
        raw = ""
        if resp_json and "choices" in resp_json:
            msg = resp_json["choices"][0].get("message", {})
            raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
        code = extract_code(raw, p["prompt"])
        passed, exec_output = run_test(code, p["test"], p["entry_point"])
        if passed:
            correct += 1
        total += 1
        out_path.write_text(json.dumps({
            "response": resp_json,
            "extracted_code": code[:2000],
            "passed": passed,
            "exec_output": exec_output,
        }, indent=2, default=str))
        results.append({
            "model": model,
            "benchmark": "humaneval",
            "question_id": p["id"],
            "correct": passed,
            "raw_answer": raw[:200],
            "parsed_answer": "pass" if passed else "fail",
            "expected": "pass",
            "latency_ms": round(latency, 1),
        })
        if (i + 1) % 10 == 0:
            print(f"  [{model}] HumanEval {i+1}/{len(problems)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr)
    if skipped:
        print(f"  [{model}] HumanEval resumed: {skipped} cached, {total-skipped} new", file=sys.stderr)
    print(f"  [{model}] HumanEval FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr)
    return results
 if __name__ == "__main__":
    model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4"
    client = OpenAI(base_url=ENDPOINT, api_key="dummy")
    problems = load_problems()
    results = run_humaneval(model, client, problems)
    for r in results:
        print(json.dumps(r))
--- a/eval/mmlu.py
+++ b/eval/mmlu.py
@@ -0,0 +1,166 @@
 #!/usr/bin/env python3
 """MMLU 100-question subset benchmark (20 per category, seed=42)."""
 import json
 import os
 import random
 import re
 import sys
 import time
 from pathlib import Path
 from datasets import load_dataset
 from openai import OpenAI
 from tqdm import tqdm
 ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
 RESULTS_DIR = Path(__file__).parent / "results"
 MAX_TOKENS = 512
 SEED = 42
 TEMPERATURE = 0
 CATEGORIES = [
    "high_school_mathematics",
    "college_computer_science",
    "professional_medicine",
    "formal_logic",
    "miscellaneous",
 ]
 PER_CATEGORY = 20
 CHOICES = ["A", "B", "C", "D"]
 def load_questions() -> list[dict]:
    rng = random.Random(SEED)
    questions = []
    for cat in CATEGORIES:
        ds = load_dataset("cais/mmlu", cat, split="test", trust_remote_code=True)
        indices = list(range(len(ds)))
        rng.shuffle(indices)
        for idx in indices[:PER_CATEGORY]:
            row = ds[idx]
            questions.append({
                "id": f"{cat}_{idx}",
                "category": cat,
                "question": row["question"],
                "choices": row["choices"],
                "answer_idx": row["answer"],
            })
    return questions
 def format_prompt(q: dict) -> str:
    lines = [f"Question: {q['question']}"]
    for i, choice in enumerate(q["choices"]):
        lines.append(f"{CHOICES[i]}) {choice}")
    lines.append("Answer with a single letter: ")
    return "\n".join(lines)
 def parse_answer(text: str) -> str | None:
    for ch in text.strip():
        if ch.upper() in CHOICES:
            return ch.upper()
    return None
 def run_mmlu(model: str, client: OpenAI, questions: list[dict]) -> list[dict]:
    model_dir = RESULTS_DIR / model / "mmlu"
    model_dir.mkdir(parents=True, exist_ok=True)
    results = []
    correct = 0
    total = 0
    skipped = 0
    for i, q in enumerate(tqdm(questions, desc=f"  MMLU {model}", file=sys.stderr)):
        expected = CHOICES[q["answer_idx"]]
        out_path = model_dir / f"{q['id']}.json"
        # Resume: skip if result file exists
        if out_path.exists():
            try:
                cached = json.loads(out_path.read_text())
                raw = ""
                if "choices" in cached:
                    msg = cached["choices"][0].get("message", {})
                    raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
                parsed = parse_answer(raw)
                is_correct = parsed == expected
                if is_correct:
                    correct += 1
                total += 1
                results.append({
                    "model": model, "benchmark": "mmlu", "question_id": q["id"],
                    "category": q["category"], "correct": is_correct,
                    "raw_answer": raw[:200], "parsed_answer": parsed or "",
                    "expected": expected, "latency_ms": 0,
                })
                skipped += 1
                continue
            except (json.JSONDecodeError, KeyError):
                pass
        prompt = format_prompt(q)
        t0 = time.time()
        resp_json = None
        for attempt in range(2):
            try:
                resp = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=MAX_TOKENS,
                    temperature=TEMPERATURE,
                    seed=SEED,
                )
                resp_json = resp.model_dump()
                break
            except Exception as e:
                if attempt == 0:
                    time.sleep(5)
                else:
                    resp_json = {"error": str(e)}
        latency = (time.time() - t0) * 1000
        raw = ""
        if resp_json and "choices" in resp_json:
            msg = resp_json["choices"][0].get("message", {})
            raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
        parsed = parse_answer(raw)
        is_correct = parsed == expected
        if is_correct:
            correct += 1
        total += 1
        out_path.write_text(json.dumps(resp_json, indent=2, default=str))
        results.append({
            "model": model,
            "benchmark": "mmlu",
            "question_id": q["id"],
            "category": q["category"],
            "correct": is_correct,
            "raw_answer": raw[:200],
            "parsed_answer": parsed or "",
            "expected": expected,
            "latency_ms": round(latency, 1),
        })
        if (i + 1) % 10 == 0:
            print(f"  [{model}] MMLU {i+1}/{len(questions)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr)
    if skipped:
        print(f"  [{model}] MMLU resumed: {skipped} cached, {total-skipped} new", file=sys.stderr)
    print(f"  [{model}] MMLU FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr)
    return results
 if __name__ == "__main__":
    model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4"
    client = OpenAI(base_url=ENDPOINT, api_key="dummy")
    questions = load_questions()
    results = run_mmlu(model, client, questions)
    for r in results:
        print(json.dumps(r))
--- a/eval/run_all.py
+++ b/eval/run_all.py
@@ -0,0 +1,117 @@
 #!/usr/bin/env python3
 """Orchestrate MMLU, GSM8K, HumanEval across all models."""
 import csv
 import json
 import os
 import sys
 import time
 from pathlib import Path
 from openai import OpenAI
 ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
 RESULTS_DIR = Path(__file__).parent / "results"
 CSV_PATH = Path(__file__).parent / "scores.csv"
 MODELS = [
    "qwen3.6-35b-a3b-mxfp4",
    "qwen3-coder-30b-apex",
    "qwen3.6-27b-mtp",
    "qwopus3.5-4b-mtp",
    "qwen3.5-9b-deepseek-v4-mtp",
    "qwopus3.6-35b-a3b-v1",
    "qwopus3.6-27b-v2-mtp",
    "qwopus3.5-9b-coder-mtp",
 ]
 def warmup_model(client: OpenAI, model: str) -> bool:
    print(f"\n{'='*60}", file=sys.stderr)
    print(f"  Loading model: {model}", file=sys.stderr)
    print(f"{'='*60}", file=sys.stderr)
    for attempt in range(3):
        try:
            resp = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": "Say OK."}],
                max_tokens=10,
                temperature=0,
            )
            print(f"  Warmup OK", file=sys.stderr)
            return True
        except Exception as e:
            print(f"  Warmup attempt {attempt+1} failed: {e}", file=sys.stderr)
            time.sleep(10)
    print(f"  WARNING: warmup failed for {model}, continuing anyway", file=sys.stderr)
    return False
 def run_benchmark(module_name: str, model: str, client: OpenAI) -> list[dict]:
    if module_name == "mmlu":
        from mmlu import load_questions, run_mmlu
        questions = load_questions()
        return run_mmlu(model, client, questions)
    elif module_name == "gsm8k":
        from gsm8k import load_questions, run_gsm8k
        questions = load_questions()
        return run_gsm8k(model, client, questions)
    elif module_name == "humaneval":
        from humaneval import load_problems, run_humaneval
        problems = load_problems()
        return run_humaneval(model, client, problems)
    else:
        raise ValueError(f"Unknown benchmark: {module_name}")
 def main() -> None:
    client = OpenAI(base_url=ENDPOINT, api_key="dummy")
    # Check connectivity
    try:
        client.models.list()
        print("Connected to llama-swap", file=sys.stderr)
    except Exception as e:
        print(f"Cannot connect to {ENDPOINT}: {e}", file=sys.stderr)
        sys.exit(1)
    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
    all_results: list[dict] = []
    benchmarks = ["mmlu", "gsm8k", "humaneval"]
    t_start = time.time()
    for model in MODELS:
        warmup_model(client, model)
        for bench in benchmarks:
            print(f"\n  --- {model} / {bench} ---", file=sys.stderr)
            try:
                results = run_benchmark(bench, model, client)
                all_results.extend(results)
                write_csv(all_results)
            except Exception as e:
                print(f"  ERROR in {model}/{bench}: {e}", file=sys.stderr)
    elapsed = time.time() - t_start
    print(f"\nAll benchmarks complete in {elapsed/60:.0f} minutes", file=sys.stderr)
    print(f"Results: {CSV_PATH}", file=sys.stderr)
 def write_csv(results: list[dict]) -> None:
    if not results:
        return
    fields = ["model", "benchmark", "question_id", "correct", "raw_answer",
              "parsed_answer", "expected", "latency_ms"]
    # Also include category if present (MMLU)
    if any("category" in r for r in results):
        fields.insert(3, "category")
    with open(CSV_PATH, "w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
        w.writeheader()
        w.writerows(results)
 if __name__ == "__main__":
    main()
--- a/eval/run_all.sh
+++ b/eval/run_all.sh
@@ -0,0 +1,20 @@
 #!/usr/bin/env bash
 set -euo pipefail
 EVAL_DIR="$(cd "$(dirname "$0")" && pwd)"
 VENV="${EVAL_DIR}/.venv/bin/python3"
 cd "$EVAL_DIR"
 echo "Starting eval sweep at $(date)"
 echo "Using venv: ${VENV}"
 echo ""
 $VENV run_all.py 2>&1 | tee eval.log
 echo ""
 echo "Generating summary..."
 $VENV analyze.py
 echo ""
 echo "Done at $(date)"
--- a/go.mod
+++ b/go.mod
@@ -0,0 +1,3 @@
 module github.com/indifferentketchup/llama-sidecar
 go 1.26.3
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -0,0 +1,139 @@
 package config
 import (
 	"bytes"
 	"encoding/json"
 	"fmt"
 	"os"
 	"strconv"
 	"strings"
 )
 var utf8BOM = []byte{0xEF, 0xBB, 0xBF}
 type Config struct {
 	Bind                  string
 	LlamaServerBin        string
 	ModelDirMap           map[string]string
 	PortRangeLo           int
 	PortRangeHi           int
 	MaxSidecars           int
 	LogLevel              string
 	BaseArgs              []string
 	HealthTimeoutSeconds  int
 	HealthIntervalSeconds int
 }
 func Load() (*Config, error) {
 	bin := os.Getenv("LLAMA_SERVER_BIN")
 	if bin == "" {
 		return nil, fmt.Errorf("LLAMA_SERVER_BIN is required")
 	}
 	if _, err := os.Stat(bin); err != nil {
 		return nil, fmt.Errorf("LLAMA_SERVER_BIN %q: %w", bin, err)
 	}
 	mapFile := os.Getenv("MODEL_DIR_MAP_FILE")
 	if mapFile == "" {
 		return nil, fmt.Errorf("MODEL_DIR_MAP_FILE is required")
 	}
 	modelMap, err := loadModelMap(mapFile)
 	if err != nil {
 		return nil, fmt.Errorf("MODEL_DIR_MAP_FILE: %w", err)
 	}
 	bind := envOr("LLAMA_SIDECAR_BIND", "127.0.0.1:8402")
 	logLevel := envOr("LOG_LEVEL", "info")
 	maxSidecars := envIntOr("MAX_SIDECARS", 2)
 	healthTimeout := envIntOr("HEALTH_TIMEOUT_SECONDS", 60)
 	healthInterval := envIntOr("HEALTH_INTERVAL_SECONDS", 30)
 	lo, hi, err := parsePortRange(envOr("PORT_RANGE", "8500-8599"))
 	if err != nil {
 		return nil, fmt.Errorf("PORT_RANGE: %w", err)
 	}
 	if hi-lo+1 < maxSidecars {
 		return nil, fmt.Errorf("PORT_RANGE %d-%d has %d ports but MAX_SIDECARS is %d", lo, hi, hi-lo+1, maxSidecars)
 	}
 	baseArgs := defaultBaseArgs()
 	if env := os.Getenv("BASE_ARGS"); env != "" {
 		var parsed []string
 		envBytes := bytes.TrimPrefix([]byte(env), utf8BOM)
 		if err := json.Unmarshal(envBytes, &parsed); err != nil {
 			return nil, fmt.Errorf("BASE_ARGS: invalid JSON array: %w", err)
 		}
 		baseArgs = parsed
 	}
 	return &Config{
 		Bind:                  bind,
 		LlamaServerBin:        bin,
 		ModelDirMap:           modelMap,
 		PortRangeLo:           lo,
 		PortRangeHi:           hi,
 		MaxSidecars:           maxSidecars,
 		LogLevel:              logLevel,
 		BaseArgs:              baseArgs,
 		HealthTimeoutSeconds:  healthTimeout,
 		HealthIntervalSeconds: healthInterval,
 	}, nil
 }
 func defaultBaseArgs() []string {
 	return []string{"-ngl", "999", "-c", "32768", "--flash-attn", "on", "--no-mmap"}
 }
 func loadModelMap(path string) (map[string]string, error) {
 	data, err := os.ReadFile(path)
 	if err != nil {
 		return nil, err
 	}
 	data = bytes.TrimPrefix(data, utf8BOM)
 	var m map[string]string
 	if err := json.Unmarshal(data, &m); err != nil {
 		return nil, fmt.Errorf("invalid JSON: %w", err)
 	}
 	if len(m) == 0 {
 		return nil, fmt.Errorf("model map is empty")
 	}
 	return m, nil
 }
 func parsePortRange(s string) (int, int, error) {
 	parts := strings.SplitN(s, "-", 2)
 	if len(parts) != 2 {
 		return 0, 0, fmt.Errorf("expected lo-hi format, got %q", s)
 	}
 	lo, err := strconv.Atoi(strings.TrimSpace(parts[0]))
 	if err != nil {
 		return 0, 0, fmt.Errorf("invalid lo port: %w", err)
 	}
 	hi, err := strconv.Atoi(strings.TrimSpace(parts[1]))
 	if err != nil {
 		return 0, 0, fmt.Errorf("invalid hi port: %w", err)
 	}
 	if hi <= lo {
 		return 0, 0, fmt.Errorf("hi (%d) must be > lo (%d)", hi, lo)
 	}
 	return lo, hi, nil
 }
 func envOr(key, fallback string) string {
 	if v := os.Getenv(key); v != "" {
 		return v
 	}
 	return fallback
 }
 func envIntOr(key string, fallback int) int {
 	v := os.Getenv(key)
 	if v == "" {
 		return fallback
 	}
 	n, err := strconv.Atoi(v)
 	if err != nil {
 		return fallback
 	}
 	return n
 }
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -0,0 +1,79 @@
 package config
 import (
 	"os"
 	"path/filepath"
 	"testing"
 )
 func TestLoad_MissingRequired(t *testing.T) {
 	os.Unsetenv("LLAMA_SERVER_BIN")
 	os.Unsetenv("MODEL_DIR_MAP_FILE")
 	_, err := Load()
 	if err == nil {
 		t.Fatal("expected error for missing LLAMA_SERVER_BIN")
 	}
 }
 func TestParsePortRange(t *testing.T) {
 	lo, hi, err := parsePortRange("8500-8599")
 	if err != nil {
 		t.Fatal(err)
 	}
 	if lo != 8500 || hi != 8599 {
 		t.Fatalf("got %d-%d", lo, hi)
 	}
 }
 func TestParsePortRange_Bad(t *testing.T) {
 	_, _, err := parsePortRange("abc")
 	if err == nil {
 		t.Fatal("expected error")
 	}
 	_, _, err = parsePortRange("100-50")
 	if err == nil {
 		t.Fatal("expected error for hi <= lo")
 	}
 }
 func TestLoadModelMap_BOM(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "model_map.json")
 	content := append([]byte{0xEF, 0xBB, 0xBF}, []byte(`{"test-model": "/fake/path.gguf"}`)...)
 	if err := os.WriteFile(path, content, 0644); err != nil {
 		t.Fatal(err)
 	}
 	m, err := loadModelMap(path)
 	if err != nil {
 		t.Fatalf("BOM-prefixed JSON should parse: %v", err)
 	}
 	if m["test-model"] != "/fake/path.gguf" {
 		t.Fatalf("unexpected map: %v", m)
 	}
 }
 func TestDefaultBaseArgs_FlashAttn(t *testing.T) {
 	args := defaultBaseArgs()
 	for i, a := range args {
 		if a == "--flash-attn" && i+1 < len(args) && args[i+1] == "on" {
 			return
 		}
 	}
 	t.Fatal("expected --flash-attn on in default args")
 }
 func TestDefaultBaseArgs(t *testing.T) {
 	args := defaultBaseArgs()
 	if len(args) == 0 {
 		t.Fatal("expected non-empty default args")
 	}
 	found := false
 	for _, a := range args {
 		if a == "--no-mmap" {
 			found = true
 		}
 	}
 	if !found {
 		t.Fatal("expected --no-mmap in default args")
 	}
 }
--- a/internal/pool/hash.go
+++ b/internal/pool/hash.go
@@ -0,0 +1,53 @@
 package pool
 import (
 	"crypto/sha256"
 	"fmt"
 	"sort"
 	"strings"
 	"github.com/indifferentketchup/llama-sidecar/internal/validator"
 )
 // Hash computes a deterministic hash for a (modelID, flags) pair.
 // Flag order does not affect the result.
 func Hash(modelID string, flags []string) string {
 	type pair struct {
 		key, val string
 	}
 	var pairs []pair
 	i := 0
 	for i < len(flags) {
 		tok := flags[i]
 		key := validator.FlagName(tok)
 		if key == "" {
 			i++
 			continue
 		}
 		if idx := strings.IndexByte(tok, '='); idx >= 0 {
 			pairs = append(pairs, pair{key: tok[:idx], val: tok[idx+1:]})
 			i++
 		} else if i+1 < len(flags) && validator.FlagName(flags[i+1]) == "" {
 			pairs = append(pairs, pair{key: key, val: flags[i+1]})
 			i += 2
 		} else {
 			pairs = append(pairs, pair{key: key, val: ""})
 			i++
 		}
 	}
 	sort.Slice(pairs, func(a, b int) bool {
 		return pairs[a].key < pairs[b].key
 	})
 	var parts []string
 	for _, p := range pairs {
 		parts = append(parts, p.key+"\x1f"+p.val)
 	}
 	serialized := strings.Join(parts, "\x1e")
 	input := modelID + "\x1d" + serialized
 	sum := sha256.Sum256([]byte(input))
 	return fmt.Sprintf("%x", sum[:8])
 }
--- a/internal/pool/hash_test.go
+++ b/internal/pool/hash_test.go
@@ -0,0 +1,53 @@
 package pool
 import (
 	"math/rand"
 	"testing"
 )
 func TestHash_OrderIndependence(t *testing.T) {
 	flags1 := []string{"--a", "1", "--b", "2", "--c", "3"}
 	h1 := Hash("foo", flags1)
 	for i := 0; i < 5; i++ {
 		shuffled := make([]string, len(flags1))
 		copy(shuffled, flags1)
 		// Shuffle pairs (each pair is 2 tokens)
 		pairs := make([][2]string, 0)
 		for j := 0; j < len(shuffled); j += 2 {
 			pairs = append(pairs, [2]string{shuffled[j], shuffled[j+1]})
 		}
 		rand.Shuffle(len(pairs), func(a, b int) { pairs[a], pairs[b] = pairs[b], pairs[a] })
 		var flat []string
 		for _, p := range pairs {
 			flat = append(flat, p[0], p[1])
 		}
 		h := Hash("foo", flat)
 		if h != h1 {
 			t.Errorf("iteration %d: hash %s != %s for order %v", i, h, h1, flat)
 		}
 	}
 }
 func TestHash_SeparatorCollision(t *testing.T) {
 	h1 := Hash("foo", []string{"--a\x1eb", "1"})
 	h2 := Hash("foo", []string{"--ab", "1"})
 	if h1 == h2 {
 		t.Error("separator collision: hashes should differ")
 	}
 }
 func TestHash_Length(t *testing.T) {
 	h := Hash("model", []string{"--top-k", "20"})
 	if len(h) != 16 {
 		t.Errorf("expected 16 hex chars, got %d: %s", len(h), h)
 	}
 }
 func TestHash_DifferentModels(t *testing.T) {
 	h1 := Hash("model-a", []string{"--top-k", "20"})
 	h2 := Hash("model-b", []string{"--top-k", "20"})
 	if h1 == h2 {
 		t.Error("different models should produce different hashes")
 	}
 }
--- a/internal/pool/pool.go
+++ b/internal/pool/pool.go
@@ -0,0 +1,188 @@
 package pool
 import (
 	"container/list"
 	"context"
 	"fmt"
 	"log/slog"
 	"sync"
 	"time"
 	"github.com/indifferentketchup/llama-sidecar/internal/config"
 	"github.com/indifferentketchup/llama-sidecar/internal/validator"
 )
 type SidecarInfo struct {
 	Hash      string    `json:"hash"`
 	ModelID   string    `json:"model_id"`
 	Flags     []string  `json:"flags"`
 	Port      int       `json:"port"`
 	Pid       int       `json:"pid"`
 	StartedAt time.Time `json:"started_at"`
 	LastUsed  time.Time `json:"last_used"`
 	Healthy   bool      `json:"healthy"`
 }
 type Pool struct {
 	mu       sync.Mutex
 	cfg      *config.Config
 	sidecars map[string]*Sidecar
 	lru      *list.List
 	lruIdx   map[string]*list.Element
 	ports    *PortAllocator
 	spawner  Spawner
 }
 func New(cfg *config.Config, spawner Spawner) *Pool {
 	return &Pool{
 		cfg:      cfg,
 		sidecars: make(map[string]*Sidecar),
 		lru:      list.New(),
 		lruIdx:   make(map[string]*list.Element),
 		ports:    NewPortAllocator(cfg.PortRangeLo, cfg.PortRangeHi),
 		spawner:  spawner,
 	}
 }
 func (p *Pool) Acquire(ctx context.Context, modelID string, flags []string) (*Sidecar, error) {
 	if _, err := validator.ValidateExtraArgs(flags); err != nil {
 		return nil, fmt.Errorf("validation: %w", err)
 	}
 	modelPath, ok := p.cfg.ModelDirMap[modelID]
 	if !ok {
 		return nil, fmt.Errorf("unknown model: %s", modelID)
 	}
 	hash := Hash(modelID, flags)
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	if s, ok := p.sidecars[hash]; ok {
 		if s.Healthy() {
 			if el, ok := p.lruIdx[hash]; ok {
 				p.lru.MoveToFront(el)
 			}
 			s.TouchLastUsed()
 			return s, nil
 		}
 		p.removeLocked(hash)
 	}
 	if len(p.sidecars) >= p.cfg.MaxSidecars {
 		if err := p.evictLRULocked(); err != nil {
 			return nil, fmt.Errorf("eviction failed: %w", err)
 		}
 	}
 	port, err := p.ports.Allocate()
 	if err != nil {
 		return nil, fmt.Errorf("port allocation: %w", err)
 	}
 	p.mu.Unlock()
 	s, err := p.spawner.Spawn(ctx, p.cfg, modelID, modelPath, flags, port, hash)
 	p.mu.Lock()
 	if err != nil {
 		p.ports.Release(port)
 		return nil, fmt.Errorf("spawn: %w", err)
 	}
 	p.sidecars[hash] = s
 	el := p.lru.PushFront(hash)
 	p.lruIdx[hash] = el
 	return s, nil
 }
 func (p *Pool) List() []SidecarInfo {
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	out := make([]SidecarInfo, 0, len(p.sidecars))
 	for _, s := range p.sidecars {
 		out = append(out, SidecarInfo{
 			Hash:      s.Hash,
 			ModelID:   s.ModelID,
 			Flags:     s.Flags,
 			Port:      s.Port,
 			Pid:       s.Pid,
 			StartedAt: s.StartedAt,
 			LastUsed:  time.Unix(0, s.LastUsed.Load()),
 			Healthy:   s.Healthy(),
 		})
 	}
 	return out
 }
 func (p *Pool) Remove(hash string) error {
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	if _, ok := p.sidecars[hash]; !ok {
 		return fmt.Errorf("sidecar %s not found", hash)
 	}
 	return p.removeLocked(hash)
 }
 func (p *Pool) Shutdown(ctx context.Context) error {
 	p.mu.Lock()
 	hashes := make([]string, 0, len(p.sidecars))
 	for h := range p.sidecars {
 		hashes = append(hashes, h)
 	}
 	p.mu.Unlock()
 	var wg sync.WaitGroup
 	for _, h := range hashes {
 		wg.Add(1)
 		go func(hash string) {
 			defer wg.Done()
 			p.mu.Lock()
 			s, ok := p.sidecars[hash]
 			p.mu.Unlock()
 			if !ok {
 				return
 			}
 			if err := p.spawner.Kill(s); err != nil {
 				slog.Error("shutdown kill failed", "hash", hash, "err", err)
 			}
 		}(h)
 	}
 	done := make(chan struct{})
 	go func() { wg.Wait(); close(done) }()
 	select {
 	case <-done:
 	case <-ctx.Done():
 		return ctx.Err()
 	}
 	slog.Info("pool shutdown complete", "count", len(hashes))
 	return nil
 }
 func (p *Pool) removeLocked(hash string) error {
 	s, ok := p.sidecars[hash]
 	if !ok {
 		return nil
 	}
 	delete(p.sidecars, hash)
 	if el, ok := p.lruIdx[hash]; ok {
 		p.lru.Remove(el)
 		delete(p.lruIdx, hash)
 	}
 	if err := p.spawner.Kill(s); err != nil {
 		slog.Error("kill failed during remove", "hash", hash, "err", err)
 	}
 	p.ports.Release(s.Port)
 	return nil
 }
 func (p *Pool) evictLRULocked() error {
 	back := p.lru.Back()
 	if back == nil {
 		return fmt.Errorf("pool full but LRU empty")
 	}
 	hash := back.Value.(string)
 	slog.Info("evicting LRU sidecar", "hash", hash)
 	return p.removeLocked(hash)
 }
--- a/internal/pool/pool_test.go
+++ b/internal/pool/pool_test.go
@@ -0,0 +1,151 @@
 package pool
 import (
 	"context"
 	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
 	"github.com/indifferentketchup/llama-sidecar/internal/config"
 )
 type fakeSpawner struct {
 	spawnCount atomic.Int32
 	killCount  atomic.Int32
 }
 func (f *fakeSpawner) Spawn(ctx context.Context, cfg *config.Config, modelID, modelPath string, flags []string, port int, hash string) (*Sidecar, error) {
 	f.spawnCount.Add(1)
 	s := &Sidecar{
 		Hash:      hash,
 		ModelID:   modelID,
 		ModelPath: modelPath,
 		Flags:     flags,
 		Port:      port,
 		Pid:       99999,
 		StartedAt: time.Now(),
 		stderr:    newRingBuffer(8),
 		cancel:    func() {},
 	}
 	s.healthy.Store(true)
 	s.LastUsed.Store(time.Now().UnixNano())
 	return s, nil
 }
 func (f *fakeSpawner) Kill(s *Sidecar) error {
 	f.killCount.Add(1)
 	return nil
 }
 func testConfig() *config.Config {
 	return &config.Config{
 		Bind:           "127.0.0.1:0",
 		LlamaServerBin: "/fake/llama-server",
 		ModelDirMap: map[string]string{
 			"model-a": "/fake/model-a.gguf",
 			"model-b": "/fake/model-b.gguf",
 		},
 		PortRangeLo:          8500,
 		PortRangeHi:          8509,
 		MaxSidecars:          2,
 		BaseArgs:             []string{"-ngl", "999"},
 		HealthTimeoutSeconds: 60,
 	}
 }
 func TestPool_AcquireSameKey(t *testing.T) {
 	fs := &fakeSpawner{}
 	p := New(testConfig(), fs)
 	ctx := context.Background()
 	s1, err := p.Acquire(ctx, "model-a", []string{"--top-k", "20"})
 	if err != nil {
 		t.Fatal(err)
 	}
 	s2, err := p.Acquire(ctx, "model-a", []string{"--top-k", "20"})
 	if err != nil {
 		t.Fatal(err)
 	}
 	if s1.Hash != s2.Hash {
 		t.Fatalf("expected same sidecar, got different hashes: %s vs %s", s1.Hash, s2.Hash)
 	}
 	if fs.spawnCount.Load() != 1 {
 		t.Fatalf("expected 1 spawn, got %d", fs.spawnCount.Load())
 	}
 }
 func TestPool_EvictLRU(t *testing.T) {
 	cfg := testConfig()
 	cfg.MaxSidecars = 1
 	fs := &fakeSpawner{}
 	p := New(cfg, fs)
 	ctx := context.Background()
 	_, err := p.Acquire(ctx, "model-a", []string{"--top-k", "20"})
 	if err != nil {
 		t.Fatal(err)
 	}
 	_, err = p.Acquire(ctx, "model-b", []string{"--top-k", "40"})
 	if err != nil {
 		t.Fatal(err)
 	}
 	if fs.spawnCount.Load() != 2 {
 		t.Fatalf("expected 2 spawns, got %d", fs.spawnCount.Load())
 	}
 	if fs.killCount.Load() != 1 {
 		t.Fatalf("expected 1 kill (eviction), got %d", fs.killCount.Load())
 	}
 	list := p.List()
 	if len(list) != 1 {
 		t.Fatalf("expected 1 sidecar, got %d", len(list))
 	}
 	if list[0].ModelID != "model-b" {
 		t.Fatalf("expected model-b, got %s", list[0].ModelID)
 	}
 }
 func TestPool_ValidatorReject(t *testing.T) {
 	fs := &fakeSpawner{}
 	p := New(testConfig(), fs)
 	_, err := p.Acquire(context.Background(), "model-a", []string{"--model", "evil.gguf"})
 	if err == nil {
 		t.Fatal("expected validation error")
 	}
 }
 func TestPool_UnknownModel(t *testing.T) {
 	fs := &fakeSpawner{}
 	p := New(testConfig(), fs)
 	_, err := p.Acquire(context.Background(), "nonexistent", nil)
 	if err == nil {
 		t.Fatal("expected unknown model error")
 	}
 }
 func TestPool_ConcurrentAcquire(t *testing.T) {
 	cfg := testConfig()
 	cfg.MaxSidecars = 10
 	cfg.PortRangeHi = 8599
 	fs := &fakeSpawner{}
 	p := New(cfg, fs)
 	ctx := context.Background()
 	var wg sync.WaitGroup
 	for i := 0; i < 10; i++ {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
 			for j := 0; j < 50; j++ {
 				_, _ = p.Acquire(ctx, "model-a", []string{"--top-k", "20"})
 			}
 		}()
 	}
 	wg.Wait()
 	list := p.List()
 	if len(list) != 1 {
 		t.Fatalf("expected 1 sidecar (same key), got %d", len(list))
 	}
 }
--- a/internal/pool/ports.go
+++ b/internal/pool/ports.go
@@ -0,0 +1,28 @@
 package pool
 import "fmt"
 type PortAllocator struct {
 	ports chan int
 }
 func NewPortAllocator(lo, hi int) *PortAllocator {
 	ch := make(chan int, hi-lo+1)
 	for p := lo; p <= hi; p++ {
 		ch <- p
 	}
 	return &PortAllocator{ports: ch}
 }
 func (pa *PortAllocator) Allocate() (int, error) {
 	select {
 	case p := <-pa.ports:
 		return p, nil
 	default:
 		return 0, fmt.Errorf("port allocator exhausted")
 	}
 }
 func (pa *PortAllocator) Release(port int) {
 	pa.ports <- port
 }
--- a/internal/pool/ports_test.go
+++ b/internal/pool/ports_test.go
@@ -0,0 +1,74 @@
 package pool
 import (
 	"sync"
 	"testing"
 )
 func TestPortAllocator_AllocateRelease(t *testing.T) {
 	pa := NewPortAllocator(8500, 8502)
 	p1, err := pa.Allocate()
 	if err != nil {
 		t.Fatal(err)
 	}
 	p2, err := pa.Allocate()
 	if err != nil {
 		t.Fatal(err)
 	}
 	p3, err := pa.Allocate()
 	if err != nil {
 		t.Fatal(err)
 	}
 	// All three ports should be distinct
 	if p1 == p2 || p2 == p3 || p1 == p3 {
 		t.Fatalf("expected distinct ports: %d, %d, %d", p1, p2, p3)
 	}
 	// Exhausted
 	_, err = pa.Allocate()
 	if err == nil {
 		t.Fatal("expected error when exhausted")
 	}
 	// Release and re-allocate
 	pa.Release(p2)
 	p4, err := pa.Allocate()
 	if err != nil {
 		t.Fatal(err)
 	}
 	if p4 != p2 {
 		t.Fatalf("expected released port %d, got %d", p2, p4)
 	}
 }
 func TestPortAllocator_Concurrent(t *testing.T) {
 	pa := NewPortAllocator(8500, 8599)
 	var wg sync.WaitGroup
 	allocated := make(chan int, 100)
 	for i := 0; i < 100; i++ {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
 			p, err := pa.Allocate()
 			if err != nil {
 				return
 			}
 			allocated <- p
 		}()
 	}
 	wg.Wait()
 	close(allocated)
 	seen := make(map[int]bool)
 	for p := range allocated {
 		if seen[p] {
 			t.Fatalf("duplicate port %d", p)
 		}
 		seen[p] = true
 	}
 	if len(seen) != 100 {
 		t.Fatalf("expected 100 ports, got %d", len(seen))
 	}
 }
--- a/internal/pool/sidecar.go
+++ b/internal/pool/sidecar.go
@@ -0,0 +1,313 @@
 package pool
 import (
 	"bytes"
 	"context"
 	"fmt"
 	"io"
 	"log/slog"
 	"net/http"
 	"os"
 	"os/exec"
 	"strconv"
 	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
 	"github.com/indifferentketchup/llama-sidecar/internal/config"
 	"github.com/indifferentketchup/llama-sidecar/internal/validator"
 )
 type Sidecar struct {
 	Hash      string
 	ModelID   string
 	ModelPath string
 	Flags     []string
 	Port      int
 	Pid       int
 	StartedAt time.Time
 	LastUsed  atomic.Int64
 	healthy   atomic.Bool
 	cmd       *exec.Cmd
 	cancel    context.CancelFunc
 	done      chan error
 	stderr    *ringBuffer
 	stopMon   context.CancelFunc
 	stdinFile  *os.File
 	stdoutR    *os.File
 	stdoutFile *os.File
 }
 func (s *Sidecar) Healthy() bool {
 	return s.healthy.Load()
 }
 func (s *Sidecar) TouchLastUsed() {
 	s.LastUsed.Store(time.Now().UnixNano())
 }
 func (s *Sidecar) LastStderr() string {
 	return s.stderr.String()
 }
 // Spawner abstracts sidecar creation for testing.
 type Spawner interface {
 	Spawn(ctx context.Context, cfg *config.Config, modelID, modelPath string, flags []string, port int, hash string) (*Sidecar, error)
 	Kill(s *Sidecar) error
 }
 type RealSpawner struct{}
 func (rs *RealSpawner) Spawn(ctx context.Context, cfg *config.Config, modelID, modelPath string, flags []string, port int, hash string) (*Sidecar, error) {
 	args := buildArgs(cfg.BaseArgs, modelPath, port, flags)
 	_ = ctx
 	childCtx, cancel := context.WithCancel(context.Background())
 	cmd := exec.CommandContext(childCtx, cfg.LlamaServerBin, args...)
 	setPlatformAttrs(cmd)
 	devNull, err := os.Open(os.DevNull)
 	if err != nil {
 		cancel()
 		return nil, fmt.Errorf("open devnull: %w", err)
 	}
 	cmd.Stdin = devNull
 	stderr := newRingBuffer(64)
 	prefix := fmt.Sprintf("[sidecar:%s:%d] ", hash[:8], port)
 	cmd.Stderr = io.MultiWriter(stderr, &prefixWriter{prefix: prefix})
 	stdoutR, stdoutW, err := os.Pipe()
 	if err != nil {
 		cancel()
 		devNull.Close()
 		return nil, fmt.Errorf("stdout pipe: %w", err)
 	}
 	go io.Copy(io.Discard, stdoutR)
 	cmd.Stdout = stdoutW
 	slog.Info("spawning sidecar", "hash", hash, "model", modelID, "port", port, "args", strings.Join(args, " "))
 	if err := cmd.Start(); err != nil {
 		cancel()
 		return nil, fmt.Errorf("spawn failed: %w", err)
 	}
 	s := &Sidecar{
 		Hash:      hash,
 		ModelID:   modelID,
 		ModelPath: modelPath,
 		Flags:     flags,
 		Port:      port,
 		Pid:       cmd.Process.Pid,
 		StartedAt: time.Now(),
 		cmd:       cmd,
 		cancel:    cancel,
 		done:      make(chan error, 1),
 		stderr:    stderr,
 		stdinFile:  devNull,
 		stdoutR:    stdoutR,
 		stdoutFile: stdoutW,
 	}
 	s.LastUsed.Store(time.Now().UnixNano())
 	go func() {
 		err := cmd.Wait()
 		s.healthy.Store(false)
 		exitCode := -1
 		if cmd.ProcessState != nil {
 			exitCode = cmd.ProcessState.ExitCode()
 		}
 		slog.Error("sidecar child exited",
 			"hash", hash,
 			"port", port,
 			"pid", s.Pid,
 			"exit_code", exitCode,
 			"wait_err", fmt.Sprintf("%v", err),
 			"uptime", time.Since(s.StartedAt).Round(time.Millisecond),
 			"stderr_tail", stderr.String(),
 		)
 		s.done <- err
 		close(s.done)
 	}()
 	// Wait for health
 	healthURL := fmt.Sprintf("http://127.0.0.1:%d/health", port)
 	deadline := time.Now().Add(time.Duration(cfg.HealthTimeoutSeconds) * time.Second)
 	for time.Now().Before(deadline) {
 		resp, err := http.Get(healthURL)
 		if err == nil {
 			resp.Body.Close()
 			if resp.StatusCode == 200 {
 				s.healthy.Store(true)
 				slog.Info("sidecar healthy", "hash", hash, "port", port, "elapsed", time.Since(s.StartedAt).Round(time.Millisecond))
 				monCtx, monCancel := context.WithCancel(ctx)
 				s.stopMon = monCancel
 				go s.healthMonitor(monCtx, cfg.HealthIntervalSeconds)
 				return s, nil
 			}
 		}
 		select {
 		case <-childCtx.Done():
 			return nil, fmt.Errorf("sidecar process exited during health check")
 		case <-time.After(500 * time.Millisecond):
 		}
 	}
 	_ = rs.Kill(s)
 	return nil, fmt.Errorf("health check timed out after %ds, last stderr: %s", cfg.HealthTimeoutSeconds, s.stderr.LastLine())
 }
 func (rs *RealSpawner) Kill(s *Sidecar) error {
 	if s.stopMon != nil {
 		s.stopMon()
 	}
 	s.cancel()
 	select {
 	case <-s.done:
 	case <-time.After(5 * time.Second):
 		if s.cmd.Process != nil {
 			_ = s.cmd.Process.Kill()
 		}
 		<-s.done
 	}
 	if s.stdinFile != nil {
 		s.stdinFile.Close()
 	}
 	if s.stdoutFile != nil {
 		s.stdoutFile.Close()
 	}
 	if s.stdoutR != nil {
 		s.stdoutR.Close()
 	}
 	slog.Info("sidecar killed", "hash", s.Hash, "port", s.Port)
 	return nil
 }
 func (s *Sidecar) healthMonitor(ctx context.Context, intervalSec int) {
 	ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
 	defer ticker.Stop()
 	failures := 0
 	url := fmt.Sprintf("http://127.0.0.1:%d/health", s.Port)
 	client := &http.Client{Timeout: 5 * time.Second}
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case <-ticker.C:
 			resp, err := client.Get(url)
 			if err != nil || resp.StatusCode != 200 {
 				if resp != nil {
 					resp.Body.Close()
 				}
 				failures++
 				if failures >= 3 {
 					slog.Warn("sidecar unhealthy, marking for eviction", "hash", s.Hash, "port", s.Port)
 					s.healthy.Store(false)
 					return
 				}
 			} else {
 				resp.Body.Close()
 				failures = 0
 			}
 		}
 	}
 }
 func buildArgs(baseArgs []string, modelPath string, port int, userFlags []string) []string {
 	deduped := dedupFlags(baseArgs, userFlags)
 	args := make([]string, 0, len(deduped)+len(userFlags)+4)
 	args = append(args, deduped...)
 	args = append(args, "--model", modelPath)
 	args = append(args, "--port", strconv.Itoa(port))
 	args = append(args, userFlags...)
 	return args
 }
 // dedupFlags removes from autoArgs any flag that the user also supplied,
 // so the user's value wins via llama.cpp's last-wins CLI parsing.
 func dedupFlags(autoArgs, userArgs []string) []string {
 	userNames := make(map[string]bool)
 	for _, tok := range userArgs {
 		if name := validator.FlagName(tok); name != "" {
 			userNames[name] = true
 		}
 	}
 	out := make([]string, 0, len(autoArgs))
 	i := 0
 	for i < len(autoArgs) {
 		tok := autoArgs[i]
 		name := validator.FlagName(tok)
 		if name == "" || !userNames[name] {
 			out = append(out, tok)
 			i++
 			continue
 		}
 		if strings.Contains(tok, "=") {
 			i++
 		} else if i+1 < len(autoArgs) && validator.FlagName(autoArgs[i+1]) == "" {
 			i += 2
 		} else {
 			i++
 		}
 	}
 	return out
 }
 // Ring buffer for last N lines of stderr
 type ringBuffer struct {
 	mu    sync.Mutex
 	lines []string
 	max   int
 }
 func newRingBuffer(max int) *ringBuffer {
 	return &ringBuffer{lines: make([]string, 0, max), max: max}
 }
 func (rb *ringBuffer) Write(p []byte) (int, error) {
 	rb.mu.Lock()
 	defer rb.mu.Unlock()
 	for _, line := range strings.Split(string(p), "\n") {
 		line = strings.TrimRight(line, "\r\n")
 		if line == "" {
 			continue
 		}
 		if len(rb.lines) >= rb.max {
 			rb.lines = rb.lines[1:]
 		}
 		rb.lines = append(rb.lines, line)
 	}
 	return len(p), nil
 }
 func (rb *ringBuffer) String() string {
 	rb.mu.Lock()
 	defer rb.mu.Unlock()
 	return strings.Join(rb.lines, "\n")
 }
 func (rb *ringBuffer) LastLine() string {
 	rb.mu.Lock()
 	defer rb.mu.Unlock()
 	if len(rb.lines) == 0 {
 		return ""
 	}
 	return rb.lines[len(rb.lines)-1]
 }
 type prefixWriter struct {
 	prefix string
 	buf    bytes.Buffer
 }
 func (pw *prefixWriter) Write(p []byte) (int, error) {
 	pw.buf.Write(p)
 	for {
 		line, err := pw.buf.ReadString('\n')
 		if err != nil {
 			pw.buf.WriteString(line)
 			break
 		}
 		fmt.Fprint(os.Stderr, pw.prefix+line)
 	}
 	return len(p), nil
 }
--- a/internal/pool/sidecar_test.go
+++ b/internal/pool/sidecar_test.go
@@ -0,0 +1,96 @@
 package pool
 import (
 	"reflect"
 	"testing"
 )
 func TestBuildArgs_PreservesNonOverlapping(t *testing.T) {
 	base := []string{"-ngl", "999", "-c", "32768", "--flash-attn", "on", "--no-mmap"}
 	user := []string{"--top-k", "20"}
 	got := buildArgs(base, "/model.gguf", 8500, user)
 	// -c 32768 must survive (user didn't supply -c)
 	if !containsSeq(got, "-c", "32768") {
 		t.Errorf("-c 32768 missing from args: %v", got)
 	}
 	// --top-k 20 must be present (user flag)
 	if !containsSeq(got, "--top-k", "20") {
 		t.Errorf("--top-k 20 missing from args: %v", got)
 	}
 	// --model and --port injected
 	if !containsSeq(got, "--model", "/model.gguf") {
 		t.Errorf("--model missing: %v", got)
 	}
 	if !containsSeq(got, "--port", "8500") {
 		t.Errorf("--port missing: %v", got)
 	}
 }
 func TestBuildArgs_UserOverridesBase(t *testing.T) {
 	base := []string{"-ngl", "999", "-c", "32768"}
 	user := []string{"-c", "131072"}
 	got := buildArgs(base, "/model.gguf", 8500, user)
 	// base -c should be dropped, user -c should be present
 	count := 0
 	for i, tok := range got {
 		if tok == "-c" && i+1 < len(got) {
 			count++
 			if got[i+1] == "32768" {
 				t.Errorf("base -c 32768 should have been deduped: %v", got)
 			}
 		}
 	}
 	if count != 1 {
 		t.Errorf("expected exactly 1 -c flag, got %d in %v", count, got)
 	}
 }
 func TestBuildArgs_NoUserFlags(t *testing.T) {
 	base := []string{"-ngl", "999", "-c", "32768", "--no-mmap"}
 	got := buildArgs(base, "/model.gguf", 8500, nil)
 	if !containsSeq(got, "-c", "32768") {
 		t.Errorf("-c 32768 missing when no user flags: %v", got)
 	}
 	if !containsSeq(got, "--no-mmap") {
 		t.Errorf("--no-mmap missing: %v", got)
 	}
 }
 func TestDedupFlags_Mixed(t *testing.T) {
 	auto := []string{"--top-k", "40", "-c", "32768", "--no-mmap"}
 	user := []string{"--top-k", "20"}
 	got := dedupFlags(auto, user)
 	want := []string{"-c", "32768", "--no-mmap"}
 	if !reflect.DeepEqual(got, want) {
 		t.Errorf("dedupFlags = %v, want %v", got, want)
 	}
 }
 func TestDedupFlags_EqualsForm(t *testing.T) {
 	auto := []string{"--ctx-size=4096", "--no-mmap"}
 	user := []string{"--ctx-size", "8192"}
 	got := dedupFlags(auto, user)
 	want := []string{"--no-mmap"}
 	if !reflect.DeepEqual(got, want) {
 		t.Errorf("dedupFlags = %v, want %v", got, want)
 	}
 }
 func containsSeq(args []string, seq ...string) bool {
 	for i := 0; i <= len(args)-len(seq); i++ {
 		match := true
 		for j, s := range seq {
 			if args[i+j] != s {
 				match = false
 				break
 			}
 		}
 		if match {
 			return true
 		}
 	}
 	return false
 }
--- a/internal/pool/sidecar_unix.go
+++ b/internal/pool/sidecar_unix.go
@@ -0,0 +1,7 @@
 //go:build !windows
 package pool
 import "os/exec"
 func setPlatformAttrs(_ *exec.Cmd) {}
--- a/internal/pool/sidecar_windows.go
+++ b/internal/pool/sidecar_windows.go
@@ -0,0 +1,15 @@
 //go:build windows
 package pool
 import (
 	"os/exec"
 	"syscall"
 )
 func setPlatformAttrs(cmd *exec.Cmd) {
 	cmd.SysProcAttr = &syscall.SysProcAttr{
 		HideWindow:    true,
 		CreationFlags: 0x00000008 | 0x00000200, // DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP
 	}
 }
--- a/internal/server/admin.go
+++ b/internal/server/admin.go
@@ -0,0 +1,42 @@
 package server
 import (
 	"net/http"
 	"time"
 	"github.com/indifferentketchup/llama-sidecar/internal/config"
 	"github.com/indifferentketchup/llama-sidecar/internal/pool"
 )
 func healthHandler(p *pool.Pool, cfg *config.Config, startedAt time.Time) http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		sidecars := p.List()
 		writeJSON(w, http.StatusOK, map[string]any{
 			"status":         "ok",
 			"sidecars":       len(sidecars),
 			"max":            cfg.MaxSidecars,
 			"uptime_seconds": int(time.Since(startedAt).Seconds()),
 		})
 	}
 }
 func listSidecarsHandler(p *pool.Pool) http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		writeJSON(w, http.StatusOK, p.List())
 	}
 }
 func deleteSidecarHandler(p *pool.Pool) http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		hash := r.PathValue("hash")
 		if hash == "" {
 			writeJSON(w, http.StatusBadRequest, map[string]string{"error": "hash required"})
 			return
 		}
 		if err := p.Remove(hash); err != nil {
 			writeJSON(w, http.StatusNotFound, map[string]string{"error": err.Error()})
 			return
 		}
 		writeJSON(w, http.StatusOK, map[string]string{"status": "removed"})
 	}
 }
--- a/internal/server/proxy.go
+++ b/internal/server/proxy.go
@@ -0,0 +1,111 @@
 package server
 import (
 	"encoding/json"
 	"fmt"
 	"io"
 	"log/slog"
 	"net/http"
 	"net/http/httputil"
 	"net/url"
 	"strings"
 	"github.com/indifferentketchup/llama-sidecar/internal/pool"
 )
 var shellUnsafe = strings.NewReplacer(
 	"`", "", "$", "", "|", "", ";", "", "&", "", "\n", "",
 )
 func parseFlags(raw string) ([]string, error) {
 	cleaned := shellUnsafe.Replace(raw)
 	if cleaned != raw {
 		return nil, fmt.Errorf("flags contain unsafe characters")
 	}
 	return splitArgs(strings.TrimSpace(raw)), nil
 }
 func splitArgs(s string) []string {
 	if s == "" {
 		return nil
 	}
 	return strings.Fields(s)
 }
 func proxyHandler(p *pool.Pool) http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		flagsRaw := r.Header.Get("X-Agent-Flags")
 		var flags []string
 		if flagsRaw != "" {
 			var err error
 			flags, err = parseFlags(flagsRaw)
 			if err != nil {
 				writeJSON(w, http.StatusBadRequest, map[string]string{
 					"error": err.Error(),
 				})
 				return
 			}
 		}
 		modelID := r.Header.Get("X-Model-Id")
 		if modelID == "" {
 			body, err := io.ReadAll(io.LimitReader(r.Body, 1<<20))
 			if err != nil {
 				writeJSON(w, http.StatusBadRequest, map[string]string{"error": "failed to read body"})
 				return
 			}
 			var req struct {
 				Model string `json:"model"`
 			}
 			if err := json.Unmarshal(body, &req); err == nil && req.Model != "" {
 				modelID = req.Model
 			}
 			r.Body = io.NopCloser(strings.NewReader(string(body)))
 			r.ContentLength = int64(len(body))
 		}
 		if modelID == "" {
 			writeJSON(w, http.StatusBadRequest, map[string]string{"error": "model not specified (X-Model-Id header or body.model)"})
 			return
 		}
 		sidecar, err := p.Acquire(r.Context(), modelID, flags)
 		if err != nil {
 			errMsg := err.Error()
 			status := http.StatusInternalServerError
 			if strings.Contains(errMsg, "validation:") {
 				status = http.StatusBadRequest
 			} else if strings.Contains(errMsg, "unknown model:") {
 				status = http.StatusNotFound
 			} else if strings.Contains(errMsg, "port allocation:") {
 				status = http.StatusServiceUnavailable
 			}
 			writeJSON(w, status, map[string]string{"error": errMsg})
 			return
 		}
 		target := &url.URL{
 			Scheme: "http",
 			Host:   fmt.Sprintf("127.0.0.1:%d", sidecar.Port),
 		}
 		proxy := httputil.NewSingleHostReverseProxy(target)
 		proxy.ErrorHandler = func(rw http.ResponseWriter, req *http.Request, err error) {
 			slog.Error("upstream error", "hash", sidecar.Hash, "port", sidecar.Port, "err", err)
 			writeJSON(rw, http.StatusBadGateway, map[string]any{
 				"error":        "upstream unavailable",
 				"error_detail": err.Error(),
 				"sidecar_hash": sidecar.Hash,
 				"sidecar_port": sidecar.Port,
 				"last_stderr":  sidecar.LastStderr(),
 			})
 		}
 		sidecar.TouchLastUsed()
 		proxy.ServeHTTP(w, r)
 	}
 }
 func writeJSON(w http.ResponseWriter, status int, v any) {
 	w.Header().Set("Content-Type", "application/json")
 	w.WriteHeader(status)
 	json.NewEncoder(w).Encode(v)
 }
--- a/internal/server/server.go
+++ b/internal/server/server.go
@@ -0,0 +1,56 @@
 package server
 import (
 	"log/slog"
 	"net/http"
 	"time"
 	"github.com/indifferentketchup/llama-sidecar/internal/config"
 	"github.com/indifferentketchup/llama-sidecar/internal/pool"
 )
 func New(cfg *config.Config, p *pool.Pool, startedAt time.Time) *http.Server {
 	mux := http.NewServeMux()
 	mux.HandleFunc("GET /health", healthHandler(p, cfg, startedAt))
 	mux.HandleFunc("GET /sidecars", listSidecarsHandler(p))
 	mux.HandleFunc("DELETE /sidecars/{hash}", deleteSidecarHandler(p))
 	mux.HandleFunc("POST /v1/chat/completions", proxyHandler(p))
 	mux.HandleFunc("POST /v1/completions", proxyHandler(p))
 	handler := requestLogger(mux)
 	return &http.Server{
 		Addr:    cfg.Bind,
 		Handler: handler,
 	}
 }
 func requestLogger(next http.Handler) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		start := time.Now()
 		rw := &statusRecorder{ResponseWriter: w, status: 200}
 		next.ServeHTTP(rw, r)
 		slog.Info("request",
 			"method", r.Method,
 			"path", r.URL.Path,
 			"status", rw.status,
 			"duration_ms", time.Since(start).Milliseconds(),
 		)
 	})
 }
 type statusRecorder struct {
 	http.ResponseWriter
 	status int
 }
 func (sr *statusRecorder) WriteHeader(code int) {
 	sr.status = code
 	sr.ResponseWriter.WriteHeader(code)
 }
 func (sr *statusRecorder) Flush() {
 	if f, ok := sr.ResponseWriter.(http.Flusher); ok {
 		f.Flush()
 	}
 }
--- a/internal/validator/validator.go
+++ b/internal/validator/validator.go
@@ -0,0 +1,156 @@
 // SPDX-License-Identifier: AGPL-3.0-only
 // Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
 // Ported from studio/backend/core/inference/llama_server_args.py.
 // Original: https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/llama_server_args.py
 package validator
 import (
 	"fmt"
 	"strings"
 )
 var denylistGroups = [][]string{
 	// Model identity
 	{"-m", "--model"},
 	{"-mu", "--model-url"},
 	{"-dr", "--docker-repo"},
 	{"-hf", "-hfr", "--hf-repo"},
 	{"-hff", "--hf-file"},
 	{"-hfv", "-hfrv", "--hf-repo-v"},
 	{"-hffv", "--hf-file-v"},
 	{"-hft", "--hf-token"},
 	{"-mm", "--mmproj"},
 	{"-mmu", "--mmproj-url"},
 	// Networking
 	{"--host"},
 	{"--port"},
 	{"--path"},
 	{"--api-prefix"},
 	{"--reuse-port"},
 	// Auth / TLS
 	{"--api-key"},
 	{"--api-key-file"},
 	{"--ssl-key-file"},
 	{"--ssl-cert-file"},
 	// Server UI / multi-model
 	{"--webui", "--no-webui"},
 	{"--ui", "--no-ui"},
 	{"--ui-config"},
 	{"--ui-config-file"},
 	{"--ui-mcp-proxy", "--no-ui-mcp-proxy"},
 	{"--models-dir"},
 	{"--models-preset"},
 	{"--models-max"},
 	{"--models-autoload", "--no-models-autoload"},
 }
 var denylist map[string]bool
 func init() {
 	denylist = make(map[string]bool)
 	for _, group := range denylistGroups {
 		for _, flag := range group {
 			denylist[flag] = true
 		}
 	}
 }
 // FlagName returns the flag name for a CLI token, or "" if it isn't a flag.
 // Peels --key=value to the bare --key. Numeric values like -1 or -0.5
 // (e.g. --seed -1) are treated as values, not flags.
 func FlagName(token string) string {
 	if !strings.HasPrefix(token, "-") || token == "-" || token == "--" {
 		return ""
 	}
 	if len(token) >= 2 && (token[1] >= '0' && token[1] <= '9' || token[1] == '.') {
 		return ""
 	}
 	if idx := strings.IndexByte(token, '='); idx >= 0 {
 		return token[:idx]
 	}
 	return token
 }
 // ValidateExtraArgs validates user-supplied llama-server args. Returns the
 // args as a flat slice. Returns an error with the offending flag if any
 // token resolves to a managed flag.
 func ValidateExtraArgs(args []string) ([]string, error) {
 	if len(args) == 0 {
 		return nil, nil
 	}
 	out := make([]string, 0, len(args))
 	for _, raw := range args {
 		flag := FlagName(raw)
 		if flag != "" && denylist[flag] {
 			return nil, fmt.Errorf("llama-server flag '%s' is managed and cannot be passed as an extra arg", flag)
 		}
 		out = append(out, raw)
 	}
 	return out, nil
 }
 // IsManagedFlag returns true if flag is a managed llama-server flag.
 func IsManagedFlag(flag string) bool {
 	return denylist[flag]
 }
 var contextFlags = setOf("-c", "--ctx-size")
 var cacheFlags = setOf("-ctk", "--cache-type-k", "-ctv", "--cache-type-v")
 var specFlags = setOf(
 	"--spec-default", "--spec-type", "--spec-ngram-size-n", "--spec-ngram-size",
 	"--draft-min", "--draft-max",
 	"--spec-draft-n-max", "--spec-draft-n-min", "--spec-draft-p-min", "--spec-draft-p-split",
 	"--spec-ngram-mod-n-match", "--spec-ngram-mod-n-min", "--spec-ngram-mod-n-max",
 )
 var templateFlags = setOf(
 	"--chat-template", "--chat-template-file", "--chat-template-kwargs",
 	"--jinja", "--no-jinja",
 )
 var booleanShadowingFlags = setOf("--spec-default", "--jinja", "--no-jinja")
 func setOf(vals ...string) map[string]bool {
 	m := make(map[string]bool, len(vals))
 	for _, v := range vals {
 		m[v] = true
 	}
 	return m
 }
 // StripShadowingFlags removes flags that shadow first-class settings from
 // the arg list. By default all shadowing groups are stripped.
 func StripShadowingFlags(args []string) []string {
 	shadowing := make(map[string]bool)
 	for k, v := range contextFlags {
 		shadowing[k] = v
 	}
 	for k, v := range cacheFlags {
 		shadowing[k] = v
 	}
 	for k, v := range specFlags {
 		shadowing[k] = v
 	}
 	for k, v := range templateFlags {
 		shadowing[k] = v
 	}
 	out := make([]string, 0, len(args))
 	i, n := 0, len(args)
 	for i < n {
 		tok := args[i]
 		flag := FlagName(tok)
 		if flag == "" || !shadowing[flag] {
 			out = append(out, tok)
 			i++
 			continue
 		}
 		if booleanShadowingFlags[flag] || strings.Contains(tok, "=") {
 			i++
 		} else if i+1 < n && FlagName(args[i+1]) == "" {
 			i += 2
 		} else {
 			i++
 		}
 	}
 	return out
 }
--- a/internal/validator/validator_test.go
+++ b/internal/validator/validator_test.go
@@ -0,0 +1,150 @@
 package validator
 import (
 	"testing"
 )
 func TestValidateExtraArgs_DenyList(t *testing.T) {
 	denied := []string{
 		"-m", "--model",
 		"-mu", "--model-url",
 		"-dr", "--docker-repo",
 		"-hf", "-hfr", "--hf-repo",
 		"-hff", "--hf-file",
 		"-hfv", "-hfrv", "--hf-repo-v",
 		"-hffv", "--hf-file-v",
 		"-hft", "--hf-token",
 		"-mm", "--mmproj",
 		"-mmu", "--mmproj-url",
 		"--host", "--port", "--path", "--api-prefix", "--reuse-port",
 		"--api-key", "--api-key-file",
 		"--ssl-key-file", "--ssl-cert-file",
 		"--webui", "--no-webui", "--ui", "--no-ui",
 		"--ui-config", "--ui-config-file",
 		"--ui-mcp-proxy", "--no-ui-mcp-proxy",
 		"--models-dir", "--models-preset", "--models-max",
 		"--models-autoload", "--no-models-autoload",
 	}
 	for _, flag := range denied {
 		t.Run(flag, func(t *testing.T) {
 			_, err := ValidateExtraArgs([]string{flag})
 			if err == nil {
 				t.Fatalf("expected error for %s", flag)
 			}
 		})
 	}
 }
 func TestValidateExtraArgs_SafeFlags(t *testing.T) {
 	safe := []string{
 		"-c", "--ctx-size", "-ngl", "--gpu-layers",
 		"--top-k", "--cache-type-k", "--jinja", "--no-jinja",
 		"--spec-draft-n-max", "-fa", "--flash-attn",
 		"-t", "--threads", "-np", "--parallel", "--no-mmap",
 	}
 	for _, flag := range safe {
 		t.Run(flag, func(t *testing.T) {
 			out, err := ValidateExtraArgs([]string{flag})
 			if err != nil {
 				t.Fatalf("unexpected error for %s: %v", flag, err)
 			}
 			if len(out) != 1 || out[0] != flag {
 				t.Fatalf("expected [%s], got %v", flag, out)
 			}
 		})
 	}
 }
 func TestValidateExtraArgs_FlagEqualsValue(t *testing.T) {
 	_, err := ValidateExtraArgs([]string{"--model=evil.gguf"})
 	if err == nil {
 		t.Fatal("expected error for --model=evil.gguf")
 	}
 	out, err := ValidateExtraArgs([]string{"--ctx-size=4096"})
 	if err != nil {
 		t.Fatal(err)
 	}
 	if len(out) != 1 || out[0] != "--ctx-size=4096" {
 		t.Fatalf("expected [--ctx-size=4096], got %v", out)
 	}
 }
 func TestValidateExtraArgs_NegativeNumber(t *testing.T) {
 	out, err := ValidateExtraArgs([]string{"--seed", "-1"})
 	if err != nil {
 		t.Fatal(err)
 	}
 	if len(out) != 2 {
 		t.Fatalf("expected 2 tokens, got %d", len(out))
 	}
 }
 func TestValidateExtraArgs_Empty(t *testing.T) {
 	out, err := ValidateExtraArgs(nil)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if out != nil {
 		t.Fatalf("expected nil, got %v", out)
 	}
 }
 func TestIsManagedFlag(t *testing.T) {
 	if !IsManagedFlag("--model") {
 		t.Fatal("--model should be managed")
 	}
 	if !IsManagedFlag("-m") {
 		t.Fatal("-m should be managed")
 	}
 	if IsManagedFlag("-c") {
 		t.Fatal("-c should not be managed")
 	}
 }
 func TestFlagName(t *testing.T) {
 	tests := []struct {
 		in, want string
 	}{
 		{"--model=foo", "--model"},
 		{"-c", "-c"},
 		{"--top-k", "--top-k"},
 		{"-1", ""},
 		{"-0.5", ""},
 		{"-", ""},
 		{"--", ""},
 		{"hello", ""},
 	}
 	for _, tt := range tests {
 		got := FlagName(tt.in)
 		if got != tt.want {
 			t.Errorf("FlagName(%q) = %q, want %q", tt.in, got, tt.want)
 		}
 	}
 }
 func TestStripShadowingFlags(t *testing.T) {
 	t.Run("strips context flag with value", func(t *testing.T) {
 		out := StripShadowingFlags([]string{"-c", "4096", "--top-k", "40"})
 		if len(out) != 2 || out[0] != "--top-k" || out[1] != "40" {
 			t.Fatalf("got %v", out)
 		}
 	})
 	t.Run("retains non-shadowing flags", func(t *testing.T) {
 		out := StripShadowingFlags([]string{"--top-k", "40", "--top-p", "0.95"})
 		if len(out) != 4 {
 			t.Fatalf("got %v", out)
 		}
 	})
 	t.Run("strips boolean jinja flag", func(t *testing.T) {
 		out := StripShadowingFlags([]string{"--jinja", "--top-k", "40"})
 		if len(out) != 2 || out[0] != "--top-k" {
 			t.Fatalf("got %v", out)
 		}
 	})
 	t.Run("strips equals form", func(t *testing.T) {
 		out := StripShadowingFlags([]string{"--ctx-size=4096"})
 		if len(out) != 0 {
 			t.Fatalf("got %v", out)
 		}
 	})
 }
--- a/internal/winsvc/winsvc_unix.go
+++ b/internal/winsvc/winsvc_unix.go
@@ -0,0 +1,26 @@
 //go:build !windows
 package winsvc
 import (
 	"context"
 	"log/slog"
 	"os"
 	"os/signal"
 	"syscall"
 	"time"
 )
 func RegisterShutdownHandler(ctx context.Context, shutdownFunc func(context.Context) error) {
 	sigCh := make(chan os.Signal, 1)
 	signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGINT)
 	<-sigCh
 	slog.Info("shutdown signal received")
 	shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
 	defer cancel()
 	if err := shutdownFunc(shutdownCtx); err != nil {
 		slog.Error("shutdown error", "err", err)
 		os.Exit(1)
 	}
 	os.Exit(0)
 }
--- a/internal/winsvc/winsvc_windows.go
+++ b/internal/winsvc/winsvc_windows.go
@@ -0,0 +1,25 @@
 //go:build windows
 package winsvc
 import (
 	"context"
 	"log/slog"
 	"os"
 	"os/signal"
 	"time"
 )
 func RegisterShutdownHandler(ctx context.Context, shutdownFunc func(context.Context) error) {
 	sigCh := make(chan os.Signal, 1)
 	signal.Notify(sigCh, os.Interrupt)
 	<-sigCh
 	slog.Info("shutdown signal received")
 	shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
 	defer cancel()
 	if err := shutdownFunc(shutdownCtx); err != nil {
 		slog.Error("shutdown error", "err", err)
 		os.Exit(1)
 	}
 	os.Exit(0)
 }
		`@@ -0,0 +1,3 @@`
							`module github.com/indifferentketchup/llama-sidecar`

							`go 1.26.3`