From fe7f36ae98c6c04c1bdf761146f41a58e2391d4b Mon Sep 17 00:00:00 2001 From: indifferentketchup Date: Thu, 28 May 2026 01:55:13 +0000 Subject: [PATCH] llama-sidecar v0.1.0: daemon + benchmarks + eval suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitignore | 18 ++ Makefile | 19 ++ README.md | 77 +++++++ bench/analyze.py | 215 ++++++++++++++++++ bench/bench.sh | 192 ++++++++++++++++ bench/prompts/p1024.txt | 67 ++++++ bench/prompts/p256.txt | 18 ++ bench/prompts/p4096.txt | 319 +++++++++++++++++++++++++++ benchmarks/3d/analyze.py | 109 +++++++++ benchmarks/3d/run_sweep.py | 248 +++++++++++++++++++++ cmd/llama-sidecar/main.go | 74 +++++++ eval/ab/prompts.json | 72 ++++++ eval/ab/run.sh | 242 ++++++++++++++++++++ eval/analyze.py | 125 +++++++++++ eval/gsm8k.py | 164 ++++++++++++++ eval/humaneval.py | 201 +++++++++++++++++ eval/mmlu.py | 166 ++++++++++++++ eval/run_all.py | 117 ++++++++++ eval/run_all.sh | 20 ++ go.mod | 3 + internal/config/config.go | 139 ++++++++++++ internal/config/config_test.go | 79 +++++++ internal/pool/hash.go | 53 +++++ internal/pool/hash_test.go | 53 +++++ internal/pool/pool.go | 188 ++++++++++++++++ internal/pool/pool_test.go | 151 +++++++++++++ internal/pool/ports.go | 28 +++ internal/pool/ports_test.go | 74 +++++++ internal/pool/sidecar.go | 313 ++++++++++++++++++++++++++ internal/pool/sidecar_test.go | 96 ++++++++ internal/pool/sidecar_unix.go | 7 + internal/pool/sidecar_windows.go | 15 ++ internal/server/admin.go | 42 ++++ internal/server/proxy.go | 111 ++++++++++ internal/server/server.go | 56 +++++ internal/validator/validator.go | 156 +++++++++++++ internal/validator/validator_test.go | 150 +++++++++++++ internal/winsvc/winsvc_unix.go | 26 +++ internal/winsvc/winsvc_windows.go | 25 +++ 39 files changed, 4228 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README.md create mode 100644 bench/analyze.py create mode 100755 bench/bench.sh create mode 100644 bench/prompts/p1024.txt create mode 100644 bench/prompts/p256.txt create mode 100644 bench/prompts/p4096.txt create mode 100644 benchmarks/3d/analyze.py create mode 100644 benchmarks/3d/run_sweep.py create mode 100644 cmd/llama-sidecar/main.go create mode 100644 eval/ab/prompts.json create mode 100755 eval/ab/run.sh create mode 100644 eval/analyze.py create mode 100644 eval/gsm8k.py create mode 100644 eval/humaneval.py create mode 100644 eval/mmlu.py create mode 100644 eval/run_all.py create mode 100755 eval/run_all.sh create mode 100644 go.mod create mode 100644 internal/config/config.go create mode 100644 internal/config/config_test.go create mode 100644 internal/pool/hash.go create mode 100644 internal/pool/hash_test.go create mode 100644 internal/pool/pool.go create mode 100644 internal/pool/pool_test.go create mode 100644 internal/pool/ports.go create mode 100644 internal/pool/ports_test.go create mode 100644 internal/pool/sidecar.go create mode 100644 internal/pool/sidecar_test.go create mode 100644 internal/pool/sidecar_unix.go create mode 100644 internal/pool/sidecar_windows.go create mode 100644 internal/server/admin.go create mode 100644 internal/server/proxy.go create mode 100644 internal/server/server.go create mode 100644 internal/validator/validator.go create mode 100644 internal/validator/validator_test.go create mode 100644 internal/winsvc/winsvc_unix.go create mode 100644 internal/winsvc/winsvc_windows.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..926425a --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +bin/ +*.exe +eval/.venv/ +eval/results/ +eval/scores.csv +eval/SUMMARY.md +eval/eval.log +eval/ab/results/ +eval/ab/COMPARE.md +eval/ab/timing.csv +eval/ab/run.log +bench/results/ +bench/SUMMARY.md +bench/results.csv +bench/llama-swap-recommendations.md +internal/pool/*.bak-* +internal/pool/sidecar_windows.go.bak-* +__pycache__/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d4b73bc --- /dev/null +++ b/Makefile @@ -0,0 +1,19 @@ +.PHONY: build build-windows test test-integration lint + +GO = /snap/go/current/bin/go + +build: + $(GO) build -o bin/llama-sidecar ./cmd/llama-sidecar + +build-windows: + GOOS=windows GOARCH=amd64 $(GO) build -o bin/llama-sidecar.exe ./cmd/llama-sidecar + +test: + $(GO) test ./internal/... + +test-integration: + $(GO) test -tags=integration ./internal/... + +lint: + $(GO) vet ./... + gofmt -l . diff --git a/README.md b/README.md new file mode 100644 index 0000000..cc71150 --- /dev/null +++ b/README.md @@ -0,0 +1,77 @@ +# llama-sidecar + +Per-agent llama-server process pool daemon. Runs on sam-desktop alongside llama-swap. Spawns or reuses llama-server processes keyed on (modelID, flags) hash. + +## License + +AGPL-3.0-only. + +The validator package (`internal/validator/`) is ported from [Unsloth Studio](https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/llama_server_args.py) (AGPL-3.0). BooCode's TypeScript port (`apps/server/src/services/inference/llama-args-validator.ts`) is the sibling — update both when upstream changes. + +## Build + +```bash +# Linux (development) +make build + +# Windows AMD64 (production target — cross-compile from Linux) +make build-windows + +# Copy to sam-desktop +# scp bin/llama-sidecar.exe sam-desktop:C:\llama-sidecar\ +``` + +## Configuration + +All via environment variables (no CLI flags): + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `LLAMA_SERVER_BIN` | yes | — | Path to llama-server.exe | +| `MODEL_DIR_MAP_FILE` | yes | — | JSON file mapping model IDs to GGUF paths | +| `LLAMA_SIDECAR_BIND` | no | `127.0.0.1:8402` | Listen address | +| `PORT_RANGE` | no | `8500-8599` | Port range for sidecar processes | +| `MAX_SIDECARS` | no | `2` | Max concurrent sidecar processes | +| `LOG_LEVEL` | no | `info` | Log level (debug, info, warn, error) | +| `BASE_ARGS` | no | `["-ngl","999","-c","32768","--flash-attn","on","--no-mmap"]` | JSON array of base llama-server args | +| `HEALTH_TIMEOUT_SECONDS` | no | `60` | Max wait for sidecar health check | +| `HEALTH_INTERVAL_SECONDS` | no | `30` | Background health check interval | + +## API + +### `GET /health` + +Returns daemon status. + +### `GET /sidecars` + +Returns list of active sidecar processes. + +### `DELETE /sidecars/{hash}` + +Kill and remove a sidecar process. + +### `POST /v1/chat/completions` + +OpenAI-compatible proxy. Routes to a sidecar process based on model + flags. + +Headers: +- `X-Agent-Flags: --top-k 20 --cache-type-k q8_0` (optional) +- `X-Model-Id: qwen3.6-35b-a3b-mxfp4` (optional, overrides body.model) + +## Test + +```bash +make test # unit tests +make test-integration # requires real llama-server + GGUF +make lint # vet + gofmt +``` + +## NSSM Service + +Pre-configured on sam-desktop as `llama-sidecar`. Start/stop via: +``` +C:\Tools\nssm\nssm.exe start llama-sidecar +C:\Tools\nssm\nssm.exe stop llama-sidecar +C:\Tools\nssm\nssm.exe status llama-sidecar +``` diff --git a/bench/analyze.py b/bench/analyze.py new file mode 100644 index 0000000..5ddf99c --- /dev/null +++ b/bench/analyze.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +"""Analyze MTP on/off benchmark results → CSV + SUMMARY.md + recommendations.""" + +import csv +import json +import os +import re +import statistics +from pathlib import Path + +RESULTS_DIR = Path(__file__).parent / "results" +CSV_PATH = Path(__file__).parent / "results.csv" +SUMMARY_PATH = Path(__file__).parent / "SUMMARY.md" +RECO_PATH = Path(__file__).parent / "llama-swap-recommendations.md" + +FNAME_RE = re.compile( + r"^(?P.+?)__mtp-(?Pon|off)__len(?P\d+)__run(?P\d+)\.json$" +) + + +def parse_result(path: Path) -> dict | None: + m = FNAME_RE.match(path.name) + if not m: + return None + try: + data = json.loads(path.read_text()) + except (json.JSONDecodeError, OSError): + return None + t = data.get("timings", {}) + return { + "gguf": m.group("stem"), + "mtp": m.group("mtp"), + "prompt_len": int(m.group("len")), + "run": int(m.group("run")), + "prompt_tps": t.get("prompt_per_second"), + "predicted_tps": t.get("predicted_per_second"), + "cache_n": t.get("cache_n"), + "draft_n": t.get("draft_n"), + "accepted_n": t.get("draft_n_accepted"), + "total_ms": (t.get("prompt_ms", 0) or 0) + (t.get("predicted_ms", 0) or 0), + } + + +def load_all() -> list[dict]: + rows = [] + for f in sorted(RESULTS_DIR.glob("*.json")): + r = parse_result(f) + if r: + rows.append(r) + return rows + + +def write_csv(rows: list[dict]) -> None: + fields = ["gguf", "mtp", "prompt_len", "run", "prompt_tps", "predicted_tps", + "cache_n", "draft_n", "accepted_n", "total_ms"] + with open(CSV_PATH, "w", newline="") as f: + w = csv.DictWriter(f, fieldnames=fields) + w.writeheader() + w.writerows(rows) + print(f"Wrote {len(rows)} rows to {CSV_PATH}") + + +def median_of(values: list[float]) -> float: + return statistics.median(values) if values else 0.0 + + +def write_summary(rows: list[dict]) -> None: + ggufs = sorted(set(r["gguf"] for r in rows)) + lens = sorted(set(r["prompt_len"] for r in rows)) + lines = ["# MTP On/Off Benchmark Results\n"] + lines.append(f"**{len(rows)} measurements across {len(ggufs)} GGUFs.**\n") + lines.append(f"Runs 2 & 3 used for median (run 1 = warmup, discarded).\n") + + verdicts = [] + + for gguf in ggufs: + lines.append(f"\n## {gguf}\n") + header_parts = ["prompt_len"] + for state in ["off", "on"]: + header_parts.append(f"MTP-{state} tok/s") + header_parts.extend(["delta %", "accept %"]) + lines.append("| " + " | ".join(header_parts) + " |") + lines.append("|" + "|".join("---" for _ in header_parts) + "|") + + any_above_10 = False + for pl in lens: + off_vals = [r["predicted_tps"] for r in rows + if r["gguf"] == gguf and r["mtp"] == "off" + and r["prompt_len"] == pl and r["run"] >= 2 + and r["predicted_tps"] is not None] + on_vals = [r["predicted_tps"] for r in rows + if r["gguf"] == gguf and r["mtp"] == "on" + and r["prompt_len"] == pl and r["run"] >= 2 + and r["predicted_tps"] is not None] + + off_med = median_of(off_vals) + on_med = median_of(on_vals) + + if off_med > 0: + delta = ((on_med - off_med) / off_med) * 100 + else: + delta = 0.0 + + if abs(delta) >= 10: + any_above_10 = True + + draft_rows = [r for r in rows + if r["gguf"] == gguf and r["mtp"] == "on" + and r["prompt_len"] == pl and r["run"] >= 2 + and r.get("draft_n")] + total_draft = sum(r.get("draft_n", 0) for r in draft_rows) + total_accepted = sum(r.get("accepted_n", 0) for r in draft_rows) + accept_pct = f"{(total_accepted / total_draft * 100):.0f}%" if total_draft > 0 else "—" + + lines.append( + f"| {pl} | {off_med:.1f} | {on_med:.1f} | {delta:+.1f}% | {accept_pct} |" + ) + + if any_above_10: + verdict = "KEEP MTP" + else: + verdict = "DROP MTP" + verdicts.append((gguf, verdict)) + lines.append(f"\n**Verdict: {verdict}**\n") + + lines.append("\n---\n") + lines.append("## Verdict Summary\n") + lines.append("| GGUF | Verdict |") + lines.append("|------|---------|") + for gguf, verdict in verdicts: + lines.append(f"| {gguf} | {verdict} |") + + summary = "\n".join(lines) + "\n" + SUMMARY_PATH.write_text(summary) + print(f"Wrote {SUMMARY_PATH}") + print(summary) + + +def write_recommendations(rows: list[dict]) -> None: + ggufs = sorted(set(r["gguf"] for r in rows)) + lens = sorted(set(r["prompt_len"] for r in rows)) + + lines = ["# llama-swap Config Recommendations\n"] + lines.append("Based on MTP on/off benchmark results.\n") + lines.append("**Read-only reference** — do NOT edit D:\\llama-swap\\config.yaml directly.\n") + lines.append("```yaml") + lines.append("# Commented diff against current config.yaml") + lines.append("# Lines starting with + should be added, - should be removed") + lines.append("") + + model_map = { + "Qwen3.6-35B-A3B-MXFP4_MOE": "qwen3.6-35b-a3b-mxfp4", + "Qwen3.6-27B-Q6_K": "qwen3.6-27b-mtp", + "Qwopus3.5-4B-v3-MTP-Q8_0": "qwopus3.5-4b-mtp", + "Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0": "qwen3.5-9b-deepseek-v4-mtp", + "Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M": "qwopus3.6-35b-a3b-v1-mtp", + "Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16": "qwopus3.6-35b-a3b-mxfp4-mtp", + "Qwopus3.6-27B-v2-MTP-Q6_K": "qwopus3.6-27b-v2-mtp", + "Qwopus3.5-9B-Coder-MTP-Q8_0": "qwopus3.5-9b-coder-mtp", + } + + currently_mtp = { + "Qwen3.6-35B-A3B-MXFP4_MOE": False, + "Qwen3.6-27B-Q6_K": True, + "Qwopus3.5-4B-v3-MTP-Q8_0": True, + "Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0": True, + "Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M": True, + "Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16": True, + "Qwopus3.6-27B-v2-MTP-Q6_K": True, + "Qwopus3.5-9B-Coder-MTP-Q8_0": True, + } + + for gguf in ggufs: + model_id = model_map.get(gguf, gguf) + is_mtp_now = currently_mtp.get(gguf, False) + + off_vals = [r["predicted_tps"] for r in rows + if r["gguf"] == gguf and r["mtp"] == "off" and r["run"] >= 2 + and r["predicted_tps"] is not None] + on_vals = [r["predicted_tps"] for r in rows + if r["gguf"] == gguf and r["mtp"] == "on" and r["run"] >= 2 + and r["predicted_tps"] is not None] + off_med = median_of(off_vals) + on_med = median_of(on_vals) + delta = ((on_med - off_med) / off_med * 100) if off_med > 0 else 0 + + should_mtp = delta >= 10 + lines.append(f" # {model_id}: MTP {'on' if is_mtp_now else 'off'} → {'on' if should_mtp else 'off'} (delta {delta:+.1f}%)") + + if should_mtp and not is_mtp_now: + lines.append(f" # + --spec-type draft-mtp --spec-draft-n-max 2") + elif not should_mtp and is_mtp_now: + lines.append(f" # - --spec-type draft-mtp --spec-draft-n-max 2") + else: + lines.append(f" # (no change)") + lines.append("") + + lines.append("```\n") + reco = "\n".join(lines) + RECO_PATH.write_text(reco) + print(f"Wrote {RECO_PATH}") + + +def main() -> None: + rows = load_all() + if not rows: + print("No results found in", RESULTS_DIR) + return + write_csv(rows) + write_summary(rows) + write_recommendations(rows) + + +if __name__ == "__main__": + main() diff --git a/bench/bench.sh b/bench/bench.sh new file mode 100755 index 0000000..3fe0afb --- /dev/null +++ b/bench/bench.sh @@ -0,0 +1,192 @@ +#!/usr/bin/env bash +set -euo pipefail + +ENDPOINT="http://100.101.41.16:8650" +SSH_HOST="samki@100.101.41.16" +TASK_NAME="bench_llama" +BAT_PATH='%TEMP%\bench_run.bat' +RESULTS_DIR="$(cd "$(dirname "$0")" && pwd)/results" +PROMPTS_DIR="$(cd "$(dirname "$0")" && pwd)/prompts" +MAX_TOKENS=200 +HEALTH_TIMEOUT=120 +LLAMA_BIN='D:\llama-server\llama-server.exe' + +mkdir -p "$RESULTS_DIR" + +# ── Config matrix: STEM|MTP_STATE|FULL_ARGS ─────────────────────────── + +CONFIGS=( +'Qwen3.6-35B-A3B-MXFP4_MOE|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-35B-A3B-MXFP4_MOE.gguf --mmproj D:\models\Qwen3.6-35B-A3B-MXFP4_MOE\mmproj.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwen3.6-35B-A3B-MXFP4_MOE|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-35B-A3B-MXFP4_MOE.gguf --mmproj D:\models\Qwen3.6-35B-A3B-MXFP4_MOE\mmproj.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwen3.6-27B-Q6_K|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-27B-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwen3.6-27B-Q6_K|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-27B-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwopus3.5-4B-v3-MTP-Q8_0|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-4B-v3-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwopus3.5-4B-v3-MTP-Q8_0|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-4B-v3-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwopus3.6-27B-v2-MTP-Q6_K|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-27B-v2-MTP-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwopus3.6-27B-v2-MTP-Q6_K|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-27B-v2-MTP-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwopus3.5-9B-Coder-MTP-Q8_0|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-9B-Coder-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.4 --top-p 0.8 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' + +'Qwopus3.5-9B-Coder-MTP-Q8_0|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-9B-Coder-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.4 --top-p 0.8 --top-k 20 --min-p 0.0 --repeat-penalty 1.0' +) + +PROMPT_LENS=(256 1024 4096) + +# ── Helper functions ────────────────────────────────────────────────── + +kill_bench_server() { + local pids + pids=$(ssh "$SSH_HOST" 'for /f "tokens=5" %a in ('"'"'netstat -aon ^| findstr :8650 ^| findstr LISTENING'"'"') do @echo %a' 2>/dev/null || true) + for pid in $pids; do + if [ -n "$pid" ] && [ "$pid" != "0" ]; then + ssh "$SSH_HOST" "taskkill /F /PID $pid" 2>/dev/null || true + fi + done + ssh "$SSH_HOST" "schtasks /Delete /TN ${TASK_NAME} /F" 2>/dev/null || true + sleep 3 +} + +start_bench_server() { + local args="$1" + # Write a batch file, then run it via schtasks + ssh "$SSH_HOST" "echo ${LLAMA_BIN} ${args} > ${BAT_PATH}" 2>/dev/null + ssh "$SSH_HOST" "schtasks /Create /TN ${TASK_NAME} /TR ${BAT_PATH} /SC ONCE /ST 00:00 /F /RL HIGHEST" 2>/dev/null + ssh "$SSH_HOST" "schtasks /Run /TN ${TASK_NAME}" 2>/dev/null +} + +poll_health() { + local elapsed=0 + while [ $elapsed -lt $HEALTH_TIMEOUT ]; do + if curl -sf "${ENDPOINT}/health" >/dev/null 2>&1; then + echo " health OK (${elapsed}s)" + return 0 + fi + sleep 3 + elapsed=$((elapsed + 3)) + if [ $((elapsed % 15)) -eq 0 ]; then + echo " waiting... (${elapsed}s)" + fi + done + echo " HEALTH TIMEOUT after ${HEALTH_TIMEOUT}s" + return 1 +} + +send_request() { + local prompt_file="$1" + local output_file="$2" + local body + body=$(python3 -c " +import json +prompt = open('${prompt_file}').read() +print(json.dumps({ + 'messages': [{'role': 'user', 'content': prompt}], + 'max_tokens': ${MAX_TOKENS}, + 'temperature': 0, + 'seed': 42, + 'stream': False +})) +") + local http_code + http_code=$(curl -s -w '%{http_code}' -o "$output_file" \ + --max-time 300 \ + -X POST "${ENDPOINT}/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "$body" 2>/dev/null) + if [ "$http_code" != "200" ]; then + echo "HTTP ${http_code}" + return 1 + fi + return 0 +} + +print_metrics() { + python3 -c " +import json +d = json.load(open('${1}')) +t = d.get('timings', {}) +ptps = t.get('prompt_per_second', 0) +etps = t.get('predicted_per_second', 0) +dn = t.get('draft_n', '') +da = t.get('draft_n_accepted', '') +draft = '' +if dn != '': + draft = f' draft={da}/{dn}' +print(f'prompt={ptps:.1f} eval={etps:.1f} tok/s{draft}') +" 2>/dev/null || echo "(parse error)" +} + +# ── Main ────────────────────────────────────────────────────────────── + +total=${#CONFIGS[@]} +echo "================================================================" +echo " MTP ON/OFF BENCHMARK SWEEP" +echo " ${total} configs x 3 prompts x 3 runs" +echo " Endpoint: ${ENDPOINT}" +echo "================================================================" + +t_start=$(date +%s) +config_idx=0 + +for config_entry in "${CONFIGS[@]}"; do + config_idx=$((config_idx + 1)) + IFS='|' read -r stem mtp_state args <<< "$config_entry" + + echo "" + echo "================================================================" + echo " [${config_idx}/${total}] ${stem} MTP=${mtp_state}" + echo "================================================================" + + kill_bench_server + echo " Starting llama-server..." + start_bench_server "$args" + + if ! poll_health; then + echo " SKIPPING" + kill_bench_server + continue + fi + + for len in "${PROMPT_LENS[@]}"; do + prompt_file="${PROMPTS_DIR}/p${len}.txt" + [ -f "$prompt_file" ] || { echo " Missing p${len}.txt"; continue; } + echo " -- p${len} --" + for run in 1 2 3; do + outfile="${RESULTS_DIR}/${stem}__mtp-${mtp_state}__len${len}__run${run}.json" + printf " run %d: " "$run" + if send_request "$prompt_file" "$outfile"; then + print_metrics "$outfile" + fi + sleep 1 + done + done + + echo " Killing..." + kill_bench_server +done + +t_end=$(date +%s) +elapsed=$(( t_end - t_start )) +echo "" +echo "================================================================" +echo " SWEEP COMPLETE in $(( elapsed / 60 ))m $(( elapsed % 60 ))s" +echo " Run: python3 $(dirname "$0")/analyze.py" +echo "================================================================" diff --git a/bench/prompts/p1024.txt b/bench/prompts/p1024.txt new file mode 100644 index 0000000..ecee490 --- /dev/null +++ b/bench/prompts/p1024.txt @@ -0,0 +1,67 @@ +You will rejoice to hear that no disaster has accompanied the +commencement of an enterprise which you have regarded with such evil +forebodings. I arrived here yesterday, and my first task is to assure +my dear sister of my welfare and increasing confidence in the success +of my undertaking. + +I am already far north of London, and as I walk in the streets of +Petersburgh, I feel a cold northern breeze play upon my cheeks, which +braces my nerves and fills me with delight. Do you understand this +feeling? This breeze, which has travelled from the regions towards +which I am advancing, gives me a foretaste of those icy climes. +Inspirited by this wind of promise, my daydreams become more fervent +and vivid. I try in vain to be persuaded that the pole is the seat of +frost and desolation; it ever presents itself to my imagination as the +region of beauty and delight. There, Margaret, the sun is for ever +visible, its broad disk just skirting the horizon and diffusing a +perpetual splendour. There—for with your leave, my sister, I will put +some trust in preceding navigators—there snow and frost are banished; +and, sailing over a calm sea, we may be wafted to a land surpassing in +wonders and in beauty every region hitherto discovered on the habitable +globe. Its productions and features may be without example, as the +phenomena of the heavenly bodies undoubtedly are in those undiscovered +solitudes. What may not be expected in a country of eternal light? I +may there discover the wondrous power which attracts the needle and may +regulate a thousand celestial observations that require only this +voyage to render their seeming eccentricities consistent for ever. I +shall satiate my ardent curiosity with the sight of a part of the world +never before visited, and may tread a land never before imprinted by +the foot of man. These are my enticements, and they are sufficient to +conquer all fear of danger or death and to induce me to commence this +laborious voyage with the joy a child feels when he embarks in a little +boat, with his holiday mates, on an expedition of discovery up his +native river. But supposing all these conjectures to be false, you +cannot contest the inestimable benefit which I shall confer on all +mankind, to the last generation, by discovering a passage near the pole +to those countries, to reach which at present so many months are +requisite; or by ascertaining the secret of the magnet, which, if at +all possible, can only be effected by an undertaking such as mine. + +These reflections have dispelled the agitation with which I began my +letter, and I feel my heart glow with an enthusiasm which elevates me +to heaven, for nothing contributes so much to tranquillise the mind as +a steady purpose—a point on which the soul may fix its intellectual +eye. This expedition has been the favourite dream of my early years. I +have read with ardour the accounts of the various voyages which have +been made in the prospect of arriving at the North Pacific Ocean +through the seas which surround the pole. You may remember that a +history of all the voyages made for purposes of discovery composed the +whole of our good Uncle Thomas’ library. My education was neglected, +yet I was passionately fond of reading. These volumes were my study +day and night, and my familiarity with them increased that regret which +I had felt, as a child, on learning that my father’s dying injunction +had forbidden my uncle to allow me to embark in a seafaring life. + +These visions faded when I perused, for the first time, those poets +whose effusions entranced my soul and lifted it to heaven. I also +became a poet and for one year lived in a paradise of my own creation; +I imagined that I also might obtain a niche in the temple where the +names of Homer and Shakespeare are consecrated. You are well +acquainted with my failure and how heavily I bore the disappointment. +But just at that time I inherited the fortune of my cousin, and my +thoughts were turned into the channel of their earlier bent. + +Six years have passed since I resolved on my present undertaking. I +can, even now, remember the hour from which I dedicated myself to this +great enterprise. I commenced by inuring my body to hardship. +Continue this passage in exactly 200 tokens of prose. diff --git a/bench/prompts/p256.txt b/bench/prompts/p256.txt new file mode 100644 index 0000000..1abc1b4 --- /dev/null +++ b/bench/prompts/p256.txt @@ -0,0 +1,18 @@ +You will rejoice to hear that no disaster has accompanied the +commencement of an enterprise which you have regarded with such evil +forebodings. I arrived here yesterday, and my first task is to assure +my dear sister of my welfare and increasing confidence in the success +of my undertaking. + +I am already far north of London, and as I walk in the streets of +Petersburgh, I feel a cold northern breeze play upon my cheeks, which +braces my nerves and fills me with delight. Do you understand this +feeling? This breeze, which has travelled from the regions towards +which I am advancing, gives me a foretaste of those icy climes. +Inspirited by this wind of promise, my daydreams become more fervent +and vivid. I try in vain to be persuaded that the pole is the seat of +frost and desolation; it ever presents itself to my imagination as the +region of beauty and delight. There, Margaret, the sun is for ever +visible, its broad disk just skirting the horizon and diffusing a +perpetual splendour. +Continue this passage in exactly 200 tokens of prose. diff --git a/bench/prompts/p4096.txt b/bench/prompts/p4096.txt new file mode 100644 index 0000000..8beec85 --- /dev/null +++ b/bench/prompts/p4096.txt @@ -0,0 +1,319 @@ +You will rejoice to hear that no disaster has accompanied the +commencement of an enterprise which you have regarded with such evil +forebodings. I arrived here yesterday, and my first task is to assure +my dear sister of my welfare and increasing confidence in the success +of my undertaking. + +I am already far north of London, and as I walk in the streets of +Petersburgh, I feel a cold northern breeze play upon my cheeks, which +braces my nerves and fills me with delight. Do you understand this +feeling? This breeze, which has travelled from the regions towards +which I am advancing, gives me a foretaste of those icy climes. +Inspirited by this wind of promise, my daydreams become more fervent +and vivid. I try in vain to be persuaded that the pole is the seat of +frost and desolation; it ever presents itself to my imagination as the +region of beauty and delight. There, Margaret, the sun is for ever +visible, its broad disk just skirting the horizon and diffusing a +perpetual splendour. There—for with your leave, my sister, I will put +some trust in preceding navigators—there snow and frost are banished; +and, sailing over a calm sea, we may be wafted to a land surpassing in +wonders and in beauty every region hitherto discovered on the habitable +globe. Its productions and features may be without example, as the +phenomena of the heavenly bodies undoubtedly are in those undiscovered +solitudes. What may not be expected in a country of eternal light? I +may there discover the wondrous power which attracts the needle and may +regulate a thousand celestial observations that require only this +voyage to render their seeming eccentricities consistent for ever. I +shall satiate my ardent curiosity with the sight of a part of the world +never before visited, and may tread a land never before imprinted by +the foot of man. These are my enticements, and they are sufficient to +conquer all fear of danger or death and to induce me to commence this +laborious voyage with the joy a child feels when he embarks in a little +boat, with his holiday mates, on an expedition of discovery up his +native river. But supposing all these conjectures to be false, you +cannot contest the inestimable benefit which I shall confer on all +mankind, to the last generation, by discovering a passage near the pole +to those countries, to reach which at present so many months are +requisite; or by ascertaining the secret of the magnet, which, if at +all possible, can only be effected by an undertaking such as mine. + +These reflections have dispelled the agitation with which I began my +letter, and I feel my heart glow with an enthusiasm which elevates me +to heaven, for nothing contributes so much to tranquillise the mind as +a steady purpose—a point on which the soul may fix its intellectual +eye. This expedition has been the favourite dream of my early years. I +have read with ardour the accounts of the various voyages which have +been made in the prospect of arriving at the North Pacific Ocean +through the seas which surround the pole. You may remember that a +history of all the voyages made for purposes of discovery composed the +whole of our good Uncle Thomas’ library. My education was neglected, +yet I was passionately fond of reading. These volumes were my study +day and night, and my familiarity with them increased that regret which +I had felt, as a child, on learning that my father’s dying injunction +had forbidden my uncle to allow me to embark in a seafaring life. + +These visions faded when I perused, for the first time, those poets +whose effusions entranced my soul and lifted it to heaven. I also +became a poet and for one year lived in a paradise of my own creation; +I imagined that I also might obtain a niche in the temple where the +names of Homer and Shakespeare are consecrated. You are well +acquainted with my failure and how heavily I bore the disappointment. +But just at that time I inherited the fortune of my cousin, and my +thoughts were turned into the channel of their earlier bent. + +Six years have passed since I resolved on my present undertaking. I +can, even now, remember the hour from which I dedicated myself to this +great enterprise. I commenced by inuring my body to hardship. I +accompanied the whale-fishers on several expeditions to the North Sea; +I voluntarily endured cold, famine, thirst, and want of sleep; I often +worked harder than the common sailors during the day and devoted my +nights to the study of mathematics, the theory of medicine, and those +branches of physical science from which a naval adventurer might derive +the greatest practical advantage. Twice I actually hired myself as an +under-mate in a Greenland whaler, and acquitted myself to admiration. I +must own I felt a little proud when my captain offered me the second +dignity in the vessel and entreated me to remain with the greatest +earnestness, so valuable did he consider my services. + +And now, dear Margaret, do I not deserve to accomplish some great purpose? +My life might have been passed in ease and luxury, but I preferred glory to +every enticement that wealth placed in my path. Oh, that some encouraging +voice would answer in the affirmative! My courage and my resolution is +firm; but my hopes fluctuate, and my spirits are often depressed. I am +about to proceed on a long and difficult voyage, the emergencies of which +will demand all my fortitude: I am required not only to raise the spirits +of others, but sometimes to sustain my own, when theirs are failing. + +This is the most favourable period for travelling in Russia. They fly +quickly over the snow in their sledges; the motion is pleasant, and, in +my opinion, far more agreeable than that of an English stagecoach. The +cold is not excessive, if you are wrapped in furs—a dress which I have +already adopted, for there is a great difference between walking the +deck and remaining seated motionless for hours, when no exercise +prevents the blood from actually freezing in your veins. I have no +ambition to lose my life on the post-road between St. Petersburgh and +Archangel. + +I shall depart for the latter town in a fortnight or three weeks; and my +intention is to hire a ship there, which can easily be done by paying the +insurance for the owner, and to engage as many sailors as I think necessary +among those who are accustomed to the whale-fishing. I do not intend to +sail until the month of June; and when shall I return? Ah, dear sister, how +can I answer this question? If I succeed, many, many months, perhaps years, +will pass before you and I may meet. If I fail, you will see me again soon, +or never. + +Farewell, my dear, excellent Margaret. Heaven shower down blessings on you, +and save me, that I may again and again testify my gratitude for all your +love and kindness. + +Your affectionate brother, + +R. Walton + + + + +Letter 2 + +_To Mrs. Saville, England._ + +Archangel, 28th March, 17—. + + +How slowly the time passes here, encompassed as I am by frost and snow! +Yet a second step is taken towards my enterprise. I have hired a +vessel and am occupied in collecting my sailors; those whom I have +already engaged appear to be men on whom I can depend and are certainly +possessed of dauntless courage. + +But I have one want which I have never yet been able to satisfy, and the +absence of the object of which I now feel as a most severe evil, I have no +friend, Margaret: when I am glowing with the enthusiasm of success, there +will be none to participate my joy; if I am assailed by disappointment, no +one will endeavour to sustain me in dejection. I shall commit my thoughts +to paper, it is true; but that is a poor medium for the communication of +feeling. I desire the company of a man who could sympathise with me, whose +eyes would reply to mine. You may deem me romantic, my dear sister, but I +bitterly feel the want of a friend. I have no one near me, gentle yet +courageous, possessed of a cultivated as well as of a capacious mind, whose +tastes are like my own, to approve or amend my plans. How would such a +friend repair the faults of your poor brother! I am too ardent in execution +and too impatient of difficulties. But it is a still greater evil to me +that I am self-educated: for the first fourteen years of my life I ran wild +on a common and read nothing but our Uncle Thomas’ books of voyages. +At that age I became acquainted with the celebrated poets of our own +country; but it was only when it had ceased to be in my power to derive its +most important benefits from such a conviction that I perceived the +necessity of becoming acquainted with more languages than that of my native +country. Now I am twenty-eight and am in reality more illiterate than many +schoolboys of fifteen. It is true that I have thought more and that my +daydreams are more extended and magnificent, but they want (as the painters +call it) _keeping;_ and I greatly need a friend who would have sense +enough not to despise me as romantic, and affection enough for me to +endeavour to regulate my mind. + +Well, these are useless complaints; I shall certainly find no friend on the +wide ocean, nor even here in Archangel, among merchants and seamen. Yet +some feelings, unallied to the dross of human nature, beat even in these +rugged bosoms. My lieutenant, for instance, is a man of wonderful courage +and enterprise; he is madly desirous of glory, or rather, to word my phrase +more characteristically, of advancement in his profession. He is an +Englishman, and in the midst of national and professional prejudices, +unsoftened by cultivation, retains some of the noblest endowments of +humanity. I first became acquainted with him on board a whale vessel; +finding that he was unemployed in this city, I easily engaged him to assist +in my enterprise. + +The master is a person of an excellent disposition and is remarkable in the +ship for his gentleness and the mildness of his discipline. This +circumstance, added to his well-known integrity and dauntless courage, made +me very desirous to engage him. A youth passed in solitude, my best years +spent under your gentle and feminine fosterage, has so refined the +groundwork of my character that I cannot overcome an intense distaste to +the usual brutality exercised on board ship: I have never believed it to be +necessary, and when I heard of a mariner equally noted for his kindliness +of heart and the respect and obedience paid to him by his crew, I felt +myself peculiarly fortunate in being able to secure his services. I heard +of him first in rather a romantic manner, from a lady who owes to him the +happiness of her life. This, briefly, is his story. Some years ago he loved +a young Russian lady of moderate fortune, and having amassed a considerable +sum in prize-money, the father of the girl consented to the match. He saw +his mistress once before the destined ceremony; but she was bathed in +tears, and throwing herself at his feet, entreated him to spare her, +confessing at the same time that she loved another, but that he was poor, +and that her father would never consent to the union. My generous friend +reassured the suppliant, and on being informed of the name of her lover, +instantly abandoned his pursuit. He had already bought a farm with his +money, on which he had designed to pass the remainder of his life; but he +bestowed the whole on his rival, together with the remains of his +prize-money to purchase stock, and then himself solicited the young +woman’s father to consent to her marriage with her lover. But the old +man decidedly refused, thinking himself bound in honour to my friend, who, +when he found the father inexorable, quitted his country, nor returned +until he heard that his former mistress was married according to her +inclinations. “What a noble fellow!” you will exclaim. He is +so; but then he is wholly uneducated: he is as silent as a Turk, and a kind +of ignorant carelessness attends him, which, while it renders his conduct +the more astonishing, detracts from the interest and sympathy which +otherwise he would command. + +Yet do not suppose, because I complain a little or because I can +conceive a consolation for my toils which I may never know, that I am +wavering in my resolutions. Those are as fixed as fate, and my voyage +is only now delayed until the weather shall permit my embarkation. The +winter has been dreadfully severe, but the spring promises well, and it +is considered as a remarkably early season, so that perhaps I may sail +sooner than I expected. I shall do nothing rashly: you know me +sufficiently to confide in my prudence and considerateness whenever the +safety of others is committed to my care. + +I cannot describe to you my sensations on the near prospect of my +undertaking. It is impossible to communicate to you a conception of +the trembling sensation, half pleasurable and half fearful, with which +I am preparing to depart. I am going to unexplored regions, to “the +land of mist and snow,” but I shall kill no albatross; therefore do not +be alarmed for my safety or if I should come back to you as worn and +woeful as the “Ancient Mariner.” You will smile at my allusion, but I +will disclose a secret. I have often attributed my attachment to, my +passionate enthusiasm for, the dangerous mysteries of ocean to that +production of the most imaginative of modern poets. There is something +at work in my soul which I do not understand. I am practically +industrious—painstaking, a workman to execute with perseverance and +labour—but besides this there is a love for the marvellous, a belief +in the marvellous, intertwined in all my projects, which hurries me out +of the common pathways of men, even to the wild sea and unvisited +regions I am about to explore. + +But to return to dearer considerations. Shall I meet you again, after +having traversed immense seas, and returned by the most southern cape of +Africa or America? I dare not expect such success, yet I cannot bear to +look on the reverse of the picture. Continue for the present to write to +me by every opportunity: I may receive your letters on some occasions when +I need them most to support my spirits. I love you very tenderly. +Remember me with affection, should you never hear from me again. + +Your affectionate brother, + Robert Walton + + + + +Letter 3 + +_To Mrs. Saville, England._ + +July 7th, 17—. + + +My dear Sister, + +I write a few lines in haste to say that I am safe—and well advanced +on my voyage. This letter will reach England by a merchantman now on +its homeward voyage from Archangel; more fortunate than I, who may not +see my native land, perhaps, for many years. I am, however, in good +spirits: my men are bold and apparently firm of purpose, nor do the +floating sheets of ice that continually pass us, indicating the dangers +of the region towards which we are advancing, appear to dismay them. We +have already reached a very high latitude; but it is the height of +summer, and although not so warm as in England, the southern gales, +which blow us speedily towards those shores which I so ardently desire +to attain, breathe a degree of renovating warmth which I had not +expected. + +No incidents have hitherto befallen us that would make a figure in a +letter. One or two stiff gales and the springing of a leak are +accidents which experienced navigators scarcely remember to record, and +I shall be well content if nothing worse happen to us during our voyage. + +Adieu, my dear Margaret. Be assured that for my own sake, as well as +yours, I will not rashly encounter danger. I will be cool, +persevering, and prudent. + +But success _shall_ crown my endeavours. Wherefore not? Thus far I +have gone, tracing a secure way over the pathless seas, the very stars +themselves being witnesses and testimonies of my triumph. Why not +still proceed over the untamed yet obedient element? What can stop the +determined heart and resolved will of man? + +My swelling heart involuntarily pours itself out thus. But I must +finish. Heaven bless my beloved sister! + +R.W. + + + + +Letter 4 + + +_To Mrs. Saville, England._ + +August 5th, 17—. + +So strange an accident has happened to us that I cannot forbear +recording it, although it is very probable that you will see me before +these papers can come into your possession. + +Last Monday (July 31st) we were nearly surrounded by ice, which closed +in the ship on all sides, scarcely leaving her the sea-room in which +she floated. Our situation was somewhat dangerous, especially as we +were compassed round by a very thick fog. We accordingly lay to, +hoping that some change would take place in the atmosphere and weather. + +About two o’clock the mist cleared away, and we beheld, stretched out +in every direction, vast and irregular plains of ice, which seemed to +have no end. Some of my comrades groaned, and my own mind began to +grow watchful with anxious thoughts, when a strange sight suddenly +attracted our attention and diverted our solicitude from our own +situation. We perceived a low carriage, fixed on a sledge and drawn by +dogs, pass on towards the north, at the distance of half a mile; a +being which had the shape of a man, but apparently of gigantic stature, +sat in the sledge and guided the dogs. We watched the rapid progress +of the traveller with our telescopes until he was lost among the +distant inequalities of the ice. + +This appearance excited our unqualified wonder. We were, as we believed, +many hundred miles from any land; but this apparition seemed to denote that +it was not, in reality, so distant as we had supposed. +Continue this passage in exactly 200 tokens of prose. diff --git a/benchmarks/3d/analyze.py b/benchmarks/3d/analyze.py new file mode 100644 index 0000000..ab0918b --- /dev/null +++ b/benchmarks/3d/analyze.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +"""Analyze MTP n_max sweep results and produce summary.md.""" + +import json +from pathlib import Path + +RESULTS_PATH = Path(__file__).parent / "results.json" +SUMMARY_PATH = Path(__file__).parent / "summary.md" + + +def load_results() -> list[dict]: + data = json.loads(RESULTS_PATH.read_text()) + return [r for r in data if r.get("eval_tok_s") is not None and r.get("error") is None] + + +def main() -> None: + rows = load_results() + if not rows: + print("No valid results found.") + return + + models = sorted(set(r["model"] for r in rows)) + lines = ["# MTP n_max Sweep Results\n"] + lines.append(f"**{len(rows)} valid measurements across {len(models)} models.**\n") + + recommendations = [] + + for model in models: + model_rows = [r for r in rows if r["model"] == model] + n_max_values = sorted(set(r["n_max"] for r in model_rows)) + prompt_names = sorted(set(r["prompt"] for r in model_rows)) + + lines.append(f"\n## {model}\n") + + header = "| n_max | " + " | ".join(f"{p} tok/s" for p in prompt_names) + " | avg tok/s | vs n_max=0 |" + sep = "|-------|" + "|".join("-" * (len(p) + 7) for p in prompt_names) + "|-----------|------------|" + lines.append(header) + lines.append(sep) + + baseline_avg = None + best_avg = 0 + best_n = 0 + + for n in n_max_values: + cells = [] + vals = [] + for p in prompt_names: + matching = [r for r in model_rows if r["n_max"] == n and r["prompt"] == p] + if matching: + v = matching[0]["eval_tok_s"] + cells.append(f"{v:.1f}") + vals.append(v) + else: + cells.append("—") + + avg = sum(vals) / len(vals) if vals else 0 + if n == 0: + baseline_avg = avg + delta = "baseline" + elif baseline_avg and baseline_avg > 0: + pct = ((avg - baseline_avg) / baseline_avg) * 100 + delta = f"{pct:+.1f}%" + else: + delta = "—" + + if avg > best_avg: + best_avg = avg + best_n = n + + draft_info = "" + draft_rows = [r for r in model_rows if r["n_max"] == n and r.get("draft_n")] + if draft_rows: + total_draft = sum(r.get("draft_n", 0) for r in draft_rows) + total_accepted = sum(r.get("draft_n_accepted", 0) for r in draft_rows) + if total_draft > 0: + accept_pct = (total_accepted / total_draft) * 100 + draft_info = f" (accept {accept_pct:.0f}%)" + + row_str = f"| {n} | " + " | ".join(cells) + f" | {avg:.1f} | {delta}{draft_info} |" + lines.append(row_str) + + if baseline_avg and baseline_avg > 0 and best_avg > 0: + improvement = ((best_avg - baseline_avg) / baseline_avg) * 100 + lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s, {improvement:+.1f}% vs baseline)\n") + recommendations.append((model, best_n, best_avg, improvement)) + else: + lines.append(f"\n**Optimal n_max: {best_n}** (avg {best_avg:.1f} tok/s)\n") + + # Recommendations section + lines.append("\n---\n") + lines.append("## Recommended `llama_extra_args` per model\n") + lines.append("| Model | n_max | avg tok/s | vs baseline | suggested flags |") + lines.append("|-------|-------|-----------|-------------|-----------------|") + for model, n, avg, imp in recommendations: + if n > 0: + flags = f'`["--spec-type", "draft-mtp", "--spec-draft-n-max", "{n}"]`' + else: + flags = "_(none — MTP not beneficial)_" + lines.append(f"| {model} | {n} | {avg:.1f} | {imp:+.1f}% | {flags} |") + + lines.append("") + summary = "\n".join(lines) + SUMMARY_PATH.write_text(summary) + print(summary) + print(f"\nWritten to: {SUMMARY_PATH}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/3d/run_sweep.py b/benchmarks/3d/run_sweep.py new file mode 100644 index 0000000..3ddca7f --- /dev/null +++ b/benchmarks/3d/run_sweep.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +"""MTP n_max sweep across MTP-capable models via llama-sidecar. + +Usage: + python3 run_sweep.py # full sweep + python3 run_sweep.py --dry-run # print matrix, no API calls + python3 run_sweep.py --limit 1 # run first combo only (smoke) +""" + +import argparse +import json +import os +import sys +import time +from datetime import datetime, timezone +from pathlib import Path +from urllib.request import Request, urlopen +from urllib.error import URLError, HTTPError + +SIDECAR_URL = os.environ.get("SIDECAR_URL", "http://100.101.41.16:8402") +RESULTS_PATH = Path(__file__).parent / "results.json" + +MATRIX = [ + ("qwen3.6-35b-a3b-mxfp4", [0, 1, 2, 3]), + ("qwen3.6-27b-mtp", [0, 1, 2, 3, 4]), + ("qwopus3.6-27b-v2-mtp", [0, 2]), + ("qwopus3.5-9b-coder-mtp", [0, 2]), +] + +PROMPTS = { + "short": { + "content": "Reply with exactly five words: a haiku-like greeting.", + "max_tokens": 100, + }, + "medium": { + "content": ( + "Explain how multi-token prediction speculative decoding works in transformer " + "inference. Cover: 1) the draft model role, 2) the verification mechanism, " + "3) acceptance rate dynamics, 4) why MoE models gain less than dense models. " + "Aim for 400-500 words." + ), + "max_tokens": 700, + }, + "long": { + "content": ( + "Write a complete Python implementation of a simple HTTP server that " + "accepts POST requests on /v1/chat/completions, validates JSON bodies " + "against a basic OpenAI schema, logs each request to stdout in JSON " + "format, and returns a hardcoded streaming response. Include error " + "handling for malformed JSON, missing required fields, and unsupported " + "methods. Add docstrings and type hints throughout. Show full file." + ), + "max_tokens": 2500, + }, +} + + +def build_flags(n_max: int) -> str: + if n_max > 0: + return f"--spec-type draft-mtp --spec-draft-n-max {n_max} --repeat-penalty 1.0" + return "--repeat-penalty 1.0" + + +def sidecar_request(method: str, path: str, body: dict | None = None, + headers: dict | None = None, timeout: int = 180) -> dict | None: + url = f"{SIDECAR_URL}{path}" + data = json.dumps(body).encode() if body else None + hdrs = {"Content-Type": "application/json"} + if headers: + hdrs.update(headers) + req = Request(url, data=data, headers=hdrs, method=method) + try: + with urlopen(req, timeout=timeout) as resp: + return json.loads(resp.read()) + except HTTPError as e: + body_text = e.read().decode(errors="replace") + try: + return json.loads(body_text) + except json.JSONDecodeError: + return {"error": f"HTTP {e.code}", "body": body_text[:500]} + except URLError as e: + return {"error": str(e)} + + +def send_completion(model: str, flags: str, prompt: str, max_tokens: int) -> dict: + body = { + "model": model, + "messages": [{"role": "user", "content": prompt}], + "max_tokens": max_tokens, + "stream": False, + } + headers = { + "X-Agent-Flags": flags, + "X-Model-Id": model, + } + t0 = time.perf_counter() + resp = sidecar_request("POST", "/v1/chat/completions", body=body, headers=headers) + wall_ms = (time.perf_counter() - t0) * 1000 + if resp is None: + return {"error": "no response", "wall_clock_ms": wall_ms} + resp["wall_clock_ms"] = wall_ms + return resp + + +def extract_metrics(resp: dict, model: str, n_max: int, prompt_name: str) -> dict: + timings = resp.get("timings", {}) + usage = resp.get("usage", {}) + sidecars = sidecar_request("GET", "/sidecars") or [] + sidecar_hash = "" + sidecar_port = 0 + if isinstance(sidecars, list): + for s in sidecars: + if s.get("model_id") == model: + sidecar_hash = s.get("hash", "") + sidecar_port = s.get("port", 0) + break + + return { + "model": model, + "n_max": n_max, + "prompt": prompt_name, + "timestamp_utc": datetime.now(timezone.utc).isoformat(), + "completion_tokens": usage.get("completion_tokens"), + "prompt_tokens": usage.get("prompt_tokens"), + "eval_tok_s": timings.get("predicted_per_second"), + "prompt_tok_s": timings.get("prompt_per_second"), + "eval_ms": timings.get("predicted_ms"), + "prompt_ms": timings.get("prompt_ms"), + "draft_n": timings.get("draft_n"), + "draft_n_accepted": timings.get("draft_n_accepted"), + "wall_clock_ms": resp.get("wall_clock_ms"), + "sidecar_hash": sidecar_hash, + "sidecar_port": sidecar_port, + "error": resp.get("error"), + } + + +def append_result(row: dict) -> None: + results = [] + if RESULTS_PATH.exists(): + try: + results = json.loads(RESULTS_PATH.read_text()) + except (json.JSONDecodeError, OSError): + pass + results.append(row) + RESULTS_PATH.write_text(json.dumps(results, indent=2) + "\n") + + +def evict_all_sidecars() -> None: + sidecars = sidecar_request("GET", "/sidecars") + if not isinstance(sidecars, list): + return + for s in sidecars: + h = s.get("hash", "") + if h: + sidecar_request("DELETE", f"/sidecars/{h}") + + +def run_combo(model: str, n_max: int, combo_idx: int, total_combos: int, + prompt_names: list[str]) -> None: + flags = build_flags(n_max) + label = f"[{combo_idx}/{total_combos}] {model} n_max={n_max}" + print(f"\n{'='*60}") + print(f"{label}") + print(f" flags: {flags}") + print(f"{'='*60}") + + for pname in prompt_names: + p = PROMPTS[pname] + # Warmup + print(f" {pname}: warmup...", end="", flush=True) + send_completion(model, flags, p["content"], p["max_tokens"]) + print(" done.", flush=True) + time.sleep(2) + + # Record + print(f" {pname}: recording...", end="", flush=True) + resp = send_completion(model, flags, p["content"], p["max_tokens"]) + row = extract_metrics(resp, model, n_max, pname) + append_result(row) + + tok_s = row.get("eval_tok_s") + draft = row.get("draft_n") + err = row.get("error") + if err: + print(f" ERROR: {err}") + elif tok_s: + draft_str = f" draft_n={draft}" if draft else "" + print(f" {tok_s:.1f} tok/s{draft_str}") + else: + print(" (no timings in response)") + + # Evict this sidecar to free VRAM + evict_all_sidecars() + print(f" evicted sidecars, sleeping 5s for VRAM release...") + time.sleep(5) + + +def dry_run() -> None: + combos = [(model, n) for model, ns in MATRIX for n in ns] + print(f"Dry run: {len(combos)} combos × 3 prompts × 2 calls = {len(combos)*6} API calls") + print(f"Estimated runtime: 60-90 minutes\n") + for i, (model, n_max) in enumerate(combos, 1): + flags = build_flags(n_max) + print(f" [{i}/{len(combos)}] {model} n_max={n_max}") + print(f" flags: {flags}") + for pname in PROMPTS: + p = PROMPTS[pname] + print(f" {pname}: max_tokens={p['max_tokens']}") + print(f"\nResults would be written to: {RESULTS_PATH}") + + +def main() -> None: + parser = argparse.ArgumentParser(description="MTP n_max sweep benchmark") + parser.add_argument("--dry-run", action="store_true", help="Print matrix without running") + parser.add_argument("--limit", type=int, default=0, help="Run only first N combos") + args = parser.parse_args() + + if args.dry_run: + dry_run() + return + + # Check sidecar health + health = sidecar_request("GET", "/health") + if not health or health.get("status") != "ok": + print(f"Sidecar unhealthy: {health}", file=sys.stderr) + sys.exit(1) + print(f"Sidecar healthy: {health}") + + # Clear existing sidecars + evict_all_sidecars() + + combos = [(model, n) for model, ns in MATRIX for n in ns] + if args.limit > 0: + combos = combos[:args.limit] + prompt_names = list(PROMPTS.keys()) + + t_start = time.perf_counter() + for i, (model, n_max) in enumerate(combos, 1): + run_combo(model, n_max, i, len(combos), prompt_names) + + elapsed = time.perf_counter() - t_start + print(f"\nSweep complete. {len(combos)} combos in {elapsed/60:.1f} minutes.") + print(f"Results: {RESULTS_PATH}") + + +if __name__ == "__main__": + main() diff --git a/cmd/llama-sidecar/main.go b/cmd/llama-sidecar/main.go new file mode 100644 index 0000000..aecf797 --- /dev/null +++ b/cmd/llama-sidecar/main.go @@ -0,0 +1,74 @@ +package main + +import ( + "context" + "fmt" + "log/slog" + "net/http" + "os" + "time" + + "github.com/indifferentketchup/llama-sidecar/internal/config" + "github.com/indifferentketchup/llama-sidecar/internal/pool" + "github.com/indifferentketchup/llama-sidecar/internal/server" + "github.com/indifferentketchup/llama-sidecar/internal/winsvc" +) + +func main() { + cfg, err := config.Load() + if err != nil { + fmt.Fprintf(os.Stderr, "config error: %v\n", err) + os.Exit(1) + } + + initLogger(cfg.LogLevel) + slog.Info("starting llama-sidecar", + "bind", cfg.Bind, + "max_sidecars", cfg.MaxSidecars, + "port_range", fmt.Sprintf("%d-%d", cfg.PortRangeLo, cfg.PortRangeHi), + "models", len(cfg.ModelDirMap), + "base_args", cfg.BaseArgs, + ) + + startedAt := time.Now() + spawner := &pool.RealSpawner{} + p := pool.New(cfg, spawner) + srv := server.New(cfg, p, startedAt) + + go func() { + slog.Info("listening", "addr", cfg.Bind) + if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed { + slog.Error("server error", "err", err) + os.Exit(1) + } + }() + + winsvc.RegisterShutdownHandler(context.Background(), func(ctx context.Context) error { + slog.Info("draining HTTP server") + drainCtx, drainCancel := context.WithTimeout(ctx, 10*time.Second) + defer drainCancel() + if err := srv.Shutdown(drainCtx); err != nil { + slog.Error("HTTP drain failed", "err", err) + } + slog.Info("shutting down sidecar pool") + poolCtx, poolCancel := context.WithTimeout(ctx, 30*time.Second) + defer poolCancel() + return p.Shutdown(poolCtx) + }) +} + +func initLogger(level string) { + var lvl slog.Level + switch level { + case "debug": + lvl = slog.LevelDebug + case "warn": + lvl = slog.LevelWarn + case "error": + lvl = slog.LevelError + default: + lvl = slog.LevelInfo + } + handler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: lvl}) + slog.SetDefault(slog.New(handler)) +} diff --git a/eval/ab/prompts.json b/eval/ab/prompts.json new file mode 100644 index 0000000..cba240f --- /dev/null +++ b/eval/ab/prompts.json @@ -0,0 +1,72 @@ +[ + { + "id": "review-1", + "agent": "Code Reviewer", + "prompt": "Review the `buildHeadPayload` function in `apps/server/src/services/compaction.ts`. It was recently patched in v1.13.6 to embed `reasoning_parts` as a `...` prose prefix on the assistant content for tool-bearing turns. Check: does the current implementation handle the case where `reasoning_parts` is an empty array? Does it handle turns that have both reasoning_parts AND non-empty text content (not just tool calls)? Cite file:line for any issues." + }, + { + "id": "review-2", + "agent": "Code Reviewer", + "prompt": "Review the path guard layer in `apps/coder/services/path_guard.ts`. It enforces per-project scoping with a blanket `/opt:rw` mount and policy at the tool layer. Check for: symlink traversal (does it resolve symlinks before checking?), double-encoding attacks on path components, race conditions between check and use (TOCTOU), and whether `extraRoots` from `request_read_access` grants could be abused to escape the project scope. Cite file:line." + }, + { + "id": "debug-1", + "agent": "Debugger", + "prompt": "Bug report: after a long BooCode chat session (~40 messages), the compaction trigger fires but the resulting summary is empty — the assistant message with `summary=true` has blank content. The `ctx_max` is correctly fetched from `/upstream//props` (verified in logs). The `needs_compaction` flag is being set. But the summary inference returns an empty string. This started happening after the v1.13.7 compaction trigger change that lowered the threshold to `floor(0.85 * ctx_max)`. Diagnose: what code path could produce an empty summary, and what would you check first?" + }, + { + "id": "debug-2", + "agent": "Debugger", + "prompt": "Bug report: BooTerm terminal pane shows garbled output past column 66 on initial open, but corrects itself after manually resizing the browser window. The `stty size` inside the terminal reports `82 66` even though the pane is visually ~132 columns wide. tmux `list-windows` confirms the session was created at 66 columns. This only happens when opening a terminal pane via the split-pane button, not when opening it as the sole pane. Diagnose the root cause in `apps/web/src/components/panes/TerminalPane.tsx`." + }, + { + "id": "refactor-1", + "agent": "Refactorer", + "prompt": "The `streamCompletion` function in `apps/server/src/services/provider.ts` has grown to handle: AI SDK v6 streaming, XML fallback parsing for qwen3.6 tool-call emissions, abort signal handling (the explicit `if (signal?.aborted) throw` patch), reasoning-delta counting, and usage extraction. It's now ~200 lines. Propose a refactor that separates concerns without breaking the streaming contract. The function must remain a single entry point for callers." + }, + { + "id": "refactor-2", + "agent": "Refactorer", + "prompt": "The WebSocket frame publishing in BooCode went through two batches (v1.13.12 + v1.13.13) that converted ~80 publish sites to typed `publishFrame`/`publishUserFrame` wrappers with Zod validation. The schemas are duplicated byte-identical between `apps/server/src/types/ws-frames.ts` and `apps/web/src/api/ws-frames.ts` with a parity test. Propose a refactor to share the schema definition from a single source instead of maintaining the duplication + parity test." + }, + { + "id": "architect-1", + "agent": "Architect", + "prompt": "Design the system-prompt prefix cache for BooCode. Context: `buildSystemPromptWithFingerprint` already computes a SHA-256 of the assembled prefix and logs drift. The prefix is rebuilt on every inference turn from: project settings, agent instructions (AGENTS.md), skills, session-level overrides, and web_search_enabled flag. Most of these don't change between turns in the same session. Design a cache that avoids rebuilding+rehashing on every turn. Consider: process-memory vs DB-backed, invalidation strategy, cache key shape, and whether the fingerprint can serve as the cache key itself." + }, + { + "id": "architect-2", + "agent": "Architect", + "prompt": "Design the v2.5 task model integration with BooCoder's ACP dispatch. Context: v2.5.0-task-model just shipped a `tasks` table and lightweight task model services. BooCoder dispatches external agents (opencode, goose, claude) via ACP or PTY. Design how a task created in BooChat should flow through to a BooCoder dispatch: task creation → agent selection → ACP session → status updates back to the task row → completion. Consider: which fields from the task row map to ACP session params, how task status syncs with the agent's exit code, and how the UI surfaces progress." + }, + { + "id": "security-1", + "agent": "Security Auditor", + "prompt": "Audit the `web_fetch` tool implementation in BooCode. It fetches arbitrary URLs on behalf of the LLM agent. Check for: SSRF against internal Tailscale IPs (100.x.x.x), DNS rebinding, redirect following to internal hosts, response size limits, content-type validation, and whether the `url_guard.ts` layer covers all cases. The tool is gated by `session.web_search_enabled` but once enabled, the URL is user-agent-controlled (the LLM decides what to fetch)." + }, + { + "id": "security-2", + "agent": "Security Auditor", + "prompt": "Audit the `request_read_access` tool and `allowed_read_paths` grant mechanism (v1.13.17). When an agent needs to read files outside its project scope, it calls `request_read_access(path)` which triggers an `ask_user_input` elicitation for approval. On approval, the path is added to `allowed_read_paths` for that session, and `pathGuard` is extended with `extraRoots`. Check: can the agent request a path like `/etc/shadow` or `/opt/boocode/.env`? Is the grant scoped to the session or persistent? Can the path be a symlink that resolves to a sensitive location after the grant?" + }, + { + "id": "prompt-1", + "agent": "Prompt Builder", + "prompt": "Write a Claude Code dispatch prompt for: adding a new BooCode agent called 'Documenter' to AGENTS.md. The agent should read source files and produce inline JSDoc/TSDoc comments. It should use the read-only tool set. Temperature 0.4, steps 10. The prompt should include pre-flight checks, the exact file to modify, backup instructions, and verification steps." + }, + { + "id": "prompt-2", + "agent": "Prompt Builder", + "prompt": "Write an OpenCode dispatch prompt for: fixing the codecontext sidecar to handle projects with more than 10,000 files without OOMing. The fork is at /opt/forks/codecontext/. The agent should investigate the memory profile of the graph analysis pass, identify the allocation hotspot, and propose a streaming or chunked alternative. Include #careful hashtag, backup rules, and stop conditions." + }, + { + "id": "recon-1", + "agent": "Recon", + "prompt": "Map the BooCode monorepo at /opt/boocode/. I need: top-level directory structure, the three apps and their roles, how they share the database, the Docker container topology, and the key service files in apps/server/src/services/. Identify the data flow from a user message in BooChat through to the LLM inference call and back." + }, + { + "id": "recon-2", + "agent": "Recon", + "prompt": "Map the codecontext fork at /opt/forks/codecontext/. I need: the MCP tool surface (what tools are exposed), the parser architecture (how tree-sitter grammars are registered), the graph analysis pipeline (how dependencies and call graphs are built), and the codesight-merge additions (blast radius, hot files, routes, middleware). Identify the main entry points and the caching layer." + } +] diff --git a/eval/ab/run.sh b/eval/ab/run.sh new file mode 100755 index 0000000..b7a2c1d --- /dev/null +++ b/eval/ab/run.sh @@ -0,0 +1,242 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ENDPOINT="http://100.101.41.16:8401/v1" +PROMPTS_FILE="${SCRIPT_DIR}/prompts.json" +RESULTS_DIR="${SCRIPT_DIR}/results" +COMPARE_FILE="${SCRIPT_DIR}/COMPARE.md" +TIMING_FILE="${SCRIPT_DIR}/timing.csv" + +MODELS=( + qwen3.6-35b-a3b-mxfp4 + qwen3-coder-30b-apex + qwen3.6-27b-mtp + qwopus3.5-4b-mtp + qwen3.5-9b-deepseek-v4-mtp + qwopus3.6-35b-a3b-v1 + qwopus3.6-27b-v2-mtp + qwopus3.5-9b-coder-mtp +) + +mkdir -p "$RESULTS_DIR" + +# ── Parse prompts ───────────────────────────────────────────────────── + +PROMPT_COUNT=$(python3 -c "import json; print(len(json.load(open('${PROMPTS_FILE}'))))") +TOTAL=$((PROMPT_COUNT * ${#MODELS[@]})) +EST_MIN=$(( TOTAL * 30 / 60 )) + +echo "================================================================" +echo " A/B MODEL COMPARISON" +echo " ${PROMPT_COUNT} prompts × ${#MODELS[@]} models = ${TOTAL} requests" +echo " Estimated runtime: ~${EST_MIN} minutes" +echo " Endpoint: ${ENDPOINT}" +echo "================================================================" +echo "" + +# ── Main loop: models (outer) × prompts (inner) ────────────────────── +# One model load per model, all prompts answered, then swap. + +t_start=$(date +%s) +done_count=0 + +for model in "${MODELS[@]}"; do + echo "" + echo "================================================================" + echo " MODEL: ${model}" + echo "================================================================" + + # Warmup: load the model with a trivial request + all_cached=true + for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do + PID=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['id'])") + if [ ! -f "${RESULTS_DIR}/${PID}/${model}.json" ] || [ ! -s "${RESULTS_DIR}/${PID}/${model}.json" ]; then + all_cached=false + break + fi + done + + if [ "$all_cached" = "true" ]; then + echo " All ${PROMPT_COUNT} prompts cached, skipping model" + for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do + done_count=$((done_count + 1)) + done + continue + fi + + echo " Warming up..." + curl -s -X POST "${ENDPOINT}/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{\"model\":\"${model}\",\"messages\":[{\"role\":\"user\",\"content\":\"Say OK.\"}],\"max_tokens\":10,\"temperature\":0}" \ + --max-time 300 > /dev/null 2>&1 + echo " Warm." + + for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do + PROMPT_ID=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['id'])") + AGENT=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['agent'])") + + mkdir -p "${RESULTS_DIR}/${PROMPT_ID}" + OUT_JSON="${RESULTS_DIR}/${PROMPT_ID}/${model}.json" + OUT_MD="${RESULTS_DIR}/${PROMPT_ID}/${model}.md" + + # Resume: skip if already done + if [ -f "$OUT_JSON" ] && [ -s "$OUT_JSON" ]; then + done_count=$((done_count + 1)) + echo " [${PROMPT_ID}] cached (${done_count}/${TOTAL})" + continue + fi + + BODY=$(python3 -c " +import json +p = json.load(open('${PROMPTS_FILE}'))[${pidx}] +print(json.dumps({ + 'model': '${model}', + 'messages': [{'role': 'user', 'content': p['prompt']}], + 'temperature': 0.6, + 'max_tokens': 2048, + 'seed': 42, + 'stream': False +})) +") + + SUCCESS=0 + for attempt in 1 2; do + HTTP_CODE=$(curl -s -w '%{http_code}' -o "$OUT_JSON" \ + --max-time 300 \ + -X POST "${ENDPOINT}/chat/completions" \ + -H "Content-Type: application/json" \ + -d "$BODY" 2>/dev/null) + + if [ "$HTTP_CODE" = "200" ]; then + SUCCESS=1 + break + else + if [ "$attempt" = "1" ]; then + echo " [${PROMPT_ID}] HTTP ${HTTP_CODE}, retrying in 10s..." + sleep 10 + else + echo "ERROR: HTTP ${HTTP_CODE}" > "$OUT_MD" + echo " [${PROMPT_ID}] FAILED (HTTP ${HTTP_CODE})" + fi + fi + done + + if [ "$SUCCESS" = "1" ]; then + python3 -c " +import json +d = json.load(open('${OUT_JSON}')) +msg = d.get('choices', [{}])[0].get('message', {}) +content = msg.get('content', '') or '' +reasoning = msg.get('reasoning_content', '') or '' +out = '' +if reasoning: + out += '\n' + reasoning + '\n\n\n' +out += content +open('${OUT_MD}', 'w').write(out) +" 2>/dev/null + done_count=$((done_count + 1)) + METRICS=$(python3 -c " +import json +d = json.load(open('${OUT_JSON}')) +t = d.get('timings', {}) +tps = t.get('predicted_per_second', 0) +tok = d.get('usage', {}).get('completion_tokens', 0) +print(f'{tps:.1f}tok/s {tok}tok') +" 2>/dev/null || echo "?") + echo " [${PROMPT_ID}] done (${METRICS}) [${done_count}/${TOTAL}]" + fi + + sleep 2 + done +done + +# ── Generate COMPARE.md ────────────────────────────────────────────── + +echo "" +echo "Generating COMPARE.md..." + +MODELS_JSON=$(printf '%s\n' "${MODELS[@]}" | python3 -c "import json,sys; print(json.dumps([l.strip() for l in sys.stdin if l.strip()]))") + +python3 -c " +import json +from pathlib import Path + +prompts = json.load(open('${PROMPTS_FILE}')) +results_dir = Path('${RESULTS_DIR}') +models = json.loads('${MODELS_JSON}') + +lines = ['# A/B Model Comparison\n'] + +timing_rows = [] + +for p in prompts: + pid = p['id'] + agent = p['agent'] + short = p['prompt'][:80] + lines.append(f'## [{pid}] {agent}\n') + lines.append(f'> {short}...\n') + + for model in models: + md_path = results_dir / pid / f'{model}.md' + json_path = results_dir / pid / f'{model}.json' + lines.append(f'### {model}\n') + if md_path.exists(): + content = md_path.read_text().strip() + lines.append(f'{content}\n') + else: + lines.append('*(no response)*\n') + + if json_path.exists(): + try: + d = json.loads(json_path.read_text()) + t = d.get('timings', {}) + u = d.get('usage', {}) + timing_rows.append({ + 'prompt_id': pid, + 'model_id': model, + 'prompt_tps': t.get('prompt_per_second', 0), + 'predicted_tps': t.get('predicted_per_second', 0), + 'total_tokens': u.get('total_tokens', 0), + 'latency_ms': round((t.get('prompt_ms', 0) or 0) + (t.get('predicted_ms', 0) or 0), 1), + }) + except: + pass + lines.append('---\n') + +# Timing table +lines.append('## Timing Summary\n') +pids = list(dict.fromkeys(r['prompt_id'] for r in timing_rows)) +lines.append('| prompt | ' + ' | '.join(models) + ' |') +lines.append('|--------' + '|------' * len(models) + '|') +for pid in pids: + cells = [] + for model in models: + match = [r for r in timing_rows if r['prompt_id'] == pid and r['model_id'] == model] + if match: + cells.append(f\"{match[0]['predicted_tps']:.0f}\") + else: + cells.append('—') + lines.append(f'| {pid} | ' + ' | '.join(cells) + ' |') + +Path('${COMPARE_FILE}').write_text('\n'.join(lines) + '\n') +print(f'Wrote ${COMPARE_FILE}') + +# timing.csv +import csv +with open('${TIMING_FILE}', 'w', newline='') as f: + w = csv.DictWriter(f, fieldnames=['prompt_id', 'model_id', 'prompt_tps', 'predicted_tps', 'total_tokens', 'latency_ms']) + w.writeheader() + w.writerows(timing_rows) +print(f'Wrote ${TIMING_FILE}') +" + +t_end=$(date +%s) +elapsed=$(( t_end - t_start )) +echo "" +echo "================================================================" +echo " COMPLETE in $(( elapsed / 60 ))m $(( elapsed % 60 ))s" +echo " Results: ${RESULTS_DIR}/" +echo " Compare: ${COMPARE_FILE}" +echo " Timing: ${TIMING_FILE}" +echo "================================================================" diff --git a/eval/analyze.py b/eval/analyze.py new file mode 100644 index 0000000..e65c827 --- /dev/null +++ b/eval/analyze.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +"""Generate SUMMARY.md from scores.csv.""" + +import csv +from collections import defaultdict +from pathlib import Path + +CSV_PATH = Path(__file__).parent / "scores.csv" +SUMMARY_PATH = Path(__file__).parent / "SUMMARY.md" + + +def load_scores() -> list[dict]: + rows = [] + with open(CSV_PATH) as f: + for row in csv.DictReader(f): + row["correct"] = row["correct"].lower() in ("true", "1", "yes") + row["latency_ms"] = float(row.get("latency_ms", 0) or 0) + rows.append(row) + return rows + + +def main() -> None: + rows = load_scores() + if not rows: + print("No data in scores.csv") + return + + models = sorted(set(r["model"] for r in rows)) + benchmarks = ["mmlu", "gsm8k", "humaneval"] + + # Compute scores + scores = {} # (model, bench) -> (correct, total) + for r in rows: + key = (r["model"], r["benchmark"]) + if key not in scores: + scores[key] = [0, 0] + scores[key][1] += 1 + if r["correct"]: + scores[key][0] += 1 + + # MMLU per-category + cat_scores = defaultdict(lambda: [0, 0]) + for r in rows: + if r["benchmark"] == "mmlu" and r.get("category"): + key = (r["model"], r["category"]) + cat_scores[key][1] += 1 + if r["correct"]: + cat_scores[key][0] += 1 + + categories = sorted(set(r.get("category", "") for r in rows if r.get("category"))) + + lines = ["# Eval Results\n"] + + # Main table + lines.append("## Overall Scores\n") + header = "| Model | MMLU (%) | GSM8K (%) | HumanEval (%) | Avg (%) |" + sep = "|-------|---------|---------|--------------|---------|" + lines.append(header) + lines.append(sep) + + model_avgs = [] + for model in models: + cells = [] + pcts = [] + for bench in benchmarks: + key = (model, bench) + if key in scores: + c, t = scores[key] + pct = c / t * 100 if t > 0 else 0 + cells.append(f"{pct:.1f}") + pcts.append(pct) + else: + cells.append("—") + avg = sum(pcts) / len(pcts) if pcts else 0 + model_avgs.append((model, avg)) + cells.append(f"{avg:.1f}") + lines.append(f"| {model} | " + " | ".join(cells) + " |") + + # Sort summary + model_avgs.sort(key=lambda x: -x[1]) + lines.append(f"\n**Best overall: {model_avgs[0][0]}** ({model_avgs[0][1]:.1f}% avg)\n") + + # MMLU category breakdown + if categories: + lines.append("\n## MMLU Per-Category Breakdown\n") + header = "| Model | " + " | ".join(c.replace("_", " ").title() for c in categories) + " |" + sep = "|-------" + "|-------" * len(categories) + "|" + lines.append(header) + lines.append(sep) + for model in models: + cells = [] + for cat in categories: + key = (model, cat) + if key in cat_scores: + c, t = cat_scores[key] + cells.append(f"{c}/{t}") + else: + cells.append("—") + lines.append(f"| {model} | " + " | ".join(cells) + " |") + + # Latency summary + lines.append("\n## Median Latency (ms)\n") + lines.append("| Model | MMLU | GSM8K | HumanEval |") + lines.append("|-------|------|-------|-----------|") + for model in models: + cells = [] + for bench in benchmarks: + lats = sorted([r["latency_ms"] for r in rows + if r["model"] == model and r["benchmark"] == bench + and r["latency_ms"] > 0]) + if lats: + med = lats[len(lats)//2] + cells.append(f"{med:.0f}") + else: + cells.append("—") + lines.append(f"| {model} | " + " | ".join(cells) + " |") + + summary = "\n".join(lines) + "\n" + SUMMARY_PATH.write_text(summary) + print(summary) + print(f"\nWritten to: {SUMMARY_PATH}") + + +if __name__ == "__main__": + main() diff --git a/eval/gsm8k.py b/eval/gsm8k.py new file mode 100644 index 0000000..b4e0288 --- /dev/null +++ b/eval/gsm8k.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +"""GSM8K 50-question subset benchmark (seed=42).""" + +import json +import os +import random +import re +import sys +import time +from pathlib import Path + +from datasets import load_dataset +from openai import OpenAI +from tqdm import tqdm + +ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1") +RESULTS_DIR = Path(__file__).parent / "results" +MAX_TOKENS = 512 +SEED = 42 +TEMPERATURE = 0 +N_QUESTIONS = 50 + + +def load_questions() -> list[dict]: + rng = random.Random(SEED) + ds = load_dataset("openai/gsm8k", "main", split="test", trust_remote_code=True) + indices = list(range(len(ds))) + rng.shuffle(indices) + questions = [] + for idx in indices[:N_QUESTIONS]: + row = ds[idx] + answer_text = row["answer"] + # GSM8K answer format: "#### " at end + match = re.search(r"####\s*([0-9,.-]+)", answer_text) + expected = int(match.group(1).replace(",", "")) if match else 0 + questions.append({ + "id": f"gsm8k_{idx}", + "question": row["question"], + "expected": expected, + }) + return questions + + +def format_prompt(q: dict) -> str: + return ( + "Solve this problem step by step, then on the final line write " + "'ANSWER: '.\n\n" + q["question"] + ) + + +def parse_answer(text: str) -> int | None: + matches = re.findall(r"ANSWER:\s*([0-9,.-]+)", text, re.IGNORECASE) + if matches: + try: + return int(matches[-1].replace(",", "")) + except ValueError: + return None + # Fallback: last number in the response + nums = re.findall(r"-?\d[\d,]*", text) + if nums: + try: + return int(nums[-1].replace(",", "")) + except ValueError: + return None + return None + + +def run_gsm8k(model: str, client: OpenAI, questions: list[dict]) -> list[dict]: + model_dir = RESULTS_DIR / model / "gsm8k" + model_dir.mkdir(parents=True, exist_ok=True) + + results = [] + correct = 0 + total = 0 + + skipped = 0 + for i, q in enumerate(tqdm(questions, desc=f" GSM8K {model}", file=sys.stderr)): + expected = q["expected"] + out_path = model_dir / f"{q['id']}.json" + + if out_path.exists(): + try: + cached = json.loads(out_path.read_text()) + raw = "" + if "choices" in cached: + msg = cached["choices"][0].get("message", {}) + raw = msg.get("content", "") or msg.get("reasoning_content", "") or "" + parsed = parse_answer(raw) + is_correct = parsed is not None and parsed == expected + if is_correct: + correct += 1 + total += 1 + results.append({ + "model": model, "benchmark": "gsm8k", "question_id": q["id"], + "correct": is_correct, "raw_answer": raw[:200], + "parsed_answer": str(parsed) if parsed is not None else "", + "expected": str(expected), "latency_ms": 0, + }) + skipped += 1 + continue + except (json.JSONDecodeError, KeyError): + pass + + prompt = format_prompt(q) + t0 = time.time() + resp_json = None + for attempt in range(2): + try: + resp = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": prompt}], + max_tokens=MAX_TOKENS, + temperature=TEMPERATURE, + seed=SEED, + ) + resp_json = resp.model_dump() + break + except Exception as e: + if attempt == 0: + time.sleep(5) + else: + resp_json = {"error": str(e)} + latency = (time.time() - t0) * 1000 + + raw = "" + if resp_json and "choices" in resp_json: + msg = resp_json["choices"][0].get("message", {}) + raw = msg.get("content", "") or msg.get("reasoning_content", "") or "" + + parsed = parse_answer(raw) + is_correct = parsed is not None and parsed == expected + if is_correct: + correct += 1 + total += 1 + + out_path.write_text(json.dumps(resp_json, indent=2, default=str)) + + results.append({ + "model": model, + "benchmark": "gsm8k", + "question_id": q["id"], + "correct": is_correct, + "raw_answer": raw[:200], + "parsed_answer": str(parsed) if parsed is not None else "", + "expected": str(expected), + "latency_ms": round(latency, 1), + }) + + if (i + 1) % 10 == 0: + print(f" [{model}] GSM8K {i+1}/{len(questions)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr) + + if skipped: + print(f" [{model}] GSM8K resumed: {skipped} cached, {total-skipped} new", file=sys.stderr) + print(f" [{model}] GSM8K FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr) + return results + + +if __name__ == "__main__": + model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4" + client = OpenAI(base_url=ENDPOINT, api_key="dummy") + questions = load_questions() + results = run_gsm8k(model, client, questions) + for r in results: + print(json.dumps(r)) diff --git a/eval/humaneval.py b/eval/humaneval.py new file mode 100644 index 0000000..490baa5 --- /dev/null +++ b/eval/humaneval.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +"""HumanEval benchmark — 164 problems with sandboxed execution.""" + +import json +import os +import re +import subprocess +import sys +import tempfile +import textwrap +import time +from pathlib import Path + +from datasets import load_dataset +from openai import OpenAI +from tqdm import tqdm + +ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1") +RESULTS_DIR = Path(__file__).parent / "results" +MAX_TOKENS = 1024 +SEED = 42 +TEMPERATURE = 0 +EXEC_TIMEOUT = 30 + + +def load_problems() -> list[dict]: + ds = load_dataset("openai/openai_humaneval", split="test", trust_remote_code=True) + problems = [] + for row in ds: + problems.append({ + "id": row["task_id"], + "prompt": row["prompt"], + "canonical": row["canonical_solution"], + "test": row["test"], + "entry_point": row["entry_point"], + }) + return problems + + +def extract_code(response: str, prompt: str) -> str: + # Try to find a code block + blocks = re.findall(r"```(?:python)?\n(.*?)```", response, re.DOTALL) + if blocks: + code = blocks[0] + # If the code block contains the function signature, use it directly + if "def " in code: + return code + # Otherwise prepend the prompt (function signature) + return prompt + code + + # No code block — try to extract everything from the first def onwards + lines = response.split("\n") + in_code = False + code_lines = [] + for line in lines: + if line.strip().startswith("def ") or in_code: + in_code = True + code_lines.append(line) + elif in_code and line.strip() == "": + code_lines.append(line) + + if code_lines: + return "\n".join(code_lines) + + # Last resort: prepend prompt to raw response + return prompt + response + + +def run_test(code: str, test_code: str, entry_point: str) -> tuple[bool, str]: + full = code + "\n\n" + test_code + f"\n\ncheck({entry_point})\n" + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".py", dir="/tmp", delete=False + ) as f: + f.write(full) + f.flush() + fpath = f.name + + try: + # Sandboxed execution: restrict to /tmp, limited PATH + env = {"PATH": "/usr/bin:/usr/local/bin", "HOME": "/tmp"} + result = subprocess.run( + [sys.executable, fpath], + capture_output=True, text=True, + timeout=EXEC_TIMEOUT, + cwd="/tmp", + env=env, + ) + passed = result.returncode == 0 + output = result.stderr[:500] if result.stderr else result.stdout[:500] + return passed, output + except subprocess.TimeoutExpired: + return False, "TIMEOUT" + except Exception as e: + return False, str(e)[:500] + finally: + try: + os.unlink(fpath) + except OSError: + pass + + +def run_humaneval(model: str, client: OpenAI, problems: list[dict]) -> list[dict]: + model_dir = RESULTS_DIR / model / "humaneval" + model_dir.mkdir(parents=True, exist_ok=True) + + results = [] + correct = 0 + total = 0 + + skipped = 0 + for i, p in enumerate(tqdm(problems, desc=f" HumanEval {model}", file=sys.stderr)): + out_path = model_dir / f"{p['id'].replace('/', '_')}.json" + + if out_path.exists(): + try: + cached = json.loads(out_path.read_text()) + passed = cached.get("passed", False) + if passed: + correct += 1 + total += 1 + results.append({ + "model": model, "benchmark": "humaneval", + "question_id": p["id"], "correct": passed, + "raw_answer": "", "parsed_answer": "pass" if passed else "fail", + "expected": "pass", "latency_ms": 0, + }) + skipped += 1 + continue + except (json.JSONDecodeError, KeyError): + pass + + t0 = time.time() + resp_json = None + for attempt in range(2): + try: + resp = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": ( + "Complete the following Python function. " + "Return ONLY the complete function implementation.\n\n" + + p["prompt"] + )}], + max_tokens=MAX_TOKENS, + temperature=TEMPERATURE, + seed=SEED, + ) + resp_json = resp.model_dump() + break + except Exception as e: + if attempt == 0: + time.sleep(5) + else: + resp_json = {"error": str(e)} + latency = (time.time() - t0) * 1000 + + raw = "" + if resp_json and "choices" in resp_json: + msg = resp_json["choices"][0].get("message", {}) + raw = msg.get("content", "") or msg.get("reasoning_content", "") or "" + + code = extract_code(raw, p["prompt"]) + passed, exec_output = run_test(code, p["test"], p["entry_point"]) + if passed: + correct += 1 + total += 1 + + out_path.write_text(json.dumps({ + "response": resp_json, + "extracted_code": code[:2000], + "passed": passed, + "exec_output": exec_output, + }, indent=2, default=str)) + + results.append({ + "model": model, + "benchmark": "humaneval", + "question_id": p["id"], + "correct": passed, + "raw_answer": raw[:200], + "parsed_answer": "pass" if passed else "fail", + "expected": "pass", + "latency_ms": round(latency, 1), + }) + + if (i + 1) % 10 == 0: + print(f" [{model}] HumanEval {i+1}/{len(problems)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr) + + if skipped: + print(f" [{model}] HumanEval resumed: {skipped} cached, {total-skipped} new", file=sys.stderr) + print(f" [{model}] HumanEval FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr) + return results + + +if __name__ == "__main__": + model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4" + client = OpenAI(base_url=ENDPOINT, api_key="dummy") + problems = load_problems() + results = run_humaneval(model, client, problems) + for r in results: + print(json.dumps(r)) diff --git a/eval/mmlu.py b/eval/mmlu.py new file mode 100644 index 0000000..92a4f11 --- /dev/null +++ b/eval/mmlu.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +"""MMLU 100-question subset benchmark (20 per category, seed=42).""" + +import json +import os +import random +import re +import sys +import time +from pathlib import Path + +from datasets import load_dataset +from openai import OpenAI +from tqdm import tqdm + +ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1") +RESULTS_DIR = Path(__file__).parent / "results" +MAX_TOKENS = 512 +SEED = 42 +TEMPERATURE = 0 + +CATEGORIES = [ + "high_school_mathematics", + "college_computer_science", + "professional_medicine", + "formal_logic", + "miscellaneous", +] +PER_CATEGORY = 20 + +CHOICES = ["A", "B", "C", "D"] + + +def load_questions() -> list[dict]: + rng = random.Random(SEED) + questions = [] + for cat in CATEGORIES: + ds = load_dataset("cais/mmlu", cat, split="test", trust_remote_code=True) + indices = list(range(len(ds))) + rng.shuffle(indices) + for idx in indices[:PER_CATEGORY]: + row = ds[idx] + questions.append({ + "id": f"{cat}_{idx}", + "category": cat, + "question": row["question"], + "choices": row["choices"], + "answer_idx": row["answer"], + }) + return questions + + +def format_prompt(q: dict) -> str: + lines = [f"Question: {q['question']}"] + for i, choice in enumerate(q["choices"]): + lines.append(f"{CHOICES[i]}) {choice}") + lines.append("Answer with a single letter: ") + return "\n".join(lines) + + +def parse_answer(text: str) -> str | None: + for ch in text.strip(): + if ch.upper() in CHOICES: + return ch.upper() + return None + + +def run_mmlu(model: str, client: OpenAI, questions: list[dict]) -> list[dict]: + model_dir = RESULTS_DIR / model / "mmlu" + model_dir.mkdir(parents=True, exist_ok=True) + + results = [] + correct = 0 + total = 0 + + skipped = 0 + for i, q in enumerate(tqdm(questions, desc=f" MMLU {model}", file=sys.stderr)): + expected = CHOICES[q["answer_idx"]] + out_path = model_dir / f"{q['id']}.json" + + # Resume: skip if result file exists + if out_path.exists(): + try: + cached = json.loads(out_path.read_text()) + raw = "" + if "choices" in cached: + msg = cached["choices"][0].get("message", {}) + raw = msg.get("content", "") or msg.get("reasoning_content", "") or "" + parsed = parse_answer(raw) + is_correct = parsed == expected + if is_correct: + correct += 1 + total += 1 + results.append({ + "model": model, "benchmark": "mmlu", "question_id": q["id"], + "category": q["category"], "correct": is_correct, + "raw_answer": raw[:200], "parsed_answer": parsed or "", + "expected": expected, "latency_ms": 0, + }) + skipped += 1 + continue + except (json.JSONDecodeError, KeyError): + pass + + prompt = format_prompt(q) + t0 = time.time() + resp_json = None + for attempt in range(2): + try: + resp = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": prompt}], + max_tokens=MAX_TOKENS, + temperature=TEMPERATURE, + seed=SEED, + ) + resp_json = resp.model_dump() + break + except Exception as e: + if attempt == 0: + time.sleep(5) + else: + resp_json = {"error": str(e)} + latency = (time.time() - t0) * 1000 + + raw = "" + if resp_json and "choices" in resp_json: + msg = resp_json["choices"][0].get("message", {}) + raw = msg.get("content", "") or msg.get("reasoning_content", "") or "" + + parsed = parse_answer(raw) + is_correct = parsed == expected + if is_correct: + correct += 1 + total += 1 + + out_path.write_text(json.dumps(resp_json, indent=2, default=str)) + + results.append({ + "model": model, + "benchmark": "mmlu", + "question_id": q["id"], + "category": q["category"], + "correct": is_correct, + "raw_answer": raw[:200], + "parsed_answer": parsed or "", + "expected": expected, + "latency_ms": round(latency, 1), + }) + + if (i + 1) % 10 == 0: + print(f" [{model}] MMLU {i+1}/{len(questions)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr) + + if skipped: + print(f" [{model}] MMLU resumed: {skipped} cached, {total-skipped} new", file=sys.stderr) + print(f" [{model}] MMLU FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr) + return results + + +if __name__ == "__main__": + model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4" + client = OpenAI(base_url=ENDPOINT, api_key="dummy") + questions = load_questions() + results = run_mmlu(model, client, questions) + for r in results: + print(json.dumps(r)) diff --git a/eval/run_all.py b/eval/run_all.py new file mode 100644 index 0000000..1d44d1b --- /dev/null +++ b/eval/run_all.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""Orchestrate MMLU, GSM8K, HumanEval across all models.""" + +import csv +import json +import os +import sys +import time +from pathlib import Path + +from openai import OpenAI + +ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1") +RESULTS_DIR = Path(__file__).parent / "results" +CSV_PATH = Path(__file__).parent / "scores.csv" + +MODELS = [ + "qwen3.6-35b-a3b-mxfp4", + "qwen3-coder-30b-apex", + "qwen3.6-27b-mtp", + "qwopus3.5-4b-mtp", + "qwen3.5-9b-deepseek-v4-mtp", + "qwopus3.6-35b-a3b-v1", + "qwopus3.6-27b-v2-mtp", + "qwopus3.5-9b-coder-mtp", +] + + +def warmup_model(client: OpenAI, model: str) -> bool: + print(f"\n{'='*60}", file=sys.stderr) + print(f" Loading model: {model}", file=sys.stderr) + print(f"{'='*60}", file=sys.stderr) + for attempt in range(3): + try: + resp = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": "Say OK."}], + max_tokens=10, + temperature=0, + ) + print(f" Warmup OK", file=sys.stderr) + return True + except Exception as e: + print(f" Warmup attempt {attempt+1} failed: {e}", file=sys.stderr) + time.sleep(10) + print(f" WARNING: warmup failed for {model}, continuing anyway", file=sys.stderr) + return False + + +def run_benchmark(module_name: str, model: str, client: OpenAI) -> list[dict]: + if module_name == "mmlu": + from mmlu import load_questions, run_mmlu + questions = load_questions() + return run_mmlu(model, client, questions) + elif module_name == "gsm8k": + from gsm8k import load_questions, run_gsm8k + questions = load_questions() + return run_gsm8k(model, client, questions) + elif module_name == "humaneval": + from humaneval import load_problems, run_humaneval + problems = load_problems() + return run_humaneval(model, client, problems) + else: + raise ValueError(f"Unknown benchmark: {module_name}") + + +def main() -> None: + client = OpenAI(base_url=ENDPOINT, api_key="dummy") + + # Check connectivity + try: + client.models.list() + print("Connected to llama-swap", file=sys.stderr) + except Exception as e: + print(f"Cannot connect to {ENDPOINT}: {e}", file=sys.stderr) + sys.exit(1) + + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + all_results: list[dict] = [] + benchmarks = ["mmlu", "gsm8k", "humaneval"] + + t_start = time.time() + + for model in MODELS: + warmup_model(client, model) + + for bench in benchmarks: + print(f"\n --- {model} / {bench} ---", file=sys.stderr) + try: + results = run_benchmark(bench, model, client) + all_results.extend(results) + write_csv(all_results) + except Exception as e: + print(f" ERROR in {model}/{bench}: {e}", file=sys.stderr) + + elapsed = time.time() - t_start + print(f"\nAll benchmarks complete in {elapsed/60:.0f} minutes", file=sys.stderr) + print(f"Results: {CSV_PATH}", file=sys.stderr) + + +def write_csv(results: list[dict]) -> None: + if not results: + return + fields = ["model", "benchmark", "question_id", "correct", "raw_answer", + "parsed_answer", "expected", "latency_ms"] + # Also include category if present (MMLU) + if any("category" in r for r in results): + fields.insert(3, "category") + + with open(CSV_PATH, "w", newline="") as f: + w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore") + w.writeheader() + w.writerows(results) + + +if __name__ == "__main__": + main() diff --git a/eval/run_all.sh b/eval/run_all.sh new file mode 100755 index 0000000..d30822d --- /dev/null +++ b/eval/run_all.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -euo pipefail + +EVAL_DIR="$(cd "$(dirname "$0")" && pwd)" +VENV="${EVAL_DIR}/.venv/bin/python3" + +cd "$EVAL_DIR" + +echo "Starting eval sweep at $(date)" +echo "Using venv: ${VENV}" +echo "" + +$VENV run_all.py 2>&1 | tee eval.log + +echo "" +echo "Generating summary..." +$VENV analyze.py + +echo "" +echo "Done at $(date)" diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..2206a9c --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/indifferentketchup/llama-sidecar + +go 1.26.3 diff --git a/internal/config/config.go b/internal/config/config.go new file mode 100644 index 0000000..a96a9f4 --- /dev/null +++ b/internal/config/config.go @@ -0,0 +1,139 @@ +package config + +import ( + "bytes" + "encoding/json" + "fmt" + "os" + "strconv" + "strings" +) + +var utf8BOM = []byte{0xEF, 0xBB, 0xBF} + +type Config struct { + Bind string + LlamaServerBin string + ModelDirMap map[string]string + PortRangeLo int + PortRangeHi int + MaxSidecars int + LogLevel string + BaseArgs []string + HealthTimeoutSeconds int + HealthIntervalSeconds int +} + +func Load() (*Config, error) { + bin := os.Getenv("LLAMA_SERVER_BIN") + if bin == "" { + return nil, fmt.Errorf("LLAMA_SERVER_BIN is required") + } + if _, err := os.Stat(bin); err != nil { + return nil, fmt.Errorf("LLAMA_SERVER_BIN %q: %w", bin, err) + } + + mapFile := os.Getenv("MODEL_DIR_MAP_FILE") + if mapFile == "" { + return nil, fmt.Errorf("MODEL_DIR_MAP_FILE is required") + } + modelMap, err := loadModelMap(mapFile) + if err != nil { + return nil, fmt.Errorf("MODEL_DIR_MAP_FILE: %w", err) + } + + bind := envOr("LLAMA_SIDECAR_BIND", "127.0.0.1:8402") + logLevel := envOr("LOG_LEVEL", "info") + maxSidecars := envIntOr("MAX_SIDECARS", 2) + healthTimeout := envIntOr("HEALTH_TIMEOUT_SECONDS", 60) + healthInterval := envIntOr("HEALTH_INTERVAL_SECONDS", 30) + + lo, hi, err := parsePortRange(envOr("PORT_RANGE", "8500-8599")) + if err != nil { + return nil, fmt.Errorf("PORT_RANGE: %w", err) + } + if hi-lo+1 < maxSidecars { + return nil, fmt.Errorf("PORT_RANGE %d-%d has %d ports but MAX_SIDECARS is %d", lo, hi, hi-lo+1, maxSidecars) + } + + baseArgs := defaultBaseArgs() + if env := os.Getenv("BASE_ARGS"); env != "" { + var parsed []string + envBytes := bytes.TrimPrefix([]byte(env), utf8BOM) + if err := json.Unmarshal(envBytes, &parsed); err != nil { + return nil, fmt.Errorf("BASE_ARGS: invalid JSON array: %w", err) + } + baseArgs = parsed + } + + return &Config{ + Bind: bind, + LlamaServerBin: bin, + ModelDirMap: modelMap, + PortRangeLo: lo, + PortRangeHi: hi, + MaxSidecars: maxSidecars, + LogLevel: logLevel, + BaseArgs: baseArgs, + HealthTimeoutSeconds: healthTimeout, + HealthIntervalSeconds: healthInterval, + }, nil +} + +func defaultBaseArgs() []string { + return []string{"-ngl", "999", "-c", "32768", "--flash-attn", "on", "--no-mmap"} +} + +func loadModelMap(path string) (map[string]string, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + data = bytes.TrimPrefix(data, utf8BOM) + var m map[string]string + if err := json.Unmarshal(data, &m); err != nil { + return nil, fmt.Errorf("invalid JSON: %w", err) + } + if len(m) == 0 { + return nil, fmt.Errorf("model map is empty") + } + return m, nil +} + +func parsePortRange(s string) (int, int, error) { + parts := strings.SplitN(s, "-", 2) + if len(parts) != 2 { + return 0, 0, fmt.Errorf("expected lo-hi format, got %q", s) + } + lo, err := strconv.Atoi(strings.TrimSpace(parts[0])) + if err != nil { + return 0, 0, fmt.Errorf("invalid lo port: %w", err) + } + hi, err := strconv.Atoi(strings.TrimSpace(parts[1])) + if err != nil { + return 0, 0, fmt.Errorf("invalid hi port: %w", err) + } + if hi <= lo { + return 0, 0, fmt.Errorf("hi (%d) must be > lo (%d)", hi, lo) + } + return lo, hi, nil +} + +func envOr(key, fallback string) string { + if v := os.Getenv(key); v != "" { + return v + } + return fallback +} + +func envIntOr(key string, fallback int) int { + v := os.Getenv(key) + if v == "" { + return fallback + } + n, err := strconv.Atoi(v) + if err != nil { + return fallback + } + return n +} diff --git a/internal/config/config_test.go b/internal/config/config_test.go new file mode 100644 index 0000000..b00384e --- /dev/null +++ b/internal/config/config_test.go @@ -0,0 +1,79 @@ +package config + +import ( + "os" + "path/filepath" + "testing" +) + +func TestLoad_MissingRequired(t *testing.T) { + os.Unsetenv("LLAMA_SERVER_BIN") + os.Unsetenv("MODEL_DIR_MAP_FILE") + _, err := Load() + if err == nil { + t.Fatal("expected error for missing LLAMA_SERVER_BIN") + } +} + +func TestParsePortRange(t *testing.T) { + lo, hi, err := parsePortRange("8500-8599") + if err != nil { + t.Fatal(err) + } + if lo != 8500 || hi != 8599 { + t.Fatalf("got %d-%d", lo, hi) + } +} + +func TestParsePortRange_Bad(t *testing.T) { + _, _, err := parsePortRange("abc") + if err == nil { + t.Fatal("expected error") + } + _, _, err = parsePortRange("100-50") + if err == nil { + t.Fatal("expected error for hi <= lo") + } +} + +func TestLoadModelMap_BOM(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "model_map.json") + content := append([]byte{0xEF, 0xBB, 0xBF}, []byte(`{"test-model": "/fake/path.gguf"}`)...) + if err := os.WriteFile(path, content, 0644); err != nil { + t.Fatal(err) + } + m, err := loadModelMap(path) + if err != nil { + t.Fatalf("BOM-prefixed JSON should parse: %v", err) + } + if m["test-model"] != "/fake/path.gguf" { + t.Fatalf("unexpected map: %v", m) + } +} + +func TestDefaultBaseArgs_FlashAttn(t *testing.T) { + args := defaultBaseArgs() + for i, a := range args { + if a == "--flash-attn" && i+1 < len(args) && args[i+1] == "on" { + return + } + } + t.Fatal("expected --flash-attn on in default args") +} + +func TestDefaultBaseArgs(t *testing.T) { + args := defaultBaseArgs() + if len(args) == 0 { + t.Fatal("expected non-empty default args") + } + found := false + for _, a := range args { + if a == "--no-mmap" { + found = true + } + } + if !found { + t.Fatal("expected --no-mmap in default args") + } +} diff --git a/internal/pool/hash.go b/internal/pool/hash.go new file mode 100644 index 0000000..fc025bb --- /dev/null +++ b/internal/pool/hash.go @@ -0,0 +1,53 @@ +package pool + +import ( + "crypto/sha256" + "fmt" + "sort" + "strings" + + "github.com/indifferentketchup/llama-sidecar/internal/validator" +) + +// Hash computes a deterministic hash for a (modelID, flags) pair. +// Flag order does not affect the result. +func Hash(modelID string, flags []string) string { + type pair struct { + key, val string + } + + var pairs []pair + i := 0 + for i < len(flags) { + tok := flags[i] + key := validator.FlagName(tok) + if key == "" { + i++ + continue + } + if idx := strings.IndexByte(tok, '='); idx >= 0 { + pairs = append(pairs, pair{key: tok[:idx], val: tok[idx+1:]}) + i++ + } else if i+1 < len(flags) && validator.FlagName(flags[i+1]) == "" { + pairs = append(pairs, pair{key: key, val: flags[i+1]}) + i += 2 + } else { + pairs = append(pairs, pair{key: key, val: ""}) + i++ + } + } + + sort.Slice(pairs, func(a, b int) bool { + return pairs[a].key < pairs[b].key + }) + + var parts []string + for _, p := range pairs { + parts = append(parts, p.key+"\x1f"+p.val) + } + serialized := strings.Join(parts, "\x1e") + input := modelID + "\x1d" + serialized + + sum := sha256.Sum256([]byte(input)) + return fmt.Sprintf("%x", sum[:8]) +} diff --git a/internal/pool/hash_test.go b/internal/pool/hash_test.go new file mode 100644 index 0000000..5d95964 --- /dev/null +++ b/internal/pool/hash_test.go @@ -0,0 +1,53 @@ +package pool + +import ( + "math/rand" + "testing" +) + +func TestHash_OrderIndependence(t *testing.T) { + flags1 := []string{"--a", "1", "--b", "2", "--c", "3"} + h1 := Hash("foo", flags1) + + for i := 0; i < 5; i++ { + shuffled := make([]string, len(flags1)) + copy(shuffled, flags1) + // Shuffle pairs (each pair is 2 tokens) + pairs := make([][2]string, 0) + for j := 0; j < len(shuffled); j += 2 { + pairs = append(pairs, [2]string{shuffled[j], shuffled[j+1]}) + } + rand.Shuffle(len(pairs), func(a, b int) { pairs[a], pairs[b] = pairs[b], pairs[a] }) + var flat []string + for _, p := range pairs { + flat = append(flat, p[0], p[1]) + } + h := Hash("foo", flat) + if h != h1 { + t.Errorf("iteration %d: hash %s != %s for order %v", i, h, h1, flat) + } + } +} + +func TestHash_SeparatorCollision(t *testing.T) { + h1 := Hash("foo", []string{"--a\x1eb", "1"}) + h2 := Hash("foo", []string{"--ab", "1"}) + if h1 == h2 { + t.Error("separator collision: hashes should differ") + } +} + +func TestHash_Length(t *testing.T) { + h := Hash("model", []string{"--top-k", "20"}) + if len(h) != 16 { + t.Errorf("expected 16 hex chars, got %d: %s", len(h), h) + } +} + +func TestHash_DifferentModels(t *testing.T) { + h1 := Hash("model-a", []string{"--top-k", "20"}) + h2 := Hash("model-b", []string{"--top-k", "20"}) + if h1 == h2 { + t.Error("different models should produce different hashes") + } +} diff --git a/internal/pool/pool.go b/internal/pool/pool.go new file mode 100644 index 0000000..a8bbd73 --- /dev/null +++ b/internal/pool/pool.go @@ -0,0 +1,188 @@ +package pool + +import ( + "container/list" + "context" + "fmt" + "log/slog" + "sync" + "time" + + "github.com/indifferentketchup/llama-sidecar/internal/config" + "github.com/indifferentketchup/llama-sidecar/internal/validator" +) + +type SidecarInfo struct { + Hash string `json:"hash"` + ModelID string `json:"model_id"` + Flags []string `json:"flags"` + Port int `json:"port"` + Pid int `json:"pid"` + StartedAt time.Time `json:"started_at"` + LastUsed time.Time `json:"last_used"` + Healthy bool `json:"healthy"` +} + +type Pool struct { + mu sync.Mutex + cfg *config.Config + sidecars map[string]*Sidecar + lru *list.List + lruIdx map[string]*list.Element + ports *PortAllocator + spawner Spawner +} + +func New(cfg *config.Config, spawner Spawner) *Pool { + return &Pool{ + cfg: cfg, + sidecars: make(map[string]*Sidecar), + lru: list.New(), + lruIdx: make(map[string]*list.Element), + ports: NewPortAllocator(cfg.PortRangeLo, cfg.PortRangeHi), + spawner: spawner, + } +} + +func (p *Pool) Acquire(ctx context.Context, modelID string, flags []string) (*Sidecar, error) { + if _, err := validator.ValidateExtraArgs(flags); err != nil { + return nil, fmt.Errorf("validation: %w", err) + } + + modelPath, ok := p.cfg.ModelDirMap[modelID] + if !ok { + return nil, fmt.Errorf("unknown model: %s", modelID) + } + + hash := Hash(modelID, flags) + + p.mu.Lock() + defer p.mu.Unlock() + + if s, ok := p.sidecars[hash]; ok { + if s.Healthy() { + if el, ok := p.lruIdx[hash]; ok { + p.lru.MoveToFront(el) + } + s.TouchLastUsed() + return s, nil + } + p.removeLocked(hash) + } + + if len(p.sidecars) >= p.cfg.MaxSidecars { + if err := p.evictLRULocked(); err != nil { + return nil, fmt.Errorf("eviction failed: %w", err) + } + } + + port, err := p.ports.Allocate() + if err != nil { + return nil, fmt.Errorf("port allocation: %w", err) + } + + p.mu.Unlock() + s, err := p.spawner.Spawn(ctx, p.cfg, modelID, modelPath, flags, port, hash) + p.mu.Lock() + + if err != nil { + p.ports.Release(port) + return nil, fmt.Errorf("spawn: %w", err) + } + + p.sidecars[hash] = s + el := p.lru.PushFront(hash) + p.lruIdx[hash] = el + return s, nil +} + +func (p *Pool) List() []SidecarInfo { + p.mu.Lock() + defer p.mu.Unlock() + out := make([]SidecarInfo, 0, len(p.sidecars)) + for _, s := range p.sidecars { + out = append(out, SidecarInfo{ + Hash: s.Hash, + ModelID: s.ModelID, + Flags: s.Flags, + Port: s.Port, + Pid: s.Pid, + StartedAt: s.StartedAt, + LastUsed: time.Unix(0, s.LastUsed.Load()), + Healthy: s.Healthy(), + }) + } + return out +} + +func (p *Pool) Remove(hash string) error { + p.mu.Lock() + defer p.mu.Unlock() + if _, ok := p.sidecars[hash]; !ok { + return fmt.Errorf("sidecar %s not found", hash) + } + return p.removeLocked(hash) +} + +func (p *Pool) Shutdown(ctx context.Context) error { + p.mu.Lock() + hashes := make([]string, 0, len(p.sidecars)) + for h := range p.sidecars { + hashes = append(hashes, h) + } + p.mu.Unlock() + + var wg sync.WaitGroup + for _, h := range hashes { + wg.Add(1) + go func(hash string) { + defer wg.Done() + p.mu.Lock() + s, ok := p.sidecars[hash] + p.mu.Unlock() + if !ok { + return + } + if err := p.spawner.Kill(s); err != nil { + slog.Error("shutdown kill failed", "hash", hash, "err", err) + } + }(h) + } + + done := make(chan struct{}) + go func() { wg.Wait(); close(done) }() + select { + case <-done: + case <-ctx.Done(): + return ctx.Err() + } + slog.Info("pool shutdown complete", "count", len(hashes)) + return nil +} + +func (p *Pool) removeLocked(hash string) error { + s, ok := p.sidecars[hash] + if !ok { + return nil + } + delete(p.sidecars, hash) + if el, ok := p.lruIdx[hash]; ok { + p.lru.Remove(el) + delete(p.lruIdx, hash) + } + if err := p.spawner.Kill(s); err != nil { + slog.Error("kill failed during remove", "hash", hash, "err", err) + } + p.ports.Release(s.Port) + return nil +} + +func (p *Pool) evictLRULocked() error { + back := p.lru.Back() + if back == nil { + return fmt.Errorf("pool full but LRU empty") + } + hash := back.Value.(string) + slog.Info("evicting LRU sidecar", "hash", hash) + return p.removeLocked(hash) +} diff --git a/internal/pool/pool_test.go b/internal/pool/pool_test.go new file mode 100644 index 0000000..b3aec99 --- /dev/null +++ b/internal/pool/pool_test.go @@ -0,0 +1,151 @@ +package pool + +import ( + "context" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/indifferentketchup/llama-sidecar/internal/config" +) + +type fakeSpawner struct { + spawnCount atomic.Int32 + killCount atomic.Int32 +} + +func (f *fakeSpawner) Spawn(ctx context.Context, cfg *config.Config, modelID, modelPath string, flags []string, port int, hash string) (*Sidecar, error) { + f.spawnCount.Add(1) + s := &Sidecar{ + Hash: hash, + ModelID: modelID, + ModelPath: modelPath, + Flags: flags, + Port: port, + Pid: 99999, + StartedAt: time.Now(), + stderr: newRingBuffer(8), + cancel: func() {}, + } + s.healthy.Store(true) + s.LastUsed.Store(time.Now().UnixNano()) + return s, nil +} + +func (f *fakeSpawner) Kill(s *Sidecar) error { + f.killCount.Add(1) + return nil +} + +func testConfig() *config.Config { + return &config.Config{ + Bind: "127.0.0.1:0", + LlamaServerBin: "/fake/llama-server", + ModelDirMap: map[string]string{ + "model-a": "/fake/model-a.gguf", + "model-b": "/fake/model-b.gguf", + }, + PortRangeLo: 8500, + PortRangeHi: 8509, + MaxSidecars: 2, + BaseArgs: []string{"-ngl", "999"}, + HealthTimeoutSeconds: 60, + } +} + +func TestPool_AcquireSameKey(t *testing.T) { + fs := &fakeSpawner{} + p := New(testConfig(), fs) + ctx := context.Background() + + s1, err := p.Acquire(ctx, "model-a", []string{"--top-k", "20"}) + if err != nil { + t.Fatal(err) + } + s2, err := p.Acquire(ctx, "model-a", []string{"--top-k", "20"}) + if err != nil { + t.Fatal(err) + } + if s1.Hash != s2.Hash { + t.Fatalf("expected same sidecar, got different hashes: %s vs %s", s1.Hash, s2.Hash) + } + if fs.spawnCount.Load() != 1 { + t.Fatalf("expected 1 spawn, got %d", fs.spawnCount.Load()) + } +} + +func TestPool_EvictLRU(t *testing.T) { + cfg := testConfig() + cfg.MaxSidecars = 1 + fs := &fakeSpawner{} + p := New(cfg, fs) + ctx := context.Background() + + _, err := p.Acquire(ctx, "model-a", []string{"--top-k", "20"}) + if err != nil { + t.Fatal(err) + } + _, err = p.Acquire(ctx, "model-b", []string{"--top-k", "40"}) + if err != nil { + t.Fatal(err) + } + + if fs.spawnCount.Load() != 2 { + t.Fatalf("expected 2 spawns, got %d", fs.spawnCount.Load()) + } + if fs.killCount.Load() != 1 { + t.Fatalf("expected 1 kill (eviction), got %d", fs.killCount.Load()) + } + list := p.List() + if len(list) != 1 { + t.Fatalf("expected 1 sidecar, got %d", len(list)) + } + if list[0].ModelID != "model-b" { + t.Fatalf("expected model-b, got %s", list[0].ModelID) + } +} + +func TestPool_ValidatorReject(t *testing.T) { + fs := &fakeSpawner{} + p := New(testConfig(), fs) + _, err := p.Acquire(context.Background(), "model-a", []string{"--model", "evil.gguf"}) + if err == nil { + t.Fatal("expected validation error") + } +} + +func TestPool_UnknownModel(t *testing.T) { + fs := &fakeSpawner{} + p := New(testConfig(), fs) + _, err := p.Acquire(context.Background(), "nonexistent", nil) + if err == nil { + t.Fatal("expected unknown model error") + } +} + +func TestPool_ConcurrentAcquire(t *testing.T) { + cfg := testConfig() + cfg.MaxSidecars = 10 + cfg.PortRangeHi = 8599 + fs := &fakeSpawner{} + p := New(cfg, fs) + ctx := context.Background() + + var wg sync.WaitGroup + for i := 0; i < 10; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := 0; j < 50; j++ { + _, _ = p.Acquire(ctx, "model-a", []string{"--top-k", "20"}) + } + }() + } + wg.Wait() + + list := p.List() + if len(list) != 1 { + t.Fatalf("expected 1 sidecar (same key), got %d", len(list)) + } +} diff --git a/internal/pool/ports.go b/internal/pool/ports.go new file mode 100644 index 0000000..6b45b25 --- /dev/null +++ b/internal/pool/ports.go @@ -0,0 +1,28 @@ +package pool + +import "fmt" + +type PortAllocator struct { + ports chan int +} + +func NewPortAllocator(lo, hi int) *PortAllocator { + ch := make(chan int, hi-lo+1) + for p := lo; p <= hi; p++ { + ch <- p + } + return &PortAllocator{ports: ch} +} + +func (pa *PortAllocator) Allocate() (int, error) { + select { + case p := <-pa.ports: + return p, nil + default: + return 0, fmt.Errorf("port allocator exhausted") + } +} + +func (pa *PortAllocator) Release(port int) { + pa.ports <- port +} diff --git a/internal/pool/ports_test.go b/internal/pool/ports_test.go new file mode 100644 index 0000000..aa8fe2b --- /dev/null +++ b/internal/pool/ports_test.go @@ -0,0 +1,74 @@ +package pool + +import ( + "sync" + "testing" +) + +func TestPortAllocator_AllocateRelease(t *testing.T) { + pa := NewPortAllocator(8500, 8502) + p1, err := pa.Allocate() + if err != nil { + t.Fatal(err) + } + p2, err := pa.Allocate() + if err != nil { + t.Fatal(err) + } + p3, err := pa.Allocate() + if err != nil { + t.Fatal(err) + } + + // All three ports should be distinct + if p1 == p2 || p2 == p3 || p1 == p3 { + t.Fatalf("expected distinct ports: %d, %d, %d", p1, p2, p3) + } + + // Exhausted + _, err = pa.Allocate() + if err == nil { + t.Fatal("expected error when exhausted") + } + + // Release and re-allocate + pa.Release(p2) + p4, err := pa.Allocate() + if err != nil { + t.Fatal(err) + } + if p4 != p2 { + t.Fatalf("expected released port %d, got %d", p2, p4) + } +} + +func TestPortAllocator_Concurrent(t *testing.T) { + pa := NewPortAllocator(8500, 8599) + var wg sync.WaitGroup + allocated := make(chan int, 100) + + for i := 0; i < 100; i++ { + wg.Add(1) + go func() { + defer wg.Done() + p, err := pa.Allocate() + if err != nil { + return + } + allocated <- p + }() + } + wg.Wait() + close(allocated) + + seen := make(map[int]bool) + for p := range allocated { + if seen[p] { + t.Fatalf("duplicate port %d", p) + } + seen[p] = true + } + if len(seen) != 100 { + t.Fatalf("expected 100 ports, got %d", len(seen)) + } +} diff --git a/internal/pool/sidecar.go b/internal/pool/sidecar.go new file mode 100644 index 0000000..885ec36 --- /dev/null +++ b/internal/pool/sidecar.go @@ -0,0 +1,313 @@ +package pool + +import ( + "bytes" + "context" + "fmt" + "io" + "log/slog" + "net/http" + "os" + "os/exec" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/indifferentketchup/llama-sidecar/internal/config" + "github.com/indifferentketchup/llama-sidecar/internal/validator" +) + +type Sidecar struct { + Hash string + ModelID string + ModelPath string + Flags []string + Port int + Pid int + StartedAt time.Time + LastUsed atomic.Int64 + healthy atomic.Bool + cmd *exec.Cmd + cancel context.CancelFunc + done chan error + stderr *ringBuffer + stopMon context.CancelFunc + stdinFile *os.File + stdoutR *os.File + stdoutFile *os.File +} + +func (s *Sidecar) Healthy() bool { + return s.healthy.Load() +} + +func (s *Sidecar) TouchLastUsed() { + s.LastUsed.Store(time.Now().UnixNano()) +} + +func (s *Sidecar) LastStderr() string { + return s.stderr.String() +} + +// Spawner abstracts sidecar creation for testing. +type Spawner interface { + Spawn(ctx context.Context, cfg *config.Config, modelID, modelPath string, flags []string, port int, hash string) (*Sidecar, error) + Kill(s *Sidecar) error +} + +type RealSpawner struct{} + +func (rs *RealSpawner) Spawn(ctx context.Context, cfg *config.Config, modelID, modelPath string, flags []string, port int, hash string) (*Sidecar, error) { + args := buildArgs(cfg.BaseArgs, modelPath, port, flags) + _ = ctx + childCtx, cancel := context.WithCancel(context.Background()) + cmd := exec.CommandContext(childCtx, cfg.LlamaServerBin, args...) + setPlatformAttrs(cmd) + + devNull, err := os.Open(os.DevNull) + if err != nil { + cancel() + return nil, fmt.Errorf("open devnull: %w", err) + } + cmd.Stdin = devNull + + stderr := newRingBuffer(64) + prefix := fmt.Sprintf("[sidecar:%s:%d] ", hash[:8], port) + cmd.Stderr = io.MultiWriter(stderr, &prefixWriter{prefix: prefix}) + stdoutR, stdoutW, err := os.Pipe() + if err != nil { + cancel() + devNull.Close() + return nil, fmt.Errorf("stdout pipe: %w", err) + } + go io.Copy(io.Discard, stdoutR) + cmd.Stdout = stdoutW + + slog.Info("spawning sidecar", "hash", hash, "model", modelID, "port", port, "args", strings.Join(args, " ")) + if err := cmd.Start(); err != nil { + cancel() + return nil, fmt.Errorf("spawn failed: %w", err) + } + + s := &Sidecar{ + Hash: hash, + ModelID: modelID, + ModelPath: modelPath, + Flags: flags, + Port: port, + Pid: cmd.Process.Pid, + StartedAt: time.Now(), + cmd: cmd, + cancel: cancel, + done: make(chan error, 1), + stderr: stderr, + stdinFile: devNull, + stdoutR: stdoutR, + stdoutFile: stdoutW, + } + s.LastUsed.Store(time.Now().UnixNano()) + + go func() { + err := cmd.Wait() + s.healthy.Store(false) + exitCode := -1 + if cmd.ProcessState != nil { + exitCode = cmd.ProcessState.ExitCode() + } + slog.Error("sidecar child exited", + "hash", hash, + "port", port, + "pid", s.Pid, + "exit_code", exitCode, + "wait_err", fmt.Sprintf("%v", err), + "uptime", time.Since(s.StartedAt).Round(time.Millisecond), + "stderr_tail", stderr.String(), + ) + s.done <- err + close(s.done) + }() + + // Wait for health + healthURL := fmt.Sprintf("http://127.0.0.1:%d/health", port) + deadline := time.Now().Add(time.Duration(cfg.HealthTimeoutSeconds) * time.Second) + for time.Now().Before(deadline) { + resp, err := http.Get(healthURL) + if err == nil { + resp.Body.Close() + if resp.StatusCode == 200 { + s.healthy.Store(true) + slog.Info("sidecar healthy", "hash", hash, "port", port, "elapsed", time.Since(s.StartedAt).Round(time.Millisecond)) + monCtx, monCancel := context.WithCancel(ctx) + s.stopMon = monCancel + go s.healthMonitor(monCtx, cfg.HealthIntervalSeconds) + return s, nil + } + } + select { + case <-childCtx.Done(): + return nil, fmt.Errorf("sidecar process exited during health check") + case <-time.After(500 * time.Millisecond): + } + } + + _ = rs.Kill(s) + return nil, fmt.Errorf("health check timed out after %ds, last stderr: %s", cfg.HealthTimeoutSeconds, s.stderr.LastLine()) +} + +func (rs *RealSpawner) Kill(s *Sidecar) error { + if s.stopMon != nil { + s.stopMon() + } + s.cancel() + select { + case <-s.done: + case <-time.After(5 * time.Second): + if s.cmd.Process != nil { + _ = s.cmd.Process.Kill() + } + <-s.done + } + if s.stdinFile != nil { + s.stdinFile.Close() + } + if s.stdoutFile != nil { + s.stdoutFile.Close() + } + if s.stdoutR != nil { + s.stdoutR.Close() + } + slog.Info("sidecar killed", "hash", s.Hash, "port", s.Port) + return nil +} + +func (s *Sidecar) healthMonitor(ctx context.Context, intervalSec int) { + ticker := time.NewTicker(time.Duration(intervalSec) * time.Second) + defer ticker.Stop() + failures := 0 + url := fmt.Sprintf("http://127.0.0.1:%d/health", s.Port) + client := &http.Client{Timeout: 5 * time.Second} + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + resp, err := client.Get(url) + if err != nil || resp.StatusCode != 200 { + if resp != nil { + resp.Body.Close() + } + failures++ + if failures >= 3 { + slog.Warn("sidecar unhealthy, marking for eviction", "hash", s.Hash, "port", s.Port) + s.healthy.Store(false) + return + } + } else { + resp.Body.Close() + failures = 0 + } + } + } +} + +func buildArgs(baseArgs []string, modelPath string, port int, userFlags []string) []string { + deduped := dedupFlags(baseArgs, userFlags) + args := make([]string, 0, len(deduped)+len(userFlags)+4) + args = append(args, deduped...) + args = append(args, "--model", modelPath) + args = append(args, "--port", strconv.Itoa(port)) + args = append(args, userFlags...) + return args +} + +// dedupFlags removes from autoArgs any flag that the user also supplied, +// so the user's value wins via llama.cpp's last-wins CLI parsing. +func dedupFlags(autoArgs, userArgs []string) []string { + userNames := make(map[string]bool) + for _, tok := range userArgs { + if name := validator.FlagName(tok); name != "" { + userNames[name] = true + } + } + out := make([]string, 0, len(autoArgs)) + i := 0 + for i < len(autoArgs) { + tok := autoArgs[i] + name := validator.FlagName(tok) + if name == "" || !userNames[name] { + out = append(out, tok) + i++ + continue + } + if strings.Contains(tok, "=") { + i++ + } else if i+1 < len(autoArgs) && validator.FlagName(autoArgs[i+1]) == "" { + i += 2 + } else { + i++ + } + } + return out +} + +// Ring buffer for last N lines of stderr +type ringBuffer struct { + mu sync.Mutex + lines []string + max int +} + +func newRingBuffer(max int) *ringBuffer { + return &ringBuffer{lines: make([]string, 0, max), max: max} +} + +func (rb *ringBuffer) Write(p []byte) (int, error) { + rb.mu.Lock() + defer rb.mu.Unlock() + for _, line := range strings.Split(string(p), "\n") { + line = strings.TrimRight(line, "\r\n") + if line == "" { + continue + } + if len(rb.lines) >= rb.max { + rb.lines = rb.lines[1:] + } + rb.lines = append(rb.lines, line) + } + return len(p), nil +} + +func (rb *ringBuffer) String() string { + rb.mu.Lock() + defer rb.mu.Unlock() + return strings.Join(rb.lines, "\n") +} + +func (rb *ringBuffer) LastLine() string { + rb.mu.Lock() + defer rb.mu.Unlock() + if len(rb.lines) == 0 { + return "" + } + return rb.lines[len(rb.lines)-1] +} + +type prefixWriter struct { + prefix string + buf bytes.Buffer +} + +func (pw *prefixWriter) Write(p []byte) (int, error) { + pw.buf.Write(p) + for { + line, err := pw.buf.ReadString('\n') + if err != nil { + pw.buf.WriteString(line) + break + } + fmt.Fprint(os.Stderr, pw.prefix+line) + } + return len(p), nil +} diff --git a/internal/pool/sidecar_test.go b/internal/pool/sidecar_test.go new file mode 100644 index 0000000..58e1758 --- /dev/null +++ b/internal/pool/sidecar_test.go @@ -0,0 +1,96 @@ +package pool + +import ( + "reflect" + "testing" +) + +func TestBuildArgs_PreservesNonOverlapping(t *testing.T) { + base := []string{"-ngl", "999", "-c", "32768", "--flash-attn", "on", "--no-mmap"} + user := []string{"--top-k", "20"} + got := buildArgs(base, "/model.gguf", 8500, user) + + // -c 32768 must survive (user didn't supply -c) + if !containsSeq(got, "-c", "32768") { + t.Errorf("-c 32768 missing from args: %v", got) + } + // --top-k 20 must be present (user flag) + if !containsSeq(got, "--top-k", "20") { + t.Errorf("--top-k 20 missing from args: %v", got) + } + // --model and --port injected + if !containsSeq(got, "--model", "/model.gguf") { + t.Errorf("--model missing: %v", got) + } + if !containsSeq(got, "--port", "8500") { + t.Errorf("--port missing: %v", got) + } +} + +func TestBuildArgs_UserOverridesBase(t *testing.T) { + base := []string{"-ngl", "999", "-c", "32768"} + user := []string{"-c", "131072"} + got := buildArgs(base, "/model.gguf", 8500, user) + + // base -c should be dropped, user -c should be present + count := 0 + for i, tok := range got { + if tok == "-c" && i+1 < len(got) { + count++ + if got[i+1] == "32768" { + t.Errorf("base -c 32768 should have been deduped: %v", got) + } + } + } + if count != 1 { + t.Errorf("expected exactly 1 -c flag, got %d in %v", count, got) + } +} + +func TestBuildArgs_NoUserFlags(t *testing.T) { + base := []string{"-ngl", "999", "-c", "32768", "--no-mmap"} + got := buildArgs(base, "/model.gguf", 8500, nil) + + if !containsSeq(got, "-c", "32768") { + t.Errorf("-c 32768 missing when no user flags: %v", got) + } + if !containsSeq(got, "--no-mmap") { + t.Errorf("--no-mmap missing: %v", got) + } +} + +func TestDedupFlags_Mixed(t *testing.T) { + auto := []string{"--top-k", "40", "-c", "32768", "--no-mmap"} + user := []string{"--top-k", "20"} + got := dedupFlags(auto, user) + want := []string{"-c", "32768", "--no-mmap"} + if !reflect.DeepEqual(got, want) { + t.Errorf("dedupFlags = %v, want %v", got, want) + } +} + +func TestDedupFlags_EqualsForm(t *testing.T) { + auto := []string{"--ctx-size=4096", "--no-mmap"} + user := []string{"--ctx-size", "8192"} + got := dedupFlags(auto, user) + want := []string{"--no-mmap"} + if !reflect.DeepEqual(got, want) { + t.Errorf("dedupFlags = %v, want %v", got, want) + } +} + +func containsSeq(args []string, seq ...string) bool { + for i := 0; i <= len(args)-len(seq); i++ { + match := true + for j, s := range seq { + if args[i+j] != s { + match = false + break + } + } + if match { + return true + } + } + return false +} diff --git a/internal/pool/sidecar_unix.go b/internal/pool/sidecar_unix.go new file mode 100644 index 0000000..3c596df --- /dev/null +++ b/internal/pool/sidecar_unix.go @@ -0,0 +1,7 @@ +//go:build !windows + +package pool + +import "os/exec" + +func setPlatformAttrs(_ *exec.Cmd) {} diff --git a/internal/pool/sidecar_windows.go b/internal/pool/sidecar_windows.go new file mode 100644 index 0000000..6d38275 --- /dev/null +++ b/internal/pool/sidecar_windows.go @@ -0,0 +1,15 @@ +//go:build windows + +package pool + +import ( + "os/exec" + "syscall" +) + +func setPlatformAttrs(cmd *exec.Cmd) { + cmd.SysProcAttr = &syscall.SysProcAttr{ + HideWindow: true, + CreationFlags: 0x00000008 | 0x00000200, // DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP + } +} diff --git a/internal/server/admin.go b/internal/server/admin.go new file mode 100644 index 0000000..882ff29 --- /dev/null +++ b/internal/server/admin.go @@ -0,0 +1,42 @@ +package server + +import ( + "net/http" + "time" + + "github.com/indifferentketchup/llama-sidecar/internal/config" + "github.com/indifferentketchup/llama-sidecar/internal/pool" +) + +func healthHandler(p *pool.Pool, cfg *config.Config, startedAt time.Time) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + sidecars := p.List() + writeJSON(w, http.StatusOK, map[string]any{ + "status": "ok", + "sidecars": len(sidecars), + "max": cfg.MaxSidecars, + "uptime_seconds": int(time.Since(startedAt).Seconds()), + }) + } +} + +func listSidecarsHandler(p *pool.Pool) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + writeJSON(w, http.StatusOK, p.List()) + } +} + +func deleteSidecarHandler(p *pool.Pool) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + hash := r.PathValue("hash") + if hash == "" { + writeJSON(w, http.StatusBadRequest, map[string]string{"error": "hash required"}) + return + } + if err := p.Remove(hash); err != nil { + writeJSON(w, http.StatusNotFound, map[string]string{"error": err.Error()}) + return + } + writeJSON(w, http.StatusOK, map[string]string{"status": "removed"}) + } +} diff --git a/internal/server/proxy.go b/internal/server/proxy.go new file mode 100644 index 0000000..1ff64bf --- /dev/null +++ b/internal/server/proxy.go @@ -0,0 +1,111 @@ +package server + +import ( + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" + "net/http/httputil" + "net/url" + "strings" + + "github.com/indifferentketchup/llama-sidecar/internal/pool" +) + +var shellUnsafe = strings.NewReplacer( + "`", "", "$", "", "|", "", ";", "", "&", "", "\n", "", +) + +func parseFlags(raw string) ([]string, error) { + cleaned := shellUnsafe.Replace(raw) + if cleaned != raw { + return nil, fmt.Errorf("flags contain unsafe characters") + } + return splitArgs(strings.TrimSpace(raw)), nil +} + +func splitArgs(s string) []string { + if s == "" { + return nil + } + return strings.Fields(s) +} + +func proxyHandler(p *pool.Pool) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + flagsRaw := r.Header.Get("X-Agent-Flags") + var flags []string + if flagsRaw != "" { + var err error + flags, err = parseFlags(flagsRaw) + if err != nil { + writeJSON(w, http.StatusBadRequest, map[string]string{ + "error": err.Error(), + }) + return + } + } + + modelID := r.Header.Get("X-Model-Id") + if modelID == "" { + body, err := io.ReadAll(io.LimitReader(r.Body, 1<<20)) + if err != nil { + writeJSON(w, http.StatusBadRequest, map[string]string{"error": "failed to read body"}) + return + } + var req struct { + Model string `json:"model"` + } + if err := json.Unmarshal(body, &req); err == nil && req.Model != "" { + modelID = req.Model + } + r.Body = io.NopCloser(strings.NewReader(string(body))) + r.ContentLength = int64(len(body)) + } + if modelID == "" { + writeJSON(w, http.StatusBadRequest, map[string]string{"error": "model not specified (X-Model-Id header or body.model)"}) + return + } + + sidecar, err := p.Acquire(r.Context(), modelID, flags) + if err != nil { + errMsg := err.Error() + status := http.StatusInternalServerError + if strings.Contains(errMsg, "validation:") { + status = http.StatusBadRequest + } else if strings.Contains(errMsg, "unknown model:") { + status = http.StatusNotFound + } else if strings.Contains(errMsg, "port allocation:") { + status = http.StatusServiceUnavailable + } + writeJSON(w, status, map[string]string{"error": errMsg}) + return + } + + target := &url.URL{ + Scheme: "http", + Host: fmt.Sprintf("127.0.0.1:%d", sidecar.Port), + } + proxy := httputil.NewSingleHostReverseProxy(target) + proxy.ErrorHandler = func(rw http.ResponseWriter, req *http.Request, err error) { + slog.Error("upstream error", "hash", sidecar.Hash, "port", sidecar.Port, "err", err) + writeJSON(rw, http.StatusBadGateway, map[string]any{ + "error": "upstream unavailable", + "error_detail": err.Error(), + "sidecar_hash": sidecar.Hash, + "sidecar_port": sidecar.Port, + "last_stderr": sidecar.LastStderr(), + }) + } + + sidecar.TouchLastUsed() + proxy.ServeHTTP(w, r) + } +} + +func writeJSON(w http.ResponseWriter, status int, v any) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + json.NewEncoder(w).Encode(v) +} diff --git a/internal/server/server.go b/internal/server/server.go new file mode 100644 index 0000000..db72f00 --- /dev/null +++ b/internal/server/server.go @@ -0,0 +1,56 @@ +package server + +import ( + "log/slog" + "net/http" + "time" + + "github.com/indifferentketchup/llama-sidecar/internal/config" + "github.com/indifferentketchup/llama-sidecar/internal/pool" +) + +func New(cfg *config.Config, p *pool.Pool, startedAt time.Time) *http.Server { + mux := http.NewServeMux() + mux.HandleFunc("GET /health", healthHandler(p, cfg, startedAt)) + mux.HandleFunc("GET /sidecars", listSidecarsHandler(p)) + mux.HandleFunc("DELETE /sidecars/{hash}", deleteSidecarHandler(p)) + mux.HandleFunc("POST /v1/chat/completions", proxyHandler(p)) + mux.HandleFunc("POST /v1/completions", proxyHandler(p)) + + handler := requestLogger(mux) + + return &http.Server{ + Addr: cfg.Bind, + Handler: handler, + } +} + +func requestLogger(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + start := time.Now() + rw := &statusRecorder{ResponseWriter: w, status: 200} + next.ServeHTTP(rw, r) + slog.Info("request", + "method", r.Method, + "path", r.URL.Path, + "status", rw.status, + "duration_ms", time.Since(start).Milliseconds(), + ) + }) +} + +type statusRecorder struct { + http.ResponseWriter + status int +} + +func (sr *statusRecorder) WriteHeader(code int) { + sr.status = code + sr.ResponseWriter.WriteHeader(code) +} + +func (sr *statusRecorder) Flush() { + if f, ok := sr.ResponseWriter.(http.Flusher); ok { + f.Flush() + } +} diff --git a/internal/validator/validator.go b/internal/validator/validator.go new file mode 100644 index 0000000..94a4e89 --- /dev/null +++ b/internal/validator/validator.go @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: AGPL-3.0-only +// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. +// Ported from studio/backend/core/inference/llama_server_args.py. +// Original: https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/llama_server_args.py + +package validator + +import ( + "fmt" + "strings" +) + +var denylistGroups = [][]string{ + // Model identity + {"-m", "--model"}, + {"-mu", "--model-url"}, + {"-dr", "--docker-repo"}, + {"-hf", "-hfr", "--hf-repo"}, + {"-hff", "--hf-file"}, + {"-hfv", "-hfrv", "--hf-repo-v"}, + {"-hffv", "--hf-file-v"}, + {"-hft", "--hf-token"}, + {"-mm", "--mmproj"}, + {"-mmu", "--mmproj-url"}, + // Networking + {"--host"}, + {"--port"}, + {"--path"}, + {"--api-prefix"}, + {"--reuse-port"}, + // Auth / TLS + {"--api-key"}, + {"--api-key-file"}, + {"--ssl-key-file"}, + {"--ssl-cert-file"}, + // Server UI / multi-model + {"--webui", "--no-webui"}, + {"--ui", "--no-ui"}, + {"--ui-config"}, + {"--ui-config-file"}, + {"--ui-mcp-proxy", "--no-ui-mcp-proxy"}, + {"--models-dir"}, + {"--models-preset"}, + {"--models-max"}, + {"--models-autoload", "--no-models-autoload"}, +} + +var denylist map[string]bool + +func init() { + denylist = make(map[string]bool) + for _, group := range denylistGroups { + for _, flag := range group { + denylist[flag] = true + } + } +} + +// FlagName returns the flag name for a CLI token, or "" if it isn't a flag. +// Peels --key=value to the bare --key. Numeric values like -1 or -0.5 +// (e.g. --seed -1) are treated as values, not flags. +func FlagName(token string) string { + if !strings.HasPrefix(token, "-") || token == "-" || token == "--" { + return "" + } + if len(token) >= 2 && (token[1] >= '0' && token[1] <= '9' || token[1] == '.') { + return "" + } + if idx := strings.IndexByte(token, '='); idx >= 0 { + return token[:idx] + } + return token +} + +// ValidateExtraArgs validates user-supplied llama-server args. Returns the +// args as a flat slice. Returns an error with the offending flag if any +// token resolves to a managed flag. +func ValidateExtraArgs(args []string) ([]string, error) { + if len(args) == 0 { + return nil, nil + } + out := make([]string, 0, len(args)) + for _, raw := range args { + flag := FlagName(raw) + if flag != "" && denylist[flag] { + return nil, fmt.Errorf("llama-server flag '%s' is managed and cannot be passed as an extra arg", flag) + } + out = append(out, raw) + } + return out, nil +} + +// IsManagedFlag returns true if flag is a managed llama-server flag. +func IsManagedFlag(flag string) bool { + return denylist[flag] +} + +var contextFlags = setOf("-c", "--ctx-size") +var cacheFlags = setOf("-ctk", "--cache-type-k", "-ctv", "--cache-type-v") +var specFlags = setOf( + "--spec-default", "--spec-type", "--spec-ngram-size-n", "--spec-ngram-size", + "--draft-min", "--draft-max", + "--spec-draft-n-max", "--spec-draft-n-min", "--spec-draft-p-min", "--spec-draft-p-split", + "--spec-ngram-mod-n-match", "--spec-ngram-mod-n-min", "--spec-ngram-mod-n-max", +) +var templateFlags = setOf( + "--chat-template", "--chat-template-file", "--chat-template-kwargs", + "--jinja", "--no-jinja", +) +var booleanShadowingFlags = setOf("--spec-default", "--jinja", "--no-jinja") + +func setOf(vals ...string) map[string]bool { + m := make(map[string]bool, len(vals)) + for _, v := range vals { + m[v] = true + } + return m +} + +// StripShadowingFlags removes flags that shadow first-class settings from +// the arg list. By default all shadowing groups are stripped. +func StripShadowingFlags(args []string) []string { + shadowing := make(map[string]bool) + for k, v := range contextFlags { + shadowing[k] = v + } + for k, v := range cacheFlags { + shadowing[k] = v + } + for k, v := range specFlags { + shadowing[k] = v + } + for k, v := range templateFlags { + shadowing[k] = v + } + + out := make([]string, 0, len(args)) + i, n := 0, len(args) + for i < n { + tok := args[i] + flag := FlagName(tok) + if flag == "" || !shadowing[flag] { + out = append(out, tok) + i++ + continue + } + if booleanShadowingFlags[flag] || strings.Contains(tok, "=") { + i++ + } else if i+1 < n && FlagName(args[i+1]) == "" { + i += 2 + } else { + i++ + } + } + return out +} diff --git a/internal/validator/validator_test.go b/internal/validator/validator_test.go new file mode 100644 index 0000000..60c5078 --- /dev/null +++ b/internal/validator/validator_test.go @@ -0,0 +1,150 @@ +package validator + +import ( + "testing" +) + +func TestValidateExtraArgs_DenyList(t *testing.T) { + denied := []string{ + "-m", "--model", + "-mu", "--model-url", + "-dr", "--docker-repo", + "-hf", "-hfr", "--hf-repo", + "-hff", "--hf-file", + "-hfv", "-hfrv", "--hf-repo-v", + "-hffv", "--hf-file-v", + "-hft", "--hf-token", + "-mm", "--mmproj", + "-mmu", "--mmproj-url", + "--host", "--port", "--path", "--api-prefix", "--reuse-port", + "--api-key", "--api-key-file", + "--ssl-key-file", "--ssl-cert-file", + "--webui", "--no-webui", "--ui", "--no-ui", + "--ui-config", "--ui-config-file", + "--ui-mcp-proxy", "--no-ui-mcp-proxy", + "--models-dir", "--models-preset", "--models-max", + "--models-autoload", "--no-models-autoload", + } + for _, flag := range denied { + t.Run(flag, func(t *testing.T) { + _, err := ValidateExtraArgs([]string{flag}) + if err == nil { + t.Fatalf("expected error for %s", flag) + } + }) + } +} + +func TestValidateExtraArgs_SafeFlags(t *testing.T) { + safe := []string{ + "-c", "--ctx-size", "-ngl", "--gpu-layers", + "--top-k", "--cache-type-k", "--jinja", "--no-jinja", + "--spec-draft-n-max", "-fa", "--flash-attn", + "-t", "--threads", "-np", "--parallel", "--no-mmap", + } + for _, flag := range safe { + t.Run(flag, func(t *testing.T) { + out, err := ValidateExtraArgs([]string{flag}) + if err != nil { + t.Fatalf("unexpected error for %s: %v", flag, err) + } + if len(out) != 1 || out[0] != flag { + t.Fatalf("expected [%s], got %v", flag, out) + } + }) + } +} + +func TestValidateExtraArgs_FlagEqualsValue(t *testing.T) { + _, err := ValidateExtraArgs([]string{"--model=evil.gguf"}) + if err == nil { + t.Fatal("expected error for --model=evil.gguf") + } + out, err := ValidateExtraArgs([]string{"--ctx-size=4096"}) + if err != nil { + t.Fatal(err) + } + if len(out) != 1 || out[0] != "--ctx-size=4096" { + t.Fatalf("expected [--ctx-size=4096], got %v", out) + } +} + +func TestValidateExtraArgs_NegativeNumber(t *testing.T) { + out, err := ValidateExtraArgs([]string{"--seed", "-1"}) + if err != nil { + t.Fatal(err) + } + if len(out) != 2 { + t.Fatalf("expected 2 tokens, got %d", len(out)) + } +} + +func TestValidateExtraArgs_Empty(t *testing.T) { + out, err := ValidateExtraArgs(nil) + if err != nil { + t.Fatal(err) + } + if out != nil { + t.Fatalf("expected nil, got %v", out) + } +} + +func TestIsManagedFlag(t *testing.T) { + if !IsManagedFlag("--model") { + t.Fatal("--model should be managed") + } + if !IsManagedFlag("-m") { + t.Fatal("-m should be managed") + } + if IsManagedFlag("-c") { + t.Fatal("-c should not be managed") + } +} + +func TestFlagName(t *testing.T) { + tests := []struct { + in, want string + }{ + {"--model=foo", "--model"}, + {"-c", "-c"}, + {"--top-k", "--top-k"}, + {"-1", ""}, + {"-0.5", ""}, + {"-", ""}, + {"--", ""}, + {"hello", ""}, + } + for _, tt := range tests { + got := FlagName(tt.in) + if got != tt.want { + t.Errorf("FlagName(%q) = %q, want %q", tt.in, got, tt.want) + } + } +} + +func TestStripShadowingFlags(t *testing.T) { + t.Run("strips context flag with value", func(t *testing.T) { + out := StripShadowingFlags([]string{"-c", "4096", "--top-k", "40"}) + if len(out) != 2 || out[0] != "--top-k" || out[1] != "40" { + t.Fatalf("got %v", out) + } + }) + t.Run("retains non-shadowing flags", func(t *testing.T) { + out := StripShadowingFlags([]string{"--top-k", "40", "--top-p", "0.95"}) + if len(out) != 4 { + t.Fatalf("got %v", out) + } + }) + t.Run("strips boolean jinja flag", func(t *testing.T) { + out := StripShadowingFlags([]string{"--jinja", "--top-k", "40"}) + if len(out) != 2 || out[0] != "--top-k" { + t.Fatalf("got %v", out) + } + }) + t.Run("strips equals form", func(t *testing.T) { + out := StripShadowingFlags([]string{"--ctx-size=4096"}) + if len(out) != 0 { + t.Fatalf("got %v", out) + } + }) +} diff --git a/internal/winsvc/winsvc_unix.go b/internal/winsvc/winsvc_unix.go new file mode 100644 index 0000000..7564510 --- /dev/null +++ b/internal/winsvc/winsvc_unix.go @@ -0,0 +1,26 @@ +//go:build !windows + +package winsvc + +import ( + "context" + "log/slog" + "os" + "os/signal" + "syscall" + "time" +) + +func RegisterShutdownHandler(ctx context.Context, shutdownFunc func(context.Context) error) { + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGINT) + <-sigCh + slog.Info("shutdown signal received") + shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + if err := shutdownFunc(shutdownCtx); err != nil { + slog.Error("shutdown error", "err", err) + os.Exit(1) + } + os.Exit(0) +} diff --git a/internal/winsvc/winsvc_windows.go b/internal/winsvc/winsvc_windows.go new file mode 100644 index 0000000..5ebf2dc --- /dev/null +++ b/internal/winsvc/winsvc_windows.go @@ -0,0 +1,25 @@ +//go:build windows + +package winsvc + +import ( + "context" + "log/slog" + "os" + "os/signal" + "time" +) + +func RegisterShutdownHandler(ctx context.Context, shutdownFunc func(context.Context) error) { + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, os.Interrupt) + <-sigCh + slog.Info("shutdown signal received") + shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + if err := shutdownFunc(shutdownCtx); err != nil { + slog.Error("shutdown error", "err", err) + os.Exit(1) + } + os.Exit(0) +}