Files
llama-sidecar/eval/run_all.py
indifferentketchup fe7f36ae98 llama-sidecar v0.1.0: daemon + benchmarks + eval suite
Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with
LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port),
deterministic hash-keyed sidecar reuse. Windows service support via
schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx
decoupled child lifetime.

Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM
in JSON config, -fa → --flash-attn on default, child process exit after
one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED,
context.Background for child lifetime, background reaper goroutine).

bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks
automation to sam-desktop. Per-GGUF production flags from llama-swap
config with --ctx-size 32768 override.

eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) +
A/B model comparison (14 agent-typed prompts × 8 models). All scripts
resumable at individual question level.

94 Go tests, race detector clean.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-28 01:55:13 +00:00

118 lines
3.6 KiB
Python

#!/usr/bin/env python3
"""Orchestrate MMLU, GSM8K, HumanEval across all models."""
import csv
import json
import os
import sys
import time
from pathlib import Path
from openai import OpenAI
ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
RESULTS_DIR = Path(__file__).parent / "results"
CSV_PATH = Path(__file__).parent / "scores.csv"
MODELS = [
"qwen3.6-35b-a3b-mxfp4",
"qwen3-coder-30b-apex",
"qwen3.6-27b-mtp",
"qwopus3.5-4b-mtp",
"qwen3.5-9b-deepseek-v4-mtp",
"qwopus3.6-35b-a3b-v1",
"qwopus3.6-27b-v2-mtp",
"qwopus3.5-9b-coder-mtp",
]
def warmup_model(client: OpenAI, model: str) -> bool:
print(f"\n{'='*60}", file=sys.stderr)
print(f" Loading model: {model}", file=sys.stderr)
print(f"{'='*60}", file=sys.stderr)
for attempt in range(3):
try:
resp = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": "Say OK."}],
max_tokens=10,
temperature=0,
)
print(f" Warmup OK", file=sys.stderr)
return True
except Exception as e:
print(f" Warmup attempt {attempt+1} failed: {e}", file=sys.stderr)
time.sleep(10)
print(f" WARNING: warmup failed for {model}, continuing anyway", file=sys.stderr)
return False
def run_benchmark(module_name: str, model: str, client: OpenAI) -> list[dict]:
if module_name == "mmlu":
from mmlu import load_questions, run_mmlu
questions = load_questions()
return run_mmlu(model, client, questions)
elif module_name == "gsm8k":
from gsm8k import load_questions, run_gsm8k
questions = load_questions()
return run_gsm8k(model, client, questions)
elif module_name == "humaneval":
from humaneval import load_problems, run_humaneval
problems = load_problems()
return run_humaneval(model, client, problems)
else:
raise ValueError(f"Unknown benchmark: {module_name}")
def main() -> None:
client = OpenAI(base_url=ENDPOINT, api_key="dummy")
# Check connectivity
try:
client.models.list()
print("Connected to llama-swap", file=sys.stderr)
except Exception as e:
print(f"Cannot connect to {ENDPOINT}: {e}", file=sys.stderr)
sys.exit(1)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
all_results: list[dict] = []
benchmarks = ["mmlu", "gsm8k", "humaneval"]
t_start = time.time()
for model in MODELS:
warmup_model(client, model)
for bench in benchmarks:
print(f"\n --- {model} / {bench} ---", file=sys.stderr)
try:
results = run_benchmark(bench, model, client)
all_results.extend(results)
write_csv(all_results)
except Exception as e:
print(f" ERROR in {model}/{bench}: {e}", file=sys.stderr)
elapsed = time.time() - t_start
print(f"\nAll benchmarks complete in {elapsed/60:.0f} minutes", file=sys.stderr)
print(f"Results: {CSV_PATH}", file=sys.stderr)
def write_csv(results: list[dict]) -> None:
if not results:
return
fields = ["model", "benchmark", "question_id", "correct", "raw_answer",
"parsed_answer", "expected", "latency_ms"]
# Also include category if present (MMLU)
if any("category" in r for r in results):
fields.insert(3, "category")
with open(CSV_PATH, "w", newline="") as f:
w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
w.writeheader()
w.writerows(results)
if __name__ == "__main__":
main()