Files
llama-sidecar/eval/mmlu.py
indifferentketchup fe7f36ae98 llama-sidecar v0.1.0: daemon + benchmarks + eval suite
Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with
LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port),
deterministic hash-keyed sidecar reuse. Windows service support via
schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx
decoupled child lifetime.

Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM
in JSON config, -fa → --flash-attn on default, child process exit after
one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED,
context.Background for child lifetime, background reaper goroutine).

bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks
automation to sam-desktop. Per-GGUF production flags from llama-swap
config with --ctx-size 32768 override.

eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) +
A/B model comparison (14 agent-typed prompts × 8 models). All scripts
resumable at individual question level.

94 Go tests, race detector clean.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-28 01:55:13 +00:00

167 lines
5.2 KiB
Python

#!/usr/bin/env python3
"""MMLU 100-question subset benchmark (20 per category, seed=42)."""
import json
import os
import random
import re
import sys
import time
from pathlib import Path
from datasets import load_dataset
from openai import OpenAI
from tqdm import tqdm
ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
RESULTS_DIR = Path(__file__).parent / "results"
MAX_TOKENS = 512
SEED = 42
TEMPERATURE = 0
CATEGORIES = [
"high_school_mathematics",
"college_computer_science",
"professional_medicine",
"formal_logic",
"miscellaneous",
]
PER_CATEGORY = 20
CHOICES = ["A", "B", "C", "D"]
def load_questions() -> list[dict]:
rng = random.Random(SEED)
questions = []
for cat in CATEGORIES:
ds = load_dataset("cais/mmlu", cat, split="test", trust_remote_code=True)
indices = list(range(len(ds)))
rng.shuffle(indices)
for idx in indices[:PER_CATEGORY]:
row = ds[idx]
questions.append({
"id": f"{cat}_{idx}",
"category": cat,
"question": row["question"],
"choices": row["choices"],
"answer_idx": row["answer"],
})
return questions
def format_prompt(q: dict) -> str:
lines = [f"Question: {q['question']}"]
for i, choice in enumerate(q["choices"]):
lines.append(f"{CHOICES[i]}) {choice}")
lines.append("Answer with a single letter: ")
return "\n".join(lines)
def parse_answer(text: str) -> str | None:
for ch in text.strip():
if ch.upper() in CHOICES:
return ch.upper()
return None
def run_mmlu(model: str, client: OpenAI, questions: list[dict]) -> list[dict]:
model_dir = RESULTS_DIR / model / "mmlu"
model_dir.mkdir(parents=True, exist_ok=True)
results = []
correct = 0
total = 0
skipped = 0
for i, q in enumerate(tqdm(questions, desc=f" MMLU {model}", file=sys.stderr)):
expected = CHOICES[q["answer_idx"]]
out_path = model_dir / f"{q['id']}.json"
# Resume: skip if result file exists
if out_path.exists():
try:
cached = json.loads(out_path.read_text())
raw = ""
if "choices" in cached:
msg = cached["choices"][0].get("message", {})
raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
parsed = parse_answer(raw)
is_correct = parsed == expected
if is_correct:
correct += 1
total += 1
results.append({
"model": model, "benchmark": "mmlu", "question_id": q["id"],
"category": q["category"], "correct": is_correct,
"raw_answer": raw[:200], "parsed_answer": parsed or "",
"expected": expected, "latency_ms": 0,
})
skipped += 1
continue
except (json.JSONDecodeError, KeyError):
pass
prompt = format_prompt(q)
t0 = time.time()
resp_json = None
for attempt in range(2):
try:
resp = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
seed=SEED,
)
resp_json = resp.model_dump()
break
except Exception as e:
if attempt == 0:
time.sleep(5)
else:
resp_json = {"error": str(e)}
latency = (time.time() - t0) * 1000
raw = ""
if resp_json and "choices" in resp_json:
msg = resp_json["choices"][0].get("message", {})
raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
parsed = parse_answer(raw)
is_correct = parsed == expected
if is_correct:
correct += 1
total += 1
out_path.write_text(json.dumps(resp_json, indent=2, default=str))
results.append({
"model": model,
"benchmark": "mmlu",
"question_id": q["id"],
"category": q["category"],
"correct": is_correct,
"raw_answer": raw[:200],
"parsed_answer": parsed or "",
"expected": expected,
"latency_ms": round(latency, 1),
})
if (i + 1) % 10 == 0:
print(f" [{model}] MMLU {i+1}/{len(questions)}{correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr)
if skipped:
print(f" [{model}] MMLU resumed: {skipped} cached, {total-skipped} new", file=sys.stderr)
print(f" [{model}] MMLU FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr)
return results
if __name__ == "__main__":
model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4"
client = OpenAI(base_url=ENDPOINT, api_key="dummy")
questions = load_questions()
results = run_mmlu(model, client, questions)
for r in results:
print(json.dumps(r))