Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
118 lines
3.6 KiB
Python
118 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""Orchestrate MMLU, GSM8K, HumanEval across all models."""
|
|
|
|
import csv
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from openai import OpenAI
|
|
|
|
ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
|
|
RESULTS_DIR = Path(__file__).parent / "results"
|
|
CSV_PATH = Path(__file__).parent / "scores.csv"
|
|
|
|
MODELS = [
|
|
"qwen3.6-35b-a3b-mxfp4",
|
|
"qwen3-coder-30b-apex",
|
|
"qwen3.6-27b-mtp",
|
|
"qwopus3.5-4b-mtp",
|
|
"qwen3.5-9b-deepseek-v4-mtp",
|
|
"qwopus3.6-35b-a3b-v1",
|
|
"qwopus3.6-27b-v2-mtp",
|
|
"qwopus3.5-9b-coder-mtp",
|
|
]
|
|
|
|
|
|
def warmup_model(client: OpenAI, model: str) -> bool:
|
|
print(f"\n{'='*60}", file=sys.stderr)
|
|
print(f" Loading model: {model}", file=sys.stderr)
|
|
print(f"{'='*60}", file=sys.stderr)
|
|
for attempt in range(3):
|
|
try:
|
|
resp = client.chat.completions.create(
|
|
model=model,
|
|
messages=[{"role": "user", "content": "Say OK."}],
|
|
max_tokens=10,
|
|
temperature=0,
|
|
)
|
|
print(f" Warmup OK", file=sys.stderr)
|
|
return True
|
|
except Exception as e:
|
|
print(f" Warmup attempt {attempt+1} failed: {e}", file=sys.stderr)
|
|
time.sleep(10)
|
|
print(f" WARNING: warmup failed for {model}, continuing anyway", file=sys.stderr)
|
|
return False
|
|
|
|
|
|
def run_benchmark(module_name: str, model: str, client: OpenAI) -> list[dict]:
|
|
if module_name == "mmlu":
|
|
from mmlu import load_questions, run_mmlu
|
|
questions = load_questions()
|
|
return run_mmlu(model, client, questions)
|
|
elif module_name == "gsm8k":
|
|
from gsm8k import load_questions, run_gsm8k
|
|
questions = load_questions()
|
|
return run_gsm8k(model, client, questions)
|
|
elif module_name == "humaneval":
|
|
from humaneval import load_problems, run_humaneval
|
|
problems = load_problems()
|
|
return run_humaneval(model, client, problems)
|
|
else:
|
|
raise ValueError(f"Unknown benchmark: {module_name}")
|
|
|
|
|
|
def main() -> None:
|
|
client = OpenAI(base_url=ENDPOINT, api_key="dummy")
|
|
|
|
# Check connectivity
|
|
try:
|
|
client.models.list()
|
|
print("Connected to llama-swap", file=sys.stderr)
|
|
except Exception as e:
|
|
print(f"Cannot connect to {ENDPOINT}: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
all_results: list[dict] = []
|
|
benchmarks = ["mmlu", "gsm8k", "humaneval"]
|
|
|
|
t_start = time.time()
|
|
|
|
for model in MODELS:
|
|
warmup_model(client, model)
|
|
|
|
for bench in benchmarks:
|
|
print(f"\n --- {model} / {bench} ---", file=sys.stderr)
|
|
try:
|
|
results = run_benchmark(bench, model, client)
|
|
all_results.extend(results)
|
|
write_csv(all_results)
|
|
except Exception as e:
|
|
print(f" ERROR in {model}/{bench}: {e}", file=sys.stderr)
|
|
|
|
elapsed = time.time() - t_start
|
|
print(f"\nAll benchmarks complete in {elapsed/60:.0f} minutes", file=sys.stderr)
|
|
print(f"Results: {CSV_PATH}", file=sys.stderr)
|
|
|
|
|
|
def write_csv(results: list[dict]) -> None:
|
|
if not results:
|
|
return
|
|
fields = ["model", "benchmark", "question_id", "correct", "raw_answer",
|
|
"parsed_answer", "expected", "latency_ms"]
|
|
# Also include category if present (MMLU)
|
|
if any("category" in r for r in results):
|
|
fields.insert(3, "category")
|
|
|
|
with open(CSV_PATH, "w", newline="") as f:
|
|
w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
|
|
w.writeheader()
|
|
w.writerows(results)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|