llama-sidecar v0.1.0: daemon + benchmarks + eval suite
Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
117
eval/run_all.py
Normal file
117
eval/run_all.py
Normal file
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Orchestrate MMLU, GSM8K, HumanEval across all models."""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
|
||||
RESULTS_DIR = Path(__file__).parent / "results"
|
||||
CSV_PATH = Path(__file__).parent / "scores.csv"
|
||||
|
||||
MODELS = [
|
||||
"qwen3.6-35b-a3b-mxfp4",
|
||||
"qwen3-coder-30b-apex",
|
||||
"qwen3.6-27b-mtp",
|
||||
"qwopus3.5-4b-mtp",
|
||||
"qwen3.5-9b-deepseek-v4-mtp",
|
||||
"qwopus3.6-35b-a3b-v1",
|
||||
"qwopus3.6-27b-v2-mtp",
|
||||
"qwopus3.5-9b-coder-mtp",
|
||||
]
|
||||
|
||||
|
||||
def warmup_model(client: OpenAI, model: str) -> bool:
|
||||
print(f"\n{'='*60}", file=sys.stderr)
|
||||
print(f" Loading model: {model}", file=sys.stderr)
|
||||
print(f"{'='*60}", file=sys.stderr)
|
||||
for attempt in range(3):
|
||||
try:
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Say OK."}],
|
||||
max_tokens=10,
|
||||
temperature=0,
|
||||
)
|
||||
print(f" Warmup OK", file=sys.stderr)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f" Warmup attempt {attempt+1} failed: {e}", file=sys.stderr)
|
||||
time.sleep(10)
|
||||
print(f" WARNING: warmup failed for {model}, continuing anyway", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def run_benchmark(module_name: str, model: str, client: OpenAI) -> list[dict]:
|
||||
if module_name == "mmlu":
|
||||
from mmlu import load_questions, run_mmlu
|
||||
questions = load_questions()
|
||||
return run_mmlu(model, client, questions)
|
||||
elif module_name == "gsm8k":
|
||||
from gsm8k import load_questions, run_gsm8k
|
||||
questions = load_questions()
|
||||
return run_gsm8k(model, client, questions)
|
||||
elif module_name == "humaneval":
|
||||
from humaneval import load_problems, run_humaneval
|
||||
problems = load_problems()
|
||||
return run_humaneval(model, client, problems)
|
||||
else:
|
||||
raise ValueError(f"Unknown benchmark: {module_name}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
client = OpenAI(base_url=ENDPOINT, api_key="dummy")
|
||||
|
||||
# Check connectivity
|
||||
try:
|
||||
client.models.list()
|
||||
print("Connected to llama-swap", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"Cannot connect to {ENDPOINT}: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
all_results: list[dict] = []
|
||||
benchmarks = ["mmlu", "gsm8k", "humaneval"]
|
||||
|
||||
t_start = time.time()
|
||||
|
||||
for model in MODELS:
|
||||
warmup_model(client, model)
|
||||
|
||||
for bench in benchmarks:
|
||||
print(f"\n --- {model} / {bench} ---", file=sys.stderr)
|
||||
try:
|
||||
results = run_benchmark(bench, model, client)
|
||||
all_results.extend(results)
|
||||
write_csv(all_results)
|
||||
except Exception as e:
|
||||
print(f" ERROR in {model}/{bench}: {e}", file=sys.stderr)
|
||||
|
||||
elapsed = time.time() - t_start
|
||||
print(f"\nAll benchmarks complete in {elapsed/60:.0f} minutes", file=sys.stderr)
|
||||
print(f"Results: {CSV_PATH}", file=sys.stderr)
|
||||
|
||||
|
||||
def write_csv(results: list[dict]) -> None:
|
||||
if not results:
|
||||
return
|
||||
fields = ["model", "benchmark", "question_id", "correct", "raw_answer",
|
||||
"parsed_answer", "expected", "latency_ms"]
|
||||
# Also include category if present (MMLU)
|
||||
if any("category" in r for r in results):
|
||||
fields.insert(3, "category")
|
||||
|
||||
with open(CSV_PATH, "w", newline="") as f:
|
||||
w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
|
||||
w.writeheader()
|
||||
w.writerows(results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user