#!/usr/bin/env python3 """Orchestrate MMLU, GSM8K, HumanEval across all models.""" import csv import json import os import sys import time from pathlib import Path from openai import OpenAI ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1") RESULTS_DIR = Path(__file__).parent / "results" CSV_PATH = Path(__file__).parent / "scores.csv" MODELS = [ "qwen3.6-35b-a3b-mxfp4", "qwen3-coder-30b-apex", "qwen3.6-27b-mtp", "qwopus3.5-4b-mtp", "qwen3.5-9b-deepseek-v4-mtp", "qwopus3.6-35b-a3b-v1", "qwopus3.6-27b-v2-mtp", "qwopus3.5-9b-coder-mtp", ] def warmup_model(client: OpenAI, model: str) -> bool: print(f"\n{'='*60}", file=sys.stderr) print(f" Loading model: {model}", file=sys.stderr) print(f"{'='*60}", file=sys.stderr) for attempt in range(3): try: resp = client.chat.completions.create( model=model, messages=[{"role": "user", "content": "Say OK."}], max_tokens=10, temperature=0, ) print(f" Warmup OK", file=sys.stderr) return True except Exception as e: print(f" Warmup attempt {attempt+1} failed: {e}", file=sys.stderr) time.sleep(10) print(f" WARNING: warmup failed for {model}, continuing anyway", file=sys.stderr) return False def run_benchmark(module_name: str, model: str, client: OpenAI) -> list[dict]: if module_name == "mmlu": from mmlu import load_questions, run_mmlu questions = load_questions() return run_mmlu(model, client, questions) elif module_name == "gsm8k": from gsm8k import load_questions, run_gsm8k questions = load_questions() return run_gsm8k(model, client, questions) elif module_name == "humaneval": from humaneval import load_problems, run_humaneval problems = load_problems() return run_humaneval(model, client, problems) else: raise ValueError(f"Unknown benchmark: {module_name}") def main() -> None: client = OpenAI(base_url=ENDPOINT, api_key="dummy") # Check connectivity try: client.models.list() print("Connected to llama-swap", file=sys.stderr) except Exception as e: print(f"Cannot connect to {ENDPOINT}: {e}", file=sys.stderr) sys.exit(1) RESULTS_DIR.mkdir(parents=True, exist_ok=True) all_results: list[dict] = [] benchmarks = ["mmlu", "gsm8k", "humaneval"] t_start = time.time() for model in MODELS: warmup_model(client, model) for bench in benchmarks: print(f"\n --- {model} / {bench} ---", file=sys.stderr) try: results = run_benchmark(bench, model, client) all_results.extend(results) write_csv(all_results) except Exception as e: print(f" ERROR in {model}/{bench}: {e}", file=sys.stderr) elapsed = time.time() - t_start print(f"\nAll benchmarks complete in {elapsed/60:.0f} minutes", file=sys.stderr) print(f"Results: {CSV_PATH}", file=sys.stderr) def write_csv(results: list[dict]) -> None: if not results: return fields = ["model", "benchmark", "question_id", "correct", "raw_answer", "parsed_answer", "expected", "latency_ms"] # Also include category if present (MMLU) if any("category" in r for r in results): fields.insert(3, "category") with open(CSV_PATH, "w", newline="") as f: w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore") w.writeheader() w.writerows(results) if __name__ == "__main__": main()