llama-sidecar v0.1.0: daemon + benchmarks + eval suite

Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-28 01:55:13 +00:00
parent babbb4f39b
commit fe7f36ae98
39 changed files with 4228 additions and 0 deletions
--- a/eval/humaneval.py
+++ b/eval/humaneval.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""HumanEval benchmark — 164 problems with sandboxed execution."""
+
+import json
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import textwrap
+import time
+from pathlib import Path
+
+from datasets import load_dataset
+from openai import OpenAI
+from tqdm import tqdm
+
+ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
+RESULTS_DIR = Path(__file__).parent / "results"
+MAX_TOKENS = 1024
+SEED = 42
+TEMPERATURE = 0
+EXEC_TIMEOUT = 30
+
+
+def load_problems() -> list[dict]:
+    ds = load_dataset("openai/openai_humaneval", split="test", trust_remote_code=True)
+    problems = []
+    for row in ds:
+        problems.append({
+            "id": row["task_id"],
+            "prompt": row["prompt"],
+            "canonical": row["canonical_solution"],
+            "test": row["test"],
+            "entry_point": row["entry_point"],
+        })
+    return problems
+
+
+def extract_code(response: str, prompt: str) -> str:
+    # Try to find a code block
+    blocks = re.findall(r"```(?:python)?\n(.*?)```", response, re.DOTALL)
+    if blocks:
+        code = blocks[0]
+        # If the code block contains the function signature, use it directly
+        if "def " in code:
+            return code
+        # Otherwise prepend the prompt (function signature)
+        return prompt + code
+
+    # No code block — try to extract everything from the first def onwards
+    lines = response.split("\n")
+    in_code = False
+    code_lines = []
+    for line in lines:
+        if line.strip().startswith("def ") or in_code:
+            in_code = True
+            code_lines.append(line)
+        elif in_code and line.strip() == "":
+            code_lines.append(line)
+
+    if code_lines:
+        return "\n".join(code_lines)
+
+    # Last resort: prepend prompt to raw response
+    return prompt + response
+
+
+def run_test(code: str, test_code: str, entry_point: str) -> tuple[bool, str]:
+    full = code + "\n\n" + test_code + f"\n\ncheck({entry_point})\n"
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".py", dir="/tmp", delete=False
+    ) as f:
+        f.write(full)
+        f.flush()
+        fpath = f.name
+
+    try:
+        # Sandboxed execution: restrict to /tmp, limited PATH
+        env = {"PATH": "/usr/bin:/usr/local/bin", "HOME": "/tmp"}
+        result = subprocess.run(
+            [sys.executable, fpath],
+            capture_output=True, text=True,
+            timeout=EXEC_TIMEOUT,
+            cwd="/tmp",
+            env=env,
+        )
+        passed = result.returncode == 0
+        output = result.stderr[:500] if result.stderr else result.stdout[:500]
+        return passed, output
+    except subprocess.TimeoutExpired:
+        return False, "TIMEOUT"
+    except Exception as e:
+        return False, str(e)[:500]
+    finally:
+        try:
+            os.unlink(fpath)
+        except OSError:
+            pass
+
+
+def run_humaneval(model: str, client: OpenAI, problems: list[dict]) -> list[dict]:
+    model_dir = RESULTS_DIR / model / "humaneval"
+    model_dir.mkdir(parents=True, exist_ok=True)
+
+    results = []
+    correct = 0
+    total = 0
+
+    skipped = 0
+    for i, p in enumerate(tqdm(problems, desc=f"  HumanEval {model}", file=sys.stderr)):
+        out_path = model_dir / f"{p['id'].replace('/', '_')}.json"
+
+        if out_path.exists():
+            try:
+                cached = json.loads(out_path.read_text())
+                passed = cached.get("passed", False)
+                if passed:
+                    correct += 1
+                total += 1
+                results.append({
+                    "model": model, "benchmark": "humaneval",
+                    "question_id": p["id"], "correct": passed,
+                    "raw_answer": "", "parsed_answer": "pass" if passed else "fail",
+                    "expected": "pass", "latency_ms": 0,
+                })
+                skipped += 1
+                continue
+            except (json.JSONDecodeError, KeyError):
+                pass
+
+        t0 = time.time()
+        resp_json = None
+        for attempt in range(2):
+            try:
+                resp = client.chat.completions.create(
+                    model=model,
+                    messages=[{"role": "user", "content": (
+                        "Complete the following Python function. "
+                        "Return ONLY the complete function implementation.\n\n"
+                        + p["prompt"]
+                    )}],
+                    max_tokens=MAX_TOKENS,
+                    temperature=TEMPERATURE,
+                    seed=SEED,
+                )
+                resp_json = resp.model_dump()
+                break
+            except Exception as e:
+                if attempt == 0:
+                    time.sleep(5)
+                else:
+                    resp_json = {"error": str(e)}
+        latency = (time.time() - t0) * 1000
+
+        raw = ""
+        if resp_json and "choices" in resp_json:
+            msg = resp_json["choices"][0].get("message", {})
+            raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
+
+        code = extract_code(raw, p["prompt"])
+        passed, exec_output = run_test(code, p["test"], p["entry_point"])
+        if passed:
+            correct += 1
+        total += 1
+
+        out_path.write_text(json.dumps({
+            "response": resp_json,
+            "extracted_code": code[:2000],
+            "passed": passed,
+            "exec_output": exec_output,
+        }, indent=2, default=str))
+
+        results.append({
+            "model": model,
+            "benchmark": "humaneval",
+            "question_id": p["id"],
+            "correct": passed,
+            "raw_answer": raw[:200],
+            "parsed_answer": "pass" if passed else "fail",
+            "expected": "pass",
+            "latency_ms": round(latency, 1),
+        })
+
+        if (i + 1) % 10 == 0:
+            print(f"  [{model}] HumanEval {i+1}/{len(problems)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr)
+
+    if skipped:
+        print(f"  [{model}] HumanEval resumed: {skipped} cached, {total-skipped} new", file=sys.stderr)
+    print(f"  [{model}] HumanEval FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr)
+    return results
+
+
+if __name__ == "__main__":
+    model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4"
+    client = OpenAI(base_url=ENDPOINT, api_key="dummy")
+    problems = load_problems()
+    results = run_humaneval(model, client, problems)
+    for r in results:
+        print(json.dumps(r))