llama-sidecar v0.1.0: daemon + benchmarks + eval suite
Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
201
eval/humaneval.py
Normal file
201
eval/humaneval.py
Normal file
@@ -0,0 +1,201 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HumanEval benchmark — 164 problems with sandboxed execution."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import textwrap
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from datasets import load_dataset
|
||||
from openai import OpenAI
|
||||
from tqdm import tqdm
|
||||
|
||||
ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
|
||||
RESULTS_DIR = Path(__file__).parent / "results"
|
||||
MAX_TOKENS = 1024
|
||||
SEED = 42
|
||||
TEMPERATURE = 0
|
||||
EXEC_TIMEOUT = 30
|
||||
|
||||
|
||||
def load_problems() -> list[dict]:
|
||||
ds = load_dataset("openai/openai_humaneval", split="test", trust_remote_code=True)
|
||||
problems = []
|
||||
for row in ds:
|
||||
problems.append({
|
||||
"id": row["task_id"],
|
||||
"prompt": row["prompt"],
|
||||
"canonical": row["canonical_solution"],
|
||||
"test": row["test"],
|
||||
"entry_point": row["entry_point"],
|
||||
})
|
||||
return problems
|
||||
|
||||
|
||||
def extract_code(response: str, prompt: str) -> str:
|
||||
# Try to find a code block
|
||||
blocks = re.findall(r"```(?:python)?\n(.*?)```", response, re.DOTALL)
|
||||
if blocks:
|
||||
code = blocks[0]
|
||||
# If the code block contains the function signature, use it directly
|
||||
if "def " in code:
|
||||
return code
|
||||
# Otherwise prepend the prompt (function signature)
|
||||
return prompt + code
|
||||
|
||||
# No code block — try to extract everything from the first def onwards
|
||||
lines = response.split("\n")
|
||||
in_code = False
|
||||
code_lines = []
|
||||
for line in lines:
|
||||
if line.strip().startswith("def ") or in_code:
|
||||
in_code = True
|
||||
code_lines.append(line)
|
||||
elif in_code and line.strip() == "":
|
||||
code_lines.append(line)
|
||||
|
||||
if code_lines:
|
||||
return "\n".join(code_lines)
|
||||
|
||||
# Last resort: prepend prompt to raw response
|
||||
return prompt + response
|
||||
|
||||
|
||||
def run_test(code: str, test_code: str, entry_point: str) -> tuple[bool, str]:
|
||||
full = code + "\n\n" + test_code + f"\n\ncheck({entry_point})\n"
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w", suffix=".py", dir="/tmp", delete=False
|
||||
) as f:
|
||||
f.write(full)
|
||||
f.flush()
|
||||
fpath = f.name
|
||||
|
||||
try:
|
||||
# Sandboxed execution: restrict to /tmp, limited PATH
|
||||
env = {"PATH": "/usr/bin:/usr/local/bin", "HOME": "/tmp"}
|
||||
result = subprocess.run(
|
||||
[sys.executable, fpath],
|
||||
capture_output=True, text=True,
|
||||
timeout=EXEC_TIMEOUT,
|
||||
cwd="/tmp",
|
||||
env=env,
|
||||
)
|
||||
passed = result.returncode == 0
|
||||
output = result.stderr[:500] if result.stderr else result.stdout[:500]
|
||||
return passed, output
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, "TIMEOUT"
|
||||
except Exception as e:
|
||||
return False, str(e)[:500]
|
||||
finally:
|
||||
try:
|
||||
os.unlink(fpath)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def run_humaneval(model: str, client: OpenAI, problems: list[dict]) -> list[dict]:
|
||||
model_dir = RESULTS_DIR / model / "humaneval"
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
results = []
|
||||
correct = 0
|
||||
total = 0
|
||||
|
||||
skipped = 0
|
||||
for i, p in enumerate(tqdm(problems, desc=f" HumanEval {model}", file=sys.stderr)):
|
||||
out_path = model_dir / f"{p['id'].replace('/', '_')}.json"
|
||||
|
||||
if out_path.exists():
|
||||
try:
|
||||
cached = json.loads(out_path.read_text())
|
||||
passed = cached.get("passed", False)
|
||||
if passed:
|
||||
correct += 1
|
||||
total += 1
|
||||
results.append({
|
||||
"model": model, "benchmark": "humaneval",
|
||||
"question_id": p["id"], "correct": passed,
|
||||
"raw_answer": "", "parsed_answer": "pass" if passed else "fail",
|
||||
"expected": "pass", "latency_ms": 0,
|
||||
})
|
||||
skipped += 1
|
||||
continue
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
|
||||
t0 = time.time()
|
||||
resp_json = None
|
||||
for attempt in range(2):
|
||||
try:
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": (
|
||||
"Complete the following Python function. "
|
||||
"Return ONLY the complete function implementation.\n\n"
|
||||
+ p["prompt"]
|
||||
)}],
|
||||
max_tokens=MAX_TOKENS,
|
||||
temperature=TEMPERATURE,
|
||||
seed=SEED,
|
||||
)
|
||||
resp_json = resp.model_dump()
|
||||
break
|
||||
except Exception as e:
|
||||
if attempt == 0:
|
||||
time.sleep(5)
|
||||
else:
|
||||
resp_json = {"error": str(e)}
|
||||
latency = (time.time() - t0) * 1000
|
||||
|
||||
raw = ""
|
||||
if resp_json and "choices" in resp_json:
|
||||
msg = resp_json["choices"][0].get("message", {})
|
||||
raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""
|
||||
|
||||
code = extract_code(raw, p["prompt"])
|
||||
passed, exec_output = run_test(code, p["test"], p["entry_point"])
|
||||
if passed:
|
||||
correct += 1
|
||||
total += 1
|
||||
|
||||
out_path.write_text(json.dumps({
|
||||
"response": resp_json,
|
||||
"extracted_code": code[:2000],
|
||||
"passed": passed,
|
||||
"exec_output": exec_output,
|
||||
}, indent=2, default=str))
|
||||
|
||||
results.append({
|
||||
"model": model,
|
||||
"benchmark": "humaneval",
|
||||
"question_id": p["id"],
|
||||
"correct": passed,
|
||||
"raw_answer": raw[:200],
|
||||
"parsed_answer": "pass" if passed else "fail",
|
||||
"expected": "pass",
|
||||
"latency_ms": round(latency, 1),
|
||||
})
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" [{model}] HumanEval {i+1}/{len(problems)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr)
|
||||
|
||||
if skipped:
|
||||
print(f" [{model}] HumanEval resumed: {skipped} cached, {total-skipped} new", file=sys.stderr)
|
||||
print(f" [{model}] HumanEval FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr)
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4"
|
||||
client = OpenAI(base_url=ENDPOINT, api_key="dummy")
|
||||
problems = load_problems()
|
||||
results = run_humaneval(model, client, problems)
|
||||
for r in results:
|
||||
print(json.dumps(r))
|
||||
Reference in New Issue
Block a user