#!/usr/bin/env python3 """HumanEval benchmark — 164 problems with sandboxed execution.""" import json import os import re import subprocess import sys import tempfile import textwrap import time from pathlib import Path from datasets import load_dataset from openai import OpenAI from tqdm import tqdm ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1") RESULTS_DIR = Path(__file__).parent / "results" MAX_TOKENS = 1024 SEED = 42 TEMPERATURE = 0 EXEC_TIMEOUT = 30 def load_problems() -> list[dict]: ds = load_dataset("openai/openai_humaneval", split="test", trust_remote_code=True) problems = [] for row in ds: problems.append({ "id": row["task_id"], "prompt": row["prompt"], "canonical": row["canonical_solution"], "test": row["test"], "entry_point": row["entry_point"], }) return problems def extract_code(response: str, prompt: str) -> str: # Try to find a code block blocks = re.findall(r"```(?:python)?\n(.*?)```", response, re.DOTALL) if blocks: code = blocks[0] # If the code block contains the function signature, use it directly if "def " in code: return code # Otherwise prepend the prompt (function signature) return prompt + code # No code block — try to extract everything from the first def onwards lines = response.split("\n") in_code = False code_lines = [] for line in lines: if line.strip().startswith("def ") or in_code: in_code = True code_lines.append(line) elif in_code and line.strip() == "": code_lines.append(line) if code_lines: return "\n".join(code_lines) # Last resort: prepend prompt to raw response return prompt + response def run_test(code: str, test_code: str, entry_point: str) -> tuple[bool, str]: full = code + "\n\n" + test_code + f"\n\ncheck({entry_point})\n" with tempfile.NamedTemporaryFile( mode="w", suffix=".py", dir="/tmp", delete=False ) as f: f.write(full) f.flush() fpath = f.name try: # Sandboxed execution: restrict to /tmp, limited PATH env = {"PATH": "/usr/bin:/usr/local/bin", "HOME": "/tmp"} result = subprocess.run( [sys.executable, fpath], capture_output=True, text=True, timeout=EXEC_TIMEOUT, cwd="/tmp", env=env, ) passed = result.returncode == 0 output = result.stderr[:500] if result.stderr else result.stdout[:500] return passed, output except subprocess.TimeoutExpired: return False, "TIMEOUT" except Exception as e: return False, str(e)[:500] finally: try: os.unlink(fpath) except OSError: pass def run_humaneval(model: str, client: OpenAI, problems: list[dict]) -> list[dict]: model_dir = RESULTS_DIR / model / "humaneval" model_dir.mkdir(parents=True, exist_ok=True) results = [] correct = 0 total = 0 skipped = 0 for i, p in enumerate(tqdm(problems, desc=f" HumanEval {model}", file=sys.stderr)): out_path = model_dir / f"{p['id'].replace('/', '_')}.json" if out_path.exists(): try: cached = json.loads(out_path.read_text()) passed = cached.get("passed", False) if passed: correct += 1 total += 1 results.append({ "model": model, "benchmark": "humaneval", "question_id": p["id"], "correct": passed, "raw_answer": "", "parsed_answer": "pass" if passed else "fail", "expected": "pass", "latency_ms": 0, }) skipped += 1 continue except (json.JSONDecodeError, KeyError): pass t0 = time.time() resp_json = None for attempt in range(2): try: resp = client.chat.completions.create( model=model, messages=[{"role": "user", "content": ( "Complete the following Python function. " "Return ONLY the complete function implementation.\n\n" + p["prompt"] )}], max_tokens=MAX_TOKENS, temperature=TEMPERATURE, seed=SEED, ) resp_json = resp.model_dump() break except Exception as e: if attempt == 0: time.sleep(5) else: resp_json = {"error": str(e)} latency = (time.time() - t0) * 1000 raw = "" if resp_json and "choices" in resp_json: msg = resp_json["choices"][0].get("message", {}) raw = msg.get("content", "") or msg.get("reasoning_content", "") or "" code = extract_code(raw, p["prompt"]) passed, exec_output = run_test(code, p["test"], p["entry_point"]) if passed: correct += 1 total += 1 out_path.write_text(json.dumps({ "response": resp_json, "extracted_code": code[:2000], "passed": passed, "exec_output": exec_output, }, indent=2, default=str)) results.append({ "model": model, "benchmark": "humaneval", "question_id": p["id"], "correct": passed, "raw_answer": raw[:200], "parsed_answer": "pass" if passed else "fail", "expected": "pass", "latency_ms": round(latency, 1), }) if (i + 1) % 10 == 0: print(f" [{model}] HumanEval {i+1}/{len(problems)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr) if skipped: print(f" [{model}] HumanEval resumed: {skipped} cached, {total-skipped} new", file=sys.stderr) print(f" [{model}] HumanEval FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr) return results if __name__ == "__main__": model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4" client = OpenAI(base_url=ENDPOINT, api_key="dummy") problems = load_problems() results = run_humaneval(model, client, problems) for r in results: print(json.dumps(r))