llama-sidecar/eval/humaneval.py

#!/usr/bin/env python3
"""HumanEval benchmark — 164 problems with sandboxed execution."""

import json
import os
import re
import subprocess
import sys
import tempfile
import textwrap
import time
from pathlib import Path

from datasets import load_dataset
from openai import OpenAI
from tqdm import tqdm

ENDPOINT = os.environ.get("LLAMA_SWAP_URL", "http://100.101.41.16:8401/v1")
RESULTS_DIR = Path(__file__).parent / "results"
MAX_TOKENS = 1024
SEED = 42
TEMPERATURE = 0
EXEC_TIMEOUT = 30


def load_problems() -> list[dict]:
    ds = load_dataset("openai/openai_humaneval", split="test", trust_remote_code=True)
    problems = []
    for row in ds:
        problems.append({
            "id": row["task_id"],
            "prompt": row["prompt"],
            "canonical": row["canonical_solution"],
            "test": row["test"],
            "entry_point": row["entry_point"],
        })
    return problems


def extract_code(response: str, prompt: str) -> str:
    # Try to find a code block
    blocks = re.findall(r"```(?:python)?\n(.*?)```", response, re.DOTALL)
    if blocks:
        code = blocks[0]
        # If the code block contains the function signature, use it directly
        if "def " in code:
            return code
        # Otherwise prepend the prompt (function signature)
        return prompt + code

    # No code block — try to extract everything from the first def onwards
    lines = response.split("\n")
    in_code = False
    code_lines = []
    for line in lines:
        if line.strip().startswith("def ") or in_code:
            in_code = True
            code_lines.append(line)
        elif in_code and line.strip() == "":
            code_lines.append(line)

    if code_lines:
        return "\n".join(code_lines)

    # Last resort: prepend prompt to raw response
    return prompt + response


def run_test(code: str, test_code: str, entry_point: str) -> tuple[bool, str]:
    full = code + "\n\n" + test_code + f"\n\ncheck({entry_point})\n"

    with tempfile.NamedTemporaryFile(
        mode="w", suffix=".py", dir="/tmp", delete=False
    ) as f:
        f.write(full)
        f.flush()
        fpath = f.name

    try:
        # Sandboxed execution: restrict to /tmp, limited PATH
        env = {"PATH": "/usr/bin:/usr/local/bin", "HOME": "/tmp"}
        result = subprocess.run(
            [sys.executable, fpath],
            capture_output=True, text=True,
            timeout=EXEC_TIMEOUT,
            cwd="/tmp",
            env=env,
        )
        passed = result.returncode == 0
        output = result.stderr[:500] if result.stderr else result.stdout[:500]
        return passed, output
    except subprocess.TimeoutExpired:
        return False, "TIMEOUT"
    except Exception as e:
        return False, str(e)[:500]
    finally:
        try:
            os.unlink(fpath)
        except OSError:
            pass


def run_humaneval(model: str, client: OpenAI, problems: list[dict]) -> list[dict]:
    model_dir = RESULTS_DIR / model / "humaneval"
    model_dir.mkdir(parents=True, exist_ok=True)

    results = []
    correct = 0
    total = 0

    skipped = 0
    for i, p in enumerate(tqdm(problems, desc=f"  HumanEval {model}", file=sys.stderr)):
        out_path = model_dir / f"{p['id'].replace('/', '_')}.json"

        if out_path.exists():
            try:
                cached = json.loads(out_path.read_text())
                passed = cached.get("passed", False)
                if passed:
                    correct += 1
                total += 1
                results.append({
                    "model": model, "benchmark": "humaneval",
                    "question_id": p["id"], "correct": passed,
                    "raw_answer": "", "parsed_answer": "pass" if passed else "fail",
                    "expected": "pass", "latency_ms": 0,
                })
                skipped += 1
                continue
            except (json.JSONDecodeError, KeyError):
                pass

        t0 = time.time()
        resp_json = None
        for attempt in range(2):
            try:
                resp = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": (
                        "Complete the following Python function. "
                        "Return ONLY the complete function implementation.\n\n"
                        + p["prompt"]
                    )}],
                    max_tokens=MAX_TOKENS,
                    temperature=TEMPERATURE,
                    seed=SEED,
                )
                resp_json = resp.model_dump()
                break
            except Exception as e:
                if attempt == 0:
                    time.sleep(5)
                else:
                    resp_json = {"error": str(e)}
        latency = (time.time() - t0) * 1000

        raw = ""
        if resp_json and "choices" in resp_json:
            msg = resp_json["choices"][0].get("message", {})
            raw = msg.get("content", "") or msg.get("reasoning_content", "") or ""

        code = extract_code(raw, p["prompt"])
        passed, exec_output = run_test(code, p["test"], p["entry_point"])
        if passed:
            correct += 1
        total += 1

        out_path.write_text(json.dumps({
            "response": resp_json,
            "extracted_code": code[:2000],
            "passed": passed,
            "exec_output": exec_output,
        }, indent=2, default=str))

        results.append({
            "model": model,
            "benchmark": "humaneval",
            "question_id": p["id"],
            "correct": passed,
            "raw_answer": raw[:200],
            "parsed_answer": "pass" if passed else "fail",
            "expected": "pass",
            "latency_ms": round(latency, 1),
        })

        if (i + 1) % 10 == 0:
            print(f"  [{model}] HumanEval {i+1}/{len(problems)} — {correct}/{total} ({correct/total*100:.0f}%)", file=sys.stderr)

    if skipped:
        print(f"  [{model}] HumanEval resumed: {skipped} cached, {total-skipped} new", file=sys.stderr)
    print(f"  [{model}] HumanEval FINAL: {correct}/{total} ({correct/total*100:.1f}%)", file=sys.stderr)
    return results


if __name__ == "__main__":
    model = sys.argv[1] if len(sys.argv) > 1 else "qwen3.6-35b-a3b-mxfp4"
    client = OpenAI(base_url=ENDPOINT, api_key="dummy")
    problems = load_problems()
    results = run_humaneval(model, client, problems)
    for r in results:
        print(json.dumps(r))