Files
llama-sidecar/eval/ab/run.sh
indifferentketchup fe7f36ae98 llama-sidecar v0.1.0: daemon + benchmarks + eval suite
Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with
LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port),
deterministic hash-keyed sidecar reuse. Windows service support via
schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx
decoupled child lifetime.

Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM
in JSON config, -fa → --flash-attn on default, child process exit after
one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED,
context.Background for child lifetime, background reaper goroutine).

bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks
automation to sam-desktop. Per-GGUF production flags from llama-swap
config with --ctx-size 32768 override.

eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) +
A/B model comparison (14 agent-typed prompts × 8 models). All scripts
resumable at individual question level.

94 Go tests, race detector clean.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-28 01:55:13 +00:00

243 lines
7.7 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
ENDPOINT="http://100.101.41.16:8401/v1"
PROMPTS_FILE="${SCRIPT_DIR}/prompts.json"
RESULTS_DIR="${SCRIPT_DIR}/results"
COMPARE_FILE="${SCRIPT_DIR}/COMPARE.md"
TIMING_FILE="${SCRIPT_DIR}/timing.csv"
MODELS=(
qwen3.6-35b-a3b-mxfp4
qwen3-coder-30b-apex
qwen3.6-27b-mtp
qwopus3.5-4b-mtp
qwen3.5-9b-deepseek-v4-mtp
qwopus3.6-35b-a3b-v1
qwopus3.6-27b-v2-mtp
qwopus3.5-9b-coder-mtp
)
mkdir -p "$RESULTS_DIR"
# ── Parse prompts ─────────────────────────────────────────────────────
PROMPT_COUNT=$(python3 -c "import json; print(len(json.load(open('${PROMPTS_FILE}'))))")
TOTAL=$((PROMPT_COUNT * ${#MODELS[@]}))
EST_MIN=$(( TOTAL * 30 / 60 ))
echo "================================================================"
echo " A/B MODEL COMPARISON"
echo " ${PROMPT_COUNT} prompts × ${#MODELS[@]} models = ${TOTAL} requests"
echo " Estimated runtime: ~${EST_MIN} minutes"
echo " Endpoint: ${ENDPOINT}"
echo "================================================================"
echo ""
# ── Main loop: models (outer) × prompts (inner) ──────────────────────
# One model load per model, all prompts answered, then swap.
t_start=$(date +%s)
done_count=0
for model in "${MODELS[@]}"; do
echo ""
echo "================================================================"
echo " MODEL: ${model}"
echo "================================================================"
# Warmup: load the model with a trivial request
all_cached=true
for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do
PID=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['id'])")
if [ ! -f "${RESULTS_DIR}/${PID}/${model}.json" ] || [ ! -s "${RESULTS_DIR}/${PID}/${model}.json" ]; then
all_cached=false
break
fi
done
if [ "$all_cached" = "true" ]; then
echo " All ${PROMPT_COUNT} prompts cached, skipping model"
for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do
done_count=$((done_count + 1))
done
continue
fi
echo " Warming up..."
curl -s -X POST "${ENDPOINT}/chat/completions" \
-H "Content-Type: application/json" \
-d "{\"model\":\"${model}\",\"messages\":[{\"role\":\"user\",\"content\":\"Say OK.\"}],\"max_tokens\":10,\"temperature\":0}" \
--max-time 300 > /dev/null 2>&1
echo " Warm."
for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do
PROMPT_ID=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['id'])")
AGENT=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['agent'])")
mkdir -p "${RESULTS_DIR}/${PROMPT_ID}"
OUT_JSON="${RESULTS_DIR}/${PROMPT_ID}/${model}.json"
OUT_MD="${RESULTS_DIR}/${PROMPT_ID}/${model}.md"
# Resume: skip if already done
if [ -f "$OUT_JSON" ] && [ -s "$OUT_JSON" ]; then
done_count=$((done_count + 1))
echo " [${PROMPT_ID}] cached (${done_count}/${TOTAL})"
continue
fi
BODY=$(python3 -c "
import json
p = json.load(open('${PROMPTS_FILE}'))[${pidx}]
print(json.dumps({
'model': '${model}',
'messages': [{'role': 'user', 'content': p['prompt']}],
'temperature': 0.6,
'max_tokens': 2048,
'seed': 42,
'stream': False
}))
")
SUCCESS=0
for attempt in 1 2; do
HTTP_CODE=$(curl -s -w '%{http_code}' -o "$OUT_JSON" \
--max-time 300 \
-X POST "${ENDPOINT}/chat/completions" \
-H "Content-Type: application/json" \
-d "$BODY" 2>/dev/null)
if [ "$HTTP_CODE" = "200" ]; then
SUCCESS=1
break
else
if [ "$attempt" = "1" ]; then
echo " [${PROMPT_ID}] HTTP ${HTTP_CODE}, retrying in 10s..."
sleep 10
else
echo "ERROR: HTTP ${HTTP_CODE}" > "$OUT_MD"
echo " [${PROMPT_ID}] FAILED (HTTP ${HTTP_CODE})"
fi
fi
done
if [ "$SUCCESS" = "1" ]; then
python3 -c "
import json
d = json.load(open('${OUT_JSON}'))
msg = d.get('choices', [{}])[0].get('message', {})
content = msg.get('content', '') or ''
reasoning = msg.get('reasoning_content', '') or ''
out = ''
if reasoning:
out += '<think>\n' + reasoning + '\n</think>\n\n'
out += content
open('${OUT_MD}', 'w').write(out)
" 2>/dev/null
done_count=$((done_count + 1))
METRICS=$(python3 -c "
import json
d = json.load(open('${OUT_JSON}'))
t = d.get('timings', {})
tps = t.get('predicted_per_second', 0)
tok = d.get('usage', {}).get('completion_tokens', 0)
print(f'{tps:.1f}tok/s {tok}tok')
" 2>/dev/null || echo "?")
echo " [${PROMPT_ID}] done (${METRICS}) [${done_count}/${TOTAL}]"
fi
sleep 2
done
done
# ── Generate COMPARE.md ──────────────────────────────────────────────
echo ""
echo "Generating COMPARE.md..."
MODELS_JSON=$(printf '%s\n' "${MODELS[@]}" | python3 -c "import json,sys; print(json.dumps([l.strip() for l in sys.stdin if l.strip()]))")
python3 -c "
import json
from pathlib import Path
prompts = json.load(open('${PROMPTS_FILE}'))
results_dir = Path('${RESULTS_DIR}')
models = json.loads('${MODELS_JSON}')
lines = ['# A/B Model Comparison\n']
timing_rows = []
for p in prompts:
pid = p['id']
agent = p['agent']
short = p['prompt'][:80]
lines.append(f'## [{pid}] {agent}\n')
lines.append(f'> {short}...\n')
for model in models:
md_path = results_dir / pid / f'{model}.md'
json_path = results_dir / pid / f'{model}.json'
lines.append(f'### {model}\n')
if md_path.exists():
content = md_path.read_text().strip()
lines.append(f'{content}\n')
else:
lines.append('*(no response)*\n')
if json_path.exists():
try:
d = json.loads(json_path.read_text())
t = d.get('timings', {})
u = d.get('usage', {})
timing_rows.append({
'prompt_id': pid,
'model_id': model,
'prompt_tps': t.get('prompt_per_second', 0),
'predicted_tps': t.get('predicted_per_second', 0),
'total_tokens': u.get('total_tokens', 0),
'latency_ms': round((t.get('prompt_ms', 0) or 0) + (t.get('predicted_ms', 0) or 0), 1),
})
except:
pass
lines.append('---\n')
# Timing table
lines.append('## Timing Summary\n')
pids = list(dict.fromkeys(r['prompt_id'] for r in timing_rows))
lines.append('| prompt | ' + ' | '.join(models) + ' |')
lines.append('|--------' + '|------' * len(models) + '|')
for pid in pids:
cells = []
for model in models:
match = [r for r in timing_rows if r['prompt_id'] == pid and r['model_id'] == model]
if match:
cells.append(f\"{match[0]['predicted_tps']:.0f}\")
else:
cells.append('—')
lines.append(f'| {pid} | ' + ' | '.join(cells) + ' |')
Path('${COMPARE_FILE}').write_text('\n'.join(lines) + '\n')
print(f'Wrote ${COMPARE_FILE}')
# timing.csv
import csv
with open('${TIMING_FILE}', 'w', newline='') as f:
w = csv.DictWriter(f, fieldnames=['prompt_id', 'model_id', 'prompt_tps', 'predicted_tps', 'total_tokens', 'latency_ms'])
w.writeheader()
w.writerows(timing_rows)
print(f'Wrote ${TIMING_FILE}')
"
t_end=$(date +%s)
elapsed=$(( t_end - t_start ))
echo ""
echo "================================================================"
echo " COMPLETE in $(( elapsed / 60 ))m $(( elapsed % 60 ))s"
echo " Results: ${RESULTS_DIR}/"
echo " Compare: ${COMPARE_FILE}"
echo " Timing: ${TIMING_FILE}"
echo "================================================================"