llama-sidecar v0.1.0: daemon + benchmarks + eval suite
Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
242
eval/ab/run.sh
Executable file
242
eval/ab/run.sh
Executable file
@@ -0,0 +1,242 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
ENDPOINT="http://100.101.41.16:8401/v1"
|
||||
PROMPTS_FILE="${SCRIPT_DIR}/prompts.json"
|
||||
RESULTS_DIR="${SCRIPT_DIR}/results"
|
||||
COMPARE_FILE="${SCRIPT_DIR}/COMPARE.md"
|
||||
TIMING_FILE="${SCRIPT_DIR}/timing.csv"
|
||||
|
||||
MODELS=(
|
||||
qwen3.6-35b-a3b-mxfp4
|
||||
qwen3-coder-30b-apex
|
||||
qwen3.6-27b-mtp
|
||||
qwopus3.5-4b-mtp
|
||||
qwen3.5-9b-deepseek-v4-mtp
|
||||
qwopus3.6-35b-a3b-v1
|
||||
qwopus3.6-27b-v2-mtp
|
||||
qwopus3.5-9b-coder-mtp
|
||||
)
|
||||
|
||||
mkdir -p "$RESULTS_DIR"
|
||||
|
||||
# ── Parse prompts ─────────────────────────────────────────────────────
|
||||
|
||||
PROMPT_COUNT=$(python3 -c "import json; print(len(json.load(open('${PROMPTS_FILE}'))))")
|
||||
TOTAL=$((PROMPT_COUNT * ${#MODELS[@]}))
|
||||
EST_MIN=$(( TOTAL * 30 / 60 ))
|
||||
|
||||
echo "================================================================"
|
||||
echo " A/B MODEL COMPARISON"
|
||||
echo " ${PROMPT_COUNT} prompts × ${#MODELS[@]} models = ${TOTAL} requests"
|
||||
echo " Estimated runtime: ~${EST_MIN} minutes"
|
||||
echo " Endpoint: ${ENDPOINT}"
|
||||
echo "================================================================"
|
||||
echo ""
|
||||
|
||||
# ── Main loop: models (outer) × prompts (inner) ──────────────────────
|
||||
# One model load per model, all prompts answered, then swap.
|
||||
|
||||
t_start=$(date +%s)
|
||||
done_count=0
|
||||
|
||||
for model in "${MODELS[@]}"; do
|
||||
echo ""
|
||||
echo "================================================================"
|
||||
echo " MODEL: ${model}"
|
||||
echo "================================================================"
|
||||
|
||||
# Warmup: load the model with a trivial request
|
||||
all_cached=true
|
||||
for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do
|
||||
PID=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['id'])")
|
||||
if [ ! -f "${RESULTS_DIR}/${PID}/${model}.json" ] || [ ! -s "${RESULTS_DIR}/${PID}/${model}.json" ]; then
|
||||
all_cached=false
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$all_cached" = "true" ]; then
|
||||
echo " All ${PROMPT_COUNT} prompts cached, skipping model"
|
||||
for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do
|
||||
done_count=$((done_count + 1))
|
||||
done
|
||||
continue
|
||||
fi
|
||||
|
||||
echo " Warming up..."
|
||||
curl -s -X POST "${ENDPOINT}/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"model\":\"${model}\",\"messages\":[{\"role\":\"user\",\"content\":\"Say OK.\"}],\"max_tokens\":10,\"temperature\":0}" \
|
||||
--max-time 300 > /dev/null 2>&1
|
||||
echo " Warm."
|
||||
|
||||
for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do
|
||||
PROMPT_ID=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['id'])")
|
||||
AGENT=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['agent'])")
|
||||
|
||||
mkdir -p "${RESULTS_DIR}/${PROMPT_ID}"
|
||||
OUT_JSON="${RESULTS_DIR}/${PROMPT_ID}/${model}.json"
|
||||
OUT_MD="${RESULTS_DIR}/${PROMPT_ID}/${model}.md"
|
||||
|
||||
# Resume: skip if already done
|
||||
if [ -f "$OUT_JSON" ] && [ -s "$OUT_JSON" ]; then
|
||||
done_count=$((done_count + 1))
|
||||
echo " [${PROMPT_ID}] cached (${done_count}/${TOTAL})"
|
||||
continue
|
||||
fi
|
||||
|
||||
BODY=$(python3 -c "
|
||||
import json
|
||||
p = json.load(open('${PROMPTS_FILE}'))[${pidx}]
|
||||
print(json.dumps({
|
||||
'model': '${model}',
|
||||
'messages': [{'role': 'user', 'content': p['prompt']}],
|
||||
'temperature': 0.6,
|
||||
'max_tokens': 2048,
|
||||
'seed': 42,
|
||||
'stream': False
|
||||
}))
|
||||
")
|
||||
|
||||
SUCCESS=0
|
||||
for attempt in 1 2; do
|
||||
HTTP_CODE=$(curl -s -w '%{http_code}' -o "$OUT_JSON" \
|
||||
--max-time 300 \
|
||||
-X POST "${ENDPOINT}/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$BODY" 2>/dev/null)
|
||||
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
SUCCESS=1
|
||||
break
|
||||
else
|
||||
if [ "$attempt" = "1" ]; then
|
||||
echo " [${PROMPT_ID}] HTTP ${HTTP_CODE}, retrying in 10s..."
|
||||
sleep 10
|
||||
else
|
||||
echo "ERROR: HTTP ${HTTP_CODE}" > "$OUT_MD"
|
||||
echo " [${PROMPT_ID}] FAILED (HTTP ${HTTP_CODE})"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$SUCCESS" = "1" ]; then
|
||||
python3 -c "
|
||||
import json
|
||||
d = json.load(open('${OUT_JSON}'))
|
||||
msg = d.get('choices', [{}])[0].get('message', {})
|
||||
content = msg.get('content', '') or ''
|
||||
reasoning = msg.get('reasoning_content', '') or ''
|
||||
out = ''
|
||||
if reasoning:
|
||||
out += '<think>\n' + reasoning + '\n</think>\n\n'
|
||||
out += content
|
||||
open('${OUT_MD}', 'w').write(out)
|
||||
" 2>/dev/null
|
||||
done_count=$((done_count + 1))
|
||||
METRICS=$(python3 -c "
|
||||
import json
|
||||
d = json.load(open('${OUT_JSON}'))
|
||||
t = d.get('timings', {})
|
||||
tps = t.get('predicted_per_second', 0)
|
||||
tok = d.get('usage', {}).get('completion_tokens', 0)
|
||||
print(f'{tps:.1f}tok/s {tok}tok')
|
||||
" 2>/dev/null || echo "?")
|
||||
echo " [${PROMPT_ID}] done (${METRICS}) [${done_count}/${TOTAL}]"
|
||||
fi
|
||||
|
||||
sleep 2
|
||||
done
|
||||
done
|
||||
|
||||
# ── Generate COMPARE.md ──────────────────────────────────────────────
|
||||
|
||||
echo ""
|
||||
echo "Generating COMPARE.md..."
|
||||
|
||||
MODELS_JSON=$(printf '%s\n' "${MODELS[@]}" | python3 -c "import json,sys; print(json.dumps([l.strip() for l in sys.stdin if l.strip()]))")
|
||||
|
||||
python3 -c "
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
prompts = json.load(open('${PROMPTS_FILE}'))
|
||||
results_dir = Path('${RESULTS_DIR}')
|
||||
models = json.loads('${MODELS_JSON}')
|
||||
|
||||
lines = ['# A/B Model Comparison\n']
|
||||
|
||||
timing_rows = []
|
||||
|
||||
for p in prompts:
|
||||
pid = p['id']
|
||||
agent = p['agent']
|
||||
short = p['prompt'][:80]
|
||||
lines.append(f'## [{pid}] {agent}\n')
|
||||
lines.append(f'> {short}...\n')
|
||||
|
||||
for model in models:
|
||||
md_path = results_dir / pid / f'{model}.md'
|
||||
json_path = results_dir / pid / f'{model}.json'
|
||||
lines.append(f'### {model}\n')
|
||||
if md_path.exists():
|
||||
content = md_path.read_text().strip()
|
||||
lines.append(f'{content}\n')
|
||||
else:
|
||||
lines.append('*(no response)*\n')
|
||||
|
||||
if json_path.exists():
|
||||
try:
|
||||
d = json.loads(json_path.read_text())
|
||||
t = d.get('timings', {})
|
||||
u = d.get('usage', {})
|
||||
timing_rows.append({
|
||||
'prompt_id': pid,
|
||||
'model_id': model,
|
||||
'prompt_tps': t.get('prompt_per_second', 0),
|
||||
'predicted_tps': t.get('predicted_per_second', 0),
|
||||
'total_tokens': u.get('total_tokens', 0),
|
||||
'latency_ms': round((t.get('prompt_ms', 0) or 0) + (t.get('predicted_ms', 0) or 0), 1),
|
||||
})
|
||||
except:
|
||||
pass
|
||||
lines.append('---\n')
|
||||
|
||||
# Timing table
|
||||
lines.append('## Timing Summary\n')
|
||||
pids = list(dict.fromkeys(r['prompt_id'] for r in timing_rows))
|
||||
lines.append('| prompt | ' + ' | '.join(models) + ' |')
|
||||
lines.append('|--------' + '|------' * len(models) + '|')
|
||||
for pid in pids:
|
||||
cells = []
|
||||
for model in models:
|
||||
match = [r for r in timing_rows if r['prompt_id'] == pid and r['model_id'] == model]
|
||||
if match:
|
||||
cells.append(f\"{match[0]['predicted_tps']:.0f}\")
|
||||
else:
|
||||
cells.append('—')
|
||||
lines.append(f'| {pid} | ' + ' | '.join(cells) + ' |')
|
||||
|
||||
Path('${COMPARE_FILE}').write_text('\n'.join(lines) + '\n')
|
||||
print(f'Wrote ${COMPARE_FILE}')
|
||||
|
||||
# timing.csv
|
||||
import csv
|
||||
with open('${TIMING_FILE}', 'w', newline='') as f:
|
||||
w = csv.DictWriter(f, fieldnames=['prompt_id', 'model_id', 'prompt_tps', 'predicted_tps', 'total_tokens', 'latency_ms'])
|
||||
w.writeheader()
|
||||
w.writerows(timing_rows)
|
||||
print(f'Wrote ${TIMING_FILE}')
|
||||
"
|
||||
|
||||
t_end=$(date +%s)
|
||||
elapsed=$(( t_end - t_start ))
|
||||
echo ""
|
||||
echo "================================================================"
|
||||
echo " COMPLETE in $(( elapsed / 60 ))m $(( elapsed % 60 ))s"
|
||||
echo " Results: ${RESULTS_DIR}/"
|
||||
echo " Compare: ${COMPARE_FILE}"
|
||||
echo " Timing: ${TIMING_FILE}"
|
||||
echo "================================================================"
|
||||
Reference in New Issue
Block a user