#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" ENDPOINT="http://100.101.41.16:8401/v1" PROMPTS_FILE="${SCRIPT_DIR}/prompts.json" RESULTS_DIR="${SCRIPT_DIR}/results" COMPARE_FILE="${SCRIPT_DIR}/COMPARE.md" TIMING_FILE="${SCRIPT_DIR}/timing.csv" MODELS=( qwen3.6-35b-a3b-mxfp4 qwen3-coder-30b-apex qwen3.6-27b-mtp qwopus3.5-4b-mtp qwen3.5-9b-deepseek-v4-mtp qwopus3.6-35b-a3b-v1 qwopus3.6-27b-v2-mtp qwopus3.5-9b-coder-mtp ) mkdir -p "$RESULTS_DIR" # ── Parse prompts ───────────────────────────────────────────────────── PROMPT_COUNT=$(python3 -c "import json; print(len(json.load(open('${PROMPTS_FILE}'))))") TOTAL=$((PROMPT_COUNT * ${#MODELS[@]})) EST_MIN=$(( TOTAL * 30 / 60 )) echo "================================================================" echo " A/B MODEL COMPARISON" echo " ${PROMPT_COUNT} prompts × ${#MODELS[@]} models = ${TOTAL} requests" echo " Estimated runtime: ~${EST_MIN} minutes" echo " Endpoint: ${ENDPOINT}" echo "================================================================" echo "" # ── Main loop: models (outer) × prompts (inner) ────────────────────── # One model load per model, all prompts answered, then swap. t_start=$(date +%s) done_count=0 for model in "${MODELS[@]}"; do echo "" echo "================================================================" echo " MODEL: ${model}" echo "================================================================" # Warmup: load the model with a trivial request all_cached=true for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do PID=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['id'])") if [ ! -f "${RESULTS_DIR}/${PID}/${model}.json" ] || [ ! -s "${RESULTS_DIR}/${PID}/${model}.json" ]; then all_cached=false break fi done if [ "$all_cached" = "true" ]; then echo " All ${PROMPT_COUNT} prompts cached, skipping model" for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do done_count=$((done_count + 1)) done continue fi echo " Warming up..." curl -s -X POST "${ENDPOINT}/chat/completions" \ -H "Content-Type: application/json" \ -d "{\"model\":\"${model}\",\"messages\":[{\"role\":\"user\",\"content\":\"Say OK.\"}],\"max_tokens\":10,\"temperature\":0}" \ --max-time 300 > /dev/null 2>&1 echo " Warm." for pidx in $(seq 0 $((PROMPT_COUNT - 1))); do PROMPT_ID=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['id'])") AGENT=$(python3 -c "import json; print(json.load(open('${PROMPTS_FILE}'))[${pidx}]['agent'])") mkdir -p "${RESULTS_DIR}/${PROMPT_ID}" OUT_JSON="${RESULTS_DIR}/${PROMPT_ID}/${model}.json" OUT_MD="${RESULTS_DIR}/${PROMPT_ID}/${model}.md" # Resume: skip if already done if [ -f "$OUT_JSON" ] && [ -s "$OUT_JSON" ]; then done_count=$((done_count + 1)) echo " [${PROMPT_ID}] cached (${done_count}/${TOTAL})" continue fi BODY=$(python3 -c " import json p = json.load(open('${PROMPTS_FILE}'))[${pidx}] print(json.dumps({ 'model': '${model}', 'messages': [{'role': 'user', 'content': p['prompt']}], 'temperature': 0.6, 'max_tokens': 2048, 'seed': 42, 'stream': False })) ") SUCCESS=0 for attempt in 1 2; do HTTP_CODE=$(curl -s -w '%{http_code}' -o "$OUT_JSON" \ --max-time 300 \ -X POST "${ENDPOINT}/chat/completions" \ -H "Content-Type: application/json" \ -d "$BODY" 2>/dev/null) if [ "$HTTP_CODE" = "200" ]; then SUCCESS=1 break else if [ "$attempt" = "1" ]; then echo " [${PROMPT_ID}] HTTP ${HTTP_CODE}, retrying in 10s..." sleep 10 else echo "ERROR: HTTP ${HTTP_CODE}" > "$OUT_MD" echo " [${PROMPT_ID}] FAILED (HTTP ${HTTP_CODE})" fi fi done if [ "$SUCCESS" = "1" ]; then python3 -c " import json d = json.load(open('${OUT_JSON}')) msg = d.get('choices', [{}])[0].get('message', {}) content = msg.get('content', '') or '' reasoning = msg.get('reasoning_content', '') or '' out = '' if reasoning: out += '\n' + reasoning + '\n\n\n' out += content open('${OUT_MD}', 'w').write(out) " 2>/dev/null done_count=$((done_count + 1)) METRICS=$(python3 -c " import json d = json.load(open('${OUT_JSON}')) t = d.get('timings', {}) tps = t.get('predicted_per_second', 0) tok = d.get('usage', {}).get('completion_tokens', 0) print(f'{tps:.1f}tok/s {tok}tok') " 2>/dev/null || echo "?") echo " [${PROMPT_ID}] done (${METRICS}) [${done_count}/${TOTAL}]" fi sleep 2 done done # ── Generate COMPARE.md ────────────────────────────────────────────── echo "" echo "Generating COMPARE.md..." MODELS_JSON=$(printf '%s\n' "${MODELS[@]}" | python3 -c "import json,sys; print(json.dumps([l.strip() for l in sys.stdin if l.strip()]))") python3 -c " import json from pathlib import Path prompts = json.load(open('${PROMPTS_FILE}')) results_dir = Path('${RESULTS_DIR}') models = json.loads('${MODELS_JSON}') lines = ['# A/B Model Comparison\n'] timing_rows = [] for p in prompts: pid = p['id'] agent = p['agent'] short = p['prompt'][:80] lines.append(f'## [{pid}] {agent}\n') lines.append(f'> {short}...\n') for model in models: md_path = results_dir / pid / f'{model}.md' json_path = results_dir / pid / f'{model}.json' lines.append(f'### {model}\n') if md_path.exists(): content = md_path.read_text().strip() lines.append(f'{content}\n') else: lines.append('*(no response)*\n') if json_path.exists(): try: d = json.loads(json_path.read_text()) t = d.get('timings', {}) u = d.get('usage', {}) timing_rows.append({ 'prompt_id': pid, 'model_id': model, 'prompt_tps': t.get('prompt_per_second', 0), 'predicted_tps': t.get('predicted_per_second', 0), 'total_tokens': u.get('total_tokens', 0), 'latency_ms': round((t.get('prompt_ms', 0) or 0) + (t.get('predicted_ms', 0) or 0), 1), }) except: pass lines.append('---\n') # Timing table lines.append('## Timing Summary\n') pids = list(dict.fromkeys(r['prompt_id'] for r in timing_rows)) lines.append('| prompt | ' + ' | '.join(models) + ' |') lines.append('|--------' + '|------' * len(models) + '|') for pid in pids: cells = [] for model in models: match = [r for r in timing_rows if r['prompt_id'] == pid and r['model_id'] == model] if match: cells.append(f\"{match[0]['predicted_tps']:.0f}\") else: cells.append('—') lines.append(f'| {pid} | ' + ' | '.join(cells) + ' |') Path('${COMPARE_FILE}').write_text('\n'.join(lines) + '\n') print(f'Wrote ${COMPARE_FILE}') # timing.csv import csv with open('${TIMING_FILE}', 'w', newline='') as f: w = csv.DictWriter(f, fieldnames=['prompt_id', 'model_id', 'prompt_tps', 'predicted_tps', 'total_tokens', 'latency_ms']) w.writeheader() w.writerows(timing_rows) print(f'Wrote ${TIMING_FILE}') " t_end=$(date +%s) elapsed=$(( t_end - t_start )) echo "" echo "================================================================" echo " COMPLETE in $(( elapsed / 60 ))m $(( elapsed % 60 ))s" echo " Results: ${RESULTS_DIR}/" echo " Compare: ${COMPARE_FILE}" echo " Timing: ${TIMING_FILE}" echo "================================================================"