llama-sidecar/bench/bench.sh

#!/usr/bin/env bash
set -euo pipefail

ENDPOINT="http://100.101.41.16:8650"
SSH_HOST="samki@100.101.41.16"
TASK_NAME="bench_llama"
BAT_PATH='%TEMP%\bench_run.bat'
RESULTS_DIR="$(cd "$(dirname "$0")" && pwd)/results"
PROMPTS_DIR="$(cd "$(dirname "$0")" && pwd)/prompts"
MAX_TOKENS=200
HEALTH_TIMEOUT=120
LLAMA_BIN='D:\llama-server\llama-server.exe'

mkdir -p "$RESULTS_DIR"

# ── Config matrix: STEM|MTP_STATE|FULL_ARGS ───────────────────────────

CONFIGS=(
'Qwen3.6-35B-A3B-MXFP4_MOE|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-35B-A3B-MXFP4_MOE.gguf --mmproj D:\models\Qwen3.6-35B-A3B-MXFP4_MOE\mmproj.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwen3.6-35B-A3B-MXFP4_MOE|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-35B-A3B-MXFP4_MOE.gguf --mmproj D:\models\Qwen3.6-35B-A3B-MXFP4_MOE\mmproj.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwen3.6-27B-Q6_K|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-27B-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwen3.6-27B-Q6_K|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.6-27B-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwopus3.5-4B-v3-MTP-Q8_0|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-4B-v3-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwopus3.5-4B-v3-MTP-Q8_0|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-4B-v3-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwen3.5-9B-DeepSeek-V4-Flash-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-35B-A3B-v1-MTP-MXFP4_MOE_BF16.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 2048 --parallel 1 --batch-size 4096 --ubatch-size 1024 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwopus3.6-27B-v2-MTP-Q6_K|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-27B-v2-MTP-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwopus3.6-27B-v2-MTP-Q6_K|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.6-27B-v2-MTP-Q6_K.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q4_0 --cache-type-v q4_0 --jinja --chat-template-file D:\models\qwen3.6.jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwopus3.5-9B-Coder-MTP-Q8_0|off|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-9B-Coder-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --temp 0.4 --top-p 0.8 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'

'Qwopus3.5-9B-Coder-MTP-Q8_0|on|--host 0.0.0.0 --port 8650 -m D:\models\Qwopus3.5-9B-Coder-MTP-Q8_0.gguf -ngl 99 --ctx-size 32768 --flash-attn on --cont-batching --cache-type-k q8_0 --cache-type-v q8_0 --jinja --keep -1 --cache-reuse 1024 --parallel 1 --batch-size 2048 --ubatch-size 512 --threads 8 --no-mmap --mlock --seed 42 --spec-type draft-mtp --spec-draft-n-max 2 --temp 0.4 --top-p 0.8 --top-k 20 --min-p 0.0 --repeat-penalty 1.0'
)

PROMPT_LENS=(256 1024 4096)

# ── Helper functions ──────────────────────────────────────────────────

kill_bench_server() {
  local pids
  pids=$(ssh "$SSH_HOST" 'for /f "tokens=5" %a in ('"'"'netstat -aon ^| findstr :8650 ^| findstr LISTENING'"'"') do @echo %a' 2>/dev/null || true)
  for pid in $pids; do
    if [ -n "$pid" ] && [ "$pid" != "0" ]; then
      ssh "$SSH_HOST" "taskkill /F /PID $pid" 2>/dev/null || true
    fi
  done
  ssh "$SSH_HOST" "schtasks /Delete /TN ${TASK_NAME} /F" 2>/dev/null || true
  sleep 3
}

start_bench_server() {
  local args="$1"
  # Write a batch file, then run it via schtasks
  ssh "$SSH_HOST" "echo ${LLAMA_BIN} ${args} > ${BAT_PATH}" 2>/dev/null
  ssh "$SSH_HOST" "schtasks /Create /TN ${TASK_NAME} /TR ${BAT_PATH} /SC ONCE /ST 00:00 /F /RL HIGHEST" 2>/dev/null
  ssh "$SSH_HOST" "schtasks /Run /TN ${TASK_NAME}" 2>/dev/null
}

poll_health() {
  local elapsed=0
  while [ $elapsed -lt $HEALTH_TIMEOUT ]; do
    if curl -sf "${ENDPOINT}/health" >/dev/null 2>&1; then
      echo "  health OK (${elapsed}s)"
      return 0
    fi
    sleep 3
    elapsed=$((elapsed + 3))
    if [ $((elapsed % 15)) -eq 0 ]; then
      echo "  waiting... (${elapsed}s)"
    fi
  done
  echo "  HEALTH TIMEOUT after ${HEALTH_TIMEOUT}s"
  return 1
}

send_request() {
  local prompt_file="$1"
  local output_file="$2"
  local body
  body=$(python3 -c "
import json
prompt = open('${prompt_file}').read()
print(json.dumps({
    'messages': [{'role': 'user', 'content': prompt}],
    'max_tokens': ${MAX_TOKENS},
    'temperature': 0,
    'seed': 42,
    'stream': False
}))
")
  local http_code
  http_code=$(curl -s -w '%{http_code}' -o "$output_file" \
    --max-time 300 \
    -X POST "${ENDPOINT}/v1/chat/completions" \
    -H "Content-Type: application/json" \
    -d "$body" 2>/dev/null)
  if [ "$http_code" != "200" ]; then
    echo "HTTP ${http_code}"
    return 1
  fi
  return 0
}

print_metrics() {
  python3 -c "
import json
d = json.load(open('${1}'))
t = d.get('timings', {})
ptps = t.get('prompt_per_second', 0)
etps = t.get('predicted_per_second', 0)
dn = t.get('draft_n', '')
da = t.get('draft_n_accepted', '')
draft = ''
if dn != '':
    draft = f'  draft={da}/{dn}'
print(f'prompt={ptps:.1f}  eval={etps:.1f} tok/s{draft}')
" 2>/dev/null || echo "(parse error)"
}

# ── Main ──────────────────────────────────────────────────────────────

total=${#CONFIGS[@]}
echo "================================================================"
echo "  MTP ON/OFF BENCHMARK SWEEP"
echo "  ${total} configs x 3 prompts x 3 runs"
echo "  Endpoint: ${ENDPOINT}"
echo "================================================================"

t_start=$(date +%s)
config_idx=0

for config_entry in "${CONFIGS[@]}"; do
  config_idx=$((config_idx + 1))
  IFS='|' read -r stem mtp_state args <<< "$config_entry"

  echo ""
  echo "================================================================"
  echo "  [${config_idx}/${total}] ${stem}  MTP=${mtp_state}"
  echo "================================================================"

  kill_bench_server
  echo "  Starting llama-server..."
  start_bench_server "$args"

  if ! poll_health; then
    echo "  SKIPPING"
    kill_bench_server
    continue
  fi

  for len in "${PROMPT_LENS[@]}"; do
    prompt_file="${PROMPTS_DIR}/p${len}.txt"
    [ -f "$prompt_file" ] || { echo "  Missing p${len}.txt"; continue; }
    echo "  -- p${len} --"
    for run in 1 2 3; do
      outfile="${RESULTS_DIR}/${stem}__mtp-${mtp_state}__len${len}__run${run}.json"
      printf "    run %d: " "$run"
      if send_request "$prompt_file" "$outfile"; then
        print_metrics "$outfile"
      fi
      sleep 1
    done
  done

  echo "  Killing..."
  kill_bench_server
done

t_end=$(date +%s)
elapsed=$(( t_end - t_start ))
echo ""
echo "================================================================"
echo "  SWEEP COMPLETE in $(( elapsed / 60 ))m $(( elapsed % 60 ))s"
echo "  Run: python3 $(dirname "$0")/analyze.py"
echo "================================================================"