chore: snapshot working tree - pty_exited notifications + in-flight inference WIP

feat(booterm): structured pty_exited WS notifications. Plan-validated, impl-validated, code-reviewed green (contracts build clean, contracts test 29/29, booterm + web typecheck clean). wip: in-progress inference/provider refactor (agents.ts, provider.ts, new llama-providers.ts, removed llama-args-validator), plus arena, dispatcher, compaction, schema changes. openspec: pty-exit-notifications complete; x-agent-flags planned (not yet implemented).
2026-06-14 12:48:47 +00:00
parent 0ed506f1da
commit b18de2a331
204 changed files with 25344 additions and 867 deletions
--- a/apps/control/src/config.ts
+++ b/apps/control/src/config.ts
@@ -0,0 +1,29 @@
+import { z } from 'zod';
+
+const schema = z.object({
+  NODE_ENV: z.enum(['development', 'production']).default('production'),
+  PORT: z.coerce.number().default(9503),
+  HOST: z.string().default('100.114.205.53'),
+  DATABASE_URL: z.string(),
+  LOG_LEVEL: z.enum(['fatal', 'error', 'warn', 'info', 'debug', 'trace']).default('info'),
+  RETENTION_RAW_HOURS: z.coerce.number().default(48),
+  RETENTION_ROLLUP_DAYS: z.coerce.number().default(90),
+  CAPTURE_SIZE_KB: z.coerce.number().default(256),
+  CAPTURE_BUDGET_MB: z.coerce.number().default(50),
+  LLAMA_PROVIDERS_PATH: z.string().optional(),
+  LLAMA_SWAP_URL: z.string().default('http://localhost:8080'),
+  // P9.1: path to the llama-swap config-schema.json (fork). Defaults to the
+  // copy bundled under dist/data; override to point at the live fork schema.
+  LLAMA_CONFIG_SCHEMA_PATH: z.string().optional(),
+});
+
+export type Config = z.infer<typeof schema>;
+
+export function loadConfig(): Config {
+  const result = schema.safeParse(process.env);
+  if (!result.success) {
+    console.error('Invalid env:', result.error.message);
+    process.exit(1);
+  }
+  return result.data;
+}
--- a/apps/control/src/db.ts
+++ b/apps/control/src/db.ts
@@ -0,0 +1,67 @@
+import postgres from 'postgres';
+import { readFile } from 'node:fs/promises';
+import { fileURLToPath } from 'node:url';
+import { dirname, resolve } from 'node:path';
+import type { Config } from './config.js';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+export type Sql = ReturnType<typeof postgres>;
+
+let sqlInstance: Sql | null = null;
+
+export function getSql(config: Config): Sql {
+  if (sqlInstance) return sqlInstance;
+  sqlInstance = postgres(config.DATABASE_URL, {
+    max: 10,
+    idle_timeout: 30,
+    connect_timeout: 10,
+    onnotice: () => {},
+  });
+  return sqlInstance;
+}
+
+/**
+ * Poll information_schema.tables for a table name with exponential backoff.
+ * Throws on timeout so systemd Restart=on-failure retries.
+ */
+export async function waitForTable(sql: Sql, tableName: string, timeoutMs: number): Promise<void> {
+  const start = Date.now();
+  const baseDelay = 100;
+  const cap = 2000;
+  while (true) {
+    const rows = await sql<{ table_name: string }[]>`
+      SELECT table_name FROM information_schema.tables
+      WHERE table_schema = 'public' AND table_name = ${tableName}
+    `;
+    if (rows.length > 0) return;
+    if (Date.now() - start >= timeoutMs) {
+      throw new Error(`timeout waiting for table '${tableName}' after ${timeoutMs}ms`);
+    }
+    const delay = Math.min(cap, baseDelay * 2 ** Math.floor((Date.now() - start) / 1000));
+    await new Promise((r) => setTimeout(r, delay));
+  }
+}
+
+export async function applySchema(sql: Sql): Promise<void> {
+  const schemaPath = resolve(__dirname, 'schema.sql');
+  const ddl = await readFile(schemaPath, 'utf8');
+  await sql.unsafe(ddl);
+}
+
+export async function pingDb(sql: Sql): Promise<boolean> {
+  try {
+    await sql`SELECT 1`;
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+export async function closeDb(): Promise<void> {
+  if (sqlInstance) {
+    await sqlInstance.end({ timeout: 5 });
+    sqlInstance = null;
+  }
+}
--- a/apps/control/src/index.ts
+++ b/apps/control/src/index.ts
@@ -0,0 +1,624 @@
+import Fastify from 'fastify';
+import fastifyWebsocket from '@fastify/websocket';
+import { loadConfig } from './config.js';
+import { getSql, applySchema, pingDb, waitForTable } from './db.js';
+import type { FleetState, HostState } from './services/fleet-state.js';
+import { createFleetState, ensureHostState, stampLastSeen, incrementSeq } from './services/fleet-state.js';
+import { registerControlWebSocket } from './routes/ws.js';
+import type { LlamaSweepSSEEvent, MetricsEntry } from './services/fleet-connector.js';
+import { startFleetConnector } from './services/fleet-connector.js';
+import { buildRetentionConfig, runRollup, pruneRawSamples, pruneActivity, pruneModelEvents, trimCapture, parseCaptureJson } from './services/retention.js';
+import { detectGap } from './services/reconcile.js';
+import { jsonbObject } from './services/jsonb.js';
+import { ActionQueue } from './services/action-queue.js';
+import { LogRelay } from './services/log-relay.js';
+import { registerActionRoutes } from './routes/actions.js';
+import { registerCaptureRoutes } from './routes/captures.js';
+import { registerBenchRoutes, setBenchApp } from './routes/bench.js';
+import { registerPlaygroundRoutes } from './routes/playground.js';
+import { registerEvalRoutes } from './routes/evals.js';
+import { registerRoutingRoutes } from './routes/routing.js';
+import { registerReportRoutes, startReportScheduler } from './routes/reports.js';
+import { registerGatewayRoutes } from './routes/gateway.js';
+import { registerPolicyRoutes } from './routes/policies.js';
+import { registerSshConfigRoutes } from './routes/ssh-config.js';
+import { loadLlamaProviders, getLlamaProviders, resolveProviderBaseUrl } from './services/llama-providers.js';
+
+// ─── delta emitter (B3 fix) ─────────────────────────────────────────────────
+
+export type DeltaCallback = (delta: unknown) => void;
+export type DeltaEmitter = {
+  subscribe(cb: DeltaCallback): () => void;
+  publish(delta: unknown): void;
+};
+
+export function createDeltaEmitter(): DeltaEmitter {
+  const listeners = new Set<DeltaCallback>();
+  return {
+    subscribe(cb: DeltaCallback): () => void {
+      listeners.add(cb);
+      return () => { listeners.delete(cb); };
+    },
+    publish(delta: unknown): void {
+      for (const cb of listeners) {
+        try { cb(delta); } catch { /* ignore emitter errors */ }
+      }
+    },
+  };
+}
+
+// ─── metrics entry field-name mapper ─────────────────────────────────────────
+// Real /api/metrics shape has nested tokens and different field names:
+//   {id, timestamp, model, req_path, resp_status_code, tokens:{...}, duration_ms, has_capture}
+// Map to the column names used in control_requests.
+
+interface MappedMetricsEntry {
+  id: number;
+  ts: string;
+  model: string;
+  req_path: string;
+  status_code: number;
+  duration_ms: number;
+  cache_tokens: number;
+  input_tokens: number;
+  output_tokens: number;
+  prompt_tps: number;
+  gen_tps: number;
+  has_capture: boolean;
+  /** P4: NULL for ring data — ActivityLogEntry does not carry request headers. */
+  source: string | null;
+}
+
+function mapMetricsEntry(entry: MetricsEntry): MappedMetricsEntry {
+  return {
+    id: entry.id,
+    ts: entry.timestamp,
+    model: entry.model,
+    req_path: entry.req_path,
+    status_code: entry.resp_status_code,
+    duration_ms: entry.duration_ms,
+    cache_tokens: entry.tokens.cache_tokens,
+    input_tokens: entry.tokens.input_tokens,
+    output_tokens: entry.tokens.output_tokens,
+    prompt_tps: entry.tokens.prompt_per_second,
+    gen_tps: entry.tokens.tokens_per_second,
+    has_capture: entry.has_capture,
+    /** P4: NULL — ActivityLogEntry does not carry request headers. */
+    source: null,
+  };
+}
+
+// ─── SSE event handlers (B5 fix: await onEvent; B2 fix: incrementSeq) ───────
+
+export async function handleLlamaSweepEvent(
+  fleet: FleetState,
+  sql: ReturnType<typeof getSql>,
+  config: ReturnType<typeof loadConfig>,
+  providerId: string,
+  emitter: DeltaEmitter,
+  event: LlamaSweepSSEEvent,
+  logRelay: LogRelay | null = null,
+): Promise<void> {
+  const state = ensureHostState(fleet, providerId);
+  stampLastSeen(state);
+
+  switch (event.type) {
+    case 'modelStatus': {
+      // Real payload: FULL-FLEET array of {id, state, ...} (fork apiModel).
+      // Derive transitions by diffing against current state; persist only changes.
+      state.liveness = 'connected';
+      const changed: Array<{ model: string; state: string }> = [];
+      for (const m of event.data) {
+        const prev = state.models.get(m.id);
+        if (!prev || prev.state !== m.state) {
+          changed.push({ model: m.id, state: m.state });
+        }
+        state.models.set(m.id, {
+          model: m.id,
+          state: m.state,
+          ts: new Date(),
+          ttlDeadline: prev?.ttlDeadline ?? null,
+          inflight: prev?.inflight ?? 0,
+        });
+      }
+      if (changed.length === 0) break;
+      const seq = incrementSeq(state);
+      for (const c of changed) {
+        await sql`
+          INSERT INTO control_model_events (provider_id, model, state, ts, detail)
+          VALUES (${providerId}, ${c.model}, ${c.state}, clock_timestamp(), ${sql.json({} as never)})
+          ON CONFLICT (provider_id, model, state, ts) DO NOTHING
+        `;
+      }
+      // Publish delta to WS subscribers (B3 fix).
+      emitter.publish({
+        type: 'control_fleet' as const,
+        seq,
+        hosts: [{
+          providerId: state.providerId,
+          liveness: state.liveness,
+          lastSeenAt: state.lastSeenAt?.toISOString() ?? null,
+          seq: state.seq,
+          models: Array.from(state.models.values()).map((m) => ({
+            model: m.model,
+            state: m.state,
+            ts: m.ts.toISOString(),
+            ttlDeadline: m.ttlDeadline?.toISOString() ?? null,
+            inflight: m.inflight,
+          })),
+        }],
+      });
+      break;
+    }
+    case 'logData': {
+      // Logs are relay-only; no persistence by default.
+      const source = event.data.source as 'proxy' | 'upstream' | 'model';
+      // Real payload field is 'data' (fork sendLogData), may contain multiple lines.
+      const text = event.data.data;
+      if (logRelay) {
+        logRelay.append(providerId, source, text);
+      }
+      const seq = incrementSeq(state);
+      emitter.publish({
+        type: 'control_log' as const,
+        seq,
+        providerId,
+        source,
+        line: text,
+      });
+      break;
+    }
+    case 'metrics': {
+      // Real payload: BARE array of ActivityLogEntry (fork sendMetrics).
+      const entries = event.data;
+      // B5 fix: await onEvent (handleReconcile is async).
+      const seq = incrementSeq(state);
+      await handleReconcile(fleet, sql, config, providerId, emitter, event.data).catch((err) => {
+        // A1: log the error instead of swallowing silently.
+        const msg = (err as Error).message ?? String(err);
+        console.warn({ providerId, err: msg }, 'fleet: reconcile failed');
+      });
+      // Publish activity deltas.
+      for (const entry of entries) {
+        const captureTrimmed = entry.capture ? trimCapture(entry.capture, config.CAPTURE_SIZE_KB) : null;
+        const captureObj = captureTrimmed ? parseCaptureJson(captureTrimmed) : null;
+        // Map real field names: resp_status_code -> status_code, tokens.* nested, timestamp -> ts.
+        const mapped = mapMetricsEntry(entry);
+        await sql`
+          INSERT INTO control_requests (provider_id, swap_entry_id, ts, model, req_path, status_code, duration_ms, cache_tokens, input_tokens, output_tokens, prompt_tps, gen_tps, has_capture, capture, source)
+          VALUES (${providerId}, ${mapped.id}, ${mapped.ts}, ${mapped.model}, ${mapped.req_path}, ${mapped.status_code}, ${mapped.duration_ms}, ${mapped.cache_tokens}, ${mapped.input_tokens}, ${mapped.output_tokens}, ${mapped.prompt_tps}, ${mapped.gen_tps}, ${mapped.has_capture}, ${captureObj ? sql.json(captureObj as never) : sql`NULL::jsonb`}, ${mapped.source})
+          ON CONFLICT (provider_id, swap_entry_id, ts) DO NOTHING
+        `;
+        emitter.publish({
+          type: 'control_activity' as const,
+          seq: state.seq,
+          providerId,
+          entry: {
+            id: mapped.id,
+            ts: mapped.ts,
+            model: mapped.model,
+            reqPath: mapped.req_path,
+            statusCode: mapped.status_code,
+            durationMs: mapped.duration_ms,
+          },
+        });
+      }
+      break;
+    }
+    case 'inflight': {
+      // Real payload: {total} -- host-level total (fork sendInFlight); the fork
+      // does not publish per-model inflight over SSE.
+      state.inflightTotal = event.data.total;
+      break;
+    }
+  }
+}
+
+// ─── reconcile handler (B7 fix: called from metrics event) ───────────────────
+
+async function handleReconcile(
+  fleet: FleetState,
+  sql: ReturnType<typeof getSql>,
+  config: ReturnType<typeof loadConfig>,
+  providerId: string,
+  emitter: DeltaEmitter,
+  metrics: MetricsEntry[],
+): Promise<boolean> {
+  const state = ensureHostState(fleet, providerId);
+  stampLastSeen(state);
+  state.liveness = 'connected';
+
+// Detect gap: if oldest reconcile entry is newer than newest persisted entry
+    // for that provider, the ring wrapped past our tail.
+  const entries = metrics ?? [];
+  const oldestReconcileTs = entries.length > 0
+    ? entries[entries.length - 1]!.timestamp
+    : null;
+
+  if (oldestReconcileTs) {
+    const newestPersisted = await sql<{ ts: string }[]>`
+      SELECT ts FROM control_requests
+      WHERE provider_id = ${providerId}
+      ORDER BY ts DESC LIMIT 1
+    `;
+
+    if (newestPersisted.length > 0) {
+      const newestRow = newestPersisted[0]!;
+      if (detectGap(oldestReconcileTs, newestRow.ts)) {
+        await sql`
+          INSERT INTO control_model_events (provider_id, model, state, ts, detail)
+          VALUES (${providerId}, '*', 'gap_suspected', clock_timestamp(), ${sql.json({
+            oldestReconcile: oldestReconcileTs,
+            newestPersisted: newestRow.ts,
+          } as never)})
+          ON CONFLICT (provider_id, model, state, ts) DO NOTHING
+        `;
+      }
+    }
+  }
+
+  // Ingest reconcile entries (dedup via UNIQUE constraint).
+  for (const entry of entries) {
+    const mapped = mapMetricsEntry(entry);
+    await sql`
+        INSERT INTO control_requests (provider_id, swap_entry_id, ts, model, req_path, status_code, duration_ms, cache_tokens, input_tokens, output_tokens, prompt_tps, gen_tps, has_capture, source)
+        VALUES (${providerId}, ${mapped.id}, ${mapped.ts}, ${mapped.model}, ${mapped.req_path}, ${mapped.status_code}, ${mapped.duration_ms}, ${mapped.cache_tokens}, ${mapped.input_tokens}, ${mapped.output_tokens}, ${mapped.prompt_tps}, ${mapped.gen_tps}, ${mapped.has_capture}, ${mapped.source})
+        ON CONFLICT (provider_id, swap_entry_id, ts) DO NOTHING
+      `;
+  }
+
+  return true;
+}
+
+// ─── perf poller (A7 fix: add timeout; A8 fix: log errors) ───────────────────
+
+async function pollPerformance(
+  sql: ReturnType<typeof getSql>,
+  config: ReturnType<typeof loadConfig>,
+  providerId: string,
+  baseUrl: string,
+  fleet: FleetState,
+  emitter: DeltaEmitter,
+): Promise<void> {
+  const state = ensureHostState(fleet, providerId);
+
+  // Recover watermark from MAX(ts) per provider.
+  const watermark = await sql<{ ts: string | null }[]>`
+    SELECT MAX(ts) AS ts FROM control_perf_samples WHERE provider_id = ${providerId}
+  `;
+
+  // porsager returns timestamptz as a Date object; interpolating it raw yields
+  // Date.toString() ("Thu Jun 12 2026 ...") which llama-swap rejects with 400.
+  const afterParam = watermark[0]?.ts
+    ? `?after=${encodeURIComponent(new Date(watermark[0].ts).toISOString())}`
+    : '';
+  const url = `${baseUrl}/api/performance${afterParam}`;
+
+  try {
+    // A7 fix: add fetch timeout via AbortController.
+    const fetchSignal = AbortSignal.timeout(10_000);
+    const res = await fetch(url, { signal: fetchSignal });
+    if (!res.ok) return;
+
+    // Real shape: { gpu_stats: GpuStat[], sys_stats: SysStat[] }
+    const data = await res.json() as { gpu_stats?: unknown[]; sys_stats?: unknown[] } | null;
+    if (!data) return;
+
+    // Pair gpu_stats and sys_stats by timestamp.
+    const gpuMap = new Map<string, unknown>();
+    for (const g of data.gpu_stats ?? []) {
+      const gpu = g as { timestamp?: string };
+      if (gpu.timestamp) {
+        gpuMap.set(gpu.timestamp, g);
+      }
+    }
+
+    const sysMap = new Map<string, unknown>();
+    for (const s of data.sys_stats ?? []) {
+      const sys = s as { timestamp?: string };
+      if (sys.timestamp) {
+        sysMap.set(sys.timestamp, s);
+      }
+    }
+
+    // Collect all unique timestamps.
+    const allTimestamps = new Set([...gpuMap.keys(), ...sysMap.keys()]);
+    if (allTimestamps.size === 0) return;
+
+    stampLastSeen(state);
+
+    for (const ts of allTimestamps) {
+      const gpu = gpuMap.get(ts) ?? null;
+      const sys = sysMap.get(ts) ?? null;
+
+      await sql`
+        INSERT INTO control_perf_samples (provider_id, ts, gpu, sys)
+        VALUES (${providerId}, ${ts}, ${sql.json(gpu as never)}, ${sql.json(sys as never)})
+        ON CONFLICT (provider_id, ts) DO NOTHING
+      `;
+
+      const seq = incrementSeq(state);
+      emitter.publish({
+        type: 'control_perf' as const,
+        seq,
+        providerId,
+        ts,
+        gpu,
+        sys,
+      });
+    }
+  } catch (err) {
+    // A8 fix: log the error instead of swallowing silently.
+    const msg = (err as Error).message ?? String(err);
+    console.warn({ providerId, err: msg }, 'fleet: perf poll failed');
+  }
+}
+
+// ─── fleet-state rebuild from DB (A1/F2 fix) ─────────────────────────────────
+
+async function rebuildFleetFromDB(fleet: FleetState, sql: ReturnType<typeof getSql>): Promise<void> {
+  // Query control_model_events for latest model state per provider.
+  // B3: ORDER BY ASC so iteration processes oldest first; Map.set() overwrites
+  // with the latest state for each model, so the newest event wins.
+  const modelEvents = await sql<{ provider_id: string; model: string; state: string; ts: string; detail: string }[]>`
+    SELECT provider_id, model, state, ts, detail
+    FROM control_model_events
+    WHERE ts IN (
+      SELECT MAX(ts) FROM control_model_events
+      GROUP BY provider_id, model, state
+    )
+    ORDER BY ts ASC
+  `;
+
+  for (const row of modelEvents) {
+    const state = ensureHostState(fleet, row.provider_id);
+    state.liveness = 'down';
+    stampLastSeen(state);
+    // row.detail is jsonb (porsager returns it parsed); jsonbObject tolerates
+    // both a parsed object and a JSON string.
+    const detail: unknown = jsonbObject(row.detail);
+    // B4: ttlDeadline recalculation. The live modelStatus handler (index.ts:57)
+    // computes ttlDeadline = new Date(Date.now() + ttl * 1000), relative to event
+    // arrival time. For rebuild, use the event timestamp so the deadline reflects
+    // when the model was actually loaded, not when we rebuild.
+    const ttl = (detail as { ttl?: number })?.ttl;
+    const eventTs = new Date(row.ts).getTime();
+    const ttlDeadline = ttl ? new Date(eventTs + ttl * 1000) : null;
+    state.models.set(row.model, {
+      model: row.model,
+      state: row.state,
+      ts: new Date(row.ts),
+      ttlDeadline,
+      inflight: 0,
+    });
+  }
+
+  // Query control_requests for last activity.
+  const lastRequests = await sql<{ provider_id: string; ts: string }[]>`
+    SELECT provider_id, ts FROM control_requests
+    WHERE ts IN (
+      SELECT MAX(ts) FROM control_requests GROUP BY provider_id
+    )
+    ORDER BY ts DESC
+  `;
+
+  for (const row of lastRequests) {
+    const state = ensureHostState(fleet, row.provider_id);
+    stampLastSeen(state);
+  }
+
+  // Query control_perf_samples for latest perf sample.
+  const lastPerf = await sql<{ provider_id: string; ts: string }[]>`
+    SELECT provider_id, ts FROM control_perf_samples
+    WHERE ts IN (
+      SELECT MAX(ts) FROM control_perf_samples GROUP BY provider_id
+    )
+    ORDER BY ts DESC
+  `;
+
+  for (const row of lastPerf) {
+    const state = ensureHostState(fleet, row.provider_id);
+    stampLastSeen(state);
+  }
+}
+
+// ─── main ───────────────────────────────────────────────────────────────────
+
+async function main() {
+  const config = loadConfig();
+  const app = Fastify({ logger: { level: config.LOG_LEVEL } });
+
+  app.removeContentTypeParser(['application/json']);
+  app.addContentTypeParser('application/json', { parseAs: 'string' }, (_req: unknown, body: unknown, done: (err: Error | null, body: unknown) => void) => {
+    const str = (body as string) ?? '';
+    if (str.trim().length === 0) {
+      done(null, {});
+      return;
+    }
+    try {
+      done(null, JSON.parse(str));
+    } catch (err) {
+      done(err as Error, undefined);
+    }
+  });
+
+  const sql = getSql(config);
+
+  // Startup ordering guard: wait for server-owned tables before applying schema.
+  await waitForTable(sql, 'sessions', 30_000);
+  await applySchema(sql);
+  app.log.info('database schema applied');
+
+  // Register WebSocket endpoint.
+  const fleet = createFleetState();
+  const emitter = createDeltaEmitter();
+
+  // P2: Action queue + log relay
+  const actionQueue = new ActionQueue();
+  const logRelay = new LogRelay();
+  registerControlWebSocket(app, fleet, emitter, logRelay);
+  registerActionRoutes(app, actionQueue, fleet, emitter);
+  registerCaptureRoutes(app, sql);
+  setBenchApp(app.log);
+  registerBenchRoutes(app, sql, fleet, emitter);
+  registerPlaygroundRoutes(app);
+  registerEvalRoutes(app, sql, fleet, emitter);
+  registerRoutingRoutes(app, sql, fleet);
+  registerReportRoutes(app, sql);
+  registerGatewayRoutes(app, sql, fleet, emitter);
+  registerPolicyRoutes(app, sql);
+  registerSshConfigRoutes(app, sql, config, fleet, emitter);
+
+  // Health endpoint.
+  app.get('/api/health', async (_req: unknown, reply: import('fastify').FastifyReply) => {
+    const dbOk = await pingDb(sql);
+    const status = dbOk ? 200 : 503;
+    return reply.status(status).send({
+      ok: dbOk,
+      db: dbOk,
+    });
+  });
+
+  // Rebuild fleet state from DB on startup (A1/F2 fix).
+  await rebuildFleetFromDB(fleet, sql).catch((err) => {
+    app.log.warn({ err: (err as Error).message }, 'fleet: rebuild from DB failed');
+  });
+
+  // Load the provider registry — baseUrl comes from the registry, never from ssh_host.
+  const registry = loadLlamaProviders(config.LLAMA_PROVIDERS_PATH, config.LLAMA_SWAP_URL);
+  app.log.info({ count: registry.providers.length }, 'fleet: provider registry loaded');
+
+  // P7.2: the auto:* gateway is itself a registry entry (kind boocontrol-gateway)
+  // so BooChat adopts it as a provider. BooControl must NOT treat it as a fleet
+  // host — it has no llama-swap SSE/perf surface and its baseUrl points back at
+  // this service. Filter it out of every fleet operation.
+  const fleetProviders = registry.providers.filter((p) => p.kind !== 'boocontrol-gateway');
+
+  // JOIN registry providers with control_hosts for the enabled flag.
+  // Insert a control_hosts row ON CONFLICT DO NOTHING for any registry provider
+  // missing one, so the fleet state has a row to key off.
+  const enabledHosts = await sql<{ provider_id: string; enabled: boolean }[]>`
+    SELECT provider_id, enabled FROM control_hosts
+    WHERE provider_id = ANY(${fleetProviders.map((p) => p.id)}::text[])
+  `;
+  const enabledMap = new Map<string, boolean>();
+  for (const row of enabledHosts) {
+    enabledMap.set(row.provider_id, row.enabled);
+  }
+
+  // Seed missing control_hosts rows so the registry is the source of truth.
+  for (const provider of fleetProviders) {
+    if (!enabledMap.has(provider.id)) {
+      await sql`
+        INSERT INTO control_hosts (provider_id, enabled)
+        VALUES (${provider.id}, true)
+        ON CONFLICT (provider_id) DO NOTHING
+      `;
+      enabledMap.set(provider.id, true);
+    }
+  }
+
+  const abortControllers = new Map<string, AbortController>();
+
+  for (const provider of fleetProviders) {
+    const enabled = enabledMap.get(provider.id) ?? true;
+    if (!enabled) continue;
+
+    const baseUrl = provider.baseUrl;
+
+    // P2: Register host with action queue
+    actionQueue.registerHost(provider.id, {
+      baseUrl,
+      isLivenessUp: () => {
+        const hs = fleet.hosts.get(provider.id);
+        return hs?.liveness !== 'down';
+      },
+      isInflightRequests: () => {
+        // Host-level total from the SSE inflight event (per-model is not published).
+        return fleet.hosts.get(provider.id)?.inflightTotal ?? 0;
+      },
+      log: app.log,
+    });
+
+    const abort = startFleetConnector(provider.id, baseUrl, {
+      isUp: () => true,
+      sql,
+      log: app.log,
+      onEvent: (pid, event) => handleLlamaSweepEvent(fleet, sql, config, pid, emitter, event, logRelay),
+      onReconcile: (pid, metrics) => handleReconcile(fleet, sql, config, pid, emitter, metrics),
+      onReconnectGiveUp: async (pid) => {
+        const state = ensureHostState(fleet, pid);
+        state.liveness = 'down';
+      },
+      sleep: (ms) => new Promise((r) => setTimeout(r, ms)),
+    });
+    abortControllers.set(provider.id, abort);
+  }
+
+  // Perf poller: 5s interval per enabled provider — baseUrl from registry.
+  const pollTimer = setInterval(async () => {
+    for (const provider of fleetProviders) {
+      const enabled = enabledMap.get(provider.id) ?? true;
+      if (!enabled) continue;
+      await pollPerformance(sql, config, provider.id, provider.baseUrl, fleet, emitter);
+    }
+  }, 5_000);
+
+  // Retention job: daily timer — iterate registry providers.
+  const retentionConfig = buildRetentionConfig(config);
+  const retentionTimer = setInterval(async () => {
+    for (const provider of fleetProviders) {
+      const enabled = enabledMap.get(provider.id) ?? true;
+      if (!enabled) continue;
+      await runRollup(sql, provider.id, retentionConfig.rawHours);
+      // A2 fix: chunk pruneRawSamples (already chunked), also chunk pruneActivity and pruneModelEvents.
+      await pruneRawSamples(sql, provider.id, retentionConfig.rawHours);
+      await pruneActivity(sql, retentionConfig.rawHours);
+      await pruneModelEvents(sql, retentionConfig.rollupDays * 24);
+    }
+  }, 24 * 3600_000); // daily
+
+  // P6.2: Report digest scheduler (catch-up on boot, then hourly).
+  const stopReportScheduler = startReportScheduler(sql, app.log);
+
+  app.addHook('onClose', async () => {
+    clearInterval(pollTimer);
+    clearInterval(retentionTimer);
+    stopReportScheduler();
+    for (const abort of abortControllers.values()) {
+      abort.abort();
+    }
+  });
+
+  // Graceful shutdown.
+  const shutdown = async () => {
+    app.log.info('shutting down');
+    await app.close();
+    await sql.end({ timeout: 5 });
+    process.exit(0);
+  };
+  process.on('SIGTERM', shutdown);
+  process.on('SIGINT', shutdown);
+
+  await app.listen({ port: config.PORT, host: config.HOST });
+  app.log.info(`BooControl listening on ${config.HOST}:${config.PORT}`);
+}
+
+// P2 exports for tests
+export { ActionQueue } from './services/action-queue.js';
+export { LogRelay } from './services/log-relay.js';
+
+// P3 exports for tests
+export { runSingleBenchRequest, parseLlamaTimings, computeAggregates } from './services/bench-engine.js';
+export { computeRegressionFlag } from './services/bench-engine.js';
+
+// P5 exports for tests
+export { loadEvalSuitesFromData } from './services/eval-suites.js';
+export { runCodeEval } from './services/sandbox-runner.js';
+
+if (!process.env.VITEST) {
+  main().catch((err) => {
+    console.error('fatal:', err);
+    process.exit(1);
+  });
+}
--- a/apps/control/src/routes/actions.ts
+++ b/apps/control/src/routes/actions.ts
@@ -0,0 +1,108 @@
+import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
+import { randomUUID } from 'node:crypto';
+import type { ActionQueue } from '../services/action-queue.js';
+import type { FleetState } from '../services/fleet-state.js';
+import type { DeltaEmitter } from '../index.js';
+
+/**
+ * Register action submission routes.
+ *
+ * POST /api/action/submit — enqueue a warm or unload action
+ * GET  /api/action/queue/:providerId — get current queue state
+ */
+export function registerActionRoutes(
+  app: FastifyInstance,
+  actionQueue: ActionQueue,
+  fleet: FleetState,
+  emitter: DeltaEmitter,
+): void {
+  app.post('/api/action/submit', async (req: FastifyRequest, reply: FastifyReply) => {
+    const body = req.body as Record<string, unknown>;
+    const type = body.type as string;
+    const providerId = body.providerId as string;
+    const model = body.model as string | undefined;
+    const confirmed = body.confirmed === true;
+
+    if (!type || !['warm', 'unload'].includes(type)) {
+      return reply.status(400).send({ error: 'type must be warm or unload' });
+    }
+    if (!providerId) {
+      return reply.status(400).send({ error: 'providerId is required' });
+    }
+
+    // Check host liveness
+    const hostState = fleet.hosts.get(providerId);
+    if (!hostState || hostState.liveness === 'down') {
+      return reply.status(409).send({ error: 'host offline' });
+    }
+
+    const action = {
+      actionId: randomUUID(),
+      type: type as 'warm' | 'unload',
+      providerId,
+      model,
+      confirmed,
+      createdAt: new Date(),
+    };
+
+    const result = actionQueue.submit(action);
+
+    if (!result.ok) {
+      if (result.requiresConfirmation) {
+        return reply.status(409).send({
+          error: result.error,
+          requiresConfirmation: true,
+        });
+      }
+      if (result.pending) {
+        return reply.status(429).send({
+          error: result.error,
+          pending: result.pending,
+        });
+      }
+      return reply.status(409).send({ error: result.error });
+    }
+
+    // Publish action queued event
+    emitter.publish({
+      type: 'control_job' as const,
+      seq: hostState.seq,
+      jobType: 'action' as const,
+      jobId: action.actionId,
+      status: 'queued' as const,
+      detail: {
+        actionType: action.type,
+        providerId: action.providerId,
+        model: action.model ?? null,
+      },
+    });
+
+    return reply.status(202).send({
+      actionId: action.actionId,
+      status: 'queued',
+    });
+  });
+
+  app.get('/api/action/queue/:providerId', async (req: FastifyRequest, reply: FastifyReply) => {
+    const providerId = req.params as { providerId: string };
+    const state = actionQueue.getState(providerId.providerId);
+
+    if (!state) {
+      return reply.status(404).send({ error: 'host not found' });
+    }
+
+    return reply.send({
+      providerId: providerId.providerId,
+      depth: state.queue.length,
+      running: state.running,
+      entries: state.queue.map((e) => ({
+        actionId: e.action.actionId,
+        type: e.action.type,
+        model: e.action.model ?? null,
+        status: e.status,
+        error: e.error ?? null,
+        enqueuedAt: e.enqueuedAt.toISOString(),
+      })),
+    });
+  });
+}
--- a/apps/control/src/routes/bench.ts
+++ b/apps/control/src/routes/bench.ts
@@ -0,0 +1,492 @@
+import { randomUUID } from 'node:crypto';
+import type { FastifyBaseLogger, FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
+import type { Sql } from '../db.js';
+import type { FleetState } from '../services/fleet-state.js';
+import type { DeltaEmitter } from '../index.js';
+import { acquireHostAccess } from '../services/host-access.js';
+import type { BenchSuite, BenchRunProgress } from '../services/bench-engine.js';
+import { runBenchSuite } from '../services/bench-engine.js';
+import { resolveProviderBaseUrl } from '../services/llama-providers.js';
+import { jsonbNumberArray, jsonbObject } from '../services/jsonb.js';
+
+/**
+ * Register bench routes.
+ *
+ * POST /api/bench/suite        — create a suite definition
+ * GET  /api/bench/suites       — list suites
+ * GET  /api/bench/suites/:id   — get suite
+ * POST /api/bench/run          — start a bench run (gated through acquireHostAccess)
+ * GET  /api/bench/runs         — list runs
+ * GET  /api/bench/runs/:id     — get run + samples
+ * GET  /api/bench/baselines    — get baselines per (provider_id, model)
+ */
+export function registerBenchRoutes(
+  app: FastifyInstance,
+  sql: Sql,
+  fleet: FleetState,
+  emitter: DeltaEmitter,
+): void {
+  // ─── suite CRUD ──────────────────────────────────────────────────────────
+
+  app.post('/api/bench/suite', async (req: FastifyRequest, reply: FastifyReply) => {
+    const body = req.body as Record<string, unknown>;
+    const suiteId = body.id as string;
+    const name = body.name as string;
+    const providerId = body.providerId as string;
+    const model = body.model as string;
+    const promptTokens = body.promptTokens as number[];
+    const genTokens = body.genTokens as number[];
+    const concurrency = body.concurrency as number[];
+    const repetitions = (body.repetitions as number) ?? 1;
+    const metadata = body.metadata as Record<string, unknown> | undefined;
+
+    if (!name || !providerId || !model) {
+      return reply.status(400).send({ error: 'name, providerId, and model are required' });
+    }
+    if (!promptTokens?.length || !genTokens?.length || !concurrency?.length) {
+      return reply.status(400).send({ error: 'promptTokens, genTokens, and concurrency must each have at least one value' });
+    }
+
+    const id = suiteId ?? randomUUID();
+    await sql`
+      INSERT INTO bench_suites (id, name, provider_id, model, prompt_tokens, gen_tokens, concurrency, repetitions, metadata)
+      VALUES (${id}, ${name}, ${providerId}, ${model}, ${sql.json(promptTokens as never)}, ${sql.json(genTokens as never)}, ${sql.json(concurrency as never)}, ${repetitions}, ${metadata ? sql.json(metadata as never) : sql`NULL::jsonb`})
+      ON CONFLICT (id) DO UPDATE SET
+        name = EXCLUDED.name,
+        provider_id = EXCLUDED.provider_id,
+        model = EXCLUDED.model,
+        prompt_tokens = EXCLUDED.prompt_tokens,
+        gen_tokens = EXCLUDED.gen_tokens,
+        concurrency = EXCLUDED.concurrency,
+        repetitions = EXCLUDED.repetitions,
+        metadata = EXCLUDED.metadata
+    `;
+
+    return reply.status(201).send({ id });
+  });
+
+  app.get('/api/bench/suites', async (_req: FastifyRequest, reply: FastifyReply) => {
+    const suites = await sql<{
+      id: string;
+      name: string;
+      provider_id: string;
+      model: string;
+      prompt_tokens: string;
+      gen_tokens: string;
+      concurrency: string;
+      repetitions: number;
+      metadata: string | null;
+      created_at: string;
+    }[]>`
+      SELECT id, name, provider_id, model, prompt_tokens, gen_tokens, concurrency, repetitions, metadata, created_at
+      FROM bench_suites
+      ORDER BY created_at DESC
+    `;
+
+    return reply.send({
+      suites: suites.map((s) => ({
+        id: s.id,
+        name: s.name,
+        providerId: s.provider_id,
+        model: s.model,
+        promptTokens: jsonbNumberArray(s.prompt_tokens),
+        genTokens: jsonbNumberArray(s.gen_tokens),
+        concurrency: jsonbNumberArray(s.concurrency),
+        repetitions: s.repetitions,
+        metadata: jsonbObject(s.metadata) ?? undefined,
+        createdAt: s.created_at,
+      })),
+    });
+  });
+
+  app.get('/api/bench/suites/:id', async (req: FastifyRequest, reply: FastifyReply) => {
+    const { id } = req.params as { id: string };
+    const rows = await sql<{
+      id: string;
+      name: string;
+      provider_id: string;
+      model: string;
+      prompt_tokens: string;
+      gen_tokens: string;
+      concurrency: string;
+      repetitions: number;
+      metadata: string | null;
+      created_at: string;
+    }[]>`
+      SELECT id, name, provider_id, model, prompt_tokens, gen_tokens, concurrency, repetitions, metadata, created_at
+      FROM bench_suites WHERE id = ${id}
+    `;
+
+    if (rows.length === 0) {
+      return reply.status(404).send({ error: 'suite not found' });
+    }
+
+    const s = rows[0]!;
+    return reply.send({
+      id: s.id,
+      name: s.name,
+      providerId: s.provider_id,
+      model: s.model,
+      promptTokens: jsonbNumberArray(s.prompt_tokens),
+      genTokens: jsonbNumberArray(s.gen_tokens),
+      concurrency: jsonbNumberArray(s.concurrency),
+      repetitions: s.repetitions,
+      metadata: jsonbObject(s.metadata) ?? undefined,
+      createdAt: s.created_at,
+    });
+  });
+
+  // ─── run launcher (P3.3: safety gates + P3.4: acquireHostAccess) ─────────
+
+  app.post('/api/bench/run', async (req: FastifyRequest, reply: FastifyReply) => {
+    const body = req.body as Record<string, unknown>;
+    const suiteId = body.suiteId as string;
+    const temperature = (body.temperature as number) ?? 0.7;
+    const topP = (body.topP as number) ?? 0.9;
+
+    if (!suiteId) {
+      return reply.status(400).send({ error: 'suiteId is required' });
+    }
+
+    // Load suite.
+    const suiteRows = await sql<{
+      id: string;
+      name: string;
+      provider_id: string;
+      model: string;
+      prompt_tokens: string;
+      gen_tokens: string;
+      concurrency: string;
+      repetitions: number;
+      metadata: string | null;
+    }[]>`
+      SELECT id, name, provider_id, model, prompt_tokens, gen_tokens, concurrency, repetitions, metadata
+      FROM bench_suites WHERE id = ${suiteId}
+    `;
+
+    if (suiteRows.length === 0) {
+      return reply.status(404).send({ error: 'suite not found' });
+    }
+
+    const s = suiteRows[0]!;
+    const suite: BenchSuite = {
+      id: s.id,
+      name: s.name,
+      providerId: s.provider_id,
+      model: s.model,
+      promptTokens: jsonbNumberArray(s.prompt_tokens),
+      genTokens: jsonbNumberArray(s.gen_tokens),
+      concurrency: jsonbNumberArray(s.concurrency),
+      repetitions: s.repetitions,
+      metadata: jsonbObject(s.metadata) ?? undefined,
+    };
+
+    // P3.3: Safety check — check recent traffic on the target host.
+    const hostState = fleet.hosts.get(suite.providerId);
+    const recentTraffic = checkRecentTraffic(hostState);
+
+    // P3.4: Gate through acquireHostAccess seam.
+    const grant = await acquireHostAccess(suite.providerId, 'bench');
+    if (!grant.ok) {
+      return reply.status(409).send({
+        error: 'host access denied',
+        reason: grant.reason,
+      });
+    }
+
+    // Resolve base URL from registry.
+    const baseUrl = resolveBaseUrl(suite.providerId);
+    if (!baseUrl) {
+      return reply.status(400).send({ error: `no base URL configured for provider ${suite.providerId}` });
+    }
+
+    // Get seq for the host.
+    const seq = hostState?.seq ?? 0;
+
+    // Run the bench suite asynchronously (non-blocking HTTP response).
+    void runBenchAsync(
+      { suite, baseUrl, temperature, topP },
+      sql,
+      emitter,
+      seq,
+      suite.providerId,
+    );
+
+    return reply.status(202).send({
+      status: 'queued',
+      suiteId: suite.id,
+      recentTraffic,
+    });
+  });
+
+  // ─── runs listing ────────────────────────────────────────────────────────
+
+  app.get('/api/bench/runs', async (req: FastifyRequest, reply: FastifyReply) => {
+    const query = req.query as Record<string, string | undefined>;
+    const suiteId = query.suiteId;
+
+    let runs: Array<{
+      id: string;
+      suite_id: string;
+      job_type: string;
+      status: string;
+      started_at: string | null;
+      finished_at: string | null;
+      total_samples: number;
+      completed_samples: number;
+      concurrent_foreign_requests: number;
+      regression_flag: string | null;
+      aggregate: string | null;
+      error: string | null;
+      created_at: string;
+    }>;
+
+    if (suiteId) {
+      runs = await sql`
+        SELECT id, suite_id, job_type, status, started_at, finished_at, total_samples, completed_samples, concurrent_foreign_requests, regression_flag, aggregate, error, created_at
+        FROM bench_runs WHERE suite_id = ${suiteId}
+        ORDER BY created_at DESC
+      `;
+    } else {
+      runs = await sql`
+        SELECT id, suite_id, job_type, status, started_at, finished_at, total_samples, completed_samples, concurrent_foreign_requests, regression_flag, aggregate, error, created_at
+        FROM bench_runs
+        ORDER BY created_at DESC
+        LIMIT 100
+      `;
+    }
+
+    return reply.send({
+      runs: runs.map((r) => ({
+        id: r.id,
+        suiteId: r.suite_id,
+        jobType: r.job_type,
+        status: r.status,
+        startedAt: r.started_at,
+        finishedAt: r.finished_at,
+        totalSamples: r.total_samples,
+        completedSamples: r.completed_samples,
+        concurrentForeignRequests: r.concurrent_foreign_requests,
+        regressionFlag: r.regression_flag,
+        aggregate: jsonbObject(r.aggregate),
+        error: r.error,
+        createdAt: r.created_at,
+      })),
+    });
+  });
+
+  app.get('/api/bench/runs/:id', async (req: FastifyRequest, reply: FastifyReply) => {
+    const { id } = req.params as { id: string };
+
+    const runRows = await sql<{
+      id: string;
+      suite_id: string;
+      job_type: string;
+      status: string;
+      started_at: string | null;
+      finished_at: string | null;
+      total_samples: number;
+      completed_samples: number;
+      concurrent_foreign_requests: number;
+      regression_flag: string | null;
+      aggregate: string | null;
+      error: string | null;
+      created_at: string;
+    }[]>`
+      SELECT id, suite_id, job_type, status, started_at, finished_at, total_samples, completed_samples, concurrent_foreign_requests, regression_flag, aggregate, error, created_at
+      FROM bench_runs WHERE id = ${id}
+    `;
+
+    if (runRows.length === 0) {
+      return reply.status(404).send({ error: 'run not found' });
+    }
+
+    const r = runRows[0]!;
+
+    const samples = await sql<{
+      id: number;
+      prompt_tokens: number;
+      gen_tokens: number;
+      concurrency: number;
+      repetition: number;
+      ttft_ms: number | null;
+      total_ms: number | null;
+      prompt_tps: number | null;
+      gen_tps: number | null;
+      cache_n: number | null;
+      error: string | null;
+    }[]>`
+      SELECT id, prompt_tokens, gen_tokens, concurrency, repetition, ttft_ms, total_ms, prompt_tps, gen_tps, cache_n, error
+      FROM bench_samples WHERE run_id = ${id}
+      ORDER BY prompt_tokens, gen_tokens, concurrency, repetition
+    `;
+
+    return reply.send({
+      run: {
+        id: r.id,
+        suiteId: r.suite_id,
+        jobType: r.job_type,
+        status: r.status,
+        startedAt: r.started_at,
+        finishedAt: r.finished_at,
+        totalSamples: r.total_samples,
+        completedSamples: r.completed_samples,
+        concurrentForeignRequests: r.concurrent_foreign_requests,
+        regressionFlag: r.regression_flag,
+        aggregate: jsonbObject(r.aggregate),
+        error: r.error,
+        createdAt: r.created_at,
+      },
+      samples: samples.map((s) => ({
+        id: s.id,
+        promptTokens: s.prompt_tokens,
+        genTokens: s.gen_tokens,
+        concurrency: s.concurrency,
+        repetition: s.repetition,
+        ttftMs: s.ttft_ms,
+        totalMs: s.total_ms,
+        promptTps: s.prompt_tps,
+        genTps: s.gen_tps,
+        cacheN: s.cache_n,
+        error: s.error,
+      })),
+    });
+  });
+
+  // ─── baselines ───────────────────────────────────────────────────────────
+
+  app.get('/api/bench/baselines', async (_req: FastifyRequest, reply: FastifyReply) => {
+    const rows = await sql<{
+      provider_id: string;
+      model: string;
+      run_id: string;
+      aggregate: string;
+      created_at: string;
+    }[]>`
+      SELECT provider_id, model, run_id, aggregate, created_at
+      FROM bench_baselines
+      ORDER BY provider_id, model
+    `;
+
+    return reply.send({
+      baselines: rows.map((r) => ({
+        providerId: r.provider_id,
+        model: r.model,
+        runId: r.run_id,
+        aggregate: jsonbObject(r.aggregate),
+        createdAt: r.created_at,
+      })),
+    });
+  });
+}
+
+/**
+ * P3.3: Check if the target host has recent traffic (for takeover confirmation).
+ */
+function checkRecentTraffic(hostState: { models: Map<string, { inflight: number }> } | undefined): { hasRecentTraffic: boolean; inflightCount: number } {
+  if (!hostState) {
+    return { hasRecentTraffic: false, inflightCount: 0 };
+  }
+  let total = 0;
+  for (const m of hostState.models.values()) {
+    total += m.inflight;
+  }
+  return {
+    hasRecentTraffic: total > 0,
+    inflightCount: total,
+  };
+}
+
+/**
+ * Resolve the base URL for a provider from the loaded registry.
+ * baseUrl comes from LlamaProvider.baseUrl, never from ssh_host.
+ */
+function resolveBaseUrl(providerId: string): string | null {
+  return resolveProviderBaseUrl(providerId);
+}
+
+/**
+ * Async bench runner: fire-and-forget, records concurrent_foreign_requests.
+ * A6: sources from activity stream during [started_at, finished_at] window,
+ * minus the bench's own samples count.
+ */
+async function runBenchAsync(
+  params: { suite: BenchSuite; baseUrl: string; temperature?: number; topP?: number },
+  sql: Sql,
+  emitter: DeltaEmitter,
+  seq: number,
+  providerId: string,
+): Promise<void> {
+  const { suite } = params;
+
+  // Find the latest running run for this suite.
+  const latestRun = await sql<{ id: string; started_at: string | null }[]>`
+    SELECT id, started_at FROM bench_runs
+    WHERE suite_id = ${suite.id} AND status = 'running'
+    ORDER BY created_at DESC LIMIT 1
+  `;
+
+  if (latestRun.length === 0) {
+    benchLogger?.error?.({}, 'bench: no running run found');
+    return;
+  }
+
+  const runId = latestRun[0]!.id;
+
+  const progressHandler = (_progress: BenchRunProgress) => {
+    // Progress is published via emitter in runBenchSuite.
+  };
+
+  try {
+    await runBenchSuite(params, sql, emitter, seq, progressHandler);
+
+    // A6: Record concurrent_foreign_requests from activity stream during run window.
+    // Count control_requests for this provider in [started_at, finished_at],
+    // minus the bench's own sample count.
+    const runData = await sql<{ started_at: string | null; finished_at: string | null; completed_samples: number }[]>`
+      SELECT started_at, finished_at, completed_samples FROM bench_runs WHERE id = ${runId}
+    `;
+    const rd = runData[0]!;
+
+    if (rd.started_at && rd.finished_at) {
+      const foreignCount = await sql<{ count: number }[]>`
+        SELECT COUNT(*)::INT AS count FROM control_requests
+        WHERE provider_id = ${providerId}
+        AND ts >= ${rd.started_at}::timestamptz
+        AND ts <= ${rd.finished_at}::timestamptz
+      `;
+      const totalForeign = (foreignCount[0]?.count ?? 0) - rd.completed_samples;
+      await sql`
+        UPDATE bench_runs SET concurrent_foreign_requests = ${Math.max(0, totalForeign)}
+        WHERE id = ${runId}
+      `;
+    }
+  } catch (err) {
+    const msg = (err as Error).message ?? String(err);
+    benchLogger?.error?.({ err: msg }, 'bench: run failed');
+
+    await sql`
+      UPDATE bench_runs
+      SET status = 'failed', finished_at = clock_timestamp(), error = ${msg}
+      WHERE id = ${runId}
+    `;
+
+    emitter.publish({
+      type: 'control_job' as const,
+      seq,
+      jobType: 'bench' as const,
+      jobId: runId,
+      status: 'failed' as const,
+      detail: { error: msg },
+    });
+  }
+}
+
+/**
+ * Set the Fastify logger for the async bench runner.
+ */
+let benchLogger: FastifyBaseLogger | undefined;
+
+export function setBenchApp(logger: FastifyBaseLogger): void {
+  benchLogger = logger;
+}
--- a/apps/control/src/routes/captures.ts
+++ b/apps/control/src/routes/captures.ts
@@ -0,0 +1,52 @@
+import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
+import type { Sql } from '../db.js';
+import { fetchCapture, persistCapture } from '../services/capture-fetch.js';
+
+/**
+ * Register capture inspection routes.
+ *
+ * GET /api/capture/:providerId/:swapEntryId — fetch capture from host, persist trimmed copy
+ */
+export function registerCaptureRoutes(
+  app: FastifyInstance,
+  sql: Sql,
+): void {
+  app.get(
+    '/api/capture/:providerId/:swapEntryId',
+    async (req: FastifyRequest, reply: FastifyReply) => {
+      const params = req.params as { providerId: string; swapEntryId: string };
+      const swapEntryId = parseInt(params.swapEntryId, 10);
+
+      if (isNaN(swapEntryId)) {
+        return reply.status(400).send({ error: 'invalid swapEntryId' });
+      }
+
+      // Resolve host URL from control_hosts
+      const hosts = await sql<{ ssh_host: string }[]>`
+        SELECT ssh_host FROM control_hosts WHERE provider_id = ${params.providerId}
+      `;
+
+      if (hosts.length === 0 || !hosts[0]?.ssh_host) {
+        return reply.status(404).send({ error: 'host not found or no SSH host configured' });
+      }
+
+      const baseUrl = `http://${hosts[0].ssh_host}:8401`;
+
+      const result = await fetchCapture(baseUrl, params.providerId, swapEntryId);
+
+      if (!result.ok) {
+        return reply.status(404).send({ error: result.error });
+      }
+
+      // Persist trimmed copy
+      try {
+        await persistCapture(sql, result.capture!);
+      } catch (err) {
+        // Persistence failure is non-fatal — still return the capture
+        app.log.warn({ err: (err as Error).message }, 'capture: persist failed');
+      }
+
+      return reply.send(result.capture);
+    },
+  );
+}
--- a/apps/control/src/routes/evals.ts
+++ b/apps/control/src/routes/evals.ts
@@ -0,0 +1,366 @@
+import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
+import type { Sql } from '../db.js';
+import type { DeltaEmitter } from '../index.js';
+import type { FleetState } from '../services/fleet-state.js';
+import {
+  listEvalSuites,
+  getEvalSuite,
+  upsertEvalSuite,
+  listEvalRuns,
+  getEvalResults,
+  seedEvalSuites,
+} from '../services/eval-suites.js';
+import { jsonbArray, jsonbObject } from '../services/jsonb.js';
+
+/**
+ * Register eval routes.
+ *
+ * POST /api/eval/suite        — create/update an eval suite
+ * GET  /api/eval/suites       — list suites
+ * GET  /api/eval/suites/:id   — get suite
+ * POST /api/eval/seed         — seed suites from data/ YAML
+ * POST /api/eval/run          — start an eval run
+ * GET  /api/eval/runs         — list runs
+ * GET  /api/eval/runs/:id     — get run + results
+ * GET  /api/eval/leaderboard  — per (provider_id, model) aggregate scores
+ */
+export function registerEvalRoutes(
+  app: FastifyInstance,
+  sql: Sql,
+  fleet: FleetState,
+  emitter: DeltaEmitter,
+): void {
+  // Seed suites from data/ YAML on startup (idempotent).
+  app.addHook('onReady', async () => {
+    await seedEvalSuites(sql).catch((err) => {
+      app.log.warn({ err: (err as Error).message }, 'eval: seed failed');
+    });
+  });
+
+  // ─── suite CRUD ──────────────────────────────────────────────────────────
+
+  app.post('/api/eval/suite', async (req: FastifyRequest, reply: FastifyReply) => {
+    const body = req.body as Record<string, unknown>;
+    const id = (body.id as string) ?? null;
+    const name = body.name as string;
+    const kind = body.kind as 'chat' | 'code';
+    const tasks = body.tasks as unknown[];
+    const judgeModel = (body.judgeModel as string) ?? null;
+    const metadata = body.metadata as Record<string, unknown> | undefined;
+
+    if (!name || !kind || !tasks?.length) {
+      return reply.status(400).send({ error: 'name, kind, and tasks are required' });
+    }
+
+    const suiteId = await upsertEvalSuite(sql, id, name, kind, tasks, judgeModel, metadata);
+    return reply.status(201).send({ id: suiteId });
+  });
+
+  app.get('/api/eval/suites', async (_req: FastifyRequest, reply: FastifyReply) => {
+    const suites = await listEvalSuites(sql);
+    return reply.send({
+      suites: suites.map((s) => ({
+        id: s.id,
+        name: s.name,
+        kind: s.kind,
+        version: s.version,
+        tasks: jsonbArray(s.tasks),
+        judgeModel: s.judge_model,
+        judgeModelVersion: s.judge_model_version,
+        metadata: jsonbObject(s.metadata) ?? undefined,
+        createdAt: s.created_at,
+      })),
+    });
+  });
+
+  app.get('/api/eval/suites/:id', async (req: FastifyRequest, reply: FastifyReply) => {
+    const { id } = req.params as { id: string };
+    const suite = await getEvalSuite(sql, id);
+    if (!suite) {
+      return reply.status(404).send({ error: 'suite not found' });
+    }
+    return reply.send({
+      id: suite.id,
+      name: suite.name,
+      kind: suite.kind,
+      version: suite.version,
+      tasks: jsonbArray(suite.tasks),
+      judgeModel: suite.judge_model,
+      judgeModelVersion: suite.judge_model_version,
+      metadata: jsonbObject(suite.metadata) ?? undefined,
+      createdAt: suite.created_at,
+    });
+  });
+
+  // ─── seed from data/ ─────────────────────────────────────────────────────
+
+  app.post('/api/eval/seed', async (_req: FastifyRequest, reply: FastifyReply) => {
+    await seedEvalSuites(sql);
+    return reply.send({ ok: true });
+  });
+
+  // ─── run launcher ────────────────────────────────────────────────────────
+
+  app.post('/api/eval/run', async (req: FastifyRequest, reply: FastifyReply) => {
+    const body = req.body as Record<string, unknown>;
+    const suiteId = body.suiteId as string;
+    const providerId = body.providerId as string;
+    const model = body.model as string;
+    const quant = (body.quant as string) ?? null;
+
+    if (!suiteId || !providerId || !model) {
+      return reply.status(400).send({ error: 'suiteId, providerId, and model are required' });
+    }
+
+    const suite = await getEvalSuite(sql, suiteId);
+    if (!suite) {
+      return reply.status(404).send({ error: 'suite not found' });
+    }
+
+    const tasks = jsonbArray(suite.tasks);
+    const judgeModel = suite.judge_model;
+    const seq = fleet.hosts.get(providerId)?.seq ?? 0;
+
+    // Start the eval run asynchronously.
+    void runEvalAsync(
+      { suiteId, providerId, model, quant, tasks, judgeModel },
+      sql,
+      emitter,
+      seq,
+      app.log,
+    );
+
+    return reply.status(202).send({ status: 'queued', suiteId, providerId, model });
+  });
+
+  // ─── runs listing ────────────────────────────────────────────────────────
+
+  app.get('/api/eval/runs', async (req: FastifyRequest, reply: FastifyReply) => {
+    const query = req.query as Record<string, string | undefined>;
+    const runs = await listEvalRuns(sql, query.suiteId, query.providerId);
+    return reply.send({
+      runs: runs.map((r) => ({
+        id: r.id,
+        suiteId: r.suite_id,
+        jobType: r.job_type,
+        providerId: r.provider_id,
+        model: r.model,
+        quant: r.quant,
+        status: r.status,
+        judgeModel: r.judge_model,
+        startedAt: r.started_at,
+        finishedAt: r.finished_at,
+        totalTasks: r.total_tasks,
+        completedTasks: r.completed_tasks,
+        aggregate: jsonbObject(r.aggregate),
+        error: r.error,
+        createdAt: r.created_at,
+      })),
+    });
+  });
+
+  app.get('/api/eval/runs/:id', async (req: FastifyRequest, reply: FastifyReply) => {
+    const { id } = req.params as { id: string };
+    const runs = await listEvalRuns(sql);
+    const run = runs.find((r) => r.id === id);
+    if (!run) {
+      return reply.status(404).send({ error: 'run not found' });
+    }
+
+    const results = await getEvalResults(sql, id);
+
+    return reply.send({
+      run: {
+        id: run.id,
+        suiteId: run.suite_id,
+        jobType: run.job_type,
+        providerId: run.provider_id,
+        model: run.model,
+        quant: run.quant,
+        status: run.status,
+        judgeModel: run.judge_model,
+        startedAt: run.started_at,
+        finishedAt: run.finished_at,
+        totalTasks: run.total_tasks,
+        completedTasks: run.completed_tasks,
+        aggregate: jsonbObject(run.aggregate),
+        error: run.error,
+        createdAt: run.created_at,
+      },
+      results: results.map((r) => ({
+        id: r.id,
+        taskId: r.task_id,
+        taskIndex: r.task_index,
+        score: r.score,
+        maxScore: r.max_score,
+        rationale: r.rationale,
+        sandboxExitCode: r.sandbox_exit_code,
+        sandboxStderr: r.sandbox_stderr,
+        sandboxStdout: r.sandbox_stdout,
+        executionMs: r.execution_ms,
+        error: r.error,
+      })),
+    });
+  });
+
+  // ─── leaderboard ─────────────────────────────────────────────────────────
+
+  app.get('/api/eval/leaderboard', async (req: FastifyRequest, reply: FastifyReply) => {
+    const query = req.query as Record<string, string | undefined>;
+    const kind = query.kind as 'chat' | 'code' | undefined;
+
+    // Aggregate scores per (provider_id, model) from completed eval_runs.
+    const rows = await sql<{
+      provider_id: string;
+      model: string;
+      quant: string | null;
+      suite_kind: string;
+      avg_score: number;
+      run_count: number;
+      latest_run_at: string;
+    }[]>`
+      SELECT
+        er.provider_id,
+        er.model,
+        er.quant,
+        es.kind AS suite_kind,
+        AVG(CASE WHEN er.aggregate IS NOT NULL THEN (er.aggregate::jsonb ->> 'avgScore')::float ELSE NULL END) AS avg_score,
+        COUNT(DISTINCT er.id) AS run_count,
+        MAX(er.finished_at) AS latest_run_at
+      FROM eval_runs er
+      JOIN eval_suites es ON er.suite_id = es.id
+      WHERE er.status = 'completed'
+        ${kind ? sql`AND es.kind = ${kind}` : sql`AND 1=1`}
+      GROUP BY er.provider_id, er.model, er.quant, es.kind
+      ORDER BY avg_score DESC NULLS LAST
+    `;
+
+    return reply.send({
+      leaderboard: rows.map((r) => ({
+        providerId: r.provider_id,
+        model: r.model,
+        quant: r.quant,
+        suiteKind: r.suite_kind,
+        avgScore: r.avg_score,
+        runCount: r.run_count,
+        latestRunAt: r.latest_run_at,
+      })),
+    });
+  });
+}
+
+/**
+ * Async eval runner: fire-and-forget.
+ * Delegates to judge runner (chat) or sandbox runner (code).
+ */
+async function runEvalAsync(
+  params: {
+    suiteId: string;
+    providerId: string;
+    model: string;
+    quant: string | null;
+    tasks: unknown[];
+    judgeModel: string | null;
+  },
+  sql: Sql,
+  emitter: DeltaEmitter,
+  seq: number,
+  logger: import('fastify').FastifyBaseLogger,
+): Promise<void> {
+  const { suiteId, providerId, model, quant, tasks, judgeModel } = params;
+  const runId = `eval_${Date.now()}_${crypto.randomUUID().slice(0, 8)}`;
+
+  try {
+    await sql`
+      INSERT INTO eval_runs (id, suite_id, job_type, provider_id, model, quant, status, judge_model, started_at, total_tasks)
+      VALUES (${runId}, ${suiteId}, 'eval', ${providerId}, ${model}, ${quant}, 'running', ${judgeModel}, clock_timestamp(), ${tasks.length})
+    `;
+
+    emitter.publish({
+      type: 'control_job' as const,
+      seq,
+      jobType: 'eval' as const,
+      jobId: runId,
+      status: 'running' as const,
+      detail: { suiteId, providerId, model, totalTasks: tasks.length },
+    });
+
+    // Import runners dynamically to avoid circular deps.
+    const suiteKind = tasks[0] as Record<string, unknown>;
+    const isCodeSuite = !!(suiteKind && suiteKind.test_code);
+
+    let completed = 0;
+    let error: string | null = null;
+
+    if (isCodeSuite) {
+      const { runCodeEval } = await import('../services/sandbox-runner.js');
+      const result = await runCodeEval(
+        { runId, providerId, model, tasks: tasks as Array<Record<string, unknown>>, quant },
+        sql,
+        emitter,
+        seq,
+        (progress) => {
+          completed = progress.completedTasks;
+        },
+      );
+      if (result.error) error = result.error;
+    } else {
+      const { runJudgeEval } = await import('../services/judge-runner.js');
+      const result = await runJudgeEval(
+        { runId, providerId, model, tasks: tasks as Array<Record<string, unknown>>, judgeModel, quant },
+        sql,
+        emitter,
+        seq,
+        logger,
+        (progress) => {
+          completed = progress.completedTasks;
+        },
+      );
+      if (result.error) error = result.error;
+    }
+
+    // Compute aggregate.
+    const results = await sql<{ score: number | null; max_score: number | null }[]>`
+      SELECT score, max_score FROM eval_results WHERE run_id = ${runId}
+    `;
+    const scores = results.map((r) => r.score).filter((s): s is number => s != null);
+    const avgScore = scores.length ? scores.reduce((a, b) => a + b, 0) / scores.length : null;
+
+    await sql`
+      UPDATE eval_runs
+      SET status = ${error ? 'failed' : 'completed'},
+          finished_at = clock_timestamp(),
+          completed_tasks = ${completed},
+          aggregate = ${avgScore != null ? sql.json({ avgScore, totalTasks: tasks.length, passedTasks: scores.filter((s, i) => { const m = results[i]?.max_score; return m ? s / m >= 0.7 : s != null; }).length } as never) : sql`NULL::jsonb`},
+          error = ${error}
+      WHERE id = ${runId}
+    `;
+
+    emitter.publish({
+      type: 'control_job' as const,
+      seq,
+      jobType: 'eval' as const,
+      jobId: runId,
+      status: error ? 'failed' as const : 'completed' as const,
+      detail: { avgScore, error },
+    });
+  } catch (err) {
+    const msg = (err as Error).message ?? String(err);
+    logger.error({ err: msg }, 'eval: run failed');
+
+    await sql`
+      UPDATE eval_runs
+      SET status = 'failed', finished_at = clock_timestamp(), error = ${msg}
+      WHERE id = ${runId}
+    `.catch(() => {});
+
+    emitter.publish({
+      type: 'control_job' as const,
+      seq,
+      jobType: 'eval' as const,
+      jobId: runId,
+      status: 'failed' as const,
+      detail: { error: msg },
+    });
+  }
+}
--- a/apps/control/src/routes/gateway.ts
+++ b/apps/control/src/routes/gateway.ts
@@ -0,0 +1,205 @@
+import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
+import type { Sql } from '../db.js';
+import type { FleetState } from '../services/fleet-state.js';
+import type { DeltaEmitter } from '../index.js';
+import {
+  VIRTUAL_MODELS,
+  resolveCandidates,
+  splitComposite,
+} from '../services/gateway.js';
+import { resolveProviderBaseUrl } from '../services/llama-providers.js';
+
+/**
+ * P7.1: OpenAI-compatible auto:* gateway.
+ *
+ * BooChat reaches this server directly (registry baseUrl), NOT through the
+ * /api/control proxy, so streaming works end to end. Endpoints mirror the
+ * llama-swap wire surface BooChat's provider adapter expects:
+ *
+ *   GET  /v1/models                — advertise the virtual models
+ *   POST /v1/chat/completions      — resolve a policy, dispatch with failover
+ *   GET  /upstream/:model/props    — props for getModelContext (best candidate)
+ *
+ * Every dispatch forwards X-Boo-Source to the chosen target so attribution
+ * survives the extra hop, and is recorded in route_dispatch_log.
+ */
+export function registerGatewayRoutes(
+  app: FastifyInstance,
+  sql: Sql,
+  fleet: FleetState,
+  _emitter: DeltaEmitter,
+): void {
+  // ─── model catalog ───────────────────────────────────────────────────────
+
+  app.get('/v1/models', async (_req: FastifyRequest, reply: FastifyReply) => {
+    return reply.send({
+      object: 'list',
+      data: VIRTUAL_MODELS.map((id) => ({
+        id,
+        object: 'model',
+        created: 0,
+        owned_by: 'boocontrol-gateway',
+      })),
+    });
+  });
+
+  // ─── props (for getModelContext) ─────────────────────────────────────────
+  // Resolve candidates and proxy the first healthy candidate's props so the
+  // caller can read default_generation_settings.n_ctx.
+
+  app.get('/upstream/:model/props', async (req: FastifyRequest, reply: FastifyReply) => {
+    const { model } = req.params as { model: string };
+    const { candidates } = await resolveCandidates(sql, fleet, model);
+
+    for (const compositeId of candidates) {
+      const split = splitComposite(compositeId);
+      if (!split) continue;
+      const baseUrl = resolveProviderBaseUrl(split.providerId);
+      if (!baseUrl) continue;
+      try {
+        const url = `${baseUrl.replace(/\/+$/, '')}/upstream/${encodeURIComponent(split.model)}/props`;
+        const res = await fetch(url, { signal: AbortSignal.timeout(5_000) });
+        if (!res.ok) continue;
+        const body = await res.json();
+        return reply.send(body);
+      } catch {
+        continue;
+      }
+    }
+    return reply.status(503).send({ error: 'no healthy candidate for virtual model', model });
+  });
+
+  // ─── chat completions (dispatch with failover) ───────────────────────────
+
+  app.post('/v1/chat/completions', async (req: FastifyRequest, reply: FastifyReply) => {
+    const body = req.body as Record<string, unknown>;
+    const requestedModel = body?.model as string | undefined;
+    if (!requestedModel) {
+      return reply.status(400).send({ error: { message: 'model is required' } });
+    }
+
+    const source = (req.headers['x-boo-source'] as string | undefined) ?? null;
+    const stream = body.stream === true;
+    const { virtualModel, candidates } = await resolveCandidates(sql, fleet, requestedModel);
+
+    if (candidates.length === 0) {
+      await logDispatch(sql, { virtualModel, chosen: null, tried: [], status: 'no_candidates', source, error: 'no healthy candidates', durationMs: 0 });
+      return reply.status(503).send({
+        error: { message: `routing gateway: no healthy candidate for ${virtualModel}`, type: 'gateway_error' },
+      });
+    }
+
+    const tried: string[] = [];
+    const startedAt = Date.now();
+
+    for (const compositeId of candidates) {
+      const split = splitComposite(compositeId);
+      if (!split) continue;
+      const baseUrl = resolveProviderBaseUrl(split.providerId);
+      if (!baseUrl) continue;
+      tried.push(compositeId);
+
+      const upstreamHeaders: Record<string, string> = { 'Content-Type': 'application/json' };
+      if (source) upstreamHeaders['X-Boo-Source'] = source;
+
+      const upstreamBody = JSON.stringify({ ...body, model: split.model });
+
+      try {
+        const res = await fetch(`${baseUrl.replace(/\/+$/, '')}/v1/chat/completions`, {
+          method: 'POST',
+          headers: upstreamHeaders,
+          body: upstreamBody,
+          signal: AbortSignal.timeout(300_000),
+        });
+
+        if (!res.ok) {
+          // HTTP error before body — eligible for failover to the next candidate.
+          continue;
+        }
+
+        // Success: dispatch chosen. Log and stream/return through.
+        await logDispatch(sql, {
+          virtualModel,
+          chosen: compositeId,
+          tried,
+          status: 'dispatched',
+          source,
+          error: null,
+          durationMs: Date.now() - startedAt,
+        });
+
+        if (stream) {
+          reply.header('Content-Type', 'text/event-stream');
+          reply.header('Cache-Control', 'no-cache');
+          reply.header('Connection', 'keep-alive');
+          reply.raw.writeHead(200);
+          const reader = res.body?.getReader();
+          if (!reader) {
+            reply.raw.end();
+            return;
+          }
+          const decoder = new TextDecoder();
+          try {
+            while (true) {
+              const { done, value } = await reader.read();
+              if (done) break;
+              reply.raw.write(decoder.decode(value, { stream: true }));
+            }
+          } finally {
+            reply.raw.end();
+          }
+          return;
+        }
+
+        // Non-streaming: pass JSON through.
+        const json = await res.json();
+        return reply.send(json);
+      } catch {
+        // Connection error — failover to the next candidate.
+        continue;
+      }
+    }
+
+    // All candidates exhausted.
+    await logDispatch(sql, {
+      virtualModel,
+      chosen: null,
+      tried,
+      status: 'failed',
+      source,
+      error: 'all candidates failed',
+      durationMs: Date.now() - startedAt,
+    });
+    return reply.status(502).send({
+      error: { message: `routing gateway: all candidates failed for ${virtualModel}`, type: 'gateway_error' },
+    });
+  });
+}
+
+async function logDispatch(
+  sql: Sql,
+  entry: {
+    virtualModel: string;
+    chosen: string | null;
+    tried: string[];
+    status: string;
+    source: string | null;
+    error: string | null;
+    durationMs: number;
+  },
+): Promise<void> {
+  const split = entry.chosen ? splitComposite(entry.chosen) : null;
+  await sql`
+    INSERT INTO route_dispatch_log (virtual_model, chosen_provider_id, chosen_model, candidates_tried, status, source, error, duration_ms)
+    VALUES (
+      ${entry.virtualModel},
+      ${split?.providerId ?? null},
+      ${split?.model ?? null},
+      ${sql.json(entry.tried as never)},
+      ${entry.status},
+      ${entry.source},
+      ${entry.error},
+      ${entry.durationMs}
+    )
+  `.catch(() => { /* logging must never break dispatch */ });
+}
--- a/apps/control/src/routes/playground.ts
+++ b/apps/control/src/routes/playground.ts
@@ -0,0 +1,235 @@
+import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
+import { getLlamaProviders, resolveProviderBaseUrl } from '../services/llama-providers.js';
+
+/**
+ * Playground routes: model select, param controls, streaming chat.
+ *
+ * GET  /api/playground/models       — list available models from providers
+ * POST /api/playground/chat         — streaming chat against a model
+ * POST /api/playground/chat-ab      — side-by-side A/B compare
+ */
+export function registerPlaygroundRoutes(
+  app: FastifyInstance,
+): void {
+  // ─── model catalog ───────────────────────────────────────────────────────
+
+  app.get('/api/playground/models', async (_req: FastifyRequest, reply: FastifyReply) => {
+    // Resolve provider URLs from the loaded registry.
+    const registry = getLlamaProviders();
+    const providers = registry.providers.map((p) => ({
+      id: p.id,
+      baseUrl: p.baseUrl,
+    }));
+
+    const results = await Promise.allSettled(
+      providers.map(async (p) => {
+        try {
+          const res = await fetch(`${p.baseUrl}/v1/models`, {
+            signal: AbortSignal.timeout(5_000),
+          });
+          if (!res.ok) return null;
+          const data = await res.json() as { data?: Array<{ id: string }> };
+          return {
+            providerId: p.id,
+            models: data?.data?.map((m) => m.id) ?? [],
+          };
+        } catch {
+          return null;
+        }
+      }),
+    );
+
+    const models: Array<{ providerId: string; models: string[] }> = [];
+    for (const r of results) {
+      if (r.status === 'fulfilled' && r.value) {
+        models.push(r.value);
+      }
+    }
+
+    return reply.send({ models });
+  });
+
+  // ─── streaming chat ──────────────────────────────────────────────────────
+
+  app.post('/api/playground/chat', async (req: FastifyRequest, reply: FastifyReply) => {
+    const body = req.body as Record<string, unknown>;
+    const providerId = body.providerId as string;
+    const model = body.model as string;
+    const messages = body.messages as Array<{ role: string; content: string }>;
+    const temperature = (body.temperature as number) ?? 0.7;
+    const topP = (body.topP as number) ?? 0.9;
+    const maxTokens = (body.maxTokens as number) ?? 1024;
+
+    if (!providerId || !model || !messages?.length) {
+      return reply.status(400).send({ error: 'providerId, model, and messages are required' });
+    }
+
+    const baseUrl = resolveProviderBaseUrl(providerId);
+    if (!baseUrl) {
+      return reply.status(400).send({ error: `unknown provider: ${providerId}` });
+    }
+
+    // Stream the response back to the client via SSE.
+    reply.header('Content-Type', 'text/event-stream');
+    reply.header('Cache-Control', 'no-cache');
+    reply.header('Connection', 'keep-alive');
+    reply.raw.writeHead(200);
+
+    try {
+      const res = await fetch(`${baseUrl}/v1/chat/completions`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          model,
+          messages,
+          temperature,
+          top_p: topP,
+          max_tokens: maxTokens,
+          stream: true,
+        }),
+        signal: AbortSignal.timeout(120_000),
+      });
+
+      if (!res.ok) {
+        const errBody = await res.text().catch(() => '');
+        reply.raw.write(`data: ${JSON.stringify({ error: `Request failed: ${res.status} ${errBody.slice(0, 200)}` })}\n\n`);
+        reply.raw.end();
+        return;
+      }
+
+      const reader = res.body?.getReader();
+      if (!reader) {
+        reply.raw.write('data: {"error": "No response body"}\n\n');
+        reply.raw.end();
+        return;
+      }
+
+      const decoder = new TextDecoder();
+      let buffer = '';
+
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+
+        buffer += decoder.decode(value, { stream: true });
+        const lines = buffer.split('\n');
+        buffer = lines.pop() ?? '';
+
+        for (const line of lines) {
+          const trimmed = line.trim();
+          if (!trimmed) continue;
+          if (trimmed === 'data: [DONE]') {
+            reply.raw.write('data: [DONE]\n\n');
+            continue;
+          }
+          // N3: pass through the raw SSE line from upstream as-is.
+          // If it already has 'data: ' prefix, don't double-prefix.
+          const payload = trimmed.startsWith('data: ') ? trimmed : `data: ${trimmed}`;
+          reply.raw.write(`${payload}\n\n`);
+        }
+      }
+
+      reply.raw.write('data: [DONE]\n\n');
+    } catch (err) {
+      const msg = (err as Error).message ?? String(err);
+      reply.raw.write(`data: ${JSON.stringify({ error: msg })}\n\n`);
+    } finally {
+      reply.raw.end();
+    }
+  });
+
+  // ─── A/B compare ─────────────────────────────────────────────────────────
+
+  app.post('/api/playground/chat-ab', async (req: FastifyRequest, reply: FastifyReply) => {
+    const body = req.body as Record<string, unknown>;
+    const providerIdA = body.providerIdA as string;
+    const modelA = body.modelA as string;
+    const providerIdB = body.providerIdB as string;
+    const modelB = body.modelB as string;
+    const messages = body.messages as Array<{ role: string; content: string }>;
+    const temperature = (body.temperature as number) ?? 0.7;
+    const topP = (body.topP as number) ?? 0.9;
+    const maxTokens = (body.maxTokens as number) ?? 1024;
+
+    if (!providerIdA || !modelA || !providerIdB || !modelB || !messages?.length) {
+      return reply.status(400).send({ error: 'Both models and messages are required' });
+    }
+
+    const baseUrlA = resolveProviderBaseUrl(providerIdA);
+    const baseUrlB = resolveProviderBaseUrl(providerIdB);
+
+    if (!baseUrlA || !baseUrlB) {
+      return reply.status(400).send({ error: 'One or both providers unknown' });
+    }
+
+    // Stream both responses via SSE with lane identifiers.
+    reply.header('Content-Type', 'text/event-stream');
+    reply.header('Cache-Control', 'no-cache');
+    reply.header('Connection', 'keep-alive');
+    reply.raw.writeHead(200);
+
+    const streamModel = async (lane: 'A' | 'B', baseUrl: string, model: string) => {
+      try {
+        const res = await fetch(`${baseUrl}/v1/chat/completions`, {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({
+            model,
+            messages,
+            temperature,
+            top_p: topP,
+            max_tokens: maxTokens,
+            stream: true,
+          }),
+          signal: AbortSignal.timeout(120_000),
+        });
+
+        if (!res.ok) {
+          const errBody = await res.text().catch(() => '');
+          reply.raw.write(`data: ${JSON.stringify({ lane, error: `Request failed: ${res.status}` })}\n\n`);
+          return;
+        }
+
+        const reader = res.body?.getReader();
+        if (!reader) return;
+
+        const decoder = new TextDecoder();
+        let buffer = '';
+
+        while (true) {
+          const { done, value } = await reader.read();
+          if (done) break;
+
+          buffer += decoder.decode(value, { stream: true });
+          const lines = buffer.split('\n');
+          buffer = lines.pop() ?? '';
+
+          for (const line of lines) {
+            const trimmed = line.trim();
+            if (!trimmed) continue;
+            if (trimmed === 'data: [DONE]') {
+              reply.raw.write(`data: ${JSON.stringify({ lane, done: true })}\n\n`);
+              continue;
+            }
+            // N3: strip 'data: ' prefix from upstream before re-wrapping with lane info.
+            const payload = trimmed.startsWith('data: ') ? trimmed.slice(6) : trimmed;
+            reply.raw.write(`data: ${JSON.stringify({ lane, raw: payload })}\n\n`);
+          }
+        }
+
+        reply.raw.write(`data: ${JSON.stringify({ lane, done: true })}\n\n`);
+      } catch (err) {
+        const msg = (err as Error).message ?? String(err);
+        reply.raw.write(`data: ${JSON.stringify({ lane, error: msg })}\n\n`);
+      }
+    };
+
+    // Run both streams concurrently.
+    await Promise.all([
+      streamModel('A', baseUrlA, modelA),
+      streamModel('B', baseUrlB, modelB),
+    ]);
+
+    reply.raw.end();
+  });
+}
--- a/apps/control/src/routes/policies.ts
+++ b/apps/control/src/routes/policies.ts
@@ -0,0 +1,136 @@
+import { randomUUID } from 'node:crypto';
+import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
+import type { Sql } from '../db.js';
+import { VIRTUAL_MODELS } from '../services/gateway.js';
+import { jsonbStringArray } from '../services/jsonb.js';
+
+/**
+ * P7.4: Route policy CRUD + dispatch log.
+ *
+ * GET    /api/policies              — list policies
+ * POST   /api/policies             — create/update a policy (upsert by virtual_model)
+ * DELETE /api/policies/:id          — delete a policy
+ * GET    /api/policies/dispatch-log — recent gateway dispatches
+ * GET    /api/policies/virtual-models — the available virtual model tokens
+ */
+export function registerPolicyRoutes(app: FastifyInstance, sql: Sql): void {
+  app.get('/api/policies/virtual-models', async (_req: FastifyRequest, reply: FastifyReply) => {
+    return reply.send({ virtualModels: VIRTUAL_MODELS });
+  });
+
+  app.get('/api/policies', async (_req: FastifyRequest, reply: FastifyReply) => {
+    const rows = await sql<{
+      id: string;
+      name: string;
+      virtual_model: string;
+      candidates: string;
+      fallback: string | null;
+      enabled: boolean;
+      created_at: string;
+      updated_at: string;
+    }[]>`
+      SELECT id, name, virtual_model, candidates, fallback, enabled, created_at, updated_at
+      FROM route_policies
+      ORDER BY virtual_model
+    `;
+    return reply.send({
+      policies: rows.map((r) => ({
+        id: r.id,
+        name: r.name,
+        virtualModel: r.virtual_model,
+        candidates: safeParseArray(r.candidates),
+        fallback: r.fallback,
+        enabled: r.enabled,
+        createdAt: r.created_at,
+        updatedAt: r.updated_at,
+      })),
+    });
+  });
+
+  app.post('/api/policies', async (req: FastifyRequest, reply: FastifyReply) => {
+    const body = req.body as Record<string, unknown>;
+    const id = (body.id as string) ?? randomUUID();
+    const name = body.name as string;
+    const virtualModel = body.virtualModel as string;
+    const candidates = body.candidates as unknown;
+    const fallback = (body.fallback as string) ?? null;
+    const enabled = body.enabled !== false;
+
+    if (!name || !virtualModel) {
+      return reply.status(400).send({ error: 'name and virtualModel are required' });
+    }
+    if (!(VIRTUAL_MODELS as readonly string[]).includes(virtualModel)) {
+      return reply.status(400).send({ error: `virtualModel must be one of ${VIRTUAL_MODELS.join(', ')}` });
+    }
+    const candidateList = Array.isArray(candidates)
+      ? candidates.filter((c): c is string => typeof c === 'string')
+      : [];
+
+    // Upsert by virtual_model (UNIQUE) so there is one policy per virtual model.
+    await sql`
+      INSERT INTO route_policies (id, name, virtual_model, candidates, fallback, enabled, updated_at)
+      VALUES (${id}, ${name}, ${virtualModel}, ${sql.json(candidateList as never)}, ${fallback}, ${enabled}, clock_timestamp())
+      ON CONFLICT (virtual_model) DO UPDATE SET
+        name = EXCLUDED.name,
+        candidates = EXCLUDED.candidates,
+        fallback = EXCLUDED.fallback,
+        enabled = EXCLUDED.enabled,
+        updated_at = clock_timestamp()
+    `;
+    return reply.status(201).send({ id });
+  });
+
+  app.delete('/api/policies/:id', async (req: FastifyRequest, reply: FastifyReply) => {
+    const { id } = req.params as { id: string };
+    await sql`DELETE FROM route_policies WHERE id = ${id}`;
+    return reply.send({ ok: true });
+  });
+
+  app.get('/api/policies/dispatch-log', async (req: FastifyRequest, reply: FastifyReply) => {
+    const query = req.query as Record<string, string | undefined>;
+    const virtualModel = query.virtualModel;
+
+    const rows = virtualModel
+      ? await sql<DispatchLogRow[]>`
+          SELECT id, ts, virtual_model, chosen_provider_id, chosen_model, candidates_tried, status, source, error, duration_ms
+          FROM route_dispatch_log WHERE virtual_model = ${virtualModel}
+          ORDER BY ts DESC LIMIT 200
+        `
+      : await sql<DispatchLogRow[]>`
+          SELECT id, ts, virtual_model, chosen_provider_id, chosen_model, candidates_tried, status, source, error, duration_ms
+          FROM route_dispatch_log
+          ORDER BY ts DESC LIMIT 200
+        `;
+
+    return reply.send({
+      dispatches: rows.map((r) => ({
+        id: r.id,
+        ts: r.ts,
+        virtualModel: r.virtual_model,
+        chosenProviderId: r.chosen_provider_id,
+        chosenModel: r.chosen_model,
+        candidatesTried: safeParseArray(r.candidates_tried),
+        status: r.status,
+        source: r.source,
+        error: r.error,
+        durationMs: r.duration_ms,
+      })),
+    });
+  });
+}
+
+interface DispatchLogRow {
+  id: number;
+  ts: string;
+  virtual_model: string;
+  chosen_provider_id: string | null;
+  chosen_model: string | null;
+  candidates_tried: unknown;
+  status: string;
+  source: string | null;
+  error: string | null;
+  duration_ms: number | null;
+}
+
+// jsonb columns come back parsed from porsager; jsonbStringArray tolerates both.
+const safeParseArray = jsonbStringArray;
--- a/apps/control/src/routes/reports.ts
+++ b/apps/control/src/routes/reports.ts
@@ -0,0 +1,122 @@
+import type { FastifyInstance, FastifyRequest, FastifyReply, FastifyBaseLogger } from 'fastify';
+import type { Sql } from '../db.js';
+import { generateReport, runReportSchedulerTick } from '../services/reports.js';
+import { jsonbObject } from '../services/jsonb.js';
+
+/**
+ * P6.2: Reports tab API + scheduled digest.
+ *
+ * GET  /api/reports            — list generated reports (newest first)
+ * GET  /api/reports/:id        — single report (markdown + stats)
+ * POST /api/reports/generate   — manually trigger a digest now
+ * GET  /api/reports/schedule   — current schedule meta
+ * POST /api/reports/schedule   — update schedule meta {interval, enabled}
+ */
+export function registerReportRoutes(app: FastifyInstance, sql: Sql): void {
+  app.get('/api/reports', async (_req: FastifyRequest, reply: FastifyReply) => {
+    const rows = await sql<{
+      id: string;
+      kind: string;
+      interval: string;
+      period_start: string;
+      period_end: string;
+      created_at: string;
+    }[]>`
+      SELECT id, kind, interval, period_start, period_end, created_at
+      FROM control_reports
+      ORDER BY created_at DESC
+      LIMIT 100
+    `;
+    return reply.send({
+      reports: rows.map((r) => ({
+        id: r.id,
+        kind: r.kind,
+        interval: r.interval,
+        periodStart: r.period_start,
+        periodEnd: r.period_end,
+        createdAt: r.created_at,
+      })),
+    });
+  });
+
+  app.get('/api/reports/:id', async (req: FastifyRequest, reply: FastifyReply) => {
+    const { id } = req.params as { id: string };
+    const rows = await sql<{
+      id: string;
+      kind: string;
+      interval: string;
+      period_start: string;
+      period_end: string;
+      markdown: string;
+      stats: unknown;
+      created_at: string;
+    }[]>`
+      SELECT id, kind, interval, period_start, period_end, markdown, stats, created_at
+      FROM control_reports WHERE id = ${id}
+    `;
+    if (rows.length === 0) {
+      return reply.status(404).send({ error: 'report not found' });
+    }
+    const r = rows[0]!;
+    return reply.send({
+      id: r.id,
+      kind: r.kind,
+      interval: r.interval,
+      periodStart: r.period_start,
+      periodEnd: r.period_end,
+      markdown: r.markdown,
+      stats: jsonbObject(r.stats),
+      createdAt: r.created_at,
+    });
+  });
+
+  app.post('/api/reports/generate', async (req: FastifyRequest, reply: FastifyReply) => {
+    const body = (req.body as Record<string, unknown>) ?? {};
+    const interval = body.interval === 'weekly' ? 'weekly' : 'daily';
+    const id = await generateReport(sql, interval);
+    return reply.status(201).send({ id });
+  });
+
+  app.get('/api/reports/schedule', async (_req: FastifyRequest, reply: FastifyReply) => {
+    const rows = await sql<{ interval: string; enabled: boolean; last_run_at: string | null }[]>`
+      SELECT interval, enabled, last_run_at FROM control_schedule_meta WHERE name = 'report-digest'
+    `;
+    const m = rows[0];
+    return reply.send({
+      interval: m?.interval ?? 'daily',
+      enabled: m?.enabled ?? true,
+      lastRunAt: m?.last_run_at ?? null,
+    });
+  });
+
+  app.post('/api/reports/schedule', async (req: FastifyRequest, reply: FastifyReply) => {
+    const body = (req.body as Record<string, unknown>) ?? {};
+    const interval = body.interval === 'weekly' ? 'weekly' : 'daily';
+    const enabled = body.enabled !== false;
+    await sql`
+      UPDATE control_schedule_meta
+      SET interval = ${interval}, enabled = ${enabled}
+      WHERE name = 'report-digest'
+    `;
+    return reply.send({ interval, enabled });
+  });
+}
+
+/**
+ * Start the in-process report scheduler: an immediate catch-up tick on boot,
+ * then hourly. Returns a stop function for onClose.
+ */
+export function startReportScheduler(sql: Sql, log: FastifyBaseLogger): () => void {
+  const tick = async () => {
+    try {
+      const result = await runReportSchedulerTick(sql);
+      if (result.ran) log.info({ reportId: result.reportId }, 'reports: digest generated');
+    } catch (err) {
+      log.warn({ err: (err as Error).message }, 'reports: scheduler tick failed');
+    }
+  };
+  // Catch-up on boot.
+  void tick();
+  const timer = setInterval(tick, 3600_000); // hourly
+  return () => clearInterval(timer);
+}
--- a/apps/control/src/routes/routing.ts
+++ b/apps/control/src/routes/routing.ts
@@ -0,0 +1,32 @@
+import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
+import type { Sql } from '../db.js';
+import type { FleetState } from '../services/fleet-state.js';
+import { computeRoutingScores, BADGE_LABELS } from '../services/routing-scores.js';
+
+/**
+ * P6.1: Advisory routing scores.
+ *
+ * GET /api/routing/scores — per (provider_id, model) advisory scores + badges.
+ *   Surfaced as model-picker badges in BooChat. Advisory only; no enforcement.
+ */
+export function registerRoutingRoutes(
+  app: FastifyInstance,
+  sql: Sql,
+  fleet: FleetState,
+): void {
+  app.get('/api/routing/scores', async (_req: FastifyRequest, reply: FastifyReply) => {
+    const scores = await computeRoutingScores(sql, fleet);
+
+    // Map of compositeId -> badge kinds, for cheap picker lookup.
+    const badges: Record<string, string[]> = {};
+    for (const s of scores) {
+      if (s.badges.length > 0) badges[s.compositeId] = s.badges;
+    }
+
+    return reply.send({
+      scores,
+      badges,
+      badgeLabels: BADGE_LABELS,
+    });
+  });
+}
--- a/apps/control/src/routes/ssh-config.ts
+++ b/apps/control/src/routes/ssh-config.ts
@@ -0,0 +1,262 @@
+import { readFileSync } from 'node:fs';
+import { randomUUID } from 'node:crypto';
+import { fileURLToPath } from 'node:url';
+import { dirname, resolve } from 'node:path';
+import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
+import type { Sql } from '../db.js';
+import type { Config } from '../config.js';
+import type { FleetState } from '../services/fleet-state.js';
+import type { DeltaEmitter } from '../index.js';
+import { resolveProviderBaseUrl } from '../services/llama-providers.js';
+import {
+  validateLlamaConfig,
+  computeDiff,
+  readRemoteConfig,
+  applyRemoteConfig,
+  sshExec,
+  type SshTarget,
+  type SshExec,
+  type SshMode,
+} from '../services/ssh-config.js';
+import { runModelPull, validateRepoId } from '../services/model-pull.js';
+
+/**
+ * P9.1: SSH config editor for llama-swap hosts.
+ *
+ * GET   /api/hosts                       — list control_hosts with SSH config status
+ * PATCH /api/hosts/:id                    — set ssh_host/ssh_user/ssh_key_path/config_path/restart_cmd
+ * GET   /api/hosts/:id/config             — SSH read the remote config
+ * POST  /api/hosts/:id/config/validate    — validate a candidate config (no host touch)
+ * POST  /api/hosts/:id/config/diff        — diff a candidate vs the live remote config
+ * POST  /api/hosts/:id/config/apply       — validate -> backup -> write -> restart -> health-wait
+ * POST  /api/hosts/:id/pull               — pull a HuggingFace model (non-blocking job)
+ *
+ * `exec` is injectable for tests; production uses the real `sshExec` (spawn ssh).
+ */
+export function registerSshConfigRoutes(
+  app: FastifyInstance,
+  sql: Sql,
+  config: Config,
+  fleet: FleetState,
+  emitter: DeltaEmitter,
+  exec: SshExec = sshExec,
+): void {
+  const schema = loadConfigSchema(config);
+
+  app.get('/api/hosts', async (_req: FastifyRequest, reply: FastifyReply) => {
+    const rows = await sql<HostRow[]>`
+      SELECT provider_id, ssh_host, ssh_user, ssh_key_path, config_path, restart_cmd, ssh_mode, os, gpu_label, enabled
+      FROM control_hosts ORDER BY provider_id
+    `;
+    return reply.send({
+      hosts: rows.map((r) => ({
+        providerId: r.provider_id,
+        sshHost: r.ssh_host,
+        sshUser: r.ssh_user,
+        sshKeyPath: r.ssh_key_path,
+        configPath: r.config_path,
+        restartCmd: r.restart_cmd,
+        sshMode: r.ssh_mode ?? 'shell',
+        os: r.os,
+        gpuLabel: r.gpu_label,
+        enabled: r.enabled,
+        sshConfigured: !!(r.ssh_host && r.ssh_user && r.ssh_key_path && r.config_path),
+      })),
+    });
+  });
+
+  app.patch('/api/hosts/:id', async (req: FastifyRequest, reply: FastifyReply) => {
+    const { id } = req.params as { id: string };
+    const body = (req.body as Record<string, unknown>) ?? {};
+    const sshHost = (body.sshHost as string) ?? null;
+    const sshUser = (body.sshUser as string) ?? null;
+    const sshKeyPath = (body.sshKeyPath as string) ?? null;
+    const configPath = (body.configPath as string) ?? null;
+    const restartCmd = (body.restartCmd as string) ?? null;
+    const sshMode: SshMode = body.sshMode === 'wrapper' ? 'wrapper' : 'shell';
+
+    const rows = await sql`
+      UPDATE control_hosts
+      SET ssh_host = ${sshHost}, ssh_user = ${sshUser}, ssh_key_path = ${sshKeyPath},
+          config_path = ${configPath}, restart_cmd = ${restartCmd}, ssh_mode = ${sshMode}
+      WHERE provider_id = ${id}
+      RETURNING provider_id
+    `;
+    if (rows.length === 0) {
+      return reply.status(404).send({ error: 'host not found' });
+    }
+    return reply.send({ ok: true });
+  });
+
+  app.get('/api/hosts/:id/config', async (req: FastifyRequest, reply: FastifyReply) => {
+    const { id } = req.params as { id: string };
+    const host = await loadHost(sql, id);
+    if (!host) return reply.status(404).send({ error: 'host not found' });
+    const target = sshTargetOf(host);
+    if (!target || !host.config_path) {
+      return reply.status(400).send({ error: 'host has no SSH config configured (set ssh_host/ssh_user/ssh_key_path/config_path first)' });
+    }
+    try {
+      const content = await readRemoteConfig(target, host.config_path, exec, hostMode(host));
+      return reply.send({ configPath: host.config_path, content });
+    } catch (err) {
+      return reply.status(502).send({ error: (err as Error).message });
+    }
+  });
+
+  app.post('/api/hosts/:id/config/validate', async (req: FastifyRequest, reply: FastifyReply) => {
+    const body = (req.body as Record<string, unknown>) ?? {};
+    const content = body.content as string;
+    if (typeof content !== 'string') {
+      return reply.status(400).send({ error: 'content (string) is required' });
+    }
+    if (!schema) {
+      return reply.status(500).send({ error: 'config schema not available on this host' });
+    }
+    const result = validateLlamaConfig(content, schema);
+    return reply.send({ valid: result.valid, errors: result.errors });
+  });
+
+  app.post('/api/hosts/:id/config/diff', async (req: FastifyRequest, reply: FastifyReply) => {
+    const { id } = req.params as { id: string };
+    const body = (req.body as Record<string, unknown>) ?? {};
+    const content = body.content as string;
+    if (typeof content !== 'string') {
+      return reply.status(400).send({ error: 'content (string) is required' });
+    }
+    const host = await loadHost(sql, id);
+    if (!host) return reply.status(404).send({ error: 'host not found' });
+    const target = sshTargetOf(host);
+    if (!target || !host.config_path) {
+      return reply.status(400).send({ error: 'host has no SSH config configured' });
+    }
+    try {
+      const current = await readRemoteConfig(target, host.config_path, exec, hostMode(host));
+      return reply.send({ diff: computeDiff(current, content) });
+    } catch (err) {
+      return reply.status(502).send({ error: (err as Error).message });
+    }
+  });
+
+  app.post('/api/hosts/:id/config/apply', async (req: FastifyRequest, reply: FastifyReply) => {
+    const { id } = req.params as { id: string };
+    const body = (req.body as Record<string, unknown>) ?? {};
+    const content = body.content as string;
+    const confirm = body.confirm === true;
+    if (typeof content !== 'string') {
+      return reply.status(400).send({ error: 'content (string) is required' });
+    }
+    if (!confirm) {
+      return reply.status(409).send({ error: 'apply requires confirmation', requiresConfirmation: true });
+    }
+    if (!schema) {
+      return reply.status(500).send({ error: 'config schema not available on this host' });
+    }
+    const host = await loadHost(sql, id);
+    if (!host) return reply.status(404).send({ error: 'host not found' });
+    const target = sshTargetOf(host);
+    const mode = hostMode(host);
+    // restart_cmd is only used in shell mode; in wrapper mode the wrapper's
+    // `restart` verb hardcodes the service, so restart_cmd is not required.
+    if (!target || !host.config_path || (mode === 'shell' && !host.restart_cmd)) {
+      return reply.status(400).send({ error: 'host needs ssh_host/ssh_user/ssh_key_path/config_path (+ restart_cmd in shell mode) set first' });
+    }
+    const baseUrl = resolveProviderBaseUrl(id);
+    if (!baseUrl) {
+      return reply.status(400).send({ error: `no base URL in registry for provider ${id}` });
+    }
+
+    const result = await applyRemoteConfig({
+      target,
+      configPath: host.config_path,
+      restartCmd: host.restart_cmd ?? '',
+      newConfig: content,
+      schema,
+      baseUrl,
+      exec,
+      mode,
+    });
+
+    const status = result.ok ? 200 : (result.step === 'validate' ? 400 : 502);
+    return reply.status(status).send(result);
+  });
+
+  // ─── model pull (non-blocking job) ─────────────────────────────────────────
+  app.post('/api/hosts/:id/pull', async (req: FastifyRequest, reply: FastifyReply) => {
+    const { id } = req.params as { id: string };
+    const body = (req.body as Record<string, unknown>) ?? {};
+    const repo = body.repo as string;
+    const modelsDir = (body.modelsDir as string) ?? undefined;
+
+    if (typeof repo !== 'string' || !validateRepoId(repo)) {
+      return reply.status(400).send({ error: 'repo must be a valid HuggingFace id (org/name)' });
+    }
+    const host = await loadHost(sql, id);
+    if (!host) return reply.status(404).send({ error: 'host not found' });
+    const target = sshTargetOf(host);
+    if (!target) {
+      return reply.status(400).send({ error: 'host has no SSH configured' });
+    }
+    const mode = hostMode(host);
+    if (mode === 'shell' && !modelsDir) {
+      return reply.status(400).send({ error: 'shell-mode host requires a modelsDir in the request body' });
+    }
+
+    const jobId = `pull_${Date.now()}_${randomUUID().slice(0, 8)}`;
+    const seq = fleet.hosts.get(id)?.seq ?? 0;
+    // Fire and forget; progress streams over control_job frames.
+    void runModelPull({ jobId, target, repo, mode, modelsDir }, exec, emitter, seq);
+
+    return reply.status(202).send({ status: 'queued', jobId, repo });
+  });
+}
+
+function hostMode(host: HostRow): SshMode {
+  return host.ssh_mode === 'wrapper' ? 'wrapper' : 'shell';
+}
+
+interface HostRow {
+  provider_id: string;
+  ssh_host: string | null;
+  ssh_user: string | null;
+  ssh_key_path: string | null;
+  config_path: string | null;
+  restart_cmd: string | null;
+  ssh_mode: string | null;
+  os: string | null;
+  gpu_label: string | null;
+  enabled: boolean;
+}
+
+async function loadHost(sql: Sql, id: string): Promise<HostRow | null> {
+  const rows = await sql<HostRow[]>`
+    SELECT provider_id, ssh_host, ssh_user, ssh_key_path, config_path, restart_cmd, ssh_mode, os, gpu_label, enabled
+    FROM control_hosts WHERE provider_id = ${id}
+  `;
+  return rows[0] ?? null;
+}
+
+function sshTargetOf(host: HostRow): SshTarget | null {
+  if (!host.ssh_host || !host.ssh_user || !host.ssh_key_path) return null;
+  return { host: host.ssh_host, user: host.ssh_user, keyPath: host.ssh_key_path };
+}
+
+/** Load the config schema from the configured path or the bundled copy. */
+function loadConfigSchema(config: Config): object | null {
+  const here = dirname(fileURLToPath(import.meta.url));
+  // dist/routes/ssh-config.js -> dist/data/config-schema.json
+  const bundled = resolve(here, '../data/config-schema.json');
+  const path = config.LLAMA_CONFIG_SCHEMA_PATH ?? bundled;
+  try {
+    return JSON.parse(readFileSync(path, 'utf8'));
+  } catch {
+    if (path !== bundled) {
+      try {
+        return JSON.parse(readFileSync(bundled, 'utf8'));
+      } catch {
+        return null;
+      }
+    }
+    return null;
+  }
+}
--- a/apps/control/src/routes/ws.ts
+++ b/apps/control/src/routes/ws.ts
@@ -0,0 +1,109 @@
+import type { FastifyInstance } from 'fastify';
+import WebSocket from 'ws';
+import type { FleetState, HostState } from '../services/fleet-state.js';
+import type { DeltaEmitter } from '../index.js';
+import type { LogRelay } from '../services/log-relay.js';
+
+/**
+ * WS endpoint: /api/ws/control
+ *
+ * On join: send snapshot carrying current fleet state + seqs.
+ * B6: After snapshot, replay in-memory log tail for late joiners.
+ * On delta: forward seq-stamped deltas to subscribers.
+ *
+ * Client rule: buffer pre-snapshot deltas, replay after snapshot applying only
+ * seq > snapshot_seq. On service restart, rebuild fleet state from DB before
+ * serving snapshots.
+ */
+export function registerControlWebSocket(
+  app: FastifyInstance,
+  fleet: FleetState,
+  emitter: DeltaEmitter,
+  logRelay: LogRelay | null = null,
+): void {
+  app.get('/api/ws/control', { websocket: true }, (socket, req) => {
+    const fleetState = fleet;
+    const snapshot = buildSnapshot(fleetState);
+
+    // B4 fix: send snapshot at top level matching ControlFleetFrame Zod schema.
+    const maxSeq = snapshot.hosts.reduce((max, h) => Math.max(max, h.seq), 0);
+    socket.send(JSON.stringify({
+      type: 'control_fleet' as const,
+      seq: maxSeq,
+      hosts: snapshot.hosts,
+    }));
+
+    // B6: Replay in-memory log tail for late joiners.
+    if (logRelay && socket.readyState === WebSocket.OPEN) {
+      const tails = logRelay.getAllTails();
+      for (const entry of tails) {
+        socket.send(JSON.stringify({
+          type: 'control_log' as const,
+          seq: maxSeq, // tail lines don't carry per-host seq; use snapshot seq
+          providerId: entry.providerId,
+          source: entry.source,
+          line: entry.line,
+        }));
+      }
+    }
+
+    // B3 fix: subscribe to delta emitter so WS clients receive live updates.
+    const unsub = emitter.subscribe((delta: unknown) => {
+      if (socket.readyState === WebSocket.OPEN) {
+        socket.send(JSON.stringify(delta));
+      }
+    });
+
+    const heartbeat = setInterval(() => {
+      if (socket.readyState !== WebSocket.OPEN) {
+        clearInterval(heartbeat);
+        return;
+      }
+      socket.send(JSON.stringify({ type: 'ping' as const }));
+    }, 30_000);
+
+    socket.on('close', () => {
+      clearInterval(heartbeat);
+      unsub();
+    });
+
+    socket.on('error', () => {
+      clearInterval(heartbeat);
+      unsub();
+    });
+  });
+}
+
+/**
+ * Build a snapshot from the in-memory fleet state.
+ * On restart, this is rebuilt from DB before serving snapshots.
+ */
+function buildSnapshot(fleet: FleetState): { hosts: Array<{
+  providerId: string;
+  liveness: 'connected' | 'reconnecting' | 'down';
+  lastSeenAt: string | null;
+  seq: number;
+  models: Array<{
+    model: string;
+    state: string;
+    ts: string;
+    ttlDeadline: string | null;
+    inflight: number;
+  }>;
+}> } {
+  const hosts = Array.from(fleet.hosts.values()).map((h) => ({
+    providerId: h.providerId,
+    liveness: h.liveness,
+    lastSeenAt: h.lastSeenAt?.toISOString() ?? null,
+    seq: h.seq,
+    models: Array.from(h.models.values()).map((m) => ({
+      model: m.model,
+      state: m.state,
+      ts: m.ts.toISOString(),
+      ttlDeadline: m.ttlDeadline?.toISOString() ?? null,
+      inflight: m.inflight,
+    })),
+  }));
+
+  return { hosts };
+}
--- a/apps/control/src/schema.sql
+++ b/apps/control/src/schema.sql
@@ -0,0 +1,291 @@
+-- P1: BooControl schema -- read-only fleet cockpit tables.
+-- Applied on startup by apps/control/src/db.ts:applySchema().
+-- Lives in the same 'boochat' database as BooChat's tables.
+
+-- Host registry: one row per enabled llama-swap instance.
+CREATE TABLE IF NOT EXISTS control_hosts (
+  provider_id TEXT PRIMARY KEY,
+  ssh_host TEXT,
+  ssh_user TEXT,
+  ssh_key_path TEXT,
+  config_path TEXT,
+  restart_cmd TEXT,
+  os TEXT,
+  gpu_label TEXT,
+  enabled BOOLEAN NOT NULL DEFAULT true
+);
+
+-- P9 verb-mode: per-host SSH command mode. 'shell' = raw commands (default,
+-- backward compatible); 'wrapper' = fixed verbs for a forced-command-locked key.
+ALTER TABLE control_hosts ADD COLUMN IF NOT EXISTS ssh_mode TEXT NOT NULL DEFAULT 'shell';
+
+-- Seed display metadata; SSH/config columns are NULL until P9.
+INSERT INTO control_hosts (provider_id, os, gpu_label)
+VALUES
+  ('sam-desktop', 'Windows', 'RTX 5090 32GB'),
+  ('embedding', 'Linux', 'P104-100 8GB')
+ON CONFLICT (provider_id) DO NOTHING;
+
+-- Request log: ingested from llama-swap /api/metrics ring.
+CREATE TABLE IF NOT EXISTS control_requests (
+  id BIGSERIAL PRIMARY KEY,
+  provider_id TEXT NOT NULL,
+  swap_entry_id INT NOT NULL,
+  ts TIMESTAMPTZ NOT NULL,
+  model TEXT,
+  req_path TEXT,
+  status_code INT,
+  duration_ms INT,
+  cache_tokens INT,
+  input_tokens INT,
+  output_tokens INT,
+  prompt_tps REAL,
+  gen_tps REAL,
+  has_capture BOOLEAN NOT NULL DEFAULT false,
+  capture JSONB,
+  UNIQUE (provider_id, swap_entry_id, ts)
+);
+
+-- P4: Per-consumer attribution column. Added via idempotent ALTER so existing
+-- DBs pick it up on next restart. See design §7 "Implementation notes" for the
+-- llama-swap ActivityLogEntry discrepancy.
+ALTER TABLE control_requests ADD COLUMN IF NOT EXISTS source TEXT;
+
+CREATE INDEX IF NOT EXISTS idx_control_requests_provider_ts
+  ON control_requests (provider_id, ts DESC);
+
+-- Raw performance samples from llama-swap /api/performance.
+CREATE TABLE IF NOT EXISTS control_perf_samples (
+  provider_id TEXT NOT NULL,
+  ts TIMESTAMPTZ NOT NULL,
+  gpu JSONB,
+  sys JSONB,
+  UNIQUE (provider_id, ts)
+);
+
+CREATE INDEX IF NOT EXISTS idx_control_perf_samples_provider_ts
+  ON control_perf_samples (provider_id, ts DESC);
+
+-- 5-minute rollup aggregates.
+CREATE TABLE IF NOT EXISTS control_perf_rollup_5m (
+  provider_id TEXT NOT NULL,
+  bucket TIMESTAMPTZ NOT NULL,
+  gpu_agg JSONB,
+  sys_agg JSONB,
+  UNIQUE (provider_id, bucket)
+);
+
+-- Model state transitions + gap events.
+CREATE TABLE IF NOT EXISTS control_model_events (
+  provider_id TEXT NOT NULL,
+  model TEXT NOT NULL,
+  state TEXT NOT NULL,
+  ts TIMESTAMPTZ NOT NULL,
+  detail JSONB,
+  UNIQUE (provider_id, model, state, ts)
+);
+
+CREATE INDEX IF NOT EXISTS idx_control_model_events_provider_ts
+  ON control_model_events (provider_id, ts DESC);
+
+-- P3: Bench engine tables -- additive schema change.
+
+-- Suite definitions: grid of prompt_tokens x gen_tokens x concurrency x repetitions.
+CREATE TABLE IF NOT EXISTS bench_suites (
+  id TEXT PRIMARY KEY,
+  name TEXT NOT NULL,
+  provider_id TEXT NOT NULL,
+  model TEXT NOT NULL,
+  prompt_tokens INT[] NOT NULL,
+  gen_tokens INT[] NOT NULL,
+  concurrency INT[] NOT NULL,
+  repetitions INT NOT NULL DEFAULT 1,
+  metadata JSONB,
+  created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp()
+);
+
+-- Individual bench runs (one per suite execution).
+CREATE TABLE IF NOT EXISTS bench_runs (
+  id TEXT PRIMARY KEY,
+  suite_id TEXT NOT NULL REFERENCES bench_suites(id),
+  job_type TEXT NOT NULL DEFAULT 'bench',
+  status TEXT NOT NULL DEFAULT 'queued',
+  started_at TIMESTAMPTZ,
+  finished_at TIMESTAMPTZ,
+  total_samples INT NOT NULL DEFAULT 0,
+  completed_samples INT NOT NULL DEFAULT 0,
+  concurrent_foreign_requests INT NOT NULL DEFAULT 0,
+  temperature REAL,
+  top_p REAL,
+  aggregate JSONB,
+  regression_flag TEXT,
+  error TEXT,
+  created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp()
+);
+
+CREATE INDEX IF NOT EXISTS idx_bench_runs_suite_id
+  ON bench_runs (suite_id);
+
+CREATE INDEX IF NOT EXISTS idx_bench_runs_status
+  ON bench_runs (status);
+
+-- Raw per-request samples from a bench run.
+CREATE TABLE IF NOT EXISTS bench_samples (
+  id BIGSERIAL PRIMARY KEY,
+  run_id TEXT NOT NULL REFERENCES bench_runs(id),
+  prompt_tokens INT NOT NULL,
+  gen_tokens INT NOT NULL,
+  concurrency INT NOT NULL,
+  repetition INT NOT NULL,
+  ttft_ms REAL,
+  total_ms REAL,
+  prompt_tps REAL,
+  gen_tps REAL,
+  cache_n INT,
+  error TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_bench_samples_run_id
+  ON bench_samples (run_id);
+
+-- P3: Baseline aggregates per (provider_id, model).
+-- First completed run seeds the baseline; subsequent runs compare against it.
+CREATE TABLE IF NOT EXISTS bench_baselines (
+  provider_id TEXT NOT NULL,
+  model TEXT NOT NULL,
+  aggregate JSONB NOT NULL,
+  run_id TEXT NOT NULL,
+  created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
+  PRIMARY KEY (provider_id, model)
+);
+
+-- P5: Quality evals + sandbox tables.
+
+-- Eval suite definitions: kind (chat|code), tasks JSONB, judge_model.
+CREATE TABLE IF NOT EXISTS eval_suites (
+  id TEXT PRIMARY KEY,
+  name TEXT NOT NULL,
+  kind TEXT NOT NULL,
+  version INT NOT NULL DEFAULT 1,
+  tasks JSONB NOT NULL,
+  judge_model TEXT,
+  judge_model_version TEXT,
+  metadata JSONB,
+  UNIQUE (name, version),
+  created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp()
+);
+
+CREATE INDEX IF NOT EXISTS idx_eval_suites_kind
+  ON eval_suites (kind);
+
+-- Individual eval runs (one per suite execution against a model).
+CREATE TABLE IF NOT EXISTS eval_runs (
+  id TEXT PRIMARY KEY,
+  suite_id TEXT NOT NULL REFERENCES eval_suites(id),
+  job_type TEXT NOT NULL DEFAULT 'eval',
+  provider_id TEXT NOT NULL,
+  model TEXT NOT NULL,
+  quant TEXT,
+  status TEXT NOT NULL DEFAULT 'queued',
+  judge_model TEXT,
+  judge_model_version TEXT,
+  started_at TIMESTAMPTZ,
+  finished_at TIMESTAMPTZ,
+  total_tasks INT NOT NULL DEFAULT 0,
+  completed_tasks INT NOT NULL DEFAULT 0,
+  aggregate JSONB,
+  error TEXT,
+  created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp()
+);
+
+CREATE INDEX IF NOT EXISTS idx_eval_runs_suite_id
+  ON eval_runs (suite_id);
+
+CREATE INDEX IF NOT EXISTS idx_eval_runs_status
+  ON eval_runs (status);
+
+CREATE INDEX IF NOT EXISTS idx_eval_runs_provider_model
+  ON eval_runs (provider_id, model);
+
+-- Per-task eval results: score, judge rationale, sandbox exit info.
+CREATE TABLE IF NOT EXISTS eval_results (
+  id BIGSERIAL PRIMARY KEY,
+  run_id TEXT NOT NULL REFERENCES eval_runs(id),
+  task_id TEXT NOT NULL,
+  task_index INT NOT NULL,
+  score REAL,
+  max_score REAL,
+  rationale TEXT,
+  sandbox_exit_code INT,
+  sandbox_stderr TEXT,
+  sandbox_stdout TEXT,
+  execution_ms INT,
+  error TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_eval_results_run_id
+  ON eval_results (run_id);
+
+-- P6.2: Generated fleet reports (markdown digest + JSONB stats).
+CREATE TABLE IF NOT EXISTS control_reports (
+  id TEXT PRIMARY KEY,
+  kind TEXT NOT NULL DEFAULT 'digest',
+  interval TEXT NOT NULL DEFAULT 'daily',
+  period_start TIMESTAMPTZ NOT NULL,
+  period_end TIMESTAMPTZ NOT NULL,
+  markdown TEXT NOT NULL,
+  stats JSONB,
+  created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp()
+);
+
+CREATE INDEX IF NOT EXISTS idx_control_reports_created
+  ON control_reports (created_at DESC);
+
+-- P6.2: Scheduler metadata for the in-process report timer. Single row keyed by
+-- schedule name; last_run_at drives catch-up-on-boot (same pattern as retention).
+CREATE TABLE IF NOT EXISTS control_schedule_meta (
+  name TEXT PRIMARY KEY,
+  interval TEXT NOT NULL DEFAULT 'daily',
+  enabled BOOLEAN NOT NULL DEFAULT true,
+  last_run_at TIMESTAMPTZ
+);
+
+INSERT INTO control_schedule_meta (name, interval, enabled)
+VALUES ('report-digest', 'daily', true)
+ON CONFLICT (name) DO NOTHING;
+
+-- P7.1: Routing policies for the auto:* gateway. `match` selects which virtual
+-- model a policy serves (e.g. 'auto:code'); `candidates` is an ordered list of
+-- composite ids ('provider/model'); `fallback` is the last-resort composite id.
+CREATE TABLE IF NOT EXISTS route_policies (
+  id TEXT PRIMARY KEY,
+  name TEXT NOT NULL,
+  virtual_model TEXT NOT NULL,
+  candidates JSONB NOT NULL,
+  fallback TEXT,
+  enabled BOOLEAN NOT NULL DEFAULT true,
+  created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
+  updated_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
+  UNIQUE (virtual_model)
+);
+
+-- P7.1/P7.4: Per-dispatch log for the gateway. One row per resolved completion
+-- routed through a virtual model, recording the chosen target + outcome.
+CREATE TABLE IF NOT EXISTS route_dispatch_log (
+  id BIGSERIAL PRIMARY KEY,
+  ts TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
+  virtual_model TEXT NOT NULL,
+  chosen_provider_id TEXT,
+  chosen_model TEXT,
+  candidates_tried JSONB,
+  status TEXT NOT NULL,
+  source TEXT,
+  error TEXT,
+  duration_ms INT
+);
+
+CREATE INDEX IF NOT EXISTS idx_route_dispatch_log_ts
+  ON route_dispatch_log (ts DESC);
+
+CREATE INDEX IF NOT EXISTS idx_route_dispatch_log_virtual
+  ON route_dispatch_log (virtual_model, ts DESC);
--- a/apps/control/src/services/tests/action-queue.test.ts
+++ b/apps/control/src/services/tests/action-queue.test.ts
@@ -0,0 +1,194 @@
+import { describe, it, expect, beforeEach } from 'vitest';
+import { ActionQueue } from '../action-queue.js';
+import type { ActionQueueDeps, QueuedAction } from '../action-queue.js';
+
+describe('ActionQueue', () => {
+  let queue: ActionQueue;
+  let deps: ActionQueueDeps;
+
+  beforeEach(() => {
+    queue = new ActionQueue();
+    deps = {
+      baseUrl: 'http://test-host:8401',
+      isLivenessUp: () => true,
+      isInflightRequests: () => 0,
+      log: {
+        error: () => {},
+        warn: () => {},
+        info: () => {},
+        debug: () => {},
+        trace: () => {},
+        fatal: () => {},
+        child: () => deps.log,
+      } as any,
+    };
+    queue.registerHost('host1', deps);
+  });
+
+  describe('submit', () => {
+    it('rejects submission when host is down', () => {
+      const downQueue = new ActionQueue();
+      const downDeps: ActionQueueDeps = {
+        ...deps,
+        isLivenessUp: () => false,
+      };
+      downQueue.registerHost('down-host', downDeps);
+
+      const result = downQueue.submit({
+        actionId: 'a1',
+        type: 'warm',
+        providerId: 'down-host',
+        confirmed: false,
+        createdAt: new Date(),
+      });
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error).toBe('host offline');
+      }
+    });
+
+    it('rejects submission when queue is full (depth 4)', () => {
+      // Fill the queue to capacity
+      for (let i = 0; i < 4; i++) {
+        const result = queue.submit({
+          actionId: `fill-${i}`,
+          type: 'warm',
+          providerId: 'host1',
+          model: 'model1',
+          confirmed: false,
+          createdAt: new Date(),
+        });
+        expect(result.ok).toBe(true);
+      }
+
+      // 5th submission should be rejected
+      const result = queue.submit({
+        actionId: 'overflow',
+        type: 'warm',
+        providerId: 'host1',
+        model: 'model1',
+        confirmed: false,
+        createdAt: new Date(),
+      });
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error).toContain('queue full');
+        expect(result.pending).toHaveLength(4);
+      }
+    });
+
+    it('returns 409 with requiresConfirmation for unload during inflight', () => {
+      const inflightDeps: ActionQueueDeps = {
+        ...deps,
+        isInflightRequests: () => 5,
+      };
+      const inflightQueue = new ActionQueue();
+      inflightQueue.registerHost('busy-host', inflightDeps);
+
+      const result = inflightQueue.submit({
+        actionId: 'unload-1',
+        type: 'unload',
+        providerId: 'busy-host',
+        confirmed: false,
+        createdAt: new Date(),
+      });
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error).toBe('bench in progress');
+        expect(result.requiresConfirmation).toBe(true);
+      }
+    });
+
+    it('allows confirmed unload during inflight', () => {
+      const inflightDeps: ActionQueueDeps = {
+        ...deps,
+        isInflightRequests: () => 5,
+      };
+      const inflightQueue = new ActionQueue();
+      inflightQueue.registerHost('busy-host', inflightDeps);
+
+      const result = inflightQueue.submit({
+        actionId: 'unload-confirmed',
+        type: 'unload',
+        providerId: 'busy-host',
+        confirmed: true,
+        createdAt: new Date(),
+      });
+
+      expect(result.ok).toBe(true);
+    });
+
+    it('accepts a warm action when queue has capacity', () => {
+      const result = queue.submit({
+        actionId: 'warm-1',
+        type: 'warm',
+        providerId: 'host1',
+        model: 'llama3',
+        confirmed: false,
+        createdAt: new Date(),
+      });
+
+      expect(result.ok).toBe(true);
+    });
+  });
+
+  describe('getState', () => {
+    it('returns null for unknown host', () => {
+      expect(queue.getState('unknown')).toBeNull();
+    });
+
+    it('returns state with entries after submission', () => {
+      queue.submit({
+        actionId: 'test-1',
+        type: 'warm',
+        providerId: 'host1',
+        model: 'llama3',
+        confirmed: false,
+        createdAt: new Date(),
+      });
+
+      const state = queue.getState('host1');
+      expect(state).not.toBeNull();
+      expect(state!.queue.length).toBe(1);
+      expect(state!.queue[0].action.actionId).toBe('test-1');
+      // Status transitions to 'running' as processNext kicks off asynchronously
+      expect(['pending', 'running']).toContain(state!.queue[0].status);
+    });
+  });
+
+  describe('processNext (stale action skip)', () => {
+    it('skips an action when host goes down during processing', async () => {
+      let livenessUp = true;
+      const dynamicDeps: ActionQueueDeps = {
+        ...deps,
+        isLivenessUp: () => livenessUp,
+      };
+      const dynamicQueue = new ActionQueue();
+      dynamicQueue.registerHost('flaky-host', dynamicDeps);
+
+      // Submit an action
+      dynamicQueue.submit({
+        actionId: 'stale-1',
+        type: 'warm',
+        providerId: 'flaky-host',
+        model: 'llama3',
+        confirmed: false,
+        createdAt: new Date(),
+      });
+
+      // Turn host down before processing
+      livenessUp = false;
+
+      // The queue processor will skip the action
+      // We can't easily test the async processNext directly, but we can verify
+      // the state reflects the skip logic by checking the queue state
+      const state = dynamicQueue.getState('flaky-host');
+      expect(state).not.toBeNull();
+      expect(state!.queue.length).toBe(1);
+      // The entry is still pending; processNext would mark it skipped
+    });
+  });
+});
--- a/apps/control/src/services/tests/bench-engine.test.ts
+++ b/apps/control/src/services/tests/bench-engine.test.ts
@@ -0,0 +1,300 @@
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { parseLlamaTimings, computeAggregates, runSingleBenchRequest } from '../../index.js';
+import { computeRegressionFlag } from '../bench-engine.js';
+import { createFleetState, ensureHostState } from '../fleet-state.js';
+import { createDeltaEmitter } from '../../index.js';
+import type { Sql } from '../../db.js';
+import type { Config } from '../../config.js';
+import type { BenchSuite } from '../bench-engine.js';
+
+// ─── parseLlamaTimings tests ────────────────────────────────────────────────
+
+describe('parseLlamaTimings', () => {
+  it('parses timings from a standard llama.cpp chunk', () => {
+    const chunk = 'data: {"choices":[],"timings":{"prompt_per_second":150,"predicted_per_second":80,"cache_n":50}}';
+    const result = parseLlamaTimings(chunk);
+    expect(result).not.toBeNull();
+    expect(result!.promptPerSecond).toBe(150);
+    expect(result!.predictedPerSecond).toBe(80);
+    expect(result!.cacheN).toBe(50);
+  });
+
+  it('parses timings without data: prefix', () => {
+    const chunk = '{"timings":{"prompt_per_second":200,"predicted_per_second":100,"cache_n":0}}';
+    const result = parseLlamaTimings(chunk);
+    expect(result).not.toBeNull();
+    expect(result!.promptPerSecond).toBe(200);
+  });
+
+  it('returns null for [DONE] chunk', () => {
+    expect(parseLlamaTimings('data: [DONE]')).toBeNull();
+  });
+
+  it('returns null for chunk without timings', () => {
+    const chunk = 'data: {"choices":[{"delta":{"content":"hello"}}]}';
+    expect(parseLlamaTimings(chunk)).toBeNull();
+  });
+
+  it('returns null for malformed JSON', () => {
+    expect(parseLlamaTimings('data: not-json')).toBeNull();
+  });
+});
+
+// ─── computeAggregates tests ────────────────────────────────────────────────
+
+describe('computeAggregates', () => {
+  it('returns nulls for empty samples', () => {
+    const result = computeAggregates([]);
+    expect(result.totalSamples).toBe(0);
+    expect(result.avgTtftMs).toBeNull();
+    expect(result.avgGenTps).toBeNull();
+  });
+
+  it('computes averages correctly', () => {
+    const samples = [
+      { ttftMs: 100, genTps: 50, promptTps: 100, error: null } as any,
+      { ttftMs: 200, genTps: 100, promptTps: 200, error: null } as any,
+      { ttftMs: 300, genTps: 150, promptTps: 300, error: null } as any,
+    ];
+    const result = computeAggregates(samples);
+    expect(result.avgTtftMs).toBe(200);
+    expect(result.avgGenTps).toBe(100);
+    expect(result.avgPromptTps).toBe(200);
+    expect(result.totalSamples).toBe(3);
+    expect(result.errorSamples).toBe(0);
+  });
+
+  it('computes median correctly for odd count', () => {
+    const samples = [
+      { ttftMs: 100, genTps: 50, promptTps: 100, error: null } as any,
+      { ttftMs: 200, genTps: 100, promptTps: 200, error: null } as any,
+      { ttftMs: 300, genTps: 150, promptTps: 300, error: null } as any,
+    ];
+    const result = computeAggregates(samples);
+    expect(result.medianTtftMs).toBe(200);
+    expect(result.medianGenTps).toBe(100);
+  });
+
+  it('computes median correctly for even count', () => {
+    const samples = [
+      { ttftMs: 100, genTps: 50, promptTps: 100, error: null } as any,
+      { ttftMs: 200, genTps: 100, promptTps: 200, error: null } as any,
+      { ttftMs: 300, genTps: 150, promptTps: 300, error: null } as any,
+      { ttftMs: 400, genTps: 200, promptTps: 400, error: null } as any,
+    ];
+    const result = computeAggregates(samples);
+    expect(result.medianTtftMs).toBe(250);
+    expect(result.medianGenTps).toBe(125);
+  });
+
+  it('computes p95 TTFT', () => {
+    const samples = Array.from({ length: 20 }, (_, i) => ({
+      ttftMs: (i + 1) * 10,
+      genTps: 50,
+      promptTps: 100,
+      error: null,
+    })) as any[];
+    const result = computeAggregates(samples);
+    expect(result.p95TtftMs).toBeCloseTo(190, -1);
+  });
+
+  it('filters out null values', () => {
+    const samples = [
+      { ttftMs: 100, genTps: 50, promptTps: 100, error: null } as any,
+      { ttftMs: null, genTps: null, promptTps: null, error: 'timeout' } as any,
+    ];
+    const result = computeAggregates(samples);
+    expect(result.avgTtftMs).toBe(100);
+    expect(result.errorSamples).toBe(1);
+  });
+});
+
+// ─── bench runner pipeline test (mock fetch + real functions) ────────────────
+
+describe('bench runner pipeline', () => {
+  let mockSql: Sql;
+  let executedQueries: Array<{ query: string; values: unknown[] }>;
+
+  beforeEach(() => {
+    executedQueries = [];
+    mockSql = Object.assign(
+      (strings: TemplateStringsArray, ...values: unknown[]) => {
+        const query = strings.reduce((acc: string, s: string, i: number) => acc + s + (values[i] ?? ''), '');
+        executedQueries.push({ query, values });
+        return Promise.resolve([]);
+      },
+      {
+        json: (v: unknown) => v,
+        unsafe: async (q: string) => { executedQueries.push({ query: q, values: [] }); return []; },
+      },
+    ) as unknown as Sql;
+  });
+
+  it('runSingleBenchRequest captures TTFT and timings on successful stream', async () => {
+    const fakeStream = createFakeStreamResponse([
+      'data: {"choices":[{"delta":{"content":"H"}}]}',
+      'data: {"choices":[{"delta":{"content":"ello"}}]}',
+      'data: {"choices":[],"timings":{"prompt_per_second":150,"predicted_per_second":80,"cache_n":10}}',
+      'data: [DONE]',
+    ]);
+
+    vi.spyOn(global, 'fetch').mockResolvedValueOnce(fakeStream);
+
+    const sample = await runSingleBenchRequest(
+      'http://localhost:8401',
+      'test-model',
+      10,
+      20,
+      0,
+      0.7,
+      0.9,
+    );
+
+    expect(sample.error).toBeNull();
+    expect(sample.ttftMs).toBeGreaterThanOrEqual(0);
+    expect(sample.ttftMs).toBeLessThan(5000);
+    expect(sample.totalMs).toBeGreaterThanOrEqual(0);
+    expect(sample.promptTps).toBe(150);
+    expect(sample.genTps).toBe(80);
+    expect(sample.cacheN).toBe(10);
+    expect(sample.promptTokens).toBe(10);
+    expect(sample.genTokens).toBe(20);
+    expect(sample.repetition).toBe(0);
+
+    vi.restoreAllMocks();
+  });
+
+  it('runSingleBenchRequest captures error on HTTP failure', async () => {
+    vi.spyOn(global, 'fetch').mockResolvedValueOnce({
+      ok: false,
+      status: 500,
+      text: async () => 'Internal Server Error',
+    } as Response);
+
+    const sample = await runSingleBenchRequest(
+      'http://localhost:8401',
+      'test-model',
+      10,
+      20,
+      0,
+    );
+
+    expect(sample.error).toContain('500');
+    expect(sample.ttftMs).toBeNull();
+
+    vi.restoreAllMocks();
+  });
+
+  it('runSingleBenchRequest captures error on fetch exception', async () => {
+    vi.spyOn(global, 'fetch').mockRejectedValueOnce(new Error('ECONNREFUSED'));
+
+    const sample = await runSingleBenchRequest(
+      'http://localhost:8401',
+      'test-model',
+      10,
+      20,
+      0,
+    );
+
+    expect(sample.error).toContain('ECONNREFUSED');
+
+    vi.restoreAllMocks();
+  });
+});
+
+// ─── helper: create a fake streaming Response ────────────────────────────────
+
+function createFakeStreamResponse(lines: string[]): Response {
+  const encoder = new TextEncoder();
+  let position = 0;
+
+  const stream = new ReadableStream({
+    async pull(controller) {
+      if (position >= lines.length) {
+        controller.close();
+        return;
+      }
+      const line = lines[position]! + '\n\n';
+      controller.enqueue(encoder.encode(line));
+      position++;
+      // Small delay to simulate network latency for TTFT measurement
+      await new Promise((r) => setTimeout(r, 5));
+    },
+  });
+
+  return new Response(stream, {
+    status: 200,
+    headers: { 'Content-Type': 'text/event-stream' },
+  });
+}
+
+// ─── computeRegressionFlag tests (A1) ────────────────────────────────────────
+
+describe('computeRegressionFlag', () => {
+  it('returns baseline for first run (no baseline)', () => {
+    const current = computeAggregates([
+      { ttftMs: 100, genTps: 80, promptTps: 150, error: null } as any,
+    ]);
+    expect(computeRegressionFlag(current, undefined)).toBe('baseline');
+  });
+
+  it('returns regression when gen tok/s drops below -10%', () => {
+    const current = computeAggregates([
+      { ttftMs: 200, genTps: 70, promptTps: 100, error: null } as any,
+    ]);
+    const baseline = JSON.stringify({
+      avgGenTps: 100,
+      avgTtftMs: 100,
+      totalSamples: 1,
+    });
+    expect(computeRegressionFlag(current, baseline)).toBe('regression');
+  });
+
+  it('returns improvement when gen tok/s rises above +5%', () => {
+    const current = computeAggregates([
+      { ttftMs: 80, genTps: 120, promptTps: 200, error: null } as any,
+    ]);
+    const baseline = JSON.stringify({
+      avgGenTps: 100,
+      avgTtftMs: 100,
+      totalSamples: 1,
+    });
+    expect(computeRegressionFlag(current, baseline)).toBe('improvement');
+  });
+
+  it('returns baseline when within threshold', () => {
+    const current = computeAggregates([
+      { ttftMs: 100, genTps: 98, promptTps: 150, error: null } as any,
+    ]);
+    const baseline = JSON.stringify({
+      avgGenTps: 100,
+      avgTtftMs: 100,
+      totalSamples: 1,
+    });
+    expect(computeRegressionFlag(current, baseline)).toBe('baseline');
+  });
+
+  it('returns null for divide-by-zero (N5: baseline avgGenTps is 0)', () => {
+    const current = computeAggregates([
+      { ttftMs: 100, genTps: 50, promptTps: 100, error: null } as any,
+    ]);
+    const baseline = JSON.stringify({
+      avgGenTps: 0,
+      avgTtftMs: 100,
+      totalSamples: 1,
+    });
+    expect(computeRegressionFlag(current, baseline)).toBeNull();
+  });
+
+  it('returns null for null current avgGenTps', () => {
+    const current = computeAggregates([]);
+    expect(computeRegressionFlag(current, JSON.stringify({ avgGenTps: 100 }))).toBeNull();
+  });
+
+  it('returns null for malformed baseline JSON', () => {
+    const current = computeAggregates([
+      { ttftMs: 100, genTps: 80, promptTps: 150, error: null } as any,
+    ]);
+    expect(computeRegressionFlag(current, 'not-json')).toBeNull();
+  });
+});
--- a/apps/control/src/services/tests/capture-fetch.test.ts
+++ b/apps/control/src/services/tests/capture-fetch.test.ts
@@ -0,0 +1,60 @@
+import { describe, it, expect } from 'vitest';
+import { parseCapture } from '../capture-fetch.js';
+
+describe('parseCapture', () => {
+  it('trims response body when total exceeds 256KB cap', () => {
+    const largeBody = 'y'.repeat(300_000);
+    const capture = parseCapture({
+      request_headers: { 'Content-Type': 'application/json' },
+      response_headers: {},
+      request_body: Buffer.from('x'.repeat(100_000)).toString('base64'),
+      response_body: Buffer.from(largeBody).toString('base64'),
+      timestamp: '2024-01-01T00:00:00Z',
+      model: 'test-model',
+      duration_ms: 100,
+    }, 'host1', 1);
+
+    expect(capture.responseBody).toContain('[truncated: capture exceeds 256KB cap]');
+    const totalBytes = Buffer.byteLength(capture.requestBody + capture.responseBody);
+    expect(totalBytes).toBeLessThanOrEqual(256 * 1024 + 100);
+  });
+
+  it('does not trim when under cap', () => {
+    const capture = parseCapture({
+      request_headers: {},
+      response_headers: {},
+      request_body: Buffer.from('small request').toString('base64'),
+      response_body: Buffer.from('small response').toString('base64'),
+      timestamp: '2024-01-01T00:00:00Z',
+      model: 'test-model',
+      duration_ms: 50,
+    }, 'host1', 2);
+
+    expect(capture.requestBody).toBe('small request');
+    expect(capture.responseBody).toBe('small response');
+    expect(capture.responseBody).not.toContain('[truncated');
+  });
+
+  it('handles missing base64 bodies gracefully', () => {
+    const capture = parseCapture({
+      timestamp: '2024-01-01T00:00:00Z',
+    }, 'host1', 3);
+
+    expect(capture.requestBody).toBe('');
+    expect(capture.responseBody).toBe('');
+  });
+
+  it('decodes base64 (invalid base64 produces binary, not raw string)', () => {
+    // Buffer.from(str, 'base64') does not throw on invalid base64 —
+    // it decodes what it can. The catch block only triggers on actual
+    // Buffer.from exceptions, which are rare.
+    const capture = parseCapture({
+      request_body: Buffer.from('valid json').toString('base64'),
+      response_body: Buffer.from('{"result": true}').toString('base64'),
+      timestamp: '2024-01-01T00:00:00Z',
+    }, 'host1', 4);
+
+    expect(capture.requestBody).toBe('valid json');
+    expect(capture.responseBody).toBe('{"result": true}');
+  });
+});
--- a/apps/control/src/services/tests/eval-suites.test.ts
+++ b/apps/control/src/services/tests/eval-suites.test.ts
@@ -0,0 +1,50 @@
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { loadEvalSuitesFromData } from '../../index.js';
+
+// ─── loadEvalSuitesFromData tests ───────────────────────────────────────────
+
+describe('loadEvalSuitesFromData', () => {
+  it('loads suites from data/ YAML files', () => {
+    const suites = loadEvalSuitesFromData();
+    expect(suites.length).toBeGreaterThanOrEqual(4);
+
+    const ids = suites.map((s) => s.id);
+    expect(ids).toContain('agent-coding');
+    expect(ids).toContain('chat-quality');
+    expect(ids).toContain('long-context-retrieval');
+    expect(ids).toContain('utility-calls');
+  });
+
+  it('loads code suite with correct structure', () => {
+    const suites = loadEvalSuitesFromData();
+    const codeSuite = suites.find((s) => s.id === 'agent-coding');
+    expect(codeSuite).not.toBeUndefined();
+    expect(codeSuite!.kind).toBe('code');
+    expect(codeSuite!.tasks.length).toBeGreaterThan(0);
+
+    const task = codeSuite!.tasks[0] as Record<string, unknown>;
+    expect(task.id).toBeDefined();
+    expect(task.prompt).toBeDefined();
+    expect(task.test_code).toBeDefined();
+    expect(task.expected_output).toBeDefined();
+    expect(task.language).toBe('typescript');
+  });
+
+  it('loads chat suite with rubric structure', () => {
+    const suites = loadEvalSuitesFromData();
+    const chatSuite = suites.find((s) => s.id === 'chat-quality');
+    expect(chatSuite).not.toBeUndefined();
+    expect(chatSuite!.kind).toBe('chat');
+
+    const task = chatSuite!.tasks[0] as Record<string, unknown>;
+    expect(task.rubric).toBeDefined();
+    expect((task.rubric as Record<string, unknown>).max_score).toBeGreaterThan(0);
+  });
+
+  it('handles missing data/ directory gracefully', () => {
+    // The function catches errors and returns empty array.
+    // We can't easily test this without mocking fs, but the try-catch is there.
+    const suites = loadEvalSuitesFromData();
+    expect(Array.isArray(suites)).toBe(true);
+  });
+});
--- a/apps/control/src/services/tests/fleet-connector.test.ts
+++ b/apps/control/src/services/tests/fleet-connector.test.ts
@@ -0,0 +1,82 @@
+import { describe, it, expect } from 'vitest';
+import { addJitter, reconnectDecision, DEFAULT_RECONNECT_POLICY } from '../fleet-connector.js';
+
+describe('addJitter', () => {
+  it('returns a value >= the input delay', () => {
+    const jittered = addJitter(1000);
+    expect(jittered).toBeGreaterThanOrEqual(1000);
+  });
+
+  it('returns a value <= 1.5x the input delay', () => {
+    const jittered = addJitter(1000);
+    expect(jittered).toBeLessThanOrEqual(1500);
+  });
+
+  it('0ms delay stays 0ms', () => {
+    expect(addJitter(0)).toBe(0);
+  });
+
+  it('returns different values on repeated calls (stochastic)', () => {
+    const results = new Set<number>();
+    for (let i = 0; i < 20; i++) {
+      results.add(addJitter(1000));
+    }
+    expect(results.size).toBeGreaterThan(1);
+  });
+});
+
+describe('reconnectDecision', () => {
+  it('first failure returns baseMs with jitter', () => {
+    const decision = reconnectDecision(1);
+    expect(decision.action).toBe('reconnect');
+    expect(decision.delayMs).toBeGreaterThanOrEqual(DEFAULT_RECONNECT_POLICY.baseMs);
+    expect(decision.delayMs).toBeLessThanOrEqual(DEFAULT_RECONNECT_POLICY.baseMs * 1.5);
+  });
+
+  it('exponential growth: failure 2 returns 2x baseMs with jitter', () => {
+    const decision = reconnectDecision(2);
+    expect(decision.action).toBe('reconnect');
+    expect(decision.delayMs).toBeGreaterThanOrEqual(DEFAULT_RECONNECT_POLICY.baseMs * 2);
+    expect(decision.delayMs).toBeLessThanOrEqual(DEFAULT_RECONNECT_POLICY.baseMs * 3);
+  });
+
+  it('exponential growth: failure 3 returns 4x baseMs with jitter', () => {
+    const decision = reconnectDecision(3);
+    expect(decision.action).toBe('reconnect');
+    expect(decision.delayMs).toBeGreaterThanOrEqual(DEFAULT_RECONNECT_POLICY.baseMs * 4);
+    expect(decision.delayMs).toBeLessThanOrEqual(DEFAULT_RECONNECT_POLICY.baseMs * 6);
+  });
+
+  it('capped at maxMs with jitter', () => {
+    const decision = reconnectDecision(6);
+    expect(decision.action).toBe('reconnect');
+    expect(decision.delayMs).toBeGreaterThanOrEqual(DEFAULT_RECONNECT_POLICY.maxMs);
+    expect(decision.delayMs).toBeLessThanOrEqual(DEFAULT_RECONNECT_POLICY.maxMs * 1.5);
+  });
+
+  it('gives up after maxAttempts', () => {
+    const decision = reconnectDecision(DEFAULT_RECONNECT_POLICY.maxAttempts + 1);
+    expect(decision).toEqual({ action: 'give-up' });
+  });
+
+  it('custom policy works with jitter', () => {
+    const policy = { baseMs: 500, maxMs: 5000, maxAttempts: 3 };
+    const d1 = reconnectDecision(1, policy);
+    expect(d1.action).toBe('reconnect');
+    expect(d1.delayMs).toBeGreaterThanOrEqual(500);
+    expect(d1.delayMs).toBeLessThanOrEqual(750);
+
+    const d2 = reconnectDecision(2, policy);
+    expect(d2.action).toBe('reconnect');
+    expect(d2.delayMs).toBeGreaterThanOrEqual(1000);
+    expect(d2.delayMs).toBeLessThanOrEqual(1500);
+
+    const d3 = reconnectDecision(3, policy);
+    expect(d3.action).toBe('reconnect');
+    expect(d3.delayMs).toBeGreaterThanOrEqual(2000);
+    expect(d3.delayMs).toBeLessThanOrEqual(3000);
+
+    const d4 = reconnectDecision(4, policy);
+    expect(d4).toEqual({ action: 'give-up' });
+  });
+});
--- a/apps/control/src/services/tests/fleet-state.test.ts
+++ b/apps/control/src/services/tests/fleet-state.test.ts
@@ -0,0 +1,42 @@
+import { describe, it, expect } from 'vitest';
+import { createFleetState, ensureHostState, stampLastSeen } from '../fleet-state.js';
+
+describe('createFleetState', () => {
+  it('creates an empty fleet', () => {
+    const fleet = createFleetState();
+    expect(fleet.hosts.size).toBe(0);
+  });
+});
+
+describe('ensureHostState', () => {
+  it('creates a new host state if none exists', () => {
+    const fleet = createFleetState();
+    const state = ensureHostState(fleet, 'test-host');
+    expect(state.providerId).toBe('test-host');
+    expect(state.liveness).toBe('down');
+    expect(state.lastSeenAt).toBeNull();
+    expect(state.seq).toBe(0);
+    expect(state.models.size).toBe(0);
+  });
+
+  it('returns existing host state', () => {
+    const fleet = createFleetState();
+    const state1 = ensureHostState(fleet, 'test-host');
+    const state2 = ensureHostState(fleet, 'test-host');
+    expect(state1).toBe(state2);
+  });
+
+  it('seq is 0 on first call', () => {
+    const fleet = createFleetState();
+    const state = ensureHostState(fleet, 'test-host');
+    expect(state.seq).toBe(0);
+  });
+
+  it('stamps lastSeenAt on connection', () => {
+    const fleet = createFleetState();
+    const state = ensureHostState(fleet, 'test-host');
+    expect(state.lastSeenAt).toBeNull();
+    stampLastSeen(state);
+    expect(state.lastSeenAt).not.toBeNull();
+  });
+});
--- a/apps/control/src/services/tests/gateway.test.ts
+++ b/apps/control/src/services/tests/gateway.test.ts
@@ -0,0 +1,92 @@
+import { describe, it, expect } from 'vitest';
+import {
+  isGatewayVirtualModel,
+  parseVirtualModel,
+  orderCandidates,
+  splitComposite,
+} from '../gateway.js';
+import type { ModelScore } from '../routing-scores.js';
+
+function score(compositeId: string, partial: Partial<ModelScore> = {}): ModelScore {
+  return {
+    compositeId,
+    providerId: compositeId.split('/')[0]!,
+    model: compositeId.split('/').slice(1).join('/'),
+    codeScore: null,
+    chatScore: null,
+    evalScore: null,
+    avgGenTps: null,
+    avgLatencyMs: null,
+    sampleCount: 0,
+    healthy: true,
+    badges: [],
+    ...partial,
+  };
+}
+
+describe('isGatewayVirtualModel', () => {
+  it('matches auto and auto:* tokens', () => {
+    expect(isGatewayVirtualModel('auto')).toBe(true);
+    expect(isGatewayVirtualModel('auto:code')).toBe(true);
+    expect(isGatewayVirtualModel('auto:fast')).toBe(true);
+  });
+  it('does not match ordinary models', () => {
+    expect(isGatewayVirtualModel('qwopus-35b')).toBe(false);
+    expect(isGatewayVirtualModel('autobahn')).toBe(false);
+  });
+});
+
+describe('parseVirtualModel', () => {
+  it('strips a gateway provider prefix', () => {
+    expect(parseVirtualModel('auto/auto:code')).toBe('auto:code');
+  });
+  it('passes a bare virtual model through', () => {
+    expect(parseVirtualModel('auto:fast')).toBe('auto:fast');
+  });
+});
+
+describe('splitComposite', () => {
+  it('splits provider/model', () => {
+    expect(splitComposite('sam-desktop/qwopus-35b')).toEqual({ providerId: 'sam-desktop', model: 'qwopus-35b' });
+  });
+  it('returns null for a bare id', () => {
+    expect(splitComposite('qwopus-35b')).toBeNull();
+  });
+});
+
+describe('orderCandidates', () => {
+  it('orders auto:code by code score among healthy hosts', () => {
+    const scores = [
+      score('a/m1', { codeScore: 0.6 }),
+      score('a/m2', { codeScore: 0.9 }),
+      score('a/m3', { codeScore: 0.7, healthy: false }),
+    ];
+    expect(orderCandidates('auto:code', null, scores)).toEqual(['a/m2', 'a/m1']);
+  });
+
+  it('orders auto:fast by throughput', () => {
+    const scores = [
+      score('a/slow', { avgGenTps: 10 }),
+      score('a/fast', { avgGenTps: 50 }),
+    ];
+    expect(orderCandidates('auto:fast', null, scores)).toEqual(['a/fast', 'a/slow']);
+  });
+
+  it('honors an explicit policy order and appends the fallback', () => {
+    const scores = [score('a/m1'), score('a/m2'), score('a/fb')];
+    const ordered = orderCandidates('auto:code', { candidates: ['a/m2', 'a/m1'], fallback: 'a/fb' }, scores);
+    expect(ordered).toEqual(['a/m2', 'a/m1', 'a/fb']);
+  });
+
+  it('drops policy candidates whose host is unhealthy', () => {
+    const scores = [score('a/m1', { healthy: false }), score('a/m2', { healthy: true })];
+    const ordered = orderCandidates('auto:code', { candidates: ['a/m1', 'a/m2'], fallback: null }, scores);
+    expect(ordered).toEqual(['a/m2']);
+  });
+
+  it('keeps a never-seen policy candidate (unknown health) for dispatch to try', () => {
+    const scores = [score('a/known', { healthy: true })];
+    const ordered = orderCandidates('auto:code', { candidates: ['a/never-seen', 'a/known'], fallback: null }, scores);
+    expect(ordered).toEqual(['a/never-seen', 'a/known']);
+  });
+});
--- a/apps/control/src/services/tests/jsonb.test.ts
+++ b/apps/control/src/services/tests/jsonb.test.ts
@@ -0,0 +1,60 @@
+import { describe, it, expect } from 'vitest';
+import { jsonbStringArray, jsonbArray, jsonbNumberArray, jsonbObject } from '../jsonb.js';
+
+describe('jsonbStringArray', () => {
+  it('passes through an already-parsed array (porsager behavior)', () => {
+    expect(jsonbStringArray(['a', 'b'])).toEqual(['a', 'b']);
+  });
+  it('parses a JSON string array', () => {
+    expect(jsonbStringArray('["a","b"]')).toEqual(['a', 'b']);
+  });
+  it('filters non-strings out of a parsed array', () => {
+    expect(jsonbStringArray(['a', 1, null, 'b'])).toEqual(['a', 'b']);
+  });
+  it('returns [] for null / invalid', () => {
+    expect(jsonbStringArray(null)).toEqual([]);
+    expect(jsonbStringArray('not json')).toEqual([]);
+    expect(jsonbStringArray({})).toEqual([]);
+  });
+});
+
+describe('jsonbArray', () => {
+  it('passes through an already-parsed array of objects (eval tasks)', () => {
+    expect(jsonbArray([{ id: 't1' }])).toEqual([{ id: 't1' }]);
+  });
+  it('parses a JSON string array', () => {
+    expect(jsonbArray('[{"id":"t1"}]')).toEqual([{ id: 't1' }]);
+  });
+  it('returns [] for null / invalid / non-array', () => {
+    expect(jsonbArray(null)).toEqual([]);
+    expect(jsonbArray('nope')).toEqual([]);
+    expect(jsonbArray({})).toEqual([]);
+  });
+});
+
+describe('jsonbNumberArray', () => {
+  it('passes through an already-parsed number array (bench token grids)', () => {
+    expect(jsonbNumberArray([128, 512])).toEqual([128, 512]);
+  });
+  it('parses a JSON string array and filters non-numbers', () => {
+    expect(jsonbNumberArray('[128,"x",512]')).toEqual([128, 512]);
+  });
+  it('returns [] for null / invalid', () => {
+    expect(jsonbNumberArray(null)).toEqual([]);
+    expect(jsonbNumberArray('nope')).toEqual([]);
+  });
+});
+
+describe('jsonbObject', () => {
+  it('passes through an already-parsed object', () => {
+    expect(jsonbObject({ a: 1 })).toEqual({ a: 1 });
+  });
+  it('parses a JSON string object', () => {
+    expect(jsonbObject('{"a":1}')).toEqual({ a: 1 });
+  });
+  it('returns null for arrays, null, and invalid', () => {
+    expect(jsonbObject([1, 2])).toBeNull();
+    expect(jsonbObject(null)).toBeNull();
+    expect(jsonbObject('nope')).toBeNull();
+  });
+});
--- a/apps/control/src/services/tests/judge-runner.test.ts
+++ b/apps/control/src/services/tests/judge-runner.test.ts
@@ -0,0 +1,55 @@
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+
+// ─── Judge runner tests (mock sql + real functions) ─────────────────────────
+
+describe('judge runner', () => {
+  beforeEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('runJudgeError', async () => {
+    // Test that the judge runner imports correctly and has the expected interface.
+    const mod = await import('../judge-runner.js');
+    expect(typeof mod.runJudgeEval).toBe('function');
+  });
+
+  it('generateResponse rejects on bad URL', async () => {
+    // The generateResponse function is internal, but we can test the public API.
+    const { runJudgeEval } = await import('../judge-runner.js');
+
+    // Mock sql operations.
+    const mockSql = vi.fn().mockResolvedValue([]);
+    mockSql.tag = vi.fn().mockReturnValue({ SQL: '' });
+
+    const mockEmitter = {
+      publish: vi.fn(),
+    };
+
+    const mockLogger = {
+      info: vi.fn(),
+      warn: vi.fn(),
+      error: vi.fn(),
+    };
+
+    const progressHandler = vi.fn();
+
+    // This will fail because resolveProviderBaseUrl returns null for unknown provider.
+    const result = await runJudgeEval(
+      {
+        runId: 'test_run',
+        providerId: 'nonexistent-provider',
+        model: 'test-model',
+        quant: null,
+        tasks: [],
+        judgeModel: null,
+      },
+      mockSql as unknown as import('../../db.js').Sql,
+      mockEmitter as unknown as import('../../index.js').DeltaEmitter,
+      0,
+      mockLogger as unknown as import('fastify').FastifyBaseLogger,
+      progressHandler,
+    );
+
+    expect(result.error).toContain('no base URL');
+  });
+});
--- a/apps/control/src/services/tests/liveness.test.ts
+++ b/apps/control/src/services/tests/liveness.test.ts
@@ -0,0 +1,102 @@
+import { describe, it, expect } from 'vitest';
+import type { HostState } from '../fleet-state.js';
+
+type Liveness = 'connected' | 'reconnecting' | 'down';
+
+function transitionLiveness(current: Liveness, event: 'connect' | 'disconnect' | 'reconnect_attempt' | 'reconnect_success'): Liveness {
+  switch (event) {
+    case 'connect':
+      return 'connected';
+    case 'disconnect':
+      return 'down';
+    case 'reconnect_attempt':
+      return 'reconnecting';
+    case 'reconnect_success':
+      return 'connected';
+  }
+}
+
+describe('liveness state machine', () => {
+  it('starts as down', () => {
+    const state: HostState = {
+      providerId: 'test',
+      liveness: 'down',
+      lastSeenAt: null,
+      seq: 0,
+      models: new Map(),
+    };
+    expect(state.liveness).toBe('down');
+  });
+
+  it('connect -> connected', () => {
+    const state: HostState = {
+      providerId: 'test',
+      liveness: 'down',
+      lastSeenAt: null,
+      seq: 0,
+      models: new Map(),
+    };
+    state.liveness = transitionLiveness(state.liveness, 'connect');
+    expect(state.liveness).toBe('connected');
+  });
+
+  it('connected -> down on disconnect', () => {
+    const state: HostState = {
+      providerId: 'test',
+      liveness: 'connected',
+      lastSeenAt: new Date(),
+      seq: 0,
+      models: new Map(),
+    };
+    state.liveness = transitionLiveness(state.liveness, 'disconnect');
+    expect(state.liveness).toBe('down');
+  });
+
+  it('down -> reconnecting on reconnect attempt', () => {
+    const state: HostState = {
+      providerId: 'test',
+      liveness: 'down',
+      lastSeenAt: null,
+      seq: 0,
+      models: new Map(),
+    };
+    state.liveness = transitionLiveness(state.liveness, 'reconnect_attempt');
+    expect(state.liveness).toBe('reconnecting');
+  });
+
+  it('reconnecting -> connected on reconnect success', () => {
+    const state: HostState = {
+      providerId: 'test',
+      liveness: 'reconnecting',
+      lastSeenAt: null,
+      seq: 0,
+      models: new Map(),
+    };
+    state.liveness = transitionLiveness(state.liveness, 'reconnect_success');
+    expect(state.liveness).toBe('connected');
+  });
+
+  it('connected -> reconnecting on reconnect attempt', () => {
+    const state: HostState = {
+      providerId: 'test',
+      liveness: 'connected',
+      lastSeenAt: new Date(),
+      seq: 0,
+      models: new Map(),
+    };
+    state.liveness = transitionLiveness(state.liveness, 'reconnect_attempt');
+    expect(state.liveness).toBe('reconnecting');
+  });
+
+  it('reconnecting -> down on reconnect failure', () => {
+    const state: HostState = {
+      providerId: 'test',
+      liveness: 'reconnecting',
+      lastSeenAt: null,
+      seq: 0,
+      models: new Map(),
+    };
+    state.liveness = transitionLiveness(state.liveness, 'disconnect');
+    expect(state.liveness).toBe('down');
+  });
+});
--- a/apps/control/src/services/tests/llama-providers.test.ts
+++ b/apps/control/src/services/tests/llama-providers.test.ts
@@ -0,0 +1,115 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import { writeFileSync, unlinkSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { loadLlamaProviders, getLlamaProviders, resolveProviderBaseUrl } from '../llama-providers.js';
+
+function loadFixture(
+  providers: Array<{ id: string; label: string; baseUrl: string; kind?: string }>,
+): string {
+  const file = {
+    defaultProvider: providers[0]!.id,
+    providers: providers.map((p) => ({ ...p, kind: p.kind ?? 'llama-swap' })),
+  };
+  const path = join(tmpdir(), `llama-providers-test-${Math.random().toString(36).slice(2)}.json`);
+  writeFileSync(path, JSON.stringify(file), 'utf8');
+  return path;
+}
+
+describe('loadLlamaProviders', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('loads a valid providers file', () => {
+    const path = loadFixture([
+      { id: 'sam-desktop', label: 'Sam Desktop', baseUrl: 'http://100.101.41.16:8401' },
+      { id: 'embedding', label: 'Embedding', baseUrl: 'http://100.90.172.55:8411' },
+    ]);
+
+    const result = loadLlamaProviders(path, 'http://legacy.test:8080');
+
+    expect(result.providers).toHaveLength(2);
+    expect(result.providers[0]!.id).toBe('sam-desktop');
+    expect(result.providers[0]!.baseUrl).toBe('http://100.101.41.16:8401');
+    expect(result.providers[1]!.id).toBe('embedding');
+    expect(result.providers[1]!.baseUrl).toBe('http://100.90.172.55:8411');
+
+    unlinkSync(path);
+  });
+
+  it('falls back to legacy when file is missing', () => {
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
+
+    const result = loadLlamaProviders('/nonexistent/path.json', 'http://legacy.test:8080');
+
+    expect(result.providers).toHaveLength(1);
+    expect(result.providers[0]!.id).toBe('llama-swap');
+    expect(result.providers[0]!.baseUrl).toBe('http://legacy.test:8080');
+
+    warnSpy.mockRestore();
+  });
+
+  it('falls back to legacy when path is undefined', () => {
+    const result = loadLlamaProviders(undefined, 'http://legacy.test:8080');
+
+    expect(result.providers).toHaveLength(1);
+    expect(result.providers[0]!.id).toBe('llama-swap');
+    expect(result.providers[0]!.baseUrl).toBe('http://legacy.test:8080');
+  });
+
+  it('falls back to legacy when JSON is invalid', () => {
+    const path = join(tmpdir(), `llama-providers-bad-${Math.random().toString(36).slice(2)}.json`);
+    writeFileSync(path, '{not valid json', 'utf8');
+    const errorSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
+
+    const result = loadLlamaProviders(path, 'http://legacy.test:8080');
+
+    expect(result.providers).toHaveLength(1);
+    expect(result.providers[0]!.id).toBe('llama-swap');
+
+    errorSpy.mockRestore();
+    unlinkSync(path);
+  });
+});
+
+describe('getLlamaProviders', () => {
+  it('returns cached result after load', () => {
+    loadLlamaProviders(undefined, 'http://test.example:9999');
+    const cached = getLlamaProviders();
+    expect(cached.providers[0]!.baseUrl).toBe('http://test.example:9999');
+  });
+
+  it('returns legacy fallback when nothing loaded', () => {
+    // This tests the fallback when cached is null.
+    // Since loadLlamaProviders always sets cached, we test the default URL.
+    const result = getLlamaProviders();
+    expect(result).toBeDefined();
+    expect(result.providers.length).toBeGreaterThanOrEqual(1);
+  });
+});
+
+describe('resolveProviderBaseUrl', () => {
+  it('resolves baseUrl for a known provider', () => {
+    loadLlamaProviders(undefined, 'http://test.example:9999');
+    expect(resolveProviderBaseUrl('llama-swap')).toBe('http://test.example:9999');
+  });
+
+  it('returns null for unknown provider', () => {
+    loadLlamaProviders(undefined, 'http://test.example:9999');
+    expect(resolveProviderBaseUrl('nonexistent')).toBeNull();
+  });
+
+  it('resolves correct URLs for both seeded providers', () => {
+    const path = loadFixture([
+      { id: 'sam-desktop', label: 'Sam Desktop', baseUrl: 'http://100.101.41.16:8401' },
+      { id: 'embedding', label: 'Embedding', baseUrl: 'http://100.90.172.55:8411' },
+    ]);
+    loadLlamaProviders(path, 'http://legacy.test:8080');
+
+    expect(resolveProviderBaseUrl('sam-desktop')).toBe('http://100.101.41.16:8401');
+    expect(resolveProviderBaseUrl('embedding')).toBe('http://100.90.172.55:8411');
+
+    unlinkSync(path);
+  });
+});
--- a/apps/control/src/services/tests/log-relay.test.ts
+++ b/apps/control/src/services/tests/log-relay.test.ts
@@ -0,0 +1,63 @@
+import { describe, it, expect, beforeEach } from 'vitest';
+import { LogRelay } from '../log-relay.js';
+
+describe('LogRelay', () => {
+  let relay: LogRelay;
+
+  beforeEach(() => {
+    relay = new LogRelay();
+  });
+
+  it('appends log lines to per-host tail', () => {
+    relay.append('host1', 'proxy', 'connection established');
+    relay.append('host1', 'upstream', 'request completed');
+
+    const tail = relay.getTail('host1');
+    expect(tail).toHaveLength(2);
+    expect(tail[0].source).toBe('proxy');
+    expect(tail[1].source).toBe('upstream');
+  });
+
+  it('trims tail to MAX_LOG_LINES (2000)', () => {
+    for (let i = 0; i < 2500; i++) {
+      relay.append('host1', 'proxy', `line ${i}`);
+    }
+
+    const tail = relay.getTail('host1');
+    expect(tail.length).toBe(2000);
+    expect(tail[0].line).toBe('line 500');
+    expect(tail[tail.length - 1].line).toBe('line 2499');
+  });
+
+  it('returns empty array for unknown host', () => {
+    expect(relay.getTail('unknown')).toEqual([]);
+  });
+
+  it('getAllTails returns lines from all hosts', () => {
+    relay.append('host1', 'proxy', 'line1');
+    relay.append('host2', 'upstream', 'line2');
+
+    const all = relay.getAllTails();
+    expect(all).toHaveLength(2);
+    expect(all.map((l) => l.providerId)).toContain('host1');
+    expect(all.map((l) => l.providerId)).toContain('host2');
+  });
+
+  it('getSources returns unique source values', () => {
+    relay.append('host1', 'proxy', 'line1');
+    relay.append('host1', 'upstream', 'line2');
+    relay.append('host2', 'model', 'line3');
+
+    const sources = relay.getSources();
+    expect(sources).toContain('proxy');
+    expect(sources).toContain('upstream');
+    expect(sources).toContain('model');
+    expect(sources.length).toBe(3);
+  });
+
+  it('timestamps are set on each line', () => {
+    relay.append('host1', 'proxy', 'test');
+    const tail = relay.getTail('host1');
+    expect(tail[0].ts).toBeInstanceOf(Date);
+  });
+});
--- a/apps/control/src/services/tests/model-pull.test.ts
+++ b/apps/control/src/services/tests/model-pull.test.ts
@@ -0,0 +1,83 @@
+import { describe, it, expect } from 'vitest';
+import { validateRepoId, buildPullCommand, runModelPull } from '../model-pull.js';
+import type { SshExec, ExecResult } from '../ssh-config.js';
+import type { DeltaEmitter } from '../../index.js';
+
+describe('validateRepoId', () => {
+  it('accepts org/name', () => {
+    expect(validateRepoId('Qwen/Qwen3.5-9B')).toBe(true);
+    expect(validateRepoId('lmstudio-community/model.gguf-q4')).toBe(true);
+  });
+  it('rejects traversal, spaces, metacharacters, and bare names', () => {
+    expect(validateRepoId('../etc/passwd')).toBe(false);
+    expect(validateRepoId('a/b; rm -rf /')).toBe(false);
+    expect(validateRepoId('a b/c')).toBe(false);
+    expect(validateRepoId('justname')).toBe(false);
+    expect(validateRepoId('a/b/c')).toBe(false);
+  });
+});
+
+describe('buildPullCommand', () => {
+  it('wrapper mode emits the pull verb', () => {
+    expect(buildPullCommand('wrapper', 'Qwen/Q3')).toBe('pull Qwen/Q3');
+  });
+  it('shell mode emits huggingface-cli into a sanitized local dir', () => {
+    expect(buildPullCommand('shell', 'Qwen/Q3', '/home/u/models/')).toBe(
+      "huggingface-cli download Qwen/Q3 --local-dir '/home/u/models/Qwen__Q3'",
+    );
+  });
+});
+
+function emitterSpy(): { emitter: DeltaEmitter; frames: Record<string, unknown>[] } {
+  const frames: Record<string, unknown>[] = [];
+  const emitter: DeltaEmitter = {
+    subscribe: () => () => {},
+    publish: (d) => { frames.push(d as Record<string, unknown>); },
+  };
+  return { emitter, frames };
+}
+
+function execReturning(result: ExecResult): { exec: SshExec; calls: string[] } {
+  const calls: string[] = [];
+  const exec: SshExec = async (_t, command) => { calls.push(command); return result; };
+  return { exec, calls };
+}
+
+const target = { host: 'h', user: 'u', keyPath: '/k' };
+
+describe('runModelPull', () => {
+  it('rejects an invalid repo id before issuing any command', async () => {
+    const { emitter, frames } = emitterSpy();
+    const { exec, calls } = execReturning({ code: 0, stdout: '', stderr: '' });
+    const r = await runModelPull({ jobId: 'j1', target, repo: '../x', mode: 'wrapper' }, exec, emitter);
+    expect(r.ok).toBe(false);
+    expect(calls).toHaveLength(0);
+    expect(frames[frames.length - 1]).toMatchObject({ type: 'control_job', status: 'failed' });
+  });
+
+  it('runs the wrapper pull verb and emits running then completed', async () => {
+    const { emitter, frames } = emitterSpy();
+    const { exec, calls } = execReturning({ code: 0, stdout: 'done', stderr: '' });
+    const r = await runModelPull({ jobId: 'j2', target, repo: 'Qwen/Q3', mode: 'wrapper' }, exec, emitter);
+    expect(r.ok).toBe(true);
+    expect(calls).toEqual(['pull Qwen/Q3']);
+    expect(frames.map((f) => f.status)).toEqual(['running', 'completed']);
+    expect(frames.every((f) => (f.detail as { kind?: string }).kind === 'pull')).toBe(true);
+  });
+
+  it('reports a non-zero exit as failed', async () => {
+    const { emitter, frames } = emitterSpy();
+    const { exec } = execReturning({ code: 1, stdout: '', stderr: 'no such repo' });
+    const r = await runModelPull({ jobId: 'j3', target, repo: 'Qwen/Q3', mode: 'wrapper' }, exec, emitter);
+    expect(r.ok).toBe(false);
+    expect(frames[frames.length - 1]).toMatchObject({ status: 'failed' });
+  });
+
+  it('shell mode without a models dir fails fast', async () => {
+    const { emitter } = emitterSpy();
+    const { exec, calls } = execReturning({ code: 0, stdout: '', stderr: '' });
+    const r = await runModelPull({ jobId: 'j4', target, repo: 'Qwen/Q3', mode: 'shell' }, exec, emitter);
+    expect(r.ok).toBe(false);
+    expect(calls).toHaveLength(0);
+  });
+});
--- a/apps/control/src/services/tests/pipeline.test.ts
+++ b/apps/control/src/services/tests/pipeline.test.ts
@@ -0,0 +1,337 @@
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { parseSseLine } from '../fleet-connector.js';
+import type { LlamaSweepSSEEvent, MetricsEntry, ModelStatusEntry } from '../fleet-connector.js';
+import { createFleetState, ensureHostState, incrementSeq } from '../fleet-state.js';
+import { createDeltaEmitter, handleLlamaSweepEvent } from '../../index.js';
+import type { DeltaEmitter } from '../../index.js';
+import type { Sql } from '../../db.js';
+import type { Config } from '../../config.js';
+
+// ─── SSE parser tests (REAL wire shapes from apigroup.go) ────────────────────
+// Real format: event:message / data:{"type":"<TYPE>","data":"<ESCAPED JSON>"}
+
+describe('parseSseLine (real wire shapes)', () => {
+  it('parses double-encoded modelStatus (real full-fleet array payload)', () => {
+    const inner = JSON.stringify([
+      { id: 'llama3', name: '', description: '', state: 'ready', unlisted: false, peerID: '' },
+    ]);
+    const outer = JSON.stringify({ type: 'modelStatus', data: inner });
+    const result = parseSseLine(`data: ${outer}`);
+    expect(result).not.toBeNull();
+    expect(result!.type).toBe('modelStatus');
+    expect(result!.data).toEqual([
+      { id: 'llama3', name: '', description: '', state: 'ready', unlisted: false, peerID: '' },
+    ]);
+  });
+
+  it('ignores event: lines (always event:message)', () => {
+    expect(parseSseLine('event:message')).toBeNull();
+  });
+
+  it('returns null for data: with missing inner data field', () => {
+    expect(parseSseLine('data:{"type":"modelStatus"}')).toBeNull();
+  });
+
+  it('returns null for empty line', () => {
+    expect(parseSseLine('')).toBeNull();
+    expect(parseSseLine('   ')).toBeNull();
+  });
+
+  it('returns null for malformed JSON', () => {
+    expect(parseSseLine('data: not-json')).toBeNull();
+  });
+});
+
+// ─── Pipeline integration test (real functions) ──────────────────────────────
+
+
+function apiModel(id: string, state: string): ModelStatusEntry {
+  return { id, name: '', description: '', state, unlisted: false, peerID: '' };
+}
+
+describe('SSE pipeline: parse -> handleLlamaSweepEvent -> emit deltas', () => {
+  let mockSql: Sql;
+  let mockConfig: Config;
+  let executedQueries: string[];
+
+  beforeEach(() => {
+    executedQueries = [];
+    mockSql = Object.assign(
+      (strings: TemplateStringsArray, ...values: unknown[]) => {
+        const query = strings.reduce((acc: string, s: string, i: number) => acc + s + (values[i] ?? ''), '');
+        executedQueries.push(query);
+        return Promise.resolve([]);
+      },
+      {
+        json: (v: unknown) => v,
+        unsafe: async (q: string) => { executedQueries.push(q); return []; },
+      },
+    ) as unknown as Sql;
+
+    mockConfig = {
+      NODE_ENV: 'production',
+      PORT: 9503,
+      HOST: '127.0.0.1',
+      DATABASE_URL: 'postgres://test',
+      LOG_LEVEL: 'info',
+      RETENTION_RAW_HOURS: 48,
+      RETENTION_ROLLUP_DAYS: 90,
+      CAPTURE_SIZE_KB: 256,
+      CAPTURE_BUDGET_MB: 50,
+    } as unknown as Config;
+  });
+
+  it('processes modelStatus SSE event and emits delta with seq=1', async () => {
+    const fleet = createFleetState();
+    const emitter = createDeltaEmitter();
+    const deltas: unknown[] = [];
+    emitter.subscribe((d) => deltas.push(d));
+
+    const event: LlamaSweepSSEEvent = {
+      type: 'modelStatus',
+      data: [apiModel('llama3', 'ready')],
+    };
+
+    await handleLlamaSweepEvent(fleet, mockSql, mockConfig, 'host1', emitter, event);
+
+    // Assert: delta was emitted
+    expect(deltas).toHaveLength(1);
+    const delta = deltas[0] as { type: string; seq: number; hosts: Array<{ seq: number; models: Array<{ model: string; state: string }> }> };
+    expect(delta.type).toBe('control_fleet');
+    expect(delta.seq).toBe(1);
+    expect(delta.hosts[0].seq).toBe(1);
+    expect(delta.hosts[0].models[0].model).toBe('llama3');
+    expect(delta.hosts[0].models[0].state).toBe('ready');
+
+    // Assert: SQL INSERT was called
+    expect(executedQueries.length).toBe(1);
+    expect(executedQueries[0]).toContain('control_model_events');
+    expect(executedQueries[0]).toContain('llama3');
+  });
+
+  it('increments seq monotonically across multiple events', async () => {
+    const fleet = createFleetState();
+    const emitter = createDeltaEmitter();
+    const deltas: unknown[] = [];
+    emitter.subscribe((d) => deltas.push(d));
+
+    for (let i = 0; i < 3; i++) {
+      // Each snapshot adds a new model -> a transition -> a delta.
+      await handleLlamaSweepEvent(fleet, mockSql, mockConfig, 'host1', emitter, {
+        type: 'modelStatus',
+        data: [apiModel(`model${i}`, 'ready')],
+      });
+    }
+
+    expect(deltas).toHaveLength(3);
+    const seqs = deltas.map((d) => (d as { seq: number }).seq);
+    expect(seqs).toEqual([1, 2, 3]);
+  });
+
+  it('processes metrics event with multiple entries and emits activity deltas', async () => {
+    const fleet = createFleetState();
+    const emitter = createDeltaEmitter();
+    const deltas: unknown[] = [];
+    emitter.subscribe((d) => deltas.push(d));
+
+    const metricsEvent: LlamaSweepSSEEvent = {
+      type: 'metrics',
+      data: [
+          {
+            id: 1,
+            timestamp: '2024-01-01T00:00:00Z',
+            model: 'llama3',
+            req_path: '/v1/chat/completions',
+            resp_status_code: 200,
+            duration_ms: 1500,
+            tokens: {
+              cache_tokens: 100,
+              input_tokens: 50,
+              output_tokens: 200,
+              prompt_per_second: 30,
+              tokens_per_second: 50,
+            },
+            has_capture: false,
+          },
+          {
+            id: 2,
+            timestamp: '2024-01-01T00:01:00Z',
+            model: 'llama3',
+            req_path: '/v1/chat/completions',
+            resp_status_code: 200,
+            duration_ms: 1200,
+            tokens: {
+              cache_tokens: 0,
+              input_tokens: 100,
+              output_tokens: 300,
+              prompt_per_second: 25,
+              tokens_per_second: 45,
+            },
+            has_capture: false,
+          },
+      ],
+    };
+
+    await handleLlamaSweepEvent(fleet, mockSql, mockConfig, 'host1', emitter, metricsEvent);
+
+    // handleReconcile is called (gap detection), then 2 activity deltas
+    // The reconcile SQL call + 2 INSERT calls = 3 queries
+    expect(executedQueries.length).toBeGreaterThanOrEqual(2);
+
+    // Activity deltas (2 entries)
+    const activityDeltas = deltas.filter((d) => (d as { type: string }).type === 'control_activity');
+    expect(activityDeltas).toHaveLength(2);
+
+    const d1 = activityDeltas[0] as { entry: { id: number } };
+    const d2 = activityDeltas[1] as { entry: { id: number } };
+    expect(d1.entry.id).toBe(1);
+    expect(d2.entry.id).toBe(2);
+  });
+
+  it('snapshot seq is max of all host seqs', () => {
+    const fleet = createFleetState();
+
+    const host1 = ensureHostState(fleet, 'host1');
+    incrementSeq(host1);
+    incrementSeq(host1);
+
+    const host2 = ensureHostState(fleet, 'host2');
+    incrementSeq(host2);
+    incrementSeq(host2);
+    incrementSeq(host2);
+
+    const hosts = Array.from(fleet.hosts.values()).map((h) => ({
+      providerId: h.providerId,
+      seq: h.seq,
+    }));
+    const snapshotMaxSeq = hosts.reduce((max: number, h: { seq: number }) => Math.max(max, h.seq), 0);
+    expect(snapshotMaxSeq).toBe(3);
+  });
+});
+
+// ─── 2-host delta merge test (B9) ────────────────────────────────────────────
+
+// ─── P4: source column mapping ──────────────────────────────────────────────
+
+describe('P4: source column in metrics ingest', () => {
+  let mockSql: Sql;
+  let mockConfig: Config;
+  let executedQueries: string[];
+
+  beforeEach(() => {
+    executedQueries = [];
+    mockSql = Object.assign(
+      (strings: TemplateStringsArray, ...values: unknown[]) => {
+        const query = strings.reduce((acc: string, s: string, i: number) => acc + s + (values[i] ?? ''), '');
+        executedQueries.push(query);
+        return Promise.resolve([]);
+      },
+      {
+        json: (v: unknown) => v,
+        unsafe: async (q: string) => { executedQueries.push(q); return []; },
+      },
+    ) as unknown as Sql;
+
+    mockConfig = {
+      NODE_ENV: 'production',
+      PORT: 9503,
+      HOST: '127.0.0.1',
+      DATABASE_URL: 'postgres://test',
+      LOG_LEVEL: 'info',
+      RETENTION_RAW_HOURS: 48,
+      RETENTION_ROLLUP_DAYS: 90,
+      CAPTURE_SIZE_KB: 256,
+      CAPTURE_BUDGET_MB: 50,
+    } as unknown as Config;
+  });
+
+  it('maps source as NULL for ring data (ActivityLogEntry has no headers)', async () => {
+    const fleet = createFleetState();
+    const emitter = createDeltaEmitter();
+    const deltas: unknown[] = [];
+    emitter.subscribe((d) => deltas.push(d));
+
+    const metricsEvent: LlamaSweepSSEEvent = {
+      type: 'metrics',
+      data: [
+        {
+          id: 1,
+          timestamp: '2024-01-01T00:00:00Z',
+          model: 'llama3',
+          req_path: '/v1/chat/completions',
+          resp_status_code: 200,
+          duration_ms: 1500,
+          tokens: {
+            cache_tokens: 100,
+            input_tokens: 50,
+            output_tokens: 200,
+            prompt_per_second: 30,
+            tokens_per_second: 50,
+          },
+          has_capture: false,
+        },
+      ],
+    };
+
+    await handleLlamaSweepEvent(fleet, mockSql, mockConfig, 'host1', emitter, metricsEvent);
+
+    // The INSERT query should include the source column
+    const insertQueries = executedQueries.filter((q) => q.includes('control_requests'));
+    expect(insertQueries.length).toBeGreaterThanOrEqual(2);
+    // The SSE handler INSERT (second one) includes source; reconcile INSERT (first) does not
+    expect(insertQueries[1]).toContain('source');
+  });
+});
+
+describe('2-host delta merge (B9)', () => {
+  it('delta for host2 does not wipe host1 from the hosts array', () => {
+    // Simulate the merge logic from useControlStream.tsx
+    const hosts = [
+      { providerId: 'host1', liveness: 'connected' as const, lastSeenAt: '', seq: 5, models: [] },
+      { providerId: 'host2', liveness: 'connected' as const, lastSeenAt: '', seq: 3, models: [] },
+    ];
+
+    // Delta arrives for host2 only
+    const deltaHosts = [
+      { providerId: 'host2', liveness: 'connected' as const, lastSeenAt: '', seq: 4, models: [] },
+    ];
+
+    const merged = [...hosts];
+    for (const dh of deltaHosts) {
+      const idx = merged.findIndex((h) => h.providerId === dh.providerId);
+      if (idx >= 0) {
+        merged[idx] = dh;
+      } else {
+        merged.push(dh);
+      }
+    }
+
+    expect(merged).toHaveLength(2);
+    expect(merged.find((h) => h.providerId === 'host1')).toBeDefined();
+    expect(merged.find((h) => h.providerId === 'host2')!.seq).toBe(4);
+    expect(merged.find((h) => h.providerId === 'host1')!.seq).toBe(5);
+  });
+
+  it('new host is appended when not in existing array', () => {
+    const hosts = [
+      { providerId: 'host1', liveness: 'connected' as const, lastSeenAt: '', seq: 5, models: [] },
+    ];
+
+    const deltaHosts = [
+      { providerId: 'host3', liveness: 'connected' as const, lastSeenAt: '', seq: 1, models: [] },
+    ];
+
+    const merged = [...hosts];
+    for (const dh of deltaHosts) {
+      const idx = merged.findIndex((h) => h.providerId === dh.providerId);
+      if (idx >= 0) {
+        merged[idx] = dh;
+      } else {
+        merged.push(dh);
+      }
+    }
+
+    expect(merged).toHaveLength(2);
+    expect(merged.map((h) => h.providerId)).toEqual(['host1', 'host3']);
+  });
+});
--- a/apps/control/src/services/tests/reconcile.test.ts
+++ b/apps/control/src/services/tests/reconcile.test.ts
@@ -0,0 +1,34 @@
+import { describe, it, expect } from 'vitest';
+import { detectGap } from '../reconcile.js';
+
+describe('detectGap', () => {
+  it('detects gap when oldest reconcile is newer than newest persisted', () => {
+    expect(detectGap('2024-01-02T00:00:00Z', '2024-01-01T00:00:00Z')).toBe(true);
+  });
+
+  it('does not detect gap when overlap exists', () => {
+    expect(detectGap('2024-01-01T00:00:00Z', '2024-01-02T00:00:00Z')).toBe(false);
+  });
+
+  it('does not detect gap when timestamps are equal', () => {
+    expect(detectGap('2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z')).toBe(false);
+  });
+
+  it('returns false when oldest reconcile is null', () => {
+    expect(detectGap(null, '2024-01-01T00:00:00Z')).toBe(false);
+  });
+
+  it('returns false when newest persisted is null', () => {
+    expect(detectGap('2024-01-01T00:00:00Z', null)).toBe(false);
+  });
+
+  it('returns false when both are null', () => {
+    expect(detectGap(null, null)).toBe(false);
+  });
+
+  it('handles timezone offsets correctly', () => {
+    // 2024-01-01T12:00:00Z == 2024-01-01T14:00:00+02:00
+    expect(detectGap('2024-01-01T12:00:00Z', '2024-01-01T14:00:00+02:00')).toBe(false);
+    expect(detectGap('2024-01-01T13:00:00Z', '2024-01-01T14:00:00+02:00')).toBe(true);
+  });
+});
--- a/apps/control/src/services/tests/reports.test.ts
+++ b/apps/control/src/services/tests/reports.test.ts
@@ -0,0 +1,66 @@
+import { describe, it, expect } from 'vitest';
+import { renderReportMarkdown, isReportDue, type ReportStats } from '../reports.js';
+
+function makeStats(partial: Partial<ReportStats> = {}): ReportStats {
+  return {
+    periodStart: '2026-06-11T00:00:00.000Z',
+    periodEnd: '2026-06-12T00:00:00.000Z',
+    interval: 'daily',
+    totalRequests: 100,
+    priorRequests: 50,
+    totalInputTokens: 1000,
+    totalOutputTokens: 2000,
+    bySource: [{ source: 'boochat', requests: 80, inputTokens: 800, outputTokens: 1600 }],
+    byProvider: [{ providerId: 'sam-desktop', requests: 100, swaps: 4 }],
+    leaderboard: [{ providerId: 'sam-desktop', model: 'qwopus-35b', kind: 'code', avgScore: 0.82 }],
+    regressions: [],
+    ...partial,
+  };
+}
+
+describe('renderReportMarkdown', () => {
+  it('renders usage with a trend vs the prior period', () => {
+    const md = renderReportMarkdown(makeStats());
+    expect(md).toContain('# Fleet daily report');
+    expect(md).toContain('Requests: 100 (+100% vs prior period)');
+    expect(md).toContain('| boochat | 80 |');
+    expect(md).toContain('| sam-desktop | 100 | 4 |');
+    expect(md).toContain('No speed regressions flagged this period.');
+  });
+
+  it('renders regression anomalies when present', () => {
+    const md = renderReportMarkdown(makeStats({
+      regressions: [{ providerId: 'sam-desktop', model: 'qwopus-35b', avgGenTps: 42.5 }],
+    }));
+    expect(md).toContain('Regression: sam-desktop/qwopus-35b');
+    expect(md).toContain('42.5 tok/s');
+  });
+
+  it('handles a zero prior period without dividing by zero', () => {
+    const md = renderReportMarkdown(makeStats({ totalRequests: 5, priorRequests: 0 }));
+    expect(md).toContain('Requests: 5 (new vs prior period)');
+  });
+});
+
+describe('isReportDue', () => {
+  const now = new Date('2026-06-12T12:00:00.000Z');
+
+  it('is due when never run', () => {
+    expect(isReportDue(null, 'daily', now)).toBe(true);
+  });
+
+  it('is not due within the interval', () => {
+    const lastRun = new Date('2026-06-12T06:00:00.000Z'); // 6h ago
+    expect(isReportDue(lastRun, 'daily', now)).toBe(false);
+  });
+
+  it('is due once the interval has elapsed', () => {
+    const lastRun = new Date('2026-06-11T06:00:00.000Z'); // 30h ago
+    expect(isReportDue(lastRun, 'daily', now)).toBe(true);
+  });
+
+  it('uses a 7-day window for weekly', () => {
+    const lastRun = new Date('2026-06-09T12:00:00.000Z'); // 3 days ago
+    expect(isReportDue(lastRun, 'weekly', now)).toBe(false);
+  });
+});
--- a/apps/control/src/services/tests/retention.test.ts
+++ b/apps/control/src/services/tests/retention.test.ts
@@ -0,0 +1,68 @@
+import { describe, it, expect } from 'vitest';
+import { trimCapture, parseCaptureJson } from '../retention.js';
+
+describe('trimCapture', () => {
+  it('returns null for null input', () => {
+    expect(trimCapture(null, 256)).toBeNull();
+  });
+
+  it('returns unchanged capture when within cap', () => {
+    const capture = JSON.stringify({ data: 'x'.repeat(100) });
+    const result = trimCapture(capture, 256);
+    expect(result).toBe(capture);
+  });
+
+  it('trims capture when over cap', () => {
+    const capture = JSON.stringify({ data: 'x'.repeat(300_000) }); // ~600KB
+    const result = trimCapture(capture, 256);
+    expect(result).not.toBe(capture);
+    expect(result!.length).toBeLessThan(capture.length);
+  });
+
+  it('trims to roughly the cap size', () => {
+    const capture = JSON.stringify({ data: 'x'.repeat(1_000_000) }); // ~2MB
+    const result = trimCapture(capture, 256);
+    // trimCapture slices to sizeKB * 1024 bytes
+    const expectedLength = Math.floor(256 * 1024);
+    expect(result!.length).toBeLessThanOrEqual(expectedLength);
+  });
+});
+
+describe('parseCaptureJson', () => {
+  it('parses valid JSON string into object', () => {
+    const input = JSON.stringify({ requestHeaders: {}, requestBody: '{}', responseHeaders: {}, responseBody: '{}' });
+    const result = parseCaptureJson(input);
+    expect(result).toEqual({ requestHeaders: {}, requestBody: '{}', responseHeaders: {}, responseBody: '{}' });
+  });
+
+  it('returns null for null input', () => {
+    expect(parseCaptureJson(null)).toBeNull();
+  });
+
+  it('returns null for invalid JSON', () => {
+    expect(parseCaptureJson('not json')).toBeNull();
+  });
+
+  it('B7: trimmed capture produces a JSONB-ready object, not a string', () => {
+    // Simulate the pipeline: trim -> parse -> ready for sql.json()
+    // A capture within the cap parses cleanly to an object for sql.json()
+    const withinCap = JSON.stringify({ requestHeaders: {}, requestBody: '{}', responseBody: '{}' });
+    const parsed = parseCaptureJson(withinCap);
+    expect(typeof parsed).toBe('object');
+    expect(parsed).not.toBeNull();
+    // sql.json() expects an object/array; a string would double-serialize
+    expect(Array.isArray(parsed) || typeof parsed === 'object').toBe(true);
+  });
+
+  it('B7: oversized capture trims to invalid JSON -> parseCaptureJson returns null -> stored as NULL', () => {
+    // trimCapture slices by byte count, which produces invalid JSON for large captures.
+    // parseCaptureJson returns null for invalid JSON, and the insert stores NULL::jsonb.
+    // This is acceptable: a truncated capture is not useful anyway.
+    const raw = JSON.stringify({ data: 'x'.repeat(300_000) });
+    const trimmed = trimCapture(raw, 256);
+    expect(trimmed).not.toBeNull();
+    const parsed = parseCaptureJson(trimmed!);
+    // Trimmed capture is invalid JSON (sliced mid-object), so parse returns null
+    expect(parsed).toBeNull();
+  });
+});
--- a/apps/control/src/services/tests/routing-scores.test.ts
+++ b/apps/control/src/services/tests/routing-scores.test.ts
@@ -0,0 +1,57 @@
+import { describe, it, expect } from 'vitest';
+import { assignBadges, type ModelScore } from '../routing-scores.js';
+
+function makeScore(partial: Partial<ModelScore> & { compositeId: string }): ModelScore {
+  return {
+    providerId: partial.compositeId.split('/')[0]!,
+    model: partial.compositeId.split('/').slice(1).join('/'),
+    codeScore: null,
+    chatScore: null,
+    evalScore: null,
+    avgGenTps: null,
+    avgLatencyMs: null,
+    sampleCount: 0,
+    healthy: true,
+    badges: [],
+    ...partial,
+  };
+}
+
+describe('assignBadges', () => {
+  it('awards best-code to the highest healthy code score', () => {
+    const scores = [
+      makeScore({ compositeId: 'a/m1', codeScore: 0.7 }),
+      makeScore({ compositeId: 'a/m2', codeScore: 0.9 }),
+      makeScore({ compositeId: 'a/m3', codeScore: 0.5 }),
+    ];
+    assignBadges(scores);
+    expect(scores.find((s) => s.compositeId === 'a/m2')!.badges).toContain('best-code');
+    expect(scores.find((s) => s.compositeId === 'a/m1')!.badges).not.toContain('best-code');
+  });
+
+  it('excludes unhealthy hosts from winning any badge', () => {
+    const scores = [
+      makeScore({ compositeId: 'a/m1', codeScore: 0.95, healthy: false }),
+      makeScore({ compositeId: 'a/m2', codeScore: 0.6, healthy: true }),
+    ];
+    assignBadges(scores);
+    expect(scores.find((s) => s.compositeId === 'a/m1')!.badges).toHaveLength(0);
+    expect(scores.find((s) => s.compositeId === 'a/m2')!.badges).toContain('best-code');
+  });
+
+  it('awards best-fast by throughput independently of eval scores', () => {
+    const scores = [
+      makeScore({ compositeId: 'a/slow', codeScore: 0.9, avgGenTps: 10 }),
+      makeScore({ compositeId: 'a/fast', codeScore: 0.4, avgGenTps: 80 }),
+    ];
+    assignBadges(scores);
+    expect(scores.find((s) => s.compositeId === 'a/fast')!.badges).toContain('best-fast');
+    expect(scores.find((s) => s.compositeId === 'a/slow')!.badges).toContain('best-code');
+  });
+
+  it('awards nothing for a category when no model has that metric', () => {
+    const scores = [makeScore({ compositeId: 'a/m1', avgGenTps: 20 })];
+    assignBadges(scores);
+    expect(scores[0]!.badges).toEqual(['best-fast']);
+  });
+});
--- a/apps/control/src/services/tests/sandbox-runner.test.ts
+++ b/apps/control/src/services/tests/sandbox-runner.test.ts
@@ -0,0 +1,130 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+
+// ─── Sandbox lifecycle tests (mock docker spawn, test orchestration) ─────────
+
+describe('sandbox runner lifecycle', () => {
+  beforeEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('runCodeEval is importable', async () => {
+    const mod = await import('../sandbox-runner.js');
+    expect(typeof mod.runCodeEval).toBe('function');
+  });
+
+  it('bounded fan-out via Promise.allSettled', async () => {
+    // Test the bounded concurrency pattern directly.
+    const tasks = Array.from({ length: 10 }, (_, i) => ({ id: `task_${i}` }));
+    const concurrency = 4;
+    const executionOrder: number[] = [];
+    const activeCount: number[] = [];
+    let currentlyActive = 0;
+
+    const results = await Promise.allSettled(
+      tasks.slice(0, concurrency).map(async (task, idx) => {
+        currentlyActive++;
+        activeCount.push(currentlyActive);
+        await new Promise((r) => setTimeout(r, 10 + idx * 5));
+        executionOrder.push(idx);
+        currentlyActive--;
+        return { taskId: task.id, idx };
+      }),
+    );
+
+    // All should fulfill.
+    expect(results.filter((r) => r.status === 'fulfilled').length).toBe(concurrency);
+    // Max concurrent should not exceed concurrency limit.
+    expect(Math.max(...activeCount)).toBeLessThanOrEqual(concurrency);
+  });
+
+  it('per-task finally cleanup runs on error', async () => {
+    const cleanupCalls: string[] = [];
+
+    const tasks = [
+      { id: 'task_ok' },
+      { id: 'task_fail' },
+      { id: 'task_ok2' },
+    ];
+
+    const results = await Promise.allSettled(
+      tasks.map(async (task) => {
+        try {
+          if (task.id === 'task_fail') {
+            throw new Error('simulated failure');
+          }
+          return { ok: true };
+        } finally {
+          cleanupCalls.push(task.id);
+        }
+      }),
+    );
+
+    // All cleanup calls should run, even for the failed task.
+    expect(cleanupCalls).toContain('task_ok');
+    expect(cleanupCalls).toContain('task_fail');
+    expect(cleanupCalls).toContain('task_ok2');
+
+    // One rejection, two fulfillments.
+    expect(results.filter((r) => r.status === 'fulfilled').length).toBe(2);
+    expect(results.filter((r) => r.status === 'rejected').length).toBe(1);
+  });
+
+  it('kill-on-timeout pattern', async () => {
+    // Test that spawn with timeout + SIGKILL works.
+    const { spawn } = await import('node:child_process');
+    const child = spawn('sleep', ['300']);
+    const timeoutHandle = setTimeout(() => {
+      child.kill('SIGKILL');
+    }, 100);
+
+    await new Promise<void>((resolve) => {
+      child.on('close', () => {
+        clearTimeout(timeoutHandle);
+        resolve();
+      });
+    });
+
+    // SIGKILL gives signal, not exit code.
+    expect(child.killed).toBe(true);
+  });
+
+  it('allSettled isolation: one failure does not abort others', async () => {
+    const completed: string[] = [];
+
+    const results = await Promise.allSettled([
+      (async () => {
+        await new Promise((r) => setTimeout(r, 50));
+        completed.push('task1');
+        return 'ok1';
+      })(),
+      (async () => {
+        await new Promise((r) => setTimeout(r, 20));
+        throw new Error('fail');
+      })(),
+      (async () => {
+        await new Promise((r) => setTimeout(r, 50));
+        completed.push('task3');
+        return 'ok3';
+      })(),
+    ]);
+
+    // Both successful tasks completed despite the failure.
+    expect(completed).toContain('task1');
+    expect(completed).toContain('task3');
+
+    expect(results[0].status).toBe('fulfilled');
+    expect(results[1].status).toBe('rejected');
+    expect(results[2].status).toBe('fulfilled');
+  });
+
+  it('pruneOrphanContainers handles missing docker gracefully', async () => {
+    // The pruneOrphanContainers function is internal but handles docker errors gracefully.
+    // We verify the module loads without error even if docker is not available.
+    const mod = await import('../sandbox-runner.js');
+    expect(typeof mod.runCodeEval).toBe('function');
+  });
+});
--- a/apps/control/src/services/tests/seq-logic.test.ts
+++ b/apps/control/src/services/tests/seq-logic.test.ts
@@ -0,0 +1,106 @@
+import { describe, it, expect } from 'vitest';
+
+// Seq logic test: verify the buffer-then-filter rule.
+// Client buffers pre-snapshot deltas, discards seq <= snapshot_seq per-host.
+
+interface Delta {
+  type: 'control_fleet';
+  seq: number;
+  hosts: Array<{ providerId: string; seq: number }>;
+}
+
+interface Snapshot {
+  type: 'control_fleet';
+  seq: number;
+  hosts: Array<{ providerId: string; seq: number }>;
+}
+
+function applyDelta(delta: Delta, snapshotSeqs: Map<string, number>): boolean {
+  // Apply only if seq > snapshot seq for that host.
+  const firstHost = delta.hosts[0];
+  if (!firstHost) return false;
+  const snapshotSeq = snapshotSeqs.get(firstHost.providerId) ?? 0;
+  return delta.seq > snapshotSeq;
+}
+
+function applySnapshot(snapshot: Snapshot, snapshotSeqs: Map<string, number>): void {
+  for (const host of snapshot.hosts) {
+    snapshotSeqs.set(host.providerId, host.seq);
+  }
+}
+
+describe('seq logic: buffer-then-filter', () => {
+  it('applies delta when seq > snapshot seq', () => {
+    const snapshotSeqs = new Map([['host1', 5]]);
+    const delta: Delta = {
+      type: 'control_fleet',
+      seq: 10,
+      hosts: [{ providerId: 'host1', seq: 10 }],
+    };
+    expect(applyDelta(delta, snapshotSeqs)).toBe(true);
+  });
+
+  it('discards delta when seq <= snapshot seq', () => {
+    const snapshotSeqs = new Map([['host1', 10]]);
+    const delta: Delta = {
+      type: 'control_fleet',
+      seq: 5,
+      hosts: [{ providerId: 'host1', seq: 5 }],
+    };
+    expect(applyDelta(delta, snapshotSeqs)).toBe(false);
+  });
+
+  it('discards delta when seq equals snapshot seq', () => {
+    const snapshotSeqs = new Map([['host1', 10]]);
+    const delta: Delta = {
+      type: 'control_fleet',
+      seq: 10,
+      hosts: [{ providerId: 'host1', seq: 10 }],
+    };
+    expect(applyDelta(delta, snapshotSeqs)).toBe(false);
+  });
+
+  it('updates snapshot seqs on snapshot apply', () => {
+    const snapshotSeqs = new Map<string, number>();
+    const snapshot: Snapshot = {
+      type: 'control_fleet',
+      seq: 0,
+      hosts: [
+        { providerId: 'host1', seq: 100 },
+        { providerId: 'host2', seq: 50 },
+      ],
+    };
+    applySnapshot(snapshot, snapshotSeqs);
+    expect(snapshotSeqs.get('host1')).toBe(100);
+    expect(snapshotSeqs.get('host2')).toBe(50);
+  });
+
+  it('handles missing snapshot seq (treats as 0)', () => {
+    const snapshotSeqs = new Map<string, number>();
+    const delta: Delta = {
+      type: 'control_fleet',
+      seq: 1,
+      hosts: [{ providerId: 'host1', seq: 1 }],
+    };
+    // Without a snapshot, seq 1 > 0, so delta applies.
+    expect(applyDelta(delta, snapshotSeqs)).toBe(true);
+  });
+
+  it('discards out-of-order delta after snapshot', () => {
+    // Simulate: snapshot arrives at seq 10, then delta at seq 5 arrives.
+    const snapshotSeqs = new Map<string, number>();
+    const snapshot: Snapshot = {
+      type: 'control_fleet',
+      seq: 0,
+      hosts: [{ providerId: 'host1', seq: 10 }],
+    };
+    applySnapshot(snapshot, snapshotSeqs);
+
+    const delta: Delta = {
+      type: 'control_fleet',
+      seq: 5,
+      hosts: [{ providerId: 'host1', seq: 5 }],
+    };
+    expect(applyDelta(delta, snapshotSeqs)).toBe(false);
+  });
+});
--- a/apps/control/src/services/tests/ssh-config.test.ts
+++ b/apps/control/src/services/tests/ssh-config.test.ts
@@ -0,0 +1,234 @@
+import { describe, it, expect } from 'vitest';
+import {
+  validateLlamaConfig,
+  computeDiff,
+  backupFilename,
+  applyRemoteConfig,
+  healthWait,
+  type SshExec,
+  type ExecResult,
+} from '../ssh-config.js';
+
+// A minimal subset of the llama-swap config schema sufficient for these tests:
+// top-level object with a required non-empty `models` object.
+const SCHEMA = {
+  type: 'object',
+  required: ['models'],
+  properties: {
+    models: {
+      type: 'object',
+      minProperties: 1,
+      additionalProperties: {
+        type: 'object',
+        properties: { cmd: { type: 'string' } },
+      },
+    },
+  },
+} as const;
+
+const VALID_YAML = `models:\n  m1:\n    cmd: "llama-server -m m1.gguf"\n`;
+
+describe('validateLlamaConfig', () => {
+  it('accepts a valid config', () => {
+    const r = validateLlamaConfig(VALID_YAML, SCHEMA);
+    expect(r.valid).toBe(true);
+    expect(r.errors).toEqual([]);
+  });
+
+  it('rejects broken YAML with a parse error', () => {
+    const r = validateLlamaConfig('models:\n  m1:\n   cmd: "x\n  : :', SCHEMA);
+    expect(r.valid).toBe(false);
+    expect(r.errors[0]).toMatch(/YAML parse error/);
+  });
+
+  it('rejects a config missing required models', () => {
+    const r = validateLlamaConfig('healthCheckTimeout: 30\n', SCHEMA);
+    expect(r.valid).toBe(false);
+    expect(r.errors.join(' ')).toMatch(/models/);
+  });
+
+  it('rejects a non-mapping document', () => {
+    const r = validateLlamaConfig('- just\n- a\n- list\n', SCHEMA);
+    expect(r.valid).toBe(false);
+  });
+});
+
+describe('computeDiff', () => {
+  it('returns empty for identical text', () => {
+    expect(computeDiff('a\nb\n', 'a\nb\n')).toBe('');
+  });
+  it('marks changed lines with -/+', () => {
+    const d = computeDiff('a\nb\nc\n', 'a\nX\nc\n');
+    expect(d).toContain('- b');
+    expect(d).toContain('+ X');
+  });
+});
+
+describe('backupFilename', () => {
+  it('produces a timestamped path', () => {
+    const name = backupFilename('/etc/llama/config.yaml', new Date('2026-06-12T03:04:05.678Z'));
+    expect(name).toBe('/etc/llama/config.yaml.bak-20260612T030405Z');
+  });
+});
+
+// ─── apply pipeline failure paths ────────────────────────────────────────────
+
+function makeExec(handlers: Record<string, ExecResult>): { exec: SshExec; calls: string[] } {
+  const calls: string[] = [];
+  const exec: SshExec = async (_t, command) => {
+    calls.push(command);
+    for (const [pattern, result] of Object.entries(handlers)) {
+      if (command.includes(pattern)) return result;
+    }
+    return { code: 0, stdout: '', stderr: '' };
+  };
+  return { exec, calls };
+}
+
+const target = { host: 'h', user: 'u', keyPath: '/k' };
+const okFetcher = (async () => new Response('{}', { status: 200 })) as unknown as typeof fetch;
+
+describe('applyRemoteConfig', () => {
+  it('aborts at validate for an invalid config and never touches the host', async () => {
+    const { exec, calls } = makeExec({});
+    const r = await applyRemoteConfig({
+      target, configPath: '/c.yaml', restartCmd: 'restart', newConfig: 'not: valid: yaml: here:::',
+      schema: SCHEMA, baseUrl: 'http://h', exec, fetcher: okFetcher,
+    });
+    expect(r.ok).toBe(false);
+    expect(r.step).toBe('validate');
+    expect(calls).toHaveLength(0);
+  });
+
+  it('aborts at validate when the host config is unreadable', async () => {
+    const { exec } = makeExec({ "cat '": { code: 1, stdout: '', stderr: 'no such file' } });
+    const r = await applyRemoteConfig({
+      target, configPath: '/c.yaml', restartCmd: 'restart', newConfig: VALID_YAML,
+      schema: SCHEMA, baseUrl: 'http://h', exec, fetcher: okFetcher,
+    });
+    expect(r.ok).toBe(false);
+    expect(r.step).toBe('validate');
+    expect(r.error).toMatch(/read current failed/);
+  });
+
+  it('backs up BEFORE write and aborts on write failure (backup retained)', async () => {
+    const { exec, calls } = makeExec({
+      "cat '": { code: 0, stdout: 'models:\n  old: {}\n', stderr: '' }, // read current
+      'cp ': { code: 0, stdout: '', stderr: '' },                      // backup
+      'cat >': { code: 1, stdout: '', stderr: 'disk full' },           // write fails
+    });
+    const r = await applyRemoteConfig({
+      target, configPath: '/c.yaml', restartCmd: 'restart', newConfig: VALID_YAML,
+      schema: SCHEMA, baseUrl: 'http://h', exec, fetcher: okFetcher,
+      now: new Date('2026-06-12T00:00:00Z'),
+    });
+    expect(r.ok).toBe(false);
+    expect(r.step).toBe('write');
+    expect(r.backupPath).toBe('/c.yaml.bak-20260612T000000Z');
+    // backup (cp) must precede write (cat >)
+    const cpIdx = calls.findIndex((c) => c.startsWith('cp '));
+    const writeIdx = calls.findIndex((c) => c.startsWith('cat >'));
+    expect(cpIdx).toBeGreaterThanOrEqual(0);
+    expect(writeIdx).toBeGreaterThan(cpIdx);
+  });
+
+  it('aborts at restart on restart failure', async () => {
+    const { exec } = makeExec({
+      "cat '": { code: 0, stdout: 'models:\n  old: {}\n', stderr: '' },
+      'cp ': { code: 0, stdout: '', stderr: '' },
+      'cat >': { code: 0, stdout: '', stderr: '' },
+      restart: { code: 1, stdout: '', stderr: 'service not found' },
+    });
+    const r = await applyRemoteConfig({
+      target, configPath: '/c.yaml', restartCmd: 'restart-svc', newConfig: VALID_YAML,
+      schema: SCHEMA, baseUrl: 'http://h', exec, fetcher: okFetcher,
+    });
+    expect(r.ok).toBe(false);
+    expect(r.step).toBe('restart');
+  });
+
+  it('aborts at health when the service never comes back', async () => {
+    const { exec } = makeExec({
+      "cat '": { code: 0, stdout: 'models:\n  old: {}\n', stderr: '' },
+      'cp ': { code: 0, stdout: '', stderr: '' },
+      'cat >': { code: 0, stdout: '', stderr: '' },
+      'restart-svc': { code: 0, stdout: '', stderr: '' },
+    });
+    const downFetcher = (async () => { throw new Error('refused'); }) as unknown as typeof fetch;
+    const r = await applyRemoteConfig({
+      target, configPath: '/c.yaml', restartCmd: 'restart-svc', newConfig: VALID_YAML,
+      schema: SCHEMA, baseUrl: 'http://h', exec, fetcher: downFetcher,
+      healthAttempts: 2, healthDelayMs: 1,
+    });
+    expect(r.ok).toBe(false);
+    expect(r.step).toBe('health');
+  });
+
+  it('succeeds through the full pipeline', async () => {
+    const { exec } = makeExec({
+      "cat '": { code: 0, stdout: 'models:\n  old: {}\n', stderr: '' },
+      'cp ': { code: 0, stdout: '', stderr: '' },
+      'cat >': { code: 0, stdout: '', stderr: '' },
+      'restart-svc': { code: 0, stdout: '', stderr: '' },
+    });
+    const r = await applyRemoteConfig({
+      target, configPath: '/c.yaml', restartCmd: 'restart-svc', newConfig: VALID_YAML,
+      schema: SCHEMA, baseUrl: 'http://h', exec, fetcher: okFetcher,
+      healthAttempts: 1, healthDelayMs: 1,
+    });
+    expect(r.ok).toBe(true);
+    expect(r.step).toBe('done');
+    expect(r.backupPath).toBeDefined();
+  });
+});
+
+describe('healthWait', () => {
+  it('returns true on first OK', async () => {
+    const ok = await healthWait('http://h', okFetcher, 3, 1);
+    expect(ok).toBe(true);
+  });
+  it('returns false after exhausting attempts', async () => {
+    const downFetcher = (async () => new Response('', { status: 503 })) as unknown as typeof fetch;
+    const ok = await healthWait('http://h', downFetcher, 2, 1);
+    expect(ok).toBe(false);
+  });
+});
+
+// ─── wrapper mode (forced-command verbs) ─────────────────────────────────────
+
+describe('applyRemoteConfig wrapper mode', () => {
+  it('sends verbs (not raw shell) and reads the backup path from the backup verb', async () => {
+    const { exec, calls } = makeExec({
+      read: { code: 0, stdout: 'models:\n  old: {}\n', stderr: '' },
+      backup: { code: 0, stdout: '/c.yaml.bak-WRAP\n', stderr: '' },
+      write: { code: 0, stdout: '', stderr: '' },
+      restart: { code: 0, stdout: '', stderr: '' },
+    });
+    const r = await applyRemoteConfig({
+      target, configPath: '/c.yaml', restartCmd: 'ignored-in-wrapper', newConfig: VALID_YAML,
+      schema: SCHEMA, baseUrl: 'http://h', exec, fetcher: okFetcher, mode: 'wrapper',
+      healthAttempts: 1, healthDelayMs: 1,
+    });
+    expect(r.ok).toBe(true);
+    // backup path comes from the wrapper's stdout, not a client-computed name
+    expect(r.backupPath).toBe('/c.yaml.bak-WRAP');
+    // verbs only — no cat/cp/cat > shell commands
+    expect(calls).toEqual(['read', 'backup', 'write', 'restart']);
+    expect(calls.some((c) => c.includes('cat') || c.includes('cp '))).toBe(false);
+  });
+
+  it('aborts at write when the wrapper write verb fails (backup retained)', async () => {
+    const { exec } = makeExec({
+      read: { code: 0, stdout: 'old\n', stderr: '' },
+      backup: { code: 0, stdout: '/c.yaml.bak-WRAP\n', stderr: '' },
+      write: { code: 1, stdout: '', stderr: 'denied' },
+    });
+    const r = await applyRemoteConfig({
+      target, configPath: '/c.yaml', restartCmd: 'x', newConfig: VALID_YAML,
+      schema: SCHEMA, baseUrl: 'http://h', exec, fetcher: okFetcher, mode: 'wrapper',
+    });
+    expect(r.ok).toBe(false);
+    expect(r.step).toBe('write');
+    expect(r.backupPath).toBe('/c.yaml.bak-WRAP');
+  });
+});
--- a/apps/control/src/services/action-queue.ts
+++ b/apps/control/src/services/action-queue.ts
@@ -0,0 +1,236 @@
+/**
+ * Per-host FIFO action queue.
+ *
+ * All host-mutating actions (warm, unload) from BooControl serialize through
+ * a single FIFO queue per provider_id. Queue discipline:
+ *
+ * - Submissions rejected immediately while host liveness is 'down'
+ * - Queue depth capped at 4; reject-on-full includes pending queue contents
+ * - Each action re-checks liveness on dequeue and skips if stale
+ * - Unload-during-bench returns 409 {error: 'bench in progress', requiresConfirmation: true}
+ *
+ * Pattern: arena-runner.ts advanceChain promise-chain + read-fresh-state-or-skip.
+ */
+
+import type { FastifyBaseLogger } from 'fastify';
+
+export type ActionType = 'warm' | 'unload';
+
+export interface QueuedAction {
+  actionId: string;
+  type: ActionType;
+  providerId: string;
+  model?: string; // for warm: target model; for unload: specific model or undefined for all
+  confirmed: boolean; // true if client confirmed takeover
+  createdAt: Date;
+}
+
+export interface ActionQueueEntry {
+  action: QueuedAction;
+  status: 'pending' | 'running' | 'completed' | 'failed' | 'skipped';
+  error?: string;
+  enqueuedAt: Date;
+}
+
+export interface ActionQueueState {
+  queue: ActionQueueEntry[];
+  running: boolean;
+}
+
+export interface ActionQueueDeps {
+  baseUrl: string;
+  isLivenessUp: () => boolean;
+  isInflightRequests: () => number;
+  log: FastifyBaseLogger;
+}
+
+const MAX_QUEUE_DEPTH = 4;
+
+export class ActionQueue {
+  private queues: Map<string, ActionQueueState> = new Map();
+  private depsMap: Map<string, ActionQueueDeps> = new Map();
+
+  registerHost(providerId: string, deps: ActionQueueDeps): void {
+    this.depsMap.set(providerId, deps);
+    if (!this.queues.has(providerId)) {
+      this.queues.set(providerId, { queue: [], running: false });
+    }
+  }
+
+  /**
+   * Submit an action to the per-host queue.
+   * Returns rejection reasons for: host down, queue full, bench in progress.
+   */
+  submit(action: QueuedAction): { ok: true } | { ok: false; error: string; pending?: QueuedAction[]; requiresConfirmation?: boolean } {
+    const deps = this.depsMap.get(action.providerId);
+    if (!deps) {
+      return { ok: false, error: `unknown host: ${action.providerId}` };
+    }
+
+    // Reject if host is down
+    if (!deps.isLivenessUp()) {
+      return { ok: false, error: 'host offline' };
+    }
+
+    const state = this.queues.get(action.providerId);
+    if (!state) {
+      return { ok: false, error: `queue not initialized for ${action.providerId}` };
+    }
+
+    // Check bench in progress for unload actions
+    if (action.type === 'unload' && !action.confirmed) {
+      const inflight = deps.isInflightRequests();
+      if (inflight > 0) {
+        return {
+          ok: false,
+          error: 'bench in progress',
+          requiresConfirmation: true,
+        };
+      }
+    }
+
+    // Depth cap
+    if (state.queue.length >= MAX_QUEUE_DEPTH) {
+      const pending = state.queue.map((e) => e.action);
+      return {
+        ok: false,
+        error: `queue full (${state.queue.length}/${MAX_QUEUE_DEPTH})`,
+        pending,
+      };
+    }
+
+    const entry: ActionQueueEntry = {
+      action,
+      status: 'pending',
+      enqueuedAt: new Date(),
+    };
+    state.queue.push(entry);
+
+    // Kick the processor
+    void this.processNext(action.providerId, deps);
+    return { ok: true };
+  }
+
+  /**
+   * Get the current queue state for a host.
+   */
+  getState(providerId: string): ActionQueueState | null {
+    return this.queues.get(providerId) ?? null;
+  }
+
+  /**
+   * Process the next action in the queue for a host.
+   * Uses promise-chain pattern: each action runs to completion before the next.
+   */
+  private async processNext(providerId: string, deps: ActionQueueDeps): Promise<void> {
+    const state = this.queues.get(providerId);
+    if (!state || state.running || state.queue.length === 0) return;
+
+    state.running = true;
+    const entry = state.queue[0];
+    if (!entry) {
+      state.running = false;
+      return;
+    }
+
+    entry.status = 'running';
+
+    try {
+      // Re-check liveness on dequeue — skip stale actions
+      if (!deps.isLivenessUp()) {
+        entry.status = 'skipped';
+        entry.error = 'host went down during queue wait';
+        state.queue.shift();
+        state.running = false;
+        // Process next
+        void this.processNext(providerId, deps);
+        return;
+      }
+
+      // Re-check if action is still valid (stale warm after model loaded, etc.)
+      if (entry.action.type === 'warm' && this.isModelAlreadyLoaded(providerId, entry.action.model)) {
+        entry.status = 'skipped';
+        entry.error = 'model already loaded';
+        state.queue.shift();
+        state.running = false;
+        void this.processNext(providerId, deps);
+        return;
+      }
+
+      await this.executeAction(entry.action, deps);
+      entry.status = 'completed';
+    } catch (err) {
+      entry.status = 'failed';
+      entry.error = (err as Error).message ?? String(err);
+      deps.log.error({ actionId: entry.action.actionId, err: entry.error }, 'action: failed');
+    }
+
+    state.queue.shift();
+    state.running = false;
+    void this.processNext(providerId, deps);
+  }
+
+  private async executeAction(action: QueuedAction, deps: ActionQueueDeps): Promise<void> {
+    const baseUrl = deps.baseUrl;
+
+    switch (action.type) {
+      case 'warm': {
+        // 1-token POST /v1/chat/completions with bare wire ID
+        if (!action.model) {
+          throw new Error('warm action requires model');
+        }
+        const res = await fetch(`${baseUrl}/v1/chat/completions`, {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({
+            model: action.model,
+            prompt: '.',
+            max_tokens: 1,
+            stream: false,
+          }),
+          signal: AbortSignal.timeout(60_000),
+        });
+        if (!res.ok) {
+          const body = await res.text().catch(() => '');
+          throw new Error(`warm failed: ${res.status} ${body.slice(0, 200)}`);
+        }
+        break;
+      }
+
+      case 'unload': {
+        let url: string;
+        if (action.model) {
+          url = `${baseUrl}/api/models/unload/${encodeURIComponent(action.model)}`;
+        } else {
+          url = `${baseUrl}/api/models/unload`;
+        }
+        const res = await fetch(url, {
+          method: 'POST',
+          signal: AbortSignal.timeout(30_000),
+        });
+        if (!res.ok) {
+          const body = await res.text().catch(() => '');
+          throw new Error(`unload failed: ${res.status} ${body.slice(0, 200)}`);
+        }
+        break;
+      }
+    }
+  }
+
+  /**
+   * Check if a model is already loaded on the host (stale-action guard).
+   * This is a placeholder — the real check reads from fleet state.
+   */
+  private isModelAlreadyLoaded(_providerId: string, _model: string | undefined): boolean {
+    // Will be wired to fleet state in index.ts
+    return false;
+  }
+
+  /**
+   * Set the model-loaded check callback (wired from index.ts).
+   */
+  setModelLoadedCheck(fn: (providerId: string, model: string | undefined) => boolean): void {
+    const original = this.isModelAlreadyLoaded.bind(this);
+    this.isModelAlreadyLoaded = fn;
+  }
+}
--- a/apps/control/src/services/bench-engine.ts
+++ b/apps/control/src/services/bench-engine.ts
@@ -0,0 +1,517 @@
+/**
+ * Bench engine: speed benchmark runner.
+ *
+ * Suite = grid of (prompt_tokens x gen_tokens x concurrency) x repetitions.
+ * TTFT measured client-side at first stream delta.
+ * llama.cpp timings parsed from final stream chunk.
+ * Bounded fan-out via Promise.allSettled at suite-declared concurrency.
+ * Warmup excluded from results.
+ */
+
+import type { Sql } from '../db.js';
+import type { DeltaEmitter } from '../index.js';
+import { jsonbObject } from './jsonb.js';
+
+// ─── types ──────────────────────────────────────────────────────────────────
+
+export interface BenchSuite {
+  id: string;
+  name: string;
+  providerId: string;
+  model: string;
+  promptTokens: number[];
+  genTokens: number[];
+  concurrency: number[];
+  repetitions: number;
+  temperature?: number;
+  topP?: number;
+  metadata?: Record<string, unknown>;
+}
+
+export interface BenchRunParams {
+  suite: BenchSuite;
+  baseUrl: string;
+  temperature?: number;
+  topP?: number;
+}
+
+export interface BenchTimings {
+  promptPerSecond: number;
+  predictedPerSecond: number;
+  cacheN: number;
+}
+
+export interface BenchSample {
+  promptTokens: number;
+  genTokens: number;
+  concurrency: number;
+  repetition: number;
+  ttftMs: number | null;
+  totalMs: number | null;
+  promptTps: number | null;
+  genTps: number | null;
+  cacheN: number | null;
+  error: string | null;
+}
+
+// ─── stream parser ──────────────────────────────────────────────────────────
+
+/**
+ * Parse llama.cpp timings from the final chunk of a streaming response.
+ * llama.cpp returns timings in the last chunk's usage or as a separate field:
+ *   { "timings": { "prompt_per_second": N, "predicted_per_second": N, "cache_n": N } }
+ * or in the usage object.
+ */
+export function parseLlamaTimings(chunk: string): BenchTimings | null {
+  try {
+    // Strip "data: " prefix if present
+    const jsonStr = chunk.startsWith('data: ') ? chunk.slice(6) : chunk;
+    if (jsonStr.trim() === '[DONE]') return null;
+
+    const parsed = JSON.parse(jsonStr) as Record<string, unknown>;
+
+    // Try the timings object first (llama.cpp standard)
+    const timings = parsed.timings as {
+      prompt_per_second?: number;
+      predicted_per_second?: number;
+      cache_n?: number;
+    } | undefined;
+    if (timings) {
+      return {
+        promptPerSecond: timings.prompt_per_second ?? 0,
+        predictedPerSecond: timings.predicted_per_second ?? 0,
+        cacheN: timings.cache_n ?? 0,
+      };
+    }
+
+    // Fallback: check usage.completion_tokens_details or completion_tokens
+    const usage = parsed.usage as {
+      prompt_tokens?: number;
+      completion_tokens?: number;
+    } | undefined;
+    if (usage) {
+      return {
+        promptPerSecond: 0,
+        predictedPerSecond: 0,
+        cacheN: 0,
+      };
+    }
+
+    return null;
+  } catch {
+    return null;
+  }
+}
+
+// ─── single request runner ──────────────────────────────────────────────────
+
+/**
+ * Run a single bench request: stream completion, capture TTFT, parse timings.
+ * Returns a BenchSample.
+ */
+export async function runSingleBenchRequest(
+  baseUrl: string,
+  model: string,
+  promptTokens: number,
+  genTokens: number,
+  repetition: number,
+  temperature: number = 0.7,
+  topP: number = 0.9,
+): Promise<BenchSample> {
+  const sample: BenchSample = {
+    promptTokens,
+    genTokens,
+    concurrency: 1, // set by the fan-out caller
+    repetition,
+    ttftMs: null,
+    totalMs: null,
+    promptTps: null,
+    genTps: null,
+    cacheN: null,
+    error: null,
+  };
+
+  // Generate a deterministic prompt of the target length.
+  const prompt = generatePrompt(promptTokens);
+
+  const startTime = Date.now();
+  let firstDeltaTime: number | null = null;
+  let timings: BenchTimings | null = null;
+
+  try {
+    const res = await fetch(`${baseUrl}/v1/chat/completions`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        model,
+        messages: [{ role: 'user', content: prompt }],
+        temperature,
+        top_p: topP,
+        max_tokens: genTokens,
+        stream: true,
+      }),
+      signal: AbortSignal.timeout(120_000),
+    });
+
+    if (!res.ok) {
+      const errBody = await res.text().catch(() => '');
+      throw new Error(`bench request failed: ${res.status} ${errBody.slice(0, 200)}`);
+    }
+
+    const reader = res.body?.getReader();
+    if (!reader) {
+      throw new Error('no response body');
+    }
+
+    const decoder = new TextDecoder();
+    let buffer = '';
+
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+
+      buffer += decoder.decode(value, { stream: true });
+      const lines = buffer.split('\n');
+      buffer = lines.pop() ?? '';
+
+      for (const line of lines) {
+        const trimmed = line.trim();
+        if (!trimmed || trimmed === 'data: [DONE]') continue;
+
+        // TTFT: capture at first delta
+        if (firstDeltaTime === null) {
+          firstDeltaTime = Date.now();
+        }
+
+        // Parse timings from the final chunk
+        const t = parseLlamaTimings(trimmed);
+        if (t) {
+          timings = t;
+        }
+      }
+    }
+
+    sample.ttftMs = firstDeltaTime !== null ? firstDeltaTime - startTime : null;
+    sample.totalMs = Date.now() - startTime;
+
+    if (timings) {
+      sample.promptTps = timings.promptPerSecond;
+      sample.genTps = timings.predictedPerSecond;
+      sample.cacheN = timings.cacheN;
+    }
+  } catch (err) {
+    sample.error = (err as Error).message ?? String(err);
+  }
+
+  return sample;
+}
+
+/**
+ * Generate a deterministic prompt with approximately the target token count.
+ * Uses a repeating pattern that averages ~1.3 chars per token for GPT-style tokenizers.
+ */
+function generatePrompt(targetTokens: number): string {
+  // Simple pattern: repeat a sentence that tokenizes predictably.
+  // ~1.3 chars/token is a rough average for English text.
+  const charsPerToken = 4;
+  const targetChars = targetTokens * charsPerToken;
+  const base = 'The quick brown fox jumps over the lazy dog. ';
+  let result = '';
+  while (result.length < targetChars) {
+    result += base;
+  }
+  return result.slice(0, targetChars);
+}
+
+// ─── bench runner ───────────────────────────────────────────────────────────
+
+export interface BenchRunProgress {
+  jobId: string;
+  totalSamples: number;
+  completedSamples: number;
+  currentPromptTokens: number;
+  currentGenTokens: number;
+  currentConcurrency: number;
+  currentRepetition: number;
+}
+
+/**
+ * Run a full bench suite: grid of all combinations.
+ * Bounded fan-out via Promise.allSettled at suite-declared concurrency.
+ * Warmup excluded from results (1 warmup request per unique grid cell, discarded).
+ */
+export async function runBenchSuite(
+  params: BenchRunParams,
+  sql: Sql,
+  emitter: DeltaEmitter,
+  seq: number,
+  onProgress: (progress: BenchRunProgress) => void,
+): Promise<void> {
+  const { suite, baseUrl } = params;
+
+  // A4: suite-defined sampling params with fallback defaults.
+  const temperature = suite.temperature ?? params.temperature ?? 0.7;
+  const topP = suite.topP ?? params.topP ?? 0.9;
+  const jobId = suite.id;
+
+  // Build the full grid of combinations.
+  const grid: Array<{
+    promptTokens: number;
+    genTokens: number;
+    concurrency: number;
+    repetition: number;
+  }> = [];
+
+  for (const pt of suite.promptTokens) {
+    for (const gt of suite.genTokens) {
+      for (const conc of suite.concurrency) {
+        for (let rep = 0; rep < suite.repetitions; rep++) {
+          grid.push({ promptTokens: pt, genTokens: gt, concurrency: conc, repetition: rep });
+        }
+      }
+    }
+  }
+
+  const totalSamples = grid.length;
+
+  // Persist the run record with jobType (A2) and sampling params (A4).
+  const runId = `${jobId}_${Date.now()}`;
+  await sql`
+    INSERT INTO bench_runs (id, suite_id, job_type, status, started_at, total_samples, temperature, top_p)
+    VALUES (${runId}, ${suite.id}, 'bench', 'running', clock_timestamp(), ${totalSamples}, ${temperature}, ${topP})
+  `;
+
+  // Publish run started.
+  emitter.publish({
+    type: 'control_job' as const,
+    seq,
+    jobType: 'bench' as const,
+    jobId: runId,
+    status: 'running' as const,
+    detail: {
+      suiteId: suite.id,
+      providerId: suite.providerId,
+      model: suite.model,
+      totalSamples,
+    },
+  });
+
+  // A5: Warmup pass — 1 request per unique (promptTokens, genTokens) cell, discarded.
+  const uniqueCells = new Set<string>();
+  for (const item of grid) {
+    const cellKey = `${item.promptTokens}_${item.genTokens}`;
+    if (!uniqueCells.has(cellKey)) {
+      uniqueCells.add(cellKey);
+    }
+  }
+  const warmupPromises = Array.from(uniqueCells).map(async (cellKey) => {
+    const parts = cellKey.split('_').map(Number);
+    const pt = parts[0] ?? 0;
+    const gt = parts[1] ?? 0;
+    return runSingleBenchRequest(baseUrl, suite.model, pt, gt, 0, temperature, topP);
+  });
+  await Promise.allSettled(warmupPromises);
+
+  let completed = 0;
+  const samples: BenchSample[] = [];
+
+  // Group by (promptTokens, genTokens, concurrency) for fan-out; each group
+  // runs 'repetitions' requests concurrently.
+  const groups = new Map<string, typeof grid>();
+  for (const item of grid) {
+    const key = `${item.promptTokens}_${item.genTokens}_${item.concurrency}`;
+    if (!groups.has(key)) {
+      groups.set(key, []);
+    }
+    groups.get(key)!.push(item);
+  }
+
+  for (const [key, group] of groups) {
+    const concurrency = group[0]!.concurrency;
+    const batchSize = Math.min(concurrency, group.length);
+
+    // Process in batches of 'concurrency' size using Promise.allSettled.
+    for (let batchStart = 0; batchStart < group.length; batchStart += batchSize) {
+      const batch = group.slice(batchStart, batchStart + batchSize);
+
+      const promises = batch.map(async (item) => {
+        const sample = await runSingleBenchRequest(
+          baseUrl,
+          suite.model,
+          item.promptTokens,
+          item.genTokens,
+          item.repetition,
+          temperature,
+          topP,
+        );
+        sample.concurrency = item.concurrency;
+        return sample;
+      });
+
+      const results = await Promise.allSettled(promises);
+      for (const result of results) {
+        if (result.status === 'fulfilled') {
+          samples.push(result.value);
+        }
+        completed++;
+
+        // Progress callback
+        const current = batch[0]!;
+        onProgress({
+          jobId: runId,
+          totalSamples,
+          completedSamples: completed,
+          currentPromptTokens: current.promptTokens,
+          currentGenTokens: current.genTokens,
+          currentConcurrency: current.concurrency,
+          currentRepetition: current.repetition,
+        });
+
+        // Publish progress
+        emitter.publish({
+          type: 'control_job' as const,
+          seq,
+          jobType: 'bench' as const,
+          jobId: runId,
+          status: 'running' as const,
+          detail: {
+            completedSamples: completed,
+            totalSamples,
+            percent: Math.round((completed / totalSamples) * 100),
+          },
+        });
+      }
+    }
+  }
+
+  // Persist all samples.
+  for (const s of samples) {
+    await sql`
+      INSERT INTO bench_samples (run_id, prompt_tokens, gen_tokens, concurrency, repetition, ttft_ms, total_ms, prompt_tps, gen_tps, cache_n, error)
+      VALUES (${runId}, ${s.promptTokens}, ${s.genTokens}, ${s.concurrency}, ${s.repetition}, ${s.ttftMs ?? null}, ${s.totalMs ?? null}, ${s.promptTps ?? null}, ${s.genTps ?? null}, ${s.cacheN ?? null}, ${s.error ?? null})
+    `;
+  }
+
+  // Compute aggregates.
+  const validSamples = samples.filter((s) => !s.error && s.genTps != null);
+  const aggregate = computeAggregates(validSamples);
+
+  // A1: Baseline persistence + regression flag.
+  // Compare against existing baseline; first run seeds it.
+  const baselineRows = await sql<{ aggregate: string }[]>`
+    SELECT aggregate FROM bench_baselines
+    WHERE provider_id = ${suite.providerId} AND model = ${suite.model}
+  `;
+
+  const regressionFlag = computeRegressionFlag(aggregate, baselineRows[0]?.aggregate);
+
+  // Upsert baseline.
+  await sql`
+    INSERT INTO bench_baselines (provider_id, model, aggregate, run_id)
+    VALUES (${suite.providerId}, ${suite.model}, ${sql.json(aggregate as never)}, ${runId})
+    ON CONFLICT (provider_id, model) DO UPDATE SET
+      aggregate = EXCLUDED.aggregate,
+      run_id = EXCLUDED.run_id,
+      created_at = clock_timestamp()
+  `;
+
+  // Update run record with regression flag.
+  await sql`
+    UPDATE bench_runs
+    SET status = 'completed', finished_at = clock_timestamp(), completed_samples = ${completed},
+        aggregate = ${sql.json(aggregate as never)}, regression_flag = ${regressionFlag}
+    WHERE id = ${runId}
+  `;
+
+  // Publish completion.
+  emitter.publish({
+    type: 'control_job' as const,
+    seq,
+    jobType: 'bench' as const,
+    jobId: runId,
+    status: 'completed' as const,
+    detail: { ...aggregate, regressionFlag },
+  });
+}
+
+/**
+ * A1: Compute regression flag against baseline.
+ * Threshold: gen tok/s -10% = regression, +5% = improvement.
+ * N5: guards against divide-by-zero.
+ */
+export function computeRegressionFlag(
+  current: BenchAggregate,
+  // Accepts the raw bench_baselines.aggregate value: porsager returns jsonb
+  // already-parsed (object), while tests pass a JSON string. jsonbObject handles
+  // both. undefined => no baseline row yet => seed.
+  baselineJson: unknown,
+): 'baseline' | 'regression' | 'improvement' | null {
+  if (!current.avgGenTps) return null;
+  if (!baselineJson) return 'baseline';
+
+  const baseline = jsonbObject(baselineJson) as BenchAggregate | null;
+  if (!baseline) return null;
+
+  if (!baseline.avgGenTps || baseline.avgGenTps === 0) return null;
+
+  const delta = (current.avgGenTps - baseline.avgGenTps) / baseline.avgGenTps;
+  if (delta < -0.1) return 'regression';
+  if (delta > 0.05) return 'improvement';
+  return 'baseline';
+}
+
+export interface BenchAggregate {
+  avgTtftMs: number | null;
+  medianTtftMs: number | null;
+  avgGenTps: number | null;
+  medianGenTps: number | null;
+  avgPromptTps: number | null;
+  medianPromptTps: number | null;
+  totalSamples: number;
+  errorSamples: number;
+  p95TtftMs: number | null;
+}
+
+export function computeAggregates(samples: BenchSample[]): BenchAggregate {
+  if (samples.length === 0) {
+    return {
+      avgTtftMs: null,
+      medianTtftMs: null,
+      avgGenTps: null,
+      medianGenTps: null,
+      avgPromptTps: null,
+      medianPromptTps: null,
+      totalSamples: 0,
+      errorSamples: 0,
+      p95TtftMs: null,
+    };
+  }
+
+  const ttfts = samples.map((s) => s.ttftMs).filter((v): v is number => v != null).sort((a, b) => a - b);
+  const genTps = samples.map((s) => s.genTps).filter((v): v is number => v != null).sort((a, b) => a - b);
+  const promptTps = samples.map((s) => s.promptTps).filter((v): v is number => v != null).sort((a, b) => a - b);
+
+  const avg = (arr: number[]) => arr.length ? arr.reduce((a, b) => a + b, 0) / arr.length : null;
+  const median = (arr: number[]) => {
+    if (arr.length === 0) return null;
+    const mid = Math.floor(arr.length / 2);
+    return arr.length % 2 ? arr[mid]! : (arr[mid - 1]! + arr[mid]!) / 2;
+  };
+  const p95 = (arr: number[]) => {
+    if (arr.length === 0) return null;
+    const idx = Math.ceil(arr.length * 0.95) - 1;
+    return arr[Math.max(0, idx)] ?? null;
+  };
+
+  return {
+    avgTtftMs: avg(ttfts),
+    medianTtftMs: median(ttfts),
+    avgGenTps: avg(genTps),
+    medianGenTps: median(genTps),
+    avgPromptTps: avg(promptTps),
+    medianPromptTps: median(promptTps),
+    totalSamples: samples.length,
+    errorSamples: samples.filter((s) => s.error).length,
+    p95TtftMs: p95(ttfts),
+  };
+}
--- a/apps/control/src/services/capture-fetch.ts
+++ b/apps/control/src/services/capture-fetch.ts
@@ -0,0 +1,142 @@
+/**
+ * Capture fetch: GET /api/captures/:id on llama-swap host, decode base64,
+ * persist trimmed copy (256KB cap app-enforced), render with shiki JSON.
+ *
+ * The 256KB cap is application-enforced in the fetch handler, not a DB constraint.
+ * Total budget: 50MB default, configurable via CAPTURE_BUDGET_MB env var.
+ */
+
+import type { Sql } from '../db.js';
+
+const MAX_CAPTURE_BYTES = 256 * 1024; // 256KB
+
+export interface CaptureData {
+  id: number;
+  providerId: string;
+  timestamp: string;
+  model: string;
+  requestHeaders: Record<string, string>;
+  requestBody: string;
+  responseHeaders: Record<string, string>;
+  responseBody: string;
+  durationMs: number;
+  sizeBytes: number;
+}
+
+export interface CaptureFetchResult {
+  ok: boolean;
+  capture?: CaptureData;
+  error?: string;
+}
+
+/**
+ * Fetch a capture from a llama-swap host by its swap_entry_id.
+ */
+export async function fetchCapture(
+  baseUrl: string,
+  providerId: string,
+  swapEntryId: number,
+): Promise<CaptureFetchResult> {
+  try {
+    const res = await fetch(`${baseUrl}/api/captures/${swapEntryId}`, {
+      signal: AbortSignal.timeout(10_000),
+    });
+
+    if (!res.ok) {
+      if (res.status === 404) {
+        return { ok: false, error: 'capture not found on host' };
+      }
+      return { ok: false, error: `fetch failed: ${res.status}` };
+    }
+
+    const raw = await res.json() as Record<string, unknown>;
+    return { ok: true, capture: parseCapture(raw, providerId, swapEntryId) };
+  } catch (err) {
+    return { ok: false, error: (err as Error).message ?? String(err) };
+  }
+}
+
+/**
+ * Parse raw capture data from llama-swap into our structured format.
+ * Trims to 256KB cap.
+ */
+export function parseCapture(
+  raw: Record<string, unknown>,
+  providerId: string,
+  swapEntryId: number,
+): CaptureData {
+  const requestHeaders = (raw.request_headers ?? raw.headers ?? {}) as Record<string, string>;
+  const responseHeaders = (raw.response_headers ?? {}) as Record<string, string>;
+
+  let requestBody = '';
+  let responseBody = '';
+
+  // Decode base64 bodies if present
+  const reqBodyRaw = raw.request_body as string | undefined;
+  const respBodyRaw = raw.response_body as string | undefined;
+
+  if (reqBodyRaw) {
+    try {
+      requestBody = Buffer.from(reqBodyRaw, 'base64').toString('utf8');
+    } catch {
+      requestBody = reqBodyRaw;
+    }
+  }
+
+  if (respBodyRaw) {
+    try {
+      responseBody = Buffer.from(respBodyRaw, 'base64').toString('utf8');
+    } catch {
+      responseBody = respBodyRaw;
+    }
+  }
+
+  // Enforce 256KB cap by trimming response body (largest component)
+  const totalSize = requestBody.length + responseBody.length;
+  if (totalSize > MAX_CAPTURE_BYTES) {
+    const remaining = MAX_CAPTURE_BYTES - requestBody.length;
+    responseBody = responseBody.slice(0, Math.max(0, Math.floor(remaining)));
+    responseBody += '\n\n[truncated: capture exceeds 256KB cap]';
+  }
+
+  const sizeBytes = Buffer.byteLength(requestBody + responseBody);
+
+  return {
+    id: swapEntryId,
+    providerId,
+    timestamp: (raw.timestamp ?? raw.ts ?? new Date().toISOString()) as string,
+    model: (raw.model ?? '') as string,
+    requestHeaders,
+    requestBody,
+    responseHeaders,
+    responseBody,
+    durationMs: (raw.duration_ms ?? 0) as number,
+    sizeBytes,
+  };
+}
+
+/**
+ * Persist a trimmed capture to the control_requests table.
+ * Uses sql.json(value as never) per convention.
+ */
+export async function persistCapture(
+  sql: Sql,
+  capture: CaptureData,
+): Promise<void> {
+  // Pass the OBJECT to sql.json — wrapping a pre-stringified value stores a
+  // JSON string in the JSONB column (the double-serialization gotcha).
+  const captureObj = {
+    requestHeaders: capture.requestHeaders,
+    requestBody: capture.requestBody,
+    responseHeaders: capture.responseHeaders,
+    responseBody: capture.responseBody,
+    durationMs: capture.durationMs,
+  };
+
+  await sql`
+    INSERT INTO control_requests (provider_id, swap_entry_id, ts, model, capture)
+    VALUES (${capture.providerId}, ${capture.id}, ${capture.timestamp}, ${capture.model}, ${sql.json(captureObj as never)})
+    ON CONFLICT (provider_id, swap_entry_id, ts) DO UPDATE SET
+      capture = EXCLUDED.capture
+  `;
+}
--- a/apps/control/src/services/eval-suites.ts
+++ b/apps/control/src/services/eval-suites.ts
@@ -0,0 +1,409 @@
+import { randomUUID } from 'node:crypto';
+import { readFileSync, readdirSync } from 'node:fs';
+import { resolve, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { load as loadYaml } from 'js-yaml';
+import type { Sql } from '../db.js';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+// ─── types ──────────────────────────────────────────────────────────────────
+
+export interface CodeTask {
+  id: string;
+  prompt: string;
+  test_code: string;
+  expected_output: string;
+  language: string;
+}
+
+export interface RubricCriterion {
+  criterion: string;
+  description: string;
+  weight: number;
+}
+
+export interface ChatTask {
+  id: string;
+  prompt: string;
+  prompt_template?: string;
+  context_generator?: string;
+  rubric: {
+    criteria: RubricCriterion[];
+    max_score: number;
+  };
+}
+
+export interface EvalSuiteData {
+  id: string;
+  name: string;
+  kind: 'chat' | 'code';
+  version: number;
+  description?: string;
+  judge_model: string | null;
+  tasks: (CodeTask | ChatTask)[];
+}
+
+export interface EvalSuiteRow {
+  id: string;
+  name: string;
+  kind: string;
+  version: number;
+  tasks: string;
+  judge_model: string | null;
+  judge_model_version: string | null;
+  metadata: string | null;
+  created_at: string;
+}
+
+// ─── YAML loader ────────────────────────────────────────────────────────────
+
+const DATA_DIR = resolve(dirname(__filename), '../../data');
+
+/**
+ * Load all eval suite YAML files from the data/ directory.
+ */
+export function loadEvalSuitesFromData(): EvalSuiteData[] {
+  const suites: EvalSuiteData[] = [];
+  try {
+    const files = readdirSync(DATA_DIR).filter((f) => f.startsWith('suite-') && f.endsWith('.yaml'));
+    for (const file of files) {
+      const path = resolve(DATA_DIR, file);
+      const content = readFileSync(path, 'utf8');
+      const parsed = loadYaml(content) as Record<string, unknown>;
+      const tasks = parsed.tasks as (CodeTask | ChatTask)[] | undefined;
+      if (!tasks || !Array.isArray(tasks)) continue;
+
+      const chatTasks: ChatTask[] = [];
+      const codeTasks: CodeTask[] = [];
+
+      for (const task of tasks) {
+        const t = task as unknown as Record<string, unknown>;
+        if (t.rubric) {
+          const rubric = t.rubric as Record<string, unknown>;
+          chatTasks.push({
+            id: t.id as string,
+            prompt: t.prompt as string,
+            prompt_template: (t.prompt_template as string) ?? undefined,
+            context_generator: (t.context_generator as string) ?? undefined,
+            rubric: {
+              criteria: normalizeCriteria(rubric),
+              max_score: (rubric.max_score as number) ?? 7,
+            },
+          });
+        } else if (t.test_code) {
+          codeTasks.push({
+            id: t.id as string,
+            prompt: t.prompt as string,
+            test_code: t.test_code as string,
+            expected_output: t.expected_output as string,
+            language: t.language as string,
+          });
+        }
+      }
+
+      suites.push({
+        id: parsed.id as string,
+        name: parsed.name as string,
+        kind: parsed.kind as 'chat' | 'code',
+        version: (parsed.version as number) ?? 1,
+        description: (parsed.description as string) ?? undefined,
+        judge_model: (parsed.judge_model as string) ?? null,
+        tasks: [...codeTasks, ...chatTasks],
+      });
+    }
+  } catch (err) {
+    console.warn({ err: (err as Error).message }, 'eval: failed to load suites from data/');
+  }
+  return suites;
+}
+
+function normalizeCriteria(rubric: Record<string, unknown>): RubricCriterion[] {
+  const criteria = rubric.criteria as RubricCriterion[] | undefined;
+  if (criteria && Array.isArray(criteria)) {
+    return criteria.filter((c) => c.criterion && c.weight);
+  }
+  const maxScore = rubric.max_score as number | undefined;
+  const entries = Object.entries(rubric);
+  const result: RubricCriterion[] = [];
+  let totalWeight = 0;
+  for (const [key, val] of entries) {
+    if (key === 'max_score' || key === 'criteria') continue;
+    const entry = val as { criterion?: string; description?: string; weight?: number };
+    if (entry.weight && entry.description) {
+      result.push({ criterion: key, description: entry.description, weight: entry.weight });
+      totalWeight += entry.weight;
+    }
+  }
+  if (result.length === 0) {
+    for (const [key, val] of entries) {
+      if (key === 'max_score' || key === 'criteria') continue;
+      result.push({ criterion: key, description: String(val), weight: 1 });
+    }
+  }
+  if (maxScore && totalWeight > 0) {
+    const scale = maxScore / totalWeight;
+    for (const c of result) {
+      c.weight = Math.round(c.weight * scale * 10) / 10;
+    }
+  }
+  return result;
+}
+
+// ─── DB operations ──────────────────────────────────────────────────────────
+
+/**
+ * Seed eval suites from data/ YAML files into the database.
+ * Uses INSERT ... ON CONFLICT DO NOTHING for idempotency.
+ */
+export async function seedEvalSuites(sql: Sql): Promise<void> {
+  const suites = loadEvalSuitesFromData();
+  for (const suite of suites) {
+    await sql`
+      INSERT INTO eval_suites (id, name, kind, version, tasks, judge_model, judge_model_version, metadata)
+      VALUES (
+        ${suite.id},
+        ${suite.name},
+        ${suite.kind},
+        ${suite.version},
+        ${sql.json(suite.tasks as never)},
+        ${suite.judge_model},
+        NULL,
+        ${suite.description ? sql.json({ description: suite.description } as never) : sql`NULL::jsonb`}
+      )
+      ON CONFLICT (id) DO NOTHING
+    `;
+  }
+}
+
+/**
+ * List all eval suites.
+ */
+export async function listEvalSuites(sql: Sql): Promise<EvalSuiteRow[]> {
+  return await sql<EvalSuiteRow[]>`
+    SELECT id, name, kind, version, tasks, judge_model, judge_model_version, metadata, created_at
+    FROM eval_suites
+    ORDER BY created_at DESC
+  `;
+}
+
+/**
+ * Get a single eval suite by ID.
+ */
+export async function getEvalSuite(sql: Sql, id: string): Promise<EvalSuiteRow | null> {
+  const rows = await sql<EvalSuiteRow[]>`
+    SELECT id, name, kind, version, tasks, judge_model, judge_model_version, metadata, created_at
+    FROM eval_suites WHERE id = ${id}
+  `;
+  return rows[0] ?? null;
+}
+
+/**
+ * Create or update an eval suite.
+ */
+export async function upsertEvalSuite(
+  sql: Sql,
+  id: string | null,
+  name: string,
+  kind: 'chat' | 'code',
+  tasks: unknown[],
+  judgeModel: string | null,
+  metadata?: Record<string, unknown>,
+): Promise<string> {
+  const suiteId = id ?? randomUUID();
+  const existing = await getEvalSuite(sql, suiteId);
+  const version = existing ? existing.version + 1 : 1;
+
+  await sql`
+    INSERT INTO eval_suites (id, name, kind, version, tasks, judge_model, judge_model_version, metadata)
+    VALUES (
+      ${suiteId},
+      ${name},
+      ${kind},
+      ${version},
+      ${sql.json(tasks as never)},
+      ${judgeModel},
+      NULL,
+      ${metadata ? sql.json(metadata as never) : sql`NULL::jsonb`}
+    )
+    ON CONFLICT (id) DO UPDATE SET
+      name = EXCLUDED.name,
+      kind = EXCLUDED.kind,
+      version = EXCLUDED.version,
+      tasks = EXCLUDED.tasks,
+      judge_model = EXCLUDED.judge_model,
+      metadata = EXCLUDED.metadata
+  `;
+  return suiteId;
+}
+
+/**
+ * Create a new eval run record.
+ */
+export async function createEvalRun(
+  sql: Sql,
+  suiteId: string,
+  providerId: string,
+  model: string,
+  quant: string | null,
+  judgeModel: string | null,
+  judgeModelVersion: string | null,
+  totalTasks: number,
+): Promise<string> {
+  const runId = `eval_${Date.now()}_${randomUUID().slice(0, 8)}`;
+  await sql`
+    INSERT INTO eval_runs (id, suite_id, job_type, provider_id, model, quant, status, judge_model, judge_model_version, started_at, total_tasks)
+    VALUES (
+      ${runId}, ${suiteId}, 'eval', ${providerId}, ${model}, ${quant},
+      'running', ${judgeModel}, ${judgeModelVersion},
+      clock_timestamp(), ${totalTasks}
+    )
+  `;
+  return runId;
+}
+
+/**
+ * Record a single eval result.
+ */
+export async function recordEvalResult(
+  sql: Sql,
+  runId: string,
+  taskId: string,
+  taskIndex: number,
+  score: number | null,
+  maxScore: number | null,
+  rationale: string | null,
+  sandboxExitCode: number | null,
+  sandboxStderr: string | null,
+  sandboxStdout: string | null,
+  executionMs: number | null,
+  error: string | null,
+): Promise<void> {
+  await sql`
+    INSERT INTO eval_results (run_id, task_id, task_index, score, max_score, rationale, sandbox_exit_code, sandbox_stderr, sandbox_stdout, execution_ms, error)
+    VALUES (
+      ${runId}, ${taskId}, ${taskIndex}, ${score}, ${maxScore},
+      ${rationale}, ${sandboxExitCode}, ${sandboxStderr}, ${sandboxStdout},
+      ${executionMs}, ${error}
+    )
+  `;
+}
+
+/**
+ * Update eval run completion.
+ */
+export async function completeEvalRun(
+  sql: Sql,
+  runId: string,
+  completedTasks: number,
+  aggregate: Record<string, unknown> | null,
+  error: string | null,
+): Promise<void> {
+  await sql`
+    UPDATE eval_runs
+    SET status = ${error ? 'failed' : 'completed'},
+        finished_at = clock_timestamp(),
+        completed_tasks = ${completedTasks},
+        aggregate = ${aggregate ? sql.json(aggregate as never) : sql`NULL::jsonb`},
+        error = ${error}
+    WHERE id = ${runId}
+  `;
+}
+
+/**
+ * List eval runs with optional filters.
+ */
+export async function listEvalRuns(
+  sql: Sql,
+  suiteId?: string,
+  providerId?: string,
+): Promise<Array<{
+  id: string;
+  suite_id: string;
+  job_type: string;
+  provider_id: string;
+  model: string;
+  quant: string | null;
+  status: string;
+  judge_model: string | null;
+  started_at: string | null;
+  finished_at: string | null;
+  total_tasks: number;
+  completed_tasks: number;
+  aggregate: string | null;
+  error: string | null;
+  created_at: string;
+}>> {
+  let query = sql<EvalSuiteRow[]>`
+    SELECT id, suite_id, job_type, provider_id, model, quant, status, judge_model,
+      started_at, finished_at, total_tasks, completed_tasks, aggregate, error, created_at
+    FROM eval_runs
+    WHERE 1=1
+  `;
+
+  if (suiteId) {
+    query = sql`${query} AND suite_id = ${suiteId}`;
+  }
+  if (providerId) {
+    query = sql`${query} AND provider_id = ${providerId}`;
+  }
+
+  query = sql`${query} ORDER BY created_at DESC LIMIT 200`;
+  return query as unknown as Array<{
+    id: string;
+    suite_id: string;
+    job_type: string;
+    provider_id: string;
+    model: string;
+    quant: string | null;
+    status: string;
+    judge_model: string | null;
+    started_at: string | null;
+    finished_at: string | null;
+    total_tasks: number;
+    completed_tasks: number;
+    aggregate: string | null;
+    error: string | null;
+    created_at: string;
+  }>;
+}
+
+/**
+ * Get eval results for a run.
+ */
+export async function getEvalResults(
+  sql: Sql,
+  runId: string,
+): Promise<Array<{
+  id: number;
+  task_id: string;
+  task_index: number;
+  score: number | null;
+  max_score: number | null;
+  rationale: string | null;
+  sandbox_exit_code: number | null;
+  sandbox_stderr: string | null;
+  sandbox_stdout: string | null;
+  execution_ms: number | null;
+  error: string | null;
+}>> {
+  return await sql<Array<{
+    id: number;
+    task_id: string;
+    task_index: number;
+    score: number | null;
+    max_score: number | null;
+    rationale: string | null;
+    sandbox_exit_code: number | null;
+    sandbox_stderr: string | null;
+    sandbox_stdout: string | null;
+    execution_ms: number | null;
+    error: string | null;
+  }>>`
+    SELECT id, task_id, task_index, score, max_score, rationale,
+      sandbox_exit_code, sandbox_stderr, sandbox_stdout, execution_ms, error
+    FROM eval_results WHERE run_id = ${runId}
+    ORDER BY task_index
+  `;
+}
--- a/apps/control/src/services/fleet-connector.ts
+++ b/apps/control/src/services/fleet-connector.ts
@@ -0,0 +1,264 @@
+/**
+ * Fleet connector: SSE client consuming llama-swap /api/events per enabled host.
+ *
+ * Ports the opencode-sse.ts reconnectDecision pattern (exponential backoff +
+ * circuit-breaker) with one critical addition: **jitter**. The source pattern
+ * has NO jitter, which causes thundering-herd reconnections across N hosts.
+ *
+ * Jitter: random 0-50% of computed delay. Pure function for testability.
+ *
+ * Event parsing is NEW code — llama-swap's SSE envelope (modelStatus | logData |
+ * metrics | inflight) differs from the opencode SDK's Event type.
+ */
+
+import type { FastifyBaseLogger } from 'fastify';
+import type { Sql } from '../db.js';
+
+// ─── jitter (pure) ──────────────────────────────────────────────────────────
+
+/** Add random 0-50% jitter to a delay value. */
+export function addJitter(delayMs: number): number {
+  const jitter = delayMs * Math.random() * 0.5;
+  return delayMs + jitter;
+}
+
+// ─── reconnect backoff ──────────────────────────────────────────────────────
+
+export interface ReconnectPolicy {
+  baseMs: number;
+  maxMs: number;
+  maxAttempts: number;
+}
+
+export const DEFAULT_RECONNECT_POLICY: ReconnectPolicy = {
+  baseMs: 1_000,
+  maxMs: 30_000,
+  maxAttempts: 6,
+};
+
+export type ReconnectDecision =
+  | { action: 'reconnect'; delayMs: number }
+  | { action: 'give-up' };
+
+export function reconnectDecision(
+  failures: number,
+  policy: ReconnectPolicy = DEFAULT_RECONNECT_POLICY,
+): ReconnectDecision {
+  if (failures > policy.maxAttempts) return { action: 'give-up' };
+  const exp = policy.baseMs * 2 ** (failures - 1);
+  const capped = Math.min(policy.maxMs, exp);
+  return { action: 'reconnect', delayMs: addJitter(capped) };
+}
+
+// ─── llama-swap SSE envelope types ──────────────────────────────────────────
+// Real wire shape (apigroup.go):
+//   event:message
+//   data:{"type":"modelStatus|logData|metrics|inflight","data":"<ESCAPED JSON STRING>"}
+// The SSE event name is ALWAYS 'message'. The discriminator is the outer JSON's
+// .type field. The payload is DOUBLE-ENCODED: JSON.parse(data) gives {type, data:string},
+// then JSON.parse(that.data) gives the actual payload.
+
+// Per-type payload shapes, verified against the fork source
+// (/opt/forks/llama-swap/internal/server/apigroup.go sendModels/sendLogData/
+// sendMetrics/sendInFlight, apiModel struct at :20):
+//   modelStatus -> []apiModel        (FULL-FLEET snapshot array, not a single transition)
+//   logData     -> {source, data}    (field is 'data', not 'line')
+//   metrics     -> []ActivityLogEntry (BARE array, tokens nested)
+//   inflight    -> {total}           (host-level total, NOT per-model)
+export type LlamaSweepSSEEvent =
+  | { type: 'modelStatus'; data: ModelStatusEntry[] }
+  | { type: 'logData'; data: LogData }
+  | { type: 'metrics'; data: MetricsEntry[] }
+  | { type: 'inflight'; data: InflightData };
+
+/** One entry of the modelStatus full-fleet array (fork apiModel struct). */
+export interface ModelStatusEntry {
+  id: string;
+  name: string;
+  description: string;
+  state: string;
+  unlisted: boolean;
+  peerID: string;
+  aliases?: string[];
+}
+
+export interface LogData {
+  source: string;
+  data: string;
+}
+
+// Real /api/metrics shape: bare JSON array of entries with NESTED tokens.
+// {id, timestamp, model, req_path, resp_status_code, tokens:{...}, duration_ms, has_capture}
+// NOTE: ActivityLogEntry does NOT carry request headers or source field.
+// Headers exist only in ReqRespCapture (fetched on-demand via /api/captures/:id).
+// See design §7 "Implementation notes" for the discrepancy.
+export interface MetricsEntry {
+  id: number;
+  timestamp: string;
+  model: string;
+  req_path: string;
+  resp_status_code: number;
+  tokens: {
+    cache_tokens: number;
+    input_tokens: number;
+    output_tokens: number;
+    prompt_per_second: number;
+    tokens_per_second: number;
+  };
+  duration_ms: number;
+  has_capture: boolean;
+  capture?: string;
+}
+
+export interface InflightData {
+  total: number;
+}
+
+// ─── the loop ───────────────────────────────────────────────────────────────
+
+export interface FleetConnectorDeps {
+  isUp: () => boolean;
+  sql: Sql;
+  log: FastifyBaseLogger;
+  onEvent: (providerId: string, event: LlamaSweepSSEEvent) => void | Promise<void>;
+  onReconcile: (providerId: string, metrics: MetricsEntry[]) => Promise<boolean>;
+  onReconnectGiveUp: (providerId: string) => Promise<void>;
+  sleep?: (ms: number) => Promise<void>;
+  policy?: ReconnectPolicy;
+}
+
+function defaultSleep(ms: number): Promise<void> {
+  return new Promise((r) => setTimeout(r, ms));
+}
+
+/**
+ * Parse llama-swap SSE lines.
+ *
+ * Real wire shape (apigroup.go):
+ *   event:message
+ *   data:{"type":"modelStatus","data":"<ESCAPED JSON STRING>"}
+ *
+ * The SSE event name is always 'message'. The discriminator is the outer JSON's
+ * .type field. The payload is DOUBLE-ENCODED: JSON.parse(data) gives {type, data:string},
+ * then JSON.parse(that.data) gives the actual payload.
+ *
+ * Returns the fully-decoded event, or null for non-data lines.
+ */
+export function parseSseLine(line: string): LlamaSweepSSEEvent | null {
+  const trimmed = line.trim();
+  if (!trimmed) return null;
+
+  // The SSE event name is always 'event:message' -- we ignore it.
+  if (trimmed.startsWith('event:')) {
+    return null;
+  }
+
+  // "data: <json>" -- the only line that carries payload.
+  if (trimmed.startsWith('data:')) {
+    const dataStr = trimmed.slice(5).trimStart();
+    if (!dataStr) return null;
+
+    // First JSON parse: { type: "modelStatus", data: "<escaped json>" }
+    let outer: { type: string; data: string };
+    try {
+      outer = JSON.parse(dataStr) as { type: string; data: string };
+    } catch {
+      return null;
+    }
+
+    if (!outer.type || typeof outer.data !== 'string' || !outer.data) {
+      return null;
+    }
+
+    // Second JSON parse: the actual payload (double-encoded string).
+    let inner: unknown;
+    try {
+      inner = JSON.parse(outer.data);
+    } catch {
+      return null;
+    }
+
+    return { type: outer.type, data: inner } as LlamaSweepSSEEvent;
+  }
+
+  return null;
+}
+
+export function startFleetConnector(providerId: string, baseUrl: string, deps: FleetConnectorDeps): AbortController {
+  const abort = new AbortController();
+  void runFleetConnector(providerId, baseUrl, abort, deps).finally(() => {
+    if (abort.signal.aborted) {
+      // connection dropped — cleanup handled by caller
+    }
+  });
+  return abort;
+}
+
+export async function runFleetConnector(
+  providerId: string,
+  baseUrl: string,
+  abort: AbortController,
+  deps: FleetConnectorDeps,
+): Promise<void> {
+  const signal = abort.signal;
+  const sleep = deps.sleep ?? defaultSleep;
+  const policy = deps.policy ?? DEFAULT_RECONNECT_POLICY;
+  let failures = 0;
+
+  while (deps.isUp() && !signal.aborted) {
+    const url = `${baseUrl}/api/events`;
+    try {
+      const res = await fetch(url, { signal });
+      if (!res.ok) {
+        throw new Error(`SSE connect failed: ${res.status} ${res.statusText}`);
+      }
+
+      const reader = res.body?.getReader();
+      if (!reader) throw new Error('no response body');
+
+      const decoder = new TextDecoder();
+      let buffer = '';
+
+      while (!signal.aborted) {
+        const { done, value } = await reader.read();
+        if (done) break;
+        buffer += decoder.decode(value, { stream: true });
+
+        const lines = buffer.split('\n');
+        buffer = lines.pop() ?? '';
+
+        for (const line of lines) {
+          if (signal.aborted) break;
+          const event = parseSseLine(line);
+          if (!event) continue;
+
+          try {
+            await Promise.resolve(deps.onEvent(providerId, event));
+          } catch (err) {
+            deps.log.error({ providerId, err: (err as Error).message }, 'fleet: onEvent failed');
+          }
+        }
+      }
+
+      // Clean stream end — healthy reconnect at base delay (pre-hardening).
+      failures = 0;
+      if (deps.isUp() && !signal.aborted) {
+        await sleep(policy.baseMs);
+      }
+    } catch (err) {
+      if (!deps.isUp() || signal.aborted) break;
+      failures += 1;
+      const decision = reconnectDecision(failures, policy);
+      deps.log.warn(
+        { providerId, failures, action: decision.action, err: (err as Error).message },
+        'fleet: SSE error; reconnecting',
+      );
+      if (decision.action === 'give-up') {
+        deps.log.warn({ providerId, failures }, 'fleet: SSE reconnect gave up (circuit breaker)');
+        await deps.onReconnectGiveUp(providerId);
+        break;
+      }
+      await sleep(decision.delayMs);
+    }
+  }
+}
--- a/apps/control/src/services/fleet-state.ts
+++ b/apps/control/src/services/fleet-state.ts
@@ -0,0 +1,89 @@
+export interface HostConfig {
+  providerId: string;
+  baseUrl: string;
+  enabled: boolean;
+}
+
+export interface FleetState {
+  hosts: Map<string, HostState>;
+}
+
+export interface HostState {
+  providerId: string;
+  liveness: 'connected' | 'reconnecting' | 'down';
+  lastSeenAt: Date | null;
+  seq: number;
+  /** Host-level inflight total (the fork's SSE publishes only a total, not per-model). */
+  inflightTotal: number;
+  models: Map<string, ModelState>;
+}
+
+export interface ModelState {
+  model: string;
+  state: string;
+  ts: Date;
+  ttlDeadline: Date | null;
+  inflight: number;
+}
+
+export interface SnapshotData {
+  hosts: Array<{
+    providerId: string;
+    liveness: 'connected' | 'reconnecting' | 'down';
+    lastSeenAt: string | null;
+    seq: number;
+    models: Array<{
+      model: string;
+      state: string;
+      ts: string;
+      ttlDeadline: string | null;
+      inflight: number;
+    }>;
+  }>;
+  requests?: Array<{
+    id: number;
+    providerId: string;
+    ts: string;
+    model: string | null;
+    reqPath: string | null;
+    statusCode: number | null;
+    durationMs: number | null;
+  }>;
+  perfSamples?: Array<{
+    providerId: string;
+    ts: string;
+    gpu: unknown;
+    sys: unknown;
+  }>;
+}
+
+// ─── helpers for tests ──────────────────────────────────────────────────────
+
+export function createFleetState(): FleetState {
+  return { hosts: new Map() };
+}
+
+export function ensureHostState(fleet: FleetState, providerId: string): HostState {
+  let state = fleet.hosts.get(providerId);
+  if (!state) {
+    state = {
+      providerId,
+      liveness: 'down',
+      lastSeenAt: null,
+      seq: 0,
+      inflightTotal: 0,
+      models: new Map(),
+    };
+    fleet.hosts.set(providerId, state);
+  }
+  return state;
+}
+
+export function stampLastSeen(state: HostState): void {
+  state.lastSeenAt = new Date();
+}
+
+export function incrementSeq(state: HostState): number {
+  state.seq += 1;
+  return state.seq;
+}
--- a/apps/control/src/services/gateway.ts
+++ b/apps/control/src/services/gateway.ts
@@ -0,0 +1,140 @@
+/**
+ * P7.1: auto:* gateway candidate resolution.
+ *
+ * The gateway exposes OpenAI-compatible virtual models. A completion against
+ * `auto:code` (etc.) is resolved to an ordered list of concrete candidate
+ * composite ids ('provider/model'), then dispatched with failover.
+ *
+ * Ordering source:
+ *   - An explicit route_policy for the virtual model (admin-curated candidates).
+ *   - Otherwise, advisory routing scores ranked by the category metric.
+ *
+ * Health filtering (only connected hosts are eligible) is applied last so a
+ * curated policy never dispatches to a down host.
+ *
+ * Pure helpers (orderCandidates, parseVirtualModel) are unit-tested; the DB
+ * read lives in resolveCandidates().
+ */
+
+import type { Sql } from '../db.js';
+import type { FleetState } from './fleet-state.js';
+import { computeRoutingScores, type ModelScore } from './routing-scores.js';
+import { jsonbStringArray } from './jsonb.js';
+
+export const VIRTUAL_MODELS = ['auto', 'auto:code', 'auto:fast', 'auto:cheap'] as const;
+export type VirtualModel = (typeof VIRTUAL_MODELS)[number];
+
+export function isGatewayVirtualModel(id: string): boolean {
+  return id === 'auto' || id.startsWith('auto:');
+}
+
+/**
+ * Strip a composite/provider prefix the picker may prepend. The gateway
+ * registry provider id is 'auto', so BooChat may send 'auto/auto:code'.
+ * Normalize to the bare virtual model token.
+ */
+export function parseVirtualModel(modelId: string): string {
+  // Composite form: '<gatewayProviderId>/<virtual>' — take the part after '/'.
+  const slash = modelId.indexOf('/');
+  const tail = slash >= 0 ? modelId.slice(slash + 1) : modelId;
+  return tail;
+}
+
+export interface RoutePolicyRow {
+  virtual_model: string;
+  candidates: unknown; // jsonb: porsager returns a parsed array (see jsonb.ts)
+  fallback: string | null;
+  enabled: boolean;
+}
+
+/**
+ * Order concrete candidates for a virtual model. Pure.
+ *
+ * When an explicit policy is provided, its candidate list defines the order
+ * (with the fallback appended last). Otherwise candidates are derived from
+ * advisory scores ranked by the virtual model's category metric.
+ *
+ * The returned list is health-filtered: only composite ids whose host is
+ * connected survive (a curated candidate on a down host is skipped, not
+ * dispatched to).
+ */
+export function orderCandidates(
+  virtualModel: string,
+  policy: { candidates: string[]; fallback: string | null } | null,
+  scores: ModelScore[],
+): string[] {
+  const healthy = new Set(scores.filter((s) => s.healthy).map((s) => s.compositeId));
+
+  if (policy) {
+    const ordered = [...policy.candidates];
+    if (policy.fallback && !ordered.includes(policy.fallback)) ordered.push(policy.fallback);
+    // Keep curated order; drop unhealthy. If a candidate isn't in the scores
+    // set at all (never seen), keep it — health is unknown, let dispatch try.
+    return ordered.filter((id) => !scores.some((s) => s.compositeId === id) || healthy.has(id));
+  }
+
+  // Derive from advisory scores by category metric.
+  const metric = (s: ModelScore): number | null => {
+    switch (virtualModel) {
+      case 'auto:code':
+        return s.codeScore;
+      case 'auto:fast':
+      case 'auto:cheap':
+        return s.avgGenTps;
+      case 'auto':
+      default:
+        // Overall: prefer eval score, then throughput.
+        return s.evalScore ?? (s.avgGenTps != null ? s.avgGenTps / 1000 : null);
+    }
+  };
+
+  return scores
+    .filter((s) => s.healthy && metric(s) != null)
+    .sort((a, b) => (metric(b) ?? -Infinity) - (metric(a) ?? -Infinity))
+    .map((s) => s.compositeId);
+}
+
+export interface ResolvedCandidates {
+  virtualModel: string;
+  candidates: string[];
+  policyName: string | null;
+}
+
+/**
+ * Resolve the ordered candidate list for a virtual model against the live
+ * fleet + policies + advisory scores.
+ */
+export async function resolveCandidates(
+  sql: Sql,
+  fleet: FleetState,
+  modelId: string,
+): Promise<ResolvedCandidates> {
+  const virtualModel = parseVirtualModel(modelId);
+
+  const policyRows = await sql<(RoutePolicyRow & { name: string })[]>`
+    SELECT name, virtual_model, candidates, fallback, enabled
+    FROM route_policies
+    WHERE virtual_model = ${virtualModel} AND enabled = true
+    LIMIT 1
+  `;
+
+  const scores = await computeRoutingScores(sql, fleet);
+
+  let policy: { candidates: string[]; fallback: string | null } | null = null;
+  let policyName: string | null = null;
+  if (policyRows.length > 0) {
+    const row = policyRows[0]!;
+    policy = { candidates: jsonbStringArray(row.candidates as unknown), fallback: row.fallback };
+    policyName = row.name;
+  }
+
+  const candidates = orderCandidates(virtualModel, policy, scores);
+  return { virtualModel, candidates, policyName };
+}
+
+/** Split a composite id 'provider/model' into parts. */
+export function splitComposite(compositeId: string): { providerId: string; model: string } | null {
+  const slash = compositeId.indexOf('/');
+  if (slash <= 0) return null;
+  return { providerId: compositeId.slice(0, slash), model: compositeId.slice(slash + 1) };
+}
--- a/apps/control/src/services/host-access.ts
+++ b/apps/control/src/services/host-access.ts
@@ -0,0 +1,19 @@
+/**
+ * Host-access seam: acquire exclusive access to a host for a purpose.
+ *
+ * V1 body: no-op returning {ok: true}. This is the P8 seam — P8 swaps the
+ * body for a DB lease without touching the bench engine.
+ */
+
+export interface HostGrant {
+  ok: boolean;
+  reason?: string;
+}
+
+export async function acquireHostAccess(
+  providerId: string,
+  purpose: string,
+): Promise<HostGrant> {
+  // V1: no-op — always grant access.
+  return { ok: true };
+}
--- a/apps/control/src/services/jsonb.ts
+++ b/apps/control/src/services/jsonb.ts
@@ -0,0 +1,41 @@
+/**
+ * JSONB read helpers.
+ *
+ * porsager/postgres returns `jsonb` columns already parsed into JS values (an
+ * object/array), NOT a JSON string. Calling JSON.parse on that throws
+ * ("[object Object] is not valid JSON"). These helpers accept either shape so a
+ * read works whether the driver parsed the column or handed back a string.
+ */
+
+/** Coerce a JSONB column value to a string array. */
+export function jsonbStringArray(value: unknown): string[] {
+  let v = value;
+  if (typeof v === 'string') {
+    try { v = JSON.parse(v); } catch { return []; }
+  }
+  return Array.isArray(v) ? v.filter((x): x is string => typeof x === 'string') : [];
+}
+
+/** Coerce a JSONB column value to an array (elements untyped). */
+export function jsonbArray(value: unknown): unknown[] {
+  let v = value;
+  if (typeof v === 'string') {
+    try { v = JSON.parse(v); } catch { return []; }
+  }
+  return Array.isArray(v) ? v : [];
+}
+
+/** Coerce a JSONB column value to a number array. */
+export function jsonbNumberArray(value: unknown): number[] {
+  return jsonbArray(value).filter((x): x is number => typeof x === 'number');
+}
+
+/** Coerce a JSONB column value to a plain object, or null. */
+export function jsonbObject(value: unknown): Record<string, unknown> | null {
+  let v = value;
+  if (v == null) return null;
+  if (typeof v === 'string') {
+    try { v = JSON.parse(v); } catch { return null; }
+  }
+  return v && typeof v === 'object' && !Array.isArray(v) ? (v as Record<string, unknown>) : null;
+}
--- a/apps/control/src/services/judge-runner.ts
+++ b/apps/control/src/services/judge-runner.ts
@@ -0,0 +1,288 @@
+import type { Sql } from '../db.js';
+import type { DeltaEmitter } from '../index.js';
+import { recordEvalResult, completeEvalRun } from './eval-suites.js';
+import { resolveProviderBaseUrl } from './llama-providers.js';
+
+// ─── types ──────────────────────────────────────────────────────────────────
+
+export interface JudgeEvalParams {
+  runId: string;
+  providerId: string;
+  model: string;
+  quant: string | null;
+  tasks: Array<Record<string, unknown>>;
+  judgeModel: string | null;
+}
+
+export interface JudgeProgress {
+  completedTasks: number;
+}
+
+export interface JudgeResult {
+  error: string | null;
+}
+
+// ─── judge runner ───────────────────────────────────────────────────────────
+
+/**
+ * Run a judge-based eval (chat quality, rubric scoring).
+ *
+ * Judge requests go through llama-swap with:
+ * - temperature 0
+ * - judge model + version pinned per run
+ * - X-Boo-Source: control-eval
+ * - BARE wire model id
+ *
+ * Rubric scoring: each criterion gets a score, weighted average produces the task score.
+ * Rationale is captured per criterion.
+ */
+export async function runJudgeEval(
+  params: JudgeEvalParams,
+  sql: Sql,
+  emitter: DeltaEmitter,
+  seq: number,
+  logger: import('fastify').FastifyBaseLogger,
+  onProgress: (progress: JudgeProgress) => void,
+): Promise<JudgeResult> {
+  const { runId, providerId, model, tasks, judgeModel, quant } = params;
+
+  // Resolve the target model's base URL.
+  const baseUrl = resolveProviderBaseUrl(providerId);
+  if (!baseUrl) {
+    const err = `no base URL for provider ${providerId}`;
+    await completeEvalRun(sql, runId, 0, null, err).catch(() => {});
+    return { error: err };
+  }
+
+  // Determine judge model: suite default -> strongest local model.
+  const judgeModelId = judgeModel ?? resolveDefaultJudgeModel();
+  const judgeModelVersion = `${judgeModelId}@${Date.now()}`;
+
+  logger.info(
+    { runId, judgeModel: judgeModelId, targetModel: model, taskCount: tasks.length },
+    'eval: judge run started',
+  );
+
+  let completedTasks = 0;
+  let error: string | null = null;
+
+  for (let i = 0; i < tasks.length; i++) {
+    const task = tasks[i];
+    if (!task) continue;
+    const taskId = (task.id as string) ?? `task_${i}`;
+    const prompt = (task.prompt as string) ?? '';
+    const rubric = (task.rubric as { criteria: Array<{ criterion: string; description: string; weight: number }>; max_score: number }) ?? null;
+
+    const startTime = Date.now();
+
+    try {
+      // Generate the response from the target model.
+      const response = await generateResponse(baseUrl, model, prompt);
+
+      // Score the response.
+      let score: number | null = null;
+      let maxScore: number | null = null;
+      let rationale: string | null = null;
+
+      if (rubric) {
+        const scoring = await scoreWithRubric(
+          baseUrl,
+          judgeModelId,
+          prompt,
+          response,
+          rubric,
+        );
+        score = scoring.score;
+        maxScore = scoring.maxScore;
+        rationale = scoring.rationale;
+      } else {
+        // Simple pass/fail for tasks without rubric.
+        score = response.trim().length > 0 ? 1 : 0;
+        maxScore = 1;
+        rationale = response.trim().length > 0 ? 'Response generated' : 'Empty response';
+      }
+
+      const executionMs = Date.now() - startTime;
+
+      await recordEvalResult(
+        sql,
+        runId,
+        taskId,
+        i,
+        score,
+        maxScore,
+        rationale,
+        null,
+        null,
+        null,
+        executionMs,
+        null,
+      );
+
+      completedTasks++;
+      onProgress({ completedTasks });
+
+      emitter.publish({
+        type: 'control_job' as const,
+        seq,
+        jobType: 'eval' as const,
+        jobId: runId,
+        status: 'running' as const,
+        detail: {
+          completedTasks,
+          totalTasks: tasks.length,
+          taskId,
+          score,
+        },
+      });
+    } catch (err) {
+      const msg = (err as Error).message ?? String(err);
+      logger.warn({ taskId, err: msg }, 'eval: judge task failed');
+
+      await recordEvalResult(
+        sql,
+        runId,
+        taskId,
+        i,
+        null,
+        null,
+        null,
+        null,
+        null,
+        null,
+        Date.now() - startTime,
+        msg,
+      ).catch(() => {});
+
+      completedTasks++;
+      onProgress({ completedTasks });
+    }
+  }
+
+  return { error };
+}
+
+/**
+ * Generate a response from the target model through llama-swap.
+ */
+async function generateResponse(
+  baseUrl: string,
+  model: string,
+  prompt: string,
+): Promise<string> {
+  const res = await fetch(`${baseUrl}/v1/chat/completions`, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      'X-Boo-Source': 'control-eval',
+    },
+    body: JSON.stringify({
+      model,
+      messages: [{ role: 'user', content: prompt }],
+      // Design S8: temperature 0 everywhere in the eval pipeline -- response
+      // generation must be as reproducible as the judging (audit B1).
+      temperature: 0,
+      max_tokens: 2048,
+    }),
+    signal: AbortSignal.timeout(120_000),
+  });
+
+  if (!res.ok) {
+    const body = await res.text().catch(() => '');
+    throw new Error(`model response failed: ${res.status} ${body.slice(0, 200)}`);
+  }
+
+  const data = await res.json() as { choices?: Array<{ message?: { content?: string } }> };
+  return data.choices?.[0]?.message?.content ?? '';
+}
+
+/**
+ * Score a response using a rubric via LLM-as-judge.
+ */
+async function scoreWithRubric(
+  baseUrl: string,
+  judgeModelId: string,
+  prompt: string,
+  response: string,
+  rubric: { criteria: Array<{ criterion: string; description: string; weight: number }>; max_score: number },
+): Promise<{ score: number; maxScore: number; rationale: string }> {
+  const criteriaText = rubric.criteria
+    .map((c, i) => `${i + 1}. **${c.criterion}** (weight: ${c.weight}): ${c.description}`)
+    .join('\n');
+
+  const judgePrompt = `You are an evaluation judge. Score the following response against the given prompt using the rubric criteria.
+
+**Prompt:**
+${prompt}
+
+**Response:**
+${response}
+
+**Rubric Criteria (score each 0-3, then compute weighted average):**
+${criteriaText}
+
+**Max Score:** ${rubric.max_score}
+
+Return your evaluation in JSON format:
+{
+  "criterion_scores": {
+    "criterion_name": { "score": 0-3, "rationale": "explanation" }
+  },
+  "weighted_score": <number>,
+  "overall_rationale": "<summary>"
+}`;
+
+  const res = await fetch(`${baseUrl}/v1/chat/completions`, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      'X-Boo-Source': 'control-eval',
+    },
+    body: JSON.stringify({
+      model: judgeModelId,
+      messages: [{ role: 'user', content: judgePrompt }],
+      temperature: 0,
+      max_tokens: 1024,
+      response_format: { type: 'json_object' },
+    }),
+    signal: AbortSignal.timeout(120_000),
+  });
+
+  if (!res.ok) {
+    const body = await res.text().catch(() => '');
+    throw new Error(`judge failed: ${res.status} ${body.slice(0, 200)}`);
+  }
+
+  const data = await res.json() as { choices?: Array<{ message?: { content?: string } }> };
+  const content = data.choices?.[0]?.message?.content ?? '{}';
+
+  let parsed: { weighted_score?: number; overall_rationale?: string };
+  try {
+    parsed = JSON.parse(content);
+  } catch {
+    // Fallback: try to extract JSON from markdown code blocks.
+    const match = content.match(/```(?:json)?\s*([\s\S]*?)```/);
+    if (match && match[1]) {
+      parsed = JSON.parse(match[1]);
+    } else {
+      parsed = {};
+    }
+  }
+
+  const score = parsed.weighted_score ?? 0;
+  const rationale = parsed.overall_rationale ?? 'No rationale provided';
+
+  return {
+    score: Math.min(score, rubric.max_score),
+    maxScore: rubric.max_score,
+    rationale,
+  };
+}
+
+/**
+ * Resolve the default judge model.
+ * Strongest local model by default -- configurable via config.
+ */
+function resolveDefaultJudgeModel(): string {
+  return process.env.EVAL_JUDGE_MODEL ?? 'qwen2.5-72b-instruct';
+}
--- a/apps/control/src/services/llama-providers.ts
+++ b/apps/control/src/services/llama-providers.ts
@@ -0,0 +1,101 @@
+/**
+ * Local provider registry loader (control-side).
+ *
+ * Reads the shared llama-providers config file at startup and caches the
+ * parsed result. When the file is absent or invalid, synthesizes a single
+ * legacy provider from LLAMA_SWAP_URL so the service starts with only
+ * legacy env vars (D-1).
+ *
+ * Schema and pure helpers live in @boocode/contracts/llama-providers.
+ * File I/O stays app-local per D-1.
+ */
+import { readFileSync } from 'node:fs';
+import {
+  LlamaProvidersFileSchema,
+  type LlamaProvidersFile,
+  type LlamaProvider,
+} from '@boocode/contracts/llama-providers';
+
+export type { LlamaProvidersFile, LlamaProvider };
+
+/** Synthesize a single legacy provider from env vars. */
+function buildLegacyProvider(llamaSwapUrl: string): LlamaProvidersFile {
+  return {
+    defaultProvider: 'llama-swap',
+    providers: [
+      {
+        id: 'llama-swap',
+        label: 'llama-swap',
+        baseUrl: llamaSwapUrl,
+        kind: 'llama-swap',
+      },
+    ],
+  };
+}
+
+let cached: LlamaProvidersFile | null = null;
+
+/**
+ * Load (or re-load) the local provider config. Never throws on bad input --
+ * falls back to the legacy single-provider shape.
+ */
+export function loadLlamaProviders(
+  providersPath: string | undefined,
+  llamaSwapUrl: string,
+): LlamaProvidersFile {
+  if (!providersPath) {
+    cached = buildLegacyProvider(llamaSwapUrl);
+    return cached;
+  }
+
+  let raw: string;
+  try {
+    raw = readFileSync(providersPath, 'utf8');
+  } catch {
+    console.warn(
+      `llama-providers: file not found at ${providersPath} -- falling back to legacy single-provider`,
+    );
+    cached = buildLegacyProvider(llamaSwapUrl);
+    return cached;
+  }
+
+  let json: unknown;
+  try {
+    json = JSON.parse(raw);
+  } catch (err) {
+    console.error(
+      `llama-providers: invalid JSON in ${providersPath} -- falling back to legacy single-provider`,
+      err,
+    );
+    cached = buildLegacyProvider(llamaSwapUrl);
+    return cached;
+  }
+
+  const parsed = LlamaProvidersFileSchema.safeParse(json);
+  if (!parsed.success) {
+    console.error(
+      `llama-providers: schema validation failed for ${providersPath} -- falling back to legacy single-provider`,
+      parsed.error.flatten(),
+    );
+    cached = buildLegacyProvider(llamaSwapUrl);
+    return cached;
+  }
+
+  cached = parsed.data;
+  return cached;
+}
+
+/** The cached provider config. Returns legacy fallback if nothing loaded yet. */
+export function getLlamaProviders(): LlamaProvidersFile {
+  return cached ?? buildLegacyProvider('http://localhost:8080');
+}
+
+/**
+ * Resolve a provider's baseUrl by id from the cached registry.
+ * Returns null if the provider is not found.
+ */
+export function resolveProviderBaseUrl(providerId: string): string | null {
+  const file = getLlamaProviders();
+  const provider = file.providers.find((p) => p.id === providerId);
+  return provider?.baseUrl ?? null;
+}
--- a/apps/control/src/services/log-relay.ts
+++ b/apps/control/src/services/log-relay.ts
@@ -0,0 +1,67 @@
+/**
+ * Log relay: in-memory tail buffer per host for logData SSE events.
+ *
+ * - 2k-line tail per host for late joiners
+ * - Relays /api/events logData into control_log frames
+ * - Source filter: proxy | upstream | model
+ */
+
+const MAX_LOG_LINES = 2000;
+
+export interface LogLine {
+  providerId: string;
+  source: 'proxy' | 'upstream' | 'model';
+  line: string;
+  ts: Date;
+}
+
+export class LogRelay {
+  private tails: Map<string, LogLine[]> = new Map();
+
+  /**
+   * Append a log line to the per-host tail buffer.
+   */
+  append(providerId: string, source: 'proxy' | 'upstream' | 'model', line: string): void {
+    let tail = this.tails.get(providerId);
+    if (!tail) {
+      tail = [];
+      this.tails.set(providerId, tail);
+    }
+    tail.push({ providerId, source, line, ts: new Date() });
+    // Trim to max lines
+    while (tail.length > MAX_LOG_LINES) {
+      tail.shift();
+    }
+  }
+
+  /**
+   * Get the tail buffer for a host (for late joiners).
+   */
+  getTail(providerId: string): LogLine[] {
+    return this.tails.get(providerId) ?? [];
+  }
+
+  /**
+   * Get all tails (for snapshot-on-join).
+   */
+  getAllTails(): LogLine[] {
+    const all: LogLine[] = [];
+    for (const tail of this.tails.values()) {
+      all.push(...tail);
+    }
+    return all;
+  }
+
+  /**
+   * Get unique source values across all logs.
+   */
+  getSources(): string[] {
+    const sources = new Set<string>();
+    for (const tail of this.tails.values()) {
+      for (const entry of tail) {
+        sources.add(entry.source);
+      }
+    }
+    return Array.from(sources);
+  }
+}
--- a/apps/control/src/services/model-pull.ts
+++ b/apps/control/src/services/model-pull.ts
@@ -0,0 +1,105 @@
+/**
+ * P9 model pull: download a HuggingFace repo onto a host into its models dir.
+ *
+ * Non-blocking job (fire-and-forget like bench/eval), progress over the existing
+ * control_job frame (jobType 'action', detail.kind = 'pull'). The repo id is
+ * validated server-side as defense in depth on top of the wrapper's own check,
+ * then passed as a single token (never interpolated into a shell string in
+ * wrapper mode; in shell mode it is the only argument and is regex-clean).
+ */
+
+import type { DeltaEmitter } from '../index.js';
+import type { SshExec, SshTarget, SshMode } from './ssh-config.js';
+
+/**
+ * HF repo id: org/name. Each segment MUST start with an alphanumeric (HF's own
+ * rule), which also rejects `..`/`.` traversal segments that a plain `[._-]+`
+ * class would let through (e.g. `../x`). Exactly one slash; no spaces/metachars.
+ */
+export const REPO_ID_RE = /^[A-Za-z0-9][A-Za-z0-9._-]*\/[A-Za-z0-9][A-Za-z0-9._-]*$/;
+
+export function validateRepoId(repo: string): boolean {
+  return REPO_ID_RE.test(repo);
+}
+
+/**
+ * Build the pull command for a host. Pure helper for testing.
+ * - wrapper mode: the `pull <repo>` verb (wrapper hardcodes the models dir).
+ * - shell mode: a direct `huggingface-cli download` into <modelsDir>/<repo__>.
+ */
+export function buildPullCommand(mode: SshMode, repo: string, modelsDir?: string): string {
+  if (mode === 'wrapper') return `pull ${repo}`;
+  const dir = (modelsDir ?? '').replace(/\/+$/, '');
+  const local = `${dir}/${repo.replace(/\//g, '__')}`;
+  return `huggingface-cli download ${repo} --local-dir '${local}'`;
+}
+
+export interface PullParams {
+  jobId: string;
+  target: SshTarget;
+  repo: string;
+  mode: SshMode;
+  modelsDir?: string; // required for shell mode
+}
+
+export interface PullResult {
+  ok: boolean;
+  error?: string;
+}
+
+/**
+ * Run a model pull as a control_job. Resolves when the pull finishes; callers
+ * invoke it fire-and-forget so the HTTP response can return 202 immediately.
+ */
+export async function runModelPull(
+  params: PullParams,
+  exec: SshExec,
+  emitter: DeltaEmitter,
+  seq: number = 0,
+): Promise<PullResult> {
+  const { jobId, target, repo, mode, modelsDir } = params;
+
+  if (!validateRepoId(repo)) {
+    emitter.publish({
+      type: 'control_job' as const, seq, jobType: 'action' as const, jobId,
+      status: 'failed' as const, detail: { kind: 'pull', repo, error: 'invalid repo id' },
+    });
+    return { ok: false, error: 'invalid repo id' };
+  }
+  if (mode === 'shell' && !modelsDir) {
+    emitter.publish({
+      type: 'control_job' as const, seq, jobType: 'action' as const, jobId,
+      status: 'failed' as const, detail: { kind: 'pull', repo, error: 'shell mode requires a models directory' },
+    });
+    return { ok: false, error: 'shell mode requires a models directory' };
+  }
+
+  emitter.publish({
+    type: 'control_job' as const, seq, jobType: 'action' as const, jobId,
+    status: 'running' as const, detail: { kind: 'pull', repo },
+  });
+
+  try {
+    const res = await exec(target, buildPullCommand(mode, repo, modelsDir));
+    if (res.code !== 0) {
+      const error = `pull failed (exit ${res.code}): ${res.stderr.slice(0, 500)}`;
+      emitter.publish({
+        type: 'control_job' as const, seq, jobType: 'action' as const, jobId,
+        status: 'failed' as const, detail: { kind: 'pull', repo, error },
+      });
+      return { ok: false, error };
+    }
+    emitter.publish({
+      type: 'control_job' as const, seq, jobType: 'action' as const, jobId,
+      status: 'completed' as const, detail: { kind: 'pull', repo, output: res.stdout.slice(-500) },
+    });
+    return { ok: true };
+  } catch (err) {
+    const error = (err as Error).message ?? String(err);
+    emitter.publish({
+      type: 'control_job' as const, seq, jobType: 'action' as const, jobId,
+      status: 'failed' as const, detail: { kind: 'pull', repo, error },
+    });
+    return { ok: false, error };
+  }
+}
--- a/apps/control/src/services/reconcile.ts
+++ b/apps/control/src/services/reconcile.ts
@@ -0,0 +1,12 @@
+/**
+ * Reconcile gap detection: if the oldest entry in a reconcile fetch is newer
+ * than the newest already-persisted entry for that provider, the ring wrapped
+ * past our tail and we have a gap.
+ */
+export function detectGap(
+  oldestReconcileTs: string | null,
+  newestPersistedTs: string | null,
+): boolean {
+  if (!oldestReconcileTs || !newestPersistedTs) return false;
+  return new Date(oldestReconcileTs) > new Date(newestPersistedTs);
+}
--- a/apps/control/src/services/reports.ts
+++ b/apps/control/src/services/reports.ts
@@ -0,0 +1,299 @@
+/**
+ * P6.2: Scheduled fleet digest reports.
+ *
+ * Same in-process timer pattern as the retention job (design §3/§6): an hourly
+ * tick reads control_schedule_meta.last_run_at and runs the digest when due,
+ * so a boot after a missed window catches up immediately. No cron dependency,
+ * no new scheduler abstraction.
+ *
+ * The report gathers usage, trends vs the prior period, swap counts, the eval
+ * leaderboard, and bench regression anomalies, renders a markdown digest, and
+ * persists both the markdown and the structured stats to control_reports.
+ */
+
+import type { Sql } from '../db.js';
+
+export type ReportInterval = 'daily' | 'weekly';
+
+export interface ReportStats {
+  periodStart: string;
+  periodEnd: string;
+  interval: ReportInterval;
+  totalRequests: number;
+  priorRequests: number;
+  totalInputTokens: number;
+  totalOutputTokens: number;
+  bySource: Array<{ source: string; requests: number; inputTokens: number; outputTokens: number }>;
+  byProvider: Array<{ providerId: string; requests: number; swaps: number }>;
+  leaderboard: Array<{ providerId: string; model: string; kind: string; avgScore: number | null }>;
+  regressions: Array<{ providerId: string; model: string; avgGenTps: number | null }>;
+}
+
+function intervalHours(interval: ReportInterval): number {
+  return interval === 'weekly' ? 24 * 7 : 24;
+}
+
+/**
+ * Gather the structured stats for a report window. Pure read; no writes.
+ */
+export async function gatherReportStats(
+  sql: Sql,
+  interval: ReportInterval,
+  now: Date,
+): Promise<ReportStats> {
+  const hours = intervalHours(interval);
+  const periodEnd = now;
+  const periodStart = new Date(now.getTime() - hours * 3600_000);
+  const priorStart = new Date(periodStart.getTime() - hours * 3600_000);
+
+  const startIso = periodStart.toISOString();
+  const endIso = periodEnd.toISOString();
+  const priorIso = priorStart.toISOString();
+
+  const totals = await sql<{ requests: number; in_tokens: number; out_tokens: number }[]>`
+    SELECT COUNT(*)::int AS requests,
+           COALESCE(SUM(input_tokens), 0)::int AS in_tokens,
+           COALESCE(SUM(output_tokens), 0)::int AS out_tokens
+    FROM control_requests
+    WHERE ts >= ${startIso} AND ts < ${endIso}
+  `;
+
+  const prior = await sql<{ requests: number }[]>`
+    SELECT COUNT(*)::int AS requests
+    FROM control_requests
+    WHERE ts >= ${priorIso} AND ts < ${startIso}
+  `;
+
+  const bySource = await sql<{ source: string | null; requests: number; in_tokens: number; out_tokens: number }[]>`
+    SELECT source,
+           COUNT(*)::int AS requests,
+           COALESCE(SUM(input_tokens), 0)::int AS in_tokens,
+           COALESCE(SUM(output_tokens), 0)::int AS out_tokens
+    FROM control_requests
+    WHERE ts >= ${startIso} AND ts < ${endIso}
+    GROUP BY source
+    ORDER BY requests DESC
+  `;
+
+  const byProviderReqs = await sql<{ provider_id: string; requests: number }[]>`
+    SELECT provider_id, COUNT(*)::int AS requests
+    FROM control_requests
+    WHERE ts >= ${startIso} AND ts < ${endIso}
+    GROUP BY provider_id
+  `;
+
+  // Swap counts: a model entering 'ready' / 'starting' marks a load/swap.
+  const swaps = await sql<{ provider_id: string; swaps: number }[]>`
+    SELECT provider_id, COUNT(*)::int AS swaps
+    FROM control_model_events
+    WHERE ts >= ${startIso} AND ts < ${endIso}
+      AND state IN ('ready', 'starting')
+    GROUP BY provider_id
+  `;
+
+  const swapMap = new Map<string, number>();
+  for (const r of swaps) swapMap.set(r.provider_id, r.swaps);
+  const providerIds = new Set<string>([
+    ...byProviderReqs.map((r) => r.provider_id),
+    ...swaps.map((r) => r.provider_id),
+  ]);
+  const reqMap = new Map<string, number>();
+  for (const r of byProviderReqs) reqMap.set(r.provider_id, r.requests);
+
+  const byProvider = Array.from(providerIds)
+    .sort()
+    .map((providerId) => ({
+      providerId,
+      requests: reqMap.get(providerId) ?? 0,
+      swaps: swapMap.get(providerId) ?? 0,
+    }));
+
+  // Leaderboard: latest completed eval avgScore per (provider, model, kind).
+  const leaderboard = await sql<{ provider_id: string; model: string; kind: string; avg_score: number | null }[]>`
+    SELECT er.provider_id, er.model, es.kind,
+           (er.aggregate::jsonb ->> 'avgScore')::float AS avg_score
+    FROM eval_runs er
+    JOIN eval_suites es ON er.suite_id = es.id
+    WHERE er.status = 'completed' AND er.aggregate IS NOT NULL
+      AND er.finished_at = (
+        SELECT MAX(er2.finished_at) FROM eval_runs er2
+        JOIN eval_suites es2 ON er2.suite_id = es2.id
+        WHERE er2.provider_id = er.provider_id AND er2.model = er.model
+          AND es2.kind = es.kind AND er2.status = 'completed'
+      )
+    ORDER BY avg_score DESC NULLS LAST
+    LIMIT 20
+  `;
+
+  // Regression anomalies: bench runs flagged 'regression' in the window.
+  const regressions = await sql<{ provider_id: string; model: string; avg_gen_tps: number | null }[]>`
+    SELECT bs.provider_id, bs.model,
+           (br.aggregate::jsonb ->> 'avgGenTps')::float AS avg_gen_tps
+    FROM bench_runs br
+    JOIN bench_suites bs ON br.suite_id = bs.id
+    WHERE br.regression_flag = 'regression'
+      AND br.finished_at >= ${startIso} AND br.finished_at < ${endIso}
+    ORDER BY br.finished_at DESC
+  `;
+
+  return {
+    periodStart: startIso,
+    periodEnd: endIso,
+    interval,
+    totalRequests: totals[0]?.requests ?? 0,
+    priorRequests: prior[0]?.requests ?? 0,
+    totalInputTokens: totals[0]?.in_tokens ?? 0,
+    totalOutputTokens: totals[0]?.out_tokens ?? 0,
+    bySource: bySource.map((r) => ({
+      source: r.source ?? '(unattributed)',
+      requests: r.requests,
+      inputTokens: r.in_tokens,
+      outputTokens: r.out_tokens,
+    })),
+    byProvider,
+    leaderboard: leaderboard.map((r) => ({
+      providerId: r.provider_id,
+      model: r.model,
+      kind: r.kind,
+      avgScore: r.avg_score,
+    })),
+    regressions: regressions.map((r) => ({
+      providerId: r.provider_id,
+      model: r.model,
+      avgGenTps: r.avg_gen_tps,
+    })),
+  };
+}
+
+/**
+ * Render a markdown digest from gathered stats. Pure — unit-testable.
+ */
+export function renderReportMarkdown(stats: ReportStats): string {
+  const lines: string[] = [];
+  const pct = (cur: number, prev: number): string => {
+    if (prev === 0) return cur === 0 ? '0%' : 'new';
+    const d = ((cur - prev) / prev) * 100;
+    return `${d >= 0 ? '+' : ''}${d.toFixed(0)}%`;
+  };
+
+  lines.push(`# Fleet ${stats.interval} report`);
+  lines.push('');
+  lines.push(`Period: ${stats.periodStart} to ${stats.periodEnd}`);
+  lines.push('');
+
+  lines.push('## Usage');
+  lines.push('');
+  lines.push(`- Requests: ${stats.totalRequests} (${pct(stats.totalRequests, stats.priorRequests)} vs prior period)`);
+  lines.push(`- Input tokens: ${stats.totalInputTokens}`);
+  lines.push(`- Output tokens: ${stats.totalOutputTokens}`);
+  lines.push('');
+
+  if (stats.bySource.length > 0) {
+    lines.push('## By source');
+    lines.push('');
+    lines.push('| Source | Requests | Input tok | Output tok |');
+    lines.push('| --- | ---: | ---: | ---: |');
+    for (const s of stats.bySource) {
+      lines.push(`| ${s.source} | ${s.requests} | ${s.inputTokens} | ${s.outputTokens} |`);
+    }
+    lines.push('');
+  }
+
+  if (stats.byProvider.length > 0) {
+    lines.push('## By host');
+    lines.push('');
+    lines.push('| Host | Requests | Swaps |');
+    lines.push('| --- | ---: | ---: |');
+    for (const p of stats.byProvider) {
+      lines.push(`| ${p.providerId} | ${p.requests} | ${p.swaps} |`);
+    }
+    lines.push('');
+  }
+
+  if (stats.leaderboard.length > 0) {
+    lines.push('## Leaderboard');
+    lines.push('');
+    lines.push('| Model | Kind | Score |');
+    lines.push('| --- | --- | ---: |');
+    for (const l of stats.leaderboard) {
+      lines.push(`| ${l.providerId}/${l.model} | ${l.kind} | ${l.avgScore != null ? l.avgScore.toFixed(3) : 'n/a'} |`);
+    }
+    lines.push('');
+  }
+
+  lines.push('## Anomalies');
+  lines.push('');
+  if (stats.regressions.length === 0) {
+    lines.push('No speed regressions flagged this period.');
+  } else {
+    for (const r of stats.regressions) {
+      lines.push(`- Regression: ${r.providerId}/${r.model} (avg gen ${r.avgGenTps != null ? r.avgGenTps.toFixed(1) : 'n/a'} tok/s)`);
+    }
+  }
+  lines.push('');
+
+  return lines.join('\n');
+}
+
+/**
+ * Generate a report for the given interval and persist it. Returns the new id.
+ */
+export async function generateReport(
+  sql: Sql,
+  interval: ReportInterval,
+  now: Date = new Date(),
+): Promise<string> {
+  const stats = await gatherReportStats(sql, interval, now);
+  const markdown = renderReportMarkdown(stats);
+  const id = `report_${now.getTime()}_${interval}`;
+
+  await sql`
+    INSERT INTO control_reports (id, kind, interval, period_start, period_end, markdown, stats)
+    VALUES (${id}, 'digest', ${interval}, ${stats.periodStart}, ${stats.periodEnd}, ${markdown}, ${sql.json(stats as never)})
+    ON CONFLICT (id) DO NOTHING
+  `;
+
+  return id;
+}
+
+/**
+ * Decide whether a scheduled report is due. Pure helper for testing.
+ */
+export function isReportDue(
+  lastRunAt: Date | null,
+  interval: ReportInterval,
+  now: Date,
+): boolean {
+  if (!lastRunAt) return true;
+  const elapsed = now.getTime() - lastRunAt.getTime();
+  return elapsed >= intervalHours(interval) * 3600_000;
+}
+
+/**
+ * Run one scheduler tick: check control_schedule_meta and generate the digest
+ * if due. Catch-up-on-boot is achieved by calling this once at startup, then
+ * hourly.
+ */
+export async function runReportSchedulerTick(
+  sql: Sql,
+  now: Date = new Date(),
+): Promise<{ ran: boolean; reportId?: string }> {
+  const rows = await sql<{ interval: string; enabled: boolean; last_run_at: string | null }[]>`
+    SELECT interval, enabled, last_run_at
+    FROM control_schedule_meta WHERE name = 'report-digest'
+  `;
+  const meta = rows[0];
+  if (!meta || !meta.enabled) return { ran: false };
+
+  const interval = (meta.interval === 'weekly' ? 'weekly' : 'daily') as ReportInterval;
+  const lastRunAt = meta.last_run_at ? new Date(meta.last_run_at) : null;
+
+  if (!isReportDue(lastRunAt, interval, now)) return { ran: false };
+
+  const reportId = await generateReport(sql, interval, now);
+  await sql`
+    UPDATE control_schedule_meta SET last_run_at = ${now.toISOString()}
+    WHERE name = 'report-digest'
+  `;
+  return { ran: true, reportId };
+}
--- a/apps/control/src/services/retention.ts
+++ b/apps/control/src/services/retention.ts
@@ -0,0 +1,159 @@
+/**
+ * Retention job: daily in-process timer that rolls up raw perf samples and
+ * prunes old data.
+ *
+ * Crash-safe by construction:
+ * 1. Rollup is an idempotent upsert (INSERT ... ON CONFLICT DO UPDATE).
+ * 2. Delete raw only AFTER covering buckets are committed.
+ * 3. Chunked transactions: one per provider per 1-hour window.
+ */
+
+import type { Sql } from '../db.js';
+import type { Config } from '../config.js';
+
+export interface RetentionConfig {
+  rawHours: number;
+  rollupDays: number;
+  captureSizeKB: number;
+  captureBudgetMB: number;
+}
+
+export function buildRetentionConfig(cfg: Config): RetentionConfig {
+  return {
+    rawHours: cfg.RETENTION_RAW_HOURS,
+    rollupDays: cfg.RETENTION_ROLLUP_DAYS,
+    captureSizeKB: cfg.CAPTURE_SIZE_KB,
+    captureBudgetMB: cfg.CAPTURE_BUDGET_MB,
+  };
+}
+
+/**
+ * Roll up raw perf samples into 5-minute buckets.
+ * Idempotent: re-running the same window produces identical rollups.
+ */
+export async function runRollup(sql: Sql, providerId: string, hours: number): Promise<void> {
+  const cutoff = new Date(Date.now() - hours * 3600_000);
+  const buckets = await sql<{ bucket: Date }[]>`
+    SELECT date_trunc('5 minutes', ts) AS bucket
+    FROM control_perf_samples
+    WHERE provider_id = ${providerId}
+      AND ts >= ${cutoff.toISOString()}
+    GROUP BY bucket
+    ORDER BY bucket
+  `;
+
+  for (const { bucket } of buckets) {
+    const bucketStart = new Date(bucket);
+    const bucketEnd = new Date(bucket.getTime() + 5 * 60_000);
+
+    // Idempotent upsert: re-run recomputes the same buckets, never double-counts.
+    await sql`
+      INSERT INTO control_perf_rollup_5m (provider_id, bucket, gpu_agg, sys_agg)
+      SELECT
+        ${providerId},
+        ${bucketStart.toISOString()},
+        jsonb_agg(DISTINCT jsonb_build_object('ts', ts, 'gpu', gpu)) AS gpu_agg,
+        jsonb_agg(DISTINCT jsonb_build_object('ts', ts, 'sys', sys)) AS sys_agg
+      FROM control_perf_samples
+      WHERE provider_id = ${providerId}
+        AND ts >= ${bucketStart.toISOString()}
+        AND ts < ${bucketEnd.toISOString()}
+      GROUP BY provider_id
+      ON CONFLICT (provider_id, bucket) DO UPDATE SET
+        gpu_agg = EXCLUDED.gpu_agg,
+        sys_agg = EXCLUDED.sys_agg
+    `;
+  }
+}
+
+/**
+ * Prune raw perf samples older than the retention window.
+ * Chunked: one transaction per provider per 1-hour window.
+ */
+export async function pruneRawSamples(sql: Sql, providerId: string, hours: number): Promise<void> {
+  const cutoff = new Date(Date.now() - hours * 3600_000);
+  const chunkSize = 1000;
+
+  while (true) {
+    const toDelete = await sql<{ ts: Date }[]>`
+      SELECT ts FROM control_perf_samples
+      WHERE provider_id = ${providerId}
+        AND ts < ${cutoff.toISOString()}
+      ORDER BY ts DESC
+      LIMIT ${chunkSize}
+    `;
+    if (toDelete.length === 0) break;
+
+    const timestamps = toDelete.map((r) => r.ts);
+    await sql`DELETE FROM control_perf_samples WHERE provider_id = ${providerId} AND ts = ANY(${timestamps})`;
+  }
+}
+
+/**
+ * Prune activity (control_requests) older than the retention window.
+ * Chunked: one transaction per batch to avoid long lock hold times.
+ */
+export async function pruneActivity(sql: Sql, hours: number): Promise<void> {
+  const cutoff = new Date(Date.now() - hours * 3600_000);
+  const chunkSize = 1000;
+
+  while (true) {
+    const toDelete = await sql<{ ts: Date }[]>`
+      SELECT ts FROM control_requests
+      WHERE ts < ${cutoff.toISOString()}
+      ORDER BY ts DESC
+      LIMIT ${chunkSize}
+    `;
+    if (toDelete.length === 0) break;
+
+    const timestamps = toDelete.map((r) => r.ts);
+    await sql`DELETE FROM control_requests WHERE ts = ANY(${timestamps})`;
+  }
+}
+
+/**
+ * Prune model events older than the retention window.
+ * Chunked: one transaction per batch to avoid long lock hold times.
+ */
+export async function pruneModelEvents(sql: Sql, hours: number): Promise<void> {
+  const cutoff = new Date(Date.now() - hours * 3600_000);
+  const chunkSize = 1000;
+
+  while (true) {
+    const toDelete = await sql<{ ts: Date }[]>`
+      SELECT ts FROM control_model_events
+      WHERE ts < ${cutoff.toISOString()}
+      ORDER BY ts DESC
+      LIMIT ${chunkSize}
+    `;
+    if (toDelete.length === 0) break;
+
+    const timestamps = toDelete.map((r) => r.ts);
+    await sql`DELETE FROM control_model_events WHERE ts = ANY(${timestamps})`;
+  }
+}
+
+/**
+ * Trim capture JSONB per-row to the configured size cap.
+ * Returns the trimmed JSON string, or null.
+ */
+export function trimCapture(captureJson: string | null, sizeKB: number): string | null {
+  if (!captureJson) return null;
+  const sizeBytes = Buffer.byteLength(captureJson, 'utf8');
+  if (sizeBytes <= sizeKB * 1024) return captureJson;
+  // Trim the capture to fit within the cap.
+  return captureJson.slice(0, Math.floor(sizeKB * 1024));
+}
+
+/**
+ * Parse a capture JSON string into an object for sql.json().
+ * Returns null if the input is null or invalid JSON.
+ */
+export function parseCaptureJson(captureJson: string | null): Record<string, unknown> | null {
+  if (!captureJson) return null;
+  try {
+    return JSON.parse(captureJson) as Record<string, unknown>;
+  } catch {
+    return null;
+  }
+}
--- a/apps/control/src/services/routing-scores.ts
+++ b/apps/control/src/services/routing-scores.ts
@@ -0,0 +1,194 @@
+/**
+ * P6.1: Advisory routing scores.
+ *
+ * Combines three signals per (provider_id, model) into an advisory score and
+ * a set of category badges surfaced in the BooChat model picker:
+ *   - eval results   (eval_runs.aggregate.avgScore, split by suite kind)
+ *   - live latency   (control_requests gen_tps + duration over a recent window)
+ *   - host health    (fleet liveness — an unhealthy host can win no badge)
+ *
+ * Advisory only: this never enforces routing. It powers display badges
+ * ("best code model right now") and the P7 gateway candidate ordering.
+ *
+ * The pure scoring/badge helpers are extracted for unit testing per the
+ * turn-guard.ts pattern; the DB read lives in computeRoutingScores().
+ */
+
+import type { Sql } from '../db.js';
+import type { FleetState } from './fleet-state.js';
+
+/** Recent-activity window for live latency signals. */
+const LIVE_WINDOW_HOURS = 24;
+
+export interface ModelScore {
+  /** Composite picker id: `${providerId}/${model}` (matches /api/models). */
+  compositeId: string;
+  providerId: string;
+  model: string;
+  /** Avg score (0..1) from completed code-suite eval runs, or null. */
+  codeScore: number | null;
+  /** Avg score (0..1) from completed chat-suite eval runs, or null. */
+  chatScore: number | null;
+  /** Best eval score across kinds, or null when never evaluated. */
+  evalScore: number | null;
+  /** Avg gen tok/s over the live window, or null when no recent traffic. */
+  avgGenTps: number | null;
+  /** Avg request duration (ms) over the live window, or null. */
+  avgLatencyMs: number | null;
+  /** Recent request count in the live window. */
+  sampleCount: number;
+  /** Whether the owning host is currently connected. */
+  healthy: boolean;
+  /** Category badges this model currently wins. */
+  badges: BadgeKind[];
+}
+
+export type BadgeKind = 'best-code' | 'best-chat' | 'best-fast';
+
+export const BADGE_LABELS: Record<BadgeKind, string> = {
+  'best-code': 'Best code model now',
+  'best-chat': 'Best chat model now',
+  'best-fast': 'Fastest model now',
+};
+
+interface EvalRow {
+  provider_id: string;
+  model: string;
+  suite_kind: string;
+  avg_score: number | null;
+}
+
+interface LatencyRow {
+  provider_id: string;
+  model: string;
+  avg_gen_tps: number | null;
+  avg_duration_ms: number | null;
+  sample_count: number;
+}
+
+/**
+ * Pure badge assignment: given the per-model signals, award one winner per
+ * category. Only healthy hosts are eligible; ties broken by first-seen order
+ * (callers sort deterministically before passing in).
+ */
+export function assignBadges(scores: ModelScore[]): void {
+  const eligible = scores.filter((s) => s.healthy);
+
+  const award = (
+    pick: (s: ModelScore) => number | null,
+    badge: BadgeKind,
+  ): void => {
+    let best: ModelScore | null = null;
+    let bestVal = -Infinity;
+    for (const s of eligible) {
+      const v = pick(s);
+      if (v == null) continue;
+      if (v > bestVal) {
+        bestVal = v;
+        best = s;
+      }
+    }
+    if (best && bestVal > -Infinity) {
+      best.badges.push(badge);
+    }
+  };
+
+  award((s) => s.codeScore, 'best-code');
+  award((s) => s.chatScore, 'best-chat');
+  award((s) => s.avgGenTps, 'best-fast');
+}
+
+/**
+ * Compute advisory routing scores across all (provider_id, model) pairs that
+ * have either eval history or recent live traffic.
+ */
+export async function computeRoutingScores(
+  sql: Sql,
+  fleet: FleetState,
+): Promise<ModelScore[]> {
+  // 1. Eval scores — latest completed run per (provider, model, kind).
+  //    Take the most recent finished run's aggregate avgScore per kind so a
+  //    fresh run supersedes stale numbers.
+  const evalRows = await sql<EvalRow[]>`
+    SELECT er.provider_id,
+           er.model,
+           es.kind AS suite_kind,
+           (er.aggregate::jsonb ->> 'avgScore')::float AS avg_score
+    FROM eval_runs er
+    JOIN eval_suites es ON er.suite_id = es.id
+    WHERE er.status = 'completed'
+      AND er.aggregate IS NOT NULL
+      AND er.finished_at = (
+        SELECT MAX(er2.finished_at)
+        FROM eval_runs er2
+        JOIN eval_suites es2 ON er2.suite_id = es2.id
+        WHERE er2.provider_id = er.provider_id
+          AND er2.model = er.model
+          AND es2.kind = es.kind
+          AND er2.status = 'completed'
+      )
+  `;
+
+  // 2. Live latency/throughput — recent control_requests per (provider, model).
+  const cutoff = new Date(Date.now() - LIVE_WINDOW_HOURS * 3600_000).toISOString();
+  const latencyRows = await sql<LatencyRow[]>`
+    SELECT provider_id,
+           model,
+           AVG(gen_tps) FILTER (WHERE gen_tps > 0) AS avg_gen_tps,
+           AVG(duration_ms) FILTER (WHERE duration_ms > 0) AS avg_duration_ms,
+           COUNT(*)::int AS sample_count
+    FROM control_requests
+    WHERE ts >= ${cutoff}
+      AND model IS NOT NULL
+    GROUP BY provider_id, model
+  `;
+
+  // 3. Merge signals keyed by compositeId.
+  const byKey = new Map<string, ModelScore>();
+  const keyOf = (providerId: string, model: string) => `${providerId}/${model}`;
+
+  const ensure = (providerId: string, model: string): ModelScore => {
+    const compositeId = keyOf(providerId, model);
+    let s = byKey.get(compositeId);
+    if (!s) {
+      s = {
+        compositeId,
+        providerId,
+        model,
+        codeScore: null,
+        chatScore: null,
+        evalScore: null,
+        avgGenTps: null,
+        avgLatencyMs: null,
+        sampleCount: 0,
+        healthy: fleet.hosts.get(providerId)?.liveness === 'connected',
+        badges: [],
+      };
+      byKey.set(compositeId, s);
+    }
+    return s;
+  };
+
+  for (const row of evalRows) {
+    const s = ensure(row.provider_id, row.model);
+    if (row.suite_kind === 'code') s.codeScore = row.avg_score;
+    else if (row.suite_kind === 'chat') s.chatScore = row.avg_score;
+    const best = Math.max(s.codeScore ?? -Infinity, s.chatScore ?? -Infinity);
+    s.evalScore = best > -Infinity ? best : null;
+  }
+
+  for (const row of latencyRows) {
+    const s = ensure(row.provider_id, row.model);
+    s.avgGenTps = row.avg_gen_tps;
+    s.avgLatencyMs = row.avg_duration_ms;
+    s.sampleCount = row.sample_count;
+  }
+
+  // Deterministic order before badge assignment so ties are stable.
+  const scores = Array.from(byKey.values()).sort((a, b) =>
+    a.compositeId < b.compositeId ? -1 : a.compositeId > b.compositeId ? 1 : 0,
+  );
+
+  assignBadges(scores);
+  return scores;
+}
--- a/apps/control/src/services/sandbox-runner.ts
+++ b/apps/control/src/services/sandbox-runner.ts
@@ -0,0 +1,410 @@
+import { spawn, type ChildProcess } from 'node:child_process';
+import { randomUUID } from 'node:crypto';
+import type { Sql } from '../db.js';
+import type { DeltaEmitter } from '../index.js';
+import { recordEvalResult } from './eval-suites.js';
+
+// ─── types ──────────────────────────────────────────────────────────────────
+
+export interface SandboxEvalParams {
+  runId: string;
+  providerId: string;
+  model: string;
+  quant: string | null;
+  tasks: Array<Record<string, unknown>>;
+}
+
+export interface SandboxProgress {
+  completedTasks: number;
+}
+
+export interface SandboxResult {
+  error: string | null;
+}
+
+export interface SandboxContainer {
+  id: string;
+  process: ChildProcess;
+  timeoutHandle: NodeJS.Timeout | null;
+}
+
+// ─── hardening constants (LAW, not suggestions) ─────────────────────────────
+
+const SANDBOX_IMAGE = process.env.SANDBOX_IMAGE ?? 'node:20-bookworm-slim';
+const SANDBOX_MEMORY = process.env.SANDBOX_MEMORY ?? '512m';
+const SANDBOX_CPU = process.env.SANDBOX_CPU ?? '0.5';
+const SANDBOX_PIDS = process.env.SANDBOX_PIDS ?? '100';
+const SANDBOX_TIMEOUT_MS = Number(process.env.SANDBOX_TIMEOUT_MS ?? '30000');
+const SANDBOX_CONCURRENCY = Number(process.env.SANDBOX_CONCURRENCY ?? '4');
+const SANDBOX_LABEL = 'boocontrol-eval';
+
+// ─── sandbox runner ─────────────────────────────────────────────────────────
+
+/**
+ * Run a code sandbox eval: each task generates code via LLM, executes in
+ * an ephemeral Docker container with hardening flags, and scores pass@1.
+ *
+ * HARDENING FLAGS (LAW):
+ * - --network none: NO network access
+ * - --user 1000:1000: non-root user
+ * - --memory, --cpus, --pids-limit: resource caps
+ * - --tmpfs /workspace:tmpfs workdir
+ * - --rm: auto-remove on exit
+ * - --label boocontrol-eval: orphan findability
+ * - --security-opt=no-new-privileges: no privilege escalation
+ * - --cap-drop=ALL: drop all capabilities
+ *
+ * NO volume mounts from the repo.
+ * NO docker socket inside containers.
+ *
+ * Bounded concurrency via Promise.allSettled.
+ * Per-task finally cleanup.
+ * Kill-on-timeout.
+ */
+export async function runCodeEval(
+  params: SandboxEvalParams,
+  sql: Sql,
+  emitter: DeltaEmitter,
+  seq: number,
+  onProgress: (progress: SandboxProgress) => void,
+): Promise<SandboxResult> {
+  const { runId, tasks } = params;
+
+  // Orphan prune at engine start.
+  await pruneOrphanContainers();
+
+  let completedTasks = 0;
+  let error: string | null = null;
+
+  // Bounded concurrency: process tasks in batches.
+  const batchSizes: number[] = [];
+  for (let i = 0; i < tasks.length; i += SANDBOX_CONCURRENCY) {
+    const batch = tasks.slice(i, i + SANDBOX_CONCURRENCY);
+    batchSizes.push(batch.length);
+
+    // Promise.allSettled: a single task failure never abandons in-flight containers.
+    const results = await Promise.allSettled(
+      batch.map(async (task, batchIdx) => {
+        const globalIdx = i + batchIdx;
+        const taskId = (task.id as string) ?? `task_${globalIdx}`;
+        const prompt = (task.prompt as string) ?? '';
+        const testCode = (task.test_code as string) ?? '';
+        const expectedOutput = (task.expected_output as string) ?? '';
+        const language = (task.language as string) ?? 'typescript';
+
+        const startTime = Date.now();
+        let container: SandboxContainer | null = null;
+
+        try {
+          // Generate code from LLM.
+          const generatedCode = await generateCode(params.providerId, params.model, prompt, language);
+
+          // Execute in sandbox.
+          const execResult = await executeInSandbox(generatedCode, testCode, language);
+
+          const executionMs = Date.now() - startTime;
+
+          // pass@1 scoring: output matches expected.
+          const passed = normalizeOutput(execResult.stdout) === normalizeOutput(expectedOutput);
+          const score = passed ? 1 : 0;
+
+          await recordEvalResult(
+            sql,
+            runId,
+            taskId,
+            globalIdx,
+            score,
+            1,
+            passed ? 'Output matches expected' : `Expected: ${expectedOutput}, Got: ${execResult.stdout}`,
+            execResult.exitCode,
+            execResult.stderr,
+            execResult.stdout,
+            executionMs,
+            null,
+          );
+
+          emitter.publish({
+            type: 'control_job' as const,
+            seq,
+            jobType: 'eval' as const,
+            jobId: runId,
+            status: 'running' as const,
+            detail: {
+              taskId,
+              taskIndex: globalIdx,
+              passed,
+              score,
+            },
+          });
+
+          return { taskId, passed, score };
+        } catch (err) {
+          const msg = (err as Error).message ?? String(err);
+          const executionMs = Date.now() - startTime;
+
+          await recordEvalResult(
+            sql,
+            runId,
+            taskId,
+            globalIdx,
+            null,
+            1,
+            null,
+            null,
+            msg,
+            null,
+            executionMs,
+            msg,
+          ).catch(() => {});
+
+          return { taskId, passed: false, score: 0, error: msg };
+        } finally {
+          // Per-task finally cleanup: kill container + remove.
+          if (container) {
+            await cleanupContainer(container);
+          }
+          completedTasks++;
+          onProgress({ completedTasks });
+        }
+      }),
+    );
+
+    // Log batch results.
+    for (const result of results) {
+      if (result.status === 'rejected') {
+        console.error('sandbox: batch task rejected:', result.reason);
+      }
+    }
+  }
+
+  return { error };
+}
+
+/**
+ * Generate code from the target model.
+ */
+async function generateCode(
+  providerId: string,
+  model: string,
+  prompt: string,
+  language: string,
+): Promise<string> {
+  const baseUrl = resolveProviderBaseUrlInternal(providerId);
+  if (!baseUrl) {
+    throw new Error(`no base URL for provider ${providerId}`);
+  }
+
+  const systemPrompt = `You are a code generator. Write ${language} code that solves the given task.
+Output ONLY the code, no explanations, no markdown fences. The code will be executed directly.`;
+
+  const res = await fetch(`${baseUrl}/v1/chat/completions`, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      'X-Boo-Source': 'control-eval',
+    },
+    body: JSON.stringify({
+      model,
+      messages: [
+        { role: 'system', content: systemPrompt },
+        { role: 'user', content: prompt },
+      ],
+      temperature: 0,
+      max_tokens: 2048,
+    }),
+    signal: AbortSignal.timeout(120_000),
+  });
+
+  if (!res.ok) {
+    const body = await res.text().catch(() => '');
+    throw new Error(`code generation failed: ${res.status} ${body.slice(0, 200)}`);
+  }
+
+  const data = await res.json() as { choices?: Array<{ message?: { content?: string } }> };
+  let code = data.choices?.[0]?.message?.content ?? '';
+
+  // Strip markdown code fences if present.
+  const fenceMatch = code.match(/```[\w]*\n([\s\S]*?)```/);
+  if (fenceMatch && fenceMatch[1]) {
+    code = fenceMatch[1];
+  }
+
+  return code.trim();
+}
+
+/**
+ * Execute code in a hardened Docker container.
+ */
+async function executeInSandbox(
+  generatedCode: string,
+  testCode: string,
+  language: string,
+): Promise<{ stdout: string; stderr: string; exitCode: number | null }> {
+  return new Promise((resolve, reject) => {
+    const containerId = `eval_${randomUUID().slice(0, 12)}`;
+
+    // Build the combined script: generated code + test code.
+    const script = buildExecutionScript(generatedCode, testCode, language);
+
+    // SECURITY: Hardened Docker run command.
+    // --network none: NO network access.
+    // --user 1000:1000: non-root user.
+    // --memory, --cpus, --pids-limit: resource caps.
+    // --tmpfs /workspace: tmpfs workdir, no persistent storage.
+    // --rm: auto-remove on exit.
+    // --label boocontrol-eval: orphan findability.
+    // --security-opt=no-new-privileges: no privilege escalation.
+    // --cap-drop=ALL: drop all capabilities.
+    const dockerArgs = [
+      'run',
+      '--network', 'none',
+      '--user', '1000:1000',
+      '--memory', SANDBOX_MEMORY,
+      '--cpus', String(SANDBOX_CPU),
+      '--pids-limit', String(SANDBOX_PIDS),
+      '--tmpfs', '/workspace:rw,noexec,size=64m',
+      '--rm',
+      '--label', SANDBOX_LABEL,
+      '--security-opt', 'no-new-privileges',
+      '--cap-drop', 'ALL',
+      '--name', containerId,
+      '-e', 'NODE_ENV=production',
+      SANDBOX_IMAGE,
+      'sh', '-c', script,
+    ];
+
+    const dockerProcess = spawn('docker', dockerArgs, {
+      timeout: SANDBOX_TIMEOUT_MS,
+      env: { ...process.env },
+    });
+
+    let stdout = '';
+    let stderr = '';
+
+    dockerProcess.stdout.on('data', (chunk: Buffer) => {
+      stdout += chunk.toString();
+    });
+
+    dockerProcess.stderr.on('data', (chunk: Buffer) => {
+      stderr += chunk.toString();
+    });
+
+    dockerProcess.on('close', (code) => {
+      resolve({
+        stdout: stdout.trim(),
+        stderr: stderr.trim(),
+        exitCode: code,
+      });
+    });
+
+    dockerProcess.on('error', (err) => {
+      reject(new Error(`docker spawn failed: ${err.message}`));
+    });
+
+    // Kill-on-timeout: if the process exceeds SANDBOX_TIMEOUT_MS, kill it.
+    const timeoutHandle = setTimeout(() => {
+      dockerProcess.kill('SIGKILL');
+      reject(new Error(`sandbox execution timeout (${SANDBOX_TIMEOUT_MS}ms)`));
+    }, SANDBOX_TIMEOUT_MS);
+
+    // Clear timeout on close.
+    dockerProcess.on('close', () => {
+      clearTimeout(timeoutHandle);
+    });
+  });
+}
+
+/**
+ * Build the execution script for the sandbox.
+ */
+function buildExecutionScript(
+  generatedCode: string,
+  testCode: string,
+  language: string,
+): string {
+  if (language === 'typescript' || language === 'javascript') {
+    return [
+      'cd /workspace',
+      `echo '${escapeShell(generatedCode)}' > output.js`,
+      `echo '${escapeShell(testCode)}' > test.js`,
+      'npx --yes tsx test.js 2>&1',
+    ].join(' && ');
+  }
+
+  // Fallback: generic shell execution.
+  return [
+    'cd /workspace',
+    `echo '${escapeShell(generatedCode)}' > output.sh`,
+    `echo '${escapeShell(testCode)}' > test.sh`,
+    'chmod +x output.sh test.sh',
+    'bash test.sh 2>&1',
+  ].join(' && ');
+}
+
+/**
+ * Escape a string for safe shell embedding.
+ */
+function escapeShell(str: string): string {
+  return str.replace(/'/g, "'\\''");
+}
+
+/**
+ * Normalize output for comparison (trim, collapse whitespace).
+ */
+function normalizeOutput(output: string): string {
+  return output.trim().replace(/\s+/g, ' ');
+}
+
+/**
+ * Prune orphan containers from crashed runs.
+ */
+async function pruneOrphanContainers(): Promise<void> {
+  return new Promise((resolve) => {
+    const pruneCmd = spawn('docker', ['ps', '-q', '--filter', `label=${SANDBOX_LABEL}`]);
+    let output = '';
+    pruneCmd.stdout.on('data', (chunk: Buffer) => { output += chunk.toString(); });
+    pruneCmd.on('close', async () => {
+      const containerIds = output.trim().split('\n').filter(Boolean);
+      if (containerIds.length > 0) {
+        console.log({ count: containerIds.length }, 'sandbox: pruning orphan containers');
+        const kill = spawn('docker', ['kill', ...containerIds]);
+        await new Promise((r) => {
+          kill.on('close', r);
+          kill.on('error', r);
+        });
+      }
+      resolve();
+    });
+    pruneCmd.on('error', () => resolve());
+  });
+}
+
+/**
+ * Cleanup a sandbox container.
+ */
+async function cleanupContainer(container: SandboxContainer): Promise<void> {
+  if (container.timeoutHandle) {
+    clearTimeout(container.timeoutHandle);
+  }
+  if (container.process.exitCode === null) {
+    container.process.kill('SIGKILL');
+  }
+  // Container is --rm, so it auto-removes. But force-remove as safety net.
+  await new Promise<void>((resolve) => {
+    const rm = spawn('docker', ['rm', '-f', container.id]);
+    rm.on('close', resolve);
+    rm.on('error', resolve);
+  }).catch(() => {});
+}
+
+/**
+ * Resolve provider base URL (internal, mirrors llama-providers).
+ */
+function resolveProviderBaseUrlInternal(providerId: string): string | null {
+  try {
+    const { resolveProviderBaseUrl } = require('./llama-providers.js');
+    return resolveProviderBaseUrl(providerId);
+  } catch {
+    return null;
+  }
+}
--- a/apps/control/src/services/ssh-config.ts
+++ b/apps/control/src/services/ssh-config.ts
@@ -0,0 +1,361 @@
+/**
+ * P9.1: SSH config editor for llama-swap hosts.
+ *
+ * Pipeline (design §5, stackctl flow with the tests stackctl never had):
+ *   SFTP/SSH read -> schema-validated edit (config-schema.json from the fork)
+ *   -> diff preview -> timestamped backup -> write -> restart -> health-wait.
+ *
+ * SSH I/O is shelled out via `ssh` (matching the booterm precedent — no ssh2
+ * dependency, key from `secrets/`), injected as `SshExec` so every failure path
+ * is unit-testable without a live host. The pure helpers (validate, diff,
+ * backup filename) carry the logic and are tested directly.
+ */
+
+import { spawn } from 'node:child_process';
+import { createRequire } from 'node:module';
+import { load as loadYaml } from 'js-yaml';
+import type { ValidateFunction } from 'ajv';
+
+// ajv + ajv-formats are CJS. Under NodeNext ESM the default-import interop binds
+// the namespace, not the constructable class, so load them via createRequire to
+// get the real module.exports (class / plugin fn) at both type and runtime.
+const require = createRequire(import.meta.url);
+const Ajv = require('ajv') as typeof import('ajv').default;
+const addFormats = require('ajv-formats') as typeof import('ajv-formats').default;
+
+// ─── host SSH target ─────────────────────────────────────────────────────────
+
+export interface SshTarget {
+  host: string;
+  user: string;
+  keyPath: string;
+}
+
+export interface ExecResult {
+  code: number;
+  stdout: string;
+  stderr: string;
+}
+
+/** Injectable SSH executor. `stdin`, when present, is piped to the remote command. */
+export type SshExec = (target: SshTarget, command: string, stdin?: string) => Promise<ExecResult>;
+
+// ─── pure: schema validation ─────────────────────────────────────────────────
+
+export interface ValidationResult {
+  valid: boolean;
+  errors: string[];
+  /** Parsed config object when YAML is syntactically valid. */
+  parsed?: unknown;
+}
+
+let cachedValidator: ValidateFunction | null = null;
+let cachedSchemaRef: object | null = null;
+
+function getValidator(schema: object): ValidateFunction {
+  if (cachedValidator && cachedSchemaRef === schema) return cachedValidator;
+  const ajv = new Ajv({ allErrors: true, strict: false });
+  addFormats(ajv);
+  const validate = ajv.compile(schema);
+  cachedValidator = validate;
+  cachedSchemaRef = schema;
+  return validate;
+}
+
+/**
+ * Validate a llama-swap config YAML string against the fork's
+ * config-schema.json. Catches YAML syntax errors first, then schema errors.
+ * Pure — no I/O; the schema object is passed in.
+ */
+export function validateLlamaConfig(yamlText: string, schema: object): ValidationResult {
+  let parsed: unknown;
+  try {
+    parsed = loadYaml(yamlText);
+  } catch (err) {
+    return { valid: false, errors: [`YAML parse error: ${(err as Error).message}`] };
+  }
+  if (parsed === null || typeof parsed !== 'object') {
+    return { valid: false, errors: ['config must be a YAML mapping'], parsed };
+  }
+
+  const validate = getValidator(schema);
+  const ok = validate(parsed);
+  if (ok) return { valid: true, errors: [], parsed };
+
+  const errors = (validate.errors ?? []).map((e) => {
+    const path = e.instancePath || '(root)';
+    return `${path} ${e.message ?? 'invalid'}`;
+  });
+  return { valid: false, errors: errors.length ? errors : ['schema validation failed'], parsed };
+}
+
+// ─── pure: unified-ish diff ──────────────────────────────────────────────────
+
+/**
+ * Produce a compact line diff between two texts. Trims a common prefix/suffix
+ * and marks the changed middle with -/+ lines. Sufficient for a preview; not a
+ * minimal-edit Myers diff.
+ */
+export function computeDiff(oldText: string, newText: string): string {
+  const oldLines = oldText.split('\n');
+  const newLines = newText.split('\n');
+
+  let start = 0;
+  while (start < oldLines.length && start < newLines.length && oldLines[start] === newLines[start]) {
+    start++;
+  }
+  let endOld = oldLines.length - 1;
+  let endNew = newLines.length - 1;
+  while (endOld >= start && endNew >= start && oldLines[endOld] === newLines[endNew]) {
+    endOld--;
+    endNew--;
+  }
+
+  if (endOld < start && endNew < start) return ''; // identical
+
+  const out: string[] = [];
+  out.push(`@@ lines ${start + 1}..${endOld + 1} -> ${start + 1}..${endNew + 1} @@`);
+  for (let i = start; i <= endOld; i++) out.push(`- ${oldLines[i]}`);
+  for (let i = start; i <= endNew; i++) out.push(`+ ${newLines[i]}`);
+  return out.join('\n');
+}
+
+// ─── pure: backup filename ───────────────────────────────────────────────────
+
+/** Timestamped backup path: `<configPath>.bak-YYYYMMDDTHHMMSSZ`. */
+export function backupFilename(configPath: string, now: Date): string {
+  const stamp = now.toISOString().replace(/[-:]/g, '').replace(/\.\d+Z$/, 'Z');
+  return `${configPath}.bak-${stamp}`;
+}
+
+// ─── RemoteOps seam (shell vs wrapper) ───────────────────────────────────────
+//
+// 'shell' mode issues raw shell commands (P9.1 behavior). 'wrapper' mode issues
+// fixed verbs so the key can be bound to an authorized_keys forced command that
+// hardcodes the paths. Both drive the same apply pipeline.
+
+export type SshMode = 'shell' | 'wrapper';
+
+export interface RemoteOps {
+  read(): Promise<string>;
+  backup(now: Date): Promise<string>;        // returns the backup path
+  write(content: string): Promise<void>;
+  restart(restartCmd: string): Promise<void>;
+}
+
+function fail(label: string, res: ExecResult): never {
+  throw new Error(`${label} failed (exit ${res.code}): ${res.stderr.slice(0, 300)}`);
+}
+
+/** Raw-command ops (no wrapper on the host). */
+export function shellOps(target: SshTarget, configPath: string, exec: SshExec): RemoteOps {
+  return {
+    async read() {
+      const r = await exec(target, `cat ${shellQuote(configPath)}`);
+      if (r.code !== 0) fail('read', r);
+      return r.stdout;
+    },
+    async backup(now) {
+      const backupPath = backupFilename(configPath, now);
+      const r = await exec(target, `cp ${shellQuote(configPath)} ${shellQuote(backupPath)}`);
+      if (r.code !== 0) fail('backup', r);
+      return backupPath;
+    },
+    async write(content) {
+      const r = await exec(target, `cat > ${shellQuote(configPath)}`, content);
+      if (r.code !== 0) fail('write', r);
+    },
+    async restart(restartCmd) {
+      const r = await exec(target, restartCmd);
+      if (r.code !== 0) fail('restart', r);
+    },
+  };
+}
+
+/** Verb ops for a forced-command-locked key. The wrapper hardcodes the paths;
+ *  the backup verb stamps and returns the backup path on stdout. */
+export function wrapperOps(target: SshTarget, exec: SshExec): RemoteOps {
+  return {
+    async read() {
+      const r = await exec(target, 'read');
+      if (r.code !== 0) fail('read', r);
+      return r.stdout;
+    },
+    async backup() {
+      const r = await exec(target, 'backup');
+      if (r.code !== 0) fail('backup', r);
+      return r.stdout.trim();
+    },
+    async write(content) {
+      const r = await exec(target, 'write', content);
+      if (r.code !== 0) fail('write', r);
+    },
+    async restart() {
+      const r = await exec(target, 'restart');
+      if (r.code !== 0) fail('restart', r);
+    },
+  };
+}
+
+export function makeRemoteOps(mode: SshMode, target: SshTarget, configPath: string, exec: SshExec): RemoteOps {
+  return mode === 'wrapper' ? wrapperOps(target, exec) : shellOps(target, configPath, exec);
+}
+
+// ─── orchestration (injectable exec) ─────────────────────────────────────────
+
+/** Read the remote config file (mode-aware; defaults to shell for compat). */
+export async function readRemoteConfig(
+  target: SshTarget,
+  configPath: string,
+  exec: SshExec,
+  mode: SshMode = 'shell',
+): Promise<string> {
+  return makeRemoteOps(mode, target, configPath, exec).read();
+}
+
+export interface ApplyResult {
+  ok: boolean;
+  step: 'validate' | 'backup' | 'write' | 'restart' | 'health' | 'done';
+  backupPath?: string;
+  diff?: string;
+  error?: string;
+}
+
+export interface ApplyOptions {
+  target: SshTarget;
+  configPath: string;
+  restartCmd: string;
+  newConfig: string;
+  schema: object;
+  baseUrl: string;
+  exec: SshExec;
+  /** 'shell' (default) or 'wrapper'. */
+  mode?: SshMode;
+  fetcher?: typeof fetch;
+  now?: Date;
+  healthAttempts?: number;
+  healthDelayMs?: number;
+}
+
+/**
+ * The full apply pipeline. Aborts at the first failing step and reports which
+ * one. Backup ALWAYS precedes write, so a failed write leaves the timestamped
+ * backup intact for manual recovery. Mode selects the wire commands (raw shell
+ * vs forced-command verbs); the pipeline is identical.
+ */
+export async function applyRemoteConfig(opts: ApplyOptions): Promise<ApplyResult> {
+  const {
+    target, configPath, restartCmd, newConfig, schema, baseUrl, exec,
+    mode = 'shell', fetcher = fetch, now = new Date(),
+    healthAttempts = 10, healthDelayMs = 2000,
+  } = opts;
+
+  const ops = makeRemoteOps(mode, target, configPath, exec);
+
+  // 1. Validate before touching the host.
+  const validation = validateLlamaConfig(newConfig, schema);
+  if (!validation.valid) {
+    return { ok: false, step: 'validate', error: validation.errors.join('; ') };
+  }
+
+  // Read current for diff + so an unreadable host fails before any write.
+  let current = '';
+  try {
+    current = await ops.read();
+  } catch (err) {
+    return { ok: false, step: 'validate', error: `read current failed: ${(err as Error).message}` };
+  }
+  const diff = computeDiff(current, newConfig);
+
+  // 2. Timestamped backup BEFORE write.
+  let backupPath: string;
+  try {
+    backupPath = await ops.backup(now);
+  } catch (err) {
+    return { ok: false, step: 'backup', diff, error: (err as Error).message };
+  }
+
+  // 3. Write new config.
+  try {
+    await ops.write(newConfig);
+  } catch (err) {
+    return { ok: false, step: 'write', backupPath, diff, error: (err as Error).message };
+  }
+
+  // 4. Restart the service.
+  try {
+    await ops.restart(restartCmd);
+  } catch (err) {
+    return { ok: false, step: 'restart', backupPath, diff, error: (err as Error).message };
+  }
+
+  // 5. Health-wait: poll the provider until it serves /v1/models.
+  const healthy = await healthWait(baseUrl, fetcher, healthAttempts, healthDelayMs);
+  if (!healthy) {
+    return { ok: false, step: 'health', backupPath, diff, error: 'health check did not pass after restart; backup retained' };
+  }
+
+  return { ok: true, step: 'done', backupPath, diff };
+}
+
+/** Poll the provider's /v1/models until it responds OK or attempts run out. */
+export async function healthWait(
+  baseUrl: string,
+  fetcher: typeof fetch,
+  attempts: number,
+  delayMs: number,
+): Promise<boolean> {
+  for (let i = 0; i < attempts; i++) {
+    try {
+      const res = await fetcher(`${baseUrl.replace(/\/+$/, '')}/v1/models`, {
+        signal: AbortSignal.timeout(5_000),
+      });
+      if (res.ok) return true;
+    } catch {
+      // not up yet
+    }
+    if (i < attempts - 1) await sleep(delayMs);
+  }
+  return false;
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((r) => setTimeout(r, ms));
+}
+
+// Minimal POSIX single-quote shell escape for the remote command string.
+function shellQuote(s: string): string {
+  return `'${s.replace(/'/g, `'\\''`)}'`;
+}
+
+// ─── real SSH executor (spawn) ───────────────────────────────────────────────
+
+/**
+ * Default SSH executor. Uses the system `ssh` with an explicit identity file and
+ * IdentitiesOnly so the agent's default key is never offered (the boocode Gitea
+ * lesson). BatchMode avoids interactive prompts hanging the service.
+ */
+export const sshExec: SshExec = (target, command, stdin) => {
+  return new Promise<ExecResult>((resolve) => {
+    const args = [
+      '-i', target.keyPath,
+      '-o', 'IdentitiesOnly=yes',
+      '-o', 'BatchMode=yes',
+      '-o', 'StrictHostKeyChecking=accept-new',
+      '-o', 'ConnectTimeout=10',
+      `${target.user}@${target.host}`,
+      command,
+    ];
+    const child = spawn('ssh', args, { stdio: ['pipe', 'pipe', 'pipe'] });
+    let stdout = '';
+    let stderr = '';
+    child.stdout.on('data', (d) => { stdout += d.toString(); });
+    child.stderr.on('data', (d) => { stderr += d.toString(); });
+    child.on('error', (err) => resolve({ code: 127, stdout, stderr: `${stderr}${(err as Error).message}` }));
+    child.on('close', (code) => resolve({ code: code ?? 1, stdout, stderr }));
+    if (stdin !== undefined) {
+      child.stdin.write(stdin);
+    }
+    child.stdin.end();
+  });
+};