feat(server): inference state-graph + supervisor, memory tools, MCP client, schema, routes

- Add state-graph.ts: typed state machine for inference lifecycle - Add supervisor.ts: agent supervisor pattern for multi-agent coordination - Add export-formatter.ts: structured export formatting - Add manage_memory.ts: memory CRUD tool for agent persistence - Add get_wiki_article.ts: codecontext wiki article retrieval - Extend memory/index.ts: 3-tier memory (context/daily/core) - Extend MCP client: mcp-config.ts env-var substitution - Update schema.sql: agent_sessions, tasks, pending_changes extensions - Update API types: MessageMetadata, ErrorReason, AgentSessionConfig - Update routes: chats, messages, sessions — column renames and agent_session_id - Update inference: error handler, payload builder, stream phase, turn orchestrator
2026-06-08 03:48:47 +00:00
parent 4a6623112c
commit d360051329
19 changed files with 1546 additions and 230 deletions
--- a/apps/server/src/services/inference/error-handler.ts
+++ b/apps/server/src/services/inference/error-handler.ts
@@ -74,6 +74,7 @@ export async function handleAbortOrError(
      type: 'message_complete',
      message_id: assistantMessageId,
      chat_id: chatId,
+      ...(args.compareGroupId ? { compare_group_id: args.compareGroupId } : {}),
    });
    ctx.log.info({ sessionId, chatId, assistantMessageId }, 'inference cancelled');
  } else {
@@ -90,6 +91,7 @@ export async function handleAbortOrError(
      chat_id: chatId,
      error: errMsg,
      reason: 'llm_provider_error',
+      ...(args.compareGroupId ? { compare_group_id: args.compareGroupId } : {}),
    });
    ctx.log.error({ err, sessionId, assistantMessageId }, 'inference failed');
  }
@@ -125,6 +127,7 @@ export async function finalizeStreamedRow(
    cacheTokens?: number | null;
    reasoningTokens?: number | null;
    beforeComplete?: () => Promise<void>;
+    compareGroupId?: string;
  },
 ): Promise<void> {
  // v1.11.3: see executeToolPhase for the rationale.
@@ -158,6 +161,7 @@ export async function finalizeStreamedRow(
    started_at: opts.startedAt,
    finished_at: updated?.finished_at ?? null,
    model: opts.model,
+    ...(opts.compareGroupId ? { compare_group_id: opts.compareGroupId } : {}),
  });
 }

@@ -182,6 +186,7 @@ export async function finalizeEmpty(
    type: 'message_complete',
    message_id: assistantMessageId,
    chat_id: chatId,
+    ...(args.compareGroupId ? { compare_group_id: args.compareGroupId } : {}),
  });
 }

@@ -281,6 +286,7 @@ export async function finalizeCompletion(
    started_at: startedAt,
    finished_at: updated?.finished_at ?? null,
    model: session.model,
+    ...(args.compareGroupId ? { compare_group_id: args.compareGroupId } : {}),
  });
  ctx.log.info(
    {
--- a/apps/server/src/services/inference/index.ts
+++ b/apps/server/src/services/inference/index.ts
@@ -8,6 +8,7 @@ export {
  createInferenceRunner,
  MAX_STEPS,
  runInference,
+  runInferenceWithModel,
 } from './turn.js';
 // P5: the shared pipeline types moved from turn.ts to types.ts (breaking the
 // hub-and-leaf near-cycle). Re-exported here so the public surface is unchanged.
@@ -21,3 +22,4 @@ export type {
 export type { ToolPhaseResult } from './tool-phase.js';
 export { detectDoomLoop, DOOM_LOOP_THRESHOLD } from './sentinels.js';
 export { buildMessagesPayload } from './payload.js';
+export { runGraph, type GraphNodeType, type GraphState, type GraphResult } from './state-graph.js';
--- a/apps/server/src/services/inference/payload.ts
+++ b/apps/server/src/services/inference/payload.ts
@@ -194,6 +194,14 @@ export async function buildMessagesPayload(
      out.push(msg);
      continue;
    }
+    // TODO(vDeepSeek): when m has image attachments, use a content array
+    // with text + image parts (see multi-modal.ts:imageAttachmentsToParts).
+    // The AI SDK ModelMessage content shape supports:
+    //   content: [
+    //     { type: 'text', text: '...' },
+    //     { type: 'image', image: 'data:image/png;base64,...' }
+    //   ]
+    // The @ai-sdk/deepseek provider handles the image parts natively.
    out.push({ role: 'user', content: m.content });
  }
  return out;
@@ -206,7 +214,7 @@ export async function loadContext(
 ): Promise<{ session: Session; project: Project; history: Message[] } | null> {
  const sessionRows = await sql<Session[]>`
    SELECT id, project_id, name, model, system_prompt, status, created_at, updated_at,
-           agent_id, web_search_enabled, allowed_read_paths
+           agent_id, web_search_enabled, allowed_read_paths, state_graph_enabled
    FROM sessions WHERE id = ${sessionId}
  `;
  if (sessionRows.length === 0) return null;
--- a/apps/server/src/services/inference/state-graph.ts
+++ b/apps/server/src/services/inference/state-graph.ts
@@ -0,0 +1,531 @@
+// P5: Optional declarative state graph engine for the inference turn loop.
+//
+// Replaces the procedural `while (stepNumber < effectiveCap)` in turn.ts
+// with a node-based execution model. Default OFF via
+// session.state_graph_enabled — zero behavior change when disabled.
+//
+// Nodes wrap EXISTING infrastructure (no new I/O patterns):
+//   PLAN      → top-of-loop gate, compaction, loadContext, buildMessagesPayload,
+//               executeStreamPhase
+//   CALL_TOOL → executeToolPhase
+//   OBSERVE   → process tool results, update loop locals
+//   REFLECT   → decidePostToolAction, sentinel insertion, mistake tracker
+//   SYNTHESIZE → terminal (graph loop exits)
+
+import type { Agent, Project, Session, ToolCall } from '../../types/api.js';
+import { resolveProjectRoot } from '../path_guard.js';
+import { rewriteSearchQuery } from '../task-search-rewrite.js';
+import * as compaction from '../compaction.js';
+import { decideStep, decidePostToolAction } from './step-decision.js';
+import {
+  recordStep,
+  MISTAKE_RECOVERY_NOTE,
+  type MistakeState,
+} from './mistake-tracker.js';
+import {
+  buildMessagesPayload,
+  loadContext,
+} from './payload.js';
+import { toDcpMessages, transformMessages, fromDcpMessages } from './dcp/index.js';
+import {
+  finalizeCompletion,
+  finalizeEmpty,
+  handleAbortOrError,
+} from './error-handler.js';
+import {
+  executeStreamPhase,
+} from './stream-phase.js';
+import { executeToolPhase, type ToolPhaseResult } from './tool-phase.js';
+import type {
+  InferenceContext,
+  StreamPhaseState,
+  StreamResult,
+  TurnArgs,
+} from './types.js';
+import {
+  runCapHitSummary,
+  runDoomLoopSummary,
+  insertMistakeRecoverySentinel,
+} from './sentinel-summaries.js';
+import { execFile } from 'node:child_process';
+import { readFileSync, existsSync } from 'node:fs';
+import { join } from 'node:path';
+
+const BUILD_TIMEOUT_MS = 60_000;
+const BUILD_OUTPUT_CAP = 8_000;
+
+async function detectAndRunBuild(
+  ctx: InferenceContext,
+  projectRoot: string,
+  sessionId: string,
+  chatId: string,
+  model: string,
+  existingNote: string | undefined,
+): Promise<string | undefined> {
+  if (!model.startsWith('deepseek-')) return undefined;
+  const pkgPath = join(projectRoot, 'package.json');
+  if (!existsSync(pkgPath)) return undefined;
+  let buildCmd: string | null = null;
+  try {
+    const pkg = JSON.parse(readFileSync(pkgPath, 'utf8')) as { scripts?: Record<string, string> };
+    if (pkg.scripts?.build) buildCmd = 'build';
+    else if (pkg.scripts?.compile) buildCmd = 'compile';
+    else if (pkg.scripts?.typecheck) buildCmd = 'typecheck';
+  } catch {
+    return undefined;
+  }
+  if (!buildCmd) return undefined;
+  const hasPnpm = existsSync(join(projectRoot, 'pnpm-lock.yaml'));
+  const hasYarn = existsSync(join(projectRoot, 'yarn.lock'));
+  const pm = hasPnpm ? 'pnpm' : hasYarn ? 'yarn' : 'npm';
+  try {
+    const out = await new Promise<string>((resolve, reject) => {
+      execFile(pm, ['run', buildCmd!], { cwd: projectRoot, timeout: BUILD_TIMEOUT_MS, maxBuffer: BUILD_OUTPUT_CAP * 2 },
+        (err, stdout, stderr) => {
+          if (err && (err as NodeJS.ErrnoException).code === 'ENOENT') {
+            resolve('');
+            return;
+          }
+          const merged = (stdout + '\n' + stderr).trim();
+          resolve(merged.slice(0, BUILD_OUTPUT_CAP));
+        },
+      );
+    });
+    if (!out) return undefined;
+    ctx.log.info({ sessionId, chatId, buildCmd, outputLen: out.length }, 'auto-fix: build failed');
+    const combined = existingNote
+      ? existingNote + '\n\n--- Build error ---\n' + out.slice(0, BUILD_OUTPUT_CAP - existingNote.length)
+      : '--- Build error ---\n' + out.slice(0, BUILD_OUTPUT_CAP);
+    return combined;
+  } catch {
+    return undefined;
+  }
+}
+
+// -- Types ----------------------------------------------------------------
+
+export type GraphNodeType = 'PLAN' | 'CALL_TOOL' | 'OBSERVE' | 'REFLECT' | 'SYNTHESIZE';
+
+export interface GraphState {
+  stepNumber: number;
+  toolsUsed: number;
+  recentToolCalls: ToolCall[];
+  assistantMessageId: string;
+  mistakeTracker: MistakeState;
+  pendingRecoveryNote?: string;
+  effectiveCap: number;
+  budget: number;
+  projectRoot: string;
+  iterSession?: Session;
+  iterProject?: Project;
+  streamResult?: StreamResult;
+  startedAt?: string | null;
+  toolPhaseResult?: ToolPhaseResult;
+  shouldStop: boolean;
+}
+
+interface GraphNode {
+  type: GraphNodeType;
+  edges: Array<{ to: GraphNodeType; condition: (state: GraphState) => boolean }>;
+  execute: (
+    ctx: InferenceContext,
+    args: TurnArgs,
+    state: GraphState,
+    agent: Agent | null,
+  ) => Promise<void>;
+}
+
+export interface GraphResult {
+  stepNumber: number;
+  assistantMessageId: string;
+  toolsUsed: number;
+  recentToolCalls: ToolCall[];
+  mistakeTracker: MistakeState;
+}
+
+// -- Default graph --------------------------------------------------------
+
+export function createDefaultGraph(): GraphNode[] {
+  return [
+    {
+      type: 'PLAN',
+      edges: [
+        { to: 'CALL_TOOL', condition: (s) => !!s.streamResult && s.streamResult.toolCalls.length > 0 },
+        { to: 'SYNTHESIZE', condition: () => true },
+      ],
+      execute: planNode,
+    },
+    {
+      type: 'CALL_TOOL',
+      edges: [
+        { to: 'OBSERVE', condition: () => true },
+      ],
+      execute: callToolNode,
+    },
+    {
+      type: 'OBSERVE',
+      edges: [
+        { to: 'REFLECT', condition: (s) => s.toolPhaseResult?.action === 'continue' },
+        { to: 'SYNTHESIZE', condition: () => true },
+      ],
+      execute: observeNode,
+    },
+    {
+      type: 'REFLECT',
+      edges: [
+        { to: 'PLAN', condition: (s) => s.stepNumber < s.effectiveCap },
+        { to: 'SYNTHESIZE', condition: () => true },
+      ],
+      execute: reflectNode,
+    },
+    {
+      type: 'SYNTHESIZE',
+      edges: [],
+      execute: async () => {},
+    },
+  ];
+}
+
+// -- Graph runner ---------------------------------------------------------
+
+export async function runGraph(
+  ctx: InferenceContext,
+  args: TurnArgs,
+  extra: { effectiveCap: number; budget: number; agent: Agent | null; projectRoot: string },
+): Promise<GraphResult> {
+  const { effectiveCap, budget, agent } = extra;
+
+  const state: GraphState = {
+    stepNumber: 0,
+    toolsUsed: args.toolsUsed,
+    recentToolCalls: args.recentToolCalls,
+    assistantMessageId: args.assistantMessageId,
+    mistakeTracker: args.mistakeTracker,
+    pendingRecoveryNote: args.pendingRecoveryNote,
+    effectiveCap,
+    budget,
+    projectRoot: extra.projectRoot,
+    shouldStop: false,
+  };
+
+  const graph = createDefaultGraph();
+  let currentNode: GraphNodeType = 'PLAN';
+
+  while (currentNode !== 'SYNTHESIZE' && !state.shouldStop) {
+    const node = graph.find((n) => n.type === currentNode)!;
+    await node.execute(ctx, args, state, agent);
+    if (state.shouldStop) break;
+    const nextEdge = node.edges.find((e) => e.condition(state));
+    if (!nextEdge) break;
+    currentNode = nextEdge.to;
+  }
+
+  return {
+    stepNumber: state.stepNumber,
+    assistantMessageId: state.assistantMessageId,
+    toolsUsed: state.toolsUsed,
+    recentToolCalls: state.recentToolCalls,
+    mistakeTracker: state.mistakeTracker,
+  };
+}
+
+// -- PLAN node ------------------------------------------------------------
+// Top-of-loop gate → compaction → loadContext → DCP → buildPayload → stream
+
+async function planNode(
+  ctx: InferenceContext,
+  args: TurnArgs,
+  state: GraphState,
+  agent: Agent | null,
+): Promise<void> {
+  const { sessionId, chatId, signal } = args;
+
+  // 1. Top-of-loop gate: doom-loop, then budget (pure decisions)
+  const decision = decideStep({
+    recentToolCalls: state.recentToolCalls,
+    toolsUsed: state.toolsUsed,
+    budget: state.budget,
+  });
+
+  if (decision.kind === 'doom') {
+    const loaded = await loadContext(ctx.sql, sessionId, chatId);
+    if (loaded) {
+      const dlSession = args.modelOverride ? { ...loaded.session, model: args.modelOverride } : loaded.session;
+      const iterArgs: TurnArgs = {
+        sessionId, chatId, assistantMessageId: state.assistantMessageId,
+        toolsUsed: state.toolsUsed, recentToolCalls: state.recentToolCalls,
+        mistakeTracker: state.mistakeTracker, signal,
+      };
+      await runDoomLoopSummary(ctx, iterArgs, dlSession, loaded.project, loaded.history, agent, decision.loop);
+    }
+    state.shouldStop = true;
+    return;
+  }
+
+  if (decision.kind === 'budget') {
+    const loaded = await loadContext(ctx.sql, sessionId, chatId);
+    if (loaded) {
+      const bhSession = args.modelOverride ? { ...loaded.session, model: args.modelOverride } : loaded.session;
+      const iterArgs: TurnArgs = {
+        sessionId, chatId, assistantMessageId: state.assistantMessageId,
+        toolsUsed: state.toolsUsed, recentToolCalls: state.recentToolCalls,
+        mistakeTracker: state.mistakeTracker, signal,
+      };
+      await runCapHitSummary(ctx, iterArgs, bhSession, loaded.project, loaded.history, agent, state.budget);
+    }
+    state.shouldStop = true;
+    return;
+  }
+
+  // decision.kind === 'stream' → proceed.
+
+  // 2. Compaction check
+  const chatFlag = await ctx.sql<{ needs_compaction: boolean }[]>`
+    SELECT needs_compaction FROM chats WHERE id = ${chatId}
+  `;
+  if (chatFlag[0]?.needs_compaction) {
+    try {
+      await compaction.process({
+        sql: ctx.sql, config: ctx.config, log: ctx.log,
+        broker: ctx.broker, chatId, hooks: ctx.hooks,
+      });
+    } catch (err) {
+      ctx.log.warn({ err, chatId }, 'auto-compaction failed; clearing flag and proceeding');
+      await ctx.sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
+    }
+  }
+
+  // 3. Load context (must re-load each iteration — new messages)
+  const loaded = await loadContext(ctx.sql, sessionId, chatId);
+  if (!loaded) {
+    ctx.log.warn({ sessionId }, 'inference: session or project missing mid-loop');
+    state.shouldStop = true;
+    return;
+  }
+  let { session: iterSession, project: iterProject, history } = loaded;
+  if (args.modelOverride) {
+    iterSession = { ...iterSession, model: args.modelOverride };
+  }
+  state.iterSession = iterSession;
+  state.iterProject = iterProject;
+  const projectRoot = await resolveProjectRoot(iterProject.path);
+  state.projectRoot = projectRoot;
+
+  // 4. DCP transform
+  try {
+    const dcpMsgs = toDcpMessages(history);
+    const { messages: pruned, stats } = transformMessages(chatId, dcpMsgs);
+    if (stats.removedCount > 0) {
+      ctx.log.info({ chatId, ...stats }, 'dcp: transform removed messages');
+      history = fromDcpMessages(pruned) as typeof history;
+    }
+  } catch (err) {
+    ctx.log.warn({ err: err instanceof Error ? err.message : String(err), chatId }, 'dcp: transform skipped');
+  }
+
+  // 5. Log step boundary
+  ctx.log.info(
+    { sessionId, chatId, step: state.stepNumber, assistantMessageId: state.assistantMessageId },
+    'step_start',
+  );
+
+  // 6. Build messages + stream phase
+  const messages = await buildMessagesPayload(iterSession, iterProject, history, agent, ctx.log);
+  const webToolsEnabled =
+    iterSession.web_search_enabled ?? iterProject.default_web_search_enabled ?? false;
+
+  if (state.stepNumber === 0 && webToolsEnabled && messages.length >= 2) {
+    const lastUserMsg = [...messages].reverse().find((m) => m.role === 'user');
+    if (lastUserMsg?.content) {
+      const hint = await rewriteSearchQuery(lastUserMsg.content);
+      if (hint && messages[0]?.role === 'system' && messages[0].content) {
+        messages[0].content += `\n\nThe user's search intent can be summarized as: "${hint}"`;
+      }
+    }
+  }
+
+  if (state.pendingRecoveryNote) {
+    messages.push({ role: 'system', content: state.pendingRecoveryNote });
+    state.pendingRecoveryNote = undefined;
+  }
+
+  // 7. Stream phase
+  const iterArgs: TurnArgs = {
+    sessionId, chatId, assistantMessageId: state.assistantMessageId,
+    toolsUsed: state.toolsUsed, recentToolCalls: state.recentToolCalls,
+    mistakeTracker: state.mistakeTracker, signal,
+  };
+  const streamState: StreamPhaseState = { accumulated: '', startedAt: null };
+  try {
+    const result = await executeStreamPhase(ctx, iterArgs, iterSession, messages, streamState, agent, webToolsEnabled);
+    state.streamResult = result;
+    state.startedAt = streamState.startedAt;
+
+    // Non-tool finish: Stop hook + finalize here (edge from PLAN → SYNTHESIZE
+    // will break the graph loop after this node returns).
+    if (result.toolCalls.length === 0) {
+      if (ctx.hooks) {
+        ctx.hooks.run('Stop', {
+          event: 'Stop',
+          session_id: sessionId,
+          chat_id: chatId,
+          last_assistant_text: result.content.slice(0, 500),
+          turn: state.stepNumber,
+        }).catch(() => {});
+      }
+      await finalizeCompletion(ctx, iterArgs, result, streamState.startedAt, iterSession);
+    }
+  } catch (err) {
+    await handleAbortOrError(ctx, iterArgs, streamState.accumulated, err);
+    state.shouldStop = true;
+  }
+}
+
+// -- CALL_TOOL node -------------------------------------------------------
+// Executes the tool phase and stores the result for OBSERVE.
+
+async function callToolNode(
+  ctx: InferenceContext,
+  args: TurnArgs,
+  state: GraphState,
+  agent: Agent | null,
+): Promise<void> {
+  const { sessionId, chatId } = args;
+  const result = state.streamResult;
+  if (!result) {
+    ctx.log.warn({ sessionId }, 'state-graph: CALL_TOOL without stream result');
+    state.shouldStop = true;
+    return;
+  }
+  const session = state.iterSession;
+  if (!session) {
+    ctx.log.warn({ sessionId }, 'state-graph: CALL_TOOL without iterSession');
+    state.shouldStop = true;
+    return;
+  }
+
+  try {
+    state.toolPhaseResult = await executeToolPhase(
+      ctx, args, result, state.startedAt ?? null,
+      session, state.projectRoot, agent, state.stepNumber,
+    );
+  } catch (err) {
+    ctx.log.error({ err, sessionId, chatId, step: state.stepNumber }, 'tool phase threw unexpectedly');
+    state.shouldStop = true;
+  }
+}
+
+// -- OBSERVE node ---------------------------------------------------------
+// Processes tool results: updates loop locals, mistake tracking, build errors.
+
+async function observeNode(
+  ctx: InferenceContext,
+  args: TurnArgs,
+  state: GraphState,
+  _agent: Agent | null,
+): Promise<void> {
+  const { sessionId, chatId } = args;
+  const tpr = state.toolPhaseResult;
+  if (!tpr) {
+    state.shouldStop = true;
+    return;
+  }
+
+  // Update loop locals (mirrors the existing while-loop post-tool logic)
+  state.toolsUsed += tpr.toolCallCount;
+  state.recentToolCalls = [...state.recentToolCalls, ...tpr.toolCalls];
+  state.stepNumber++;
+
+  // Fold tool outcomes into the mistake tracker
+  for (const o of tpr.outcomes) {
+    recordStep(state.mistakeTracker, o);
+  }
+
+  // Auto-fix: after write tools, attempt build and inject errors.
+  const WRITE_TOOLS = new Set(['edit_file', 'create_file', 'delete_file', 'apply_pending']);
+  const hasWriteTools = tpr.toolCalls.some((tc) => WRITE_TOOLS.has(tc.name));
+  if (hasWriteTools && state.iterSession) {
+    detectAndRunBuild(ctx, state.projectRoot, sessionId, chatId, state.iterSession.model, state.pendingRecoveryNote)
+      .then((buildError) => {
+        if (buildError) state.pendingRecoveryNote = buildError;
+      })
+      .catch(() => {});
+  }
+}
+
+// -- REFLECT node ---------------------------------------------------------
+// Post-tool decision: decidePostToolAction, nudge/escalate/continue handling.
+
+async function reflectNode(
+  ctx: InferenceContext,
+  args: TurnArgs,
+  state: GraphState,
+  _agent: Agent | null,
+): Promise<void> {
+  const { sessionId, chatId, signal } = args;
+  const tpr = state.toolPhaseResult;
+  if (!tpr) {
+    state.shouldStop = true;
+    return;
+  }
+
+  const post = decidePostToolAction(tpr.action, state.mistakeTracker);
+
+  if (post === 'stop') {
+    state.shouldStop = true;
+    return;
+  }
+
+  if (post === 'nudge') {
+    state.pendingRecoveryNote = MISTAKE_RECOVERY_NOTE;
+    const failureKinds = [...state.mistakeTracker.run];
+    await insertMistakeRecoverySentinel(ctx, sessionId, chatId, {
+      failureKinds,
+      count: failureKinds.length,
+      escalated: false,
+      canContinue: true,
+    });
+    state.mistakeTracker.nudges += 1;
+    state.mistakeTracker.run = [];
+    ctx.log.info(
+      { sessionId, chatId, step: state.stepNumber, nudges: state.mistakeTracker.nudges, failureKinds },
+      'mistake_recovery nudge',
+    );
+    // Continue to next PLAN node — edges check step < cap.
+    if (state.assistantMessageId !== tpr.nextAssistantId && tpr.nextAssistantId) {
+      state.assistantMessageId = tpr.nextAssistantId;
+    }
+    return;
+  }
+
+  if (post === 'escalate') {
+    const failureKinds = [...state.mistakeTracker.run];
+    if (tpr.nextAssistantId) {
+      state.assistantMessageId = tpr.nextAssistantId;
+    }
+    const escalateArgs: TurnArgs = {
+      sessionId, chatId, assistantMessageId: state.assistantMessageId,
+      toolsUsed: state.toolsUsed, recentToolCalls: state.recentToolCalls,
+      mistakeTracker: state.mistakeTracker, signal,
+    };
+    await finalizeEmpty(ctx, escalateArgs);
+    await insertMistakeRecoverySentinel(ctx, sessionId, chatId, {
+      failureKinds,
+      count: failureKinds.length,
+      escalated: true,
+      canContinue: true,
+    });
+    ctx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'idle', at: new Date().toISOString() });
+    ctx.log.info(
+      { sessionId, chatId, step: state.stepNumber, failureKinds },
+      'mistake_recovery escalate — stopping turn',
+    );
+    state.shouldStop = true;
+    return;
+  }
+
+  // 'continue' — advance to next assistant message.
+  if (tpr.nextAssistantId) {
+    state.assistantMessageId = tpr.nextAssistantId;
+  }
+}
--- a/apps/server/src/services/inference/stream-phase.ts
+++ b/apps/server/src/services/inference/stream-phase.ts
@@ -56,6 +56,7 @@ export async function executeStreamPhase(
    message_id: assistantMessageId,
    chat_id: chatId,
    role: 'assistant',
+    ...(args.compareGroupId ? { compare_group_id: args.compareGroupId } : {}),
  });

  const flusher = createContentFlusher(ctx.sql, assistantMessageId, () => state.accumulated);
@@ -119,6 +120,7 @@ export async function executeStreamPhase(
          message_id: assistantMessageId,
          chat_id: chatId,
          content: delta,
+          ...(args.compareGroupId ? { compare_group_id: args.compareGroupId } : {}),
        });
        ctx.log.debug({ sessionId, delta }, 'inference delta');
        flusher.scheduleFlush();
--- a/apps/server/src/services/inference/supervisor.ts
+++ b/apps/server/src/services/inference/supervisor.ts
@@ -0,0 +1,75 @@
+// Supervisor agent: routes user requests to the best agent via a cheap LLM
+// classification call. Activated when session.agent_id === 'supervisor'.
+
+import type { Agent } from '../../types/api.js';
+import { taskModelCompletion } from '../task-model.js';
+
+export interface SupervisorRoute {
+  agent_id: string;
+  confidence: number;
+  reasoning: string;
+}
+
+const SUPERVISOR_SYSTEM_PROMPT = `You are a router. Given the user's request and the available agents, choose the best agent to handle the request.
+
+Rules:
+- Match the request to the agent whose description and toolset best fits the task.
+- For code review / bug finding requests → code-reviewer
+- For debugging / diagnosing failures → debugger
+- For refactoring / simplifying code → refactorer
+- For architecture / design / planning → architect or planner
+- For security audits → security-auditor
+- For building prompts for other agents → prompt-builder
+- For exploring / understanding unfamiliar code → recon
+- For implementing / writing code changes → builder
+- Respond with ONLY the agent id (e.g. "builder") or "none" if no agent fits.
+- Do not include any other text, punctuation, or explanation.`;
+
+const MAX_ROUTING_TOKENS = 30;
+
+/**
+ * Given the user's latest message and available agents, classifies which agent
+ * should handle this turn. Returns null to fall through to default (no agent).
+ */
+export async function resolveSupervisorTurn(
+  latestUserMessage: string,
+  agents: Agent[],
+  fallbackModel?: string,
+): Promise<SupervisorRoute | null> {
+  // Build agent listing — skip the supervisor itself to avoid self-routing.
+  const agentList = agents
+    .filter((a) => a.id !== 'supervisor')
+    .map((a) => `- ${a.id}: ${a.description} (${a.tools.length} tools)`)
+    .join('\n');
+
+  if (!agentList) {
+    return null;
+  }
+
+  const userPrompt = `Available agents:\n${agentList}\n\nUser request: ${latestUserMessage.slice(0, 2000)}`;
+
+  const response = await taskModelCompletion({
+    system: SUPERVISOR_SYSTEM_PROMPT,
+    user: userPrompt,
+    maxTokens: MAX_ROUTING_TOKENS,
+    temperature: 0.1,
+    fallbackModel,
+  });
+
+  const agentId = response.trim().toLowerCase();
+  if (!agentId || agentId === 'none') {
+    return null;
+  }
+
+  // Map back to a real agent to validate the id.
+  const matched = agents.find((a) => a.id === agentId);
+  if (!matched) {
+    return null;
+  }
+
+  return {
+    agent_id: matched.id,
+    confidence: 1,
+    reasoning: `supervisor routed to "${matched.name}" based on request classification`,
+  };
+}
--- a/apps/server/src/services/inference/turn.ts
+++ b/apps/server/src/services/inference/turn.ts
@@ -8,7 +8,7 @@ import type {
 import { resolveProjectRoot } from '../path_guard.js';
 import { maybeAutoNameChat } from '../auto_name.js';
 import { rewriteSearchQuery } from '../task-search-rewrite.js';
-import { getAgentById } from '../agents.js';
+import { getAgentById, getAgentsForProject } from '../agents.js';
 import * as compaction from '../compaction.js';
 import { resolveTurnConfig } from './turn-config.js';
 import { decideStep, decidePostToolAction } from './step-decision.js';
@@ -49,6 +49,8 @@ import {
  runStepCapSummary,
  insertMistakeRecoverySentinel,
 } from './sentinel-summaries.js';
+import { resolveSupervisorTurn } from './supervisor.js';
+import { runGraph } from './state-graph.js';

 // vWhale: auto-fix — detect build command from package.json, run it, return
 // error text for injection into next iteration. Best-effort, never throws.
@@ -149,10 +151,39 @@ export async function runAssistantTurn(
    ctx.log.warn({ sessionId }, 'inference: session or project missing');
    return;
  }
-  const { session, project } = initialLoaded;
-  const agent = session.agent_id
+  let { session, project, history: initialHistory } = initialLoaded;
+  if (args.modelOverride) {
+    session = { ...session, model: args.modelOverride };
+  }
+  let agent = session.agent_id
    ? await getAgentById(project.path, session.agent_id)
    : null;
+
+  // vSupervisor: if the session is set to supervisor mode, resolve the real
+  // agent via a cheap classification call. Falls through to default (no agent)
+  // if routing returns null.
+  if (agent?.id === 'supervisor') {
+    const { agents: availableAgents } = await getAgentsForProject(project.path);
+    const latestUser = [...initialHistory].reverse().find((m) => m.role === 'user');
+    const userMessage = latestUser?.content ?? '';
+    if (userMessage) {
+      const route = await resolveSupervisorTurn(userMessage, availableAgents, session.model ?? undefined);
+      if (route) {
+        ctx.log.info(
+          { sessionId, chatId, resolvedAgent: route.agent_id, reasoning: route.reasoning },
+          'supervisor: routed turn',
+        );
+        agent = await getAgentById(project.path, route.agent_id);
+      } else {
+        ctx.log.info({ sessionId, chatId }, 'supervisor: no agent matched, falling through to default');
+        agent = null;
+      }
+    } else {
+      ctx.log.info({ sessionId, chatId }, 'supervisor: no user message found, falling through to default');
+      agent = null;
+    }
+  }
+
  // P5: pure per-turn config (budget + cap math + text-only flag).
  const { effectiveCap, budget, isTextOnly } = resolveTurnConfig(agent);

@@ -162,7 +193,8 @@ export async function runAssistantTurn(
  if (isTextOnly) {
    const loaded = await loadContext(ctx.sql, sessionId, chatId);
    if (loaded) {
-      await runTextOnlyTurn(ctx, args, loaded.session, loaded.project, loaded.history, agent);
+      const txtSession = args.modelOverride ? { ...loaded.session, model: args.modelOverride } : loaded.session;
+      await runTextOnlyTurn(ctx, args, txtSession, loaded.project, loaded.history, agent);
    }
    return;
  }
@@ -178,228 +210,244 @@ export async function runAssistantTurn(
  const mistakeTracker = args.mistakeTracker;
  let pendingRecoveryNote: string | undefined = args.pendingRecoveryNote;

-  while (stepNumber < effectiveCap) {
-    // ---- top-of-loop gate: doom-loop, then budget (pure decision) ----
-    const decision = decideStep({ recentToolCalls, toolsUsed, budget });
-    if (decision.kind === 'doom') {
-      // Need fresh history for the summary.
-      const loaded = await loadContext(ctx.sql, sessionId, chatId);
-      if (loaded) {
-        const iterArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, mistakeTracker, signal };
-        await runDoomLoopSummary(ctx, iterArgs, loaded.session, loaded.project, loaded.history, agent, decision.loop);
+  if (session.state_graph_enabled) {
+    // ---- optional state graph path ----
+    const gProjectRoot = await resolveProjectRoot(project.path);
+    const graphResult = await runGraph(ctx, args, { effectiveCap, budget, agent, projectRoot: gProjectRoot });
+    stepNumber = graphResult.stepNumber;
+    toolsUsed = graphResult.toolsUsed;
+    recentToolCalls = graphResult.recentToolCalls;
+    assistantMessageId = graphResult.assistantMessageId;
+    // mistakeTracker is the same object reference (mutated in place by the graph).
+  } else {
+    while (stepNumber < effectiveCap) {
+      // ---- top-of-loop gate: doom-loop, then budget (pure decision) ----
+      const decision = decideStep({ recentToolCalls, toolsUsed, budget });
+      if (decision.kind === 'doom') {
+        // Need fresh history for the summary.
+        const loaded = await loadContext(ctx.sql, sessionId, chatId);
+        if (loaded) {
+          const dlSession = args.modelOverride ? { ...loaded.session, model: args.modelOverride } : loaded.session;
+          const iterArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, mistakeTracker, signal };
+          await runDoomLoopSummary(ctx, iterArgs, dlSession, loaded.project, loaded.history, agent, decision.loop);
+        }
+        break;
      }
-      break;
-    }
-    if (decision.kind === 'budget') {
-      const loaded = await loadContext(ctx.sql, sessionId, chatId);
-      if (loaded) {
-        const iterArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, mistakeTracker, signal };
-        await runCapHitSummary(ctx, iterArgs, loaded.session, loaded.project, loaded.history, agent, budget);
+      if (decision.kind === 'budget') {
+        const loaded = await loadContext(ctx.sql, sessionId, chatId);
+        if (loaded) {
+          const bhSession = args.modelOverride ? { ...loaded.session, model: args.modelOverride } : loaded.session;
+          const iterArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, mistakeTracker, signal };
+          await runCapHitSummary(ctx, iterArgs, bhSession, loaded.project, loaded.history, agent, budget);
+        }
+        break;
      }
-      break;
-    }
-    // decision.kind === 'stream' → proceed with compaction + stream + tools.
+      // decision.kind === 'stream' → proceed with compaction + stream + tools.

-    // ---- compaction check ----
-    // v1.11: if the prior turn flagged this chat for compaction, run it
-    // before loadContext so we read post-compaction history. Swallow
-    // failures and proceed with un-compacted history.
-    const chatFlag = await ctx.sql<{ needs_compaction: boolean }[]>`
-      SELECT needs_compaction FROM chats WHERE id = ${chatId}
-    `;
-    if (chatFlag[0]?.needs_compaction) {
-      try {
-        await compaction.process({
-          sql: ctx.sql,
-          config: ctx.config,
-          log: ctx.log,
-          broker: ctx.broker,
-          chatId,
-          hooks: ctx.hooks,
-        });
-      } catch (err) {
-        ctx.log.warn({ err, chatId }, 'auto-compaction failed; clearing flag and proceeding');
-        await ctx.sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
-      }
-    }
-
-    // ---- load context (must re-load each iteration — new messages since last step) ----
-    const loaded = await loadContext(ctx.sql, sessionId, chatId);
-    if (!loaded) {
-      ctx.log.warn({ sessionId }, 'inference: session or project missing mid-loop');
-      break;
-    }
-    let { session: iterSession, project: iterProject, history } = loaded;
-    const projectRoot = await resolveProjectRoot(iterProject.path);
-
-    try {
-      const dcpMsgs = toDcpMessages(history);
-      const { messages: pruned, stats } = transformMessages(chatId, dcpMsgs);
-      if (stats.removedCount > 0) {
-        ctx.log.info({ chatId, ...stats }, 'dcp: transform removed messages');
-        history = fromDcpMessages(pruned) as typeof history;
-      }
-    } catch (err) {
-      ctx.log.warn({ err: err instanceof Error ? err.message : String(err), chatId }, 'dcp: transform skipped');
-    }
-
-    // v1.14.0: log step boundary for instrumentation. step_start parts are in
-    // the schema CHECK but not emitted here — writing to the assistant message
-    // before the stream phase creates a sequence-0 collision with
-    // partsFromAssistantMessage. A WS frame or structured log is sufficient
-    // since the frontend doesn't render step boundaries in v1.14.
-    ctx.log.info({ sessionId, chatId, step: stepNumber, assistantMessageId }, 'step_start');
-
-    // ---- build messages + stream phase ----
-    const messages = await buildMessagesPayload(iterSession, iterProject, history, agent, ctx.log);
-    const webToolsEnabled =
-      iterSession.web_search_enabled ?? iterProject.default_web_search_enabled ?? false;
-
-    if (stepNumber === 0 && webToolsEnabled && messages.length >= 2) {
-      const lastUserMsg = [...messages].reverse().find((m) => m.role === 'user');
-      if (lastUserMsg?.content) {
-        const hint = await rewriteSearchQuery(lastUserMsg.content);
-        if (hint && messages[0]?.role === 'system' && messages[0].content) {
-          messages[0].content += `\n\nThe user's search intent can be summarized as: "${hint}"`;
+      // ---- compaction check ----
+      // v1.11: if the prior turn flagged this chat for compaction, run it
+      // before loadContext so we read post-compaction history. Swallow
+      // failures and proceed with un-compacted history.
+      const chatFlag = await ctx.sql<{ needs_compaction: boolean }[]>`
+        SELECT needs_compaction FROM chats WHERE id = ${chatId}
+      `;
+      if (chatFlag[0]?.needs_compaction) {
+        try {
+          await compaction.process({
+            sql: ctx.sql,
+            config: ctx.config,
+            log: ctx.log,
+            broker: ctx.broker,
+            chatId,
+            hooks: ctx.hooks,
+          });
+        } catch (err) {
+          ctx.log.warn({ err, chatId }, 'auto-compaction failed; clearing flag and proceeding');
+          await ctx.sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
        }
      }
-    }

-    // v#12 MistakeTracker: if the prior iteration's nudge fired, append the
-    // transient recovery note to THIS payload (consumed exactly once, then
-    // cleared). Never persisted — same lifecycle as the cap-hit/doom-loop
-    // summary notes, which live only inside the in-memory messages array.
-    if (pendingRecoveryNote) {
-      messages.push({ role: 'system', content: pendingRecoveryNote });
-      pendingRecoveryNote = undefined;
-    }
-
-    const iterArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, mistakeTracker, signal };
-    const state: StreamPhaseState = { accumulated: '', startedAt: null };
-    let result: StreamResult;
-    try {
-      result = await executeStreamPhase(ctx, iterArgs, iterSession, messages, state, agent, webToolsEnabled);
-    } catch (err) {
-      await handleAbortOrError(ctx, iterArgs, state.accumulated, err);
-      break;
-    }
-
-    // ---- non-tool finish → finalize and exit ----
-    if (result.toolCalls.length === 0) {
-      // vWhale: Stop hook (best-effort, non-blocking).
-      if (ctx.hooks) {
-        ctx.hooks.run('Stop', {
-          event: 'Stop',
-          session_id: sessionId,
-          chat_id: chatId,
-          last_assistant_text: result.content.slice(0, 500),
-          turn: stepNumber,
-        }).catch(() => {});
+      // ---- load context (must re-load each iteration — new messages since last step) ----
+      const loaded = await loadContext(ctx.sql, sessionId, chatId);
+      if (!loaded) {
+        ctx.log.warn({ sessionId }, 'inference: session or project missing mid-loop');
+        break;
      }
-      await finalizeCompletion(ctx, iterArgs, result, state.startedAt, iterSession);
-      break;
-    }
+      let { session: iterSession, project: iterProject, history } = loaded;
+      if (args.modelOverride) {
+        iterSession = { ...iterSession, model: args.modelOverride };
+      }
+      const projectRoot = await resolveProjectRoot(iterProject.path);

-    // ---- steps: 0 edge case ----
-    // effectiveCap check above guarantees we're inside the loop, but this
-    // guard handles the theoretical case where the model emits tool calls
-    // on step 0 when effectiveCap would have been 0 (impossible since the
-    // while condition prevents entry, but kept for safety). If effectiveCap
-    // is 1 and we're on step 0, tool calls ARE executed — steps counts
-    // iterations, not post-first-stream.
+      try {
+        const dcpMsgs = toDcpMessages(history);
+        const { messages: pruned, stats } = transformMessages(chatId, dcpMsgs);
+        if (stats.removedCount > 0) {
+          ctx.log.info({ chatId, ...stats }, 'dcp: transform removed messages');
+          history = fromDcpMessages(pruned) as typeof history;
+        }
+      } catch (err) {
+        ctx.log.warn({ err: err instanceof Error ? err.message : String(err), chatId }, 'dcp: transform skipped');
+      }

-    // ---- tool phase ----
-    let toolPhaseResult: ToolPhaseResult;
-    try {
-      toolPhaseResult = await executeToolPhase(ctx, iterArgs, result, state.startedAt, iterSession, projectRoot, agent, stepNumber);
-    } catch (err) {
-      // Tool phase errors are unexpected (individual tool failures are
-      // caught inside executeToolPhase). Log and break.
-      ctx.log.error({ err, sessionId, chatId, step: stepNumber }, 'tool phase threw unexpectedly');
-      break;
-    }
+      // v1.14.0: log step boundary for instrumentation. step_start parts are in
+      // the schema CHECK but not emitted here — writing to the assistant message
+      // before the stream phase creates a sequence-0 collision with
+      // partsFromAssistantMessage. A WS frame or structured log is sufficient
+      // since the frontend doesn't render step boundaries in v1.14.
+      ctx.log.info({ sessionId, chatId, step: stepNumber, assistantMessageId }, 'step_start');

-    // ---- update loop locals ----
-    toolsUsed += toolPhaseResult.toolCallCount;
-    recentToolCalls = [...recentToolCalls, ...toolPhaseResult.toolCalls];
-    stepNumber++;
+      // ---- build messages + stream phase ----
+      const messages = await buildMessagesPayload(iterSession, iterProject, history, agent, ctx.log);
+      const webToolsEnabled =
+        iterSession.web_search_enabled ?? iterProject.default_web_search_enabled ?? false;

-    // v#12 MistakeTracker: fold this iteration's tool outcomes into the
-    // tracker, in order. recordStep mutates `mistakeTracker` in place (it is
-    // the same object referenced by args). A 'success' clears the streak.
-    for (const o of toolPhaseResult.outcomes) {
-      recordStep(mistakeTracker, o);
-    }
+      if (stepNumber === 0 && webToolsEnabled && messages.length >= 2) {
+        const lastUserMsg = [...messages].reverse().find((m) => m.role === 'user');
+        if (lastUserMsg?.content) {
+          const hint = await rewriteSearchQuery(lastUserMsg.content);
+          if (hint && messages[0]?.role === 'system' && messages[0].content) {
+            messages[0].content += `\n\nThe user's search intent can be summarized as: "${hint}"`;
+          }
+        }
+      }

-    // vWhale: auto-fix — after write tools, attempt build and inject errors.
-    const WRITE_TOOLS = new Set(['edit_file', 'create_file', 'delete_file', 'apply_pending']);
-    const hasWriteTools = toolPhaseResult.toolCalls.some((tc) => WRITE_TOOLS.has(tc.name));
-    if (hasWriteTools) {
-      detectAndRunBuild(ctx, projectRoot, sessionId, chatId, iterSession.model, pendingRecoveryNote)
-        .then((buildError) => {
-          if (buildError) pendingRecoveryNote = buildError;
-        })
-        .catch(() => {});
-    }
+      // v#12 MistakeTracker: if the prior iteration's nudge fired, append the
+      // transient recovery note to THIS payload (consumed exactly once, then
+      // cleared). Never persisted — same lifecycle as the cap-hit/doom-loop
+      // summary notes, which live only inside the in-memory messages array.
+      if (pendingRecoveryNote) {
+        messages.push({ role: 'system', content: pendingRecoveryNote });
+        pendingRecoveryNote = undefined;
+      }

-    // v#12 MistakeTracker: post-tool decision (pure). 'stop' = the tool phase
-    // returned a non-'continue' action ('paused' for user input, or
-    // 'synthesis_done') — neither a nudge nor an escalate would change the
-    // control flow, so the mistake check is skipped. On 'continue' the
-    // heterogeneous-failure pattern gates nudge/escalate/continue. Complements
-    // the doom-loop gate above, which only catches *identical* repeats.
-    const post = decidePostToolAction(toolPhaseResult.action, mistakeTracker);
-    if (post === 'stop') {
-      break;
-    }
-    if (post === 'nudge') {
-      // Soft intervention: inject model-facing recovery guidance into the NEXT
-      // step's payload, drop a UI sentinel, bump nudges, reset the streak, and
-      // continue. The note is consumed (and cleared) at the top of the next
-      // iteration's payload build.
-      pendingRecoveryNote = MISTAKE_RECOVERY_NOTE;
-      const failureKinds = [...mistakeTracker.run];
-      await insertMistakeRecoverySentinel(ctx, sessionId, chatId, {
-        failureKinds,
-        count: failureKinds.length,
-        escalated: false,
-        canContinue: true,
-      });
-      mistakeTracker.nudges += 1;
-      mistakeTracker.run = [];
-      ctx.log.info(
-        { sessionId, chatId, step: stepNumber, nudges: mistakeTracker.nudges, failureKinds },
-        'mistake_recovery nudge',
-      );
+      const iterArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, mistakeTracker, signal };
+      const state: StreamPhaseState = { accumulated: '', startedAt: null };
+      let result: StreamResult;
+      try {
+        result = await executeStreamPhase(ctx, iterArgs, iterSession, messages, state, agent, webToolsEnabled);
+      } catch (err) {
+        await handleAbortOrError(ctx, iterArgs, state.accumulated, err);
+        break;
+      }
+
+      // ---- non-tool finish → finalize and exit ----
+      if (result.toolCalls.length === 0) {
+        // vWhale: Stop hook (best-effort, non-blocking).
+        if (ctx.hooks) {
+          ctx.hooks.run('Stop', {
+            event: 'Stop',
+            session_id: sessionId,
+            chat_id: chatId,
+            last_assistant_text: result.content.slice(0, 500),
+            turn: stepNumber,
+          }).catch(() => {});
+        }
+        await finalizeCompletion(ctx, iterArgs, result, state.startedAt, iterSession);
+        break;
+      }
+
+      // ---- steps: 0 edge case ----
+      // effectiveCap check above guarantees we're inside the loop, but this
+      // guard handles the theoretical case where the model emits tool calls
+      // on step 0 when effectiveCap would have been 0 (impossible since the
+      // while condition prevents entry, but kept for safety). If effectiveCap
+      // is 1 and we're on step 0, tool calls ARE executed — steps counts
+      // iterations, not post-first-stream.
+
+      // ---- tool phase ----
+      let toolPhaseResult: ToolPhaseResult;
+      try {
+        toolPhaseResult = await executeToolPhase(ctx, iterArgs, result, state.startedAt, iterSession, projectRoot, agent, stepNumber);
+      } catch (err) {
+        // Tool phase errors are unexpected (individual tool failures are
+        // caught inside executeToolPhase). Log and break.
+        ctx.log.error({ err, sessionId, chatId, step: stepNumber }, 'tool phase threw unexpectedly');
+        break;
+      }
+
+      // ---- update loop locals ----
+      toolsUsed += toolPhaseResult.toolCallCount;
+      recentToolCalls = [...recentToolCalls, ...toolPhaseResult.toolCalls];
+      stepNumber++;
+
+      // v#12 MistakeTracker: fold this iteration's tool outcomes into the
+      // tracker, in order. recordStep mutates `mistakeTracker` in place (it is
+      // the same object referenced by args). A 'success' clears the streak.
+      for (const o of toolPhaseResult.outcomes) {
+        recordStep(mistakeTracker, o);
+      }
+
+      // vWhale: auto-fix — after write tools, attempt build and inject errors.
+      const WRITE_TOOLS = new Set(['edit_file', 'create_file', 'delete_file', 'apply_pending']);
+      const hasWriteTools = toolPhaseResult.toolCalls.some((tc) => WRITE_TOOLS.has(tc.name));
+      if (hasWriteTools) {
+        detectAndRunBuild(ctx, projectRoot, sessionId, chatId, iterSession.model, pendingRecoveryNote)
+          .then((buildError) => {
+            if (buildError) pendingRecoveryNote = buildError;
+          })
+          .catch(() => {});
+      }
+
+      // v#12 MistakeTracker: post-tool decision (pure). 'stop' = the tool phase
+      // returned a non-'continue' action ('paused' for user input, or
+      // 'synthesis_done') — neither a nudge nor an escalate would change the
+      // control flow, so the mistake check is skipped. On 'continue' the
+      // heterogeneous-failure pattern gates nudge/escalate/continue. Complements
+      // the doom-loop gate above, which only catches *identical* repeats.
+      const post = decidePostToolAction(toolPhaseResult.action, mistakeTracker);
+      if (post === 'stop') {
+        break;
+      }
+      if (post === 'nudge') {
+        // Soft intervention: inject model-facing recovery guidance into the NEXT
+        // step's payload, drop a UI sentinel, bump nudges, reset the streak, and
+        // continue. The note is consumed (and cleared) at the top of the next
+        // iteration's payload build.
+        pendingRecoveryNote = MISTAKE_RECOVERY_NOTE;
+        const failureKinds = [...mistakeTracker.run];
+        await insertMistakeRecoverySentinel(ctx, sessionId, chatId, {
+          failureKinds,
+          count: failureKinds.length,
+          escalated: false,
+          canContinue: true,
+        });
+        mistakeTracker.nudges += 1;
+        mistakeTracker.run = [];
+        ctx.log.info(
+          { sessionId, chatId, step: stepNumber, nudges: mistakeTracker.nudges, failureKinds },
+          'mistake_recovery nudge',
+        );
+        assistantMessageId = toolPhaseResult.nextAssistantId!;
+        continue;
+      }
+      if (post === 'escalate') {
+        // The nudge didn't break the failure run — stop the turn (cap-hit-style)
+        // to avoid burning the whole step budget on heterogeneous failures. The
+        // next assistant row is still 'streaming'; finalize it as an empty
+        // complete row so the slot doesn't dangle, then drop the escalate
+        // sentinel.
+        const failureKinds = [...mistakeTracker.run];
+        assistantMessageId = toolPhaseResult.nextAssistantId!;
+        const escalateArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, mistakeTracker, signal };
+        await finalizeEmpty(ctx, escalateArgs);
+        await insertMistakeRecoverySentinel(ctx, sessionId, chatId, {
+          failureKinds,
+          count: failureKinds.length,
+          escalated: true,
+          canContinue: true,
+        });
+        ctx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'idle', at: new Date().toISOString() });
+        ctx.log.info(
+          { sessionId, chatId, step: stepNumber, failureKinds },
+          'mistake_recovery escalate — stopping turn',
+        );
+        break;
+      }
+
+      // 'continue' — advance to next assistant message.
      assistantMessageId = toolPhaseResult.nextAssistantId!;
-      continue;
    }
-    if (post === 'escalate') {
-      // The nudge didn't break the failure run — stop the turn (cap-hit-style)
-      // to avoid burning the whole step budget on heterogeneous failures. The
-      // next assistant row is still 'streaming'; finalize it as an empty
-      // complete row so the slot doesn't dangle, then drop the escalate
-      // sentinel.
-      const failureKinds = [...mistakeTracker.run];
-      assistantMessageId = toolPhaseResult.nextAssistantId!;
-      const escalateArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, mistakeTracker, signal };
-      await finalizeEmpty(ctx, escalateArgs);
-      await insertMistakeRecoverySentinel(ctx, sessionId, chatId, {
-        failureKinds,
-        count: failureKinds.length,
-        escalated: true,
-        canContinue: true,
-      });
-      ctx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'idle', at: new Date().toISOString() });
-      ctx.log.info(
-        { sessionId, chatId, step: stepNumber, failureKinds },
-        'mistake_recovery escalate — stopping turn',
-      );
-      break;
-    }
-
-    // 'continue' — advance to next assistant message.
-    assistantMessageId = toolPhaseResult.nextAssistantId!;
  }

  // vWhale: Stop hook at post-loop exit (best-effort, non-blocking).
@@ -438,8 +486,9 @@ export async function runAssistantTurn(
  if (stepNumber >= effectiveCap && effectiveCap < Infinity) {
    const loaded = await loadContext(ctx.sql, sessionId, chatId);
    if (loaded) {
+      const scSession = args.modelOverride ? { ...loaded.session, model: args.modelOverride } : loaded.session;
      const capArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, mistakeTracker, signal };
-      await runStepCapSummary(ctx, capArgs, loaded.session, loaded.project, loaded.history, agent, stepNumber, effectiveCap);
+      await runStepCapSummary(ctx, capArgs, scSession, loaded.project, loaded.history, agent, stepNumber, effectiveCap);
    }
  }
 }
@@ -510,6 +559,31 @@ export async function runInference(
  });
 }

+// v2.8-compare: run inference with a model override and compare group id.
+// Used by the compare endpoint to run the same message through N models in
+// parallel. Each call publishes frames scoped to its compare_group_id.
+export async function runInferenceWithModel(
+  ctx: InferenceContext,
+  sessionId: string,
+  chatId: string,
+  assistantMessageId: string,
+  modelOverride: string,
+  compareGroupId: string,
+  signal?: AbortSignal,
+): Promise<void> {
+  return runAssistantTurn(ctx, {
+    sessionId,
+    chatId,
+    assistantMessageId,
+    toolsUsed: 0,
+    recentToolCalls: [],
+    mistakeTracker: freshMistakeState(),
+    modelOverride,
+    compareGroupId,
+    signal,
+  });
+}
+
 // v1.8.2: cap-hit summary flow. Called instead of erroring when the loop
 // hits its budget. Reuses the in-flight assistant message slot to stream a
 // short wrap-up reply with the synthetic note prepended and tools disabled,
--- a/apps/server/src/services/inference/types.ts
+++ b/apps/server/src/services/inference/types.ts
@@ -52,7 +52,9 @@ export interface InferenceFrame {
    // arena frames
    | 'battle_started'
    | 'contestant_updated'
-    | 'battle_updated';
+    | 'battle_updated'
+    // inter-agent message
+    | 'agent_message';
  message_id?: string;
  message_ids?: string[];
  chat_id?: string;
@@ -103,6 +105,11 @@ export interface InferenceFrame {
  status?: string;
  run_status?: 'running' | 'completed' | 'failed' | 'cancelled';
  report?: string;
+  // v2.8-compare: groups messages belonging to the same compare operation.
+  compare_group_id?: string;
+  // inter-agent message
+  sender_step_id?: string;
+  channel?: string;
  // arena frames
  battle_id?: string;
  battle_type?: 'coding' | 'qa';
@@ -177,5 +184,10 @@ export interface TurnArgs {
  // Never persisted — mirrors how the cap-hit/doom-loop notes live only inside
  // the summary call's messages array.
  pendingRecoveryNote?: string;
+  // v2.8-compare: when set, overrides the session model for this single turn.
+  // Used by the compare endpoint to run the same message through N models.
+  modelOverride?: string;
+  // v2.8-compare: opaque group id that rides on every published frame.
+  compareGroupId?: string;
  signal: AbortSignal | undefined;
 }