v1.14.0-outer-loop: explicit while loop replaces inference recursion

Converts the ad-hoc executeToolPhase → runAssistantTurn recursion into an explicit while (stepNumber < effectiveCap) loop. A step is one stream-and- tool-execute iteration; the loop terminates on non-tool finish, step-cap hit, doom-loop, budget exhaustion, abort, or synthesis success. MAX_STEPS = 200 hard ceiling (4x old effective limit from budget). Per-agent steps: field in AGENTS.md frontmatter sets tighter caps (Refactorer: 5, Architect: 20, others: unset = bounded only by MAX_STEPS). Resolution: effectiveCap = Math.min(agent.steps ?? Infinity, MAX_STEPS). executeToolPhase no longer recurses — returns ToolPhaseResult struct (action: 'continue' | 'paused' | 'synthesis_done') so the caller decides whether to continue or break. steps: 0 handled as "no tool calls allowed" via runTextOnlyTurn (one text-only stream phase, tool calls ignored with warn log). Step-cap hits produce a sentinel summary (reuses cap_hit kind so CapHitSentinel.tsx renders without frontend changes; text distinguishes "Step limit reached" from "Tool budget exhausted"). Doom-loop check migrated to top of loop body — same predicate, same threshold (3), break instead of return. step_start parts are in the schema CHECK but not emitted as message_parts — writing before the stream phase creates a sequence-0 collision with partsFromAssistantMessage. Structured log line emitted instead. Adversarial review caught the collision pre-deploy. 332/332 server tests passing. No frontend changes. No schema changes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 20:29:21 +00:00
parent 211e903620
commit f4a97808ad
14 changed files with 756 additions and 204 deletions
--- a/apps/server/src/services/agents.ts
+++ b/apps/server/src/services/agents.ts
@@ -37,6 +37,10 @@ interface ParsedFrontmatter {
  // v1.8.2: optional per-agent tool-loop budget. Absent → inference resolves
  // from the agent's toolset at runtime.
  max_tool_calls?: number;
+  // v1.14.0: optional per-agent step cap. Absent → bounded only by MAX_STEPS
+  // (200) in the outer loop. Integer ≥ 0; steps: 0 means "no tool calls
+  // allowed" — the model responds text-only.
+  steps?: number;
 }

 function stripQuotes(s: string): string {
@@ -112,6 +116,21 @@ function parseFrontmatter(yaml: string): { data: ParsedFrontmatter; errors: stri
      } else {
        errors.push(`max_tool_calls must be an integer 1-100 (got "${valueRaw}")`);
      }
+    } else if (key === 'steps') {
+      // v1.14.0: per-agent step cap for the outer inference loop. Integer ≥ 0.
+      // steps: 0 means "no tool calls allowed" — model responds text-only.
+      // Non-integer or negative values are warned and ignored (falls back to
+      // MAX_STEPS ceiling), matching the max_tool_calls pattern above.
+      const n = Number(valueRaw);
+      if (Number.isInteger(n) && n >= 0) {
+        data.steps = n;
+      } else if (Number.isInteger(n)) {
+        console.warn(
+          `agents: steps ${n} is negative, ignoring (falling back to default)`,
+        );
+      } else {
+        errors.push(`steps must be a non-negative integer (got "${valueRaw}")`);
+      }
    }
    // Unknown keys silently ignored — forward-compat.
  }
@@ -204,6 +223,7 @@ function parseAgentSection(section: RawSection): Omit<Agent, 'source'> {
    tools: filteredTools,
    model: typeof fm.model === 'string' && fm.model.length > 0 ? fm.model : null,
    max_tool_calls: typeof fm.max_tool_calls === 'number' ? fm.max_tool_calls : null,
+    steps: typeof fm.steps === 'number' ? fm.steps : null,
  };
 }

--- a/apps/server/src/services/inference/index.ts
+++ b/apps/server/src/services/inference/index.ts
@@ -6,6 +6,7 @@

 export {
  createInferenceRunner,
+  MAX_STEPS,
  runAssistantTurn,
  runInference,
 } from './turn.js';
@@ -16,5 +17,6 @@ export type {
  StreamResult,
  TurnArgs,
 } from './turn.js';
+export type { ToolPhaseResult } from './tool-phase.js';
 export { detectDoomLoop, DOOM_LOOP_THRESHOLD } from './sentinels.js';
 export { buildMessagesPayload } from './payload.js';
--- a/apps/server/src/services/inference/sentinel-summaries.ts
+++ b/apps/server/src/services/inference/sentinel-summaries.ts
@@ -476,6 +476,202 @@ export async function runDoomLoopSummary(
  );
 }

+// v1.14.0: step-cap wrap-up. Mirrors runCapHitSummary structurally — same
+// in-flight-slot reuse, same tools-disabled streaming-summary call, same
+// post-finalize sentinel insert + chat_status drop. Difference: the note
+// text names the step limit rather than the tool budget. Sentinel reuses
+// metadata.kind = 'cap_hit' so the frontend CapHitSentinel component
+// renders it without changes.
+const STEP_CAP_NOTE = (steps: number, cap: number) =>
+  `You've reached the step limit (${steps}/${cap} steps). Produce the best answer you can with what you have. Do not call more tools.`;
+
+export async function runStepCapSummary(
+  ctx: InferenceContext,
+  args: TurnArgs,
+  session: Session,
+  project: Project,
+  history: Message[],
+  agent: Agent | null,
+  steps: number,
+  cap: number,
+): Promise<void> {
+  const { sessionId, chatId, assistantMessageId, signal } = args;
+
+  const messages = await buildMessagesPayload(session, project, history, agent, ctx.log);
+  messages.push({ role: 'system', content: STEP_CAP_NOTE(steps, cap) });
+
+  const startedRow = await ctx.sql<{ started_at: string }[]>`
+    UPDATE messages
+    SET started_at = clock_timestamp()
+    WHERE id = ${assistantMessageId}
+    RETURNING started_at
+  `;
+  const startedAt = startedRow[0]?.started_at ?? null;
+
+  ctx.publish(sessionId, {
+    type: 'message_started',
+    message_id: assistantMessageId,
+    chat_id: chatId,
+    role: 'assistant',
+  });
+
+  let accumulated = '';
+  let pendingFlushTimer: NodeJS.Timeout | null = null;
+  let flushPromise: Promise<unknown> = Promise.resolve();
+  const flushNow = () => {
+    if (pendingFlushTimer) {
+      clearTimeout(pendingFlushTimer);
+      pendingFlushTimer = null;
+    }
+    const snapshot = accumulated;
+    flushPromise = flushPromise.then(() =>
+      ctx.sql`UPDATE messages SET content = ${snapshot} WHERE id = ${assistantMessageId}`
+    );
+  };
+  const scheduleFlush = () => {
+    if (pendingFlushTimer) return;
+    pendingFlushTimer = setTimeout(() => {
+      pendingFlushTimer = null;
+      flushNow();
+    }, DB_FLUSH_INTERVAL_MS);
+  };
+
+  let summaryOk = false;
+  let summarySoftCancelled = false;
+  let summaryError: string | null = null;
+  let result: StreamResult | null = null;
+  try {
+    result = await streamCompletion(
+      ctx,
+      session.model,
+      messages,
+      { tools: null, temperature: agent?.temperature },
+      (delta) => {
+        accumulated += delta;
+        ctx.publish(sessionId, {
+          type: 'delta',
+          message_id: assistantMessageId,
+          chat_id: chatId,
+          content: delta,
+        });
+        scheduleFlush();
+      },
+      undefined,
+      signal,
+    );
+    summaryOk = true;
+  } catch (err) {
+    if (err instanceof Error && err.name === 'AbortError') {
+      summarySoftCancelled = true;
+    } else {
+      summaryError = err instanceof Error ? err.message : String(err);
+    }
+  } finally {
+    if (pendingFlushTimer) {
+      clearTimeout(pendingFlushTimer);
+      pendingFlushTimer = null;
+    }
+    await flushPromise;
+  }
+
+  if (summaryOk && result) {
+    const mctx = await modelContext.getModelContext(session.model);
+    const nCtx = mctx?.n_ctx ?? null;
+    const [updated] = await ctx.sql<
+      { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
+    >`
+      UPDATE messages
+      SET content = ${result.content},
+          status = 'complete',
+          tokens_used = ${result.completionTokens},
+          ctx_used = ${result.promptTokens},
+          ctx_max = ${nCtx},
+          finished_at = clock_timestamp()
+      WHERE id = ${assistantMessageId}
+      RETURNING tokens_used, ctx_used, ctx_max, finished_at
+    `;
+    ctx.publish(sessionId, {
+      type: 'message_complete',
+      message_id: assistantMessageId,
+      chat_id: chatId,
+      tokens_used: updated?.tokens_used ?? null,
+      ctx_used: updated?.ctx_used ?? null,
+      ctx_max: updated?.ctx_max ?? null,
+      started_at: startedAt,
+      finished_at: updated?.finished_at ?? null,
+      model: session.model,
+    });
+  } else if (summarySoftCancelled) {
+    await ctx.sql`
+      UPDATE messages
+      SET content = ${accumulated},
+          status = 'cancelled',
+          finished_at = clock_timestamp()
+      WHERE id = ${assistantMessageId}
+    `;
+    ctx.publish(sessionId, {
+      type: 'message_complete',
+      message_id: assistantMessageId,
+      chat_id: chatId,
+    });
+  } else {
+    const errMeta: MessageMetadata = {
+      kind: 'error',
+      error_reason: 'summary_after_cap_failed',
+      error_text: summaryError ?? 'step-cap summary failed',
+    };
+    await ctx.sql`
+      UPDATE messages
+      SET content = ${accumulated},
+          status = 'failed',
+          finished_at = clock_timestamp(),
+          metadata = ${ctx.sql.json(errMeta as never)}
+      WHERE id = ${assistantMessageId}
+    `;
+    ctx.publish(sessionId, {
+      type: 'error',
+      message_id: assistantMessageId,
+      chat_id: chatId,
+      error: summaryError ?? 'step-cap summary failed',
+      reason: 'summary_after_cap_failed',
+    });
+  }
+
+  const [sessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>`
+    UPDATE sessions SET updated_at = clock_timestamp()
+    WHERE id = ${sessionId}
+    RETURNING project_id, name, updated_at
+  `;
+  ctx.publishUser({
+    type: 'session_updated',
+    session_id: sessionId,
+    project_id: sessRow!.project_id,
+    name: sessRow!.name,
+    updated_at: sessRow!.updated_at,
+  });
+
+  // Reuse cap_hit sentinel so the frontend CapHitSentinel component renders
+  // it without changes. The content text distinguishes step cap from budget.
+  await insertCapHitSentinel(ctx, sessionId, chatId, agent, cap);
+
+  if (summaryOk || summarySoftCancelled) {
+    ctx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'idle', at: new Date().toISOString() });
+  } else {
+    ctx.publishUser({
+      type: 'chat_status',
+      chat_id: chatId,
+      status: 'error',
+      at: new Date().toISOString(),
+      reason: 'summary_after_cap_failed',
+    });
+  }
+
+  ctx.log.info(
+    { sessionId, chatId, assistantMessageId, steps, cap, summaryOk, summaryCancelled: summarySoftCancelled },
+    'inference step-cap summary finished',
+  );
+}
+
 async function insertDoomLoopSentinel(
  ctx: InferenceContext,
  sessionId: string,
--- a/apps/server/src/services/inference/tool-phase.ts
+++ b/apps/server/src/services/inference/tool-phase.ts
@@ -19,11 +19,6 @@ import type {
  StreamResult,
  TurnArgs,
 } from './turn.js';
-// v1.12.4: ESM value-import cycle. executeToolPhase recurses into
-// runAssistantTurn which lives in inference.ts. The cycle is safe because
-// the reference is read at call time (inside an async function body), not
-// at module top-level. Node + tsc resolve this cleanly.
-import { runAssistantTurn } from './turn.js';
 // v1.13.13: synthesis pipeline — replaces the immediate recursive turn when
 // any of this batch's tool calls is in SYNTHESIS_TOOLS. Falls through to
 // recursion on synthesis failure (timeout / model error). See module header
@@ -86,6 +81,16 @@ async function executeToolCall(
  }
 }

+// v1.14.0: return struct from executeToolPhase so the caller (the outer
+// while loop in turn.ts) can decide whether to continue, break, or handle
+// synthesis. Replaces the recursive call into runAssistantTurn.
+export interface ToolPhaseResult {
+  action: 'continue' | 'paused' | 'synthesis_done';
+  toolCallCount: number;
+  toolCalls: ToolCall[];
+  nextAssistantId: string | null;
+}
+
 export async function executeToolPhase(
  ctx: InferenceContext,
  args: TurnArgs,
@@ -93,8 +98,8 @@ export async function executeToolPhase(
  startedAt: string | null,
  session: Session,
  projectRoot: string
-): Promise<void> {
-  const { sessionId, chatId, assistantMessageId, toolsUsed, signal } = args;
+): Promise<ToolPhaseResult> {
+  const { sessionId, chatId, assistantMessageId } = args;
  const { content, toolCalls, promptTokens, completionTokens } = result;

  // v1.11.3: ctx_max comes from llama-swap /upstream/<model>/props, not the
@@ -296,7 +301,12 @@ export async function executeToolPhase(
      { sessionId, chatId, assistantMessageId },
      'inference paused awaiting user input',
    );
-    return;
+    return {
+      action: 'paused' as const,
+      toolCallCount: toolCalls.length,
+      toolCalls,
+      nextAssistantId: null,
+    };
  }

  // v1.13.13: synthesis-pipeline branch. When any of this batch's tool calls
@@ -328,30 +338,30 @@ export async function executeToolPhase(
      ...(typeof out?.truncated === 'boolean' ? { truncated: out.truncated } : {}),
      ...(typeof out?.outputPath === 'string' ? { outputPath: out.outputPath } : {}),
    });
-    if (ran) return;
+    if (ran) {
+      return {
+        action: 'synthesis_done' as const,
+        toolCallCount: toolCalls.length,
+        toolCalls,
+        nextAssistantId: null,
+      };
+    }
    // ran === false → synthesis failed (timeout / model error) → fall through
-    // to the standard recursive turn below. The synth message (if created)
+    // to the standard continue path below. The synth message (if created)
    // was already marked status='failed' inside runSynthesisPass.
  }

+  // v1.14.0: create the next assistant row and return a continue result.
+  // The caller (outer while loop in turn.ts) handles the iteration.
  const [nextAssistant] = await ctx.sql<{ id: string }[]>`
    INSERT INTO messages (session_id, chat_id, role, content, status, created_at)
    VALUES (${sessionId}, ${chatId}, 'assistant', '', 'streaming', clock_timestamp())
    RETURNING id
  `;
-  await runAssistantTurn(ctx, {
-    sessionId,
-    chatId,
-    assistantMessageId: nextAssistant!.id,
-    // v1.8.2: charge this turn's actual tool invocations against the budget.
-    // One assistant message can emit multiple tool_calls, so we add the run
-    // count, not 1. The next turn's budget check sees the cumulative total.
-    toolsUsed: toolsUsed + result.toolCalls.length,
-    // v1.11.6: append the just-executed tool calls to the per-turn history
-    // so the next runAssistantTurn's doom-loop check can see them. We don't
-    // cap the array length here — per-turn budgets keep it bounded
-    // (typically <30 entries), and slicing happens inside detectDoomLoop.
-    recentToolCalls: [...args.recentToolCalls, ...result.toolCalls],
-    signal,
-  });
+  return {
+    action: 'continue' as const,
+    toolCallCount: toolCalls.length,
+    toolCalls,
+    nextAssistantId: nextAssistant!.id,
+  };
 }
--- a/apps/server/src/services/inference/turn.ts
+++ b/apps/server/src/services/inference/turn.ts
@@ -16,11 +16,9 @@ import { resolveProjectRoot } from '../path_guard.js';
 import { maybeAutoNameChat } from '../auto_name.js';
 import { getAgentById } from '../agents.js';
 import * as compaction from '../compaction.js';
-import * as modelContext from '../model-context.js';
 import type { Broker } from '../broker.js';
 import { resolveToolBudget } from './budget.js';
 import {
-  DOOM_LOOP_THRESHOLD,
  detectDoomLoop,
 } from './sentinels.js';
 import {
@@ -33,15 +31,23 @@ import {
 } from './error-handler.js';
 import {
  executeStreamPhase,
-  streamCompletion,
 } from './stream-phase.js';
-import { executeToolPhase } from './tool-phase.js';
-import { DB_FLUSH_INTERVAL_MS, type StreamPhaseState } from './types.js';
+import { executeToolPhase, type ToolPhaseResult } from './tool-phase.js';
+import type { StreamPhaseState } from './types.js';
 import {
  runCapHitSummary,
  runDoomLoopSummary,
+  runStepCapSummary,
 } from './sentinel-summaries.js';

+// v1.14.0: hard ceiling on the number of stream-and-tool iterations per
+// user-message turn. Per-agent cap via agent.steps is the primary knob;
+// MAX_STEPS is the safety ceiling. 200 is 4x the effective budget ceiling
+// (50 tool calls) — in practice budget fires first unless the model makes
+// many 0-tool-call iterations (which exit the loop via the non-tool finish
+// path anyway).
+export const MAX_STEPS = 200;
+
 // v1.12.4: re-exported so external callers (tests, future consumers) keep
 // importing from services/inference.js as the public surface.
 export { detectDoomLoop, DOOM_LOOP_THRESHOLD } from './sentinels.js';
@@ -145,75 +151,185 @@ export async function runAssistantTurn(
  ctx: InferenceContext,
  args: TurnArgs,
 ): Promise<void> {
-  const { sessionId, chatId } = args;
+  const { sessionId, chatId, signal } = args;

-  // v1.11: if the prior turn flagged this chat for compaction, run it first
-  // so loadContext below reads the post-compaction history. We swallow
-  // compaction failures (clearing the flag so we don't loop) and proceed
-  // with the un-compacted history — a slow turn that hits the model's
-  // hard limit is recoverable; a dead session is not.
-  const chatFlag = await ctx.sql<{ needs_compaction: boolean }[]>`
-    SELECT needs_compaction FROM chats WHERE id = ${chatId}
-  `;
-  if (chatFlag[0]?.needs_compaction) {
-    try {
-      await compaction.process({
-        sql: ctx.sql,
-        config: ctx.config,
-        log: ctx.log,
-        broker: ctx.broker,
-        chatId,
-      });
-    } catch (err) {
-      ctx.log.warn({ err, chatId }, 'auto-compaction failed; clearing flag and proceeding');
-      await ctx.sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
-    }
-  }
-
-  const loaded = await loadContext(ctx.sql, sessionId, chatId);
-  if (!loaded) {
+  // v1.14.0: resolve agent once at the top. The agent stays fixed for the
+  // duration of this user-message turn — PATCH agent_id mid-conversation
+  // takes effect on the next runInference, not mid-loop.
+  const initialLoaded = await loadContext(ctx.sql, sessionId, chatId);
+  if (!initialLoaded) {
    ctx.log.warn({ sessionId }, 'inference: session or project missing');
    return;
  }
-  const { session, project, history } = loaded;
-  const projectRoot = await resolveProjectRoot(project.path);
-  // Agent resolution is per-turn so PATCH agent_id mid-conversation takes
-  // effect on the next message. Unknown agent_id returns null silently —
-  // session falls back to base prompt + all tools + default temperature.
+  const { session, project } = initialLoaded;
  const agent = session.agent_id
    ? await getAgentById(project.path, session.agent_id)
    : null;
-
-  // v1.8.2: cap-hit replaces the older "tool loop depth exceeded" failure.
-  // When we've already burned the budget *before* this turn even runs, we
-  // skip straight to the summary flow — the in-flight assistant message slot
-  // gets reused for the wrap-up reply instead of being marked failed.
  const budget = resolveToolBudget(agent);
-  if (args.toolsUsed >= budget) {
-    await runCapHitSummary(ctx, args, session, project, history, agent, budget);
+
+  // v1.14.0: effectiveCap = min(agent.steps ?? Infinity, MAX_STEPS).
+  // steps: 0 means "no tool calls allowed" — the first stream phase runs
+  // but if it emits tool calls they are not executed (finalize as text-only).
+  const effectiveCap = Math.min(agent?.steps ?? Infinity, MAX_STEPS);
+
+  // steps: 0 special case — model responds text-only. The while loop would
+  // never enter (effectiveCap === 0), so we handle it explicitly before the
+  // loop. The model always gets at least one chance to respond with text.
+  if (effectiveCap === 0) {
+    const loaded = await loadContext(ctx.sql, sessionId, chatId);
+    if (loaded) {
+      await runTextOnlyTurn(ctx, args, loaded.session, loaded.project, loaded.history, agent);
+    }
    return;
  }

-  // v1.11.6: doom-loop guard. Detected BEFORE the budget cap (the model can
-  // burn through 3 identical calls long before the 15-call budget fires).
-  // Same in-flight-slot-reuse pattern as runCapHitSummary — wrap-up reply
-  // lands in args.assistantMessageId, then a doom_loop sentinel is inserted
-  // to make the abort visible in the chat history.
-  const loop = detectDoomLoop(args.recentToolCalls);
-  if (loop) {
-    await runDoomLoopSummary(ctx, args, session, project, history, agent, loop);
-    return;
+  let stepNumber = 0;
+  let toolsUsed = args.toolsUsed;
+  let recentToolCalls = args.recentToolCalls;
+  let assistantMessageId = args.assistantMessageId;
+
+  while (stepNumber < effectiveCap) {
+    // ---- doom-loop check (moved from top-of-function) ----
+    const loop = detectDoomLoop(recentToolCalls);
+    if (loop) {
+      // Need fresh history for the summary.
+      const loaded = await loadContext(ctx.sql, sessionId, chatId);
+      if (loaded) {
+        const iterArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, signal };
+        await runDoomLoopSummary(ctx, iterArgs, loaded.session, loaded.project, loaded.history, agent, loop);
+      }
+      break;
+    }
+
+    // ---- budget check (moved from top-of-function) ----
+    if (toolsUsed >= budget) {
+      const loaded = await loadContext(ctx.sql, sessionId, chatId);
+      if (loaded) {
+        const iterArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, signal };
+        await runCapHitSummary(ctx, iterArgs, loaded.session, loaded.project, loaded.history, agent, budget);
+      }
+      break;
+    }
+
+    // ---- compaction check ----
+    // v1.11: if the prior turn flagged this chat for compaction, run it
+    // before loadContext so we read post-compaction history. Swallow
+    // failures and proceed with un-compacted history.
+    const chatFlag = await ctx.sql<{ needs_compaction: boolean }[]>`
+      SELECT needs_compaction FROM chats WHERE id = ${chatId}
+    `;
+    if (chatFlag[0]?.needs_compaction) {
+      try {
+        await compaction.process({
+          sql: ctx.sql,
+          config: ctx.config,
+          log: ctx.log,
+          broker: ctx.broker,
+          chatId,
+        });
+      } catch (err) {
+        ctx.log.warn({ err, chatId }, 'auto-compaction failed; clearing flag and proceeding');
+        await ctx.sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
+      }
+    }
+
+    // ---- load context (must re-load each iteration — new messages since last step) ----
+    const loaded = await loadContext(ctx.sql, sessionId, chatId);
+    if (!loaded) {
+      ctx.log.warn({ sessionId }, 'inference: session or project missing mid-loop');
+      break;
+    }
+    const { session: iterSession, project: iterProject, history } = loaded;
+    const projectRoot = await resolveProjectRoot(iterProject.path);
+
+    // v1.14.0: log step boundary for instrumentation. step_start parts are in
+    // the schema CHECK but not emitted here — writing to the assistant message
+    // before the stream phase creates a sequence-0 collision with
+    // partsFromAssistantMessage. A WS frame or structured log is sufficient
+    // since the frontend doesn't render step boundaries in v1.14.
+    ctx.log.info({ sessionId, chatId, step: stepNumber, assistantMessageId }, 'step_start');
+
+    // ---- build messages + stream phase ----
+    const messages = await buildMessagesPayload(iterSession, iterProject, history, agent, ctx.log);
+    const webToolsEnabled =
+      iterSession.web_search_enabled ?? iterProject.default_web_search_enabled ?? false;
+
+    const iterArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, signal };
+    const state: StreamPhaseState = { accumulated: '', startedAt: null };
+    let result: StreamResult;
+    try {
+      result = await executeStreamPhase(ctx, iterArgs, iterSession, messages, state, agent, webToolsEnabled);
+    } catch (err) {
+      await handleAbortOrError(ctx, iterArgs, state.accumulated, err);
+      break;
+    }
+
+    // ---- non-tool finish → finalize and exit ----
+    if (result.toolCalls.length === 0) {
+      await finalizeCompletion(ctx, iterArgs, result, state.startedAt, iterSession);
+      break;
+    }
+
+    // ---- steps: 0 edge case ----
+    // effectiveCap check above guarantees we're inside the loop, but this
+    // guard handles the theoretical case where the model emits tool calls
+    // on step 0 when effectiveCap would have been 0 (impossible since the
+    // while condition prevents entry, but kept for safety). If effectiveCap
+    // is 1 and we're on step 0, tool calls ARE executed — steps counts
+    // iterations, not post-first-stream.
+
+    // ---- tool phase ----
+    let toolPhaseResult: ToolPhaseResult;
+    try {
+      toolPhaseResult = await executeToolPhase(ctx, iterArgs, result, state.startedAt, iterSession, projectRoot);
+    } catch (err) {
+      // Tool phase errors are unexpected (individual tool failures are
+      // caught inside executeToolPhase). Log and break.
+      ctx.log.error({ err, sessionId, chatId, step: stepNumber }, 'tool phase threw unexpectedly');
+      break;
+    }
+
+    // ---- update loop locals ----
+    toolsUsed += toolPhaseResult.toolCallCount;
+    recentToolCalls = [...recentToolCalls, ...toolPhaseResult.toolCalls];
+    stepNumber++;
+
+    if (toolPhaseResult.action !== 'continue') {
+      // 'paused' (user input) or 'synthesis_done' — stop the loop.
+      break;
+    }
+    // 'continue' — advance to next assistant message.
+    assistantMessageId = toolPhaseResult.nextAssistantId!;
  }

+  // ---- post-loop: step-cap sentinel ----
+  // When the loop exits because stepNumber reached effectiveCap, the last
+  // iteration's tool phase returned 'continue' with a nextAssistantId that
+  // is still in 'streaming' status (unfilled). Use it for the wrap-up.
+  if (stepNumber >= effectiveCap && effectiveCap < Infinity) {
+    const loaded = await loadContext(ctx.sql, sessionId, chatId);
+    if (loaded) {
+      const capArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, signal };
+      await runStepCapSummary(ctx, capArgs, loaded.session, loaded.project, loaded.history, agent, stepNumber, effectiveCap);
+    }
+  }
+}
+
+// v1.14.0: special handling for steps: 0 — the model responds text-only.
+// The while loop never enters (effectiveCap === 0). We stream once with
+// no tools, finalize, and return. If the model emits tool calls despite
+// not being offered tools, they're ignored (finalize as text-only).
+async function runTextOnlyTurn(
+  ctx: InferenceContext,
+  args: TurnArgs,
+  session: Session,
+  project: Project,
+  history: Message[],
+  agent: Agent | null,
+): Promise<void> {
  const messages = await buildMessagesPayload(session, project, history, agent, ctx.log);
-
-  // v1.11.8: resolve per-chat web-tools opt-in. Tri-state on the wire:
-  //   - session.web_search_enabled = null → inherit project default
-  //   - session.web_search_enabled = true/false → explicit
-  // Both web_search and web_fetch are gated by this single flag (the UI
-  // label is "Enable web search and fetch" — same store, both tools).
-  // Default is false unless explicitly opted in, matching the v1.9
-  // plumbing intent ("inert until Batch 8 ships the actual tools").
+  // Web tools are irrelevant when steps: 0 (no tool execution), but we
+  // still need to resolve the flag for executeStreamPhase's signature.
  const webToolsEnabled =
    session.web_search_enabled ?? project.default_web_search_enabled ?? false;

@@ -227,8 +343,12 @@ export async function runAssistantTurn(
  }

  if (result.toolCalls.length > 0) {
-    await executeToolPhase(ctx, args, result, state.startedAt, session, projectRoot);
-    return;
+    ctx.log.warn(
+      { chatId: args.chatId, toolCallCount: result.toolCalls.length },
+      'steps: 0 agent emitted tool calls; ignoring and finalizing as text-only',
+    );
+    // Override: strip tool calls so finalizeCompletion treats it as text-only.
+    result = { ...result, toolCalls: [] };
  }

  await finalizeCompletion(ctx, args, result, state.startedAt, session);
--- a/apps/server/src/types/api.ts
+++ b/apps/server/src/types/api.ts
@@ -106,6 +106,9 @@ export interface Agent {
  // agent's toolset (30 if all tools are read-only, 10 otherwise) or 15 for
  // raw chat with no agent.
  max_tool_calls: number | null;
+  // v1.14.0: per-agent step cap for the outer inference loop. null means
+  // bounded only by MAX_STEPS (200). 0 means "no tool calls allowed."
+  steps: number | null;
 }

 // One entry per malformed `## Name` block. Per-block errors don't fail the
--- a/apps/web/src/api/types.ts
+++ b/apps/web/src/api/types.ts
@@ -73,6 +73,9 @@ export interface Agent {
  // the agent's toolset (30 for all read-only, 10 otherwise) or 15 for raw
  // chat with no agent.
  max_tool_calls: number | null;
+  // v1.14.0: per-agent step cap for the outer inference loop. null means
+  // bounded only by MAX_STEPS (200). 0 means "no tool calls allowed."
+  steps: number | null;
 }

 export interface AgentParseError {