v1.11.6: doom-loop guard (3 identical tool calls aborts recursion)

This commit is contained in:
2026-05-20 20:28:45 +00:00
parent 4ec196273b
commit f92b0810c3
8 changed files with 1050 additions and 176 deletions

View File

@@ -0,0 +1,130 @@
import { describe, it, expect } from 'vitest';
import { DOOM_LOOP_THRESHOLD, detectDoomLoop } from '../inference.js';
import type { ToolCall } from '../../types/api.js';
// ---- fixture ----------------------------------------------------------------
// Tiny helper. `id` is required on ToolCall but irrelevant to detection —
// detectDoomLoop compares name + JSON.stringify(args). Counter-based id keeps
// each call unique so we don't accidentally test id-based equality.
let counter = 0;
function mkCall(name: string, args: Record<string, unknown> = {}): ToolCall {
counter += 1;
return { id: `c${counter}`, name, args };
}
// ---- below-threshold -------------------------------------------------------
describe('detectDoomLoop — below threshold', () => {
it('returns null for an empty array', () => {
expect(detectDoomLoop([])).toBeNull();
});
it('returns null when fewer than DOOM_LOOP_THRESHOLD calls exist', () => {
// 2 < 3 — sliding-window can't form even if both match.
const a = mkCall('view_file', { path: 'a.ts' });
const b = mkCall('view_file', { path: 'a.ts' });
expect(detectDoomLoop([a, b])).toBeNull();
});
});
// ---- positive detection ----------------------------------------------------
describe('detectDoomLoop — positive matches', () => {
it('returns name + args when exactly DOOM_LOOP_THRESHOLD identical calls land', () => {
const calls = [
mkCall('grep', { pattern: 'TODO', path: 'src' }),
mkCall('grep', { pattern: 'TODO', path: 'src' }),
mkCall('grep', { pattern: 'TODO', path: 'src' }),
];
const result = detectDoomLoop(calls);
expect(result).not.toBeNull();
expect(result!.name).toBe('grep');
expect(result!.args).toEqual({ pattern: 'TODO', path: 'src' });
});
it('matches sliding window — last DOOM_LOOP_THRESHOLD match even with earlier non-matching calls', () => {
// 4 calls: first differs, last 3 are identical → fire.
const calls = [
mkCall('list_dir', { path: '/' }),
mkCall('view_file', { path: 'a.ts' }),
mkCall('view_file', { path: 'a.ts' }),
mkCall('view_file', { path: 'a.ts' }),
];
const result = detectDoomLoop(calls);
expect(result).not.toBeNull();
expect(result!.name).toBe('view_file');
});
it('matches identical empty-args calls (defense against {} !== {} reference bug)', () => {
// JSON.stringify on two distinct {} both produce '{}'. Confirms the
// detector uses value-equality not reference-equality.
const calls = [mkCall('ping', {}), mkCall('ping', {}), mkCall('ping', {})];
expect(detectDoomLoop(calls)).not.toBeNull();
});
it('matches calls with nested args of equal shape', () => {
// Deep-equal via JSON.stringify. If the model emits the same nested
// object three times, that's still a loop.
const nested = { filter: { glob: '*.ts', case: 'sensitive' }, limit: 50 };
const calls = [
mkCall('find_files', { ...nested }),
mkCall('find_files', { ...nested }),
mkCall('find_files', { ...nested }),
];
expect(detectDoomLoop(calls)).not.toBeNull();
});
});
// ---- negative detection ----------------------------------------------------
describe('detectDoomLoop — negative cases', () => {
it('returns null when 3 calls share name but differ in args', () => {
const calls = [
mkCall('view_file', { path: 'a.ts' }),
mkCall('view_file', { path: 'b.ts' }),
mkCall('view_file', { path: 'c.ts' }),
];
expect(detectDoomLoop(calls)).toBeNull();
});
it('returns null when 3 calls share args but differ in name', () => {
const calls = [
mkCall('view_file', { path: 'a.ts' }),
mkCall('grep', { path: 'a.ts' }),
mkCall('list_dir', { path: 'a.ts' }),
];
expect(detectDoomLoop(calls)).toBeNull();
});
it('returns null when the FIRST three of four match but the latest differs', () => {
// Critical sliding-window edge: detector must ONLY look at the last
// DOOM_LOOP_THRESHOLD entries. Earlier matches don't count if the
// model has since moved on.
const calls = [
mkCall('grep', { pattern: 'X' }),
mkCall('grep', { pattern: 'X' }),
mkCall('grep', { pattern: 'X' }),
mkCall('view_file', { path: 'a.ts' }),
];
expect(detectDoomLoop(calls)).toBeNull();
});
it('returns null when args have same keys but different values', () => {
const calls = [
mkCall('grep', { pattern: 'TODO', path: 'src' }),
mkCall('grep', { pattern: 'TODO', path: 'src' }),
mkCall('grep', { pattern: 'TODO', path: 'apps' }),
];
expect(detectDoomLoop(calls)).toBeNull();
});
});
// ---- threshold contract ----------------------------------------------------
describe('DOOM_LOOP_THRESHOLD', () => {
it('is a positive integer (the public contract — tests assume 3)', () => {
expect(DOOM_LOOP_THRESHOLD).toBeGreaterThan(0);
expect(Number.isInteger(DOOM_LOOP_THRESHOLD)).toBe(true);
});
});

View File

@@ -54,6 +54,36 @@ function resolveToolBudget(agent: Agent | null): number {
const CAP_HIT_SUMMARY_NOTE = (limit: number) =>
`You've reached the tool budget (${limit} calls). Produce the best answer you can with what you have. Do not call more tools.`;
// v1.11.6: doom-loop guard. When the model calls the same tool with the
// same arguments DOOM_LOOP_THRESHOLD times in a row within one user-message
// turn, abort the recursion and run the same wrap-up summary path as the
// cap-hit case. Ported from opencode (DOOM_LOOP_THRESHOLD in
// session/processor.ts). Threshold of 3 is the smallest value that doesn't
// false-positive on a model that retries once after a transient error.
export const DOOM_LOOP_THRESHOLD = 3;
const DOOM_LOOP_NOTE = (name: string) =>
`You called ${name} with the same arguments ${DOOM_LOOP_THRESHOLD} times in a row. Stop calling it. Produce the best answer you can with what you have.`;
// Returns the name + args of the looping tool when the LAST
// DOOM_LOOP_THRESHOLD entries in `recentToolCalls` are identical (same name
// AND deep-equal args via JSON.stringify). Returns null otherwise.
// Pure; exported for unit-test access.
export function detectDoomLoop(
recentToolCalls: ToolCall[],
): { name: string; args: Record<string, unknown> } | null {
if (recentToolCalls.length < DOOM_LOOP_THRESHOLD) return null;
const last = recentToolCalls.slice(-DOOM_LOOP_THRESHOLD);
const ref = last[0]!;
const refArgs = JSON.stringify(ref.args);
for (let i = 1; i < last.length; i++) {
const tc = last[i]!;
if (tc.name !== ref.name) return null;
if (JSON.stringify(tc.args) !== refArgs) return null;
}
return { name: ref.name, args: ref.args };
}
function isCapHitSentinel(m: Message): boolean {
return (
m.role === 'system' &&
@@ -63,6 +93,22 @@ function isCapHitSentinel(m: Message): boolean {
);
}
// v1.11.6: parallel predicate. Same UI-only semantics as cap-hit sentinels —
// never sent to the LLM (filtered by buildMessagesPayload through the
// isAnySentinel check below).
function isDoomLoopSentinel(m: Message): boolean {
return (
m.role === 'system' &&
m.metadata !== null &&
typeof m.metadata === 'object' &&
(m.metadata as { kind?: unknown }).kind === 'doom_loop'
);
}
function isAnySentinel(m: Message): boolean {
return isCapHitSentinel(m) || isDoomLoopSentinel(m);
}
export interface InferenceFrame {
type:
| 'message_started'
@@ -203,11 +249,11 @@ export function buildMessagesPayload(
out.push({ role: 'system', content: m.content });
continue;
}
// v1.8.2: cap-hit sentinels are UI-only — never send them to the LLM. The
// synthetic "you've reached the tool budget" note lives only inside the
// summary call's messages array and is never persisted, so on Continue
// the model resumes with a clean context.
if (isCapHitSentinel(m)) continue;
// v1.8.2 / v1.11.6: cap-hit and doom-loop sentinels are UI-only — never
// send them to the LLM. The synthetic instruction note lives only inside
// the summary call's messages array and is never persisted, so on a
// follow-up turn the model resumes with a clean context.
if (isAnySentinel(m)) continue;
if (m.role === 'assistant' && m.status === 'streaming') continue;
if (m.role === 'assistant' && m.status === 'cancelled') continue;
if (m.role === 'tool') {
@@ -608,6 +654,11 @@ interface TurnArgs {
// resolved budget at the top of each turn. Replaces the older `depth`
// counter (which counted iterations, not invocations).
toolsUsed: number;
// v1.11.6: ordered tool calls executed in this user-message turn (across
// recursive runAssistantTurn invocations). Reset to [] at user-message
// boundaries by runInference, same as toolsUsed. Doom-loop check at the
// top of runAssistantTurn slices the last DOOM_LOOP_THRESHOLD entries.
recentToolCalls: ToolCall[];
signal: AbortSignal | undefined;
}
@@ -910,6 +961,11 @@ async function executeToolPhase(
// One assistant message can emit multiple tool_calls, so we add the run
// count, not 1. The next turn's budget check sees the cumulative total.
toolsUsed: toolsUsed + result.toolCalls.length,
// v1.11.6: append the just-executed tool calls to the per-turn history
// so the next runAssistantTurn's doom-loop check can see them. We don't
// cap the array length here — per-turn budgets keep it bounded
// (typically <30 entries), and slicing happens inside detectDoomLoop.
recentToolCalls: [...args.recentToolCalls, ...result.toolCalls],
signal,
});
}
@@ -1029,6 +1085,17 @@ async function runAssistantTurn(
return;
}
// v1.11.6: doom-loop guard. Detected BEFORE the budget cap (the model can
// burn through 3 identical calls long before the 15-call budget fires).
// Same in-flight-slot-reuse pattern as runCapHitSummary — wrap-up reply
// lands in args.assistantMessageId, then a doom_loop sentinel is inserted
// to make the abort visible in the chat history.
const loop = detectDoomLoop(args.recentToolCalls);
if (loop) {
await runDoomLoopSummary(ctx, args, session, project, history, agent, loop);
return;
}
const messages = buildMessagesPayload(session, project, history, agent);
const state: StreamPhaseState = { accumulated: '', startedAt: null };
@@ -1059,7 +1126,16 @@ export async function runInference(
// continue) starts with a clean budget. Tool-call accumulation across
// Continue invocations is what the hard ceiling guards against, not the
// per-call budget.
return runAssistantTurn(ctx, { sessionId, chatId, assistantMessageId, toolsUsed: 0, signal });
// v1.11.6: recentToolCalls also resets — doom-loop detection is scoped
// to a single user-message turn, so a Continue starts with no history.
return runAssistantTurn(ctx, {
sessionId,
chatId,
assistantMessageId,
toolsUsed: 0,
recentToolCalls: [],
signal,
});
}
// v1.8.2: cap-hit summary flow. Called instead of erroring when the loop
@@ -1318,6 +1394,250 @@ async function insertCapHitSentinel(
});
}
// v1.11.6: doom-loop wrap-up. Mirrors runCapHitSummary structurally — same
// in-flight-slot reuse, same tools-disabled streaming-summary call, same
// post-finalize sentinel insert + chat_status drop. Differences:
// - synthetic note text comes from DOOM_LOOP_NOTE (names the looping tool)
// - sentinel metadata is { kind: 'doom_loop', tool_name, args, threshold }
// and has no Continue affordance (manual retry would just re-loop)
// - chat_status error path uses reason: 'doom_loop_summary_failed'
// Kept as a clone rather than refactored into a shared helper because the
// two summary paths still differ in error reason + sentinel shape; a third
// sentinel would justify factoring out runWrapUpSummary(opts).
async function runDoomLoopSummary(
ctx: InferenceContext,
args: TurnArgs,
session: Session,
project: Project,
history: Message[],
agent: Agent | null,
loop: { name: string; args: Record<string, unknown> },
): Promise<void> {
const { sessionId, chatId, assistantMessageId, signal } = args;
const messages = buildMessagesPayload(session, project, history, agent);
messages.push({ role: 'system', content: DOOM_LOOP_NOTE(loop.name) });
const startedRow = await ctx.sql<{ started_at: string }[]>`
UPDATE messages
SET started_at = clock_timestamp()
WHERE id = ${assistantMessageId}
RETURNING started_at
`;
const startedAt = startedRow[0]?.started_at ?? null;
ctx.publish(sessionId, {
type: 'message_started',
message_id: assistantMessageId,
chat_id: chatId,
role: 'assistant',
});
let accumulated = '';
let pendingFlushTimer: NodeJS.Timeout | null = null;
let flushPromise: Promise<unknown> = Promise.resolve();
const flushNow = () => {
if (pendingFlushTimer) {
clearTimeout(pendingFlushTimer);
pendingFlushTimer = null;
}
const snapshot = accumulated;
flushPromise = flushPromise.then(() =>
ctx.sql`UPDATE messages SET content = ${snapshot} WHERE id = ${assistantMessageId}`
);
};
const scheduleFlush = () => {
if (pendingFlushTimer) return;
pendingFlushTimer = setTimeout(() => {
pendingFlushTimer = null;
flushNow();
}, DB_FLUSH_INTERVAL_MS);
};
let summaryOk = false;
let summarySoftCancelled = false;
let summaryError: string | null = null;
let result: StreamResult | null = null;
try {
result = await streamCompletion(
ctx,
session.model,
messages,
{ tools: null, temperature: agent?.temperature },
(delta) => {
accumulated += delta;
ctx.publish(sessionId, {
type: 'delta',
message_id: assistantMessageId,
chat_id: chatId,
content: delta,
});
scheduleFlush();
},
signal,
);
summaryOk = true;
} catch (err) {
if (err instanceof Error && err.name === 'AbortError') {
summarySoftCancelled = true;
} else {
summaryError = err instanceof Error ? err.message : String(err);
}
} finally {
if (pendingFlushTimer) {
clearTimeout(pendingFlushTimer);
pendingFlushTimer = null;
}
await flushPromise;
}
if (summaryOk && result) {
const mctx = await modelContext.getModelContext(session.model);
const nCtx = mctx?.n_ctx ?? null;
const [updated] = await ctx.sql<
{ tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
>`
UPDATE messages
SET content = ${result.content},
status = 'complete',
tokens_used = ${result.completionTokens},
ctx_used = ${result.promptTokens},
ctx_max = ${nCtx},
finished_at = clock_timestamp()
WHERE id = ${assistantMessageId}
RETURNING tokens_used, ctx_used, ctx_max, finished_at
`;
ctx.publish(sessionId, {
type: 'message_complete',
message_id: assistantMessageId,
chat_id: chatId,
tokens_used: updated?.tokens_used ?? null,
ctx_used: updated?.ctx_used ?? null,
ctx_max: updated?.ctx_max ?? null,
started_at: startedAt,
finished_at: updated?.finished_at ?? null,
model: session.model,
});
} else if (summarySoftCancelled) {
await ctx.sql`
UPDATE messages
SET content = ${accumulated},
status = 'cancelled',
finished_at = clock_timestamp()
WHERE id = ${assistantMessageId}
`;
ctx.publish(sessionId, {
type: 'message_complete',
message_id: assistantMessageId,
chat_id: chatId,
});
} else {
// Doom-loop summary failure reuses the existing summary_after_cap_failed
// error reason — the ErrorReason union is shared between sentinel paths
// and the UI surfaces a generic "summary failed" line for both. We don't
// add a new reason code because the user-visible failure mode is the
// same (model gave up mid-summary). Sentinel below still fires.
const errMeta: MessageMetadata = {
kind: 'error',
error_reason: 'summary_after_cap_failed',
error_text: summaryError ?? 'doom-loop summary failed',
};
await ctx.sql`
UPDATE messages
SET content = ${accumulated},
status = 'failed',
finished_at = clock_timestamp(),
metadata = ${ctx.sql.json(errMeta as never)}
WHERE id = ${assistantMessageId}
`;
ctx.publish(sessionId, {
type: 'error',
message_id: assistantMessageId,
chat_id: chatId,
error: summaryError ?? 'doom-loop summary failed',
reason: 'summary_after_cap_failed',
});
}
const [sessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>`
UPDATE sessions SET updated_at = clock_timestamp()
WHERE id = ${sessionId}
RETURNING project_id, name, updated_at
`;
ctx.publishUser({
type: 'session_updated',
session_id: sessionId,
project_id: sessRow!.project_id,
name: sessRow!.name,
updated_at: sessRow!.updated_at,
});
await insertDoomLoopSentinel(ctx, sessionId, chatId, loop);
if (summaryOk || summarySoftCancelled) {
ctx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'idle', at: new Date().toISOString() });
} else {
ctx.publishUser({
type: 'chat_status',
chat_id: chatId,
status: 'error',
at: new Date().toISOString(),
reason: 'summary_after_cap_failed',
});
}
ctx.log.info(
{ sessionId, chatId, assistantMessageId, loopedTool: loop.name, summaryOk, summaryCancelled: summarySoftCancelled },
'inference doom-loop summary finished',
);
}
async function insertDoomLoopSentinel(
ctx: InferenceContext,
sessionId: string,
chatId: string,
loop: { name: string; args: Record<string, unknown> },
): Promise<void> {
// No hard-ceiling / can-continue logic here — doom-loop is a different
// failure mode from cap-hit. Continuing would re-trigger the loop with
// the same tools available; the user needs to restate their question
// or switch agents instead.
const metadata: MessageMetadata = {
kind: 'doom_loop',
tool_name: loop.name,
args: loop.args,
threshold: DOOM_LOOP_THRESHOLD,
};
const content = `Detected ${DOOM_LOOP_THRESHOLD} identical calls to ${loop.name}. Stopping the tool-call loop. Produce the best answer you can with what you have.`;
const [row] = await ctx.sql<{ id: string }[]>`
INSERT INTO messages (session_id, chat_id, role, content, status, created_at, metadata)
VALUES (${sessionId}, ${chatId}, 'system', ${content}, 'complete', clock_timestamp(), ${ctx.sql.json(metadata as never)})
RETURNING id
`;
// Standard frame sequence — same as cap-hit sentinel — so
// useSessionStream's reducer appends the row via the existing path.
ctx.publish(sessionId, {
type: 'message_started',
message_id: row!.id,
chat_id: chatId,
role: 'system',
});
ctx.publish(sessionId, {
type: 'delta',
message_id: row!.id,
chat_id: chatId,
content,
});
ctx.publish(sessionId, {
type: 'message_complete',
message_id: row!.id,
chat_id: chatId,
metadata,
});
}
interface InferenceRegistration {
controller: AbortController;
completed: Promise<void>;

View File

@@ -128,9 +128,11 @@ export type ErrorReason =
| 'tool_execution_failed'
| 'summary_after_cap_failed';
// v1.8.2: shapes stored in messages.metadata. Discriminated on `kind`.
// cap_hit — system sentinel emitted when tool budget is exhausted
// error — attached to a failed assistant message so UI can show reason
// v1.8.2 / v1.11.6: shapes stored in messages.metadata. Discriminated on `kind`.
// cap_hit — system sentinel emitted when tool budget is exhausted
// doom_loop — system sentinel emitted when the model called the same
// tool with the same args DOOM_LOOP_THRESHOLD times in a row
// error — attached to a failed assistant message so UI can show reason
export type MessageMetadata =
| {
kind: 'cap_hit';
@@ -139,6 +141,12 @@ export type MessageMetadata =
agent_name: string | null;
can_continue: boolean;
}
| {
kind: 'doom_loop';
tool_name: string;
args: Record<string, unknown>;
threshold: number;
}
| {
kind: 'error';
error_reason: ErrorReason;