diff --git a/apps/server/src/index.ts b/apps/server/src/index.ts index dd15a3a..1c49de5 100644 --- a/apps/server/src/index.ts +++ b/apps/server/src/index.ts @@ -16,7 +16,7 @@ import { registerWebSocket } from './routes/ws.js'; import { registerModelRoutes } from './routes/models.js'; import { registerAgentRoutes } from './routes/agents.js'; import { registerSkillsRoutes } from './routes/skills.js'; -import { createInferenceRunner } from './services/inference.js'; +import { createInferenceRunner } from './services/inference/index.js'; import { createBroker } from './services/broker.js'; import { listSkills } from './services/skills.js'; import * as compaction from './services/compaction.js'; diff --git a/apps/server/src/services/__tests__/doom-loop.test.ts b/apps/server/src/services/__tests__/doom-loop.test.ts index d51b6e0..6c53024 100644 --- a/apps/server/src/services/__tests__/doom-loop.test.ts +++ b/apps/server/src/services/__tests__/doom-loop.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from 'vitest'; -import { DOOM_LOOP_THRESHOLD, detectDoomLoop } from '../inference.js'; +import { DOOM_LOOP_THRESHOLD, detectDoomLoop } from '../inference/index.js'; import type { ToolCall } from '../../types/api.js'; // ---- fixture ---------------------------------------------------------------- diff --git a/apps/server/src/services/__tests__/inference.test.ts b/apps/server/src/services/__tests__/inference.test.ts index 1469c8d..10bde23 100644 --- a/apps/server/src/services/__tests__/inference.test.ts +++ b/apps/server/src/services/__tests__/inference.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from 'vitest'; -import { buildMessagesPayload } from '../inference.js'; +import { buildMessagesPayload } from '../inference/index.js'; import type { Message, MessageRole, diff --git a/apps/server/src/services/auto_name.ts b/apps/server/src/services/auto_name.ts index 341c60b..746f25b 100644 --- a/apps/server/src/services/auto_name.ts +++ b/apps/server/src/services/auto_name.ts @@ -1,4 +1,4 @@ -import type { InferenceContext } from './inference.js'; +import type { InferenceContext } from './inference/index.js'; const NAMING_SYSTEM_PROMPT = 'You name chat sessions. Reply directly with no thinking, reasoning, or explanation. Output ONLY the title, 4 words max, no quotes, no punctuation, no prefix like "Title:".'; diff --git a/apps/server/src/services/inference/error-handler.ts b/apps/server/src/services/inference/error-handler.ts index b98e202..6364ccc 100644 --- a/apps/server/src/services/inference/error-handler.ts +++ b/apps/server/src/services/inference/error-handler.ts @@ -1,7 +1,7 @@ import type { MessageMetadata, Session } from '../../types/api.js'; import * as modelContext from '../model-context.js'; import { maybeFlagForCompaction } from './payload.js'; -import type { InferenceContext, StreamResult, TurnArgs } from '../inference.js'; +import type { InferenceContext, StreamResult, TurnArgs } from './turn.js'; export async function handleAbortOrError( ctx: InferenceContext, diff --git a/apps/server/src/services/inference/index.ts b/apps/server/src/services/inference/index.ts new file mode 100644 index 0000000..8788a1b --- /dev/null +++ b/apps/server/src/services/inference/index.ts @@ -0,0 +1,20 @@ +// v1.12.4: re-export shim. Outside callers (apps/server/src/index.ts and the +// vitest inference tests) import from './services/inference/index.js'. The +// directory is now the public surface; turn.ts holds runAssistantTurn / +// runInference / createInferenceRunner while the other inference/*.ts files +// stay implementation-private. + +export { + createInferenceRunner, + runAssistantTurn, + runInference, +} from './turn.js'; +export type { + FramePublisher, + InferenceContext, + InferenceFrame, + StreamResult, + TurnArgs, +} from './turn.js'; +export { detectDoomLoop, DOOM_LOOP_THRESHOLD } from './sentinels.js'; +export { buildMessagesPayload } from './payload.js'; diff --git a/apps/server/src/services/inference/payload.ts b/apps/server/src/services/inference/payload.ts index ec0d9f5..a0efdd6 100644 --- a/apps/server/src/services/inference/payload.ts +++ b/apps/server/src/services/inference/payload.ts @@ -8,7 +8,7 @@ import type { import * as compaction from '../compaction.js'; import { buildSystemPrompt } from '../system-prompt.js'; import { isAnySentinel } from './sentinels.js'; -import type { InferenceContext } from '../inference.js'; +import type { InferenceContext } from './turn.js'; export interface OpenAiMessage { role: 'system' | 'user' | 'assistant' | 'tool'; diff --git a/apps/server/src/services/inference.ts b/apps/server/src/services/inference/sentinel-summaries.ts similarity index 58% rename from apps/server/src/services/inference.ts rename to apps/server/src/services/inference/sentinel-summaries.ts index 96d282b..49a30df 100644 --- a/apps/server/src/services/inference.ts +++ b/apps/server/src/services/inference/sentinel-summaries.ts @@ -1,47 +1,20 @@ -import type { FastifyBaseLogger } from 'fastify'; -import type { Sql } from '../db.js'; -import type { Config } from '../config.js'; import type { Agent, - ErrorReason, Message, MessageMetadata, Project, Session, - ToolCall, - UserStreamFrame, -} from '../types/api.js'; -import { ALL_TOOLS } from './tools.js'; -import { resolveProjectRoot } from './path_guard.js'; -import { maybeAutoNameChat } from './auto_name.js'; -import { getAgentById } from './agents.js'; -import * as compaction from './compaction.js'; -import * as modelContext from './model-context.js'; -import type { Broker } from './broker.js'; -import { resolveToolBudget } from './inference/budget.js'; -import { - DOOM_LOOP_THRESHOLD, - detectDoomLoop, -} from './inference/sentinels.js'; -import { - buildMessagesPayload, - loadContext, -} from './inference/payload.js'; -import { - finalizeCompletion, - handleAbortOrError, -} from './inference/error-handler.js'; -import { - executeStreamPhase, - streamCompletion, -} from './inference/stream-phase.js'; -import { executeToolPhase } from './inference/tool-phase.js'; -import { DB_FLUSH_INTERVAL_MS, type StreamPhaseState } from './inference/types.js'; - -// v1.12.4: re-exported so external callers (tests, future consumers) keep -// importing from services/inference.js as the public surface. -export { detectDoomLoop, DOOM_LOOP_THRESHOLD } from './inference/sentinels.js'; -export { buildMessagesPayload } from './inference/payload.js'; +} from '../../types/api.js'; +import * as modelContext from '../model-context.js'; +import { buildMessagesPayload } from './payload.js'; +import { DOOM_LOOP_THRESHOLD } from './sentinels.js'; +import { streamCompletion } from './stream-phase.js'; +import { DB_FLUSH_INTERVAL_MS } from './types.js'; +import type { + InferenceContext, + StreamResult, + TurnArgs, +} from './turn.js'; // Synthetic system note appended to the cap-hit summary call. Verbatim from // the v1.8.2 spec — do not paraphrase: the model is more reliable when the @@ -52,219 +25,7 @@ const CAP_HIT_SUMMARY_NOTE = (limit: number) => const DOOM_LOOP_NOTE = (name: string) => `You called ${name} with the same arguments ${DOOM_LOOP_THRESHOLD} times in a row. Stop calling it. Produce the best answer you can with what you have.`; -export interface InferenceFrame { - type: - | 'message_started' - | 'delta' - | 'tool_call' - | 'tool_result' - | 'message_complete' - | 'usage' - | 'messages_deleted' - | 'session_renamed' - | 'chat_renamed' - | 'error'; - message_id?: string; - message_ids?: string[]; - chat_id?: string; - tool_message_id?: string; - tool_call_id?: string; - // v1.8.2: 'system' added so cap-hit sentinel messages can announce themselves - // through the normal message_started → delta → message_complete sequence. - role?: 'assistant' | 'tool' | 'user' | 'system'; - content?: string; - tool_call?: ToolCall; - output?: unknown; - truncated?: boolean; - error?: string; - // v1.8.2: structured error reason. Set on `type: 'error'` so the UI can - // surface a specific message; `error` stays the human-readable text. - reason?: ErrorReason; - // v1.8.2: piggybacks on `message_complete` so static or terminally-resolved - // messages can carry their persisted metadata to the live stream without a - // refetch (sentinels carry { kind: 'cap_hit', ... }; failed messages carry - // { kind: 'error', ... }). - metadata?: MessageMetadata | null; - tokens_used?: number | null; - ctx_used?: number | null; - ctx_max?: number | null; - completion_tokens?: number | null; - started_at?: string | null; - finished_at?: string | null; - model?: string; - session_id?: string; - name?: string; -} - -export type FramePublisher = (sessionId: string, frame: InferenceFrame) => void; - -export interface InferenceContext { - sql: Sql; - config: Config; - log: FastifyBaseLogger; - publish: FramePublisher; - publishUser: (frame: UserStreamFrame) => void; - // v1.11: passed through so compaction.process can publish 'compacted' - // frames on the same session WS channel useSessionStream subscribes to. - // Compaction is the only path that needs the raw broker handle (regular - // inference goes through `publish`); keeping a separate field avoids - // tempting other code paths into bypassing the session-id binding. - broker: Broker; -} - -// v1.12.4: payload assembly extracted to ./inference/payload.ts (tests -// import buildMessagesPayload from this module, so a re-export below -// preserves the public surface). Stream + tool phases extracted to -// ./inference/stream-phase.ts and ./inference/tool-phase.ts. - -export interface StreamResult { - finishReason: string | null; - content: string; - toolCalls: ToolCall[]; - promptTokens: number | null; - completionTokens: number | null; -} - - -export interface TurnArgs { - sessionId: string; - chatId: string; - assistantMessageId: string; - // v1.8.2: cumulative tool calls executed this run. Compared against the - // resolved budget at the top of each turn. Replaces the older `depth` - // counter (which counted iterations, not invocations). - toolsUsed: number; - // v1.11.6: ordered tool calls executed in this user-message turn (across - // recursive runAssistantTurn invocations). Reset to [] at user-message - // boundaries by runInference, same as toolsUsed. Doom-loop check at the - // top of runAssistantTurn slices the last DOOM_LOOP_THRESHOLD entries. - recentToolCalls: ToolCall[]; - signal: AbortSignal | undefined; -} - - -export async function runAssistantTurn( - ctx: InferenceContext, - args: TurnArgs, -): Promise { - const { sessionId, chatId } = args; - - // v1.11: if the prior turn flagged this chat for compaction, run it first - // so loadContext below reads the post-compaction history. We swallow - // compaction failures (clearing the flag so we don't loop) and proceed - // with the un-compacted history — a slow turn that hits the model's - // hard limit is recoverable; a dead session is not. - const chatFlag = await ctx.sql<{ needs_compaction: boolean }[]>` - SELECT needs_compaction FROM chats WHERE id = ${chatId} - `; - if (chatFlag[0]?.needs_compaction) { - try { - await compaction.process({ - sql: ctx.sql, - config: ctx.config, - log: ctx.log, - broker: ctx.broker, - chatId, - }); - } catch (err) { - ctx.log.warn({ err, chatId }, 'auto-compaction failed; clearing flag and proceeding'); - await ctx.sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`; - } - } - - const loaded = await loadContext(ctx.sql, sessionId, chatId); - if (!loaded) { - ctx.log.warn({ sessionId }, 'inference: session or project missing'); - return; - } - const { session, project, history } = loaded; - const projectRoot = await resolveProjectRoot(project.path); - // Agent resolution is per-turn so PATCH agent_id mid-conversation takes - // effect on the next message. Unknown agent_id returns null silently — - // session falls back to base prompt + all tools + default temperature. - const agent = session.agent_id - ? await getAgentById(project.path, session.agent_id) - : null; - - // v1.8.2: cap-hit replaces the older "tool loop depth exceeded" failure. - // When we've already burned the budget *before* this turn even runs, we - // skip straight to the summary flow — the in-flight assistant message slot - // gets reused for the wrap-up reply instead of being marked failed. - const budget = resolveToolBudget(agent); - if (args.toolsUsed >= budget) { - await runCapHitSummary(ctx, args, session, project, history, agent, budget); - return; - } - - // v1.11.6: doom-loop guard. Detected BEFORE the budget cap (the model can - // burn through 3 identical calls long before the 15-call budget fires). - // Same in-flight-slot-reuse pattern as runCapHitSummary — wrap-up reply - // lands in args.assistantMessageId, then a doom_loop sentinel is inserted - // to make the abort visible in the chat history. - const loop = detectDoomLoop(args.recentToolCalls); - if (loop) { - await runDoomLoopSummary(ctx, args, session, project, history, agent, loop); - return; - } - - const messages = await buildMessagesPayload(session, project, history, agent); - - // v1.11.8: resolve per-chat web-tools opt-in. Tri-state on the wire: - // - session.web_search_enabled = null → inherit project default - // - session.web_search_enabled = true/false → explicit - // Both web_search and web_fetch are gated by this single flag (the UI - // label is "Enable web search and fetch" — same store, both tools). - // Default is false unless explicitly opted in, matching the v1.9 - // plumbing intent ("inert until Batch 8 ships the actual tools"). - const webToolsEnabled = - session.web_search_enabled ?? project.default_web_search_enabled ?? false; - - const state: StreamPhaseState = { accumulated: '', startedAt: null }; - let result: StreamResult; - try { - result = await executeStreamPhase(ctx, args, session, messages, state, agent, webToolsEnabled); - } catch (err) { - await handleAbortOrError(ctx, args, state.accumulated, err); - return; - } - - if (result.toolCalls.length > 0) { - await executeToolPhase(ctx, args, result, state.startedAt, session, projectRoot); - return; - } - - await finalizeCompletion(ctx, args, result, state.startedAt, session); -} - -export async function runInference( - ctx: InferenceContext, - sessionId: string, - chatId: string, - assistantMessageId: string, - signal?: AbortSignal -): Promise { - // v1.8.2: every fresh inference (initial send, regenerate, force_send, - // continue) starts with a clean budget. Tool-call accumulation across - // Continue invocations is what the hard ceiling guards against, not the - // per-call budget. - // v1.11.6: recentToolCalls also resets — doom-loop detection is scoped - // to a single user-message turn, so a Continue starts with no history. - return runAssistantTurn(ctx, { - sessionId, - chatId, - assistantMessageId, - toolsUsed: 0, - recentToolCalls: [], - signal, - }); -} - -// v1.8.2: cap-hit summary flow. Called instead of erroring when the loop -// hits its budget. Reuses the in-flight assistant message slot to stream a -// short wrap-up reply with the synthetic note prepended and tools disabled, -// then always inserts a cap_hit sentinel afterward (regardless of summary -// outcome) so the UI can show a Continue affordance. -async function runCapHitSummary( +export async function runCapHitSummary( ctx: InferenceContext, args: TurnArgs, session: Session, @@ -526,7 +287,7 @@ async function insertCapHitSentinel( // Kept as a clone rather than refactored into a shared helper because the // two summary paths still differ in error reason + sentinel shape; a third // sentinel would justify factoring out runWrapUpSummary(opts). -async function runDoomLoopSummary( +export async function runDoomLoopSummary( ctx: InferenceContext, args: TurnArgs, session: Session, @@ -760,69 +521,3 @@ async function insertDoomLoopSentinel( metadata, }); } - -interface InferenceRegistration { - controller: AbortController; - completed: Promise; -} - -export function createInferenceRunner( - ctx: Omit, - publishUserFn: (user: string, frame: UserStreamFrame) => void -) { - const registry = new Map(); - - return { - enqueue(sessionId: string, chatId: string, assistantMessageId: string, user: string) { - const callCtx: InferenceContext = { - ...ctx, - publishUser: (frame) => publishUserFn(user, frame), - // v1.11: broker comes in via ctx (set at registration time). Repeated - // here so the destructure carries it onto the per-call ctx without - // having to add it to every enqueue/cancel signature individually. - broker: ctx.broker, - }; - // v1.8 mobile-tabs: announce working before the async loop starts so - // every device subscribed to the user channel sees the amber dot. - callCtx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'streaming', at: new Date().toISOString() }); - const controller = new AbortController(); - let resolveCompleted!: () => void; - const completed = new Promise((res) => { resolveCompleted = res; }); - const registration: InferenceRegistration = { controller, completed }; - registry.set(chatId, registration); - void (async () => { - try { - await runInference(callCtx, sessionId, chatId, assistantMessageId, controller.signal); - setImmediate(() => { - void maybeAutoNameChat(callCtx, chatId, sessionId).catch((err: Error) => { - callCtx.log.warn({ err, chatId }, 'auto-name failed'); - }); - }); - } catch (err) { - callCtx.log.error({ err }, 'unhandled inference error'); - } finally { - resolveCompleted(); - // Only clear our own registration; a force-send may have replaced it. - if (registry.get(chatId) === registration) { - registry.delete(chatId); - } - } - })(); - }, - - async cancel(_sessionId: string, chatId: string): Promise { - const reg = registry.get(chatId); - if (!reg) return false; - reg.controller.abort(); - // Swallow — we just need to wait for the catch/finally to persist state. - await reg.completed.catch(() => {}); - return true; - }, - - hasActive(chatId: string): boolean { - return registry.has(chatId); - }, - }; -} - -export const _toolNames = ALL_TOOLS.map((t) => t.name); diff --git a/apps/server/src/services/inference/stream-phase.ts b/apps/server/src/services/inference/stream-phase.ts index c044f87..2e5092f 100644 --- a/apps/server/src/services/inference/stream-phase.ts +++ b/apps/server/src/services/inference/stream-phase.ts @@ -17,7 +17,7 @@ import type { InferenceContext, StreamResult, TurnArgs, -} from '../inference.js'; +} from './turn.js'; interface ChatCompletionDelta { role?: string; diff --git a/apps/server/src/services/inference/tool-phase.ts b/apps/server/src/services/inference/tool-phase.ts index 880d7c2..8fd94de 100644 --- a/apps/server/src/services/inference/tool-phase.ts +++ b/apps/server/src/services/inference/tool-phase.ts @@ -7,12 +7,12 @@ import type { InferenceContext, StreamResult, TurnArgs, -} from '../inference.js'; +} from './turn.js'; // v1.12.4: ESM value-import cycle. executeToolPhase recurses into // runAssistantTurn which lives in inference.ts. The cycle is safe because // the reference is read at call time (inside an async function body), not // at module top-level. Node + tsc resolve this cleanly. -import { runAssistantTurn } from '../inference.js'; +import { runAssistantTurn } from './turn.js'; async function executeToolCall( projectRoot: string, diff --git a/apps/server/src/services/inference/turn.ts b/apps/server/src/services/inference/turn.ts new file mode 100644 index 0000000..64b9c83 --- /dev/null +++ b/apps/server/src/services/inference/turn.ts @@ -0,0 +1,326 @@ +import type { FastifyBaseLogger } from 'fastify'; +import type { Sql } from '../../db.js'; +import type { Config } from '../../config.js'; +import type { + Agent, + ErrorReason, + Message, + MessageMetadata, + Project, + Session, + ToolCall, + UserStreamFrame, +} from '../../types/api.js'; +import { ALL_TOOLS } from '../tools.js'; +import { resolveProjectRoot } from '../path_guard.js'; +import { maybeAutoNameChat } from '../auto_name.js'; +import { getAgentById } from '../agents.js'; +import * as compaction from '../compaction.js'; +import * as modelContext from '../model-context.js'; +import type { Broker } from '../broker.js'; +import { resolveToolBudget } from './budget.js'; +import { + DOOM_LOOP_THRESHOLD, + detectDoomLoop, +} from './sentinels.js'; +import { + buildMessagesPayload, + loadContext, +} from './payload.js'; +import { + finalizeCompletion, + handleAbortOrError, +} from './error-handler.js'; +import { + executeStreamPhase, + streamCompletion, +} from './stream-phase.js'; +import { executeToolPhase } from './tool-phase.js'; +import { DB_FLUSH_INTERVAL_MS, type StreamPhaseState } from './types.js'; +import { + runCapHitSummary, + runDoomLoopSummary, +} from './sentinel-summaries.js'; + +// v1.12.4: re-exported so external callers (tests, future consumers) keep +// importing from services/inference.js as the public surface. +export { detectDoomLoop, DOOM_LOOP_THRESHOLD } from './sentinels.js'; +export { buildMessagesPayload } from './payload.js'; + +export interface InferenceFrame { + type: + | 'message_started' + | 'delta' + | 'tool_call' + | 'tool_result' + | 'message_complete' + | 'usage' + | 'messages_deleted' + | 'session_renamed' + | 'chat_renamed' + | 'error'; + message_id?: string; + message_ids?: string[]; + chat_id?: string; + tool_message_id?: string; + tool_call_id?: string; + // v1.8.2: 'system' added so cap-hit sentinel messages can announce themselves + // through the normal message_started → delta → message_complete sequence. + role?: 'assistant' | 'tool' | 'user' | 'system'; + content?: string; + tool_call?: ToolCall; + output?: unknown; + truncated?: boolean; + error?: string; + // v1.8.2: structured error reason. Set on `type: 'error'` so the UI can + // surface a specific message; `error` stays the human-readable text. + reason?: ErrorReason; + // v1.8.2: piggybacks on `message_complete` so static or terminally-resolved + // messages can carry their persisted metadata to the live stream without a + // refetch (sentinels carry { kind: 'cap_hit', ... }; failed messages carry + // { kind: 'error', ... }). + metadata?: MessageMetadata | null; + tokens_used?: number | null; + ctx_used?: number | null; + ctx_max?: number | null; + completion_tokens?: number | null; + started_at?: string | null; + finished_at?: string | null; + model?: string; + session_id?: string; + name?: string; +} + +export type FramePublisher = (sessionId: string, frame: InferenceFrame) => void; + +export interface InferenceContext { + sql: Sql; + config: Config; + log: FastifyBaseLogger; + publish: FramePublisher; + publishUser: (frame: UserStreamFrame) => void; + // v1.11: passed through so compaction.process can publish 'compacted' + // frames on the same session WS channel useSessionStream subscribes to. + // Compaction is the only path that needs the raw broker handle (regular + // inference goes through `publish`); keeping a separate field avoids + // tempting other code paths into bypassing the session-id binding. + broker: Broker; +} + +// v1.12.4: payload assembly extracted to ./inference/payload.ts (tests +// import buildMessagesPayload from this module, so a re-export below +// preserves the public surface). Stream + tool phases extracted to +// ./inference/stream-phase.ts and ./inference/tool-phase.ts. + +export interface StreamResult { + finishReason: string | null; + content: string; + toolCalls: ToolCall[]; + promptTokens: number | null; + completionTokens: number | null; +} + + +export interface TurnArgs { + sessionId: string; + chatId: string; + assistantMessageId: string; + // v1.8.2: cumulative tool calls executed this run. Compared against the + // resolved budget at the top of each turn. Replaces the older `depth` + // counter (which counted iterations, not invocations). + toolsUsed: number; + // v1.11.6: ordered tool calls executed in this user-message turn (across + // recursive runAssistantTurn invocations). Reset to [] at user-message + // boundaries by runInference, same as toolsUsed. Doom-loop check at the + // top of runAssistantTurn slices the last DOOM_LOOP_THRESHOLD entries. + recentToolCalls: ToolCall[]; + signal: AbortSignal | undefined; +} + + +export async function runAssistantTurn( + ctx: InferenceContext, + args: TurnArgs, +): Promise { + const { sessionId, chatId } = args; + + // v1.11: if the prior turn flagged this chat for compaction, run it first + // so loadContext below reads the post-compaction history. We swallow + // compaction failures (clearing the flag so we don't loop) and proceed + // with the un-compacted history — a slow turn that hits the model's + // hard limit is recoverable; a dead session is not. + const chatFlag = await ctx.sql<{ needs_compaction: boolean }[]>` + SELECT needs_compaction FROM chats WHERE id = ${chatId} + `; + if (chatFlag[0]?.needs_compaction) { + try { + await compaction.process({ + sql: ctx.sql, + config: ctx.config, + log: ctx.log, + broker: ctx.broker, + chatId, + }); + } catch (err) { + ctx.log.warn({ err, chatId }, 'auto-compaction failed; clearing flag and proceeding'); + await ctx.sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`; + } + } + + const loaded = await loadContext(ctx.sql, sessionId, chatId); + if (!loaded) { + ctx.log.warn({ sessionId }, 'inference: session or project missing'); + return; + } + const { session, project, history } = loaded; + const projectRoot = await resolveProjectRoot(project.path); + // Agent resolution is per-turn so PATCH agent_id mid-conversation takes + // effect on the next message. Unknown agent_id returns null silently — + // session falls back to base prompt + all tools + default temperature. + const agent = session.agent_id + ? await getAgentById(project.path, session.agent_id) + : null; + + // v1.8.2: cap-hit replaces the older "tool loop depth exceeded" failure. + // When we've already burned the budget *before* this turn even runs, we + // skip straight to the summary flow — the in-flight assistant message slot + // gets reused for the wrap-up reply instead of being marked failed. + const budget = resolveToolBudget(agent); + if (args.toolsUsed >= budget) { + await runCapHitSummary(ctx, args, session, project, history, agent, budget); + return; + } + + // v1.11.6: doom-loop guard. Detected BEFORE the budget cap (the model can + // burn through 3 identical calls long before the 15-call budget fires). + // Same in-flight-slot-reuse pattern as runCapHitSummary — wrap-up reply + // lands in args.assistantMessageId, then a doom_loop sentinel is inserted + // to make the abort visible in the chat history. + const loop = detectDoomLoop(args.recentToolCalls); + if (loop) { + await runDoomLoopSummary(ctx, args, session, project, history, agent, loop); + return; + } + + const messages = await buildMessagesPayload(session, project, history, agent); + + // v1.11.8: resolve per-chat web-tools opt-in. Tri-state on the wire: + // - session.web_search_enabled = null → inherit project default + // - session.web_search_enabled = true/false → explicit + // Both web_search and web_fetch are gated by this single flag (the UI + // label is "Enable web search and fetch" — same store, both tools). + // Default is false unless explicitly opted in, matching the v1.9 + // plumbing intent ("inert until Batch 8 ships the actual tools"). + const webToolsEnabled = + session.web_search_enabled ?? project.default_web_search_enabled ?? false; + + const state: StreamPhaseState = { accumulated: '', startedAt: null }; + let result: StreamResult; + try { + result = await executeStreamPhase(ctx, args, session, messages, state, agent, webToolsEnabled); + } catch (err) { + await handleAbortOrError(ctx, args, state.accumulated, err); + return; + } + + if (result.toolCalls.length > 0) { + await executeToolPhase(ctx, args, result, state.startedAt, session, projectRoot); + return; + } + + await finalizeCompletion(ctx, args, result, state.startedAt, session); +} + +export async function runInference( + ctx: InferenceContext, + sessionId: string, + chatId: string, + assistantMessageId: string, + signal?: AbortSignal +): Promise { + // v1.8.2: every fresh inference (initial send, regenerate, force_send, + // continue) starts with a clean budget. Tool-call accumulation across + // Continue invocations is what the hard ceiling guards against, not the + // per-call budget. + // v1.11.6: recentToolCalls also resets — doom-loop detection is scoped + // to a single user-message turn, so a Continue starts with no history. + return runAssistantTurn(ctx, { + sessionId, + chatId, + assistantMessageId, + toolsUsed: 0, + recentToolCalls: [], + signal, + }); +} + +// v1.8.2: cap-hit summary flow. Called instead of erroring when the loop +// hits its budget. Reuses the in-flight assistant message slot to stream a +// short wrap-up reply with the synthetic note prepended and tools disabled, +// then always inserts a cap_hit sentinel afterward (regardless of summary +// outcome) so the UI can show a Continue affordance. +interface InferenceRegistration { + controller: AbortController; + completed: Promise; +} + +export function createInferenceRunner( + ctx: Omit, + publishUserFn: (user: string, frame: UserStreamFrame) => void +) { + const registry = new Map(); + + return { + enqueue(sessionId: string, chatId: string, assistantMessageId: string, user: string) { + const callCtx: InferenceContext = { + ...ctx, + publishUser: (frame) => publishUserFn(user, frame), + // v1.11: broker comes in via ctx (set at registration time). Repeated + // here so the destructure carries it onto the per-call ctx without + // having to add it to every enqueue/cancel signature individually. + broker: ctx.broker, + }; + // v1.8 mobile-tabs: announce working before the async loop starts so + // every device subscribed to the user channel sees the amber dot. + callCtx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'streaming', at: new Date().toISOString() }); + const controller = new AbortController(); + let resolveCompleted!: () => void; + const completed = new Promise((res) => { resolveCompleted = res; }); + const registration: InferenceRegistration = { controller, completed }; + registry.set(chatId, registration); + void (async () => { + try { + await runInference(callCtx, sessionId, chatId, assistantMessageId, controller.signal); + setImmediate(() => { + void maybeAutoNameChat(callCtx, chatId, sessionId).catch((err: Error) => { + callCtx.log.warn({ err, chatId }, 'auto-name failed'); + }); + }); + } catch (err) { + callCtx.log.error({ err }, 'unhandled inference error'); + } finally { + resolveCompleted(); + // Only clear our own registration; a force-send may have replaced it. + if (registry.get(chatId) === registration) { + registry.delete(chatId); + } + } + })(); + }, + + async cancel(_sessionId: string, chatId: string): Promise { + const reg = registry.get(chatId); + if (!reg) return false; + reg.controller.abort(); + // Swallow — we just need to wait for the catch/finally to persist state. + await reg.completed.catch(() => {}); + return true; + }, + + hasActive(chatId: string): boolean { + return registry.has(chatId); + }, + }; +} + +export const _toolNames = ALL_TOOLS.map((t) => t.name);