diff --git a/apps/server/src/services/inference.ts b/apps/server/src/services/inference.ts index b9e6bec..8698d2e 100644 --- a/apps/server/src/services/inference.ts +++ b/apps/server/src/services/inference.ts @@ -13,7 +13,6 @@ import type { } from '../types/api.js'; import { ALL_TOOLS, - READ_ONLY_TOOL_NAMES, TOOLS_BY_NAME, toolJsonSchemas, type ToolJsonSchema, @@ -28,88 +27,34 @@ import type { Broker } from './broker.js'; // async (awaits the container-guidance loader) — buildMessagesPayload below // is therefore async too, and its three call sites in this file await it. import { buildSystemPrompt } from './system-prompt.js'; +import { resolveToolBudget } from './inference/budget.js'; +import { + DOOM_LOOP_THRESHOLD, + detectDoomLoop, + isAnySentinel, +} from './inference/sentinels.js'; +import { + XML_TOOL_CLOSE, + XML_TOOL_OPEN, + parseXmlToolCall, + partialXmlOpenerStart, +} from './inference/xml-parser.js'; + +// v1.12.4: re-exported so external callers (tests, future consumers) keep +// importing from services/inference.js as the public surface. +export { detectDoomLoop, DOOM_LOOP_THRESHOLD } from './inference/sentinels.js'; const DB_FLUSH_INTERVAL_MS = 500; -// v1.8.2: tool-call budget defaults. Resolved per-turn by resolveToolBudget. -// - Agent with explicit max_tool_calls: that value. -// - Agent with read-only-only tools: BUDGET_READ_ONLY (30). -// - Agent with any non-read-only tool: BUDGET_NON_READ_ONLY (10). -// - No agent (raw chat): BUDGET_NO_AGENT (15). -const BUDGET_READ_ONLY = 30; -const BUDGET_NON_READ_ONLY = 10; -const BUDGET_NO_AGENT = 15; - -const READ_ONLY_SET: ReadonlySet = new Set(READ_ONLY_TOOL_NAMES); - -function resolveToolBudget(agent: Agent | null): number { - if (agent?.max_tool_calls != null) return agent.max_tool_calls; - if (!agent) return BUDGET_NO_AGENT; - const allReadOnly = agent.tools.every((t) => READ_ONLY_SET.has(t)); - return allReadOnly ? BUDGET_READ_ONLY : BUDGET_NON_READ_ONLY; -} - // Synthetic system note appended to the cap-hit summary call. Verbatim from // the v1.8.2 spec — do not paraphrase: the model is more reliable when the // instruction is short, declarative, and identical across calls. const CAP_HIT_SUMMARY_NOTE = (limit: number) => `You've reached the tool budget (${limit} calls). Produce the best answer you can with what you have. Do not call more tools.`; -// v1.11.6: doom-loop guard. When the model calls the same tool with the -// same arguments DOOM_LOOP_THRESHOLD times in a row within one user-message -// turn, abort the recursion and run the same wrap-up summary path as the -// cap-hit case. Ported from opencode (DOOM_LOOP_THRESHOLD in -// session/processor.ts). Threshold of 3 is the smallest value that doesn't -// false-positive on a model that retries once after a transient error. -export const DOOM_LOOP_THRESHOLD = 3; - const DOOM_LOOP_NOTE = (name: string) => `You called ${name} with the same arguments ${DOOM_LOOP_THRESHOLD} times in a row. Stop calling it. Produce the best answer you can with what you have.`; -// Returns the name + args of the looping tool when the LAST -// DOOM_LOOP_THRESHOLD entries in `recentToolCalls` are identical (same name -// AND deep-equal args via JSON.stringify). Returns null otherwise. -// Pure; exported for unit-test access. -export function detectDoomLoop( - recentToolCalls: ToolCall[], -): { name: string; args: Record } | null { - if (recentToolCalls.length < DOOM_LOOP_THRESHOLD) return null; - const last = recentToolCalls.slice(-DOOM_LOOP_THRESHOLD); - const ref = last[0]!; - const refArgs = JSON.stringify(ref.args); - for (let i = 1; i < last.length; i++) { - const tc = last[i]!; - if (tc.name !== ref.name) return null; - if (JSON.stringify(tc.args) !== refArgs) return null; - } - return { name: ref.name, args: ref.args }; -} - -function isCapHitSentinel(m: Message): boolean { - return ( - m.role === 'system' && - m.metadata !== null && - typeof m.metadata === 'object' && - (m.metadata as { kind?: unknown }).kind === 'cap_hit' - ); -} - -// v1.11.6: parallel predicate. Same UI-only semantics as cap-hit sentinels — -// never sent to the LLM (filtered by buildMessagesPayload through the -// isAnySentinel check below). -function isDoomLoopSentinel(m: Message): boolean { - return ( - m.role === 'system' && - m.metadata !== null && - typeof m.metadata === 'object' && - (m.metadata as { kind?: unknown }).kind === 'doom_loop' - ); -} - -function isAnySentinel(m: Message): boolean { - return isCapHitSentinel(m) || isDoomLoopSentinel(m); -} - export interface InferenceFrame { type: | 'message_started' @@ -391,55 +336,6 @@ interface StreamOptions { // streamCompletion buffers delta.content, extracts complete blocks, parses // them via parseXmlToolCall, and pushes synthetic entries into the existing // toolCallsBuffer alongside any native JSON-format tool calls. -const XML_TOOL_OPEN = ''; -const XML_TOOL_CLOSE = ''; - -function parseXmlToolCall( - block: string, -): { name: string; args: Record } | null { - const nameMatch = block.match(/]+)>/); - if (!nameMatch || !nameMatch[1]) return null; - const name = nameMatch[1].trim(); - if (!name) return null; - const args: Record = {}; - // Non-greedy body so each … pair is matched - // independently even when multiple appear in the same block. - const paramRe = /]+)>([\s\S]*?)<\/parameter>/g; - for (const m of block.matchAll(paramRe)) { - const key = (m[1] ?? '').trim(); - if (!key) continue; - const raw = (m[2] ?? '').trim(); - try { - args[key] = JSON.parse(raw); - } catch { - args[key] = raw; - } - } - return { name, args }; -} - -// Locate the first character that begins (or completely contains) an -// unfinished opener in `s`. Returns -1 when `s` can be flushed -// to the client in full without risking a partial tag leak. -// Case 1: a full `` opener with no matching closer — caller -// must keep everything from that index forward until the next -// chunk arrives with the closer. -// Case 2: `s` ends with a strict prefix of `` (e.g. ` pair before reaching this check. -function partialXmlOpenerStart(s: string): number { - const fullOpener = s.indexOf(XML_TOOL_OPEN); - if (fullOpener !== -1) return fullOpener; - const lastLt = s.lastIndexOf('<'); - if (lastLt === -1) return -1; - const suffix = s.slice(lastLt); - if (XML_TOOL_OPEN.startsWith(suffix) && suffix.length < XML_TOOL_OPEN.length) { - return lastLt; - } - return -1; -} - async function streamCompletion( ctx: InferenceContext, model: string, diff --git a/apps/server/src/services/inference/budget.ts b/apps/server/src/services/inference/budget.ts new file mode 100644 index 0000000..01659ce --- /dev/null +++ b/apps/server/src/services/inference/budget.ts @@ -0,0 +1,20 @@ +import type { Agent } from '../../types/api.js'; +import { READ_ONLY_TOOL_NAMES } from '../tools.js'; + +// v1.8.2: tool-call budget defaults. Resolved per-turn by resolveToolBudget. +// - Agent with explicit max_tool_calls: that value. +// - Agent with read-only-only tools: BUDGET_READ_ONLY (30). +// - Agent with any non-read-only tool: BUDGET_NON_READ_ONLY (10). +// - No agent (raw chat): BUDGET_NO_AGENT (15). +export const BUDGET_READ_ONLY = 30; +export const BUDGET_NON_READ_ONLY = 10; +export const BUDGET_NO_AGENT = 15; + +const READ_ONLY_SET: ReadonlySet = new Set(READ_ONLY_TOOL_NAMES); + +export function resolveToolBudget(agent: Agent | null): number { + if (agent?.max_tool_calls != null) return agent.max_tool_calls; + if (!agent) return BUDGET_NO_AGENT; + const allReadOnly = agent.tools.every((t) => READ_ONLY_SET.has(t)); + return allReadOnly ? BUDGET_READ_ONLY : BUDGET_NON_READ_ONLY; +} diff --git a/apps/server/src/services/inference/sentinels.ts b/apps/server/src/services/inference/sentinels.ts new file mode 100644 index 0000000..3b84da5 --- /dev/null +++ b/apps/server/src/services/inference/sentinels.ts @@ -0,0 +1,53 @@ +import type { Message, ToolCall } from '../../types/api.js'; + +// v1.11.6: doom-loop guard. When the model calls the same tool with the +// same arguments DOOM_LOOP_THRESHOLD times in a row within one user-message +// turn, abort the recursion and run the same wrap-up summary path as the +// cap-hit case. Ported from opencode (DOOM_LOOP_THRESHOLD in +// session/processor.ts). Threshold of 3 is the smallest value that doesn't +// false-positive on a model that retries once after a transient error. +export const DOOM_LOOP_THRESHOLD = 3; + +// Returns the name + args of the looping tool when the LAST +// DOOM_LOOP_THRESHOLD entries in `recentToolCalls` are identical (same name +// AND deep-equal args via JSON.stringify). Returns null otherwise. +// Pure; exported for unit-test access. +export function detectDoomLoop( + recentToolCalls: ToolCall[], +): { name: string; args: Record } | null { + if (recentToolCalls.length < DOOM_LOOP_THRESHOLD) return null; + const last = recentToolCalls.slice(-DOOM_LOOP_THRESHOLD); + const ref = last[0]!; + const refArgs = JSON.stringify(ref.args); + for (let i = 1; i < last.length; i++) { + const tc = last[i]!; + if (tc.name !== ref.name) return null; + if (JSON.stringify(tc.args) !== refArgs) return null; + } + return { name: ref.name, args: ref.args }; +} + +export function isCapHitSentinel(m: Message): boolean { + return ( + m.role === 'system' && + m.metadata !== null && + typeof m.metadata === 'object' && + (m.metadata as { kind?: unknown }).kind === 'cap_hit' + ); +} + +// v1.11.6: parallel predicate. Same UI-only semantics as cap-hit sentinels — +// never sent to the LLM (filtered by buildMessagesPayload through the +// isAnySentinel check below). +export function isDoomLoopSentinel(m: Message): boolean { + return ( + m.role === 'system' && + m.metadata !== null && + typeof m.metadata === 'object' && + (m.metadata as { kind?: unknown }).kind === 'doom_loop' + ); +} + +export function isAnySentinel(m: Message): boolean { + return isCapHitSentinel(m) || isDoomLoopSentinel(m); +} diff --git a/apps/server/src/services/inference/xml-parser.ts b/apps/server/src/services/inference/xml-parser.ts new file mode 100644 index 0000000..61f080b --- /dev/null +++ b/apps/server/src/services/inference/xml-parser.ts @@ -0,0 +1,53 @@ +// v1.10.5: XML-tag tool-call fallback. Some models emit +// value +// in plain content instead of using the OpenAI tool_calls JSON channel. +// The streaming loop in inference.ts extracts these blocks via these helpers. + +export const XML_TOOL_OPEN = ''; +export const XML_TOOL_CLOSE = ''; + +export function parseXmlToolCall( + block: string, +): { name: string; args: Record } | null { + const nameMatch = block.match(/]+)>/); + if (!nameMatch || !nameMatch[1]) return null; + const name = nameMatch[1].trim(); + if (!name) return null; + const args: Record = {}; + // Non-greedy body so each … pair is matched + // independently even when multiple appear in the same block. + const paramRe = /]+)>([\s\S]*?)<\/parameter>/g; + for (const m of block.matchAll(paramRe)) { + const key = (m[1] ?? '').trim(); + if (!key) continue; + const raw = (m[2] ?? '').trim(); + try { + args[key] = JSON.parse(raw); + } catch { + args[key] = raw; + } + } + return { name, args }; +} + +// Locate the first character that begins (or completely contains) an +// unfinished opener in `s`. Returns -1 when `s` can be flushed +// to the client in full without risking a partial tag leak. +// Case 1: a full `` opener with no matching closer — caller +// must keep everything from that index forward until the next +// chunk arrives with the closer. +// Case 2: `s` ends with a strict prefix of `` (e.g. ` pair before reaching this check. +export function partialXmlOpenerStart(s: string): number { + const fullOpener = s.indexOf(XML_TOOL_OPEN); + if (fullOpener !== -1) return fullOpener; + const lastLt = s.lastIndexOf('<'); + if (lastLt === -1) return -1; + const suffix = s.slice(lastLt); + if (XML_TOOL_OPEN.startsWith(suffix) && suffix.length < XML_TOOL_OPEN.length) { + return lastLt; + } + return -1; +}