diff --git a/apps/server/src/services/inference.ts b/apps/server/src/services/inference.ts
index b9e6bec..8698d2e 100644
--- a/apps/server/src/services/inference.ts
+++ b/apps/server/src/services/inference.ts
@@ -13,7 +13,6 @@ import type {
 } from '../types/api.js';
 import {
   ALL_TOOLS,
-  READ_ONLY_TOOL_NAMES,
   TOOLS_BY_NAME,
   toolJsonSchemas,
   type ToolJsonSchema,
@@ -28,88 +27,34 @@ import type { Broker } from './broker.js';
 // async (awaits the container-guidance loader) — buildMessagesPayload below
 // is therefore async too, and its three call sites in this file await it.
 import { buildSystemPrompt } from './system-prompt.js';
+import { resolveToolBudget } from './inference/budget.js';
+import {
+  DOOM_LOOP_THRESHOLD,
+  detectDoomLoop,
+  isAnySentinel,
+} from './inference/sentinels.js';
+import {
+  XML_TOOL_CLOSE,
+  XML_TOOL_OPEN,
+  parseXmlToolCall,
+  partialXmlOpenerStart,
+} from './inference/xml-parser.js';
+
+// v1.12.4: re-exported so external callers (tests, future consumers) keep
+// importing from services/inference.js as the public surface.
+export { detectDoomLoop, DOOM_LOOP_THRESHOLD } from './inference/sentinels.js';
 
 const DB_FLUSH_INTERVAL_MS = 500;
 
-// v1.8.2: tool-call budget defaults. Resolved per-turn by resolveToolBudget.
-//   - Agent with explicit max_tool_calls: that value.
-//   - Agent with read-only-only tools:    BUDGET_READ_ONLY (30).
-//   - Agent with any non-read-only tool:  BUDGET_NON_READ_ONLY (10).
-//   - No agent (raw chat):                BUDGET_NO_AGENT (15).
-const BUDGET_READ_ONLY = 30;
-const BUDGET_NON_READ_ONLY = 10;
-const BUDGET_NO_AGENT = 15;
-
-const READ_ONLY_SET: ReadonlySet<string> = new Set(READ_ONLY_TOOL_NAMES);
-
-function resolveToolBudget(agent: Agent | null): number {
-  if (agent?.max_tool_calls != null) return agent.max_tool_calls;
-  if (!agent) return BUDGET_NO_AGENT;
-  const allReadOnly = agent.tools.every((t) => READ_ONLY_SET.has(t));
-  return allReadOnly ? BUDGET_READ_ONLY : BUDGET_NON_READ_ONLY;
-}
-
 // Synthetic system note appended to the cap-hit summary call. Verbatim from
 // the v1.8.2 spec — do not paraphrase: the model is more reliable when the
 // instruction is short, declarative, and identical across calls.
 const CAP_HIT_SUMMARY_NOTE = (limit: number) =>
   `You've reached the tool budget (${limit} calls). Produce the best answer you can with what you have. Do not call more tools.`;
 
-// v1.11.6: doom-loop guard. When the model calls the same tool with the
-// same arguments DOOM_LOOP_THRESHOLD times in a row within one user-message
-// turn, abort the recursion and run the same wrap-up summary path as the
-// cap-hit case. Ported from opencode (DOOM_LOOP_THRESHOLD in
-// session/processor.ts). Threshold of 3 is the smallest value that doesn't
-// false-positive on a model that retries once after a transient error.
-export const DOOM_LOOP_THRESHOLD = 3;
-
 const DOOM_LOOP_NOTE = (name: string) =>
   `You called ${name} with the same arguments ${DOOM_LOOP_THRESHOLD} times in a row. Stop calling it. Produce the best answer you can with what you have.`;
 
-// Returns the name + args of the looping tool when the LAST
-// DOOM_LOOP_THRESHOLD entries in `recentToolCalls` are identical (same name
-// AND deep-equal args via JSON.stringify). Returns null otherwise.
-// Pure; exported for unit-test access.
-export function detectDoomLoop(
-  recentToolCalls: ToolCall[],
-): { name: string; args: Record<string, unknown> } | null {
-  if (recentToolCalls.length < DOOM_LOOP_THRESHOLD) return null;
-  const last = recentToolCalls.slice(-DOOM_LOOP_THRESHOLD);
-  const ref = last[0]!;
-  const refArgs = JSON.stringify(ref.args);
-  for (let i = 1; i < last.length; i++) {
-    const tc = last[i]!;
-    if (tc.name !== ref.name) return null;
-    if (JSON.stringify(tc.args) !== refArgs) return null;
-  }
-  return { name: ref.name, args: ref.args };
-}
-
-function isCapHitSentinel(m: Message): boolean {
-  return (
-    m.role === 'system' &&
-    m.metadata !== null &&
-    typeof m.metadata === 'object' &&
-    (m.metadata as { kind?: unknown }).kind === 'cap_hit'
-  );
-}
-
-// v1.11.6: parallel predicate. Same UI-only semantics as cap-hit sentinels —
-// never sent to the LLM (filtered by buildMessagesPayload through the
-// isAnySentinel check below).
-function isDoomLoopSentinel(m: Message): boolean {
-  return (
-    m.role === 'system' &&
-    m.metadata !== null &&
-    typeof m.metadata === 'object' &&
-    (m.metadata as { kind?: unknown }).kind === 'doom_loop'
-  );
-}
-
-function isAnySentinel(m: Message): boolean {
-  return isCapHitSentinel(m) || isDoomLoopSentinel(m);
-}
-
 export interface InferenceFrame {
   type:
     | 'message_started'
@@ -391,55 +336,6 @@ interface StreamOptions {
 // streamCompletion buffers delta.content, extracts complete blocks, parses
 // them via parseXmlToolCall, and pushes synthetic entries into the existing
 // toolCallsBuffer alongside any native JSON-format tool calls.
-const XML_TOOL_OPEN = '<tool_call>';
-const XML_TOOL_CLOSE = '</tool_call>';
-
-function parseXmlToolCall(
-  block: string,
-): { name: string; args: Record<string, unknown> } | null {
-  const nameMatch = block.match(/<function=([^>]+)>/);
-  if (!nameMatch || !nameMatch[1]) return null;
-  const name = nameMatch[1].trim();
-  if (!name) return null;
-  const args: Record<string, unknown> = {};
-  // Non-greedy body so each <parameter=…>…</parameter> pair is matched
-  // independently even when multiple appear in the same block.
-  const paramRe = /<parameter=([^>]+)>([\s\S]*?)<\/parameter>/g;
-  for (const m of block.matchAll(paramRe)) {
-    const key = (m[1] ?? '').trim();
-    if (!key) continue;
-    const raw = (m[2] ?? '').trim();
-    try {
-      args[key] = JSON.parse(raw);
-    } catch {
-      args[key] = raw;
-    }
-  }
-  return { name, args };
-}
-
-// Locate the first character that begins (or completely contains) an
-// unfinished <tool_call> opener in `s`. Returns -1 when `s` can be flushed
-// to the client in full without risking a partial tag leak.
-//   Case 1: a full `<tool_call>` opener with no matching closer — caller
-//           must keep everything from that index forward until the next
-//           chunk arrives with the closer.
-//   Case 2: `s` ends with a strict prefix of `<tool_call>` (e.g. `<tool_c`).
-//           Caller must keep just that suffix in the buffer.
-// Note: case 1 assumes the calling loop already extracted every complete
-// <tool_call>…</tool_call> pair before reaching this check.
-function partialXmlOpenerStart(s: string): number {
-  const fullOpener = s.indexOf(XML_TOOL_OPEN);
-  if (fullOpener !== -1) return fullOpener;
-  const lastLt = s.lastIndexOf('<');
-  if (lastLt === -1) return -1;
-  const suffix = s.slice(lastLt);
-  if (XML_TOOL_OPEN.startsWith(suffix) && suffix.length < XML_TOOL_OPEN.length) {
-    return lastLt;
-  }
-  return -1;
-}
-
 async function streamCompletion(
   ctx: InferenceContext,
   model: string,
diff --git a/apps/server/src/services/inference/budget.ts b/apps/server/src/services/inference/budget.ts
new file mode 100644
index 0000000..01659ce
--- /dev/null
+++ b/apps/server/src/services/inference/budget.ts
@@ -0,0 +1,20 @@
+import type { Agent } from '../../types/api.js';
+import { READ_ONLY_TOOL_NAMES } from '../tools.js';
+
+// v1.8.2: tool-call budget defaults. Resolved per-turn by resolveToolBudget.
+//   - Agent with explicit max_tool_calls: that value.
+//   - Agent with read-only-only tools:    BUDGET_READ_ONLY (30).
+//   - Agent with any non-read-only tool:  BUDGET_NON_READ_ONLY (10).
+//   - No agent (raw chat):                BUDGET_NO_AGENT (15).
+export const BUDGET_READ_ONLY = 30;
+export const BUDGET_NON_READ_ONLY = 10;
+export const BUDGET_NO_AGENT = 15;
+
+const READ_ONLY_SET: ReadonlySet<string> = new Set(READ_ONLY_TOOL_NAMES);
+
+export function resolveToolBudget(agent: Agent | null): number {
+  if (agent?.max_tool_calls != null) return agent.max_tool_calls;
+  if (!agent) return BUDGET_NO_AGENT;
+  const allReadOnly = agent.tools.every((t) => READ_ONLY_SET.has(t));
+  return allReadOnly ? BUDGET_READ_ONLY : BUDGET_NON_READ_ONLY;
+}
diff --git a/apps/server/src/services/inference/sentinels.ts b/apps/server/src/services/inference/sentinels.ts
new file mode 100644
index 0000000..3b84da5
--- /dev/null
+++ b/apps/server/src/services/inference/sentinels.ts
@@ -0,0 +1,53 @@
+import type { Message, ToolCall } from '../../types/api.js';
+
+// v1.11.6: doom-loop guard. When the model calls the same tool with the
+// same arguments DOOM_LOOP_THRESHOLD times in a row within one user-message
+// turn, abort the recursion and run the same wrap-up summary path as the
+// cap-hit case. Ported from opencode (DOOM_LOOP_THRESHOLD in
+// session/processor.ts). Threshold of 3 is the smallest value that doesn't
+// false-positive on a model that retries once after a transient error.
+export const DOOM_LOOP_THRESHOLD = 3;
+
+// Returns the name + args of the looping tool when the LAST
+// DOOM_LOOP_THRESHOLD entries in `recentToolCalls` are identical (same name
+// AND deep-equal args via JSON.stringify). Returns null otherwise.
+// Pure; exported for unit-test access.
+export function detectDoomLoop(
+  recentToolCalls: ToolCall[],
+): { name: string; args: Record<string, unknown> } | null {
+  if (recentToolCalls.length < DOOM_LOOP_THRESHOLD) return null;
+  const last = recentToolCalls.slice(-DOOM_LOOP_THRESHOLD);
+  const ref = last[0]!;
+  const refArgs = JSON.stringify(ref.args);
+  for (let i = 1; i < last.length; i++) {
+    const tc = last[i]!;
+    if (tc.name !== ref.name) return null;
+    if (JSON.stringify(tc.args) !== refArgs) return null;
+  }
+  return { name: ref.name, args: ref.args };
+}
+
+export function isCapHitSentinel(m: Message): boolean {
+  return (
+    m.role === 'system' &&
+    m.metadata !== null &&
+    typeof m.metadata === 'object' &&
+    (m.metadata as { kind?: unknown }).kind === 'cap_hit'
+  );
+}
+
+// v1.11.6: parallel predicate. Same UI-only semantics as cap-hit sentinels —
+// never sent to the LLM (filtered by buildMessagesPayload through the
+// isAnySentinel check below).
+export function isDoomLoopSentinel(m: Message): boolean {
+  return (
+    m.role === 'system' &&
+    m.metadata !== null &&
+    typeof m.metadata === 'object' &&
+    (m.metadata as { kind?: unknown }).kind === 'doom_loop'
+  );
+}
+
+export function isAnySentinel(m: Message): boolean {
+  return isCapHitSentinel(m) || isDoomLoopSentinel(m);
+}
diff --git a/apps/server/src/services/inference/xml-parser.ts b/apps/server/src/services/inference/xml-parser.ts
new file mode 100644
index 0000000..61f080b
--- /dev/null
+++ b/apps/server/src/services/inference/xml-parser.ts
@@ -0,0 +1,53 @@
+// v1.10.5: XML-tag tool-call fallback. Some models emit
+// <tool_call><function=foo><parameter=key>value</parameter></function></tool_call>
+// in plain content instead of using the OpenAI tool_calls JSON channel.
+// The streaming loop in inference.ts extracts these blocks via these helpers.
+
+export const XML_TOOL_OPEN = '<tool_call>';
+export const XML_TOOL_CLOSE = '</tool_call>';
+
+export function parseXmlToolCall(
+  block: string,
+): { name: string; args: Record<string, unknown> } | null {
+  const nameMatch = block.match(/<function=([^>]+)>/);
+  if (!nameMatch || !nameMatch[1]) return null;
+  const name = nameMatch[1].trim();
+  if (!name) return null;
+  const args: Record<string, unknown> = {};
+  // Non-greedy body so each <parameter=…>…</parameter> pair is matched
+  // independently even when multiple appear in the same block.
+  const paramRe = /<parameter=([^>]+)>([\s\S]*?)<\/parameter>/g;
+  for (const m of block.matchAll(paramRe)) {
+    const key = (m[1] ?? '').trim();
+    if (!key) continue;
+    const raw = (m[2] ?? '').trim();
+    try {
+      args[key] = JSON.parse(raw);
+    } catch {
+      args[key] = raw;
+    }
+  }
+  return { name, args };
+}
+
+// Locate the first character that begins (or completely contains) an
+// unfinished <tool_call> opener in `s`. Returns -1 when `s` can be flushed
+// to the client in full without risking a partial tag leak.
+//   Case 1: a full `<tool_call>` opener with no matching closer — caller
+//           must keep everything from that index forward until the next
+//           chunk arrives with the closer.
+//   Case 2: `s` ends with a strict prefix of `<tool_call>` (e.g. `<tool_c`).
+//           Caller must keep just that suffix in the buffer.
+// Note: case 1 assumes the calling loop already extracted every complete
+// <tool_call>…</tool_call> pair before reaching this check.
+export function partialXmlOpenerStart(s: string): number {
+  const fullOpener = s.indexOf(XML_TOOL_OPEN);
+  if (fullOpener !== -1) return fullOpener;
+  const lastLt = s.lastIndexOf('<');
+  if (lastLt === -1) return -1;
+  const suffix = s.slice(lastLt);
+  if (XML_TOOL_OPEN.startsWith(suffix) && suffix.length < XML_TOOL_OPEN.length) {
+    return lastLt;
+  }
+  return -1;
+}