v1.12.4-rc3: extract stream-phase + tool-phase from inference.ts

- stream-phase.ts: streamCompletion, executeStreamPhase (plus sseLines, StreamOptions, ChatCompletionDelta/Chunk as private helpers) - tool-phase.ts: executeToolPhase + private executeToolCall - types.ts: shared StreamPhaseState + DB_FLUSH_INTERVAL_MS so the summary functions still in inference.ts can reference them without pulling from a phase file Cycle: executeToolPhase recurses into runAssistantTurn, which stays in inference.ts. Resolved by direct value back-edge — tool-phase.ts does `import { runAssistantTurn } from '../inference.js'` and runAssistantTurn is now exported. Safe because the dereference happens inside an async function body, after both modules have fully evaluated. No callback-through-args fallback needed. inference.ts shrinks from ~1401 to ~828 LoC. Final Dispatch D moves the sentinel summaries out and renames the residue to inference/turn.ts. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 22:28:23 +00:00
parent 8fa7b7fce9
commit c87df6981a
4 changed files with 619 additions and 586 deletions
--- a/apps/server/src/services/inference/stream-phase.ts
+++ b/apps/server/src/services/inference/stream-phase.ts
@@ -0,0 +1,380 @@
+import type {
+  Agent,
+  Session,
+  ToolCall,
+} from '../../types/api.js';
+import * as modelContext from '../model-context.js';
+import { toolJsonSchemas, type ToolJsonSchema } from '../tools.js';
+import type { OpenAiMessage } from './payload.js';
+import {
+  XML_TOOL_CLOSE,
+  XML_TOOL_OPEN,
+  parseXmlToolCall,
+  partialXmlOpenerStart,
+} from './xml-parser.js';
+import { DB_FLUSH_INTERVAL_MS, type StreamPhaseState } from './types.js';
+import type {
+  InferenceContext,
+  StreamResult,
+  TurnArgs,
+} from '../inference.js';
+
+interface ChatCompletionDelta {
+  role?: string;
+  content?: string | null;
+  tool_calls?: Array<{
+    index: number;
+    id?: string;
+    type?: 'function';
+    function?: { name?: string; arguments?: string };
+  }>;
+}
+
+interface ChatCompletionChunk {
+  choices?: Array<{
+    delta: ChatCompletionDelta;
+    finish_reason: string | null;
+  }>;
+  usage?: {
+    prompt_tokens?: number;
+    completion_tokens?: number;
+    total_tokens?: number;
+  };
+}
+
+interface StreamOptions {
+  // null = omit tools entirely (compact phase); [] = caller stripped all tools
+  // (rare; we still omit from the request body to avoid OpenAI 400).
+  tools: ToolJsonSchema[] | null;
+  temperature?: number;
+}
+
+async function* sseLines(stream: ReadableStream<Uint8Array>): AsyncGenerator<string> {
+  const reader = stream.getReader();
+  const decoder = new TextDecoder('utf-8');
+  let buffer = '';
+  try {
+    while (true) {
+      const { value, done } = await reader.read();
+      if (done) break;
+      buffer += decoder.decode(value, { stream: true });
+      let idx;
+      while ((idx = buffer.indexOf('\n')) >= 0) {
+        const line = buffer.slice(0, idx).replace(/\r$/, '');
+        buffer = buffer.slice(idx + 1);
+        if (line.length === 0) continue;
+        yield line;
+      }
+    }
+    if (buffer.length > 0) yield buffer;
+  } finally {
+    reader.releaseLock();
+  }
+}
+
+// v1.10.5 Qwen-coder XML fallback. Some local models (notably qwen3-coder via
+// llama-swap) emit tool calls as inline XML inside delta.content rather than
+// the structured delta.tool_calls field. The XML shape is:
+//   <tool_call>
+//   <function=NAME>
+//   <parameter=KEY>
+//   VALUE
+//   </parameter>
+//   ...more parameters...
+//   </function>
+//   </tool_call>
+// Multiple <tool_call> blocks may appear back-to-back; they never nest.
+// streamCompletion buffers delta.content, extracts complete blocks, parses
+// them via parseXmlToolCall, and pushes synthetic entries into the existing
+// toolCallsBuffer alongside any native JSON-format tool calls.
+export async function streamCompletion(
+  ctx: InferenceContext,
+  model: string,
+  messages: OpenAiMessage[],
+  opts: StreamOptions,
+  onDelta: (content: string) => void,
+  onUsage: ((prompt: number | null, completion: number | null) => void) | undefined,
+  signal?: AbortSignal
+): Promise<StreamResult> {
+  const body: Record<string, unknown> = {
+    model,
+    messages,
+    stream: true,
+    stream_options: { include_usage: true },
+  };
+  if (opts.tools && opts.tools.length > 0) {
+    body['tools'] = opts.tools;
+    body['tool_choice'] = 'auto';
+  }
+  if (typeof opts.temperature === 'number') {
+    body['temperature'] = opts.temperature;
+  }
+
+  const res = await fetch(`${ctx.config.LLAMA_SWAP_URL}/v1/chat/completions`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify(body),
+    signal,
+  });
+  if (!res.ok || !res.body) {
+    const text = await res.text().catch(() => '');
+    throw new Error(`llama-swap returned ${res.status}: ${text.slice(0, 200)}`);
+  }
+
+  let content = '';
+  // v1.10.5: holds delta.content bytes that may contain a partial XML tool
+  // call. Anything not part of a (possibly forming) <tool_call>…</tool_call>
+  // pair is flushed to content + onDelta as soon as we know it's safe.
+  let pendingBuffer = '';
+  let finishReason: string | null = null;
+  let promptTokens: number | null = null;
+  let completionTokens: number | null = null;
+  const toolCallsBuffer = new Map<number, { id: string; name: string; argsText: string }>();
+
+  for await (const line of sseLines(res.body)) {
+    if (!line.startsWith('data:')) continue;
+    const payload = line.slice(5).trim();
+    if (payload === '[DONE]') break;
+    let parsed: ChatCompletionChunk;
+    try {
+      parsed = JSON.parse(payload);
+    } catch {
+      continue;
+    }
+
+    if (parsed.usage) {
+      if (typeof parsed.usage.prompt_tokens === 'number') {
+        promptTokens = parsed.usage.prompt_tokens;
+      }
+      if (typeof parsed.usage.completion_tokens === 'number') {
+        completionTokens = parsed.usage.completion_tokens;
+      }
+      onUsage?.(promptTokens, completionTokens);
+    }
+    // v1.11.3: removed dead `parsed.timings.n_ctx` read. llama-server's
+    // streaming completion does NOT emit n_ctx in timings (verified
+    // empirically); the authoritative source is llama-swap's
+    // /upstream/<model>/props endpoint, fetched per-turn via
+    // model-context.getModelContext() at the finalization sites below.
+
+    const choice = parsed.choices?.[0];
+    if (!choice) continue;
+    const delta = choice.delta ?? {};
+    if (typeof delta.content === 'string' && delta.content.length > 0) {
+      // v1.10.5 XML fallback. Append, then extract any complete tool_call
+      // blocks before deciding what's safe to flush as visible content.
+      pendingBuffer += delta.content;
+      while (true) {
+        const startIdx = pendingBuffer.indexOf(XML_TOOL_OPEN);
+        if (startIdx === -1) break;
+        const closeIdx = pendingBuffer.indexOf(XML_TOOL_CLOSE, startIdx);
+        if (closeIdx === -1) break;
+        const blockEnd = closeIdx + XML_TOOL_CLOSE.length;
+        const block = pendingBuffer.slice(startIdx, blockEnd);
+        // Any text before the opener is plain content — flush it now.
+        if (startIdx > 0) {
+          const before = pendingBuffer.slice(0, startIdx);
+          content += before;
+          onDelta(before);
+        }
+        const parsedCall = parseXmlToolCall(block);
+        if (parsedCall) {
+          const synthIdx = toolCallsBuffer.size;
+          toolCallsBuffer.set(synthIdx, {
+            id: `xml_call_${synthIdx}`,
+            name: parsedCall.name,
+            argsText: JSON.stringify(parsedCall.args),
+          });
+        }
+        // If parsing failed we still drop the block — emitting unparseable
+        // XML to the chat would look worse than silently swallowing it.
+        pendingBuffer = pendingBuffer.slice(blockEnd);
+      }
+      // After all complete blocks are out, hold back any (partial or full)
+      // unclosed opener; flush the rest.
+      const partialIdx = partialXmlOpenerStart(pendingBuffer);
+      if (partialIdx >= 0) {
+        if (partialIdx > 0) {
+          const flush = pendingBuffer.slice(0, partialIdx);
+          content += flush;
+          onDelta(flush);
+        }
+        pendingBuffer = pendingBuffer.slice(partialIdx);
+      } else if (pendingBuffer.length > 0) {
+        content += pendingBuffer;
+        onDelta(pendingBuffer);
+        pendingBuffer = '';
+      }
+    }
+    if (Array.isArray(delta.tool_calls)) {
+      for (const tc of delta.tool_calls) {
+        const idx = tc.index;
+        const existing = toolCallsBuffer.get(idx) ?? { id: '', name: '', argsText: '' };
+        if (tc.id) existing.id = tc.id;
+        if (tc.function?.name) existing.name = tc.function.name;
+        if (typeof tc.function?.arguments === 'string') existing.argsText += tc.function.arguments;
+        toolCallsBuffer.set(idx, existing);
+      }
+    }
+    if (choice.finish_reason) finishReason = choice.finish_reason;
+  }
+
+  // v1.10.5: if the stream ended mid-XML (e.g. model truncated, no closer
+  // ever arrived), flush whatever was buffered as plain content so it isn't
+  // silently dropped. Better to show a stray `<tool_call>` than vanish text.
+  if (pendingBuffer.length > 0) {
+    content += pendingBuffer;
+    onDelta(pendingBuffer);
+    pendingBuffer = '';
+  }
+
+  const toolCalls: ToolCall[] = [];
+  for (const [, t] of [...toolCallsBuffer.entries()].sort(([a], [b]) => a - b)) {
+    let args: Record<string, unknown> = {};
+    if (t.argsText.length > 0) {
+      try {
+        args = JSON.parse(t.argsText);
+      } catch {
+        args = { _raw: t.argsText };
+      }
+    }
+    toolCalls.push({ id: t.id || `call_${toolCalls.length}`, name: t.name, args });
+  }
+
+  return { finishReason, content, toolCalls, promptTokens, completionTokens };
+}
+
+export async function executeStreamPhase(
+  ctx: InferenceContext,
+  args: TurnArgs,
+  session: Session,
+  messages: OpenAiMessage[],
+  state: StreamPhaseState,
+  agent: Agent | null,
+  // v1.11.8: when false, web_search and web_fetch are stripped from the
+  // tool list sent to the LLM, so the model can't even attempt them.
+  webToolsEnabled: boolean,
+): Promise<StreamResult> {
+  const { sessionId, chatId, assistantMessageId, signal } = args;
+
+  const startedRow = await ctx.sql<{ started_at: string }[]>`
+    UPDATE messages
+    SET started_at = clock_timestamp()
+    WHERE id = ${assistantMessageId}
+    RETURNING started_at
+  `;
+  state.startedAt = startedRow[0]?.started_at ?? null;
+
+  ctx.publish(sessionId, {
+    type: 'message_started',
+    message_id: assistantMessageId,
+    chat_id: chatId,
+    role: 'assistant',
+  });
+
+  let pendingFlushTimer: NodeJS.Timeout | null = null;
+  let flushPromise: Promise<unknown> = Promise.resolve();
+
+  const flushNow = () => {
+    if (pendingFlushTimer) {
+      clearTimeout(pendingFlushTimer);
+      pendingFlushTimer = null;
+    }
+    const snapshot = state.accumulated;
+    flushPromise = flushPromise.then(() =>
+      ctx.sql`UPDATE messages SET content = ${snapshot} WHERE id = ${assistantMessageId}`
+    );
+  };
+
+  const scheduleFlush = () => {
+    if (pendingFlushTimer) return;
+    pendingFlushTimer = setTimeout(() => {
+      pendingFlushTimer = null;
+      flushNow();
+    }, DB_FLUSH_INTERVAL_MS);
+  };
+
+  // Tool whitelist: if an agent is set, filter the global tool list to only the
+  // tool names it allows. Unknown names in agent.tools are dropped silently
+  // (handled here by intersection). When no agent: send all tools.
+  // v1.11.8: a second filter strips web_search + web_fetch unless the chat
+  // has them explicitly enabled. Counts as an opt-in security boundary: the
+  // model can't summon a tool that wasn't offered to it.
+  const WEB_TOOL_NAMES: ReadonlySet<string> = new Set(['web_search', 'web_fetch']);
+  const effectiveTools: ToolJsonSchema[] = (agent
+    ? toolJsonSchemas().filter((t) => agent.tools.includes(t.function.name))
+    : toolJsonSchemas()
+  ).filter((t) => webToolsEnabled || !WEB_TOOL_NAMES.has(t.function.name));
+  const effectiveTemperature = agent?.temperature;
+
+  // v1.12.2: ctx_max lookup is cached after the first hit per model, so this
+  // is a Map probe in steady state. We capture nCtx once at the top of the
+  // stream so the throttled usage publish doesn't refetch each tick.
+  const mctxForStream = await modelContext.getModelContext(session.model);
+  const nCtxForStream = mctxForStream?.n_ctx ?? null;
+
+  // v1.12.2: throttle live usage publishes to ~500ms. The model can land
+  // dozens of usage frames per second; without a throttle the WS turns into
+  // a firehose for a few KB savings on each render.
+  const USAGE_THROTTLE_MS = 500;
+  let lastUsageAt = 0;
+  let pendingUsage: { p: number | null; c: number | null } | null = null;
+  let usageTimer: NodeJS.Timeout | null = null;
+  const flushUsage = () => {
+    if (!pendingUsage) return;
+    const { p, c } = pendingUsage;
+    pendingUsage = null;
+    lastUsageAt = Date.now();
+    ctx.publish(sessionId, {
+      type: 'usage',
+      message_id: assistantMessageId,
+      chat_id: chatId,
+      completion_tokens: c,
+      ctx_used: p,
+      ctx_max: nCtxForStream,
+    });
+  };
+
+  try {
+    return await streamCompletion(
+      ctx,
+      session.model,
+      messages,
+      { tools: effectiveTools, temperature: effectiveTemperature },
+      (delta) => {
+        state.accumulated += delta;
+        ctx.publish(sessionId, {
+          type: 'delta',
+          message_id: assistantMessageId,
+          chat_id: chatId,
+          content: delta,
+        });
+        ctx.log.debug({ sessionId, delta }, 'inference delta');
+        scheduleFlush();
+      },
+      (prompt, completion) => {
+        pendingUsage = { p: prompt, c: completion };
+        const elapsed = Date.now() - lastUsageAt;
+        if (elapsed >= USAGE_THROTTLE_MS) {
+          flushUsage();
+        } else if (!usageTimer) {
+          usageTimer = setTimeout(() => {
+            usageTimer = null;
+            flushUsage();
+          }, USAGE_THROTTLE_MS - elapsed);
+        }
+      },
+      signal
+    );
+  } finally {
+    if (pendingFlushTimer) {
+      clearTimeout(pendingFlushTimer);
+      pendingFlushTimer = null;
+    }
+    if (usageTimer) {
+      clearTimeout(usageTimer);
+      usageTimer = null;
+    }
+    await flushPromise;
+  }
+}