boocode/apps/server/src/services/inference/stream-phase-adapter.ts

// P5 (SPLIT SKETCH): the generic AI-SDK adapter, split out of stream-phase.ts.
// This module is the v1.13.1-A streamText adapter and nothing else — it has NO
// SQL, broker, or BooCode persistence dependencies (its only `ctx` access is
// config + log), so it can be unit-tested without standing up a DB or broker.
// stream-phase.ts (the I/O layer) re-exports the public names below so existing
// importers (`./stream-phase.js`) are unchanged.

import type { FastifyBaseLogger } from 'fastify';
import type { Config } from '../../config.js';
import type { Agent, ToolCall } from '../../types/api.js';
import type { ToolJsonSchema } from '../tools.js';
import type { OpenAiMessage } from './payload.js';
import { extractToolCallBlocks } from './tool-call-parser.js';
import type { StreamResult } from './types.js';
import { upstreamModel } from './provider.js';
import {
  jsonSchema,
  streamText,
  tool,
  type JSONValue,
  type ModelMessage,
  type ToolCallRepairFunction,
} from 'ai';

// The slice of InferenceContext the adapter actually needs. Narrowing it here
// (instead of taking the full InferenceContext) keeps the adapter free of the
// SQL/broker/publish surface. InferenceContext structurally satisfies this, so
// callers pass their ctx unchanged.
export interface StreamAdapterContext {
  config: Config;
  log: FastifyBaseLogger;
}

export interface StreamOptions {
  // null = omit tools entirely (compact phase); [] = caller stripped all tools
  // (rare; we still omit from the request body to avoid OpenAI 400).
  tools: ToolJsonSchema[] | null;
  temperature?: number;
  top_p?: number | null;
  top_k?: number | null;
  min_p?: number | null;
  presence_penalty?: number | null;
  // v2.6 sampling-streamjson-tokens (#11): llama.cpp sampler extensions. These
  // are NOT standard AI-SDK streamText options and are NOT serialized by the
  // openai-compatible provider's standardized-settings path (topK is even
  // explicitly dropped with an "unsupported feature: topK" warning). They reach
  // llama-server only via providerOptions.openaiCompatible (see buildSamplerProviderOptions).
  top_n_sigma?: number | null;
  dry_multiplier?: number | null;
  dry_base?: number | null;
  dry_allowed_length?: number | null;
  dry_penalty_last_n?: number | null;
}

// P5: the 10-field sampler-options literal that was copy-pasted at 4 sites
// (the three sentinel summaries + executeStreamPhase). Builds the StreamOptions
// sampler subset from an agent's frontmatter knobs. `temperature` is
// `agent?.temperature` (already number|undefined); the nullable fields strip
// null → undefined so they're omitted from the request body when unset. Keep
// this in lockstep with the StreamOptions sampler fields — a new sampler knob
// (the v2.7.3 dry_* family did this) is added here once instead of at 4 sites.
export type SamplerOpts = Omit<StreamOptions, 'tools'>;

export function samplerOptsFromAgent(agent: Agent | null): SamplerOpts {
  return {
    temperature: agent?.temperature,
    top_p: agent?.top_p ?? undefined,
    top_k: agent?.top_k ?? undefined,
    min_p: agent?.min_p ?? undefined,
    presence_penalty: agent?.presence_penalty ?? undefined,
    top_n_sigma: agent?.top_n_sigma ?? undefined,
    dry_multiplier: agent?.dry_multiplier ?? undefined,
    dry_base: agent?.dry_base ?? undefined,
    dry_allowed_length: agent?.dry_allowed_length ?? undefined,
    dry_penalty_last_n: agent?.dry_penalty_last_n ?? undefined,
  };
}

// v2.6 #11: build the providerOptions.openaiCompatible extraBody object for the
// llama.cpp sampler extensions. @ai-sdk/openai-compatible (2.0.47) merges every
// non-reserved key under providerOptions.openaiCompatible straight into the
// chat-completion request body (see its getArgs: the Object.fromEntries spread
// filtered against openaiCompatibleLanguageModelChatOptions.shape). This is the
// ONLY working passthrough for these params:
//   - top_k / min_p were latently dropped before this: top_k was passed as the
//     AI-SDK `topK` setting which the openai-compatible provider rejects as
//     unsupported; min_p was never passed to streamText at all.
//   - top_n_sigma + the dry_* family have no AI-SDK equivalent.
// Keys use llama-server's snake_case body names so they land verbatim.
function buildSamplerProviderOptions(opts: StreamOptions): Record<string, number> | undefined {
  const body: Record<string, number> = {};
  if (typeof opts.top_k === 'number') body.top_k = opts.top_k;
  if (typeof opts.min_p === 'number') body.min_p = opts.min_p;
  if (typeof opts.top_n_sigma === 'number') body.top_n_sigma = opts.top_n_sigma;
  if (typeof opts.dry_multiplier === 'number') body.dry_multiplier = opts.dry_multiplier;
  if (typeof opts.dry_base === 'number') body.dry_base = opts.dry_base;
  if (typeof opts.dry_allowed_length === 'number') body.dry_allowed_length = opts.dry_allowed_length;
  if (typeof opts.dry_penalty_last_n === 'number') body.dry_penalty_last_n = opts.dry_penalty_last_n;
  return Object.keys(body).length > 0 ? body : undefined;
}

// v1.13.1-A: convert BooCode's OpenAI-shaped history into AI SDK
// ModelMessage[]. Tool result messages need a `toolName` field that the
// OpenAI shape doesn't carry; we look it up by scanning earlier assistant
// `tool_calls` entries for a matching id.
function toModelMessages(messages: OpenAiMessage[]): ModelMessage[] {
  const toolNameById = new Map<string, string>();
  for (const m of messages) {
    if (m.role === 'assistant' && m.tool_calls) {
      for (const tc of m.tool_calls) {
        toolNameById.set(tc.id, tc.function.name);
      }
    }
  }
  const out: ModelMessage[] = [];
  for (const m of messages) {
    if (m.role === 'system' || m.role === 'user') {
      out.push({ role: m.role, content: m.content ?? '' });
      continue;
    }
    if (m.role === 'assistant') {
      const hasTools = m.tool_calls && m.tool_calls.length > 0;
      const hasReasoning = typeof m.reasoning === 'string' && m.reasoning.length > 0;
      if (!hasTools && !hasReasoning) {
        // Bare text assistant (string content). null content + no tool_calls
        // is degenerate but harmless to forward.
        out.push({ role: 'assistant', content: m.content ?? '' });
        continue;
      }
      // v1.13.1-C: AI SDK ReasoningPart precedes text + tool-calls in the
      // assistant content array. Reasoning models (qwen3.6) consume their
      // prior reasoning context to resume mid-thought across tool boundaries.
      const parts: Array<
        | { type: 'reasoning'; text: string }
        | { type: 'text'; text: string }
        | { type: 'tool-call'; toolCallId: string; toolName: string; input: unknown }
      > = [];
      if (hasReasoning) {
        parts.push({ type: 'reasoning', text: m.reasoning! });
      }
      if (m.content && m.content.length > 0) {
        parts.push({ type: 'text', text: m.content });
      }
      for (const tc of m.tool_calls ?? []) {
        let input: unknown = {};
        try {
          input = tc.function.arguments.length > 0 ? JSON.parse(tc.function.arguments) : {};
        } catch {
          // Malformed args from a prior turn: pass through as a raw blob so
          // the model sees the same shape it emitted. Wraps the string under
          // _raw to match the buildMessagesPayload upstream convention.
          input = { _raw: tc.function.arguments };
        }
        parts.push({ type: 'tool-call', toolCallId: tc.id, toolName: tc.function.name, input });
      }
      out.push({ role: 'assistant', content: parts });
      continue;
    }
    if (m.role === 'tool') {
      const toolCallId = m.tool_call_id ?? '';
      const toolName = toolNameById.get(toolCallId) ?? 'unknown';
      const raw = m.content ?? '';
      let output: { type: 'text'; value: string } | { type: 'json'; value: JSONValue };
      try {
        // JSON.parse returns `any`; cast to JSONValue since the upstream
        // tool_results column is already JSON-serializable by construction.
        output = { type: 'json', value: JSON.parse(raw) as JSONValue };
      } catch {
        output = { type: 'text', value: raw };
      }
      out.push({
        role: 'tool',
        content: [{ type: 'tool-result', toolCallId, toolName, output }],
      });
      continue;
    }
  }
  return out;
}

// Build the AI SDK tools record from BooCode's JSON-schema tool definitions.
// No `execute` field: BooCode runs tools itself in tool-phase.ts; streamText
// surfaces the tool-call parts via fullStream and we capture them for the
// outer loop to dispatch.
function buildAiTools(schemas: ToolJsonSchema[]): Record<string, ReturnType<typeof tool>> {
  const out: Record<string, ReturnType<typeof tool>> = {};
  for (const s of schemas) {
    out[s.function.name] = tool({
      description: s.function.description,
      inputSchema: jsonSchema(s.function.parameters),
    });
  }
  return out;
}

// v1.10.5 Qwen-coder XML fallback. Some local models (notably qwen3-coder via
// llama-swap) emit tool calls as inline XML inside delta.content rather than
// the structured tool_calls field. We extract them out of the streamed text
// before flushing it to the client.
//
// Qwen shape:
//   <tool_call>
//   <function=NAME>
//   <parameter=KEY>VALUE</parameter>
//   ...
//   </function>
//   </tool_call>
//
// v1.13.16: also recognize Anthropic <invoke> markup that qwen3.6-35b-a3b-mxfp4
// drifts to (training-data residue from Claude Code documentation):
//   <invoke name="NAME">
//   <parameter name="KEY">VALUE</parameter>
//   </invoke>
// Both formats share the synthetic xml_call_${idx} ID space; the counter
// increments across whichever opener appears first. Multiple blocks may
// appear back-to-back in either format and they never nest.
export async function streamCompletion(
  ctx: StreamAdapterContext,
  model: string,
  messages: OpenAiMessage[],
  opts: StreamOptions,
  onDelta: (content: string) => void,
  onUsage: ((prompt: number | null, completion: number | null) => void) | undefined,
  signal?: AbortSignal,
  agent?: Agent | null,
): Promise<StreamResult> {
  const aiMessages = toModelMessages(messages);
  const hasTools = opts.tools !== null && opts.tools.length > 0;
  const aiTools = hasTools ? buildAiTools(opts.tools!) : undefined;

  const startedAt = Date.now();
  // v1.13.1-C: accumulate reasoning text across reasoning-delta parts.
  // qwen3.6 emits these on a separate channel from text content; we capture
  // them per stream so finalizeCompletion can dual-write a 'reasoning' part.
  // Replaces the v1.13.1-A counter-only diagnostic.
  let reasoningAccumulated = '';

  // v1.13.3: experimental_repairToolCall keeps the stream alive when the
  // model emits a malformed tool call (bad JSON args, unknown name, etc.).
  // Without a repair function streamText throws and the WHOLE stream dies;
  // with one, the SDK invokes us and we route the bad call through normally.
  // Strategy: pass through unmodified. executeToolPhase's existing error
  // path (unknown tool name → "unknown tool: X" result; zod-reject → tool
  // 'X' rejected — fieldname: required) already gives the model a clean
  // recovery surface on the next turn. Logging gives us visibility into
  // how often qwen3.6 actually emits broken calls.
  const repairToolCall: ToolCallRepairFunction<NonNullable<typeof aiTools>> = async ({
    toolCall,
    error,
  }) => {
    ctx.log.warn(
      {
        toolCallId: toolCall.toolCallId,
        toolName: toolCall.toolName,
        error: error.message,
      },
      'malformed tool call surfaced via repairToolCall',
    );
    return toolCall;
  };

  // v2.6 #11: llama.cpp sampler extensions (top_k, min_p, top_n_sigma, dry_*)
  // ride providerOptions.openaiCompatible — they are NOT standardized streamText
  // settings. NB: top_k used to be passed below as the AI-SDK `topK` setting;
  // the openai-compatible provider dropped it with an "unsupported feature: topK"
  // warning and min_p was never wired at all, so both were dead on the wire
  // before this. They now go through the same extraBody path as the new params.
  const samplerBody = buildSamplerProviderOptions(opts);

  const result = streamText({
    model: upstreamModel(ctx.config, model, agent ?? null),
    messages: aiMessages,
    ...(aiTools
      ? { tools: aiTools, toolChoice: 'auto' as const, experimental_repairToolCall: repairToolCall }
      : {}),
    ...(typeof opts.temperature === 'number' ? { temperature: opts.temperature } : {}),
    ...(typeof opts.top_p === 'number' ? { topP: opts.top_p } : {}),
    ...(typeof opts.presence_penalty === 'number' ? { presencePenalty: opts.presence_penalty } : {}),
    ...(samplerBody ? { providerOptions: { openaiCompatible: samplerBody } } : {}),
    abortSignal: signal,
  });

  let content = '';
  let pendingBuffer = '';
  let finishReason: string | null = null;
  // v1.13.1-A: AI SDK emits one `tool-call` part per fully-aggregated call,
  // so we no longer need the OpenAI-index reassembly map the manual SSE
  // parser used. XML tool calls extracted from text content go into the
  // same flat list and keep the v1.10.5 synthetic id convention.
  const toolCalls: ToolCall[] = [];

  for await (const part of result.fullStream) {
    switch (part.type) {
      case 'text-delta': {
        pendingBuffer += part.text;
        // v1.13.16: unified extraction. The helper finds the earliest-opening
        // complete <tool_call> or <invoke> block, flushes prose between/around
        // them, holds any partial opener for the next chunk, and silently
        // drops blocks that fail to parse (matches pre-v1.13.16 behavior).
        const extracted = extractToolCallBlocks(pendingBuffer);
        if (extracted.flushed.length > 0) {
          content += extracted.flushed;
          onDelta(extracted.flushed);
        }
        for (const call of extracted.calls) {
          const synthIdx = toolCalls.length;
          toolCalls.push({
            id: `xml_call_${synthIdx}`,
            name: call.name,
            args: call.args,
          });
        }
        pendingBuffer = extracted.remaining;
        break;
      }
      case 'tool-call': {
        // AI SDK has already parsed the input into an object. Match the
        // ToolCall shape BooCode passes around in toolCallsBuffer downstream.
        toolCalls.push({
          id: part.toolCallId,
          name: part.toolName,
          args: (part.input ?? {}) as Record<string, unknown>,
        });
        break;
      }
      case 'reasoning-delta': {
        // v1.13.1-C: accumulate; finalizeCompletion / executeToolPhase
        // dual-write the resulting text as a kind='reasoning' part.
        if (typeof part.text === 'string') {
          reasoningAccumulated += part.text;
        }
        break;
      }
      case 'finish': {
        if (typeof part.finishReason === 'string') {
          finishReason = part.finishReason;
        }
        break;
      }
      case 'error': {
        const err = part.error;
        throw err instanceof Error ? err : new Error(String(err));
      }
      // Intentional no-op: start, start-step, text-start, text-end,
      // reasoning-start, reasoning-end, source, file, tool-input-start,
      // tool-input-delta, tool-input-end, tool-result, tool-error,
      // finish-step, raw. We only care about the aggregated tool-call and
      // text-delta paths above; the rest are AI SDK lifecycle/streaming
      // breadcrumbs that don't change BooCode's persistence or WS contract.
      default:
        break;
    }
  }

  // v1.13.1-A: drain any buffered partial XML opener as plain text. The
  // pre-AI-SDK path did this on stream end too — better to leak `<tool_c`
  // than vanish the text.
  if (pendingBuffer.length > 0) {
    content += pendingBuffer;
    onDelta(pendingBuffer);
    pendingBuffer = '';
  }

  // AI SDK v6 fullStream returns normally on abort; check signal explicitly.
  // Without this throw the row would land as status='complete' with partial
  // content instead of going through handleAbortOrError → status='cancelled'.
  // Smoke D caught this in v1.13.1-A — don't refactor it away.
  if (signal?.aborted) {
    const abortErr = new Error('aborted');
    abortErr.name = 'AbortError';
    throw abortErr;
  }

  // Usage lands as a promise on the result; awaiting after fullStream is
  // drained is safe. AI SDK v6 names: `inputTokens` / `outputTokens`.
  let promptTokens: number | null = null;
  let completionTokens: number | null = null;
  try {
    const usage = await result.usage;
    if (typeof usage.inputTokens === 'number') promptTokens = usage.inputTokens;
    if (typeof usage.outputTokens === 'number') completionTokens = usage.outputTokens;
  } catch {
    // Some providers omit usage on partial streams; leave both null.
  }

  if (onUsage && (promptTokens !== null || completionTokens !== null)) {
    onUsage(promptTokens, completionTokens);
  }

  if (reasoningAccumulated.length > 0) {
    ctx.log.debug(
      { reasoningChars: reasoningAccumulated.length, model, elapsed_ms: Date.now() - startedAt },
      'streamCompletion: captured reasoning',
    );
  }

  return {
    finishReason,
    content,
    toolCalls,
    promptTokens,
    completionTokens,
    reasoning: reasoningAccumulated,
  };
}