// P5 (SPLIT SKETCH): the generic AI-SDK adapter, split out of stream-phase.ts. // This module is the v1.13.1-A streamText adapter and nothing else — it has NO // SQL, broker, or BooCode persistence dependencies (its only `ctx` access is // config + log), so it can be unit-tested without standing up a DB or broker. // stream-phase.ts (the I/O layer) re-exports the public names below so existing // importers (`./stream-phase.js`) are unchanged. import type { FastifyBaseLogger } from 'fastify'; import type { Config } from '../../config.js'; import type { Agent, ToolCall } from '../../types/api.js'; import type { ToolJsonSchema } from '../tools.js'; import type { OpenAiMessage } from './payload.js'; import { extractToolCallBlocks } from './tool-call-parser.js'; import type { StreamResult } from './types.js'; import { upstreamModel } from './provider.js'; import { jsonSchema, streamText, tool, type JSONValue, type ModelMessage, type ToolCallRepairFunction, } from 'ai'; // The slice of InferenceContext the adapter actually needs. Narrowing it here // (instead of taking the full InferenceContext) keeps the adapter free of the // SQL/broker/publish surface. InferenceContext structurally satisfies this, so // callers pass their ctx unchanged. export interface StreamAdapterContext { config: Config; log: FastifyBaseLogger; } export interface StreamOptions { // null = omit tools entirely (compact phase); [] = caller stripped all tools // (rare; we still omit from the request body to avoid OpenAI 400). tools: ToolJsonSchema[] | null; temperature?: number; top_p?: number | null; top_k?: number | null; min_p?: number | null; presence_penalty?: number | null; // v2.6 sampling-streamjson-tokens (#11): llama.cpp sampler extensions. These // are NOT standard AI-SDK streamText options and are NOT serialized by the // openai-compatible provider's standardized-settings path (topK is even // explicitly dropped with an "unsupported feature: topK" warning). They reach // llama-server only via providerOptions.openaiCompatible (see buildSamplerProviderOptions). top_n_sigma?: number | null; dry_multiplier?: number | null; dry_base?: number | null; dry_allowed_length?: number | null; dry_penalty_last_n?: number | null; } // P5: the 10-field sampler-options literal that was copy-pasted at 4 sites // (the three sentinel summaries + executeStreamPhase). Builds the StreamOptions // sampler subset from an agent's frontmatter knobs. `temperature` is // `agent?.temperature` (already number|undefined); the nullable fields strip // null → undefined so they're omitted from the request body when unset. Keep // this in lockstep with the StreamOptions sampler fields — a new sampler knob // (the v2.7.3 dry_* family did this) is added here once instead of at 4 sites. export type SamplerOpts = Omit; export function samplerOptsFromAgent(agent: Agent | null): SamplerOpts { return { temperature: agent?.temperature, top_p: agent?.top_p ?? undefined, top_k: agent?.top_k ?? undefined, min_p: agent?.min_p ?? undefined, presence_penalty: agent?.presence_penalty ?? undefined, top_n_sigma: agent?.top_n_sigma ?? undefined, dry_multiplier: agent?.dry_multiplier ?? undefined, dry_base: agent?.dry_base ?? undefined, dry_allowed_length: agent?.dry_allowed_length ?? undefined, dry_penalty_last_n: agent?.dry_penalty_last_n ?? undefined, }; } // v2.6 #11: build the providerOptions.openaiCompatible extraBody object for the // llama.cpp sampler extensions. @ai-sdk/openai-compatible (2.0.47) merges every // non-reserved key under providerOptions.openaiCompatible straight into the // chat-completion request body (see its getArgs: the Object.fromEntries spread // filtered against openaiCompatibleLanguageModelChatOptions.shape). This is the // ONLY working passthrough for these params: // - top_k / min_p were latently dropped before this: top_k was passed as the // AI-SDK `topK` setting which the openai-compatible provider rejects as // unsupported; min_p was never passed to streamText at all. // - top_n_sigma + the dry_* family have no AI-SDK equivalent. // Keys use llama-server's snake_case body names so they land verbatim. function buildSamplerProviderOptions(opts: StreamOptions): Record | undefined { const body: Record = {}; if (typeof opts.top_k === 'number') body.top_k = opts.top_k; if (typeof opts.min_p === 'number') body.min_p = opts.min_p; if (typeof opts.top_n_sigma === 'number') body.top_n_sigma = opts.top_n_sigma; if (typeof opts.dry_multiplier === 'number') body.dry_multiplier = opts.dry_multiplier; if (typeof opts.dry_base === 'number') body.dry_base = opts.dry_base; if (typeof opts.dry_allowed_length === 'number') body.dry_allowed_length = opts.dry_allowed_length; if (typeof opts.dry_penalty_last_n === 'number') body.dry_penalty_last_n = opts.dry_penalty_last_n; return Object.keys(body).length > 0 ? body : undefined; } // v1.13.1-A: convert BooCode's OpenAI-shaped history into AI SDK // ModelMessage[]. Tool result messages need a `toolName` field that the // OpenAI shape doesn't carry; we look it up by scanning earlier assistant // `tool_calls` entries for a matching id. function toModelMessages(messages: OpenAiMessage[]): ModelMessage[] { const toolNameById = new Map(); for (const m of messages) { if (m.role === 'assistant' && m.tool_calls) { for (const tc of m.tool_calls) { toolNameById.set(tc.id, tc.function.name); } } } const out: ModelMessage[] = []; for (const m of messages) { if (m.role === 'system' || m.role === 'user') { out.push({ role: m.role, content: m.content ?? '' }); continue; } if (m.role === 'assistant') { const hasTools = m.tool_calls && m.tool_calls.length > 0; const hasReasoning = typeof m.reasoning === 'string' && m.reasoning.length > 0; if (!hasTools && !hasReasoning) { // Bare text assistant (string content). null content + no tool_calls // is degenerate but harmless to forward. out.push({ role: 'assistant', content: m.content ?? '' }); continue; } // v1.13.1-C: AI SDK ReasoningPart precedes text + tool-calls in the // assistant content array. Reasoning models (qwen3.6) consume their // prior reasoning context to resume mid-thought across tool boundaries. const parts: Array< | { type: 'reasoning'; text: string } | { type: 'text'; text: string } | { type: 'tool-call'; toolCallId: string; toolName: string; input: unknown } > = []; if (hasReasoning) { parts.push({ type: 'reasoning', text: m.reasoning! }); } if (m.content && m.content.length > 0) { parts.push({ type: 'text', text: m.content }); } for (const tc of m.tool_calls ?? []) { let input: unknown = {}; try { input = tc.function.arguments.length > 0 ? JSON.parse(tc.function.arguments) : {}; } catch { // Malformed args from a prior turn: pass through as a raw blob so // the model sees the same shape it emitted. Wraps the string under // _raw to match the buildMessagesPayload upstream convention. input = { _raw: tc.function.arguments }; } parts.push({ type: 'tool-call', toolCallId: tc.id, toolName: tc.function.name, input }); } out.push({ role: 'assistant', content: parts }); continue; } if (m.role === 'tool') { const toolCallId = m.tool_call_id ?? ''; const toolName = toolNameById.get(toolCallId) ?? 'unknown'; const raw = m.content ?? ''; let output: { type: 'text'; value: string } | { type: 'json'; value: JSONValue }; try { // JSON.parse returns `any`; cast to JSONValue since the upstream // tool_results column is already JSON-serializable by construction. output = { type: 'json', value: JSON.parse(raw) as JSONValue }; } catch { output = { type: 'text', value: raw }; } out.push({ role: 'tool', content: [{ type: 'tool-result', toolCallId, toolName, output }], }); continue; } } return out; } // Build the AI SDK tools record from BooCode's JSON-schema tool definitions. // No `execute` field: BooCode runs tools itself in tool-phase.ts; streamText // surfaces the tool-call parts via fullStream and we capture them for the // outer loop to dispatch. function buildAiTools(schemas: ToolJsonSchema[]): Record> { const out: Record> = {}; for (const s of schemas) { out[s.function.name] = tool({ description: s.function.description, inputSchema: jsonSchema(s.function.parameters), }); } return out; } // v1.10.5 Qwen-coder XML fallback. Some local models (notably qwen3-coder via // llama-swap) emit tool calls as inline XML inside delta.content rather than // the structured tool_calls field. We extract them out of the streamed text // before flushing it to the client. // // Qwen shape: // // // VALUE // ... // // // // v1.13.16: also recognize Anthropic markup that qwen3.6-35b-a3b-mxfp4 // drifts to (training-data residue from Claude Code documentation): // // VALUE // // Both formats share the synthetic xml_call_${idx} ID space; the counter // increments across whichever opener appears first. Multiple blocks may // appear back-to-back in either format and they never nest. export async function streamCompletion( ctx: StreamAdapterContext, model: string, messages: OpenAiMessage[], opts: StreamOptions, onDelta: (content: string) => void, onUsage: ((prompt: number | null, completion: number | null) => void) | undefined, signal?: AbortSignal, agent?: Agent | null, ): Promise { const aiMessages = toModelMessages(messages); const hasTools = opts.tools !== null && opts.tools.length > 0; const aiTools = hasTools ? buildAiTools(opts.tools!) : undefined; const startedAt = Date.now(); // v1.13.1-C: accumulate reasoning text across reasoning-delta parts. // qwen3.6 emits these on a separate channel from text content; we capture // them per stream so finalizeCompletion can dual-write a 'reasoning' part. // Replaces the v1.13.1-A counter-only diagnostic. let reasoningAccumulated = ''; // v1.13.3: experimental_repairToolCall keeps the stream alive when the // model emits a malformed tool call (bad JSON args, unknown name, etc.). // Without a repair function streamText throws and the WHOLE stream dies; // with one, the SDK invokes us and we route the bad call through normally. // Strategy: pass through unmodified. executeToolPhase's existing error // path (unknown tool name → "unknown tool: X" result; zod-reject → tool // 'X' rejected — fieldname: required) already gives the model a clean // recovery surface on the next turn. Logging gives us visibility into // how often qwen3.6 actually emits broken calls. const repairToolCall: ToolCallRepairFunction> = async ({ toolCall, error, }) => { ctx.log.warn( { toolCallId: toolCall.toolCallId, toolName: toolCall.toolName, error: error.message, }, 'malformed tool call surfaced via repairToolCall', ); return toolCall; }; // v2.6 #11: llama.cpp sampler extensions (top_k, min_p, top_n_sigma, dry_*) // ride providerOptions.openaiCompatible — they are NOT standardized streamText // settings. NB: top_k used to be passed below as the AI-SDK `topK` setting; // the openai-compatible provider dropped it with an "unsupported feature: topK" // warning and min_p was never wired at all, so both were dead on the wire // before this. They now go through the same extraBody path as the new params. const samplerBody = buildSamplerProviderOptions(opts); const result = streamText({ model: upstreamModel(ctx.config, model, agent ?? null), messages: aiMessages, ...(aiTools ? { tools: aiTools, toolChoice: 'auto' as const, experimental_repairToolCall: repairToolCall } : {}), ...(typeof opts.temperature === 'number' ? { temperature: opts.temperature } : {}), ...(typeof opts.top_p === 'number' ? { topP: opts.top_p } : {}), ...(typeof opts.presence_penalty === 'number' ? { presencePenalty: opts.presence_penalty } : {}), ...(samplerBody ? { providerOptions: { openaiCompatible: samplerBody } } : {}), abortSignal: signal, }); let content = ''; let pendingBuffer = ''; let finishReason: string | null = null; // v1.13.1-A: AI SDK emits one `tool-call` part per fully-aggregated call, // so we no longer need the OpenAI-index reassembly map the manual SSE // parser used. XML tool calls extracted from text content go into the // same flat list and keep the v1.10.5 synthetic id convention. const toolCalls: ToolCall[] = []; for await (const part of result.fullStream) { switch (part.type) { case 'text-delta': { pendingBuffer += part.text; // v1.13.16: unified extraction. The helper finds the earliest-opening // complete or block, flushes prose between/around // them, holds any partial opener for the next chunk, and silently // drops blocks that fail to parse (matches pre-v1.13.16 behavior). const extracted = extractToolCallBlocks(pendingBuffer); if (extracted.flushed.length > 0) { content += extracted.flushed; onDelta(extracted.flushed); } for (const call of extracted.calls) { const synthIdx = toolCalls.length; toolCalls.push({ id: `xml_call_${synthIdx}`, name: call.name, args: call.args, }); } pendingBuffer = extracted.remaining; break; } case 'tool-call': { // AI SDK has already parsed the input into an object. Match the // ToolCall shape BooCode passes around in toolCallsBuffer downstream. toolCalls.push({ id: part.toolCallId, name: part.toolName, args: (part.input ?? {}) as Record, }); break; } case 'reasoning-delta': { // v1.13.1-C: accumulate; finalizeCompletion / executeToolPhase // dual-write the resulting text as a kind='reasoning' part. if (typeof part.text === 'string') { reasoningAccumulated += part.text; } break; } case 'finish': { if (typeof part.finishReason === 'string') { finishReason = part.finishReason; } break; } case 'error': { const err = part.error; throw err instanceof Error ? err : new Error(String(err)); } // Intentional no-op: start, start-step, text-start, text-end, // reasoning-start, reasoning-end, source, file, tool-input-start, // tool-input-delta, tool-input-end, tool-result, tool-error, // finish-step, raw. We only care about the aggregated tool-call and // text-delta paths above; the rest are AI SDK lifecycle/streaming // breadcrumbs that don't change BooCode's persistence or WS contract. default: break; } } // v1.13.1-A: drain any buffered partial XML opener as plain text. The // pre-AI-SDK path did this on stream end too — better to leak ` 0) { content += pendingBuffer; onDelta(pendingBuffer); pendingBuffer = ''; } // AI SDK v6 fullStream returns normally on abort; check signal explicitly. // Without this throw the row would land as status='complete' with partial // content instead of going through handleAbortOrError → status='cancelled'. // Smoke D caught this in v1.13.1-A — don't refactor it away. if (signal?.aborted) { const abortErr = new Error('aborted'); abortErr.name = 'AbortError'; throw abortErr; } // Usage lands as a promise on the result; awaiting after fullStream is // drained is safe. AI SDK v6 names: `inputTokens` / `outputTokens`. let promptTokens: number | null = null; let completionTokens: number | null = null; try { const usage = await result.usage; if (typeof usage.inputTokens === 'number') promptTokens = usage.inputTokens; if (typeof usage.outputTokens === 'number') completionTokens = usage.outputTokens; } catch { // Some providers omit usage on partial streams; leave both null. } if (onUsage && (promptTokens !== null || completionTokens !== null)) { onUsage(promptTokens, completionTokens); } if (reasoningAccumulated.length > 0) { ctx.log.debug( { reasoningChars: reasoningAccumulated.length, model, elapsed_ms: Date.now() - startedAt }, 'streamCompletion: captured reasoning', ); } return { finishReason, content, toolCalls, promptTokens, completionTokens, reasoning: reasoningAccumulated, }; }