diff --git a/apps/server/src/services/inference.ts b/apps/server/src/services/inference.ts index 8c4af97..dd23623 100644 --- a/apps/server/src/services/inference.ts +++ b/apps/server/src/services/inference.ts @@ -310,6 +310,70 @@ interface StreamOptions { temperature?: number; } +// v1.10.5 Qwen-coder XML fallback. Some local models (notably qwen3-coder via +// llama-swap) emit tool calls as inline XML inside delta.content rather than +// the structured delta.tool_calls field. The XML shape is: +// +// +// +// VALUE +// +// ...more parameters... +// +// +// Multiple blocks may appear back-to-back; they never nest. +// streamCompletion buffers delta.content, extracts complete blocks, parses +// them via parseXmlToolCall, and pushes synthetic entries into the existing +// toolCallsBuffer alongside any native JSON-format tool calls. +const XML_TOOL_OPEN = ''; +const XML_TOOL_CLOSE = ''; + +function parseXmlToolCall( + block: string, +): { name: string; args: Record } | null { + const nameMatch = block.match(/]+)>/); + if (!nameMatch || !nameMatch[1]) return null; + const name = nameMatch[1].trim(); + if (!name) return null; + const args: Record = {}; + // Non-greedy body so each … pair is matched + // independently even when multiple appear in the same block. + const paramRe = /]+)>([\s\S]*?)<\/parameter>/g; + for (const m of block.matchAll(paramRe)) { + const key = (m[1] ?? '').trim(); + if (!key) continue; + const raw = (m[2] ?? '').trim(); + try { + args[key] = JSON.parse(raw); + } catch { + args[key] = raw; + } + } + return { name, args }; +} + +// Locate the first character that begins (or completely contains) an +// unfinished opener in `s`. Returns -1 when `s` can be flushed +// to the client in full without risking a partial tag leak. +// Case 1: a full `` opener with no matching closer — caller +// must keep everything from that index forward until the next +// chunk arrives with the closer. +// Case 2: `s` ends with a strict prefix of `` (e.g. ` pair before reaching this check. +function partialXmlOpenerStart(s: string): number { + const fullOpener = s.indexOf(XML_TOOL_OPEN); + if (fullOpener !== -1) return fullOpener; + const lastLt = s.lastIndexOf('<'); + if (lastLt === -1) return -1; + const suffix = s.slice(lastLt); + if (XML_TOOL_OPEN.startsWith(suffix) && suffix.length < XML_TOOL_OPEN.length) { + return lastLt; + } + return -1; +} + async function streamCompletion( ctx: InferenceContext, model: string, @@ -344,6 +408,10 @@ async function streamCompletion( } let content = ''; + // v1.10.5: holds delta.content bytes that may contain a partial XML tool + // call. Anything not part of a (possibly forming) + // pair is flushed to content + onDelta as soon as we know it's safe. + let pendingBuffer = ''; let finishReason: string | null = null; let promptTokens: number | null = null; let completionTokens: number | null = null; @@ -377,8 +445,50 @@ async function streamCompletion( if (!choice) continue; const delta = choice.delta ?? {}; if (typeof delta.content === 'string' && delta.content.length > 0) { - content += delta.content; - onDelta(delta.content); + // v1.10.5 XML fallback. Append, then extract any complete tool_call + // blocks before deciding what's safe to flush as visible content. + pendingBuffer += delta.content; + while (true) { + const startIdx = pendingBuffer.indexOf(XML_TOOL_OPEN); + if (startIdx === -1) break; + const closeIdx = pendingBuffer.indexOf(XML_TOOL_CLOSE, startIdx); + if (closeIdx === -1) break; + const blockEnd = closeIdx + XML_TOOL_CLOSE.length; + const block = pendingBuffer.slice(startIdx, blockEnd); + // Any text before the opener is plain content — flush it now. + if (startIdx > 0) { + const before = pendingBuffer.slice(0, startIdx); + content += before; + onDelta(before); + } + const parsedCall = parseXmlToolCall(block); + if (parsedCall) { + const synthIdx = toolCallsBuffer.size; + toolCallsBuffer.set(synthIdx, { + id: `xml_call_${synthIdx}`, + name: parsedCall.name, + argsText: JSON.stringify(parsedCall.args), + }); + } + // If parsing failed we still drop the block — emitting unparseable + // XML to the chat would look worse than silently swallowing it. + pendingBuffer = pendingBuffer.slice(blockEnd); + } + // After all complete blocks are out, hold back any (partial or full) + // unclosed opener; flush the rest. + const partialIdx = partialXmlOpenerStart(pendingBuffer); + if (partialIdx >= 0) { + if (partialIdx > 0) { + const flush = pendingBuffer.slice(0, partialIdx); + content += flush; + onDelta(flush); + } + pendingBuffer = pendingBuffer.slice(partialIdx); + } else if (pendingBuffer.length > 0) { + content += pendingBuffer; + onDelta(pendingBuffer); + pendingBuffer = ''; + } } if (Array.isArray(delta.tool_calls)) { for (const tc of delta.tool_calls) { @@ -393,6 +503,15 @@ async function streamCompletion( if (choice.finish_reason) finishReason = choice.finish_reason; } + // v1.10.5: if the stream ended mid-XML (e.g. model truncated, no closer + // ever arrived), flush whatever was buffered as plain content so it isn't + // silently dropped. Better to show a stray `` than vanish text. + if (pendingBuffer.length > 0) { + content += pendingBuffer; + onDelta(pendingBuffer); + pendingBuffer = ''; + } + const toolCalls: ToolCall[] = []; for (const [, t] of [...toolCallsBuffer.entries()].sort(([a], [b]) => a - b)) { let args: Record = {};