diff --git a/apps/server/src/services/inference.ts b/apps/server/src/services/inference.ts
index 8c4af97..dd23623 100644
--- a/apps/server/src/services/inference.ts
+++ b/apps/server/src/services/inference.ts
@@ -310,6 +310,70 @@ interface StreamOptions {
temperature?: number;
}
+// v1.10.5 Qwen-coder XML fallback. Some local models (notably qwen3-coder via
+// llama-swap) emit tool calls as inline XML inside delta.content rather than
+// the structured delta.tool_calls field. The XML shape is:
+//
+//
+//
+// VALUE
+//
+// ...more parameters...
+//
+//
+// Multiple blocks may appear back-to-back; they never nest.
+// streamCompletion buffers delta.content, extracts complete blocks, parses
+// them via parseXmlToolCall, and pushes synthetic entries into the existing
+// toolCallsBuffer alongside any native JSON-format tool calls.
+const XML_TOOL_OPEN = '';
+const XML_TOOL_CLOSE = '';
+
+function parseXmlToolCall(
+ block: string,
+): { name: string; args: Record } | null {
+ const nameMatch = block.match(/]+)>/);
+ if (!nameMatch || !nameMatch[1]) return null;
+ const name = nameMatch[1].trim();
+ if (!name) return null;
+ const args: Record = {};
+ // Non-greedy body so each … pair is matched
+ // independently even when multiple appear in the same block.
+ const paramRe = /]+)>([\s\S]*?)<\/parameter>/g;
+ for (const m of block.matchAll(paramRe)) {
+ const key = (m[1] ?? '').trim();
+ if (!key) continue;
+ const raw = (m[2] ?? '').trim();
+ try {
+ args[key] = JSON.parse(raw);
+ } catch {
+ args[key] = raw;
+ }
+ }
+ return { name, args };
+}
+
+// Locate the first character that begins (or completely contains) an
+// unfinished opener in `s`. Returns -1 when `s` can be flushed
+// to the client in full without risking a partial tag leak.
+// Case 1: a full `` opener with no matching closer — caller
+// must keep everything from that index forward until the next
+// chunk arrives with the closer.
+// Case 2: `s` ends with a strict prefix of `` (e.g. `… pair before reaching this check.
+function partialXmlOpenerStart(s: string): number {
+ const fullOpener = s.indexOf(XML_TOOL_OPEN);
+ if (fullOpener !== -1) return fullOpener;
+ const lastLt = s.lastIndexOf('<');
+ if (lastLt === -1) return -1;
+ const suffix = s.slice(lastLt);
+ if (XML_TOOL_OPEN.startsWith(suffix) && suffix.length < XML_TOOL_OPEN.length) {
+ return lastLt;
+ }
+ return -1;
+}
+
async function streamCompletion(
ctx: InferenceContext,
model: string,
@@ -344,6 +408,10 @@ async function streamCompletion(
}
let content = '';
+ // v1.10.5: holds delta.content bytes that may contain a partial XML tool
+ // call. Anything not part of a (possibly forming) …
+ // pair is flushed to content + onDelta as soon as we know it's safe.
+ let pendingBuffer = '';
let finishReason: string | null = null;
let promptTokens: number | null = null;
let completionTokens: number | null = null;
@@ -377,8 +445,50 @@ async function streamCompletion(
if (!choice) continue;
const delta = choice.delta ?? {};
if (typeof delta.content === 'string' && delta.content.length > 0) {
- content += delta.content;
- onDelta(delta.content);
+ // v1.10.5 XML fallback. Append, then extract any complete tool_call
+ // blocks before deciding what's safe to flush as visible content.
+ pendingBuffer += delta.content;
+ while (true) {
+ const startIdx = pendingBuffer.indexOf(XML_TOOL_OPEN);
+ if (startIdx === -1) break;
+ const closeIdx = pendingBuffer.indexOf(XML_TOOL_CLOSE, startIdx);
+ if (closeIdx === -1) break;
+ const blockEnd = closeIdx + XML_TOOL_CLOSE.length;
+ const block = pendingBuffer.slice(startIdx, blockEnd);
+ // Any text before the opener is plain content — flush it now.
+ if (startIdx > 0) {
+ const before = pendingBuffer.slice(0, startIdx);
+ content += before;
+ onDelta(before);
+ }
+ const parsedCall = parseXmlToolCall(block);
+ if (parsedCall) {
+ const synthIdx = toolCallsBuffer.size;
+ toolCallsBuffer.set(synthIdx, {
+ id: `xml_call_${synthIdx}`,
+ name: parsedCall.name,
+ argsText: JSON.stringify(parsedCall.args),
+ });
+ }
+ // If parsing failed we still drop the block — emitting unparseable
+ // XML to the chat would look worse than silently swallowing it.
+ pendingBuffer = pendingBuffer.slice(blockEnd);
+ }
+ // After all complete blocks are out, hold back any (partial or full)
+ // unclosed opener; flush the rest.
+ const partialIdx = partialXmlOpenerStart(pendingBuffer);
+ if (partialIdx >= 0) {
+ if (partialIdx > 0) {
+ const flush = pendingBuffer.slice(0, partialIdx);
+ content += flush;
+ onDelta(flush);
+ }
+ pendingBuffer = pendingBuffer.slice(partialIdx);
+ } else if (pendingBuffer.length > 0) {
+ content += pendingBuffer;
+ onDelta(pendingBuffer);
+ pendingBuffer = '';
+ }
}
if (Array.isArray(delta.tool_calls)) {
for (const tc of delta.tool_calls) {
@@ -393,6 +503,15 @@ async function streamCompletion(
if (choice.finish_reason) finishReason = choice.finish_reason;
}
+ // v1.10.5: if the stream ended mid-XML (e.g. model truncated, no closer
+ // ever arrived), flush whatever was buffered as plain content so it isn't
+ // silently dropped. Better to show a stray `` than vanish text.
+ if (pendingBuffer.length > 0) {
+ content += pendingBuffer;
+ onDelta(pendingBuffer);
+ pendingBuffer = '';
+ }
+
const toolCalls: ToolCall[] = [];
for (const [, t] of [...toolCallsBuffer.entries()].sort(([a], [b]) => a - b)) {
let args: Record = {};