Batch 1 — tool-call-parser.ts: replaces xml-parser.ts with a port of
Unsloth's tool_call_parser.py. Adds balanced-brace JSON scanner,
single-param fast path, hasToolSignal/stripToolMarkup/parseToolCallsFromText
exports, and stream-finalization stripping at all three final-write sites
(error-handler, finalizeCompletion, executeToolPhase). Anthropic <invoke>
shape preserved. 75+12 tests.
Batch 2 — web/html-to-md.ts: parse5 tree-walking HTML-to-Markdown converter
ported from Unsloth's _html_to_md.py. Replaces web_fetch's regex stripHtml
with structured markdown output (headings, links, lists, tables, code blocks,
blockquotes, entity decoding). 29 tests.
Batch 3 — llama-args-validator.ts: port of llama_server_args.py deny-list
validator. Wired into AGENTS.md frontmatter parser — llama_extra_args field
validated at load time, rejects managed flags (model identity, networking,
auth/TLS, server UI). No runtime consumer yet (llama-swap boundary). 76 tests.
All three files carry SPDX-License-Identifier: AGPL-3.0-only headers.
LICENSE flipped to AGPL-3.0-only in prior commit (a938cf1).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
427 lines
15 KiB
TypeScript
427 lines
15 KiB
TypeScript
// SPDX-License-Identifier: AGPL-3.0-only
|
|
// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
|
|
// Ported from studio/backend/core/inference/tool_call_parser.py.
|
|
// Original: https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/tool_call_parser.py
|
|
|
|
// ── Constants ────────────────────────────────────────────────────────────
|
|
|
|
export const XML_TOOL_OPEN = '<tool_call>';
|
|
export const XML_TOOL_CLOSE = '</tool_call>';
|
|
export const INVOKE_TOOL_OPEN = '<invoke';
|
|
export const INVOKE_TOOL_CLOSE = '</invoke>';
|
|
|
|
export const TOOL_XML_SIGNALS = [XML_TOOL_OPEN, '<function=', INVOKE_TOOL_OPEN] as const;
|
|
|
|
export const TOOL_ERROR_PREFIXES = [
|
|
'Error',
|
|
'Search failed',
|
|
'Execution error',
|
|
'Blocked:',
|
|
'Exit code',
|
|
'Failed to fetch',
|
|
'Failed to resolve',
|
|
'No query provided',
|
|
] as const;
|
|
|
|
export const DUPLICATE_CALL_NUDGE =
|
|
'You already made this exact call. Do not repeat the same tool ' +
|
|
'call. Try a different approach: fetch a URL from previous ' +
|
|
'results, use Python to process data you already have, or ' +
|
|
'provide your final answer now.';
|
|
|
|
export const TOOL_ERROR_NUDGE =
|
|
'\n\nThe tool call encountered an issue. Please try a different ' +
|
|
'approach or rephrase your request.';
|
|
|
|
export const BUDGET_EXHAUSTED_NUDGE =
|
|
'You have used all available tool calls. Based on everything you ' +
|
|
'have found so far, provide your final answer now. Do not call ' +
|
|
'any more tools.';
|
|
|
|
// ── Strip patterns ───────────────────────────────────────────────────────
|
|
|
|
const TOOL_CLOSED_PATS = [
|
|
/<tool_call>.*?<\/tool_call>/gs,
|
|
/<function=\w+>.*?<\/function>/gs,
|
|
/<invoke\s[^>]*>.*?<\/invoke>/gs,
|
|
];
|
|
|
|
const TOOL_ALL_PATS = [
|
|
...TOOL_CLOSED_PATS,
|
|
/<tool_call>.*$/gs,
|
|
/<function=\w+>.*$/gs,
|
|
/<invoke\s[^>]*>.*$/gs,
|
|
];
|
|
|
|
// ── Strip / signal ───────────────────────────────────────────────────────
|
|
|
|
export function stripToolMarkup(text: string, opts?: { final?: boolean }): string {
|
|
const pats = opts?.final ? TOOL_ALL_PATS : TOOL_CLOSED_PATS;
|
|
for (const pat of pats) {
|
|
text = text.replace(pat, '');
|
|
}
|
|
return opts?.final ? text.trim() : text;
|
|
}
|
|
|
|
export function hasToolSignal(text: string): boolean {
|
|
return TOOL_XML_SIGNALS.some((s) => text.includes(s));
|
|
}
|
|
|
|
// ── parseToolCallsFromText (Unsloth port + Anthropic extension) ──────────
|
|
|
|
export interface OpenAiToolCall {
|
|
id: string;
|
|
type: 'function';
|
|
function: { name: string; arguments: string };
|
|
}
|
|
|
|
const TC_JSON_START_RE = /<tool_call>\s*\{/g;
|
|
const TC_FUNC_START_RE = /<function=(\w+)>\s*/g;
|
|
const TC_END_TAG_RE = /<\/tool_call>/;
|
|
const TC_FUNC_CLOSE_RE = /\s*<\/function>\s*$/;
|
|
const TC_PARAM_START_RE = /<parameter=(\w+)>\s*/g;
|
|
const TC_PARAM_CLOSE_RE = /\s*<\/parameter>\s*$/;
|
|
|
|
const TC_INVOKE_START_RE = /<invoke\s+name\s*=\s*(?:"([^"]*)"|'([^']*)')\s*>/g;
|
|
const TC_INVOKE_CLOSE_RE = /\s*<\/invoke>\s*$/;
|
|
const TC_INVOKE_PARAM_RE = /<parameter\s+name\s*=\s*(?:"([^"]*)"|'([^']*)')\s*>/g;
|
|
const TC_INVOKE_PARAM_CLOSE_RE = /\s*<\/parameter>\s*$/;
|
|
|
|
function scanBalancedBraces(content: string, start: number): number {
|
|
let depth = 0;
|
|
let i = start;
|
|
let inString = false;
|
|
while (i < content.length) {
|
|
const ch = content[i]!;
|
|
if (inString) {
|
|
if (ch === '\\' && i + 1 < content.length) {
|
|
i += 2;
|
|
continue;
|
|
}
|
|
if (ch === '"') inString = false;
|
|
} else if (ch === '"') {
|
|
inString = true;
|
|
} else if (ch === '{') {
|
|
depth++;
|
|
} else if (ch === '}') {
|
|
depth--;
|
|
if (depth === 0) return i;
|
|
}
|
|
i++;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
export function parseToolCallsFromText(
|
|
content: string,
|
|
opts?: { idOffset?: number },
|
|
): OpenAiToolCall[] {
|
|
const toolCalls: OpenAiToolCall[] = [];
|
|
const idOffset = opts?.idOffset ?? 0;
|
|
|
|
// Pattern 1: <tool_call>{json}</tool_call> -- balanced-brace JSON scanner.
|
|
// Skips braces inside JSON strings so nested objects parse correctly.
|
|
TC_JSON_START_RE.lastIndex = 0;
|
|
let m: RegExpExecArray | null;
|
|
while ((m = TC_JSON_START_RE.exec(content)) !== null) {
|
|
const braceStart = m.index + m[0].length - 1;
|
|
const braceEnd = scanBalancedBraces(content, braceStart);
|
|
if (braceEnd === -1) continue;
|
|
const jsonStr = content.slice(braceStart, braceEnd + 1);
|
|
try {
|
|
const obj = JSON.parse(jsonStr) as Record<string, unknown>;
|
|
const name = typeof obj.name === 'string' ? obj.name : '';
|
|
let args: string;
|
|
const rawArgs = obj.arguments ?? {};
|
|
if (typeof rawArgs === 'string') {
|
|
args = rawArgs;
|
|
} else {
|
|
args = JSON.stringify(rawArgs);
|
|
}
|
|
toolCalls.push({
|
|
id: `call_${idOffset + toolCalls.length}`,
|
|
type: 'function',
|
|
function: { name, arguments: args },
|
|
});
|
|
} catch {
|
|
// malformed JSON -- skip
|
|
}
|
|
}
|
|
|
|
// Pattern 2: <function=name><parameter=key>value -- closing tags optional.
|
|
// Body boundary uses </tool_call> or next <function= (not </function>,
|
|
// because code parameter values can contain that literal).
|
|
if (toolCalls.length === 0) {
|
|
TC_FUNC_START_RE.lastIndex = 0;
|
|
const funcStarts: Array<{ match: RegExpExecArray; name: string }> = [];
|
|
while ((m = TC_FUNC_START_RE.exec(content)) !== null) {
|
|
funcStarts.push({ match: m, name: m[1]! });
|
|
}
|
|
for (let idx = 0; idx < funcStarts.length; idx++) {
|
|
const { match: fm, name: funcName } = funcStarts[idx]!;
|
|
const bodyStart = fm.index + fm[0].length;
|
|
const nextFunc = idx + 1 < funcStarts.length
|
|
? funcStarts[idx + 1]!.match.index
|
|
: content.length;
|
|
const endTag = TC_END_TAG_RE.exec(content.slice(bodyStart));
|
|
let bodyEnd = endTag ? bodyStart + endTag.index : content.length;
|
|
bodyEnd = Math.min(bodyEnd, nextFunc);
|
|
let body = content.slice(bodyStart, bodyEnd);
|
|
body = body.replace(TC_FUNC_CLOSE_RE, '');
|
|
|
|
const args: Record<string, string> = {};
|
|
TC_PARAM_START_RE.lastIndex = 0;
|
|
const paramStarts: Array<{ match: RegExpExecArray; name: string }> = [];
|
|
let pm: RegExpExecArray | null;
|
|
while ((pm = TC_PARAM_START_RE.exec(body)) !== null) {
|
|
paramStarts.push({ match: pm, name: pm[1]! });
|
|
}
|
|
if (paramStarts.length === 1) {
|
|
// Single param: take everything to body end so embedded
|
|
// </parameter> in code strings is preserved.
|
|
const p = paramStarts[0]!;
|
|
let val = body.slice(p.match.index + p.match[0].length);
|
|
val = val.replace(TC_PARAM_CLOSE_RE, '');
|
|
args[p.name] = val.trim();
|
|
} else {
|
|
for (let pidx = 0; pidx < paramStarts.length; pidx++) {
|
|
const p = paramStarts[pidx]!;
|
|
const valStart = p.match.index + p.match[0].length;
|
|
const nextParam = pidx + 1 < paramStarts.length
|
|
? paramStarts[pidx + 1]!.match.index
|
|
: body.length;
|
|
let val = body.slice(valStart, nextParam);
|
|
val = val.replace(TC_PARAM_CLOSE_RE, '');
|
|
args[p.name] = val.trim();
|
|
}
|
|
}
|
|
|
|
toolCalls.push({
|
|
id: `call_${idOffset + toolCalls.length}`,
|
|
type: 'function',
|
|
function: { name: funcName, arguments: JSON.stringify(args) },
|
|
});
|
|
}
|
|
}
|
|
|
|
// Pattern 3: <invoke name="..."><parameter name="...">value -- Anthropic
|
|
// shape that qwen3.6 drifts to from Claude Code documentation residue.
|
|
// Closing tags optional; same single-param fast path as pattern 2.
|
|
if (toolCalls.length === 0) {
|
|
TC_INVOKE_START_RE.lastIndex = 0;
|
|
const invokeStarts: Array<{ match: RegExpExecArray; name: string }> = [];
|
|
while ((m = TC_INVOKE_START_RE.exec(content)) !== null) {
|
|
const name = (m[1] ?? m[2] ?? '').trim();
|
|
if (name) invokeStarts.push({ match: m, name });
|
|
}
|
|
for (let idx = 0; idx < invokeStarts.length; idx++) {
|
|
const { match: im, name: invokeName } = invokeStarts[idx]!;
|
|
const bodyStart = im.index + im[0].length;
|
|
const nextInvoke = idx + 1 < invokeStarts.length
|
|
? invokeStarts[idx + 1]!.match.index
|
|
: content.length;
|
|
const closeTag = content.slice(bodyStart).match(/<\/invoke>/);
|
|
let bodyEnd = closeTag ? bodyStart + (closeTag.index ?? 0) : content.length;
|
|
bodyEnd = Math.min(bodyEnd, nextInvoke);
|
|
let body = content.slice(bodyStart, bodyEnd);
|
|
body = body.replace(TC_INVOKE_CLOSE_RE, '');
|
|
|
|
const args: Record<string, string> = {};
|
|
TC_INVOKE_PARAM_RE.lastIndex = 0;
|
|
const paramStarts: Array<{ match: RegExpExecArray; name: string }> = [];
|
|
let pm: RegExpExecArray | null;
|
|
while ((pm = TC_INVOKE_PARAM_RE.exec(body)) !== null) {
|
|
const pname = (pm[1] ?? pm[2] ?? '').trim();
|
|
if (pname) paramStarts.push({ match: pm, name: pname });
|
|
}
|
|
if (paramStarts.length === 1) {
|
|
const p = paramStarts[0]!;
|
|
let val = body.slice(p.match.index + p.match[0].length);
|
|
val = val.replace(TC_INVOKE_PARAM_CLOSE_RE, '');
|
|
args[p.name] = val.trim();
|
|
} else {
|
|
for (let pidx = 0; pidx < paramStarts.length; pidx++) {
|
|
const p = paramStarts[pidx]!;
|
|
const valStart = p.match.index + p.match[0].length;
|
|
const nextParam = pidx + 1 < paramStarts.length
|
|
? paramStarts[pidx + 1]!.match.index
|
|
: body.length;
|
|
let val = body.slice(valStart, nextParam);
|
|
val = val.replace(TC_INVOKE_PARAM_CLOSE_RE, '');
|
|
args[p.name] = val.trim();
|
|
}
|
|
}
|
|
|
|
toolCalls.push({
|
|
id: `call_${idOffset + toolCalls.length}`,
|
|
type: 'function',
|
|
function: { name: invokeName, arguments: JSON.stringify(args) },
|
|
});
|
|
}
|
|
}
|
|
|
|
return toolCalls;
|
|
}
|
|
|
|
// ── BooCode streaming helpers ────────────────────────────────────────────
|
|
|
|
export interface ParsedCall {
|
|
name: string;
|
|
args: Record<string, unknown>;
|
|
}
|
|
|
|
const PLACEHOLDER_LITERALS = new Set(['...', 'placeholder', '<path>', '<file>']);
|
|
const ANGLE_BRACKET_SENTINEL_RE = /^<[^>]+>$/;
|
|
|
|
export function isPlaceholderArgValue(value: unknown): boolean {
|
|
if (typeof value !== 'string') return false;
|
|
const trimmed = value.trim();
|
|
if (trimmed === '') return true;
|
|
if (PLACEHOLDER_LITERALS.has(trimmed)) return true;
|
|
if (ANGLE_BRACKET_SENTINEL_RE.test(trimmed)) return true;
|
|
return false;
|
|
}
|
|
|
|
function hasPlaceholderArgs(args: Record<string, unknown>): boolean {
|
|
for (const value of Object.values(args)) {
|
|
if (isPlaceholderArgValue(value)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
function logRejectedPlaceholder(parsed: ParsedCall): void {
|
|
console.debug(
|
|
{ toolName: parsed.name, args: parsed.args },
|
|
'rejected placeholder tool call at parse time',
|
|
);
|
|
}
|
|
|
|
const QWEN_FUNCTION_RE = /<function\s*=\s*([^>\s]+)\s*>/;
|
|
const QWEN_PARAM_RE = /<parameter\s*=\s*([^>\s]+)\s*>([\s\S]*?)<\/parameter>/g;
|
|
|
|
export function parseXmlToolCall(block: string): ParsedCall | null {
|
|
const nameMatch = block.match(QWEN_FUNCTION_RE);
|
|
if (!nameMatch || !nameMatch[1]) return null;
|
|
const name = nameMatch[1].trim();
|
|
if (!name) return null;
|
|
const args: Record<string, unknown> = {};
|
|
for (const m of block.matchAll(QWEN_PARAM_RE)) {
|
|
const key = (m[1] ?? '').trim();
|
|
if (!key) continue;
|
|
const raw = (m[2] ?? '').trim();
|
|
try {
|
|
args[key] = JSON.parse(raw);
|
|
} catch {
|
|
args[key] = raw;
|
|
}
|
|
}
|
|
return { name, args };
|
|
}
|
|
|
|
const INVOKE_NAME_RE =
|
|
/<invoke\s+name\s*=\s*("([^"]*)"|'([^']*)')\s*>/;
|
|
const INVOKE_PARAM_RE =
|
|
/<parameter\s+name\s*=\s*("([^"]*)"|'([^']*)')\s*>([\s\S]*?)<\/parameter>/g;
|
|
|
|
export function parseInvokeToolCall(block: string): ParsedCall | null {
|
|
const nameMatch = block.match(INVOKE_NAME_RE);
|
|
if (!nameMatch) return null;
|
|
const name = (nameMatch[2] ?? nameMatch[3] ?? '').trim();
|
|
if (!name) return null;
|
|
const args: Record<string, unknown> = {};
|
|
for (const m of block.matchAll(INVOKE_PARAM_RE)) {
|
|
const key = ((m[2] ?? m[3] ?? '') as string).trim();
|
|
if (!key) continue;
|
|
const raw = (m[4] ?? '').trim();
|
|
try {
|
|
args[key] = JSON.parse(raw);
|
|
} catch {
|
|
args[key] = raw;
|
|
}
|
|
}
|
|
return { name, args };
|
|
}
|
|
|
|
const ALL_OPENERS = [XML_TOOL_OPEN, INVOKE_TOOL_OPEN] as const;
|
|
|
|
export function partialXmlOpenerStart(s: string): number {
|
|
let earliest = -1;
|
|
for (const op of ALL_OPENERS) {
|
|
const idx = s.indexOf(op);
|
|
if (idx === -1) continue;
|
|
if (earliest === -1 || idx < earliest) earliest = idx;
|
|
}
|
|
if (earliest !== -1) return earliest;
|
|
const lastLt = s.lastIndexOf('<');
|
|
if (lastLt === -1) return -1;
|
|
const suffix = s.slice(lastLt);
|
|
for (const op of ALL_OPENERS) {
|
|
if (op.startsWith(suffix) && suffix.length < op.length) return lastLt;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
export interface ToolCallExtraction {
|
|
flushed: string;
|
|
calls: ParsedCall[];
|
|
remaining: string;
|
|
}
|
|
|
|
interface OpenerSpec {
|
|
open: string;
|
|
close: string;
|
|
parse: (block: string) => ParsedCall | null;
|
|
}
|
|
|
|
const OPENER_SPECS: ReadonlyArray<OpenerSpec> = [
|
|
{ open: XML_TOOL_OPEN, close: XML_TOOL_CLOSE, parse: parseXmlToolCall },
|
|
{ open: INVOKE_TOOL_OPEN, close: INVOKE_TOOL_CLOSE, parse: parseInvokeToolCall },
|
|
];
|
|
|
|
export function extractToolCallBlocks(buffer: string): ToolCallExtraction {
|
|
let flushed = '';
|
|
const calls: ParsedCall[] = [];
|
|
let pos = 0;
|
|
|
|
while (pos < buffer.length) {
|
|
let next: { spec: OpenerSpec; openIdx: number; closeIdx: number } | null = null;
|
|
for (const spec of OPENER_SPECS) {
|
|
const openIdx = buffer.indexOf(spec.open, pos);
|
|
if (openIdx === -1) continue;
|
|
const closeIdx = buffer.indexOf(spec.close, openIdx);
|
|
if (closeIdx === -1) continue;
|
|
if (next === null || openIdx < next.openIdx) {
|
|
next = { spec, openIdx, closeIdx };
|
|
}
|
|
}
|
|
if (next === null) break;
|
|
|
|
if (next.openIdx > pos) {
|
|
flushed += buffer.slice(pos, next.openIdx);
|
|
}
|
|
const blockEnd = next.closeIdx + next.spec.close.length;
|
|
const block = buffer.slice(next.openIdx, blockEnd);
|
|
const parsed = next.spec.parse(block);
|
|
if (parsed) {
|
|
if (hasPlaceholderArgs(parsed.args)) {
|
|
logRejectedPlaceholder(parsed);
|
|
flushed += block;
|
|
} else {
|
|
calls.push(parsed);
|
|
}
|
|
}
|
|
pos = blockEnd;
|
|
}
|
|
|
|
const tail = buffer.slice(pos);
|
|
const partialIdx = partialXmlOpenerStart(tail);
|
|
if (partialIdx === -1) {
|
|
flushed += tail;
|
|
return { flushed, calls, remaining: '' };
|
|
}
|
|
if (partialIdx > 0) {
|
|
flushed += tail.slice(0, partialIdx);
|
|
}
|
|
return { flushed, calls, remaining: tail.slice(partialIdx) };
|
|
}
|