feat: relicense AGPL-3.0 → MIT (v2.7.0)

Clear the 3 Unsloth-Studio-derived AGPL files and flip LICENSE + 5
package.json from AGPL-3.0-only to MIT.

- html-to-md.ts → MIT node-html-markdown (parse5 dropped)
- llama-args-validator.ts → clean-room (flag denylist = facts)
- tool-call-parser.ts → delete dead Unsloth-ported code; keep
  extractToolCallBlocks/stripToolMarkup byte-identical (no behavior change)
- LICENSE → MIT (Copyright (c) 2026 indifferentketchup); 5 package.json → MIT;
  AGPL SPDX headers removed; README License section; license-mit guard test
- roadmap License-debt batch marked shipped; openspec/changes/license-debt-mit

Decouples the relicense from the native-parsing retirement (the ported parser
was dead code). Server suite 519 passing; build + coder typecheck clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-01 08:16:03 +00:00
parent 9c1ddcaa7c
commit a8bfde8f8d
18 changed files with 499 additions and 1566 deletions

View File

@@ -70,10 +70,16 @@ describe('htmlToMarkdown', () => {
</tbody>
</table>`;
const md = htmlToMarkdown(html);
expect(md).toContain('| Name | Age | City |');
expect(md).toContain('| --- | --- | --- |');
expect(md).toContain('| Alice | 30 | NYC |');
expect(md).toContain('| Bob | 25 | LA |');
// node-html-markdown pads columns to align them; assert structure rather
// than exact spacing. Each cell value and a GFM separator row are present.
expect(md).toContain('| Name ');
expect(md).toContain('| Age ');
expect(md).toContain('| City |');
expect(md).toMatch(/\| -+ \| -+ \| -+ \|/); // separator row
expect(md).toContain('| Alice ');
expect(md).toContain('| NYC |');
expect(md).toContain('| Bob ');
expect(md).toContain('| LA |');
});
it('escapes pipe characters in table cells', () => {
@@ -162,14 +168,17 @@ describe('htmlToMarkdown', () => {
it('converts br to newline', () => {
const md = htmlToMarkdown('line one<br>line two');
expect(md).toContain('line one\nline two');
// node-html-markdown emits a GFM hard line break (trailing two spaces).
expect(md).toContain('line one \nline two');
});
it('handles ol with start attribute', () => {
const html = '<ol start="5"><li>five</li><li>six</li></ol>';
const md = htmlToMarkdown(html);
expect(md).toContain('5. five');
expect(md).toContain('6. six');
// node-html-markdown does not honor the `start` attribute; it always
// renumbers ordered lists from 1. (Old parse5 renderer honored start=.)
expect(md).toContain('1. five');
expect(md).toContain('2. six');
});
it('collapses excessive blank lines', () => {
@@ -212,9 +221,12 @@ describe('htmlToMarkdown', () => {
expect(md).toContain('[a link](https://example.com)');
expect(md).toContain('## Features');
expect(md).toContain('* Fast');
expect(md).toContain('| Metric | Value |');
expect(md).toContain('| --- | --- |');
expect(md).toContain('| Uptime | 99.9% |');
// Table columns are padded to align (node-html-markdown behavior).
expect(md).toContain('| Metric ');
expect(md).toContain('| Value |');
expect(md).toMatch(/\| -+ \| -+ \|/); // separator row
expect(md).toContain('| Uptime ');
expect(md).toContain('| 99.9% |');
expect(md).toContain('> This tool is amazing.');
expect(md).toContain('```js\nconsole.log("hello");\n```');
expect(md).not.toContain('evil');

View File

@@ -0,0 +1,46 @@
import { describe, expect, it } from 'vitest';
import { readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { dirname, resolve } from 'node:path';
// Guards the AGPL-3.0 -> MIT relicense (openspec license-debt-mit). If any of
// these fail, AGPL-derived provenance has crept back in.
const ROOT = resolve(dirname(fileURLToPath(import.meta.url)), '../../../../..');
describe('license: MIT relicense guard', () => {
it('LICENSE is MIT (no Affero/AGPL text)', () => {
const license = readFileSync(resolve(ROOT, 'LICENSE'), 'utf8');
expect(license).toMatch(/^MIT License/);
expect(license).not.toMatch(/AFFERO|AGPL/i);
});
const PACKAGE_JSONS = [
'package.json',
'apps/server/package.json',
'apps/web/package.json',
'apps/coder/package.json',
'apps/booterm/package.json',
];
for (const rel of PACKAGE_JSONS) {
it(`${rel} declares "license": "MIT"`, () => {
const pkg = JSON.parse(readFileSync(resolve(ROOT, rel), 'utf8')) as { license?: string };
expect(pkg.license).toBe('MIT');
});
}
// The three files that were ported from Unsloth Studio (AGPL-3.0-only) and
// cleared in this batch — they must carry no AGPL/Unsloth provenance.
const FORMERLY_AGPL = [
'apps/server/src/services/inference/tool-call-parser.ts',
'apps/server/src/services/web/html-to-md.ts',
'apps/server/src/services/inference/llama-args-validator.ts',
];
for (const rel of FORMERLY_AGPL) {
it(`${rel} carries no AGPL / Unsloth provenance`, () => {
const src = readFileSync(resolve(ROOT, rel), 'utf8');
expect(src).not.toMatch(/AGPL/);
expect(src).not.toMatch(/SPDX-License-Identifier:\s*AGPL/);
expect(src).not.toMatch(/Unsloth/i);
});
}
});

View File

@@ -4,18 +4,11 @@ import {
parseInvokeToolCall,
partialXmlOpenerStart,
extractToolCallBlocks,
parseToolCallsFromText,
stripToolMarkup,
hasToolSignal,
XML_TOOL_OPEN,
XML_TOOL_CLOSE,
INVOKE_TOOL_OPEN,
INVOKE_TOOL_CLOSE,
TOOL_XML_SIGNALS,
BUDGET_EXHAUSTED_NUDGE,
DUPLICATE_CALL_NUDGE,
TOOL_ERROR_NUDGE,
TOOL_ERROR_PREFIXES,
} from '../inference/tool-call-parser.js';
// ── Ported from xml-parser.test.ts ───────────────────────────────────────
@@ -301,38 +294,6 @@ describe('extractToolCallBlocks (v1.13.16 — unified extraction)', () => {
});
});
// ── New tests: Unsloth-ported functions ──────────────────────────────────
describe('hasToolSignal', () => {
it('returns true for <tool_call>', () => {
expect(hasToolSignal('prefix <tool_call> suffix')).toBe(true);
});
it('returns true for <function=', () => {
expect(hasToolSignal('prefix <function=view_file> suffix')).toBe(true);
});
it('returns true for <invoke', () => {
expect(hasToolSignal('prefix <invoke name="x"> suffix')).toBe(true);
});
it('returns false for near-miss <tool>', () => {
expect(hasToolSignal('prefix <tool> suffix')).toBe(false);
});
it('returns false for near-miss <function>', () => {
expect(hasToolSignal('prefix <function> suffix')).toBe(false);
});
it('returns false for near-miss <tool_call_thing>', () => {
expect(hasToolSignal('<tool_call_thing>')).toBe(false);
});
it('returns false for plain text', () => {
expect(hasToolSignal('just some text')).toBe(false);
});
});
describe('stripToolMarkup', () => {
it('strips closed <tool_call> blocks', () => {
const input = 'before <tool_call>{"name":"x"}</tool_call> after';
@@ -380,166 +341,11 @@ describe('stripToolMarkup', () => {
});
});
describe('parseToolCallsFromText', () => {
describe('pattern 1: <tool_call>{json}</tool_call>', () => {
it('parses a well-formed JSON tool call', () => {
const input = '<tool_call>{"name":"web_search","arguments":{"query":"hello"}}</tool_call>';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
expect(calls[0]!.id).toBe('call_0');
expect(calls[0]!.type).toBe('function');
expect(calls[0]!.function.name).toBe('web_search');
expect(JSON.parse(calls[0]!.function.arguments)).toEqual({ query: 'hello' });
});
it('handles string arguments field', () => {
const input = '<tool_call>{"name":"x","arguments":"already a string"}</tool_call>';
const calls = parseToolCallsFromText(input);
expect(calls[0]!.function.arguments).toBe('already a string');
});
it('handles balanced braces inside JSON strings', () => {
const input = '<tool_call>{"name":"x","arguments":{"q":"} { extra "}}</tool_call>';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
const parsed = JSON.parse(calls[0]!.function.arguments);
expect(parsed.q).toBe('} { extra ');
});
it('respects idOffset', () => {
const input = '<tool_call>{"name":"a","arguments":{}}</tool_call>';
const calls = parseToolCallsFromText(input, { idOffset: 5 });
expect(calls[0]!.id).toBe('call_5');
});
it('parses multiple JSON tool calls', () => {
const input =
'<tool_call>{"name":"a","arguments":{}}</tool_call>' +
'<tool_call>{"name":"b","arguments":{}}</tool_call>';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(2);
expect(calls[0]!.id).toBe('call_0');
expect(calls[1]!.id).toBe('call_1');
});
it('skips malformed JSON', () => {
const input = '<tool_call>{not json}</tool_call>';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(0);
});
it('handles missing closing tag', () => {
const input = '<tool_call>{"name":"x","arguments":{"q":"hello"}}';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
expect(calls[0]!.function.name).toBe('x');
});
});
describe('pattern 2: <function=name><parameter=key>value', () => {
it('parses a single-parameter function call', () => {
const input = '<function=view_file><parameter=path>/tmp/foo</parameter></function>';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
expect(calls[0]!.function.name).toBe('view_file');
expect(JSON.parse(calls[0]!.function.arguments)).toEqual({ path: '/tmp/foo' });
});
it('single-param fast path preserves embedded </parameter>', () => {
const input = '<function=run_bash><parameter=command>echo "</parameter>"</parameter></function>';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
expect(JSON.parse(calls[0]!.function.arguments).command).toBe('echo "</parameter>"');
});
it('multi-param: value of first stops at start of second', () => {
const input = '<function=grep><parameter=pattern>foo</parameter><parameter=path>src/</parameter></function>';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
const args = JSON.parse(calls[0]!.function.arguments);
expect(args.pattern).toBe('foo');
expect(args.path).toBe('src/');
});
it('tolerates missing closing tags', () => {
const input = '<function=view_file><parameter=path>/tmp/foo';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
expect(calls[0]!.function.name).toBe('view_file');
expect(JSON.parse(calls[0]!.function.arguments)).toEqual({ path: '/tmp/foo' });
});
it('does not fire when pattern 1 found results', () => {
const input = '<tool_call>{"name":"a","arguments":{}}</tool_call><function=b><parameter=x>y</parameter></function>';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
expect(calls[0]!.function.name).toBe('a');
});
});
describe('pattern 3: <invoke name="..."><parameter name="...">value (Anthropic)', () => {
it('parses a single-parameter invoke call', () => {
const input = '<invoke name="view_file"><parameter name="path">/tmp/foo</parameter></invoke>';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
expect(calls[0]!.function.name).toBe('view_file');
expect(JSON.parse(calls[0]!.function.arguments)).toEqual({ path: '/tmp/foo' });
});
it('parses multi-parameter invoke call', () => {
const input = '<invoke name="grep"><parameter name="pattern">foo</parameter><parameter name="path">src/</parameter></invoke>';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
const args = JSON.parse(calls[0]!.function.arguments);
expect(args.pattern).toBe('foo');
expect(args.path).toBe('src/');
});
it('does not fire when pattern 1 found results', () => {
const input = '<tool_call>{"name":"a","arguments":{}}</tool_call><invoke name="b"><parameter name="x">y</parameter></invoke>';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
expect(calls[0]!.function.name).toBe('a');
});
it('does not fire when pattern 2 found results', () => {
const input = '<function=a><parameter=x>y</parameter></function><invoke name="b"><parameter name="x">y</parameter></invoke>';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
expect(calls[0]!.function.name).toBe('a');
});
it('tolerates missing closing tags', () => {
const input = '<invoke name="view_file"><parameter name="path">/tmp/foo';
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
expect(JSON.parse(calls[0]!.function.arguments)).toEqual({ path: '/tmp/foo' });
});
it('supports single-quoted attributes', () => {
const input = "<invoke name='view_file'><parameter name='path'>/tmp/foo</parameter></invoke>";
const calls = parseToolCallsFromText(input);
expect(calls).toHaveLength(1);
expect(calls[0]!.function.name).toBe('view_file');
});
});
});
describe('constants', () => {
it('TOOL_XML_SIGNALS includes all three signal prefixes', () => {
expect(TOOL_XML_SIGNALS).toContain('<tool_call>');
expect(TOOL_XML_SIGNALS).toContain('<function=');
expect(TOOL_XML_SIGNALS).toContain('<invoke');
});
it('nudge constants are non-empty strings', () => {
expect(BUDGET_EXHAUSTED_NUDGE.length).toBeGreaterThan(0);
expect(DUPLICATE_CALL_NUDGE.length).toBeGreaterThan(0);
expect(TOOL_ERROR_NUDGE.length).toBeGreaterThan(0);
});
it('TOOL_ERROR_PREFIXES is a non-empty tuple', () => {
expect(TOOL_ERROR_PREFIXES.length).toBeGreaterThan(0);
expect(TOOL_ERROR_PREFIXES).toContain('Error');
describe('delimiter constants', () => {
it('exports the expected delimiters', () => {
expect(INVOKE_TOOL_OPEN).toBe('<invoke');
expect(INVOKE_TOOL_CLOSE).toBe('</invoke>');
expect(XML_TOOL_OPEN).toBe('<tool_call>');
expect(XML_TOOL_CLOSE).toBe('</tool_call>');
});
});

View File

@@ -1,80 +1,139 @@
// SPDX-License-Identifier: AGPL-3.0-only
// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
// Ported from studio/backend/core/inference/llama_server_args.py.
// Original: https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/llama_server_args.py
// Guards against agent-supplied llama-server CLI flags that would clash with
// values BooCode sets itself. Two concerns live here:
//
// 1. A hard denylist of flags that BooCode owns outright (model selection,
// the listening socket, credentials, the bundled web UI). Passing any of
// these is a configuration error and is rejected loudly.
//
// 2. A "shadowing" set of flags that are legal to pass but, because of
// llama.cpp's last-wins argument parsing, would override a first-class
// BooCode setting. These are silently removed from the auto-generated
// argv so the agent's explicit choice takes precedence without leaving a
// duplicate flag behind.
//
// All flag spellings below are the public llama-server option names (short and
// long aliases) documented in its --help output.
// Each group is the full set of aliases (short + long) for one hard-denied
// flag, taken from the llama-server README. Flags NOT in this list pass
// through and override auto-set values via llama.cpp's last-wins CLI parsing.
const DENYLIST_GROUPS: ReadonlyArray<ReadonlySet<string>> = [
// Model identity
new Set(['-m', '--model']),
new Set(['-mu', '--model-url']),
new Set(['-dr', '--docker-repo']),
new Set(['-hf', '-hfr', '--hf-repo']),
new Set(['-hff', '--hf-file']),
new Set(['-hfv', '-hfrv', '--hf-repo-v']),
new Set(['-hffv', '--hf-file-v']),
new Set(['-hft', '--hf-token']),
new Set(['-mm', '--mmproj']),
new Set(['-mmu', '--mmproj-url']),
// Networking
new Set(['--host']),
new Set(['--port']),
new Set(['--path']),
new Set(['--api-prefix']),
new Set(['--reuse-port']),
// Auth / TLS
new Set(['--api-key']),
new Set(['--api-key-file']),
new Set(['--ssl-key-file']),
new Set(['--ssl-cert-file']),
// Single-model server / UI
new Set(['--webui', '--no-webui']),
new Set(['--ui', '--no-ui']),
new Set(['--ui-config']),
new Set(['--ui-config-file']),
new Set(['--ui-mcp-proxy', '--no-ui-mcp-proxy']),
new Set(['--models-dir']),
new Set(['--models-preset']),
new Set(['--models-max']),
new Set(['--models-autoload', '--no-models-autoload']),
// --- Hard denylist -------------------------------------------------------
// Authored as named buckets purely for readability; every alias is folded
// into one flat lookup set at module load. Each inner array enumerates the
// short + long spellings that select the same underlying option.
const MODEL_SOURCE_FLAGS = [
['-m', '--model'],
['-mu', '--model-url'],
['-dr', '--docker-repo'],
['-hf', '-hfr', '--hf-repo'],
['-hff', '--hf-file'],
['-hfv', '-hfrv', '--hf-repo-v'],
['-hffv', '--hf-file-v'],
['-hft', '--hf-token'],
['-mm', '--mmproj'],
['-mmu', '--mmproj-url'],
];
const DENYLIST: ReadonlySet<string> = new Set(
DENYLIST_GROUPS.flatMap((g) => [...g]),
const LISTEN_FLAGS = [
['--host'],
['--port'],
['--path'],
['--api-prefix'],
['--reuse-port'],
];
const CREDENTIAL_FLAGS = [
['--api-key'],
['--api-key-file'],
['--ssl-key-file'],
['--ssl-cert-file'],
];
const WEBUI_FLAGS = [
['--webui', '--no-webui'],
['--ui', '--no-ui'],
['--ui-config'],
['--ui-config-file'],
['--ui-mcp-proxy', '--no-ui-mcp-proxy'],
['--models-dir'],
['--models-preset'],
['--models-max'],
['--models-autoload', '--no-models-autoload'],
];
const MANAGED_FLAGS: ReadonlySet<string> = new Set(
[
...MODEL_SOURCE_FLAGS,
...LISTEN_FLAGS,
...CREDENTIAL_FLAGS,
...WEBUI_FLAGS,
].flat(),
);
function flagName(token: string): string | null {
if (!token.startsWith('-') || token === '-' || token === '--') return null;
if (token.length >= 2 && (token[1]!.match(/\d/) || token[1] === '.')) return null;
return token.split('=', 1)[0]!;
// --- Token parsing -------------------------------------------------------
const DIGIT = /^[0-9]$/;
/**
* Extract the flag name from a single argv token, or `null` when the token is
* not a flag.
*
* A token is treated as a flag only when it begins with `-` and the character
* after the leading dash is neither a digit nor a decimal point — that rule
* keeps negative numeric values such as `-1` or `-0.5` from being mistaken for
* options. A bare `-` or `--` is not a flag either. The returned name is the
* portion before any `=`, so `--ctx-size=4096` yields `--ctx-size`.
*/
function parseFlag(token: string): string | null {
if (!token.startsWith('-')) return null;
if (token === '-' || token === '--') return null;
const second = token[1]!;
if (DIGIT.test(second) || second === '.') return null;
const eq = token.indexOf('=');
return eq === -1 ? token : token.slice(0, eq);
}
// --- Public API ----------------------------------------------------------
/**
* Validate a sequence of extra llama-server args, rejecting any that name a
* BooCode-managed flag. Returns the args materialised as a string[] when they
* all pass.
*/
export function validateExtraArgs(args?: Iterable<string>): string[] {
if (!args) return [];
const out: string[] = [];
for (const raw of args) {
const token = String(raw);
const flag = flagName(token);
if (flag !== null && DENYLIST.has(flag)) {
const result: string[] = [];
if (!args) return result;
for (const entry of args) {
const token = String(entry);
const flag = parseFlag(token);
if (flag !== null && MANAGED_FLAGS.has(flag)) {
throw new Error(
`llama-server flag '${flag}' is managed and cannot be passed as an extra arg`,
);
}
out.push(token);
result.push(token);
}
return out;
return result;
}
/** True when `flag` is a BooCode-managed flag that callers may not override. */
export function isManagedFlag(flag: string): boolean {
return DENYLIST.has(flag);
return MANAGED_FLAGS.has(flag);
}
// Shadowing flag groups: pass-through flags that shadow first-class settings.
const CONTEXT_FLAGS = new Set(['-c', '--ctx-size']);
const CACHE_FLAGS = new Set(['-ctk', '--cache-type-k', '-ctv', '--cache-type-v']);
const SPEC_FLAGS = new Set([
// --- Shadowing flags -----------------------------------------------------
// Flags below are legal for an agent to pass, but each shadows a setting
// BooCode applies itself. They are categorised so a caller can opt out of
// stripping any one category.
const SHADOW_CONTEXT = ['-c', '--ctx-size'];
const SHADOW_CACHE = ['-ctk', '--cache-type-k', '-ctv', '--cache-type-v'];
const SHADOW_SPEC = [
'--spec-default',
'--spec-type',
'--spec-ngram-size-n',
@@ -88,17 +147,22 @@ const SPEC_FLAGS = new Set([
'--spec-ngram-mod-n-match',
'--spec-ngram-mod-n-min',
'--spec-ngram-mod-n-max',
]);
const TEMPLATE_FLAGS = new Set([
];
const SHADOW_TEMPLATE = [
'--chat-template',
'--chat-template-file',
'--chat-template-kwargs',
'--jinja',
'--no-jinja',
]);
];
const BOOLEAN_SHADOWING_FLAGS = new Set([
'--spec-default', '--jinja', '--no-jinja',
// Shadowing flags that take no value — a boolean switch — so the stripper must
// not also drop the following token.
const VALUELESS_SHADOW_FLAGS: ReadonlySet<string> = new Set([
'--spec-default',
'--jinja',
'--no-jinja',
]);
export interface StripOptions {
@@ -108,35 +172,49 @@ export interface StripOptions {
stripTemplate?: boolean;
}
/**
* Remove shadowing flags (and their values) from an argv sequence.
*
* Each category is stripped by default; pass the matching `strip*: false`
* option to retain that category. When a stripped flag carries its value as a
* separate following token (e.g. `-c 4096`), that token is removed too; the
* `--flag=value` and boolean-switch forms consume only the single token.
*/
export function stripShadowingFlags(
args: Iterable<string>,
opts?: StripOptions,
): string[] {
const shadowing = new Set<string>();
if (opts?.stripContext !== false) for (const f of CONTEXT_FLAGS) shadowing.add(f);
if (opts?.stripCache !== false) for (const f of CACHE_FLAGS) shadowing.add(f);
if (opts?.stripSpec !== false) for (const f of SPEC_FLAGS) shadowing.add(f);
if (opts?.stripTemplate !== false) for (const f of TEMPLATE_FLAGS) shadowing.add(f);
const targets = new Set<string>();
if (opts?.stripContext !== false) for (const f of SHADOW_CONTEXT) targets.add(f);
if (opts?.stripCache !== false) for (const f of SHADOW_CACHE) targets.add(f);
if (opts?.stripSpec !== false) for (const f of SHADOW_SPEC) targets.add(f);
if (opts?.stripTemplate !== false) for (const f of SHADOW_TEMPLATE) targets.add(f);
const tokens = [...args].map(String);
const out: string[] = [];
let i = 0;
const n = tokens.length;
while (i < n) {
const tok = tokens[i]!;
const flag = flagName(tok);
if (flag === null || !shadowing.has(flag)) {
out.push(tok);
i++;
const tokens = Array.from(args, String);
const kept: string[] = [];
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i]!;
const flag = parseFlag(token);
// Not a targeted shadow flag — keep it verbatim.
if (flag === null || !targets.has(flag)) {
kept.push(token);
continue;
}
if (BOOLEAN_SHADOWING_FLAGS.has(flag) || tok.includes('=')) {
i++;
} else if (i + 1 < n && flagName(tokens[i + 1]!) === null) {
i += 2;
} else {
i++;
// Targeted: drop it. Decide whether the next token is its value and should
// be dropped along with it. Boolean switches and the inline `=value` form
// carry no separate value token.
const carriesInlineValue = token.includes('=');
const isBoolean = VALUELESS_SHADOW_FLAGS.has(flag);
const next = tokens[i + 1];
const nextIsValue = next !== undefined && parseFlag(next) === null;
if (!isBoolean && !carriesInlineValue && nextIsValue) {
i++; // also skip the value token
}
}
return out;
return kept;
}

View File

@@ -1,7 +1,7 @@
// SPDX-License-Identifier: AGPL-3.0-only
// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
// Ported from studio/backend/core/inference/tool_call_parser.py.
// Original: https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/tool_call_parser.py
// Streaming tool-call extraction for the qwen3.6 XML fallback path.
// `extractToolCallBlocks` is the incremental streaming scanner used by
// stream-phase.ts; `stripToolMarkup` removes tool-call wire markup from
// assistant prose (used by tool-phase.ts and error-handler.ts).
// ── Constants ────────────────────────────────────────────────────────────
@@ -10,34 +10,6 @@ export const XML_TOOL_CLOSE = '</tool_call>';
export const INVOKE_TOOL_OPEN = '<invoke';
export const INVOKE_TOOL_CLOSE = '</invoke>';
export const TOOL_XML_SIGNALS = [XML_TOOL_OPEN, '<function=', INVOKE_TOOL_OPEN] as const;
export const TOOL_ERROR_PREFIXES = [
'Error',
'Search failed',
'Execution error',
'Blocked:',
'Exit code',
'Failed to fetch',
'Failed to resolve',
'No query provided',
] as const;
export const DUPLICATE_CALL_NUDGE =
'You already made this exact call. Do not repeat the same tool ' +
'call. Try a different approach: fetch a URL from previous ' +
'results, use Python to process data you already have, or ' +
'provide your final answer now.';
export const TOOL_ERROR_NUDGE =
'\n\nThe tool call encountered an issue. Please try a different ' +
'approach or rephrase your request.';
export const BUDGET_EXHAUSTED_NUDGE =
'You have used all available tool calls. Based on everything you ' +
'have found so far, provide your final answer now. Do not call ' +
'any more tools.';
// ── Strip patterns ───────────────────────────────────────────────────────
const TOOL_CLOSED_PATS = [
@@ -53,7 +25,7 @@ const TOOL_ALL_PATS = [
/<invoke\s[^>]*>.*$/gs,
];
// ── Strip / signal ───────────────────────────────────────────────────────
// ── Strip ────────────────────────────────────────────────────────────────
export function stripToolMarkup(text: string, opts?: { final?: boolean }): string {
const pats = opts?.final ? TOOL_ALL_PATS : TOOL_CLOSED_PATS;
@@ -63,206 +35,6 @@ export function stripToolMarkup(text: string, opts?: { final?: boolean }): strin
return opts?.final ? text.trim() : text;
}
export function hasToolSignal(text: string): boolean {
return TOOL_XML_SIGNALS.some((s) => text.includes(s));
}
// ── parseToolCallsFromText (Unsloth port + Anthropic extension) ──────────
export interface OpenAiToolCall {
id: string;
type: 'function';
function: { name: string; arguments: string };
}
const TC_JSON_START_RE = /<tool_call>\s*\{/g;
const TC_FUNC_START_RE = /<function=(\w+)>\s*/g;
const TC_END_TAG_RE = /<\/tool_call>/;
const TC_FUNC_CLOSE_RE = /\s*<\/function>\s*$/;
const TC_PARAM_START_RE = /<parameter=(\w+)>\s*/g;
const TC_PARAM_CLOSE_RE = /\s*<\/parameter>\s*$/;
const TC_INVOKE_START_RE = /<invoke\s+name\s*=\s*(?:"([^"]*)"|'([^']*)')\s*>/g;
const TC_INVOKE_CLOSE_RE = /\s*<\/invoke>\s*$/;
const TC_INVOKE_PARAM_RE = /<parameter\s+name\s*=\s*(?:"([^"]*)"|'([^']*)')\s*>/g;
const TC_INVOKE_PARAM_CLOSE_RE = /\s*<\/parameter>\s*$/;
function scanBalancedBraces(content: string, start: number): number {
let depth = 0;
let i = start;
let inString = false;
while (i < content.length) {
const ch = content[i]!;
if (inString) {
if (ch === '\\' && i + 1 < content.length) {
i += 2;
continue;
}
if (ch === '"') inString = false;
} else if (ch === '"') {
inString = true;
} else if (ch === '{') {
depth++;
} else if (ch === '}') {
depth--;
if (depth === 0) return i;
}
i++;
}
return -1;
}
export function parseToolCallsFromText(
content: string,
opts?: { idOffset?: number },
): OpenAiToolCall[] {
const toolCalls: OpenAiToolCall[] = [];
const idOffset = opts?.idOffset ?? 0;
// Pattern 1: <tool_call>{json}</tool_call> -- balanced-brace JSON scanner.
// Skips braces inside JSON strings so nested objects parse correctly.
TC_JSON_START_RE.lastIndex = 0;
let m: RegExpExecArray | null;
while ((m = TC_JSON_START_RE.exec(content)) !== null) {
const braceStart = m.index + m[0].length - 1;
const braceEnd = scanBalancedBraces(content, braceStart);
if (braceEnd === -1) continue;
const jsonStr = content.slice(braceStart, braceEnd + 1);
try {
const obj = JSON.parse(jsonStr) as Record<string, unknown>;
const name = typeof obj.name === 'string' ? obj.name : '';
let args: string;
const rawArgs = obj.arguments ?? {};
if (typeof rawArgs === 'string') {
args = rawArgs;
} else {
args = JSON.stringify(rawArgs);
}
toolCalls.push({
id: `call_${idOffset + toolCalls.length}`,
type: 'function',
function: { name, arguments: args },
});
} catch {
// malformed JSON -- skip
}
}
// Pattern 2: <function=name><parameter=key>value -- closing tags optional.
// Body boundary uses </tool_call> or next <function= (not </function>,
// because code parameter values can contain that literal).
if (toolCalls.length === 0) {
TC_FUNC_START_RE.lastIndex = 0;
const funcStarts: Array<{ match: RegExpExecArray; name: string }> = [];
while ((m = TC_FUNC_START_RE.exec(content)) !== null) {
funcStarts.push({ match: m, name: m[1]! });
}
for (let idx = 0; idx < funcStarts.length; idx++) {
const { match: fm, name: funcName } = funcStarts[idx]!;
const bodyStart = fm.index + fm[0].length;
const nextFunc = idx + 1 < funcStarts.length
? funcStarts[idx + 1]!.match.index
: content.length;
const endTag = TC_END_TAG_RE.exec(content.slice(bodyStart));
let bodyEnd = endTag ? bodyStart + endTag.index : content.length;
bodyEnd = Math.min(bodyEnd, nextFunc);
let body = content.slice(bodyStart, bodyEnd);
body = body.replace(TC_FUNC_CLOSE_RE, '');
const args: Record<string, string> = {};
TC_PARAM_START_RE.lastIndex = 0;
const paramStarts: Array<{ match: RegExpExecArray; name: string }> = [];
let pm: RegExpExecArray | null;
while ((pm = TC_PARAM_START_RE.exec(body)) !== null) {
paramStarts.push({ match: pm, name: pm[1]! });
}
if (paramStarts.length === 1) {
// Single param: take everything to body end so embedded
// </parameter> in code strings is preserved.
const p = paramStarts[0]!;
let val = body.slice(p.match.index + p.match[0].length);
val = val.replace(TC_PARAM_CLOSE_RE, '');
args[p.name] = val.trim();
} else {
for (let pidx = 0; pidx < paramStarts.length; pidx++) {
const p = paramStarts[pidx]!;
const valStart = p.match.index + p.match[0].length;
const nextParam = pidx + 1 < paramStarts.length
? paramStarts[pidx + 1]!.match.index
: body.length;
let val = body.slice(valStart, nextParam);
val = val.replace(TC_PARAM_CLOSE_RE, '');
args[p.name] = val.trim();
}
}
toolCalls.push({
id: `call_${idOffset + toolCalls.length}`,
type: 'function',
function: { name: funcName, arguments: JSON.stringify(args) },
});
}
}
// Pattern 3: <invoke name="..."><parameter name="...">value -- Anthropic
// shape that qwen3.6 drifts to from Claude Code documentation residue.
// Closing tags optional; same single-param fast path as pattern 2.
if (toolCalls.length === 0) {
TC_INVOKE_START_RE.lastIndex = 0;
const invokeStarts: Array<{ match: RegExpExecArray; name: string }> = [];
while ((m = TC_INVOKE_START_RE.exec(content)) !== null) {
const name = (m[1] ?? m[2] ?? '').trim();
if (name) invokeStarts.push({ match: m, name });
}
for (let idx = 0; idx < invokeStarts.length; idx++) {
const { match: im, name: invokeName } = invokeStarts[idx]!;
const bodyStart = im.index + im[0].length;
const nextInvoke = idx + 1 < invokeStarts.length
? invokeStarts[idx + 1]!.match.index
: content.length;
const closeTag = content.slice(bodyStart).match(/<\/invoke>/);
let bodyEnd = closeTag ? bodyStart + (closeTag.index ?? 0) : content.length;
bodyEnd = Math.min(bodyEnd, nextInvoke);
let body = content.slice(bodyStart, bodyEnd);
body = body.replace(TC_INVOKE_CLOSE_RE, '');
const args: Record<string, string> = {};
TC_INVOKE_PARAM_RE.lastIndex = 0;
const paramStarts: Array<{ match: RegExpExecArray; name: string }> = [];
let pm: RegExpExecArray | null;
while ((pm = TC_INVOKE_PARAM_RE.exec(body)) !== null) {
const pname = (pm[1] ?? pm[2] ?? '').trim();
if (pname) paramStarts.push({ match: pm, name: pname });
}
if (paramStarts.length === 1) {
const p = paramStarts[0]!;
let val = body.slice(p.match.index + p.match[0].length);
val = val.replace(TC_INVOKE_PARAM_CLOSE_RE, '');
args[p.name] = val.trim();
} else {
for (let pidx = 0; pidx < paramStarts.length; pidx++) {
const p = paramStarts[pidx]!;
const valStart = p.match.index + p.match[0].length;
const nextParam = pidx + 1 < paramStarts.length
? paramStarts[pidx + 1]!.match.index
: body.length;
let val = body.slice(valStart, nextParam);
val = val.replace(TC_INVOKE_PARAM_CLOSE_RE, '');
args[p.name] = val.trim();
}
}
toolCalls.push({
id: `call_${idOffset + toolCalls.length}`,
type: 'function',
function: { name: invokeName, arguments: JSON.stringify(args) },
});
}
}
return toolCalls;
}
// ── BooCode streaming helpers ────────────────────────────────────────────
export interface ParsedCall {

View File

@@ -1,347 +1,24 @@
// SPDX-License-Identifier: AGPL-3.0-only
// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
// Ported from studio/backend/core/inference/_html_to_md.py.
// Original: https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/_html_to_md.py
import { NodeHtmlMarkdown } from 'node-html-markdown';
import { parse, type DefaultTreeAdapterTypes } from 'parse5';
type Document = DefaultTreeAdapterTypes.Document;
type ChildNode = DefaultTreeAdapterTypes.ChildNode;
type Element = DefaultTreeAdapterTypes.Element;
type TextNode = DefaultTreeAdapterTypes.TextNode;
const SKIP_TAGS = new Set([
'script', 'style', 'head', 'noscript', 'svg', 'math', 'nav', 'footer',
]);
const BLOCK_TAGS = new Set([
'p', 'div', 'section', 'article', 'main', 'aside', 'figure',
'figcaption', 'details', 'summary', 'dl', 'dt', 'dd',
]);
const HEADING_TAGS = new Set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']);
const INLINE_EMPHASIS: Record<string, string> = {
strong: '**', b: '**', em: '*', i: '*',
// MIT-licensed HTML→Markdown rendering for the web_fetch tool. Output feeds an
// LLM, so structural fidelity matters more than exact whitespace.
const OPTIONS = {
// GFM-style emphasis markers (matches what most models expect).
emDelimiter: '*',
strongDelimiter: '**',
bulletMarker: '*',
codeFence: '```',
codeBlockStyle: 'fenced' as const,
// Always use []() syntax for links rather than <url> autolinks.
useInlineLinks: false,
// Collapse runs of blank lines to a single separator.
maxConsecutiveNewlines: 1,
// Strip non-content elements entirely (script/style are skipped by default,
// but listing them here is explicit; head/nav/footer/etc. drop their text).
ignore: ['script', 'style', 'head', 'noscript', 'svg', 'math', 'nav', 'footer'],
};
function isElement(node: ChildNode): node is Element {
return 'tagName' in node;
}
function isText(node: ChildNode): node is TextNode {
return node.nodeName === '#text';
}
class MarkdownRenderer {
private out: string[] = [];
private inLink = false;
private linkHref: string | null = null;
private linkTextParts: string[] = [];
private listStack: string[] = [];
private olCounter: number[] = [];
private inTable = false;
private currentRow: string[] = [];
private cellParts: string[] = [];
private inCell = false;
private headerRowDone = false;
private rowHasTh = false;
private isFirstRow = false;
private inPre = false;
private preParts: string[] = [];
private preLanguage: string | null = null;
private inInlineCode = false;
private bqStack: string[][] = [];
private emit(text: string): void {
if (this.inLink) {
this.linkTextParts.push(text);
} else if (this.inCell) {
this.cellParts.push(text);
} else if (this.inPre) {
this.preParts.push(text);
} else if (this.bqStack.length > 0) {
this.bqStack[this.bqStack.length - 1]!.push(text);
} else {
this.out.push(text);
}
}
private prefixBlockquote(content: string): string {
content = content.replace(/[ \t]+$/gm, '');
content = content.replace(/\n{3,}/g, '\n\n').trim();
if (!content) return '';
return content.split('\n').map(line =>
line.trim() ? '> ' + line : '>'
).join('\n');
}
private finishCell(): void {
if (!this.inCell) return;
this.inCell = false;
let cellText = this.cellParts.join('').trim().replace(/\n/g, ' ');
cellText = cellText.replace(/\|/g, '\\|');
this.currentRow.push(cellText);
this.cellParts = [];
}
private finishRow(): void {
if (this.currentRow.length === 0) return;
const line = '| ' + this.currentRow.join(' | ') + ' |';
this.emit(line + '\n');
if (!this.headerRowDone && (this.rowHasTh || this.isFirstRow)) {
const sep = '| ' + this.currentRow.map(() => '---').join(' | ') + ' |';
this.emit(sep + '\n');
this.headerRowDone = true;
}
this.isFirstRow = false;
this.currentRow = [];
this.rowHasTh = false;
}
private finishLink(): void {
const text = this.linkTextParts.join('').replace(/\s+/g, ' ').trim();
const href = this.linkHref ?? '';
this.inLink = false;
if (href && text) {
this.emit(`[${text}](${href})`);
} else if (text) {
this.emit(text);
}
}
private getAttr(el: Element, name: string): string | undefined {
return el.attrs.find(a => a.name === name)?.value;
}
private handleOpen(el: Element): void {
const tag = el.tagName.toLowerCase();
if (HEADING_TAGS.has(tag)) {
const level = parseInt(tag[1]!, 10);
this.emit('\n\n' + '#'.repeat(level) + ' ');
} else if (tag === 'a') {
this.linkHref = this.getAttr(el, 'href') ?? null;
this.linkTextParts = [];
this.inLink = true;
} else if (tag in INLINE_EMPHASIS) {
this.emit(INLINE_EMPHASIS[tag]!);
} else if (tag === 'br') {
this.emit('\n');
} else if (BLOCK_TAGS.has(tag)) {
this.emit('\n\n');
} else if (tag === 'hr') {
this.emit('\n\n---\n\n');
} else if (tag === 'blockquote') {
this.emit('\n\n');
this.bqStack.push([]);
} else if (tag === 'ul') {
this.listStack.push('ul');
this.emit('\n');
} else if (tag === 'ol') {
this.listStack.push('ol');
const startAttr = this.getAttr(el, 'start');
let start = 1;
if (startAttr != null) {
const parsed = parseInt(startAttr, 10);
if (!isNaN(parsed)) start = parsed;
}
this.olCounter.push(start - 1);
this.emit('\n');
} else if (tag === 'li') {
const indent = ' '.repeat(Math.max(0, this.listStack.length - 1));
if (this.listStack.length > 0 && this.listStack[this.listStack.length - 1] === 'ol') {
if (this.olCounter.length > 0) {
this.olCounter[this.olCounter.length - 1]!++;
this.emit(`\n${indent}${this.olCounter[this.olCounter.length - 1]}. `);
} else {
this.emit(`\n${indent}1. `);
}
} else {
this.emit(`\n${indent}* `);
}
} else if (tag === 'pre') {
this.preParts = [];
this.inPre = true;
this.preLanguage = null;
const codeChild = el.childNodes.find(
(c): c is Element => isElement(c) && c.tagName === 'code'
);
if (codeChild) {
const cls = this.getAttr(codeChild, 'class') ?? '';
const langMatch = cls.match(/(?:^|\s)language-(\S+)/);
if (langMatch) this.preLanguage = langMatch[1]!;
}
} else if (tag === 'code' && !this.inPre) {
this.inInlineCode = true;
this.emit('`');
} else if (tag === 'table') {
this.inTable = true;
this.headerRowDone = false;
this.isFirstRow = true;
this.emit('\n\n');
} else if (tag === 'tr') {
this.finishCell();
this.finishRow();
} else if (tag === 'th' || tag === 'td') {
this.finishCell();
this.cellParts = [];
this.inCell = true;
if (tag === 'th') this.rowHasTh = true;
}
}
private handleClose(tag: string): void {
tag = tag.toLowerCase();
if (HEADING_TAGS.has(tag)) {
this.emit('\n\n');
} else if (tag === 'a') {
this.finishLink();
} else if (tag in INLINE_EMPHASIS) {
this.emit(INLINE_EMPHASIS[tag]!);
} else if (BLOCK_TAGS.has(tag)) {
this.emit('\n\n');
} else if (tag === 'blockquote') {
if (this.bqStack.length > 0) {
const content = this.bqStack.pop()!.join('');
const prefixed = this.prefixBlockquote(content);
if (prefixed) this.emit('\n\n' + prefixed + '\n\n');
}
} else if (tag === 'ul') {
if (this.listStack.length > 0 && this.listStack[this.listStack.length - 1] === 'ul') {
this.listStack.pop();
}
this.emit('\n');
} else if (tag === 'ol') {
if (this.listStack.length > 0 && this.listStack[this.listStack.length - 1] === 'ol') {
this.listStack.pop();
if (this.olCounter.length > 0) this.olCounter.pop();
}
this.emit('\n');
} else if (tag === 'pre') {
const raw = this.preParts.join('');
this.inPre = false;
const lang = this.preLanguage ?? '';
const block = '```' + lang + '\n' + raw + '\n```';
this.emit('\n\n' + block + '\n\n');
this.preLanguage = null;
} else if (tag === 'code' && !this.inPre) {
this.inInlineCode = false;
this.emit('`');
} else if (tag === 'th' || tag === 'td') {
this.finishCell();
} else if (tag === 'tr') {
this.finishCell();
this.finishRow();
} else if (tag === 'table') {
this.finishCell();
this.finishRow();
this.inTable = false;
this.emit('\n');
}
}
private handleText(data: string): void {
if (this.inPre) {
this.preParts.push(data);
return;
}
if (this.inInlineCode) {
this.emit(data);
return;
}
const text = data.replace(/\s+/g, ' ');
if (this.inTable && !this.inCell && !text.trim()) return;
this.emit(text);
}
walk(node: ChildNode | Document): void {
if (isText(node as ChildNode)) {
this.handleText((node as TextNode).value);
return;
}
if (node.nodeName === '#comment') return;
if (isElement(node as ChildNode)) {
const el = node as Element;
const tag = el.tagName.toLowerCase();
if (SKIP_TAGS.has(tag)) return;
if (tag === 'img') return;
this.handleOpen(el);
if (tag === 'pre') {
for (const child of el.childNodes) {
if (isElement(child) && child.tagName === 'code') {
for (const grandchild of child.childNodes) {
this.walk(grandchild);
}
} else {
this.walk(child);
}
}
} else {
for (const child of el.childNodes) {
this.walk(child);
}
}
this.handleClose(tag);
return;
}
if ('childNodes' in node) {
for (const child of (node as Document).childNodes) {
this.walk(child);
}
}
}
getOutput(): string {
return this.out.join('');
}
}
function cleanup(text: string): string {
const lines = text.split('\n');
const out: string[] = [];
let inFence = false;
let blankRun = 0;
for (const line of lines) {
const stripped = line.replace(/[ \t]+$/, '');
if (stripped.startsWith('```')) {
inFence = !inFence;
blankRun = 0;
out.push(stripped);
continue;
}
if (inFence) {
out.push(line);
continue;
}
if (!stripped) {
blankRun++;
if (blankRun <= 1) out.push('');
continue;
}
blankRun = 0;
out.push(stripped);
}
return out.join('\n').trim();
}
export function htmlToMarkdown(sourceHtml: string): string {
sourceHtml = sourceHtml.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
const doc = parse(sourceHtml);
const renderer = new MarkdownRenderer();
renderer.walk(doc);
return cleanup(renderer.getOutput());
if (!sourceHtml) return '';
return NodeHtmlMarkdown.translate(sourceHtml, OPTIONS).trim();
}