v2.4.0-unsloth-studio-lift: port 3 Unsloth Studio AGPL-3.0 modules

Batch 1 — tool-call-parser.ts: replaces xml-parser.ts with a port of
Unsloth's tool_call_parser.py. Adds balanced-brace JSON scanner,
single-param fast path, hasToolSignal/stripToolMarkup/parseToolCallsFromText
exports, and stream-finalization stripping at all three final-write sites
(error-handler, finalizeCompletion, executeToolPhase). Anthropic <invoke>
shape preserved. 75+12 tests.

Batch 2 — web/html-to-md.ts: parse5 tree-walking HTML-to-Markdown converter
ported from Unsloth's _html_to_md.py. Replaces web_fetch's regex stripHtml
with structured markdown output (headings, links, lists, tables, code blocks,
blockquotes, entity decoding). 29 tests.

Batch 3 — llama-args-validator.ts: port of llama_server_args.py deny-list
validator. Wired into AGENTS.md frontmatter parser — llama_extra_args field
validated at load time, rejects managed flags (model identity, networking,
auth/TLS, server UI). No runtime consumer yet (llama-swap boundary). 76 tests.

All three files carry SPDX-License-Identifier: AGPL-3.0-only headers.
LICENSE flipped to AGPL-3.0-only in prior commit (a938cf1).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-26 23:30:50 +00:00
parent a938cf1d42
commit 90a6761b07
17 changed files with 1672 additions and 311 deletions

View File

@@ -12,6 +12,7 @@ import { z } from 'zod';
import { isPublicUrl } from './url_guard.js';
import type { ToolDef } from './tools.js';
import { truncateIfNeeded } from './truncate.js';
import { htmlToMarkdown } from './web/index.js';
const WebFetchInput = z.object({
url: z.string().min(1).max(2048),
@@ -38,29 +39,9 @@ export type WebFetchOutput =
}
| { error: string; reason: string; content_type?: string };
function stripHtml(html: string): { text: string; title: string | undefined } {
// Title first, before we destroy the markup. Trim collapsed whitespace.
function extractTitle(html: string): string | undefined {
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
const title = titleMatch?.[1]?.replace(/\s+/g, ' ').trim() || undefined;
// Drop script + style + comments entirely (their CONTENT must not leak —
// a regex tag stripper alone would expose inline JS as plain text).
const text = html
.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, ' ')
.replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, ' ')
.replace(/<noscript\b[^>]*>[\s\S]*?<\/noscript>/gi, ' ')
.replace(/<!--[\s\S]*?-->/g, ' ')
.replace(/<[^>]+>/g, ' ')
// Minimal entity decode — full coverage would need a table; covering
// the five common ones plus &nbsp; is enough for snippet readability.
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/\s+/g, ' ')
.trim();
return { text, title };
return titleMatch?.[1]?.replace(/\s+/g, ' ').trim() || undefined;
}
// v1.11.10: streaming body reader. Aborts the response stream the instant
@@ -211,9 +192,8 @@ export async function executeWebFetch(
let textRaw: string;
let title: string | undefined;
if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
const stripped = stripHtml(body);
textRaw = stripped.text;
title = stripped.title;
title = extractTitle(body);
textRaw = htmlToMarkdown(body);
} else if (
contentType.includes('text/plain') ||
contentType.includes('text/markdown') ||