v2.4.0-unsloth-studio-lift: port 3 Unsloth Studio AGPL-3.0 modules

Batch 1 — tool-call-parser.ts: replaces xml-parser.ts with a port of Unsloth's tool_call_parser.py. Adds balanced-brace JSON scanner, single-param fast path, hasToolSignal/stripToolMarkup/parseToolCallsFromText exports, and stream-finalization stripping at all three final-write sites (error-handler, finalizeCompletion, executeToolPhase). Anthropic <invoke> shape preserved. 75+12 tests. Batch 2 — web/html-to-md.ts: parse5 tree-walking HTML-to-Markdown converter ported from Unsloth's _html_to_md.py. Replaces web_fetch's regex stripHtml with structured markdown output (headings, links, lists, tables, code blocks, blockquotes, entity decoding). 29 tests. Batch 3 — llama-args-validator.ts: port of llama_server_args.py deny-list validator. Wired into AGENTS.md frontmatter parser — llama_extra_args field validated at load time, rejects managed flags (model identity, networking, auth/TLS, server UI). No runtime consumer yet (llama-swap boundary). 76 tests. All three files carry SPDX-License-Identifier: AGPL-3.0-only headers. LICENSE flipped to AGPL-3.0-only in prior commit (a938cf1). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-26 23:30:50 +00:00
parent a938cf1d42
commit 90a6761b07
17 changed files with 1672 additions and 311 deletions
--- a/apps/server/src/services/web_fetch.ts
+++ b/apps/server/src/services/web_fetch.ts
@@ -12,6 +12,7 @@ import { z } from 'zod';
 import { isPublicUrl } from './url_guard.js';
 import type { ToolDef } from './tools.js';
 import { truncateIfNeeded } from './truncate.js';
+import { htmlToMarkdown } from './web/index.js';

 const WebFetchInput = z.object({
  url: z.string().min(1).max(2048),
@@ -38,29 +39,9 @@ export type WebFetchOutput =
    }
  | { error: string; reason: string; content_type?: string };

-function stripHtml(html: string): { text: string; title: string | undefined } {
-  // Title first, before we destroy the markup. Trim collapsed whitespace.
+function extractTitle(html: string): string | undefined {
  const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
-  const title = titleMatch?.[1]?.replace(/\s+/g, ' ').trim() || undefined;
-  // Drop script + style + comments entirely (their CONTENT must not leak —
-  // a regex tag stripper alone would expose inline JS as plain text).
-  const text = html
-    .replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, ' ')
-    .replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, ' ')
-    .replace(/<noscript\b[^>]*>[\s\S]*?<\/noscript>/gi, ' ')
-    .replace(/<!--[\s\S]*?-->/g, ' ')
-    .replace(/<[^>]+>/g, ' ')
-    // Minimal entity decode — full coverage would need a table; covering
-    // the five common ones plus &nbsp; is enough for snippet readability.
-    .replace(/&nbsp;/g, ' ')
-    .replace(/&amp;/g, '&')
-    .replace(/&lt;/g, '<')
-    .replace(/&gt;/g, '>')
-    .replace(/&quot;/g, '"')
-    .replace(/&#39;/g, "'")
-    .replace(/\s+/g, ' ')
-    .trim();
-  return { text, title };
+  return titleMatch?.[1]?.replace(/\s+/g, ' ').trim() || undefined;
 }

 // v1.11.10: streaming body reader. Aborts the response stream the instant
@@ -211,9 +192,8 @@ export async function executeWebFetch(
  let textRaw: string;
  let title: string | undefined;
  if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
-    const stripped = stripHtml(body);
-    textRaw = stripped.text;
-    title = stripped.title;
+    title = extractTitle(body);
+    textRaw = htmlToMarkdown(body);
  } else if (
    contentType.includes('text/plain') ||
    contentType.includes('text/markdown') ||