v2.4.0-unsloth-studio-lift: port 3 Unsloth Studio AGPL-3.0 modules
Batch 1 — tool-call-parser.ts: replaces xml-parser.ts with a port of
Unsloth's tool_call_parser.py. Adds balanced-brace JSON scanner,
single-param fast path, hasToolSignal/stripToolMarkup/parseToolCallsFromText
exports, and stream-finalization stripping at all three final-write sites
(error-handler, finalizeCompletion, executeToolPhase). Anthropic <invoke>
shape preserved. 75+12 tests.
Batch 2 — web/html-to-md.ts: parse5 tree-walking HTML-to-Markdown converter
ported from Unsloth's _html_to_md.py. Replaces web_fetch's regex stripHtml
with structured markdown output (headings, links, lists, tables, code blocks,
blockquotes, entity decoding). 29 tests.
Batch 3 — llama-args-validator.ts: port of llama_server_args.py deny-list
validator. Wired into AGENTS.md frontmatter parser — llama_extra_args field
validated at load time, rejects managed flags (model identity, networking,
auth/TLS, server UI). No runtime consumer yet (llama-swap boundary). 76 tests.
All three files carry SPDX-License-Identifier: AGPL-3.0-only headers.
LICENSE flipped to AGPL-3.0-only in prior commit (a938cf1).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
347
apps/server/src/services/web/html-to-md.ts
Normal file
347
apps/server/src/services/web/html-to-md.ts
Normal file
@@ -0,0 +1,347 @@
|
||||
// SPDX-License-Identifier: AGPL-3.0-only
|
||||
// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
|
||||
// Ported from studio/backend/core/inference/_html_to_md.py.
|
||||
// Original: https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/_html_to_md.py
|
||||
|
||||
import { parse, type DefaultTreeAdapterTypes } from 'parse5';
|
||||
|
||||
type Document = DefaultTreeAdapterTypes.Document;
|
||||
type ChildNode = DefaultTreeAdapterTypes.ChildNode;
|
||||
type Element = DefaultTreeAdapterTypes.Element;
|
||||
type TextNode = DefaultTreeAdapterTypes.TextNode;
|
||||
|
||||
const SKIP_TAGS = new Set([
|
||||
'script', 'style', 'head', 'noscript', 'svg', 'math', 'nav', 'footer',
|
||||
]);
|
||||
|
||||
const BLOCK_TAGS = new Set([
|
||||
'p', 'div', 'section', 'article', 'main', 'aside', 'figure',
|
||||
'figcaption', 'details', 'summary', 'dl', 'dt', 'dd',
|
||||
]);
|
||||
|
||||
const HEADING_TAGS = new Set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']);
|
||||
|
||||
const INLINE_EMPHASIS: Record<string, string> = {
|
||||
strong: '**', b: '**', em: '*', i: '*',
|
||||
};
|
||||
|
||||
function isElement(node: ChildNode): node is Element {
|
||||
return 'tagName' in node;
|
||||
}
|
||||
|
||||
function isText(node: ChildNode): node is TextNode {
|
||||
return node.nodeName === '#text';
|
||||
}
|
||||
|
||||
class MarkdownRenderer {
|
||||
private out: string[] = [];
|
||||
|
||||
private inLink = false;
|
||||
private linkHref: string | null = null;
|
||||
private linkTextParts: string[] = [];
|
||||
|
||||
private listStack: string[] = [];
|
||||
private olCounter: number[] = [];
|
||||
|
||||
private inTable = false;
|
||||
private currentRow: string[] = [];
|
||||
private cellParts: string[] = [];
|
||||
private inCell = false;
|
||||
private headerRowDone = false;
|
||||
private rowHasTh = false;
|
||||
private isFirstRow = false;
|
||||
|
||||
private inPre = false;
|
||||
private preParts: string[] = [];
|
||||
private preLanguage: string | null = null;
|
||||
private inInlineCode = false;
|
||||
|
||||
private bqStack: string[][] = [];
|
||||
|
||||
private emit(text: string): void {
|
||||
if (this.inLink) {
|
||||
this.linkTextParts.push(text);
|
||||
} else if (this.inCell) {
|
||||
this.cellParts.push(text);
|
||||
} else if (this.inPre) {
|
||||
this.preParts.push(text);
|
||||
} else if (this.bqStack.length > 0) {
|
||||
this.bqStack[this.bqStack.length - 1]!.push(text);
|
||||
} else {
|
||||
this.out.push(text);
|
||||
}
|
||||
}
|
||||
|
||||
private prefixBlockquote(content: string): string {
|
||||
content = content.replace(/[ \t]+$/gm, '');
|
||||
content = content.replace(/\n{3,}/g, '\n\n').trim();
|
||||
if (!content) return '';
|
||||
return content.split('\n').map(line =>
|
||||
line.trim() ? '> ' + line : '>'
|
||||
).join('\n');
|
||||
}
|
||||
|
||||
private finishCell(): void {
|
||||
if (!this.inCell) return;
|
||||
this.inCell = false;
|
||||
let cellText = this.cellParts.join('').trim().replace(/\n/g, ' ');
|
||||
cellText = cellText.replace(/\|/g, '\\|');
|
||||
this.currentRow.push(cellText);
|
||||
this.cellParts = [];
|
||||
}
|
||||
|
||||
private finishRow(): void {
|
||||
if (this.currentRow.length === 0) return;
|
||||
const line = '| ' + this.currentRow.join(' | ') + ' |';
|
||||
this.emit(line + '\n');
|
||||
if (!this.headerRowDone && (this.rowHasTh || this.isFirstRow)) {
|
||||
const sep = '| ' + this.currentRow.map(() => '---').join(' | ') + ' |';
|
||||
this.emit(sep + '\n');
|
||||
this.headerRowDone = true;
|
||||
}
|
||||
this.isFirstRow = false;
|
||||
this.currentRow = [];
|
||||
this.rowHasTh = false;
|
||||
}
|
||||
|
||||
private finishLink(): void {
|
||||
const text = this.linkTextParts.join('').replace(/\s+/g, ' ').trim();
|
||||
const href = this.linkHref ?? '';
|
||||
this.inLink = false;
|
||||
if (href && text) {
|
||||
this.emit(`[${text}](${href})`);
|
||||
} else if (text) {
|
||||
this.emit(text);
|
||||
}
|
||||
}
|
||||
|
||||
private getAttr(el: Element, name: string): string | undefined {
|
||||
return el.attrs.find(a => a.name === name)?.value;
|
||||
}
|
||||
|
||||
private handleOpen(el: Element): void {
|
||||
const tag = el.tagName.toLowerCase();
|
||||
|
||||
if (HEADING_TAGS.has(tag)) {
|
||||
const level = parseInt(tag[1]!, 10);
|
||||
this.emit('\n\n' + '#'.repeat(level) + ' ');
|
||||
} else if (tag === 'a') {
|
||||
this.linkHref = this.getAttr(el, 'href') ?? null;
|
||||
this.linkTextParts = [];
|
||||
this.inLink = true;
|
||||
} else if (tag in INLINE_EMPHASIS) {
|
||||
this.emit(INLINE_EMPHASIS[tag]!);
|
||||
} else if (tag === 'br') {
|
||||
this.emit('\n');
|
||||
} else if (BLOCK_TAGS.has(tag)) {
|
||||
this.emit('\n\n');
|
||||
} else if (tag === 'hr') {
|
||||
this.emit('\n\n---\n\n');
|
||||
} else if (tag === 'blockquote') {
|
||||
this.emit('\n\n');
|
||||
this.bqStack.push([]);
|
||||
} else if (tag === 'ul') {
|
||||
this.listStack.push('ul');
|
||||
this.emit('\n');
|
||||
} else if (tag === 'ol') {
|
||||
this.listStack.push('ol');
|
||||
const startAttr = this.getAttr(el, 'start');
|
||||
let start = 1;
|
||||
if (startAttr != null) {
|
||||
const parsed = parseInt(startAttr, 10);
|
||||
if (!isNaN(parsed)) start = parsed;
|
||||
}
|
||||
this.olCounter.push(start - 1);
|
||||
this.emit('\n');
|
||||
} else if (tag === 'li') {
|
||||
const indent = ' '.repeat(Math.max(0, this.listStack.length - 1));
|
||||
if (this.listStack.length > 0 && this.listStack[this.listStack.length - 1] === 'ol') {
|
||||
if (this.olCounter.length > 0) {
|
||||
this.olCounter[this.olCounter.length - 1]!++;
|
||||
this.emit(`\n${indent}${this.olCounter[this.olCounter.length - 1]}. `);
|
||||
} else {
|
||||
this.emit(`\n${indent}1. `);
|
||||
}
|
||||
} else {
|
||||
this.emit(`\n${indent}* `);
|
||||
}
|
||||
} else if (tag === 'pre') {
|
||||
this.preParts = [];
|
||||
this.inPre = true;
|
||||
this.preLanguage = null;
|
||||
const codeChild = el.childNodes.find(
|
||||
(c): c is Element => isElement(c) && c.tagName === 'code'
|
||||
);
|
||||
if (codeChild) {
|
||||
const cls = this.getAttr(codeChild, 'class') ?? '';
|
||||
const langMatch = cls.match(/(?:^|\s)language-(\S+)/);
|
||||
if (langMatch) this.preLanguage = langMatch[1]!;
|
||||
}
|
||||
} else if (tag === 'code' && !this.inPre) {
|
||||
this.inInlineCode = true;
|
||||
this.emit('`');
|
||||
} else if (tag === 'table') {
|
||||
this.inTable = true;
|
||||
this.headerRowDone = false;
|
||||
this.isFirstRow = true;
|
||||
this.emit('\n\n');
|
||||
} else if (tag === 'tr') {
|
||||
this.finishCell();
|
||||
this.finishRow();
|
||||
} else if (tag === 'th' || tag === 'td') {
|
||||
this.finishCell();
|
||||
this.cellParts = [];
|
||||
this.inCell = true;
|
||||
if (tag === 'th') this.rowHasTh = true;
|
||||
}
|
||||
}
|
||||
|
||||
private handleClose(tag: string): void {
|
||||
tag = tag.toLowerCase();
|
||||
|
||||
if (HEADING_TAGS.has(tag)) {
|
||||
this.emit('\n\n');
|
||||
} else if (tag === 'a') {
|
||||
this.finishLink();
|
||||
} else if (tag in INLINE_EMPHASIS) {
|
||||
this.emit(INLINE_EMPHASIS[tag]!);
|
||||
} else if (BLOCK_TAGS.has(tag)) {
|
||||
this.emit('\n\n');
|
||||
} else if (tag === 'blockquote') {
|
||||
if (this.bqStack.length > 0) {
|
||||
const content = this.bqStack.pop()!.join('');
|
||||
const prefixed = this.prefixBlockquote(content);
|
||||
if (prefixed) this.emit('\n\n' + prefixed + '\n\n');
|
||||
}
|
||||
} else if (tag === 'ul') {
|
||||
if (this.listStack.length > 0 && this.listStack[this.listStack.length - 1] === 'ul') {
|
||||
this.listStack.pop();
|
||||
}
|
||||
this.emit('\n');
|
||||
} else if (tag === 'ol') {
|
||||
if (this.listStack.length > 0 && this.listStack[this.listStack.length - 1] === 'ol') {
|
||||
this.listStack.pop();
|
||||
if (this.olCounter.length > 0) this.olCounter.pop();
|
||||
}
|
||||
this.emit('\n');
|
||||
} else if (tag === 'pre') {
|
||||
const raw = this.preParts.join('');
|
||||
this.inPre = false;
|
||||
const lang = this.preLanguage ?? '';
|
||||
const block = '```' + lang + '\n' + raw + '\n```';
|
||||
this.emit('\n\n' + block + '\n\n');
|
||||
this.preLanguage = null;
|
||||
} else if (tag === 'code' && !this.inPre) {
|
||||
this.inInlineCode = false;
|
||||
this.emit('`');
|
||||
} else if (tag === 'th' || tag === 'td') {
|
||||
this.finishCell();
|
||||
} else if (tag === 'tr') {
|
||||
this.finishCell();
|
||||
this.finishRow();
|
||||
} else if (tag === 'table') {
|
||||
this.finishCell();
|
||||
this.finishRow();
|
||||
this.inTable = false;
|
||||
this.emit('\n');
|
||||
}
|
||||
}
|
||||
|
||||
private handleText(data: string): void {
|
||||
if (this.inPre) {
|
||||
this.preParts.push(data);
|
||||
return;
|
||||
}
|
||||
if (this.inInlineCode) {
|
||||
this.emit(data);
|
||||
return;
|
||||
}
|
||||
const text = data.replace(/\s+/g, ' ');
|
||||
if (this.inTable && !this.inCell && !text.trim()) return;
|
||||
this.emit(text);
|
||||
}
|
||||
|
||||
walk(node: ChildNode | Document): void {
|
||||
if (isText(node as ChildNode)) {
|
||||
this.handleText((node as TextNode).value);
|
||||
return;
|
||||
}
|
||||
if (node.nodeName === '#comment') return;
|
||||
|
||||
if (isElement(node as ChildNode)) {
|
||||
const el = node as Element;
|
||||
const tag = el.tagName.toLowerCase();
|
||||
if (SKIP_TAGS.has(tag)) return;
|
||||
if (tag === 'img') return;
|
||||
|
||||
this.handleOpen(el);
|
||||
|
||||
if (tag === 'pre') {
|
||||
for (const child of el.childNodes) {
|
||||
if (isElement(child) && child.tagName === 'code') {
|
||||
for (const grandchild of child.childNodes) {
|
||||
this.walk(grandchild);
|
||||
}
|
||||
} else {
|
||||
this.walk(child);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (const child of el.childNodes) {
|
||||
this.walk(child);
|
||||
}
|
||||
}
|
||||
|
||||
this.handleClose(tag);
|
||||
return;
|
||||
}
|
||||
|
||||
if ('childNodes' in node) {
|
||||
for (const child of (node as Document).childNodes) {
|
||||
this.walk(child);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
getOutput(): string {
|
||||
return this.out.join('');
|
||||
}
|
||||
}
|
||||
|
||||
function cleanup(text: string): string {
|
||||
const lines = text.split('\n');
|
||||
const out: string[] = [];
|
||||
let inFence = false;
|
||||
let blankRun = 0;
|
||||
|
||||
for (const line of lines) {
|
||||
const stripped = line.replace(/[ \t]+$/, '');
|
||||
if (stripped.startsWith('```')) {
|
||||
inFence = !inFence;
|
||||
blankRun = 0;
|
||||
out.push(stripped);
|
||||
continue;
|
||||
}
|
||||
if (inFence) {
|
||||
out.push(line);
|
||||
continue;
|
||||
}
|
||||
if (!stripped) {
|
||||
blankRun++;
|
||||
if (blankRun <= 1) out.push('');
|
||||
continue;
|
||||
}
|
||||
blankRun = 0;
|
||||
out.push(stripped);
|
||||
}
|
||||
|
||||
return out.join('\n').trim();
|
||||
}
|
||||
|
||||
export function htmlToMarkdown(sourceHtml: string): string {
|
||||
sourceHtml = sourceHtml.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
|
||||
const doc = parse(sourceHtml);
|
||||
const renderer = new MarkdownRenderer();
|
||||
renderer.walk(doc);
|
||||
return cleanup(renderer.getOutput());
|
||||
}
|
||||
1
apps/server/src/services/web/index.ts
Normal file
1
apps/server/src/services/web/index.ts
Normal file
@@ -0,0 +1 @@
|
||||
export { htmlToMarkdown } from './html-to-md.js';
|
||||
Reference in New Issue
Block a user