feat: relicense AGPL-3.0 → MIT (v2.7.0)
Clear the 3 Unsloth-Studio-derived AGPL files and flip LICENSE + 5 package.json from AGPL-3.0-only to MIT. - html-to-md.ts → MIT node-html-markdown (parse5 dropped) - llama-args-validator.ts → clean-room (flag denylist = facts) - tool-call-parser.ts → delete dead Unsloth-ported code; keep extractToolCallBlocks/stripToolMarkup byte-identical (no behavior change) - LICENSE → MIT (Copyright (c) 2026 indifferentketchup); 5 package.json → MIT; AGPL SPDX headers removed; README License section; license-mit guard test - roadmap License-debt batch marked shipped; openspec/changes/license-debt-mit Decouples the relicense from the native-parsing retirement (the ported parser was dead code). Server suite 519 passing; build + coder typecheck clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,347 +1,24 @@
|
||||
// SPDX-License-Identifier: AGPL-3.0-only
|
||||
// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
|
||||
// Ported from studio/backend/core/inference/_html_to_md.py.
|
||||
// Original: https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/_html_to_md.py
|
||||
import { NodeHtmlMarkdown } from 'node-html-markdown';
|
||||
|
||||
import { parse, type DefaultTreeAdapterTypes } from 'parse5';
|
||||
|
||||
type Document = DefaultTreeAdapterTypes.Document;
|
||||
type ChildNode = DefaultTreeAdapterTypes.ChildNode;
|
||||
type Element = DefaultTreeAdapterTypes.Element;
|
||||
type TextNode = DefaultTreeAdapterTypes.TextNode;
|
||||
|
||||
const SKIP_TAGS = new Set([
|
||||
'script', 'style', 'head', 'noscript', 'svg', 'math', 'nav', 'footer',
|
||||
]);
|
||||
|
||||
const BLOCK_TAGS = new Set([
|
||||
'p', 'div', 'section', 'article', 'main', 'aside', 'figure',
|
||||
'figcaption', 'details', 'summary', 'dl', 'dt', 'dd',
|
||||
]);
|
||||
|
||||
const HEADING_TAGS = new Set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']);
|
||||
|
||||
const INLINE_EMPHASIS: Record<string, string> = {
|
||||
strong: '**', b: '**', em: '*', i: '*',
|
||||
// MIT-licensed HTML→Markdown rendering for the web_fetch tool. Output feeds an
|
||||
// LLM, so structural fidelity matters more than exact whitespace.
|
||||
const OPTIONS = {
|
||||
// GFM-style emphasis markers (matches what most models expect).
|
||||
emDelimiter: '*',
|
||||
strongDelimiter: '**',
|
||||
bulletMarker: '*',
|
||||
codeFence: '```',
|
||||
codeBlockStyle: 'fenced' as const,
|
||||
// Always use []() syntax for links rather than <url> autolinks.
|
||||
useInlineLinks: false,
|
||||
// Collapse runs of blank lines to a single separator.
|
||||
maxConsecutiveNewlines: 1,
|
||||
// Strip non-content elements entirely (script/style are skipped by default,
|
||||
// but listing them here is explicit; head/nav/footer/etc. drop their text).
|
||||
ignore: ['script', 'style', 'head', 'noscript', 'svg', 'math', 'nav', 'footer'],
|
||||
};
|
||||
|
||||
function isElement(node: ChildNode): node is Element {
|
||||
return 'tagName' in node;
|
||||
}
|
||||
|
||||
function isText(node: ChildNode): node is TextNode {
|
||||
return node.nodeName === '#text';
|
||||
}
|
||||
|
||||
class MarkdownRenderer {
|
||||
private out: string[] = [];
|
||||
|
||||
private inLink = false;
|
||||
private linkHref: string | null = null;
|
||||
private linkTextParts: string[] = [];
|
||||
|
||||
private listStack: string[] = [];
|
||||
private olCounter: number[] = [];
|
||||
|
||||
private inTable = false;
|
||||
private currentRow: string[] = [];
|
||||
private cellParts: string[] = [];
|
||||
private inCell = false;
|
||||
private headerRowDone = false;
|
||||
private rowHasTh = false;
|
||||
private isFirstRow = false;
|
||||
|
||||
private inPre = false;
|
||||
private preParts: string[] = [];
|
||||
private preLanguage: string | null = null;
|
||||
private inInlineCode = false;
|
||||
|
||||
private bqStack: string[][] = [];
|
||||
|
||||
private emit(text: string): void {
|
||||
if (this.inLink) {
|
||||
this.linkTextParts.push(text);
|
||||
} else if (this.inCell) {
|
||||
this.cellParts.push(text);
|
||||
} else if (this.inPre) {
|
||||
this.preParts.push(text);
|
||||
} else if (this.bqStack.length > 0) {
|
||||
this.bqStack[this.bqStack.length - 1]!.push(text);
|
||||
} else {
|
||||
this.out.push(text);
|
||||
}
|
||||
}
|
||||
|
||||
private prefixBlockquote(content: string): string {
|
||||
content = content.replace(/[ \t]+$/gm, '');
|
||||
content = content.replace(/\n{3,}/g, '\n\n').trim();
|
||||
if (!content) return '';
|
||||
return content.split('\n').map(line =>
|
||||
line.trim() ? '> ' + line : '>'
|
||||
).join('\n');
|
||||
}
|
||||
|
||||
private finishCell(): void {
|
||||
if (!this.inCell) return;
|
||||
this.inCell = false;
|
||||
let cellText = this.cellParts.join('').trim().replace(/\n/g, ' ');
|
||||
cellText = cellText.replace(/\|/g, '\\|');
|
||||
this.currentRow.push(cellText);
|
||||
this.cellParts = [];
|
||||
}
|
||||
|
||||
private finishRow(): void {
|
||||
if (this.currentRow.length === 0) return;
|
||||
const line = '| ' + this.currentRow.join(' | ') + ' |';
|
||||
this.emit(line + '\n');
|
||||
if (!this.headerRowDone && (this.rowHasTh || this.isFirstRow)) {
|
||||
const sep = '| ' + this.currentRow.map(() => '---').join(' | ') + ' |';
|
||||
this.emit(sep + '\n');
|
||||
this.headerRowDone = true;
|
||||
}
|
||||
this.isFirstRow = false;
|
||||
this.currentRow = [];
|
||||
this.rowHasTh = false;
|
||||
}
|
||||
|
||||
private finishLink(): void {
|
||||
const text = this.linkTextParts.join('').replace(/\s+/g, ' ').trim();
|
||||
const href = this.linkHref ?? '';
|
||||
this.inLink = false;
|
||||
if (href && text) {
|
||||
this.emit(`[${text}](${href})`);
|
||||
} else if (text) {
|
||||
this.emit(text);
|
||||
}
|
||||
}
|
||||
|
||||
private getAttr(el: Element, name: string): string | undefined {
|
||||
return el.attrs.find(a => a.name === name)?.value;
|
||||
}
|
||||
|
||||
private handleOpen(el: Element): void {
|
||||
const tag = el.tagName.toLowerCase();
|
||||
|
||||
if (HEADING_TAGS.has(tag)) {
|
||||
const level = parseInt(tag[1]!, 10);
|
||||
this.emit('\n\n' + '#'.repeat(level) + ' ');
|
||||
} else if (tag === 'a') {
|
||||
this.linkHref = this.getAttr(el, 'href') ?? null;
|
||||
this.linkTextParts = [];
|
||||
this.inLink = true;
|
||||
} else if (tag in INLINE_EMPHASIS) {
|
||||
this.emit(INLINE_EMPHASIS[tag]!);
|
||||
} else if (tag === 'br') {
|
||||
this.emit('\n');
|
||||
} else if (BLOCK_TAGS.has(tag)) {
|
||||
this.emit('\n\n');
|
||||
} else if (tag === 'hr') {
|
||||
this.emit('\n\n---\n\n');
|
||||
} else if (tag === 'blockquote') {
|
||||
this.emit('\n\n');
|
||||
this.bqStack.push([]);
|
||||
} else if (tag === 'ul') {
|
||||
this.listStack.push('ul');
|
||||
this.emit('\n');
|
||||
} else if (tag === 'ol') {
|
||||
this.listStack.push('ol');
|
||||
const startAttr = this.getAttr(el, 'start');
|
||||
let start = 1;
|
||||
if (startAttr != null) {
|
||||
const parsed = parseInt(startAttr, 10);
|
||||
if (!isNaN(parsed)) start = parsed;
|
||||
}
|
||||
this.olCounter.push(start - 1);
|
||||
this.emit('\n');
|
||||
} else if (tag === 'li') {
|
||||
const indent = ' '.repeat(Math.max(0, this.listStack.length - 1));
|
||||
if (this.listStack.length > 0 && this.listStack[this.listStack.length - 1] === 'ol') {
|
||||
if (this.olCounter.length > 0) {
|
||||
this.olCounter[this.olCounter.length - 1]!++;
|
||||
this.emit(`\n${indent}${this.olCounter[this.olCounter.length - 1]}. `);
|
||||
} else {
|
||||
this.emit(`\n${indent}1. `);
|
||||
}
|
||||
} else {
|
||||
this.emit(`\n${indent}* `);
|
||||
}
|
||||
} else if (tag === 'pre') {
|
||||
this.preParts = [];
|
||||
this.inPre = true;
|
||||
this.preLanguage = null;
|
||||
const codeChild = el.childNodes.find(
|
||||
(c): c is Element => isElement(c) && c.tagName === 'code'
|
||||
);
|
||||
if (codeChild) {
|
||||
const cls = this.getAttr(codeChild, 'class') ?? '';
|
||||
const langMatch = cls.match(/(?:^|\s)language-(\S+)/);
|
||||
if (langMatch) this.preLanguage = langMatch[1]!;
|
||||
}
|
||||
} else if (tag === 'code' && !this.inPre) {
|
||||
this.inInlineCode = true;
|
||||
this.emit('`');
|
||||
} else if (tag === 'table') {
|
||||
this.inTable = true;
|
||||
this.headerRowDone = false;
|
||||
this.isFirstRow = true;
|
||||
this.emit('\n\n');
|
||||
} else if (tag === 'tr') {
|
||||
this.finishCell();
|
||||
this.finishRow();
|
||||
} else if (tag === 'th' || tag === 'td') {
|
||||
this.finishCell();
|
||||
this.cellParts = [];
|
||||
this.inCell = true;
|
||||
if (tag === 'th') this.rowHasTh = true;
|
||||
}
|
||||
}
|
||||
|
||||
private handleClose(tag: string): void {
|
||||
tag = tag.toLowerCase();
|
||||
|
||||
if (HEADING_TAGS.has(tag)) {
|
||||
this.emit('\n\n');
|
||||
} else if (tag === 'a') {
|
||||
this.finishLink();
|
||||
} else if (tag in INLINE_EMPHASIS) {
|
||||
this.emit(INLINE_EMPHASIS[tag]!);
|
||||
} else if (BLOCK_TAGS.has(tag)) {
|
||||
this.emit('\n\n');
|
||||
} else if (tag === 'blockquote') {
|
||||
if (this.bqStack.length > 0) {
|
||||
const content = this.bqStack.pop()!.join('');
|
||||
const prefixed = this.prefixBlockquote(content);
|
||||
if (prefixed) this.emit('\n\n' + prefixed + '\n\n');
|
||||
}
|
||||
} else if (tag === 'ul') {
|
||||
if (this.listStack.length > 0 && this.listStack[this.listStack.length - 1] === 'ul') {
|
||||
this.listStack.pop();
|
||||
}
|
||||
this.emit('\n');
|
||||
} else if (tag === 'ol') {
|
||||
if (this.listStack.length > 0 && this.listStack[this.listStack.length - 1] === 'ol') {
|
||||
this.listStack.pop();
|
||||
if (this.olCounter.length > 0) this.olCounter.pop();
|
||||
}
|
||||
this.emit('\n');
|
||||
} else if (tag === 'pre') {
|
||||
const raw = this.preParts.join('');
|
||||
this.inPre = false;
|
||||
const lang = this.preLanguage ?? '';
|
||||
const block = '```' + lang + '\n' + raw + '\n```';
|
||||
this.emit('\n\n' + block + '\n\n');
|
||||
this.preLanguage = null;
|
||||
} else if (tag === 'code' && !this.inPre) {
|
||||
this.inInlineCode = false;
|
||||
this.emit('`');
|
||||
} else if (tag === 'th' || tag === 'td') {
|
||||
this.finishCell();
|
||||
} else if (tag === 'tr') {
|
||||
this.finishCell();
|
||||
this.finishRow();
|
||||
} else if (tag === 'table') {
|
||||
this.finishCell();
|
||||
this.finishRow();
|
||||
this.inTable = false;
|
||||
this.emit('\n');
|
||||
}
|
||||
}
|
||||
|
||||
private handleText(data: string): void {
|
||||
if (this.inPre) {
|
||||
this.preParts.push(data);
|
||||
return;
|
||||
}
|
||||
if (this.inInlineCode) {
|
||||
this.emit(data);
|
||||
return;
|
||||
}
|
||||
const text = data.replace(/\s+/g, ' ');
|
||||
if (this.inTable && !this.inCell && !text.trim()) return;
|
||||
this.emit(text);
|
||||
}
|
||||
|
||||
walk(node: ChildNode | Document): void {
|
||||
if (isText(node as ChildNode)) {
|
||||
this.handleText((node as TextNode).value);
|
||||
return;
|
||||
}
|
||||
if (node.nodeName === '#comment') return;
|
||||
|
||||
if (isElement(node as ChildNode)) {
|
||||
const el = node as Element;
|
||||
const tag = el.tagName.toLowerCase();
|
||||
if (SKIP_TAGS.has(tag)) return;
|
||||
if (tag === 'img') return;
|
||||
|
||||
this.handleOpen(el);
|
||||
|
||||
if (tag === 'pre') {
|
||||
for (const child of el.childNodes) {
|
||||
if (isElement(child) && child.tagName === 'code') {
|
||||
for (const grandchild of child.childNodes) {
|
||||
this.walk(grandchild);
|
||||
}
|
||||
} else {
|
||||
this.walk(child);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (const child of el.childNodes) {
|
||||
this.walk(child);
|
||||
}
|
||||
}
|
||||
|
||||
this.handleClose(tag);
|
||||
return;
|
||||
}
|
||||
|
||||
if ('childNodes' in node) {
|
||||
for (const child of (node as Document).childNodes) {
|
||||
this.walk(child);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
getOutput(): string {
|
||||
return this.out.join('');
|
||||
}
|
||||
}
|
||||
|
||||
function cleanup(text: string): string {
|
||||
const lines = text.split('\n');
|
||||
const out: string[] = [];
|
||||
let inFence = false;
|
||||
let blankRun = 0;
|
||||
|
||||
for (const line of lines) {
|
||||
const stripped = line.replace(/[ \t]+$/, '');
|
||||
if (stripped.startsWith('```')) {
|
||||
inFence = !inFence;
|
||||
blankRun = 0;
|
||||
out.push(stripped);
|
||||
continue;
|
||||
}
|
||||
if (inFence) {
|
||||
out.push(line);
|
||||
continue;
|
||||
}
|
||||
if (!stripped) {
|
||||
blankRun++;
|
||||
if (blankRun <= 1) out.push('');
|
||||
continue;
|
||||
}
|
||||
blankRun = 0;
|
||||
out.push(stripped);
|
||||
}
|
||||
|
||||
return out.join('\n').trim();
|
||||
}
|
||||
|
||||
export function htmlToMarkdown(sourceHtml: string): string {
|
||||
sourceHtml = sourceHtml.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
|
||||
const doc = parse(sourceHtml);
|
||||
const renderer = new MarkdownRenderer();
|
||||
renderer.walk(doc);
|
||||
return cleanup(renderer.getOutput());
|
||||
if (!sourceHtml) return '';
|
||||
return NodeHtmlMarkdown.translate(sourceHtml, OPTIONS).trim();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user