// SPDX-License-Identifier: AGPL-3.0-only // Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. // Ported from studio/backend/core/inference/_html_to_md.py. // Original: https://github.com/unslothai/unsloth/blob/main/studio/backend/core/inference/_html_to_md.py import { parse, type DefaultTreeAdapterTypes } from 'parse5'; type Document = DefaultTreeAdapterTypes.Document; type ChildNode = DefaultTreeAdapterTypes.ChildNode; type Element = DefaultTreeAdapterTypes.Element; type TextNode = DefaultTreeAdapterTypes.TextNode; const SKIP_TAGS = new Set([ 'script', 'style', 'head', 'noscript', 'svg', 'math', 'nav', 'footer', ]); const BLOCK_TAGS = new Set([ 'p', 'div', 'section', 'article', 'main', 'aside', 'figure', 'figcaption', 'details', 'summary', 'dl', 'dt', 'dd', ]); const HEADING_TAGS = new Set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']); const INLINE_EMPHASIS: Record = { strong: '**', b: '**', em: '*', i: '*', }; function isElement(node: ChildNode): node is Element { return 'tagName' in node; } function isText(node: ChildNode): node is TextNode { return node.nodeName === '#text'; } class MarkdownRenderer { private out: string[] = []; private inLink = false; private linkHref: string | null = null; private linkTextParts: string[] = []; private listStack: string[] = []; private olCounter: number[] = []; private inTable = false; private currentRow: string[] = []; private cellParts: string[] = []; private inCell = false; private headerRowDone = false; private rowHasTh = false; private isFirstRow = false; private inPre = false; private preParts: string[] = []; private preLanguage: string | null = null; private inInlineCode = false; private bqStack: string[][] = []; private emit(text: string): void { if (this.inLink) { this.linkTextParts.push(text); } else if (this.inCell) { this.cellParts.push(text); } else if (this.inPre) { this.preParts.push(text); } else if (this.bqStack.length > 0) { this.bqStack[this.bqStack.length - 1]!.push(text); } else { this.out.push(text); } } private prefixBlockquote(content: string): string { content = content.replace(/[ \t]+$/gm, ''); content = content.replace(/\n{3,}/g, '\n\n').trim(); if (!content) return ''; return content.split('\n').map(line => line.trim() ? '> ' + line : '>' ).join('\n'); } private finishCell(): void { if (!this.inCell) return; this.inCell = false; let cellText = this.cellParts.join('').trim().replace(/\n/g, ' '); cellText = cellText.replace(/\|/g, '\\|'); this.currentRow.push(cellText); this.cellParts = []; } private finishRow(): void { if (this.currentRow.length === 0) return; const line = '| ' + this.currentRow.join(' | ') + ' |'; this.emit(line + '\n'); if (!this.headerRowDone && (this.rowHasTh || this.isFirstRow)) { const sep = '| ' + this.currentRow.map(() => '---').join(' | ') + ' |'; this.emit(sep + '\n'); this.headerRowDone = true; } this.isFirstRow = false; this.currentRow = []; this.rowHasTh = false; } private finishLink(): void { const text = this.linkTextParts.join('').replace(/\s+/g, ' ').trim(); const href = this.linkHref ?? ''; this.inLink = false; if (href && text) { this.emit(`[${text}](${href})`); } else if (text) { this.emit(text); } } private getAttr(el: Element, name: string): string | undefined { return el.attrs.find(a => a.name === name)?.value; } private handleOpen(el: Element): void { const tag = el.tagName.toLowerCase(); if (HEADING_TAGS.has(tag)) { const level = parseInt(tag[1]!, 10); this.emit('\n\n' + '#'.repeat(level) + ' '); } else if (tag === 'a') { this.linkHref = this.getAttr(el, 'href') ?? null; this.linkTextParts = []; this.inLink = true; } else if (tag in INLINE_EMPHASIS) { this.emit(INLINE_EMPHASIS[tag]!); } else if (tag === 'br') { this.emit('\n'); } else if (BLOCK_TAGS.has(tag)) { this.emit('\n\n'); } else if (tag === 'hr') { this.emit('\n\n---\n\n'); } else if (tag === 'blockquote') { this.emit('\n\n'); this.bqStack.push([]); } else if (tag === 'ul') { this.listStack.push('ul'); this.emit('\n'); } else if (tag === 'ol') { this.listStack.push('ol'); const startAttr = this.getAttr(el, 'start'); let start = 1; if (startAttr != null) { const parsed = parseInt(startAttr, 10); if (!isNaN(parsed)) start = parsed; } this.olCounter.push(start - 1); this.emit('\n'); } else if (tag === 'li') { const indent = ' '.repeat(Math.max(0, this.listStack.length - 1)); if (this.listStack.length > 0 && this.listStack[this.listStack.length - 1] === 'ol') { if (this.olCounter.length > 0) { this.olCounter[this.olCounter.length - 1]!++; this.emit(`\n${indent}${this.olCounter[this.olCounter.length - 1]}. `); } else { this.emit(`\n${indent}1. `); } } else { this.emit(`\n${indent}* `); } } else if (tag === 'pre') { this.preParts = []; this.inPre = true; this.preLanguage = null; const codeChild = el.childNodes.find( (c): c is Element => isElement(c) && c.tagName === 'code' ); if (codeChild) { const cls = this.getAttr(codeChild, 'class') ?? ''; const langMatch = cls.match(/(?:^|\s)language-(\S+)/); if (langMatch) this.preLanguage = langMatch[1]!; } } else if (tag === 'code' && !this.inPre) { this.inInlineCode = true; this.emit('`'); } else if (tag === 'table') { this.inTable = true; this.headerRowDone = false; this.isFirstRow = true; this.emit('\n\n'); } else if (tag === 'tr') { this.finishCell(); this.finishRow(); } else if (tag === 'th' || tag === 'td') { this.finishCell(); this.cellParts = []; this.inCell = true; if (tag === 'th') this.rowHasTh = true; } } private handleClose(tag: string): void { tag = tag.toLowerCase(); if (HEADING_TAGS.has(tag)) { this.emit('\n\n'); } else if (tag === 'a') { this.finishLink(); } else if (tag in INLINE_EMPHASIS) { this.emit(INLINE_EMPHASIS[tag]!); } else if (BLOCK_TAGS.has(tag)) { this.emit('\n\n'); } else if (tag === 'blockquote') { if (this.bqStack.length > 0) { const content = this.bqStack.pop()!.join(''); const prefixed = this.prefixBlockquote(content); if (prefixed) this.emit('\n\n' + prefixed + '\n\n'); } } else if (tag === 'ul') { if (this.listStack.length > 0 && this.listStack[this.listStack.length - 1] === 'ul') { this.listStack.pop(); } this.emit('\n'); } else if (tag === 'ol') { if (this.listStack.length > 0 && this.listStack[this.listStack.length - 1] === 'ol') { this.listStack.pop(); if (this.olCounter.length > 0) this.olCounter.pop(); } this.emit('\n'); } else if (tag === 'pre') { const raw = this.preParts.join(''); this.inPre = false; const lang = this.preLanguage ?? ''; const block = '```' + lang + '\n' + raw + '\n```'; this.emit('\n\n' + block + '\n\n'); this.preLanguage = null; } else if (tag === 'code' && !this.inPre) { this.inInlineCode = false; this.emit('`'); } else if (tag === 'th' || tag === 'td') { this.finishCell(); } else if (tag === 'tr') { this.finishCell(); this.finishRow(); } else if (tag === 'table') { this.finishCell(); this.finishRow(); this.inTable = false; this.emit('\n'); } } private handleText(data: string): void { if (this.inPre) { this.preParts.push(data); return; } if (this.inInlineCode) { this.emit(data); return; } const text = data.replace(/\s+/g, ' '); if (this.inTable && !this.inCell && !text.trim()) return; this.emit(text); } walk(node: ChildNode | Document): void { if (isText(node as ChildNode)) { this.handleText((node as TextNode).value); return; } if (node.nodeName === '#comment') return; if (isElement(node as ChildNode)) { const el = node as Element; const tag = el.tagName.toLowerCase(); if (SKIP_TAGS.has(tag)) return; if (tag === 'img') return; this.handleOpen(el); if (tag === 'pre') { for (const child of el.childNodes) { if (isElement(child) && child.tagName === 'code') { for (const grandchild of child.childNodes) { this.walk(grandchild); } } else { this.walk(child); } } } else { for (const child of el.childNodes) { this.walk(child); } } this.handleClose(tag); return; } if ('childNodes' in node) { for (const child of (node as Document).childNodes) { this.walk(child); } } } getOutput(): string { return this.out.join(''); } } function cleanup(text: string): string { const lines = text.split('\n'); const out: string[] = []; let inFence = false; let blankRun = 0; for (const line of lines) { const stripped = line.replace(/[ \t]+$/, ''); if (stripped.startsWith('```')) { inFence = !inFence; blankRun = 0; out.push(stripped); continue; } if (inFence) { out.push(line); continue; } if (!stripped) { blankRun++; if (blankRun <= 1) out.push(''); continue; } blankRun = 0; out.push(stripped); } return out.join('\n').trim(); } export function htmlToMarkdown(sourceHtml: string): string { sourceHtml = sourceHtml.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); const doc = parse(sourceHtml); const renderer = new MarkdownRenderer(); renderer.walk(doc); return cleanup(renderer.getOutput()); }