Files
boocode/apps/server/src/services/__tests__/html-to-md.test.ts
indifferentketchup a8bfde8f8d feat: relicense AGPL-3.0 → MIT (v2.7.0)
Clear the 3 Unsloth-Studio-derived AGPL files and flip LICENSE + 5
package.json from AGPL-3.0-only to MIT.

- html-to-md.ts → MIT node-html-markdown (parse5 dropped)
- llama-args-validator.ts → clean-room (flag denylist = facts)
- tool-call-parser.ts → delete dead Unsloth-ported code; keep
  extractToolCallBlocks/stripToolMarkup byte-identical (no behavior change)
- LICENSE → MIT (Copyright (c) 2026 indifferentketchup); 5 package.json → MIT;
  AGPL SPDX headers removed; README License section; license-mit guard test
- roadmap License-debt batch marked shipped; openspec/changes/license-debt-mit

Decouples the relicense from the native-parsing retirement (the ported parser
was dead code). Server suite 519 passing; build + coder typecheck clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-01 08:16:03 +00:00

236 lines
8.0 KiB
TypeScript

import { describe, expect, it } from 'vitest';
import { htmlToMarkdown } from '../web/html-to-md.js';
describe('htmlToMarkdown', () => {
it('converts h1 heading', () => {
expect(htmlToMarkdown('<h1>Title</h1>')).toBe('# Title');
});
it('converts h1 through h6', () => {
const html = '<h1>One</h1><h2>Two</h2><h3>Three</h3><h4>Four</h4><h5>Five</h5><h6>Six</h6>';
const md = htmlToMarkdown(html);
expect(md).toContain('# One');
expect(md).toContain('## Two');
expect(md).toContain('### Three');
expect(md).toContain('#### Four');
expect(md).toContain('##### Five');
expect(md).toContain('###### Six');
});
it('converts anchor with href', () => {
expect(htmlToMarkdown('<a href="https://example.com">click here</a>'))
.toBe('[click here](https://example.com)');
});
it('converts anchor without href to plain text', () => {
expect(htmlToMarkdown('<a>just text</a>')).toBe('just text');
});
it('converts bold and italic', () => {
expect(htmlToMarkdown('<strong>bold</strong>')).toBe('**bold**');
expect(htmlToMarkdown('<b>bold</b>')).toBe('**bold**');
expect(htmlToMarkdown('<em>italic</em>')).toBe('*italic*');
expect(htmlToMarkdown('<i>italic</i>')).toBe('*italic*');
});
it('handles combined bold+italic', () => {
const md = htmlToMarkdown('<strong><em>bold italic</em></strong>');
expect(md).toBe('***bold italic***');
});
it('converts unordered list', () => {
const html = '<ul><li>one</li><li>two</li><li>three</li></ul>';
const md = htmlToMarkdown(html);
expect(md).toContain('* one');
expect(md).toContain('* two');
expect(md).toContain('* three');
});
it('converts ordered list', () => {
const html = '<ol><li>first</li><li>second</li></ol>';
const md = htmlToMarkdown(html);
expect(md).toContain('1. first');
expect(md).toContain('2. second');
});
it('handles nested lists', () => {
const html = '<ul><li>outer<ul><li>inner</li></ul></li></ul>';
const md = htmlToMarkdown(html);
expect(md).toContain('* outer');
expect(md).toContain(' * inner');
});
it('converts 3-column GFM table with header', () => {
const html = `
<table>
<thead><tr><th>Name</th><th>Age</th><th>City</th></tr></thead>
<tbody>
<tr><td>Alice</td><td>30</td><td>NYC</td></tr>
<tr><td>Bob</td><td>25</td><td>LA</td></tr>
</tbody>
</table>`;
const md = htmlToMarkdown(html);
// node-html-markdown pads columns to align them; assert structure rather
// than exact spacing. Each cell value and a GFM separator row are present.
expect(md).toContain('| Name ');
expect(md).toContain('| Age ');
expect(md).toContain('| City |');
expect(md).toMatch(/\| -+ \| -+ \| -+ \|/); // separator row
expect(md).toContain('| Alice ');
expect(md).toContain('| NYC |');
expect(md).toContain('| Bob ');
expect(md).toContain('| LA |');
});
it('escapes pipe characters in table cells', () => {
const html = '<table><tr><th>A</th></tr><tr><td>x | y</td></tr></table>';
const md = htmlToMarkdown(html);
expect(md).toContain('x \\| y');
});
it('converts blockquote', () => {
const html = '<blockquote><p>quoted text</p></blockquote>';
const md = htmlToMarkdown(html);
expect(md).toContain('> quoted text');
});
it('converts multi-line blockquote', () => {
const html = '<blockquote><p>line one</p><p>line two</p></blockquote>';
const md = htmlToMarkdown(html);
expect(md).toContain('> line one');
expect(md).toContain('> line two');
});
it('converts fenced code block', () => {
const html = '<pre><code>const x = 1;</code></pre>';
const md = htmlToMarkdown(html);
expect(md).toContain('```\nconst x = 1;\n```');
});
it('preserves language hint from code class', () => {
const html = '<pre><code class="language-py">print("hello")</code></pre>';
const md = htmlToMarkdown(html);
expect(md).toContain('```py\nprint("hello")\n```');
});
it('converts inline code', () => {
expect(htmlToMarkdown('use <code>npm install</code> to install'))
.toContain('`npm install`');
});
it('decodes HTML entities', () => {
expect(htmlToMarkdown('&amp; &lt; &gt; &quot;')).toBe('& < > "');
});
it('decodes numeric character references', () => {
expect(htmlToMarkdown('&#39;')).toBe("'");
});
it('decodes &nbsp; as space', () => {
const md = htmlToMarkdown('hello&nbsp;world');
expect(md).toMatch(/hello\s+world/);
});
it('skips script content', () => {
const html = '<p>before</p><script>alert("xss")</script><p>after</p>';
const md = htmlToMarkdown(html);
expect(md).not.toContain('alert');
expect(md).toContain('before');
expect(md).toContain('after');
});
it('skips style content', () => {
const html = '<p>text</p><style>body { color: red }</style>';
const md = htmlToMarkdown(html);
expect(md).not.toContain('color');
expect(md).toContain('text');
});
it('does not throw on malformed HTML', () => {
expect(() => htmlToMarkdown('<p>unclosed <b>bold <i>italic')).not.toThrow();
const md = htmlToMarkdown('<p>unclosed <b>bold <i>italic');
expect(md).toContain('bold');
expect(md).toContain('italic');
});
it('returns empty string for empty input', () => {
expect(htmlToMarkdown('')).toBe('');
});
it('returns empty string for whitespace-only input', () => {
expect(htmlToMarkdown(' \n\n ')).toBe('');
});
it('converts hr to horizontal rule', () => {
const md = htmlToMarkdown('<p>above</p><hr><p>below</p>');
expect(md).toContain('---');
});
it('converts br to newline', () => {
const md = htmlToMarkdown('line one<br>line two');
// node-html-markdown emits a GFM hard line break (trailing two spaces).
expect(md).toContain('line one \nline two');
});
it('handles ol with start attribute', () => {
const html = '<ol start="5"><li>five</li><li>six</li></ol>';
const md = htmlToMarkdown(html);
// node-html-markdown does not honor the `start` attribute; it always
// renumbers ordered lists from 1. (Old parse5 renderer honored start=.)
expect(md).toContain('1. five');
expect(md).toContain('2. six');
});
it('collapses excessive blank lines', () => {
const html = '<p>one</p><p></p><p></p><p></p><p>two</p>';
const md = htmlToMarkdown(html);
const blankRuns = md.match(/\n{3,}/g);
expect(blankRuns).toBeNull();
});
// Golden test: small Hacker News-style snippet
it('golden: HN-style snippet produces structured markdown', () => {
const html = `
<html>
<head><title>Test Page</title></head>
<body>
<h1>Welcome</h1>
<p>This is a <strong>test</strong> page with <a href="https://example.com">a link</a>.</p>
<h2>Features</h2>
<ul>
<li>Fast</li>
<li>Reliable</li>
<li>Secure</li>
</ul>
<h2>Data</h2>
<table>
<thead><tr><th>Metric</th><th>Value</th></tr></thead>
<tbody>
<tr><td>Uptime</td><td>99.9%</td></tr>
<tr><td>Latency</td><td>42ms</td></tr>
</tbody>
</table>
<blockquote><p>This tool is amazing.</p></blockquote>
<pre><code class="language-js">console.log("hello");</code></pre>
<script>evil();</script>
</body>
</html>`;
const md = htmlToMarkdown(html);
expect(md).toContain('# Welcome');
expect(md).toContain('**test**');
expect(md).toContain('[a link](https://example.com)');
expect(md).toContain('## Features');
expect(md).toContain('* Fast');
// Table columns are padded to align (node-html-markdown behavior).
expect(md).toContain('| Metric ');
expect(md).toContain('| Value |');
expect(md).toMatch(/\| -+ \| -+ \|/); // separator row
expect(md).toContain('| Uptime ');
expect(md).toContain('| 99.9% |');
expect(md).toContain('> This tool is amazing.');
expect(md).toContain('```js\nconsole.log("hello");\n```');
expect(md).not.toContain('evil');
expect(md).not.toContain('<title>');
});
});