Batch 1 — tool-call-parser.ts: replaces xml-parser.ts with a port of
Unsloth's tool_call_parser.py. Adds balanced-brace JSON scanner,
single-param fast path, hasToolSignal/stripToolMarkup/parseToolCallsFromText
exports, and stream-finalization stripping at all three final-write sites
(error-handler, finalizeCompletion, executeToolPhase). Anthropic <invoke>
shape preserved. 75+12 tests.
Batch 2 — web/html-to-md.ts: parse5 tree-walking HTML-to-Markdown converter
ported from Unsloth's _html_to_md.py. Replaces web_fetch's regex stripHtml
with structured markdown output (headings, links, lists, tables, code blocks,
blockquotes, entity decoding). 29 tests.
Batch 3 — llama-args-validator.ts: port of llama_server_args.py deny-list
validator. Wired into AGENTS.md frontmatter parser — llama_extra_args field
validated at load time, rejects managed flags (model identity, networking,
auth/TLS, server UI). No runtime consumer yet (llama-swap boundary). 76 tests.
All three files carry SPDX-License-Identifier: AGPL-3.0-only headers.
LICENSE flipped to AGPL-3.0-only in prior commit (a938cf1).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
224 lines
7.4 KiB
TypeScript
224 lines
7.4 KiB
TypeScript
import { describe, expect, it } from 'vitest';
|
|
import { htmlToMarkdown } from '../web/html-to-md.js';
|
|
|
|
describe('htmlToMarkdown', () => {
|
|
it('converts h1 heading', () => {
|
|
expect(htmlToMarkdown('<h1>Title</h1>')).toBe('# Title');
|
|
});
|
|
|
|
it('converts h1 through h6', () => {
|
|
const html = '<h1>One</h1><h2>Two</h2><h3>Three</h3><h4>Four</h4><h5>Five</h5><h6>Six</h6>';
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).toContain('# One');
|
|
expect(md).toContain('## Two');
|
|
expect(md).toContain('### Three');
|
|
expect(md).toContain('#### Four');
|
|
expect(md).toContain('##### Five');
|
|
expect(md).toContain('###### Six');
|
|
});
|
|
|
|
it('converts anchor with href', () => {
|
|
expect(htmlToMarkdown('<a href="https://example.com">click here</a>'))
|
|
.toBe('[click here](https://example.com)');
|
|
});
|
|
|
|
it('converts anchor without href to plain text', () => {
|
|
expect(htmlToMarkdown('<a>just text</a>')).toBe('just text');
|
|
});
|
|
|
|
it('converts bold and italic', () => {
|
|
expect(htmlToMarkdown('<strong>bold</strong>')).toBe('**bold**');
|
|
expect(htmlToMarkdown('<b>bold</b>')).toBe('**bold**');
|
|
expect(htmlToMarkdown('<em>italic</em>')).toBe('*italic*');
|
|
expect(htmlToMarkdown('<i>italic</i>')).toBe('*italic*');
|
|
});
|
|
|
|
it('handles combined bold+italic', () => {
|
|
const md = htmlToMarkdown('<strong><em>bold italic</em></strong>');
|
|
expect(md).toBe('***bold italic***');
|
|
});
|
|
|
|
it('converts unordered list', () => {
|
|
const html = '<ul><li>one</li><li>two</li><li>three</li></ul>';
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).toContain('* one');
|
|
expect(md).toContain('* two');
|
|
expect(md).toContain('* three');
|
|
});
|
|
|
|
it('converts ordered list', () => {
|
|
const html = '<ol><li>first</li><li>second</li></ol>';
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).toContain('1. first');
|
|
expect(md).toContain('2. second');
|
|
});
|
|
|
|
it('handles nested lists', () => {
|
|
const html = '<ul><li>outer<ul><li>inner</li></ul></li></ul>';
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).toContain('* outer');
|
|
expect(md).toContain(' * inner');
|
|
});
|
|
|
|
it('converts 3-column GFM table with header', () => {
|
|
const html = `
|
|
<table>
|
|
<thead><tr><th>Name</th><th>Age</th><th>City</th></tr></thead>
|
|
<tbody>
|
|
<tr><td>Alice</td><td>30</td><td>NYC</td></tr>
|
|
<tr><td>Bob</td><td>25</td><td>LA</td></tr>
|
|
</tbody>
|
|
</table>`;
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).toContain('| Name | Age | City |');
|
|
expect(md).toContain('| --- | --- | --- |');
|
|
expect(md).toContain('| Alice | 30 | NYC |');
|
|
expect(md).toContain('| Bob | 25 | LA |');
|
|
});
|
|
|
|
it('escapes pipe characters in table cells', () => {
|
|
const html = '<table><tr><th>A</th></tr><tr><td>x | y</td></tr></table>';
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).toContain('x \\| y');
|
|
});
|
|
|
|
it('converts blockquote', () => {
|
|
const html = '<blockquote><p>quoted text</p></blockquote>';
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).toContain('> quoted text');
|
|
});
|
|
|
|
it('converts multi-line blockquote', () => {
|
|
const html = '<blockquote><p>line one</p><p>line two</p></blockquote>';
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).toContain('> line one');
|
|
expect(md).toContain('> line two');
|
|
});
|
|
|
|
it('converts fenced code block', () => {
|
|
const html = '<pre><code>const x = 1;</code></pre>';
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).toContain('```\nconst x = 1;\n```');
|
|
});
|
|
|
|
it('preserves language hint from code class', () => {
|
|
const html = '<pre><code class="language-py">print("hello")</code></pre>';
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).toContain('```py\nprint("hello")\n```');
|
|
});
|
|
|
|
it('converts inline code', () => {
|
|
expect(htmlToMarkdown('use <code>npm install</code> to install'))
|
|
.toContain('`npm install`');
|
|
});
|
|
|
|
it('decodes HTML entities', () => {
|
|
expect(htmlToMarkdown('& < > "')).toBe('& < > "');
|
|
});
|
|
|
|
it('decodes numeric character references', () => {
|
|
expect(htmlToMarkdown(''')).toBe("'");
|
|
});
|
|
|
|
it('decodes as space', () => {
|
|
const md = htmlToMarkdown('hello world');
|
|
expect(md).toMatch(/hello\s+world/);
|
|
});
|
|
|
|
it('skips script content', () => {
|
|
const html = '<p>before</p><script>alert("xss")</script><p>after</p>';
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).not.toContain('alert');
|
|
expect(md).toContain('before');
|
|
expect(md).toContain('after');
|
|
});
|
|
|
|
it('skips style content', () => {
|
|
const html = '<p>text</p><style>body { color: red }</style>';
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).not.toContain('color');
|
|
expect(md).toContain('text');
|
|
});
|
|
|
|
it('does not throw on malformed HTML', () => {
|
|
expect(() => htmlToMarkdown('<p>unclosed <b>bold <i>italic')).not.toThrow();
|
|
const md = htmlToMarkdown('<p>unclosed <b>bold <i>italic');
|
|
expect(md).toContain('bold');
|
|
expect(md).toContain('italic');
|
|
});
|
|
|
|
it('returns empty string for empty input', () => {
|
|
expect(htmlToMarkdown('')).toBe('');
|
|
});
|
|
|
|
it('returns empty string for whitespace-only input', () => {
|
|
expect(htmlToMarkdown(' \n\n ')).toBe('');
|
|
});
|
|
|
|
it('converts hr to horizontal rule', () => {
|
|
const md = htmlToMarkdown('<p>above</p><hr><p>below</p>');
|
|
expect(md).toContain('---');
|
|
});
|
|
|
|
it('converts br to newline', () => {
|
|
const md = htmlToMarkdown('line one<br>line two');
|
|
expect(md).toContain('line one\nline two');
|
|
});
|
|
|
|
it('handles ol with start attribute', () => {
|
|
const html = '<ol start="5"><li>five</li><li>six</li></ol>';
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).toContain('5. five');
|
|
expect(md).toContain('6. six');
|
|
});
|
|
|
|
it('collapses excessive blank lines', () => {
|
|
const html = '<p>one</p><p></p><p></p><p></p><p>two</p>';
|
|
const md = htmlToMarkdown(html);
|
|
const blankRuns = md.match(/\n{3,}/g);
|
|
expect(blankRuns).toBeNull();
|
|
});
|
|
|
|
// Golden test: small Hacker News-style snippet
|
|
it('golden: HN-style snippet produces structured markdown', () => {
|
|
const html = `
|
|
<html>
|
|
<head><title>Test Page</title></head>
|
|
<body>
|
|
<h1>Welcome</h1>
|
|
<p>This is a <strong>test</strong> page with <a href="https://example.com">a link</a>.</p>
|
|
<h2>Features</h2>
|
|
<ul>
|
|
<li>Fast</li>
|
|
<li>Reliable</li>
|
|
<li>Secure</li>
|
|
</ul>
|
|
<h2>Data</h2>
|
|
<table>
|
|
<thead><tr><th>Metric</th><th>Value</th></tr></thead>
|
|
<tbody>
|
|
<tr><td>Uptime</td><td>99.9%</td></tr>
|
|
<tr><td>Latency</td><td>42ms</td></tr>
|
|
</tbody>
|
|
</table>
|
|
<blockquote><p>This tool is amazing.</p></blockquote>
|
|
<pre><code class="language-js">console.log("hello");</code></pre>
|
|
<script>evil();</script>
|
|
</body>
|
|
</html>`;
|
|
const md = htmlToMarkdown(html);
|
|
expect(md).toContain('# Welcome');
|
|
expect(md).toContain('**test**');
|
|
expect(md).toContain('[a link](https://example.com)');
|
|
expect(md).toContain('## Features');
|
|
expect(md).toContain('* Fast');
|
|
expect(md).toContain('| Metric | Value |');
|
|
expect(md).toContain('| --- | --- |');
|
|
expect(md).toContain('| Uptime | 99.9% |');
|
|
expect(md).toContain('> This tool is amazing.');
|
|
expect(md).toContain('```js\nconsole.log("hello");\n```');
|
|
expect(md).not.toContain('evil');
|
|
expect(md).not.toContain('<title>');
|
|
});
|
|
});
|