v1.11.9: manual redirect handling — re-run URL guard on each hop

This commit is contained in:
2026-05-21 00:37:35 +00:00
parent 4e67a265ac
commit ab01e04d77
2 changed files with 233 additions and 74 deletions

View File

@@ -343,3 +343,113 @@ describe('executeWebFetch — size + truncation', () => {
expect('content' in result && result.truncated).toBe(false); expect('content' in result && result.truncated).toBe(false);
}); });
}); });
// ============================================================================
// v1.11.9: manual redirect handling — re-run URL guard on each hop
// ============================================================================
// Helper: build a 30x redirect Response. status 302 by default; tests
// pass other codes (or omit the Location header) when they need to.
function redirect(loc: string | null, status = 302): Response {
const headers: Record<string, string> = {};
if (loc !== null) headers['location'] = loc;
return new Response('', { status, headers });
}
describe('executeWebFetch — redirect handling', () => {
it('blocks a redirect target that resolves to a private IP (AWS IMDS)', async () => {
// Public-IP origin 302s into 169.254.169.254 (link-local). Pre-v1.11.9
// `redirect: 'follow'` would silently follow this; the new manual
// loop re-runs isPublicUrl on the resolved target and blocks.
const fakeFetch = vi
.fn<typeof fetch>()
.mockResolvedValueOnce(redirect('http://169.254.169.254/latest/meta-data/'));
const result = await executeWebFetch(
{ url: 'https://example.com/redirect' },
fakeFetch as unknown as typeof fetch,
);
expect('error' in result).toBe(true);
if ('error' in result) {
expect(result.error).toBe('blocked_by_url_guard');
// Reason should make it clear this was a REDIRECT hop, not the
// initial URL — so logs can distinguish the two failure modes.
expect(result.reason).toMatch(/redirect target/);
}
// Critical: the second fetch (the private target) must NOT happen.
expect(fakeFetch).toHaveBeenCalledTimes(1);
});
it('follows a public-to-public redirect and returns the final body', async () => {
const fakeFetch = vi
.fn<typeof fetch>()
.mockResolvedValueOnce(redirect('https://example.org/final'))
.mockResolvedValueOnce(mockResponse('ok body', { contentType: 'text/plain' }));
const result = await executeWebFetch(
{ url: 'https://example.com/start' },
fakeFetch as unknown as typeof fetch,
);
expect('content' in result).toBe(true);
if ('content' in result) {
expect(result.content).toBe('ok body');
// Final URL is reported back so the model knows where the body came from.
expect(result.url).toBe('https://example.org/final');
}
expect(fakeFetch).toHaveBeenCalledTimes(2);
});
it('bails after MAX_REDIRECTS hops with a Too many redirects error', async () => {
// Chain 6 redirects — one more than the loop allows. Each Location
// points at a distinct public host so the URL guard stays happy and
// we exercise the redirectCount > MAX_REDIRECTS branch specifically.
const fakeFetch = vi
.fn<typeof fetch>()
.mockResolvedValueOnce(redirect('https://a.example/'))
.mockResolvedValueOnce(redirect('https://b.example/'))
.mockResolvedValueOnce(redirect('https://c.example/'))
.mockResolvedValueOnce(redirect('https://d.example/'))
.mockResolvedValueOnce(redirect('https://e.example/'))
.mockResolvedValueOnce(redirect('https://f.example/'));
const result = await executeWebFetch(
{ url: 'https://start.example/' },
fakeFetch as unknown as typeof fetch,
);
expect('error' in result).toBe(true);
if ('error' in result) {
expect(result.error).toBe('too_many_redirects');
expect(result.reason).toMatch(/Too many redirects/);
}
});
it('errors when a 30x response omits the Location header', async () => {
const fakeFetch = vi
.fn<typeof fetch>()
.mockResolvedValueOnce(redirect(null, 302));
const result = await executeWebFetch(
{ url: 'https://example.com/' },
fakeFetch as unknown as typeof fetch,
);
expect('error' in result).toBe(true);
if ('error' in result) {
expect(result.error).toBe('redirect_missing_location');
expect(result.reason).toMatch(/no Location/);
}
});
it('resolves a relative Location against the current URL', async () => {
// Server sends `Location: /foo` (relative) on a request to
// https://example.com/path. RFC 9110 says resolve against the
// request URL, so the next hop is https://example.com/foo. Assert
// the second fetch was called with the absolute resolved URL.
const fakeFetch = vi
.fn<typeof fetch>()
.mockResolvedValueOnce(redirect('/foo'))
.mockResolvedValueOnce(mockResponse('final', { contentType: 'text/plain' }));
const result = await executeWebFetch(
{ url: 'https://example.com/path' },
fakeFetch as unknown as typeof fetch,
);
expect('content' in result && result.content).toBe('final');
expect(fakeFetch).toHaveBeenCalledTimes(2);
expect(fakeFetch.mock.calls[1]![0]).toBe('https://example.com/foo');
});
});

View File

@@ -22,6 +22,9 @@ const DEFAULT_MAX_CHARS = 8_000;
const MAX_CHARS_CAP = 32_000; const MAX_CHARS_CAP = 32_000;
const FETCH_TIMEOUT_MS = 15_000; const FETCH_TIMEOUT_MS = 15_000;
const MAX_BYTES = 5 * 1024 * 1024; const MAX_BYTES = 5 * 1024 * 1024;
// v1.11.9: cap redirect chains. Each hop re-runs isPublicUrl on the
// resolved target so a public-IP origin can't 302 us into a private IP.
const MAX_REDIRECTS = 5;
// Output shape. Each variant uses a discriminator the LLM can branch on. // Output shape. Each variant uses a discriminator the LLM can branch on.
export type WebFetchOutput = export type WebFetchOutput =
@@ -74,89 +77,135 @@ export async function executeWebFetch(
input: WebFetchInputT, input: WebFetchInputT,
fetcher: typeof fetch = fetch, fetcher: typeof fetch = fetch,
): Promise<WebFetchOutput> { ): Promise<WebFetchOutput> {
const guard = isPublicUrl(input.url);
if (!guard.ok) {
return { error: 'blocked_by_url_guard', reason: guard.reason ?? 'unknown' };
}
const maxChars = Math.min(input.max_chars ?? DEFAULT_MAX_CHARS, MAX_CHARS_CAP); const maxChars = Math.min(input.max_chars ?? DEFAULT_MAX_CHARS, MAX_CHARS_CAP);
const controller = new AbortController(); // v1.11.9: manual redirect handling. `redirect: 'follow'` in fetch
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS); // doesn't expose intermediate hops — a public-IP origin that 302s us
try { // to 169.254.169.254 would silently bypass isPublicUrl. We follow each
const res = await fetcher(input.url, { // hop ourselves, re-running the URL guard on the resolved target so a
signal: controller.signal, // mid-chain hostile redirect gets blocked.
// TODO(v1.11.9): redirect: 'manual' + re-run isPublicUrl on Location header. //
// Current 'follow' allows redirect-to-private-IP bypass of URL guard. // Timeout semantics changed from v1.11.8: AbortSignal.timeout fires
redirect: 'follow', // per fetch hop (vs. one 15s budget shared across the whole call). In
headers: { 'User-Agent': 'BooCode/1.11.8', Accept: 'text/html,text/plain,application/json,*/*' }, // the worst case a 5-hop chain can take ~5×15s before erroring — still
}); // bounded; trades a longer cap for simpler code.
if (!res.ok) { let currentUrl = input.url;
return { error: 'upstream_status', reason: `HTTP ${res.status}` }; let res: Response | undefined;
} let redirectCount = 0;
// Pre-flight size check via Content-Length when the server provides it.
const lenHeader = res.headers.get('content-length');
if (lenHeader) {
const len = Number(lenHeader);
if (Number.isFinite(len) && len > MAX_BYTES) {
return { error: 'response_too_large', reason: `Content-Length ${len} > ${MAX_BYTES}` };
}
}
const contentType = (res.headers.get('content-type') ?? '').toLowerCase();
// Read body. We rely on the 5MB cap by checking length after consumption
// — most malicious or accidental large responses also exceed it via the
// Content-Length pre-flight above. A truly hostile server that lies
// about length AND streams gigabytes would defeat that; for v1.11.8
// the 15s timeout is the secondary fence.
const body = await res.text();
// v1.11.8 review: byte-count, not char-count. A 5MB cap on
// body.length (UTF-16 code units) lets a multi-byte payload (emoji,
// CJK) pass when its wire size already exceeded MAX_BYTES. Compute
// once and reuse for the error message.
const bodyBytes = Buffer.byteLength(body, 'utf8');
if (bodyBytes > MAX_BYTES) {
return { error: 'response_too_large', reason: `body ${bodyBytes} bytes > ${MAX_BYTES}` };
}
let textRaw: string; while (true) {
let title: string | undefined; const guard = isPublicUrl(currentUrl);
if (contentType.includes('text/html') || contentType.includes('application/xhtml')) { if (!guard.ok) {
const stripped = stripHtml(body);
textRaw = stripped.text;
title = stripped.title;
} else if (
contentType.includes('text/plain') ||
contentType.includes('text/markdown') ||
contentType.includes('application/json') ||
contentType.includes('text/xml') ||
contentType.includes('application/xml')
) {
textRaw = body;
} else {
return { return {
error: 'unsupported_content_type', error: 'blocked_by_url_guard',
reason: `content-type ${contentType || '(none)'} not supported`, reason: redirectCount === 0
content_type: contentType, ? (guard.reason ?? 'unknown')
: `redirect target ${currentUrl} blocked: ${guard.reason ?? 'unknown'}`,
}; };
} }
const truncated = truncate(textRaw, maxChars); try {
return { res = await fetcher(currentUrl, {
url: input.url, method: 'GET',
title, redirect: 'manual',
content: truncated.content, signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
content_type: contentType, headers: {
truncated: truncated.truncated, 'User-Agent': 'BooCode/1.11.9',
}; Accept: 'text/html,text/plain,application/json,*/*',
} catch (err) { },
const msg = err instanceof Error ? err.message : String(err); });
if (err instanceof Error && err.name === 'AbortError') { } catch (err) {
return { error: 'timeout', reason: `aborted after ${FETCH_TIMEOUT_MS}ms` }; const msg = err instanceof Error ? err.message : String(err);
// AbortSignal.timeout fires a DOMException with name 'TimeoutError';
// older runtimes / polyfills may surface 'AbortError'. Treat both.
if (err instanceof Error && (err.name === 'TimeoutError' || err.name === 'AbortError')) {
return { error: 'timeout', reason: `aborted after ${FETCH_TIMEOUT_MS}ms` };
}
return { error: 'fetch_failed', reason: msg };
} }
return { error: 'fetch_failed', reason: msg };
} finally { if (res.status >= 300 && res.status < 400) {
clearTimeout(timer); const loc = res.headers.get('location');
if (!loc) {
return {
error: 'redirect_missing_location',
reason: `${res.status} redirect with no Location header`,
};
}
redirectCount += 1;
if (redirectCount > MAX_REDIRECTS) {
return {
error: 'too_many_redirects',
reason: `Too many redirects (exceeded ${MAX_REDIRECTS} hops)`,
};
}
// Resolve relative Location against the URL we just hit (RFC 9110).
// The next loop iteration re-runs isPublicUrl on the new currentUrl.
currentUrl = new URL(loc, currentUrl).toString();
continue;
}
break;
} }
if (!res.ok) {
return { error: 'upstream_status', reason: `HTTP ${res.status}` };
}
// Pre-flight size check via Content-Length when the server provides it.
const lenHeader = res.headers.get('content-length');
if (lenHeader) {
const len = Number(lenHeader);
if (Number.isFinite(len) && len > MAX_BYTES) {
return { error: 'response_too_large', reason: `Content-Length ${len} > ${MAX_BYTES}` };
}
}
const contentType = (res.headers.get('content-type') ?? '').toLowerCase();
// Read body. We rely on the 5MB cap by checking length after consumption
// — most malicious or accidental large responses also exceed it via the
// Content-Length pre-flight above. A truly hostile server that lies
// about length AND streams gigabytes would defeat that; the per-hop
// 15s timeout is the secondary fence.
const body = await res.text();
// v1.11.8 review: byte-count, not char-count. A 5MB cap on body.length
// (UTF-16 code units) lets a multi-byte payload (emoji, CJK) pass when
// its wire size already exceeded MAX_BYTES.
const bodyBytes = Buffer.byteLength(body, 'utf8');
if (bodyBytes > MAX_BYTES) {
return { error: 'response_too_large', reason: `body ${bodyBytes} bytes > ${MAX_BYTES}` };
}
let textRaw: string;
let title: string | undefined;
if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
const stripped = stripHtml(body);
textRaw = stripped.text;
title = stripped.title;
} else if (
contentType.includes('text/plain') ||
contentType.includes('text/markdown') ||
contentType.includes('application/json') ||
contentType.includes('text/xml') ||
contentType.includes('application/xml')
) {
textRaw = body;
} else {
return {
error: 'unsupported_content_type',
reason: `content-type ${contentType || '(none)'} not supported`,
content_type: contentType,
};
}
const truncated = truncate(textRaw, maxChars);
// Report the FINAL URL (post-redirects) so the LLM knows where the body
// came from — useful for citations and for the model to reason about
// domain trust.
return {
url: currentUrl,
title,
content: truncated.content,
content_type: contentType,
truncated: truncated.truncated,
};
} }
export const webFetch: ToolDef<WebFetchInputT> = { export const webFetch: ToolDef<WebFetchInputT> = {