From ab01e04d775ae19c74111c44da3a3fe932c15838 Mon Sep 17 00:00:00 2001 From: indifferentketchup Date: Thu, 21 May 2026 00:37:35 +0000 Subject: [PATCH] =?UTF-8?q?v1.11.9:=20manual=20redirect=20handling=20?= =?UTF-8?q?=E2=80=94=20re-run=20URL=20guard=20on=20each=20hop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/services/__tests__/web_tools.test.ts | 110 ++++++++++ apps/server/src/services/web_fetch.ts | 197 +++++++++++------- 2 files changed, 233 insertions(+), 74 deletions(-) diff --git a/apps/server/src/services/__tests__/web_tools.test.ts b/apps/server/src/services/__tests__/web_tools.test.ts index 1b0583f..b7cc41b 100644 --- a/apps/server/src/services/__tests__/web_tools.test.ts +++ b/apps/server/src/services/__tests__/web_tools.test.ts @@ -343,3 +343,113 @@ describe('executeWebFetch — size + truncation', () => { expect('content' in result && result.truncated).toBe(false); }); }); + +// ============================================================================ +// v1.11.9: manual redirect handling — re-run URL guard on each hop +// ============================================================================ + +// Helper: build a 30x redirect Response. status 302 by default; tests +// pass other codes (or omit the Location header) when they need to. +function redirect(loc: string | null, status = 302): Response { + const headers: Record = {}; + if (loc !== null) headers['location'] = loc; + return new Response('', { status, headers }); +} + +describe('executeWebFetch — redirect handling', () => { + it('blocks a redirect target that resolves to a private IP (AWS IMDS)', async () => { + // Public-IP origin 302s into 169.254.169.254 (link-local). Pre-v1.11.9 + // `redirect: 'follow'` would silently follow this; the new manual + // loop re-runs isPublicUrl on the resolved target and blocks. + const fakeFetch = vi + .fn() + .mockResolvedValueOnce(redirect('http://169.254.169.254/latest/meta-data/')); + const result = await executeWebFetch( + { url: 'https://example.com/redirect' }, + fakeFetch as unknown as typeof fetch, + ); + expect('error' in result).toBe(true); + if ('error' in result) { + expect(result.error).toBe('blocked_by_url_guard'); + // Reason should make it clear this was a REDIRECT hop, not the + // initial URL — so logs can distinguish the two failure modes. + expect(result.reason).toMatch(/redirect target/); + } + // Critical: the second fetch (the private target) must NOT happen. + expect(fakeFetch).toHaveBeenCalledTimes(1); + }); + + it('follows a public-to-public redirect and returns the final body', async () => { + const fakeFetch = vi + .fn() + .mockResolvedValueOnce(redirect('https://example.org/final')) + .mockResolvedValueOnce(mockResponse('ok body', { contentType: 'text/plain' })); + const result = await executeWebFetch( + { url: 'https://example.com/start' }, + fakeFetch as unknown as typeof fetch, + ); + expect('content' in result).toBe(true); + if ('content' in result) { + expect(result.content).toBe('ok body'); + // Final URL is reported back so the model knows where the body came from. + expect(result.url).toBe('https://example.org/final'); + } + expect(fakeFetch).toHaveBeenCalledTimes(2); + }); + + it('bails after MAX_REDIRECTS hops with a Too many redirects error', async () => { + // Chain 6 redirects — one more than the loop allows. Each Location + // points at a distinct public host so the URL guard stays happy and + // we exercise the redirectCount > MAX_REDIRECTS branch specifically. + const fakeFetch = vi + .fn() + .mockResolvedValueOnce(redirect('https://a.example/')) + .mockResolvedValueOnce(redirect('https://b.example/')) + .mockResolvedValueOnce(redirect('https://c.example/')) + .mockResolvedValueOnce(redirect('https://d.example/')) + .mockResolvedValueOnce(redirect('https://e.example/')) + .mockResolvedValueOnce(redirect('https://f.example/')); + const result = await executeWebFetch( + { url: 'https://start.example/' }, + fakeFetch as unknown as typeof fetch, + ); + expect('error' in result).toBe(true); + if ('error' in result) { + expect(result.error).toBe('too_many_redirects'); + expect(result.reason).toMatch(/Too many redirects/); + } + }); + + it('errors when a 30x response omits the Location header', async () => { + const fakeFetch = vi + .fn() + .mockResolvedValueOnce(redirect(null, 302)); + const result = await executeWebFetch( + { url: 'https://example.com/' }, + fakeFetch as unknown as typeof fetch, + ); + expect('error' in result).toBe(true); + if ('error' in result) { + expect(result.error).toBe('redirect_missing_location'); + expect(result.reason).toMatch(/no Location/); + } + }); + + it('resolves a relative Location against the current URL', async () => { + // Server sends `Location: /foo` (relative) on a request to + // https://example.com/path. RFC 9110 says resolve against the + // request URL, so the next hop is https://example.com/foo. Assert + // the second fetch was called with the absolute resolved URL. + const fakeFetch = vi + .fn() + .mockResolvedValueOnce(redirect('/foo')) + .mockResolvedValueOnce(mockResponse('final', { contentType: 'text/plain' })); + const result = await executeWebFetch( + { url: 'https://example.com/path' }, + fakeFetch as unknown as typeof fetch, + ); + expect('content' in result && result.content).toBe('final'); + expect(fakeFetch).toHaveBeenCalledTimes(2); + expect(fakeFetch.mock.calls[1]![0]).toBe('https://example.com/foo'); + }); +}); diff --git a/apps/server/src/services/web_fetch.ts b/apps/server/src/services/web_fetch.ts index 09f8bc8..7fe10bf 100644 --- a/apps/server/src/services/web_fetch.ts +++ b/apps/server/src/services/web_fetch.ts @@ -22,6 +22,9 @@ const DEFAULT_MAX_CHARS = 8_000; const MAX_CHARS_CAP = 32_000; const FETCH_TIMEOUT_MS = 15_000; const MAX_BYTES = 5 * 1024 * 1024; +// v1.11.9: cap redirect chains. Each hop re-runs isPublicUrl on the +// resolved target so a public-IP origin can't 302 us into a private IP. +const MAX_REDIRECTS = 5; // Output shape. Each variant uses a discriminator the LLM can branch on. export type WebFetchOutput = @@ -74,89 +77,135 @@ export async function executeWebFetch( input: WebFetchInputT, fetcher: typeof fetch = fetch, ): Promise { - const guard = isPublicUrl(input.url); - if (!guard.ok) { - return { error: 'blocked_by_url_guard', reason: guard.reason ?? 'unknown' }; - } - const maxChars = Math.min(input.max_chars ?? DEFAULT_MAX_CHARS, MAX_CHARS_CAP); - const controller = new AbortController(); - const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS); - try { - const res = await fetcher(input.url, { - signal: controller.signal, - // TODO(v1.11.9): redirect: 'manual' + re-run isPublicUrl on Location header. - // Current 'follow' allows redirect-to-private-IP bypass of URL guard. - redirect: 'follow', - headers: { 'User-Agent': 'BooCode/1.11.8', Accept: 'text/html,text/plain,application/json,*/*' }, - }); - if (!res.ok) { - return { error: 'upstream_status', reason: `HTTP ${res.status}` }; - } - // Pre-flight size check via Content-Length when the server provides it. - const lenHeader = res.headers.get('content-length'); - if (lenHeader) { - const len = Number(lenHeader); - if (Number.isFinite(len) && len > MAX_BYTES) { - return { error: 'response_too_large', reason: `Content-Length ${len} > ${MAX_BYTES}` }; - } - } - const contentType = (res.headers.get('content-type') ?? '').toLowerCase(); - // Read body. We rely on the 5MB cap by checking length after consumption - // — most malicious or accidental large responses also exceed it via the - // Content-Length pre-flight above. A truly hostile server that lies - // about length AND streams gigabytes would defeat that; for v1.11.8 - // the 15s timeout is the secondary fence. - const body = await res.text(); - // v1.11.8 review: byte-count, not char-count. A 5MB cap on - // body.length (UTF-16 code units) lets a multi-byte payload (emoji, - // CJK) pass when its wire size already exceeded MAX_BYTES. Compute - // once and reuse for the error message. - const bodyBytes = Buffer.byteLength(body, 'utf8'); - if (bodyBytes > MAX_BYTES) { - return { error: 'response_too_large', reason: `body ${bodyBytes} bytes > ${MAX_BYTES}` }; - } + // v1.11.9: manual redirect handling. `redirect: 'follow'` in fetch + // doesn't expose intermediate hops — a public-IP origin that 302s us + // to 169.254.169.254 would silently bypass isPublicUrl. We follow each + // hop ourselves, re-running the URL guard on the resolved target so a + // mid-chain hostile redirect gets blocked. + // + // Timeout semantics changed from v1.11.8: AbortSignal.timeout fires + // per fetch hop (vs. one 15s budget shared across the whole call). In + // the worst case a 5-hop chain can take ~5×15s before erroring — still + // bounded; trades a longer cap for simpler code. + let currentUrl = input.url; + let res: Response | undefined; + let redirectCount = 0; - let textRaw: string; - let title: string | undefined; - if (contentType.includes('text/html') || contentType.includes('application/xhtml')) { - const stripped = stripHtml(body); - textRaw = stripped.text; - title = stripped.title; - } else if ( - contentType.includes('text/plain') || - contentType.includes('text/markdown') || - contentType.includes('application/json') || - contentType.includes('text/xml') || - contentType.includes('application/xml') - ) { - textRaw = body; - } else { + while (true) { + const guard = isPublicUrl(currentUrl); + if (!guard.ok) { return { - error: 'unsupported_content_type', - reason: `content-type ${contentType || '(none)'} not supported`, - content_type: contentType, + error: 'blocked_by_url_guard', + reason: redirectCount === 0 + ? (guard.reason ?? 'unknown') + : `redirect target ${currentUrl} blocked: ${guard.reason ?? 'unknown'}`, }; } - const truncated = truncate(textRaw, maxChars); - return { - url: input.url, - title, - content: truncated.content, - content_type: contentType, - truncated: truncated.truncated, - }; - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - if (err instanceof Error && err.name === 'AbortError') { - return { error: 'timeout', reason: `aborted after ${FETCH_TIMEOUT_MS}ms` }; + try { + res = await fetcher(currentUrl, { + method: 'GET', + redirect: 'manual', + signal: AbortSignal.timeout(FETCH_TIMEOUT_MS), + headers: { + 'User-Agent': 'BooCode/1.11.9', + Accept: 'text/html,text/plain,application/json,*/*', + }, + }); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + // AbortSignal.timeout fires a DOMException with name 'TimeoutError'; + // older runtimes / polyfills may surface 'AbortError'. Treat both. + if (err instanceof Error && (err.name === 'TimeoutError' || err.name === 'AbortError')) { + return { error: 'timeout', reason: `aborted after ${FETCH_TIMEOUT_MS}ms` }; + } + return { error: 'fetch_failed', reason: msg }; } - return { error: 'fetch_failed', reason: msg }; - } finally { - clearTimeout(timer); + + if (res.status >= 300 && res.status < 400) { + const loc = res.headers.get('location'); + if (!loc) { + return { + error: 'redirect_missing_location', + reason: `${res.status} redirect with no Location header`, + }; + } + redirectCount += 1; + if (redirectCount > MAX_REDIRECTS) { + return { + error: 'too_many_redirects', + reason: `Too many redirects (exceeded ${MAX_REDIRECTS} hops)`, + }; + } + // Resolve relative Location against the URL we just hit (RFC 9110). + // The next loop iteration re-runs isPublicUrl on the new currentUrl. + currentUrl = new URL(loc, currentUrl).toString(); + continue; + } + break; } + + if (!res.ok) { + return { error: 'upstream_status', reason: `HTTP ${res.status}` }; + } + // Pre-flight size check via Content-Length when the server provides it. + const lenHeader = res.headers.get('content-length'); + if (lenHeader) { + const len = Number(lenHeader); + if (Number.isFinite(len) && len > MAX_BYTES) { + return { error: 'response_too_large', reason: `Content-Length ${len} > ${MAX_BYTES}` }; + } + } + const contentType = (res.headers.get('content-type') ?? '').toLowerCase(); + // Read body. We rely on the 5MB cap by checking length after consumption + // — most malicious or accidental large responses also exceed it via the + // Content-Length pre-flight above. A truly hostile server that lies + // about length AND streams gigabytes would defeat that; the per-hop + // 15s timeout is the secondary fence. + const body = await res.text(); + // v1.11.8 review: byte-count, not char-count. A 5MB cap on body.length + // (UTF-16 code units) lets a multi-byte payload (emoji, CJK) pass when + // its wire size already exceeded MAX_BYTES. + const bodyBytes = Buffer.byteLength(body, 'utf8'); + if (bodyBytes > MAX_BYTES) { + return { error: 'response_too_large', reason: `body ${bodyBytes} bytes > ${MAX_BYTES}` }; + } + + let textRaw: string; + let title: string | undefined; + if (contentType.includes('text/html') || contentType.includes('application/xhtml')) { + const stripped = stripHtml(body); + textRaw = stripped.text; + title = stripped.title; + } else if ( + contentType.includes('text/plain') || + contentType.includes('text/markdown') || + contentType.includes('application/json') || + contentType.includes('text/xml') || + contentType.includes('application/xml') + ) { + textRaw = body; + } else { + return { + error: 'unsupported_content_type', + reason: `content-type ${contentType || '(none)'} not supported`, + content_type: contentType, + }; + } + + const truncated = truncate(textRaw, maxChars); + // Report the FINAL URL (post-redirects) so the LLM knows where the body + // came from — useful for citations and for the model to reason about + // domain trust. + return { + url: currentUrl, + title, + content: truncated.content, + content_type: contentType, + truncated: truncated.truncated, + }; } export const webFetch: ToolDef = {