v1.11.9: manual redirect handling — re-run URL guard on each hop

2026-05-21 00:37:35 +00:00
parent 4e67a265ac
commit ab01e04d77
2 changed files with 233 additions and 74 deletions
--- a/apps/server/src/services/tests/web_tools.test.ts
+++ b/apps/server/src/services/tests/web_tools.test.ts
@@ -343,3 +343,113 @@ describe('executeWebFetch — size + truncation', () => {
    expect('content' in result && result.truncated).toBe(false);
  });
 });
 // ============================================================================
 // v1.11.9: manual redirect handling — re-run URL guard on each hop
 // ============================================================================
 // Helper: build a 30x redirect Response. status 302 by default; tests
 // pass other codes (or omit the Location header) when they need to.
 function redirect(loc: string | null, status = 302): Response {
  const headers: Record<string, string> = {};
  if (loc !== null) headers['location'] = loc;
  return new Response('', { status, headers });
 }
 describe('executeWebFetch — redirect handling', () => {
  it('blocks a redirect target that resolves to a private IP (AWS IMDS)', async () => {
    // Public-IP origin 302s into 169.254.169.254 (link-local). Pre-v1.11.9
    // `redirect: 'follow'` would silently follow this; the new manual
    // loop re-runs isPublicUrl on the resolved target and blocks.
    const fakeFetch = vi
      .fn<typeof fetch>()
      .mockResolvedValueOnce(redirect('http://169.254.169.254/latest/meta-data/'));
    const result = await executeWebFetch(
      { url: 'https://example.com/redirect' },
      fakeFetch as unknown as typeof fetch,
    );
    expect('error' in result).toBe(true);
    if ('error' in result) {
      expect(result.error).toBe('blocked_by_url_guard');
      // Reason should make it clear this was a REDIRECT hop, not the
      // initial URL — so logs can distinguish the two failure modes.
      expect(result.reason).toMatch(/redirect target/);
    }
    // Critical: the second fetch (the private target) must NOT happen.
    expect(fakeFetch).toHaveBeenCalledTimes(1);
  });
  it('follows a public-to-public redirect and returns the final body', async () => {
    const fakeFetch = vi
      .fn<typeof fetch>()
      .mockResolvedValueOnce(redirect('https://example.org/final'))
      .mockResolvedValueOnce(mockResponse('ok body', { contentType: 'text/plain' }));
    const result = await executeWebFetch(
      { url: 'https://example.com/start' },
      fakeFetch as unknown as typeof fetch,
    );
    expect('content' in result).toBe(true);
    if ('content' in result) {
      expect(result.content).toBe('ok body');
      // Final URL is reported back so the model knows where the body came from.
      expect(result.url).toBe('https://example.org/final');
    }
    expect(fakeFetch).toHaveBeenCalledTimes(2);
  });
  it('bails after MAX_REDIRECTS hops with a Too many redirects error', async () => {
    // Chain 6 redirects — one more than the loop allows. Each Location
    // points at a distinct public host so the URL guard stays happy and
    // we exercise the redirectCount > MAX_REDIRECTS branch specifically.
    const fakeFetch = vi
      .fn<typeof fetch>()
      .mockResolvedValueOnce(redirect('https://a.example/'))
      .mockResolvedValueOnce(redirect('https://b.example/'))
      .mockResolvedValueOnce(redirect('https://c.example/'))
      .mockResolvedValueOnce(redirect('https://d.example/'))
      .mockResolvedValueOnce(redirect('https://e.example/'))
      .mockResolvedValueOnce(redirect('https://f.example/'));
    const result = await executeWebFetch(
      { url: 'https://start.example/' },
      fakeFetch as unknown as typeof fetch,
    );
    expect('error' in result).toBe(true);
    if ('error' in result) {
      expect(result.error).toBe('too_many_redirects');
      expect(result.reason).toMatch(/Too many redirects/);
    }
  });
  it('errors when a 30x response omits the Location header', async () => {
    const fakeFetch = vi
      .fn<typeof fetch>()
      .mockResolvedValueOnce(redirect(null, 302));
    const result = await executeWebFetch(
      { url: 'https://example.com/' },
      fakeFetch as unknown as typeof fetch,
    );
    expect('error' in result).toBe(true);
    if ('error' in result) {
      expect(result.error).toBe('redirect_missing_location');
      expect(result.reason).toMatch(/no Location/);
    }
  });
  it('resolves a relative Location against the current URL', async () => {
    // Server sends `Location: /foo` (relative) on a request to
    // https://example.com/path. RFC 9110 says resolve against the
    // request URL, so the next hop is https://example.com/foo. Assert
    // the second fetch was called with the absolute resolved URL.
    const fakeFetch = vi
      .fn<typeof fetch>()
      .mockResolvedValueOnce(redirect('/foo'))
      .mockResolvedValueOnce(mockResponse('final', { contentType: 'text/plain' }));
    const result = await executeWebFetch(
      { url: 'https://example.com/path' },
      fakeFetch as unknown as typeof fetch,
    );
    expect('content' in result && result.content).toBe('final');
    expect(fakeFetch).toHaveBeenCalledTimes(2);
    expect(fakeFetch.mock.calls[1]![0]).toBe('https://example.com/foo');
  });
 });
--- a/apps/server/src/services/web_fetch.ts
+++ b/apps/server/src/services/web_fetch.ts
@@ -22,6 +22,9 @@ const DEFAULT_MAX_CHARS = 8_000;
 const MAX_CHARS_CAP = 32_000;
 const FETCH_TIMEOUT_MS = 15_000;
 const MAX_BYTES = 5 * 1024 * 1024;
 // v1.11.9: cap redirect chains. Each hop re-runs isPublicUrl on the
 // resolved target so a public-IP origin can't 302 us into a private IP.
 const MAX_REDIRECTS = 5;
 // Output shape. Each variant uses a discriminator the LLM can branch on.
 export type WebFetchOutput =
@@ -74,89 +77,135 @@ export async function executeWebFetch(
  input: WebFetchInputT,
  fetcher: typeof fetch = fetch,
 ): Promise<WebFetchOutput> {
  const guard = isPublicUrl(input.url);
  if (!guard.ok) {
    return { error: 'blocked_by_url_guard', reason: guard.reason ?? 'unknown' };
  }
  const maxChars = Math.min(input.max_chars ?? DEFAULT_MAX_CHARS, MAX_CHARS_CAP);
-  const controller = new AbortController();
+  // v1.11.9: manual redirect handling. `redirect: 'follow'` in fetch
-  const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
+  // doesn't expose intermediate hops — a public-IP origin that 302s us
-  try {
+  // to 169.254.169.254 would silently bypass isPublicUrl. We follow each
-    const res = await fetcher(input.url, {
+  // hop ourselves, re-running the URL guard on the resolved target so a
-      signal: controller.signal,
+  // mid-chain hostile redirect gets blocked.
-      // TODO(v1.11.9): redirect: 'manual' + re-run isPublicUrl on Location header.
+  //
-      // Current 'follow' allows redirect-to-private-IP bypass of URL guard.
+  // Timeout semantics changed from v1.11.8: AbortSignal.timeout fires
-      redirect: 'follow',
+  // per fetch hop (vs. one 15s budget shared across the whole call). In
-      headers: { 'User-Agent': 'BooCode/1.11.8', Accept: 'text/html,text/plain,application/json,*/*' },
+  // the worst case a 5-hop chain can take ~5×15s before erroring — still
-    });
+  // bounded; trades a longer cap for simpler code.
-    if (!res.ok) {
+  let currentUrl = input.url;
-      return { error: 'upstream_status', reason: `HTTP ${res.status}` };
+  let res: Response | undefined;
-    }
+  let redirectCount = 0;
    // Pre-flight size check via Content-Length when the server provides it.
    const lenHeader = res.headers.get('content-length');
    if (lenHeader) {
      const len = Number(lenHeader);
      if (Number.isFinite(len) && len > MAX_BYTES) {
        return { error: 'response_too_large', reason: `Content-Length ${len} > ${MAX_BYTES}` };
      }
    }
    const contentType = (res.headers.get('content-type') ?? '').toLowerCase();
    // Read body. We rely on the 5MB cap by checking length after consumption
    // — most malicious or accidental large responses also exceed it via the
    // Content-Length pre-flight above. A truly hostile server that lies
    // about length AND streams gigabytes would defeat that; for v1.11.8
    // the 15s timeout is the secondary fence.
    const body = await res.text();
    // v1.11.8 review: byte-count, not char-count. A 5MB cap on
    // body.length (UTF-16 code units) lets a multi-byte payload (emoji,
    // CJK) pass when its wire size already exceeded MAX_BYTES. Compute
    // once and reuse for the error message.
    const bodyBytes = Buffer.byteLength(body, 'utf8');
    if (bodyBytes > MAX_BYTES) {
      return { error: 'response_too_large', reason: `body ${bodyBytes} bytes > ${MAX_BYTES}` };
    }
-    let textRaw: string;
+  while (true) {
-    let title: string | undefined;
+    const guard = isPublicUrl(currentUrl);
-    if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
+    if (!guard.ok) {
      const stripped = stripHtml(body);
      textRaw = stripped.text;
      title = stripped.title;
    } else if (
      contentType.includes('text/plain') ||
      contentType.includes('text/markdown') ||
      contentType.includes('application/json') ||
      contentType.includes('text/xml') ||
      contentType.includes('application/xml')
    ) {
      textRaw = body;
    } else {
      return {
-        error: 'unsupported_content_type',
+        error: 'blocked_by_url_guard',
-        reason: `content-type ${contentType || '(none)'} not supported`,
+        reason: redirectCount === 0
-        content_type: contentType,
+          ? (guard.reason ?? 'unknown')
          : `redirect target ${currentUrl} blocked: ${guard.reason ?? 'unknown'}`,
      };
    }
-    const truncated = truncate(textRaw, maxChars);
+    try {
-    return {
+      res = await fetcher(currentUrl, {
-      url: input.url,
+        method: 'GET',
-      title,
+        redirect: 'manual',
-      content: truncated.content,
+        signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
-      content_type: contentType,
+        headers: {
-      truncated: truncated.truncated,
+          'User-Agent': 'BooCode/1.11.9',
-    };
+          Accept: 'text/html,text/plain,application/json,*/*',
-  } catch (err) {
+        },
-    const msg = err instanceof Error ? err.message : String(err);
+      });
-    if (err instanceof Error && err.name === 'AbortError') {
+    } catch (err) {
-      return { error: 'timeout', reason: `aborted after ${FETCH_TIMEOUT_MS}ms` };
+      const msg = err instanceof Error ? err.message : String(err);
      // AbortSignal.timeout fires a DOMException with name 'TimeoutError';
      // older runtimes / polyfills may surface 'AbortError'. Treat both.
      if (err instanceof Error && (err.name === 'TimeoutError' || err.name === 'AbortError')) {
        return { error: 'timeout', reason: `aborted after ${FETCH_TIMEOUT_MS}ms` };
      }
      return { error: 'fetch_failed', reason: msg };
    }
-    return { error: 'fetch_failed', reason: msg };
+
-  } finally {
+    if (res.status >= 300 && res.status < 400) {
-    clearTimeout(timer);
+      const loc = res.headers.get('location');
      if (!loc) {
        return {
          error: 'redirect_missing_location',
          reason: `${res.status} redirect with no Location header`,
        };
      }
      redirectCount += 1;
      if (redirectCount > MAX_REDIRECTS) {
        return {
          error: 'too_many_redirects',
          reason: `Too many redirects (exceeded ${MAX_REDIRECTS} hops)`,
        };
      }
      // Resolve relative Location against the URL we just hit (RFC 9110).
      // The next loop iteration re-runs isPublicUrl on the new currentUrl.
      currentUrl = new URL(loc, currentUrl).toString();
      continue;
    }
    break;
  }
  if (!res.ok) {
    return { error: 'upstream_status', reason: `HTTP ${res.status}` };
  }
  // Pre-flight size check via Content-Length when the server provides it.
  const lenHeader = res.headers.get('content-length');
  if (lenHeader) {
    const len = Number(lenHeader);
    if (Number.isFinite(len) && len > MAX_BYTES) {
      return { error: 'response_too_large', reason: `Content-Length ${len} > ${MAX_BYTES}` };
    }
  }
  const contentType = (res.headers.get('content-type') ?? '').toLowerCase();
  // Read body. We rely on the 5MB cap by checking length after consumption
  // — most malicious or accidental large responses also exceed it via the
  // Content-Length pre-flight above. A truly hostile server that lies
  // about length AND streams gigabytes would defeat that; the per-hop
  // 15s timeout is the secondary fence.
  const body = await res.text();
  // v1.11.8 review: byte-count, not char-count. A 5MB cap on body.length
  // (UTF-16 code units) lets a multi-byte payload (emoji, CJK) pass when
  // its wire size already exceeded MAX_BYTES.
  const bodyBytes = Buffer.byteLength(body, 'utf8');
  if (bodyBytes > MAX_BYTES) {
    return { error: 'response_too_large', reason: `body ${bodyBytes} bytes > ${MAX_BYTES}` };
  }
  let textRaw: string;
  let title: string | undefined;
  if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
    const stripped = stripHtml(body);
    textRaw = stripped.text;
    title = stripped.title;
  } else if (
    contentType.includes('text/plain') ||
    contentType.includes('text/markdown') ||
    contentType.includes('application/json') ||
    contentType.includes('text/xml') ||
    contentType.includes('application/xml')
  ) {
    textRaw = body;
  } else {
    return {
      error: 'unsupported_content_type',
      reason: `content-type ${contentType || '(none)'} not supported`,
      content_type: contentType,
    };
  }
  const truncated = truncate(textRaw, maxChars);
  // Report the FINAL URL (post-redirects) so the LLM knows where the body
  // came from — useful for citations and for the model to reason about
  // domain trust.
  return {
    url: currentUrl,
    title,
    content: truncated.content,
    content_type: contentType,
    truncated: truncated.truncated,
  };
 }
 export const webFetch: ToolDef<WebFetchInputT> = {