v1.11.9: manual redirect handling — re-run URL guard on each hop

2026-05-21 00:37:35 +00:00
parent 4e67a265ac
commit ab01e04d77
2 changed files with 233 additions and 74 deletions
--- a/apps/server/src/services/web_fetch.ts
+++ b/apps/server/src/services/web_fetch.ts
@@ -22,6 +22,9 @@ const DEFAULT_MAX_CHARS = 8_000;
 const MAX_CHARS_CAP = 32_000;
 const FETCH_TIMEOUT_MS = 15_000;
 const MAX_BYTES = 5 * 1024 * 1024;
+// v1.11.9: cap redirect chains. Each hop re-runs isPublicUrl on the
+// resolved target so a public-IP origin can't 302 us into a private IP.
+const MAX_REDIRECTS = 5;

 // Output shape. Each variant uses a discriminator the LLM can branch on.
 export type WebFetchOutput =
@@ -74,89 +77,135 @@ export async function executeWebFetch(
  input: WebFetchInputT,
  fetcher: typeof fetch = fetch,
 ): Promise<WebFetchOutput> {
-  const guard = isPublicUrl(input.url);
-  if (!guard.ok) {
-    return { error: 'blocked_by_url_guard', reason: guard.reason ?? 'unknown' };
-  }
-
  const maxChars = Math.min(input.max_chars ?? DEFAULT_MAX_CHARS, MAX_CHARS_CAP);

-  const controller = new AbortController();
-  const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
-  try {
-    const res = await fetcher(input.url, {
-      signal: controller.signal,
-      // TODO(v1.11.9): redirect: 'manual' + re-run isPublicUrl on Location header.
-      // Current 'follow' allows redirect-to-private-IP bypass of URL guard.
-      redirect: 'follow',
-      headers: { 'User-Agent': 'BooCode/1.11.8', Accept: 'text/html,text/plain,application/json,*/*' },
-    });
-    if (!res.ok) {
-      return { error: 'upstream_status', reason: `HTTP ${res.status}` };
-    }
-    // Pre-flight size check via Content-Length when the server provides it.
-    const lenHeader = res.headers.get('content-length');
-    if (lenHeader) {
-      const len = Number(lenHeader);
-      if (Number.isFinite(len) && len > MAX_BYTES) {
-        return { error: 'response_too_large', reason: `Content-Length ${len} > ${MAX_BYTES}` };
-      }
-    }
-    const contentType = (res.headers.get('content-type') ?? '').toLowerCase();
-    // Read body. We rely on the 5MB cap by checking length after consumption
-    // — most malicious or accidental large responses also exceed it via the
-    // Content-Length pre-flight above. A truly hostile server that lies
-    // about length AND streams gigabytes would defeat that; for v1.11.8
-    // the 15s timeout is the secondary fence.
-    const body = await res.text();
-    // v1.11.8 review: byte-count, not char-count. A 5MB cap on
-    // body.length (UTF-16 code units) lets a multi-byte payload (emoji,
-    // CJK) pass when its wire size already exceeded MAX_BYTES. Compute
-    // once and reuse for the error message.
-    const bodyBytes = Buffer.byteLength(body, 'utf8');
-    if (bodyBytes > MAX_BYTES) {
-      return { error: 'response_too_large', reason: `body ${bodyBytes} bytes > ${MAX_BYTES}` };
-    }
+  // v1.11.9: manual redirect handling. `redirect: 'follow'` in fetch
+  // doesn't expose intermediate hops — a public-IP origin that 302s us
+  // to 169.254.169.254 would silently bypass isPublicUrl. We follow each
+  // hop ourselves, re-running the URL guard on the resolved target so a
+  // mid-chain hostile redirect gets blocked.
+  //
+  // Timeout semantics changed from v1.11.8: AbortSignal.timeout fires
+  // per fetch hop (vs. one 15s budget shared across the whole call). In
+  // the worst case a 5-hop chain can take ~5×15s before erroring — still
+  // bounded; trades a longer cap for simpler code.
+  let currentUrl = input.url;
+  let res: Response | undefined;
+  let redirectCount = 0;

-    let textRaw: string;
-    let title: string | undefined;
-    if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
-      const stripped = stripHtml(body);
-      textRaw = stripped.text;
-      title = stripped.title;
-    } else if (
-      contentType.includes('text/plain') ||
-      contentType.includes('text/markdown') ||
-      contentType.includes('application/json') ||
-      contentType.includes('text/xml') ||
-      contentType.includes('application/xml')
-    ) {
-      textRaw = body;
-    } else {
+  while (true) {
+    const guard = isPublicUrl(currentUrl);
+    if (!guard.ok) {
      return {
-        error: 'unsupported_content_type',
-        reason: `content-type ${contentType || '(none)'} not supported`,
-        content_type: contentType,
+        error: 'blocked_by_url_guard',
+        reason: redirectCount === 0
+          ? (guard.reason ?? 'unknown')
+          : `redirect target ${currentUrl} blocked: ${guard.reason ?? 'unknown'}`,
      };
    }

-    const truncated = truncate(textRaw, maxChars);
-    return {
-      url: input.url,
-      title,
-      content: truncated.content,
-      content_type: contentType,
-      truncated: truncated.truncated,
-    };
-  } catch (err) {
-    const msg = err instanceof Error ? err.message : String(err);
-    if (err instanceof Error && err.name === 'AbortError') {
-      return { error: 'timeout', reason: `aborted after ${FETCH_TIMEOUT_MS}ms` };
+    try {
+      res = await fetcher(currentUrl, {
+        method: 'GET',
+        redirect: 'manual',
+        signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
+        headers: {
+          'User-Agent': 'BooCode/1.11.9',
+          Accept: 'text/html,text/plain,application/json,*/*',
+        },
+      });
+    } catch (err) {
+      const msg = err instanceof Error ? err.message : String(err);
+      // AbortSignal.timeout fires a DOMException with name 'TimeoutError';
+      // older runtimes / polyfills may surface 'AbortError'. Treat both.
+      if (err instanceof Error && (err.name === 'TimeoutError' || err.name === 'AbortError')) {
+        return { error: 'timeout', reason: `aborted after ${FETCH_TIMEOUT_MS}ms` };
+      }
+      return { error: 'fetch_failed', reason: msg };
    }
-    return { error: 'fetch_failed', reason: msg };
-  } finally {
-    clearTimeout(timer);
+
+    if (res.status >= 300 && res.status < 400) {
+      const loc = res.headers.get('location');
+      if (!loc) {
+        return {
+          error: 'redirect_missing_location',
+          reason: `${res.status} redirect with no Location header`,
+        };
+      }
+      redirectCount += 1;
+      if (redirectCount > MAX_REDIRECTS) {
+        return {
+          error: 'too_many_redirects',
+          reason: `Too many redirects (exceeded ${MAX_REDIRECTS} hops)`,
+        };
+      }
+      // Resolve relative Location against the URL we just hit (RFC 9110).
+      // The next loop iteration re-runs isPublicUrl on the new currentUrl.
+      currentUrl = new URL(loc, currentUrl).toString();
+      continue;
+    }
+    break;
  }
+
+  if (!res.ok) {
+    return { error: 'upstream_status', reason: `HTTP ${res.status}` };
+  }
+  // Pre-flight size check via Content-Length when the server provides it.
+  const lenHeader = res.headers.get('content-length');
+  if (lenHeader) {
+    const len = Number(lenHeader);
+    if (Number.isFinite(len) && len > MAX_BYTES) {
+      return { error: 'response_too_large', reason: `Content-Length ${len} > ${MAX_BYTES}` };
+    }
+  }
+  const contentType = (res.headers.get('content-type') ?? '').toLowerCase();
+  // Read body. We rely on the 5MB cap by checking length after consumption
+  // — most malicious or accidental large responses also exceed it via the
+  // Content-Length pre-flight above. A truly hostile server that lies
+  // about length AND streams gigabytes would defeat that; the per-hop
+  // 15s timeout is the secondary fence.
+  const body = await res.text();
+  // v1.11.8 review: byte-count, not char-count. A 5MB cap on body.length
+  // (UTF-16 code units) lets a multi-byte payload (emoji, CJK) pass when
+  // its wire size already exceeded MAX_BYTES.
+  const bodyBytes = Buffer.byteLength(body, 'utf8');
+  if (bodyBytes > MAX_BYTES) {
+    return { error: 'response_too_large', reason: `body ${bodyBytes} bytes > ${MAX_BYTES}` };
+  }
+
+  let textRaw: string;
+  let title: string | undefined;
+  if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
+    const stripped = stripHtml(body);
+    textRaw = stripped.text;
+    title = stripped.title;
+  } else if (
+    contentType.includes('text/plain') ||
+    contentType.includes('text/markdown') ||
+    contentType.includes('application/json') ||
+    contentType.includes('text/xml') ||
+    contentType.includes('application/xml')
+  ) {
+    textRaw = body;
+  } else {
+    return {
+      error: 'unsupported_content_type',
+      reason: `content-type ${contentType || '(none)'} not supported`,
+      content_type: contentType,
+    };
+  }
+
+  const truncated = truncate(textRaw, maxChars);
+  // Report the FINAL URL (post-redirects) so the LLM knows where the body
+  // came from — useful for citations and for the model to reason about
+  // domain trust.
+  return {
+    url: currentUrl,
+    title,
+    content: truncated.content,
+    content_type: contentType,
+    truncated: truncated.truncated,
+  };
 }

 export const webFetch: ToolDef<WebFetchInputT> = {