v1.11.9: manual redirect handling — re-run URL guard on each hop
This commit is contained in:
@@ -22,6 +22,9 @@ const DEFAULT_MAX_CHARS = 8_000;
|
||||
const MAX_CHARS_CAP = 32_000;
|
||||
const FETCH_TIMEOUT_MS = 15_000;
|
||||
const MAX_BYTES = 5 * 1024 * 1024;
|
||||
// v1.11.9: cap redirect chains. Each hop re-runs isPublicUrl on the
|
||||
// resolved target so a public-IP origin can't 302 us into a private IP.
|
||||
const MAX_REDIRECTS = 5;
|
||||
|
||||
// Output shape. Each variant uses a discriminator the LLM can branch on.
|
||||
export type WebFetchOutput =
|
||||
@@ -74,89 +77,135 @@ export async function executeWebFetch(
|
||||
input: WebFetchInputT,
|
||||
fetcher: typeof fetch = fetch,
|
||||
): Promise<WebFetchOutput> {
|
||||
const guard = isPublicUrl(input.url);
|
||||
if (!guard.ok) {
|
||||
return { error: 'blocked_by_url_guard', reason: guard.reason ?? 'unknown' };
|
||||
}
|
||||
|
||||
const maxChars = Math.min(input.max_chars ?? DEFAULT_MAX_CHARS, MAX_CHARS_CAP);
|
||||
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
|
||||
try {
|
||||
const res = await fetcher(input.url, {
|
||||
signal: controller.signal,
|
||||
// TODO(v1.11.9): redirect: 'manual' + re-run isPublicUrl on Location header.
|
||||
// Current 'follow' allows redirect-to-private-IP bypass of URL guard.
|
||||
redirect: 'follow',
|
||||
headers: { 'User-Agent': 'BooCode/1.11.8', Accept: 'text/html,text/plain,application/json,*/*' },
|
||||
});
|
||||
if (!res.ok) {
|
||||
return { error: 'upstream_status', reason: `HTTP ${res.status}` };
|
||||
}
|
||||
// Pre-flight size check via Content-Length when the server provides it.
|
||||
const lenHeader = res.headers.get('content-length');
|
||||
if (lenHeader) {
|
||||
const len = Number(lenHeader);
|
||||
if (Number.isFinite(len) && len > MAX_BYTES) {
|
||||
return { error: 'response_too_large', reason: `Content-Length ${len} > ${MAX_BYTES}` };
|
||||
}
|
||||
}
|
||||
const contentType = (res.headers.get('content-type') ?? '').toLowerCase();
|
||||
// Read body. We rely on the 5MB cap by checking length after consumption
|
||||
// — most malicious or accidental large responses also exceed it via the
|
||||
// Content-Length pre-flight above. A truly hostile server that lies
|
||||
// about length AND streams gigabytes would defeat that; for v1.11.8
|
||||
// the 15s timeout is the secondary fence.
|
||||
const body = await res.text();
|
||||
// v1.11.8 review: byte-count, not char-count. A 5MB cap on
|
||||
// body.length (UTF-16 code units) lets a multi-byte payload (emoji,
|
||||
// CJK) pass when its wire size already exceeded MAX_BYTES. Compute
|
||||
// once and reuse for the error message.
|
||||
const bodyBytes = Buffer.byteLength(body, 'utf8');
|
||||
if (bodyBytes > MAX_BYTES) {
|
||||
return { error: 'response_too_large', reason: `body ${bodyBytes} bytes > ${MAX_BYTES}` };
|
||||
}
|
||||
// v1.11.9: manual redirect handling. `redirect: 'follow'` in fetch
|
||||
// doesn't expose intermediate hops — a public-IP origin that 302s us
|
||||
// to 169.254.169.254 would silently bypass isPublicUrl. We follow each
|
||||
// hop ourselves, re-running the URL guard on the resolved target so a
|
||||
// mid-chain hostile redirect gets blocked.
|
||||
//
|
||||
// Timeout semantics changed from v1.11.8: AbortSignal.timeout fires
|
||||
// per fetch hop (vs. one 15s budget shared across the whole call). In
|
||||
// the worst case a 5-hop chain can take ~5×15s before erroring — still
|
||||
// bounded; trades a longer cap for simpler code.
|
||||
let currentUrl = input.url;
|
||||
let res: Response | undefined;
|
||||
let redirectCount = 0;
|
||||
|
||||
let textRaw: string;
|
||||
let title: string | undefined;
|
||||
if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
|
||||
const stripped = stripHtml(body);
|
||||
textRaw = stripped.text;
|
||||
title = stripped.title;
|
||||
} else if (
|
||||
contentType.includes('text/plain') ||
|
||||
contentType.includes('text/markdown') ||
|
||||
contentType.includes('application/json') ||
|
||||
contentType.includes('text/xml') ||
|
||||
contentType.includes('application/xml')
|
||||
) {
|
||||
textRaw = body;
|
||||
} else {
|
||||
while (true) {
|
||||
const guard = isPublicUrl(currentUrl);
|
||||
if (!guard.ok) {
|
||||
return {
|
||||
error: 'unsupported_content_type',
|
||||
reason: `content-type ${contentType || '(none)'} not supported`,
|
||||
content_type: contentType,
|
||||
error: 'blocked_by_url_guard',
|
||||
reason: redirectCount === 0
|
||||
? (guard.reason ?? 'unknown')
|
||||
: `redirect target ${currentUrl} blocked: ${guard.reason ?? 'unknown'}`,
|
||||
};
|
||||
}
|
||||
|
||||
const truncated = truncate(textRaw, maxChars);
|
||||
return {
|
||||
url: input.url,
|
||||
title,
|
||||
content: truncated.content,
|
||||
content_type: contentType,
|
||||
truncated: truncated.truncated,
|
||||
};
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
if (err instanceof Error && err.name === 'AbortError') {
|
||||
return { error: 'timeout', reason: `aborted after ${FETCH_TIMEOUT_MS}ms` };
|
||||
try {
|
||||
res = await fetcher(currentUrl, {
|
||||
method: 'GET',
|
||||
redirect: 'manual',
|
||||
signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
|
||||
headers: {
|
||||
'User-Agent': 'BooCode/1.11.9',
|
||||
Accept: 'text/html,text/plain,application/json,*/*',
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
// AbortSignal.timeout fires a DOMException with name 'TimeoutError';
|
||||
// older runtimes / polyfills may surface 'AbortError'. Treat both.
|
||||
if (err instanceof Error && (err.name === 'TimeoutError' || err.name === 'AbortError')) {
|
||||
return { error: 'timeout', reason: `aborted after ${FETCH_TIMEOUT_MS}ms` };
|
||||
}
|
||||
return { error: 'fetch_failed', reason: msg };
|
||||
}
|
||||
return { error: 'fetch_failed', reason: msg };
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
|
||||
if (res.status >= 300 && res.status < 400) {
|
||||
const loc = res.headers.get('location');
|
||||
if (!loc) {
|
||||
return {
|
||||
error: 'redirect_missing_location',
|
||||
reason: `${res.status} redirect with no Location header`,
|
||||
};
|
||||
}
|
||||
redirectCount += 1;
|
||||
if (redirectCount > MAX_REDIRECTS) {
|
||||
return {
|
||||
error: 'too_many_redirects',
|
||||
reason: `Too many redirects (exceeded ${MAX_REDIRECTS} hops)`,
|
||||
};
|
||||
}
|
||||
// Resolve relative Location against the URL we just hit (RFC 9110).
|
||||
// The next loop iteration re-runs isPublicUrl on the new currentUrl.
|
||||
currentUrl = new URL(loc, currentUrl).toString();
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (!res.ok) {
|
||||
return { error: 'upstream_status', reason: `HTTP ${res.status}` };
|
||||
}
|
||||
// Pre-flight size check via Content-Length when the server provides it.
|
||||
const lenHeader = res.headers.get('content-length');
|
||||
if (lenHeader) {
|
||||
const len = Number(lenHeader);
|
||||
if (Number.isFinite(len) && len > MAX_BYTES) {
|
||||
return { error: 'response_too_large', reason: `Content-Length ${len} > ${MAX_BYTES}` };
|
||||
}
|
||||
}
|
||||
const contentType = (res.headers.get('content-type') ?? '').toLowerCase();
|
||||
// Read body. We rely on the 5MB cap by checking length after consumption
|
||||
// — most malicious or accidental large responses also exceed it via the
|
||||
// Content-Length pre-flight above. A truly hostile server that lies
|
||||
// about length AND streams gigabytes would defeat that; the per-hop
|
||||
// 15s timeout is the secondary fence.
|
||||
const body = await res.text();
|
||||
// v1.11.8 review: byte-count, not char-count. A 5MB cap on body.length
|
||||
// (UTF-16 code units) lets a multi-byte payload (emoji, CJK) pass when
|
||||
// its wire size already exceeded MAX_BYTES.
|
||||
const bodyBytes = Buffer.byteLength(body, 'utf8');
|
||||
if (bodyBytes > MAX_BYTES) {
|
||||
return { error: 'response_too_large', reason: `body ${bodyBytes} bytes > ${MAX_BYTES}` };
|
||||
}
|
||||
|
||||
let textRaw: string;
|
||||
let title: string | undefined;
|
||||
if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
|
||||
const stripped = stripHtml(body);
|
||||
textRaw = stripped.text;
|
||||
title = stripped.title;
|
||||
} else if (
|
||||
contentType.includes('text/plain') ||
|
||||
contentType.includes('text/markdown') ||
|
||||
contentType.includes('application/json') ||
|
||||
contentType.includes('text/xml') ||
|
||||
contentType.includes('application/xml')
|
||||
) {
|
||||
textRaw = body;
|
||||
} else {
|
||||
return {
|
||||
error: 'unsupported_content_type',
|
||||
reason: `content-type ${contentType || '(none)'} not supported`,
|
||||
content_type: contentType,
|
||||
};
|
||||
}
|
||||
|
||||
const truncated = truncate(textRaw, maxChars);
|
||||
// Report the FINAL URL (post-redirects) so the LLM knows where the body
|
||||
// came from — useful for citations and for the model to reason about
|
||||
// domain trust.
|
||||
return {
|
||||
url: currentUrl,
|
||||
title,
|
||||
content: truncated.content,
|
||||
content_type: contentType,
|
||||
truncated: truncated.truncated,
|
||||
};
|
||||
}
|
||||
|
||||
export const webFetch: ToolDef<WebFetchInputT> = {
|
||||
|
||||
Reference in New Issue
Block a user