Files
boocode/apps/server/src/services/url_guard.ts
indifferentketchup 2fdbb05477 v1.11.8: web_search + web_fetch tools via SearXNG
Adds two new tools registered through the existing ALL_TOOLS registry:
  - web_search hits SearXNG's JSON API (Fathom, internal Tailscale URL,
    no auth) and returns top results
  - web_fetch retrieves a URL's text content, gated by isPublicUrl
    (url_guard.ts) which blocks loopback / RFC1918 / Tailscale CGNAT /
    link-local / .local / .internal / non-http schemes

Both tools are opt-in via the existing session.web_search_enabled flag
(plumbed in v1.9, activated here). Default off. UI labels updated to
"Enable web search and fetch" / "Web search and fetch" since fetch joins
the same store. Counts against the v1.8.2 per-turn budget; covered by
the v1.11.6 doom-loop guard.

Native Node 20 fetch — no new prod dep. HTML stripping via regex (script
and style content elided wholesale). 5MB body cap, 15s fetch timeout,
8000-char default output, 32000-char cap.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 21:38:02 +00:00

79 lines
3.1 KiB
TypeScript

// v1.11.8: SSRF guard for web_fetch (and any other tool that follows a
// model-supplied URL). Sibling of path_guard.ts (workspace scope) and
// secret_guard.ts (filename deny) — same _guard.ts naming pattern. The
// spec suggested apps/server/src/services/safety/urlGuard.ts but BooCode
// has no `safety/` subdirectory and the existing guards live one level up.
//
// Block list, in order of evaluation:
// - protocol other than http: / https:
// - hostname is a known private name (localhost, 0.0.0.0, ::1)
// - hostname ends with .local or .internal (mDNS / private TLD)
// - IPv4 in any RFC1918 / loopback / CGNAT / link-local range
//
// IPv6 numeric literals aren't enumerated here. Most public hostnames
// resolve to IPv4 via DNS; an IPv6-only attack surface against a
// chat-app deployment is exotic enough to defer until a real abuse case
// motivates a comprehensive check. The protocol + name-suffix checks
// already cover the common LAN-targeting cases.
export interface UrlGuardResult {
ok: boolean;
reason?: string;
}
export function isPublicUrl(input: string): UrlGuardResult {
let u: URL;
try {
u = new URL(input);
} catch {
return { ok: false, reason: 'invalid_url' };
}
if (u.protocol !== 'http:' && u.protocol !== 'https:') {
return { ok: false, reason: `unsupported_protocol: ${u.protocol}` };
}
const host = u.hostname.toLowerCase();
if (host.length === 0) {
return { ok: false, reason: 'empty_host' };
}
// Bare-name targets
if (host === 'localhost' || host === '0.0.0.0') {
return { ok: false, reason: `private_host: ${host}` };
}
// node's URL strips the [] from a literal IPv6 host. Both forms checked.
if (host === '::1' || host === '[::1]') {
return { ok: false, reason: `loopback_v6: ${host}` };
}
// mDNS / private TLDs
if (host.endsWith('.local') || host.endsWith('.internal')) {
return { ok: false, reason: `private_suffix: ${host}` };
}
// IPv4 numeric ranges. Matches host that's all-numeric octets only — DNS
// names that happen to start with digits (e.g. 1password.com) won't match.
const ipv4 = host.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/);
if (ipv4) {
const o1 = Number(ipv4[1]);
const o2 = Number(ipv4[2]);
// Loopback 127.0.0.0/8
if (o1 === 127) return { ok: false, reason: `loopback: ${host}` };
// RFC1918 10.0.0.0/8
if (o1 === 10) return { ok: false, reason: `rfc1918: ${host}` };
// RFC1918 172.16.0.0/12
if (o1 === 172 && o2 >= 16 && o2 <= 31) return { ok: false, reason: `rfc1918: ${host}` };
// RFC1918 192.168.0.0/16
if (o1 === 192 && o2 === 168) return { ok: false, reason: `rfc1918: ${host}` };
// CGNAT / Tailscale 100.64.0.0/10
if (o1 === 100 && o2 >= 64 && o2 <= 127) return { ok: false, reason: `cgnat: ${host}` };
// Link-local 169.254.0.0/16 (covers AWS/GCP metadata IMDS)
if (o1 === 169 && o2 === 254) return { ok: false, reason: `link_local: ${host}` };
// Source net 0.0.0.0/8 (rare but possible)
if (o1 === 0) return { ok: false, reason: `zero_net: ${host}` };
}
return { ok: true };
}