v2.4.1-sidecar-routing: route per-agent flags to llama-sidecar + tool gap fix

Batch 3c: when an agent has llama_extra_args in AGENTS.md, provider.ts
routes inference through LLAMA_SIDECAR_URL instead of LLAMA_SWAP_URL.
X-Agent-Flags header built from the agent's flags. Boot-time guard
refuses to start if any agent has llama_extra_args but LLAMA_SIDECAR_URL
is unset. PrefixFingerprint gains a route field (swap/sidecar) for
per-turn visibility. 9 provider tests.

AGENTS.md tool gap: all agents (except Prompt Builder) were missing 8
tools that were added after the original tool lists were written:
request_read_access, view_truncated_output, ask_user_input, git_status,
get_blast_radius, get_hot_files, get_middleware, get_routes. The missing
request_read_access caused silent "permission denied" when reading files
outside the project root.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-27 19:28:08 +00:00
parent 90a6761b07
commit bcfc94fa47
8 changed files with 155 additions and 26 deletions

View File

@@ -1,37 +1,84 @@
import { createOpenAICompatible } from '@ai-sdk/openai-compatible';
import type { LanguageModel } from 'ai';
// TODO: When per-agent llama-server flag overrides are added, route them
// through validateExtraArgs (./llama-args-validator.ts) first.
// v1.13.1-A: AI SDK provider against llama-swap. baseURL is threaded from
// config.LLAMA_SWAP_URL at call time (not module-load) so tests can stub the
// upstream without touching env vars. No apiKey — llama-swap is unauth in our
// Tailscale topology and exposing it over the public internet is gated by
// Authelia at the Caddy layer, not by API keys.
//
// v2.4.1-sidecar: when the agent has llama_extra_args, route through
// llama-sidecar instead. A fresh provider is created per call (not cached)
// because the X-Agent-Flags header varies per agent. The llama-swap path
// stays cached since it has no per-request headers.
const cache = new Map<string, ReturnType<typeof createOpenAICompatible>>();
const swapCache = new Map<string, ReturnType<typeof createOpenAICompatible>>();
function getProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> {
let provider = cache.get(baseURL);
function getSwapProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> {
let provider = swapCache.get(baseURL);
if (!provider) {
provider = createOpenAICompatible({
name: 'llama-swap',
baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
// v1.13.7: @ai-sdk/openai-compatible defaults includeUsage=false, which
// omits `stream_options.include_usage` from the request body. Without
// it, llama.cpp / llama-swap never emits the trailing usage block, so
// `result.usage` resolves with inputTokens=outputTokens=undefined and
// tokens_used / ctx_used land as NULL in every messages row. Setting
// true here re-enables the per-stream usage payload across all models
// served via the llama-swap provider.
includeUsage: true,
});
cache.set(baseURL, provider);
swapCache.set(baseURL, provider);
}
return provider;
}
export function upstreamModel(baseURL: string, modelId: string): LanguageModel {
return getProvider(baseURL).chatModel(modelId);
function sidecarProvider(
baseURL: string,
flags: string[],
): ReturnType<typeof createOpenAICompatible> {
return createOpenAICompatible({
name: 'llama-sidecar',
baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
includeUsage: true,
headers: {
'X-Agent-Flags': flags.join(' '),
},
});
}
export type InferenceRoute = 'swap' | 'sidecar';
export interface RoutingInfo {
route: InferenceRoute;
flags: string[] | null;
}
interface AgentLike {
llama_extra_args: string[] | null;
}
interface ConfigLike {
LLAMA_SWAP_URL: string;
LLAMA_SIDECAR_URL?: string;
}
export function resolveRoute(agent: AgentLike | null): RoutingInfo {
const flags = agent?.llama_extra_args;
if (flags && flags.length > 0) {
return { route: 'sidecar', flags };
}
return { route: 'swap', flags: null };
}
export function upstreamModel(
config: ConfigLike,
modelId: string,
agent?: AgentLike | null,
): LanguageModel {
const { route, flags } = resolveRoute(agent ?? null);
if (route === 'sidecar') {
const url = config.LLAMA_SIDECAR_URL;
if (!url) {
throw new Error(
`Agent has llama_extra_args but LLAMA_SIDECAR_URL is not set`,
);
}
return sidecarProvider(url, flags!).chatModel(modelId);
}
return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId);
}