v2.4.1-sidecar-routing: route per-agent flags to llama-sidecar + tool gap fix
Batch 3c: when an agent has llama_extra_args in AGENTS.md, provider.ts routes inference through LLAMA_SIDECAR_URL instead of LLAMA_SWAP_URL. X-Agent-Flags header built from the agent's flags. Boot-time guard refuses to start if any agent has llama_extra_args but LLAMA_SIDECAR_URL is unset. PrefixFingerprint gains a route field (swap/sidecar) for per-turn visibility. 9 provider tests. AGENTS.md tool gap: all agents (except Prompt Builder) were missing 8 tools that were added after the original tool lists were written: request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes. The missing request_read_access caused silent "permission denied" when reading files outside the project root. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,37 +1,84 @@
|
||||
import { createOpenAICompatible } from '@ai-sdk/openai-compatible';
|
||||
import type { LanguageModel } from 'ai';
|
||||
|
||||
// TODO: When per-agent llama-server flag overrides are added, route them
|
||||
// through validateExtraArgs (./llama-args-validator.ts) first.
|
||||
|
||||
// v1.13.1-A: AI SDK provider against llama-swap. baseURL is threaded from
|
||||
// config.LLAMA_SWAP_URL at call time (not module-load) so tests can stub the
|
||||
// upstream without touching env vars. No apiKey — llama-swap is unauth in our
|
||||
// Tailscale topology and exposing it over the public internet is gated by
|
||||
// Authelia at the Caddy layer, not by API keys.
|
||||
//
|
||||
// v2.4.1-sidecar: when the agent has llama_extra_args, route through
|
||||
// llama-sidecar instead. A fresh provider is created per call (not cached)
|
||||
// because the X-Agent-Flags header varies per agent. The llama-swap path
|
||||
// stays cached since it has no per-request headers.
|
||||
|
||||
const cache = new Map<string, ReturnType<typeof createOpenAICompatible>>();
|
||||
const swapCache = new Map<string, ReturnType<typeof createOpenAICompatible>>();
|
||||
|
||||
function getProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> {
|
||||
let provider = cache.get(baseURL);
|
||||
function getSwapProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> {
|
||||
let provider = swapCache.get(baseURL);
|
||||
if (!provider) {
|
||||
provider = createOpenAICompatible({
|
||||
name: 'llama-swap',
|
||||
baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
|
||||
// v1.13.7: @ai-sdk/openai-compatible defaults includeUsage=false, which
|
||||
// omits `stream_options.include_usage` from the request body. Without
|
||||
// it, llama.cpp / llama-swap never emits the trailing usage block, so
|
||||
// `result.usage` resolves with inputTokens=outputTokens=undefined and
|
||||
// tokens_used / ctx_used land as NULL in every messages row. Setting
|
||||
// true here re-enables the per-stream usage payload across all models
|
||||
// served via the llama-swap provider.
|
||||
includeUsage: true,
|
||||
});
|
||||
cache.set(baseURL, provider);
|
||||
swapCache.set(baseURL, provider);
|
||||
}
|
||||
return provider;
|
||||
}
|
||||
|
||||
export function upstreamModel(baseURL: string, modelId: string): LanguageModel {
|
||||
return getProvider(baseURL).chatModel(modelId);
|
||||
function sidecarProvider(
|
||||
baseURL: string,
|
||||
flags: string[],
|
||||
): ReturnType<typeof createOpenAICompatible> {
|
||||
return createOpenAICompatible({
|
||||
name: 'llama-sidecar',
|
||||
baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
|
||||
includeUsage: true,
|
||||
headers: {
|
||||
'X-Agent-Flags': flags.join(' '),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
export type InferenceRoute = 'swap' | 'sidecar';
|
||||
|
||||
export interface RoutingInfo {
|
||||
route: InferenceRoute;
|
||||
flags: string[] | null;
|
||||
}
|
||||
|
||||
interface AgentLike {
|
||||
llama_extra_args: string[] | null;
|
||||
}
|
||||
|
||||
interface ConfigLike {
|
||||
LLAMA_SWAP_URL: string;
|
||||
LLAMA_SIDECAR_URL?: string;
|
||||
}
|
||||
|
||||
export function resolveRoute(agent: AgentLike | null): RoutingInfo {
|
||||
const flags = agent?.llama_extra_args;
|
||||
if (flags && flags.length > 0) {
|
||||
return { route: 'sidecar', flags };
|
||||
}
|
||||
return { route: 'swap', flags: null };
|
||||
}
|
||||
|
||||
export function upstreamModel(
|
||||
config: ConfigLike,
|
||||
modelId: string,
|
||||
agent?: AgentLike | null,
|
||||
): LanguageModel {
|
||||
const { route, flags } = resolveRoute(agent ?? null);
|
||||
if (route === 'sidecar') {
|
||||
const url = config.LLAMA_SIDECAR_URL;
|
||||
if (!url) {
|
||||
throw new Error(
|
||||
`Agent has llama_extra_args but LLAMA_SIDECAR_URL is not set`,
|
||||
);
|
||||
}
|
||||
return sidecarProvider(url, flags!).chatModel(modelId);
|
||||
}
|
||||
return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user