v2.4.1-sidecar-routing: route per-agent flags to llama-sidecar + tool gap fix

Batch 3c: when an agent has llama_extra_args in AGENTS.md, provider.ts
routes inference through LLAMA_SIDECAR_URL instead of LLAMA_SWAP_URL.
X-Agent-Flags header built from the agent's flags. Boot-time guard
refuses to start if any agent has llama_extra_args but LLAMA_SIDECAR_URL
is unset. PrefixFingerprint gains a route field (swap/sidecar) for
per-turn visibility. 9 provider tests.

AGENTS.md tool gap: all agents (except Prompt Builder) were missing 8
tools that were added after the original tool lists were written:
request_read_access, view_truncated_output, ask_user_input, git_status,
get_blast_radius, get_hot_files, get_middleware, get_routes. The missing
request_read_access caused silent "permission denied" when reading files
outside the project root.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-27 19:28:08 +00:00
parent 90a6761b07
commit bcfc94fa47
8 changed files with 155 additions and 26 deletions

View File

@@ -25,6 +25,8 @@ const ConfigSchema = z.object({
// v2.0.5: cheaper model for titles, summaries, labeling. Falls back to // v2.0.5: cheaper model for titles, summaries, labeling. Falls back to
// session model (auto_name) or DEFAULT_MODEL when unset. // session model (auto_name) or DEFAULT_MODEL when unset.
FAST_MODEL: z.string().optional(), FAST_MODEL: z.string().optional(),
TASK_MODEL_URL: z.string().url().optional(),
LLAMA_SIDECAR_URL: z.string().url().optional(),
}); });
export type Config = z.infer<typeof ConfigSchema>; export type Config = z.infer<typeof ConfigSchema>;

View File

@@ -28,7 +28,7 @@ import { cleanupTruncations } from './services/truncate.js';
import { loadMcpConfig } from './services/mcp-config.js'; import { loadMcpConfig } from './services/mcp-config.js';
import { initialize as initMcp, getTools as getMcpTools, shutdown as shutdownMcp } from './services/mcp-client.js'; import { initialize as initMcp, getTools as getMcpTools, shutdown as shutdownMcp } from './services/mcp-client.js';
import { appendMcpTools } from './services/tools.js'; import { appendMcpTools } from './services/tools.js';
import { refreshToolNames } from './services/agents.js'; import { refreshToolNames, getAgentsForProject } from './services/agents.js';
async function main() { async function main() {
const config = loadConfig(); const config = loadConfig();
@@ -91,6 +91,20 @@ async function main() {
} }
app.addHook('onClose', async () => { await shutdownMcp(); }); app.addHook('onClose', async () => { await shutdownMcp(); });
// Boot-time guard: if any agent has llama_extra_args but LLAMA_SIDECAR_URL
// is unset, fail fast. Silent fallback would defeat per-agent flags.
if (!config.LLAMA_SIDECAR_URL) {
const { agents } = await getAgentsForProject('');
const offending = agents.find(a => a.llama_extra_args && a.llama_extra_args.length > 0);
if (offending) {
app.log.fatal(
{ agent: offending.name },
`Agent "${offending.name}" has llama_extra_args but LLAMA_SIDECAR_URL is not set`,
);
process.exit(1);
}
}
await app.register(fastifyWebsocket); await app.register(fastifyWebsocket);
app.get('/api/health', async () => { app.get('/api/health', async () => {

View File

@@ -0,0 +1,58 @@
import { describe, expect, it } from 'vitest';
import { resolveRoute, upstreamModel } from '../inference/provider.js';
describe('resolveRoute', () => {
it('routes to swap when agent is null', () => {
expect(resolveRoute(null)).toEqual({ route: 'swap', flags: null });
});
it('routes to swap when agent has no llama_extra_args', () => {
expect(resolveRoute({ llama_extra_args: null })).toEqual({ route: 'swap', flags: null });
});
it('routes to swap when agent has empty llama_extra_args', () => {
expect(resolveRoute({ llama_extra_args: [] })).toEqual({ route: 'swap', flags: null });
});
it('routes to sidecar when agent has llama_extra_args', () => {
const result = resolveRoute({ llama_extra_args: ['--top-k', '20'] });
expect(result.route).toBe('sidecar');
expect(result.flags).toEqual(['--top-k', '20']);
});
});
describe('upstreamModel', () => {
const swapConfig = { LLAMA_SWAP_URL: 'http://localhost:8401' };
const fullConfig = {
LLAMA_SWAP_URL: 'http://localhost:8401',
LLAMA_SIDECAR_URL: 'http://localhost:8402',
};
it('returns a model for swap route (no agent)', () => {
const model = upstreamModel(swapConfig, 'test-model');
expect(model).toBeDefined();
expect((model as any).modelId).toBe('test-model');
});
it('returns a model for swap route (agent without extra args)', () => {
const model = upstreamModel(swapConfig, 'test-model', { llama_extra_args: null });
expect(model).toBeDefined();
});
it('returns a model for sidecar route', () => {
const model = upstreamModel(fullConfig, 'test-model', { llama_extra_args: ['--top-k', '20'] });
expect(model).toBeDefined();
expect((model as any).modelId).toBe('test-model');
});
it('throws when sidecar route requested but URL missing', () => {
expect(() =>
upstreamModel(swapConfig, 'test-model', { llama_extra_args: ['--top-k', '20'] }),
).toThrow(/LLAMA_SIDECAR_URL/);
});
it('routes to swap for empty llama_extra_args array', () => {
const model = upstreamModel(swapConfig, 'test-model', { llama_extra_args: [] });
expect(model).toBeDefined();
});
});

View File

@@ -1,37 +1,84 @@
import { createOpenAICompatible } from '@ai-sdk/openai-compatible'; import { createOpenAICompatible } from '@ai-sdk/openai-compatible';
import type { LanguageModel } from 'ai'; import type { LanguageModel } from 'ai';
// TODO: When per-agent llama-server flag overrides are added, route them
// through validateExtraArgs (./llama-args-validator.ts) first.
// v1.13.1-A: AI SDK provider against llama-swap. baseURL is threaded from // v1.13.1-A: AI SDK provider against llama-swap. baseURL is threaded from
// config.LLAMA_SWAP_URL at call time (not module-load) so tests can stub the // config.LLAMA_SWAP_URL at call time (not module-load) so tests can stub the
// upstream without touching env vars. No apiKey — llama-swap is unauth in our // upstream without touching env vars. No apiKey — llama-swap is unauth in our
// Tailscale topology and exposing it over the public internet is gated by // Tailscale topology and exposing it over the public internet is gated by
// Authelia at the Caddy layer, not by API keys. // Authelia at the Caddy layer, not by API keys.
//
// v2.4.1-sidecar: when the agent has llama_extra_args, route through
// llama-sidecar instead. A fresh provider is created per call (not cached)
// because the X-Agent-Flags header varies per agent. The llama-swap path
// stays cached since it has no per-request headers.
const cache = new Map<string, ReturnType<typeof createOpenAICompatible>>(); const swapCache = new Map<string, ReturnType<typeof createOpenAICompatible>>();
function getProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> { function getSwapProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> {
let provider = cache.get(baseURL); let provider = swapCache.get(baseURL);
if (!provider) { if (!provider) {
provider = createOpenAICompatible({ provider = createOpenAICompatible({
name: 'llama-swap', name: 'llama-swap',
baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`, baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
// v1.13.7: @ai-sdk/openai-compatible defaults includeUsage=false, which
// omits `stream_options.include_usage` from the request body. Without
// it, llama.cpp / llama-swap never emits the trailing usage block, so
// `result.usage` resolves with inputTokens=outputTokens=undefined and
// tokens_used / ctx_used land as NULL in every messages row. Setting
// true here re-enables the per-stream usage payload across all models
// served via the llama-swap provider.
includeUsage: true, includeUsage: true,
}); });
cache.set(baseURL, provider); swapCache.set(baseURL, provider);
} }
return provider; return provider;
} }
export function upstreamModel(baseURL: string, modelId: string): LanguageModel { function sidecarProvider(
return getProvider(baseURL).chatModel(modelId); baseURL: string,
flags: string[],
): ReturnType<typeof createOpenAICompatible> {
return createOpenAICompatible({
name: 'llama-sidecar',
baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
includeUsage: true,
headers: {
'X-Agent-Flags': flags.join(' '),
},
});
}
export type InferenceRoute = 'swap' | 'sidecar';
export interface RoutingInfo {
route: InferenceRoute;
flags: string[] | null;
}
interface AgentLike {
llama_extra_args: string[] | null;
}
interface ConfigLike {
LLAMA_SWAP_URL: string;
LLAMA_SIDECAR_URL?: string;
}
export function resolveRoute(agent: AgentLike | null): RoutingInfo {
const flags = agent?.llama_extra_args;
if (flags && flags.length > 0) {
return { route: 'sidecar', flags };
}
return { route: 'swap', flags: null };
}
export function upstreamModel(
config: ConfigLike,
modelId: string,
agent?: AgentLike | null,
): LanguageModel {
const { route, flags } = resolveRoute(agent ?? null);
if (route === 'sidecar') {
const url = config.LLAMA_SIDECAR_URL;
if (!url) {
throw new Error(
`Agent has llama_extra_args but LLAMA_SIDECAR_URL is not set`,
);
}
return sidecarProvider(url, flags!).chatModel(modelId);
}
return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId);
} }

View File

@@ -157,7 +157,8 @@ export async function streamCompletion(
opts: StreamOptions, opts: StreamOptions,
onDelta: (content: string) => void, onDelta: (content: string) => void,
onUsage: ((prompt: number | null, completion: number | null) => void) | undefined, onUsage: ((prompt: number | null, completion: number | null) => void) | undefined,
signal?: AbortSignal signal?: AbortSignal,
agent?: Agent | null,
): Promise<StreamResult> { ): Promise<StreamResult> {
const aiMessages = toModelMessages(messages); const aiMessages = toModelMessages(messages);
const hasTools = opts.tools !== null && opts.tools.length > 0; const hasTools = opts.tools !== null && opts.tools.length > 0;
@@ -195,7 +196,7 @@ export async function streamCompletion(
}; };
const result = streamText({ const result = streamText({
model: upstreamModel(ctx.config.LLAMA_SWAP_URL, model), model: upstreamModel(ctx.config, model, agent ?? null),
messages: aiMessages, messages: aiMessages,
...(aiTools ...(aiTools
? { tools: aiTools, toolChoice: 'auto' as const, experimental_repairToolCall: repairToolCall } ? { tools: aiTools, toolChoice: 'auto' as const, experimental_repairToolCall: repairToolCall }
@@ -458,7 +459,8 @@ export async function executeStreamPhase(
}, USAGE_THROTTLE_MS - elapsed); }, USAGE_THROTTLE_MS - elapsed);
} }
}, },
signal signal,
agent,
); );
} finally { } finally {
if (pendingFlushTimer) { if (pendingFlushTimer) {

View File

@@ -21,6 +21,7 @@ import { createHash } from 'node:crypto';
import { readFile, stat } from 'node:fs/promises'; import { readFile, stat } from 'node:fs/promises';
import type { Agent, Project, Session } from '../types/api.js'; import type { Agent, Project, Session } from '../types/api.js';
import { getAgentsMtimes } from './agents.js'; import { getAgentsMtimes } from './agents.js';
import { resolveRoute } from './inference/provider.js';
const BASE_SYSTEM_PROMPT = (projectPath: string) => const BASE_SYSTEM_PROMPT = (projectPath: string) =>
`You are BooCode Chat, a code investigation assistant. The user is working on a project located at ${projectPath}. Use the file-read tools (view_file, list_dir, grep, find_files) to investigate code when needed. Be concise. Cite file paths and line numbers when discussing code. Do not hallucinate file contents — read the file first. Tool results may be truncated; if so, narrow your query rather than guessing.`; `You are BooCode Chat, a code investigation assistant. The user is working on a project located at ${projectPath}. Use the file-read tools (view_file, list_dir, grep, find_files) to investigate code when needed. Be concise. Cite file paths and line numbers when discussing code. Do not hallucinate file contents — read the file first. Tool results may be truncated; if so, narrow your query rather than guessing.`;
@@ -98,6 +99,7 @@ export interface PrefixFingerprint {
has_agent_system_prompt: boolean; has_agent_system_prompt: boolean;
has_session_override: boolean; has_session_override: boolean;
has_project_override: boolean; has_project_override: boolean;
route: 'swap' | 'sidecar';
} }
export interface PrefixDrift { export interface PrefixDrift {
@@ -125,6 +127,7 @@ interface ObservedInputs {
has_agent_system_prompt: boolean; has_agent_system_prompt: boolean;
has_session_override: boolean; has_session_override: boolean;
has_project_override: boolean; has_project_override: boolean;
route: 'swap' | 'sidecar';
} }
interface ObserverEntry { interface ObserverEntry {
@@ -183,6 +186,7 @@ export async function buildSystemPromptWithFingerprint(
has_agent_system_prompt: !!(agent && agent.system_prompt.trim().length > 0), has_agent_system_prompt: !!(agent && agent.system_prompt.trim().length > 0),
has_session_override: sessionPrompt.length > 0, has_session_override: sessionPrompt.length > 0,
has_project_override: projectPrompt.length > 0, has_project_override: projectPrompt.length > 0,
route: resolveRoute(agent).route,
}; };
const fingerprint: PrefixFingerprint = { const fingerprint: PrefixFingerprint = {
@@ -199,6 +203,7 @@ export async function buildSystemPromptWithFingerprint(
has_agent_system_prompt: inputs.has_agent_system_prompt, has_agent_system_prompt: inputs.has_agent_system_prompt,
has_session_override: inputs.has_session_override, has_session_override: inputs.has_session_override,
has_project_override: inputs.has_project_override, has_project_override: inputs.has_project_override,
route: inputs.route,
}; };
let drift: PrefixDrift | null = null; let drift: PrefixDrift | null = null;

View File

@@ -7,7 +7,7 @@ top_p: 0.95
top_k: 20 top_k: 20
min_p: 0.0 min_p: 0.0
presence_penalty: 0.0 presence_penalty: 0.0
tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes] tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes]
description: Reviews code for bugs, security issues, and maintainability. Read-only. description: Reviews code for bugs, security issues, and maintainability. Read-only.
--- ---
You review code. Find real problems, not style nits. You review code. Find real problems, not style nits.
@@ -46,7 +46,7 @@ top_p: 0.95
top_k: 20 top_k: 20
min_p: 0.0 min_p: 0.0
presence_penalty: 0.0 presence_penalty: 0.0
tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes] tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes]
description: Diagnoses bugs from error messages, logs, or described symptoms. description: Diagnoses bugs from error messages, logs, or described symptoms.
--- ---
You diagnose bugs. Form a hypothesis, prove it with evidence from the code. You diagnose bugs. Form a hypothesis, prove it with evidence from the code.
@@ -72,7 +72,7 @@ top_k: 20
min_p: 0.0 min_p: 0.0
presence_penalty: 0.0 presence_penalty: 0.0
steps: 5 steps: 5
tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes] tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes]
description: Proposes refactors for clarity, deduplication, or decoupling. Read-only — outputs plans, not edits. description: Proposes refactors for clarity, deduplication, or decoupling. Read-only — outputs plans, not edits.
--- ---
You propose refactors. You do not apply them. The user applies via OpenCode or Claude Code. You propose refactors. You do not apply them. The user applies via OpenCode or Claude Code.
@@ -115,7 +115,7 @@ top_k: 20
min_p: 0.0 min_p: 0.0
presence_penalty: 1.5 presence_penalty: 1.5
steps: 20 steps: 20
tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes] tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes]
description: Designs new features, modules, or architectural changes. Outputs a build plan. description: Designs new features, modules, or architectural changes. Outputs a build plan.
--- ---
You design. You produce build plans, not code. You design. You produce build plans, not code.
@@ -157,7 +157,7 @@ top_p: 0.95
top_k: 20 top_k: 20
min_p: 0.0 min_p: 0.0
presence_penalty: 0.0 presence_penalty: 0.0
tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes] tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes]
description: Audits code for security vulnerabilities. Read-only. description: Audits code for security vulnerabilities. Read-only.
--- ---
You audit for security issues. Concrete findings only, no generic warnings. You audit for security issues. Concrete findings only, no generic warnings.
@@ -240,7 +240,7 @@ top_p: 0.95
top_k: 20 top_k: 20
min_p: 0.0 min_p: 0.0
presence_penalty: 0.0 presence_penalty: 0.0
tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes] tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes]
description: Discovers and maps unfamiliar codebases. Reads architecture, traces data flow, identifies key symbols. description: Discovers and maps unfamiliar codebases. Reads architecture, traces data flow, identifies key symbols.
--- ---
You map codebases. Start broad, then drill into specifics. You map codebases. Start broad, then drill into specifics.

View File

@@ -11,6 +11,7 @@ services:
CONTAINER_GUIDANCE_FILE: /app/BOOCHAT.md CONTAINER_GUIDANCE_FILE: /app/BOOCHAT.md
DATABASE_URL: postgres://boocode:${POSTGRES_PASSWORD}@boocode_db:5432/boochat DATABASE_URL: postgres://boocode:${POSTGRES_PASSWORD}@boocode_db:5432/boochat
BOOCODER_URL: http://100.114.205.53:9502 BOOCODER_URL: http://100.114.205.53:9502
LLAMA_SIDECAR_URL: http://100.101.41.16:8402
volumes: volumes:
- /opt:/opt - /opt:/opt
- /opt/projects:/opt/projects:rw - /opt/projects:/opt/projects:rw