v2.4.1-sidecar-routing: route per-agent flags to llama-sidecar + tool gap fix
Batch 3c: when an agent has llama_extra_args in AGENTS.md, provider.ts routes inference through LLAMA_SIDECAR_URL instead of LLAMA_SWAP_URL. X-Agent-Flags header built from the agent's flags. Boot-time guard refuses to start if any agent has llama_extra_args but LLAMA_SIDECAR_URL is unset. PrefixFingerprint gains a route field (swap/sidecar) for per-turn visibility. 9 provider tests. AGENTS.md tool gap: all agents (except Prompt Builder) were missing 8 tools that were added after the original tool lists were written: request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes. The missing request_read_access caused silent "permission denied" when reading files outside the project root. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -25,6 +25,8 @@ const ConfigSchema = z.object({
|
||||
// v2.0.5: cheaper model for titles, summaries, labeling. Falls back to
|
||||
// session model (auto_name) or DEFAULT_MODEL when unset.
|
||||
FAST_MODEL: z.string().optional(),
|
||||
TASK_MODEL_URL: z.string().url().optional(),
|
||||
LLAMA_SIDECAR_URL: z.string().url().optional(),
|
||||
});
|
||||
|
||||
export type Config = z.infer<typeof ConfigSchema>;
|
||||
|
||||
@@ -28,7 +28,7 @@ import { cleanupTruncations } from './services/truncate.js';
|
||||
import { loadMcpConfig } from './services/mcp-config.js';
|
||||
import { initialize as initMcp, getTools as getMcpTools, shutdown as shutdownMcp } from './services/mcp-client.js';
|
||||
import { appendMcpTools } from './services/tools.js';
|
||||
import { refreshToolNames } from './services/agents.js';
|
||||
import { refreshToolNames, getAgentsForProject } from './services/agents.js';
|
||||
|
||||
async function main() {
|
||||
const config = loadConfig();
|
||||
@@ -91,6 +91,20 @@ async function main() {
|
||||
}
|
||||
app.addHook('onClose', async () => { await shutdownMcp(); });
|
||||
|
||||
// Boot-time guard: if any agent has llama_extra_args but LLAMA_SIDECAR_URL
|
||||
// is unset, fail fast. Silent fallback would defeat per-agent flags.
|
||||
if (!config.LLAMA_SIDECAR_URL) {
|
||||
const { agents } = await getAgentsForProject('');
|
||||
const offending = agents.find(a => a.llama_extra_args && a.llama_extra_args.length > 0);
|
||||
if (offending) {
|
||||
app.log.fatal(
|
||||
{ agent: offending.name },
|
||||
`Agent "${offending.name}" has llama_extra_args but LLAMA_SIDECAR_URL is not set`,
|
||||
);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
await app.register(fastifyWebsocket);
|
||||
|
||||
app.get('/api/health', async () => {
|
||||
|
||||
58
apps/server/src/services/__tests__/provider.test.ts
Normal file
58
apps/server/src/services/__tests__/provider.test.ts
Normal file
@@ -0,0 +1,58 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { resolveRoute, upstreamModel } from '../inference/provider.js';
|
||||
|
||||
describe('resolveRoute', () => {
|
||||
it('routes to swap when agent is null', () => {
|
||||
expect(resolveRoute(null)).toEqual({ route: 'swap', flags: null });
|
||||
});
|
||||
|
||||
it('routes to swap when agent has no llama_extra_args', () => {
|
||||
expect(resolveRoute({ llama_extra_args: null })).toEqual({ route: 'swap', flags: null });
|
||||
});
|
||||
|
||||
it('routes to swap when agent has empty llama_extra_args', () => {
|
||||
expect(resolveRoute({ llama_extra_args: [] })).toEqual({ route: 'swap', flags: null });
|
||||
});
|
||||
|
||||
it('routes to sidecar when agent has llama_extra_args', () => {
|
||||
const result = resolveRoute({ llama_extra_args: ['--top-k', '20'] });
|
||||
expect(result.route).toBe('sidecar');
|
||||
expect(result.flags).toEqual(['--top-k', '20']);
|
||||
});
|
||||
});
|
||||
|
||||
describe('upstreamModel', () => {
|
||||
const swapConfig = { LLAMA_SWAP_URL: 'http://localhost:8401' };
|
||||
const fullConfig = {
|
||||
LLAMA_SWAP_URL: 'http://localhost:8401',
|
||||
LLAMA_SIDECAR_URL: 'http://localhost:8402',
|
||||
};
|
||||
|
||||
it('returns a model for swap route (no agent)', () => {
|
||||
const model = upstreamModel(swapConfig, 'test-model');
|
||||
expect(model).toBeDefined();
|
||||
expect((model as any).modelId).toBe('test-model');
|
||||
});
|
||||
|
||||
it('returns a model for swap route (agent without extra args)', () => {
|
||||
const model = upstreamModel(swapConfig, 'test-model', { llama_extra_args: null });
|
||||
expect(model).toBeDefined();
|
||||
});
|
||||
|
||||
it('returns a model for sidecar route', () => {
|
||||
const model = upstreamModel(fullConfig, 'test-model', { llama_extra_args: ['--top-k', '20'] });
|
||||
expect(model).toBeDefined();
|
||||
expect((model as any).modelId).toBe('test-model');
|
||||
});
|
||||
|
||||
it('throws when sidecar route requested but URL missing', () => {
|
||||
expect(() =>
|
||||
upstreamModel(swapConfig, 'test-model', { llama_extra_args: ['--top-k', '20'] }),
|
||||
).toThrow(/LLAMA_SIDECAR_URL/);
|
||||
});
|
||||
|
||||
it('routes to swap for empty llama_extra_args array', () => {
|
||||
const model = upstreamModel(swapConfig, 'test-model', { llama_extra_args: [] });
|
||||
expect(model).toBeDefined();
|
||||
});
|
||||
});
|
||||
@@ -1,37 +1,84 @@
|
||||
import { createOpenAICompatible } from '@ai-sdk/openai-compatible';
|
||||
import type { LanguageModel } from 'ai';
|
||||
|
||||
// TODO: When per-agent llama-server flag overrides are added, route them
|
||||
// through validateExtraArgs (./llama-args-validator.ts) first.
|
||||
|
||||
// v1.13.1-A: AI SDK provider against llama-swap. baseURL is threaded from
|
||||
// config.LLAMA_SWAP_URL at call time (not module-load) so tests can stub the
|
||||
// upstream without touching env vars. No apiKey — llama-swap is unauth in our
|
||||
// Tailscale topology and exposing it over the public internet is gated by
|
||||
// Authelia at the Caddy layer, not by API keys.
|
||||
//
|
||||
// v2.4.1-sidecar: when the agent has llama_extra_args, route through
|
||||
// llama-sidecar instead. A fresh provider is created per call (not cached)
|
||||
// because the X-Agent-Flags header varies per agent. The llama-swap path
|
||||
// stays cached since it has no per-request headers.
|
||||
|
||||
const cache = new Map<string, ReturnType<typeof createOpenAICompatible>>();
|
||||
const swapCache = new Map<string, ReturnType<typeof createOpenAICompatible>>();
|
||||
|
||||
function getProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> {
|
||||
let provider = cache.get(baseURL);
|
||||
function getSwapProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> {
|
||||
let provider = swapCache.get(baseURL);
|
||||
if (!provider) {
|
||||
provider = createOpenAICompatible({
|
||||
name: 'llama-swap',
|
||||
baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
|
||||
// v1.13.7: @ai-sdk/openai-compatible defaults includeUsage=false, which
|
||||
// omits `stream_options.include_usage` from the request body. Without
|
||||
// it, llama.cpp / llama-swap never emits the trailing usage block, so
|
||||
// `result.usage` resolves with inputTokens=outputTokens=undefined and
|
||||
// tokens_used / ctx_used land as NULL in every messages row. Setting
|
||||
// true here re-enables the per-stream usage payload across all models
|
||||
// served via the llama-swap provider.
|
||||
includeUsage: true,
|
||||
});
|
||||
cache.set(baseURL, provider);
|
||||
swapCache.set(baseURL, provider);
|
||||
}
|
||||
return provider;
|
||||
}
|
||||
|
||||
export function upstreamModel(baseURL: string, modelId: string): LanguageModel {
|
||||
return getProvider(baseURL).chatModel(modelId);
|
||||
function sidecarProvider(
|
||||
baseURL: string,
|
||||
flags: string[],
|
||||
): ReturnType<typeof createOpenAICompatible> {
|
||||
return createOpenAICompatible({
|
||||
name: 'llama-sidecar',
|
||||
baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
|
||||
includeUsage: true,
|
||||
headers: {
|
||||
'X-Agent-Flags': flags.join(' '),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
export type InferenceRoute = 'swap' | 'sidecar';
|
||||
|
||||
export interface RoutingInfo {
|
||||
route: InferenceRoute;
|
||||
flags: string[] | null;
|
||||
}
|
||||
|
||||
interface AgentLike {
|
||||
llama_extra_args: string[] | null;
|
||||
}
|
||||
|
||||
interface ConfigLike {
|
||||
LLAMA_SWAP_URL: string;
|
||||
LLAMA_SIDECAR_URL?: string;
|
||||
}
|
||||
|
||||
export function resolveRoute(agent: AgentLike | null): RoutingInfo {
|
||||
const flags = agent?.llama_extra_args;
|
||||
if (flags && flags.length > 0) {
|
||||
return { route: 'sidecar', flags };
|
||||
}
|
||||
return { route: 'swap', flags: null };
|
||||
}
|
||||
|
||||
export function upstreamModel(
|
||||
config: ConfigLike,
|
||||
modelId: string,
|
||||
agent?: AgentLike | null,
|
||||
): LanguageModel {
|
||||
const { route, flags } = resolveRoute(agent ?? null);
|
||||
if (route === 'sidecar') {
|
||||
const url = config.LLAMA_SIDECAR_URL;
|
||||
if (!url) {
|
||||
throw new Error(
|
||||
`Agent has llama_extra_args but LLAMA_SIDECAR_URL is not set`,
|
||||
);
|
||||
}
|
||||
return sidecarProvider(url, flags!).chatModel(modelId);
|
||||
}
|
||||
return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId);
|
||||
}
|
||||
|
||||
@@ -157,7 +157,8 @@ export async function streamCompletion(
|
||||
opts: StreamOptions,
|
||||
onDelta: (content: string) => void,
|
||||
onUsage: ((prompt: number | null, completion: number | null) => void) | undefined,
|
||||
signal?: AbortSignal
|
||||
signal?: AbortSignal,
|
||||
agent?: Agent | null,
|
||||
): Promise<StreamResult> {
|
||||
const aiMessages = toModelMessages(messages);
|
||||
const hasTools = opts.tools !== null && opts.tools.length > 0;
|
||||
@@ -195,7 +196,7 @@ export async function streamCompletion(
|
||||
};
|
||||
|
||||
const result = streamText({
|
||||
model: upstreamModel(ctx.config.LLAMA_SWAP_URL, model),
|
||||
model: upstreamModel(ctx.config, model, agent ?? null),
|
||||
messages: aiMessages,
|
||||
...(aiTools
|
||||
? { tools: aiTools, toolChoice: 'auto' as const, experimental_repairToolCall: repairToolCall }
|
||||
@@ -458,7 +459,8 @@ export async function executeStreamPhase(
|
||||
}, USAGE_THROTTLE_MS - elapsed);
|
||||
}
|
||||
},
|
||||
signal
|
||||
signal,
|
||||
agent,
|
||||
);
|
||||
} finally {
|
||||
if (pendingFlushTimer) {
|
||||
|
||||
@@ -21,6 +21,7 @@ import { createHash } from 'node:crypto';
|
||||
import { readFile, stat } from 'node:fs/promises';
|
||||
import type { Agent, Project, Session } from '../types/api.js';
|
||||
import { getAgentsMtimes } from './agents.js';
|
||||
import { resolveRoute } from './inference/provider.js';
|
||||
|
||||
const BASE_SYSTEM_PROMPT = (projectPath: string) =>
|
||||
`You are BooCode Chat, a code investigation assistant. The user is working on a project located at ${projectPath}. Use the file-read tools (view_file, list_dir, grep, find_files) to investigate code when needed. Be concise. Cite file paths and line numbers when discussing code. Do not hallucinate file contents — read the file first. Tool results may be truncated; if so, narrow your query rather than guessing.`;
|
||||
@@ -98,6 +99,7 @@ export interface PrefixFingerprint {
|
||||
has_agent_system_prompt: boolean;
|
||||
has_session_override: boolean;
|
||||
has_project_override: boolean;
|
||||
route: 'swap' | 'sidecar';
|
||||
}
|
||||
|
||||
export interface PrefixDrift {
|
||||
@@ -125,6 +127,7 @@ interface ObservedInputs {
|
||||
has_agent_system_prompt: boolean;
|
||||
has_session_override: boolean;
|
||||
has_project_override: boolean;
|
||||
route: 'swap' | 'sidecar';
|
||||
}
|
||||
|
||||
interface ObserverEntry {
|
||||
@@ -183,6 +186,7 @@ export async function buildSystemPromptWithFingerprint(
|
||||
has_agent_system_prompt: !!(agent && agent.system_prompt.trim().length > 0),
|
||||
has_session_override: sessionPrompt.length > 0,
|
||||
has_project_override: projectPrompt.length > 0,
|
||||
route: resolveRoute(agent).route,
|
||||
};
|
||||
|
||||
const fingerprint: PrefixFingerprint = {
|
||||
@@ -199,6 +203,7 @@ export async function buildSystemPromptWithFingerprint(
|
||||
has_agent_system_prompt: inputs.has_agent_system_prompt,
|
||||
has_session_override: inputs.has_session_override,
|
||||
has_project_override: inputs.has_project_override,
|
||||
route: inputs.route,
|
||||
};
|
||||
|
||||
let drift: PrefixDrift | null = null;
|
||||
|
||||
Reference in New Issue
Block a user