v2.4.1-sidecar-routing: route per-agent flags to llama-sidecar + tool gap fix

Batch 3c: when an agent has llama_extra_args in AGENTS.md, provider.ts routes inference through LLAMA_SIDECAR_URL instead of LLAMA_SWAP_URL. X-Agent-Flags header built from the agent's flags. Boot-time guard refuses to start if any agent has llama_extra_args but LLAMA_SIDECAR_URL is unset. PrefixFingerprint gains a route field (swap/sidecar) for per-turn visibility. 9 provider tests. AGENTS.md tool gap: all agents (except Prompt Builder) were missing 8 tools that were added after the original tool lists were written: request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes. The missing request_read_access caused silent "permission denied" when reading files outside the project root. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-27 19:28:08 +00:00
parent 90a6761b07
commit bcfc94fa47
8 changed files with 155 additions and 26 deletions
--- a/apps/server/src/config.ts
+++ b/apps/server/src/config.ts
@@ -25,6 +25,8 @@ const ConfigSchema = z.object({
  // v2.0.5: cheaper model for titles, summaries, labeling. Falls back to
  // session model (auto_name) or DEFAULT_MODEL when unset.
  FAST_MODEL: z.string().optional(),
+  TASK_MODEL_URL: z.string().url().optional(),
+  LLAMA_SIDECAR_URL: z.string().url().optional(),
 });

 export type Config = z.infer<typeof ConfigSchema>;
--- a/apps/server/src/index.ts
+++ b/apps/server/src/index.ts
@@ -28,7 +28,7 @@ import { cleanupTruncations } from './services/truncate.js';
 import { loadMcpConfig } from './services/mcp-config.js';
 import { initialize as initMcp, getTools as getMcpTools, shutdown as shutdownMcp } from './services/mcp-client.js';
 import { appendMcpTools } from './services/tools.js';
-import { refreshToolNames } from './services/agents.js';
+import { refreshToolNames, getAgentsForProject } from './services/agents.js';

 async function main() {
  const config = loadConfig();
@@ -91,6 +91,20 @@ async function main() {
  }
  app.addHook('onClose', async () => { await shutdownMcp(); });

+  // Boot-time guard: if any agent has llama_extra_args but LLAMA_SIDECAR_URL
+  // is unset, fail fast. Silent fallback would defeat per-agent flags.
+  if (!config.LLAMA_SIDECAR_URL) {
+    const { agents } = await getAgentsForProject('');
+    const offending = agents.find(a => a.llama_extra_args && a.llama_extra_args.length > 0);
+    if (offending) {
+      app.log.fatal(
+        { agent: offending.name },
+        `Agent "${offending.name}" has llama_extra_args but LLAMA_SIDECAR_URL is not set`,
+      );
+      process.exit(1);
+    }
+  }
+
  await app.register(fastifyWebsocket);

  app.get('/api/health', async () => {
--- a/apps/server/src/services/tests/provider.test.ts
+++ b/apps/server/src/services/tests/provider.test.ts
@@ -0,0 +1,58 @@
+import { describe, expect, it } from 'vitest';
+import { resolveRoute, upstreamModel } from '../inference/provider.js';
+
+describe('resolveRoute', () => {
+  it('routes to swap when agent is null', () => {
+    expect(resolveRoute(null)).toEqual({ route: 'swap', flags: null });
+  });
+
+  it('routes to swap when agent has no llama_extra_args', () => {
+    expect(resolveRoute({ llama_extra_args: null })).toEqual({ route: 'swap', flags: null });
+  });
+
+  it('routes to swap when agent has empty llama_extra_args', () => {
+    expect(resolveRoute({ llama_extra_args: [] })).toEqual({ route: 'swap', flags: null });
+  });
+
+  it('routes to sidecar when agent has llama_extra_args', () => {
+    const result = resolveRoute({ llama_extra_args: ['--top-k', '20'] });
+    expect(result.route).toBe('sidecar');
+    expect(result.flags).toEqual(['--top-k', '20']);
+  });
+});
+
+describe('upstreamModel', () => {
+  const swapConfig = { LLAMA_SWAP_URL: 'http://localhost:8401' };
+  const fullConfig = {
+    LLAMA_SWAP_URL: 'http://localhost:8401',
+    LLAMA_SIDECAR_URL: 'http://localhost:8402',
+  };
+
+  it('returns a model for swap route (no agent)', () => {
+    const model = upstreamModel(swapConfig, 'test-model');
+    expect(model).toBeDefined();
+    expect((model as any).modelId).toBe('test-model');
+  });
+
+  it('returns a model for swap route (agent without extra args)', () => {
+    const model = upstreamModel(swapConfig, 'test-model', { llama_extra_args: null });
+    expect(model).toBeDefined();
+  });
+
+  it('returns a model for sidecar route', () => {
+    const model = upstreamModel(fullConfig, 'test-model', { llama_extra_args: ['--top-k', '20'] });
+    expect(model).toBeDefined();
+    expect((model as any).modelId).toBe('test-model');
+  });
+
+  it('throws when sidecar route requested but URL missing', () => {
+    expect(() =>
+      upstreamModel(swapConfig, 'test-model', { llama_extra_args: ['--top-k', '20'] }),
+    ).toThrow(/LLAMA_SIDECAR_URL/);
+  });
+
+  it('routes to swap for empty llama_extra_args array', () => {
+    const model = upstreamModel(swapConfig, 'test-model', { llama_extra_args: [] });
+    expect(model).toBeDefined();
+  });
+});
--- a/apps/server/src/services/inference/provider.ts
+++ b/apps/server/src/services/inference/provider.ts
@@ -1,37 +1,84 @@
 import { createOpenAICompatible } from '@ai-sdk/openai-compatible';
 import type { LanguageModel } from 'ai';

-// TODO: When per-agent llama-server flag overrides are added, route them
-// through validateExtraArgs (./llama-args-validator.ts) first.
-
 // v1.13.1-A: AI SDK provider against llama-swap. baseURL is threaded from
 // config.LLAMA_SWAP_URL at call time (not module-load) so tests can stub the
 // upstream without touching env vars. No apiKey — llama-swap is unauth in our
 // Tailscale topology and exposing it over the public internet is gated by
 // Authelia at the Caddy layer, not by API keys.
+//
+// v2.4.1-sidecar: when the agent has llama_extra_args, route through
+// llama-sidecar instead. A fresh provider is created per call (not cached)
+// because the X-Agent-Flags header varies per agent. The llama-swap path
+// stays cached since it has no per-request headers.

-const cache = new Map<string, ReturnType<typeof createOpenAICompatible>>();
+const swapCache = new Map<string, ReturnType<typeof createOpenAICompatible>>();

-function getProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> {
-  let provider = cache.get(baseURL);
+function getSwapProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> {
+  let provider = swapCache.get(baseURL);
  if (!provider) {
    provider = createOpenAICompatible({
      name: 'llama-swap',
      baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
-      // v1.13.7: @ai-sdk/openai-compatible defaults includeUsage=false, which
-      // omits `stream_options.include_usage` from the request body. Without
-      // it, llama.cpp / llama-swap never emits the trailing usage block, so
-      // `result.usage` resolves with inputTokens=outputTokens=undefined and
-      // tokens_used / ctx_used land as NULL in every messages row. Setting
-      // true here re-enables the per-stream usage payload across all models
-      // served via the llama-swap provider.
      includeUsage: true,
    });
-    cache.set(baseURL, provider);
+    swapCache.set(baseURL, provider);
  }
  return provider;
 }

-export function upstreamModel(baseURL: string, modelId: string): LanguageModel {
-  return getProvider(baseURL).chatModel(modelId);
+function sidecarProvider(
+  baseURL: string,
+  flags: string[],
+): ReturnType<typeof createOpenAICompatible> {
+  return createOpenAICompatible({
+    name: 'llama-sidecar',
+    baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
+    includeUsage: true,
+    headers: {
+      'X-Agent-Flags': flags.join(' '),
+    },
+  });
+}
+
+export type InferenceRoute = 'swap' | 'sidecar';
+
+export interface RoutingInfo {
+  route: InferenceRoute;
+  flags: string[] | null;
+}
+
+interface AgentLike {
+  llama_extra_args: string[] | null;
+}
+
+interface ConfigLike {
+  LLAMA_SWAP_URL: string;
+  LLAMA_SIDECAR_URL?: string;
+}
+
+export function resolveRoute(agent: AgentLike | null): RoutingInfo {
+  const flags = agent?.llama_extra_args;
+  if (flags && flags.length > 0) {
+    return { route: 'sidecar', flags };
+  }
+  return { route: 'swap', flags: null };
+}
+
+export function upstreamModel(
+  config: ConfigLike,
+  modelId: string,
+  agent?: AgentLike | null,
+): LanguageModel {
+  const { route, flags } = resolveRoute(agent ?? null);
+  if (route === 'sidecar') {
+    const url = config.LLAMA_SIDECAR_URL;
+    if (!url) {
+      throw new Error(
+        `Agent has llama_extra_args but LLAMA_SIDECAR_URL is not set`,
+      );
+    }
+    return sidecarProvider(url, flags!).chatModel(modelId);
+  }
+  return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId);
 }
--- a/apps/server/src/services/inference/stream-phase.ts
+++ b/apps/server/src/services/inference/stream-phase.ts
@@ -157,7 +157,8 @@ export async function streamCompletion(
  opts: StreamOptions,
  onDelta: (content: string) => void,
  onUsage: ((prompt: number | null, completion: number | null) => void) | undefined,
-  signal?: AbortSignal
+  signal?: AbortSignal,
+  agent?: Agent | null,
 ): Promise<StreamResult> {
  const aiMessages = toModelMessages(messages);
  const hasTools = opts.tools !== null && opts.tools.length > 0;
@@ -195,7 +196,7 @@ export async function streamCompletion(
  };

  const result = streamText({
-    model: upstreamModel(ctx.config.LLAMA_SWAP_URL, model),
+    model: upstreamModel(ctx.config, model, agent ?? null),
    messages: aiMessages,
    ...(aiTools
      ? { tools: aiTools, toolChoice: 'auto' as const, experimental_repairToolCall: repairToolCall }
@@ -458,7 +459,8 @@ export async function executeStreamPhase(
          }, USAGE_THROTTLE_MS - elapsed);
        }
      },
-      signal
+      signal,
+      agent,
    );
  } finally {
    if (pendingFlushTimer) {
--- a/apps/server/src/services/system-prompt.ts
+++ b/apps/server/src/services/system-prompt.ts
@@ -21,6 +21,7 @@ import { createHash } from 'node:crypto';
 import { readFile, stat } from 'node:fs/promises';
 import type { Agent, Project, Session } from '../types/api.js';
 import { getAgentsMtimes } from './agents.js';
+import { resolveRoute } from './inference/provider.js';

 const BASE_SYSTEM_PROMPT = (projectPath: string) =>
  `You are BooCode Chat, a code investigation assistant. The user is working on a project located at ${projectPath}. Use the file-read tools (view_file, list_dir, grep, find_files) to investigate code when needed. Be concise. Cite file paths and line numbers when discussing code. Do not hallucinate file contents — read the file first. Tool results may be truncated; if so, narrow your query rather than guessing.`;
@@ -98,6 +99,7 @@ export interface PrefixFingerprint {
  has_agent_system_prompt: boolean;
  has_session_override: boolean;
  has_project_override: boolean;
+  route: 'swap' | 'sidecar';
 }

 export interface PrefixDrift {
@@ -125,6 +127,7 @@ interface ObservedInputs {
  has_agent_system_prompt: boolean;
  has_session_override: boolean;
  has_project_override: boolean;
+  route: 'swap' | 'sidecar';
 }

 interface ObserverEntry {
@@ -183,6 +186,7 @@ export async function buildSystemPromptWithFingerprint(
    has_agent_system_prompt: !!(agent && agent.system_prompt.trim().length > 0),
    has_session_override: sessionPrompt.length > 0,
    has_project_override: projectPrompt.length > 0,
+    route: resolveRoute(agent).route,
  };

  const fingerprint: PrefixFingerprint = {
@@ -199,6 +203,7 @@ export async function buildSystemPromptWithFingerprint(
    has_agent_system_prompt: inputs.has_agent_system_prompt,
    has_session_override: inputs.has_session_override,
    has_project_override: inputs.has_project_override,
+    route: inputs.route,
  };

  let drift: PrefixDrift | null = null;
--- a/data/AGENTS.md
+++ b/data/AGENTS.md
@@ -7,7 +7,7 @@ top_p: 0.95
 top_k: 20
 min_p: 0.0
 presence_penalty: 0.0
-tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes]
+tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes]
 description: Reviews code for bugs, security issues, and maintainability. Read-only.
 ---
 You review code. Find real problems, not style nits.
@@ -46,7 +46,7 @@ top_p: 0.95
 top_k: 20
 min_p: 0.0
 presence_penalty: 0.0
-tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes]
+tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes]
 description: Diagnoses bugs from error messages, logs, or described symptoms.
 ---
 You diagnose bugs. Form a hypothesis, prove it with evidence from the code.
@@ -72,7 +72,7 @@ top_k: 20
 min_p: 0.0
 presence_penalty: 0.0
 steps: 5
-tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes]
+tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes]
 description: Proposes refactors for clarity, deduplication, or decoupling. Read-only — outputs plans, not edits.
 ---
 You propose refactors. You do not apply them. The user applies via OpenCode or Claude Code.
@@ -115,7 +115,7 @@ top_k: 20
 min_p: 0.0
 presence_penalty: 1.5
 steps: 20
-tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes]
+tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes]
 description: Designs new features, modules, or architectural changes. Outputs a build plan.
 ---
 You design. You produce build plans, not code.
@@ -157,7 +157,7 @@ top_p: 0.95
 top_k: 20
 min_p: 0.0
 presence_penalty: 0.0
-tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes]
+tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes]
 description: Audits code for security vulnerabilities. Read-only.
 ---
 You audit for security issues. Concrete findings only, no generic warnings.
@@ -240,7 +240,7 @@ top_p: 0.95
 top_k: 20
 min_p: 0.0
 presence_penalty: 0.0
-tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes]
+tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes]
 description: Discovers and maps unfamiliar codebases. Reads architecture, traces data flow, identifies key symbols.
 ---
 You map codebases. Start broad, then drill into specifics.
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -11,6 +11,7 @@ services:
      CONTAINER_GUIDANCE_FILE: /app/BOOCHAT.md
      DATABASE_URL: postgres://boocode:${POSTGRES_PASSWORD}@boocode_db:5432/boochat
      BOOCODER_URL: http://100.114.205.53:9502
+      LLAMA_SIDECAR_URL: http://100.101.41.16:8402
    volumes:
      - /opt:/opt
      - /opt/projects:/opt/projects:rw