diff --git a/apps/server/src/config.ts b/apps/server/src/config.ts index 1fb641d..ace037b 100644 --- a/apps/server/src/config.ts +++ b/apps/server/src/config.ts @@ -25,6 +25,8 @@ const ConfigSchema = z.object({ // v2.0.5: cheaper model for titles, summaries, labeling. Falls back to // session model (auto_name) or DEFAULT_MODEL when unset. FAST_MODEL: z.string().optional(), + TASK_MODEL_URL: z.string().url().optional(), + LLAMA_SIDECAR_URL: z.string().url().optional(), }); export type Config = z.infer; diff --git a/apps/server/src/index.ts b/apps/server/src/index.ts index 35a31d0..d083d46 100644 --- a/apps/server/src/index.ts +++ b/apps/server/src/index.ts @@ -28,7 +28,7 @@ import { cleanupTruncations } from './services/truncate.js'; import { loadMcpConfig } from './services/mcp-config.js'; import { initialize as initMcp, getTools as getMcpTools, shutdown as shutdownMcp } from './services/mcp-client.js'; import { appendMcpTools } from './services/tools.js'; -import { refreshToolNames } from './services/agents.js'; +import { refreshToolNames, getAgentsForProject } from './services/agents.js'; async function main() { const config = loadConfig(); @@ -91,6 +91,20 @@ async function main() { } app.addHook('onClose', async () => { await shutdownMcp(); }); + // Boot-time guard: if any agent has llama_extra_args but LLAMA_SIDECAR_URL + // is unset, fail fast. Silent fallback would defeat per-agent flags. + if (!config.LLAMA_SIDECAR_URL) { + const { agents } = await getAgentsForProject(''); + const offending = agents.find(a => a.llama_extra_args && a.llama_extra_args.length > 0); + if (offending) { + app.log.fatal( + { agent: offending.name }, + `Agent "${offending.name}" has llama_extra_args but LLAMA_SIDECAR_URL is not set`, + ); + process.exit(1); + } + } + await app.register(fastifyWebsocket); app.get('/api/health', async () => { diff --git a/apps/server/src/services/__tests__/provider.test.ts b/apps/server/src/services/__tests__/provider.test.ts new file mode 100644 index 0000000..bc9ef1f --- /dev/null +++ b/apps/server/src/services/__tests__/provider.test.ts @@ -0,0 +1,58 @@ +import { describe, expect, it } from 'vitest'; +import { resolveRoute, upstreamModel } from '../inference/provider.js'; + +describe('resolveRoute', () => { + it('routes to swap when agent is null', () => { + expect(resolveRoute(null)).toEqual({ route: 'swap', flags: null }); + }); + + it('routes to swap when agent has no llama_extra_args', () => { + expect(resolveRoute({ llama_extra_args: null })).toEqual({ route: 'swap', flags: null }); + }); + + it('routes to swap when agent has empty llama_extra_args', () => { + expect(resolveRoute({ llama_extra_args: [] })).toEqual({ route: 'swap', flags: null }); + }); + + it('routes to sidecar when agent has llama_extra_args', () => { + const result = resolveRoute({ llama_extra_args: ['--top-k', '20'] }); + expect(result.route).toBe('sidecar'); + expect(result.flags).toEqual(['--top-k', '20']); + }); +}); + +describe('upstreamModel', () => { + const swapConfig = { LLAMA_SWAP_URL: 'http://localhost:8401' }; + const fullConfig = { + LLAMA_SWAP_URL: 'http://localhost:8401', + LLAMA_SIDECAR_URL: 'http://localhost:8402', + }; + + it('returns a model for swap route (no agent)', () => { + const model = upstreamModel(swapConfig, 'test-model'); + expect(model).toBeDefined(); + expect((model as any).modelId).toBe('test-model'); + }); + + it('returns a model for swap route (agent without extra args)', () => { + const model = upstreamModel(swapConfig, 'test-model', { llama_extra_args: null }); + expect(model).toBeDefined(); + }); + + it('returns a model for sidecar route', () => { + const model = upstreamModel(fullConfig, 'test-model', { llama_extra_args: ['--top-k', '20'] }); + expect(model).toBeDefined(); + expect((model as any).modelId).toBe('test-model'); + }); + + it('throws when sidecar route requested but URL missing', () => { + expect(() => + upstreamModel(swapConfig, 'test-model', { llama_extra_args: ['--top-k', '20'] }), + ).toThrow(/LLAMA_SIDECAR_URL/); + }); + + it('routes to swap for empty llama_extra_args array', () => { + const model = upstreamModel(swapConfig, 'test-model', { llama_extra_args: [] }); + expect(model).toBeDefined(); + }); +}); diff --git a/apps/server/src/services/inference/provider.ts b/apps/server/src/services/inference/provider.ts index ce0ae79..4b4e69f 100644 --- a/apps/server/src/services/inference/provider.ts +++ b/apps/server/src/services/inference/provider.ts @@ -1,37 +1,84 @@ import { createOpenAICompatible } from '@ai-sdk/openai-compatible'; import type { LanguageModel } from 'ai'; -// TODO: When per-agent llama-server flag overrides are added, route them -// through validateExtraArgs (./llama-args-validator.ts) first. - // v1.13.1-A: AI SDK provider against llama-swap. baseURL is threaded from // config.LLAMA_SWAP_URL at call time (not module-load) so tests can stub the // upstream without touching env vars. No apiKey — llama-swap is unauth in our // Tailscale topology and exposing it over the public internet is gated by // Authelia at the Caddy layer, not by API keys. +// +// v2.4.1-sidecar: when the agent has llama_extra_args, route through +// llama-sidecar instead. A fresh provider is created per call (not cached) +// because the X-Agent-Flags header varies per agent. The llama-swap path +// stays cached since it has no per-request headers. -const cache = new Map>(); +const swapCache = new Map>(); -function getProvider(baseURL: string): ReturnType { - let provider = cache.get(baseURL); +function getSwapProvider(baseURL: string): ReturnType { + let provider = swapCache.get(baseURL); if (!provider) { provider = createOpenAICompatible({ name: 'llama-swap', baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`, - // v1.13.7: @ai-sdk/openai-compatible defaults includeUsage=false, which - // omits `stream_options.include_usage` from the request body. Without - // it, llama.cpp / llama-swap never emits the trailing usage block, so - // `result.usage` resolves with inputTokens=outputTokens=undefined and - // tokens_used / ctx_used land as NULL in every messages row. Setting - // true here re-enables the per-stream usage payload across all models - // served via the llama-swap provider. includeUsage: true, }); - cache.set(baseURL, provider); + swapCache.set(baseURL, provider); } return provider; } -export function upstreamModel(baseURL: string, modelId: string): LanguageModel { - return getProvider(baseURL).chatModel(modelId); +function sidecarProvider( + baseURL: string, + flags: string[], +): ReturnType { + return createOpenAICompatible({ + name: 'llama-sidecar', + baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`, + includeUsage: true, + headers: { + 'X-Agent-Flags': flags.join(' '), + }, + }); +} + +export type InferenceRoute = 'swap' | 'sidecar'; + +export interface RoutingInfo { + route: InferenceRoute; + flags: string[] | null; +} + +interface AgentLike { + llama_extra_args: string[] | null; +} + +interface ConfigLike { + LLAMA_SWAP_URL: string; + LLAMA_SIDECAR_URL?: string; +} + +export function resolveRoute(agent: AgentLike | null): RoutingInfo { + const flags = agent?.llama_extra_args; + if (flags && flags.length > 0) { + return { route: 'sidecar', flags }; + } + return { route: 'swap', flags: null }; +} + +export function upstreamModel( + config: ConfigLike, + modelId: string, + agent?: AgentLike | null, +): LanguageModel { + const { route, flags } = resolveRoute(agent ?? null); + if (route === 'sidecar') { + const url = config.LLAMA_SIDECAR_URL; + if (!url) { + throw new Error( + `Agent has llama_extra_args but LLAMA_SIDECAR_URL is not set`, + ); + } + return sidecarProvider(url, flags!).chatModel(modelId); + } + return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId); } diff --git a/apps/server/src/services/inference/stream-phase.ts b/apps/server/src/services/inference/stream-phase.ts index c1e6a38..95f2fa9 100644 --- a/apps/server/src/services/inference/stream-phase.ts +++ b/apps/server/src/services/inference/stream-phase.ts @@ -157,7 +157,8 @@ export async function streamCompletion( opts: StreamOptions, onDelta: (content: string) => void, onUsage: ((prompt: number | null, completion: number | null) => void) | undefined, - signal?: AbortSignal + signal?: AbortSignal, + agent?: Agent | null, ): Promise { const aiMessages = toModelMessages(messages); const hasTools = opts.tools !== null && opts.tools.length > 0; @@ -195,7 +196,7 @@ export async function streamCompletion( }; const result = streamText({ - model: upstreamModel(ctx.config.LLAMA_SWAP_URL, model), + model: upstreamModel(ctx.config, model, agent ?? null), messages: aiMessages, ...(aiTools ? { tools: aiTools, toolChoice: 'auto' as const, experimental_repairToolCall: repairToolCall } @@ -458,7 +459,8 @@ export async function executeStreamPhase( }, USAGE_THROTTLE_MS - elapsed); } }, - signal + signal, + agent, ); } finally { if (pendingFlushTimer) { diff --git a/apps/server/src/services/system-prompt.ts b/apps/server/src/services/system-prompt.ts index 9272a47..b9a21d5 100644 --- a/apps/server/src/services/system-prompt.ts +++ b/apps/server/src/services/system-prompt.ts @@ -21,6 +21,7 @@ import { createHash } from 'node:crypto'; import { readFile, stat } from 'node:fs/promises'; import type { Agent, Project, Session } from '../types/api.js'; import { getAgentsMtimes } from './agents.js'; +import { resolveRoute } from './inference/provider.js'; const BASE_SYSTEM_PROMPT = (projectPath: string) => `You are BooCode Chat, a code investigation assistant. The user is working on a project located at ${projectPath}. Use the file-read tools (view_file, list_dir, grep, find_files) to investigate code when needed. Be concise. Cite file paths and line numbers when discussing code. Do not hallucinate file contents — read the file first. Tool results may be truncated; if so, narrow your query rather than guessing.`; @@ -98,6 +99,7 @@ export interface PrefixFingerprint { has_agent_system_prompt: boolean; has_session_override: boolean; has_project_override: boolean; + route: 'swap' | 'sidecar'; } export interface PrefixDrift { @@ -125,6 +127,7 @@ interface ObservedInputs { has_agent_system_prompt: boolean; has_session_override: boolean; has_project_override: boolean; + route: 'swap' | 'sidecar'; } interface ObserverEntry { @@ -183,6 +186,7 @@ export async function buildSystemPromptWithFingerprint( has_agent_system_prompt: !!(agent && agent.system_prompt.trim().length > 0), has_session_override: sessionPrompt.length > 0, has_project_override: projectPrompt.length > 0, + route: resolveRoute(agent).route, }; const fingerprint: PrefixFingerprint = { @@ -199,6 +203,7 @@ export async function buildSystemPromptWithFingerprint( has_agent_system_prompt: inputs.has_agent_system_prompt, has_session_override: inputs.has_session_override, has_project_override: inputs.has_project_override, + route: inputs.route, }; let drift: PrefixDrift | null = null; diff --git a/data/AGENTS.md b/data/AGENTS.md index 7a26291..3fd0602 100644 --- a/data/AGENTS.md +++ b/data/AGENTS.md @@ -7,7 +7,7 @@ top_p: 0.95 top_k: 20 min_p: 0.0 presence_penalty: 0.0 -tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes] +tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes] description: Reviews code for bugs, security issues, and maintainability. Read-only. --- You review code. Find real problems, not style nits. @@ -46,7 +46,7 @@ top_p: 0.95 top_k: 20 min_p: 0.0 presence_penalty: 0.0 -tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes] +tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes] description: Diagnoses bugs from error messages, logs, or described symptoms. --- You diagnose bugs. Form a hypothesis, prove it with evidence from the code. @@ -72,7 +72,7 @@ top_k: 20 min_p: 0.0 presence_penalty: 0.0 steps: 5 -tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes] +tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes] description: Proposes refactors for clarity, deduplication, or decoupling. Read-only — outputs plans, not edits. --- You propose refactors. You do not apply them. The user applies via OpenCode or Claude Code. @@ -115,7 +115,7 @@ top_k: 20 min_p: 0.0 presence_penalty: 1.5 steps: 20 -tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes] +tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes] description: Designs new features, modules, or architectural changes. Outputs a build plan. --- You design. You produce build plans, not code. @@ -157,7 +157,7 @@ top_p: 0.95 top_k: 20 min_p: 0.0 presence_penalty: 0.0 -tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes] +tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes] description: Audits code for security vulnerabilities. Read-only. --- You audit for security issues. Concrete findings only, no generic warnings. @@ -240,7 +240,7 @@ top_p: 0.95 top_k: 20 min_p: 0.0 presence_penalty: 0.0 -tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes] +tools: [find_files, get_codebase_overview, get_dependencies, get_file_analysis, get_framework_analysis, get_semantic_neighborhoods, get_symbol_info, grep, list_dir, search_symbols, view_file, watch_changes, request_read_access, view_truncated_output, ask_user_input, git_status, get_blast_radius, get_hot_files, get_middleware, get_routes] description: Discovers and maps unfamiliar codebases. Reads architecture, traces data flow, identifies key symbols. --- You map codebases. Start broad, then drill into specifics. diff --git a/docker-compose.yml b/docker-compose.yml index 1263efb..8adbb3b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,6 +11,7 @@ services: CONTAINER_GUIDANCE_FILE: /app/BOOCHAT.md DATABASE_URL: postgres://boocode:${POSTGRES_PASSWORD}@boocode_db:5432/boochat BOOCODER_URL: http://100.114.205.53:9502 + LLAMA_SIDECAR_URL: http://100.101.41.16:8402 volumes: - /opt:/opt - /opt/projects:/opt/projects:rw