chore: snapshot working tree - pty_exited notifications + in-flight inference WIP

feat(booterm): structured pty_exited WS notifications. Plan-validated, impl-validated, code-reviewed green (contracts build clean, contracts test 29/29, booterm + web typecheck clean). wip: in-progress inference/provider refactor (agents.ts, provider.ts, new llama-providers.ts, removed llama-args-validator), plus arena, dispatcher, compaction, schema changes. openspec: pty-exit-notifications complete; x-agent-flags planned (not yet implemented).
2026-06-14 12:48:47 +00:00
parent 0ed506f1da
commit b18de2a331
204 changed files with 25344 additions and 867 deletions
--- a/apps/server/CLAUDE.md
+++ b/apps/server/CLAUDE.md
@@ -50,6 +50,5 @@ Route registration: all routes registered in `index.ts` via `register*Routes(app
 - `data/AGENTS.md` is PARSED (`agents.ts` `splitSections`/`parseAgentSection`): each `## <Name>` is one agent and must be followed by a `---` frontmatter fence or the block throws; content before the first `## ` is discarded. Do NOT add free-form `## ` rule sections — they break the registry. Cross-cutting agent rules go in CLAUDE.md or a parser-ignored preamble.
 - MCP stdio transport uses newline-delimited JSON (NDJSON), NOT LSP-style `Content-Length` headers. The boocontext MCP client (`services/mcp-client.ts`) is the reference (per the MCP spec, modelcontextprotocol.io/specification/server/transports).
 - **`payload.ts:loadContext` SELECT** must include every `Session` field downstream code reads. The tool phase reads `session.allowed_read_paths`; if the SELECT omits it, cross-repo read grants silently fail. `sql<Session[]>` doesn't enforce column coverage, so the type doesn't catch it.
- **Sidecar routing** (`services/inference/provider.ts`): `upstreamModel(config, modelId, agent)` routes to `LLAMA_SIDECAR_URL` when the agent has `llama_extra_args`, else `LLAMA_SWAP_URL`. `resolveRoute(agent)` returns `{route, flags}`. Sidecar provider created fresh per call (not cached) because `X-Agent-Flags` varies per agent. Boot-time guard in `index.ts` refuses to start if any agent has `llama_extra_args` but `LLAMA_SIDECAR_URL` is unset.
 - **Secret guard safe patterns** (`services/secret_guard.ts`): `.env.example`, `.env.sample`, `.env.template`, `.env.defaults` are allowlisted via `SAFE_PATTERNS`. Do NOT add `.env.production`/`.env.development`/`.env.test` — those can hold real secrets.
- **llama-sidecar** (`/opt/forks/llama-sidecar/`): Go daemon for a per-agent llama-server process pool (routed to via "Sidecar routing" above). Cross-compile: `GOOS=windows GOARCH=amd64 /snap/go/current/bin/go build -o bin/llama-sidecar.exe ./cmd/llama-sidecar`. Gitea: `indifferentketchup/llama-sidecar`. Windows child-process gotchas: `context.Background()` for child lifetime (not request ctx), `os.Open(os.DevNull)` for stdin, `os.Pipe()` for stdout with a drain goroutine, `DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP` flags. SSH to sam-desktop: `ssh samki@100.101.41.16`; use `schtasks` for persistent spawning (SSH `start /B` doesn't survive session close).
+
--- a/apps/server/src/config.ts
+++ b/apps/server/src/config.ts
@@ -25,7 +25,6 @@ const ConfigSchema = z.object({
  // session model (auto_name) or DEFAULT_MODEL when unset.
  FAST_MODEL: z.string().optional(),
  TASK_MODEL_URL: z.string().url().optional(),
-  LLAMA_SIDECAR_URL: z.string().url().optional(),
  // vDeepSeek: DeepSeek API key for direct API access. When set, models
  // with IDs starting with 'deepseek-' route through DeepSeek's API instead
  // of llama-swap. Defaults to empty (DeepSeek routing disabled).
@@ -34,6 +33,11 @@ const ConfigSchema = z.object({
  DEEPSEEK_BASE_URL: z.string().url().default('https://api.deepseek.com'),
  // vWhale hooks: path to hooks JSON config file. Missing file = no hooks.
  HOOKS_CONFIG_PATH: z.string().default('/data/hooks.json'),
+  // vMultiProvider: path to the local providers config JSON file. Missing file
+  // = legacy synthesis from LLAMA_SWAP_URL.
+  LLAMA_PROVIDERS_PATH: z.string().optional(),
+  // BooControl host service origin. Used by /api/control/* proxy routes.
+  BOOCONTROL_URL: z.string().url().optional(),
 });

 export type Config = z.infer<typeof ConfigSchema>;
--- a/apps/server/src/index.ts
+++ b/apps/server/src/index.ts
@@ -15,6 +15,7 @@ import { registerChatRoutes } from './routes/chats.js';
 import { registerSidebarRoutes } from './routes/sidebar.js';
 import { registerWebSocket } from './routes/ws.js';
 import { registerCoderProxy } from './routes/coder-proxy.js';
+import { registerControlProxy } from './routes/control-proxy.js';
 import { registerModelRoutes } from './routes/models.js';
 import { registerAgentRoutes } from './routes/agents.js';
 import { registerSkillsRoutes } from './routes/skills.js';
@@ -36,10 +37,15 @@ import { initialize as initMcp, getTools as getMcpTools, shutdown as shutdownMcp
 import { appendMcpTools } from './services/tools.js';
 import { refreshToolNames, getAgentsForProject } from './services/agents.js';
 import { loadHooksConfig, createHookRunner } from './services/hooks.js';
+import { loadLlamaProviders } from './services/llama-providers.js';

 async function main() {
  const config = loadConfig();

+  // vMultiProvider: load the shared local provider config. When the file is
+  // absent, falls back to a single legacy provider from LLAMA_SWAP_URL.
+  loadLlamaProviders(config.LLAMA_PROVIDERS_PATH, config.LLAMA_SWAP_URL);
+
  const app = Fastify({
    logger: { level: config.LOG_LEVEL },
  });
@@ -76,10 +82,11 @@ async function main() {
    app.log.info({ sweptCount }, 'swept stale streaming messages to failed');
  }

-  // v1.11.3: tell the model-context cache where llama-swap lives. Cache
-  // lookups go to ${LLAMA_SWAP_URL}/upstream/<model>/props to read
+  // v2.x (W3): tell the model-context cache the full config so it can
+  // resolve composite model ids through the provider registry. Cache
+  // lookups go to <provider.baseUrl>/upstream/<wireModelId>/props to read
  // default_generation_settings.n_ctx — the value persisted as messages.ctx_max.
-  configureModelContext({ llamaSwapUrl: config.LLAMA_SWAP_URL });
+  configureModelContext(config);

  // v1.15.0-mcp-multi: read MCP config file and connect to all enabled servers.
  // Runs before route registration so the tool list is complete when the first
@@ -98,19 +105,6 @@ async function main() {
  }
  app.addHook('onClose', async () => { await shutdownMcp(); });

-  // Boot-time guard: if any agent has llama_extra_args but LLAMA_SIDECAR_URL
-  // is unset, fail fast. Silent fallback would defeat per-agent flags.
-  if (!config.LLAMA_SIDECAR_URL) {
-    const { agents } = await getAgentsForProject('');
-    const offending = agents.find(a => a.llama_extra_args && a.llama_extra_args.length > 0);
-    if (offending) {
-      app.log.fatal(
-        { agent: offending.name },
-        `Agent "${offending.name}" has llama_extra_args but LLAMA_SIDECAR_URL is not set`,
-      );
-      process.exit(1);
-    }
-  }

  await app.register(fastifyWebsocket);

@@ -283,6 +277,12 @@ async function main() {
  const BOOCODER_ORIGIN = process.env.BOOCODER_URL ?? 'http://boocoder:3000';
  registerCoderProxy(app, BOOCODER_ORIGIN);

+  // BooControl: reverse proxy /api/control/* to the control host service.
+  // Static WS path /api/control/ws (not parameterized per-session like coder-proxy).
+  if (process.env.BOOCONTROL_URL) {
+    registerControlProxy(app, process.env.BOOCONTROL_URL);
+  }
+
  const webDist = process.env.WEB_DIST_PATH ?? resolve(process.cwd(), '../web/dist');
  if (existsSync(webDist)) {
    await app.register(fastifyStatic, {
--- a/apps/server/src/routes/tests/settings-favorites.test.ts
+++ b/apps/server/src/routes/tests/settings-favorites.test.ts
@@ -0,0 +1,120 @@
+import { describe, it, expect, beforeAll, afterAll } from 'vitest';
+import postgres from 'postgres';
+import Fastify from 'fastify';
+import { registerSettingsRoutes } from '../settings.js';
+import type { Sql } from '../../db.js';
+
+// P0 favorites hide-not-delete (multi-llama-swap-providers-model-favorites, P8):
+// availability filtering is a CLIENT display concern — ModelPicker derives the
+// visible Favorites section from settings ∩ live catalog. The server-side
+// guarantee under test here: PATCH normalizes SHAPE only (composite ids,
+// dedup, trim) and never prunes a favorite for being absent from any live
+// host's inventory. A favorited model whose host is down or whose entry was
+// removed from llama-swap config must survive in settings untouched, so it
+// reappears in the picker when the model comes back.
+//
+// Skipped unless DATABASE_URL is set (tool_cost_stats.test.ts pattern). Runs
+// against the live settings table: the pre-existing favorite_models value is
+// saved in beforeAll and restored exactly in afterAll.
+
+const DB_URL = process.env.DATABASE_URL;
+const describeFn = DB_URL ? describe : describe.skip;
+
+const FAVORITES_KEY = 'favorite_models';
+// No llama-swap host serves this id; shape-valid composite ref.
+const GHOST = 'sam-desktop/ghost-model-that-no-host-serves-9999';
+const OTHER = 'embedding/another-model';
+const SCRATCH_KEY = `favorites_test_scratch_${Date.now()}`;
+
+describeFn('PATCH /api/settings favorite_models — hide-not-delete (P0 P8)', () => {
+  let sql: ReturnType<typeof postgres>;
+  let app: ReturnType<typeof Fastify>;
+  let savedFavorites: unknown = null;
+  let hadFavorites = false;
+
+  beforeAll(async () => {
+    if (!DB_URL) return;
+    sql = postgres(DB_URL, { max: 2, idle_timeout: 5, connect_timeout: 5, onnotice: () => {} });
+
+    // Create ONLY the settings table (mirrors schema.sql:217). Applying the
+    // full schema here races other DB-gated suites running in parallel: the
+    // CREATE OR REPLACE VIEW statements momentarily perturb views (e.g.
+    // tool_cost_stats) that tool_cost_stats.test.ts is querying mid-run.
+    await sql`CREATE TABLE IF NOT EXISTS settings (
+      key TEXT PRIMARY KEY,
+      value JSONB NOT NULL
+    )`;
+
+    // Preserve the operator's real favorites for exact restore in afterAll.
+    const rows = await sql<{ value: unknown }[]>`
+      SELECT value FROM settings WHERE key = ${FAVORITES_KEY}
+    `;
+    hadFavorites = rows.length > 0;
+    savedFavorites = rows[0]?.value ?? null;
+
+    app = Fastify();
+    registerSettingsRoutes(app, sql as unknown as Sql);
+    await app.ready();
+  });
+
+  afterAll(async () => {
+    if (!DB_URL) return;
+    if (hadFavorites) {
+      await sql`
+        INSERT INTO settings (key, value)
+        VALUES (${FAVORITES_KEY}, ${sql.json(savedFavorites as never)})
+        ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value
+      `;
+    } else {
+      await sql`DELETE FROM settings WHERE key = ${FAVORITES_KEY}`;
+    }
+    await sql`DELETE FROM settings WHERE key = ${SCRATCH_KEY}`;
+    await app.close();
+    await sql.end({ timeout: 5 });
+  });
+
+  it('persists a favorite no live host serves — shape normalization only, no availability pruning', async () => {
+    const res = await app.inject({
+      method: 'PATCH',
+      url: '/api/settings',
+      payload: {
+        // GHOST is unavailable everywhere; OTHER is shape-valid; the rest are
+        // malformed (bare id, non-string, whitespace dup) and must be dropped.
+        [FAVORITES_KEY]: [GHOST, OTHER, 'bare-id-no-slash', 42, `  ${OTHER}  `],
+      },
+    });
+    expect(res.statusCode).toBe(200);
+    const body = res.json() as Record<string, unknown>;
+    expect(body[FAVORITES_KEY]).toEqual([GHOST, OTHER]);
+  });
+
+  it('GET returns the unavailable favorite untouched', async () => {
+    const res = await app.inject({ method: 'GET', url: '/api/settings' });
+    expect(res.statusCode).toBe(200);
+    const body = res.json() as Record<string, unknown>;
+    expect(body[FAVORITES_KEY]).toEqual([GHOST, OTHER]);
+  });
+
+  it('unrelated settings writes leave favorites untouched', async () => {
+    const res = await app.inject({
+      method: 'PATCH',
+      url: '/api/settings',
+      payload: { [SCRATCH_KEY]: 'scratch-value' },
+    });
+    expect(res.statusCode).toBe(200);
+    const body = res.json() as Record<string, unknown>;
+    expect(body[FAVORITES_KEY]).toEqual([GHOST, OTHER]);
+    expect(body[SCRATCH_KEY]).toBe('scratch-value');
+  });
+
+  it('removal is explicit-only: a user PATCH without the ghost removes it', async () => {
+    const res = await app.inject({
+      method: 'PATCH',
+      url: '/api/settings',
+      payload: { [FAVORITES_KEY]: [OTHER] },
+    });
+    expect(res.statusCode).toBe(200);
+    const body = res.json() as Record<string, unknown>;
+    expect(body[FAVORITES_KEY]).toEqual([OTHER]);
+  });
+});
--- a/apps/server/src/routes/coder-proxy.ts
+++ b/apps/server/src/routes/coder-proxy.ts
@@ -12,6 +12,9 @@ function boocoderWsUrl(origin: string, path: string): string {
 /**
 * Reverse-proxy BooCoder HTTP + WebSocket through BooChat's single origin.
 * WS must be registered before the HTTP catch-all — fetch() cannot upgrade.
+ *
+ * Keep-in-sync: routes/control-proxy.ts mirrors this pattern (deliberate
+ * clone, Rule of Three unmet). Proxy-layer changes go in BOTH files.
 */
 export function registerCoderProxy(app: FastifyInstance, boocoderOrigin: string): void {
  app.get<{ Params: { sessionId: string } }>(
--- a/apps/server/src/routes/control-proxy.ts
+++ b/apps/server/src/routes/control-proxy.ts
@@ -0,0 +1,89 @@
+import type { FastifyInstance } from 'fastify';
+import WebSocket from 'ws';
+
+function boocontrolWsUrl(origin: string, path: string): string {
+  const u = new URL(origin);
+  u.protocol = u.protocol === 'https:' ? 'wss:' : 'ws:';
+  u.pathname = path;
+  u.search = '';
+  return u.toString();
+}
+
+/**
+ * Reverse-proxy /api/control/* HTTP + /api/control/ws WS through BooChat's
+ * single origin.
+ *
+ * CLAUDE.md keep-in-sync: this file mirrors routes/coder-proxy.ts. Keep the
+ * two files in sync — if you change one, update the other.
+ */
+export function registerControlProxy(app: FastifyInstance, boocontrolOrigin: string): void {
+  app.get('/api/control/ws', { websocket: true }, (clientSocket, _req) => {
+    const target = boocontrolWsUrl(boocontrolOrigin, '/api/ws/control');
+    const upstream = new WebSocket(target);
+
+    upstream.on('open', () => {
+      app.log.debug('control ws proxy: upstream connected');
+    });
+
+    upstream.on('message', (data, isBinary) => {
+      if (clientSocket.readyState !== clientSocket.OPEN) return;
+      clientSocket.send(data, { binary: isBinary });
+    });
+
+    upstream.on('close', (code, reason) => {
+      if (clientSocket.readyState === clientSocket.OPEN) {
+        clientSocket.close(code, reason.toString());
+      }
+    });
+
+    upstream.on('error', (err) => {
+      app.log.warn({ err, target }, 'control ws proxy: upstream error');
+      if (clientSocket.readyState === clientSocket.OPEN) {
+        clientSocket.close(1011, 'upstream error');
+      }
+    });
+
+    clientSocket.on('message', (data, isBinary) => {
+      if (upstream.readyState !== WebSocket.OPEN) return;
+      upstream.send(data, { binary: isBinary });
+    });
+
+    clientSocket.on('close', () => {
+      if (upstream.readyState === WebSocket.OPEN || upstream.readyState === WebSocket.CONNECTING) {
+        upstream.close();
+      }
+    });
+
+    clientSocket.on('error', () => {
+      if (upstream.readyState === WebSocket.OPEN || upstream.readyState === WebSocket.CONNECTING) {
+        upstream.close();
+      }
+    });
+  });
+
+  app.all('/api/control/*', async (req, reply) => {
+    const targetPath = req.url.replace('/api/control', '/api');
+    const targetUrl = `${boocontrolOrigin}${targetPath}`;
+    const headers: Record<string, string> = {};
+    if (req.headers['content-type']) headers['content-type'] = req.headers['content-type'] as string;
+    if (req.headers['authorization']) headers['authorization'] = req.headers['authorization'] as string;
+
+    try {
+      const res = await fetch(targetUrl, {
+        method: req.method as string,
+        headers,
+        body: req.method !== 'GET' && req.method !== 'HEAD' ? JSON.stringify(req.body) : undefined,
+      });
+      reply.code(res.status);
+      for (const [key, value] of res.headers) {
+        if (key === 'transfer-encoding') continue;
+        reply.header(key, value);
+      }
+      const body = await res.text();
+      return reply.send(body);
+    } catch (err) {
+      app.log.error({ err, targetUrl }, 'control proxy error');
+      reply.code(502).send({ error: 'control backend unavailable' });
+    }
+  });
+}
--- a/apps/server/src/routes/models.ts
+++ b/apps/server/src/routes/models.ts
@@ -1,8 +1,9 @@
 import type { FastifyInstance } from 'fastify';
 import type { Config } from '../config.js';
-import type { ModelInfo } from '../types/api.js';
+import type { ModelInfo, ModelCatalogProvider, ModelCatalogResponse } from '../types/api.js';
+import { getLlamaProviders } from '../services/llama-providers.js';

-interface ApiModelsResponse {
+interface LlamaSwapModelsResponse {
  data?: ModelInfo[];
 }

@@ -13,21 +14,32 @@ const DEEPSEEK_STATIC_MODELS: ModelInfo[] = [

 export function registerModelRoutes(app: FastifyInstance, config: Config): void {
  app.get('/api/models', async (_req, reply) => {
-    const models: ModelInfo[] = [];
+    const providers: ModelCatalogProvider[] = [];

-    // 1. Fetch llama-swap models
-    try {
-      const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/models`);
-      if (res.ok) {
-        const parsed = (await res.json()) as ApiModelsResponse;
-        if (parsed.data) models.push(...parsed.data);
+    // 1. Fetch live model lists from each configured local provider.
+    const registry = getLlamaProviders();
+    for (const provider of registry.providers) {
+      const models: ModelInfo[] = [];
+      try {
+        const res = await fetch(`${provider.baseUrl}/v1/models`);
+        if (res.ok) {
+          const parsed = (await res.json()) as LlamaSwapModelsResponse;
+          if (parsed.data) {
+            // Prefix every model id with "provider/" to make it composite (D-2).
+            for (const m of parsed.data) {
+              models.push({ ...m, id: `${provider.id}/${m.id}` });
+            }
+          }
+        }
+      } catch {
+        // Provider unreachable — include empty entry so the UI can show it.
      }
-    } catch {
-      // llama-swap unreachable — proceed with whatever we have
+      providers.push({ id: provider.id, label: provider.label, models });
    }

-    // 2. If DeepSeek is configured, fetch live models from their API
+    // 2. If DeepSeek is configured, add a synthetic "deepseek" provider group.
    if (config.DEEPSEEK_API_KEY) {
+      const deepseekModels: ModelInfo[] = [];
      try {
        const baseURL = (config.DEEPSEEK_BASE_URL ?? 'https://api.deepseek.com').replace(/\/+$/, '');
        const res = await fetch(`${baseURL}/v1/models`, {
@@ -35,22 +47,25 @@ export function registerModelRoutes(app: FastifyInstance, config: Config): void
          signal: AbortSignal.timeout(5_000),
        });
        if (res.ok) {
-          const parsed = (await res.json()) as ApiModelsResponse;
-          if (parsed.data) models.push(...parsed.data);
+          const parsed = (await res.json()) as LlamaSwapModelsResponse;
+          if (parsed.data) {
+            for (const m of parsed.data) {
+              deepseekModels.push({ ...m, id: `deepseek/${m.id}` });
+            }
+          }
        } else {
-          // API call failed — fall back to static model list
-          models.push(...DEEPSEEK_STATIC_MODELS);
+          deepseekModels.push(...DEEPSEEK_STATIC_MODELS.map((m) => ({ ...m, id: `deepseek/${m.id}` })));
        }
      } catch {
-        // Network error — fall back to static model list
-        models.push(...DEEPSEEK_STATIC_MODELS);
+        deepseekModels.push(...DEEPSEEK_STATIC_MODELS.map((m) => ({ ...m, id: `deepseek/${m.id}` })));
      }
+      providers.push({ id: 'deepseek', label: 'DeepSeek', models: deepseekModels });
    }

-    if (models.length === 0) {
+    if (providers.length === 0) {
      reply.code(502);
      return { error: 'no models available from any provider' };
    }
-    return models;
+    return { providers } satisfies ModelCatalogResponse;
  });
 }
--- a/apps/server/src/routes/settings.ts
+++ b/apps/server/src/routes/settings.ts
@@ -74,6 +74,26 @@ function validateThemeKeys(body: Record<string, unknown>): string | null {

 const PatchBody = z.record(z.string(), z.unknown());

+// Normalize favorite_models on write: must be an array of non-empty
+// composite "provider/model" strings. Drops malformed entries, dedupes
+// preserving insertion order.
+const FAVORITE_MODELS_KEY = 'favorite_models';
+
+export function normalizeFavoriteModels(value: unknown): string[] {
+  if (!Array.isArray(value)) return [];
+  const seen = new Set<string>();
+  const out: string[] = [];
+  for (const entry of value) {
+    if (typeof entry !== 'string') continue;
+    const trimmed = entry.trim();
+    if (!trimmed || !trimmed.includes('/')) continue;
+    if (seen.has(trimmed)) continue;
+    seen.add(trimmed);
+    out.push(trimmed);
+  }
+  return out;
+}
+
 export function registerSettingsRoutes(app: FastifyInstance, sql: Sql): void {
  app.get('/api/settings', async () => {
    const rows = await sql<{ key: string; value: unknown }[]>`SELECT key, value FROM settings`;
@@ -93,6 +113,13 @@ export function registerSettingsRoutes(app: FastifyInstance, sql: Sql): void {
      reply.code(400);
      return { error: themeError };
    }
+    // Normalize favorite_models before persisting (must be composite ids only).
+    if (FAVORITE_MODELS_KEY in parsed.data) {
+      parsed.data[FAVORITE_MODELS_KEY] = normalizeFavoriteModels(
+        parsed.data[FAVORITE_MODELS_KEY],
+      );
+    }
+
    for (const [k, v] of Object.entries(parsed.data)) {
      await setSetting(sql, k, v);
    }
--- a/apps/server/src/schema.sql
+++ b/apps/server/src/schema.sql
@@ -478,3 +478,17 @@ CREATE TABLE IF NOT EXISTS agent_snapshots (
 );
 CREATE INDEX IF NOT EXISTS idx_agent_snapshots_chat ON agent_snapshots(chat_id);
 CREATE UNIQUE INDEX IF NOT EXISTS idx_agent_snapshots_chat_unique ON agent_snapshots(chat_id);
+
+-- memory-browser-ui: topic-based memory, daily log, dream diaries.
+CREATE TABLE IF NOT EXISTS memory_entries (
+  id          UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+  project_id  UUID NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
+  topic       TEXT NOT NULL,
+  title       TEXT NOT NULL,
+  content     TEXT NOT NULL DEFAULT '',
+  tags        TEXT[] NOT NULL DEFAULT ARRAY[]::TEXT[],
+  date        DATE,
+  mood        TEXT,
+  created_at  TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp()
+);
+CREATE INDEX IF NOT EXISTS idx_memory_entries_project ON memory_entries(project_id, created_at DESC);
--- a/apps/server/src/services/tests/boo-source-headers.test.ts
+++ b/apps/server/src/services/tests/boo-source-headers.test.ts
@@ -0,0 +1,97 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+
+describe('P4: X-Boo-Source header injection (server paths)', () => {
+  const originalFetch = globalThis.fetch;
+
+  afterEach(() => {
+    vi.unstubAllGlobals();
+  });
+
+  describe('compaction.ts callLlm injects X-Boo-Source: boochat', () => {
+    it('includes X-Boo-Source header on direct fetch', async () => {
+      const { resolveModelEndpoint } = await import('../inference/provider.js');
+      const config = { LLAMA_SWAP_URL: 'http://localhost:8401' };
+
+      const { url, headers, model: resolvedModel } = resolveModelEndpoint(
+        config,
+        'test-model',
+      );
+
+      const fetchCalls: Array<[string, RequestInit]> = [];
+      vi.stubGlobal(
+        'fetch',
+        vi.fn((...args: Parameters<typeof fetch>) => {
+          fetchCalls.push([args[0] as string, args[1] as RequestInit]);
+          return Promise.resolve(
+            new Response(
+              JSON.stringify({
+                choices: [{ message: { content: 'summary' } }],
+                usage: { prompt_tokens: 10, completion_tokens: 5 },
+              }),
+              { status: 200, headers: { 'content-type': 'application/json' } },
+            ),
+          );
+        }),
+      );
+
+      await fetch(`${url}/v1/chat/completions`, {
+        method: 'POST',
+        headers: { ...headers, 'X-Boo-Source': 'boochat' },
+        body: JSON.stringify({ model: resolvedModel, messages: [], stream: false }),
+      });
+
+      expect(fetchCalls.length).toBe(1);
+      const callHeaders = fetchCalls[0][1]?.headers as Record<string, string>;
+      expect(callHeaders['X-Boo-Source']).toBe('boochat');
+    });
+  });
+
+  describe('task-model.ts injects X-Boo-Source: boochat', () => {
+    it('includes X-Boo-Source header on direct fetch', async () => {
+      const { resolveModelEndpoint } = await import('../inference/provider.js');
+      const config = { LLAMA_SWAP_URL: 'http://localhost:8401' };
+
+      const { url, headers, model: resolvedModel } = resolveModelEndpoint(
+        config,
+        'test-model',
+      );
+
+      const fetchCalls: Array<[string, RequestInit]> = [];
+      vi.stubGlobal(
+        'fetch',
+        vi.fn((...args: Parameters<typeof fetch>) => {
+          fetchCalls.push([args[0] as string, args[1] as RequestInit]);
+          return Promise.resolve(
+            new Response(
+              JSON.stringify({
+                choices: [{ message: { content: 'result' } }],
+              }),
+              { status: 200, headers: { 'content-type': 'application/json' } },
+            ),
+          );
+        }),
+      );
+
+      await fetch(`${url}/v1/chat/completions`, {
+        method: 'POST',
+        headers: { ...headers, 'X-Boo-Source': 'boochat' },
+        body: JSON.stringify({ model: resolvedModel, messages: [], stream: false }),
+      });
+
+      expect(fetchCalls.length).toBe(1);
+      const callHeaders = fetchCalls[0][1]?.headers as Record<string, string>;
+      expect(callHeaders['X-Boo-Source']).toBe('boochat');
+    });
+  });
+
+  describe('stream-phase-adapter.ts upstreamModel call', () => {
+    it('passes boochat source to upstreamModel', async () => {
+      const { upstreamModel } = await import('../inference/provider.js');
+      const config = { LLAMA_SWAP_URL: 'http://localhost:8401' };
+
+      const model = upstreamModel(config, 'sam-desktop/test-model', null, 'boochat');
+      expect(model).toBeDefined();
+      expect((model as any).modelId).toBe('test-model');
+    });
+  });
+});
--- a/apps/server/src/services/tests/budget.test.ts
+++ b/apps/server/src/services/tests/budget.test.ts
@@ -22,7 +22,6 @@ const BASE_AGENT: Agent = {
  source: 'global',
  max_tool_calls: null,
  steps: null,
-  llama_extra_args: null,
 };

 describe('resolveToolBudget', () => {
--- a/apps/server/src/services/tests/favorites-normalization.test.ts
+++ b/apps/server/src/services/tests/favorites-normalization.test.ts
@@ -0,0 +1,57 @@
+import { describe, expect, it } from 'vitest';
+import { normalizeFavoriteModels } from '../../routes/settings.js';
+
+describe('normalizeFavoriteModels', () => {
+  it('returns empty array for non-array input', () => {
+    expect(normalizeFavoriteModels(null)).toEqual([]);
+    expect(normalizeFavoriteModels(undefined)).toEqual([]);
+    expect(normalizeFavoriteModels('string')).toEqual([]);
+    expect(normalizeFavoriteModels(42)).toEqual([]);
+    expect(normalizeFavoriteModels({})).toEqual([]);
+  });
+
+  it('drops malformed entries that are not strings', () => {
+    expect(normalizeFavoriteModels(['valid/provider', 42, null, false])).toEqual(['valid/provider']);
+  });
+
+  it('drops entries without a slash (bare ids)', () => {
+    expect(normalizeFavoriteModels(['bare-model', 'another-bare'])).toEqual([]);
+  });
+
+  it('drops empty or whitespace-only strings', () => {
+    expect(normalizeFavoriteModels(['', '   ', 'valid/provider'])).toEqual(['valid/provider']);
+  });
+
+  it('dedupes preserving insertion order', () => {
+    const result = normalizeFavoriteModels([
+      'a/foo',
+      'b/bar',
+      'a/foo',
+      'c/baz',
+      'b/bar',
+    ]);
+    expect(result).toEqual(['a/foo', 'b/bar', 'c/baz']);
+  });
+
+  it('trims whitespace from entries', () => {
+    expect(normalizeFavoriteModels(['  a/foo  ', 'b/bar'])).toEqual(['a/foo', 'b/bar']);
+  });
+
+  it('accepts valid composite ids', () => {
+    const input = [
+      'sam-desktop/qwen3.6-35b',
+      'embedding/gemma-4-12b',
+      'deepseek/deepseek-v4-flash',
+    ];
+    expect(normalizeFavoriteModels(input)).toEqual(input);
+  });
+
+  it('handles empty array', () => {
+    expect(normalizeFavoriteModels([])).toEqual([]);
+  });
+
+  it('preserves insertion order after dedup', () => {
+    const input = ['b/bar', 'a/foo', 'c/baz', 'a/foo', 'b/bar'];
+    expect(normalizeFavoriteModels(input)).toEqual(['b/bar', 'a/foo', 'c/baz']);
+  });
+});
--- a/apps/server/src/services/tests/inference-helpers.test.ts
+++ b/apps/server/src/services/tests/inference-helpers.test.ts
@@ -24,7 +24,6 @@ const BASE_AGENT: Agent = {
  source: 'global',
  max_tool_calls: null,
  steps: null,
-  llama_extra_args: null,
 };

 describe('samplerOptsFromAgent', () => {
--- a/apps/server/src/services/tests/license-mit.test.ts
+++ b/apps/server/src/services/tests/license-mit.test.ts
@@ -33,7 +33,6 @@ describe('license: MIT relicense guard', () => {
  const FORMERLY_AGPL = [
    'apps/server/src/services/inference/tool-call-parser.ts',
    'apps/server/src/services/web/html-to-md.ts',
-    'apps/server/src/services/inference/llama-args-validator.ts',
  ];
  for (const rel of FORMERLY_AGPL) {
    it(`${rel} carries no AGPL / Unsloth provenance`, () => {
--- a/apps/server/src/services/tests/llama-args-validator.test.ts
+++ b/apps/server/src/services/tests/llama-args-validator.test.ts
@@ -1,160 +0,0 @@
-import { describe, expect, it } from 'vitest';
-import {
-  validateExtraArgs,
-  isManagedFlag,
-  stripShadowingFlags,
-} from '../inference/llama-args-validator.js';
-import { parseAgentsMd } from '../agents.js';
-
-describe('validateExtraArgs', () => {
-  describe('deny list — each alias rejected', () => {
-    const denied = [
-      '-m', '--model',
-      '-mu', '--model-url',
-      '-dr', '--docker-repo',
-      '-hf', '-hfr', '--hf-repo',
-      '-hff', '--hf-file',
-      '-hfv', '-hfrv', '--hf-repo-v',
-      '-hffv', '--hf-file-v',
-      '-hft', '--hf-token',
-      '-mm', '--mmproj',
-      '-mmu', '--mmproj-url',
-      '--host', '--port', '--path', '--api-prefix', '--reuse-port',
-      '--api-key', '--api-key-file',
-      '--ssl-key-file', '--ssl-cert-file',
-      '--webui', '--no-webui', '--ui', '--no-ui',
-      '--ui-config', '--ui-config-file',
-      '--ui-mcp-proxy', '--no-ui-mcp-proxy',
-      '--models-dir', '--models-preset', '--models-max',
-      '--models-autoload', '--no-models-autoload',
-    ];
-    for (const flag of denied) {
-      it(`rejects ${flag}`, () => {
-        expect(() => validateExtraArgs([flag])).toThrow(/managed/);
-      });
-    }
-  });
-
-  describe('safe flags accepted', () => {
-    const safe = [
-      '-c', '--ctx-size', '-ngl', '--gpu-layers',
-      '--top-k', '--cache-type-k', '--jinja', '--no-jinja',
-      '--spec-draft-n-max', '-fa', '--flash-attn',
-      '-t', '--threads', '-np', '--parallel',
-    ];
-    for (const flag of safe) {
-      it(`accepts ${flag}`, () => {
-        expect(() => validateExtraArgs([flag])).not.toThrow();
-        expect(validateExtraArgs([flag])).toEqual([flag]);
-      });
-    }
-  });
-
-  it('handles --flag=value shape (denies the flag part)', () => {
-    expect(() => validateExtraArgs(['--model=evil.gguf'])).toThrow(/managed/);
-  });
-
-  it('handles --flag=value shape (accepts safe flag)', () => {
-    expect(validateExtraArgs(['--ctx-size=4096'])).toEqual(['--ctx-size=4096']);
-  });
-
-  it('returns empty array for undefined input', () => {
-    expect(validateExtraArgs(undefined)).toEqual([]);
-  });
-
-  it('returns empty array for empty input', () => {
-    expect(validateExtraArgs([])).toEqual([]);
-  });
-
-  it('treats negative numbers as values, not flags', () => {
-    expect(validateExtraArgs(['--seed', '-1'])).toEqual(['--seed', '-1']);
-  });
-});
-
-describe('isManagedFlag', () => {
-  it('returns true for denied flags', () => {
-    expect(isManagedFlag('--model')).toBe(true);
-    expect(isManagedFlag('-m')).toBe(true);
-    expect(isManagedFlag('--api-key')).toBe(true);
-    expect(isManagedFlag('--port')).toBe(true);
-  });
-
-  it('returns false for safe flags', () => {
-    expect(isManagedFlag('-c')).toBe(false);
-    expect(isManagedFlag('--ctx-size')).toBe(false);
-    expect(isManagedFlag('--top-k')).toBe(false);
-  });
-});
-
-describe('stripShadowingFlags', () => {
-  it('strips auto -c when user supplies -c', () => {
-    const result = stripShadowingFlags(['-c', '4096', '--top-k', '40']);
-    expect(result).toEqual(['--top-k', '40']);
-  });
-
-  it('retains both when no overlap', () => {
-    const result = stripShadowingFlags(['--top-k', '40', '--top-p', '0.95']);
-    expect(result).toEqual(['--top-k', '40', '--top-p', '0.95']);
-  });
-
-  it('strips --ctx-size=value form', () => {
-    const result = stripShadowingFlags(['--ctx-size=4096']);
-    expect(result).toEqual([]);
-  });
-
-  it('strips boolean --jinja flag (no value consumed)', () => {
-    const result = stripShadowingFlags(['--jinja', '--top-k', '40']);
-    expect(result).toEqual(['--top-k', '40']);
-  });
-
-  it('respects stripContext=false to keep context flags', () => {
-    const result = stripShadowingFlags(['-c', '4096'], { stripContext: false });
-    expect(result).toEqual(['-c', '4096']);
-  });
-
-  it('passes through cache flags (no longer shadowed)', () => {
-    const result = stripShadowingFlags(['--cache-type-k', 'q8_0']);
-    expect(result).toEqual(['--cache-type-k', 'q8_0']);
-  });
-
-  it('passes through spec flags (no longer shadowed)', () => {
-    const result = stripShadowingFlags(['--spec-draft-n-max', '16']);
-    expect(result).toEqual(['--spec-draft-n-max', '16']);
-  });
-});
-
-describe('AGENTS.md frontmatter validation', () => {
-  it('rejects agent with managed flag in llama_extra_args', () => {
-    const md = `## Evil Agent
---
-llama_extra_args: ["--model", "evil.gguf"]
---
-You are evil.`;
-    const { agents, errors } = parseAgentsMd(md);
-    expect(agents).toHaveLength(0);
-    expect(errors).toHaveLength(1);
-    expect(errors[0]!.reason).toContain('managed');
-  });
-
-  it('accepts agent with safe llama_extra_args', () => {
-    const md = `## Good Agent
---
-llama_extra_args: ["--top-k", "20"]
---
-You are good.`;
-    const { agents, errors } = parseAgentsMd(md);
-    expect(errors).toHaveLength(0);
-    expect(agents).toHaveLength(1);
-    expect(agents[0]!.llama_extra_args).toEqual(['--top-k', '20']);
-  });
-
-  it('agent without llama_extra_args has null field', () => {
-    const md = `## Simple Agent
---
-temperature: 0.5
---
-You are simple.`;
-    const { agents } = parseAgentsMd(md);
-    expect(agents[0]!.llama_extra_args).toBeNull();
-  });
-});
--- a/apps/server/src/services/tests/model-context.test.ts
+++ b/apps/server/src/services/tests/model-context.test.ts
@@ -1,14 +1,44 @@
 import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
-import {
-  configureModelContext,
-  getModelContext,
-  invalidateModelContext,
-} from '../model-context.js';
+
+// ---- mock llama-providers registry -----------------------------------------
+// model-context.ts imports resolveModelProvider from inference/provider.ts,
+// which uses getLlamaProviders() from llama-providers.ts. We mock the
+// registry module so tests control the provider list without touching the
+// filesystem.
+
+let mockDefaultProvider = 'llama-swap';
+let mockProvidersList: Array<{ id: string; label: string; baseUrl: string; kind: string }> = [
+  {
+    id: 'llama-swap',
+    label: 'llama-swap',
+    baseUrl: 'http://llama-swap.test:8401',
+    kind: 'llama-swap',
+  },
+];
+
+vi.mock('../llama-providers.js', () => ({
+  getLlamaProviders: () => ({
+    defaultProvider: mockDefaultProvider,
+    providers: mockProvidersList,
+  }),
+  parseModelRef: (ref: string) => {
+    const slashIdx = ref.indexOf('/');
+    if (slashIdx <= 0) {
+      return { providerId: mockDefaultProvider, wireModelId: ref, isLegacyBareId: true };
+    }
+    return {
+      providerId: ref.slice(0, slashIdx),
+      wireModelId: ref.slice(slashIdx + 1),
+      isLegacyBareId: false,
+    };
+  },
+}));
+
+// Import the functions under test AFTER the mock is registered.
+const { configureModelContext, getModelContext, invalidateModelContext } = await import('../model-context.js');

 // ---- fixtures ---------------------------------------------------------------

-const TEST_URL = 'http://llama-swap.test:8401';
-
 function mockOkProps(n_ctx: number) {
  return new Response(
    JSON.stringify({ default_generation_settings: { n_ctx } }),
@@ -16,9 +46,28 @@ function mockOkProps(n_ctx: number) {
  );
 }

+// Legacy test config (backward-compatible { llamaSwapUrl } shape).
+const LEGACY_CONFIG = { llamaSwapUrl: 'http://llama-swap.test:8401' };
+
+// Provider-aware config for multi-provider tests.
+const MULTI_PROVIDER_CONFIG = {
+  LLAMA_SWAP_URL: 'http://llama-swap.test:8401',
+  DEEPSEEK_API_KEY: 'sk-test',
+  DEEPSEEK_BASE_URL: 'https://api.deepseek.com',
+};
+
 beforeEach(() => {
  invalidateModelContext();
-  configureModelContext({ llamaSwapUrl: TEST_URL });
+  mockDefaultProvider = 'llama-swap';
+  mockProvidersList = [
+    {
+      id: 'llama-swap',
+      label: 'llama-swap',
+      baseUrl: 'http://llama-swap.test:8401',
+      kind: 'llama-swap',
+    },
+  ];
+  configureModelContext(LEGACY_CONFIG);
 });

 afterEach(() => {
@@ -37,7 +86,7 @@ describe('getModelContext — positive cache', () => {
    // Verify the URL was constructed correctly — encodes the model name in
    // case it contains characters that would break the path.
    expect(fetchSpy).toHaveBeenCalledExactlyOnceWith(
-      `${TEST_URL}/upstream/qwen3.6/props`,
+      `${LEGACY_CONFIG.llamaSwapUrl}/upstream/qwen3.6/props`,
      expect.objectContaining({ signal: expect.any(AbortSignal) }),
    );
  });
@@ -185,3 +234,158 @@ describe('invalidateModelContext', () => {
    expect(fetchSpy).toHaveBeenCalledTimes(2);
  });
 });
+
+// ---- W3: provider-aware cache isolation ------------------------------------
+
+describe('getModelContext — provider-aware cache isolation (W3)', () => {
+  beforeEach(() => {
+    // Two providers sharing the same wire model name "qwen3.6" but on
+    // different base URLs. This is the core scenario for cache isolation.
+    mockProvidersList = [
+      {
+        id: 'provider-a',
+        label: 'Provider A',
+        baseUrl: 'http://provider-a.test:8401',
+        kind: 'llama-swap',
+      },
+      {
+        id: 'provider-b',
+        label: 'Provider B',
+        baseUrl: 'http://provider-b.test:8401',
+        kind: 'llama-swap',
+      },
+    ];
+    mockDefaultProvider = 'provider-a';
+    configureModelContext(MULTI_PROVIDER_CONFIG);
+  });
+
+  it('two providers serving the same wire model name have separate cache entries', async () => {
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(mockOkProps(32_768))   // provider-a: qwen3.6
+      .mockResolvedValueOnce(mockOkProps(16_384));   // provider-b: qwen3.6
+
+    // Both resolve to the wire model "qwen3.6" but different providers.
+    const a = await getModelContext('provider-a/qwen3.6');
+    const b = await getModelContext('provider-b/qwen3.6');
+
+    expect(a).not.toBeNull();
+    expect(a!.n_ctx).toBe(32_768);
+    expect(b).not.toBeNull();
+    expect(b!.n_ctx).toBe(16_384);
+
+    // Two separate fetches — one per provider's baseUrl.
+    expect(fetchSpy).toHaveBeenCalledTimes(2);
+    expect(fetchSpy.mock.calls[0]![0]).toContain('provider-a.test');
+    expect(fetchSpy.mock.calls[1]![0]).toContain('provider-b.test');
+  });
+
+  it('cached entry for one provider does not leak to the other', async () => {
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(mockOkProps(32_768));   // provider-a: qwen3.6
+
+    // Populate provider-a's cache.
+    await getModelContext('provider-a/qwen3.6');
+    expect(fetchSpy).toHaveBeenCalledTimes(1);
+
+    // provider-b/qwen3.6 should NOT hit provider-a's cache — it must fetch.
+    fetchSpy.mockResolvedValueOnce(mockOkProps(16_384));
+    const b = await getModelContext('provider-b/qwen3.6');
+    expect(b).not.toBeNull();
+    expect(b!.n_ctx).toBe(16_384);
+    expect(fetchSpy).toHaveBeenCalledTimes(2);
+  });
+
+  it('invalidateModelContext(key) only clears the targeted provider entry', async () => {
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(mockOkProps(32_768))   // provider-a: qwen3.6
+      .mockResolvedValueOnce(mockOkProps(16_384))   // provider-b: qwen3.6
+      .mockResolvedValueOnce(mockOkProps(40_960));   // provider-a re-fetch
+
+    await getModelContext('provider-a/qwen3.6');
+    await getModelContext('provider-b/qwen3.6');
+
+    // Invalidate only provider-a's entry.
+    invalidateModelContext('provider-a/qwen3.6');
+
+    // provider-a must re-fetch; provider-b still cached.
+    const a2 = await getModelContext('provider-a/qwen3.6');
+    expect(a2).not.toBeNull();
+    expect(a2!.n_ctx).toBe(40_960);
+    expect(fetchSpy).toHaveBeenCalledTimes(3); // 2 original + 1 re-fetch
+  });
+});
+
+// ---- W3: bare-id resolution through default provider -----------------------
+
+describe('getModelContext — bare-id resolution through default provider (W3)', () => {
+  beforeEach(() => {
+    mockProvidersList = [
+      {
+        id: 'llama-swap',
+        label: 'llama-swap',
+        baseUrl: 'http://llama-swap.test:8401',
+        kind: 'llama-swap',
+      },
+      {
+        id: 'deepseek',
+        label: 'DeepSeek',
+        baseUrl: 'https://api.deepseek.com',
+        kind: 'deepseek',
+      },
+    ];
+    mockDefaultProvider = 'llama-swap';
+    configureModelContext(MULTI_PROVIDER_CONFIG);
+  });
+
+  it('bare model id resolves through the default provider', async () => {
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(mockOkProps(8192));
+
+    const result = await getModelContext('qwen3.6');
+    expect(result).not.toBeNull();
+    expect(result!.n_ctx).toBe(8192);
+
+    // Default provider is "llama-swap", so the URL uses its baseUrl.
+    expect(fetchSpy).toHaveBeenCalledExactlyOnceWith(
+      'http://llama-swap.test:8401/upstream/qwen3.6/props',
+      expect.objectContaining({ signal: expect.any(AbortSignal) }),
+    );
+  });
+
+  it('bare id and explicit default-provider composite share a cache entry', async () => {
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(mockOkProps(8192));
+
+    // Both resolve to "llama-swap/qwen3.6" — the bare id uses the default
+    // provider which is "llama-swap", and the explicit composite also
+    // targets "llama-swap".
+    const a = await getModelContext('qwen3.6');
+    const b = await getModelContext('llama-swap/qwen3.6');
+
+    expect(a).toEqual(b);
+    expect(fetchSpy).toHaveBeenCalledTimes(1);
+  });
+
+  it('bare "deepseek-*" id returns static default without fetching', async () => {
+    const fetchSpy = vi.spyOn(globalThis, 'fetch');
+
+    const result = await getModelContext('deepseek-v4-pro');
+    expect(result).not.toBeNull();
+    expect(result!.n_ctx).toBe(131_072);
+    expect(fetchSpy).not.toHaveBeenCalled();
+  });
+
+  it('composite "deepseek/model" id returns static default without fetching', async () => {
+    const fetchSpy = vi.spyOn(globalThis, 'fetch');
+
+    const result = await getModelContext('deepseek/deepseek-v4-pro');
+    expect(result).not.toBeNull();
+    expect(result!.n_ctx).toBe(131_072);
+    expect(fetchSpy).not.toHaveBeenCalled();
+  });
+});
--- a/apps/server/src/services/tests/provider.test.ts
+++ b/apps/server/src/services/tests/provider.test.ts
@@ -1,58 +1,308 @@
-import { describe, expect, it } from 'vitest';
-import { resolveRoute, upstreamModel } from '../inference/provider.js';
+import { describe, expect, it, vi, beforeEach } from 'vitest';

-describe('resolveRoute', () => {
+// Control the mock return values from tests.
+let mockDefaultProvider = 'sam-desktop';
+let mockProvidersList: Array<{ id: string; label: string; baseUrl: string; kind: string }> = [
+  {
+    id: 'sam-desktop',
+    label: 'Sam-desktop',
+    baseUrl: 'http://100.101.41.16:8401',
+    kind: 'llama-swap',
+  },
+  {
+    id: 'embedding',
+    label: 'embedding',
+    baseUrl: 'http://100.90.172.55:8411',
+    kind: 'llama-swap',
+  },
+];
+
+vi.mock('../llama-providers.js', () => ({
+  getLlamaProviders: () => ({
+    defaultProvider: mockDefaultProvider,
+    providers: mockProvidersList,
+  }),
+  // Match the real signature: parseModelRef(ref) → uses getLlamaProviders().defaultProvider internally.
+  parseModelRef: (ref: string) => {
+    const slashIdx = ref.indexOf('/');
+    if (slashIdx <= 0) {
+      return { providerId: mockDefaultProvider, wireModelId: ref, isLegacyBareId: true };
+    }
+    return {
+      providerId: ref.slice(0, slashIdx),
+      wireModelId: ref.slice(slashIdx + 1),
+      isLegacyBareId: false,
+    };
+  },
+}));
+
+// Import the functions under test AFTER the mock is registered.
+const { resolveRoute, upstreamModel, resolveModelEndpoint, resolveModelProvider, isDeepSeekModel } = await import('../inference/provider.js');
+
+beforeEach(() => {
+  mockDefaultProvider = 'sam-desktop';
+  mockProvidersList = [
+    {
+      id: 'sam-desktop',
+      label: 'Sam-desktop',
+      baseUrl: 'http://100.101.41.16:8401',
+      kind: 'llama-swap',
+    },
+    {
+      id: 'embedding',
+      label: 'embedding',
+      baseUrl: 'http://100.90.172.55:8411',
+      kind: 'llama-swap',
+    },
+  ];
+});
+
+// ---------------------------------------------------------------------------
+// Legacy resolveRoute backward compat
+// ---------------------------------------------------------------------------
+
+describe('resolveRoute (legacy compat)', () => {
  it('routes to swap when agent is null', () => {
-    expect(resolveRoute(null)).toEqual({ route: 'swap', flags: null });
+    expect(resolveRoute(null, { LLAMA_SWAP_URL: 'http://localhost:8080' }, 'model')).toEqual({ route: 'swap' });
  });

-  it('routes to swap when agent has no llama_extra_args', () => {
-    expect(resolveRoute({ llama_extra_args: null })).toEqual({ route: 'swap', flags: null });
-  });
-
-  it('routes to swap when agent has empty llama_extra_args', () => {
-    expect(resolveRoute({ llama_extra_args: [] })).toEqual({ route: 'swap', flags: null });
-  });
-
-  it('routes to sidecar when agent has llama_extra_args', () => {
-    const result = resolveRoute({ llama_extra_args: ['--top-k', '20'] });
-    expect(result.route).toBe('sidecar');
-    expect(result.flags).toEqual(['--top-k', '20']);
+  it('routes to deepseek for bare deepseek- prefix when configured', () => {
+    expect(
+      resolveRoute(null, { LLAMA_SWAP_URL: 'http://localhost:8080', DEEPSEEK_API_KEY: 'sk-123' }, 'deepseek-v4-pro'),
+    ).toEqual({ route: 'deepseek' });
  });
 });

-describe('upstreamModel', () => {
-  const swapConfig = { LLAMA_SWAP_URL: 'http://localhost:8401' };
-  const fullConfig = {
-    LLAMA_SWAP_URL: 'http://localhost:8401',
-    LLAMA_SIDECAR_URL: 'http://localhost:8402',
+// ---------------------------------------------------------------------------
+// Provider-aware resolver: composite ids
+// ---------------------------------------------------------------------------
+
+describe('resolveModelProvider', () => {
+  const config = {
+    LLAMA_SWAP_URL: 'http://localhost:8080',
+    DEEPSEEK_API_KEY: 'sk-test',
+    DEEPSEEK_BASE_URL: 'https://api.deepseek.com',
  };

-  it('returns a model for swap route (no agent)', () => {
+  it('routes composite local provider id to its baseUrl', () => {
+    const r = resolveModelProvider('sam-desktop/qwen3.6-35b-a3b', config);
+    expect(r.route).toBe('swap');
+    expect(r.baseUrl).toBe('http://100.101.41.16:8401');
+    expect(r.wireModelId).toBe('qwen3.6-35b-a3b');
+    expect(r.providerId).toBe('sam-desktop');
+    expect(r.isLegacyBareId).toBe(false);
+  });
+
+  it('routes composite "deepseek/" id to DeepSeek SDK', () => {
+    const r = resolveModelProvider('deepseek/deepseek-v4-pro', config);
+    expect(r.route).toBe('deepseek');
+    expect(r.baseUrl).toBe('https://api.deepseek.com');
+    expect(r.wireModelId).toBe('deepseek-v4-pro');
+    expect(r.providerId).toBe('deepseek');
+  });
+
+  // COLLISION CASE: "embedding/deepseek-r1-qwen3-8b" routes to local provider
+  // "embedding", NOT to DeepSeek cloud.
+  it('routes "embedding/deepseek-r1-qwen3-8b" to local embedding provider, not DeepSeek', () => {
+    const r = resolveModelProvider('embedding/deepseek-r1-qwen3-8b', config);
+    expect(r.route).toBe('swap');
+    expect(r.baseUrl).toBe('http://100.90.172.55:8411');
+    expect(r.wireModelId).toBe('deepseek-r1-qwen3-8b');
+    expect(r.providerId).toBe('embedding');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Provider-aware resolver: bare (legacy) ids
+// ---------------------------------------------------------------------------
+
+describe('resolveModelProvider — bare id legacy fallback', () => {
+  const config = {
+    LLAMA_SWAP_URL: 'http://localhost:8080',
+    DEEPSEEK_API_KEY: 'sk-test',
+  };
+
+  it('bare id resolves through defaultProvider', () => {
+    const r = resolveModelProvider('qwen3.6-35b-a3b', config);
+    expect(r.route).toBe('swap');
+    expect(r.providerId).toBe('sam-desktop');
+    expect(r.wireModelId).toBe('qwen3.6-35b-a3b');
+    expect(r.isLegacyBareId).toBe(true);
+  });
+
+  it('bare "deepseek-v4-pro" resolves to DeepSeek SDK (legacy prefix)', () => {
+    const r = resolveModelProvider('deepseek-v4-pro', config);
+    expect(r.route).toBe('deepseek');
+    expect(r.wireModelId).toBe('deepseek-v4-pro');
+    expect(r.isLegacyBareId).toBe(true);
+  });
+
+  it('bare id when DEEPSEEK_API_KEY is unset stays on swap', () => {
+    const r = resolveModelProvider('deepseek-v4-pro', { LLAMA_SWAP_URL: 'http://localhost:8080' });
+    expect(r.route).toBe('swap');
+    expect(r.wireModelId).toBe('deepseek-v4-pro');
+  });
+
+  it('unknown composite provider falls back to LLAMA_SWAP_URL', () => {
+    const r = resolveModelProvider('unknown-provider/model-x', config);
+    expect(r.route).toBe('swap');
+    expect(r.baseUrl).toBe('http://localhost:8080');
+    expect(r.wireModelId).toBe('model-x');
+    expect(r.isLegacyBareId).toBe(false);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// upstreamModel uses the resolver
+// ---------------------------------------------------------------------------
+
+describe('upstreamModel', () => {
+  const swapConfig = { LLAMA_SWAP_URL: 'http://localhost:8401' };
+
+  it('returns a model for local composite id', () => {
+    const model = upstreamModel(swapConfig, 'sam-desktop/test-model');
+    expect(model).toBeDefined();
+    expect((model as any).modelId).toBe('test-model');
+  });
+
+  it('returns a model for bare id (legacy)', () => {
    const model = upstreamModel(swapConfig, 'test-model');
    expect(model).toBeDefined();
    expect((model as any).modelId).toBe('test-model');
  });
+});

-  it('returns a model for swap route (agent without extra args)', () => {
-    const model = upstreamModel(swapConfig, 'test-model', { llama_extra_args: null });
-    expect(model).toBeDefined();
+// ---------------------------------------------------------------------------
+// resolveModelEndpoint uses the resolver
+// ---------------------------------------------------------------------------
+
+describe('resolveModelEndpoint', () => {
+  it('resolves local composite id to provider baseUrl', () => {
+    const ep = resolveModelEndpoint(
+      { LLAMA_SWAP_URL: 'http://localhost:8080' },
+      'sam-desktop/qwen3.6-35b-a3b',
+    );
+    expect(ep.url).toBe('http://100.101.41.16:8401');
+    expect(ep.model).toBe('qwen3.6-35b-a3b');
+    expect(ep.headers['Content-Type']).toBe('application/json');
  });

-  it('returns a model for sidecar route', () => {
-    const model = upstreamModel(fullConfig, 'test-model', { llama_extra_args: ['--top-k', '20'] });
-    expect(model).toBeDefined();
-    expect((model as any).modelId).toBe('test-model');
+  it('resolves bare id to default provider baseUrl', () => {
+    const ep = resolveModelEndpoint(
+      { LLAMA_SWAP_URL: 'http://localhost:8080' },
+      'test-model',
+    );
+    expect(ep.url).toBe('http://100.101.41.16:8401');
+    expect(ep.model).toBe('test-model');
  });

-  it('throws when sidecar route requested but URL missing', () => {
-    expect(() =>
-      upstreamModel(swapConfig, 'test-model', { llama_extra_args: ['--top-k', '20'] }),
-    ).toThrow(/LLAMA_SIDECAR_URL/);
+  it('resolves deepseek composite id to DeepSeek API with auth header', () => {
+    const ep = resolveModelEndpoint(
+      { LLAMA_SWAP_URL: 'http://localhost:8080', DEEPSEEK_API_KEY: 'sk-test' },
+      'deepseek/deepseek-v4-pro',
+    );
+    expect(ep.url).toBe('https://api.deepseek.com');
+    expect(ep.model).toBe('deepseek-v4-pro');
+    expect(ep.headers['Authorization']).toBe('Bearer sk-test');
  });

-  it('routes to swap for empty llama_extra_args array', () => {
-    const model = upstreamModel(swapConfig, 'test-model', { llama_extra_args: [] });
-    expect(model).toBeDefined();
+  // Collision case for endpoint resolution.
+  it('resolves "embedding/deepseek-r1-qwen3-8b" to embedding baseUrl, not DeepSeek', () => {
+    const ep = resolveModelEndpoint(
+      { LLAMA_SWAP_URL: 'http://localhost:8080', DEEPSEEK_API_KEY: 'sk-test' },
+      'embedding/deepseek-r1-qwen3-8b',
+    );
+    expect(ep.url).toBe('http://100.90.172.55:8411');
+    expect(ep.model).toBe('deepseek-r1-qwen3-8b');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// isDeepSeekModel (legacy prefix check, kept for stream-phase-adapter)
+// ---------------------------------------------------------------------------
+
+describe('isDeepSeekModel', () => {
+  it('returns true for deepseek- prefix', () => {
+    expect(isDeepSeekModel('deepseek-v4-pro')).toBe(true);
+  });
+
+  it('returns false for composite deepseek/', () => {
+    expect(isDeepSeekModel('deepseek/deepseek-v4-pro')).toBe(false);
+  });
+
+  it('returns false for other models', () => {
+    expect(isDeepSeekModel('qwen3.6-35b-a3b')).toBe(false);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// P4: upstreamModel additive source param
+// ---------------------------------------------------------------------------
+
+describe('upstreamModel source param (P4)', () => {
+  const swapConfig = { LLAMA_SWAP_URL: 'http://localhost:8401' };
+
+  it('accepts optional source parameter without breaking existing calls', () => {
+    const model1 = upstreamModel(swapConfig, 'sam-desktop/test-model');
+    const model2 = upstreamModel(swapConfig, 'sam-desktop/test-model', undefined, 'boochat');
+    expect(model1).toBeDefined();
+    expect(model2).toBeDefined();
+    expect((model1 as any).modelId).toBe('test-model');
+    expect((model2 as any).modelId).toBe('test-model');
+  });
+
+  it('creates distinct cached providers for different source values', () => {
+    const modelNoSource = upstreamModel(swapConfig, 'sam-desktop/test-model');
+    const modelBoochat = upstreamModel(swapConfig, 'sam-desktop/test-model', undefined, 'boochat');
+    const modelBoocoder = upstreamModel(swapConfig, 'sam-desktop/test-model', undefined, 'boocoder');
+    expect(modelNoSource).toBeDefined();
+    expect(modelBoochat).toBeDefined();
+    expect(modelBoocoder).toBeDefined();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// P7: gateway routing (auto:* virtual models)
+// ---------------------------------------------------------------------------
+
+describe('resolveModelProvider — gateway routing (P7)', () => {
+  const config = { LLAMA_SWAP_URL: 'http://localhost:8080' };
+
+  it('routes a known gateway-kind provider to route "gateway"', () => {
+    mockProvidersList = [
+      ...mockProvidersList,
+      { id: 'auto', label: 'Auto (gateway)', baseUrl: 'http://100.114.205.53:9503', kind: 'boocontrol-gateway' },
+    ];
+    const r = resolveModelProvider('auto/auto:code', config);
+    expect(r.route).toBe('gateway');
+    expect(r.baseUrl).toBe('http://100.114.205.53:9503');
+    expect(r.wireModelId).toBe('auto:code');
+    expect(r.providerId).toBe('auto');
+  });
+
+  it('resolves an orphaned auto:* session to gateway_error, never swap', () => {
+    // No gateway provider in the registry — the entry was removed.
+    const r = resolveModelProvider('auto/auto:code', config);
+    expect(r.route).toBe('gateway_error');
+    expect(r.gatewayReason).toBe('offline');
+    expect(r.baseUrl).not.toBe(config.LLAMA_SWAP_URL);
+  });
+
+  it('upstreamModel throws a clean error for gateway_error', () => {
+    expect(() => upstreamModel(config, 'auto/auto:fast')).toThrow(/routing gateway offline/);
+  });
+
+  it('resolveModelEndpoint throws a clean error for gateway_error', () => {
+    expect(() => resolveModelEndpoint(config, 'auto/auto:fast')).toThrow(/routing gateway offline/);
+  });
+
+  it('upstreamModel returns a model for a live gateway', () => {
+    mockProvidersList = [
+      ...mockProvidersList,
+      { id: 'auto', label: 'Auto (gateway)', baseUrl: 'http://100.114.205.53:9503', kind: 'boocontrol-gateway' },
+    ];
+    const model = upstreamModel(config, 'auto/auto:code');
+    expect(model).toBeDefined();
+    expect((model as any).modelId).toBe('auto:code');
  });
 });
--- a/apps/server/src/services/tests/step-decision.test.ts
+++ b/apps/server/src/services/tests/step-decision.test.ts
@@ -25,7 +25,6 @@ const BASE_AGENT: Agent = {
  source: 'global',
  max_tool_calls: null,
  steps: null,
-  llama_extra_args: null,
 };

 function call(name: string, args: Record<string, unknown> = {}): ToolCall {
--- a/apps/server/src/services/agents.ts
+++ b/apps/server/src/services/agents.ts
@@ -2,7 +2,7 @@ import { promises as fs } from 'node:fs';
 import { join } from 'node:path';
 import type { Agent, AgentsResponse, AgentParseError } from '../types/api.js';
 import { ALL_TOOLS, resolveToolTier } from './tools.js';
-import { validateExtraArgs } from './inference/llama-args-validator.js';
+
 import { stripQuotes } from '../utils/string-utils.js';

 // v1.8.1: global agents live at /data/AGENTS.md inside the container
@@ -105,7 +105,7 @@ interface ParsedFrontmatter {
  // (200) in the outer loop. Integer ≥ 0; steps: 0 means "no tool calls
  // allowed" — the model responds text-only.
  steps?: number;
-  llama_extra_args?: string[];
+
  // vDeepSeek: thinking effort for DeepSeek V4 models.
  reasoning_effort?: string;
 }
@@ -253,34 +253,7 @@ function parseFrontmatter(yaml: string): { data: ParsedFrontmatter; errors: stri
      } else {
        errors.push(`steps must be a non-negative integer (got "${valueRaw}")`);
      }
-    } else if (key === 'llama_extra_args') {
-      if (valueRaw === '') {
-        data.llama_extra_args = [];
-        // No arrayKey support — llama_extra_args uses inline list only.
-      } else if (valueRaw.startsWith('[') && valueRaw.endsWith(']')) {
-        const inner = valueRaw.slice(1, -1);
-        const parsed = inner
-          .split(',')
-          .map((s) => stripQuotes(s.trim()))
-          .filter((s) => s.length > 0);
-        try {
-          validateExtraArgs(parsed);
-          data.llama_extra_args = parsed;
-        } catch (err) {
-          errors.push(err instanceof Error ? err.message : String(err));
-        }
-      } else {
-        const parsed = valueRaw
-          .split(',')
-          .map((s) => stripQuotes(s.trim()))
-          .filter((s) => s.length > 0);
-        try {
-          validateExtraArgs(parsed);
-          data.llama_extra_args = parsed;
-        } catch (err) {
-          errors.push(err instanceof Error ? err.message : String(err));
-        }
-      }
+
    }
    // Unknown keys silently ignored — forward-compat.
  }
@@ -387,7 +360,7 @@ function parseAgentSection(section: RawSection): Omit<Agent, 'source'> {
    model: typeof fm.model === 'string' && fm.model.length > 0 ? fm.model : null,
    max_tool_calls: typeof fm.max_tool_calls === 'number' ? fm.max_tool_calls : null,
    steps: typeof fm.steps === 'number' ? fm.steps : null,
-    llama_extra_args: Array.isArray(fm.llama_extra_args) ? fm.llama_extra_args : null,
+
    reasoning_effort: typeof fm.reasoning_effort === 'string' ? (fm.reasoning_effort as Agent['reasoning_effort']) : null,
  };
 }
--- a/apps/server/src/services/compaction.ts
+++ b/apps/server/src/services/compaction.ts
@@ -357,7 +357,7 @@ async function callLlm(
  const { url, headers, model: resolvedModel } = resolveModelEndpoint(config, model);
  const res = await fetch(`${url}/v1/chat/completions`, {
    method: 'POST',
-    headers,
+    headers: { ...headers, 'X-Boo-Source': 'boochat' },
    body: JSON.stringify({ model: resolvedModel, messages, stream: false }),
  });
  if (!res.ok) {
@@ -525,9 +525,11 @@ export async function process(input: ProcessInput): Promise<void> {
    // 7. Single completion (no tools). Throws on llama-swap failure.
    result = await callLlm(config, session.model, payload, log);

-    // 7b. v1.11.3: fetch the model's true context window from llama-swap's
-    // /upstream/<model>/props (the streaming completion doesn't carry it).
+    // 7b. v1.11.3: fetch the model's true context window from the provider's
+    // /upstream/<wireModelId>/props (the streaming completion doesn't carry it).
    // Same pattern as inference.ts; the cache makes repeated calls free.
+    // v2.x (W3): pass config so composite model ids resolve through the
+    // provider registry instead of a process-wide LLAMA_SWAP_URL.
    const mctx = await modelContextLookup.getModelContext(session.model);
    const nCtx = mctx?.n_ctx ?? null;

--- a/apps/server/src/services/inference/llama-args-validator.ts
+++ b/apps/server/src/services/inference/llama-args-validator.ts
@@ -1,209 +0,0 @@
-// Guards against agent-supplied llama-server CLI flags that would clash with
-// values BooCode sets itself. Two concerns live here:
-//
-//   1. A hard denylist of flags that BooCode owns outright (model selection,
-//      the listening socket, credentials, the bundled web UI). Passing any of
-//      these is a configuration error and is rejected loudly.
-//
-//   2. A "shadowing" set of flags that are legal to pass but, because of
-//      llama.cpp's last-wins argument parsing, would override a first-class
-//      BooCode setting. These are silently removed from the auto-generated
-//      argv so the agent's explicit choice takes precedence without leaving a
-//      duplicate flag behind.
-//
-// All flag spellings below are the public llama-server option names (short and
-// long aliases) documented in its --help output.
-
-// --- Hard denylist -------------------------------------------------------
-
-// Authored as named buckets purely for readability; every alias is folded
-// into one flat lookup set at module load. Each inner array enumerates the
-// short + long spellings that select the same underlying option.
-const MODEL_SOURCE_FLAGS = [
-  ['-m', '--model'],
-  ['-mu', '--model-url'],
-  ['-dr', '--docker-repo'],
-  ['-hf', '-hfr', '--hf-repo'],
-  ['-hff', '--hf-file'],
-  ['-hfv', '-hfrv', '--hf-repo-v'],
-  ['-hffv', '--hf-file-v'],
-  ['-hft', '--hf-token'],
-  ['-mm', '--mmproj'],
-  ['-mmu', '--mmproj-url'],
-];
-
-const LISTEN_FLAGS = [
-  ['--host'],
-  ['--port'],
-  ['--path'],
-  ['--api-prefix'],
-  ['--reuse-port'],
-];
-
-const CREDENTIAL_FLAGS = [
-  ['--api-key'],
-  ['--api-key-file'],
-  ['--ssl-key-file'],
-  ['--ssl-cert-file'],
-];
-
-const WEBUI_FLAGS = [
-  ['--webui', '--no-webui'],
-  ['--ui', '--no-ui'],
-  ['--ui-config'],
-  ['--ui-config-file'],
-  ['--ui-mcp-proxy', '--no-ui-mcp-proxy'],
-  ['--models-dir'],
-  ['--models-preset'],
-  ['--models-max'],
-  ['--models-autoload', '--no-models-autoload'],
-];
-
-const MANAGED_FLAGS: ReadonlySet<string> = new Set(
-  [
-    ...MODEL_SOURCE_FLAGS,
-    ...LISTEN_FLAGS,
-    ...CREDENTIAL_FLAGS,
-    ...WEBUI_FLAGS,
-  ].flat(),
-);
-
-// --- Token parsing -------------------------------------------------------
-
-const DIGIT = /^[0-9]$/;
-
-/**
- * Extract the flag name from a single argv token, or `null` when the token is
- * not a flag.
- *
- * A token is treated as a flag only when it begins with `-` and the character
- * after the leading dash is neither a digit nor a decimal point — that rule
- * keeps negative numeric values such as `-1` or `-0.5` from being mistaken for
- * options. A bare `-` or `--` is not a flag either. The returned name is the
- * portion before any `=`, so `--ctx-size=4096` yields `--ctx-size`.
- */
-function parseFlag(token: string): string | null {
-  if (!token.startsWith('-')) return null;
-  if (token === '-' || token === '--') return null;
-
-  const second = token[1]!;
-  if (DIGIT.test(second) || second === '.') return null;
-
-  const eq = token.indexOf('=');
-  return eq === -1 ? token : token.slice(0, eq);
-}
-
-// --- Public API ----------------------------------------------------------
-
-/**
- * Validate a sequence of extra llama-server args, rejecting any that name a
- * BooCode-managed flag. Returns the args materialised as a string[] when they
- * all pass.
- */
-export function validateExtraArgs(args?: Iterable<string>): string[] {
-  const result: string[] = [];
-  if (!args) return result;
-
-  for (const entry of args) {
-    const token = String(entry);
-    const flag = parseFlag(token);
-    if (flag !== null && MANAGED_FLAGS.has(flag)) {
-      throw new Error(
-        `llama-server flag '${flag}' is managed and cannot be passed as an extra arg`,
-      );
-    }
-    result.push(token);
-  }
-
-  return result;
-}
-
-/** True when `flag` is a BooCode-managed flag that callers may not override. */
-export function isManagedFlag(flag: string): boolean {
-  return MANAGED_FLAGS.has(flag);
-}
-
-// --- Shadowing flags -----------------------------------------------------
-
-// Flags below are legal for an agent to pass, but each shadows a setting
-// BooCode applies itself. They are categorised so a caller can opt out of
-// stripping any one category.
-
-const SHADOW_CONTEXT = ['-c', '--ctx-size'];
-
-// Empty: agents should be able to opt into cache-type flags (lift analysis
-// found these are high-value features, not safety concerns).
-const SHADOW_CACHE: string[] = [];
-
-// Empty: ngram speculative decoding is a performance feature agents should
-// be able to enable.
-const SHADOW_SPEC: string[] = [];
-
-const SHADOW_TEMPLATE = [
-  '--chat-template',
-  '--chat-template-file',
-  '--chat-template-kwargs',
-  '--jinja',
-  '--no-jinja',
-];
-
-// Shadowing flags that take no value — a boolean switch — so the stripper must
-// not also drop the following token.
-const VALUELESS_SHADOW_FLAGS: ReadonlySet<string> = new Set([
-  '--jinja',
-  '--no-jinja',
-]);
-
-export interface StripOptions {
-  stripContext?: boolean;
-  stripCache?: boolean;
-  stripSpec?: boolean;
-  stripTemplate?: boolean;
-}
-
-/**
- * Remove shadowing flags (and their values) from an argv sequence.
- *
- * Each category is stripped by default; pass the matching `strip*: false`
- * option to retain that category. When a stripped flag carries its value as a
- * separate following token (e.g. `-c 4096`), that token is removed too; the
- * `--flag=value` and boolean-switch forms consume only the single token.
- */
-export function stripShadowingFlags(
-  args: Iterable<string>,
-  opts?: StripOptions,
-): string[] {
-  const targets = new Set<string>();
-  if (opts?.stripContext !== false) for (const f of SHADOW_CONTEXT) targets.add(f);
-  if (opts?.stripCache !== false) for (const f of SHADOW_CACHE) targets.add(f);
-  if (opts?.stripSpec !== false) for (const f of SHADOW_SPEC) targets.add(f);
-  if (opts?.stripTemplate !== false) for (const f of SHADOW_TEMPLATE) targets.add(f);
-
-  const tokens = Array.from(args, String);
-  const kept: string[] = [];
-
-  for (let i = 0; i < tokens.length; i++) {
-    const token = tokens[i]!;
-    const flag = parseFlag(token);
-
-    // Not a targeted shadow flag — keep it verbatim.
-    if (flag === null || !targets.has(flag)) {
-      kept.push(token);
-      continue;
-    }
-
-    // Targeted: drop it. Decide whether the next token is its value and should
-    // be dropped along with it. Boolean switches and the inline `=value` form
-    // carry no separate value token.
-    const carriesInlineValue = token.includes('=');
-    const isBoolean = VALUELESS_SHADOW_FLAGS.has(flag);
-    const next = tokens[i + 1];
-    const nextIsValue = next !== undefined && parseFlag(next) === null;
-
-    if (!isBoolean && !carriesInlineValue && nextIsValue) {
-      i++; // also skip the value token
-    }
-  }
-
-  return kept;
-}
--- a/apps/server/src/services/inference/provider.ts
+++ b/apps/server/src/services/inference/provider.ts
@@ -1,6 +1,7 @@
 import { createOpenAICompatible } from '@ai-sdk/openai-compatible';
 import { createDeepSeek } from '@ai-sdk/deepseek';
 import type { LanguageModel } from 'ai';
+import { getLlamaProviders, parseModelRef } from '../llama-providers.js';

 // v1.13.1-A: AI SDK provider against llama-swap. baseURL is threaded from
 // config.LLAMA_SWAP_URL at call time (not module-load) so tests can stub the
@@ -8,48 +9,46 @@ import type { LanguageModel } from 'ai';
 // Tailscale topology and exposing it over the public internet is gated by
 // Authelia at the Caddy layer, not by API keys.
 //
-// v2.4.1-sidecar: when the agent has llama_extra_args, route through
-// llama-sidecar instead. A fresh provider is created per call (not cached)
-// because the X-Agent-Flags header varies per agent. The llama-swap path
-// stays cached since it has no per-request headers.
-//
-// vDeepSeek: when the model ID starts with 'deepseek-' and DEEPSEEK_API_KEY
-// is set, route through the official @ai-sdk/deepseek provider (not
-// openai-compatible) so DeepSeek-specific features work: providerMetadata
-// with promptCacheHitTokens/promptCacheMissTokens, reasoning via
-// LanguageModelV4Usage.outputTokens.reasoning, and thinking-mode options.
+// v2.x: provider-aware resolver (W2). One resolver answers provider identity,
+// upstream base URL, final wire model id, and DeepSeek
+// special handling. Both upstreamModel() and resolveModelEndpoint() go through
+// it. Legacy bare-id prefix heuristics live only in the fallback layer.

 const swapCache = new Map<string, ReturnType<typeof createOpenAICompatible>>();

-function getSwapProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> {
-  let provider = swapCache.get(baseURL);
+function getSwapProvider(baseURL: string, source?: string): ReturnType<typeof createOpenAICompatible> {
+  const cacheKey = source ? `${baseURL}||${source}` : baseURL;
+  let provider = swapCache.get(cacheKey);
  if (!provider) {
+    const fetchWrapper = source
+      ? ((...args: Parameters<typeof fetch>) => {
+          const [input, init] = args;
+          return fetch(input, {
+            ...init,
+            headers: {
+              ...(init?.headers as Record<string, string> | undefined) ?? {},
+              'X-Boo-Source': source,
+            },
+          });
+        })
+      : undefined;
    provider = createOpenAICompatible({
      name: 'llama-swap',
      baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
      includeUsage: true,
-    });
-    swapCache.set(baseURL, provider);
+      ...(fetchWrapper ? { fetch: fetchWrapper } : {}),
+    }) as ReturnType<typeof createOpenAICompatible>;
+    swapCache.set(cacheKey, provider);
  }
  return provider;
 }

-function sidecarProvider(
-  baseURL: string,
-  flags: string[],
-): ReturnType<typeof createOpenAICompatible> {
-  return createOpenAICompatible({
-    name: 'llama-sidecar',
-    baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
-    includeUsage: true,
-    headers: {
-      'X-Agent-Flags': flags.join(' '),
-    },
-  });
-}
-
 const DEEPSEEK_MODEL_PREFIX = 'deepseek-';

+/**
+ * Legacy prefix check — kept for backward compat with bare "deepseek-*" ids.
+ * Composite "deepseek/model" is identified by provider id, not prefix.
+ */
 export function isDeepSeekModel(modelId: string): boolean {
  return modelId.startsWith(DEEPSEEK_MODEL_PREFIX);
 }
@@ -69,69 +68,204 @@ function getDeepSeekProvider(
  return deepseekProviderCache;
 }

-export type InferenceRoute = 'swap' | 'sidecar' | 'deepseek';
+// ---------------------------------------------------------------------------
+// Provider-aware resolver (W2, D-2, D-3)
+// ---------------------------------------------------------------------------

-export interface RoutingInfo {
+// P7: 'gateway' routes to the BooControl auto:* gateway (OpenAI-compatible,
+// does its own policy routing + failover). 'gateway_error' is the
+// present-but-unhealthy / orphaned-session state: the session selected an
+// auto:* model but the gateway provider is missing/disabled, so we surface a
+// clean error instead of silently mis-routing to LLAMA_SWAP_URL.
+export type InferenceRoute = 'swap' | 'deepseek' | 'gateway' | 'gateway_error';
+
+/** Provider registry `kind` marking the BooControl routing gateway. */
+export const GATEWAY_KIND = 'boocontrol-gateway';
+
+/**
+ * Whether a (bare) wire model id is a gateway virtual model. Used to detect an
+ * orphaned auto:* session whose gateway registry entry was removed — the id
+ * still looks like a gateway model, so resolve to gateway_error, never swap.
+ */
+export function isGatewayVirtualModel(wireModelId: string): boolean {
+  return wireModelId === 'auto' || wireModelId.startsWith('auto:');
+}
+
+export interface ResolvedModel {
+  /** Routing destination. */
  route: InferenceRoute;
-  flags: string[] | null;
+  /** Upstream base URL for the provider (DeepSeek API base or llama-swap). */
+  baseUrl: string;
+  /** Wire model id to send upstream (bare, no provider prefix). */
+  wireModelId: string;
+  /** Whether the input was a legacy bare id resolved through defaultProvider. */
+  isLegacyBareId: boolean;
+  /** Provider identity (e.g. "sam-desktop", "embedding", "deepseek"). */
+  providerId: string;
+  /** For route 'gateway_error': why the gateway is unavailable. */
+  gatewayReason?: 'offline' | 'unhealthy';
 }

 interface AgentLike {
-  llama_extra_args: string[] | null;
+  // reserved for future per-agent routing attributes
 }

 interface ConfigLike {
  LLAMA_SWAP_URL: string;
-  LLAMA_SIDECAR_URL?: string;
  DEEPSEEK_API_KEY?: string;
  DEEPSEEK_BASE_URL?: string;
 }

+/**
+ * Provider-aware model resolver. Given a (possibly bare) model id, answers:
+ * provider identity, upstream base URL, final bare wire model id, and
+ * DeepSeek special handling.
+ *
+ * Bare ids resolve via defaultProvider (D-2). Composite "provider/model" ids
+ * look up the named provider directly. DeepSeek is identified by provider id
+ * "deepseek" or by the legacy bare "deepseek-" prefix when DEEPSEEK_API_KEY
+ * is configured.
+ */
+export function resolveModelProvider(
+  modelId: string,
+  config: ConfigLike,
+): ResolvedModel {
+  const providers = getLlamaProviders();
+  const parsed = parseModelRef(modelId);
+  const { providerId, wireModelId, isLegacyBareId } = parsed;
+
+  const deepseekConfigured = !!config.DEEPSEEK_API_KEY;
+  const deepseekBaseUrl = (config.DEEPSEEK_BASE_URL ?? 'https://api.deepseek.com').replace(/\/+$/, '');
+
+  // --- DeepSeek routing ---
+  // Explicit provider id "deepseek" → DeepSeek SDK.
+  if (providerId === 'deepseek' && deepseekConfigured) {
+    return {
+      route: 'deepseek',
+      baseUrl: deepseekBaseUrl,
+      wireModelId,
+      isLegacyBareId,
+      providerId: 'deepseek',
+    };
+  }
+
+  // Bare legacy "deepseek-*" prefix (only when DEEPSEEK_API_KEY is set) →
+  // legacy fallback layer — DeepSeek SDK.
+  if (isLegacyBareId && isDeepSeekModel(wireModelId) && deepseekConfigured) {
+    return {
+      route: 'deepseek',
+      baseUrl: deepseekBaseUrl,
+      wireModelId,
+      isLegacyBareId: true,
+      providerId: 'deepseek',
+    };
+  }
+
+  // --- Local provider routing ---
+  const provider = providers.providers.find((p) => p.id === providerId);
+
+  // --- Gateway routing (P7) ---
+  // A known gateway-kind provider → route to the gateway as an OpenAI-compatible
+  // upstream (it does its own policy routing). The gateway forwards X-Boo-Source
+  // to the chosen target so attribution survives the extra hop.
+  if (provider && provider.kind === GATEWAY_KIND) {
+    return {
+      route: 'gateway',
+      baseUrl: provider.baseUrl,
+      wireModelId,
+      isLegacyBareId,
+      providerId: provider.id,
+    };
+  }
+
+  if (!provider) {
+    // Orphaned auto:* session: the model still looks like a gateway virtual
+    // model but no gateway provider is configured. Resolve to a clean
+    // gateway_error — NEVER the silent LLAMA_SWAP_URL fallback (design §8).
+    if (isGatewayVirtualModel(wireModelId)) {
+      return {
+        route: 'gateway_error',
+        baseUrl: '',
+        wireModelId,
+        isLegacyBareId,
+        providerId,
+        gatewayReason: 'offline',
+      };
+    }
+    // Unknown provider — fall back to legacy LLAMA_SWAP_URL for bare ids.
+    if (isLegacyBareId) {
+      return {
+        route: 'swap',
+        baseUrl: config.LLAMA_SWAP_URL,
+        wireModelId,
+          isLegacyBareId: true,
+        providerId: 'llama-swap',
+      };
+    }
+    // Composite id with unknown provider — still route to LLAMA_SWAP_URL as
+    // a best-effort fallback (the wire model id carries provider intent but
+    // the config is incomplete).
+    return {
+      route: 'swap',
+      baseUrl: config.LLAMA_SWAP_URL,
+      wireModelId,
+      isLegacyBareId: false,
+      providerId,
+    };
+  }
+
+  return {
+    route: 'swap',
+    baseUrl: provider.baseUrl,
+    wireModelId,
+    isLegacyBareId,
+    providerId: provider.id,
+  };
+}
+
+/**
+ * @deprecated Use resolveModelProvider() for full routing info. Kept for
+ * backward compat with resolveRoute() callers that only need the route tag.
+ */
 export function resolveRoute(
  agent: AgentLike | null,
  config?: ConfigLike,
  modelId?: string,
-): RoutingInfo {
-  // vDeepSeek: if the model starts with deepseek- and DEEPSEEK_API_KEY is set,
-  // route through the DeepSeek provider. Checked first so DeepSeek models
-  // always bypass llama-swap/sidecar even when those are also configured.
-  if (modelId?.startsWith(DEEPSEEK_MODEL_PREFIX) && config?.DEEPSEEK_API_KEY) {
-    return { route: 'deepseek', flags: null };
-  }
-  // When llama_extra_args are explicitly set, route through sidecar with them.
-  const flags = agent?.llama_extra_args;
-  if (flags && flags.length > 0) {
-    return { route: 'sidecar', flags };
-  }
-  // When LLAMA_SIDECAR_URL is configured (even without per-agent flags),
-  // route through sidecar to pick up the default base args (cache quant,
-  // spec decoding, slot save, etc.). Fall back to llama-swap otherwise.
-  if (config?.LLAMA_SIDECAR_URL) {
-    return { route: 'sidecar', flags: [] };
-  }
-  return { route: 'swap', flags: null };
+): { route: InferenceRoute } {
+  if (!modelId || !config) return { route: 'swap' };
+  const resolved = resolveModelProvider(modelId, config);
+  return { route: resolved.route };
 }

 export function upstreamModel(
  config: ConfigLike,
  modelId: string,
  agent?: AgentLike | null,
+  source?: string,
 ): LanguageModel {
-  const { route, flags } = resolveRoute(agent ?? null, config, modelId);
-  if (route === 'deepseek') {
+  const resolved = resolveModelProvider(modelId, config);
+  if (resolved.route === 'deepseek') {
    return getDeepSeekProvider(
      config.DEEPSEEK_API_KEY!,
-      config.DEEPSEEK_BASE_URL ?? 'https://api.deepseek.com',
-    ).chat(modelId);
+      resolved.baseUrl,
+    ).chat(resolved.wireModelId);
  }
-  if (route === 'sidecar') {
-    const url = config.LLAMA_SIDECAR_URL;
-    if (!url) {
-      throw new Error(`Sidecar route selected but LLAMA_SIDECAR_URL is not set`);
-    }
-    return sidecarProvider(url, (flags ?? [])).chatModel(modelId);
+
+  // P7: gateway is OpenAI-compatible — same adapter as swap, pointed at the
+  // gateway baseUrl. The gateway resolves the policy + forwards X-Boo-Source.
+  if (resolved.route === 'gateway') {
+    return getSwapProvider(resolved.baseUrl, source).chatModel(resolved.wireModelId);
  }
-  return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId);
+
+  // P7: orphaned auto:* session with no gateway configured — fail loud rather
+  // than silently mis-route to LLAMA_SWAP_URL.
+  if (resolved.route === 'gateway_error') {
+    throw new Error(
+      `routing gateway offline (${resolved.gatewayReason ?? 'unavailable'}): ${modelId}`,
+    );
+  }
+
+  return getSwapProvider(resolved.baseUrl, source).chatModel(resolved.wireModelId);
 }

 /** Resolve the API endpoint for non-streaming calls (compaction, task-model).
@@ -140,18 +274,30 @@ export function resolveModelEndpoint(
  config: ConfigLike,
  modelId: string,
 ): { url: string; model: string; headers: Record<string, string> } {
+  const resolved = resolveModelProvider(modelId, config);
  const baseHeaders: Record<string, string> = { 'Content-Type': 'application/json' };
-  if (modelId.startsWith(DEEPSEEK_MODEL_PREFIX) && config.DEEPSEEK_API_KEY) {
-    const baseURL = (config.DEEPSEEK_BASE_URL ?? 'https://api.deepseek.com').replace(/\/+$/, '');
+
+  if (resolved.route === 'deepseek') {
    return {
-      url: baseURL,
-      model: modelId,
+      url: resolved.baseUrl,
+      model: resolved.wireModelId,
      headers: { ...baseHeaders, Authorization: `Bearer ${config.DEEPSEEK_API_KEY}` },
    };
  }
+
+  // P7: orphaned auto:* session with no gateway — fail loud (no swap fallback).
+  if (resolved.route === 'gateway_error') {
+    throw new Error(
+      `routing gateway offline (${resolved.gatewayReason ?? 'unavailable'}): ${modelId}`,
+    );
+  }
+
+  // P7: gateway uses the same unauthenticated OpenAI-compatible shape as swap.
+  // X-Boo-Source forwarding for direct-fetch callers happens at their own header
+  // layer (compaction.ts / task-model.ts); the gateway re-forwards it onward.
  return {
-    url: config.LLAMA_SWAP_URL.replace(/\/+$/, ''),
-    model: modelId,
+    url: resolved.baseUrl.replace(/\/+$/, ''),
+    model: resolved.wireModelId,
    headers: baseHeaders,
  };
 }
--- a/apps/server/src/services/inference/stream-phase-adapter.ts
+++ b/apps/server/src/services/inference/stream-phase-adapter.ts
@@ -306,7 +306,7 @@ export async function streamCompletion(
    : stallAc.signal;

  const result = streamText({
-    model: upstreamModel(ctx.config, model, agent ?? null),
+    model: upstreamModel(ctx.config, model, agent ?? null, 'boochat'),
    messages: aiMessages,
    ...(aiTools
      ? { tools: aiTools, toolChoice: 'auto' as const, experimental_repairToolCall: repairToolCall }
--- a/apps/server/src/services/llama-providers.ts
+++ b/apps/server/src/services/llama-providers.ts
@@ -0,0 +1,101 @@
+/**
+ * vMultiProvider local provider registry loader (server-side).
+ *
+ * Reads the shared `/data/llama-providers.json` (or `LLAMA_PROVIDERS_PATH`) at
+ * startup and caches the parsed result. When the file is absent or invalid,
+ * synthesizes a single legacy provider from `LLAMA_SWAP_URL` so both apps
+ * start with only legacy env vars (D-1).
+ *
+ * Schema and pure helpers live in @boocode/contracts/llama-providers.
+ * File I/O stays app-local per D-1.
+ */
+import { readFileSync } from 'node:fs';
+import {
+  LlamaProvidersFileSchema,
+  type LlamaProvidersFile,
+  type LlamaProvider,
+  type ParsedModelRef,
+  parseModelRef as parseModelRefBase,
+  formatModelRef,
+} from '@boocode/contracts/llama-providers';
+
+export type { LlamaProvidersFile, LlamaProvider, ParsedModelRef, formatModelRef };
+
+/** Synthesize a single legacy provider from env vars. */
+function buildLegacyProvider(llamaSwapUrl: string): LlamaProvidersFile {
+  return {
+    defaultProvider: 'llama-swap',
+    providers: [
+      {
+        id: 'llama-swap',
+        label: 'llama-swap',
+        baseUrl: llamaSwapUrl,
+        kind: 'llama-swap',
+      },
+    ],
+  };
+}
+
+let cached: LlamaProvidersFile | null = null;
+
+/**
+ * Load (or re-load) the local provider config. Never throws on bad input —
+ * falls back to the legacy single-provider shape.
+ */
+export function loadLlamaProviders(
+  providersPath: string | undefined,
+  llamaSwapUrl: string,
+): LlamaProvidersFile {
+  if (!providersPath) {
+    cached = buildLegacyProvider(llamaSwapUrl);
+    return cached;
+  }
+
+  let raw: string;
+  try {
+    raw = readFileSync(providersPath, 'utf8');
+  } catch {
+    console.warn(
+      `llama-providers: file not found at ${providersPath} — falling back to legacy single-provider`,
+    );
+    cached = buildLegacyProvider(llamaSwapUrl);
+    return cached;
+  }
+
+  let json: unknown;
+  try {
+    json = JSON.parse(raw);
+  } catch (err) {
+    console.error(
+      `llama-providers: invalid JSON in ${providersPath} — falling back to legacy single-provider`,
+      err,
+    );
+    cached = buildLegacyProvider(llamaSwapUrl);
+    return cached;
+  }
+
+  const parsed = LlamaProvidersFileSchema.safeParse(json);
+  if (!parsed.success) {
+    console.error(
+      `llama-providers: schema validation failed for ${providersPath} — falling back to legacy single-provider`,
+      parsed.error.flatten(),
+    );
+    cached = buildLegacyProvider(llamaSwapUrl);
+    return cached;
+  }
+
+  cached = parsed.data;
+  return cached;
+}
+
+/** The cached provider config. Returns legacy fallback if nothing loaded yet. */
+export function getLlamaProviders(): LlamaProvidersFile {
+  return cached ?? buildLegacyProvider('http://localhost:8080');
+}
+
+/**
+ * Convenience: parse a model ref against the cached default provider.
+ */
+export function parseModelRef(ref: string): ParsedModelRef {
+  return parseModelRefBase(ref, getLlamaProviders().defaultProvider);
+}
--- a/apps/server/src/services/model-context.ts
+++ b/apps/server/src/services/model-context.ts
@@ -1,13 +1,15 @@
-// v1.11.3: llama-swap model-context cache. Replaces the dead
+// v2.x: provider-aware model-context cache (W3). Replaces the dead
 // `parsed.timings.n_ctx` capture in inference.ts / compaction.ts —
 // llama-server's streaming completion never emits n_ctx in timings (verified
 // empirically: timings carries prompt_n / predicted_n / *_ms / *_per_second
-// only). The authoritative source is llama-swap's
-// /upstream/<model>/props endpoint at .default_generation_settings.n_ctx.
+// only). The authoritative source is the provider's
+// /upstream/<wireModelId>/props endpoint at .default_generation_settings.n_ctx.
 //
 // Cache design:
+//   - Keys are the full composite model id (provider/model) so two providers
+//     serving the same wire model name never share cache entries (D-2).
 //   - Positive entries (n_ctx + total_slots) have no TTL. A model's context
-//     size doesn't change while llama-swap is running; an admin endpoint
+//     size doesn't change while the provider is running; an admin endpoint
 //     can invalidateModelContext() if it ever does.
 //   - Negative entries (failed fetch) have a 60s TTL so a misconfigured or
 //     down model doesn't get hammered every inference turn, but recovers
@@ -15,6 +17,11 @@
 //   - 3s AbortController timeout on the fetch — long enough for a healthy
 //     upstream, short enough that a stuck upstream doesn't block the
 //     ctx_max UPDATE that follows.
+//
+// v1.x legacy: previously keyed by bare wire id and used a process-wide
+// LLAMA_SWAP_URL. Now resolved per-call via the provider registry.
+
+import { resolveModelProvider } from './inference/provider.js';

 export interface ModelContext {
  n_ctx: number;
@@ -28,29 +35,79 @@ const positiveCache = new Map<string, ModelContext>();
 // re-fetches within the 60s window.
 const negativeCache = new Map<string, number>();

-// Set once at startup by index.ts. We don't import loadConfig() directly
-// here to keep this module trivially mockable in tests (set the URL in
-// beforeEach instead of stubbing process.env + loadConfig's cache).
-let llamaSwapUrl: string | null = null;
+// Stored config for provider-aware resolution. Supports both the legacy
+// { llamaSwapUrl: string } shape (for tests) and the full Config shape.
+let storedConfig: ConfigForModelContext | null = null;

-export function configureModelContext(opts: { llamaSwapUrl: string }): void {
-  llamaSwapUrl = opts.llamaSwapUrl;
+/** Config fields needed for model-context provider resolution. */
+type ConfigForModelContext = {
+  LLAMA_SWAP_URL: string;
+  DEEPSEEK_API_KEY?: string;
+  DEEPSEEK_BASE_URL?: string;
+};
+
+/**
+ * Configure the module for model-context lookups.
+ *
+ * Accepts either the full server Config (production) or the legacy
+ * `{ llamaSwapUrl }` shape (tests). The full Config is preferred so
+ * getModelContext can resolve composite model ids through the provider
+ * registry.
+ */
+export function configureModelContext(
+  opts: ConfigForModelContext | { llamaSwapUrl: string },
+): void {
+  // Legacy test helper: { llamaSwapUrl } → synthesize a minimal config.
+  if ('llamaSwapUrl' in opts && typeof opts.llamaSwapUrl === 'string') {
+    storedConfig = { LLAMA_SWAP_URL: opts.llamaSwapUrl };
+    return;
+  }
+  storedConfig = opts as ConfigForModelContext;
 }

 // vDeepSeek: DeepSeek models don't have a /upstream/<model>/props endpoint.
 // Return a reasonable default context so compaction estimates work.
 const DEEPSEEK_DEFAULT_N_CTX = 131_072;
-const DEEPSEEK_MODEL_PREFIX = 'deepseek-';

 export async function getModelContext(model: string): Promise<ModelContext | null> {
-  // vDeepSeek: DeepSeek models have no /upstream/<model>/props. Use a static
-  // default so compaction doesn't fall to the buffer-only path with tiny limits.
-  if (model.startsWith(DEEPSEEK_MODEL_PREFIX)) {
+  // Resolve the model through the provider-aware resolver. For composite
+  // "provider/model" ids, this finds the correct provider's baseUrl. For
+  // bare legacy ids, it falls back to the default provider.
+  const config = storedConfig;
+  if (!config) {
+    // Module not initialized. Defensive — index.ts calls
+    // configureModelContext at startup; if a test forgets, fail closed so
+    // the chat still works (ctx_max stays null, UI degrades gracefully).
+    negativeCache.set(model, Date.now());
+    return null;
+  }
+
+  const resolved = resolveModelProvider(model, config);
+
+  // DeepSeek models (by provider id) have no /upstream/<model>/props.
+  // Use a static default so compaction doesn't fall to the buffer-only
+  // path with tiny limits.
+  if (resolved.providerId === 'deepseek') {
    return { n_ctx: DEEPSEEK_DEFAULT_N_CTX };
  }

+  // P7: orphaned auto:* session with no gateway configured — no props endpoint
+  // to query. Negative-cache and return null; compaction degrades gracefully.
+  if (resolved.route === 'gateway_error') {
+    negativeCache.set(model, Date.now());
+    return null;
+  }
+
+  // P7: gateway route — baseUrl is the control gateway, which exposes
+  // /upstream/<virtualModel>/props (it proxies the chosen candidate's props).
+  // The normal fetch path below handles it without special-casing.
+
+  // Cache key is the full composite id to prevent cross-provider cache
+  // poisoning for duplicate wire model names (D-2, design §5.3).
+  const cacheKey = `${resolved.providerId}/${resolved.wireModelId}`;
+
  // 1. Positive cache hit — no TTL check, model n_ctx is invariant.
-  const pos = positiveCache.get(model);
+  const pos = positiveCache.get(cacheKey);
  if (pos) return pos;

  // 2. Negative cache hit within TTL — return null without refetching.
@@ -58,30 +115,25 @@ export async function getModelContext(model: string): Promise<ModelContext | nul
  // attempt below; we don't delete them eagerly because the next successful
  // fetch will overwrite via the positive map and the negative entry
  // becomes irrelevant.
-  const negTs = negativeCache.get(model);
+  const negTs = negativeCache.get(cacheKey);
  if (negTs !== undefined && Date.now() - negTs < NEGATIVE_TTL_MS) {
    return null;
  }

-  // 3. Module not initialized. Defensive — index.ts calls
-  // configureModelContext at startup; if a test forgets, fail closed so
-  // the chat still works (ctx_max stays null, UI degrades gracefully).
-  if (!llamaSwapUrl) {
-    negativeCache.set(model, Date.now());
-    return null;
-  }
-
-  // 4. Fetch with timeout. AbortController fires after FETCH_TIMEOUT_MS;
+  // 3. Fetch with timeout. AbortController fires after FETCH_TIMEOUT_MS;
  // both the timeout path and a fetch reject end up in the catch below
  // and produce a negative cache entry.
-  const url = `${llamaSwapUrl}/upstream/${encodeURIComponent(model)}/props`;
+  //
+  // Strip the provider prefix: fetch from
+  // <provider.baseUrl>/upstream/<wireModelId>/props (design §5.3).
+  const url = `${resolved.baseUrl.replace(/\/+$/, '')}/upstream/${encodeURIComponent(resolved.wireModelId)}/props`;
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
  try {
    const res = await fetch(url, { signal: controller.signal });
    clearTimeout(timer);
    if (!res.ok) {
-      negativeCache.set(model, Date.now());
+      negativeCache.set(cacheKey, Date.now());
      return null;
    }
    const body = (await res.json()) as {
@@ -89,18 +141,18 @@ export async function getModelContext(model: string): Promise<ModelContext | nul
    };
    const n_ctx = body?.default_generation_settings?.n_ctx;
    if (typeof n_ctx !== 'number' || n_ctx <= 0) {
-      negativeCache.set(model, Date.now());
+      negativeCache.set(cacheKey, Date.now());
      return null;
    }
    const entry: ModelContext = { n_ctx };
-    positiveCache.set(model, entry);
+    positiveCache.set(cacheKey, entry);
    // Clear any stale negative entry so a future query sees the positive
    // hit cleanly (otherwise the negative TTL never expires from the map).
-    negativeCache.delete(model);
+    negativeCache.delete(cacheKey);
    return entry;
  } catch {
    clearTimeout(timer);
-    negativeCache.set(model, Date.now());
+    negativeCache.set(cacheKey, Date.now());
    return null;
  }
 }
@@ -110,7 +162,16 @@ export function invalidateModelContext(model?: string): void {
    positiveCache.clear();
    negativeCache.clear();
  } else {
-    positiveCache.delete(model);
-    negativeCache.delete(model);
+    // Resolve to composite cache key. If the model is already composite
+    // (contains '/'), it's used directly. Otherwise, resolve through the
+    // provider registry to find the composite key. This keeps backward
+    // compat with callers passing bare model names.
+    let cacheKey = model;
+    if (storedConfig && !model.includes('/')) {
+      const resolved = resolveModelProvider(model, storedConfig);
+      cacheKey = `${resolved.providerId}/${resolved.wireModelId}`;
+    }
+    positiveCache.delete(cacheKey);
+    negativeCache.delete(cacheKey);
  }
 }
--- a/apps/server/src/services/system-prompt.ts
+++ b/apps/server/src/services/system-prompt.ts
@@ -21,7 +21,7 @@ import { createHash } from 'node:crypto';
 import { readFile, stat } from 'node:fs/promises';
 import type { Agent, Project, Session } from '../types/api.js';
 import { getAgentsMtimes } from './agents.js';
-import { resolveRoute } from './inference/provider.js';
+import { resolveRoute, type InferenceRoute } from './inference/provider.js';
 import { loadMemoryForSession } from './memory/recall.js';
 import { formatMemoryBlock } from './memory/prompt.js';

@@ -101,7 +101,7 @@ export interface PrefixFingerprint {
  has_agent_system_prompt: boolean;
  has_session_override: boolean;
  has_project_override: boolean;
-  route: 'swap' | 'sidecar' | 'deepseek';
+  route: InferenceRoute;
 }

 export interface PrefixDrift {
@@ -129,7 +129,7 @@ interface ObservedInputs {
  has_agent_system_prompt: boolean;
  has_session_override: boolean;
  has_project_override: boolean;
-  route: 'swap' | 'sidecar' | 'deepseek';
+  route: InferenceRoute;
 }

 interface ObserverEntry {
--- a/apps/server/src/services/task-model.ts
+++ b/apps/server/src/services/task-model.ts
@@ -1,4 +1,5 @@
 import { loadConfig, type Config } from '../config.js';
+import { resolveModelEndpoint } from './inference/provider.js';

 const TIMEOUT_MS = 10_000;

@@ -13,14 +14,19 @@ export async function taskModelCompletion(opts: {
  const maxTokens = opts.maxTokens ?? 30;
  const temperature = opts.temperature ?? 0.3;

-  const { url, model } = resolveEndpoint(config, opts.fallbackModel);
+  // v2.x (W3): resolve the endpoint through the shared provider-aware
+  // resolver instead of a local LLAMA_SWAP_URL fallback. This ensures
+  // composite model ids (e.g. "sam-desktop/qwen3.6-35b") route to the
+  // correct provider, and bare ids resolve through the default provider.
+  const model = config.FAST_MODEL ?? opts.fallbackModel ?? config.DEFAULT_MODEL;
+  const { url, model: resolvedModel, headers } = resolveModelEndpoint(config, model);

  try {
    const res = await fetch(`${url}/v1/chat/completions`, {
      method: 'POST',
-      headers: { 'Content-Type': 'application/json' },
+      headers: { ...headers, 'X-Boo-Source': 'boochat' },
      body: JSON.stringify({
-        model,
+        model: resolvedModel,
        messages: [
          { role: 'system', content: opts.system },
          { role: 'user', content: opts.user },
@@ -55,14 +61,3 @@ export async function taskModelCompletion(opts: {
    return '';
  }
 }
-
-function resolveEndpoint(
-  config: Config,
-  fallbackModel?: string,
-): { url: string; model: string } {
-  if (config.TASK_MODEL_URL) {
-    return { url: config.TASK_MODEL_URL, model: 'gemma-3-270m-it' };
-  }
-  const model = config.FAST_MODEL ?? fallbackModel ?? config.DEFAULT_MODEL;
-  return { url: config.LLAMA_SWAP_URL, model };
-}
--- a/apps/server/src/types/api.ts
+++ b/apps/server/src/types/api.ts
@@ -129,7 +129,6 @@ export interface Agent {
  // v1.14.0: per-agent step cap for the outer inference loop. null means
  // bounded only by MAX_STEPS (200). 0 means "no tool calls allowed."
  steps: number | null;
-  llama_extra_args: string[] | null;
  // vDeepSeek: thinking/reasoning effort for DeepSeek V4 models.
  // Maps to DeepSeek's reasoning_effort API param.
  reasoning_effort: 'off' | 'low' | 'medium' | 'high' | 'xhigh' | 'max' | null;
@@ -244,6 +243,17 @@ export interface ModelInfo {
  [key: string]: unknown;
 }

+// v2.x: provider-grouped model catalog (W2, D-4).
+export interface ModelCatalogProvider {
+  id: string;
+  label: string;
+  models: ModelInfo[];
+}
+
+export interface ModelCatalogResponse {
+  providers: ModelCatalogProvider[];
+}
+
 export interface SidebarSession {
  id: string;
  project_id: string;