feat(web,server): inference settings UI with per-session inference overrides

Adds Inference tab to SettingsPane with controls for temperature, top-p, top-k, min-p, and other inference parameters. Server-side route and provider config wiring to pass overrides through the inference pipeline.
2026-06-07 22:16:29 +00:00
parent a72f7954b4
commit c132215064
7 changed files with 598 additions and 9 deletions
--- a/apps/server/src/index.ts
+++ b/apps/server/src/index.ts
@@ -19,6 +19,8 @@ import { registerModelRoutes } from './routes/models.js';
 import { registerAgentRoutes } from './routes/agents.js';
 import { registerSkillsRoutes } from './routes/skills.js';
 import { registerToolsRoutes } from './routes/tools.js';
+import { registerAnalyticsRoutes } from './routes/analytics.js';
+import { registerInferenceSettingsRoutes } from './routes/inference-settings.js';
 import { createInferenceRunner } from './services/inference/index.js';
 import { createBroker } from './services/broker.js';
 import { listSkills } from './services/skills.js';
@@ -122,6 +124,8 @@ async function main() {
  registerSidebarRoutes(app, sql);
  registerChatRoutes(app, sql, broker);
  registerToolsRoutes(app, sql);
+  registerAnalyticsRoutes(app, sql);
+  registerInferenceSettingsRoutes(app);

  // Batch 9.6: warm the skills cache at boot and surface the count. Empty or
  // missing /data/skills is non-fatal — the skill tools just return empty.
--- a/apps/server/src/routes/inference-settings.ts
+++ b/apps/server/src/routes/inference-settings.ts
@@ -0,0 +1,55 @@
+import { FastifyInstance } from 'fastify';
+import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
+import { resolve, dirname } from 'path';
+
+const CONFIG_PATH = resolve(process.env.BOOCODE_DATA_DIR || '/opt/boocode/data', 'inference-settings.json');
+
+const DEFAULTS = {
+  cache_type_k: 'q4_0',
+  cache_reuse: 256,
+  spec_type: 'ngram-mod',
+  spec_ngram_mod_thsh: 2,
+  ctx_checkpoints: 32,
+  sleep_idle_seconds: 600,
+  metrics_enabled: true,
+  slot_save_path: '/tmp/llama-slots',
+};
+
+function load(): Record<string, unknown> {
+  try {
+    if (existsSync(CONFIG_PATH)) {
+      return JSON.parse(readFileSync(CONFIG_PATH, 'utf-8'));
+    }
+  } catch { /* corrupt file */ }
+  return { ...DEFAULTS };
+}
+
+function save(data: Record<string, unknown>): void {
+  const dir = dirname(CONFIG_PATH);
+  if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
+  writeFileSync(CONFIG_PATH, JSON.stringify(data, null, 2) + '\n');
+}
+
+const VALID_CACHE_TYPES = ['f32', 'f16', 'q8_0', 'q4_0'] as const;
+const VALID_SPEC_TYPES = ['off', 'ngram-mod', 'draft-simple'] as const;
+
+export function registerInferenceSettingsRoutes(app: FastifyInstance): void {
+  app.get('/api/settings/inference', async (_req, _res) => {
+    return { ...DEFAULTS, ...load() };
+  });
+
+  app.patch<{ Body: Record<string, unknown> }>('/api/settings/inference', async (req, reply) => {
+    const current = { ...DEFAULTS, ...load() };
+    const merged = { ...current, ...req.body };
+
+    if (merged.cache_type_k && !(VALID_CACHE_TYPES as readonly string[]).includes(merged.cache_type_k as string)) {
+      return reply.status(400).send({ error: 'Invalid cache_type_k' });
+    }
+    if (merged.spec_type && !(VALID_SPEC_TYPES as readonly string[]).includes(merged.spec_type as string)) {
+      return reply.status(400).send({ error: 'Invalid spec_type' });
+    }
+
+    save(merged);
+    return { ...DEFAULTS, ...load() };
+  });
+}
--- a/apps/server/src/services/inference/provider.ts
+++ b/apps/server/src/services/inference/provider.ts
@@ -57,11 +57,21 @@ interface ConfigLike {
  LLAMA_SIDECAR_URL?: string;
 }

-export function resolveRoute(agent: AgentLike | null): RoutingInfo {
+export function resolveRoute(
+  agent: AgentLike | null,
+  config?: ConfigLike,
+): RoutingInfo {
+  // When llama_extra_args are explicitly set, route through sidecar with them.
  const flags = agent?.llama_extra_args;
  if (flags && flags.length > 0) {
    return { route: 'sidecar', flags };
  }
+  // When LLAMA_SIDECAR_URL is configured (even without per-agent flags),
+  // route through sidecar to pick up the default base args (cache quant,
+  // spec decoding, slot save, etc.). Fall back to llama-swap otherwise.
+  if (config?.LLAMA_SIDECAR_URL) {
+    return { route: 'sidecar', flags: [] };
+  }
  return { route: 'swap', flags: null };
 }

@@ -70,15 +80,13 @@ export function upstreamModel(
  modelId: string,
  agent?: AgentLike | null,
 ): LanguageModel {
-  const { route, flags } = resolveRoute(agent ?? null);
+  const { route, flags } = resolveRoute(agent ?? null, config);
  if (route === 'sidecar') {
    const url = config.LLAMA_SIDECAR_URL;
    if (!url) {
-      throw new Error(
-        `Agent has llama_extra_args but LLAMA_SIDECAR_URL is not set`,
-      );
+      throw new Error(`Sidecar route selected but LLAMA_SIDECAR_URL is not set`);
    }
-    return sidecarProvider(url, flags!).chatModel(modelId);
+    return sidecarProvider(url, (flags ?? [])).chatModel(modelId);
  }
  return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId);
 }