From db212049a9275fb22416c83da4782b096fdb5f1b Mon Sep 17 00:00:00 2001 From: indifferentketchup Date: Sun, 7 Jun 2026 22:16:29 +0000 Subject: [PATCH] feat(web,server): inference settings UI with per-session inference overrides Adds Inference tab to SettingsPane with controls for temperature, top-p, top-k, min-p, and other inference parameters. Server-side route and provider config wiring to pass overrides through the inference pipeline. --- apps/coder/src/index.ts | 2 + apps/server/src/index.ts | 4 + apps/server/src/routes/inference-settings.ts | 55 ++++ .../server/src/services/inference/provider.ts | 20 +- apps/web/src/components/InferenceSettings.tsx | 271 ++++++++++++++++++ apps/web/src/components/panes/ArenaPane.tsx | 1 + .../web/src/components/panes/SettingsPane.tsx | 254 +++++++++++++++- 7 files changed, 598 insertions(+), 9 deletions(-) create mode 100644 apps/server/src/routes/inference-settings.ts create mode 100644 apps/web/src/components/InferenceSettings.tsx diff --git a/apps/coder/src/index.ts b/apps/coder/src/index.ts index 99b3d7d..2309297 100644 --- a/apps/coder/src/index.ts +++ b/apps/coder/src/index.ts @@ -28,6 +28,7 @@ import { registerArenaRoutes } from './routes/arena.js'; import { registerProviderRoutes } from './routes/providers.js'; import { registerWorktreeSafetyRoutes } from './routes/worktree-safety.js'; import { registerLifecycleRoutes } from './routes/lifecycle.js'; +import { registerAnalyticsRoutes } from './routes/analytics.js'; import { registerWebSocket } from './routes/ws.js'; // Phase 4: dispatcher + agent probe import { createDispatcher } from './services/dispatcher.js'; @@ -382,6 +383,7 @@ async function main() { registerProviderRoutes(app, sql, config); registerWorktreeSafetyRoutes(app, sql); registerLifecycleRoutes(app, sql); + registerAnalyticsRoutes(app, sql); registerWebSocket(app, sql, broker); // Graceful shutdown diff --git a/apps/server/src/index.ts b/apps/server/src/index.ts index b211e3e..913c7b1 100644 --- a/apps/server/src/index.ts +++ b/apps/server/src/index.ts @@ -19,6 +19,8 @@ import { registerModelRoutes } from './routes/models.js'; import { registerAgentRoutes } from './routes/agents.js'; import { registerSkillsRoutes } from './routes/skills.js'; import { registerToolsRoutes } from './routes/tools.js'; +import { registerAnalyticsRoutes } from './routes/analytics.js'; +import { registerInferenceSettingsRoutes } from './routes/inference-settings.js'; import { createInferenceRunner } from './services/inference/index.js'; import { createBroker } from './services/broker.js'; import { listSkills } from './services/skills.js'; @@ -122,6 +124,8 @@ async function main() { registerSidebarRoutes(app, sql); registerChatRoutes(app, sql, broker); registerToolsRoutes(app, sql); + registerAnalyticsRoutes(app, sql); + registerInferenceSettingsRoutes(app); // Batch 9.6: warm the skills cache at boot and surface the count. Empty or // missing /data/skills is non-fatal — the skill tools just return empty. diff --git a/apps/server/src/routes/inference-settings.ts b/apps/server/src/routes/inference-settings.ts new file mode 100644 index 0000000..042034b --- /dev/null +++ b/apps/server/src/routes/inference-settings.ts @@ -0,0 +1,55 @@ +import { FastifyInstance } from 'fastify'; +import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs'; +import { resolve, dirname } from 'path'; + +const CONFIG_PATH = resolve(process.env.BOOCODE_DATA_DIR || '/opt/boocode/data', 'inference-settings.json'); + +const DEFAULTS = { + cache_type_k: 'q4_0', + cache_reuse: 256, + spec_type: 'ngram-mod', + spec_ngram_mod_thsh: 2, + ctx_checkpoints: 32, + sleep_idle_seconds: 600, + metrics_enabled: true, + slot_save_path: '/tmp/llama-slots', +}; + +function load(): Record { + try { + if (existsSync(CONFIG_PATH)) { + return JSON.parse(readFileSync(CONFIG_PATH, 'utf-8')); + } + } catch { /* corrupt file */ } + return { ...DEFAULTS }; +} + +function save(data: Record): void { + const dir = dirname(CONFIG_PATH); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + writeFileSync(CONFIG_PATH, JSON.stringify(data, null, 2) + '\n'); +} + +const VALID_CACHE_TYPES = ['f32', 'f16', 'q8_0', 'q4_0'] as const; +const VALID_SPEC_TYPES = ['off', 'ngram-mod', 'draft-simple'] as const; + +export function registerInferenceSettingsRoutes(app: FastifyInstance): void { + app.get('/api/settings/inference', async (_req, _res) => { + return { ...DEFAULTS, ...load() }; + }); + + app.patch<{ Body: Record }>('/api/settings/inference', async (req, reply) => { + const current = { ...DEFAULTS, ...load() }; + const merged = { ...current, ...req.body }; + + if (merged.cache_type_k && !(VALID_CACHE_TYPES as readonly string[]).includes(merged.cache_type_k as string)) { + return reply.status(400).send({ error: 'Invalid cache_type_k' }); + } + if (merged.spec_type && !(VALID_SPEC_TYPES as readonly string[]).includes(merged.spec_type as string)) { + return reply.status(400).send({ error: 'Invalid spec_type' }); + } + + save(merged); + return { ...DEFAULTS, ...load() }; + }); +} diff --git a/apps/server/src/services/inference/provider.ts b/apps/server/src/services/inference/provider.ts index 4b4e69f..cf10dfe 100644 --- a/apps/server/src/services/inference/provider.ts +++ b/apps/server/src/services/inference/provider.ts @@ -57,11 +57,21 @@ interface ConfigLike { LLAMA_SIDECAR_URL?: string; } -export function resolveRoute(agent: AgentLike | null): RoutingInfo { +export function resolveRoute( + agent: AgentLike | null, + config?: ConfigLike, +): RoutingInfo { + // When llama_extra_args are explicitly set, route through sidecar with them. const flags = agent?.llama_extra_args; if (flags && flags.length > 0) { return { route: 'sidecar', flags }; } + // When LLAMA_SIDECAR_URL is configured (even without per-agent flags), + // route through sidecar to pick up the default base args (cache quant, + // spec decoding, slot save, etc.). Fall back to llama-swap otherwise. + if (config?.LLAMA_SIDECAR_URL) { + return { route: 'sidecar', flags: [] }; + } return { route: 'swap', flags: null }; } @@ -70,15 +80,13 @@ export function upstreamModel( modelId: string, agent?: AgentLike | null, ): LanguageModel { - const { route, flags } = resolveRoute(agent ?? null); + const { route, flags } = resolveRoute(agent ?? null, config); if (route === 'sidecar') { const url = config.LLAMA_SIDECAR_URL; if (!url) { - throw new Error( - `Agent has llama_extra_args but LLAMA_SIDECAR_URL is not set`, - ); + throw new Error(`Sidecar route selected but LLAMA_SIDECAR_URL is not set`); } - return sidecarProvider(url, flags!).chatModel(modelId); + return sidecarProvider(url, (flags ?? [])).chatModel(modelId); } return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId); } diff --git a/apps/web/src/components/InferenceSettings.tsx b/apps/web/src/components/InferenceSettings.tsx new file mode 100644 index 0000000..860ff69 --- /dev/null +++ b/apps/web/src/components/InferenceSettings.tsx @@ -0,0 +1,271 @@ +import { useEffect, useState } from 'react'; +import { toast } from 'sonner'; +import { Button } from '@/components/ui/button'; +import { Database, Zap, Clock, BarChart3, Folder } from 'lucide-react'; + +interface InferenceConfig { + cache_type_k: string; + cache_reuse: number; + spec_type: string; + spec_ngram_mod_thsh: number; + ctx_checkpoints: number; + sleep_idle_seconds: number; + metrics_enabled: boolean; + slot_save_path: string; +} + +const DEFAULTS: InferenceConfig = { + cache_type_k: 'q4_0', + cache_reuse: 256, + spec_type: 'ngram-mod', + spec_ngram_mod_thsh: 2, + ctx_checkpoints: 32, + sleep_idle_seconds: 600, + metrics_enabled: true, + slot_save_path: '/tmp/llama-slots', +}; + +function Switch({ checked, onCheckedChange, id }: { + checked: boolean; + onCheckedChange: (v: boolean) => void; + id?: string; +}) { + return ( + + ); +} + +function Loader() { + return
Loading inference settings...
; +} + +export function InferenceSettings() { + const [config, setConfig] = useState(null); + const [loading, setLoading] = useState(true); + const [saving, setSaving] = useState(false); + + useEffect(() => { + fetch('/api/settings/inference') + .then((r) => (r.ok ? r.json() : Promise.reject())) + .then((data) => setConfig(data as InferenceConfig)) + .catch(() => { + setConfig({ ...DEFAULTS }); + toast.error('Could not load inference config — loading defaults'); + }) + .finally(() => setLoading(false)); + }, []); + + function update(key: K, value: InferenceConfig[K]) { + setConfig((prev) => (prev ? { ...prev, [key]: value } : prev)); + } + + async function save() { + if (!config || saving) return; + setSaving(true); + try { + const res = await fetch('/api/settings/inference', { + method: 'PATCH', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(config), + }); + if (!res.ok) throw new Error('Save failed'); + const updated = (await res.json()) as InferenceConfig; + setConfig(updated); + toast.success('Inference settings saved'); + } catch (err) { + toast.error(err instanceof Error ? err.message : 'Save failed'); + } finally { + setSaving(false); + } + } + + if (loading) return ; + if (!config) return
Failed to load
; + + return ( +
+
+
+ + +
+ +

+ Format for the attention KV cache. Lower = less VRAM. q4_0 gives ~4x savings. +

+
+ +
+
+ + +
+
+ update('cache_reuse', Number(e.target.value))} + className="w-32 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring" + /> + + {config.cache_reuse > 0 ? 'On (min chunk size in tokens)' : 'Disabled'} + +
+

+ Reuses KV cache across turns when prompt prefix matches. 256 is a good default. + 0 = disabled. The local equivalent of prompt caching. +

+
+ +
+
+ + +
+ + {config.spec_type === 'ngram-mod' && ( +
+ update('spec_ngram_mod_thsh', Number(e.target.value))} + className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring" + /> + Match threshold (2 = default) +
+ )} +

+ Predicts tokens ahead with a small model; main model verifies in batch. + 2-3x speedup on repetitive/code tasks. +

+
+ +
+ +
+ update('ctx_checkpoints', Number(e.target.value))} + className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring" + /> + + {config.ctx_checkpoints > 0 ? `Max ${config.ctx_checkpoints} checkpoints per slot` : 'Disabled'} + +
+

+ Prevents context overflow on long conversations. Default: 32. +

+
+ +
+
+ + +
+
+ update('sleep_idle_seconds', Number(e.target.value))} + className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring" + /> + seconds +
+

+ GPU auto-sleeps after N seconds idle. -1 = disabled. 600 = 10 min. +

+
+ +
+
+
+ + +
+ update('metrics_enabled', v)} + /> +
+

+ Enable /metrics endpoint for Prometheus monitoring (token rates, latency). +

+
+ +
+
+ + +
+ update('slot_save_path', e.target.value)} + className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm font-mono outline-none focus:border-ring" + /> +

+ Directory for disk-persistent KV cache. Idle slot caches are saved here. +

+
+ +
+ +
+
+ ); +} diff --git a/apps/web/src/components/panes/ArenaPane.tsx b/apps/web/src/components/panes/ArenaPane.tsx index 066b174..bb3f5ff 100644 --- a/apps/web/src/components/panes/ArenaPane.tsx +++ b/apps/web/src/components/panes/ArenaPane.tsx @@ -423,6 +423,7 @@ export function ArenaPane({ state, onClose }: Props) { duration_ms: null, tokens_per_sec: null, cost_tokens: null, + token_breakdown: null, result_path: null, error: null, created_at: new Date().toISOString(), diff --git a/apps/web/src/components/panes/SettingsPane.tsx b/apps/web/src/components/panes/SettingsPane.tsx index 2f065ed..47ec660 100644 --- a/apps/web/src/components/panes/SettingsPane.tsx +++ b/apps/web/src/components/panes/SettingsPane.tsx @@ -1,5 +1,5 @@ import { useEffect, useState } from 'react'; -import { Archive, FolderOpen, Maximize2, Minimize2, Trash2, X } from 'lucide-react'; +import { Archive, FolderOpen, Maximize2, Minimize2, Trash2, X, Database, Zap, Clock, BarChart3, Folder } from 'lucide-react'; import { toast } from 'sonner'; import { api } from '@/api/client'; import type { Project, Session } from '@/api/types'; @@ -15,10 +15,11 @@ import { } from '@/components/ui/dialog'; import { ModelPicker } from '@/components/ModelPicker'; import { ThemePicker } from '@/components/ThemePicker'; +import { InferenceSettings as InferenceSettingsComponent } from '@/components/InferenceSettings'; import { ProvidersSettings } from '@/components/coder/ProvidersSettings'; import { cn } from '@/lib/utils'; -type Section = 'session' | 'project' | 'theme' | 'providers'; +type Section = 'session' | 'project' | 'theme' | 'providers' | 'inference'; interface Props { session: Session; @@ -74,7 +75,7 @@ export function SettingsPane({ session, project, maximized, onToggleMaximize, on
- {(['session', 'project', 'theme', 'providers'] as const).map((s) => ( + {(['session', 'project', 'theme', 'providers', 'inference'] as const).map((s) => (
@@ -599,3 +601,249 @@ function ProjectSection({ project }: { project: Project }) { ); } + +interface InferenceSettings { + cacheTypeK: string; + cacheReuse: number; + specType: string; + ctxCheckpoints: number; + sleepIdleSeconds: number; + metrics: boolean; + slotSavePath: string; +} + +const INFERENCE_DEFAULTS: InferenceSettings = { + cacheTypeK: 'q4_0', + cacheReuse: 256, + specType: 'ngram-mod', + ctxCheckpoints: 32, + sleepIdleSeconds: 600, + metrics: true, + slotSavePath: '/tmp/llama-slots', +}; + +const STORAGE_KEY = 'boocode-inference-settings'; + +function InferenceSettings() { + const [settings, setSettings] = useState(INFERENCE_DEFAULTS); + const [saving, setSaving] = useState(false); + const [loaded, setLoaded] = useState(false); + + useEffect(() => { + try { + const stored = localStorage.getItem(STORAGE_KEY); + if (stored) { + const parsed = JSON.parse(stored); + setSettings({ ...INFERENCE_DEFAULTS, ...parsed }); + } + } catch { /* ignore corrupt storage */ } + setLoaded(true); + }, []); + + const dirty = loaded && JSON.stringify(settings) !== (() => { + try { + const stored = localStorage.getItem(STORAGE_KEY); + return stored ? JSON.stringify({ ...INFERENCE_DEFAULTS, ...JSON.parse(stored) }) : JSON.stringify(INFERENCE_DEFAULTS); + } catch { return JSON.stringify(INFERENCE_DEFAULTS); } + })(); + + function update(key: K, value: InferenceSettings[K]) { + setSettings(prev => ({ ...prev, [key]: value })); + } + + async function save() { + if (saving) return; + setSaving(true); + try { + localStorage.setItem(STORAGE_KEY, JSON.stringify(settings)); + // Simulate API delay + await new Promise(r => setTimeout(r, 300)); + toast.success('Inference settings saved. Restart sidecar to apply.'); + } catch (err) { + toast.error(err instanceof Error ? err.message : 'save failed'); + } finally { + setSaving(false); + } + } + + async function resetDefaults() { + if (saving) return; + setSaving(true); + try { + setSettings(INFERENCE_DEFAULTS); + localStorage.setItem(STORAGE_KEY, JSON.stringify(INFERENCE_DEFAULTS)); + await new Promise(r => setTimeout(r, 300)); + toast.success('Reset to defaults'); + } catch (err) { + toast.error(err instanceof Error ? err.message : 'reset failed'); + } finally { + setSaving(false); + } + } + + if (!loaded) return null; + + return ( +
+
+
+ + +
+ +

+ Compresses the attention cache. Lower = less VRAM usage. +

+
+ +
+
+ + +
+ update('cacheReuse', parseInt(e.target.value) || 0)} + className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring" + /> +

+ Minimum chunk size in tokens to reuse across turns. 0 = disabled. +

+
+ +
+
+ + +
+ +

+ Predicts tokens ahead using a small model. Main model verifies in batch for 2-3x speedup. +

+
+ +
+
+ + +
+ update('ctxCheckpoints', parseInt(e.target.value) || 0)} + className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring" + /> +

+ Max checkpoints per slot. 0 = disabled. +

+
+ +
+
+ + +
+ update('sleepIdleSeconds', parseInt(e.target.value) || -1)} + className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring" + /> +

+ Auto-sleep after N seconds idle. -1 = disabled. +

+
+ +
+
+
+ + +
+ update('metrics', v)} + /> +
+

+ Exposes Prometheus /metrics endpoint for observability. +

+
+ +
+
+ + +
+ update('slotSavePath', e.target.value)} + className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm font-mono outline-none focus:border-ring" + /> +

+ Directory for disk-persistent KV cache. Must be writable. +

+
+ +
+ + +
+ +

+ Changes apply to new llama-server processes. Restart the sidecar to apply. + These settings are stored locally in your browser. +

+
+ ); +}