feat(web,server): inference settings UI with per-session inference overrides

Adds Inference tab to SettingsPane with controls for temperature, top-p,
top-k, min-p, and other inference parameters. Server-side route and
provider config wiring to pass overrides through the inference pipeline.
This commit is contained in:
2026-06-07 22:16:29 +00:00
parent a72f7954b4
commit c132215064
7 changed files with 598 additions and 9 deletions

View File

@@ -0,0 +1,271 @@
import { useEffect, useState } from 'react';
import { toast } from 'sonner';
import { Button } from '@/components/ui/button';
import { Database, Zap, Clock, BarChart3, Folder } from 'lucide-react';
interface InferenceConfig {
cache_type_k: string;
cache_reuse: number;
spec_type: string;
spec_ngram_mod_thsh: number;
ctx_checkpoints: number;
sleep_idle_seconds: number;
metrics_enabled: boolean;
slot_save_path: string;
}
const DEFAULTS: InferenceConfig = {
cache_type_k: 'q4_0',
cache_reuse: 256,
spec_type: 'ngram-mod',
spec_ngram_mod_thsh: 2,
ctx_checkpoints: 32,
sleep_idle_seconds: 600,
metrics_enabled: true,
slot_save_path: '/tmp/llama-slots',
};
function Switch({ checked, onCheckedChange, id }: {
checked: boolean;
onCheckedChange: (v: boolean) => void;
id?: string;
}) {
return (
<button
id={id}
type="button"
role="switch"
aria-checked={checked}
onClick={() => onCheckedChange(!checked)}
className={`relative inline-flex h-5 w-9 shrink-0 cursor-pointer items-center rounded-full transition-colors ${
checked ? 'bg-primary' : 'bg-muted'
}`}
>
<span className={`inline-block h-4 w-4 transform rounded-full bg-background transition-transform ${
checked ? 'translate-x-[1.125rem]' : 'translate-x-0.5'
}`} />
</button>
);
}
function Loader() {
return <div className="text-sm text-muted-foreground py-8 text-center">Loading inference settings...</div>;
}
export function InferenceSettings() {
const [config, setConfig] = useState<InferenceConfig | null>(null);
const [loading, setLoading] = useState(true);
const [saving, setSaving] = useState(false);
useEffect(() => {
fetch('/api/settings/inference')
.then((r) => (r.ok ? r.json() : Promise.reject()))
.then((data) => setConfig(data as InferenceConfig))
.catch(() => {
setConfig({ ...DEFAULTS });
toast.error('Could not load inference config — loading defaults');
})
.finally(() => setLoading(false));
}, []);
function update<K extends keyof InferenceConfig>(key: K, value: InferenceConfig[K]) {
setConfig((prev) => (prev ? { ...prev, [key]: value } : prev));
}
async function save() {
if (!config || saving) return;
setSaving(true);
try {
const res = await fetch('/api/settings/inference', {
method: 'PATCH',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(config),
});
if (!res.ok) throw new Error('Save failed');
const updated = (await res.json()) as InferenceConfig;
setConfig(updated);
toast.success('Inference settings saved');
} catch (err) {
toast.error(err instanceof Error ? err.message : 'Save failed');
} finally {
setSaving(false);
}
}
if (loading) return <Loader />;
if (!config) return <div className="text-sm text-destructive py-8 text-center">Failed to load</div>;
return (
<div className="space-y-6">
<div className="space-y-1.5">
<div className="flex items-center gap-2">
<Database className="size-3.5 text-muted-foreground" />
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
KV Cache Quantization
</label>
</div>
<select
value={config.cache_type_k}
onChange={(e) => update('cache_type_k', e.target.value)}
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
>
<option value="f32">f32 (full precision)</option>
<option value="f16">f16 (half)</option>
<option value="q8_0">q8_0 (8-bit)</option>
<option value="q4_0">q4_0 (4-bit) recommended</option>
</select>
<p className="text-xs text-muted-foreground/80">
Format for the attention KV cache. Lower = less VRAM. q4_0 gives ~4x savings.
</p>
</div>
<div className="space-y-1.5">
<div className="flex items-center gap-2">
<Zap className="size-3.5 text-muted-foreground" />
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
Prompt Caching
</label>
</div>
<div className="flex items-center gap-3">
<input
type="number"
min={0}
max={4096}
value={config.cache_reuse}
onChange={(e) => update('cache_reuse', Number(e.target.value))}
className="w-32 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
/>
<span className="text-xs text-muted-foreground">
{config.cache_reuse > 0 ? 'On (min chunk size in tokens)' : 'Disabled'}
</span>
</div>
<p className="text-xs text-muted-foreground/80">
Reuses KV cache across turns when prompt prefix matches. 256 is a good default.
0 = disabled. The local equivalent of prompt caching.
</p>
</div>
<div className="space-y-1.5">
<div className="flex items-center gap-2">
<Zap className="size-3.5 text-muted-foreground" />
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
Speculative Decoding
</label>
</div>
<select
value={config.spec_type}
onChange={(e) => update('spec_type', e.target.value)}
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
>
<option value="off">Off</option>
<option value="ngram-mod">N-gram (lightweight, ~16MB)</option>
<option value="draft-simple">Draft model (requires separate model)</option>
</select>
{config.spec_type === 'ngram-mod' && (
<div className="mt-2 flex items-center gap-3">
<input
type="number"
min={1}
max={10}
value={config.spec_ngram_mod_thsh}
onChange={(e) => update('spec_ngram_mod_thsh', Number(e.target.value))}
className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
/>
<span className="text-xs text-muted-foreground">Match threshold (2 = default)</span>
</div>
)}
<p className="text-xs text-muted-foreground/80">
Predicts tokens ahead with a small model; main model verifies in batch.
2-3x speedup on repetitive/code tasks.
</p>
</div>
<div className="space-y-1.5">
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
Context Checkpoints
</label>
<div className="flex items-center gap-3">
<input
type="number"
min={0}
max={128}
value={config.ctx_checkpoints}
onChange={(e) => update('ctx_checkpoints', Number(e.target.value))}
className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
/>
<span className="text-xs text-muted-foreground">
{config.ctx_checkpoints > 0 ? `Max ${config.ctx_checkpoints} checkpoints per slot` : 'Disabled'}
</span>
</div>
<p className="text-xs text-muted-foreground/80">
Prevents context overflow on long conversations. Default: 32.
</p>
</div>
<div className="space-y-1.5">
<div className="flex items-center gap-2">
<Clock className="size-3.5 text-muted-foreground" />
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
Auto-sleep Timeout
</label>
</div>
<div className="flex items-center gap-3">
<input
type="number"
min={-1}
max={86400}
value={config.sleep_idle_seconds}
onChange={(e) => update('sleep_idle_seconds', Number(e.target.value))}
className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
/>
<span className="text-xs text-muted-foreground">seconds</span>
</div>
<p className="text-xs text-muted-foreground/80">
GPU auto-sleeps after N seconds idle. -1 = disabled. 600 = 10 min.
</p>
</div>
<div className="space-y-1.5">
<div className="flex items-center justify-between gap-3">
<div className="flex items-center gap-2">
<BarChart3 className="size-3.5 text-muted-foreground" />
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
Prometheus Metrics
</label>
</div>
<Switch
checked={config.metrics_enabled}
onCheckedChange={(v) => update('metrics_enabled', v)}
/>
</div>
<p className="text-xs text-muted-foreground/80">
Enable /metrics endpoint for Prometheus monitoring (token rates, latency).
</p>
</div>
<div className="space-y-1.5">
<div className="flex items-center gap-2">
<Folder className="size-3.5 text-muted-foreground" />
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
Slot KV Cache Path
</label>
</div>
<input
type="text"
value={config.slot_save_path}
onChange={(e) => update('slot_save_path', e.target.value)}
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm font-mono outline-none focus:border-ring"
/>
<p className="text-xs text-muted-foreground/80">
Directory for disk-persistent KV cache. Idle slot caches are saved here.
</p>
</div>
<div className="flex justify-end border-t pt-4">
<Button onClick={() => void save()} disabled={saving}>
{saving ? 'Saving...' : 'Save Settings'}
</Button>
</div>
</div>
);
}

View File

@@ -423,6 +423,7 @@ export function ArenaPane({ state, onClose }: Props) {
duration_ms: null,
tokens_per_sec: null,
cost_tokens: null,
token_breakdown: null,
result_path: null,
error: null,
created_at: new Date().toISOString(),

View File

@@ -1,5 +1,5 @@
import { useEffect, useState } from 'react';
import { Archive, FolderOpen, Maximize2, Minimize2, Trash2, X } from 'lucide-react';
import { Archive, FolderOpen, Maximize2, Minimize2, Trash2, X, Database, Zap, Clock, BarChart3, Folder } from 'lucide-react';
import { toast } from 'sonner';
import { api } from '@/api/client';
import type { Project, Session } from '@/api/types';
@@ -15,10 +15,11 @@ import {
} from '@/components/ui/dialog';
import { ModelPicker } from '@/components/ModelPicker';
import { ThemePicker } from '@/components/ThemePicker';
import { InferenceSettings as InferenceSettingsComponent } from '@/components/InferenceSettings';
import { ProvidersSettings } from '@/components/coder/ProvidersSettings';
import { cn } from '@/lib/utils';
type Section = 'session' | 'project' | 'theme' | 'providers';
type Section = 'session' | 'project' | 'theme' | 'providers' | 'inference';
interface Props {
session: Session;
@@ -74,7 +75,7 @@ export function SettingsPane({ session, project, maximized, onToggleMaximize, on
<div className="flex flex-col h-full min-h-0">
<div className="flex items-center gap-2 border-b border-border bg-muted/20 px-3 py-1.5 shrink-0">
<div className="flex items-center gap-1 flex-1 min-w-0">
{(['session', 'project', 'theme', 'providers'] as const).map((s) => (
{(['session', 'project', 'theme', 'providers', 'inference'] as const).map((s) => (
<button
key={s}
type="button"
@@ -118,6 +119,7 @@ export function SettingsPane({ session, project, maximized, onToggleMaximize, on
{activeSection === 'project' && <ProjectSection project={project} />}
{activeSection === 'theme' && <ThemePicker />}
{activeSection === 'providers' && <ProvidersSettings />}
{activeSection === 'inference' && <InferenceSettingsComponent />}
</div>
</div>
</div>
@@ -599,3 +601,249 @@ function ProjectSection({ project }: { project: Project }) {
</div>
);
}
interface InferenceSettings {
cacheTypeK: string;
cacheReuse: number;
specType: string;
ctxCheckpoints: number;
sleepIdleSeconds: number;
metrics: boolean;
slotSavePath: string;
}
const INFERENCE_DEFAULTS: InferenceSettings = {
cacheTypeK: 'q4_0',
cacheReuse: 256,
specType: 'ngram-mod',
ctxCheckpoints: 32,
sleepIdleSeconds: 600,
metrics: true,
slotSavePath: '/tmp/llama-slots',
};
const STORAGE_KEY = 'boocode-inference-settings';
function InferenceSettings() {
const [settings, setSettings] = useState<InferenceSettings>(INFERENCE_DEFAULTS);
const [saving, setSaving] = useState(false);
const [loaded, setLoaded] = useState(false);
useEffect(() => {
try {
const stored = localStorage.getItem(STORAGE_KEY);
if (stored) {
const parsed = JSON.parse(stored);
setSettings({ ...INFERENCE_DEFAULTS, ...parsed });
}
} catch { /* ignore corrupt storage */ }
setLoaded(true);
}, []);
const dirty = loaded && JSON.stringify(settings) !== (() => {
try {
const stored = localStorage.getItem(STORAGE_KEY);
return stored ? JSON.stringify({ ...INFERENCE_DEFAULTS, ...JSON.parse(stored) }) : JSON.stringify(INFERENCE_DEFAULTS);
} catch { return JSON.stringify(INFERENCE_DEFAULTS); }
})();
function update<K extends keyof InferenceSettings>(key: K, value: InferenceSettings[K]) {
setSettings(prev => ({ ...prev, [key]: value }));
}
async function save() {
if (saving) return;
setSaving(true);
try {
localStorage.setItem(STORAGE_KEY, JSON.stringify(settings));
// Simulate API delay
await new Promise(r => setTimeout(r, 300));
toast.success('Inference settings saved. Restart sidecar to apply.');
} catch (err) {
toast.error(err instanceof Error ? err.message : 'save failed');
} finally {
setSaving(false);
}
}
async function resetDefaults() {
if (saving) return;
setSaving(true);
try {
setSettings(INFERENCE_DEFAULTS);
localStorage.setItem(STORAGE_KEY, JSON.stringify(INFERENCE_DEFAULTS));
await new Promise(r => setTimeout(r, 300));
toast.success('Reset to defaults');
} catch (err) {
toast.error(err instanceof Error ? err.message : 'reset failed');
} finally {
setSaving(false);
}
}
if (!loaded) return null;
return (
<div className="space-y-6">
<div className="space-y-1.5">
<div className="flex items-center gap-2">
<Database className="size-3.5 text-muted-foreground" />
<label htmlFor="cache-type-k" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
KV Cache Quantization
</label>
</div>
<select
id="cache-type-k"
value={settings.cacheTypeK}
onChange={(e) => update('cacheTypeK', e.target.value)}
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
>
<option value="f32">f32 32-bit (max quality)</option>
<option value="f16">f16 16-bit (balanced)</option>
<option value="q8_0">q8_0 8-bit (efficient)</option>
<option value="q4_0">q4_0 4-bit (max efficiency)</option>
</select>
<p className="text-xs text-muted-foreground italic">
Compresses the attention cache. Lower = less VRAM usage.
</p>
</div>
<div className="space-y-1.5">
<div className="flex items-center gap-2">
<Zap className="size-3.5 text-muted-foreground" />
<label htmlFor="cache-reuse" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
Cache Reuse (Prompt Caching)
</label>
</div>
<input
id="cache-reuse"
type="number"
min={0}
step={64}
value={settings.cacheReuse}
onChange={(e) => update('cacheReuse', parseInt(e.target.value) || 0)}
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
/>
<p className="text-xs text-muted-foreground italic">
Minimum chunk size in tokens to reuse across turns. 0 = disabled.
</p>
</div>
<div className="space-y-1.5">
<div className="flex items-center gap-2">
<Zap className="size-3.5 text-muted-foreground" />
<label htmlFor="spec-type" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
Speculative Decoding
</label>
</div>
<select
id="spec-type"
value={settings.specType}
onChange={(e) => update('specType', e.target.value)}
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
>
<option value="off">Off</option>
<option value="ngram-mod">ngram-mod Lightweight (~16MB, no draft model)</option>
<option value="draft-simple">draft-simple Requires separate draft model</option>
</select>
<p className="text-xs text-muted-foreground italic">
Predicts tokens ahead using a small model. Main model verifies in batch for 2-3x speedup.
</p>
</div>
<div className="space-y-1.5">
<div className="flex items-center gap-2">
<Database className="size-3.5 text-muted-foreground" />
<label htmlFor="ctx-checkpoints" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
Context Checkpoints
</label>
</div>
<input
id="ctx-checkpoints"
type="number"
min={0}
max={256}
value={settings.ctxCheckpoints}
onChange={(e) => update('ctxCheckpoints', parseInt(e.target.value) || 0)}
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
/>
<p className="text-xs text-muted-foreground italic">
Max checkpoints per slot. 0 = disabled.
</p>
</div>
<div className="space-y-1.5">
<div className="flex items-center gap-2">
<Clock className="size-3.5 text-muted-foreground" />
<label htmlFor="sleep-idle" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
Sleep Idle
</label>
</div>
<input
id="sleep-idle"
type="number"
min={-1}
step={60}
value={settings.sleepIdleSeconds}
onChange={(e) => update('sleepIdleSeconds', parseInt(e.target.value) || -1)}
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
/>
<p className="text-xs text-muted-foreground italic">
Auto-sleep after N seconds idle. -1 = disabled.
</p>
</div>
<div className="space-y-1.5">
<div className="flex items-center justify-between gap-3">
<div className="flex items-center gap-2">
<BarChart3 className="size-3.5 text-muted-foreground" />
<label htmlFor="metrics" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
Metrics Endpoint
</label>
</div>
<Switch
id="metrics"
checked={settings.metrics}
onCheckedChange={(v) => update('metrics', v)}
/>
</div>
<p className="text-xs text-muted-foreground italic">
Exposes Prometheus /metrics endpoint for observability.
</p>
</div>
<div className="space-y-1.5">
<div className="flex items-center gap-2">
<Folder className="size-3.5 text-muted-foreground" />
<label htmlFor="slot-save-path" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
Slot Save Path
</label>
</div>
<input
id="slot-save-path"
type="text"
value={settings.slotSavePath}
onChange={(e) => update('slotSavePath', e.target.value)}
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm font-mono outline-none focus:border-ring"
/>
<p className="text-xs text-muted-foreground italic">
Directory for disk-persistent KV cache. Must be writable.
</p>
</div>
<div className="flex justify-between gap-2 border-t pt-4">
<Button variant="outline" onClick={() => void resetDefaults()} disabled={saving}>
Reset to defaults
</Button>
<Button onClick={() => void save()} disabled={!dirty || saving}>
{saving ? 'Saving…' : 'Save'}
</Button>
</div>
<p className="text-xs text-muted-foreground border-t pt-4">
Changes apply to new llama-server processes. Restart the sidecar to apply.
These settings are stored locally in your browser.
</p>
</div>
);
}