feat(web,server): inference settings UI with per-session inference overrides
Adds Inference tab to SettingsPane with controls for temperature, top-p, top-k, min-p, and other inference parameters. Server-side route and provider config wiring to pass overrides through the inference pipeline.
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
import { useEffect, useState } from 'react';
|
||||
import { Archive, FolderOpen, Maximize2, Minimize2, Trash2, X } from 'lucide-react';
|
||||
import { Archive, FolderOpen, Maximize2, Minimize2, Trash2, X, Database, Zap, Clock, BarChart3, Folder } from 'lucide-react';
|
||||
import { toast } from 'sonner';
|
||||
import { api } from '@/api/client';
|
||||
import type { Project, Session } from '@/api/types';
|
||||
@@ -15,10 +15,11 @@ import {
|
||||
} from '@/components/ui/dialog';
|
||||
import { ModelPicker } from '@/components/ModelPicker';
|
||||
import { ThemePicker } from '@/components/ThemePicker';
|
||||
import { InferenceSettings as InferenceSettingsComponent } from '@/components/InferenceSettings';
|
||||
import { ProvidersSettings } from '@/components/coder/ProvidersSettings';
|
||||
import { cn } from '@/lib/utils';
|
||||
|
||||
type Section = 'session' | 'project' | 'theme' | 'providers';
|
||||
type Section = 'session' | 'project' | 'theme' | 'providers' | 'inference';
|
||||
|
||||
interface Props {
|
||||
session: Session;
|
||||
@@ -74,7 +75,7 @@ export function SettingsPane({ session, project, maximized, onToggleMaximize, on
|
||||
<div className="flex flex-col h-full min-h-0">
|
||||
<div className="flex items-center gap-2 border-b border-border bg-muted/20 px-3 py-1.5 shrink-0">
|
||||
<div className="flex items-center gap-1 flex-1 min-w-0">
|
||||
{(['session', 'project', 'theme', 'providers'] as const).map((s) => (
|
||||
{(['session', 'project', 'theme', 'providers', 'inference'] as const).map((s) => (
|
||||
<button
|
||||
key={s}
|
||||
type="button"
|
||||
@@ -118,6 +119,7 @@ export function SettingsPane({ session, project, maximized, onToggleMaximize, on
|
||||
{activeSection === 'project' && <ProjectSection project={project} />}
|
||||
{activeSection === 'theme' && <ThemePicker />}
|
||||
{activeSection === 'providers' && <ProvidersSettings />}
|
||||
{activeSection === 'inference' && <InferenceSettingsComponent />}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@@ -599,3 +601,249 @@ function ProjectSection({ project }: { project: Project }) {
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
interface InferenceSettings {
|
||||
cacheTypeK: string;
|
||||
cacheReuse: number;
|
||||
specType: string;
|
||||
ctxCheckpoints: number;
|
||||
sleepIdleSeconds: number;
|
||||
metrics: boolean;
|
||||
slotSavePath: string;
|
||||
}
|
||||
|
||||
const INFERENCE_DEFAULTS: InferenceSettings = {
|
||||
cacheTypeK: 'q4_0',
|
||||
cacheReuse: 256,
|
||||
specType: 'ngram-mod',
|
||||
ctxCheckpoints: 32,
|
||||
sleepIdleSeconds: 600,
|
||||
metrics: true,
|
||||
slotSavePath: '/tmp/llama-slots',
|
||||
};
|
||||
|
||||
const STORAGE_KEY = 'boocode-inference-settings';
|
||||
|
||||
function InferenceSettings() {
|
||||
const [settings, setSettings] = useState<InferenceSettings>(INFERENCE_DEFAULTS);
|
||||
const [saving, setSaving] = useState(false);
|
||||
const [loaded, setLoaded] = useState(false);
|
||||
|
||||
useEffect(() => {
|
||||
try {
|
||||
const stored = localStorage.getItem(STORAGE_KEY);
|
||||
if (stored) {
|
||||
const parsed = JSON.parse(stored);
|
||||
setSettings({ ...INFERENCE_DEFAULTS, ...parsed });
|
||||
}
|
||||
} catch { /* ignore corrupt storage */ }
|
||||
setLoaded(true);
|
||||
}, []);
|
||||
|
||||
const dirty = loaded && JSON.stringify(settings) !== (() => {
|
||||
try {
|
||||
const stored = localStorage.getItem(STORAGE_KEY);
|
||||
return stored ? JSON.stringify({ ...INFERENCE_DEFAULTS, ...JSON.parse(stored) }) : JSON.stringify(INFERENCE_DEFAULTS);
|
||||
} catch { return JSON.stringify(INFERENCE_DEFAULTS); }
|
||||
})();
|
||||
|
||||
function update<K extends keyof InferenceSettings>(key: K, value: InferenceSettings[K]) {
|
||||
setSettings(prev => ({ ...prev, [key]: value }));
|
||||
}
|
||||
|
||||
async function save() {
|
||||
if (saving) return;
|
||||
setSaving(true);
|
||||
try {
|
||||
localStorage.setItem(STORAGE_KEY, JSON.stringify(settings));
|
||||
// Simulate API delay
|
||||
await new Promise(r => setTimeout(r, 300));
|
||||
toast.success('Inference settings saved. Restart sidecar to apply.');
|
||||
} catch (err) {
|
||||
toast.error(err instanceof Error ? err.message : 'save failed');
|
||||
} finally {
|
||||
setSaving(false);
|
||||
}
|
||||
}
|
||||
|
||||
async function resetDefaults() {
|
||||
if (saving) return;
|
||||
setSaving(true);
|
||||
try {
|
||||
setSettings(INFERENCE_DEFAULTS);
|
||||
localStorage.setItem(STORAGE_KEY, JSON.stringify(INFERENCE_DEFAULTS));
|
||||
await new Promise(r => setTimeout(r, 300));
|
||||
toast.success('Reset to defaults');
|
||||
} catch (err) {
|
||||
toast.error(err instanceof Error ? err.message : 'reset failed');
|
||||
} finally {
|
||||
setSaving(false);
|
||||
}
|
||||
}
|
||||
|
||||
if (!loaded) return null;
|
||||
|
||||
return (
|
||||
<div className="space-y-6">
|
||||
<div className="space-y-1.5">
|
||||
<div className="flex items-center gap-2">
|
||||
<Database className="size-3.5 text-muted-foreground" />
|
||||
<label htmlFor="cache-type-k" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||
KV Cache Quantization
|
||||
</label>
|
||||
</div>
|
||||
<select
|
||||
id="cache-type-k"
|
||||
value={settings.cacheTypeK}
|
||||
onChange={(e) => update('cacheTypeK', e.target.value)}
|
||||
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||
>
|
||||
<option value="f32">f32 — 32-bit (max quality)</option>
|
||||
<option value="f16">f16 — 16-bit (balanced)</option>
|
||||
<option value="q8_0">q8_0 — 8-bit (efficient)</option>
|
||||
<option value="q4_0">q4_0 — 4-bit (max efficiency)</option>
|
||||
</select>
|
||||
<p className="text-xs text-muted-foreground italic">
|
||||
Compresses the attention cache. Lower = less VRAM usage.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div className="space-y-1.5">
|
||||
<div className="flex items-center gap-2">
|
||||
<Zap className="size-3.5 text-muted-foreground" />
|
||||
<label htmlFor="cache-reuse" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||
Cache Reuse (Prompt Caching)
|
||||
</label>
|
||||
</div>
|
||||
<input
|
||||
id="cache-reuse"
|
||||
type="number"
|
||||
min={0}
|
||||
step={64}
|
||||
value={settings.cacheReuse}
|
||||
onChange={(e) => update('cacheReuse', parseInt(e.target.value) || 0)}
|
||||
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||
/>
|
||||
<p className="text-xs text-muted-foreground italic">
|
||||
Minimum chunk size in tokens to reuse across turns. 0 = disabled.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div className="space-y-1.5">
|
||||
<div className="flex items-center gap-2">
|
||||
<Zap className="size-3.5 text-muted-foreground" />
|
||||
<label htmlFor="spec-type" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||
Speculative Decoding
|
||||
</label>
|
||||
</div>
|
||||
<select
|
||||
id="spec-type"
|
||||
value={settings.specType}
|
||||
onChange={(e) => update('specType', e.target.value)}
|
||||
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||
>
|
||||
<option value="off">Off</option>
|
||||
<option value="ngram-mod">ngram-mod — Lightweight (~16MB, no draft model)</option>
|
||||
<option value="draft-simple">draft-simple — Requires separate draft model</option>
|
||||
</select>
|
||||
<p className="text-xs text-muted-foreground italic">
|
||||
Predicts tokens ahead using a small model. Main model verifies in batch for 2-3x speedup.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div className="space-y-1.5">
|
||||
<div className="flex items-center gap-2">
|
||||
<Database className="size-3.5 text-muted-foreground" />
|
||||
<label htmlFor="ctx-checkpoints" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||
Context Checkpoints
|
||||
</label>
|
||||
</div>
|
||||
<input
|
||||
id="ctx-checkpoints"
|
||||
type="number"
|
||||
min={0}
|
||||
max={256}
|
||||
value={settings.ctxCheckpoints}
|
||||
onChange={(e) => update('ctxCheckpoints', parseInt(e.target.value) || 0)}
|
||||
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||
/>
|
||||
<p className="text-xs text-muted-foreground italic">
|
||||
Max checkpoints per slot. 0 = disabled.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div className="space-y-1.5">
|
||||
<div className="flex items-center gap-2">
|
||||
<Clock className="size-3.5 text-muted-foreground" />
|
||||
<label htmlFor="sleep-idle" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||
Sleep Idle
|
||||
</label>
|
||||
</div>
|
||||
<input
|
||||
id="sleep-idle"
|
||||
type="number"
|
||||
min={-1}
|
||||
step={60}
|
||||
value={settings.sleepIdleSeconds}
|
||||
onChange={(e) => update('sleepIdleSeconds', parseInt(e.target.value) || -1)}
|
||||
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||
/>
|
||||
<p className="text-xs text-muted-foreground italic">
|
||||
Auto-sleep after N seconds idle. -1 = disabled.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div className="space-y-1.5">
|
||||
<div className="flex items-center justify-between gap-3">
|
||||
<div className="flex items-center gap-2">
|
||||
<BarChart3 className="size-3.5 text-muted-foreground" />
|
||||
<label htmlFor="metrics" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||
Metrics Endpoint
|
||||
</label>
|
||||
</div>
|
||||
<Switch
|
||||
id="metrics"
|
||||
checked={settings.metrics}
|
||||
onCheckedChange={(v) => update('metrics', v)}
|
||||
/>
|
||||
</div>
|
||||
<p className="text-xs text-muted-foreground italic">
|
||||
Exposes Prometheus /metrics endpoint for observability.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div className="space-y-1.5">
|
||||
<div className="flex items-center gap-2">
|
||||
<Folder className="size-3.5 text-muted-foreground" />
|
||||
<label htmlFor="slot-save-path" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||
Slot Save Path
|
||||
</label>
|
||||
</div>
|
||||
<input
|
||||
id="slot-save-path"
|
||||
type="text"
|
||||
value={settings.slotSavePath}
|
||||
onChange={(e) => update('slotSavePath', e.target.value)}
|
||||
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm font-mono outline-none focus:border-ring"
|
||||
/>
|
||||
<p className="text-xs text-muted-foreground italic">
|
||||
Directory for disk-persistent KV cache. Must be writable.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div className="flex justify-between gap-2 border-t pt-4">
|
||||
<Button variant="outline" onClick={() => void resetDefaults()} disabled={saving}>
|
||||
Reset to defaults
|
||||
</Button>
|
||||
<Button onClick={() => void save()} disabled={!dirty || saving}>
|
||||
{saving ? 'Saving…' : 'Save'}
|
||||
</Button>
|
||||
</div>
|
||||
|
||||
<p className="text-xs text-muted-foreground border-t pt-4">
|
||||
Changes apply to new llama-server processes. Restart the sidecar to apply.
|
||||
These settings are stored locally in your browser.
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user