import { useEffect, useState } from 'react'; import { toast } from 'sonner'; import { Button } from '@/components/ui/button'; import { Database, Zap, Clock, BarChart3, Folder } from 'lucide-react'; interface InferenceConfig { cache_type_k: string; cache_reuse: number; spec_type: string; spec_ngram_mod_thsh: number; ctx_checkpoints: number; sleep_idle_seconds: number; metrics_enabled: boolean; slot_save_path: string; } const DEFAULTS: InferenceConfig = { cache_type_k: 'q4_0', cache_reuse: 256, spec_type: 'ngram-mod', spec_ngram_mod_thsh: 2, ctx_checkpoints: 32, sleep_idle_seconds: 600, metrics_enabled: true, slot_save_path: '/tmp/llama-slots', }; function Switch({ checked, onCheckedChange, id }: { checked: boolean; onCheckedChange: (v: boolean) => void; id?: string; }) { return ( ); } function Loader() { return
Loading inference settings...
; } export function InferenceSettings() { const [config, setConfig] = useState(null); const [loading, setLoading] = useState(true); const [saving, setSaving] = useState(false); useEffect(() => { fetch('/api/settings/inference') .then((r) => (r.ok ? r.json() : Promise.reject())) .then((data) => setConfig(data as InferenceConfig)) .catch(() => { setConfig({ ...DEFAULTS }); toast.error('Could not load inference config — loading defaults'); }) .finally(() => setLoading(false)); }, []); function update(key: K, value: InferenceConfig[K]) { setConfig((prev) => (prev ? { ...prev, [key]: value } : prev)); } async function save() { if (!config || saving) return; setSaving(true); try { const res = await fetch('/api/settings/inference', { method: 'PATCH', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(config), }); if (!res.ok) throw new Error('Save failed'); const updated = (await res.json()) as InferenceConfig; setConfig(updated); toast.success('Inference settings saved'); } catch (err) { toast.error(err instanceof Error ? err.message : 'Save failed'); } finally { setSaving(false); } } if (loading) return ; if (!config) return
Failed to load
; return (

Format for the attention KV cache. Lower = less VRAM. q4_0 gives ~4x savings.

update('cache_reuse', Number(e.target.value))} className="w-32 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring" /> {config.cache_reuse > 0 ? 'On (min chunk size in tokens)' : 'Disabled'}

Reuses KV cache across turns when prompt prefix matches. 256 is a good default. 0 = disabled. The local equivalent of prompt caching.

{config.spec_type === 'ngram-mod' && (
update('spec_ngram_mod_thsh', Number(e.target.value))} className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring" /> Match threshold (2 = default)
)}

Predicts tokens ahead with a small model; main model verifies in batch. 2-3x speedup on repetitive/code tasks.

update('ctx_checkpoints', Number(e.target.value))} className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring" /> {config.ctx_checkpoints > 0 ? `Max ${config.ctx_checkpoints} checkpoints per slot` : 'Disabled'}

Prevents context overflow on long conversations. Default: 32.

update('sleep_idle_seconds', Number(e.target.value))} className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring" /> seconds

GPU auto-sleeps after N seconds idle. -1 = disabled. 600 = 10 min.

update('metrics_enabled', v)} />

Enable /metrics endpoint for Prometheus monitoring (token rates, latency).

update('slot_save_path', e.target.value)} className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm font-mono outline-none focus:border-ring" />

Directory for disk-persistent KV cache. Idle slot caches are saved here.

); }