Adds Inference tab to SettingsPane with controls for temperature, top-p, top-k, min-p, and other inference parameters. Server-side route and provider config wiring to pass overrides through the inference pipeline.
272 lines
10 KiB
TypeScript
272 lines
10 KiB
TypeScript
import { useEffect, useState } from 'react';
|
|
import { toast } from 'sonner';
|
|
import { Button } from '@/components/ui/button';
|
|
import { Database, Zap, Clock, BarChart3, Folder } from 'lucide-react';
|
|
|
|
interface InferenceConfig {
|
|
cache_type_k: string;
|
|
cache_reuse: number;
|
|
spec_type: string;
|
|
spec_ngram_mod_thsh: number;
|
|
ctx_checkpoints: number;
|
|
sleep_idle_seconds: number;
|
|
metrics_enabled: boolean;
|
|
slot_save_path: string;
|
|
}
|
|
|
|
const DEFAULTS: InferenceConfig = {
|
|
cache_type_k: 'q4_0',
|
|
cache_reuse: 256,
|
|
spec_type: 'ngram-mod',
|
|
spec_ngram_mod_thsh: 2,
|
|
ctx_checkpoints: 32,
|
|
sleep_idle_seconds: 600,
|
|
metrics_enabled: true,
|
|
slot_save_path: '/tmp/llama-slots',
|
|
};
|
|
|
|
function Switch({ checked, onCheckedChange, id }: {
|
|
checked: boolean;
|
|
onCheckedChange: (v: boolean) => void;
|
|
id?: string;
|
|
}) {
|
|
return (
|
|
<button
|
|
id={id}
|
|
type="button"
|
|
role="switch"
|
|
aria-checked={checked}
|
|
onClick={() => onCheckedChange(!checked)}
|
|
className={`relative inline-flex h-5 w-9 shrink-0 cursor-pointer items-center rounded-full transition-colors ${
|
|
checked ? 'bg-primary' : 'bg-muted'
|
|
}`}
|
|
>
|
|
<span className={`inline-block h-4 w-4 transform rounded-full bg-background transition-transform ${
|
|
checked ? 'translate-x-[1.125rem]' : 'translate-x-0.5'
|
|
}`} />
|
|
</button>
|
|
);
|
|
}
|
|
|
|
function Loader() {
|
|
return <div className="text-sm text-muted-foreground py-8 text-center">Loading inference settings...</div>;
|
|
}
|
|
|
|
export function InferenceSettings() {
|
|
const [config, setConfig] = useState<InferenceConfig | null>(null);
|
|
const [loading, setLoading] = useState(true);
|
|
const [saving, setSaving] = useState(false);
|
|
|
|
useEffect(() => {
|
|
fetch('/api/settings/inference')
|
|
.then((r) => (r.ok ? r.json() : Promise.reject()))
|
|
.then((data) => setConfig(data as InferenceConfig))
|
|
.catch(() => {
|
|
setConfig({ ...DEFAULTS });
|
|
toast.error('Could not load inference config — loading defaults');
|
|
})
|
|
.finally(() => setLoading(false));
|
|
}, []);
|
|
|
|
function update<K extends keyof InferenceConfig>(key: K, value: InferenceConfig[K]) {
|
|
setConfig((prev) => (prev ? { ...prev, [key]: value } : prev));
|
|
}
|
|
|
|
async function save() {
|
|
if (!config || saving) return;
|
|
setSaving(true);
|
|
try {
|
|
const res = await fetch('/api/settings/inference', {
|
|
method: 'PATCH',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify(config),
|
|
});
|
|
if (!res.ok) throw new Error('Save failed');
|
|
const updated = (await res.json()) as InferenceConfig;
|
|
setConfig(updated);
|
|
toast.success('Inference settings saved');
|
|
} catch (err) {
|
|
toast.error(err instanceof Error ? err.message : 'Save failed');
|
|
} finally {
|
|
setSaving(false);
|
|
}
|
|
}
|
|
|
|
if (loading) return <Loader />;
|
|
if (!config) return <div className="text-sm text-destructive py-8 text-center">Failed to load</div>;
|
|
|
|
return (
|
|
<div className="space-y-6">
|
|
<div className="space-y-1.5">
|
|
<div className="flex items-center gap-2">
|
|
<Database className="size-3.5 text-muted-foreground" />
|
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
|
KV Cache Quantization
|
|
</label>
|
|
</div>
|
|
<select
|
|
value={config.cache_type_k}
|
|
onChange={(e) => update('cache_type_k', e.target.value)}
|
|
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
|
>
|
|
<option value="f32">f32 (full precision)</option>
|
|
<option value="f16">f16 (half)</option>
|
|
<option value="q8_0">q8_0 (8-bit)</option>
|
|
<option value="q4_0">q4_0 (4-bit) — recommended</option>
|
|
</select>
|
|
<p className="text-xs text-muted-foreground/80">
|
|
Format for the attention KV cache. Lower = less VRAM. q4_0 gives ~4x savings.
|
|
</p>
|
|
</div>
|
|
|
|
<div className="space-y-1.5">
|
|
<div className="flex items-center gap-2">
|
|
<Zap className="size-3.5 text-muted-foreground" />
|
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
|
Prompt Caching
|
|
</label>
|
|
</div>
|
|
<div className="flex items-center gap-3">
|
|
<input
|
|
type="number"
|
|
min={0}
|
|
max={4096}
|
|
value={config.cache_reuse}
|
|
onChange={(e) => update('cache_reuse', Number(e.target.value))}
|
|
className="w-32 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
|
/>
|
|
<span className="text-xs text-muted-foreground">
|
|
{config.cache_reuse > 0 ? 'On (min chunk size in tokens)' : 'Disabled'}
|
|
</span>
|
|
</div>
|
|
<p className="text-xs text-muted-foreground/80">
|
|
Reuses KV cache across turns when prompt prefix matches. 256 is a good default.
|
|
0 = disabled. The local equivalent of prompt caching.
|
|
</p>
|
|
</div>
|
|
|
|
<div className="space-y-1.5">
|
|
<div className="flex items-center gap-2">
|
|
<Zap className="size-3.5 text-muted-foreground" />
|
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
|
Speculative Decoding
|
|
</label>
|
|
</div>
|
|
<select
|
|
value={config.spec_type}
|
|
onChange={(e) => update('spec_type', e.target.value)}
|
|
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
|
>
|
|
<option value="off">Off</option>
|
|
<option value="ngram-mod">N-gram (lightweight, ~16MB)</option>
|
|
<option value="draft-simple">Draft model (requires separate model)</option>
|
|
</select>
|
|
{config.spec_type === 'ngram-mod' && (
|
|
<div className="mt-2 flex items-center gap-3">
|
|
<input
|
|
type="number"
|
|
min={1}
|
|
max={10}
|
|
value={config.spec_ngram_mod_thsh}
|
|
onChange={(e) => update('spec_ngram_mod_thsh', Number(e.target.value))}
|
|
className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
|
/>
|
|
<span className="text-xs text-muted-foreground">Match threshold (2 = default)</span>
|
|
</div>
|
|
)}
|
|
<p className="text-xs text-muted-foreground/80">
|
|
Predicts tokens ahead with a small model; main model verifies in batch.
|
|
2-3x speedup on repetitive/code tasks.
|
|
</p>
|
|
</div>
|
|
|
|
<div className="space-y-1.5">
|
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
|
Context Checkpoints
|
|
</label>
|
|
<div className="flex items-center gap-3">
|
|
<input
|
|
type="number"
|
|
min={0}
|
|
max={128}
|
|
value={config.ctx_checkpoints}
|
|
onChange={(e) => update('ctx_checkpoints', Number(e.target.value))}
|
|
className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
|
/>
|
|
<span className="text-xs text-muted-foreground">
|
|
{config.ctx_checkpoints > 0 ? `Max ${config.ctx_checkpoints} checkpoints per slot` : 'Disabled'}
|
|
</span>
|
|
</div>
|
|
<p className="text-xs text-muted-foreground/80">
|
|
Prevents context overflow on long conversations. Default: 32.
|
|
</p>
|
|
</div>
|
|
|
|
<div className="space-y-1.5">
|
|
<div className="flex items-center gap-2">
|
|
<Clock className="size-3.5 text-muted-foreground" />
|
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
|
Auto-sleep Timeout
|
|
</label>
|
|
</div>
|
|
<div className="flex items-center gap-3">
|
|
<input
|
|
type="number"
|
|
min={-1}
|
|
max={86400}
|
|
value={config.sleep_idle_seconds}
|
|
onChange={(e) => update('sleep_idle_seconds', Number(e.target.value))}
|
|
className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
|
/>
|
|
<span className="text-xs text-muted-foreground">seconds</span>
|
|
</div>
|
|
<p className="text-xs text-muted-foreground/80">
|
|
GPU auto-sleeps after N seconds idle. -1 = disabled. 600 = 10 min.
|
|
</p>
|
|
</div>
|
|
|
|
<div className="space-y-1.5">
|
|
<div className="flex items-center justify-between gap-3">
|
|
<div className="flex items-center gap-2">
|
|
<BarChart3 className="size-3.5 text-muted-foreground" />
|
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
|
Prometheus Metrics
|
|
</label>
|
|
</div>
|
|
<Switch
|
|
checked={config.metrics_enabled}
|
|
onCheckedChange={(v) => update('metrics_enabled', v)}
|
|
/>
|
|
</div>
|
|
<p className="text-xs text-muted-foreground/80">
|
|
Enable /metrics endpoint for Prometheus monitoring (token rates, latency).
|
|
</p>
|
|
</div>
|
|
|
|
<div className="space-y-1.5">
|
|
<div className="flex items-center gap-2">
|
|
<Folder className="size-3.5 text-muted-foreground" />
|
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
|
Slot KV Cache Path
|
|
</label>
|
|
</div>
|
|
<input
|
|
type="text"
|
|
value={config.slot_save_path}
|
|
onChange={(e) => update('slot_save_path', e.target.value)}
|
|
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm font-mono outline-none focus:border-ring"
|
|
/>
|
|
<p className="text-xs text-muted-foreground/80">
|
|
Directory for disk-persistent KV cache. Idle slot caches are saved here.
|
|
</p>
|
|
</div>
|
|
|
|
<div className="flex justify-end border-t pt-4">
|
|
<Button onClick={() => void save()} disabled={saving}>
|
|
{saving ? 'Saving...' : 'Save Settings'}
|
|
</Button>
|
|
</div>
|
|
</div>
|
|
);
|
|
}
|