import { useEffect, useState } from 'react'; import { toast } from 'sonner'; import { Button } from '@/components/ui/button'; import { Database, Zap, Clock, BarChart3, Folder } from 'lucide-react'; interface InferenceConfig { cache_type_k: string; cache_reuse: number; spec_type: string; spec_ngram_mod_thsh: number; ctx_checkpoints: number; sleep_idle_seconds: number; metrics_enabled: boolean; slot_save_path: string; } const DEFAULTS: InferenceConfig = { cache_type_k: 'q4_0', cache_reuse: 256, spec_type: 'ngram-mod', spec_ngram_mod_thsh: 2, ctx_checkpoints: 32, sleep_idle_seconds: 600, metrics_enabled: true, slot_save_path: '/tmp/llama-slots', }; function Switch({ checked, onCheckedChange, id }: { checked: boolean; onCheckedChange: (v: boolean) => void; id?: string; }) { return ( ); } function Loader() { return
Format for the attention KV cache. Lower = less VRAM. q4_0 gives ~4x savings.
Reuses KV cache across turns when prompt prefix matches. 256 is a good default. 0 = disabled. The local equivalent of prompt caching.
Predicts tokens ahead with a small model; main model verifies in batch. 2-3x speedup on repetitive/code tasks.
Prevents context overflow on long conversations. Default: 32.
GPU auto-sleeps after N seconds idle. -1 = disabled. 600 = 10 min.
Enable /metrics endpoint for Prometheus monitoring (token rates, latency).
Directory for disk-persistent KV cache. Idle slot caches are saved here.