boocode/apps/web/src/components/InferenceSettings.tsx

import { useEffect, useState } from 'react';
import { toast } from 'sonner';
import { Button } from '@/components/ui/button';
import { Database, Zap, Clock, BarChart3, Folder } from 'lucide-react';

interface InferenceConfig {
  cache_type_k: string;
  cache_reuse: number;
  spec_type: string;
  spec_ngram_mod_thsh: number;
  ctx_checkpoints: number;
  sleep_idle_seconds: number;
  metrics_enabled: boolean;
  slot_save_path: string;
}

const DEFAULTS: InferenceConfig = {
  cache_type_k: 'q4_0',
  cache_reuse: 256,
  spec_type: 'ngram-mod',
  spec_ngram_mod_thsh: 2,
  ctx_checkpoints: 32,
  sleep_idle_seconds: 600,
  metrics_enabled: true,
  slot_save_path: '/tmp/llama-slots',
};

function Switch({ checked, onCheckedChange, id }: {
  checked: boolean;
  onCheckedChange: (v: boolean) => void;
  id?: string;
}) {
  return (
    <button
      id={id}
      type="button"
      role="switch"
      aria-checked={checked}
      onClick={() => onCheckedChange(!checked)}
      className={`relative inline-flex h-5 w-9 shrink-0 cursor-pointer items-center rounded-full transition-colors ${
        checked ? 'bg-primary' : 'bg-muted'
      }`}
    >
      <span className={`inline-block h-4 w-4 transform rounded-full bg-background transition-transform ${
        checked ? 'translate-x-[1.125rem]' : 'translate-x-0.5'
      }`} />
    </button>
  );
}

function Loader() {
  return <div className="text-sm text-muted-foreground py-8 text-center">Loading inference settings...</div>;
}

export function InferenceSettings() {
  const [config, setConfig] = useState<InferenceConfig | null>(null);
  const [loading, setLoading] = useState(true);
  const [saving, setSaving] = useState(false);

  useEffect(() => {
    fetch('/api/settings/inference')
      .then((r) => (r.ok ? r.json() : Promise.reject()))
      .then((data) => setConfig(data as InferenceConfig))
      .catch(() => {
        setConfig({ ...DEFAULTS });
        toast.error('Could not load inference config — loading defaults');
      })
      .finally(() => setLoading(false));
  }, []);

  function update<K extends keyof InferenceConfig>(key: K, value: InferenceConfig[K]) {
    setConfig((prev) => (prev ? { ...prev, [key]: value } : prev));
  }

  async function save() {
    if (!config || saving) return;
    setSaving(true);
    try {
      const res = await fetch('/api/settings/inference', {
        method: 'PATCH',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify(config),
      });
      if (!res.ok) throw new Error('Save failed');
      const updated = (await res.json()) as InferenceConfig;
      setConfig(updated);
      toast.success('Inference settings saved');
    } catch (err) {
      toast.error(err instanceof Error ? err.message : 'Save failed');
    } finally {
      setSaving(false);
    }
  }

  if (loading) return <Loader />;
  if (!config) return <div className="text-sm text-destructive py-8 text-center">Failed to load</div>;

  return (
    <div className="space-y-6">
      <div className="space-y-1.5">
        <div className="flex items-center gap-2">
          <Database className="size-3.5 text-muted-foreground" />
          <label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
            KV Cache Quantization
          </label>
        </div>
        <select
          value={config.cache_type_k}
          onChange={(e) => update('cache_type_k', e.target.value)}
          className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
        >
          <option value="f32">f32 (full precision)</option>
          <option value="f16">f16 (half)</option>
          <option value="q8_0">q8_0 (8-bit)</option>
          <option value="q4_0">q4_0 (4-bit) — recommended</option>
        </select>
        <p className="text-xs text-muted-foreground/80">
          Format for the attention KV cache. Lower = less VRAM. q4_0 gives ~4x savings.
        </p>
      </div>

      <div className="space-y-1.5">
        <div className="flex items-center gap-2">
          <Zap className="size-3.5 text-muted-foreground" />
          <label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
            Prompt Caching
          </label>
        </div>
        <div className="flex items-center gap-3">
          <input
            type="number"
            min={0}
            max={4096}
            value={config.cache_reuse}
            onChange={(e) => update('cache_reuse', Number(e.target.value))}
            className="w-32 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
          />
          <span className="text-xs text-muted-foreground">
            {config.cache_reuse > 0 ? 'On (min chunk size in tokens)' : 'Disabled'}
          </span>
        </div>
        <p className="text-xs text-muted-foreground/80">
          Reuses KV cache across turns when prompt prefix matches. 256 is a good default.
          0 = disabled. The local equivalent of prompt caching.
        </p>
      </div>

      <div className="space-y-1.5">
        <div className="flex items-center gap-2">
          <Zap className="size-3.5 text-muted-foreground" />
          <label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
            Speculative Decoding
          </label>
        </div>
        <select
          value={config.spec_type}
          onChange={(e) => update('spec_type', e.target.value)}
          className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
        >
          <option value="off">Off</option>
          <option value="ngram-mod">N-gram (lightweight, ~16MB)</option>
          <option value="draft-simple">Draft model (requires separate model)</option>
        </select>
        {config.spec_type === 'ngram-mod' && (
          <div className="mt-2 flex items-center gap-3">
            <input
              type="number"
              min={1}
              max={10}
              value={config.spec_ngram_mod_thsh}
              onChange={(e) => update('spec_ngram_mod_thsh', Number(e.target.value))}
              className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
            />
            <span className="text-xs text-muted-foreground">Match threshold (2 = default)</span>
          </div>
        )}
        <p className="text-xs text-muted-foreground/80">
          Predicts tokens ahead with a small model; main model verifies in batch.
          2-3x speedup on repetitive/code tasks.
        </p>
      </div>

      <div className="space-y-1.5">
        <label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
          Context Checkpoints
        </label>
        <div className="flex items-center gap-3">
          <input
            type="number"
            min={0}
            max={128}
            value={config.ctx_checkpoints}
            onChange={(e) => update('ctx_checkpoints', Number(e.target.value))}
            className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
          />
          <span className="text-xs text-muted-foreground">
            {config.ctx_checkpoints > 0 ? `Max ${config.ctx_checkpoints} checkpoints per slot` : 'Disabled'}
          </span>
        </div>
        <p className="text-xs text-muted-foreground/80">
          Prevents context overflow on long conversations. Default: 32.
        </p>
      </div>

      <div className="space-y-1.5">
        <div className="flex items-center gap-2">
          <Clock className="size-3.5 text-muted-foreground" />
          <label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
            Auto-sleep Timeout
          </label>
        </div>
        <div className="flex items-center gap-3">
          <input
            type="number"
            min={-1}
            max={86400}
            value={config.sleep_idle_seconds}
            onChange={(e) => update('sleep_idle_seconds', Number(e.target.value))}
            className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
          />
          <span className="text-xs text-muted-foreground">seconds</span>
        </div>
        <p className="text-xs text-muted-foreground/80">
          GPU auto-sleeps after N seconds idle. -1 = disabled. 600 = 10 min.
        </p>
      </div>

      <div className="space-y-1.5">
        <div className="flex items-center justify-between gap-3">
          <div className="flex items-center gap-2">
            <BarChart3 className="size-3.5 text-muted-foreground" />
            <label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
              Prometheus Metrics
            </label>
          </div>
          <Switch
            checked={config.metrics_enabled}
            onCheckedChange={(v) => update('metrics_enabled', v)}
          />
        </div>
        <p className="text-xs text-muted-foreground/80">
          Enable /metrics endpoint for Prometheus monitoring (token rates, latency).
        </p>
      </div>

      <div className="space-y-1.5">
        <div className="flex items-center gap-2">
          <Folder className="size-3.5 text-muted-foreground" />
          <label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
            Slot KV Cache Path
          </label>
        </div>
        <input
          type="text"
          value={config.slot_save_path}
          onChange={(e) => update('slot_save_path', e.target.value)}
          className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm font-mono outline-none focus:border-ring"
        />
        <p className="text-xs text-muted-foreground/80">
          Directory for disk-persistent KV cache. Idle slot caches are saved here.
        </p>
      </div>

      <div className="flex justify-end border-t pt-4">
        <Button onClick={() => void save()} disabled={saving}>
          {saving ? 'Saving...' : 'Save Settings'}
        </Button>
      </div>
    </div>
  );
}