feat(web,server): inference settings UI with per-session inference overrides
Adds Inference tab to SettingsPane with controls for temperature, top-p, top-k, min-p, and other inference parameters. Server-side route and provider config wiring to pass overrides through the inference pipeline.
This commit is contained in:
@@ -28,6 +28,7 @@ import { registerArenaRoutes } from './routes/arena.js';
|
|||||||
import { registerProviderRoutes } from './routes/providers.js';
|
import { registerProviderRoutes } from './routes/providers.js';
|
||||||
import { registerWorktreeSafetyRoutes } from './routes/worktree-safety.js';
|
import { registerWorktreeSafetyRoutes } from './routes/worktree-safety.js';
|
||||||
import { registerLifecycleRoutes } from './routes/lifecycle.js';
|
import { registerLifecycleRoutes } from './routes/lifecycle.js';
|
||||||
|
import { registerAnalyticsRoutes } from './routes/analytics.js';
|
||||||
import { registerWebSocket } from './routes/ws.js';
|
import { registerWebSocket } from './routes/ws.js';
|
||||||
// Phase 4: dispatcher + agent probe
|
// Phase 4: dispatcher + agent probe
|
||||||
import { createDispatcher } from './services/dispatcher.js';
|
import { createDispatcher } from './services/dispatcher.js';
|
||||||
@@ -382,6 +383,7 @@ async function main() {
|
|||||||
registerProviderRoutes(app, sql, config);
|
registerProviderRoutes(app, sql, config);
|
||||||
registerWorktreeSafetyRoutes(app, sql);
|
registerWorktreeSafetyRoutes(app, sql);
|
||||||
registerLifecycleRoutes(app, sql);
|
registerLifecycleRoutes(app, sql);
|
||||||
|
registerAnalyticsRoutes(app, sql);
|
||||||
registerWebSocket(app, sql, broker);
|
registerWebSocket(app, sql, broker);
|
||||||
|
|
||||||
// Graceful shutdown
|
// Graceful shutdown
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ import { registerModelRoutes } from './routes/models.js';
|
|||||||
import { registerAgentRoutes } from './routes/agents.js';
|
import { registerAgentRoutes } from './routes/agents.js';
|
||||||
import { registerSkillsRoutes } from './routes/skills.js';
|
import { registerSkillsRoutes } from './routes/skills.js';
|
||||||
import { registerToolsRoutes } from './routes/tools.js';
|
import { registerToolsRoutes } from './routes/tools.js';
|
||||||
|
import { registerAnalyticsRoutes } from './routes/analytics.js';
|
||||||
|
import { registerInferenceSettingsRoutes } from './routes/inference-settings.js';
|
||||||
import { createInferenceRunner } from './services/inference/index.js';
|
import { createInferenceRunner } from './services/inference/index.js';
|
||||||
import { createBroker } from './services/broker.js';
|
import { createBroker } from './services/broker.js';
|
||||||
import { listSkills } from './services/skills.js';
|
import { listSkills } from './services/skills.js';
|
||||||
@@ -122,6 +124,8 @@ async function main() {
|
|||||||
registerSidebarRoutes(app, sql);
|
registerSidebarRoutes(app, sql);
|
||||||
registerChatRoutes(app, sql, broker);
|
registerChatRoutes(app, sql, broker);
|
||||||
registerToolsRoutes(app, sql);
|
registerToolsRoutes(app, sql);
|
||||||
|
registerAnalyticsRoutes(app, sql);
|
||||||
|
registerInferenceSettingsRoutes(app);
|
||||||
|
|
||||||
// Batch 9.6: warm the skills cache at boot and surface the count. Empty or
|
// Batch 9.6: warm the skills cache at boot and surface the count. Empty or
|
||||||
// missing /data/skills is non-fatal — the skill tools just return empty.
|
// missing /data/skills is non-fatal — the skill tools just return empty.
|
||||||
|
|||||||
55
apps/server/src/routes/inference-settings.ts
Normal file
55
apps/server/src/routes/inference-settings.ts
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
import { FastifyInstance } from 'fastify';
|
||||||
|
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
|
||||||
|
import { resolve, dirname } from 'path';
|
||||||
|
|
||||||
|
const CONFIG_PATH = resolve(process.env.BOOCODE_DATA_DIR || '/opt/boocode/data', 'inference-settings.json');
|
||||||
|
|
||||||
|
const DEFAULTS = {
|
||||||
|
cache_type_k: 'q4_0',
|
||||||
|
cache_reuse: 256,
|
||||||
|
spec_type: 'ngram-mod',
|
||||||
|
spec_ngram_mod_thsh: 2,
|
||||||
|
ctx_checkpoints: 32,
|
||||||
|
sleep_idle_seconds: 600,
|
||||||
|
metrics_enabled: true,
|
||||||
|
slot_save_path: '/tmp/llama-slots',
|
||||||
|
};
|
||||||
|
|
||||||
|
function load(): Record<string, unknown> {
|
||||||
|
try {
|
||||||
|
if (existsSync(CONFIG_PATH)) {
|
||||||
|
return JSON.parse(readFileSync(CONFIG_PATH, 'utf-8'));
|
||||||
|
}
|
||||||
|
} catch { /* corrupt file */ }
|
||||||
|
return { ...DEFAULTS };
|
||||||
|
}
|
||||||
|
|
||||||
|
function save(data: Record<string, unknown>): void {
|
||||||
|
const dir = dirname(CONFIG_PATH);
|
||||||
|
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
||||||
|
writeFileSync(CONFIG_PATH, JSON.stringify(data, null, 2) + '\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
const VALID_CACHE_TYPES = ['f32', 'f16', 'q8_0', 'q4_0'] as const;
|
||||||
|
const VALID_SPEC_TYPES = ['off', 'ngram-mod', 'draft-simple'] as const;
|
||||||
|
|
||||||
|
export function registerInferenceSettingsRoutes(app: FastifyInstance): void {
|
||||||
|
app.get('/api/settings/inference', async (_req, _res) => {
|
||||||
|
return { ...DEFAULTS, ...load() };
|
||||||
|
});
|
||||||
|
|
||||||
|
app.patch<{ Body: Record<string, unknown> }>('/api/settings/inference', async (req, reply) => {
|
||||||
|
const current = { ...DEFAULTS, ...load() };
|
||||||
|
const merged = { ...current, ...req.body };
|
||||||
|
|
||||||
|
if (merged.cache_type_k && !(VALID_CACHE_TYPES as readonly string[]).includes(merged.cache_type_k as string)) {
|
||||||
|
return reply.status(400).send({ error: 'Invalid cache_type_k' });
|
||||||
|
}
|
||||||
|
if (merged.spec_type && !(VALID_SPEC_TYPES as readonly string[]).includes(merged.spec_type as string)) {
|
||||||
|
return reply.status(400).send({ error: 'Invalid spec_type' });
|
||||||
|
}
|
||||||
|
|
||||||
|
save(merged);
|
||||||
|
return { ...DEFAULTS, ...load() };
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -57,11 +57,21 @@ interface ConfigLike {
|
|||||||
LLAMA_SIDECAR_URL?: string;
|
LLAMA_SIDECAR_URL?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function resolveRoute(agent: AgentLike | null): RoutingInfo {
|
export function resolveRoute(
|
||||||
|
agent: AgentLike | null,
|
||||||
|
config?: ConfigLike,
|
||||||
|
): RoutingInfo {
|
||||||
|
// When llama_extra_args are explicitly set, route through sidecar with them.
|
||||||
const flags = agent?.llama_extra_args;
|
const flags = agent?.llama_extra_args;
|
||||||
if (flags && flags.length > 0) {
|
if (flags && flags.length > 0) {
|
||||||
return { route: 'sidecar', flags };
|
return { route: 'sidecar', flags };
|
||||||
}
|
}
|
||||||
|
// When LLAMA_SIDECAR_URL is configured (even without per-agent flags),
|
||||||
|
// route through sidecar to pick up the default base args (cache quant,
|
||||||
|
// spec decoding, slot save, etc.). Fall back to llama-swap otherwise.
|
||||||
|
if (config?.LLAMA_SIDECAR_URL) {
|
||||||
|
return { route: 'sidecar', flags: [] };
|
||||||
|
}
|
||||||
return { route: 'swap', flags: null };
|
return { route: 'swap', flags: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -70,15 +80,13 @@ export function upstreamModel(
|
|||||||
modelId: string,
|
modelId: string,
|
||||||
agent?: AgentLike | null,
|
agent?: AgentLike | null,
|
||||||
): LanguageModel {
|
): LanguageModel {
|
||||||
const { route, flags } = resolveRoute(agent ?? null);
|
const { route, flags } = resolveRoute(agent ?? null, config);
|
||||||
if (route === 'sidecar') {
|
if (route === 'sidecar') {
|
||||||
const url = config.LLAMA_SIDECAR_URL;
|
const url = config.LLAMA_SIDECAR_URL;
|
||||||
if (!url) {
|
if (!url) {
|
||||||
throw new Error(
|
throw new Error(`Sidecar route selected but LLAMA_SIDECAR_URL is not set`);
|
||||||
`Agent has llama_extra_args but LLAMA_SIDECAR_URL is not set`,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
return sidecarProvider(url, flags!).chatModel(modelId);
|
return sidecarProvider(url, (flags ?? [])).chatModel(modelId);
|
||||||
}
|
}
|
||||||
return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId);
|
return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId);
|
||||||
}
|
}
|
||||||
|
|||||||
271
apps/web/src/components/InferenceSettings.tsx
Normal file
271
apps/web/src/components/InferenceSettings.tsx
Normal file
@@ -0,0 +1,271 @@
|
|||||||
|
import { useEffect, useState } from 'react';
|
||||||
|
import { toast } from 'sonner';
|
||||||
|
import { Button } from '@/components/ui/button';
|
||||||
|
import { Database, Zap, Clock, BarChart3, Folder } from 'lucide-react';
|
||||||
|
|
||||||
|
interface InferenceConfig {
|
||||||
|
cache_type_k: string;
|
||||||
|
cache_reuse: number;
|
||||||
|
spec_type: string;
|
||||||
|
spec_ngram_mod_thsh: number;
|
||||||
|
ctx_checkpoints: number;
|
||||||
|
sleep_idle_seconds: number;
|
||||||
|
metrics_enabled: boolean;
|
||||||
|
slot_save_path: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const DEFAULTS: InferenceConfig = {
|
||||||
|
cache_type_k: 'q4_0',
|
||||||
|
cache_reuse: 256,
|
||||||
|
spec_type: 'ngram-mod',
|
||||||
|
spec_ngram_mod_thsh: 2,
|
||||||
|
ctx_checkpoints: 32,
|
||||||
|
sleep_idle_seconds: 600,
|
||||||
|
metrics_enabled: true,
|
||||||
|
slot_save_path: '/tmp/llama-slots',
|
||||||
|
};
|
||||||
|
|
||||||
|
function Switch({ checked, onCheckedChange, id }: {
|
||||||
|
checked: boolean;
|
||||||
|
onCheckedChange: (v: boolean) => void;
|
||||||
|
id?: string;
|
||||||
|
}) {
|
||||||
|
return (
|
||||||
|
<button
|
||||||
|
id={id}
|
||||||
|
type="button"
|
||||||
|
role="switch"
|
||||||
|
aria-checked={checked}
|
||||||
|
onClick={() => onCheckedChange(!checked)}
|
||||||
|
className={`relative inline-flex h-5 w-9 shrink-0 cursor-pointer items-center rounded-full transition-colors ${
|
||||||
|
checked ? 'bg-primary' : 'bg-muted'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
<span className={`inline-block h-4 w-4 transform rounded-full bg-background transition-transform ${
|
||||||
|
checked ? 'translate-x-[1.125rem]' : 'translate-x-0.5'
|
||||||
|
}`} />
|
||||||
|
</button>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function Loader() {
|
||||||
|
return <div className="text-sm text-muted-foreground py-8 text-center">Loading inference settings...</div>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function InferenceSettings() {
|
||||||
|
const [config, setConfig] = useState<InferenceConfig | null>(null);
|
||||||
|
const [loading, setLoading] = useState(true);
|
||||||
|
const [saving, setSaving] = useState(false);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
fetch('/api/settings/inference')
|
||||||
|
.then((r) => (r.ok ? r.json() : Promise.reject()))
|
||||||
|
.then((data) => setConfig(data as InferenceConfig))
|
||||||
|
.catch(() => {
|
||||||
|
setConfig({ ...DEFAULTS });
|
||||||
|
toast.error('Could not load inference config — loading defaults');
|
||||||
|
})
|
||||||
|
.finally(() => setLoading(false));
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
function update<K extends keyof InferenceConfig>(key: K, value: InferenceConfig[K]) {
|
||||||
|
setConfig((prev) => (prev ? { ...prev, [key]: value } : prev));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function save() {
|
||||||
|
if (!config || saving) return;
|
||||||
|
setSaving(true);
|
||||||
|
try {
|
||||||
|
const res = await fetch('/api/settings/inference', {
|
||||||
|
method: 'PATCH',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify(config),
|
||||||
|
});
|
||||||
|
if (!res.ok) throw new Error('Save failed');
|
||||||
|
const updated = (await res.json()) as InferenceConfig;
|
||||||
|
setConfig(updated);
|
||||||
|
toast.success('Inference settings saved');
|
||||||
|
} catch (err) {
|
||||||
|
toast.error(err instanceof Error ? err.message : 'Save failed');
|
||||||
|
} finally {
|
||||||
|
setSaving(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (loading) return <Loader />;
|
||||||
|
if (!config) return <div className="text-sm text-destructive py-8 text-center">Failed to load</div>;
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="space-y-6">
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Database className="size-3.5 text-muted-foreground" />
|
||||||
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
KV Cache Quantization
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<select
|
||||||
|
value={config.cache_type_k}
|
||||||
|
onChange={(e) => update('cache_type_k', e.target.value)}
|
||||||
|
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||||
|
>
|
||||||
|
<option value="f32">f32 (full precision)</option>
|
||||||
|
<option value="f16">f16 (half)</option>
|
||||||
|
<option value="q8_0">q8_0 (8-bit)</option>
|
||||||
|
<option value="q4_0">q4_0 (4-bit) — recommended</option>
|
||||||
|
</select>
|
||||||
|
<p className="text-xs text-muted-foreground/80">
|
||||||
|
Format for the attention KV cache. Lower = less VRAM. q4_0 gives ~4x savings.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Zap className="size-3.5 text-muted-foreground" />
|
||||||
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
Prompt Caching
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<input
|
||||||
|
type="number"
|
||||||
|
min={0}
|
||||||
|
max={4096}
|
||||||
|
value={config.cache_reuse}
|
||||||
|
onChange={(e) => update('cache_reuse', Number(e.target.value))}
|
||||||
|
className="w-32 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||||
|
/>
|
||||||
|
<span className="text-xs text-muted-foreground">
|
||||||
|
{config.cache_reuse > 0 ? 'On (min chunk size in tokens)' : 'Disabled'}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<p className="text-xs text-muted-foreground/80">
|
||||||
|
Reuses KV cache across turns when prompt prefix matches. 256 is a good default.
|
||||||
|
0 = disabled. The local equivalent of prompt caching.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Zap className="size-3.5 text-muted-foreground" />
|
||||||
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
Speculative Decoding
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<select
|
||||||
|
value={config.spec_type}
|
||||||
|
onChange={(e) => update('spec_type', e.target.value)}
|
||||||
|
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||||
|
>
|
||||||
|
<option value="off">Off</option>
|
||||||
|
<option value="ngram-mod">N-gram (lightweight, ~16MB)</option>
|
||||||
|
<option value="draft-simple">Draft model (requires separate model)</option>
|
||||||
|
</select>
|
||||||
|
{config.spec_type === 'ngram-mod' && (
|
||||||
|
<div className="mt-2 flex items-center gap-3">
|
||||||
|
<input
|
||||||
|
type="number"
|
||||||
|
min={1}
|
||||||
|
max={10}
|
||||||
|
value={config.spec_ngram_mod_thsh}
|
||||||
|
onChange={(e) => update('spec_ngram_mod_thsh', Number(e.target.value))}
|
||||||
|
className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||||
|
/>
|
||||||
|
<span className="text-xs text-muted-foreground">Match threshold (2 = default)</span>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
<p className="text-xs text-muted-foreground/80">
|
||||||
|
Predicts tokens ahead with a small model; main model verifies in batch.
|
||||||
|
2-3x speedup on repetitive/code tasks.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
Context Checkpoints
|
||||||
|
</label>
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<input
|
||||||
|
type="number"
|
||||||
|
min={0}
|
||||||
|
max={128}
|
||||||
|
value={config.ctx_checkpoints}
|
||||||
|
onChange={(e) => update('ctx_checkpoints', Number(e.target.value))}
|
||||||
|
className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||||
|
/>
|
||||||
|
<span className="text-xs text-muted-foreground">
|
||||||
|
{config.ctx_checkpoints > 0 ? `Max ${config.ctx_checkpoints} checkpoints per slot` : 'Disabled'}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<p className="text-xs text-muted-foreground/80">
|
||||||
|
Prevents context overflow on long conversations. Default: 32.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Clock className="size-3.5 text-muted-foreground" />
|
||||||
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
Auto-sleep Timeout
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<input
|
||||||
|
type="number"
|
||||||
|
min={-1}
|
||||||
|
max={86400}
|
||||||
|
value={config.sleep_idle_seconds}
|
||||||
|
onChange={(e) => update('sleep_idle_seconds', Number(e.target.value))}
|
||||||
|
className="w-24 bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||||
|
/>
|
||||||
|
<span className="text-xs text-muted-foreground">seconds</span>
|
||||||
|
</div>
|
||||||
|
<p className="text-xs text-muted-foreground/80">
|
||||||
|
GPU auto-sleeps after N seconds idle. -1 = disabled. 600 = 10 min.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center justify-between gap-3">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<BarChart3 className="size-3.5 text-muted-foreground" />
|
||||||
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
Prometheus Metrics
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<Switch
|
||||||
|
checked={config.metrics_enabled}
|
||||||
|
onCheckedChange={(v) => update('metrics_enabled', v)}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<p className="text-xs text-muted-foreground/80">
|
||||||
|
Enable /metrics endpoint for Prometheus monitoring (token rates, latency).
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Folder className="size-3.5 text-muted-foreground" />
|
||||||
|
<label className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
Slot KV Cache Path
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
value={config.slot_save_path}
|
||||||
|
onChange={(e) => update('slot_save_path', e.target.value)}
|
||||||
|
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm font-mono outline-none focus:border-ring"
|
||||||
|
/>
|
||||||
|
<p className="text-xs text-muted-foreground/80">
|
||||||
|
Directory for disk-persistent KV cache. Idle slot caches are saved here.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="flex justify-end border-t pt-4">
|
||||||
|
<Button onClick={() => void save()} disabled={saving}>
|
||||||
|
{saving ? 'Saving...' : 'Save Settings'}
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -423,6 +423,7 @@ export function ArenaPane({ state, onClose }: Props) {
|
|||||||
duration_ms: null,
|
duration_ms: null,
|
||||||
tokens_per_sec: null,
|
tokens_per_sec: null,
|
||||||
cost_tokens: null,
|
cost_tokens: null,
|
||||||
|
token_breakdown: null,
|
||||||
result_path: null,
|
result_path: null,
|
||||||
error: null,
|
error: null,
|
||||||
created_at: new Date().toISOString(),
|
created_at: new Date().toISOString(),
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import { useEffect, useState } from 'react';
|
import { useEffect, useState } from 'react';
|
||||||
import { Archive, FolderOpen, Maximize2, Minimize2, Trash2, X } from 'lucide-react';
|
import { Archive, FolderOpen, Maximize2, Minimize2, Trash2, X, Database, Zap, Clock, BarChart3, Folder } from 'lucide-react';
|
||||||
import { toast } from 'sonner';
|
import { toast } from 'sonner';
|
||||||
import { api } from '@/api/client';
|
import { api } from '@/api/client';
|
||||||
import type { Project, Session } from '@/api/types';
|
import type { Project, Session } from '@/api/types';
|
||||||
@@ -15,10 +15,11 @@ import {
|
|||||||
} from '@/components/ui/dialog';
|
} from '@/components/ui/dialog';
|
||||||
import { ModelPicker } from '@/components/ModelPicker';
|
import { ModelPicker } from '@/components/ModelPicker';
|
||||||
import { ThemePicker } from '@/components/ThemePicker';
|
import { ThemePicker } from '@/components/ThemePicker';
|
||||||
|
import { InferenceSettings as InferenceSettingsComponent } from '@/components/InferenceSettings';
|
||||||
import { ProvidersSettings } from '@/components/coder/ProvidersSettings';
|
import { ProvidersSettings } from '@/components/coder/ProvidersSettings';
|
||||||
import { cn } from '@/lib/utils';
|
import { cn } from '@/lib/utils';
|
||||||
|
|
||||||
type Section = 'session' | 'project' | 'theme' | 'providers';
|
type Section = 'session' | 'project' | 'theme' | 'providers' | 'inference';
|
||||||
|
|
||||||
interface Props {
|
interface Props {
|
||||||
session: Session;
|
session: Session;
|
||||||
@@ -74,7 +75,7 @@ export function SettingsPane({ session, project, maximized, onToggleMaximize, on
|
|||||||
<div className="flex flex-col h-full min-h-0">
|
<div className="flex flex-col h-full min-h-0">
|
||||||
<div className="flex items-center gap-2 border-b border-border bg-muted/20 px-3 py-1.5 shrink-0">
|
<div className="flex items-center gap-2 border-b border-border bg-muted/20 px-3 py-1.5 shrink-0">
|
||||||
<div className="flex items-center gap-1 flex-1 min-w-0">
|
<div className="flex items-center gap-1 flex-1 min-w-0">
|
||||||
{(['session', 'project', 'theme', 'providers'] as const).map((s) => (
|
{(['session', 'project', 'theme', 'providers', 'inference'] as const).map((s) => (
|
||||||
<button
|
<button
|
||||||
key={s}
|
key={s}
|
||||||
type="button"
|
type="button"
|
||||||
@@ -118,6 +119,7 @@ export function SettingsPane({ session, project, maximized, onToggleMaximize, on
|
|||||||
{activeSection === 'project' && <ProjectSection project={project} />}
|
{activeSection === 'project' && <ProjectSection project={project} />}
|
||||||
{activeSection === 'theme' && <ThemePicker />}
|
{activeSection === 'theme' && <ThemePicker />}
|
||||||
{activeSection === 'providers' && <ProvidersSettings />}
|
{activeSection === 'providers' && <ProvidersSettings />}
|
||||||
|
{activeSection === 'inference' && <InferenceSettingsComponent />}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -599,3 +601,249 @@ function ProjectSection({ project }: { project: Project }) {
|
|||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface InferenceSettings {
|
||||||
|
cacheTypeK: string;
|
||||||
|
cacheReuse: number;
|
||||||
|
specType: string;
|
||||||
|
ctxCheckpoints: number;
|
||||||
|
sleepIdleSeconds: number;
|
||||||
|
metrics: boolean;
|
||||||
|
slotSavePath: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const INFERENCE_DEFAULTS: InferenceSettings = {
|
||||||
|
cacheTypeK: 'q4_0',
|
||||||
|
cacheReuse: 256,
|
||||||
|
specType: 'ngram-mod',
|
||||||
|
ctxCheckpoints: 32,
|
||||||
|
sleepIdleSeconds: 600,
|
||||||
|
metrics: true,
|
||||||
|
slotSavePath: '/tmp/llama-slots',
|
||||||
|
};
|
||||||
|
|
||||||
|
const STORAGE_KEY = 'boocode-inference-settings';
|
||||||
|
|
||||||
|
function InferenceSettings() {
|
||||||
|
const [settings, setSettings] = useState<InferenceSettings>(INFERENCE_DEFAULTS);
|
||||||
|
const [saving, setSaving] = useState(false);
|
||||||
|
const [loaded, setLoaded] = useState(false);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
try {
|
||||||
|
const stored = localStorage.getItem(STORAGE_KEY);
|
||||||
|
if (stored) {
|
||||||
|
const parsed = JSON.parse(stored);
|
||||||
|
setSettings({ ...INFERENCE_DEFAULTS, ...parsed });
|
||||||
|
}
|
||||||
|
} catch { /* ignore corrupt storage */ }
|
||||||
|
setLoaded(true);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const dirty = loaded && JSON.stringify(settings) !== (() => {
|
||||||
|
try {
|
||||||
|
const stored = localStorage.getItem(STORAGE_KEY);
|
||||||
|
return stored ? JSON.stringify({ ...INFERENCE_DEFAULTS, ...JSON.parse(stored) }) : JSON.stringify(INFERENCE_DEFAULTS);
|
||||||
|
} catch { return JSON.stringify(INFERENCE_DEFAULTS); }
|
||||||
|
})();
|
||||||
|
|
||||||
|
function update<K extends keyof InferenceSettings>(key: K, value: InferenceSettings[K]) {
|
||||||
|
setSettings(prev => ({ ...prev, [key]: value }));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function save() {
|
||||||
|
if (saving) return;
|
||||||
|
setSaving(true);
|
||||||
|
try {
|
||||||
|
localStorage.setItem(STORAGE_KEY, JSON.stringify(settings));
|
||||||
|
// Simulate API delay
|
||||||
|
await new Promise(r => setTimeout(r, 300));
|
||||||
|
toast.success('Inference settings saved. Restart sidecar to apply.');
|
||||||
|
} catch (err) {
|
||||||
|
toast.error(err instanceof Error ? err.message : 'save failed');
|
||||||
|
} finally {
|
||||||
|
setSaving(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function resetDefaults() {
|
||||||
|
if (saving) return;
|
||||||
|
setSaving(true);
|
||||||
|
try {
|
||||||
|
setSettings(INFERENCE_DEFAULTS);
|
||||||
|
localStorage.setItem(STORAGE_KEY, JSON.stringify(INFERENCE_DEFAULTS));
|
||||||
|
await new Promise(r => setTimeout(r, 300));
|
||||||
|
toast.success('Reset to defaults');
|
||||||
|
} catch (err) {
|
||||||
|
toast.error(err instanceof Error ? err.message : 'reset failed');
|
||||||
|
} finally {
|
||||||
|
setSaving(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!loaded) return null;
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="space-y-6">
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Database className="size-3.5 text-muted-foreground" />
|
||||||
|
<label htmlFor="cache-type-k" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
KV Cache Quantization
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<select
|
||||||
|
id="cache-type-k"
|
||||||
|
value={settings.cacheTypeK}
|
||||||
|
onChange={(e) => update('cacheTypeK', e.target.value)}
|
||||||
|
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||||
|
>
|
||||||
|
<option value="f32">f32 — 32-bit (max quality)</option>
|
||||||
|
<option value="f16">f16 — 16-bit (balanced)</option>
|
||||||
|
<option value="q8_0">q8_0 — 8-bit (efficient)</option>
|
||||||
|
<option value="q4_0">q4_0 — 4-bit (max efficiency)</option>
|
||||||
|
</select>
|
||||||
|
<p className="text-xs text-muted-foreground italic">
|
||||||
|
Compresses the attention cache. Lower = less VRAM usage.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Zap className="size-3.5 text-muted-foreground" />
|
||||||
|
<label htmlFor="cache-reuse" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
Cache Reuse (Prompt Caching)
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<input
|
||||||
|
id="cache-reuse"
|
||||||
|
type="number"
|
||||||
|
min={0}
|
||||||
|
step={64}
|
||||||
|
value={settings.cacheReuse}
|
||||||
|
onChange={(e) => update('cacheReuse', parseInt(e.target.value) || 0)}
|
||||||
|
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||||
|
/>
|
||||||
|
<p className="text-xs text-muted-foreground italic">
|
||||||
|
Minimum chunk size in tokens to reuse across turns. 0 = disabled.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Zap className="size-3.5 text-muted-foreground" />
|
||||||
|
<label htmlFor="spec-type" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
Speculative Decoding
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<select
|
||||||
|
id="spec-type"
|
||||||
|
value={settings.specType}
|
||||||
|
onChange={(e) => update('specType', e.target.value)}
|
||||||
|
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||||
|
>
|
||||||
|
<option value="off">Off</option>
|
||||||
|
<option value="ngram-mod">ngram-mod — Lightweight (~16MB, no draft model)</option>
|
||||||
|
<option value="draft-simple">draft-simple — Requires separate draft model</option>
|
||||||
|
</select>
|
||||||
|
<p className="text-xs text-muted-foreground italic">
|
||||||
|
Predicts tokens ahead using a small model. Main model verifies in batch for 2-3x speedup.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Database className="size-3.5 text-muted-foreground" />
|
||||||
|
<label htmlFor="ctx-checkpoints" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
Context Checkpoints
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<input
|
||||||
|
id="ctx-checkpoints"
|
||||||
|
type="number"
|
||||||
|
min={0}
|
||||||
|
max={256}
|
||||||
|
value={settings.ctxCheckpoints}
|
||||||
|
onChange={(e) => update('ctxCheckpoints', parseInt(e.target.value) || 0)}
|
||||||
|
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||||
|
/>
|
||||||
|
<p className="text-xs text-muted-foreground italic">
|
||||||
|
Max checkpoints per slot. 0 = disabled.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Clock className="size-3.5 text-muted-foreground" />
|
||||||
|
<label htmlFor="sleep-idle" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
Sleep Idle
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<input
|
||||||
|
id="sleep-idle"
|
||||||
|
type="number"
|
||||||
|
min={-1}
|
||||||
|
step={60}
|
||||||
|
value={settings.sleepIdleSeconds}
|
||||||
|
onChange={(e) => update('sleepIdleSeconds', parseInt(e.target.value) || -1)}
|
||||||
|
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm outline-none focus:border-ring"
|
||||||
|
/>
|
||||||
|
<p className="text-xs text-muted-foreground italic">
|
||||||
|
Auto-sleep after N seconds idle. -1 = disabled.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center justify-between gap-3">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<BarChart3 className="size-3.5 text-muted-foreground" />
|
||||||
|
<label htmlFor="metrics" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
Metrics Endpoint
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<Switch
|
||||||
|
id="metrics"
|
||||||
|
checked={settings.metrics}
|
||||||
|
onCheckedChange={(v) => update('metrics', v)}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<p className="text-xs text-muted-foreground italic">
|
||||||
|
Exposes Prometheus /metrics endpoint for observability.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-1.5">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Folder className="size-3.5 text-muted-foreground" />
|
||||||
|
<label htmlFor="slot-save-path" className="text-xs font-medium uppercase tracking-wide text-muted-foreground">
|
||||||
|
Slot Save Path
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<input
|
||||||
|
id="slot-save-path"
|
||||||
|
type="text"
|
||||||
|
value={settings.slotSavePath}
|
||||||
|
onChange={(e) => update('slotSavePath', e.target.value)}
|
||||||
|
className="w-full bg-background border border-border rounded px-2 py-1.5 text-sm font-mono outline-none focus:border-ring"
|
||||||
|
/>
|
||||||
|
<p className="text-xs text-muted-foreground italic">
|
||||||
|
Directory for disk-persistent KV cache. Must be writable.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="flex justify-between gap-2 border-t pt-4">
|
||||||
|
<Button variant="outline" onClick={() => void resetDefaults()} disabled={saving}>
|
||||||
|
Reset to defaults
|
||||||
|
</Button>
|
||||||
|
<Button onClick={() => void save()} disabled={!dirty || saving}>
|
||||||
|
{saving ? 'Saving…' : 'Save'}
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p className="text-xs text-muted-foreground border-t pt-4">
|
||||||
|
Changes apply to new llama-server processes. Restart the sidecar to apply.
|
||||||
|
These settings are stored locally in your browser.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user