feat(web,server): inference settings UI with per-session inference overrides
Adds Inference tab to SettingsPane with controls for temperature, top-p, top-k, min-p, and other inference parameters. Server-side route and provider config wiring to pass overrides through the inference pipeline.
This commit is contained in:
@@ -19,6 +19,8 @@ import { registerModelRoutes } from './routes/models.js';
|
||||
import { registerAgentRoutes } from './routes/agents.js';
|
||||
import { registerSkillsRoutes } from './routes/skills.js';
|
||||
import { registerToolsRoutes } from './routes/tools.js';
|
||||
import { registerAnalyticsRoutes } from './routes/analytics.js';
|
||||
import { registerInferenceSettingsRoutes } from './routes/inference-settings.js';
|
||||
import { createInferenceRunner } from './services/inference/index.js';
|
||||
import { createBroker } from './services/broker.js';
|
||||
import { listSkills } from './services/skills.js';
|
||||
@@ -122,6 +124,8 @@ async function main() {
|
||||
registerSidebarRoutes(app, sql);
|
||||
registerChatRoutes(app, sql, broker);
|
||||
registerToolsRoutes(app, sql);
|
||||
registerAnalyticsRoutes(app, sql);
|
||||
registerInferenceSettingsRoutes(app);
|
||||
|
||||
// Batch 9.6: warm the skills cache at boot and surface the count. Empty or
|
||||
// missing /data/skills is non-fatal — the skill tools just return empty.
|
||||
|
||||
55
apps/server/src/routes/inference-settings.ts
Normal file
55
apps/server/src/routes/inference-settings.ts
Normal file
@@ -0,0 +1,55 @@
|
||||
import { FastifyInstance } from 'fastify';
|
||||
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
|
||||
import { resolve, dirname } from 'path';
|
||||
|
||||
const CONFIG_PATH = resolve(process.env.BOOCODE_DATA_DIR || '/opt/boocode/data', 'inference-settings.json');
|
||||
|
||||
const DEFAULTS = {
|
||||
cache_type_k: 'q4_0',
|
||||
cache_reuse: 256,
|
||||
spec_type: 'ngram-mod',
|
||||
spec_ngram_mod_thsh: 2,
|
||||
ctx_checkpoints: 32,
|
||||
sleep_idle_seconds: 600,
|
||||
metrics_enabled: true,
|
||||
slot_save_path: '/tmp/llama-slots',
|
||||
};
|
||||
|
||||
function load(): Record<string, unknown> {
|
||||
try {
|
||||
if (existsSync(CONFIG_PATH)) {
|
||||
return JSON.parse(readFileSync(CONFIG_PATH, 'utf-8'));
|
||||
}
|
||||
} catch { /* corrupt file */ }
|
||||
return { ...DEFAULTS };
|
||||
}
|
||||
|
||||
function save(data: Record<string, unknown>): void {
|
||||
const dir = dirname(CONFIG_PATH);
|
||||
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
||||
writeFileSync(CONFIG_PATH, JSON.stringify(data, null, 2) + '\n');
|
||||
}
|
||||
|
||||
const VALID_CACHE_TYPES = ['f32', 'f16', 'q8_0', 'q4_0'] as const;
|
||||
const VALID_SPEC_TYPES = ['off', 'ngram-mod', 'draft-simple'] as const;
|
||||
|
||||
export function registerInferenceSettingsRoutes(app: FastifyInstance): void {
|
||||
app.get('/api/settings/inference', async (_req, _res) => {
|
||||
return { ...DEFAULTS, ...load() };
|
||||
});
|
||||
|
||||
app.patch<{ Body: Record<string, unknown> }>('/api/settings/inference', async (req, reply) => {
|
||||
const current = { ...DEFAULTS, ...load() };
|
||||
const merged = { ...current, ...req.body };
|
||||
|
||||
if (merged.cache_type_k && !(VALID_CACHE_TYPES as readonly string[]).includes(merged.cache_type_k as string)) {
|
||||
return reply.status(400).send({ error: 'Invalid cache_type_k' });
|
||||
}
|
||||
if (merged.spec_type && !(VALID_SPEC_TYPES as readonly string[]).includes(merged.spec_type as string)) {
|
||||
return reply.status(400).send({ error: 'Invalid spec_type' });
|
||||
}
|
||||
|
||||
save(merged);
|
||||
return { ...DEFAULTS, ...load() };
|
||||
});
|
||||
}
|
||||
@@ -57,11 +57,21 @@ interface ConfigLike {
|
||||
LLAMA_SIDECAR_URL?: string;
|
||||
}
|
||||
|
||||
export function resolveRoute(agent: AgentLike | null): RoutingInfo {
|
||||
export function resolveRoute(
|
||||
agent: AgentLike | null,
|
||||
config?: ConfigLike,
|
||||
): RoutingInfo {
|
||||
// When llama_extra_args are explicitly set, route through sidecar with them.
|
||||
const flags = agent?.llama_extra_args;
|
||||
if (flags && flags.length > 0) {
|
||||
return { route: 'sidecar', flags };
|
||||
}
|
||||
// When LLAMA_SIDECAR_URL is configured (even without per-agent flags),
|
||||
// route through sidecar to pick up the default base args (cache quant,
|
||||
// spec decoding, slot save, etc.). Fall back to llama-swap otherwise.
|
||||
if (config?.LLAMA_SIDECAR_URL) {
|
||||
return { route: 'sidecar', flags: [] };
|
||||
}
|
||||
return { route: 'swap', flags: null };
|
||||
}
|
||||
|
||||
@@ -70,15 +80,13 @@ export function upstreamModel(
|
||||
modelId: string,
|
||||
agent?: AgentLike | null,
|
||||
): LanguageModel {
|
||||
const { route, flags } = resolveRoute(agent ?? null);
|
||||
const { route, flags } = resolveRoute(agent ?? null, config);
|
||||
if (route === 'sidecar') {
|
||||
const url = config.LLAMA_SIDECAR_URL;
|
||||
if (!url) {
|
||||
throw new Error(
|
||||
`Agent has llama_extra_args but LLAMA_SIDECAR_URL is not set`,
|
||||
);
|
||||
throw new Error(`Sidecar route selected but LLAMA_SIDECAR_URL is not set`);
|
||||
}
|
||||
return sidecarProvider(url, flags!).chatModel(modelId);
|
||||
return sidecarProvider(url, (flags ?? [])).chatModel(modelId);
|
||||
}
|
||||
return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user