diff --git a/.env.example b/.env.example index ff57b77..41158c3 100644 --- a/.env.example +++ b/.env.example @@ -20,6 +20,12 @@ SEARXNG_URL=http://100.114.205.53:8888 # with FAST_MODEL when unset. # TASK_MODEL_URL=http://100.90.172.55:7995 +# DeepSeek API key. When set, models with IDs starting with 'deepseek-' +# (e.g. deepseek-chat, deepseek-reasoner, deepseek-v4-flash) route through +# DeepSeek's API instead of llama-swap. Requires a DeepSeek Platform API key. +# DEEPSEEK_API_KEY=sk-... +# DEEPSEEK_BASE_URL=https://api.deepseek.com + # v1.13.15-tools: BOOCODE_TOOLS narrows the tool whitelist sent to the LLM. # Unset (default) → all tools (~21k schema). Useful primarily for single-purpose # sessions where the model only needs read-only filesystem access. diff --git a/apps/coder/src/config.ts b/apps/coder/src/config.ts index 294e9a5..f588d52 100644 --- a/apps/coder/src/config.ts +++ b/apps/coder/src/config.ts @@ -50,6 +50,8 @@ const ConfigSchema = z.object({ // only reaped after it's been untouched this long (avoids sweeping a dir mid // ensureSessionWorktree create). 1h default. ORPHAN_WORKTREE_GRACE_MS: z.coerce.number().int().positive().default(3_600_000), + DEEPSEEK_API_KEY: z.string().optional(), + DEEPSEEK_BASE_URL: z.string().url().default('https://api.deepseek.com'), }); export type Config = z.infer; diff --git a/apps/coder/src/services/provider-snapshot.ts b/apps/coder/src/services/provider-snapshot.ts index f2b36d4..c60d65e 100644 --- a/apps/coder/src/services/provider-snapshot.ts +++ b/apps/coder/src/services/provider-snapshot.ts @@ -29,6 +29,22 @@ interface AgentRow { last_probed_at: string | Date | null; } +export async function fetchDeepSeekModels(config: Config): Promise { + if (!config.DEEPSEEK_API_KEY) return []; + try { + const baseURL = (config.DEEPSEEK_BASE_URL ?? 'https://api.deepseek.com').replace(/\/+$/, ''); + const res = await fetch(`${baseURL}/v1/models`, { + headers: { Authorization: `Bearer ${config.DEEPSEEK_API_KEY}` }, + signal: AbortSignal.timeout(5_000), + }); + if (!res.ok) return []; + const parsed = (await res.json()) as { data?: Array<{ id: string }> }; + return (parsed.data ?? []).map((m) => ({ id: m.id, label: m.id })); + } catch { + return []; + } +} + export async function fetchLlamaSwapModels(config: Config): Promise { try { const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/models`); @@ -256,7 +272,13 @@ export async function getProviderSnapshot( } const build = async (): Promise => { - const llamaModels = await fetchLlamaSwapModels(config); + const [llamaModels, deepseekModels] = await Promise.all([ + fetchLlamaSwapModels(config), + fetchDeepSeekModels(config), + ]); + // Merge DeepSeek models into the llama-swap model pool so the boocode + // provider (which sources from llama-swap) also includes DeepSeek models. + const mergedModels = mergeModels(llamaModels, deepseekModels); const agents = await sql` SELECT name, install_path, supports_acp, models, commands, label, transport, last_probed_at FROM available_agents `; @@ -265,7 +287,7 @@ export async function getProviderSnapshot( const entries = await Promise.all( [...getResolvedRegistry().values()].map((resolved) => - buildProviderEntry(resolved, agentMap.get(resolved.id), llamaModels, resolvedCwd, ttlMs, force), + buildProviderEntry(resolved, agentMap.get(resolved.id), mergedModels, resolvedCwd, ttlMs, force), ), ); diff --git a/apps/server/package.json b/apps/server/package.json index 7a6ff25..f345f0a 100644 --- a/apps/server/package.json +++ b/apps/server/package.json @@ -77,8 +77,9 @@ "test": "vitest run" }, "dependencies": { - "@boocode/contracts": "workspace:*", + "@ai-sdk/deepseek": "^2.0.35", "@ai-sdk/openai-compatible": "^2.0.47", + "@boocode/contracts": "workspace:*", "@fastify/static": "^7.0.4", "@fastify/websocket": "^10.0.1", "@modelcontextprotocol/sdk": "^1.29.0", diff --git a/apps/server/src/config.ts b/apps/server/src/config.ts index d7b6199..d69f1a0 100644 --- a/apps/server/src/config.ts +++ b/apps/server/src/config.ts @@ -26,6 +26,14 @@ const ConfigSchema = z.object({ FAST_MODEL: z.string().optional(), TASK_MODEL_URL: z.string().url().optional(), LLAMA_SIDECAR_URL: z.string().url().optional(), + // vDeepSeek: DeepSeek API key for direct API access. When set, models + // with IDs starting with 'deepseek-' route through DeepSeek's API instead + // of llama-swap. Defaults to empty (DeepSeek routing disabled). + DEEPSEEK_API_KEY: z.string().optional(), + // Optional base URL override for DeepSeek API. Defaults to api.deepseek.com. + DEEPSEEK_BASE_URL: z.string().url().default('https://api.deepseek.com'), + // vWhale hooks: path to hooks JSON config file. Missing file = no hooks. + HOOKS_CONFIG_PATH: z.string().default('/data/hooks.json'), }); export type Config = z.infer; diff --git a/apps/server/src/index.ts b/apps/server/src/index.ts index 913c7b1..19114f6 100644 --- a/apps/server/src/index.ts +++ b/apps/server/src/index.ts @@ -31,6 +31,7 @@ import { loadMcpConfig } from './services/mcp-config.js'; import { initialize as initMcp, getTools as getMcpTools, shutdown as shutdownMcp } from './services/mcp-client.js'; import { appendMcpTools } from './services/tools.js'; import { refreshToolNames, getAgentsForProject } from './services/agents.js'; +import { loadHooksConfig, createHookRunner } from './services/hooks.js'; async function main() { const config = loadConfig(); @@ -136,11 +137,17 @@ async function main() { app.log.warn({ err }, 'skills boot walk failed'); } + // vWhale hooks: load hook config and create runner. Missing file = no hooks. + loadHooksConfig(config.HOOKS_CONFIG_PATH); + const hookRunner = createHookRunner(); + const hasHooks = Object.keys(loadHooksConfig(config.HOOKS_CONFIG_PATH).hooks).length > 0; + const inference = createInferenceRunner( { sql, config, log: app.log, + hooks: hasHooks ? hookRunner : undefined, publish: (sessionId, frame) => { // v1.13.11-b: route through the typed publishFrame so the broker's // Zod gate validates every inference frame before delivery. @@ -166,7 +173,7 @@ async function main() { // bubble up so the route can reply 500 — manual /compact failures // should be loud (the user just clicked a button). runCompaction: (chatId) => - compaction.process({ sql, config, log: app.log, broker, chatId }), + compaction.process({ sql, config, log: app.log, broker, chatId, hooks: hasHooks ? hookRunner : undefined }), cancelInference: async (sessionId, chatId) => { return inference.cancel(sessionId, chatId); }, diff --git a/apps/server/src/routes/models.ts b/apps/server/src/routes/models.ts index b3266d0..f0bd3a8 100644 --- a/apps/server/src/routes/models.ts +++ b/apps/server/src/routes/models.ts @@ -2,26 +2,55 @@ import type { FastifyInstance } from 'fastify'; import type { Config } from '../config.js'; import type { ModelInfo } from '../types/api.js'; -interface LlamaSwapModelsResponse { +interface ApiModelsResponse { data?: ModelInfo[]; } +const DEEPSEEK_STATIC_MODELS: ModelInfo[] = [ + { id: 'deepseek-v4-flash', object: 'model', created: 0, owned_by: 'deepseek' }, + { id: 'deepseek-v4-pro', object: 'model', created: 0, owned_by: 'deepseek' }, +]; + export function registerModelRoutes(app: FastifyInstance, config: Config): void { app.get('/api/models', async (_req, reply) => { + const models: ModelInfo[] = []; + + // 1. Fetch llama-swap models try { const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/models`); - if (!res.ok) { - reply.code(502); - return { error: `llama-swap returned ${res.status}` }; + if (res.ok) { + const parsed = (await res.json()) as ApiModelsResponse; + if (parsed.data) models.push(...parsed.data); } - const parsed = (await res.json()) as LlamaSwapModelsResponse; - return parsed.data ?? []; - } catch (err) { - reply.code(502); - return { - error: 'failed to reach llama-swap', - details: err instanceof Error ? err.message : String(err), - }; + } catch { + // llama-swap unreachable — proceed with whatever we have } + + // 2. If DeepSeek is configured, fetch live models from their API + if (config.DEEPSEEK_API_KEY) { + try { + const baseURL = (config.DEEPSEEK_BASE_URL ?? 'https://api.deepseek.com').replace(/\/+$/, ''); + const res = await fetch(`${baseURL}/v1/models`, { + headers: { Authorization: `Bearer ${config.DEEPSEEK_API_KEY}` }, + signal: AbortSignal.timeout(5_000), + }); + if (res.ok) { + const parsed = (await res.json()) as ApiModelsResponse; + if (parsed.data) models.push(...parsed.data); + } else { + // API call failed — fall back to static model list + models.push(...DEEPSEEK_STATIC_MODELS); + } + } catch { + // Network error — fall back to static model list + models.push(...DEEPSEEK_STATIC_MODELS); + } + } + + if (models.length === 0) { + reply.code(502); + return { error: 'no models available from any provider' }; + } + return models; }); } diff --git a/apps/server/src/schema.sql b/apps/server/src/schema.sql index d6d3089..dd73ae2 100644 --- a/apps/server/src/schema.sql +++ b/apps/server/src/schema.sql @@ -126,8 +126,8 @@ SELECT FROM message_parts p WHERE p.message_id = m.id AND p.kind = 'reasoning' AND p.hidden_at IS NULL) AS reasoning_parts, -- NEW columns MUST be appended at the end: CREATE OR REPLACE VIEW can't - -- reorder/rename existing columns (42P16). m.model added last. - m.model + -- reorder/rename existing columns (42P16). cache_tokens and reasoning_tokens added last. + m.model, m.cache_tokens, m.reasoning_tokens FROM messages m; -- v1.13.20: drop legacy tool_calls/tool_results columns. Reads have routed @@ -204,6 +204,8 @@ ALTER TABLE messages ADD COLUMN IF NOT EXISTS ctx_used INTEGER; ALTER TABLE messages ADD COLUMN IF NOT EXISTS ctx_max INTEGER; ALTER TABLE messages ADD COLUMN IF NOT EXISTS started_at TIMESTAMPTZ; ALTER TABLE messages ADD COLUMN IF NOT EXISTS finished_at TIMESTAMPTZ; +ALTER TABLE messages ADD COLUMN IF NOT EXISTS cache_tokens INTEGER; +ALTER TABLE messages ADD COLUMN IF NOT EXISTS reasoning_tokens INTEGER; ALTER TABLE sessions ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(); diff --git a/apps/server/src/services/agents.ts b/apps/server/src/services/agents.ts index e4f0697..e4769bc 100644 --- a/apps/server/src/services/agents.ts +++ b/apps/server/src/services/agents.ts @@ -106,6 +106,8 @@ interface ParsedFrontmatter { // allowed" — the model responds text-only. steps?: number; llama_extra_args?: string[]; + // vDeepSeek: thinking effort for DeepSeek V4 models. + reasoning_effort?: string; } // P5: table-driven validation for the "soft-range" numeric frontmatter fields. @@ -386,6 +388,7 @@ function parseAgentSection(section: RawSection): Omit { max_tool_calls: typeof fm.max_tool_calls === 'number' ? fm.max_tool_calls : null, steps: typeof fm.steps === 'number' ? fm.steps : null, llama_extra_args: Array.isArray(fm.llama_extra_args) ? fm.llama_extra_args : null, + reasoning_effort: typeof fm.reasoning_effort === 'string' ? (fm.reasoning_effort as Agent['reasoning_effort']) : null, }; } diff --git a/apps/server/src/services/compaction.ts b/apps/server/src/services/compaction.ts index b283f7b..a0e5e2d 100644 --- a/apps/server/src/services/compaction.ts +++ b/apps/server/src/services/compaction.ts @@ -24,6 +24,8 @@ import { SUMMARY_TEMPLATE } from './compaction-prompt.js'; import * as modelContextLookup from './model-context.js'; import { SENTINEL_KINDS } from './inference/sentinels.js'; import type { OpenAiMessage } from './inference/payload.js'; +import { resolveModelEndpoint } from './inference/provider.js'; +import type { HookRunner } from './hooks.js'; // v1.13.9: ratio-only overflow trigger. Fires compaction at 85% of ctx_max // (opencode session/overflow.ts pattern). Replaces the v1.11.0-era @@ -346,20 +348,22 @@ interface CompletionResult { completionTokens: number; } -async function callLlamaSwap( +async function callLlm( config: Config, model: string, messages: OpenAiMessage[], log: FastifyBaseLogger, ): Promise { - const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/chat/completions`, { + const { url, headers, model: resolvedModel } = resolveModelEndpoint(config, model); + const res = await fetch(`${url}/v1/chat/completions`, { method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ model, messages, stream: false }), + headers, + body: JSON.stringify({ model: resolvedModel, messages, stream: false }), }); if (!res.ok) { const text = await res.text().catch(() => ''); - throw new Error(`llama-swap returned ${res.status}: ${text.slice(0, 200)}`); + const prefix = model.startsWith('deepseek-') ? 'deepseek' : 'llama-swap'; + throw new Error(`${prefix} returned ${res.status}: ${text.slice(0, 200)}`); } const json = (await res.json()) as { choices?: Array<{ message?: { content?: string } }>; @@ -383,6 +387,8 @@ export interface ProcessInput { log: FastifyBaseLogger; broker: Broker; chatId: string; + /** vWhale: lifecycle hooks runner. Undefined when no hooks configured. */ + hooks?: HookRunner; } // Runs one round of anchored rolling compaction on `chatId`. No-ops cleanly @@ -497,6 +503,17 @@ export async function process(input: ProcessInput): Promise { at: new Date().toISOString(), }); + // vWhale: PreCompact hook (best-effort, non-blocking). + const msgBefore = messages.length; + if (input.hooks) { + input.hooks.run('PreCompact', { + event: 'PreCompact', + session_id: sessionId, + chat_id: chatId, + messages_before: msgBefore, + }).catch(() => {}); + } + // try/finally so the dot ALWAYS drops back to idle, even if the LLM call // throws or a downstream DB write fails. The succeeded flag gates the // 'compacted' frame + final log: we only signal completion to the UI when @@ -506,7 +523,7 @@ export async function process(input: ProcessInput): Promise { let result: CompletionResult | undefined; try { // 7. Single completion (no tools). Throws on llama-swap failure. - result = await callLlamaSwap(config, session.model, payload, log); + result = await callLlm(config, session.model, payload, log); // 7b. v1.11.3: fetch the model's true context window from llama-swap's // /upstream//props (the streaming completion doesn't carry it). @@ -558,6 +575,18 @@ export async function process(input: ProcessInput): Promise { `; succeeded = true; + + // vWhale: PostCompact hook (best-effort, non-blocking). + if (input.hooks) { + input.hooks.run('PostCompact', { + event: 'PostCompact', + session_id: sessionId, + chat_id: chatId, + messages_before: msgBefore, + messages_after: sel.head.length, + summary: (result?.content ?? '').slice(0, 500), + }).catch(() => {}); + } } finally { // Always restore the dot. Status='idle' (not 'error') even on failure — // the caller logs/re-surfaces the error separately; the dot doesn't diff --git a/apps/server/src/services/hooks.ts b/apps/server/src/services/hooks.ts new file mode 100644 index 0000000..ab60909 --- /dev/null +++ b/apps/server/src/services/hooks.ts @@ -0,0 +1,299 @@ +/** + * vWhale: lifecycle hook runner. Hooks are shell commands that fire at key + * points in the inference pipeline. Each hook receives a JSON payload on + * stdin and can return JSON on stdout to influence behavior. + * + * Inspired by Whale's hook system with 11 lifecycle events. BooCode + * implements the most relevant subset: PreToolUse, PostToolUse, + * UserPromptSubmit, Stop, PreCompact, PostCompact. + * + * Config: JSON file at HOOKS_CONFIG_PATH (default /data/hooks.json). + * Format: + * ```json + * { + * "hooks": { + * "PreToolUse": [ + * { "match": "shell_run", "command": "python3 /data/hooks/check_shell.py", "timeout": 30 } + * ], + * "Stop": [ + * { "command": "node /data/hooks/log_turn.mjs" } + * ] + * } + * } + * ``` + */ + +import { spawn } from 'node:child_process'; +import { readFileSync, existsSync } from 'node:fs'; +import type { FastifyBaseLogger } from 'fastify'; + +// ─── Events ─────────────────────────────────────────────────────────────── + +export type HookEvent = + | 'PreToolUse' + | 'PostToolUse' + | 'UserPromptSubmit' + | 'Stop' + | 'PreCompact' + | 'PostCompact'; + +const ALL_EVENTS: HookEvent[] = [ + 'PreToolUse', + 'PostToolUse', + 'UserPromptSubmit', + 'Stop', + 'PreCompact', + 'PostCompact', +]; + +// ─── Config ──────────────────────────────────────────────────────────────── + +export interface HookConfig { + /** Glob or exact tool name to match (PreToolUse/PostToolUse only). Omit or '*' for all. */ + match?: string; + /** Shell command to run. Receives JSON payload on stdin. */ + command: string; + /** Timeout in seconds (default 30). */ + timeout?: number; +} + +export interface HooksConfig { + hooks: Partial>; +} + +// ─── Payloads ────────────────────────────────────────────────────────────── + +export interface PreToolUsePayload { + event: 'PreToolUse'; + session_id: string; + tool_name: string; + tool_args: Record; +} + +export interface PostToolUsePayload { + event: 'PostToolUse'; + session_id: string; + tool_name: string; + tool_args: Record; + tool_result: unknown; + tool_error?: string; +} + +export interface UserPromptSubmitPayload { + event: 'UserPromptSubmit'; + session_id: string; + chat_id: string; + prompt: string; +} + +export interface StopPayload { + event: 'Stop'; + session_id: string; + chat_id: string; + last_assistant_text: string; + turn: number; +} + +export interface PreCompactPayload { + event: 'PreCompact'; + session_id: string; + chat_id: string; + messages_before: number; +} + +export interface PostCompactPayload { + event: 'PostCompact'; + session_id: string; + chat_id: string; + messages_before: number; + messages_after: number; + summary: string; +} + +export type HookPayload = + | PreToolUsePayload + | PostToolUsePayload + | UserPromptSubmitPayload + | StopPayload + | PreCompactPayload + | PostCompactPayload; + +// ─── Response ────────────────────────────────────────────────────────────── + +export type HookDecision = 'pass' | 'warn' | 'block'; + +export interface HookResponse { + decision?: HookDecision; + reason?: string; + /** When present, replaces the original tool args / user prompt. */ + updated_input?: Record | string; + /** Injected into the model's context for the next turn. */ + additional_context?: string; +} + +// ─── Runner ──────────────────────────────────────────────────────────────── + +export interface HookRunner { + /** Run all hooks for the given event. Returns the effective response. */ + run(event: HookEvent, payload: HookPayload, log?: FastifyBaseLogger): Promise; +} + +let hooksConfig: HooksConfig | null = null; +let hooksPath: string | null = null; + +/** Load hooks config from disk. Missing file = no hooks. Never throws. */ +export function loadHooksConfig(path: string): HooksConfig { + hooksPath = path; + if (!existsSync(path)) { + hooksConfig = { hooks: {} }; + return hooksConfig; + } + try { + const raw = readFileSync(path, 'utf8'); + const parsed = JSON.parse(raw) as HooksConfig; + hooksConfig = { + hooks: { ...parsed.hooks }, + }; + // Validate event names + for (const event of Object.keys(hooksConfig.hooks)) { + if (!ALL_EVENTS.includes(event as HookEvent)) { + console.warn(`hooks: unknown event '${event}' in ${path} — ignoring`); + delete hooksConfig.hooks[event as HookEvent]; + } + } + } catch (err) { + console.error(`hooks: failed to load ${path}`, err); + hooksConfig = { hooks: {} }; + } + return hooksConfig; +} + +/** Reload the config file (call after a PATCH). */ +export function reloadHooksConfig(): HooksConfig { + if (hooksPath) return loadHooksConfig(hooksPath); + hooksConfig = { hooks: {} }; + return hooksConfig; +} + +function getConfig(): HooksConfig { + return hooksConfig ?? { hooks: {} }; +} + +/** Create a HookRunner for the current config. */ +export function createHookRunner(): HookRunner { + return { + async run(event, payload, log): Promise { + const configs = getConfig().hooks[event]; + if (!configs || configs.length === 0) return { decision: 'pass' }; + + // Pre-filter by match pattern for tool events + const toolName = 'tool_name' in payload ? (payload as PreToolUsePayload).tool_name : undefined; + + let effective: HookResponse = { decision: 'pass' }; + + for (const cfg of configs) { + // Skip if match doesn't apply + if (toolName && cfg.match && cfg.match !== '*' && cfg.match !== toolName) continue; + + const result = await runSingleHook(cfg, payload, log); + // Merge decisions: block > warn > pass + if (result.decision === 'block') { + effective = { ...result, decision: 'block' }; + break; // block is terminal + } + if (result.decision === 'warn' && effective.decision !== 'block') { + effective = { ...result, decision: 'warn' }; + } + // Merge additional_context and updated_input + if (result.additional_context) { + effective.additional_context = effective.additional_context + ? effective.additional_context + '\n' + result.additional_context + : result.additional_context; + } + if (result.updated_input && !effective.updated_input) { + effective.updated_input = result.updated_input; + } + } + + return effective; + }, + }; +} + +async function runSingleHook( + cfg: HookConfig, + payload: HookPayload, + log?: FastifyBaseLogger, +): Promise { + const timeoutMs = (cfg.timeout ?? 30) * 1000; + + return new Promise((resolve) => { + const child = spawn('sh', ['-c', cfg.command], { + stdio: ['pipe', 'pipe', 'pipe'], + timeout: timeoutMs, + env: { ...process.env }, + }); + + const stdout: Buffer[] = []; + const stderr: Buffer[] = []; + + child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk)); + child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk)); + + let settled = false; + const timer = setTimeout(() => { + if (!settled) { + settled = true; + child.kill('SIGTERM'); + log?.warn({ event: payload.event, command: cfg.command }, 'hooks: timeout'); + resolve({ decision: 'warn', reason: 'hook timed out' }); + } + }, timeoutMs); + + child.on('error', (err) => { + if (!settled) { + settled = true; + clearTimeout(timer); + log?.warn({ err, event: payload.event }, 'hooks: spawn error'); + resolve({ decision: 'warn', reason: `hook failed: ${err.message}` }); + } + }); + + child.on('close', (code) => { + if (settled) return; + settled = true; + clearTimeout(timer); + + const out = Buffer.concat(stdout).toString('utf8').trim(); + const errOut = Buffer.concat(stderr).toString('utf8').trim(); + + if (code !== 0 && !out) { + log?.warn({ event: payload.event, code, stderr: errOut.slice(0, 200) }, 'hooks: non-zero exit'); + resolve({ decision: 'warn', reason: `hook exited ${code}` }); + return; + } + + // Parse stdout as JSON response + if (out) { + try { + const parsed = JSON.parse(out) as HookResponse; + resolve(parsed); + return; + } catch { + // Not JSON — treat as pass with stdout as context + if (out.length > 0) { + resolve({ decision: 'pass', additional_context: out }); + return; + } + } + } + + resolve({ decision: 'pass' }); + }); + + // Write payload to stdin + const json = JSON.stringify(payload); + child.stdin.write(json); + child.stdin.end(); + }); +} diff --git a/apps/server/src/services/inference/error-handler.ts b/apps/server/src/services/inference/error-handler.ts index 4bd438d..b0aed4a 100644 --- a/apps/server/src/services/inference/error-handler.ts +++ b/apps/server/src/services/inference/error-handler.ts @@ -122,6 +122,8 @@ export async function finalizeStreamedRow( completionTokens: number | null; promptTokens: number | null; startedAt: string | null; + cacheTokens?: number | null; + reasoningTokens?: number | null; beforeComplete?: () => Promise; }, ): Promise { @@ -137,6 +139,8 @@ export async function finalizeStreamedRow( tokens_used = ${opts.completionTokens}, ctx_used = ${opts.promptTokens}, ctx_max = ${nCtx}, + cache_tokens = ${opts.cacheTokens ?? null}, + reasoning_tokens = ${opts.reasoningTokens ?? null}, finished_at = clock_timestamp() WHERE id = ${opts.messageId} RETURNING tokens_used, ctx_used, ctx_max, finished_at @@ -149,6 +153,8 @@ export async function finalizeStreamedRow( tokens_used: updated?.tokens_used ?? null, ctx_used: updated?.ctx_used ?? null, ctx_max: updated?.ctx_max ?? null, + cache_tokens: opts.cacheTokens ?? null, + reasoning_tokens: opts.reasoningTokens ?? null, started_at: opts.startedAt, finished_at: updated?.finished_at ?? null, model: opts.model, @@ -188,7 +194,7 @@ export async function finalizeCompletion( ): Promise { const { sessionId, chatId, assistantMessageId } = args; const content = stripToolMarkup(result.content, { final: true }); - const { finishReason, promptTokens, completionTokens } = result; + const { finishReason, promptTokens, completionTokens, cacheReadTokens, reasoningTokens } = result; // v1.11.3: see executeToolPhase for the rationale. const mctx = await modelContext.getModelContext(session.model); @@ -203,6 +209,8 @@ export async function finalizeCompletion( tokens_used = ${completionTokens}, ctx_used = ${promptTokens}, ctx_max = ${nCtx}, + cache_tokens = ${cacheReadTokens ?? null}, + reasoning_tokens = ${reasoningTokens ?? null}, model = ${session.model}, finished_at = clock_timestamp() WHERE id = ${assistantMessageId} @@ -268,6 +276,8 @@ export async function finalizeCompletion( tokens_used: updated?.tokens_used ?? null, ctx_used: updated?.ctx_used ?? null, ctx_max: updated?.ctx_max ?? null, + cache_tokens: cacheReadTokens ?? null, + reasoning_tokens: reasoningTokens ?? null, started_at: startedAt, finished_at: updated?.finished_at ?? null, model: session.model, diff --git a/apps/server/src/services/inference/provider.ts b/apps/server/src/services/inference/provider.ts index cf10dfe..8191561 100644 --- a/apps/server/src/services/inference/provider.ts +++ b/apps/server/src/services/inference/provider.ts @@ -1,4 +1,5 @@ import { createOpenAICompatible } from '@ai-sdk/openai-compatible'; +import { createDeepSeek } from '@ai-sdk/deepseek'; import type { LanguageModel } from 'ai'; // v1.13.1-A: AI SDK provider against llama-swap. baseURL is threaded from @@ -11,6 +12,12 @@ import type { LanguageModel } from 'ai'; // llama-sidecar instead. A fresh provider is created per call (not cached) // because the X-Agent-Flags header varies per agent. The llama-swap path // stays cached since it has no per-request headers. +// +// vDeepSeek: when the model ID starts with 'deepseek-' and DEEPSEEK_API_KEY +// is set, route through the official @ai-sdk/deepseek provider (not +// openai-compatible) so DeepSeek-specific features work: providerMetadata +// with promptCacheHitTokens/promptCacheMissTokens, reasoning via +// LanguageModelV4Usage.outputTokens.reasoning, and thinking-mode options. const swapCache = new Map>(); @@ -41,7 +48,28 @@ function sidecarProvider( }); } -export type InferenceRoute = 'swap' | 'sidecar'; +const DEEPSEEK_MODEL_PREFIX = 'deepseek-'; + +export function isDeepSeekModel(modelId: string): boolean { + return modelId.startsWith(DEEPSEEK_MODEL_PREFIX); +} + +let deepseekProviderCache: ReturnType | null = null; + +function getDeepSeekProvider( + apiKey: string, + baseURL: string, +): ReturnType { + if (!deepseekProviderCache) { + deepseekProviderCache = createDeepSeek({ + apiKey, + baseURL, + }); + } + return deepseekProviderCache; +} + +export type InferenceRoute = 'swap' | 'sidecar' | 'deepseek'; export interface RoutingInfo { route: InferenceRoute; @@ -55,12 +83,21 @@ interface AgentLike { interface ConfigLike { LLAMA_SWAP_URL: string; LLAMA_SIDECAR_URL?: string; + DEEPSEEK_API_KEY?: string; + DEEPSEEK_BASE_URL?: string; } export function resolveRoute( agent: AgentLike | null, config?: ConfigLike, + modelId?: string, ): RoutingInfo { + // vDeepSeek: if the model starts with deepseek- and DEEPSEEK_API_KEY is set, + // route through the DeepSeek provider. Checked first so DeepSeek models + // always bypass llama-swap/sidecar even when those are also configured. + if (modelId?.startsWith(DEEPSEEK_MODEL_PREFIX) && config?.DEEPSEEK_API_KEY) { + return { route: 'deepseek', flags: null }; + } // When llama_extra_args are explicitly set, route through sidecar with them. const flags = agent?.llama_extra_args; if (flags && flags.length > 0) { @@ -80,7 +117,13 @@ export function upstreamModel( modelId: string, agent?: AgentLike | null, ): LanguageModel { - const { route, flags } = resolveRoute(agent ?? null, config); + const { route, flags } = resolveRoute(agent ?? null, config, modelId); + if (route === 'deepseek') { + return getDeepSeekProvider( + config.DEEPSEEK_API_KEY!, + config.DEEPSEEK_BASE_URL ?? 'https://api.deepseek.com', + ).chat(modelId); + } if (route === 'sidecar') { const url = config.LLAMA_SIDECAR_URL; if (!url) { @@ -90,3 +133,30 @@ export function upstreamModel( } return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId); } + +/** Resolve the API endpoint for non-streaming calls (compaction, task-model). + * Returns the URL + model + optional auth header for direct fetch() usage. */ +export function resolveModelEndpoint( + config: ConfigLike, + modelId: string, +): { url: string; model: string; headers: Record } { + const baseHeaders: Record = { 'Content-Type': 'application/json' }; + if (modelId.startsWith(DEEPSEEK_MODEL_PREFIX) && config.DEEPSEEK_API_KEY) { + const baseURL = (config.DEEPSEEK_BASE_URL ?? 'https://api.deepseek.com').replace(/\/+$/, ''); + return { + url: baseURL, + model: modelId, + headers: { ...baseHeaders, Authorization: `Bearer ${config.DEEPSEEK_API_KEY}` }, + }; + } + return { + url: config.LLAMA_SWAP_URL.replace(/\/+$/, ''), + model: modelId, + headers: baseHeaders, + }; +} + +/** Invalidate the cached DeepSeek provider (e.g. when env vars change at runtime). */ +export function resetDeepSeekProvider(): void { + deepseekProviderCache = null; +} diff --git a/apps/server/src/services/inference/stream-phase-adapter.ts b/apps/server/src/services/inference/stream-phase-adapter.ts index 7094649..c16262a 100644 --- a/apps/server/src/services/inference/stream-phase-adapter.ts +++ b/apps/server/src/services/inference/stream-phase-adapter.ts @@ -13,7 +13,7 @@ import type { OpenAiMessage } from './payload.js'; import { extractToolCallBlocks } from './tool-call-parser.js'; import { classifyStreamError } from './stream-error-classifier.js'; import type { StreamResult } from './types.js'; -import { upstreamModel } from './provider.js'; +import { isDeepSeekModel, upstreamModel } from './provider.js'; import { jsonSchema, streamText, @@ -51,6 +51,9 @@ export interface StreamOptions { dry_base?: number | null; dry_allowed_length?: number | null; dry_penalty_last_n?: number | null; + // vDeepSeek: thinking/reasoning effort. Maps to DeepSeek's reasoning_effort + // API param for deepseek-v4-flash / deepseek-v4-pro models. + reasoning_effort?: 'off' | 'low' | 'medium' | 'high' | 'xhigh' | 'max'; } // P5: the 10-field sampler-options literal that was copy-pasted at 4 sites @@ -74,6 +77,7 @@ export function samplerOptsFromAgent(agent: Agent | null): SamplerOpts { dry_base: agent?.dry_base ?? undefined, dry_allowed_length: agent?.dry_allowed_length ?? undefined, dry_penalty_last_n: agent?.dry_penalty_last_n ?? undefined, + reasoning_effort: agent?.reasoning_effort ?? undefined, }; } @@ -272,6 +276,19 @@ export async function streamCompletion( // before this. They now go through the same extraBody path as the new params. const samplerBody = buildSamplerProviderOptions(opts); + // vDeepSeek: build providerOptions.deepseek for DeepSeek V4 models. + let deepseekProviderOptions: + | { thinking: { type: 'enabled' | 'disabled' }; reasoningEffort?: 'low' | 'medium' | 'high' | 'xhigh' | 'max' } + | undefined; + if (isDeepSeekModel(model)) { + const dsEffort = opts.reasoning_effort; + const thinkingEnabled = dsEffort && dsEffort !== 'off'; + deepseekProviderOptions = { + thinking: { type: thinkingEnabled ? 'enabled' : 'disabled' }, + ...(thinkingEnabled ? { reasoningEffort: dsEffort } : {}), + }; + } + // F6: per-chunk stall deadline. If the model stops emitting chunks for // STALL_TIMEOUT_MS the stallAc fires through AbortSignal.any; the post-loop // abort check below then throws AbortError → handleAbortOrError writes @@ -297,7 +314,14 @@ export async function streamCompletion( ...(typeof opts.temperature === 'number' ? { temperature: opts.temperature } : {}), ...(typeof opts.top_p === 'number' ? { topP: opts.top_p } : {}), ...(typeof opts.presence_penalty === 'number' ? { presencePenalty: opts.presence_penalty } : {}), - ...(samplerBody ? { providerOptions: { openaiCompatible: samplerBody } } : {}), + ...(samplerBody || deepseekProviderOptions + ? { + providerOptions: { + ...(samplerBody ? { openaiCompatible: samplerBody } : {}), + ...(deepseekProviderOptions ? { deepseek: deepseekProviderOptions } : {}), + }, + } + : {}), abortSignal: effectiveSignal, }); @@ -401,12 +425,26 @@ export async function streamCompletion( // Usage lands as a promise on the result; awaiting after fullStream is // drained is safe. AI SDK v6 names: `inputTokens` / `outputTokens`. + // Some providers (llama-swap via openai-compatible) return plain numbers; + // others (deepseek via @ai-sdk/deepseek) return {total, cacheRead, noCache, ...}. let promptTokens: number | null = null; let completionTokens: number | null = null; + let cacheReadTokens: number | null = null; + let reasoningTokens: number | null = null; try { const usage = await result.usage; - if (typeof usage.inputTokens === 'number') promptTokens = usage.inputTokens; - if (typeof usage.outputTokens === 'number') completionTokens = usage.outputTokens; + if (typeof usage.inputTokens === 'number') { + promptTokens = usage.inputTokens; + } else if (usage.inputTokens && typeof usage.inputTokens === 'object') { + promptTokens = (usage.inputTokens as Record).total ?? null; + cacheReadTokens = (usage.inputTokens as Record).cacheRead ?? null; + } + if (typeof usage.outputTokens === 'number') { + completionTokens = usage.outputTokens; + } else if (usage.outputTokens && typeof usage.outputTokens === 'object') { + completionTokens = (usage.outputTokens as Record).total ?? null; + reasoningTokens = (usage.outputTokens as Record).reasoning ?? null; + } } catch { // Some providers omit usage on partial streams; leave both null. } @@ -422,6 +460,13 @@ export async function streamCompletion( ); } + if (cacheReadTokens !== null || reasoningTokens !== null) { + ctx.log.debug( + { promptTokens, completionTokens, cacheReadTokens, reasoningTokens, model }, + 'streamCompletion: deepseek usage breakdown', + ); + } + return { finishReason, content, @@ -429,6 +474,10 @@ export async function streamCompletion( promptTokens, completionTokens, reasoning: reasoningAccumulated, + // vDeepSeek: optional usage breakdown populated when the provider returns + // structured usage (cache hit tokens, reasoning tokens). + cacheReadTokens: cacheReadTokens ?? undefined, + reasoningTokens: reasoningTokens ?? undefined, }; } finally { // Clear the stall timer whether the stream completes normally, throws, or diff --git a/apps/server/src/services/inference/tool-input-repair.ts b/apps/server/src/services/inference/tool-input-repair.ts new file mode 100644 index 0000000..52941f0 --- /dev/null +++ b/apps/server/src/services/inference/tool-input-repair.ts @@ -0,0 +1,179 @@ +/** + * vWhale: schema-based tool input repair. When the model emits tool call args + * that don't match the expected types (common with weaker models), apply + * heuristic repairs before falling through to the Zod parse. + * + * Inspired by Whale's RepairToolInputForSpec: + * - Coerce string "true"/"false" → boolean + * - Unwrap markdown autolinks in string fields: → /path + * - Wrap bare values in arrays when schema expects array + * - Convert "42.0" decimal string → "42" for integer fields + * - Recurse into objects to repair nested properties + */ + +export interface ToolInputRepair { + field: string; + kind: string; + detail: string; +} + +const MARKDOWN_AUTOLINK_RE = /^<(?:file|path):\/\/(.+?)>$/; + +/** + * Attempt to repair tool call args against the tool's JSON Schema. + * Returns the (possibly modified) args plus a list of repairs applied. + */ +export function repairToolInput( + schema: Record | undefined, + args: Record, +): { repaired: Record; repairs: ToolInputRepair[] } { + const repairs: ToolInputRepair[] = []; + if (!schema || typeof schema !== 'object') { + return { repaired: args, repairs }; + } + + const properties = (schema as Record).properties as + Record | undefined; + if (!properties) { + return { repaired: args, repairs }; + } + + const required = new Set( + Array.isArray((schema as Record).required) + ? (schema as Record).required as string[] + : [], + ); + + const repaired: Record = {}; + for (const [key, value] of Object.entries(args)) { + const propSchema = properties[key] as Record | undefined; + if (propSchema && value !== null && value !== undefined) { + repaired[key] = repairValue(key, propSchema, value, repairs, required.has(key)); + } else { + repaired[key] = value; + } + } + + // Drop keys not in the schema (only for required fields that are missing) + // to avoid polluting the model with hallucinated params. + for (const key of Object.keys(repaired)) { + if (!(key in properties)) { + repairs.push({ field: key, kind: 'removed_unknown', detail: `Removed unknown parameter '${key}'` }); + delete repaired[key]; + } + } + + return { repaired, repairs }; +} + +function repairValue( + field: string, + schema: Record, + value: unknown, + repairs: ToolInputRepair[], + required: boolean, +): unknown { + const schemaType = schema.type; + const isArray = schemaType === 'array' || Array.isArray(schemaType) + ? schemaType === 'array' || (Array.isArray(schemaType) && schemaType.includes('array')) + : false; + const isObject = schemaType === 'object'; + const isBoolean = schemaType === 'boolean'; + const isInteger = schemaType === 'integer' || schemaType === 'number'; + const isString = schemaType === 'string'; + + // --- Array repair: wrap bare value or empty object --- + if (isArray) { + if (!Array.isArray(value)) { + if (typeof value === 'string') { + // Try parsing as JSON array first + try { + const parsed = JSON.parse(value); + if (Array.isArray(parsed)) { + repairs.push({ field, kind: 'parsed_json_array', detail: `Parsed string as JSON array for '${field}'` }); + return parsed; + } + } catch { /* not JSON */ } + } + if (typeof value === 'object' && value !== null && Object.keys(value).length === 0) { + if (required) { + repairs.push({ field, kind: 'empty_object_to_array', detail: `Converted empty object to empty array for '${field}'` }); + return []; + } + repairs.push({ field, kind: 'empty_object_to_undefined', detail: `Removed empty object for optional array '${field}'` }); + return undefined; + } + repairs.push({ field, kind: 'wrapped_in_array', detail: `Wrapped bare value in array for '${field}'` }); + return [value]; + } + // Recurse into array items + const itemsSchema = schema.items as Record | undefined; + if (itemsSchema) { + return value.map((item, i) => repairValue(`${field}[${i}]`, itemsSchema, item, repairs, required)); + } + return value; + } + + // --- Object repair: recurse into properties --- + if (isObject && typeof value === 'object' && value !== null && !Array.isArray(value)) { + const props = (schema.properties as Record) ?? {}; + const repaired: Record = {}; + for (const [k, v] of Object.entries(value as Record)) { + const propSchema = props[k] as Record | undefined; + if (propSchema) { + repaired[k] = repairValue(`${field}.${k}`, propSchema, v, repairs, required); + } else { + repaired[k] = v; + } + } + return repaired; + } + + // --- String repair: unwrap markdown autolinks --- + if (isString && typeof value === 'string') { + const match = value.match(MARKDOWN_AUTOLINK_RE); + if (match) { + repairs.push({ field, kind: 'unwrapped_markdown_link', detail: `Unwrapped markdown autolink for '${field}': ${value}` }); + return match[1]; + } + return value; + } + + // --- Boolean coercion --- + if (isBoolean && typeof value === 'string') { + const lower = value.toLowerCase(); + if (lower === 'true') { + repairs.push({ field, kind: 'coerced_to_boolean', detail: `Coerced string '${value}' → true for '${field}'` }); + return true; + } + if (lower === 'false') { + repairs.push({ field, kind: 'coerced_to_boolean', detail: `Coerced string '${value}' → false for '${field}'` }); + return false; + } + return value; + } + + // --- Integer coercion: "42.0" → 42 --- + if (isInteger && typeof value === 'string') { + const num = Number(value); + if (!Number.isNaN(num)) { + repairs.push({ field, kind: 'coerced_to_number', detail: `Coerced string '${value}' → ${num} for '${field}'` }); + return num; + } + return value; + } + + // --- Integer coercion: boolean → 0/1 --- + if (isInteger && typeof value === 'boolean') { + repairs.push({ field, kind: 'coerced_boolean_to_integer', detail: `Coerced boolean ${value} → ${value ? 1 : 0} for '${field}'` }); + return value ? 1 : 0; + } + + // --- Empty string to null for optional fields --- + if (value === '' && !required) { + repairs.push({ field, kind: 'empty_string_to_undefined', detail: `Converted empty string for optional '${field}'` }); + return undefined; + } + + return value; +} diff --git a/apps/server/src/services/inference/tool-phase.ts b/apps/server/src/services/inference/tool-phase.ts index f0e8d44..1adeb2e 100644 --- a/apps/server/src/services/inference/tool-phase.ts +++ b/apps/server/src/services/inference/tool-phase.ts @@ -6,6 +6,7 @@ import type { ToolExecCtx } from '../tools.js'; import { matchToolGlob } from '../agents.js'; import { maybeFlagForCompaction } from './payload.js'; import { insertParts, partsFromAssistantMessage, partsFromToolMessage } from './parts.js'; +import { getServerPermission } from '../mcp-client.js'; // v1.13.16: richer unknown-tool error so the model can self-correct when it // drifts to a Claude Code tool name (e.g. read_file → suggest view_file). // Applies to all unknown tool names, not just -derived ones — at the @@ -17,6 +18,7 @@ import { formatUnknownToolError } from './tool-suggestions.js'; // prompted about paths we couldn't grant anyway (e.g. /etc/passwd). import { resolveGrantRoot } from '../grant_resolver.js'; import { stripToolMarkup } from './tool-call-parser.js'; +import { repairToolInput } from './tool-input-repair.js'; import type { FailureKind } from './mistake-tracker.js'; import type { InferenceContext, @@ -34,6 +36,8 @@ async function executeToolCall( toolCall: ToolCall, extraRoots: readonly string[], toolCtx?: ToolExecCtx, + hooks?: import('../hooks.js').HookRunner, + sessionId?: string, ): Promise<{ output: unknown; truncated: boolean; error?: string; outcome: FailureKind | 'success' }> { // v#12 MistakeTracker: every return path carries an `outcome` so the turn // loop can detect a run of heterogeneous failures. The failure taxonomy @@ -48,7 +52,61 @@ async function executeToolCall( outcome: 'tool_not_found', }; } - const parsed = tool.inputSchema.safeParse(toolCall.args); + // MCP permission gate — block deny/ask before any Zod parsing or execution + const mcpPerm = getServerPermission(toolCall.name); + if (mcpPerm === 'deny') { + return { output: null, truncated: false, error: `blocked: MCP server denied tool '${toolCall.name}'`, outcome: 'permission_denied' }; + } + if (mcpPerm === 'ask') { + return { output: null, truncated: false, error: `requires approval: tool '${toolCall.name}' needs user approval`, outcome: 'permission_denied' }; + } + // vWhale: schema-based tool input repair. If the Zod parse fails, attempt + // heuristic repairs (type coercion, markdown-link unwrapping, array wrapping) + // and retry. Logs repairs for debugging. + let args = toolCall.args; + let parsed = tool.inputSchema.safeParse(args); + if (!parsed.success) { + const schema = tool.jsonSchema?.function?.parameters; + if (schema) { + const { repaired: repairedArgs, repairs } = repairToolInput( + schema as Record, + args as Record, + ); + if (repairs.length > 0) { + const retry = tool.inputSchema.safeParse(repairedArgs); + if (retry.success) { + args = repairedArgs; + parsed = retry; + } + } + } + } + // vWhale: PreToolUse hook — can block execution. + if (hooks && sessionId) { + const hookResult = await hooks.run('PreToolUse', { + event: 'PreToolUse', + session_id: sessionId, + tool_name: toolCall.name, + tool_args: args as Record, + }); + if (hookResult.decision === 'block') { + return { + output: null, + truncated: false, + error: `blocked by hook: ${hookResult.reason ?? 'PreToolUse denied'}`, + outcome: 'permission_denied', + }; + } + // Apply updated_input if the hook rewrote the args + if (hookResult.updated_input && typeof hookResult.updated_input === 'object') { + const reParsed = tool.inputSchema.safeParse(hookResult.updated_input); + if (reParsed.success) { + args = hookResult.updated_input as Record; + parsed = reParsed; + } + } + } + if (!parsed.success) { // v1.12 Track B.2: enrich the zod-reject path so the model sees a // one-line, tool-named hint ("tool 'search_symbols' rejected — query: @@ -183,6 +241,8 @@ export async function executeToolPhase( tokens_used: updated?.tokens_used ?? null, ctx_used: updated?.ctx_used ?? null, ctx_max: updated?.ctx_max ?? null, + cache_tokens: result.cacheReadTokens ?? null, + reasoning_tokens: result.reasoningTokens ?? null, started_at: startedAt, finished_at: updated?.finished_at ?? null, model: session.model, @@ -318,10 +378,22 @@ export async function executeToolPhase( }); return; } - const tres = await executeToolCall(projectRoot, tc, session.allowed_read_paths, { - sql: ctx.sql, - sessionId, - }); + const tres = await executeToolCall( + projectRoot, tc, session.allowed_read_paths, + { sql: ctx.sql, sessionId }, + ctx.hooks, sessionId, + ); + // vWhale: PostToolUse hook (best-effort, non-blocking). + if (ctx.hooks) { + ctx.hooks.run('PostToolUse', { + event: 'PostToolUse', + session_id: sessionId, + tool_name: tc.name, + tool_args: tc.args as Record, + tool_result: tres.output, + tool_error: tres.error, + }).catch(() => {}); + } // v#12 MistakeTracker: record the real execution outcome (success or a // FailureKind). This is the primary signal for heterogeneous-failure // detection. diff --git a/apps/server/src/services/inference/turn.ts b/apps/server/src/services/inference/turn.ts index 12c4ff3..f6dacb2 100644 --- a/apps/server/src/services/inference/turn.ts +++ b/apps/server/src/services/inference/turn.ts @@ -144,6 +144,7 @@ export async function runAssistantTurn( log: ctx.log, broker: ctx.broker, chatId, + hooks: ctx.hooks, }); } catch (err) { ctx.log.warn({ err, chatId }, 'auto-compaction failed; clearing flag and proceeding'); @@ -214,6 +215,16 @@ export async function runAssistantTurn( // ---- non-tool finish → finalize and exit ---- if (result.toolCalls.length === 0) { + // vWhale: Stop hook (best-effort, non-blocking). + if (ctx.hooks) { + ctx.hooks.run('Stop', { + event: 'Stop', + session_id: sessionId, + chat_id: chatId, + last_assistant_text: result.content.slice(0, 500), + turn: stepNumber, + }).catch(() => {}); + } await finalizeCompletion(ctx, iterArgs, result, state.startedAt, iterSession); break; } @@ -309,6 +320,22 @@ export async function runAssistantTurn( assistantMessageId = toolPhaseResult.nextAssistantId!; } + // vWhale: Stop hook at post-loop exit (best-effort, non-blocking). + if (ctx.hooks) { + const loaded = await loadContext(ctx.sql, sessionId, chatId); + const lastAssistant = loaded?.history?.slice().reverse().find( + (m: import('../../types/api.js').Message) => m.role === 'assistant', + ); + const content = lastAssistant?.content ?? ''; + ctx.hooks.run('Stop', { + event: 'Stop', + session_id: sessionId, + chat_id: chatId, + last_assistant_text: content.slice(0, 500), + turn: stepNumber, + }).catch(() => {}); + } + // ---- post-loop: step-cap sentinel ---- // When the loop exits because stepNumber reached effectiveCap, the last // iteration's tool phase returned 'continue' with a nextAssistantId that diff --git a/apps/server/src/services/inference/types.ts b/apps/server/src/services/inference/types.ts index d3f3bd0..94892a3 100644 --- a/apps/server/src/services/inference/types.ts +++ b/apps/server/src/services/inference/types.ts @@ -19,6 +19,7 @@ import type { UserStreamFrame, } from '../../types/api.js'; import type { Broker } from '../broker.js'; +import type { HookRunner } from '../hooks.js'; import type { MistakeState } from './mistake-tracker.js'; export interface StreamPhaseState { @@ -77,6 +78,8 @@ export interface InferenceFrame { started_at?: string | null; finished_at?: string | null; model?: string; + cache_tokens?: number | null; + reasoning_tokens?: number | null; session_id?: string; name?: string; // orchestrator frames ([D-6]) @@ -117,6 +120,9 @@ export interface InferenceContext { // inference goes through `publish`); keeping a separate field avoids // tempting other code paths into bypassing the session-id binding. broker: Broker; + // vWhale: lifecycle hooks runner. Undefined when no hooks configured. + // Hook calls are best-effort — a failing hook never blocks inference. + hooks?: HookRunner; } export interface StreamResult { @@ -128,6 +134,12 @@ export interface StreamResult { // v1.13.1-C: reasoning text accumulated across reasoning-delta parts. // Empty string when the model doesn't emit reasoning (most cases). reasoning: string; + // vDeepSeek: optional cache-hit token count from DeepSeek's API. + // Only populated when using @ai-sdk/deepseek provider (not llama-swap). + cacheReadTokens?: number; + // vDeepSeek: optional reasoning token count from DeepSeek's API. + // Only populated when using @ai-sdk/deepseek provider (not llama-swap). + reasoningTokens?: number; } export interface TurnArgs { diff --git a/apps/server/src/services/mcp-client.ts b/apps/server/src/services/mcp-client.ts index a2017c2..f588236 100644 --- a/apps/server/src/services/mcp-client.ts +++ b/apps/server/src/services/mcp-client.ts @@ -31,11 +31,14 @@ interface McpToolDef { annotations?: McpToolAnnotations; } +export type McpPermission = 'allow' | 'ask' | 'deny'; + interface ServerState { client: Client; transport: StreamableHTTPClientTransport | StdioClientTransport; tools: ToolDef>[]; type: 'streamableHttp' | 'stdio'; + permission: McpPermission; } // ---- Module-level state ---- @@ -137,6 +140,14 @@ export async function callTool( } } +/** Return the permission level for a given MCP server. Defaults to 'allow' if unknown. */ +export function getServerPermission(prefixedToolName: string): McpPermission { + const serverName = toolToServer.get(prefixedToolName); + if (!serverName) return 'allow'; + const state = servers.get(serverName); + return state?.permission ?? 'allow'; +} + /** Return all wrapped ToolDefs from all connected servers, flattened. */ export function getTools(): ToolDef>[] { const all: ToolDef>[] = []; @@ -214,7 +225,8 @@ async function connectServer(entry: McpServerEntry): Promise { toolToServer.set(wrapped.name, name); } - servers.set(name, { client, transport, tools, type: config.type }); + const permission = (config as { permission?: McpPermission }).permission ?? 'allow'; + servers.set(name, { client, transport, tools, type: config.type, permission }); log!.info( { server: name, type: config.type, count: tools.length, names: tools.map((t) => t.name) }, diff --git a/apps/server/src/services/mcp-config.ts b/apps/server/src/services/mcp-config.ts index fcb04ba..e1e1750 100644 --- a/apps/server/src/services/mcp-config.ts +++ b/apps/server/src/services/mcp-config.ts @@ -17,12 +17,15 @@ import type { FastifyBaseLogger } from 'fastify'; // ---- Zod schema ---- +const McpPermissionSchema = z.enum(['allow', 'ask', 'deny']).default('allow'); + const McpServerConfigSchema = z.discriminatedUnion('type', [ z.object({ type: z.literal('streamableHttp'), url: z.string().url(), headers: z.record(z.string()).optional(), enabled: z.boolean().default(true), + permission: McpPermissionSchema, }), z.object({ type: z.literal('stdio'), @@ -30,6 +33,7 @@ const McpServerConfigSchema = z.discriminatedUnion('type', [ args: z.array(z.string()).default([]), env: z.record(z.string()).optional(), enabled: z.boolean().default(true), + permission: McpPermissionSchema, }), ]); diff --git a/apps/server/src/services/message-columns.ts b/apps/server/src/services/message-columns.ts index 33a4cac..42d1a40 100644 --- a/apps/server/src/services/message-columns.ts +++ b/apps/server/src/services/message-columns.ts @@ -7,10 +7,12 @@ export const MESSAGE_COLUMNS = 'id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq, ' + - 'tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata, ' + + 'tokens_used, ctx_used, ctx_max, cache_tokens, reasoning_tokens, ' + + 'started_at, finished_at, created_at, metadata, ' + 'summary, tail_start_id, compacted_at, model'; export const INFERENCE_MESSAGE_COLUMNS = 'id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq, ' + - 'tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata, ' + + 'tokens_used, ctx_used, ctx_max, cache_tokens, reasoning_tokens, ' + + 'started_at, finished_at, created_at, metadata, ' + 'reasoning_parts, model'; diff --git a/apps/server/src/services/model-context.ts b/apps/server/src/services/model-context.ts index d390bfe..4ef6710 100644 --- a/apps/server/src/services/model-context.ts +++ b/apps/server/src/services/model-context.ts @@ -37,7 +37,18 @@ export function configureModelContext(opts: { llamaSwapUrl: string }): void { llamaSwapUrl = opts.llamaSwapUrl; } +// vDeepSeek: DeepSeek models don't have a /upstream//props endpoint. +// Return a reasonable default context so compaction estimates work. +const DEEPSEEK_DEFAULT_N_CTX = 131_072; +const DEEPSEEK_MODEL_PREFIX = 'deepseek-'; + export async function getModelContext(model: string): Promise { + // vDeepSeek: DeepSeek models have no /upstream//props. Use a static + // default so compaction doesn't fall to the buffer-only path with tiny limits. + if (model.startsWith(DEEPSEEK_MODEL_PREFIX)) { + return { n_ctx: DEEPSEEK_DEFAULT_N_CTX }; + } + // 1. Positive cache hit — no TTL check, model n_ctx is invariant. const pos = positiveCache.get(model); if (pos) return pos; diff --git a/apps/server/src/services/system-prompt.ts b/apps/server/src/services/system-prompt.ts index a1dde52..533b1dd 100644 --- a/apps/server/src/services/system-prompt.ts +++ b/apps/server/src/services/system-prompt.ts @@ -101,7 +101,7 @@ export interface PrefixFingerprint { has_agent_system_prompt: boolean; has_session_override: boolean; has_project_override: boolean; - route: 'swap' | 'sidecar'; + route: 'swap' | 'sidecar' | 'deepseek'; } export interface PrefixDrift { @@ -129,7 +129,7 @@ interface ObservedInputs { has_agent_system_prompt: boolean; has_session_override: boolean; has_project_override: boolean; - route: 'swap' | 'sidecar'; + route: 'swap' | 'sidecar' | 'deepseek'; } interface ObserverEntry { diff --git a/apps/server/src/types/api.ts b/apps/server/src/types/api.ts index 0146343..a1264f7 100644 --- a/apps/server/src/types/api.ts +++ b/apps/server/src/types/api.ts @@ -127,6 +127,9 @@ export interface Agent { // bounded only by MAX_STEPS (200). 0 means "no tool calls allowed." steps: number | null; llama_extra_args: string[] | null; + // vDeepSeek: thinking/reasoning effort for DeepSeek V4 models. + // Maps to DeepSeek's reasoning_effort API param. + reasoning_effort: 'off' | 'low' | 'medium' | 'high' | 'xhigh' | 'max' | null; } // One entry per malformed `## Name` block. Per-block errors don't fail the @@ -206,6 +209,8 @@ export interface Message { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; + cache_tokens: number | null; + reasoning_tokens: number | null; started_at: string | null; finished_at: string | null; created_at: string; diff --git a/apps/web/src/api/types.ts b/apps/web/src/api/types.ts index fba90dc..ed01ff8 100644 --- a/apps/web/src/api/types.ts +++ b/apps/web/src/api/types.ts @@ -152,6 +152,8 @@ export interface Message { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; + cache_tokens: number | null; + reasoning_tokens: number | null; // model-attribution: which model produced this assistant message (null for // user/system rows + pre-attribution messages). Rendered as a chip. model: string | null; diff --git a/apps/web/src/components/MessageBubble.tsx b/apps/web/src/components/MessageBubble.tsx index 4ecb881..7f50505 100644 --- a/apps/web/src/components/MessageBubble.tsx +++ b/apps/web/src/components/MessageBubble.tsx @@ -156,9 +156,16 @@ function StatsLine({ message }: { message: Message }) { : `${ctxUsed} ctx` : null; + const cacheHit = message.cache_tokens; + const reasoning = message.reasoning_tokens; + const cachePart = typeof cacheHit === 'number' && cacheHit > 0 ? `cache ${cacheHit}` : null; + const reasoningPart = typeof reasoning === 'number' && reasoning > 0 ? `think ${reasoning}` : null; + const parts: string[] = [`${tokens} tokens`]; if (tps !== null) parts.push(`${tps.toFixed(1)} tok/s`); if (ctxPart) parts.push(ctxPart); + if (cachePart) parts.push(cachePart); + if (reasoningPart) parts.push(reasoningPart); return (
diff --git a/apps/web/src/hooks/useSessionStream.ts b/apps/web/src/hooks/useSessionStream.ts index 75c4429..32ab9d3 100644 --- a/apps/web/src/hooks/useSessionStream.ts +++ b/apps/web/src/hooks/useSessionStream.ts @@ -123,6 +123,8 @@ function applyFrame(state: State, frame: WsFrame): State { ...(frame.tokens_used !== undefined ? { tokens_used: frame.tokens_used } : {}), ...(frame.ctx_used !== undefined ? { ctx_used: frame.ctx_used } : {}), ...(frame.ctx_max !== undefined ? { ctx_max: frame.ctx_max } : {}), + ...(frame.cache_tokens !== undefined ? { cache_tokens: frame.cache_tokens } : {}), + ...(frame.reasoning_tokens !== undefined ? { reasoning_tokens: frame.reasoning_tokens } : {}), ...(frame.started_at !== undefined ? { started_at: frame.started_at } : {}), ...(frame.finished_at !== undefined ? { finished_at: frame.finished_at } : {}), ...(frame.model !== undefined ? { model: frame.model } : {}), diff --git a/data/AGENTS.md b/data/AGENTS.md index 6d4e0dd..f62779c 100644 --- a/data/AGENTS.md +++ b/data/AGENTS.md @@ -6,7 +6,7 @@ Operating rules for every agent in this registry. Full procedures live in the `c **Worktrees** — Isolate work in a worktree when it is parallel to in-progress work, risky/experimental, a hotfix interrupting other work, or splits into independent units — just create when clear, propose in one line when ambiguous, skip quick/small single-stream work. Branch from a stable base (default branch); worktrees persist (never auto-remove or auto-merge); they isolate code state, not runtime (ports/DBs/services still collide). Full heuristic: invoke `using-worktrees`. -**Sampling knobs** — Each `## Name` frontmatter block accepts these per-agent sampler fields, threaded into the llama-swap chat-completion request: `temperature`, `top_p`, `top_k`, `min_p`, `presence_penalty`, and (v2.6) `top_n_sigma`, `dry_multiplier`, `dry_base`, `dry_allowed_length`, `dry_penalty_last_n`. The `top_n_sigma` + `dry_*` repetition family curb the doom-loop-prone local model. Omit a field to leave it at the server default. Example: `top_n_sigma: 1.0`, `dry_multiplier: 0.8`, `dry_base: 1.75`, `dry_allowed_length: 2`, `dry_penalty_last_n: -1` (-1 = whole context). +**Sampling knobs** — Each `## Name` frontmatter block accepts these per-agent sampler fields, threaded into the llama-swap chat-completion request: `temperature`, `top_p`, `top_k`, `min_p`, `presence_penalty`, and (v2.6) `top_n_sigma`, `dry_multiplier`, `dry_base`, `dry_allowed_length`, `dry_penalty_last_n`. The `top_n_sigma` + `dry_*` repetition family curb the doom-loop-prone local model. Omit a field to leave it at the server default. Example: `top_n_sigma: 1.0`, `dry_multiplier: 0.8`, `dry_base: 1.75`, `dry_allowed_length: 2`, `dry_penalty_last_n: -1` (-1 = whole context). DeepSeek V4 models also accept `reasoning_effort` (low/medium/high/xhigh/max); omit to disable thinking mode. Example: `reasoning_effort: 'high'`. **Reasoning budget** — To cap a reasoning model's thinking tokens, pass `--reasoning-budget` through `llama_extra_args` (already permitted by the deny-list validator; routes the agent to llama-sidecar). Example frontmatter line: `llama_extra_args: ["--reasoning-budget", "2048"]`. This is a sidecar process flag, not a chat-completion body param — distinct from the sampling knobs above. diff --git a/packages/contracts/src/ws-frames.ts b/packages/contracts/src/ws-frames.ts index 06b9e59..2a4bf7c 100644 --- a/packages/contracts/src/ws-frames.ts +++ b/packages/contracts/src/ws-frames.ts @@ -116,6 +116,8 @@ export const MessageCompleteFrame = z.object({ tokens_used: z.number().int().nonnegative().nullable().optional(), ctx_used: z.number().int().nonnegative().nullable().optional(), ctx_max: z.number().int().positive().nullable().optional(), + cache_tokens: z.number().int().nonnegative().nullable().optional(), + reasoning_tokens: z.number().int().nonnegative().nullable().optional(), started_at: IsoTimestamp.nullable().optional(), finished_at: IsoTimestamp.nullable().optional(), // nullable: external-coder turns carry task.model, which is null when no