feat: DeepSeek API integration + Whale lift (hooks, tool repair, MCP permissions, token tracking)

DeepSeek API:
- @ai-sdk/deepseek provider replaces openai-compatible for deepseek-* models
- Token tracking: cache_hit/reasoning tokens flow API → DB → WS frames → UI
- thinking effort levels (off/low/medium/high/xhigh/max) via AGENTS.md frontmatter
- V4 models: deepseek-v4-flash, deepseek-v4-pro
- Wired for both chat and coder panes

Whale lifts:
- Tool input repair (schema-based type coercion, markdown link unwrapping)
- Hooks system (6 lifecycle events, shell exec, JSON stdin/stdout contract)
- Per-MCP-server permissions (allow/ask/deny)
- token tracking UI (cache N, think N in message stats line)

Infra:
- New DB columns: messages.cache_tokens, messages.reasoning_tokens
- New WS frame fields: cache_tokens, reasoning_tokens on message_complete
- coder provider snapshot merges DeepSeek models alongside llama-swap
This commit is contained in:
2026-06-08 01:24:23 +00:00
parent 31e5d9d4ab
commit c4079dd85c
29 changed files with 916 additions and 42 deletions

View File

@@ -24,6 +24,8 @@ import { SUMMARY_TEMPLATE } from './compaction-prompt.js';
import * as modelContextLookup from './model-context.js';
import { SENTINEL_KINDS } from './inference/sentinels.js';
import type { OpenAiMessage } from './inference/payload.js';
import { resolveModelEndpoint } from './inference/provider.js';
import type { HookRunner } from './hooks.js';
// v1.13.9: ratio-only overflow trigger. Fires compaction at 85% of ctx_max
// (opencode session/overflow.ts pattern). Replaces the v1.11.0-era
@@ -346,20 +348,22 @@ interface CompletionResult {
completionTokens: number;
}
async function callLlamaSwap(
async function callLlm(
config: Config,
model: string,
messages: OpenAiMessage[],
log: FastifyBaseLogger,
): Promise<CompletionResult> {
const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/chat/completions`, {
const { url, headers, model: resolvedModel } = resolveModelEndpoint(config, model);
const res = await fetch(`${url}/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ model, messages, stream: false }),
headers,
body: JSON.stringify({ model: resolvedModel, messages, stream: false }),
});
if (!res.ok) {
const text = await res.text().catch(() => '');
throw new Error(`llama-swap returned ${res.status}: ${text.slice(0, 200)}`);
const prefix = model.startsWith('deepseek-') ? 'deepseek' : 'llama-swap';
throw new Error(`${prefix} returned ${res.status}: ${text.slice(0, 200)}`);
}
const json = (await res.json()) as {
choices?: Array<{ message?: { content?: string } }>;
@@ -383,6 +387,8 @@ export interface ProcessInput {
log: FastifyBaseLogger;
broker: Broker;
chatId: string;
/** vWhale: lifecycle hooks runner. Undefined when no hooks configured. */
hooks?: HookRunner;
}
// Runs one round of anchored rolling compaction on `chatId`. No-ops cleanly
@@ -497,6 +503,17 @@ export async function process(input: ProcessInput): Promise<void> {
at: new Date().toISOString(),
});
// vWhale: PreCompact hook (best-effort, non-blocking).
const msgBefore = messages.length;
if (input.hooks) {
input.hooks.run('PreCompact', {
event: 'PreCompact',
session_id: sessionId,
chat_id: chatId,
messages_before: msgBefore,
}).catch(() => {});
}
// try/finally so the dot ALWAYS drops back to idle, even if the LLM call
// throws or a downstream DB write fails. The succeeded flag gates the
// 'compacted' frame + final log: we only signal completion to the UI when
@@ -506,7 +523,7 @@ export async function process(input: ProcessInput): Promise<void> {
let result: CompletionResult | undefined;
try {
// 7. Single completion (no tools). Throws on llama-swap failure.
result = await callLlamaSwap(config, session.model, payload, log);
result = await callLlm(config, session.model, payload, log);
// 7b. v1.11.3: fetch the model's true context window from llama-swap's
// /upstream/<model>/props (the streaming completion doesn't carry it).
@@ -558,6 +575,18 @@ export async function process(input: ProcessInput): Promise<void> {
`;
succeeded = true;
// vWhale: PostCompact hook (best-effort, non-blocking).
if (input.hooks) {
input.hooks.run('PostCompact', {
event: 'PostCompact',
session_id: sessionId,
chat_id: chatId,
messages_before: msgBefore,
messages_after: sel.head.length,
summary: (result?.content ?? '').slice(0, 500),
}).catch(() => {});
}
} finally {
// Always restore the dot. Status='idle' (not 'error') even on failure —
// the caller logs/re-surfaces the error separately; the dot doesn't