feat: DeepSeek API integration + Whale lift (hooks, tool repair, MCP permissions, token tracking)
DeepSeek API: - @ai-sdk/deepseek provider replaces openai-compatible for deepseek-* models - Token tracking: cache_hit/reasoning tokens flow API → DB → WS frames → UI - thinking effort levels (off/low/medium/high/xhigh/max) via AGENTS.md frontmatter - V4 models: deepseek-v4-flash, deepseek-v4-pro - Wired for both chat and coder panes Whale lifts: - Tool input repair (schema-based type coercion, markdown link unwrapping) - Hooks system (6 lifecycle events, shell exec, JSON stdin/stdout contract) - Per-MCP-server permissions (allow/ask/deny) - token tracking UI (cache N, think N in message stats line) Infra: - New DB columns: messages.cache_tokens, messages.reasoning_tokens - New WS frame fields: cache_tokens, reasoning_tokens on message_complete - coder provider snapshot merges DeepSeek models alongside llama-swap
This commit is contained in:
@@ -24,6 +24,8 @@ import { SUMMARY_TEMPLATE } from './compaction-prompt.js';
|
||||
import * as modelContextLookup from './model-context.js';
|
||||
import { SENTINEL_KINDS } from './inference/sentinels.js';
|
||||
import type { OpenAiMessage } from './inference/payload.js';
|
||||
import { resolveModelEndpoint } from './inference/provider.js';
|
||||
import type { HookRunner } from './hooks.js';
|
||||
|
||||
// v1.13.9: ratio-only overflow trigger. Fires compaction at 85% of ctx_max
|
||||
// (opencode session/overflow.ts pattern). Replaces the v1.11.0-era
|
||||
@@ -346,20 +348,22 @@ interface CompletionResult {
|
||||
completionTokens: number;
|
||||
}
|
||||
|
||||
async function callLlamaSwap(
|
||||
async function callLlm(
|
||||
config: Config,
|
||||
model: string,
|
||||
messages: OpenAiMessage[],
|
||||
log: FastifyBaseLogger,
|
||||
): Promise<CompletionResult> {
|
||||
const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/chat/completions`, {
|
||||
const { url, headers, model: resolvedModel } = resolveModelEndpoint(config, model);
|
||||
const res = await fetch(`${url}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ model, messages, stream: false }),
|
||||
headers,
|
||||
body: JSON.stringify({ model: resolvedModel, messages, stream: false }),
|
||||
});
|
||||
if (!res.ok) {
|
||||
const text = await res.text().catch(() => '');
|
||||
throw new Error(`llama-swap returned ${res.status}: ${text.slice(0, 200)}`);
|
||||
const prefix = model.startsWith('deepseek-') ? 'deepseek' : 'llama-swap';
|
||||
throw new Error(`${prefix} returned ${res.status}: ${text.slice(0, 200)}`);
|
||||
}
|
||||
const json = (await res.json()) as {
|
||||
choices?: Array<{ message?: { content?: string } }>;
|
||||
@@ -383,6 +387,8 @@ export interface ProcessInput {
|
||||
log: FastifyBaseLogger;
|
||||
broker: Broker;
|
||||
chatId: string;
|
||||
/** vWhale: lifecycle hooks runner. Undefined when no hooks configured. */
|
||||
hooks?: HookRunner;
|
||||
}
|
||||
|
||||
// Runs one round of anchored rolling compaction on `chatId`. No-ops cleanly
|
||||
@@ -497,6 +503,17 @@ export async function process(input: ProcessInput): Promise<void> {
|
||||
at: new Date().toISOString(),
|
||||
});
|
||||
|
||||
// vWhale: PreCompact hook (best-effort, non-blocking).
|
||||
const msgBefore = messages.length;
|
||||
if (input.hooks) {
|
||||
input.hooks.run('PreCompact', {
|
||||
event: 'PreCompact',
|
||||
session_id: sessionId,
|
||||
chat_id: chatId,
|
||||
messages_before: msgBefore,
|
||||
}).catch(() => {});
|
||||
}
|
||||
|
||||
// try/finally so the dot ALWAYS drops back to idle, even if the LLM call
|
||||
// throws or a downstream DB write fails. The succeeded flag gates the
|
||||
// 'compacted' frame + final log: we only signal completion to the UI when
|
||||
@@ -506,7 +523,7 @@ export async function process(input: ProcessInput): Promise<void> {
|
||||
let result: CompletionResult | undefined;
|
||||
try {
|
||||
// 7. Single completion (no tools). Throws on llama-swap failure.
|
||||
result = await callLlamaSwap(config, session.model, payload, log);
|
||||
result = await callLlm(config, session.model, payload, log);
|
||||
|
||||
// 7b. v1.11.3: fetch the model's true context window from llama-swap's
|
||||
// /upstream/<model>/props (the streaming completion doesn't carry it).
|
||||
@@ -558,6 +575,18 @@ export async function process(input: ProcessInput): Promise<void> {
|
||||
`;
|
||||
|
||||
succeeded = true;
|
||||
|
||||
// vWhale: PostCompact hook (best-effort, non-blocking).
|
||||
if (input.hooks) {
|
||||
input.hooks.run('PostCompact', {
|
||||
event: 'PostCompact',
|
||||
session_id: sessionId,
|
||||
chat_id: chatId,
|
||||
messages_before: msgBefore,
|
||||
messages_after: sel.head.length,
|
||||
summary: (result?.content ?? '').slice(0, 500),
|
||||
}).catch(() => {});
|
||||
}
|
||||
} finally {
|
||||
// Always restore the dot. Status='idle' (not 'error') even on failure —
|
||||
// the caller logs/re-surfaces the error separately; the dot doesn't
|
||||
|
||||
Reference in New Issue
Block a user