feat: DeepSeek API integration + Whale lift (hooks, tool repair, MCP permissions, token tracking)

DeepSeek API: - @ai-sdk/deepseek provider replaces openai-compatible for deepseek-* models - Token tracking: cache_hit/reasoning tokens flow API → DB → WS frames → UI - thinking effort levels (off/low/medium/high/xhigh/max) via AGENTS.md frontmatter - V4 models: deepseek-v4-flash, deepseek-v4-pro - Wired for both chat and coder panes Whale lifts: - Tool input repair (schema-based type coercion, markdown link unwrapping) - Hooks system (6 lifecycle events, shell exec, JSON stdin/stdout contract) - Per-MCP-server permissions (allow/ask/deny) - token tracking UI (cache N, think N in message stats line) Infra: - New DB columns: messages.cache_tokens, messages.reasoning_tokens - New WS frame fields: cache_tokens, reasoning_tokens on message_complete - coder provider snapshot merges DeepSeek models alongside llama-swap
2026-06-08 01:24:23 +00:00
parent 31e5d9d4ab
commit c4079dd85c
29 changed files with 916 additions and 42 deletions
--- a/apps/server/src/services/inference/stream-phase-adapter.ts
+++ b/apps/server/src/services/inference/stream-phase-adapter.ts
@@ -13,7 +13,7 @@ import type { OpenAiMessage } from './payload.js';
 import { extractToolCallBlocks } from './tool-call-parser.js';
 import { classifyStreamError } from './stream-error-classifier.js';
 import type { StreamResult } from './types.js';
-import { upstreamModel } from './provider.js';
+import { isDeepSeekModel, upstreamModel } from './provider.js';
 import {
  jsonSchema,
  streamText,
@@ -51,6 +51,9 @@ export interface StreamOptions {
  dry_base?: number | null;
  dry_allowed_length?: number | null;
  dry_penalty_last_n?: number | null;
+  // vDeepSeek: thinking/reasoning effort. Maps to DeepSeek's reasoning_effort
+  // API param for deepseek-v4-flash / deepseek-v4-pro models.
+  reasoning_effort?: 'off' | 'low' | 'medium' | 'high' | 'xhigh' | 'max';
 }

 // P5: the 10-field sampler-options literal that was copy-pasted at 4 sites
@@ -74,6 +77,7 @@ export function samplerOptsFromAgent(agent: Agent | null): SamplerOpts {
    dry_base: agent?.dry_base ?? undefined,
    dry_allowed_length: agent?.dry_allowed_length ?? undefined,
    dry_penalty_last_n: agent?.dry_penalty_last_n ?? undefined,
+    reasoning_effort: agent?.reasoning_effort ?? undefined,
  };
 }

@@ -272,6 +276,19 @@ export async function streamCompletion(
  // before this. They now go through the same extraBody path as the new params.
  const samplerBody = buildSamplerProviderOptions(opts);

+  // vDeepSeek: build providerOptions.deepseek for DeepSeek V4 models.
+  let deepseekProviderOptions:
+    | { thinking: { type: 'enabled' | 'disabled' }; reasoningEffort?: 'low' | 'medium' | 'high' | 'xhigh' | 'max' }
+    | undefined;
+  if (isDeepSeekModel(model)) {
+    const dsEffort = opts.reasoning_effort;
+    const thinkingEnabled = dsEffort && dsEffort !== 'off';
+    deepseekProviderOptions = {
+      thinking: { type: thinkingEnabled ? 'enabled' : 'disabled' },
+      ...(thinkingEnabled ? { reasoningEffort: dsEffort } : {}),
+    };
+  }
+
  // F6: per-chunk stall deadline. If the model stops emitting chunks for
  // STALL_TIMEOUT_MS the stallAc fires through AbortSignal.any; the post-loop
  // abort check below then throws AbortError → handleAbortOrError writes
@@ -297,7 +314,14 @@ export async function streamCompletion(
    ...(typeof opts.temperature === 'number' ? { temperature: opts.temperature } : {}),
    ...(typeof opts.top_p === 'number' ? { topP: opts.top_p } : {}),
    ...(typeof opts.presence_penalty === 'number' ? { presencePenalty: opts.presence_penalty } : {}),
-    ...(samplerBody ? { providerOptions: { openaiCompatible: samplerBody } } : {}),
+    ...(samplerBody || deepseekProviderOptions
+      ? {
+          providerOptions: {
+            ...(samplerBody ? { openaiCompatible: samplerBody } : {}),
+            ...(deepseekProviderOptions ? { deepseek: deepseekProviderOptions } : {}),
+          },
+        }
+      : {}),
    abortSignal: effectiveSignal,
  });

@@ -401,12 +425,26 @@ export async function streamCompletion(

  // Usage lands as a promise on the result; awaiting after fullStream is
  // drained is safe. AI SDK v6 names: `inputTokens` / `outputTokens`.
+  // Some providers (llama-swap via openai-compatible) return plain numbers;
+  // others (deepseek via @ai-sdk/deepseek) return {total, cacheRead, noCache, ...}.
  let promptTokens: number | null = null;
  let completionTokens: number | null = null;
+  let cacheReadTokens: number | null = null;
+  let reasoningTokens: number | null = null;
  try {
    const usage = await result.usage;
-    if (typeof usage.inputTokens === 'number') promptTokens = usage.inputTokens;
-    if (typeof usage.outputTokens === 'number') completionTokens = usage.outputTokens;
+    if (typeof usage.inputTokens === 'number') {
+      promptTokens = usage.inputTokens;
+    } else if (usage.inputTokens && typeof usage.inputTokens === 'object') {
+      promptTokens = (usage.inputTokens as Record<string, number | undefined>).total ?? null;
+      cacheReadTokens = (usage.inputTokens as Record<string, number | undefined>).cacheRead ?? null;
+    }
+    if (typeof usage.outputTokens === 'number') {
+      completionTokens = usage.outputTokens;
+    } else if (usage.outputTokens && typeof usage.outputTokens === 'object') {
+      completionTokens = (usage.outputTokens as Record<string, number | undefined>).total ?? null;
+      reasoningTokens = (usage.outputTokens as Record<string, number | undefined>).reasoning ?? null;
+    }
  } catch {
    // Some providers omit usage on partial streams; leave both null.
  }
@@ -422,6 +460,13 @@ export async function streamCompletion(
    );
  }

+  if (cacheReadTokens !== null || reasoningTokens !== null) {
+    ctx.log.debug(
+      { promptTokens, completionTokens, cacheReadTokens, reasoningTokens, model },
+      'streamCompletion: deepseek usage breakdown',
+    );
+  }
+
  return {
    finishReason,
    content,
@@ -429,6 +474,10 @@ export async function streamCompletion(
    promptTokens,
    completionTokens,
    reasoning: reasoningAccumulated,
+    // vDeepSeek: optional usage breakdown populated when the provider returns
+    // structured usage (cache hit tokens, reasoning tokens).
+    cacheReadTokens: cacheReadTokens ?? undefined,
+    reasoningTokens: reasoningTokens ?? undefined,
  };
  } finally {
    // Clear the stall timer whether the stream completes normally, throws, or