v1.12.2: live tok/s + ctx display next to status indicator

ChatThroughput renders inline beside StatusDot while streaming or tool_running. Subscribes to existing usage frames via sessionEvents. Hides when status drops to idle/error or data is older than 10s. Addresses the 2026-05-21 spike's UX gap where slow streams looked identical to dead streams — now there's a live token velocity readout that immediately distinguishes the two. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 20:45:53 +00:00
parent 1a0a3b1673
commit a7104691aa
7 changed files with 214 additions and 0 deletions
--- a/apps/server/src/services/inference.ts
+++ b/apps/server/src/services/inference.ts
@@ -117,6 +117,7 @@ export interface InferenceFrame {
    | 'tool_call'
    | 'tool_result'
    | 'message_complete'
+    | 'usage'
    | 'messages_deleted'
    | 'session_renamed'
    | 'chat_renamed'
@@ -145,6 +146,7 @@ export interface InferenceFrame {
  tokens_used?: number | null;
  ctx_used?: number | null;
  ctx_max?: number | null;
+  completion_tokens?: number | null;
  started_at?: string | null;
  finished_at?: string | null;
  model?: string;
@@ -444,6 +446,7 @@ async function streamCompletion(
  messages: OpenAiMessage[],
  opts: StreamOptions,
  onDelta: (content: string) => void,
+  onUsage: ((prompt: number | null, completion: number | null) => void) | undefined,
  signal?: AbortSignal
 ): Promise<StreamResult> {
  const body: Record<string, unknown> = {
@@ -499,6 +502,7 @@ async function streamCompletion(
      if (typeof parsed.usage.completion_tokens === 'number') {
        completionTokens = parsed.usage.completion_tokens;
      }
+      onUsage?.(promptTokens, completionTokens);
    }
    // v1.11.3: removed dead `parsed.timings.n_ctx` read. llama-server's
    // streaming completion does NOT emit n_ctx in timings (verified
@@ -728,6 +732,34 @@ async function executeStreamPhase(
  ).filter((t) => webToolsEnabled || !WEB_TOOL_NAMES.has(t.function.name));
  const effectiveTemperature = agent?.temperature;

+  // v1.12.2: ctx_max lookup is cached after the first hit per model, so this
+  // is a Map probe in steady state. We capture nCtx once at the top of the
+  // stream so the throttled usage publish doesn't refetch each tick.
+  const mctxForStream = await modelContext.getModelContext(session.model);
+  const nCtxForStream = mctxForStream?.n_ctx ?? null;
+
+  // v1.12.2: throttle live usage publishes to ~500ms. The model can land
+  // dozens of usage frames per second; without a throttle the WS turns into
+  // a firehose for a few KB savings on each render.
+  const USAGE_THROTTLE_MS = 500;
+  let lastUsageAt = 0;
+  let pendingUsage: { p: number | null; c: number | null } | null = null;
+  let usageTimer: NodeJS.Timeout | null = null;
+  const flushUsage = () => {
+    if (!pendingUsage) return;
+    const { p, c } = pendingUsage;
+    pendingUsage = null;
+    lastUsageAt = Date.now();
+    ctx.publish(sessionId, {
+      type: 'usage',
+      message_id: assistantMessageId,
+      chat_id: chatId,
+      completion_tokens: c,
+      ctx_used: p,
+      ctx_max: nCtxForStream,
+    });
+  };
+
  try {
    return await streamCompletion(
      ctx,
@@ -745,6 +777,18 @@ async function executeStreamPhase(
        ctx.log.debug({ sessionId, delta }, 'inference delta');
        scheduleFlush();
      },
+      (prompt, completion) => {
+        pendingUsage = { p: prompt, c: completion };
+        const elapsed = Date.now() - lastUsageAt;
+        if (elapsed >= USAGE_THROTTLE_MS) {
+          flushUsage();
+        } else if (!usageTimer) {
+          usageTimer = setTimeout(() => {
+            usageTimer = null;
+            flushUsage();
+          }, USAGE_THROTTLE_MS - elapsed);
+        }
+      },
      signal
    );
  } finally {
@@ -752,6 +796,10 @@ async function executeStreamPhase(
      clearTimeout(pendingFlushTimer);
      pendingFlushTimer = null;
    }
+    if (usageTimer) {
+      clearTimeout(usageTimer);
+      usageTimer = null;
+    }
    await flushPromise;
  }
 }
@@ -1238,6 +1286,7 @@ async function runCapHitSummary(
        });
        scheduleFlush();
      },
+      undefined,
      signal,
    );
    summaryOk = true;
@@ -1499,6 +1548,7 @@ async function runDoomLoopSummary(
        });
        scheduleFlush();
      },
+      undefined,
      signal,
    );
    summaryOk = true;