From a7104691aa4362e632ea78ad491ff906be02b838 Mon Sep 17 00:00:00 2001 From: indifferentketchup Date: Thu, 21 May 2026 20:45:53 +0000 Subject: [PATCH] v1.12.2: live tok/s + ctx display next to status indicator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ChatThroughput renders inline beside StatusDot while streaming or tool_running. Subscribes to existing usage frames via sessionEvents. Hides when status drops to idle/error or data is older than 10s. Addresses the 2026-05-21 spike's UX gap where slow streams looked identical to dead streams — now there's a live token velocity readout that immediately distinguishes the two. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/server/src/services/inference.ts | 50 +++++++++ apps/web/src/api/types.ts | 11 ++ apps/web/src/components/ChatTabBar.tsx | 2 + apps/web/src/components/ChatThroughput.tsx | 28 +++++ apps/web/src/components/MobileTabSwitcher.tsx | 3 + apps/web/src/hooks/useChatThroughput.ts | 106 ++++++++++++++++++ apps/web/src/hooks/useSessionStream.ts | 14 +++ 7 files changed, 214 insertions(+) create mode 100644 apps/web/src/components/ChatThroughput.tsx create mode 100644 apps/web/src/hooks/useChatThroughput.ts diff --git a/apps/server/src/services/inference.ts b/apps/server/src/services/inference.ts index 27dd24b..b9e6bec 100644 --- a/apps/server/src/services/inference.ts +++ b/apps/server/src/services/inference.ts @@ -117,6 +117,7 @@ export interface InferenceFrame { | 'tool_call' | 'tool_result' | 'message_complete' + | 'usage' | 'messages_deleted' | 'session_renamed' | 'chat_renamed' @@ -145,6 +146,7 @@ export interface InferenceFrame { tokens_used?: number | null; ctx_used?: number | null; ctx_max?: number | null; + completion_tokens?: number | null; started_at?: string | null; finished_at?: string | null; model?: string; @@ -444,6 +446,7 @@ async function streamCompletion( messages: OpenAiMessage[], opts: StreamOptions, onDelta: (content: string) => void, + onUsage: ((prompt: number | null, completion: number | null) => void) | undefined, signal?: AbortSignal ): Promise { const body: Record = { @@ -499,6 +502,7 @@ async function streamCompletion( if (typeof parsed.usage.completion_tokens === 'number') { completionTokens = parsed.usage.completion_tokens; } + onUsage?.(promptTokens, completionTokens); } // v1.11.3: removed dead `parsed.timings.n_ctx` read. llama-server's // streaming completion does NOT emit n_ctx in timings (verified @@ -728,6 +732,34 @@ async function executeStreamPhase( ).filter((t) => webToolsEnabled || !WEB_TOOL_NAMES.has(t.function.name)); const effectiveTemperature = agent?.temperature; + // v1.12.2: ctx_max lookup is cached after the first hit per model, so this + // is a Map probe in steady state. We capture nCtx once at the top of the + // stream so the throttled usage publish doesn't refetch each tick. + const mctxForStream = await modelContext.getModelContext(session.model); + const nCtxForStream = mctxForStream?.n_ctx ?? null; + + // v1.12.2: throttle live usage publishes to ~500ms. The model can land + // dozens of usage frames per second; without a throttle the WS turns into + // a firehose for a few KB savings on each render. + const USAGE_THROTTLE_MS = 500; + let lastUsageAt = 0; + let pendingUsage: { p: number | null; c: number | null } | null = null; + let usageTimer: NodeJS.Timeout | null = null; + const flushUsage = () => { + if (!pendingUsage) return; + const { p, c } = pendingUsage; + pendingUsage = null; + lastUsageAt = Date.now(); + ctx.publish(sessionId, { + type: 'usage', + message_id: assistantMessageId, + chat_id: chatId, + completion_tokens: c, + ctx_used: p, + ctx_max: nCtxForStream, + }); + }; + try { return await streamCompletion( ctx, @@ -745,6 +777,18 @@ async function executeStreamPhase( ctx.log.debug({ sessionId, delta }, 'inference delta'); scheduleFlush(); }, + (prompt, completion) => { + pendingUsage = { p: prompt, c: completion }; + const elapsed = Date.now() - lastUsageAt; + if (elapsed >= USAGE_THROTTLE_MS) { + flushUsage(); + } else if (!usageTimer) { + usageTimer = setTimeout(() => { + usageTimer = null; + flushUsage(); + }, USAGE_THROTTLE_MS - elapsed); + } + }, signal ); } finally { @@ -752,6 +796,10 @@ async function executeStreamPhase( clearTimeout(pendingFlushTimer); pendingFlushTimer = null; } + if (usageTimer) { + clearTimeout(usageTimer); + usageTimer = null; + } await flushPromise; } } @@ -1238,6 +1286,7 @@ async function runCapHitSummary( }); scheduleFlush(); }, + undefined, signal, ); summaryOk = true; @@ -1499,6 +1548,7 @@ async function runDoomLoopSummary( }); scheduleFlush(); }, + undefined, signal, ); summaryOk = true; diff --git a/apps/web/src/api/types.ts b/apps/web/src/api/types.ts index b45e542..d8fdff5 100644 --- a/apps/web/src/api/types.ts +++ b/apps/web/src/api/types.ts @@ -332,6 +332,17 @@ export type WsFrame = // to the client without a refetch. metadata?: MessageMetadata | null; } + // v1.12.2: live throughput frame, published mid-stream every ~500ms with + // the latest token + ctx counts so ChatThroughput can render tok/s and + // ctx_used while the model is still generating. + | { + type: 'usage'; + message_id: string; + chat_id?: string; + completion_tokens: number | null; + ctx_used: number | null; + ctx_max: number | null; + } | { type: 'messages_deleted'; message_ids: string[]; chat_id?: string } | { type: 'chat_renamed'; chat_id: string; name: string } // v1.11: published by services/compaction.ts after the new anchored diff --git a/apps/web/src/components/ChatTabBar.tsx b/apps/web/src/components/ChatTabBar.tsx index 91111e9..e71369c 100644 --- a/apps/web/src/components/ChatTabBar.tsx +++ b/apps/web/src/components/ChatTabBar.tsx @@ -2,6 +2,7 @@ import { useState } from 'react'; import { Bot, History, MessageSquare, Plus, Terminal, X } from 'lucide-react'; import type { Chat, WorkspacePane } from '@/api/types'; import { StatusDot } from '@/components/StatusDot'; +import { ChatThroughput } from '@/components/ChatThroughput'; import { ContextMenu, ContextMenuContent, @@ -99,6 +100,7 @@ export function ChatTabBar({ > + {renamingId === chat.id ? ( 0 ? Math.round(t.tps) : null; + const showCtx = t.ctx_used != null && t.ctx_max != null; + if (tps === null && !showCtx) return null; + return ( + + {tps !== null && `${tps} tok/s`} + {tps !== null && showCtx && ' · '} + {showCtx && `${t.ctx_used!.toLocaleString()}/${t.ctx_max!.toLocaleString()}`} + + ); +} diff --git a/apps/web/src/components/MobileTabSwitcher.tsx b/apps/web/src/components/MobileTabSwitcher.tsx index 5b7aef4..11cb840 100644 --- a/apps/web/src/components/MobileTabSwitcher.tsx +++ b/apps/web/src/components/MobileTabSwitcher.tsx @@ -13,6 +13,7 @@ import { toast } from 'sonner'; import type { Chat, WorkspacePane } from '@/api/types'; import { BottomSheet } from '@/components/BottomSheet'; import { StatusDot } from '@/components/StatusDot'; +import { ChatThroughput } from '@/components/ChatThroughput'; import { DropdownMenu, DropdownMenuContent, @@ -206,6 +207,7 @@ export function MobileTabSwitcher({ > {paneIcon(active?.kind ?? 'chat')} + {activeLabel} @@ -237,6 +239,7 @@ export function MobileTabSwitcher({ > {paneIcon(pane.kind)} + {renamingChatId === cid && cid ? ( pattern mirrors useChatStatus so any component +// can subscribe to any chatId without prop drilling. + +export interface ThroughputSample { + tps: number | null; + ctx_used: number | null; + ctx_max: number | null; +} + +interface Entry { + ctx_used: number | null; + ctx_max: number | null; + completion_tokens: number | null; + recorded_at: number; + prev_completion_tokens: number | null; + prev_recorded_at: number | null; + tps: number | null; +} + +// Stale window. After this, useChatThroughput returns null — clears the +// indicator after the stream ends without the next inference turn. +const STALE_MS = 10_000; + +const entries = new Map(); +const subscribers = new Set<() => void>(); + +function notify(): void { + for (const s of subscribers) { + try { s(); } catch { /* swallow */ } + } +} + +// v1.12.2: imported by useSessionStream's WS handler. Computes tps from the +// gap between successive completion_tokens samples; first sample yields null +// (we need two points). Skips zero-progress samples so a duplicate usage +// frame doesn't push tps to 0. +export function recordUsage( + chatId: string, + data: { completion_tokens: number | null; ctx_used: number | null; ctx_max: number | null }, +): void { + const now = Date.now(); + const prev = entries.get(chatId); + let tps: number | null = prev?.tps ?? null; + if ( + prev && + data.completion_tokens != null && + prev.completion_tokens != null && + data.completion_tokens > prev.completion_tokens && + now > prev.recorded_at + ) { + const dTokens = data.completion_tokens - prev.completion_tokens; + const dSeconds = (now - prev.recorded_at) / 1000; + tps = dTokens / dSeconds; + } + entries.set(chatId, { + ctx_used: data.ctx_used, + ctx_max: data.ctx_max, + completion_tokens: data.completion_tokens, + recorded_at: now, + prev_completion_tokens: prev?.completion_tokens ?? null, + prev_recorded_at: prev?.recorded_at ?? null, + tps, + }); + notify(); +} + +export function clearThroughput(chatId: string): void { + if (entries.delete(chatId)) notify(); +} + +// Periodic sweep: re-notify so stale entries fall off the UI when the +// stream ends without a follow-up frame. Light — one timer for the whole app. +const G = globalThis as Record; +if (!G.__boocode_throughput_ticker) { + G.__boocode_throughput_ticker = true; + setInterval(() => { + const now = Date.now(); + let touched = false; + for (const [k, v] of entries) { + if (now - v.recorded_at > STALE_MS) { + entries.delete(k); + touched = true; + } + } + if (touched) notify(); + }, 2_000); +} + +export function useChatThroughput(chatId: string | null | undefined): ThroughputSample | null { + const [, force] = useState({}); + useEffect(() => { + const sub = () => force({}); + subscribers.add(sub); + return () => { subscribers.delete(sub); }; + }, []); + if (!chatId) return null; + const entry = entries.get(chatId); + if (!entry) return null; + if (Date.now() - entry.recorded_at > STALE_MS) return null; + return { tps: entry.tps, ctx_used: entry.ctx_used, ctx_max: entry.ctx_max }; +} diff --git a/apps/web/src/hooks/useSessionStream.ts b/apps/web/src/hooks/useSessionStream.ts index 1ccdf8a..285558c 100644 --- a/apps/web/src/hooks/useSessionStream.ts +++ b/apps/web/src/hooks/useSessionStream.ts @@ -3,6 +3,7 @@ import { toast } from 'sonner'; import type { Message, WsFrame } from '@/api/types'; import { api } from '@/api/client'; import { sessionEvents } from './sessionEvents'; +import { recordUsage } from './useChatThroughput'; // session_renamed frame removed from WsFrame — it was declared but never // published on the per-session WS channel (server publishes via broker.publishUser @@ -125,6 +126,19 @@ function applyFrame(state: State, frame: WsFrame): State { ); return { ...state, messages: next }; } + case 'usage': { + // v1.12.2: live throughput. Side-effects into the module-level + // singleton consumed by ChatThroughput; no message-state mutation. + // chat_id is the optional ws-frame field; usage frames always include it. + if (frame.chat_id) { + recordUsage(frame.chat_id, { + completion_tokens: frame.completion_tokens, + ctx_used: frame.ctx_used, + ctx_max: frame.ctx_max, + }); + } + return state; + } case 'messages_deleted': { const removeSet = new Set(frame.message_ids); return {