v1.12.2: live tok/s + ctx display next to status indicator
ChatThroughput renders inline beside StatusDot while streaming or tool_running. Subscribes to existing usage frames via sessionEvents. Hides when status drops to idle/error or data is older than 10s. Addresses the 2026-05-21 spike's UX gap where slow streams looked identical to dead streams — now there's a live token velocity readout that immediately distinguishes the two. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -117,6 +117,7 @@ export interface InferenceFrame {
|
||||
| 'tool_call'
|
||||
| 'tool_result'
|
||||
| 'message_complete'
|
||||
| 'usage'
|
||||
| 'messages_deleted'
|
||||
| 'session_renamed'
|
||||
| 'chat_renamed'
|
||||
@@ -145,6 +146,7 @@ export interface InferenceFrame {
|
||||
tokens_used?: number | null;
|
||||
ctx_used?: number | null;
|
||||
ctx_max?: number | null;
|
||||
completion_tokens?: number | null;
|
||||
started_at?: string | null;
|
||||
finished_at?: string | null;
|
||||
model?: string;
|
||||
@@ -444,6 +446,7 @@ async function streamCompletion(
|
||||
messages: OpenAiMessage[],
|
||||
opts: StreamOptions,
|
||||
onDelta: (content: string) => void,
|
||||
onUsage: ((prompt: number | null, completion: number | null) => void) | undefined,
|
||||
signal?: AbortSignal
|
||||
): Promise<StreamResult> {
|
||||
const body: Record<string, unknown> = {
|
||||
@@ -499,6 +502,7 @@ async function streamCompletion(
|
||||
if (typeof parsed.usage.completion_tokens === 'number') {
|
||||
completionTokens = parsed.usage.completion_tokens;
|
||||
}
|
||||
onUsage?.(promptTokens, completionTokens);
|
||||
}
|
||||
// v1.11.3: removed dead `parsed.timings.n_ctx` read. llama-server's
|
||||
// streaming completion does NOT emit n_ctx in timings (verified
|
||||
@@ -728,6 +732,34 @@ async function executeStreamPhase(
|
||||
).filter((t) => webToolsEnabled || !WEB_TOOL_NAMES.has(t.function.name));
|
||||
const effectiveTemperature = agent?.temperature;
|
||||
|
||||
// v1.12.2: ctx_max lookup is cached after the first hit per model, so this
|
||||
// is a Map probe in steady state. We capture nCtx once at the top of the
|
||||
// stream so the throttled usage publish doesn't refetch each tick.
|
||||
const mctxForStream = await modelContext.getModelContext(session.model);
|
||||
const nCtxForStream = mctxForStream?.n_ctx ?? null;
|
||||
|
||||
// v1.12.2: throttle live usage publishes to ~500ms. The model can land
|
||||
// dozens of usage frames per second; without a throttle the WS turns into
|
||||
// a firehose for a few KB savings on each render.
|
||||
const USAGE_THROTTLE_MS = 500;
|
||||
let lastUsageAt = 0;
|
||||
let pendingUsage: { p: number | null; c: number | null } | null = null;
|
||||
let usageTimer: NodeJS.Timeout | null = null;
|
||||
const flushUsage = () => {
|
||||
if (!pendingUsage) return;
|
||||
const { p, c } = pendingUsage;
|
||||
pendingUsage = null;
|
||||
lastUsageAt = Date.now();
|
||||
ctx.publish(sessionId, {
|
||||
type: 'usage',
|
||||
message_id: assistantMessageId,
|
||||
chat_id: chatId,
|
||||
completion_tokens: c,
|
||||
ctx_used: p,
|
||||
ctx_max: nCtxForStream,
|
||||
});
|
||||
};
|
||||
|
||||
try {
|
||||
return await streamCompletion(
|
||||
ctx,
|
||||
@@ -745,6 +777,18 @@ async function executeStreamPhase(
|
||||
ctx.log.debug({ sessionId, delta }, 'inference delta');
|
||||
scheduleFlush();
|
||||
},
|
||||
(prompt, completion) => {
|
||||
pendingUsage = { p: prompt, c: completion };
|
||||
const elapsed = Date.now() - lastUsageAt;
|
||||
if (elapsed >= USAGE_THROTTLE_MS) {
|
||||
flushUsage();
|
||||
} else if (!usageTimer) {
|
||||
usageTimer = setTimeout(() => {
|
||||
usageTimer = null;
|
||||
flushUsage();
|
||||
}, USAGE_THROTTLE_MS - elapsed);
|
||||
}
|
||||
},
|
||||
signal
|
||||
);
|
||||
} finally {
|
||||
@@ -752,6 +796,10 @@ async function executeStreamPhase(
|
||||
clearTimeout(pendingFlushTimer);
|
||||
pendingFlushTimer = null;
|
||||
}
|
||||
if (usageTimer) {
|
||||
clearTimeout(usageTimer);
|
||||
usageTimer = null;
|
||||
}
|
||||
await flushPromise;
|
||||
}
|
||||
}
|
||||
@@ -1238,6 +1286,7 @@ async function runCapHitSummary(
|
||||
});
|
||||
scheduleFlush();
|
||||
},
|
||||
undefined,
|
||||
signal,
|
||||
);
|
||||
summaryOk = true;
|
||||
@@ -1499,6 +1548,7 @@ async function runDoomLoopSummary(
|
||||
});
|
||||
scheduleFlush();
|
||||
},
|
||||
undefined,
|
||||
signal,
|
||||
);
|
||||
summaryOk = true;
|
||||
|
||||
Reference in New Issue
Block a user