Compare commits

...

3 Commits

Author SHA1 Message Date
ea468ca7fb v1.12.4-rc1: extract budget, sentinels, xml-parser from inference.ts
Pure file moves. No behavior change. inference.ts retains createInferenceRunner
public surface; new files are internal to services/inference/.

- budget.ts: resolveToolBudget
- sentinels.ts: detectDoomLoop (re-exported through inference.ts),
  isCapHitSentinel, isDoomLoopSentinel, isAnySentinel
- xml-parser.ts: parseXmlToolCall, partialXmlOpenerStart

First of four refactor batches preparing inference.ts for the v1.13
AI SDK migration. inference.ts goes from 1780 LoC to ~1620.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 21:42:41 +00:00
eef4782383 v1.12.3: stale-stream banner with Retry/Discard
When an assistant message sits status='streaming' with no token activity
for 60+ seconds, the chat shows a banner above the input offering Retry
or Discard. Both clear the stale row via a new backend endpoint
POST /api/chats/:id/discard_stale that updates status='failed' and
publishes chat_status='idle'.

Closes the UX gap that caused the 2026-05-21 debugging spiral —
slow streams and dead streams now look different to the user.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 20:48:22 +00:00
a7104691aa v1.12.2: live tok/s + ctx display next to status indicator
ChatThroughput renders inline beside StatusDot while streaming or
tool_running. Subscribes to existing usage frames via sessionEvents.
Hides when status drops to idle/error or data is older than 10s.

Addresses the 2026-05-21 spike's UX gap where slow streams looked
identical to dead streams — now there's a live token velocity readout
that immediately distinguishes the two.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 20:45:53 +00:00
14 changed files with 547 additions and 120 deletions

View File

@@ -18,6 +18,12 @@ const ForkBody = z.object({
name: z.string().min(1).max(200).optional(),
});
const DiscardStaleBody = z.object({
message_id: z.string().uuid(),
});
const STALE_MIN_AGE_SECONDS = 60;
export function registerChatRoutes(
app: FastifyInstance,
sql: Sql,
@@ -320,6 +326,73 @@ export function registerChatRoutes(
}
);
// v1.12.3: explicit recovery from a stuck-streaming assistant row. The
// frontend gates this behind a 60s no-token-activity timer; the server
// re-checks the age and current status for safety. Non-streaming rows
// return 409 (frontend race; idempotent retry is fine).
app.post<{ Params: { id: string } }>(
'/api/chats/:id/discard_stale',
async (req, reply) => {
const parsed = DiscardStaleBody.safeParse(req.body ?? {});
if (!parsed.success) {
reply.code(400);
return { error: 'invalid body', details: parsed.error.flatten() };
}
const rows = await sql<{
id: string;
session_id: string;
chat_id: string;
status: string;
age_seconds: number;
}[]>`
SELECT id, session_id, chat_id, status,
EXTRACT(EPOCH FROM (clock_timestamp() - created_at))::int AS age_seconds
FROM messages
WHERE id = ${parsed.data.message_id} AND chat_id = ${req.params.id}
`;
if (rows.length === 0) {
reply.code(404);
return { error: 'message not found in chat' };
}
const msg = rows[0]!;
if (msg.status !== 'streaming') {
reply.code(409);
return { error: 'message is no longer streaming', current_status: msg.status };
}
if (msg.age_seconds < STALE_MIN_AGE_SECONDS) {
reply.code(409);
return { error: 'message is not stale yet', age_seconds: msg.age_seconds };
}
const updated = await sql<Message[]>`
UPDATE messages
SET status = 'failed',
content = COALESCE(content, ''),
finished_at = clock_timestamp()
WHERE id = ${msg.id} AND status = 'streaming'
RETURNING id, session_id, chat_id, role, content, kind, tool_calls, tool_results,
status, last_seq, tokens_used, ctx_used, ctx_max, started_at, finished_at,
created_at, metadata, summary, tail_start_id, compacted_at
`;
if (updated.length === 0) {
// Race: the row flipped out of 'streaming' between our SELECT and UPDATE.
reply.code(409);
return { error: 'message status changed mid-request' };
}
broker.publishUser('default', {
type: 'chat_status',
chat_id: msg.chat_id,
status: 'idle',
at: new Date().toISOString(),
});
broker.publish(msg.session_id, {
type: 'message_complete',
message_id: msg.id,
chat_id: msg.chat_id,
});
return updated[0];
}
);
app.get<{ Params: { id: string } }>(
'/api/chats/:id/messages',
async (req, reply) => {

View File

@@ -13,7 +13,6 @@ import type {
} from '../types/api.js';
import {
ALL_TOOLS,
READ_ONLY_TOOL_NAMES,
TOOLS_BY_NAME,
toolJsonSchemas,
type ToolJsonSchema,
@@ -28,88 +27,34 @@ import type { Broker } from './broker.js';
// async (awaits the container-guidance loader) — buildMessagesPayload below
// is therefore async too, and its three call sites in this file await it.
import { buildSystemPrompt } from './system-prompt.js';
import { resolveToolBudget } from './inference/budget.js';
import {
DOOM_LOOP_THRESHOLD,
detectDoomLoop,
isAnySentinel,
} from './inference/sentinels.js';
import {
XML_TOOL_CLOSE,
XML_TOOL_OPEN,
parseXmlToolCall,
partialXmlOpenerStart,
} from './inference/xml-parser.js';
// v1.12.4: re-exported so external callers (tests, future consumers) keep
// importing from services/inference.js as the public surface.
export { detectDoomLoop, DOOM_LOOP_THRESHOLD } from './inference/sentinels.js';
const DB_FLUSH_INTERVAL_MS = 500;
// v1.8.2: tool-call budget defaults. Resolved per-turn by resolveToolBudget.
// - Agent with explicit max_tool_calls: that value.
// - Agent with read-only-only tools: BUDGET_READ_ONLY (30).
// - Agent with any non-read-only tool: BUDGET_NON_READ_ONLY (10).
// - No agent (raw chat): BUDGET_NO_AGENT (15).
const BUDGET_READ_ONLY = 30;
const BUDGET_NON_READ_ONLY = 10;
const BUDGET_NO_AGENT = 15;
const READ_ONLY_SET: ReadonlySet<string> = new Set(READ_ONLY_TOOL_NAMES);
function resolveToolBudget(agent: Agent | null): number {
if (agent?.max_tool_calls != null) return agent.max_tool_calls;
if (!agent) return BUDGET_NO_AGENT;
const allReadOnly = agent.tools.every((t) => READ_ONLY_SET.has(t));
return allReadOnly ? BUDGET_READ_ONLY : BUDGET_NON_READ_ONLY;
}
// Synthetic system note appended to the cap-hit summary call. Verbatim from
// the v1.8.2 spec — do not paraphrase: the model is more reliable when the
// instruction is short, declarative, and identical across calls.
const CAP_HIT_SUMMARY_NOTE = (limit: number) =>
`You've reached the tool budget (${limit} calls). Produce the best answer you can with what you have. Do not call more tools.`;
// v1.11.6: doom-loop guard. When the model calls the same tool with the
// same arguments DOOM_LOOP_THRESHOLD times in a row within one user-message
// turn, abort the recursion and run the same wrap-up summary path as the
// cap-hit case. Ported from opencode (DOOM_LOOP_THRESHOLD in
// session/processor.ts). Threshold of 3 is the smallest value that doesn't
// false-positive on a model that retries once after a transient error.
export const DOOM_LOOP_THRESHOLD = 3;
const DOOM_LOOP_NOTE = (name: string) =>
`You called ${name} with the same arguments ${DOOM_LOOP_THRESHOLD} times in a row. Stop calling it. Produce the best answer you can with what you have.`;
// Returns the name + args of the looping tool when the LAST
// DOOM_LOOP_THRESHOLD entries in `recentToolCalls` are identical (same name
// AND deep-equal args via JSON.stringify). Returns null otherwise.
// Pure; exported for unit-test access.
export function detectDoomLoop(
recentToolCalls: ToolCall[],
): { name: string; args: Record<string, unknown> } | null {
if (recentToolCalls.length < DOOM_LOOP_THRESHOLD) return null;
const last = recentToolCalls.slice(-DOOM_LOOP_THRESHOLD);
const ref = last[0]!;
const refArgs = JSON.stringify(ref.args);
for (let i = 1; i < last.length; i++) {
const tc = last[i]!;
if (tc.name !== ref.name) return null;
if (JSON.stringify(tc.args) !== refArgs) return null;
}
return { name: ref.name, args: ref.args };
}
function isCapHitSentinel(m: Message): boolean {
return (
m.role === 'system' &&
m.metadata !== null &&
typeof m.metadata === 'object' &&
(m.metadata as { kind?: unknown }).kind === 'cap_hit'
);
}
// v1.11.6: parallel predicate. Same UI-only semantics as cap-hit sentinels —
// never sent to the LLM (filtered by buildMessagesPayload through the
// isAnySentinel check below).
function isDoomLoopSentinel(m: Message): boolean {
return (
m.role === 'system' &&
m.metadata !== null &&
typeof m.metadata === 'object' &&
(m.metadata as { kind?: unknown }).kind === 'doom_loop'
);
}
function isAnySentinel(m: Message): boolean {
return isCapHitSentinel(m) || isDoomLoopSentinel(m);
}
export interface InferenceFrame {
type:
| 'message_started'
@@ -117,6 +62,7 @@ export interface InferenceFrame {
| 'tool_call'
| 'tool_result'
| 'message_complete'
| 'usage'
| 'messages_deleted'
| 'session_renamed'
| 'chat_renamed'
@@ -145,6 +91,7 @@ export interface InferenceFrame {
tokens_used?: number | null;
ctx_used?: number | null;
ctx_max?: number | null;
completion_tokens?: number | null;
started_at?: string | null;
finished_at?: string | null;
model?: string;
@@ -389,61 +336,13 @@ interface StreamOptions {
// streamCompletion buffers delta.content, extracts complete blocks, parses
// them via parseXmlToolCall, and pushes synthetic entries into the existing
// toolCallsBuffer alongside any native JSON-format tool calls.
const XML_TOOL_OPEN = '<tool_call>';
const XML_TOOL_CLOSE = '</tool_call>';
function parseXmlToolCall(
block: string,
): { name: string; args: Record<string, unknown> } | null {
const nameMatch = block.match(/<function=([^>]+)>/);
if (!nameMatch || !nameMatch[1]) return null;
const name = nameMatch[1].trim();
if (!name) return null;
const args: Record<string, unknown> = {};
// Non-greedy body so each <parameter=…>…</parameter> pair is matched
// independently even when multiple appear in the same block.
const paramRe = /<parameter=([^>]+)>([\s\S]*?)<\/parameter>/g;
for (const m of block.matchAll(paramRe)) {
const key = (m[1] ?? '').trim();
if (!key) continue;
const raw = (m[2] ?? '').trim();
try {
args[key] = JSON.parse(raw);
} catch {
args[key] = raw;
}
}
return { name, args };
}
// Locate the first character that begins (or completely contains) an
// unfinished <tool_call> opener in `s`. Returns -1 when `s` can be flushed
// to the client in full without risking a partial tag leak.
// Case 1: a full `<tool_call>` opener with no matching closer — caller
// must keep everything from that index forward until the next
// chunk arrives with the closer.
// Case 2: `s` ends with a strict prefix of `<tool_call>` (e.g. `<tool_c`).
// Caller must keep just that suffix in the buffer.
// Note: case 1 assumes the calling loop already extracted every complete
// <tool_call>…</tool_call> pair before reaching this check.
function partialXmlOpenerStart(s: string): number {
const fullOpener = s.indexOf(XML_TOOL_OPEN);
if (fullOpener !== -1) return fullOpener;
const lastLt = s.lastIndexOf('<');
if (lastLt === -1) return -1;
const suffix = s.slice(lastLt);
if (XML_TOOL_OPEN.startsWith(suffix) && suffix.length < XML_TOOL_OPEN.length) {
return lastLt;
}
return -1;
}
async function streamCompletion(
ctx: InferenceContext,
model: string,
messages: OpenAiMessage[],
opts: StreamOptions,
onDelta: (content: string) => void,
onUsage: ((prompt: number | null, completion: number | null) => void) | undefined,
signal?: AbortSignal
): Promise<StreamResult> {
const body: Record<string, unknown> = {
@@ -499,6 +398,7 @@ async function streamCompletion(
if (typeof parsed.usage.completion_tokens === 'number') {
completionTokens = parsed.usage.completion_tokens;
}
onUsage?.(promptTokens, completionTokens);
}
// v1.11.3: removed dead `parsed.timings.n_ctx` read. llama-server's
// streaming completion does NOT emit n_ctx in timings (verified
@@ -728,6 +628,34 @@ async function executeStreamPhase(
).filter((t) => webToolsEnabled || !WEB_TOOL_NAMES.has(t.function.name));
const effectiveTemperature = agent?.temperature;
// v1.12.2: ctx_max lookup is cached after the first hit per model, so this
// is a Map probe in steady state. We capture nCtx once at the top of the
// stream so the throttled usage publish doesn't refetch each tick.
const mctxForStream = await modelContext.getModelContext(session.model);
const nCtxForStream = mctxForStream?.n_ctx ?? null;
// v1.12.2: throttle live usage publishes to ~500ms. The model can land
// dozens of usage frames per second; without a throttle the WS turns into
// a firehose for a few KB savings on each render.
const USAGE_THROTTLE_MS = 500;
let lastUsageAt = 0;
let pendingUsage: { p: number | null; c: number | null } | null = null;
let usageTimer: NodeJS.Timeout | null = null;
const flushUsage = () => {
if (!pendingUsage) return;
const { p, c } = pendingUsage;
pendingUsage = null;
lastUsageAt = Date.now();
ctx.publish(sessionId, {
type: 'usage',
message_id: assistantMessageId,
chat_id: chatId,
completion_tokens: c,
ctx_used: p,
ctx_max: nCtxForStream,
});
};
try {
return await streamCompletion(
ctx,
@@ -745,6 +673,18 @@ async function executeStreamPhase(
ctx.log.debug({ sessionId, delta }, 'inference delta');
scheduleFlush();
},
(prompt, completion) => {
pendingUsage = { p: prompt, c: completion };
const elapsed = Date.now() - lastUsageAt;
if (elapsed >= USAGE_THROTTLE_MS) {
flushUsage();
} else if (!usageTimer) {
usageTimer = setTimeout(() => {
usageTimer = null;
flushUsage();
}, USAGE_THROTTLE_MS - elapsed);
}
},
signal
);
} finally {
@@ -752,6 +692,10 @@ async function executeStreamPhase(
clearTimeout(pendingFlushTimer);
pendingFlushTimer = null;
}
if (usageTimer) {
clearTimeout(usageTimer);
usageTimer = null;
}
await flushPromise;
}
}
@@ -1238,6 +1182,7 @@ async function runCapHitSummary(
});
scheduleFlush();
},
undefined,
signal,
);
summaryOk = true;
@@ -1499,6 +1444,7 @@ async function runDoomLoopSummary(
});
scheduleFlush();
},
undefined,
signal,
);
summaryOk = true;

View File

@@ -0,0 +1,20 @@
import type { Agent } from '../../types/api.js';
import { READ_ONLY_TOOL_NAMES } from '../tools.js';
// v1.8.2: tool-call budget defaults. Resolved per-turn by resolveToolBudget.
// - Agent with explicit max_tool_calls: that value.
// - Agent with read-only-only tools: BUDGET_READ_ONLY (30).
// - Agent with any non-read-only tool: BUDGET_NON_READ_ONLY (10).
// - No agent (raw chat): BUDGET_NO_AGENT (15).
export const BUDGET_READ_ONLY = 30;
export const BUDGET_NON_READ_ONLY = 10;
export const BUDGET_NO_AGENT = 15;
const READ_ONLY_SET: ReadonlySet<string> = new Set(READ_ONLY_TOOL_NAMES);
export function resolveToolBudget(agent: Agent | null): number {
if (agent?.max_tool_calls != null) return agent.max_tool_calls;
if (!agent) return BUDGET_NO_AGENT;
const allReadOnly = agent.tools.every((t) => READ_ONLY_SET.has(t));
return allReadOnly ? BUDGET_READ_ONLY : BUDGET_NON_READ_ONLY;
}

View File

@@ -0,0 +1,53 @@
import type { Message, ToolCall } from '../../types/api.js';
// v1.11.6: doom-loop guard. When the model calls the same tool with the
// same arguments DOOM_LOOP_THRESHOLD times in a row within one user-message
// turn, abort the recursion and run the same wrap-up summary path as the
// cap-hit case. Ported from opencode (DOOM_LOOP_THRESHOLD in
// session/processor.ts). Threshold of 3 is the smallest value that doesn't
// false-positive on a model that retries once after a transient error.
export const DOOM_LOOP_THRESHOLD = 3;
// Returns the name + args of the looping tool when the LAST
// DOOM_LOOP_THRESHOLD entries in `recentToolCalls` are identical (same name
// AND deep-equal args via JSON.stringify). Returns null otherwise.
// Pure; exported for unit-test access.
export function detectDoomLoop(
recentToolCalls: ToolCall[],
): { name: string; args: Record<string, unknown> } | null {
if (recentToolCalls.length < DOOM_LOOP_THRESHOLD) return null;
const last = recentToolCalls.slice(-DOOM_LOOP_THRESHOLD);
const ref = last[0]!;
const refArgs = JSON.stringify(ref.args);
for (let i = 1; i < last.length; i++) {
const tc = last[i]!;
if (tc.name !== ref.name) return null;
if (JSON.stringify(tc.args) !== refArgs) return null;
}
return { name: ref.name, args: ref.args };
}
export function isCapHitSentinel(m: Message): boolean {
return (
m.role === 'system' &&
m.metadata !== null &&
typeof m.metadata === 'object' &&
(m.metadata as { kind?: unknown }).kind === 'cap_hit'
);
}
// v1.11.6: parallel predicate. Same UI-only semantics as cap-hit sentinels —
// never sent to the LLM (filtered by buildMessagesPayload through the
// isAnySentinel check below).
export function isDoomLoopSentinel(m: Message): boolean {
return (
m.role === 'system' &&
m.metadata !== null &&
typeof m.metadata === 'object' &&
(m.metadata as { kind?: unknown }).kind === 'doom_loop'
);
}
export function isAnySentinel(m: Message): boolean {
return isCapHitSentinel(m) || isDoomLoopSentinel(m);
}

View File

@@ -0,0 +1,53 @@
// v1.10.5: XML-tag tool-call fallback. Some models emit
// <tool_call><function=foo><parameter=key>value</parameter></function></tool_call>
// in plain content instead of using the OpenAI tool_calls JSON channel.
// The streaming loop in inference.ts extracts these blocks via these helpers.
export const XML_TOOL_OPEN = '<tool_call>';
export const XML_TOOL_CLOSE = '</tool_call>';
export function parseXmlToolCall(
block: string,
): { name: string; args: Record<string, unknown> } | null {
const nameMatch = block.match(/<function=([^>]+)>/);
if (!nameMatch || !nameMatch[1]) return null;
const name = nameMatch[1].trim();
if (!name) return null;
const args: Record<string, unknown> = {};
// Non-greedy body so each <parameter=…>…</parameter> pair is matched
// independently even when multiple appear in the same block.
const paramRe = /<parameter=([^>]+)>([\s\S]*?)<\/parameter>/g;
for (const m of block.matchAll(paramRe)) {
const key = (m[1] ?? '').trim();
if (!key) continue;
const raw = (m[2] ?? '').trim();
try {
args[key] = JSON.parse(raw);
} catch {
args[key] = raw;
}
}
return { name, args };
}
// Locate the first character that begins (or completely contains) an
// unfinished <tool_call> opener in `s`. Returns -1 when `s` can be flushed
// to the client in full without risking a partial tag leak.
// Case 1: a full `<tool_call>` opener with no matching closer — caller
// must keep everything from that index forward until the next
// chunk arrives with the closer.
// Case 2: `s` ends with a strict prefix of `<tool_call>` (e.g. `<tool_c`).
// Caller must keep just that suffix in the buffer.
// Note: case 1 assumes the calling loop already extracted every complete
// <tool_call>…</tool_call> pair before reaching this check.
export function partialXmlOpenerStart(s: string): number {
const fullOpener = s.indexOf(XML_TOOL_OPEN);
if (fullOpener !== -1) return fullOpener;
const lastLt = s.lastIndexOf('<');
if (lastLt === -1) return -1;
const suffix = s.slice(lastLt);
if (XML_TOOL_OPEN.startsWith(suffix) && suffix.length < XML_TOOL_OPEN.length) {
return lastLt;
}
return -1;
}

View File

@@ -180,6 +180,11 @@ export const api = {
request<{ ok: true }>(`/api/chats/${chatId}/compact`, { method: 'POST' }),
stop: (chatId: string) =>
request<{ stopped: boolean }>(`/api/chats/${chatId}/stop`, { method: 'POST' }),
discardStale: (chatId: string, messageId: string) =>
request<Message>(`/api/chats/${chatId}/discard_stale`, {
method: 'POST',
body: JSON.stringify({ message_id: messageId }),
}),
forceSend: (chatId: string, content: string) =>
request<{ user_message_id: string; assistant_message_id: string }>(
`/api/chats/${chatId}/force_send`,

View File

@@ -332,6 +332,17 @@ export type WsFrame =
// to the client without a refetch.
metadata?: MessageMetadata | null;
}
// v1.12.2: live throughput frame, published mid-stream every ~500ms with
// the latest token + ctx counts so ChatThroughput can render tok/s and
// ctx_used while the model is still generating.
| {
type: 'usage';
message_id: string;
chat_id?: string;
completion_tokens: number | null;
ctx_used: number | null;
ctx_max: number | null;
}
| { type: 'messages_deleted'; message_ids: string[]; chat_id?: string }
| { type: 'chat_renamed'; chat_id: string; name: string }
// v1.11: published by services/compaction.ts after the new anchored

View File

@@ -2,6 +2,7 @@ import { useState } from 'react';
import { Bot, History, MessageSquare, Plus, Terminal, X } from 'lucide-react';
import type { Chat, WorkspacePane } from '@/api/types';
import { StatusDot } from '@/components/StatusDot';
import { ChatThroughput } from '@/components/ChatThroughput';
import {
ContextMenu,
ContextMenuContent,
@@ -99,6 +100,7 @@ export function ChatTabBar({
>
<MessageSquare size={12} className="shrink-0" />
<StatusDot chatId={chat.id} />
<ChatThroughput chatId={chat.id} />
{renamingId === chat.id ? (
<input
autoFocus

View File

@@ -0,0 +1,28 @@
import { useChatStatus } from '@/hooks/useChatStatus';
import { useChatThroughput } from '@/hooks/useChatThroughput';
import { cn } from '@/lib/utils';
interface Props {
chatId: string | null | undefined;
className?: string;
}
// v1.12.2: inline throughput readout. Renders next to StatusDot while the
// chat is streaming or running a tool. Hidden in idle/error/waiting states
// — the dot already communicates those.
export function ChatThroughput({ chatId, className }: Props) {
const status = useChatStatus(chatId);
const t = useChatThroughput(chatId);
if (!chatId || !t) return null;
if (status !== 'streaming' && status !== 'tool_running') return null;
const tps = t.tps != null && t.tps > 0 ? Math.round(t.tps) : null;
const showCtx = t.ctx_used != null && t.ctx_max != null;
if (tps === null && !showCtx) return null;
return (
<span className={cn('text-xs text-muted-foreground tabular-nums', className)}>
{tps !== null && `${tps} tok/s`}
{tps !== null && showCtx && ' · '}
{showCtx && `${t.ctx_used!.toLocaleString()}/${t.ctx_max!.toLocaleString()}`}
</span>
);
}

View File

@@ -13,6 +13,7 @@ import { toast } from 'sonner';
import type { Chat, WorkspacePane } from '@/api/types';
import { BottomSheet } from '@/components/BottomSheet';
import { StatusDot } from '@/components/StatusDot';
import { ChatThroughput } from '@/components/ChatThroughput';
import {
DropdownMenu,
DropdownMenuContent,
@@ -206,6 +207,7 @@ export function MobileTabSwitcher({
>
<span className="shrink-0 text-muted-foreground">{paneIcon(active?.kind ?? 'chat')}</span>
<StatusDot chatId={activeChatId} />
<ChatThroughput chatId={activeChatId} />
<span className="truncate flex-1 text-left">{activeLabel}</span>
<ChevronDown size={14} className="opacity-60 shrink-0" />
</button>
@@ -237,6 +239,7 @@ export function MobileTabSwitcher({
>
<span className="shrink-0 text-muted-foreground">{paneIcon(pane.kind)}</span>
<StatusDot chatId={cid ?? null} />
<ChatThroughput chatId={cid ?? null} />
{renamingChatId === cid && cid ? (
<input
autoFocus

View File

@@ -0,0 +1,34 @@
interface Props {
onRetry: () => void;
onDiscard: () => void;
}
// v1.12.3: shown when an assistant message has been 'streaming' for 60+
// seconds without new tokens. Lives above ChatInput in ChatPane. Retry
// discards the stuck row then resends the last user message; Discard just
// clears the row and drops the dot to idle.
export function StaleStreamBanner({ onRetry, onDiscard }: Props) {
return (
<div className="border border-amber-500/30 bg-amber-500/5 rounded-md p-3 mb-2 mx-4 flex items-center justify-between gap-2">
<span className="text-sm text-muted-foreground">
Previous response didn't complete.
</span>
<div className="flex gap-2">
<button
type="button"
onClick={onRetry}
className="text-xs px-2 py-1 rounded border border-border hover:bg-accent max-md:min-h-[44px] max-md:px-3"
>
Retry
</button>
<button
type="button"
onClick={onDiscard}
className="text-xs px-2 py-1 rounded border border-border hover:bg-accent max-md:min-h-[44px] max-md:px-3"
>
Discard
</button>
</div>
</div>
);
}

View File

@@ -5,6 +5,7 @@ import { api } from '@/api/client';
import { useSessionStream } from '@/hooks/useSessionStream';
import { MessageList } from '@/components/MessageList';
import { ChatInput } from '@/components/ChatInput';
import { StaleStreamBanner } from '@/components/StaleStreamBanner';
import {
DropdownMenu,
DropdownMenuContent,
@@ -44,6 +45,38 @@ export function ChatPane({ sessionId, chatId, projectId, agentId, onAgentChange,
const chatMessages = stream.messages.filter((m) => m.chat_id === chatId);
const streaming = chatMessages.some((m) => m.status === 'streaming');
// v1.12.3: stale-stream detection. Watches the (at most one) streaming
// assistant row. If its content length doesn't grow for STALE_THRESHOLD_MS,
// assume the upstream call is dead and surface the recovery banner. We use
// content length as the activity signal because every token delta extends
// it; last_seq isn't currently bumped per delta.
const STALE_THRESHOLD_MS = 60_000;
const streamingMsg = chatMessages.find((m) => m.status === 'streaming' && m.role === 'assistant');
const streamingId = streamingMsg?.id ?? null;
const streamingLen = streamingMsg?.content.length ?? 0;
const lastActivityRef = useRef<{ id: string; len: number; at: number } | null>(null);
const [stale, setStale] = useState(false);
useEffect(() => {
if (!streamingId) {
lastActivityRef.current = null;
setStale(false);
return;
}
const prev = lastActivityRef.current;
if (!prev || prev.id !== streamingId || prev.len !== streamingLen) {
lastActivityRef.current = { id: streamingId, len: streamingLen, at: Date.now() };
setStale(false);
}
const interval = setInterval(() => {
const a = lastActivityRef.current;
if (!a) return;
if (Date.now() - a.at >= STALE_THRESHOLD_MS) {
setStale(true);
}
}, 5_000);
return () => clearInterval(interval);
}, [streamingId, streamingLen]);
// v1.11.5: per-chat model context limit comes from chat.model_context_limit
// populated by GET /api/sessions/:id/chats. Threaded into ChatInput so
// ContextBar can render a zero-state before the first assistant message.
@@ -87,6 +120,45 @@ export function ChatPane({ sessionId, chatId, projectId, agentId, onAgentChange,
}
}
const handleDiscardStale = useCallback(async () => {
if (!streamingId) return;
try {
await api.chats.discardStale(chatId, streamingId);
setStale(false);
lastActivityRef.current = null;
} catch (err) {
// 409 (race) is benign — the row already terminated some other way.
const msg = err instanceof Error ? err.message : 'discard failed';
if (!msg.includes('409')) toast.error(msg);
setStale(false);
}
}, [chatId, streamingId]);
const handleRetryStale = useCallback(async () => {
if (!streamingId) return;
const lastUser = [...chatMessages].reverse().find((m) => m.role === 'user' && m.kind === 'message');
if (!lastUser) {
toast.error('no prior user message to retry');
return;
}
try {
await api.chats.discardStale(chatId, streamingId);
} catch (err) {
const msg = err instanceof Error ? err.message : 'discard failed';
if (!msg.includes('409')) {
toast.error(msg);
return;
}
}
setStale(false);
lastActivityRef.current = null;
try {
await api.messages.send(chatId, lastUser.content);
} catch (err) {
toast.error(err instanceof Error ? err.message : 'retry send failed');
}
}, [chatId, streamingId, chatMessages]);
const handleForceSend = useCallback(async (content: string) => {
const trimmed = content.trim();
if (!trimmed) return;
@@ -187,6 +259,13 @@ export function ChatPane({ sessionId, chatId, projectId, agentId, onAgentChange,
</div>
)}
{stale && streamingId && (
<StaleStreamBanner
onRetry={() => void handleRetryStale()}
onDiscard={() => void handleDiscardStale()}
/>
)}
<ChatInput
disabled={false}
projectId={projectId}

View File

@@ -0,0 +1,106 @@
import { useEffect, useState } from 'react';
// v1.12.2: live throughput stream consumer. Fed by useSessionStream when a
// 'usage' WS frame lands. Renders next to StatusDot via ChatThroughput.
//
// Singleton + Set<setState> pattern mirrors useChatStatus so any component
// can subscribe to any chatId without prop drilling.
export interface ThroughputSample {
tps: number | null;
ctx_used: number | null;
ctx_max: number | null;
}
interface Entry {
ctx_used: number | null;
ctx_max: number | null;
completion_tokens: number | null;
recorded_at: number;
prev_completion_tokens: number | null;
prev_recorded_at: number | null;
tps: number | null;
}
// Stale window. After this, useChatThroughput returns null — clears the
// indicator after the stream ends without the next inference turn.
const STALE_MS = 10_000;
const entries = new Map<string, Entry>();
const subscribers = new Set<() => void>();
function notify(): void {
for (const s of subscribers) {
try { s(); } catch { /* swallow */ }
}
}
// v1.12.2: imported by useSessionStream's WS handler. Computes tps from the
// gap between successive completion_tokens samples; first sample yields null
// (we need two points). Skips zero-progress samples so a duplicate usage
// frame doesn't push tps to 0.
export function recordUsage(
chatId: string,
data: { completion_tokens: number | null; ctx_used: number | null; ctx_max: number | null },
): void {
const now = Date.now();
const prev = entries.get(chatId);
let tps: number | null = prev?.tps ?? null;
if (
prev &&
data.completion_tokens != null &&
prev.completion_tokens != null &&
data.completion_tokens > prev.completion_tokens &&
now > prev.recorded_at
) {
const dTokens = data.completion_tokens - prev.completion_tokens;
const dSeconds = (now - prev.recorded_at) / 1000;
tps = dTokens / dSeconds;
}
entries.set(chatId, {
ctx_used: data.ctx_used,
ctx_max: data.ctx_max,
completion_tokens: data.completion_tokens,
recorded_at: now,
prev_completion_tokens: prev?.completion_tokens ?? null,
prev_recorded_at: prev?.recorded_at ?? null,
tps,
});
notify();
}
export function clearThroughput(chatId: string): void {
if (entries.delete(chatId)) notify();
}
// Periodic sweep: re-notify so stale entries fall off the UI when the
// stream ends without a follow-up frame. Light — one timer for the whole app.
const G = globalThis as Record<string, unknown>;
if (!G.__boocode_throughput_ticker) {
G.__boocode_throughput_ticker = true;
setInterval(() => {
const now = Date.now();
let touched = false;
for (const [k, v] of entries) {
if (now - v.recorded_at > STALE_MS) {
entries.delete(k);
touched = true;
}
}
if (touched) notify();
}, 2_000);
}
export function useChatThroughput(chatId: string | null | undefined): ThroughputSample | null {
const [, force] = useState({});
useEffect(() => {
const sub = () => force({});
subscribers.add(sub);
return () => { subscribers.delete(sub); };
}, []);
if (!chatId) return null;
const entry = entries.get(chatId);
if (!entry) return null;
if (Date.now() - entry.recorded_at > STALE_MS) return null;
return { tps: entry.tps, ctx_used: entry.ctx_used, ctx_max: entry.ctx_max };
}

View File

@@ -3,6 +3,7 @@ import { toast } from 'sonner';
import type { Message, WsFrame } from '@/api/types';
import { api } from '@/api/client';
import { sessionEvents } from './sessionEvents';
import { recordUsage } from './useChatThroughput';
// session_renamed frame removed from WsFrame — it was declared but never
// published on the per-session WS channel (server publishes via broker.publishUser
@@ -125,6 +126,19 @@ function applyFrame(state: State, frame: WsFrame): State {
);
return { ...state, messages: next };
}
case 'usage': {
// v1.12.2: live throughput. Side-effects into the module-level
// singleton consumed by ChatThroughput; no message-state mutation.
// chat_id is the optional ws-frame field; usage frames always include it.
if (frame.chat_id) {
recordUsage(frame.chat_id, {
completion_tokens: frame.completion_tokens,
ctx_used: frame.ctx_used,
ctx_max: frame.ctx_max,
});
}
return state;
}
case 'messages_deleted': {
const removeSet = new Set(frame.message_ids);
return {