v1.12.3: stale-stream banner with Retry/Discard

When an assistant message sits status='streaming' with no token activity for 60+ seconds, the chat shows a banner above the input offering Retry or Discard. Both clear the stale row via a new backend endpoint POST /api/chats/:id/discard_stale that updates status='failed' and publishes chat_status='idle'. Closes the UX gap that caused the 2026-05-21 debugging spiral — slow streams and dead streams now look different to the user. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
v1.12.2: live tok/s + ctx display next to status indicator
2026-05-21 20:48:22 +00:00 · 2026-05-21 20:45:53 +00:00
11 changed files with 405 additions and 0 deletions
--- a/apps/server/src/routes/chats.ts
+++ b/apps/server/src/routes/chats.ts
@@ -18,6 +18,12 @@ const ForkBody = z.object({
  name: z.string().min(1).max(200).optional(),
 });

+const DiscardStaleBody = z.object({
+  message_id: z.string().uuid(),
+});
+
+const STALE_MIN_AGE_SECONDS = 60;
+
 export function registerChatRoutes(
  app: FastifyInstance,
  sql: Sql,
@@ -320,6 +326,73 @@ export function registerChatRoutes(
    }
  );

+  // v1.12.3: explicit recovery from a stuck-streaming assistant row. The
+  // frontend gates this behind a 60s no-token-activity timer; the server
+  // re-checks the age and current status for safety. Non-streaming rows
+  // return 409 (frontend race; idempotent retry is fine).
+  app.post<{ Params: { id: string } }>(
+    '/api/chats/:id/discard_stale',
+    async (req, reply) => {
+      const parsed = DiscardStaleBody.safeParse(req.body ?? {});
+      if (!parsed.success) {
+        reply.code(400);
+        return { error: 'invalid body', details: parsed.error.flatten() };
+      }
+      const rows = await sql<{
+        id: string;
+        session_id: string;
+        chat_id: string;
+        status: string;
+        age_seconds: number;
+      }[]>`
+        SELECT id, session_id, chat_id, status,
+               EXTRACT(EPOCH FROM (clock_timestamp() - created_at))::int AS age_seconds
+        FROM messages
+        WHERE id = ${parsed.data.message_id} AND chat_id = ${req.params.id}
+      `;
+      if (rows.length === 0) {
+        reply.code(404);
+        return { error: 'message not found in chat' };
+      }
+      const msg = rows[0]!;
+      if (msg.status !== 'streaming') {
+        reply.code(409);
+        return { error: 'message is no longer streaming', current_status: msg.status };
+      }
+      if (msg.age_seconds < STALE_MIN_AGE_SECONDS) {
+        reply.code(409);
+        return { error: 'message is not stale yet', age_seconds: msg.age_seconds };
+      }
+      const updated = await sql<Message[]>`
+        UPDATE messages
+        SET status = 'failed',
+            content = COALESCE(content, ''),
+            finished_at = clock_timestamp()
+        WHERE id = ${msg.id} AND status = 'streaming'
+        RETURNING id, session_id, chat_id, role, content, kind, tool_calls, tool_results,
+                  status, last_seq, tokens_used, ctx_used, ctx_max, started_at, finished_at,
+                  created_at, metadata, summary, tail_start_id, compacted_at
+      `;
+      if (updated.length === 0) {
+        // Race: the row flipped out of 'streaming' between our SELECT and UPDATE.
+        reply.code(409);
+        return { error: 'message status changed mid-request' };
+      }
+      broker.publishUser('default', {
+        type: 'chat_status',
+        chat_id: msg.chat_id,
+        status: 'idle',
+        at: new Date().toISOString(),
+      });
+      broker.publish(msg.session_id, {
+        type: 'message_complete',
+        message_id: msg.id,
+        chat_id: msg.chat_id,
+      });
+      return updated[0];
+    }
+  );
+
  app.get<{ Params: { id: string } }>(
    '/api/chats/:id/messages',
    async (req, reply) => {
--- a/apps/server/src/services/inference.ts
+++ b/apps/server/src/services/inference.ts
@@ -117,6 +117,7 @@ export interface InferenceFrame {
    | 'tool_call'
    | 'tool_result'
    | 'message_complete'
+    | 'usage'
    | 'messages_deleted'
    | 'session_renamed'
    | 'chat_renamed'
@@ -145,6 +146,7 @@ export interface InferenceFrame {
  tokens_used?: number | null;
  ctx_used?: number | null;
  ctx_max?: number | null;
+  completion_tokens?: number | null;
  started_at?: string | null;
  finished_at?: string | null;
  model?: string;
@@ -444,6 +446,7 @@ async function streamCompletion(
  messages: OpenAiMessage[],
  opts: StreamOptions,
  onDelta: (content: string) => void,
+  onUsage: ((prompt: number | null, completion: number | null) => void) | undefined,
  signal?: AbortSignal
 ): Promise<StreamResult> {
  const body: Record<string, unknown> = {
@@ -499,6 +502,7 @@ async function streamCompletion(
      if (typeof parsed.usage.completion_tokens === 'number') {
        completionTokens = parsed.usage.completion_tokens;
      }
+      onUsage?.(promptTokens, completionTokens);
    }
    // v1.11.3: removed dead `parsed.timings.n_ctx` read. llama-server's
    // streaming completion does NOT emit n_ctx in timings (verified
@@ -728,6 +732,34 @@ async function executeStreamPhase(
  ).filter((t) => webToolsEnabled || !WEB_TOOL_NAMES.has(t.function.name));
  const effectiveTemperature = agent?.temperature;

+  // v1.12.2: ctx_max lookup is cached after the first hit per model, so this
+  // is a Map probe in steady state. We capture nCtx once at the top of the
+  // stream so the throttled usage publish doesn't refetch each tick.
+  const mctxForStream = await modelContext.getModelContext(session.model);
+  const nCtxForStream = mctxForStream?.n_ctx ?? null;
+
+  // v1.12.2: throttle live usage publishes to ~500ms. The model can land
+  // dozens of usage frames per second; without a throttle the WS turns into
+  // a firehose for a few KB savings on each render.
+  const USAGE_THROTTLE_MS = 500;
+  let lastUsageAt = 0;
+  let pendingUsage: { p: number | null; c: number | null } | null = null;
+  let usageTimer: NodeJS.Timeout | null = null;
+  const flushUsage = () => {
+    if (!pendingUsage) return;
+    const { p, c } = pendingUsage;
+    pendingUsage = null;
+    lastUsageAt = Date.now();
+    ctx.publish(sessionId, {
+      type: 'usage',
+      message_id: assistantMessageId,
+      chat_id: chatId,
+      completion_tokens: c,
+      ctx_used: p,
+      ctx_max: nCtxForStream,
+    });
+  };
+
  try {
    return await streamCompletion(
      ctx,
@@ -745,6 +777,18 @@ async function executeStreamPhase(
        ctx.log.debug({ sessionId, delta }, 'inference delta');
        scheduleFlush();
      },
+      (prompt, completion) => {
+        pendingUsage = { p: prompt, c: completion };
+        const elapsed = Date.now() - lastUsageAt;
+        if (elapsed >= USAGE_THROTTLE_MS) {
+          flushUsage();
+        } else if (!usageTimer) {
+          usageTimer = setTimeout(() => {
+            usageTimer = null;
+            flushUsage();
+          }, USAGE_THROTTLE_MS - elapsed);
+        }
+      },
      signal
    );
  } finally {
@@ -752,6 +796,10 @@ async function executeStreamPhase(
      clearTimeout(pendingFlushTimer);
      pendingFlushTimer = null;
    }
+    if (usageTimer) {
+      clearTimeout(usageTimer);
+      usageTimer = null;
+    }
    await flushPromise;
  }
 }
@@ -1238,6 +1286,7 @@ async function runCapHitSummary(
        });
        scheduleFlush();
      },
+      undefined,
      signal,
    );
    summaryOk = true;
@@ -1499,6 +1548,7 @@ async function runDoomLoopSummary(
        });
        scheduleFlush();
      },
+      undefined,
      signal,
    );
    summaryOk = true;
--- a/apps/web/src/api/client.ts
+++ b/apps/web/src/api/client.ts
@@ -180,6 +180,11 @@ export const api = {
      request<{ ok: true }>(`/api/chats/${chatId}/compact`, { method: 'POST' }),
    stop: (chatId: string) =>
      request<{ stopped: boolean }>(`/api/chats/${chatId}/stop`, { method: 'POST' }),
+    discardStale: (chatId: string, messageId: string) =>
+      request<Message>(`/api/chats/${chatId}/discard_stale`, {
+        method: 'POST',
+        body: JSON.stringify({ message_id: messageId }),
+      }),
    forceSend: (chatId: string, content: string) =>
      request<{ user_message_id: string; assistant_message_id: string }>(
        `/api/chats/${chatId}/force_send`,
--- a/apps/web/src/api/types.ts
+++ b/apps/web/src/api/types.ts
@@ -332,6 +332,17 @@ export type WsFrame =
      // to the client without a refetch.
      metadata?: MessageMetadata | null;
    }
+  // v1.12.2: live throughput frame, published mid-stream every ~500ms with
+  // the latest token + ctx counts so ChatThroughput can render tok/s and
+  // ctx_used while the model is still generating.
+  | {
+      type: 'usage';
+      message_id: string;
+      chat_id?: string;
+      completion_tokens: number | null;
+      ctx_used: number | null;
+      ctx_max: number | null;
+    }
  | { type: 'messages_deleted'; message_ids: string[]; chat_id?: string }
  | { type: 'chat_renamed'; chat_id: string; name: string }
  // v1.11: published by services/compaction.ts after the new anchored
--- a/apps/web/src/components/ChatTabBar.tsx
+++ b/apps/web/src/components/ChatTabBar.tsx
@@ -2,6 +2,7 @@ import { useState } from 'react';
 import { Bot, History, MessageSquare, Plus, Terminal, X } from 'lucide-react';
 import type { Chat, WorkspacePane } from '@/api/types';
 import { StatusDot } from '@/components/StatusDot';
+import { ChatThroughput } from '@/components/ChatThroughput';
 import {
  ContextMenu,
  ContextMenuContent,
@@ -99,6 +100,7 @@ export function ChatTabBar({
              >
                <MessageSquare size={12} className="shrink-0" />
                <StatusDot chatId={chat.id} />
+                <ChatThroughput chatId={chat.id} />
                {renamingId === chat.id ? (
                  <input
                    autoFocus
--- a/apps/web/src/components/ChatThroughput.tsx
+++ b/apps/web/src/components/ChatThroughput.tsx
@@ -0,0 +1,28 @@
+import { useChatStatus } from '@/hooks/useChatStatus';
+import { useChatThroughput } from '@/hooks/useChatThroughput';
+import { cn } from '@/lib/utils';
+
+interface Props {
+  chatId: string | null | undefined;
+  className?: string;
+}
+
+// v1.12.2: inline throughput readout. Renders next to StatusDot while the
+// chat is streaming or running a tool. Hidden in idle/error/waiting states
+// — the dot already communicates those.
+export function ChatThroughput({ chatId, className }: Props) {
+  const status = useChatStatus(chatId);
+  const t = useChatThroughput(chatId);
+  if (!chatId || !t) return null;
+  if (status !== 'streaming' && status !== 'tool_running') return null;
+  const tps = t.tps != null && t.tps > 0 ? Math.round(t.tps) : null;
+  const showCtx = t.ctx_used != null && t.ctx_max != null;
+  if (tps === null && !showCtx) return null;
+  return (
+    <span className={cn('text-xs text-muted-foreground tabular-nums', className)}>
+      {tps !== null && `${tps} tok/s`}
+      {tps !== null && showCtx && ' · '}
+      {showCtx && `${t.ctx_used!.toLocaleString()}/${t.ctx_max!.toLocaleString()}`}
+    </span>
+  );
+}
--- a/apps/web/src/components/MobileTabSwitcher.tsx
+++ b/apps/web/src/components/MobileTabSwitcher.tsx
@@ -13,6 +13,7 @@ import { toast } from 'sonner';
 import type { Chat, WorkspacePane } from '@/api/types';
 import { BottomSheet } from '@/components/BottomSheet';
 import { StatusDot } from '@/components/StatusDot';
+import { ChatThroughput } from '@/components/ChatThroughput';
 import {
  DropdownMenu,
  DropdownMenuContent,
@@ -206,6 +207,7 @@ export function MobileTabSwitcher({
        >
          <span className="shrink-0 text-muted-foreground">{paneIcon(active?.kind ?? 'chat')}</span>
          <StatusDot chatId={activeChatId} />
+          <ChatThroughput chatId={activeChatId} />
          <span className="truncate flex-1 text-left">{activeLabel}</span>
          <ChevronDown size={14} className="opacity-60 shrink-0" />
        </button>
@@ -237,6 +239,7 @@ export function MobileTabSwitcher({
              >
                <span className="shrink-0 text-muted-foreground">{paneIcon(pane.kind)}</span>
                <StatusDot chatId={cid ?? null} />
+                <ChatThroughput chatId={cid ?? null} />
                {renamingChatId === cid && cid ? (
                  <input
                    autoFocus
--- a/apps/web/src/components/StaleStreamBanner.tsx
+++ b/apps/web/src/components/StaleStreamBanner.tsx
@@ -0,0 +1,34 @@
+interface Props {
+  onRetry: () => void;
+  onDiscard: () => void;
+}
+
+// v1.12.3: shown when an assistant message has been 'streaming' for 60+
+// seconds without new tokens. Lives above ChatInput in ChatPane. Retry
+// discards the stuck row then resends the last user message; Discard just
+// clears the row and drops the dot to idle.
+export function StaleStreamBanner({ onRetry, onDiscard }: Props) {
+  return (
+    <div className="border border-amber-500/30 bg-amber-500/5 rounded-md p-3 mb-2 mx-4 flex items-center justify-between gap-2">
+      <span className="text-sm text-muted-foreground">
+        Previous response didn't complete.
+      </span>
+      <div className="flex gap-2">
+        <button
+          type="button"
+          onClick={onRetry}
+          className="text-xs px-2 py-1 rounded border border-border hover:bg-accent max-md:min-h-[44px] max-md:px-3"
+        >
+          Retry
+        </button>
+        <button
+          type="button"
+          onClick={onDiscard}
+          className="text-xs px-2 py-1 rounded border border-border hover:bg-accent max-md:min-h-[44px] max-md:px-3"
+        >
+          Discard
+        </button>
+      </div>
+    </div>
+  );
+}
--- a/apps/web/src/components/panes/ChatPane.tsx
+++ b/apps/web/src/components/panes/ChatPane.tsx
@@ -5,6 +5,7 @@ import { api } from '@/api/client';
 import { useSessionStream } from '@/hooks/useSessionStream';
 import { MessageList } from '@/components/MessageList';
 import { ChatInput } from '@/components/ChatInput';
+import { StaleStreamBanner } from '@/components/StaleStreamBanner';
 import {
  DropdownMenu,
  DropdownMenuContent,
@@ -44,6 +45,38 @@ export function ChatPane({ sessionId, chatId, projectId, agentId, onAgentChange,

  const chatMessages = stream.messages.filter((m) => m.chat_id === chatId);
  const streaming = chatMessages.some((m) => m.status === 'streaming');
+
+  // v1.12.3: stale-stream detection. Watches the (at most one) streaming
+  // assistant row. If its content length doesn't grow for STALE_THRESHOLD_MS,
+  // assume the upstream call is dead and surface the recovery banner. We use
+  // content length as the activity signal because every token delta extends
+  // it; last_seq isn't currently bumped per delta.
+  const STALE_THRESHOLD_MS = 60_000;
+  const streamingMsg = chatMessages.find((m) => m.status === 'streaming' && m.role === 'assistant');
+  const streamingId = streamingMsg?.id ?? null;
+  const streamingLen = streamingMsg?.content.length ?? 0;
+  const lastActivityRef = useRef<{ id: string; len: number; at: number } | null>(null);
+  const [stale, setStale] = useState(false);
+  useEffect(() => {
+    if (!streamingId) {
+      lastActivityRef.current = null;
+      setStale(false);
+      return;
+    }
+    const prev = lastActivityRef.current;
+    if (!prev || prev.id !== streamingId || prev.len !== streamingLen) {
+      lastActivityRef.current = { id: streamingId, len: streamingLen, at: Date.now() };
+      setStale(false);
+    }
+    const interval = setInterval(() => {
+      const a = lastActivityRef.current;
+      if (!a) return;
+      if (Date.now() - a.at >= STALE_THRESHOLD_MS) {
+        setStale(true);
+      }
+    }, 5_000);
+    return () => clearInterval(interval);
+  }, [streamingId, streamingLen]);
  // v1.11.5: per-chat model context limit comes from chat.model_context_limit
  // populated by GET /api/sessions/:id/chats. Threaded into ChatInput so
  // ContextBar can render a zero-state before the first assistant message.
@@ -87,6 +120,45 @@ export function ChatPane({ sessionId, chatId, projectId, agentId, onAgentChange,
    }
  }

+  const handleDiscardStale = useCallback(async () => {
+    if (!streamingId) return;
+    try {
+      await api.chats.discardStale(chatId, streamingId);
+      setStale(false);
+      lastActivityRef.current = null;
+    } catch (err) {
+      // 409 (race) is benign — the row already terminated some other way.
+      const msg = err instanceof Error ? err.message : 'discard failed';
+      if (!msg.includes('409')) toast.error(msg);
+      setStale(false);
+    }
+  }, [chatId, streamingId]);
+
+  const handleRetryStale = useCallback(async () => {
+    if (!streamingId) return;
+    const lastUser = [...chatMessages].reverse().find((m) => m.role === 'user' && m.kind === 'message');
+    if (!lastUser) {
+      toast.error('no prior user message to retry');
+      return;
+    }
+    try {
+      await api.chats.discardStale(chatId, streamingId);
+    } catch (err) {
+      const msg = err instanceof Error ? err.message : 'discard failed';
+      if (!msg.includes('409')) {
+        toast.error(msg);
+        return;
+      }
+    }
+    setStale(false);
+    lastActivityRef.current = null;
+    try {
+      await api.messages.send(chatId, lastUser.content);
+    } catch (err) {
+      toast.error(err instanceof Error ? err.message : 'retry send failed');
+    }
+  }, [chatId, streamingId, chatMessages]);
+
  const handleForceSend = useCallback(async (content: string) => {
    const trimmed = content.trim();
    if (!trimmed) return;
@@ -187,6 +259,13 @@ export function ChatPane({ sessionId, chatId, projectId, agentId, onAgentChange,
        </div>
      )}

+      {stale && streamingId && (
+        <StaleStreamBanner
+          onRetry={() => void handleRetryStale()}
+          onDiscard={() => void handleDiscardStale()}
+        />
+      )}
+
      <ChatInput
        disabled={false}
        projectId={projectId}
--- a/apps/web/src/hooks/useChatThroughput.ts
+++ b/apps/web/src/hooks/useChatThroughput.ts
@@ -0,0 +1,106 @@
+import { useEffect, useState } from 'react';
+
+// v1.12.2: live throughput stream consumer. Fed by useSessionStream when a
+// 'usage' WS frame lands. Renders next to StatusDot via ChatThroughput.
+//
+// Singleton + Set<setState> pattern mirrors useChatStatus so any component
+// can subscribe to any chatId without prop drilling.
+
+export interface ThroughputSample {
+  tps: number | null;
+  ctx_used: number | null;
+  ctx_max: number | null;
+}
+
+interface Entry {
+  ctx_used: number | null;
+  ctx_max: number | null;
+  completion_tokens: number | null;
+  recorded_at: number;
+  prev_completion_tokens: number | null;
+  prev_recorded_at: number | null;
+  tps: number | null;
+}
+
+// Stale window. After this, useChatThroughput returns null — clears the
+// indicator after the stream ends without the next inference turn.
+const STALE_MS = 10_000;
+
+const entries = new Map<string, Entry>();
+const subscribers = new Set<() => void>();
+
+function notify(): void {
+  for (const s of subscribers) {
+    try { s(); } catch { /* swallow */ }
+  }
+}
+
+// v1.12.2: imported by useSessionStream's WS handler. Computes tps from the
+// gap between successive completion_tokens samples; first sample yields null
+// (we need two points). Skips zero-progress samples so a duplicate usage
+// frame doesn't push tps to 0.
+export function recordUsage(
+  chatId: string,
+  data: { completion_tokens: number | null; ctx_used: number | null; ctx_max: number | null },
+): void {
+  const now = Date.now();
+  const prev = entries.get(chatId);
+  let tps: number | null = prev?.tps ?? null;
+  if (
+    prev &&
+    data.completion_tokens != null &&
+    prev.completion_tokens != null &&
+    data.completion_tokens > prev.completion_tokens &&
+    now > prev.recorded_at
+  ) {
+    const dTokens = data.completion_tokens - prev.completion_tokens;
+    const dSeconds = (now - prev.recorded_at) / 1000;
+    tps = dTokens / dSeconds;
+  }
+  entries.set(chatId, {
+    ctx_used: data.ctx_used,
+    ctx_max: data.ctx_max,
+    completion_tokens: data.completion_tokens,
+    recorded_at: now,
+    prev_completion_tokens: prev?.completion_tokens ?? null,
+    prev_recorded_at: prev?.recorded_at ?? null,
+    tps,
+  });
+  notify();
+}
+
+export function clearThroughput(chatId: string): void {
+  if (entries.delete(chatId)) notify();
+}
+
+// Periodic sweep: re-notify so stale entries fall off the UI when the
+// stream ends without a follow-up frame. Light — one timer for the whole app.
+const G = globalThis as Record<string, unknown>;
+if (!G.__boocode_throughput_ticker) {
+  G.__boocode_throughput_ticker = true;
+  setInterval(() => {
+    const now = Date.now();
+    let touched = false;
+    for (const [k, v] of entries) {
+      if (now - v.recorded_at > STALE_MS) {
+        entries.delete(k);
+        touched = true;
+      }
+    }
+    if (touched) notify();
+  }, 2_000);
+}
+
+export function useChatThroughput(chatId: string | null | undefined): ThroughputSample | null {
+  const [, force] = useState({});
+  useEffect(() => {
+    const sub = () => force({});
+    subscribers.add(sub);
+    return () => { subscribers.delete(sub); };
+  }, []);
+  if (!chatId) return null;
+  const entry = entries.get(chatId);
+  if (!entry) return null;
+  if (Date.now() - entry.recorded_at > STALE_MS) return null;
+  return { tps: entry.tps, ctx_used: entry.ctx_used, ctx_max: entry.ctx_max };
+}
--- a/apps/web/src/hooks/useSessionStream.ts
+++ b/apps/web/src/hooks/useSessionStream.ts
@@ -3,6 +3,7 @@ import { toast } from 'sonner';
 import type { Message, WsFrame } from '@/api/types';
 import { api } from '@/api/client';
 import { sessionEvents } from './sessionEvents';
+import { recordUsage } from './useChatThroughput';

 // session_renamed frame removed from WsFrame — it was declared but never
 // published on the per-session WS channel (server publishes via broker.publishUser
@@ -125,6 +126,19 @@ function applyFrame(state: State, frame: WsFrame): State {
      );
      return { ...state, messages: next };
    }
+    case 'usage': {
+      // v1.12.2: live throughput. Side-effects into the module-level
+      // singleton consumed by ChatThroughput; no message-state mutation.
+      // chat_id is the optional ws-frame field; usage frames always include it.
+      if (frame.chat_id) {
+        recordUsage(frame.chat_id, {
+          completion_tokens: frame.completion_tokens,
+          ctx_used: frame.ctx_used,
+          ctx_max: frame.ctx_max,
+        });
+      }
+      return state;
+    }
    case 'messages_deleted': {
      const removeSet = new Set(frame.message_ids);
      return {