Compare commits
3 Commits
v1.12.1-st
...
ea468ca7fb
| Author | SHA1 | Date | |
|---|---|---|---|
| ea468ca7fb | |||
| eef4782383 | |||
| a7104691aa |
@@ -18,6 +18,12 @@ const ForkBody = z.object({
|
||||
name: z.string().min(1).max(200).optional(),
|
||||
});
|
||||
|
||||
const DiscardStaleBody = z.object({
|
||||
message_id: z.string().uuid(),
|
||||
});
|
||||
|
||||
const STALE_MIN_AGE_SECONDS = 60;
|
||||
|
||||
export function registerChatRoutes(
|
||||
app: FastifyInstance,
|
||||
sql: Sql,
|
||||
@@ -320,6 +326,73 @@ export function registerChatRoutes(
|
||||
}
|
||||
);
|
||||
|
||||
// v1.12.3: explicit recovery from a stuck-streaming assistant row. The
|
||||
// frontend gates this behind a 60s no-token-activity timer; the server
|
||||
// re-checks the age and current status for safety. Non-streaming rows
|
||||
// return 409 (frontend race; idempotent retry is fine).
|
||||
app.post<{ Params: { id: string } }>(
|
||||
'/api/chats/:id/discard_stale',
|
||||
async (req, reply) => {
|
||||
const parsed = DiscardStaleBody.safeParse(req.body ?? {});
|
||||
if (!parsed.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid body', details: parsed.error.flatten() };
|
||||
}
|
||||
const rows = await sql<{
|
||||
id: string;
|
||||
session_id: string;
|
||||
chat_id: string;
|
||||
status: string;
|
||||
age_seconds: number;
|
||||
}[]>`
|
||||
SELECT id, session_id, chat_id, status,
|
||||
EXTRACT(EPOCH FROM (clock_timestamp() - created_at))::int AS age_seconds
|
||||
FROM messages
|
||||
WHERE id = ${parsed.data.message_id} AND chat_id = ${req.params.id}
|
||||
`;
|
||||
if (rows.length === 0) {
|
||||
reply.code(404);
|
||||
return { error: 'message not found in chat' };
|
||||
}
|
||||
const msg = rows[0]!;
|
||||
if (msg.status !== 'streaming') {
|
||||
reply.code(409);
|
||||
return { error: 'message is no longer streaming', current_status: msg.status };
|
||||
}
|
||||
if (msg.age_seconds < STALE_MIN_AGE_SECONDS) {
|
||||
reply.code(409);
|
||||
return { error: 'message is not stale yet', age_seconds: msg.age_seconds };
|
||||
}
|
||||
const updated = await sql<Message[]>`
|
||||
UPDATE messages
|
||||
SET status = 'failed',
|
||||
content = COALESCE(content, ''),
|
||||
finished_at = clock_timestamp()
|
||||
WHERE id = ${msg.id} AND status = 'streaming'
|
||||
RETURNING id, session_id, chat_id, role, content, kind, tool_calls, tool_results,
|
||||
status, last_seq, tokens_used, ctx_used, ctx_max, started_at, finished_at,
|
||||
created_at, metadata, summary, tail_start_id, compacted_at
|
||||
`;
|
||||
if (updated.length === 0) {
|
||||
// Race: the row flipped out of 'streaming' between our SELECT and UPDATE.
|
||||
reply.code(409);
|
||||
return { error: 'message status changed mid-request' };
|
||||
}
|
||||
broker.publishUser('default', {
|
||||
type: 'chat_status',
|
||||
chat_id: msg.chat_id,
|
||||
status: 'idle',
|
||||
at: new Date().toISOString(),
|
||||
});
|
||||
broker.publish(msg.session_id, {
|
||||
type: 'message_complete',
|
||||
message_id: msg.id,
|
||||
chat_id: msg.chat_id,
|
||||
});
|
||||
return updated[0];
|
||||
}
|
||||
);
|
||||
|
||||
app.get<{ Params: { id: string } }>(
|
||||
'/api/chats/:id/messages',
|
||||
async (req, reply) => {
|
||||
|
||||
@@ -13,7 +13,6 @@ import type {
|
||||
} from '../types/api.js';
|
||||
import {
|
||||
ALL_TOOLS,
|
||||
READ_ONLY_TOOL_NAMES,
|
||||
TOOLS_BY_NAME,
|
||||
toolJsonSchemas,
|
||||
type ToolJsonSchema,
|
||||
@@ -28,88 +27,34 @@ import type { Broker } from './broker.js';
|
||||
// async (awaits the container-guidance loader) — buildMessagesPayload below
|
||||
// is therefore async too, and its three call sites in this file await it.
|
||||
import { buildSystemPrompt } from './system-prompt.js';
|
||||
import { resolveToolBudget } from './inference/budget.js';
|
||||
import {
|
||||
DOOM_LOOP_THRESHOLD,
|
||||
detectDoomLoop,
|
||||
isAnySentinel,
|
||||
} from './inference/sentinels.js';
|
||||
import {
|
||||
XML_TOOL_CLOSE,
|
||||
XML_TOOL_OPEN,
|
||||
parseXmlToolCall,
|
||||
partialXmlOpenerStart,
|
||||
} from './inference/xml-parser.js';
|
||||
|
||||
// v1.12.4: re-exported so external callers (tests, future consumers) keep
|
||||
// importing from services/inference.js as the public surface.
|
||||
export { detectDoomLoop, DOOM_LOOP_THRESHOLD } from './inference/sentinels.js';
|
||||
|
||||
const DB_FLUSH_INTERVAL_MS = 500;
|
||||
|
||||
// v1.8.2: tool-call budget defaults. Resolved per-turn by resolveToolBudget.
|
||||
// - Agent with explicit max_tool_calls: that value.
|
||||
// - Agent with read-only-only tools: BUDGET_READ_ONLY (30).
|
||||
// - Agent with any non-read-only tool: BUDGET_NON_READ_ONLY (10).
|
||||
// - No agent (raw chat): BUDGET_NO_AGENT (15).
|
||||
const BUDGET_READ_ONLY = 30;
|
||||
const BUDGET_NON_READ_ONLY = 10;
|
||||
const BUDGET_NO_AGENT = 15;
|
||||
|
||||
const READ_ONLY_SET: ReadonlySet<string> = new Set(READ_ONLY_TOOL_NAMES);
|
||||
|
||||
function resolveToolBudget(agent: Agent | null): number {
|
||||
if (agent?.max_tool_calls != null) return agent.max_tool_calls;
|
||||
if (!agent) return BUDGET_NO_AGENT;
|
||||
const allReadOnly = agent.tools.every((t) => READ_ONLY_SET.has(t));
|
||||
return allReadOnly ? BUDGET_READ_ONLY : BUDGET_NON_READ_ONLY;
|
||||
}
|
||||
|
||||
// Synthetic system note appended to the cap-hit summary call. Verbatim from
|
||||
// the v1.8.2 spec — do not paraphrase: the model is more reliable when the
|
||||
// instruction is short, declarative, and identical across calls.
|
||||
const CAP_HIT_SUMMARY_NOTE = (limit: number) =>
|
||||
`You've reached the tool budget (${limit} calls). Produce the best answer you can with what you have. Do not call more tools.`;
|
||||
|
||||
// v1.11.6: doom-loop guard. When the model calls the same tool with the
|
||||
// same arguments DOOM_LOOP_THRESHOLD times in a row within one user-message
|
||||
// turn, abort the recursion and run the same wrap-up summary path as the
|
||||
// cap-hit case. Ported from opencode (DOOM_LOOP_THRESHOLD in
|
||||
// session/processor.ts). Threshold of 3 is the smallest value that doesn't
|
||||
// false-positive on a model that retries once after a transient error.
|
||||
export const DOOM_LOOP_THRESHOLD = 3;
|
||||
|
||||
const DOOM_LOOP_NOTE = (name: string) =>
|
||||
`You called ${name} with the same arguments ${DOOM_LOOP_THRESHOLD} times in a row. Stop calling it. Produce the best answer you can with what you have.`;
|
||||
|
||||
// Returns the name + args of the looping tool when the LAST
|
||||
// DOOM_LOOP_THRESHOLD entries in `recentToolCalls` are identical (same name
|
||||
// AND deep-equal args via JSON.stringify). Returns null otherwise.
|
||||
// Pure; exported for unit-test access.
|
||||
export function detectDoomLoop(
|
||||
recentToolCalls: ToolCall[],
|
||||
): { name: string; args: Record<string, unknown> } | null {
|
||||
if (recentToolCalls.length < DOOM_LOOP_THRESHOLD) return null;
|
||||
const last = recentToolCalls.slice(-DOOM_LOOP_THRESHOLD);
|
||||
const ref = last[0]!;
|
||||
const refArgs = JSON.stringify(ref.args);
|
||||
for (let i = 1; i < last.length; i++) {
|
||||
const tc = last[i]!;
|
||||
if (tc.name !== ref.name) return null;
|
||||
if (JSON.stringify(tc.args) !== refArgs) return null;
|
||||
}
|
||||
return { name: ref.name, args: ref.args };
|
||||
}
|
||||
|
||||
function isCapHitSentinel(m: Message): boolean {
|
||||
return (
|
||||
m.role === 'system' &&
|
||||
m.metadata !== null &&
|
||||
typeof m.metadata === 'object' &&
|
||||
(m.metadata as { kind?: unknown }).kind === 'cap_hit'
|
||||
);
|
||||
}
|
||||
|
||||
// v1.11.6: parallel predicate. Same UI-only semantics as cap-hit sentinels —
|
||||
// never sent to the LLM (filtered by buildMessagesPayload through the
|
||||
// isAnySentinel check below).
|
||||
function isDoomLoopSentinel(m: Message): boolean {
|
||||
return (
|
||||
m.role === 'system' &&
|
||||
m.metadata !== null &&
|
||||
typeof m.metadata === 'object' &&
|
||||
(m.metadata as { kind?: unknown }).kind === 'doom_loop'
|
||||
);
|
||||
}
|
||||
|
||||
function isAnySentinel(m: Message): boolean {
|
||||
return isCapHitSentinel(m) || isDoomLoopSentinel(m);
|
||||
}
|
||||
|
||||
export interface InferenceFrame {
|
||||
type:
|
||||
| 'message_started'
|
||||
@@ -117,6 +62,7 @@ export interface InferenceFrame {
|
||||
| 'tool_call'
|
||||
| 'tool_result'
|
||||
| 'message_complete'
|
||||
| 'usage'
|
||||
| 'messages_deleted'
|
||||
| 'session_renamed'
|
||||
| 'chat_renamed'
|
||||
@@ -145,6 +91,7 @@ export interface InferenceFrame {
|
||||
tokens_used?: number | null;
|
||||
ctx_used?: number | null;
|
||||
ctx_max?: number | null;
|
||||
completion_tokens?: number | null;
|
||||
started_at?: string | null;
|
||||
finished_at?: string | null;
|
||||
model?: string;
|
||||
@@ -389,61 +336,13 @@ interface StreamOptions {
|
||||
// streamCompletion buffers delta.content, extracts complete blocks, parses
|
||||
// them via parseXmlToolCall, and pushes synthetic entries into the existing
|
||||
// toolCallsBuffer alongside any native JSON-format tool calls.
|
||||
const XML_TOOL_OPEN = '<tool_call>';
|
||||
const XML_TOOL_CLOSE = '</tool_call>';
|
||||
|
||||
function parseXmlToolCall(
|
||||
block: string,
|
||||
): { name: string; args: Record<string, unknown> } | null {
|
||||
const nameMatch = block.match(/<function=([^>]+)>/);
|
||||
if (!nameMatch || !nameMatch[1]) return null;
|
||||
const name = nameMatch[1].trim();
|
||||
if (!name) return null;
|
||||
const args: Record<string, unknown> = {};
|
||||
// Non-greedy body so each <parameter=…>…</parameter> pair is matched
|
||||
// independently even when multiple appear in the same block.
|
||||
const paramRe = /<parameter=([^>]+)>([\s\S]*?)<\/parameter>/g;
|
||||
for (const m of block.matchAll(paramRe)) {
|
||||
const key = (m[1] ?? '').trim();
|
||||
if (!key) continue;
|
||||
const raw = (m[2] ?? '').trim();
|
||||
try {
|
||||
args[key] = JSON.parse(raw);
|
||||
} catch {
|
||||
args[key] = raw;
|
||||
}
|
||||
}
|
||||
return { name, args };
|
||||
}
|
||||
|
||||
// Locate the first character that begins (or completely contains) an
|
||||
// unfinished <tool_call> opener in `s`. Returns -1 when `s` can be flushed
|
||||
// to the client in full without risking a partial tag leak.
|
||||
// Case 1: a full `<tool_call>` opener with no matching closer — caller
|
||||
// must keep everything from that index forward until the next
|
||||
// chunk arrives with the closer.
|
||||
// Case 2: `s` ends with a strict prefix of `<tool_call>` (e.g. `<tool_c`).
|
||||
// Caller must keep just that suffix in the buffer.
|
||||
// Note: case 1 assumes the calling loop already extracted every complete
|
||||
// <tool_call>…</tool_call> pair before reaching this check.
|
||||
function partialXmlOpenerStart(s: string): number {
|
||||
const fullOpener = s.indexOf(XML_TOOL_OPEN);
|
||||
if (fullOpener !== -1) return fullOpener;
|
||||
const lastLt = s.lastIndexOf('<');
|
||||
if (lastLt === -1) return -1;
|
||||
const suffix = s.slice(lastLt);
|
||||
if (XML_TOOL_OPEN.startsWith(suffix) && suffix.length < XML_TOOL_OPEN.length) {
|
||||
return lastLt;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
async function streamCompletion(
|
||||
ctx: InferenceContext,
|
||||
model: string,
|
||||
messages: OpenAiMessage[],
|
||||
opts: StreamOptions,
|
||||
onDelta: (content: string) => void,
|
||||
onUsage: ((prompt: number | null, completion: number | null) => void) | undefined,
|
||||
signal?: AbortSignal
|
||||
): Promise<StreamResult> {
|
||||
const body: Record<string, unknown> = {
|
||||
@@ -499,6 +398,7 @@ async function streamCompletion(
|
||||
if (typeof parsed.usage.completion_tokens === 'number') {
|
||||
completionTokens = parsed.usage.completion_tokens;
|
||||
}
|
||||
onUsage?.(promptTokens, completionTokens);
|
||||
}
|
||||
// v1.11.3: removed dead `parsed.timings.n_ctx` read. llama-server's
|
||||
// streaming completion does NOT emit n_ctx in timings (verified
|
||||
@@ -728,6 +628,34 @@ async function executeStreamPhase(
|
||||
).filter((t) => webToolsEnabled || !WEB_TOOL_NAMES.has(t.function.name));
|
||||
const effectiveTemperature = agent?.temperature;
|
||||
|
||||
// v1.12.2: ctx_max lookup is cached after the first hit per model, so this
|
||||
// is a Map probe in steady state. We capture nCtx once at the top of the
|
||||
// stream so the throttled usage publish doesn't refetch each tick.
|
||||
const mctxForStream = await modelContext.getModelContext(session.model);
|
||||
const nCtxForStream = mctxForStream?.n_ctx ?? null;
|
||||
|
||||
// v1.12.2: throttle live usage publishes to ~500ms. The model can land
|
||||
// dozens of usage frames per second; without a throttle the WS turns into
|
||||
// a firehose for a few KB savings on each render.
|
||||
const USAGE_THROTTLE_MS = 500;
|
||||
let lastUsageAt = 0;
|
||||
let pendingUsage: { p: number | null; c: number | null } | null = null;
|
||||
let usageTimer: NodeJS.Timeout | null = null;
|
||||
const flushUsage = () => {
|
||||
if (!pendingUsage) return;
|
||||
const { p, c } = pendingUsage;
|
||||
pendingUsage = null;
|
||||
lastUsageAt = Date.now();
|
||||
ctx.publish(sessionId, {
|
||||
type: 'usage',
|
||||
message_id: assistantMessageId,
|
||||
chat_id: chatId,
|
||||
completion_tokens: c,
|
||||
ctx_used: p,
|
||||
ctx_max: nCtxForStream,
|
||||
});
|
||||
};
|
||||
|
||||
try {
|
||||
return await streamCompletion(
|
||||
ctx,
|
||||
@@ -745,6 +673,18 @@ async function executeStreamPhase(
|
||||
ctx.log.debug({ sessionId, delta }, 'inference delta');
|
||||
scheduleFlush();
|
||||
},
|
||||
(prompt, completion) => {
|
||||
pendingUsage = { p: prompt, c: completion };
|
||||
const elapsed = Date.now() - lastUsageAt;
|
||||
if (elapsed >= USAGE_THROTTLE_MS) {
|
||||
flushUsage();
|
||||
} else if (!usageTimer) {
|
||||
usageTimer = setTimeout(() => {
|
||||
usageTimer = null;
|
||||
flushUsage();
|
||||
}, USAGE_THROTTLE_MS - elapsed);
|
||||
}
|
||||
},
|
||||
signal
|
||||
);
|
||||
} finally {
|
||||
@@ -752,6 +692,10 @@ async function executeStreamPhase(
|
||||
clearTimeout(pendingFlushTimer);
|
||||
pendingFlushTimer = null;
|
||||
}
|
||||
if (usageTimer) {
|
||||
clearTimeout(usageTimer);
|
||||
usageTimer = null;
|
||||
}
|
||||
await flushPromise;
|
||||
}
|
||||
}
|
||||
@@ -1238,6 +1182,7 @@ async function runCapHitSummary(
|
||||
});
|
||||
scheduleFlush();
|
||||
},
|
||||
undefined,
|
||||
signal,
|
||||
);
|
||||
summaryOk = true;
|
||||
@@ -1499,6 +1444,7 @@ async function runDoomLoopSummary(
|
||||
});
|
||||
scheduleFlush();
|
||||
},
|
||||
undefined,
|
||||
signal,
|
||||
);
|
||||
summaryOk = true;
|
||||
|
||||
20
apps/server/src/services/inference/budget.ts
Normal file
20
apps/server/src/services/inference/budget.ts
Normal file
@@ -0,0 +1,20 @@
|
||||
import type { Agent } from '../../types/api.js';
|
||||
import { READ_ONLY_TOOL_NAMES } from '../tools.js';
|
||||
|
||||
// v1.8.2: tool-call budget defaults. Resolved per-turn by resolveToolBudget.
|
||||
// - Agent with explicit max_tool_calls: that value.
|
||||
// - Agent with read-only-only tools: BUDGET_READ_ONLY (30).
|
||||
// - Agent with any non-read-only tool: BUDGET_NON_READ_ONLY (10).
|
||||
// - No agent (raw chat): BUDGET_NO_AGENT (15).
|
||||
export const BUDGET_READ_ONLY = 30;
|
||||
export const BUDGET_NON_READ_ONLY = 10;
|
||||
export const BUDGET_NO_AGENT = 15;
|
||||
|
||||
const READ_ONLY_SET: ReadonlySet<string> = new Set(READ_ONLY_TOOL_NAMES);
|
||||
|
||||
export function resolveToolBudget(agent: Agent | null): number {
|
||||
if (agent?.max_tool_calls != null) return agent.max_tool_calls;
|
||||
if (!agent) return BUDGET_NO_AGENT;
|
||||
const allReadOnly = agent.tools.every((t) => READ_ONLY_SET.has(t));
|
||||
return allReadOnly ? BUDGET_READ_ONLY : BUDGET_NON_READ_ONLY;
|
||||
}
|
||||
53
apps/server/src/services/inference/sentinels.ts
Normal file
53
apps/server/src/services/inference/sentinels.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
import type { Message, ToolCall } from '../../types/api.js';
|
||||
|
||||
// v1.11.6: doom-loop guard. When the model calls the same tool with the
|
||||
// same arguments DOOM_LOOP_THRESHOLD times in a row within one user-message
|
||||
// turn, abort the recursion and run the same wrap-up summary path as the
|
||||
// cap-hit case. Ported from opencode (DOOM_LOOP_THRESHOLD in
|
||||
// session/processor.ts). Threshold of 3 is the smallest value that doesn't
|
||||
// false-positive on a model that retries once after a transient error.
|
||||
export const DOOM_LOOP_THRESHOLD = 3;
|
||||
|
||||
// Returns the name + args of the looping tool when the LAST
|
||||
// DOOM_LOOP_THRESHOLD entries in `recentToolCalls` are identical (same name
|
||||
// AND deep-equal args via JSON.stringify). Returns null otherwise.
|
||||
// Pure; exported for unit-test access.
|
||||
export function detectDoomLoop(
|
||||
recentToolCalls: ToolCall[],
|
||||
): { name: string; args: Record<string, unknown> } | null {
|
||||
if (recentToolCalls.length < DOOM_LOOP_THRESHOLD) return null;
|
||||
const last = recentToolCalls.slice(-DOOM_LOOP_THRESHOLD);
|
||||
const ref = last[0]!;
|
||||
const refArgs = JSON.stringify(ref.args);
|
||||
for (let i = 1; i < last.length; i++) {
|
||||
const tc = last[i]!;
|
||||
if (tc.name !== ref.name) return null;
|
||||
if (JSON.stringify(tc.args) !== refArgs) return null;
|
||||
}
|
||||
return { name: ref.name, args: ref.args };
|
||||
}
|
||||
|
||||
export function isCapHitSentinel(m: Message): boolean {
|
||||
return (
|
||||
m.role === 'system' &&
|
||||
m.metadata !== null &&
|
||||
typeof m.metadata === 'object' &&
|
||||
(m.metadata as { kind?: unknown }).kind === 'cap_hit'
|
||||
);
|
||||
}
|
||||
|
||||
// v1.11.6: parallel predicate. Same UI-only semantics as cap-hit sentinels —
|
||||
// never sent to the LLM (filtered by buildMessagesPayload through the
|
||||
// isAnySentinel check below).
|
||||
export function isDoomLoopSentinel(m: Message): boolean {
|
||||
return (
|
||||
m.role === 'system' &&
|
||||
m.metadata !== null &&
|
||||
typeof m.metadata === 'object' &&
|
||||
(m.metadata as { kind?: unknown }).kind === 'doom_loop'
|
||||
);
|
||||
}
|
||||
|
||||
export function isAnySentinel(m: Message): boolean {
|
||||
return isCapHitSentinel(m) || isDoomLoopSentinel(m);
|
||||
}
|
||||
53
apps/server/src/services/inference/xml-parser.ts
Normal file
53
apps/server/src/services/inference/xml-parser.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
// v1.10.5: XML-tag tool-call fallback. Some models emit
|
||||
// <tool_call><function=foo><parameter=key>value</parameter></function></tool_call>
|
||||
// in plain content instead of using the OpenAI tool_calls JSON channel.
|
||||
// The streaming loop in inference.ts extracts these blocks via these helpers.
|
||||
|
||||
export const XML_TOOL_OPEN = '<tool_call>';
|
||||
export const XML_TOOL_CLOSE = '</tool_call>';
|
||||
|
||||
export function parseXmlToolCall(
|
||||
block: string,
|
||||
): { name: string; args: Record<string, unknown> } | null {
|
||||
const nameMatch = block.match(/<function=([^>]+)>/);
|
||||
if (!nameMatch || !nameMatch[1]) return null;
|
||||
const name = nameMatch[1].trim();
|
||||
if (!name) return null;
|
||||
const args: Record<string, unknown> = {};
|
||||
// Non-greedy body so each <parameter=…>…</parameter> pair is matched
|
||||
// independently even when multiple appear in the same block.
|
||||
const paramRe = /<parameter=([^>]+)>([\s\S]*?)<\/parameter>/g;
|
||||
for (const m of block.matchAll(paramRe)) {
|
||||
const key = (m[1] ?? '').trim();
|
||||
if (!key) continue;
|
||||
const raw = (m[2] ?? '').trim();
|
||||
try {
|
||||
args[key] = JSON.parse(raw);
|
||||
} catch {
|
||||
args[key] = raw;
|
||||
}
|
||||
}
|
||||
return { name, args };
|
||||
}
|
||||
|
||||
// Locate the first character that begins (or completely contains) an
|
||||
// unfinished <tool_call> opener in `s`. Returns -1 when `s` can be flushed
|
||||
// to the client in full without risking a partial tag leak.
|
||||
// Case 1: a full `<tool_call>` opener with no matching closer — caller
|
||||
// must keep everything from that index forward until the next
|
||||
// chunk arrives with the closer.
|
||||
// Case 2: `s` ends with a strict prefix of `<tool_call>` (e.g. `<tool_c`).
|
||||
// Caller must keep just that suffix in the buffer.
|
||||
// Note: case 1 assumes the calling loop already extracted every complete
|
||||
// <tool_call>…</tool_call> pair before reaching this check.
|
||||
export function partialXmlOpenerStart(s: string): number {
|
||||
const fullOpener = s.indexOf(XML_TOOL_OPEN);
|
||||
if (fullOpener !== -1) return fullOpener;
|
||||
const lastLt = s.lastIndexOf('<');
|
||||
if (lastLt === -1) return -1;
|
||||
const suffix = s.slice(lastLt);
|
||||
if (XML_TOOL_OPEN.startsWith(suffix) && suffix.length < XML_TOOL_OPEN.length) {
|
||||
return lastLt;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
@@ -180,6 +180,11 @@ export const api = {
|
||||
request<{ ok: true }>(`/api/chats/${chatId}/compact`, { method: 'POST' }),
|
||||
stop: (chatId: string) =>
|
||||
request<{ stopped: boolean }>(`/api/chats/${chatId}/stop`, { method: 'POST' }),
|
||||
discardStale: (chatId: string, messageId: string) =>
|
||||
request<Message>(`/api/chats/${chatId}/discard_stale`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({ message_id: messageId }),
|
||||
}),
|
||||
forceSend: (chatId: string, content: string) =>
|
||||
request<{ user_message_id: string; assistant_message_id: string }>(
|
||||
`/api/chats/${chatId}/force_send`,
|
||||
|
||||
@@ -332,6 +332,17 @@ export type WsFrame =
|
||||
// to the client without a refetch.
|
||||
metadata?: MessageMetadata | null;
|
||||
}
|
||||
// v1.12.2: live throughput frame, published mid-stream every ~500ms with
|
||||
// the latest token + ctx counts so ChatThroughput can render tok/s and
|
||||
// ctx_used while the model is still generating.
|
||||
| {
|
||||
type: 'usage';
|
||||
message_id: string;
|
||||
chat_id?: string;
|
||||
completion_tokens: number | null;
|
||||
ctx_used: number | null;
|
||||
ctx_max: number | null;
|
||||
}
|
||||
| { type: 'messages_deleted'; message_ids: string[]; chat_id?: string }
|
||||
| { type: 'chat_renamed'; chat_id: string; name: string }
|
||||
// v1.11: published by services/compaction.ts after the new anchored
|
||||
|
||||
@@ -2,6 +2,7 @@ import { useState } from 'react';
|
||||
import { Bot, History, MessageSquare, Plus, Terminal, X } from 'lucide-react';
|
||||
import type { Chat, WorkspacePane } from '@/api/types';
|
||||
import { StatusDot } from '@/components/StatusDot';
|
||||
import { ChatThroughput } from '@/components/ChatThroughput';
|
||||
import {
|
||||
ContextMenu,
|
||||
ContextMenuContent,
|
||||
@@ -99,6 +100,7 @@ export function ChatTabBar({
|
||||
>
|
||||
<MessageSquare size={12} className="shrink-0" />
|
||||
<StatusDot chatId={chat.id} />
|
||||
<ChatThroughput chatId={chat.id} />
|
||||
{renamingId === chat.id ? (
|
||||
<input
|
||||
autoFocus
|
||||
|
||||
28
apps/web/src/components/ChatThroughput.tsx
Normal file
28
apps/web/src/components/ChatThroughput.tsx
Normal file
@@ -0,0 +1,28 @@
|
||||
import { useChatStatus } from '@/hooks/useChatStatus';
|
||||
import { useChatThroughput } from '@/hooks/useChatThroughput';
|
||||
import { cn } from '@/lib/utils';
|
||||
|
||||
interface Props {
|
||||
chatId: string | null | undefined;
|
||||
className?: string;
|
||||
}
|
||||
|
||||
// v1.12.2: inline throughput readout. Renders next to StatusDot while the
|
||||
// chat is streaming or running a tool. Hidden in idle/error/waiting states
|
||||
// — the dot already communicates those.
|
||||
export function ChatThroughput({ chatId, className }: Props) {
|
||||
const status = useChatStatus(chatId);
|
||||
const t = useChatThroughput(chatId);
|
||||
if (!chatId || !t) return null;
|
||||
if (status !== 'streaming' && status !== 'tool_running') return null;
|
||||
const tps = t.tps != null && t.tps > 0 ? Math.round(t.tps) : null;
|
||||
const showCtx = t.ctx_used != null && t.ctx_max != null;
|
||||
if (tps === null && !showCtx) return null;
|
||||
return (
|
||||
<span className={cn('text-xs text-muted-foreground tabular-nums', className)}>
|
||||
{tps !== null && `${tps} tok/s`}
|
||||
{tps !== null && showCtx && ' · '}
|
||||
{showCtx && `${t.ctx_used!.toLocaleString()}/${t.ctx_max!.toLocaleString()}`}
|
||||
</span>
|
||||
);
|
||||
}
|
||||
@@ -13,6 +13,7 @@ import { toast } from 'sonner';
|
||||
import type { Chat, WorkspacePane } from '@/api/types';
|
||||
import { BottomSheet } from '@/components/BottomSheet';
|
||||
import { StatusDot } from '@/components/StatusDot';
|
||||
import { ChatThroughput } from '@/components/ChatThroughput';
|
||||
import {
|
||||
DropdownMenu,
|
||||
DropdownMenuContent,
|
||||
@@ -206,6 +207,7 @@ export function MobileTabSwitcher({
|
||||
>
|
||||
<span className="shrink-0 text-muted-foreground">{paneIcon(active?.kind ?? 'chat')}</span>
|
||||
<StatusDot chatId={activeChatId} />
|
||||
<ChatThroughput chatId={activeChatId} />
|
||||
<span className="truncate flex-1 text-left">{activeLabel}</span>
|
||||
<ChevronDown size={14} className="opacity-60 shrink-0" />
|
||||
</button>
|
||||
@@ -237,6 +239,7 @@ export function MobileTabSwitcher({
|
||||
>
|
||||
<span className="shrink-0 text-muted-foreground">{paneIcon(pane.kind)}</span>
|
||||
<StatusDot chatId={cid ?? null} />
|
||||
<ChatThroughput chatId={cid ?? null} />
|
||||
{renamingChatId === cid && cid ? (
|
||||
<input
|
||||
autoFocus
|
||||
|
||||
34
apps/web/src/components/StaleStreamBanner.tsx
Normal file
34
apps/web/src/components/StaleStreamBanner.tsx
Normal file
@@ -0,0 +1,34 @@
|
||||
interface Props {
|
||||
onRetry: () => void;
|
||||
onDiscard: () => void;
|
||||
}
|
||||
|
||||
// v1.12.3: shown when an assistant message has been 'streaming' for 60+
|
||||
// seconds without new tokens. Lives above ChatInput in ChatPane. Retry
|
||||
// discards the stuck row then resends the last user message; Discard just
|
||||
// clears the row and drops the dot to idle.
|
||||
export function StaleStreamBanner({ onRetry, onDiscard }: Props) {
|
||||
return (
|
||||
<div className="border border-amber-500/30 bg-amber-500/5 rounded-md p-3 mb-2 mx-4 flex items-center justify-between gap-2">
|
||||
<span className="text-sm text-muted-foreground">
|
||||
Previous response didn't complete.
|
||||
</span>
|
||||
<div className="flex gap-2">
|
||||
<button
|
||||
type="button"
|
||||
onClick={onRetry}
|
||||
className="text-xs px-2 py-1 rounded border border-border hover:bg-accent max-md:min-h-[44px] max-md:px-3"
|
||||
>
|
||||
Retry
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
onClick={onDiscard}
|
||||
className="text-xs px-2 py-1 rounded border border-border hover:bg-accent max-md:min-h-[44px] max-md:px-3"
|
||||
>
|
||||
Discard
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -5,6 +5,7 @@ import { api } from '@/api/client';
|
||||
import { useSessionStream } from '@/hooks/useSessionStream';
|
||||
import { MessageList } from '@/components/MessageList';
|
||||
import { ChatInput } from '@/components/ChatInput';
|
||||
import { StaleStreamBanner } from '@/components/StaleStreamBanner';
|
||||
import {
|
||||
DropdownMenu,
|
||||
DropdownMenuContent,
|
||||
@@ -44,6 +45,38 @@ export function ChatPane({ sessionId, chatId, projectId, agentId, onAgentChange,
|
||||
|
||||
const chatMessages = stream.messages.filter((m) => m.chat_id === chatId);
|
||||
const streaming = chatMessages.some((m) => m.status === 'streaming');
|
||||
|
||||
// v1.12.3: stale-stream detection. Watches the (at most one) streaming
|
||||
// assistant row. If its content length doesn't grow for STALE_THRESHOLD_MS,
|
||||
// assume the upstream call is dead and surface the recovery banner. We use
|
||||
// content length as the activity signal because every token delta extends
|
||||
// it; last_seq isn't currently bumped per delta.
|
||||
const STALE_THRESHOLD_MS = 60_000;
|
||||
const streamingMsg = chatMessages.find((m) => m.status === 'streaming' && m.role === 'assistant');
|
||||
const streamingId = streamingMsg?.id ?? null;
|
||||
const streamingLen = streamingMsg?.content.length ?? 0;
|
||||
const lastActivityRef = useRef<{ id: string; len: number; at: number } | null>(null);
|
||||
const [stale, setStale] = useState(false);
|
||||
useEffect(() => {
|
||||
if (!streamingId) {
|
||||
lastActivityRef.current = null;
|
||||
setStale(false);
|
||||
return;
|
||||
}
|
||||
const prev = lastActivityRef.current;
|
||||
if (!prev || prev.id !== streamingId || prev.len !== streamingLen) {
|
||||
lastActivityRef.current = { id: streamingId, len: streamingLen, at: Date.now() };
|
||||
setStale(false);
|
||||
}
|
||||
const interval = setInterval(() => {
|
||||
const a = lastActivityRef.current;
|
||||
if (!a) return;
|
||||
if (Date.now() - a.at >= STALE_THRESHOLD_MS) {
|
||||
setStale(true);
|
||||
}
|
||||
}, 5_000);
|
||||
return () => clearInterval(interval);
|
||||
}, [streamingId, streamingLen]);
|
||||
// v1.11.5: per-chat model context limit comes from chat.model_context_limit
|
||||
// populated by GET /api/sessions/:id/chats. Threaded into ChatInput so
|
||||
// ContextBar can render a zero-state before the first assistant message.
|
||||
@@ -87,6 +120,45 @@ export function ChatPane({ sessionId, chatId, projectId, agentId, onAgentChange,
|
||||
}
|
||||
}
|
||||
|
||||
const handleDiscardStale = useCallback(async () => {
|
||||
if (!streamingId) return;
|
||||
try {
|
||||
await api.chats.discardStale(chatId, streamingId);
|
||||
setStale(false);
|
||||
lastActivityRef.current = null;
|
||||
} catch (err) {
|
||||
// 409 (race) is benign — the row already terminated some other way.
|
||||
const msg = err instanceof Error ? err.message : 'discard failed';
|
||||
if (!msg.includes('409')) toast.error(msg);
|
||||
setStale(false);
|
||||
}
|
||||
}, [chatId, streamingId]);
|
||||
|
||||
const handleRetryStale = useCallback(async () => {
|
||||
if (!streamingId) return;
|
||||
const lastUser = [...chatMessages].reverse().find((m) => m.role === 'user' && m.kind === 'message');
|
||||
if (!lastUser) {
|
||||
toast.error('no prior user message to retry');
|
||||
return;
|
||||
}
|
||||
try {
|
||||
await api.chats.discardStale(chatId, streamingId);
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : 'discard failed';
|
||||
if (!msg.includes('409')) {
|
||||
toast.error(msg);
|
||||
return;
|
||||
}
|
||||
}
|
||||
setStale(false);
|
||||
lastActivityRef.current = null;
|
||||
try {
|
||||
await api.messages.send(chatId, lastUser.content);
|
||||
} catch (err) {
|
||||
toast.error(err instanceof Error ? err.message : 'retry send failed');
|
||||
}
|
||||
}, [chatId, streamingId, chatMessages]);
|
||||
|
||||
const handleForceSend = useCallback(async (content: string) => {
|
||||
const trimmed = content.trim();
|
||||
if (!trimmed) return;
|
||||
@@ -187,6 +259,13 @@ export function ChatPane({ sessionId, chatId, projectId, agentId, onAgentChange,
|
||||
</div>
|
||||
)}
|
||||
|
||||
{stale && streamingId && (
|
||||
<StaleStreamBanner
|
||||
onRetry={() => void handleRetryStale()}
|
||||
onDiscard={() => void handleDiscardStale()}
|
||||
/>
|
||||
)}
|
||||
|
||||
<ChatInput
|
||||
disabled={false}
|
||||
projectId={projectId}
|
||||
|
||||
106
apps/web/src/hooks/useChatThroughput.ts
Normal file
106
apps/web/src/hooks/useChatThroughput.ts
Normal file
@@ -0,0 +1,106 @@
|
||||
import { useEffect, useState } from 'react';
|
||||
|
||||
// v1.12.2: live throughput stream consumer. Fed by useSessionStream when a
|
||||
// 'usage' WS frame lands. Renders next to StatusDot via ChatThroughput.
|
||||
//
|
||||
// Singleton + Set<setState> pattern mirrors useChatStatus so any component
|
||||
// can subscribe to any chatId without prop drilling.
|
||||
|
||||
export interface ThroughputSample {
|
||||
tps: number | null;
|
||||
ctx_used: number | null;
|
||||
ctx_max: number | null;
|
||||
}
|
||||
|
||||
interface Entry {
|
||||
ctx_used: number | null;
|
||||
ctx_max: number | null;
|
||||
completion_tokens: number | null;
|
||||
recorded_at: number;
|
||||
prev_completion_tokens: number | null;
|
||||
prev_recorded_at: number | null;
|
||||
tps: number | null;
|
||||
}
|
||||
|
||||
// Stale window. After this, useChatThroughput returns null — clears the
|
||||
// indicator after the stream ends without the next inference turn.
|
||||
const STALE_MS = 10_000;
|
||||
|
||||
const entries = new Map<string, Entry>();
|
||||
const subscribers = new Set<() => void>();
|
||||
|
||||
function notify(): void {
|
||||
for (const s of subscribers) {
|
||||
try { s(); } catch { /* swallow */ }
|
||||
}
|
||||
}
|
||||
|
||||
// v1.12.2: imported by useSessionStream's WS handler. Computes tps from the
|
||||
// gap between successive completion_tokens samples; first sample yields null
|
||||
// (we need two points). Skips zero-progress samples so a duplicate usage
|
||||
// frame doesn't push tps to 0.
|
||||
export function recordUsage(
|
||||
chatId: string,
|
||||
data: { completion_tokens: number | null; ctx_used: number | null; ctx_max: number | null },
|
||||
): void {
|
||||
const now = Date.now();
|
||||
const prev = entries.get(chatId);
|
||||
let tps: number | null = prev?.tps ?? null;
|
||||
if (
|
||||
prev &&
|
||||
data.completion_tokens != null &&
|
||||
prev.completion_tokens != null &&
|
||||
data.completion_tokens > prev.completion_tokens &&
|
||||
now > prev.recorded_at
|
||||
) {
|
||||
const dTokens = data.completion_tokens - prev.completion_tokens;
|
||||
const dSeconds = (now - prev.recorded_at) / 1000;
|
||||
tps = dTokens / dSeconds;
|
||||
}
|
||||
entries.set(chatId, {
|
||||
ctx_used: data.ctx_used,
|
||||
ctx_max: data.ctx_max,
|
||||
completion_tokens: data.completion_tokens,
|
||||
recorded_at: now,
|
||||
prev_completion_tokens: prev?.completion_tokens ?? null,
|
||||
prev_recorded_at: prev?.recorded_at ?? null,
|
||||
tps,
|
||||
});
|
||||
notify();
|
||||
}
|
||||
|
||||
export function clearThroughput(chatId: string): void {
|
||||
if (entries.delete(chatId)) notify();
|
||||
}
|
||||
|
||||
// Periodic sweep: re-notify so stale entries fall off the UI when the
|
||||
// stream ends without a follow-up frame. Light — one timer for the whole app.
|
||||
const G = globalThis as Record<string, unknown>;
|
||||
if (!G.__boocode_throughput_ticker) {
|
||||
G.__boocode_throughput_ticker = true;
|
||||
setInterval(() => {
|
||||
const now = Date.now();
|
||||
let touched = false;
|
||||
for (const [k, v] of entries) {
|
||||
if (now - v.recorded_at > STALE_MS) {
|
||||
entries.delete(k);
|
||||
touched = true;
|
||||
}
|
||||
}
|
||||
if (touched) notify();
|
||||
}, 2_000);
|
||||
}
|
||||
|
||||
export function useChatThroughput(chatId: string | null | undefined): ThroughputSample | null {
|
||||
const [, force] = useState({});
|
||||
useEffect(() => {
|
||||
const sub = () => force({});
|
||||
subscribers.add(sub);
|
||||
return () => { subscribers.delete(sub); };
|
||||
}, []);
|
||||
if (!chatId) return null;
|
||||
const entry = entries.get(chatId);
|
||||
if (!entry) return null;
|
||||
if (Date.now() - entry.recorded_at > STALE_MS) return null;
|
||||
return { tps: entry.tps, ctx_used: entry.ctx_used, ctx_max: entry.ctx_max };
|
||||
}
|
||||
@@ -3,6 +3,7 @@ import { toast } from 'sonner';
|
||||
import type { Message, WsFrame } from '@/api/types';
|
||||
import { api } from '@/api/client';
|
||||
import { sessionEvents } from './sessionEvents';
|
||||
import { recordUsage } from './useChatThroughput';
|
||||
|
||||
// session_renamed frame removed from WsFrame — it was declared but never
|
||||
// published on the per-session WS channel (server publishes via broker.publishUser
|
||||
@@ -125,6 +126,19 @@ function applyFrame(state: State, frame: WsFrame): State {
|
||||
);
|
||||
return { ...state, messages: next };
|
||||
}
|
||||
case 'usage': {
|
||||
// v1.12.2: live throughput. Side-effects into the module-level
|
||||
// singleton consumed by ChatThroughput; no message-state mutation.
|
||||
// chat_id is the optional ws-frame field; usage frames always include it.
|
||||
if (frame.chat_id) {
|
||||
recordUsage(frame.chat_id, {
|
||||
completion_tokens: frame.completion_tokens,
|
||||
ctx_used: frame.ctx_used,
|
||||
ctx_max: frame.ctx_max,
|
||||
});
|
||||
}
|
||||
return state;
|
||||
}
|
||||
case 'messages_deleted': {
|
||||
const removeSet = new Set(frame.message_ids);
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user