feat: post-review backlog hardening (cancel/parser/stall/history/9502)

Five independent items from the post-review backlog. F1: Stop on an external
agent task now aborts the running child via a per-task AbortController registry
reachable from the cancel route, and finalizes the assistant message as
cancelled (fixing two latent bugs — catch blocks left the message streaming,
and warm success-paths wrote complete on an aborted turn); warm pools/worktrees
are preserved and the native path is unchanged. F2/F3: prune the tool-call
parser to its two load-bearing exports (unexport eight zero-caller symbols, add
a gate test for the <invoke>-as-text fallback) and route placeholder-rejection
logging through pino. F6: a 90s per-chunk stall-timeout wraps native inference's
fullStream via AbortSignal.any so a hung stream finalizes the message instead of
hanging — no retry (a pure classifyStreamError helper is added). F7: a read-only
view_session_history MCP tool (newest-N, chronological). F9: retire the unused
apps/coder/web :9502 fallback SPA, keeping every API/WS/health/MCP route.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-03 02:23:11 +00:00
parent 9a139633b8
commit f32fd928b3
48 changed files with 1014 additions and 2254 deletions

View File

@@ -22,6 +22,12 @@ import { shouldUseClaudeSdk } from './backends/claude-sdk-routing.js';
import type { AgentBackend, AgentEvent } from './agent-backend.js';
import { publishAgentStatus } from './agent-status-publish.js';
import type { AgentStatus } from './normalize-agent-status.js';
import { createCancelRegistry } from './cancel-registry.js';
import {
finalizeStreamingMessage,
classifyTerminalStatus,
type TerminalMessageStatus,
} from './finalize-message.js';
interface InferenceRunner {
enqueue: (sessionId: string, chatId: string, assistantId: string, user: string) => void;
@@ -43,7 +49,11 @@ interface Deps {
const POLL_INTERVAL_MS = 2_000;
const COMPLETION_POLL_MS = 2_000;
export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<void> } {
export function createDispatcher(deps: Deps): {
cancelExternalTask(taskId: string): boolean;
start(): void;
stop(): Promise<void>;
} {
const { sql, inference, broker, log, config } = deps;
let timer: ReturnType<typeof setInterval> | null = null;
let listener: { unlisten: () => Promise<void> } | null = null;
@@ -55,6 +65,13 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
// turn at a time.
const inflight = new Map<string, Promise<void>>();
// F1: per-task abort registry. Each external run-function registers its per-turn
// AbortController here (keyed by task id); the cancel route reaches it through the
// exported `cancelExternalTask`; the run's `.finally` deletes the entry. Native
// boocode tasks are never registered, so a Stop on one returns false and falls
// through to the unchanged inference.cancel path.
const taskControllers = createCancelRegistry();
// Shared entry point for both the poll timer and the NOTIFY listener. poll()'s
// `polling`/`stopping` guard makes this safe to call concurrently — a notify
// arriving mid-poll returns immediately and never double-dispatches.
@@ -83,6 +100,40 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
publishAgentStatus(broker.publishFrame, sessionId, chatId, agent, status, reason);
}
// F1 (OCE-001/OCE-002): finalize a streaming assistant message into a terminal
// state and publish the matching message_complete frame. Best-effort + idempotent
// (the helper's `WHERE status='streaming'` guard) — a failure here must never mask
// the original abort/error, so it logs and swallows.
function finalizeMessage(
sessionId: string,
chatId: string,
assistantId: string,
status: TerminalMessageStatus,
model: string | null,
content?: string,
): Promise<boolean> {
return finalizeStreamingMessage(sql, broker.publishFrame, {
sessionId,
chatId,
assistantId,
status,
model,
content,
}).catch((err) => {
log.error({ err: err instanceof Error ? err.message : String(err), assistantId }, 'dispatcher: finalizeStreamingMessage failed');
return false;
});
}
// F1: the cancel route's reach into an in-flight external run. Idempotent — a
// double-Stop re-aborts an already-aborted controller (no-op) and a Stop on a
// finished/native task returns false. Aborting only fires the backend's per-turn
// cancel (session.abort / session/cancel / interrupt / child.kill); it never kills
// a warm pool process, so persistent worktrees + pooled backends are preserved.
function cancelExternalTask(taskId: string): boolean {
return taskControllers.cancel(taskId);
}
async function poll(): Promise<void> {
// `polling` serializes poll() execution itself (timer + NOTIFY can fire
// concurrently) so we never double-select a task. It does NOT serialize task
@@ -116,6 +167,9 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
// with the same key is skipped and a concurrent poll can't re-pick it.
const p = runTask(task).finally(() => {
inflight.delete(key);
// F1: drop the abort controller once the run settles. After this, a Stop
// on the (now-finished) task returns false — cancel-after-exit is safe.
taskControllers.delete(task.id);
});
inflight.set(key, p);
}
@@ -312,13 +366,16 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
return;
}
// Create an abort controller for this task
const ac = new AbortController();
// F1: register the per-task abort controller so a Stop reaches this run.
const ac = taskControllers.register(taskId);
// #10: hoisted above the try so the catch block can report `error` status with
// the (chat, agent) key. Empty until resolved below; guarded before use.
let sessionId = '';
let chatId = '';
// F1: hoisted so the catch / abort short-circuit can finalize the streaming
// assistant row. Empty until the row is created; finalize no-ops on ''.
let assistantId = '';
try {
// Mark running
@@ -384,7 +441,7 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
VALUES (${sessionId}, ${chatId}, 'assistant', '', 'streaming', ${task.model}, clock_timestamp())
RETURNING id
`;
const assistantId = assistantMsg!.id;
assistantId = assistantMsg!.id;
// write-edit-robustness #4: pre-turn worktree checkpoint (best-effort; a
// failure logs and never breaks dispatch). This path uses a per-task worktree
@@ -526,6 +583,20 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
}
}
// F1: abort short-circuit BEFORE the unconditional 'complete' write. A Stop
// (cancelExternalTask → ac.abort) or shutdown finalizes the streaming row as
// 'cancelled' (keeping whatever streamed) instead of recording 'complete',
// and skips the diff. This one-shot path owns a per-task worktree, so we DO
// tear it down here (unlike the warm paths, which keep their persistent one).
if (ac.signal.aborted || stopping) {
await finalizeMessage(sessionId, chatId, assistantId, 'cancelled', task.model, assistantContent);
await sql`UPDATE tasks SET state = 'cancelled', ended_at = clock_timestamp() WHERE id = ${taskId}`;
emitAgentStatus(sessionId, chatId, agent, 'idle', stopping ? 'shutdown' : 'cancelled');
await cleanupWorktree(projectPath, taskId);
clearTaskCommands(taskId);
return;
}
await sql`
UPDATE messages
SET content = ${assistantContent}, status = 'complete', finished_at = clock_timestamp()
@@ -539,14 +610,6 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
model: task.model,
} as WsFrame);
if (stopping) {
await sql`
UPDATE tasks SET state = 'cancelled', ended_at = clock_timestamp() WHERE id = ${taskId}
`;
await cleanupWorktree(projectPath, taskId);
return;
}
// Step 3: Diff the worktree and queue pending changes
log.info({ taskId }, 'dispatcher: diffing worktree');
const diff = await diffWorktree(worktreePath, projectPath, { signal: ac.signal });
@@ -587,18 +650,26 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
} catch (err) {
const errMsg = err instanceof Error ? err.message : String(err);
const status = classifyTerminalStatus({ aborted: ac.signal.aborted, error: err });
log.error({ taskId, agent, err: errMsg }, 'dispatcher: external agent error');
// Guard `NOT IN ('cancelled','completed')` so a genuine error in the catch
// never overwrites a state the cancel route already wrote (user-Stop wins).
await sql`
UPDATE tasks
SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}
WHERE id = ${taskId}
SET state = ${status}, ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}
WHERE id = ${taskId} AND state NOT IN ('cancelled', 'completed')
`.catch(() => {});
// F1 (OCE-001): finalize the streaming assistant message — the catch
// previously updated only `tasks` and left the message 'streaming' forever
// (the BooChat 5-min sweep runs in a different process and can't reach it).
await finalizeMessage(sessionId, chatId, assistantId, status, task.model);
// #10: external-agent turn failed/crashed. chatId may be unbound if the throw
// preceded its assignment — guard so the status publish never masks the real
// error.
if (chatId) emitAgentStatus(sessionId, chatId, agent, 'error', 'failed');
if (chatId) emitAgentStatus(sessionId, chatId, agent, status === 'cancelled' ? 'idle' : 'error', status === 'cancelled' ? 'cancelled' : 'failed');
// Best-effort cleanup
await cleanupWorktree(projectPath, taskId);
@@ -652,11 +723,14 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
return;
}
const ac = new AbortController();
// F1: register the per-task abort controller so a Stop reaches this run.
const ac = taskControllers.register(taskId);
// #10: hoisted so the catch can report `error` with the (chat, agent) key.
let sessionId = '';
let chatId = '';
// F1: hoisted so the catch / abort short-circuit can finalize the streaming row.
let assistantId = '';
try {
// execution_path = 'acp' — the schema CHECK has no 'opencode_server' value
@@ -728,7 +802,7 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
VALUES (${sessionId}, ${chatId}, 'assistant', '', 'streaming', ${task.model}, clock_timestamp())
RETURNING id
`;
const assistantId = assistantMsg!.id;
assistantId = assistantMsg!.id;
// write-edit-robustness #4: pre-turn checkpoint of the persistent session
// worktree (best-effort; never breaks dispatch). worktreeId comes from the
@@ -856,6 +930,18 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
await persistExternalAgentTurn(sql, assistantId, [...toolSnaps.values()], reasoningText);
// F1: abort short-circuit BEFORE the unconditional 'complete' write — fixes
// the warm success-path recording 'complete' on a Stop'd turn. The abort fired
// session.abort on the prompt only: the persistent session worktree is kept
// (no cleanup) and the pooled opencode server stays warm for the next turn.
if (ac.signal.aborted || stopping) {
await finalizeMessage(sessionId, chatId, assistantId, 'cancelled', task.model, assistantContent);
await sql`UPDATE tasks SET state = 'cancelled', ended_at = clock_timestamp() WHERE id = ${taskId}`;
emitAgentStatus(sessionId, chatId, agent, 'idle', stopping ? 'shutdown' : 'cancelled');
clearTaskCommands(taskId);
return; // worktree persists (no cleanup); backend stays warm
}
await sql`
UPDATE messages
SET content = ${assistantContent}, status = 'complete', finished_at = clock_timestamp()
@@ -868,11 +954,6 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
model: task.model,
} as WsFrame);
if (stopping) {
await sql`UPDATE tasks SET state = 'cancelled', ended_at = clock_timestamp() WHERE id = ${taskId}`;
return; // worktree persists (no cleanup); backend stays warm
}
// 1.10: diff the persistent worktree against its captured baseline and
// SUPERSEDE the session's prior pending row (latest-wins, one accumulating
// diff) instead of stacking. Stamp agent for DiffPanel attribution.
@@ -920,14 +1001,17 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
clearTaskCommands(taskId);
} catch (err) {
const errMsg = err instanceof Error ? err.message : String(err);
const status = classifyTerminalStatus({ aborted: ac.signal.aborted, error: err });
log.error({ taskId, agent, err: errMsg }, 'dispatcher: opencode server error');
await sql`
UPDATE tasks
SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}
WHERE id = ${taskId}
SET state = ${status}, ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}
WHERE id = ${taskId} AND state NOT IN ('cancelled', 'completed')
`.catch(() => {});
// F1 (OCE-001): finalize the streaming message (was left 'streaming').
await finalizeMessage(sessionId, chatId, assistantId, status, task.model);
// #10: turn crashed.
if (chatId) emitAgentStatus(sessionId, chatId, agent, 'error', 'crashed');
if (chatId) emitAgentStatus(sessionId, chatId, agent, status === 'cancelled' ? 'idle' : 'error', status === 'cancelled' ? 'cancelled' : 'crashed');
clearTaskCommands(taskId);
// No worktree cleanup (persistent); backend stays warm for the next turn.
}
@@ -988,7 +1072,10 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
return;
}
const ac = new AbortController();
// F1: register the per-task abort controller so a Stop reaches this run.
const ac = taskControllers.register(taskId);
// F1: hoisted so the catch / abort short-circuit can finalize the streaming row.
let assistantId = '';
try {
await sql`
@@ -1010,7 +1097,7 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
VALUES (${sessionId}, ${chatId}, 'assistant', '', 'streaming', ${task.model}, clock_timestamp())
RETURNING id
`;
const assistantId = assistantMsg!.id;
assistantId = assistantMsg!.id;
// write-edit-robustness #4: pre-turn checkpoint of the persistent session
// worktree (best-effort; never breaks dispatch). Same worktree the opencode
@@ -1121,6 +1208,18 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
await persistExternalAgentTurn(sql, assistantId, [...toolSnaps.values()], reasoningText);
// F1: abort short-circuit BEFORE the unconditional 'complete' write — fixes
// the warm success-path recording 'complete' on a Stop'd turn. The abort fired
// session/cancel on the warm connection only (never killed the child), so the
// persistent worktree is kept and the pooled (chat,agent) backend stays warm.
if (ac.signal.aborted || stopping) {
await finalizeMessage(sessionId, chatId, assistantId, 'cancelled', task.model, assistantContent);
await sql`UPDATE tasks SET state = 'cancelled', ended_at = clock_timestamp() WHERE id = ${taskId}`;
emitAgentStatus(sessionId, chatId, agent, 'idle', stopping ? 'shutdown' : 'cancelled');
clearTaskCommands(taskId);
return; // worktree persists (no cleanup); backend stays warm
}
await sql`
UPDATE messages
SET content = ${assistantContent}, status = 'complete', finished_at = clock_timestamp()
@@ -1133,11 +1232,6 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
model: task.model,
} as WsFrame);
if (stopping) {
await sql`UPDATE tasks SET state = 'cancelled', ended_at = clock_timestamp() WHERE id = ${taskId}`;
return; // worktree persists (no cleanup); backend stays warm
}
// Diff the persistent worktree against its captured baseline and SUPERSEDE
// the session's prior pending row (latest-wins) — identical to opencode.
const diff = await diffWorktree(worktreePath, projectPath, {
@@ -1184,14 +1278,17 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
clearTaskCommands(taskId);
} catch (err) {
const errMsg = err instanceof Error ? err.message : String(err);
const status = classifyTerminalStatus({ aborted: ac.signal.aborted, error: err });
log.error({ taskId, agent, err: errMsg }, 'dispatcher: warm ACP error');
await sql`
UPDATE tasks
SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}
WHERE id = ${taskId}
SET state = ${status}, ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}
WHERE id = ${taskId} AND state NOT IN ('cancelled', 'completed')
`.catch(() => {});
// F1 (OCE-001): finalize the streaming message (was left 'streaming').
await finalizeMessage(sessionId, chatId, assistantId, status, task.model);
// #10: turn crashed.
emitAgentStatus(sessionId, chatId, agent, 'error', 'crashed');
emitAgentStatus(sessionId, chatId, agent, status === 'cancelled' ? 'idle' : 'error', status === 'cancelled' ? 'cancelled' : 'crashed');
clearTaskCommands(taskId);
// No worktree cleanup (persistent); backend stays warm for the next turn.
}
@@ -1245,7 +1342,10 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
return;
}
const ac = new AbortController();
// F1: register the per-task abort controller so a Stop reaches this run.
const ac = taskControllers.register(taskId);
// F1: hoisted so the catch / abort short-circuit can finalize the streaming row.
let assistantId = '';
try {
await sql`
@@ -1267,7 +1367,7 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
VALUES (${sessionId}, ${chatId}, 'assistant', '', 'streaming', ${task.model}, clock_timestamp())
RETURNING id
`;
const assistantId = assistantMsg!.id;
assistantId = assistantMsg!.id;
// write-edit-robustness #4: pre-turn checkpoint of the persistent session
// worktree (best-effort; never breaks dispatch).
@@ -1376,6 +1476,18 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
await persistExternalAgentTurn(sql, assistantId, [...toolSnaps.values()], reasoningText);
// F1: abort short-circuit BEFORE the unconditional 'complete' write — fixes
// the warm success-path recording 'complete' on a Stop'd turn. The abort fired
// the SDK interrupt on the same query generator only (never killed the warm
// process), so the persistent worktree is kept and the backend stays warm.
if (ac.signal.aborted || stopping) {
await finalizeMessage(sessionId, chatId, assistantId, 'cancelled', task.model, assistantContent);
await sql`UPDATE tasks SET state = 'cancelled', ended_at = clock_timestamp() WHERE id = ${taskId}`;
emitAgentStatus(sessionId, chatId, agent, 'idle', stopping ? 'shutdown' : 'cancelled');
clearTaskCommands(taskId);
return; // worktree persists (no cleanup); backend stays warm
}
// ctx_used/ctx_max from the SDK result (1M-aware) → the assistant message, so
// the ContextBar renders a real context-window fill for claude.
await sql`
@@ -1391,11 +1503,6 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
model: task.model,
} as WsFrame);
if (stopping) {
await sql`UPDATE tasks SET state = 'cancelled', ended_at = clock_timestamp() WHERE id = ${taskId}`;
return; // worktree persists (no cleanup); backend stays warm
}
// Diff the persistent worktree against its captured baseline and SUPERSEDE
// the session's prior pending row (latest-wins) — identical to opencode/ACP.
const diff = await diffWorktree(worktreePath, projectPath, {
@@ -1442,14 +1549,17 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
clearTaskCommands(taskId);
} catch (err) {
const errMsg = err instanceof Error ? err.message : String(err);
const status = classifyTerminalStatus({ aborted: ac.signal.aborted, error: err });
log.error({ taskId, agent, err: errMsg }, 'dispatcher: claude SDK error');
await sql`
UPDATE tasks
SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}
WHERE id = ${taskId}
SET state = ${status}, ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}
WHERE id = ${taskId} AND state NOT IN ('cancelled', 'completed')
`.catch(() => {});
// F1 (OCE-001): finalize the streaming message (was left 'streaming').
await finalizeMessage(sessionId, chatId, assistantId, status, task.model);
// #10: turn crashed.
emitAgentStatus(sessionId, chatId, agent, 'error', 'crashed');
emitAgentStatus(sessionId, chatId, agent, status === 'cancelled' ? 'idle' : 'error', status === 'cancelled' ? 'cancelled' : 'crashed');
clearTaskCommands(taskId);
// No worktree cleanup (persistent); backend stays warm for the next turn.
}
@@ -1476,6 +1586,7 @@ export function createDispatcher(deps: Deps): { start(): void; stop(): Promise<v
}
return {
cancelExternalTask,
start() {
log.info('dispatcher: starting poll loop + tasks_new listener');