boocode/apps/server/src/services/inference/error-handler.ts

import type { MessageMetadata, Session } from '../../types/api.js';
import {
  decideHtmlArtifactWrite,
  detectHtmlArtifact,
  deriveHtmlTitle,
  HTML_ARTIFACT_MAX_BYTES,
} from '../artifacts.js';
import * as modelContext from '../model-context.js';
import { maybeFlagForCompaction } from './payload.js';
import { insertParts, partsFromAssistantMessage } from './parts.js';
import type { PartInsert } from './parts.js';
import { stripToolMarkup } from './tool-call-parser.js';
import type { InferenceContext, StreamResult, TurnArgs } from './types.js';

export async function handleAbortOrError(
  ctx: InferenceContext,
  args: TurnArgs,
  accumulated: string,
  err: unknown
): Promise<void> {
  const { sessionId, chatId, assistantMessageId } = args;
  const isAbort = err instanceof Error && err.name === 'AbortError';
  const finalStatus = isAbort ? 'cancelled' : 'failed';
  const errMsg = err instanceof Error ? err.message : String(err);
  accumulated = stripToolMarkup(accumulated, { final: true });
  // v1.8.2: persist a structured error metadata blob on genuine failures so
  // the bubble can render the reason on reload without re-deriving from the
  // (one-shot) WS error frame. User-initiated abort skips this — there's no
  // "reason" to surface for a stop the user already explicitly chose.
  const errorMetadata: MessageMetadata | null = isAbort
    ? null
    : { kind: 'error', error_reason: 'llm_provider_error', error_text: errMsg };
  if (errorMetadata) {
    await ctx.sql`
      UPDATE messages
      SET status = ${finalStatus},
          content = ${accumulated},
          finished_at = clock_timestamp(),
          metadata = ${ctx.sql.json(errorMetadata as never)}
      WHERE id = ${assistantMessageId}
    `;
  } else {
    await ctx.sql`
      UPDATE messages
      SET status = ${finalStatus},
          content = ${accumulated},
          finished_at = clock_timestamp()
      WHERE id = ${assistantMessageId}
    `;
  }
  const [failSessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>`
    UPDATE sessions SET updated_at = clock_timestamp()
    WHERE id = ${sessionId}
    RETURNING project_id, name, updated_at
  `;
  ctx.publishUser({ type: 'session_updated', session_id: sessionId, project_id: failSessRow!.project_id, name: failSessRow!.name, updated_at: failSessRow!.updated_at });
  // v1.8 mobile-tabs: cancellation is a user-initiated stop, treat as idle;
  // genuine errors flip the dot red. v1.8.2: error path also carries a
  // machine-readable `reason` so the UI can render specifics inline.
  if (isAbort) {
    // v1.12.1: defensive cancellation write. The status=${finalStatus} UPDATE
    // above already sets 'cancelled' for the AbortError case, but a row can
    // leak as 'streaming' when the abort fires between the post-tool-phase
    // INSERT (executeToolPhase) and the next runAssistantTurn's stream setup,
    // bypassing the try/catch around executeStreamPhase. The status guard
    // makes this a no-op when the earlier write already landed.
    await ctx.sql`
      UPDATE messages
      SET status = 'cancelled', content = ${accumulated}, finished_at = clock_timestamp()
      WHERE id = ${args.assistantMessageId} AND status = 'streaming'
    `;
    ctx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'idle', at: new Date().toISOString() });
    ctx.publish(sessionId, {
      type: 'message_complete',
      message_id: assistantMessageId,
      chat_id: chatId,
    });
    ctx.log.info({ sessionId, chatId, assistantMessageId }, 'inference cancelled');
  } else {
    ctx.publishUser({
      type: 'chat_status',
      chat_id: chatId,
      status: 'error',
      at: new Date().toISOString(),
      reason: 'llm_provider_error',
    });
    ctx.publish(sessionId, {
      type: 'error',
      message_id: assistantMessageId,
      chat_id: chatId,
      error: errMsg,
      reason: 'llm_provider_error',
    });
    ctx.log.error({ err, sessionId, assistantMessageId }, 'inference failed');
  }
}

// P5: the success-finalize atom shared by the wrap-up summaries
// (sentinel-summaries.ts) and the synthesis pass (synthesisPipeline.ts). Both
// previously hand-rolled this exact ceremony — n_ctx lookup, the complete
// UPDATE (content/status/tokens/ctx/ctx_max/finished_at; NO model column), and
// the message_complete frame with the full token fields. Single-sourcing it
// means a message_complete frame-contract change lands in one place instead of
// silently skipping the summary/synthesis paths.
//
// `beforeComplete` runs AFTER the UPDATE and BEFORE the message_complete frame
// — synthesis uses it to write its kind='synthesis' part in the original order
// (UPDATE → insertParts → message_complete), preserving timing exactly.
//
// NOTE: finalizeCompletion does NOT use this — it additionally writes the
// `model` column, the text/reasoning/html_artifact parts, the compaction flag,
// and the session_updated bump, which this atom deliberately omits (the summary
// and synthesis paths handle those — or not — themselves).
export async function finalizeStreamedRow(
  ctx: InferenceContext,
  opts: {
    sessionId: string;
    chatId: string;
    messageId: string;
    model: string;
    content: string;
    completionTokens: number | null;
    promptTokens: number | null;
    startedAt: string | null;
    cacheTokens?: number | null;
    reasoningTokens?: number | null;
    beforeComplete?: () => Promise<void>;
  },
): Promise<void> {
  // v1.11.3: see executeToolPhase for the rationale.
  const mctx = await modelContext.getModelContext(opts.model);
  const nCtx = mctx?.n_ctx ?? null;
  const [updated] = await ctx.sql<
    { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
  >`
    UPDATE messages
    SET content = ${opts.content},
        status = 'complete',
        tokens_used = ${opts.completionTokens},
        ctx_used = ${opts.promptTokens},
        ctx_max = ${nCtx},
        cache_tokens = ${opts.cacheTokens ?? null},
        reasoning_tokens = ${opts.reasoningTokens ?? null},
        finished_at = clock_timestamp()
    WHERE id = ${opts.messageId}
    RETURNING tokens_used, ctx_used, ctx_max, finished_at
  `;
  if (opts.beforeComplete) await opts.beforeComplete();
  ctx.publish(opts.sessionId, {
    type: 'message_complete',
    message_id: opts.messageId,
    chat_id: opts.chatId,
    tokens_used: updated?.tokens_used ?? null,
    ctx_used: updated?.ctx_used ?? null,
    ctx_max: updated?.ctx_max ?? null,
    cache_tokens: opts.cacheTokens ?? null,
    reasoning_tokens: opts.reasoningTokens ?? null,
    started_at: opts.startedAt,
    finished_at: updated?.finished_at ?? null,
    model: opts.model,
  });
}

// P5: minimal empty-finalize for the mistake-escalate path. The escalate
// branch in runAssistantTurn stops the turn cap-hit-style; the next assistant
// row is still 'streaming', so it's finalized as an empty complete row (no
// tokens, no parts, no session bump — the escalate branch handles the sentinel
// + chat_status itself). Centralizing the status-column write + message_complete
// frame here keeps it next to the other finalize paths so a status-column
// change is found in one place.
export async function finalizeEmpty(
  ctx: InferenceContext,
  args: TurnArgs,
): Promise<void> {
  const { sessionId, chatId, assistantMessageId } = args;
  await ctx.sql`
    UPDATE messages
    SET content = '', status = 'complete', finished_at = clock_timestamp()
    WHERE id = ${assistantMessageId}
  `;
  ctx.publish(sessionId, {
    type: 'message_complete',
    message_id: assistantMessageId,
    chat_id: chatId,
  });
}

export async function finalizeCompletion(
  ctx: InferenceContext,
  args: TurnArgs,
  result: StreamResult,
  startedAt: string | null,
  session: Session
): Promise<void> {
  const { sessionId, chatId, assistantMessageId } = args;
  const content = stripToolMarkup(result.content, { final: true });
  const { finishReason, promptTokens, completionTokens, cacheReadTokens, reasoningTokens } = result;

  // v1.11.3: see executeToolPhase for the rationale.
  const mctx = await modelContext.getModelContext(session.model);
  const nCtx = mctx?.n_ctx ?? null;

  const [updated] = await ctx.sql<
    { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
  >`
    UPDATE messages
    SET content = ${content},
        status = 'complete',
        tokens_used = ${completionTokens},
        ctx_used = ${promptTokens},
        ctx_max = ${nCtx},
        cache_tokens = ${cacheReadTokens ?? null},
        reasoning_tokens = ${reasoningTokens ?? null},
        model = ${session.model},
        finished_at = clock_timestamp()
    WHERE id = ${assistantMessageId}
    RETURNING tokens_used, ctx_used, ctx_max, finished_at
  `;
  // v1.13.0: dual-write the text part. finalizeCompletion is the terminal
  // path for text-only assistant turns (no tool calls); tool_calls are null
  // here by construction (the tool-bearing path goes through executeToolPhase).
  // v1.13.1-C: include result.reasoning so reasoning-channel models capture
  // a kind='reasoning' part alongside the text.
  // TODO(v1.13.1): wrap the UPDATE above and this insertParts in a single
  // sql.begin before flipping read authority to message_parts.
  const baseParts: PartInsert[] = partsFromAssistantMessage({
    content,
    tool_calls: null,
    reasoning: result.reasoning,
  }).map((p) => ({
    ...p,
    message_id: assistantMessageId,
  }));
  // v1.14.x-html-artifact-panes: opportunistic HTML detection. Adds a
  // SIBLING html_artifact part — never replaces the text part. 1MB cap is
  // graceful: oversized payloads are skipped and the assistant message
  // lands as plain content (warn logged).
  const htmlContent = detectHtmlArtifact(content);
  if (htmlContent !== null) {
    const decision = decideHtmlArtifactWrite(htmlContent);
    if (!decision.write) {
      ctx.log.warn(
        { assistantMessageId, byteLen: decision.byteLen, cap: HTML_ARTIFACT_MAX_BYTES },
        'html_artifact exceeded 1MB cap; skipping artifact part',
      );
    } else {
      const title = deriveHtmlTitle(htmlContent);
      const nextSeq = baseParts.reduce((m, p) => Math.max(m, p.sequence), -1) + 1;
      baseParts.push({
        message_id: assistantMessageId,
        sequence: nextSeq,
        kind: 'html_artifact',
        payload: {
          html_content: htmlContent,
          char_count: htmlContent.length,
          title,
        },
      });
    }
  }
  await insertParts(ctx.sql, baseParts);
  // v1.11: flag for compaction on the terminal turn too. Catches the common
  // case of a turn that hit the limit without invoking tools.
  await maybeFlagForCompaction(ctx, chatId, updated);
  const [completeSessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>`
    UPDATE sessions SET updated_at = clock_timestamp()
    WHERE id = ${sessionId}
    RETURNING project_id, name, updated_at
  `;
  ctx.publishUser({ type: 'session_updated', session_id: sessionId, project_id: completeSessRow!.project_id, name: completeSessRow!.name, updated_at: completeSessRow!.updated_at });
  ctx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'idle', at: new Date().toISOString() });
  ctx.publish(sessionId, {
    type: 'message_complete',
    message_id: assistantMessageId,
    chat_id: chatId,
    tokens_used: updated?.tokens_used ?? null,
    ctx_used: updated?.ctx_used ?? null,
    ctx_max: updated?.ctx_max ?? null,
    cache_tokens: cacheReadTokens ?? null,
    reasoning_tokens: reasoningTokens ?? null,
    started_at: startedAt,
    finished_at: updated?.finished_at ?? null,
    model: session.model,
  });
  ctx.log.info(
    {
      sessionId,
      chatId,
      assistantMessageId,
      finishReason,
      chars: content.length,
      tokens_used: updated?.tokens_used,
      ctx_used: updated?.ctx_used,
    },
    'inference complete'
  );
}