From 8fa7b7fce930a6638bd055435ebd053a69bb2f12 Mon Sep 17 00:00:00 2001 From: indifferentketchup Date: Thu, 21 May 2026 22:09:50 +0000 Subject: [PATCH] v1.12.4-rc2: extract payload + error-handler from inference.ts - payload.ts: buildMessagesPayload (re-exported), loadContext, maybeFlagForCompaction - error-handler.ts: handleAbortOrError, finalizeCompletion Both new files type-import InferenceContext/StreamResult/TurnArgs from inference.ts; ESM elides type imports so there's no runtime cycle. handleAbortOrError turned out not to call the summary functions, so no back-edge needed. inference.ts shrinks from ~1676 to ~1401 LoC. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/server/src/services/inference.ts | 309 +----------------- .../src/services/inference/error-handler.ts | 148 +++++++++ apps/server/src/services/inference/payload.ts | 155 +++++++++ 3 files changed, 320 insertions(+), 292 deletions(-) create mode 100644 apps/server/src/services/inference/error-handler.ts create mode 100644 apps/server/src/services/inference/payload.ts diff --git a/apps/server/src/services/inference.ts b/apps/server/src/services/inference.ts index 8698d2e..662b17e 100644 --- a/apps/server/src/services/inference.ts +++ b/apps/server/src/services/inference.ts @@ -23,15 +23,10 @@ import { getAgentById } from './agents.js'; import * as compaction from './compaction.js'; import * as modelContext from './model-context.js'; import type { Broker } from './broker.js'; -// v1.12: prompt assembly extracted to its own module. buildSystemPrompt is -// async (awaits the container-guidance loader) — buildMessagesPayload below -// is therefore async too, and its three call sites in this file await it. -import { buildSystemPrompt } from './system-prompt.js'; import { resolveToolBudget } from './inference/budget.js'; import { DOOM_LOOP_THRESHOLD, detectDoomLoop, - isAnySentinel, } from './inference/sentinels.js'; import { XML_TOOL_CLOSE, @@ -39,10 +34,21 @@ import { parseXmlToolCall, partialXmlOpenerStart, } from './inference/xml-parser.js'; +import { + buildMessagesPayload, + loadContext, + maybeFlagForCompaction, + type OpenAiMessage, +} from './inference/payload.js'; +import { + finalizeCompletion, + handleAbortOrError, +} from './inference/error-handler.js'; // v1.12.4: re-exported so external callers (tests, future consumers) keep // importing from services/inference.js as the public surface. export { detectDoomLoop, DOOM_LOOP_THRESHOLD } from './inference/sentinels.js'; +export { buildMessagesPayload } from './inference/payload.js'; const DB_FLUSH_INTERVAL_MS = 500; @@ -101,17 +107,6 @@ export interface InferenceFrame { export type FramePublisher = (sessionId: string, frame: InferenceFrame) => void; -interface OpenAiMessage { - role: 'system' | 'user' | 'assistant' | 'tool'; - content: string | null; - tool_calls?: Array<{ - id: string; - type: 'function'; - function: { name: string; arguments: string }; - }>; - tool_call_id?: string; -} - interface ChatCompletionDelta { role?: string; content?: string | null; @@ -149,140 +144,14 @@ export interface InferenceContext { broker: Broker; } +// v1.12.4: payload assembly extracted to ./inference/payload.ts — +// buildMessagesPayload, loadContext, maybeFlagForCompaction, and the +// OpenAiMessage shape live there now. Re-exported below to preserve the +// public surface (tests import buildMessagesPayload from this module). // v1.12: buildSystemPrompt moved to services/system-prompt.ts. See that // module for the resolution order doc and the container-guidance layer. // buildMessagesPayload is async now because buildSystemPrompt awaits the // guidance cache lookup. -export async function buildMessagesPayload( - session: Session, - project: Project, - history: Message[], - agent: Agent | null = null -): Promise { - const out: OpenAiMessage[] = []; - const systemPrompt = await buildSystemPrompt(project, session, agent); - out.push({ role: 'system', content: systemPrompt }); - - // Find the latest compact marker — only send messages from that point onwards - let startIdx = 0; - for (let i = history.length - 1; i >= 0; i--) { - if (history[i]!.kind === 'compact') { - startIdx = i; - break; - } - } - - for (let i = startIdx; i < history.length; i++) { - const m = history[i]!; - if (m.kind === 'compact') { - out.push({ role: 'system', content: m.content }); - continue; - } - // v1.8.2 / v1.11.6: cap-hit and doom-loop sentinels are UI-only — never - // send them to the LLM. The synthetic instruction note lives only inside - // the summary call's messages array and is never persisted, so on a - // follow-up turn the model resumes with a clean context. - if (isAnySentinel(m)) continue; - if (m.role === 'assistant' && m.status === 'streaming') continue; - if (m.role === 'assistant' && m.status === 'cancelled') continue; - if (m.role === 'tool') { - const tr = m.tool_results; - if (!tr) continue; - const outputText = tr.error - ? `error: ${tr.error}` - : typeof tr.output === 'string' - ? tr.output - : JSON.stringify(tr.output); - out.push({ - role: 'tool', - content: outputText, - tool_call_id: tr.tool_call_id, - }); - continue; - } - if (m.role === 'assistant') { - const msg: OpenAiMessage = { - role: 'assistant', - content: m.content && m.content.length > 0 ? m.content : null, - }; - if (m.tool_calls && m.tool_calls.length > 0) { - msg.tool_calls = m.tool_calls.map((tc) => ({ - id: tc.id, - type: 'function' as const, - function: { name: tc.name, arguments: JSON.stringify(tc.args) }, - })); - } - out.push(msg); - continue; - } - out.push({ role: 'user', content: m.content }); - } - return out; -} - -async function loadContext( - sql: Sql, - sessionId: string, - chatId: string -): Promise<{ session: Session; project: Project; history: Message[] } | null> { - const sessionRows = await sql` - SELECT id, project_id, name, model, system_prompt, status, created_at, updated_at, - agent_id, web_search_enabled - FROM sessions WHERE id = ${sessionId} - `; - if (sessionRows.length === 0) return null; - const session = sessionRows[0]!; - - const projectRows = await sql` - SELECT id, name, path, added_at, last_session_id, status, gitea_remote, - default_system_prompt, default_web_search_enabled - FROM projects WHERE id = ${session.project_id} - `; - if (projectRows.length === 0) return null; - const project = projectRows[0]!; - - // v1.11: filter compacted messages out of the inference assembly. The GET - // /api/sessions/:id/messages endpoint still returns everything (so the UI - // can show history with the summary card inline); only LLM payloads skip - // compacted rows. compacted_at IS NULL keeps the active summary + tail. - const history = await sql` - SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq, - tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata - FROM messages - WHERE chat_id = ${chatId} AND compacted_at IS NULL - ORDER BY created_at ASC, id ASC - `; - - return { session, project, history }; -} - -// v1.11: shared helper used after both finalizeCompletion and executeToolPhase -// persist their token counts. Reads tokens off the just-UPDATEd row (which -// the caller returns from RETURNING), runs compaction.isOverflow, and flips -// chats.needs_compaction. The next runAssistantTurn invocation acts on it. -// Silent on missing tokens — llama-swap occasionally omits usage on truncated -// streams, and we'd rather miss one overflow than crash the inference path. -async function maybeFlagForCompaction( - ctx: InferenceContext, - chatId: string, - updated: { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null } | undefined, -): Promise { - if (!updated) return; - const promptTokens = updated.ctx_used; - const completionTokens = updated.tokens_used; - const contextLimit = updated.ctx_max; - if (typeof promptTokens !== 'number') return; - if (typeof completionTokens !== 'number') return; - if (typeof contextLimit !== 'number') return; - const overflow = compaction.isOverflow( - { prompt_tokens: promptTokens, completion_tokens: completionTokens }, - contextLimit, - ); - if (!overflow) return; - await ctx.sql`UPDATE chats SET needs_compaction = true WHERE id = ${chatId}`; - ctx.log.info({ chatId, promptTokens, completionTokens, contextLimit }, 'inference: flagged for compaction'); -} - async function* sseLines(stream: ReadableStream): AsyncGenerator { const reader = stream.getReader(); const decoder = new TextDecoder('utf-8'); @@ -306,7 +175,7 @@ async function* sseLines(stream: ReadableStream): AsyncGenerator { - const { sessionId, chatId, assistantMessageId } = args; - const isAbort = err instanceof Error && err.name === 'AbortError'; - const finalStatus = isAbort ? 'cancelled' : 'failed'; - const errMsg = err instanceof Error ? err.message : String(err); - // v1.8.2: persist a structured error metadata blob on genuine failures so - // the bubble can render the reason on reload without re-deriving from the - // (one-shot) WS error frame. User-initiated abort skips this — there's no - // "reason" to surface for a stop the user already explicitly chose. - const errorMetadata: MessageMetadata | null = isAbort - ? null - : { kind: 'error', error_reason: 'llm_provider_error', error_text: errMsg }; - if (errorMetadata) { - await ctx.sql` - UPDATE messages - SET status = ${finalStatus}, - content = ${accumulated}, - finished_at = clock_timestamp(), - metadata = ${ctx.sql.json(errorMetadata as never)} - WHERE id = ${assistantMessageId} - `; - } else { - await ctx.sql` - UPDATE messages - SET status = ${finalStatus}, - content = ${accumulated}, - finished_at = clock_timestamp() - WHERE id = ${assistantMessageId} - `; - } - const [failSessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>` - UPDATE sessions SET updated_at = clock_timestamp() - WHERE id = ${sessionId} - RETURNING project_id, name, updated_at - `; - ctx.publishUser({ type: 'session_updated', session_id: sessionId, project_id: failSessRow!.project_id, name: failSessRow!.name, updated_at: failSessRow!.updated_at }); - // v1.8 mobile-tabs: cancellation is a user-initiated stop, treat as idle; - // genuine errors flip the dot red. v1.8.2: error path also carries a - // machine-readable `reason` so the UI can render specifics inline. - if (isAbort) { - // v1.12.1: defensive cancellation write. The status=${finalStatus} UPDATE - // above already sets 'cancelled' for the AbortError case, but a row can - // leak as 'streaming' when the abort fires between the post-tool-phase - // INSERT (executeToolPhase) and the next runAssistantTurn's stream setup, - // bypassing the try/catch around executeStreamPhase. The status guard - // makes this a no-op when the earlier write already landed. - await ctx.sql` - UPDATE messages - SET status = 'cancelled', content = ${accumulated}, finished_at = clock_timestamp() - WHERE id = ${args.assistantMessageId} AND status = 'streaming' - `; - ctx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'idle', at: new Date().toISOString() }); - ctx.publish(sessionId, { - type: 'message_complete', - message_id: assistantMessageId, - chat_id: chatId, - }); - ctx.log.info({ sessionId, chatId, assistantMessageId }, 'inference cancelled'); - } else { - ctx.publishUser({ - type: 'chat_status', - chat_id: chatId, - status: 'error', - at: new Date().toISOString(), - reason: 'llm_provider_error', - }); - ctx.publish(sessionId, { - type: 'error', - message_id: assistantMessageId, - chat_id: chatId, - error: errMsg, - reason: 'llm_provider_error', - }); - ctx.log.error({ err, sessionId, assistantMessageId }, 'inference failed'); - } -} - async function executeToolPhase( ctx: InferenceContext, args: TurnArgs, @@ -929,68 +716,6 @@ async function executeToolPhase( }); } -async function finalizeCompletion( - ctx: InferenceContext, - args: TurnArgs, - result: StreamResult, - startedAt: string | null, - session: Session -): Promise { - const { sessionId, chatId, assistantMessageId } = args; - const { content, finishReason, promptTokens, completionTokens } = result; - - // v1.11.3: see executeToolPhase for the rationale. - const mctx = await modelContext.getModelContext(session.model); - const nCtx = mctx?.n_ctx ?? null; - - const [updated] = await ctx.sql< - { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[] - >` - UPDATE messages - SET content = ${content}, - status = 'complete', - tokens_used = ${completionTokens}, - ctx_used = ${promptTokens}, - ctx_max = ${nCtx}, - finished_at = clock_timestamp() - WHERE id = ${assistantMessageId} - RETURNING tokens_used, ctx_used, ctx_max, finished_at - `; - // v1.11: flag for compaction on the terminal turn too. Catches the common - // case of a turn that hit the limit without invoking tools. - await maybeFlagForCompaction(ctx, chatId, updated); - const [completeSessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>` - UPDATE sessions SET updated_at = clock_timestamp() - WHERE id = ${sessionId} - RETURNING project_id, name, updated_at - `; - ctx.publishUser({ type: 'session_updated', session_id: sessionId, project_id: completeSessRow!.project_id, name: completeSessRow!.name, updated_at: completeSessRow!.updated_at }); - ctx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'idle', at: new Date().toISOString() }); - ctx.publish(sessionId, { - type: 'message_complete', - message_id: assistantMessageId, - chat_id: chatId, - tokens_used: updated?.tokens_used ?? null, - ctx_used: updated?.ctx_used ?? null, - ctx_max: updated?.ctx_max ?? null, - started_at: startedAt, - finished_at: updated?.finished_at ?? null, - model: session.model, - }); - ctx.log.info( - { - sessionId, - chatId, - assistantMessageId, - finishReason, - chars: content.length, - tokens_used: updated?.tokens_used, - ctx_used: updated?.ctx_used, - }, - 'inference complete' - ); -} - async function runAssistantTurn( ctx: InferenceContext, args: TurnArgs, diff --git a/apps/server/src/services/inference/error-handler.ts b/apps/server/src/services/inference/error-handler.ts new file mode 100644 index 0000000..b98e202 --- /dev/null +++ b/apps/server/src/services/inference/error-handler.ts @@ -0,0 +1,148 @@ +import type { MessageMetadata, Session } from '../../types/api.js'; +import * as modelContext from '../model-context.js'; +import { maybeFlagForCompaction } from './payload.js'; +import type { InferenceContext, StreamResult, TurnArgs } from '../inference.js'; + +export async function handleAbortOrError( + ctx: InferenceContext, + args: TurnArgs, + accumulated: string, + err: unknown +): Promise { + const { sessionId, chatId, assistantMessageId } = args; + const isAbort = err instanceof Error && err.name === 'AbortError'; + const finalStatus = isAbort ? 'cancelled' : 'failed'; + const errMsg = err instanceof Error ? err.message : String(err); + // v1.8.2: persist a structured error metadata blob on genuine failures so + // the bubble can render the reason on reload without re-deriving from the + // (one-shot) WS error frame. User-initiated abort skips this — there's no + // "reason" to surface for a stop the user already explicitly chose. + const errorMetadata: MessageMetadata | null = isAbort + ? null + : { kind: 'error', error_reason: 'llm_provider_error', error_text: errMsg }; + if (errorMetadata) { + await ctx.sql` + UPDATE messages + SET status = ${finalStatus}, + content = ${accumulated}, + finished_at = clock_timestamp(), + metadata = ${ctx.sql.json(errorMetadata as never)} + WHERE id = ${assistantMessageId} + `; + } else { + await ctx.sql` + UPDATE messages + SET status = ${finalStatus}, + content = ${accumulated}, + finished_at = clock_timestamp() + WHERE id = ${assistantMessageId} + `; + } + const [failSessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>` + UPDATE sessions SET updated_at = clock_timestamp() + WHERE id = ${sessionId} + RETURNING project_id, name, updated_at + `; + ctx.publishUser({ type: 'session_updated', session_id: sessionId, project_id: failSessRow!.project_id, name: failSessRow!.name, updated_at: failSessRow!.updated_at }); + // v1.8 mobile-tabs: cancellation is a user-initiated stop, treat as idle; + // genuine errors flip the dot red. v1.8.2: error path also carries a + // machine-readable `reason` so the UI can render specifics inline. + if (isAbort) { + // v1.12.1: defensive cancellation write. The status=${finalStatus} UPDATE + // above already sets 'cancelled' for the AbortError case, but a row can + // leak as 'streaming' when the abort fires between the post-tool-phase + // INSERT (executeToolPhase) and the next runAssistantTurn's stream setup, + // bypassing the try/catch around executeStreamPhase. The status guard + // makes this a no-op when the earlier write already landed. + await ctx.sql` + UPDATE messages + SET status = 'cancelled', content = ${accumulated}, finished_at = clock_timestamp() + WHERE id = ${args.assistantMessageId} AND status = 'streaming' + `; + ctx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'idle', at: new Date().toISOString() }); + ctx.publish(sessionId, { + type: 'message_complete', + message_id: assistantMessageId, + chat_id: chatId, + }); + ctx.log.info({ sessionId, chatId, assistantMessageId }, 'inference cancelled'); + } else { + ctx.publishUser({ + type: 'chat_status', + chat_id: chatId, + status: 'error', + at: new Date().toISOString(), + reason: 'llm_provider_error', + }); + ctx.publish(sessionId, { + type: 'error', + message_id: assistantMessageId, + chat_id: chatId, + error: errMsg, + reason: 'llm_provider_error', + }); + ctx.log.error({ err, sessionId, assistantMessageId }, 'inference failed'); + } +} + +export async function finalizeCompletion( + ctx: InferenceContext, + args: TurnArgs, + result: StreamResult, + startedAt: string | null, + session: Session +): Promise { + const { sessionId, chatId, assistantMessageId } = args; + const { content, finishReason, promptTokens, completionTokens } = result; + + // v1.11.3: see executeToolPhase for the rationale. + const mctx = await modelContext.getModelContext(session.model); + const nCtx = mctx?.n_ctx ?? null; + + const [updated] = await ctx.sql< + { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[] + >` + UPDATE messages + SET content = ${content}, + status = 'complete', + tokens_used = ${completionTokens}, + ctx_used = ${promptTokens}, + ctx_max = ${nCtx}, + finished_at = clock_timestamp() + WHERE id = ${assistantMessageId} + RETURNING tokens_used, ctx_used, ctx_max, finished_at + `; + // v1.11: flag for compaction on the terminal turn too. Catches the common + // case of a turn that hit the limit without invoking tools. + await maybeFlagForCompaction(ctx, chatId, updated); + const [completeSessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>` + UPDATE sessions SET updated_at = clock_timestamp() + WHERE id = ${sessionId} + RETURNING project_id, name, updated_at + `; + ctx.publishUser({ type: 'session_updated', session_id: sessionId, project_id: completeSessRow!.project_id, name: completeSessRow!.name, updated_at: completeSessRow!.updated_at }); + ctx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'idle', at: new Date().toISOString() }); + ctx.publish(sessionId, { + type: 'message_complete', + message_id: assistantMessageId, + chat_id: chatId, + tokens_used: updated?.tokens_used ?? null, + ctx_used: updated?.ctx_used ?? null, + ctx_max: updated?.ctx_max ?? null, + started_at: startedAt, + finished_at: updated?.finished_at ?? null, + model: session.model, + }); + ctx.log.info( + { + sessionId, + chatId, + assistantMessageId, + finishReason, + chars: content.length, + tokens_used: updated?.tokens_used, + ctx_used: updated?.ctx_used, + }, + 'inference complete' + ); +} diff --git a/apps/server/src/services/inference/payload.ts b/apps/server/src/services/inference/payload.ts new file mode 100644 index 0000000..ec0d9f5 --- /dev/null +++ b/apps/server/src/services/inference/payload.ts @@ -0,0 +1,155 @@ +import type { Sql } from '../../db.js'; +import type { + Agent, + Message, + Project, + Session, +} from '../../types/api.js'; +import * as compaction from '../compaction.js'; +import { buildSystemPrompt } from '../system-prompt.js'; +import { isAnySentinel } from './sentinels.js'; +import type { InferenceContext } from '../inference.js'; + +export interface OpenAiMessage { + role: 'system' | 'user' | 'assistant' | 'tool'; + content: string | null; + tool_calls?: Array<{ + id: string; + type: 'function'; + function: { name: string; arguments: string }; + }>; + tool_call_id?: string; +} + +// v1.12: buildSystemPrompt lives in services/system-prompt.ts. It awaits the +// container-guidance loader, so this function is async too and every call +// site in inference.ts awaits the result. +export async function buildMessagesPayload( + session: Session, + project: Project, + history: Message[], + agent: Agent | null = null +): Promise { + const out: OpenAiMessage[] = []; + const systemPrompt = await buildSystemPrompt(project, session, agent); + out.push({ role: 'system', content: systemPrompt }); + + // Find the latest compact marker — only send messages from that point onwards + let startIdx = 0; + for (let i = history.length - 1; i >= 0; i--) { + if (history[i]!.kind === 'compact') { + startIdx = i; + break; + } + } + + for (let i = startIdx; i < history.length; i++) { + const m = history[i]!; + if (m.kind === 'compact') { + out.push({ role: 'system', content: m.content }); + continue; + } + // v1.8.2 / v1.11.6: cap-hit and doom-loop sentinels are UI-only — never + // send them to the LLM. The synthetic instruction note lives only inside + // the summary call's messages array and is never persisted, so on a + // follow-up turn the model resumes with a clean context. + if (isAnySentinel(m)) continue; + if (m.role === 'assistant' && m.status === 'streaming') continue; + if (m.role === 'assistant' && m.status === 'cancelled') continue; + if (m.role === 'tool') { + const tr = m.tool_results; + if (!tr) continue; + const outputText = tr.error + ? `error: ${tr.error}` + : typeof tr.output === 'string' + ? tr.output + : JSON.stringify(tr.output); + out.push({ + role: 'tool', + content: outputText, + tool_call_id: tr.tool_call_id, + }); + continue; + } + if (m.role === 'assistant') { + const msg: OpenAiMessage = { + role: 'assistant', + content: m.content && m.content.length > 0 ? m.content : null, + }; + if (m.tool_calls && m.tool_calls.length > 0) { + msg.tool_calls = m.tool_calls.map((tc) => ({ + id: tc.id, + type: 'function' as const, + function: { name: tc.name, arguments: JSON.stringify(tc.args) }, + })); + } + out.push(msg); + continue; + } + out.push({ role: 'user', content: m.content }); + } + return out; +} + +export async function loadContext( + sql: Sql, + sessionId: string, + chatId: string +): Promise<{ session: Session; project: Project; history: Message[] } | null> { + const sessionRows = await sql` + SELECT id, project_id, name, model, system_prompt, status, created_at, updated_at, + agent_id, web_search_enabled + FROM sessions WHERE id = ${sessionId} + `; + if (sessionRows.length === 0) return null; + const session = sessionRows[0]!; + + const projectRows = await sql` + SELECT id, name, path, added_at, last_session_id, status, gitea_remote, + default_system_prompt, default_web_search_enabled + FROM projects WHERE id = ${session.project_id} + `; + if (projectRows.length === 0) return null; + const project = projectRows[0]!; + + // v1.11: filter compacted messages out of the inference assembly. The GET + // /api/sessions/:id/messages endpoint still returns everything (so the UI + // can show history with the summary card inline); only LLM payloads skip + // compacted rows. compacted_at IS NULL keeps the active summary + tail. + const history = await sql` + SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq, + tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata + FROM messages + WHERE chat_id = ${chatId} AND compacted_at IS NULL + ORDER BY created_at ASC, id ASC + `; + + return { session, project, history }; +} + +// v1.11: shared helper used after both finalizeCompletion and executeToolPhase +// persist their token counts. Reads tokens off the just-UPDATEd row (which +// the caller returns from RETURNING), runs compaction.isOverflow, and flips +// chats.needs_compaction. The next runAssistantTurn invocation acts on it. +// Silent on missing tokens — llama-swap occasionally omits usage on truncated +// streams, and we'd rather miss one overflow than crash the inference path. +export async function maybeFlagForCompaction( + ctx: InferenceContext, + chatId: string, + updated: { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null } | undefined, +): Promise { + if (!updated) return; + const promptTokens = updated.ctx_used; + const completionTokens = updated.tokens_used; + const contextLimit = updated.ctx_max; + if (typeof promptTokens !== 'number') return; + if (typeof completionTokens !== 'number') return; + if (typeof contextLimit !== 'number') return; + const overflow = compaction.isOverflow( + { prompt_tokens: promptTokens, completion_tokens: completionTokens }, + contextLimit, + ); + if (!overflow) return; + await ctx.sql`UPDATE chats SET needs_compaction = true WHERE id = ${chatId}`; + ctx.log.info({ chatId, promptTokens, completionTokens, contextLimit }, 'inference: flagged for compaction'); +}