Second half of the WebSocket-frame-typing batch. Phase A (8b568b3)
landed the schemas + frontend receive validation + publishFrame /
publishUserFrame wrappers. This commit converts the existing publish
call sites so every server-emitted WS frame now goes through Zod
validation at the broker boundary.
Conversion strategy: change once in the inference / skills adapters in
index.ts (so ctx.publish / ctx.publishUser propagate to publishFrame /
publishUserFrame for ALL ~50 inference + auto_name call sites in one
move), then bulk-replace the ~30 direct broker.publish* call sites in
the routes + compaction.
Files touched:
- index.ts: inference + skills route adapters now call publishFrame /
publishUserFrame internally; raw broker.publishUser('default', ...)
call in the stale-row sweeper also converted.
- routes/projects.ts (7 sites), routes/chats.ts (9 sites),
routes/sessions.ts (8 sites): all broker.publishUser(...) → broker.
publishUserFrame(...).
- services/compaction.ts (3 sites): 2 publishUser, 1 publish.
Real protocol drift surfaced by Zod, fixed in the same commit:
services/compaction.ts:442 was publishing chat_status with status:
'working' — the v1.12.1 chat_status widening (CLAUDE.md:55) dropped
this enum value in favor of streaming|tool_running|waiting_for_input|
idle|error. The compaction.ts site was missed during v1.12.1; the
frame had been published with an unknown enum value ever since (the
frontend useChatStatus quietly ignored it). Corrected to 'streaming'
— compaction's LLM call has the same dot-state semantic as an
inference turn. This is exactly the class of bug v1.13.11 exists to
catch.
Schema relaxation: OpaqueObject (the bag type for nested entities like
Project / Chat / Session / WorkspacePane embedded in WS frames) was
z.object({}).passthrough(), which Zod outputs as {} & {[k:string]:
unknown}. The strict-typed entities don't have index signatures so
TypeScript rejected them at publishFrame call sites. Relaxed to
z.unknown() — runtime validation still accepts the value, dev-time
narrowing happens via the existing hand-maintained types. Trade-off:
frame-level drift detection stays sharp; nested-payload validation
goes to follow-up work as the brief intended.
Schema audit:
grep -rn "broker\.publish(\|broker\.publishUser(" apps/server/src \
--include="*.ts" | grep -v "broker.ts\|__tests__\|.bak"
→ 0 results. Every server publish goes through publishFrame /
publishUserFrame. The remaining ctx.publish / ctx.publishUser sites
in services/inference/* + services/auto_name.ts route through the
index.ts adapter, which calls publishFrame internally.
Tests: 219/219 pass (unchanged from v1.13.11-a; the Phase B conversion
is mechanical and doesn't add test cases).
Smoke: clean container boot, no ws-frame-validation-failed entries
under normal traffic. Sidebar list refresh + agent picker open both
pass through useUserEvents without drops.
~70 LoC across 7 files. v1.13.11 closed.
543 lines
21 KiB
TypeScript
543 lines
21 KiB
TypeScript
// v1.11: anchored rolling compaction. Ported algorithms (not Effect-TS code)
|
|
// from opencode (packages/opencode/src/session/{compaction,overflow}.ts).
|
|
//
|
|
// What's different from BooCode's legacy /compact:
|
|
// - Operates per-chat (chats have N:1 to sessions; history is per-chat).
|
|
// - Detects overflow automatically after each inference completion using
|
|
// llama-swap's reported n_ctx; flags chats.needs_compaction=true.
|
|
// - On the next turn (or manual /compact) we summarize the *head* (messages
|
|
// prior to a preserved tail of N user-turns) into a single
|
|
// summary=true assistant row. Older messages get compacted_at-stamped so
|
|
// inference assembly filters them out; the GET endpoint still returns
|
|
// them so the UI can show history with the summary card inline.
|
|
// - The summary is *anchored rolling* — exactly one live summary=true row
|
|
// per chat. Subsequent compactions read the prior summary as
|
|
// previousSummary, ask the LLM to update-merge it, then mark the prior
|
|
// summary row compacted_at too (it stays in the UI but isn't sent to the
|
|
// LLM again).
|
|
|
|
import type { FastifyBaseLogger } from 'fastify';
|
|
import type { Sql } from '../db.js';
|
|
import type { Config } from '../config.js';
|
|
import type { Broker } from './broker.js';
|
|
import { SUMMARY_TEMPLATE } from './compaction-prompt.js';
|
|
import * as modelContextLookup from './model-context.js';
|
|
|
|
// v1.13.9: ratio-only overflow trigger. Fires compaction at 85% of ctx_max
|
|
// (opencode session/overflow.ts pattern). Replaces the v1.11.0-era
|
|
// `ctx_max - 20_000` formula which degenerated to 0 for contexts ≤20k and
|
|
// gave only 7-8% headroom to the summarizer at 262k. Ratio gives consistent
|
|
// 15% headroom at any scale, and small-ctx models no longer get an
|
|
// effectively-disabled trigger.
|
|
const EARLY_TRIGGER_RATIO = 0.85;
|
|
const MIN_PRESERVE_RECENT_TOKENS = 2_000;
|
|
const MAX_PRESERVE_RECENT_TOKENS = 8_000;
|
|
const DEFAULT_TAIL_TURNS = 2;
|
|
|
|
// Subset of Message fields compaction touches. Selecting only what's needed
|
|
// keeps process() independent of api.ts mutations and reduces DB egress.
|
|
export interface CompactionMessage {
|
|
id: string;
|
|
role: 'user' | 'assistant' | 'system' | 'tool';
|
|
content: string;
|
|
kind: 'message' | 'compact';
|
|
summary: boolean;
|
|
status: 'streaming' | 'complete' | 'failed' | 'cancelled';
|
|
tool_calls: Array<{ id: string; name: string; args: Record<string, unknown> }> | null;
|
|
tool_results: { tool_call_id: string; output: unknown; truncated: boolean; error?: string } | null;
|
|
// v1.13.6: reasoning_parts captured by v1.13.1-C and read back through
|
|
// messages_with_parts. Embedded into the head-assembly payload as prose so
|
|
// the summarizer LLM sees what the model was reasoning through when it
|
|
// chose its tool calls.
|
|
reasoning_parts: Array<{ text: string }> | null;
|
|
metadata: { kind?: string } | null;
|
|
created_at: string;
|
|
}
|
|
|
|
// === overflow ===
|
|
|
|
// Returns the token budget at which overflow fires. Triggers compaction at
|
|
// 85% of contextLimit (opencode session/overflow.ts pattern). Returns 0 when
|
|
// the context limit is unknown — caller treats 0 as "do not trigger overflow",
|
|
// keeping inference flowing rather than compacting a turn we can't size.
|
|
export function usable(contextLimit: number): number {
|
|
if (!contextLimit || contextLimit <= 0) return 0;
|
|
return Math.floor(EARLY_TRIGGER_RATIO * contextLimit);
|
|
}
|
|
|
|
export interface Usage {
|
|
prompt_tokens: number;
|
|
completion_tokens: number;
|
|
}
|
|
|
|
// True when the assistant just used >= usable() tokens. Unknown limit → false
|
|
// (we never auto-trigger compaction without a budget — better to keep
|
|
// inference flowing than to fall into a compaction we can't size properly).
|
|
export function isOverflow(usage: Usage, contextLimit: number): boolean {
|
|
const budget = usable(contextLimit);
|
|
if (budget <= 0) return false;
|
|
return (usage.prompt_tokens + usage.completion_tokens) >= budget;
|
|
}
|
|
|
|
// === selection ===
|
|
|
|
interface Turn {
|
|
start: number;
|
|
end: number;
|
|
id: string;
|
|
}
|
|
|
|
// Char-count / 4 token estimate. Matches opencode's Token.estimate (which
|
|
// also goes through JSON.stringify). Adequate for tail-fitting math; we
|
|
// don't need a real tokenizer here — the 20k buffer absorbs the slop.
|
|
export function estimate(messages: CompactionMessage[]): number {
|
|
return Math.ceil(JSON.stringify(messages).length / 4);
|
|
}
|
|
|
|
// Walk messages, return one Turn per user message that is NOT a summary row.
|
|
// end = next-user-start; final turn ends at messages.length.
|
|
export function turns(messages: CompactionMessage[]): Turn[] {
|
|
const result: Turn[] = [];
|
|
for (let i = 0; i < messages.length; i++) {
|
|
const m = messages[i]!;
|
|
if (m.role !== 'user') continue;
|
|
if (m.summary) continue;
|
|
result.push({ start: i, end: messages.length, id: m.id });
|
|
}
|
|
for (let i = 0; i < result.length - 1; i++) {
|
|
result[i]!.end = result[i + 1]!.start;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// Inside a turn that doesn't fit whole, walk forward from start+1 looking for
|
|
// the largest suffix that fits the remaining budget. Returns the keep-start
|
|
// index (the first preserved message) or undefined if no suffix fits.
|
|
function splitTurn(
|
|
messages: CompactionMessage[],
|
|
turn: Turn,
|
|
budget: number,
|
|
): { start: number; id: string } | undefined {
|
|
if (budget <= 0) return undefined;
|
|
if (turn.end - turn.start <= 1) return undefined;
|
|
for (let start = turn.start + 1; start < turn.end; start++) {
|
|
const size = estimate(messages.slice(start, turn.end));
|
|
if (size > budget) continue;
|
|
return { start, id: messages[start]!.id };
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
export interface SelectResult {
|
|
head: CompactionMessage[];
|
|
tail_start_id: string | undefined;
|
|
}
|
|
|
|
// Choose the boundary between the "head" (to be summarized) and the "tail"
|
|
// (preserved verbatim). Strategy:
|
|
// 1. Reserve a budget for the recent tail. Default ranges [2k, 8k] tokens
|
|
// with 25% of usable() as the target.
|
|
// 2. Take the last `tail_turns` user-turns; greedily fit from newest back.
|
|
// 3. If the next-older turn doesn't fit whole, split it mid-turn.
|
|
// 4. If we couldn't keep anything OR everything fit (keep.start === 0),
|
|
// return full-preserve (no compaction this round).
|
|
export function select(
|
|
messages: CompactionMessage[],
|
|
contextLimit: number,
|
|
tailTurns: number = DEFAULT_TAIL_TURNS,
|
|
): SelectResult {
|
|
if (tailTurns <= 0) return { head: messages, tail_start_id: undefined };
|
|
const budget = Math.min(
|
|
MAX_PRESERVE_RECENT_TOKENS,
|
|
Math.max(MIN_PRESERVE_RECENT_TOKENS, Math.floor(usable(contextLimit) * 0.25)),
|
|
);
|
|
|
|
const all = turns(messages);
|
|
if (all.length === 0) return { head: messages, tail_start_id: undefined };
|
|
const recent = all.slice(-tailTurns);
|
|
|
|
let total = 0;
|
|
let keep: { start: number; id: string } | undefined;
|
|
for (let i = recent.length - 1; i >= 0; i--) {
|
|
const turn = recent[i]!;
|
|
const size = estimate(messages.slice(turn.start, turn.end));
|
|
if (total + size <= budget) {
|
|
total += size;
|
|
keep = { start: turn.start, id: turn.id };
|
|
continue;
|
|
}
|
|
const remaining = budget - total;
|
|
const split = splitTurn(messages, turn, remaining);
|
|
if (split) keep = split;
|
|
break;
|
|
}
|
|
|
|
if (!keep || keep.start === 0) {
|
|
return { head: messages, tail_start_id: undefined };
|
|
}
|
|
return {
|
|
head: messages.slice(0, keep.start),
|
|
tail_start_id: keep.id,
|
|
};
|
|
}
|
|
|
|
// === prompt assembly ===
|
|
|
|
// Build the final user message that asks the model to (re)produce the
|
|
// anchored summary. `context` is reserved for future plugin injection;
|
|
// callers pass [] today.
|
|
export function buildPrompt(
|
|
previousSummary: string | undefined,
|
|
context: string[],
|
|
): string {
|
|
const anchor = previousSummary
|
|
? [
|
|
'Update the anchored summary below using the conversation history above.',
|
|
'Preserve still-true details, remove stale details, and merge in the new facts.',
|
|
'<previous-summary>',
|
|
previousSummary,
|
|
'</previous-summary>',
|
|
].join('\n')
|
|
: 'Create a new anchored summary from the conversation history above.';
|
|
return [anchor, SUMMARY_TEMPLATE, ...context].join('\n\n');
|
|
}
|
|
|
|
// === OpenAI conversion (compaction-local; intentionally does NOT call
|
|
// inference.ts buildMessagesPayload because that uses the legacy "find latest
|
|
// kind='compact' marker and skip everything before it" shortcircuit, which
|
|
// would silently drop pre-legacy-compact history before the LLM sees it.
|
|
// Compaction wants to send the entire head, full stop.) ===
|
|
|
|
// v1.13.6: exported for unit-test access (reasoning render coverage).
|
|
export interface OpenAiMessage {
|
|
role: 'system' | 'user' | 'assistant' | 'tool';
|
|
content: string | null;
|
|
tool_calls?: Array<{
|
|
id: string;
|
|
type: 'function';
|
|
function: { name: string; arguments: string };
|
|
}>;
|
|
tool_call_id?: string;
|
|
}
|
|
|
|
function isCapHitSentinel(m: CompactionMessage): boolean {
|
|
return m.role === 'system' && m.metadata != null && m.metadata.kind === 'cap_hit';
|
|
}
|
|
|
|
// v1.13.6: exported for unit-test access (reasoning render coverage).
|
|
export function buildHeadPayload(head: CompactionMessage[]): OpenAiMessage[] {
|
|
const out: OpenAiMessage[] = [];
|
|
for (const m of head) {
|
|
if (isCapHitSentinel(m)) continue;
|
|
if (m.role === 'assistant' && (m.status === 'streaming' || m.status === 'cancelled')) continue;
|
|
if (m.kind === 'compact') {
|
|
// Legacy compact row — pass through as system context. The new
|
|
// anchored summary will subsume it, but the LLM should see it during
|
|
// the bridging round so it can carry forward the still-true bits.
|
|
out.push({ role: 'system', content: m.content });
|
|
continue;
|
|
}
|
|
if (m.summary) {
|
|
// Defense in depth: process() filters these out of the select-input
|
|
// already. If one slips through, render it as assistant content so we
|
|
// never crash here.
|
|
out.push({ role: 'assistant', content: m.content });
|
|
continue;
|
|
}
|
|
if (m.role === 'tool') {
|
|
const tr = m.tool_results;
|
|
if (!tr) continue;
|
|
const outputText = tr.error
|
|
? `error: ${tr.error}`
|
|
: typeof tr.output === 'string'
|
|
? tr.output
|
|
: JSON.stringify(tr.output);
|
|
out.push({ role: 'tool', content: outputText, tool_call_id: tr.tool_call_id });
|
|
continue;
|
|
}
|
|
if (m.role === 'assistant') {
|
|
// v1.13.6: embed reasoning text as prose prefixed onto the assistant
|
|
// content. OpenAI wire shape doesn't carry reasoning as a structured
|
|
// field, but the summarizer is reading text — a tagged prose block
|
|
// gives it the same signal. We mirror the AI SDK ReasoningPart shape
|
|
// by using a <reasoning>...</reasoning> wrapper so the summarizer can
|
|
// distinguish reasoning from user-visible answer.
|
|
let body = m.content && m.content.length > 0 ? m.content : '';
|
|
if (m.reasoning_parts && m.reasoning_parts.length > 0) {
|
|
const reasoning = m.reasoning_parts.map((r) => r.text).join('');
|
|
body = body.length > 0
|
|
? `<reasoning>${reasoning}</reasoning>\n\n${body}`
|
|
: `<reasoning>${reasoning}</reasoning>`;
|
|
}
|
|
const msg: OpenAiMessage = {
|
|
role: 'assistant',
|
|
content: body.length > 0 ? body : null,
|
|
};
|
|
if (m.tool_calls && m.tool_calls.length > 0) {
|
|
msg.tool_calls = m.tool_calls.map((tc) => ({
|
|
id: tc.id,
|
|
type: 'function' as const,
|
|
function: { name: tc.name, arguments: JSON.stringify(tc.args) },
|
|
}));
|
|
}
|
|
out.push(msg);
|
|
continue;
|
|
}
|
|
out.push({ role: 'user', content: m.content });
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// === llama-swap call ===
|
|
|
|
// Non-streaming completion. Opencode streams; for a one-shot summary call a
|
|
// single POST is less code and the latency hit is acceptable (the user
|
|
// doesn't see this directly — useSessionStream emits the toast + refetches
|
|
// on the 'compacted' frame).
|
|
interface CompletionResult {
|
|
content: string;
|
|
promptTokens: number;
|
|
completionTokens: number;
|
|
}
|
|
|
|
async function callLlamaSwap(
|
|
config: Config,
|
|
model: string,
|
|
messages: OpenAiMessage[],
|
|
log: FastifyBaseLogger,
|
|
): Promise<CompletionResult> {
|
|
const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/chat/completions`, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({ model, messages, stream: false }),
|
|
});
|
|
if (!res.ok) {
|
|
const text = await res.text().catch(() => '');
|
|
throw new Error(`llama-swap returned ${res.status}: ${text.slice(0, 200)}`);
|
|
}
|
|
const json = (await res.json()) as {
|
|
choices?: Array<{ message?: { content?: string } }>;
|
|
usage?: { prompt_tokens?: number; completion_tokens?: number };
|
|
};
|
|
// v1.11.3: removed the dead `json.timings?.n_ctx` read — llama-server's
|
|
// completions don't emit n_ctx in timings. ctx_max on the summary row
|
|
// comes from model-context.getModelContext below in process().
|
|
const content = json.choices?.[0]?.message?.content ?? '';
|
|
const promptTokens = json.usage?.prompt_tokens ?? 0;
|
|
const completionTokens = json.usage?.completion_tokens ?? 0;
|
|
log.debug({ promptTokens, completionTokens, chars: content.length }, 'compaction llm complete');
|
|
return { content, promptTokens, completionTokens };
|
|
}
|
|
|
|
// === entry point ===
|
|
|
|
export interface ProcessInput {
|
|
sql: Sql;
|
|
config: Config;
|
|
log: FastifyBaseLogger;
|
|
broker: Broker;
|
|
chatId: string;
|
|
}
|
|
|
|
// Runs one round of anchored rolling compaction on `chatId`. No-ops cleanly
|
|
// (clearing needs_compaction) when there's nothing reasonable to compact.
|
|
// Throws on LLM failure — callers decide whether to log+swallow or surface.
|
|
export async function process(input: ProcessInput): Promise<void> {
|
|
const { sql, config, log, broker, chatId } = input;
|
|
|
|
// 1. Resolve chat → session for model + WS publish channel.
|
|
const chatRows = await sql<{ id: string; session_id: string }[]>`
|
|
SELECT id, session_id FROM chats WHERE id = ${chatId}
|
|
`;
|
|
if (chatRows.length === 0) {
|
|
log.warn({ chatId }, 'compaction: chat not found');
|
|
return;
|
|
}
|
|
const chat = chatRows[0]!;
|
|
const sessionId = chat.session_id;
|
|
|
|
const sessRows = await sql<{ id: string; model: string }[]>`
|
|
SELECT id, model FROM sessions WHERE id = ${sessionId}
|
|
`;
|
|
if (sessRows.length === 0) {
|
|
log.warn({ chatId, sessionId }, 'compaction: session not found');
|
|
return;
|
|
}
|
|
const session = sessRows[0]!;
|
|
|
|
// 2. All currently-active messages in this chat (compacted_at IS NULL).
|
|
// ORDER BY (created_at, id) matches loadContext in inference.ts so the
|
|
// turns() boundary logic sees the same sequence the LLM will.
|
|
// v1.13.1-B: reads tool_calls/tool_results via the parts-merged view so
|
|
// the compaction payload matches what the LLM saw on the original turn.
|
|
// v1.13.6: also pulls reasoning_parts (added in v1.13.1-C) so summaries
|
|
// capture what the model was working through before each tool call.
|
|
const messages = await sql<CompactionMessage[]>`
|
|
SELECT id, role, content, kind, summary, status, tool_calls, tool_results,
|
|
reasoning_parts, metadata, created_at
|
|
FROM messages_with_parts
|
|
WHERE chat_id = ${chatId} AND compacted_at IS NULL
|
|
ORDER BY created_at ASC, id ASC
|
|
`;
|
|
if (messages.length === 0) {
|
|
await sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
|
|
return;
|
|
}
|
|
|
|
// 3. Find the prior anchored summary (newest summary=true row). Its content
|
|
// becomes previousSummary — the anchor in the prompt. Filter it out of the
|
|
// select-input so we don't double-encode (it's already in the anchor text).
|
|
const previousSummary = messages.filter((m) => m.summary).at(-1)?.content;
|
|
const forSelect = messages.filter((m) => !m.summary);
|
|
|
|
// 4. Resolve a recent context limit. llama-swap reports timings.n_ctx per
|
|
// completion; we cache it on messages.ctx_max. Use the most recent value
|
|
// from any message in this chat (oldest assumption is the same model is
|
|
// still running). When unknown, fall back to model.context_limit-less
|
|
// defaults via the buffer-only path (see usable()).
|
|
const ctxRows = await sql<{ ctx_max: number | null }[]>`
|
|
SELECT ctx_max FROM messages
|
|
WHERE chat_id = ${chatId} AND ctx_max IS NOT NULL
|
|
ORDER BY created_at DESC LIMIT 1
|
|
`;
|
|
const contextLimit = ctxRows[0]?.ctx_max ?? 0;
|
|
|
|
// 5. Decide head / tail.
|
|
const sel = select(forSelect, contextLimit);
|
|
if (!sel.tail_start_id || sel.head.length === 0) {
|
|
// Full preserve — nothing to compact this round. Clear the flag so we
|
|
// don't loop. (Could happen when the chat is short or the budget swung
|
|
// wider after a model context bump.)
|
|
await sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
|
|
log.info({ chatId, contextLimit, msgCount: messages.length }, 'compaction: nothing to compact');
|
|
return;
|
|
}
|
|
|
|
// 6. Build the OpenAI request: head as user/assistant/tool turns + a final
|
|
// user message carrying buildPrompt(previousSummary, []). No system prompt
|
|
// — matches opencode (`system: []`); the template + anchor are sufficient.
|
|
const headPayload = buildHeadPayload(sel.head);
|
|
const finalUser: OpenAiMessage = { role: 'user', content: buildPrompt(previousSummary, []) };
|
|
const payload = [...headPayload, finalUser];
|
|
|
|
log.info(
|
|
{
|
|
chatId,
|
|
contextLimit,
|
|
headLen: sel.head.length,
|
|
tailStartId: sel.tail_start_id,
|
|
hadPrevSummary: previousSummary !== undefined,
|
|
},
|
|
'compaction: invoking model',
|
|
);
|
|
|
|
// 6a. Flip the chat dot for the duration of the LLM call + DB writes.
|
|
// v1.13.11-b: publish status='streaming' (the v1.12.1-widened replacement
|
|
// for the dropped 'working' value). Compaction's LLM call has the same
|
|
// semantic as an inference turn for dot-state purposes. The v1.12.1
|
|
// chat_status widening missed this site; v1.13.11's WsFrame Zod schema
|
|
// surfaced the drift via the unknown-enum-value check.
|
|
broker.publishUserFrame('default', {
|
|
type: 'chat_status',
|
|
chat_id: chatId,
|
|
status: 'streaming',
|
|
at: new Date().toISOString(),
|
|
});
|
|
|
|
// try/finally so the dot ALWAYS drops back to idle, even if the LLM call
|
|
// throws or a downstream DB write fails. The succeeded flag gates the
|
|
// 'compacted' frame + final log: we only signal completion to the UI when
|
|
// the new summary row actually landed.
|
|
let succeeded = false;
|
|
let newId = '';
|
|
let result: CompletionResult | undefined;
|
|
try {
|
|
// 7. Single completion (no tools). Throws on llama-swap failure.
|
|
result = await callLlamaSwap(config, session.model, payload, log);
|
|
|
|
// 7b. v1.11.3: fetch the model's true context window from llama-swap's
|
|
// /upstream/<model>/props (the streaming completion doesn't carry it).
|
|
// Same pattern as inference.ts; the cache makes repeated calls free.
|
|
const mctx = await modelContextLookup.getModelContext(session.model);
|
|
const nCtx = mctx?.n_ctx ?? null;
|
|
|
|
// 8. Insert the new anchored summary row. role='assistant' per spec; the
|
|
// UI distinguishes via summary=true. tail_start_id points at the first
|
|
// preserved tail message so debug surfaces / future tools can reason
|
|
// about the boundary without re-deriving from compacted_at.
|
|
const insertRows = await sql<{ id: string }[]>`
|
|
INSERT INTO messages (
|
|
session_id, chat_id, role, content, kind, status,
|
|
summary, tail_start_id,
|
|
tokens_used, ctx_used, ctx_max,
|
|
created_at, finished_at
|
|
)
|
|
VALUES (
|
|
${sessionId}, ${chatId}, 'assistant', ${result.content}, 'message', 'complete',
|
|
true, ${sel.tail_start_id},
|
|
${result.completionTokens}, ${result.promptTokens}, ${nCtx},
|
|
clock_timestamp(), clock_timestamp()
|
|
)
|
|
RETURNING id
|
|
`;
|
|
newId = insertRows[0]!.id;
|
|
|
|
// 9. Mark every prior live message (head + prior summary) as compacted.
|
|
// Bound by "created_at strictly less than tail_start_id's created_at" so
|
|
// the preserved tail stays compacted_at=NULL. Exclude the new summary
|
|
// row we just inserted (it's "now", which is >= tail_start_id's
|
|
// created_at anyway, but defensive).
|
|
await sql`
|
|
UPDATE messages
|
|
SET compacted_at = clock_timestamp()
|
|
WHERE chat_id = ${chatId}
|
|
AND compacted_at IS NULL
|
|
AND id != ${newId}
|
|
AND created_at < (SELECT created_at FROM messages WHERE id = ${sel.tail_start_id})
|
|
`;
|
|
|
|
// 10. Clear the flag and bump the chat's updated_at so the sidebar
|
|
// reflects recent activity.
|
|
await sql`
|
|
UPDATE chats
|
|
SET needs_compaction = false, updated_at = clock_timestamp()
|
|
WHERE id = ${chatId}
|
|
`;
|
|
|
|
succeeded = true;
|
|
} finally {
|
|
// Always restore the dot. Status='idle' (not 'error') even on failure —
|
|
// the caller logs/re-surfaces the error separately; the dot doesn't
|
|
// need to stay red across reloads for a transient compaction blip.
|
|
broker.publishUserFrame('default', {
|
|
type: 'chat_status',
|
|
chat_id: chatId,
|
|
status: 'idle',
|
|
at: new Date().toISOString(),
|
|
});
|
|
}
|
|
|
|
// 11. Tell the client. useSessionStream subscribes to the per-session WS
|
|
// channel; the handler refetches messages (so the new summary row + the
|
|
// compacted_at-stamped older rows render correctly) and fires a sonner
|
|
// toast. Order matters: idle must precede 'compacted' so the dot is
|
|
// already green by the time the refetch toast appears.
|
|
if (succeeded) {
|
|
broker.publishFrame(sessionId, {
|
|
type: 'compacted',
|
|
session_id: sessionId,
|
|
chat_id: chatId,
|
|
summary_message_id: newId,
|
|
});
|
|
log.info(
|
|
{
|
|
chatId,
|
|
newId,
|
|
completionTokens: result?.completionTokens,
|
|
promptTokens: result?.promptTokens,
|
|
},
|
|
'compaction: complete',
|
|
);
|
|
}
|
|
}
|