v1.11: opencode-style compaction port

- compaction.ts: usable/isOverflow/estimate/turns/select/buildPrompt/process - compaction-prompt.ts: SUMMARY_TEMPLATE verbatim from opencode - schema: messages.{compacted_at,summary,tail_start_id} + chats.needs_compaction - inference: auto-trigger on overflow, pre-fetch compaction before next turn - /compact slash command rewired to new path - WS: chat_status working/idle around compaction + compacted frame - frontend: SummaryCard + sonner toast on compacted - 24 unit tests for pure functions
2026-05-20 19:05:35 +00:00
parent 6aab4f7d2a
commit dc43dd44f9
14 changed files with 1063 additions and 113 deletions
--- a/apps/server/src/services/inference.ts
+++ b/apps/server/src/services/inference.ts
@@ -21,6 +21,8 @@ import {
 import { PathScopeError, resolveProjectRoot } from './path_guard.js';
 import { maybeAutoNameChat } from './auto_name.js';
 import { getAgentById } from './agents.js';
+import * as compaction from './compaction.js';
+import type { Broker } from './broker.js';

 const BASE_SYSTEM_PROMPT = (projectPath: string) =>
  `You are BooCode Chat, a code investigation assistant. The user is working on a project located at ${projectPath}. Use the file-read tools (view_file, list_dir, grep, find_files) to investigate code when needed. Be concise. Cite file paths and line numbers when discussing code. Do not hallucinate file contents — read the file first. Tool results may be truncated; if so, narrow your query rather than guessing.`;
@@ -147,6 +149,12 @@ export interface InferenceContext {
  log: FastifyBaseLogger;
  publish: FramePublisher;
  publishUser: (frame: UserStreamFrame) => void;
+  // v1.11: passed through so compaction.process can publish 'compacted'
+  // frames on the same session WS channel useSessionStream subscribes to.
+  // Compaction is the only path that needs the raw broker handle (regular
+  // inference goes through `publish`); keeping a separate field avoids
+  // tempting other code paths into bypassing the session-id binding.
+  broker: Broker;
 }

 // Resolution order: base prompt < agent.system_prompt < user prompt, where
@@ -260,17 +268,48 @@ async function loadContext(
  if (projectRows.length === 0) return null;
  const project = projectRows[0]!;

+  // v1.11: filter compacted messages out of the inference assembly. The GET
+  // /api/sessions/:id/messages endpoint still returns everything (so the UI
+  // can show history with the summary card inline); only LLM payloads skip
+  // compacted rows. compacted_at IS NULL keeps the active summary + tail.
  const history = await sql<Message[]>`
    SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
           tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata
    FROM messages
-    WHERE chat_id = ${chatId}
+    WHERE chat_id = ${chatId} AND compacted_at IS NULL
    ORDER BY created_at ASC, id ASC
  `;

  return { session, project, history };
 }

+// v1.11: shared helper used after both finalizeCompletion and executeToolPhase
+// persist their token counts. Reads tokens off the just-UPDATEd row (which
+// the caller returns from RETURNING), runs compaction.isOverflow, and flips
+// chats.needs_compaction. The next runAssistantTurn invocation acts on it.
+// Silent on missing tokens — llama-swap occasionally omits usage on truncated
+// streams, and we'd rather miss one overflow than crash the inference path.
+async function maybeFlagForCompaction(
+  ctx: InferenceContext,
+  chatId: string,
+  updated: { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null } | undefined,
+): Promise<void> {
+  if (!updated) return;
+  const promptTokens = updated.ctx_used;
+  const completionTokens = updated.tokens_used;
+  const contextLimit = updated.ctx_max;
+  if (typeof promptTokens !== 'number') return;
+  if (typeof completionTokens !== 'number') return;
+  if (typeof contextLimit !== 'number') return;
+  const overflow = compaction.isOverflow(
+    { prompt_tokens: promptTokens, completion_tokens: completionTokens },
+    contextLimit,
+  );
+  if (!overflow) return;
+  await ctx.sql`UPDATE chats SET needs_compaction = true WHERE id = ${chatId}`;
+  ctx.log.info({ chatId, promptTokens, completionTokens, contextLimit }, 'inference: flagged for compaction');
+}
+
 async function* sseLines(stream: ReadableStream<Uint8Array>): AsyncGenerator<string> {
  const reader = stream.getReader();
  const decoder = new TextDecoder('utf-8');
@@ -758,6 +797,10 @@ async function executeToolPhase(
    WHERE id = ${assistantMessageId}
    RETURNING tokens_used, ctx_used, ctx_max, finished_at
  `;
+  // v1.11: flag for compaction if this turn pushed us over the usable budget.
+  // We never compact mid-loop (the recursive runAssistantTurn keeps tools
+  // flowing); the flag fires on the NEXT turn's pre-fetch hook above.
+  await maybeFlagForCompaction(ctx, chatId, updated);
  const [toolSessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>`
    UPDATE sessions SET updated_at = clock_timestamp()
    WHERE id = ${sessionId}
@@ -889,6 +932,9 @@ async function finalizeCompletion(
    WHERE id = ${assistantMessageId}
    RETURNING tokens_used, ctx_used, ctx_max, finished_at
  `;
+  // v1.11: flag for compaction on the terminal turn too. Catches the common
+  // case of a turn that hit the limit without invoking tools.
+  await maybeFlagForCompaction(ctx, chatId, updated);
  const [completeSessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>`
    UPDATE sessions SET updated_at = clock_timestamp()
    WHERE id = ${sessionId}
@@ -927,6 +973,29 @@ async function runAssistantTurn(
 ): Promise<void> {
  const { sessionId, chatId } = args;

+  // v1.11: if the prior turn flagged this chat for compaction, run it first
+  // so loadContext below reads the post-compaction history. We swallow
+  // compaction failures (clearing the flag so we don't loop) and proceed
+  // with the un-compacted history — a slow turn that hits the model's
+  // hard limit is recoverable; a dead session is not.
+  const chatFlag = await ctx.sql<{ needs_compaction: boolean }[]>`
+    SELECT needs_compaction FROM chats WHERE id = ${chatId}
+  `;
+  if (chatFlag[0]?.needs_compaction) {
+    try {
+      await compaction.process({
+        sql: ctx.sql,
+        config: ctx.config,
+        log: ctx.log,
+        broker: ctx.broker,
+        chatId,
+      });
+    } catch (err) {
+      ctx.log.warn({ err, chatId }, 'auto-compaction failed; clearing flag and proceeding');
+      await ctx.sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
+    }
+  }
+
  const loaded = await loadContext(ctx.sql, sessionId, chatId);
  if (!loaded) {
    ctx.log.warn({ sessionId }, 'inference: session or project missing');
@@ -1237,81 +1306,6 @@ async function insertCapHitSentinel(
  });
 }

-const COMPACT_SYSTEM_PROMPT =
-  'Summarize the preceding conversation into a dense but complete context paragraph. Preserve all key facts, decisions, file paths, code patterns, and action items. Do not add any new information. Output only the summary paragraph.';
-
-async function runCompact(
-  ctx: InferenceContext,
-  sessionId: string,
-  chatId: string,
-  compactMessageId: string
-): Promise<void> {
-  const loaded = await loadContext(ctx.sql, sessionId, chatId);
-  if (!loaded) return;
-  const { session, project, history } = loaded;
-
-  const messagesForSummary = buildMessagesPayload(session, project,
-    history.filter((m) => m.id !== compactMessageId)
-  );
-  messagesForSummary.push({
-    role: 'system',
-    content: COMPACT_SYSTEM_PROMPT,
-  });
-
-  ctx.publish(sessionId, {
-    type: 'message_started',
-    message_id: compactMessageId,
-    chat_id: chatId,
-    role: 'assistant',
-  });
-
-  let content = '';
-  try {
-    const result = await streamCompletion(
-      ctx,
-      session.model,
-      messagesForSummary,
-      { tools: null },
-      (delta) => {
-        content += delta;
-        ctx.publish(sessionId, {
-          type: 'delta',
-          message_id: compactMessageId,
-          chat_id: chatId,
-          content: delta,
-        });
-      }
-    );
-    content = result.content;
-  } catch (err) {
-    const errMsg = err instanceof Error ? err.message : String(err);
-    await ctx.sql`
-      UPDATE messages SET status = 'failed', content = ${content}, finished_at = clock_timestamp()
-      WHERE id = ${compactMessageId}
-    `;
-    ctx.publish(sessionId, {
-      type: 'error',
-      message_id: compactMessageId,
-      chat_id: chatId,
-      error: errMsg,
-    });
-    return;
-  }
-
-  const preCompactCount = history.filter((m) => m.id !== compactMessageId && m.kind !== 'compact').length;
-  const summary = `[Context compacted — ${preCompactCount} messages summarized]\n\n${content}`;
-
-  await ctx.sql`
-    UPDATE messages SET content = ${summary}, status = 'complete', finished_at = clock_timestamp()
-    WHERE id = ${compactMessageId}
-  `;
-  ctx.publish(sessionId, {
-    type: 'message_complete',
-    message_id: compactMessageId,
-    chat_id: chatId,
-  });
-}
-
 interface InferenceRegistration {
  controller: AbortController;
  completed: Promise<void>;
@@ -1328,6 +1322,10 @@ export function createInferenceRunner(
      const callCtx: InferenceContext = {
        ...ctx,
        publishUser: (frame) => publishUserFn(user, frame),
+        // v1.11: broker comes in via ctx (set at registration time). Repeated
+        // here so the destructure carries it onto the per-call ctx without
+        // having to add it to every enqueue/cancel signature individually.
+        broker: ctx.broker,
      };
      // v1.8 mobile-tabs: announce working before the async loop starts so
      // every device subscribed to the user channel sees the amber dot.
@@ -1357,20 +1355,6 @@ export function createInferenceRunner(
      })();
    },

-    enqueueCompact(sessionId: string, chatId: string, compactMessageId: string, user: string) {
-      const callCtx: InferenceContext = {
-        ...ctx,
-        publishUser: (frame) => publishUserFn(user, frame),
-      };
-      void (async () => {
-        try {
-          await runCompact(callCtx, sessionId, chatId, compactMessageId);
-        } catch (err) {
-          callCtx.log.error({ err }, 'unhandled compact error');
-        }
-      })();
-    },
-
    async cancel(_sessionId: string, chatId: string): Promise<boolean> {
      const reg = registry.get(chatId);
      if (!reg) return false;