boocode/apps/server/src/services/compaction.ts

// v1.11: anchored rolling compaction. Ported algorithms (not Effect-TS code)
// from opencode (packages/opencode/src/session/{compaction,overflow}.ts).
//
// What's different from BooCode's legacy /compact:
//   - Operates per-chat (chats have N:1 to sessions; history is per-chat).
//   - Detects overflow automatically after each inference completion using
//     llama-swap's reported n_ctx; flags chats.needs_compaction=true.
//   - On the next turn (or manual /compact) we summarize the *head* (messages
//     prior to a preserved tail of N user-turns) into a single
//     summary=true assistant row. Older messages get compacted_at-stamped so
//     inference assembly filters them out; the GET endpoint still returns
//     them so the UI can show history with the summary card inline.
//   - The summary is *anchored rolling* — exactly one live summary=true row
//     per chat. Subsequent compactions read the prior summary as
//     previousSummary, ask the LLM to update-merge it, then mark the prior
//     summary row compacted_at too (it stays in the UI but isn't sent to the
//     LLM again).

import type { FastifyBaseLogger } from 'fastify';
import type { Sql } from '../db.js';
import type { Config } from '../config.js';
import type { Broker } from './broker.js';
import { SUMMARY_TEMPLATE } from './compaction-prompt.js';
import * as modelContextLookup from './model-context.js';

// v1.13.9: ratio-only overflow trigger. Fires compaction at 85% of ctx_max
// (opencode session/overflow.ts pattern). Replaces the v1.11.0-era
// `ctx_max - 20_000` formula which degenerated to 0 for contexts ≤20k and
// gave only 7-8% headroom to the summarizer at 262k. Ratio gives consistent
// 15% headroom at any scale, and small-ctx models no longer get an
// effectively-disabled trigger.
const EARLY_TRIGGER_RATIO = 0.85;
const MIN_PRESERVE_RECENT_TOKENS = 2_000;
const MAX_PRESERVE_RECENT_TOKENS = 8_000;
const DEFAULT_TAIL_TURNS = 2;

// Subset of Message fields compaction touches. Selecting only what's needed
// keeps process() independent of api.ts mutations and reduces DB egress.
export interface CompactionMessage {
  id: string;
  role: 'user' | 'assistant' | 'system' | 'tool';
  content: string;
  kind: 'message' | 'compact';
  summary: boolean;
  status: 'streaming' | 'complete' | 'failed' | 'cancelled';
  tool_calls: Array<{ id: string; name: string; args: Record<string, unknown> }> | null;
  tool_results: { tool_call_id: string; output: unknown; truncated: boolean; error?: string } | null;
  // v1.13.6: reasoning_parts captured by v1.13.1-C and read back through
  // messages_with_parts. Embedded into the head-assembly payload as prose so
  // the summarizer LLM sees what the model was reasoning through when it
  // chose its tool calls.
  reasoning_parts: Array<{ text: string }> | null;
  metadata: { kind?: string } | null;
  created_at: string;
}

// === overflow ===

// Returns the token budget at which overflow fires. Triggers compaction at
// 85% of contextLimit (opencode session/overflow.ts pattern). Returns 0 when
// the context limit is unknown — caller treats 0 as "do not trigger overflow",
// keeping inference flowing rather than compacting a turn we can't size.
export function usable(contextLimit: number): number {
  if (!contextLimit || contextLimit <= 0) return 0;
  return Math.floor(EARLY_TRIGGER_RATIO * contextLimit);
}

export interface Usage {
  prompt_tokens: number;
  completion_tokens: number;
}

// True when the assistant just used >= usable() tokens. Unknown limit → false
// (we never auto-trigger compaction without a budget — better to keep
// inference flowing than to fall into a compaction we can't size properly).
export function isOverflow(usage: Usage, contextLimit: number): boolean {
  const budget = usable(contextLimit);
  if (budget <= 0) return false;
  return (usage.prompt_tokens + usage.completion_tokens) >= budget;
}

// === selection ===

interface Turn {
  start: number;
  end: number;
  id: string;
}

// Char-count / 4 token estimate. Matches opencode's Token.estimate (which
// also goes through JSON.stringify). Adequate for tail-fitting math; we
// don't need a real tokenizer here — the 20k buffer absorbs the slop.
export function estimate(messages: CompactionMessage[]): number {
  return Math.ceil(JSON.stringify(messages).length / 4);
}

// Walk messages, return one Turn per user message that is NOT a summary row.
// end = next-user-start; final turn ends at messages.length.
export function turns(messages: CompactionMessage[]): Turn[] {
  const result: Turn[] = [];
  for (let i = 0; i < messages.length; i++) {
    const m = messages[i]!;
    if (m.role !== 'user') continue;
    if (m.summary) continue;
    result.push({ start: i, end: messages.length, id: m.id });
  }
  for (let i = 0; i < result.length - 1; i++) {
    result[i]!.end = result[i + 1]!.start;
  }
  return result;
}

// Inside a turn that doesn't fit whole, walk forward from start+1 looking for
// the largest suffix that fits the remaining budget. Returns the keep-start
// index (the first preserved message) or undefined if no suffix fits.
function splitTurn(
  messages: CompactionMessage[],
  turn: Turn,
  budget: number,
): { start: number; id: string } | undefined {
  if (budget <= 0) return undefined;
  if (turn.end - turn.start <= 1) return undefined;
  for (let start = turn.start + 1; start < turn.end; start++) {
    const size = estimate(messages.slice(start, turn.end));
    if (size > budget) continue;
    return { start, id: messages[start]!.id };
  }
  return undefined;
}

export interface SelectResult {
  head: CompactionMessage[];
  tail_start_id: string | undefined;
}

// Choose the boundary between the "head" (to be summarized) and the "tail"
// (preserved verbatim). Strategy:
//   1. Reserve a budget for the recent tail. Default ranges [2k, 8k] tokens
//      with 25% of usable() as the target.
//   2. Take the last `tail_turns` user-turns; greedily fit from newest back.
//   3. If the next-older turn doesn't fit whole, split it mid-turn.
//   4. If we couldn't keep anything OR everything fit (keep.start === 0),
//      return full-preserve (no compaction this round).
export function select(
  messages: CompactionMessage[],
  contextLimit: number,
  tailTurns: number = DEFAULT_TAIL_TURNS,
): SelectResult {
  if (tailTurns <= 0) return { head: messages, tail_start_id: undefined };
  const budget = Math.min(
    MAX_PRESERVE_RECENT_TOKENS,
    Math.max(MIN_PRESERVE_RECENT_TOKENS, Math.floor(usable(contextLimit) * 0.25)),
  );

  const all = turns(messages);
  if (all.length === 0) return { head: messages, tail_start_id: undefined };
  const recent = all.slice(-tailTurns);

  let total = 0;
  let keep: { start: number; id: string } | undefined;
  for (let i = recent.length - 1; i >= 0; i--) {
    const turn = recent[i]!;
    const size = estimate(messages.slice(turn.start, turn.end));
    if (total + size <= budget) {
      total += size;
      keep = { start: turn.start, id: turn.id };
      continue;
    }
    const remaining = budget - total;
    const split = splitTurn(messages, turn, remaining);
    if (split) keep = split;
    break;
  }

  if (!keep || keep.start === 0) {
    return { head: messages, tail_start_id: undefined };
  }
  return {
    head: messages.slice(0, keep.start),
    tail_start_id: keep.id,
  };
}

// === prompt assembly ===

// Build the final user message that asks the model to (re)produce the
// anchored summary. `context` is reserved for future plugin injection;
// callers pass [] today.
export function buildPrompt(
  previousSummary: string | undefined,
  context: string[],
): string {
  const anchor = previousSummary
    ? [
        'Update the anchored summary below using the conversation history above.',
        'Preserve still-true details, remove stale details, and merge in the new facts.',
        '<previous-summary>',
        previousSummary,
        '</previous-summary>',
      ].join('\n')
    : 'Create a new anchored summary from the conversation history above.';
  return [anchor, SUMMARY_TEMPLATE, ...context].join('\n\n');
}

// === OpenAI conversion (compaction-local; intentionally does NOT call
// inference.ts buildMessagesPayload because that uses the legacy "find latest
// kind='compact' marker and skip everything before it" shortcircuit, which
// would silently drop pre-legacy-compact history before the LLM sees it.
// Compaction wants to send the entire head, full stop.) ===

// v1.13.6: exported for unit-test access (reasoning render coverage).
export interface OpenAiMessage {
  role: 'system' | 'user' | 'assistant' | 'tool';
  content: string | null;
  tool_calls?: Array<{
    id: string;
    type: 'function';
    function: { name: string; arguments: string };
  }>;
  tool_call_id?: string;
}

function isCapHitSentinel(m: CompactionMessage): boolean {
  return m.role === 'system' && m.metadata != null && m.metadata.kind === 'cap_hit';
}

// v1.13.6: exported for unit-test access (reasoning render coverage).
export function buildHeadPayload(head: CompactionMessage[]): OpenAiMessage[] {
  const out: OpenAiMessage[] = [];
  for (const m of head) {
    if (isCapHitSentinel(m)) continue;
    if (m.role === 'assistant' && (m.status === 'streaming' || m.status === 'cancelled')) continue;
    if (m.kind === 'compact') {
      // Legacy compact row — pass through as system context. The new
      // anchored summary will subsume it, but the LLM should see it during
      // the bridging round so it can carry forward the still-true bits.
      out.push({ role: 'system', content: m.content });
      continue;
    }
    if (m.summary) {
      // Defense in depth: process() filters these out of the select-input
      // already. If one slips through, render it as assistant content so we
      // never crash here.
      out.push({ role: 'assistant', content: m.content });
      continue;
    }
    if (m.role === 'tool') {
      const tr = m.tool_results;
      if (!tr) continue;
      const outputText = tr.error
        ? `error: ${tr.error}`
        : typeof tr.output === 'string'
          ? tr.output
          : JSON.stringify(tr.output);
      out.push({ role: 'tool', content: outputText, tool_call_id: tr.tool_call_id });
      continue;
    }
    if (m.role === 'assistant') {
      // v1.13.6: embed reasoning text as prose prefixed onto the assistant
      // content. OpenAI wire shape doesn't carry reasoning as a structured
      // field, but the summarizer is reading text — a tagged prose block
      // gives it the same signal. We mirror the AI SDK ReasoningPart shape
      // by using a <reasoning>...</reasoning> wrapper so the summarizer can
      // distinguish reasoning from user-visible answer.
      let body = m.content && m.content.length > 0 ? m.content : '';
      if (m.reasoning_parts && m.reasoning_parts.length > 0) {
        const reasoning = m.reasoning_parts.map((r) => r.text).join('');
        body = body.length > 0
          ? `<reasoning>${reasoning}</reasoning>\n\n${body}`
          : `<reasoning>${reasoning}</reasoning>`;
      }
      const msg: OpenAiMessage = {
        role: 'assistant',
        content: body.length > 0 ? body : null,
      };
      if (m.tool_calls && m.tool_calls.length > 0) {
        msg.tool_calls = m.tool_calls.map((tc) => ({
          id: tc.id,
          type: 'function' as const,
          function: { name: tc.name, arguments: JSON.stringify(tc.args) },
        }));
      }
      out.push(msg);
      continue;
    }
    out.push({ role: 'user', content: m.content });
  }
  return out;
}

// === llama-swap call ===

// Non-streaming completion. Opencode streams; for a one-shot summary call a
// single POST is less code and the latency hit is acceptable (the user
// doesn't see this directly — useSessionStream emits the toast + refetches
// on the 'compacted' frame).
interface CompletionResult {
  content: string;
  promptTokens: number;
  completionTokens: number;
}

async function callLlamaSwap(
  config: Config,
  model: string,
  messages: OpenAiMessage[],
  log: FastifyBaseLogger,
): Promise<CompletionResult> {
  const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/chat/completions`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ model, messages, stream: false }),
  });
  if (!res.ok) {
    const text = await res.text().catch(() => '');
    throw new Error(`llama-swap returned ${res.status}: ${text.slice(0, 200)}`);
  }
  const json = (await res.json()) as {
    choices?: Array<{ message?: { content?: string } }>;
    usage?: { prompt_tokens?: number; completion_tokens?: number };
  };
  // v1.11.3: removed the dead `json.timings?.n_ctx` read — llama-server's
  // completions don't emit n_ctx in timings. ctx_max on the summary row
  // comes from model-context.getModelContext below in process().
  const content = json.choices?.[0]?.message?.content ?? '';
  const promptTokens = json.usage?.prompt_tokens ?? 0;
  const completionTokens = json.usage?.completion_tokens ?? 0;
  log.debug({ promptTokens, completionTokens, chars: content.length }, 'compaction llm complete');
  return { content, promptTokens, completionTokens };
}

// === entry point ===

export interface ProcessInput {
  sql: Sql;
  config: Config;
  log: FastifyBaseLogger;
  broker: Broker;
  chatId: string;
}

// Runs one round of anchored rolling compaction on `chatId`. No-ops cleanly
// (clearing needs_compaction) when there's nothing reasonable to compact.
// Throws on LLM failure — callers decide whether to log+swallow or surface.
export async function process(input: ProcessInput): Promise<void> {
  const { sql, config, log, broker, chatId } = input;

  // 1. Resolve chat → session for model + WS publish channel.
  const chatRows = await sql<{ id: string; session_id: string }[]>`
    SELECT id, session_id FROM chats WHERE id = ${chatId}
  `;
  if (chatRows.length === 0) {
    log.warn({ chatId }, 'compaction: chat not found');
    return;
  }
  const chat = chatRows[0]!;
  const sessionId = chat.session_id;

  const sessRows = await sql<{ id: string; model: string }[]>`
    SELECT id, model FROM sessions WHERE id = ${sessionId}
  `;
  if (sessRows.length === 0) {
    log.warn({ chatId, sessionId }, 'compaction: session not found');
    return;
  }
  const session = sessRows[0]!;

  // 2. All currently-active messages in this chat (compacted_at IS NULL).
  // ORDER BY (created_at, id) matches loadContext in inference.ts so the
  // turns() boundary logic sees the same sequence the LLM will.
  // v1.13.1-B: reads tool_calls/tool_results via the parts-merged view so
  // the compaction payload matches what the LLM saw on the original turn.
  // v1.13.6: also pulls reasoning_parts (added in v1.13.1-C) so summaries
  // capture what the model was working through before each tool call.
  const messages = await sql<CompactionMessage[]>`
    SELECT id, role, content, kind, summary, status, tool_calls, tool_results,
           reasoning_parts, metadata, created_at
    FROM messages_with_parts
    WHERE chat_id = ${chatId} AND compacted_at IS NULL
    ORDER BY created_at ASC, id ASC
  `;
  if (messages.length === 0) {
    await sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
    return;
  }

  // 3. Find the prior anchored summary (newest summary=true row). Its content
  // becomes previousSummary — the anchor in the prompt. Filter it out of the
  // select-input so we don't double-encode (it's already in the anchor text).
  const previousSummary = messages.filter((m) => m.summary).at(-1)?.content;
  const forSelect = messages.filter((m) => !m.summary);

  // 4. Resolve a recent context limit. llama-swap reports timings.n_ctx per
  // completion; we cache it on messages.ctx_max. Use the most recent value
  // from any message in this chat (oldest assumption is the same model is
  // still running). When unknown, fall back to model.context_limit-less
  // defaults via the buffer-only path (see usable()).
  const ctxRows = await sql<{ ctx_max: number | null }[]>`
    SELECT ctx_max FROM messages
    WHERE chat_id = ${chatId} AND ctx_max IS NOT NULL
    ORDER BY created_at DESC LIMIT 1
  `;
  const contextLimit = ctxRows[0]?.ctx_max ?? 0;

  // 5. Decide head / tail.
  const sel = select(forSelect, contextLimit);
  if (!sel.tail_start_id || sel.head.length === 0) {
    // Full preserve — nothing to compact this round. Clear the flag so we
    // don't loop. (Could happen when the chat is short or the budget swung
    // wider after a model context bump.)
    await sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
    log.info({ chatId, contextLimit, msgCount: messages.length }, 'compaction: nothing to compact');
    return;
  }

  // 6. Build the OpenAI request: head as user/assistant/tool turns + a final
  // user message carrying buildPrompt(previousSummary, []). No system prompt
  // — matches opencode (`system: []`); the template + anchor are sufficient.
  const headPayload = buildHeadPayload(sel.head);
  const finalUser: OpenAiMessage = { role: 'user', content: buildPrompt(previousSummary, []) };
  const payload = [...headPayload, finalUser];

  log.info(
    {
      chatId,
      contextLimit,
      headLen: sel.head.length,
      tailStartId: sel.tail_start_id,
      hadPrevSummary: previousSummary !== undefined,
    },
    'compaction: invoking model',
  );

  // 6a. Flip the chat dot for the duration of the LLM call + DB writes.
  // v1.13.11-b: publish status='streaming' (the v1.12.1-widened replacement
  // for the dropped 'working' value). Compaction's LLM call has the same
  // semantic as an inference turn for dot-state purposes. The v1.12.1
  // chat_status widening missed this site; v1.13.11's WsFrame Zod schema
  // surfaced the drift via the unknown-enum-value check.
  broker.publishUserFrame('default', {
    type: 'chat_status',
    chat_id: chatId,
    status: 'streaming',
    at: new Date().toISOString(),
  });

  // try/finally so the dot ALWAYS drops back to idle, even if the LLM call
  // throws or a downstream DB write fails. The succeeded flag gates the
  // 'compacted' frame + final log: we only signal completion to the UI when
  // the new summary row actually landed.
  let succeeded = false;
  let newId = '';
  let result: CompletionResult | undefined;
  try {
    // 7. Single completion (no tools). Throws on llama-swap failure.
    result = await callLlamaSwap(config, session.model, payload, log);

    // 7b. v1.11.3: fetch the model's true context window from llama-swap's
    // /upstream/<model>/props (the streaming completion doesn't carry it).
    // Same pattern as inference.ts; the cache makes repeated calls free.
    const mctx = await modelContextLookup.getModelContext(session.model);
    const nCtx = mctx?.n_ctx ?? null;

    // 8. Insert the new anchored summary row. role='assistant' per spec; the
    // UI distinguishes via summary=true. tail_start_id points at the first
    // preserved tail message so debug surfaces / future tools can reason
    // about the boundary without re-deriving from compacted_at.
    const insertRows = await sql<{ id: string }[]>`
      INSERT INTO messages (
        session_id, chat_id, role, content, kind, status,
        summary, tail_start_id,
        tokens_used, ctx_used, ctx_max,
        created_at, finished_at
      )
      VALUES (
        ${sessionId}, ${chatId}, 'assistant', ${result.content}, 'message', 'complete',
        true, ${sel.tail_start_id},
        ${result.completionTokens}, ${result.promptTokens}, ${nCtx},
        clock_timestamp(), clock_timestamp()
      )
      RETURNING id
    `;
    newId = insertRows[0]!.id;

    // 9. Mark every prior live message (head + prior summary) as compacted.
    // Bound by "created_at strictly less than tail_start_id's created_at" so
    // the preserved tail stays compacted_at=NULL. Exclude the new summary
    // row we just inserted (it's "now", which is >= tail_start_id's
    // created_at anyway, but defensive).
    await sql`
      UPDATE messages
      SET compacted_at = clock_timestamp()
      WHERE chat_id = ${chatId}
        AND compacted_at IS NULL
        AND id != ${newId}
        AND created_at < (SELECT created_at FROM messages WHERE id = ${sel.tail_start_id})
    `;

    // 10. Clear the flag and bump the chat's updated_at so the sidebar
    // reflects recent activity.
    await sql`
      UPDATE chats
      SET needs_compaction = false, updated_at = clock_timestamp()
      WHERE id = ${chatId}
    `;

    succeeded = true;
  } finally {
    // Always restore the dot. Status='idle' (not 'error') even on failure —
    // the caller logs/re-surfaces the error separately; the dot doesn't
    // need to stay red across reloads for a transient compaction blip.
    broker.publishUserFrame('default', {
      type: 'chat_status',
      chat_id: chatId,
      status: 'idle',
      at: new Date().toISOString(),
    });
  }

  // 11. Tell the client. useSessionStream subscribes to the per-session WS
  // channel; the handler refetches messages (so the new summary row + the
  // compacted_at-stamped older rows render correctly) and fires a sonner
  // toast. Order matters: idle must precede 'compacted' so the dot is
  // already green by the time the refetch toast appears.
  if (succeeded) {
    broker.publishFrame(sessionId, {
      type: 'compacted',
      session_id: sessionId,
      chat_id: chatId,
      summary_message_id: newId,
    });
    log.info(
      {
        chatId,
        newId,
        completionTokens: result?.completionTokens,
        promptTokens: result?.promptTokens,
      },
      'compaction: complete',
    );
  }
}