v1.13.4: two-tier compaction prune — opencode pattern half-shipped in v1.11.0

- message_parts.hidden_at timestamptz column (NULL by default) with a partial index on (message_id) WHERE hidden_at IS NULL for the common visible-parts filter. - messages_with_parts view changed from COALESCE(parts, legacy) to CASE WHEN EXISTS(any parts of kind) THEN visible-parts ELSE legacy. COALESCE would have leaked hidden parts back via the legacy fallback when every part was pruned (smoke caught it pre-commit). The CASE distinguishes "no parts at all → fall back to legacy column for pre-v1.13.0 history" from "all parts hidden → return null/empty so the row drops out of the model payload" exactly. - prune.ts: scans tool_result parts newest-first, protects the last 40k tokens (PROTECTED_TOKENS), marks older candidates hidden when their combined estimate clears 20k (PRUNE_TRIGGER_TOKENS — equal to COMPACTION_BUFFER from v1.11.0, so a successful prune is exactly the budget the summary path would have freed). Stops at chats.tail_start_id so it doesn't double-erase across the last summary boundary. Pure decision helper selectPruneTargets exported separately for unit tests. - Wired into maybeFlagForCompaction: prune runs synchronously when overflow is detected; if it freed >= PRUNE_TRIGGER_TOKENS, the needs_compaction flag is NOT set and the (expensive) summary inference call is skipped this turn. The next turn's overflow check re-evaluates from scratch. - 6 new unit tests in prune.test.ts cover: empty input, protection-only (no candidates), candidates below trigger, candidates above trigger, candidates straddling a summary boundary, exactly-protection-tokens. 179 tests total (was 173). Smoke verified post-rebuild: - \\d message_parts shows hidden_at + partial index. - View definition shows AND p.hidden_at IS NULL filters on all three subselects. - Synthetic hide-then-restore confirmed the view drops the tool_result jsonb to null when its only part is hidden, and restores when un-hidden. - EXPLAIN ANALYZE on the 42-message stress chat: 0.325ms (faster than v1.13.1-B's 1.018ms — EXISTS short-circuits cleanly for the common no-parts case). - Normal turn (plain text prompt) completes unaffected. Closes a v1.11.0 design item that was scoped but never implemented. With v1.13's parts table the prune is dramatically cheaper to write — pre-parts it would have meant editing JSON blobs in-place; now it's a hidden_at flag and a view subselect. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
v1.13.3: cleanup bundle — statement timeout + alpha ordering + stuck-row sweeper + repairToolCall
2026-05-22 07:02:17 +00:00 · 2026-05-22 06:46:03 +00:00
8 changed files with 388 additions and 18 deletions
--- a/apps/server/src/index.ts
+++ b/apps/server/src/index.ts
@@ -201,6 +201,46 @@ async function main() {
    app.log.info(`serving static frontend from ${webDist}`);
  }

+  // v1.13.3: periodic in-process sweeper for streaming rows orphaned by a
+  // mid-session crash. The boot sweep (above) only fires once at startup;
+  // this loop catches the in-flight case. 60s cadence + 5-min threshold
+  // matches the boot sweep so behavior is consistent. Publishes
+  // chat_status='idle' on the user channel so the UI dot drops without a
+  // refresh — same pattern as handleAbortOrError.
+  const SWEEP_INTERVAL_MS = 60_000;
+  const sweepStaleStreaming = async (): Promise<void> => {
+    try {
+      const rows = await sql<{ id: string; chat_id: string }[]>`
+        UPDATE messages
+        SET status = 'failed', finished_at = clock_timestamp()
+        WHERE status = 'streaming'
+          AND created_at < NOW() - INTERVAL '5 minutes'
+        RETURNING id, chat_id
+      `;
+      if (rows.length === 0) return;
+      app.log.warn(
+        { swept: rows.length, ids: rows.map((r) => r.id) },
+        'swept stale streaming rows',
+      );
+      const seenChats = new Set<string>();
+      const now = new Date().toISOString();
+      for (const row of rows) {
+        if (seenChats.has(row.chat_id)) continue;
+        seenChats.add(row.chat_id);
+        broker.publishUser('default', {
+          type: 'chat_status',
+          chat_id: row.chat_id,
+          status: 'idle',
+          at: now,
+        });
+      }
+    } catch (err) {
+      app.log.error({ err }, 'stuck-row sweeper failed');
+    }
+  };
+  const sweepTimer = setInterval(() => { void sweepStaleStreaming(); }, SWEEP_INTERVAL_MS);
+  app.addHook('onClose', async () => { clearInterval(sweepTimer); });
+
  const shutdown = async (signal: string) => {
    app.log.info(`received ${signal}, shutting down`);
    try {
--- a/apps/server/src/schema.sql
+++ b/apps/server/src/schema.sql
@@ -1,3 +1,10 @@
+-- v1.13.3: statement_timeout is set at database level via:
+--   ALTER DATABASE boocode SET statement_timeout = '30s';
+-- ALTER DATABASE can't run inside a DO block, so this is an operational
+-- step rather than schema. Re-apply after a volume reset (the setting
+-- lives in pg_db which survives `docker compose up --build` but NOT a
+-- `docker volume rm boocode_pgdata`).
+
 CREATE TABLE IF NOT EXISTS projects (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  name TEXT NOT NULL,
@@ -49,6 +56,24 @@ CREATE TABLE IF NOT EXISTS message_parts (
 );
 CREATE INDEX IF NOT EXISTS message_parts_msg_seq_idx ON message_parts (message_id, sequence);

+-- v1.13.4: prune support. hidden_at marks parts that have been pruned out
+-- of the model payload by the two-tier compaction prune (services/inference/
+-- prune.ts). Rows stay in the DB so frontend can still display them with a
+-- "hidden" indicator (out of scope this dispatch). messages_with_parts
+-- view filters these out — see below. Partial index speeds the common
+-- "visible parts only" filter.
+DO $$
+BEGIN
+  IF NOT EXISTS (
+    SELECT 1 FROM information_schema.columns
+    WHERE table_name = 'message_parts' AND column_name = 'hidden_at'
+  ) THEN
+    ALTER TABLE message_parts ADD COLUMN hidden_at timestamptz NULL;
+  END IF;
+END $$;
+CREATE INDEX IF NOT EXISTS message_parts_hidden_idx
+  ON message_parts (message_id) WHERE hidden_at IS NULL;
+
 -- v1.13.1-B: read-path view. Read sites SELECT FROM messages_with_parts
 -- instead of messages so tool_calls / tool_results / reasoning_parts come
 -- from the granular message_parts table. The COALESCE means pre-v1.13.0
@@ -66,23 +91,32 @@ SELECT
  m.last_seq, m.tokens_used, m.ctx_used, m.ctx_max,
  m.started_at, m.finished_at, m.created_at, m.metadata,
  m.summary, m.tail_start_id, m.compacted_at,
-  COALESCE(
+  -- v1.13.4: prune semantics need to distinguish "no parts row exists"
+  -- (pre-v1.13.0 fallback to legacy column) from "all parts hidden"
+  -- (prune intended — return null/empty so the row drops from the model
+  -- payload). A naive COALESCE would fall back to the legacy column when
+  -- every part is hidden, undoing the prune. CASE on EXISTS(any kind)
+  -- splits the two cases.
+  CASE
+    WHEN EXISTS (SELECT 1 FROM message_parts pp
+                  WHERE pp.message_id = m.id AND pp.kind = 'tool_call')
+    THEN (SELECT jsonb_agg(p.payload ORDER BY p.sequence)
+            FROM message_parts p
+           WHERE p.message_id = m.id AND p.kind = 'tool_call' AND p.hidden_at IS NULL)
+    ELSE m.tool_calls
+  END AS tool_calls,
+  CASE
+    WHEN EXISTS (SELECT 1 FROM message_parts pp
+                  WHERE pp.message_id = m.id AND pp.kind = 'tool_result')
+    THEN (SELECT p.payload
+            FROM message_parts p
+           WHERE p.message_id = m.id AND p.kind = 'tool_result' AND p.hidden_at IS NULL
+           ORDER BY p.sequence LIMIT 1)
+    ELSE m.tool_results
+  END AS tool_results,
  (SELECT jsonb_agg(p.payload ORDER BY p.sequence)
     FROM message_parts p
-      WHERE p.message_id = m.id AND p.kind = 'tool_call'),
-    m.tool_calls
-  ) AS tool_calls,
-  COALESCE(
-    (SELECT p.payload
-       FROM message_parts p
-      WHERE p.message_id = m.id AND p.kind = 'tool_result'
-      ORDER BY p.sequence
-      LIMIT 1),
-    m.tool_results
-  ) AS tool_results,
-  (SELECT jsonb_agg(p.payload ORDER BY p.sequence)
-     FROM message_parts p
-    WHERE p.message_id = m.id AND p.kind = 'reasoning') AS reasoning_parts
+    WHERE p.message_id = m.id AND p.kind = 'reasoning' AND p.hidden_at IS NULL) AS reasoning_parts
 FROM messages m;

 ALTER TABLE messages ADD COLUMN IF NOT EXISTS tokens_used INTEGER;
--- a/apps/server/src/services/tests/prune.test.ts
+++ b/apps/server/src/services/tests/prune.test.ts
@@ -0,0 +1,96 @@
+import { describe, it, expect, beforeEach } from 'vitest';
+import {
+  selectPruneTargets,
+  PROTECTED_TOKENS,
+  PRUNE_TRIGGER_TOKENS,
+  type PartForPrune,
+} from '../inference/prune.js';
+
+// Test fixture: build a tool_result part whose payload size yields a known
+// token estimate (chars/4). The decision logic only cares about
+// JSON.stringify(payload).length, so a string payload of `4n` chars
+// produces exactly `n` tokens.
+let seq = 0;
+function part(tokens: number, createdAt: Date): PartForPrune {
+  seq += 1;
+  // JSON.stringify("xxx...") wraps in quotes (adds 2 chars), so subtract 2
+  // before multiplying. Math.ceil((len+2)/4) needs len ≈ 4*tokens - 2 so the
+  // total stringified length is 4*tokens. Approximate by padding 4 chars per
+  // token; the off-by-one from quotes is small and tests check totals, not
+  // exact per-part counts.
+  const text = 'x'.repeat(tokens * 4 - 2);
+  return { id: `p${seq}`, payload: text, created_at: createdAt };
+}
+
+const T_NOW = new Date('2026-05-22T12:00:00Z');
+function ago(secondsBack: number): Date {
+  return new Date(T_NOW.getTime() - secondsBack * 1000);
+}
+
+describe('selectPruneTargets', () => {
+  beforeEach(() => {
+    seq = 0;
+  });
+
+  it('returns nothing when there are no parts', () => {
+    expect(selectPruneTargets([], null)).toEqual({ ids: [], freedTokens: 0 });
+  });
+
+  it('returns nothing when total tokens are under the protection window', () => {
+    const parts: PartForPrune[] = [
+      part(10_000, ago(10)),
+      part(10_000, ago(20)),
+    ]; // 20k total, all protected
+    expect(selectPruneTargets(parts, null)).toEqual({ ids: [], freedTokens: 0 });
+  });
+
+  it('returns nothing when candidate total is below the prune trigger', () => {
+    // Protection fills with ~40k newest, candidates only ~5k. Below 20k trigger.
+    const parts: PartForPrune[] = [
+      part(20_000, ago(10)),
+      part(20_000, ago(20)),
+      // Past protection; total ~5k won't trigger.
+      part(5_000, ago(30)),
+    ];
+    const result = selectPruneTargets(parts, null);
+    expect(result.ids).toEqual([]);
+    expect(result.freedTokens).toBe(0);
+  });
+
+  it('hides candidates past protection when their total clears the trigger', () => {
+    // Newest 40k protected; older 30k cleanly above the 20k trigger.
+    const parts: PartForPrune[] = [
+      part(20_000, ago(10)),
+      part(20_000, ago(20)),
+      // Past protection, total ~30k freed.
+      part(15_000, ago(30)),
+      part(15_000, ago(40)),
+    ];
+    const result = selectPruneTargets(parts, null);
+    expect(result.ids).toEqual(['p3', 'p4']);
+    expect(result.freedTokens).toBeGreaterThanOrEqual(PRUNE_TRIGGER_TOKENS);
+  });
+
+  it('stops at the compaction summary boundary', () => {
+    // Newest 30k protected (just under PROTECTED_TOKENS=40k); then 30k of
+    // older parts. Boundary sits at ago(35), so the ago(40) part is
+    // beyond it and gets skipped.
+    const parts: PartForPrune[] = [
+      part(15_000, ago(10)),
+      part(15_000, ago(20)),
+      part(15_000, ago(30)), // crosses protection threshold; candidate
+      part(15_000, ago(40)), // beyond summary boundary; skipped
+    ];
+    const tailStart = ago(35);
+    const result = selectPruneTargets(parts, tailStart);
+    // ago(30) is the only candidate inside the window; 15k is below the
+    // 20k trigger so we expect no hides.
+    expect(result.ids).toEqual([]);
+  });
+
+  it('does not prune when only protected parts exist (no candidates)', () => {
+    // Exactly PROTECTED_TOKENS of newest parts; no older candidates.
+    const parts: PartForPrune[] = [part(PROTECTED_TOKENS, ago(10))];
+    expect(selectPruneTargets(parts, null)).toEqual({ ids: [], freedTokens: 0 });
+  });
+});
--- a/apps/server/src/services/tests/tools.test.ts
+++ b/apps/server/src/services/tests/tools.test.ts
@@ -0,0 +1,14 @@
+import { describe, it, expect } from 'vitest';
+import { ALL_TOOLS } from '../tools.js';
+
+describe('ALL_TOOLS registry', () => {
+  // v1.13.3: tools must be alpha-sorted at module load. llama.cpp's prompt
+  // cache hits on byte-identical prefixes; the tool list lives near the
+  // top of the system prompt, so any order drift invalidates every cached
+  // turn. The registry sort is the single source of truth; downstream
+  // helpers (toolJsonSchemas, TOOLS_BY_NAME, buildAiTools) inherit it.
+  it('exports tools in alphabetical order by name', () => {
+    const names = ALL_TOOLS.map((t) => t.name);
+    expect(names).toEqual([...names].sort((a, b) => a.localeCompare(b)));
+  });
+});
--- a/apps/server/src/services/inference/payload.ts
+++ b/apps/server/src/services/inference/payload.ts
@@ -8,6 +8,7 @@ import type {
 import * as compaction from '../compaction.js';
 import { buildSystemPrompt } from '../system-prompt.js';
 import { isAnySentinel } from './sentinels.js';
+import { PRUNE_TRIGGER_TOKENS, prune } from './prune.js';
 import type { InferenceContext } from './turn.js';

 export interface OpenAiMessage {
@@ -166,6 +167,26 @@ export async function maybeFlagForCompaction(
    contextLimit,
  );
  if (!overflow) return;
+
+  // v1.13.4: try the cheap prune first. If it freed at least the buffer
+  // worth of tokens (PRUNE_TRIGGER_TOKENS, identical to COMPACTION_BUFFER),
+  // we're below the threshold again — skip flagging summarize for the next
+  // turn. The next turn's overflow check will re-evaluate from scratch.
+  // Prune failures (DB errors etc.) propagate so the surrounding inference
+  // path sees them; the catch in finalizeCompletion / executeToolPhase
+  // doesn't shield this — by design, we want to know if prune is broken.
+  const pruned = await prune({ sql: ctx.sql, chatId });
+  if (pruned.hidden > 0) {
+    ctx.log.info(
+      { chatId, hidden: pruned.hidden, freedTokens: pruned.freedTokens },
+      'inference: prune freed context budget',
+    );
+  }
+  if (pruned.freedTokens >= PRUNE_TRIGGER_TOKENS) {
+    // Prune handled it; skip the (expensive) summarize path.
+    return;
+  }
+
  await ctx.sql`UPDATE chats SET needs_compaction = true WHERE id = ${chatId}`;
  ctx.log.info({ chatId, promptTokens, completionTokens, contextLimit }, 'inference: flagged for compaction');
 }
--- a/apps/server/src/services/inference/prune.ts
+++ b/apps/server/src/services/inference/prune.ts
@@ -0,0 +1,127 @@
+import type { Sql } from '../../db.js';
+
+// v1.13.4: two-tier compaction prune. Opencode's prune half (the cheap one);
+// summarize half shipped in v1.11.0 as services/compaction.ts.
+//
+// Algorithm: scan tool_result parts newest-first. Protect the last
+// PROTECTED_TOKENS of content (the model recently saw these — pruning them
+// kills coherence). Older parts are candidates. Mark them hidden_at only
+// if the candidate pool would free at least PRUNE_TRIGGER_TOKENS — pruning
+// 3 small tool_results to recover 500 tokens isn't worth the loss of
+// fidelity for the model's next turn.
+//
+// Stops at the last compaction summary boundary (chats.tail_start_id). The
+// v1.11.0 summary already encodes everything before that point; pruning
+// across the boundary would double-erase.
+
+export const PROTECTED_TOKENS = 40_000;
+export const PRUNE_TRIGGER_TOKENS = 20_000;
+
+// Rough char-to-token estimate. Same heuristic compaction's usable() uses
+// implicitly via the buffer constant.
+function estimateTokens(text: string): number {
+  return Math.ceil(text.length / 4);
+}
+
+function payloadTokens(payload: unknown): number {
+  return estimateTokens(JSON.stringify(payload ?? ''));
+}
+
+export interface PruneResult {
+  hidden: number;
+  freedTokens: number;
+}
+
+// Pure algorithmic core, exported for unit-test access. Takes parts already
+// ordered newest-first, plus an optional cutoff (last compaction summary
+// boundary). Returns the part ids to hide and the total token estimate of
+// the candidates. Caller does the DB UPDATE.
+export interface PartForPrune {
+  id: string;
+  payload: unknown;
+  created_at: Date;
+}
+
+export function selectPruneTargets(
+  partsNewestFirst: ReadonlyArray<PartForPrune>,
+  tailStartCreatedAt: Date | null,
+): { ids: string[]; freedTokens: number } {
+  let protectedTokens = 0;
+  const candidates: { id: string; tokens: number }[] = [];
+  let crossedProtection = false;
+
+  for (const part of partsNewestFirst) {
+    if (tailStartCreatedAt && part.created_at < tailStartCreatedAt) {
+      // Past the last summary boundary; the v1.11.0 anchored summary already
+      // covers everything older. Bail rather than double-erase.
+      break;
+    }
+    const tokens = payloadTokens(part.payload);
+    if (!crossedProtection) {
+      protectedTokens += tokens;
+      if (protectedTokens >= PROTECTED_TOKENS) {
+        crossedProtection = true;
+      }
+      continue;
+    }
+    candidates.push({ id: part.id, tokens });
+  }
+
+  const candidateTokens = candidates.reduce((s, c) => s + c.tokens, 0);
+  if (candidates.length === 0 || candidateTokens < PRUNE_TRIGGER_TOKENS) {
+    return { ids: [], freedTokens: 0 };
+  }
+  return { ids: candidates.map((c) => c.id), freedTokens: candidateTokens };
+}
+
+export async function prune(args: {
+  sql: Sql;
+  chatId: string;
+}): Promise<PruneResult> {
+  const { sql, chatId } = args;
+
+  // Newest-first scan of visible tool_result parts in this chat. Pull
+  // chats.tail_start_id alongside so we know where the last summary boundary
+  // sits (don't prune across it).
+  const parts = await sql<{
+    id: string;
+    payload: unknown;
+    created_at: Date;
+    tail_start_id: string | null;
+  }[]>`
+    SELECT p.id, p.payload, m.created_at,
+      (SELECT c.tail_start_id FROM chats c WHERE c.id = ${chatId}) AS tail_start_id
+    FROM message_parts p
+    JOIN messages m ON m.id = p.message_id
+    WHERE m.chat_id = ${chatId}
+      AND p.kind = 'tool_result'
+      AND p.hidden_at IS NULL
+    ORDER BY m.created_at DESC, p.sequence DESC
+  `;
+
+  if (parts.length === 0) {
+    return { hidden: 0, freedTokens: 0 };
+  }
+
+  // Read the boundary cutoff timestamp once. Older messages are off-limits.
+  let tailStartCreatedAt: Date | null = null;
+  const firstTailId = parts[0]?.tail_start_id ?? null;
+  if (firstTailId) {
+    const tailRow = await sql<{ created_at: Date }[]>`
+      SELECT created_at FROM messages WHERE id = ${firstTailId}
+    `;
+    tailStartCreatedAt = tailRow[0]?.created_at ?? null;
+  }
+
+  const decision = selectPruneTargets(parts, tailStartCreatedAt);
+  if (decision.ids.length === 0) {
+    return { hidden: 0, freedTokens: 0 };
+  }
+
+  await sql`
+    UPDATE message_parts
+    SET hidden_at = clock_timestamp()
+    WHERE id = ANY(${decision.ids})
+  `;
+  return { hidden: decision.ids.length, freedTokens: decision.freedTokens };
+}
--- a/apps/server/src/services/inference/stream-phase.ts
+++ b/apps/server/src/services/inference/stream-phase.ts
@@ -19,7 +19,14 @@ import type {
  TurnArgs,
 } from './turn.js';
 import { upstreamModel } from './provider.js';
-import { jsonSchema, streamText, tool, type JSONValue, type ModelMessage } from 'ai';
+import {
+  jsonSchema,
+  streamText,
+  tool,
+  type JSONValue,
+  type ModelMessage,
+  type ToolCallRepairFunction,
+} from 'ai';

 interface StreamOptions {
  // null = omit tools entirely (compact phase); [] = caller stripped all tools
@@ -155,10 +162,36 @@ export async function streamCompletion(
  // Replaces the v1.13.1-A counter-only diagnostic.
  let reasoningAccumulated = '';

+  // v1.13.3: experimental_repairToolCall keeps the stream alive when the
+  // model emits a malformed tool call (bad JSON args, unknown name, etc.).
+  // Without a repair function streamText throws and the WHOLE stream dies;
+  // with one, the SDK invokes us and we route the bad call through normally.
+  // Strategy: pass through unmodified. executeToolPhase's existing error
+  // path (unknown tool name → "unknown tool: X" result; zod-reject → tool
+  // 'X' rejected — fieldname: required) already gives the model a clean
+  // recovery surface on the next turn. Logging gives us visibility into
+  // how often qwen3.6 actually emits broken calls.
+  const repairToolCall: ToolCallRepairFunction<NonNullable<typeof aiTools>> = async ({
+    toolCall,
+    error,
+  }) => {
+    ctx.log.warn(
+      {
+        toolCallId: toolCall.toolCallId,
+        toolName: toolCall.toolName,
+        error: error.message,
+      },
+      'malformed tool call surfaced via repairToolCall',
+    );
+    return toolCall;
+  };
+
  const result = streamText({
    model: upstreamModel(ctx.config.LLAMA_SWAP_URL, model),
    messages: aiMessages,
-    ...(aiTools ? { tools: aiTools, toolChoice: 'auto' as const } : {}),
+    ...(aiTools
+      ? { tools: aiTools, toolChoice: 'auto' as const, experimental_repairToolCall: repairToolCall }
+      : {}),
    ...(typeof opts.temperature === 'number' ? { temperature: opts.temperature } : {}),
    abortSignal: signal,
  });
--- a/apps/server/src/services/tools.ts
+++ b/apps/server/src/services/tools.ts
@@ -527,6 +527,11 @@ export const askUserInput: ToolDef<AskUserInputInputT> = {
  },
 };

+// v1.13.3: alpha-sorted by tool.name at module load. llama.cpp's prompt
+// cache hits on byte-identical prefixes; the tool list lives near the top
+// of the system prompt, so any order drift would invalidate every cached
+// turn. Single source of truth for ordering lives here — toolJsonSchemas()
+// and TOOLS_BY_NAME inherit it.
 export const ALL_TOOLS: ReadonlyArray<ToolDef<unknown>> = [
  viewFile as ToolDef<unknown>,
  listDir as ToolDef<unknown>,
@@ -553,7 +558,7 @@ export const ALL_TOOLS: ReadonlyArray<ToolDef<unknown>> = [
  watchChanges as ToolDef<unknown>,
  getSemanticNeighborhoods as ToolDef<unknown>,
  getFrameworkAnalysis as ToolDef<unknown>,
-];
+].sort((a, b) => a.name.localeCompare(b.name));

 // v1.8.2: forward-compatible read-only whitelist. An agent whose `tools` is
 // fully contained in this set gets a generous default tool budget (30);