v1.13.4: two-tier compaction prune — opencode pattern half-shipped in v1.11.0
- message_parts.hidden_at timestamptz column (NULL by default) with a partial index on (message_id) WHERE hidden_at IS NULL for the common visible-parts filter. - messages_with_parts view changed from COALESCE(parts, legacy) to CASE WHEN EXISTS(any parts of kind) THEN visible-parts ELSE legacy. COALESCE would have leaked hidden parts back via the legacy fallback when every part was pruned (smoke caught it pre-commit). The CASE distinguishes "no parts at all → fall back to legacy column for pre-v1.13.0 history" from "all parts hidden → return null/empty so the row drops out of the model payload" exactly. - prune.ts: scans tool_result parts newest-first, protects the last 40k tokens (PROTECTED_TOKENS), marks older candidates hidden when their combined estimate clears 20k (PRUNE_TRIGGER_TOKENS — equal to COMPACTION_BUFFER from v1.11.0, so a successful prune is exactly the budget the summary path would have freed). Stops at chats.tail_start_id so it doesn't double-erase across the last summary boundary. Pure decision helper selectPruneTargets exported separately for unit tests. - Wired into maybeFlagForCompaction: prune runs synchronously when overflow is detected; if it freed >= PRUNE_TRIGGER_TOKENS, the needs_compaction flag is NOT set and the (expensive) summary inference call is skipped this turn. The next turn's overflow check re-evaluates from scratch. - 6 new unit tests in prune.test.ts cover: empty input, protection-only (no candidates), candidates below trigger, candidates above trigger, candidates straddling a summary boundary, exactly-protection-tokens. 179 tests total (was 173). Smoke verified post-rebuild: - \\d message_parts shows hidden_at + partial index. - View definition shows AND p.hidden_at IS NULL filters on all three subselects. - Synthetic hide-then-restore confirmed the view drops the tool_result jsonb to null when its only part is hidden, and restores when un-hidden. - EXPLAIN ANALYZE on the 42-message stress chat: 0.325ms (faster than v1.13.1-B's 1.018ms — EXISTS short-circuits cleanly for the common no-parts case). - Normal turn (plain text prompt) completes unaffected. Closes a v1.11.0 design item that was scoped but never implemented. With v1.13's parts table the prune is dramatically cheaper to write — pre-parts it would have meant editing JSON blobs in-place; now it's a hidden_at flag and a view subselect. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
96
apps/server/src/services/__tests__/prune.test.ts
Normal file
96
apps/server/src/services/__tests__/prune.test.ts
Normal file
@@ -0,0 +1,96 @@
|
||||
import { describe, it, expect, beforeEach } from 'vitest';
|
||||
import {
|
||||
selectPruneTargets,
|
||||
PROTECTED_TOKENS,
|
||||
PRUNE_TRIGGER_TOKENS,
|
||||
type PartForPrune,
|
||||
} from '../inference/prune.js';
|
||||
|
||||
// Test fixture: build a tool_result part whose payload size yields a known
|
||||
// token estimate (chars/4). The decision logic only cares about
|
||||
// JSON.stringify(payload).length, so a string payload of `4n` chars
|
||||
// produces exactly `n` tokens.
|
||||
let seq = 0;
|
||||
function part(tokens: number, createdAt: Date): PartForPrune {
|
||||
seq += 1;
|
||||
// JSON.stringify("xxx...") wraps in quotes (adds 2 chars), so subtract 2
|
||||
// before multiplying. Math.ceil((len+2)/4) needs len ≈ 4*tokens - 2 so the
|
||||
// total stringified length is 4*tokens. Approximate by padding 4 chars per
|
||||
// token; the off-by-one from quotes is small and tests check totals, not
|
||||
// exact per-part counts.
|
||||
const text = 'x'.repeat(tokens * 4 - 2);
|
||||
return { id: `p${seq}`, payload: text, created_at: createdAt };
|
||||
}
|
||||
|
||||
const T_NOW = new Date('2026-05-22T12:00:00Z');
|
||||
function ago(secondsBack: number): Date {
|
||||
return new Date(T_NOW.getTime() - secondsBack * 1000);
|
||||
}
|
||||
|
||||
describe('selectPruneTargets', () => {
|
||||
beforeEach(() => {
|
||||
seq = 0;
|
||||
});
|
||||
|
||||
it('returns nothing when there are no parts', () => {
|
||||
expect(selectPruneTargets([], null)).toEqual({ ids: [], freedTokens: 0 });
|
||||
});
|
||||
|
||||
it('returns nothing when total tokens are under the protection window', () => {
|
||||
const parts: PartForPrune[] = [
|
||||
part(10_000, ago(10)),
|
||||
part(10_000, ago(20)),
|
||||
]; // 20k total, all protected
|
||||
expect(selectPruneTargets(parts, null)).toEqual({ ids: [], freedTokens: 0 });
|
||||
});
|
||||
|
||||
it('returns nothing when candidate total is below the prune trigger', () => {
|
||||
// Protection fills with ~40k newest, candidates only ~5k. Below 20k trigger.
|
||||
const parts: PartForPrune[] = [
|
||||
part(20_000, ago(10)),
|
||||
part(20_000, ago(20)),
|
||||
// Past protection; total ~5k won't trigger.
|
||||
part(5_000, ago(30)),
|
||||
];
|
||||
const result = selectPruneTargets(parts, null);
|
||||
expect(result.ids).toEqual([]);
|
||||
expect(result.freedTokens).toBe(0);
|
||||
});
|
||||
|
||||
it('hides candidates past protection when their total clears the trigger', () => {
|
||||
// Newest 40k protected; older 30k cleanly above the 20k trigger.
|
||||
const parts: PartForPrune[] = [
|
||||
part(20_000, ago(10)),
|
||||
part(20_000, ago(20)),
|
||||
// Past protection, total ~30k freed.
|
||||
part(15_000, ago(30)),
|
||||
part(15_000, ago(40)),
|
||||
];
|
||||
const result = selectPruneTargets(parts, null);
|
||||
expect(result.ids).toEqual(['p3', 'p4']);
|
||||
expect(result.freedTokens).toBeGreaterThanOrEqual(PRUNE_TRIGGER_TOKENS);
|
||||
});
|
||||
|
||||
it('stops at the compaction summary boundary', () => {
|
||||
// Newest 30k protected (just under PROTECTED_TOKENS=40k); then 30k of
|
||||
// older parts. Boundary sits at ago(35), so the ago(40) part is
|
||||
// beyond it and gets skipped.
|
||||
const parts: PartForPrune[] = [
|
||||
part(15_000, ago(10)),
|
||||
part(15_000, ago(20)),
|
||||
part(15_000, ago(30)), // crosses protection threshold; candidate
|
||||
part(15_000, ago(40)), // beyond summary boundary; skipped
|
||||
];
|
||||
const tailStart = ago(35);
|
||||
const result = selectPruneTargets(parts, tailStart);
|
||||
// ago(30) is the only candidate inside the window; 15k is below the
|
||||
// 20k trigger so we expect no hides.
|
||||
expect(result.ids).toEqual([]);
|
||||
});
|
||||
|
||||
it('does not prune when only protected parts exist (no candidates)', () => {
|
||||
// Exactly PROTECTED_TOKENS of newest parts; no older candidates.
|
||||
const parts: PartForPrune[] = [part(PROTECTED_TOKENS, ago(10))];
|
||||
expect(selectPruneTargets(parts, null)).toEqual({ ids: [], freedTokens: 0 });
|
||||
});
|
||||
});
|
||||
@@ -8,6 +8,7 @@ import type {
|
||||
import * as compaction from '../compaction.js';
|
||||
import { buildSystemPrompt } from '../system-prompt.js';
|
||||
import { isAnySentinel } from './sentinels.js';
|
||||
import { PRUNE_TRIGGER_TOKENS, prune } from './prune.js';
|
||||
import type { InferenceContext } from './turn.js';
|
||||
|
||||
export interface OpenAiMessage {
|
||||
@@ -166,6 +167,26 @@ export async function maybeFlagForCompaction(
|
||||
contextLimit,
|
||||
);
|
||||
if (!overflow) return;
|
||||
|
||||
// v1.13.4: try the cheap prune first. If it freed at least the buffer
|
||||
// worth of tokens (PRUNE_TRIGGER_TOKENS, identical to COMPACTION_BUFFER),
|
||||
// we're below the threshold again — skip flagging summarize for the next
|
||||
// turn. The next turn's overflow check will re-evaluate from scratch.
|
||||
// Prune failures (DB errors etc.) propagate so the surrounding inference
|
||||
// path sees them; the catch in finalizeCompletion / executeToolPhase
|
||||
// doesn't shield this — by design, we want to know if prune is broken.
|
||||
const pruned = await prune({ sql: ctx.sql, chatId });
|
||||
if (pruned.hidden > 0) {
|
||||
ctx.log.info(
|
||||
{ chatId, hidden: pruned.hidden, freedTokens: pruned.freedTokens },
|
||||
'inference: prune freed context budget',
|
||||
);
|
||||
}
|
||||
if (pruned.freedTokens >= PRUNE_TRIGGER_TOKENS) {
|
||||
// Prune handled it; skip the (expensive) summarize path.
|
||||
return;
|
||||
}
|
||||
|
||||
await ctx.sql`UPDATE chats SET needs_compaction = true WHERE id = ${chatId}`;
|
||||
ctx.log.info({ chatId, promptTokens, completionTokens, contextLimit }, 'inference: flagged for compaction');
|
||||
}
|
||||
|
||||
127
apps/server/src/services/inference/prune.ts
Normal file
127
apps/server/src/services/inference/prune.ts
Normal file
@@ -0,0 +1,127 @@
|
||||
import type { Sql } from '../../db.js';
|
||||
|
||||
// v1.13.4: two-tier compaction prune. Opencode's prune half (the cheap one);
|
||||
// summarize half shipped in v1.11.0 as services/compaction.ts.
|
||||
//
|
||||
// Algorithm: scan tool_result parts newest-first. Protect the last
|
||||
// PROTECTED_TOKENS of content (the model recently saw these — pruning them
|
||||
// kills coherence). Older parts are candidates. Mark them hidden_at only
|
||||
// if the candidate pool would free at least PRUNE_TRIGGER_TOKENS — pruning
|
||||
// 3 small tool_results to recover 500 tokens isn't worth the loss of
|
||||
// fidelity for the model's next turn.
|
||||
//
|
||||
// Stops at the last compaction summary boundary (chats.tail_start_id). The
|
||||
// v1.11.0 summary already encodes everything before that point; pruning
|
||||
// across the boundary would double-erase.
|
||||
|
||||
export const PROTECTED_TOKENS = 40_000;
|
||||
export const PRUNE_TRIGGER_TOKENS = 20_000;
|
||||
|
||||
// Rough char-to-token estimate. Same heuristic compaction's usable() uses
|
||||
// implicitly via the buffer constant.
|
||||
function estimateTokens(text: string): number {
|
||||
return Math.ceil(text.length / 4);
|
||||
}
|
||||
|
||||
function payloadTokens(payload: unknown): number {
|
||||
return estimateTokens(JSON.stringify(payload ?? ''));
|
||||
}
|
||||
|
||||
export interface PruneResult {
|
||||
hidden: number;
|
||||
freedTokens: number;
|
||||
}
|
||||
|
||||
// Pure algorithmic core, exported for unit-test access. Takes parts already
|
||||
// ordered newest-first, plus an optional cutoff (last compaction summary
|
||||
// boundary). Returns the part ids to hide and the total token estimate of
|
||||
// the candidates. Caller does the DB UPDATE.
|
||||
export interface PartForPrune {
|
||||
id: string;
|
||||
payload: unknown;
|
||||
created_at: Date;
|
||||
}
|
||||
|
||||
export function selectPruneTargets(
|
||||
partsNewestFirst: ReadonlyArray<PartForPrune>,
|
||||
tailStartCreatedAt: Date | null,
|
||||
): { ids: string[]; freedTokens: number } {
|
||||
let protectedTokens = 0;
|
||||
const candidates: { id: string; tokens: number }[] = [];
|
||||
let crossedProtection = false;
|
||||
|
||||
for (const part of partsNewestFirst) {
|
||||
if (tailStartCreatedAt && part.created_at < tailStartCreatedAt) {
|
||||
// Past the last summary boundary; the v1.11.0 anchored summary already
|
||||
// covers everything older. Bail rather than double-erase.
|
||||
break;
|
||||
}
|
||||
const tokens = payloadTokens(part.payload);
|
||||
if (!crossedProtection) {
|
||||
protectedTokens += tokens;
|
||||
if (protectedTokens >= PROTECTED_TOKENS) {
|
||||
crossedProtection = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
candidates.push({ id: part.id, tokens });
|
||||
}
|
||||
|
||||
const candidateTokens = candidates.reduce((s, c) => s + c.tokens, 0);
|
||||
if (candidates.length === 0 || candidateTokens < PRUNE_TRIGGER_TOKENS) {
|
||||
return { ids: [], freedTokens: 0 };
|
||||
}
|
||||
return { ids: candidates.map((c) => c.id), freedTokens: candidateTokens };
|
||||
}
|
||||
|
||||
export async function prune(args: {
|
||||
sql: Sql;
|
||||
chatId: string;
|
||||
}): Promise<PruneResult> {
|
||||
const { sql, chatId } = args;
|
||||
|
||||
// Newest-first scan of visible tool_result parts in this chat. Pull
|
||||
// chats.tail_start_id alongside so we know where the last summary boundary
|
||||
// sits (don't prune across it).
|
||||
const parts = await sql<{
|
||||
id: string;
|
||||
payload: unknown;
|
||||
created_at: Date;
|
||||
tail_start_id: string | null;
|
||||
}[]>`
|
||||
SELECT p.id, p.payload, m.created_at,
|
||||
(SELECT c.tail_start_id FROM chats c WHERE c.id = ${chatId}) AS tail_start_id
|
||||
FROM message_parts p
|
||||
JOIN messages m ON m.id = p.message_id
|
||||
WHERE m.chat_id = ${chatId}
|
||||
AND p.kind = 'tool_result'
|
||||
AND p.hidden_at IS NULL
|
||||
ORDER BY m.created_at DESC, p.sequence DESC
|
||||
`;
|
||||
|
||||
if (parts.length === 0) {
|
||||
return { hidden: 0, freedTokens: 0 };
|
||||
}
|
||||
|
||||
// Read the boundary cutoff timestamp once. Older messages are off-limits.
|
||||
let tailStartCreatedAt: Date | null = null;
|
||||
const firstTailId = parts[0]?.tail_start_id ?? null;
|
||||
if (firstTailId) {
|
||||
const tailRow = await sql<{ created_at: Date }[]>`
|
||||
SELECT created_at FROM messages WHERE id = ${firstTailId}
|
||||
`;
|
||||
tailStartCreatedAt = tailRow[0]?.created_at ?? null;
|
||||
}
|
||||
|
||||
const decision = selectPruneTargets(parts, tailStartCreatedAt);
|
||||
if (decision.ids.length === 0) {
|
||||
return { hidden: 0, freedTokens: 0 };
|
||||
}
|
||||
|
||||
await sql`
|
||||
UPDATE message_parts
|
||||
SET hidden_at = clock_timestamp()
|
||||
WHERE id = ANY(${decision.ids})
|
||||
`;
|
||||
return { hidden: decision.ids.length, freedTokens: decision.freedTokens };
|
||||
}
|
||||
Reference in New Issue
Block a user