Files
boocode/apps/server/src/services/__tests__/compaction.test.ts
indifferentketchup bcc89d8adc feat: MistakeTracker + file-provenance ledger (v2.7.4)
Two native-inference hardening features from boocode_code_review_v2 §1 #12.

MistakeTracker: new pure mistake-tracker.ts tracks consecutive heterogeneous
tool failures (kinds surfaced per tool from tool-phase.ts). On 3 in a row the
turn loop soft-nudges (model-facing recovery guidance + mistake_recovery
sentinel + reset), then escalates to stopping the turn (cap-hit-style, Continue
affordance) on a re-trip. Complements doom-loop (identical repeats) + cap-hit.

File-provenance ledger: compaction.ts derives a deterministic ## Files Read list
from the head messages' read-tool calls and injects it into the rolling-summary
prompt so provenance survives compaction (no new table; read-only).

mistake_recovery sentinel: MessageMetadata arm (server + web) + MessageBubble
render branch. Built by 2 parallel agents. Server 545 tests passing (23 new);
build + web tsc clean. Native-inference only. Builds on v2.7.3.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-01 13:05:03 +00:00

428 lines
16 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { describe, it, expect } from 'vitest';
import {
usable,
isOverflow,
estimate,
turns,
select,
buildPrompt,
buildHeadPayload,
deriveFilesRead,
buildFilesReadContext,
type CompactionMessage,
} from '../compaction.js';
import { SUMMARY_TEMPLATE } from '../compaction-prompt.js';
// ---- fixture ----------------------------------------------------------------
// Tiny constructor for the message shape `compaction.ts` consumes. Default
// values match the post-CP1 schema (summary=false, kind='message', complete).
// Tests that need a summary row pass `summary: true`.
let counter = 0;
function mkMsg(
role: CompactionMessage['role'],
content: string,
overrides: Partial<CompactionMessage> = {},
): CompactionMessage {
counter += 1;
return {
id: `m${counter}`,
role,
content,
kind: 'message',
summary: false,
status: 'complete',
tool_calls: null,
tool_results: null,
reasoning_parts: null,
metadata: null,
created_at: new Date(counter * 1000).toISOString(),
...overrides,
};
}
// ---- usable -----------------------------------------------------------------
// v1.13.9: ratio-only early trigger at 0.85 × contextLimit. Replaces the
// v1.11.0-era `contextLimit - 20_000` math, which degenerated to 0 for
// contexts ≤20k and gave only 7-8% headroom at 262k.
describe('usable() — ratio-only early trigger (v1.13.9)', () => {
it('returns floor(0.85 * limit) for the qwen3.6 daily-driver context', () => {
// floor(0.85 * 262144) = floor(222822.4) = 222822 — 15% headroom for
// the summarizer to do its turn without itself overflowing.
expect(usable(262144)).toBe(222822);
});
it('returns 0.85× for a mid-sized context', () => {
expect(usable(100_000)).toBe(85_000);
});
it('returns 0.85× for a small context (no degenerate 0)', () => {
// floor(0.85 * 8192) = 6963. Under the old formula this returned 0
// (8192 - 20_000 clamped to 0), effectively disabling compaction for
// small-context models. The ratio keeps the trigger active.
expect(usable(8192)).toBe(6963);
});
it('returns 0 for zero or negative contextLimit', () => {
expect(usable(0)).toBe(0);
expect(usable(-1)).toBe(0);
});
});
// ---- isOverflow -------------------------------------------------------------
describe('isOverflow', () => {
it('returns false when usable is 0 (unknown contextLimit)', () => {
expect(isOverflow({ prompt_tokens: 999_999, completion_tokens: 0 }, 0)).toBe(false);
expect(isOverflow({ prompt_tokens: 0, completion_tokens: 999_999 }, -1)).toBe(false);
});
it('returns false at 50% of usable', () => {
// v1.13.9: usable(100k) = 85k → 50% ≈ 42.5k.
expect(isOverflow({ prompt_tokens: 30_000, completion_tokens: 10_000 }, 100_000)).toBe(false);
});
it('returns false just under usable', () => {
// v1.13.9: 84_000 + 999 = 84_999 < 85_000 budget.
expect(isOverflow({ prompt_tokens: 84_000, completion_tokens: 999 }, 100_000)).toBe(false);
});
it('returns true exactly at usable (>=, not strict >)', () => {
// v1.13.9: 85_000 == usable(100_000).
expect(isOverflow({ prompt_tokens: 85_000, completion_tokens: 0 }, 100_000)).toBe(true);
});
it('returns true above usable', () => {
// 50_000 + 40_000 = 90_000 > 85_000.
expect(isOverflow({ prompt_tokens: 50_000, completion_tokens: 40_000 }, 100_000)).toBe(true);
});
});
// ---- estimate ---------------------------------------------------------------
describe('estimate', () => {
it('returns a tiny value for an empty array (JSON.stringify([]) is "[]")', () => {
// Math.ceil('[]'.length / 4) = 1. Documented here so the next reader
// doesn't think "0" is the expected baseline — char-count/4 will never
// be exactly 0 for any JSON-serializable input.
expect(estimate([])).toBe(1);
});
it('scales roughly with content length', () => {
const tiny = estimate([mkMsg('user', 'hi')]);
const big = estimate([mkMsg('user', 'x'.repeat(4000))]);
expect(big).toBeGreaterThan(tiny);
expect(big).toBeGreaterThanOrEqual(1000); // 4000 chars / 4 = 1000 floor
});
it('is deterministic across repeated calls', () => {
const msgs = [mkMsg('user', 'one'), mkMsg('assistant', 'two')];
expect(estimate(msgs)).toBe(estimate(msgs));
});
});
// ---- turns ------------------------------------------------------------------
describe('turns', () => {
it('returns [] for an empty message list', () => {
expect(turns([])).toEqual([]);
});
it('returns one turn for a single user message', () => {
const u = mkMsg('user', 'hi');
const result = turns([u]);
expect(result).toHaveLength(1);
expect(result[0]).toEqual({ start: 0, end: 1, id: u.id });
});
it('returns two turns for user/assistant/user/assistant', () => {
const u1 = mkMsg('user', 'q1');
const a1 = mkMsg('assistant', 'a1');
const u2 = mkMsg('user', 'q2');
const a2 = mkMsg('assistant', 'a2');
const result = turns([u1, a1, u2, a2]);
expect(result).toEqual([
{ start: 0, end: 2, id: u1.id },
{ start: 2, end: 4, id: u2.id },
]);
});
it('extends the final turn end to include trailing non-user messages', () => {
// Spec wording: "user/assistant + trailing system → trailing included
// in last turn's range". Single-turn variant: [user, assistant, system]
// should produce one turn with end=3 (covers all three indices).
const u = mkMsg('user', 'q');
const a = mkMsg('assistant', 'a');
const s = mkMsg('system', 'note');
const result = turns([u, a, s]);
expect(result).toEqual([{ start: 0, end: 3, id: u.id }]);
});
it('skips user rows flagged as summary (anchored-rolling rows)', () => {
// Defense-in-depth — process() pre-filters summary rows, but turns()
// also skips them so a misuse from another caller doesn't create a
// bogus turn boundary on the summary row itself.
const u1 = mkMsg('user', 'q1');
const a1 = mkMsg('assistant', 'a1');
const sum = mkMsg('user', 'rolled-up', { summary: true });
const u2 = mkMsg('user', 'q2');
const result = turns([u1, a1, sum, u2]);
expect(result.map((t) => t.id)).toEqual([u1.id, u2.id]);
});
});
// ---- select -----------------------------------------------------------------
describe('select', () => {
it('returns empty head + undefined tail for an empty message list', () => {
const result = select([], 100_000);
expect(result.head).toEqual([]);
expect(result.tail_start_id).toBeUndefined();
});
it('full-preserves when there are fewer turns than tail_turns', () => {
// 1 turn but tail_turns=2: keep === turn0 → keep.start === 0 →
// sentinel-return path that signals "no compaction this round".
const u = mkMsg('user', 'only');
const a = mkMsg('assistant', 'a');
const result = select([u, a], 100_000, 2);
expect(result.head).toEqual([u, a]);
expect(result.tail_start_id).toBeUndefined();
});
it('keeps the last tail_turns turns when they all fit the budget', () => {
// 3 turns, all small. tail_turns=2 means keep the last 2; head =
// messages[0..turn2.start] = just turn1's content.
const u1 = mkMsg('user', 'q1');
const a1 = mkMsg('assistant', 'a1');
const u2 = mkMsg('user', 'q2');
const a2 = mkMsg('assistant', 'a2');
const u3 = mkMsg('user', 'q3');
const a3 = mkMsg('assistant', 'a3');
const msgs = [u1, a1, u2, a2, u3, a3];
const result = select(msgs, 100_000, 2);
// Turn boundaries: [0,2), [2,4), [4,6). slice(-2) = turns at 2 and 4.
// Walking backward: u3 fits, then u2 fits → keep={start:2, id:u2.id}.
expect(result.tail_start_id).toBe(u2.id);
expect(result.head).toEqual([u1, a1]);
});
it('splits a turn mid-stream when the whole turn would overflow the budget', () => {
// tail_turns=1 so we look only at the most recent turn. Stuff it past
// 8k of content (max preserve budget) and the splitter walks forward
// looking for the largest suffix that fits.
const u1 = mkMsg('user', 'q1');
const a1 = mkMsg('assistant', 'a1');
const u2 = mkMsg('user', 'q2 with a giant payload');
const huge = mkMsg('assistant', 'X'.repeat(40_000)); // ~10k tokens
const smallTail = mkMsg('assistant', 'short answer');
const msgs = [u1, a1, u2, huge, smallTail];
const result = select(msgs, 100_000, 1);
// The split walks from turn.start+1 forward; the first index whose
// [i, end) slice fits the budget becomes the new keep. We don't assert
// a specific id (depends on character math), only that compaction was
// triggered (tail_start_id set, head non-empty) and that the head
// doesn't include the final small message.
expect(result.tail_start_id).toBeDefined();
expect(result.head.length).toBeGreaterThan(0);
expect(result.head).not.toContain(smallTail);
});
it('full-preserves when no split point fits', () => {
// Single oversized turn; splitTurn walks but each suffix is still too
// big. After the loop, keep is undefined → full-preserve sentinel.
// Force this with a sub-buffer context so budget is the floor (2k),
// and a single 40k-char message.
const u = mkMsg('user', 'oversized');
const a = mkMsg('assistant', 'Y'.repeat(40_000));
const result = select([u, a], 30_000, 1);
// v1.13.9: usable(30k) = floor(0.85*30k) = 25500 → budget =
// min(8k, max(2k, floor(25500*0.25))) = min(8k, max(2k, 6375)) = 6375.
// 40k chars ≈ 10k tokens. Still can't fit (10k > 6375).
expect(result.tail_start_id).toBeUndefined();
expect(result.head).toEqual([u, a]);
});
});
// ---- buildPrompt ------------------------------------------------------------
describe('buildPrompt', () => {
it('opens with the "create new" anchor when previousSummary is undefined', () => {
const out = buildPrompt(undefined, []);
expect(out.startsWith('Create a new anchored summary')).toBe(true);
expect(out).toContain(SUMMARY_TEMPLATE);
expect(out).not.toContain('<previous-summary>');
});
it('opens with the "update" anchor and embeds previousSummary verbatim', () => {
const prev = '## Goal\n- finish v1.11 compaction';
const out = buildPrompt(prev, []);
expect(out.startsWith('Update the anchored summary')).toBe(true);
expect(out).toContain('<previous-summary>');
expect(out).toContain(prev);
expect(out).toContain('</previous-summary>');
expect(out).toContain(SUMMARY_TEMPLATE);
});
it('appends extra context strings after the template (reserved for plugin injection)', () => {
const out = buildPrompt(undefined, ['extra-context-line']);
expect(out.endsWith('extra-context-line')).toBe(true);
});
});
// ---- buildHeadPayload (v1.13.6) -----------------------------------------------
describe('buildHeadPayload reasoning render', () => {
it('emits reasoning as a <reasoning> tag prefixed onto the assistant content', () => {
const out = buildHeadPayload([
mkMsg('user', 'show me the file'),
mkMsg('assistant', 'reading it now', {
reasoning_parts: [{ text: 'user wants src/index.ts; I should view it' }],
}),
]);
expect(out).toHaveLength(2);
expect(out[1]!.role).toBe('assistant');
expect(out[1]!.content).toBe(
'<reasoning>user wants src/index.ts; I should view it</reasoning>\n\nreading it now',
);
});
it('emits a standalone <reasoning> tag when reasoning is present but content is empty (tool-call-only turn)', () => {
const out = buildHeadPayload([
mkMsg('assistant', '', {
reasoning_parts: [{ text: 'jumping straight to grep' }],
tool_calls: [{ id: 'c1', name: 'grep', args: { pattern: 'foo' } }],
}),
]);
expect(out).toHaveLength(1);
expect(out[0]!.content).toBe('<reasoning>jumping straight to grep</reasoning>');
expect(out[0]!.tool_calls).toHaveLength(1);
expect(out[0]!.tool_calls![0]!.function.name).toBe('grep');
});
it('joins multiple reasoning parts without separators (matches the streaming concat)', () => {
const out = buildHeadPayload([
mkMsg('assistant', 'final answer', {
reasoning_parts: [{ text: 'first thought ' }, { text: 'second thought' }],
}),
]);
expect(out[0]!.content).toBe(
'<reasoning>first thought second thought</reasoning>\n\nfinal answer',
);
});
it('omits the reasoning tag entirely when reasoning_parts is null or empty', () => {
const out = buildHeadPayload([
mkMsg('assistant', 'plain answer', { reasoning_parts: null }),
mkMsg('assistant', 'other answer', { reasoning_parts: [] }),
]);
expect(out[0]!.content).toBe('plain answer');
expect(out[1]!.content).toBe('other answer');
expect(out[0]!.content).not.toContain('<reasoning>');
expect(out[1]!.content).not.toContain('<reasoning>');
});
});
// ---- buildHeadPayload sentinel stripping (#12) -------------------------------
describe('buildHeadPayload strips all UI sentinels', () => {
it('drops cap_hit, doom_loop, and mistake_recovery system rows', () => {
const out = buildHeadPayload([
mkMsg('user', 'do the thing'),
mkMsg('system', 'budget reached', { metadata: { kind: 'cap_hit' } }),
mkMsg('system', 'looping', { metadata: { kind: 'doom_loop' } }),
mkMsg('system', 'repeated errors', { metadata: { kind: 'mistake_recovery' } }),
mkMsg('assistant', 'answer'),
]);
// Only the user + assistant rows survive; all three sentinels stripped.
expect(out).toHaveLength(2);
expect(out[0]!.role).toBe('user');
expect(out[1]!.role).toBe('assistant');
});
it('keeps a non-sentinel system row (e.g. compact bridge) untouched', () => {
const out = buildHeadPayload([
mkMsg('system', 'legacy compact', { kind: 'compact', metadata: null }),
mkMsg('user', 'q'),
]);
expect(out[0]!.role).toBe('system');
expect(out[0]!.content).toBe('legacy compact');
});
});
// ---- file-provenance ledger (#12, Part B) -----------------------------------
describe('deriveFilesRead', () => {
it('returns [] when the head has no read-tool calls', () => {
expect(deriveFilesRead([mkMsg('user', 'hi'), mkMsg('assistant', 'hello')])).toEqual([]);
});
it('extracts the path arg from view_file / list_dir / grep / find_files', () => {
const head = [
mkMsg('assistant', '', {
tool_calls: [
{ id: 'c1', name: 'view_file', args: { path: 'src/index.ts' } },
{ id: 'c2', name: 'list_dir', args: { path: 'src' } },
{ id: 'c3', name: 'grep', args: { pattern: 'TODO', path: 'apps' } },
{ id: 'c4', name: 'find_files', args: { pattern: '**/*.ts', path: 'lib' } },
],
}),
];
expect(deriveFilesRead(head)).toEqual(['apps', 'lib', 'src', 'src/index.ts']);
});
it('dedupes and sorts paths across multiple assistant turns', () => {
const head = [
mkMsg('assistant', '', { tool_calls: [{ id: 'c1', name: 'view_file', args: { path: 'b.ts' } }] }),
mkMsg('assistant', '', { tool_calls: [{ id: 'c2', name: 'view_file', args: { path: 'a.ts' } }] }),
mkMsg('assistant', '', { tool_calls: [{ id: 'c3', name: 'view_file', args: { path: 'b.ts' } }] }),
];
expect(deriveFilesRead(head)).toEqual(['a.ts', 'b.ts']);
});
it('ignores non-read tools and grep calls without a path arg', () => {
const head = [
mkMsg('assistant', '', {
tool_calls: [
{ id: 'c1', name: 'web_search', args: { query: 'x' } },
{ id: 'c2', name: 'grep', args: { pattern: 'foo' } }, // no path → root, skipped
{ id: 'c3', name: 'view_file', args: { path: 'kept.ts' } },
],
}),
];
expect(deriveFilesRead(head)).toEqual(['kept.ts']);
});
it('ignores read-tool calls on non-assistant rows', () => {
const head = [
mkMsg('user', '', { tool_calls: [{ id: 'c1', name: 'view_file', args: { path: 'nope.ts' } }] }),
];
expect(deriveFilesRead(head)).toEqual([]);
});
});
describe('buildFilesReadContext', () => {
it('returns null when nothing was read (no empty section injected)', () => {
expect(buildFilesReadContext([mkMsg('user', 'hi')])).toBeNull();
});
it('formats a ## Files Read block with sorted bullet paths', () => {
const head = [
mkMsg('assistant', '', {
tool_calls: [
{ id: 'c1', name: 'view_file', args: { path: 'z.ts' } },
{ id: 'c2', name: 'view_file', args: { path: 'a.ts' } },
],
}),
];
expect(buildFilesReadContext(head)).toBe('## Files Read\n- a.ts\n- z.ts');
});
});
describe('SUMMARY_TEMPLATE includes the Files Read section (#12)', () => {
it('declares a ## Files Read section the model must maintain', () => {
expect(SUMMARY_TEMPLATE).toContain('## Files Read');
});
});