diff --git a/apps/server/src/services/__tests__/compaction.test.ts b/apps/server/src/services/__tests__/compaction.test.ts index 0188f89..f65997d 100644 --- a/apps/server/src/services/__tests__/compaction.test.ts +++ b/apps/server/src/services/__tests__/compaction.test.ts @@ -41,49 +41,58 @@ function mkMsg( // ---- usable ----------------------------------------------------------------- -describe('usable', () => { - it('returns 0 when contextLimit is 0', () => { +// v1.13.9: ratio-only early trigger at 0.85 × contextLimit. Replaces the +// v1.11.0-era `contextLimit - 20_000` math, which degenerated to 0 for +// contexts ≤20k and gave only 7-8% headroom at 262k. +describe('usable() — ratio-only early trigger (v1.13.9)', () => { + it('returns floor(0.85 * limit) for the qwen3.6 daily-driver context', () => { + // floor(0.85 * 262144) = floor(222822.4) = 222822 — 15% headroom for + // the summarizer to do its turn without itself overflowing. + expect(usable(262144)).toBe(222822); + }); + + it('returns 0.85× for a mid-sized context', () => { + expect(usable(100_000)).toBe(85_000); + }); + + it('returns 0.85× for a small context (no degenerate 0)', () => { + // floor(0.85 * 8192) = 6963. Under the old formula this returned 0 + // (8192 - 20_000 clamped to 0), effectively disabling compaction for + // small-context models. The ratio keeps the trigger active. + expect(usable(8192)).toBe(6963); + }); + + it('returns 0 for zero or negative contextLimit', () => { expect(usable(0)).toBe(0); - }); - - it('returns 0 when contextLimit is below the 20k buffer', () => { - // Math.max(0, x - 20000) clamps the subtraction so we never report - // negative headroom. A 10k-context model reports 0 usable, which makes - // isOverflow short-circuit to false (correct — we can't size the - // compaction with no headroom). - expect(usable(10_000)).toBe(0); - expect(usable(19_999)).toBe(0); - expect(usable(20_000)).toBe(0); - }); - - it('subtracts the 20k buffer from a normal-sized context window', () => { - expect(usable(100_000)).toBe(80_000); - expect(usable(32_768)).toBe(12_768); + expect(usable(-1)).toBe(0); }); }); // ---- isOverflow ------------------------------------------------------------- describe('isOverflow', () => { - it('returns false when usable is 0 (unknown / sub-buffer context)', () => { + it('returns false when usable is 0 (unknown contextLimit)', () => { expect(isOverflow({ prompt_tokens: 999_999, completion_tokens: 0 }, 0)).toBe(false); - expect(isOverflow({ prompt_tokens: 0, completion_tokens: 999_999 }, 10_000)).toBe(false); + expect(isOverflow({ prompt_tokens: 0, completion_tokens: 999_999 }, -1)).toBe(false); }); it('returns false at 50% of usable', () => { - // usable(100k) = 80k → 50% = 40k. + // v1.13.9: usable(100k) = 85k → 50% ≈ 42.5k. expect(isOverflow({ prompt_tokens: 30_000, completion_tokens: 10_000 }, 100_000)).toBe(false); }); it('returns false just under usable', () => { - expect(isOverflow({ prompt_tokens: 79_000, completion_tokens: 999 }, 100_000)).toBe(false); + // v1.13.9: 84_000 + 999 = 84_999 < 85_000 budget. + expect(isOverflow({ prompt_tokens: 84_000, completion_tokens: 999 }, 100_000)).toBe(false); }); it('returns true exactly at usable (>=, not strict >)', () => { - expect(isOverflow({ prompt_tokens: 80_000, completion_tokens: 0 }, 100_000)).toBe(true); + // v1.13.9: 85_000 == usable(100_000). + expect(isOverflow({ prompt_tokens: 85_000, completion_tokens: 0 }, 100_000)).toBe(true); }); it('returns true above usable', () => { + // 50_000 + 40_000 = 90_000 > 85_000. expect(isOverflow({ prompt_tokens: 50_000, completion_tokens: 40_000 }, 100_000)).toBe(true); }); }); @@ -226,8 +235,9 @@ describe('select', () => { const u = mkMsg('user', 'oversized'); const a = mkMsg('assistant', 'Y'.repeat(40_000)); const result = select([u, a], 30_000, 1); - // usable(30k) = 10k → budget = min(8k, max(2k, floor(10k*0.25))) = - // min(8k, max(2k, 2500)) = 2500. 40k chars ≈ 10k tokens. Can't fit. + // v1.13.9: usable(30k) = floor(0.85*30k) = 25500 → budget = + // min(8k, max(2k, floor(25500*0.25))) = min(8k, max(2k, 6375)) = 6375. + // 40k chars ≈ 10k tokens. Still can't fit (10k > 6375). expect(result.tail_start_id).toBeUndefined(); expect(result.head).toEqual([u, a]); }); diff --git a/apps/server/src/services/compaction.ts b/apps/server/src/services/compaction.ts index 609decc..b941a7d 100644 --- a/apps/server/src/services/compaction.ts +++ b/apps/server/src/services/compaction.ts @@ -23,7 +23,13 @@ import type { Broker } from './broker.js'; import { SUMMARY_TEMPLATE } from './compaction-prompt.js'; import * as modelContextLookup from './model-context.js'; -const COMPACTION_BUFFER = 20_000; +// v1.13.9: ratio-only overflow trigger. Fires compaction at 85% of ctx_max +// (opencode session/overflow.ts pattern). Replaces the v1.11.0-era +// `ctx_max - 20_000` formula which degenerated to 0 for contexts ≤20k and +// gave only 7-8% headroom to the summarizer at 262k. Ratio gives consistent +// 15% headroom at any scale, and small-ctx models no longer get an +// effectively-disabled trigger. +const EARLY_TRIGGER_RATIO = 0.85; const MIN_PRESERVE_RECENT_TOKENS = 2_000; const MAX_PRESERVE_RECENT_TOKENS = 8_000; const DEFAULT_TAIL_TURNS = 2; @@ -50,13 +56,13 @@ export interface CompactionMessage { // === overflow === -// Tokens we hold in reserve for the model's response so a near-full context -// can still produce a useful turn. Mirrors opencode's COMPACTION_BUFFER. -// Returns 0 when the context limit is unknown (caller treats 0 as "do not -// trigger overflow"); avoids dividing-by-zero downstream. +// Returns the token budget at which overflow fires. Triggers compaction at +// 85% of contextLimit (opencode session/overflow.ts pattern). Returns 0 when +// the context limit is unknown — caller treats 0 as "do not trigger overflow", +// keeping inference flowing rather than compacting a turn we can't size. export function usable(contextLimit: number): number { if (!contextLimit || contextLimit <= 0) return 0; - return Math.max(0, contextLimit - COMPACTION_BUFFER); + return Math.floor(EARLY_TRIGGER_RATIO * contextLimit); } export interface Usage { diff --git a/apps/server/src/services/inference/payload.ts b/apps/server/src/services/inference/payload.ts index edba789..0e7f56c 100644 --- a/apps/server/src/services/inference/payload.ts +++ b/apps/server/src/services/inference/payload.ts @@ -199,10 +199,13 @@ export async function maybeFlagForCompaction( ); if (!overflow) return; - // v1.13.4: try the cheap prune first. If it freed at least the buffer - // worth of tokens (PRUNE_TRIGGER_TOKENS, identical to COMPACTION_BUFFER), - // we're below the threshold again — skip flagging summarize for the next - // turn. The next turn's overflow check will re-evaluate from scratch. + // v1.13.4: try the cheap prune first. If it freed at least + // PRUNE_TRIGGER_TOKENS (20k) worth of context, we're below the threshold + // again — skip flagging summarize for the next turn. The next turn's + // overflow check will re-evaluate from scratch. + // v1.13.9: the overflow trigger above is now 85% of ctx_max (was + // ctx_max - 20k). PRUNE_TRIGGER_TOKENS stays at 20k as the prune-freed + // threshold — independent of the overflow formula. // Prune failures (DB errors etc.) propagate so the surrounding inference // path sees them; the catch in finalizeCompletion / executeToolPhase // doesn't shield this — by design, we want to know if prune is broken.