v1.13.9: compaction overflow trigger — 0.85 × ctx_max early trigger

Opencode pattern (session/overflow.ts): fire compaction at 85% of ctx_max, replacing the v1.11.0-era `ctx_max - 20_000` formula. Old formula: usable = ctx_max - 20_000 - ctx=262144 → trigger at 242144 (92.4%) — only 7.6% headroom - ctx=100000 → trigger at 80000 (80.0%) - ctx= 32000 → trigger at 12000 (37.5%) — over-eager - ctx<=20000 → trigger at 0 — never fires New formula: usable = floor(0.85 * ctx_max) - ctx=262144 → trigger at 222822 (85.0%) — 15% headroom for summarizer - ctx=100000 → trigger at 85000 (85.0%) - ctx= 32000 → trigger at 27200 (85.0%) - ctx= 8192 → trigger at 6963 (85.0%) Ratio gives consistent headroom at any context scale. The qwen3.6 daily driver gets ~19k tokens more breathing room before overflow; small-ctx models no longer degenerate to never-triggering. usable() is the only consumer of COMPACTION_BUFFER → constant deleted. New EARLY_TRIGGER_RATIO constant takes its place. isOverflow() and the maybeFlagForCompaction() call site at payload.ts:184 are unchanged — formula swap is internal to compaction.ts. payload.ts comment touched only to drop the stale COMPACTION_BUFFER reference (PRUNE_TRIGGER_TOKENS stays at 20k as the prune-freed threshold; independent of the overflow formula). Tests: 4 new usable() corner cases (262k/100k/8k/zero+negative), plus 5 isOverflow() numbers shifted to match the 85k budget at ctx=100k. 195/195 server tests pass (was 194). Smoke: ratio math verified by unit tests at all four corners. Live cap-hit verification deferred — requires accumulating >222k tokens in a session under qwen3.6-35b-a3b-mxfp4 (was >242k pre-fix); will surface organically in extended use.
2026-05-22 13:59:14 +00:00
3 changed files with 53 additions and 34 deletions
--- a/apps/server/src/services/tests/compaction.test.ts
+++ b/apps/server/src/services/tests/compaction.test.ts
@@ -41,49 +41,58 @@ function mkMsg(

 // ---- usable -----------------------------------------------------------------

-describe('usable', () => {
-  it('returns 0 when contextLimit is 0', () => {
+// v1.13.9: ratio-only early trigger at 0.85 × contextLimit. Replaces the
+// v1.11.0-era `contextLimit - 20_000` math, which degenerated to 0 for
+// contexts ≤20k and gave only 7-8% headroom at 262k.
+describe('usable() — ratio-only early trigger (v1.13.9)', () => {
+  it('returns floor(0.85 * limit) for the qwen3.6 daily-driver context', () => {
+    // floor(0.85 * 262144) = floor(222822.4) = 222822 — 15% headroom for
+    // the summarizer to do its turn without itself overflowing.
+    expect(usable(262144)).toBe(222822);
+  });
+
+  it('returns 0.85× for a mid-sized context', () => {
+    expect(usable(100_000)).toBe(85_000);
+  });
+
+  it('returns 0.85× for a small context (no degenerate 0)', () => {
+    // floor(0.85 * 8192) = 6963. Under the old formula this returned 0
+    // (8192 - 20_000 clamped to 0), effectively disabling compaction for
+    // small-context models. The ratio keeps the trigger active.
+    expect(usable(8192)).toBe(6963);
+  });
+
+  it('returns 0 for zero or negative contextLimit', () => {
    expect(usable(0)).toBe(0);
-  });
-
-  it('returns 0 when contextLimit is below the 20k buffer', () => {
-    // Math.max(0, x - 20000) clamps the subtraction so we never report
-    // negative headroom. A 10k-context model reports 0 usable, which makes
-    // isOverflow short-circuit to false (correct — we can't size the
-    // compaction with no headroom).
-    expect(usable(10_000)).toBe(0);
-    expect(usable(19_999)).toBe(0);
-    expect(usable(20_000)).toBe(0);
-  });
-
-  it('subtracts the 20k buffer from a normal-sized context window', () => {
-    expect(usable(100_000)).toBe(80_000);
-    expect(usable(32_768)).toBe(12_768);
+    expect(usable(-1)).toBe(0);
  });
 });

 // ---- isOverflow -------------------------------------------------------------

 describe('isOverflow', () => {
-  it('returns false when usable is 0 (unknown / sub-buffer context)', () => {
+  it('returns false when usable is 0 (unknown contextLimit)', () => {
    expect(isOverflow({ prompt_tokens: 999_999, completion_tokens: 0 }, 0)).toBe(false);
-    expect(isOverflow({ prompt_tokens: 0, completion_tokens: 999_999 }, 10_000)).toBe(false);
+    expect(isOverflow({ prompt_tokens: 0, completion_tokens: 999_999 }, -1)).toBe(false);
  });

  it('returns false at 50% of usable', () => {
-    // usable(100k) = 80k → 50% = 40k.
+    // v1.13.9: usable(100k) = 85k → 50% ≈ 42.5k.
    expect(isOverflow({ prompt_tokens: 30_000, completion_tokens: 10_000 }, 100_000)).toBe(false);
  });

  it('returns false just under usable', () => {
-    expect(isOverflow({ prompt_tokens: 79_000, completion_tokens: 999 }, 100_000)).toBe(false);
+    // v1.13.9: 84_000 + 999 = 84_999 < 85_000 budget.
+    expect(isOverflow({ prompt_tokens: 84_000, completion_tokens: 999 }, 100_000)).toBe(false);
  });

  it('returns true exactly at usable (>=, not strict >)', () => {
-    expect(isOverflow({ prompt_tokens: 80_000, completion_tokens: 0 }, 100_000)).toBe(true);
+    // v1.13.9: 85_000 == usable(100_000).
+    expect(isOverflow({ prompt_tokens: 85_000, completion_tokens: 0 }, 100_000)).toBe(true);
  });

  it('returns true above usable', () => {
+    // 50_000 + 40_000 = 90_000 > 85_000.
    expect(isOverflow({ prompt_tokens: 50_000, completion_tokens: 40_000 }, 100_000)).toBe(true);
  });
 });
@@ -226,8 +235,9 @@ describe('select', () => {
    const u = mkMsg('user', 'oversized');
    const a = mkMsg('assistant', 'Y'.repeat(40_000));
    const result = select([u, a], 30_000, 1);
-    // usable(30k) = 10k → budget = min(8k, max(2k, floor(10k*0.25))) =
-    // min(8k, max(2k, 2500)) = 2500. 40k chars ≈ 10k tokens. Can't fit.
+    // v1.13.9: usable(30k) = floor(0.85*30k) = 25500 → budget =
+    // min(8k, max(2k, floor(25500*0.25))) = min(8k, max(2k, 6375)) = 6375.
+    // 40k chars ≈ 10k tokens. Still can't fit (10k > 6375).
    expect(result.tail_start_id).toBeUndefined();
    expect(result.head).toEqual([u, a]);
  });
--- a/apps/server/src/services/compaction.ts
+++ b/apps/server/src/services/compaction.ts
@@ -23,7 +23,13 @@ import type { Broker } from './broker.js';
 import { SUMMARY_TEMPLATE } from './compaction-prompt.js';
 import * as modelContextLookup from './model-context.js';

-const COMPACTION_BUFFER = 20_000;
+// v1.13.9: ratio-only overflow trigger. Fires compaction at 85% of ctx_max
+// (opencode session/overflow.ts pattern). Replaces the v1.11.0-era
+// `ctx_max - 20_000` formula which degenerated to 0 for contexts ≤20k and
+// gave only 7-8% headroom to the summarizer at 262k. Ratio gives consistent
+// 15% headroom at any scale, and small-ctx models no longer get an
+// effectively-disabled trigger.
+const EARLY_TRIGGER_RATIO = 0.85;
 const MIN_PRESERVE_RECENT_TOKENS = 2_000;
 const MAX_PRESERVE_RECENT_TOKENS = 8_000;
 const DEFAULT_TAIL_TURNS = 2;
@@ -50,13 +56,13 @@ export interface CompactionMessage {

 // === overflow ===

-// Tokens we hold in reserve for the model's response so a near-full context
-// can still produce a useful turn. Mirrors opencode's COMPACTION_BUFFER.
-// Returns 0 when the context limit is unknown (caller treats 0 as "do not
-// trigger overflow"); avoids dividing-by-zero downstream.
+// Returns the token budget at which overflow fires. Triggers compaction at
+// 85% of contextLimit (opencode session/overflow.ts pattern). Returns 0 when
+// the context limit is unknown — caller treats 0 as "do not trigger overflow",
+// keeping inference flowing rather than compacting a turn we can't size.
 export function usable(contextLimit: number): number {
  if (!contextLimit || contextLimit <= 0) return 0;
-  return Math.max(0, contextLimit - COMPACTION_BUFFER);
+  return Math.floor(EARLY_TRIGGER_RATIO * contextLimit);
 }

 export interface Usage {
--- a/apps/server/src/services/inference/payload.ts
+++ b/apps/server/src/services/inference/payload.ts
@@ -199,10 +199,13 @@ export async function maybeFlagForCompaction(
  );
  if (!overflow) return;

-  // v1.13.4: try the cheap prune first. If it freed at least the buffer
-  // worth of tokens (PRUNE_TRIGGER_TOKENS, identical to COMPACTION_BUFFER),
-  // we're below the threshold again — skip flagging summarize for the next
-  // turn. The next turn's overflow check will re-evaluate from scratch.
+  // v1.13.4: try the cheap prune first. If it freed at least
+  // PRUNE_TRIGGER_TOKENS (20k) worth of context, we're below the threshold
+  // again — skip flagging summarize for the next turn. The next turn's
+  // overflow check will re-evaluate from scratch.
+  // v1.13.9: the overflow trigger above is now 85% of ctx_max (was
+  // ctx_max - 20k). PRUNE_TRIGGER_TOKENS stays at 20k as the prune-freed
+  // threshold — independent of the overflow formula.
  // Prune failures (DB errors etc.) propagate so the surrounding inference
  // path sees them; the catch in finalizeCompletion / executeToolPhase
  // doesn't shield this — by design, we want to know if prune is broken.