merge v1.11.3-ctxmax

v1.11.3: fix ctx_max capture via /props endpoint
- llama-server does not emit n_ctx in timings (confirmed empirically); dead code at inference.ts:479 and compaction.ts:300 never fired - New model-context.ts: cached fetch of /upstream/<model>/props with positive-cache (no TTL) and 60s negative-cache - Wired into all 4 ctx_max write sites: 3 in inference.ts (executeToolPhase, finalizeCompletion, runCapHitSummary) and 1 in compaction.ts (summary row INSERT) - AbortController 3s timeout, lenient parsing with sensible defaults - 12 new vitest cases for the cache module (59 total) - 7 historical assistant rows backfilled manually (see notes)
2026-05-20 19:29:26 +00:00 · 2026-05-20 19:29:26 +00:00 · 2026-05-20 19:18:27 +00:00 · 2026-05-20 19:05:35 +00:00 · 2026-05-20 19:05:35 +00:00 · 2026-05-20 18:13:55 +00:00
20 changed files with 1572 additions and 141 deletions
--- a/apps/server/src/index.ts
+++ b/apps/server/src/index.ts
@@ -19,6 +19,8 @@ import { registerSkillsRoutes } from './routes/skills.js';
 import { createInferenceRunner } from './services/inference.js';
 import { createBroker } from './services/broker.js';
 import { listSkills } from './services/skills.js';
+import * as compaction from './services/compaction.js';
+import { configureModelContext } from './services/model-context.js';

 async function main() {
  const config = loadConfig();
@@ -47,6 +49,11 @@ async function main() {
  await applySchema(sql);
  app.log.info('database schema applied');

+  // v1.11.3: tell the model-context cache where llama-swap lives. Cache
+  // lookups go to ${LLAMA_SWAP_URL}/upstream/<model>/props to read
+  // default_generation_settings.n_ctx — the value persisted as messages.ctx_max.
+  configureModelContext({ llamaSwapUrl: config.LLAMA_SWAP_URL });
+
  await app.register(fastifyWebsocket);

  app.get('/api/health', async () => {
@@ -81,6 +88,11 @@ async function main() {
      publish: (sessionId, frame) => {
        broker.publish(sessionId, frame as unknown as Record<string, unknown> & { type: string });
      },
+      // v1.11: broker handle for compaction.process to publish 'compacted'
+      // frames on the per-session channel. Inference's regular publish path
+      // is bound to (sessionId, InferenceFrame); compaction publishes a
+      // different frame shape, so it goes through the raw broker.
+      broker,
    },
    (user, frame) => {
      broker.publishUser(user, frame as unknown as Record<string, unknown> & { type: string });
@@ -90,9 +102,13 @@ async function main() {
    enqueueInference: (sessionId, chatId, assistantId, user) => {
      inference.enqueue(sessionId, chatId, assistantId, user);
    },
-    enqueueCompact: (sessionId, chatId, compactId, user) => {
-      inference.enqueueCompact(sessionId, chatId, compactId, user);
-    },
+    // v1.11: synchronous compaction. Awaits the LLM call inside the route's
+    // request lifecycle; the new summary row arrives via the WS 'compacted'
+    // frame published from inside compaction.process. We let the error
+    // bubble up so the route can reply 500 — manual /compact failures
+    // should be loud (the user just clicked a button).
+    runCompaction: (chatId) =>
+      compaction.process({ sql, config, log: app.log, broker, chatId }),
    cancelInference: async (sessionId, chatId) => {
      return inference.cancel(sessionId, chatId);
    },
--- a/apps/server/src/routes/chats.ts
+++ b/apps/server/src/routes/chats.ts
@@ -316,7 +316,8 @@ export function registerChatRoutes(
      }
      const rows = await sql<Message[]>`
        SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
-               tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata
+               tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata,
+               summary, tail_start_id, compacted_at
        FROM messages
        WHERE chat_id = ${req.params.id}
        ORDER BY created_at ASC, id ASC
--- a/apps/server/src/routes/messages.ts
+++ b/apps/server/src/routes/messages.ts
@@ -49,7 +49,12 @@ const AskUserInputArgs = z.object({

 interface MessageHandlers {
  enqueueInference: (sessionId: string, chatId: string, assistantMessageId: string, user: string) => void;
-  enqueueCompact: (sessionId: string, chatId: string, compactMessageId: string, user: string) => void;
+  // v1.11: returns a promise that resolves after compaction.process finishes
+  // (await the LLM call). Throws on failure — the route surfaces a 500.
+  // Replaces the v1.10 enqueueCompact (which fired-and-forgot a kind='compact'
+  // streaming row). The new anchored-rolling strategy inserts a single
+  // summary=true assistant row only after the LLM responds.
+  runCompaction: (chatId: string) => Promise<void>;
  publishUserMessage: (
    sessionId: string,
    chatId: string,
@@ -81,9 +86,15 @@ export function registerMessageRoutes(
        reply.code(404);
        return { error: 'session not found' };
      }
+      // v1.11: returns ALL messages including compacted ones. The UI
+      // distinguishes via the new `summary` flag (renders an accordion
+      // SummaryCard) and shows compacted_at-stamped rows inline for context.
+      // Internal inference assembly filters compacted_at IS NULL separately —
+      // see services/inference.ts loadContext + services/compaction.ts.
      const rows = await sql<Message[]>`
        SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
-               tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata
+               tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata,
+               summary, tail_start_id, compacted_at
        FROM messages
        WHERE session_id = ${req.params.id}
        ORDER BY created_at ASC, id ASC
@@ -251,29 +262,30 @@ export function registerMessageRoutes(
    }
  );

+  // v1.11: manual /compact. Was a streaming kind='compact' row inserted by
+  // this handler; now delegates to the anchored-rolling compaction service.
+  // Synchronous (we await the LLM call) — callers either await or rely on
+  // the 'compacted' WS frame to refresh their view. The response carries
+  // no body of interest; the new summary row arrives via the WS frame.
  app.post<{ Params: { id: string } }>(
    '/api/chats/:id/compact',
    async (req, reply) => {
-      const chatRows = await sql<Chat[]>`
-        SELECT id, session_id FROM chats WHERE id = ${req.params.id} AND status = 'open'
+      const chatRows = await sql<{ id: string }[]>`
+        SELECT id FROM chats WHERE id = ${req.params.id} AND status = 'open'
      `;
      if (chatRows.length === 0) {
        reply.code(404);
        return { error: 'chat not found' };
      }
-      const chat = chatRows[0]!;
-      const sessionId = chat.session_id;
-
-      const [compactMsg] = await sql<{ id: string }[]>`
-        INSERT INTO messages (session_id, chat_id, role, content, kind, status, created_at)
-        VALUES (${sessionId}, ${chat.id}, 'system', '', 'compact', 'streaming', clock_timestamp())
-        RETURNING id
-      `;
-
-      handlers.enqueueCompact(sessionId, chat.id, compactMsg!.id, 'default');
-
-      reply.code(202);
-      return { compact_message_id: compactMsg!.id };
+      try {
+        await handlers.runCompaction(chatRows[0]!.id);
+      } catch (err) {
+        req.log.error({ err, chatId: chatRows[0]!.id }, 'manual compaction failed');
+        reply.code(500);
+        return { error: err instanceof Error ? err.message : 'compaction failed' };
+      }
+      reply.code(200);
+      return { ok: true };
    }
  );

--- a/apps/server/src/routes/ws.ts
+++ b/apps/server/src/routes/ws.ts
@@ -21,9 +21,12 @@ export function registerWebSocket(
        return;
      }

+      // v1.11: snapshot includes compaction fields so MessageBubble can
+      // render the SummaryCard for summary=true rows on first connect.
      const messages = await sql<Message[]>`
        SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
-               tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata
+               tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata,
+               summary, tail_start_id, compacted_at
        FROM messages
        WHERE session_id = ${sessionId}
        ORDER BY created_at ASC, id ASC
--- a/apps/server/src/schema.sql
+++ b/apps/server/src/schema.sql
@@ -179,3 +179,25 @@ INSERT INTO settings (key, value) VALUES ('theme_mode', '"dark"') ON CONFLICT (k
 ALTER TABLE projects ADD COLUMN IF NOT EXISTS default_system_prompt TEXT NOT NULL DEFAULT '';
 ALTER TABLE projects ADD COLUMN IF NOT EXISTS default_web_search_enabled BOOLEAN NOT NULL DEFAULT false;
 ALTER TABLE sessions ADD COLUMN IF NOT EXISTS web_search_enabled BOOLEAN;
+
+-- v1.11: anchored rolling compaction.
+--   compacted_at  — marks rows that are "behind the curtain" of the latest
+--                   summary. Inference assembly filters compacted_at IS NULL;
+--                   the API GET still returns all rows so the UI can show
+--                   history with the summary card inline.
+--   summary       — true on the assistant row that IS the anchored summary.
+--                   Exactly one row per chat is the "current" summary
+--                   (every prior summary row is itself compacted_at-stamped
+--                   when superseded, leaving one live anchor).
+--   tail_start_id — points at the first preserved message that the summary
+--                   covers up to (exclusive). Lets the UI/debug reason about
+--                   the boundary without re-deriving from compacted_at.
+--   needs_compaction — flag on chats (not sessions) because chat history is
+--                   per-chat; sessions have 1:N chats. Set true post-overflow,
+--                   cleared by compaction.process at the start of the next
+--                   inference turn.
+ALTER TABLE messages ADD COLUMN IF NOT EXISTS compacted_at TIMESTAMPTZ;
+ALTER TABLE messages ADD COLUMN IF NOT EXISTS summary BOOLEAN NOT NULL DEFAULT FALSE;
+ALTER TABLE messages ADD COLUMN IF NOT EXISTS tail_start_id UUID REFERENCES messages(id) ON DELETE SET NULL;
+ALTER TABLE chats ADD COLUMN IF NOT EXISTS needs_compaction BOOLEAN NOT NULL DEFAULT FALSE;
+CREATE INDEX IF NOT EXISTS idx_messages_chat_compacted ON messages (chat_id, compacted_at);
--- a/apps/server/src/services/tests/compaction.test.ts
+++ b/apps/server/src/services/tests/compaction.test.ts
@@ -0,0 +1,258 @@
+import { describe, it, expect } from 'vitest';
+import {
+  usable,
+  isOverflow,
+  estimate,
+  turns,
+  select,
+  buildPrompt,
+  type CompactionMessage,
+} from '../compaction.js';
+import { SUMMARY_TEMPLATE } from '../compaction-prompt.js';
+
+// ---- fixture ----------------------------------------------------------------
+// Tiny constructor for the message shape `compaction.ts` consumes. Default
+// values match the post-CP1 schema (summary=false, kind='message', complete).
+// Tests that need a summary row pass `summary: true`.
+
+let counter = 0;
+function mkMsg(
+  role: CompactionMessage['role'],
+  content: string,
+  overrides: Partial<CompactionMessage> = {},
+): CompactionMessage {
+  counter += 1;
+  return {
+    id: `m${counter}`,
+    role,
+    content,
+    kind: 'message',
+    summary: false,
+    status: 'complete',
+    tool_calls: null,
+    tool_results: null,
+    metadata: null,
+    created_at: new Date(counter * 1000).toISOString(),
+    ...overrides,
+  };
+}
+
+// ---- usable -----------------------------------------------------------------
+
+describe('usable', () => {
+  it('returns 0 when contextLimit is 0', () => {
+    expect(usable(0)).toBe(0);
+  });
+
+  it('returns 0 when contextLimit is below the 20k buffer', () => {
+    // Math.max(0, x - 20000) clamps the subtraction so we never report
+    // negative headroom. A 10k-context model reports 0 usable, which makes
+    // isOverflow short-circuit to false (correct — we can't size the
+    // compaction with no headroom).
+    expect(usable(10_000)).toBe(0);
+    expect(usable(19_999)).toBe(0);
+    expect(usable(20_000)).toBe(0);
+  });
+
+  it('subtracts the 20k buffer from a normal-sized context window', () => {
+    expect(usable(100_000)).toBe(80_000);
+    expect(usable(32_768)).toBe(12_768);
+  });
+});
+
+// ---- isOverflow -------------------------------------------------------------
+
+describe('isOverflow', () => {
+  it('returns false when usable is 0 (unknown / sub-buffer context)', () => {
+    expect(isOverflow({ prompt_tokens: 999_999, completion_tokens: 0 }, 0)).toBe(false);
+    expect(isOverflow({ prompt_tokens: 0, completion_tokens: 999_999 }, 10_000)).toBe(false);
+  });
+
+  it('returns false at 50% of usable', () => {
+    // usable(100k) = 80k → 50% = 40k.
+    expect(isOverflow({ prompt_tokens: 30_000, completion_tokens: 10_000 }, 100_000)).toBe(false);
+  });
+
+  it('returns false just under usable', () => {
+    expect(isOverflow({ prompt_tokens: 79_000, completion_tokens: 999 }, 100_000)).toBe(false);
+  });
+
+  it('returns true exactly at usable (>=, not strict >)', () => {
+    expect(isOverflow({ prompt_tokens: 80_000, completion_tokens: 0 }, 100_000)).toBe(true);
+  });
+
+  it('returns true above usable', () => {
+    expect(isOverflow({ prompt_tokens: 50_000, completion_tokens: 40_000 }, 100_000)).toBe(true);
+  });
+});
+
+// ---- estimate ---------------------------------------------------------------
+
+describe('estimate', () => {
+  it('returns a tiny value for an empty array (JSON.stringify([]) is "[]")', () => {
+    // Math.ceil('[]'.length / 4) = 1. Documented here so the next reader
+    // doesn't think "0" is the expected baseline — char-count/4 will never
+    // be exactly 0 for any JSON-serializable input.
+    expect(estimate([])).toBe(1);
+  });
+
+  it('scales roughly with content length', () => {
+    const tiny = estimate([mkMsg('user', 'hi')]);
+    const big = estimate([mkMsg('user', 'x'.repeat(4000))]);
+    expect(big).toBeGreaterThan(tiny);
+    expect(big).toBeGreaterThanOrEqual(1000); // 4000 chars / 4 = 1000 floor
+  });
+
+  it('is deterministic across repeated calls', () => {
+    const msgs = [mkMsg('user', 'one'), mkMsg('assistant', 'two')];
+    expect(estimate(msgs)).toBe(estimate(msgs));
+  });
+});
+
+// ---- turns ------------------------------------------------------------------
+
+describe('turns', () => {
+  it('returns [] for an empty message list', () => {
+    expect(turns([])).toEqual([]);
+  });
+
+  it('returns one turn for a single user message', () => {
+    const u = mkMsg('user', 'hi');
+    const result = turns([u]);
+    expect(result).toHaveLength(1);
+    expect(result[0]).toEqual({ start: 0, end: 1, id: u.id });
+  });
+
+  it('returns two turns for user/assistant/user/assistant', () => {
+    const u1 = mkMsg('user', 'q1');
+    const a1 = mkMsg('assistant', 'a1');
+    const u2 = mkMsg('user', 'q2');
+    const a2 = mkMsg('assistant', 'a2');
+    const result = turns([u1, a1, u2, a2]);
+    expect(result).toEqual([
+      { start: 0, end: 2, id: u1.id },
+      { start: 2, end: 4, id: u2.id },
+    ]);
+  });
+
+  it('extends the final turn end to include trailing non-user messages', () => {
+    // Spec wording: "user/assistant + trailing system → trailing included
+    // in last turn's range". Single-turn variant: [user, assistant, system]
+    // should produce one turn with end=3 (covers all three indices).
+    const u = mkMsg('user', 'q');
+    const a = mkMsg('assistant', 'a');
+    const s = mkMsg('system', 'note');
+    const result = turns([u, a, s]);
+    expect(result).toEqual([{ start: 0, end: 3, id: u.id }]);
+  });
+
+  it('skips user rows flagged as summary (anchored-rolling rows)', () => {
+    // Defense-in-depth — process() pre-filters summary rows, but turns()
+    // also skips them so a misuse from another caller doesn't create a
+    // bogus turn boundary on the summary row itself.
+    const u1 = mkMsg('user', 'q1');
+    const a1 = mkMsg('assistant', 'a1');
+    const sum = mkMsg('user', 'rolled-up', { summary: true });
+    const u2 = mkMsg('user', 'q2');
+    const result = turns([u1, a1, sum, u2]);
+    expect(result.map((t) => t.id)).toEqual([u1.id, u2.id]);
+  });
+});
+
+// ---- select -----------------------------------------------------------------
+
+describe('select', () => {
+  it('returns empty head + undefined tail for an empty message list', () => {
+    const result = select([], 100_000);
+    expect(result.head).toEqual([]);
+    expect(result.tail_start_id).toBeUndefined();
+  });
+
+  it('full-preserves when there are fewer turns than tail_turns', () => {
+    // 1 turn but tail_turns=2: keep === turn0 → keep.start === 0 →
+    // sentinel-return path that signals "no compaction this round".
+    const u = mkMsg('user', 'only');
+    const a = mkMsg('assistant', 'a');
+    const result = select([u, a], 100_000, 2);
+    expect(result.head).toEqual([u, a]);
+    expect(result.tail_start_id).toBeUndefined();
+  });
+
+  it('keeps the last tail_turns turns when they all fit the budget', () => {
+    // 3 turns, all small. tail_turns=2 means keep the last 2; head =
+    // messages[0..turn2.start] = just turn1's content.
+    const u1 = mkMsg('user', 'q1');
+    const a1 = mkMsg('assistant', 'a1');
+    const u2 = mkMsg('user', 'q2');
+    const a2 = mkMsg('assistant', 'a2');
+    const u3 = mkMsg('user', 'q3');
+    const a3 = mkMsg('assistant', 'a3');
+    const msgs = [u1, a1, u2, a2, u3, a3];
+    const result = select(msgs, 100_000, 2);
+    // Turn boundaries: [0,2), [2,4), [4,6). slice(-2) = turns at 2 and 4.
+    // Walking backward: u3 fits, then u2 fits → keep={start:2, id:u2.id}.
+    expect(result.tail_start_id).toBe(u2.id);
+    expect(result.head).toEqual([u1, a1]);
+  });
+
+  it('splits a turn mid-stream when the whole turn would overflow the budget', () => {
+    // tail_turns=1 so we look only at the most recent turn. Stuff it past
+    // 8k of content (max preserve budget) and the splitter walks forward
+    // looking for the largest suffix that fits.
+    const u1 = mkMsg('user', 'q1');
+    const a1 = mkMsg('assistant', 'a1');
+    const u2 = mkMsg('user', 'q2 with a giant payload');
+    const huge = mkMsg('assistant', 'X'.repeat(40_000)); // ~10k tokens
+    const smallTail = mkMsg('assistant', 'short answer');
+    const msgs = [u1, a1, u2, huge, smallTail];
+    const result = select(msgs, 100_000, 1);
+    // The split walks from turn.start+1 forward; the first index whose
+    // [i, end) slice fits the budget becomes the new keep. We don't assert
+    // a specific id (depends on character math), only that compaction was
+    // triggered (tail_start_id set, head non-empty) and that the head
+    // doesn't include the final small message.
+    expect(result.tail_start_id).toBeDefined();
+    expect(result.head.length).toBeGreaterThan(0);
+    expect(result.head).not.toContain(smallTail);
+  });
+
+  it('full-preserves when no split point fits', () => {
+    // Single oversized turn; splitTurn walks but each suffix is still too
+    // big. After the loop, keep is undefined → full-preserve sentinel.
+    // Force this with a sub-buffer context so budget is the floor (2k),
+    // and a single 40k-char message.
+    const u = mkMsg('user', 'oversized');
+    const a = mkMsg('assistant', 'Y'.repeat(40_000));
+    const result = select([u, a], 30_000, 1);
+    // usable(30k) = 10k → budget = min(8k, max(2k, floor(10k*0.25))) =
+    // min(8k, max(2k, 2500)) = 2500. 40k chars ≈ 10k tokens. Can't fit.
+    expect(result.tail_start_id).toBeUndefined();
+    expect(result.head).toEqual([u, a]);
+  });
+});
+
+// ---- buildPrompt ------------------------------------------------------------
+
+describe('buildPrompt', () => {
+  it('opens with the "create new" anchor when previousSummary is undefined', () => {
+    const out = buildPrompt(undefined, []);
+    expect(out.startsWith('Create a new anchored summary')).toBe(true);
+    expect(out).toContain(SUMMARY_TEMPLATE);
+    expect(out).not.toContain('<previous-summary>');
+  });
+
+  it('opens with the "update" anchor and embeds previousSummary verbatim', () => {
+    const prev = '## Goal\n- finish v1.11 compaction';
+    const out = buildPrompt(prev, []);
+    expect(out.startsWith('Update the anchored summary')).toBe(true);
+    expect(out).toContain('<previous-summary>');
+    expect(out).toContain(prev);
+    expect(out).toContain('</previous-summary>');
+    expect(out).toContain(SUMMARY_TEMPLATE);
+  });
+
+  it('appends extra context strings after the template (reserved for plugin injection)', () => {
+    const out = buildPrompt(undefined, ['extra-context-line']);
+    expect(out.endsWith('extra-context-line')).toBe(true);
+  });
+});
--- a/apps/server/src/services/tests/model-context.test.ts
+++ b/apps/server/src/services/tests/model-context.test.ts
@@ -0,0 +1,205 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+import {
+  configureModelContext,
+  getModelContext,
+  invalidateModelContext,
+} from '../model-context.js';
+
+// ---- fixtures ---------------------------------------------------------------
+
+const TEST_URL = 'http://llama-swap.test:8401';
+
+function mockOkProps(n_ctx: number, total_slots = 1) {
+  return new Response(
+    JSON.stringify({
+      default_generation_settings: { n_ctx },
+      total_slots,
+    }),
+    { status: 200, headers: { 'Content-Type': 'application/json' } },
+  );
+}
+
+beforeEach(() => {
+  invalidateModelContext();
+  configureModelContext({ llamaSwapUrl: TEST_URL });
+});
+
+afterEach(() => {
+  vi.restoreAllMocks();
+  vi.useRealTimers();
+});
+
+// ---- positive cache ---------------------------------------------------------
+
+describe('getModelContext — positive cache', () => {
+  it('returns the parsed body on a 200 with valid shape', async () => {
+    const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce(mockOkProps(262_144, 1));
+    const result = await getModelContext('qwen3.6');
+    expect(result).not.toBeNull();
+    expect(result!.n_ctx).toBe(262_144);
+    expect(result!.total_slots).toBe(1);
+    expect(typeof result!.fetched_at).toBe('number');
+    // Verify the URL was constructed correctly — encodes the model name in
+    // case it contains characters that would break the path.
+    expect(fetchSpy).toHaveBeenCalledExactlyOnceWith(
+      `${TEST_URL}/upstream/qwen3.6/props`,
+      expect.objectContaining({ signal: expect.any(AbortSignal) }),
+    );
+  });
+
+  it('serves the second call from cache without refetching', async () => {
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(mockOkProps(262_144));
+    const a = await getModelContext('qwen3.6');
+    const b = await getModelContext('qwen3.6');
+    expect(a).toEqual(b);
+    expect(fetchSpy).toHaveBeenCalledTimes(1);
+  });
+
+  it('defaults total_slots to 1 when the server omits it', async () => {
+    // Mirror the docstring claim — total_slots is informational and we don't
+    // reject the response just because it's missing.
+    vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce(
+      new Response(JSON.stringify({ default_generation_settings: { n_ctx: 8192 } }), {
+        status: 200,
+      }),
+    );
+    const result = await getModelContext('partial-model');
+    expect(result).not.toBeNull();
+    expect(result!.n_ctx).toBe(8192);
+    expect(result!.total_slots).toBe(1);
+  });
+});
+
+// ---- negative cache (single-shot) ------------------------------------------
+
+describe('getModelContext — negative cache (single failure modes)', () => {
+  it('returns null and negative-caches when default_generation_settings is missing', async () => {
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(new Response(JSON.stringify({ total_slots: 1 }), { status: 200 }));
+    const result = await getModelContext('broken');
+    expect(result).toBeNull();
+    // Second call within TTL must not refetch.
+    const result2 = await getModelContext('broken');
+    expect(result2).toBeNull();
+    expect(fetchSpy).toHaveBeenCalledTimes(1);
+  });
+
+  it('returns null and negative-caches when n_ctx is missing inside default_generation_settings', async () => {
+    const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce(
+      new Response(JSON.stringify({ default_generation_settings: {}, total_slots: 1 }), {
+        status: 200,
+      }),
+    );
+    await getModelContext('half-broken');
+    await getModelContext('half-broken');
+    expect(fetchSpy).toHaveBeenCalledTimes(1);
+  });
+
+  it('returns null and negative-caches on non-200 (404)', async () => {
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(new Response('not found', { status: 404 }));
+    const result = await getModelContext('missing-model');
+    expect(result).toBeNull();
+    const result2 = await getModelContext('missing-model');
+    expect(result2).toBeNull();
+    expect(fetchSpy).toHaveBeenCalledTimes(1);
+  });
+
+  it('returns null and negative-caches on network error', async () => {
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockRejectedValueOnce(new TypeError('fetch failed: connect ECONNREFUSED'));
+    const result = await getModelContext('down-upstream');
+    expect(result).toBeNull();
+    const result2 = await getModelContext('down-upstream');
+    expect(result2).toBeNull();
+    expect(fetchSpy).toHaveBeenCalledTimes(1);
+  });
+});
+
+// ---- negative cache TTL -----------------------------------------------------
+
+describe('getModelContext — negative cache TTL', () => {
+  it('does NOT refetch when a second call lands within the 60s TTL', async () => {
+    vi.useFakeTimers();
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(new Response('boom', { status: 500 }));
+
+    await getModelContext('flapping');
+    vi.advanceTimersByTime(30_000);
+    await getModelContext('flapping');
+    expect(fetchSpy).toHaveBeenCalledTimes(1);
+  });
+
+  it('refetches when the second call lands after the 60s TTL expires', async () => {
+    vi.useFakeTimers();
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(new Response('boom', { status: 500 }))
+      // Recovered upstream on the retry — we expect a positive cache hit
+      // after this fires.
+      .mockResolvedValueOnce(mockOkProps(8192));
+
+    await getModelContext('flapping');
+    vi.advanceTimersByTime(61_000);
+    const result = await getModelContext('flapping');
+    expect(result).not.toBeNull();
+    expect(result!.n_ctx).toBe(8192);
+    expect(fetchSpy).toHaveBeenCalledTimes(2);
+  });
+});
+
+// ---- invalidateModelContext -------------------------------------------------
+
+describe('invalidateModelContext', () => {
+  it('clears a single positive entry by model name', async () => {
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(mockOkProps(8192))
+      .mockResolvedValueOnce(mockOkProps(8192));
+
+    await getModelContext('cleared');
+    invalidateModelContext('cleared');
+    await getModelContext('cleared');
+    expect(fetchSpy).toHaveBeenCalledTimes(2);
+  });
+
+  it('clears ALL entries when called with no arg', async () => {
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(mockOkProps(8192))
+      .mockResolvedValueOnce(mockOkProps(16_384))
+      // After the full clear, both models re-fetch.
+      .mockResolvedValueOnce(mockOkProps(8192))
+      .mockResolvedValueOnce(mockOkProps(16_384));
+
+    await getModelContext('alpha');
+    await getModelContext('beta');
+    invalidateModelContext();
+    await getModelContext('alpha');
+    await getModelContext('beta');
+    expect(fetchSpy).toHaveBeenCalledTimes(4);
+  });
+
+  it('clearing a positive entry also clears the matching negative entry', async () => {
+    // Mixed state: first call fails (negative-caches), then we invalidate
+    // explicitly and the next call should fetch again rather than serve
+    // the stale negative entry.
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(new Response('boom', { status: 500 }))
+      .mockResolvedValueOnce(mockOkProps(4096));
+
+    await getModelContext('formerly-broken');
+    invalidateModelContext('formerly-broken');
+    const result = await getModelContext('formerly-broken');
+    expect(result).not.toBeNull();
+    expect(result!.n_ctx).toBe(4096);
+    expect(fetchSpy).toHaveBeenCalledTimes(2);
+  });
+});
--- a/apps/server/src/services/compaction-prompt.ts
+++ b/apps/server/src/services/compaction-prompt.ts
@@ -0,0 +1,40 @@
+// v1.11: anchored rolling summary template. Verbatim port from opencode
+// (packages/opencode/src/session/compaction.ts SUMMARY_TEMPLATE). Kept in a
+// separate module so the long template literal doesn't bloat compaction.ts.
+
+export const SUMMARY_TEMPLATE = `Output exactly the Markdown structure shown inside <template> and keep the section order unchanged. Do not include the <template> tags in your response.
+<template>
+## Goal
+- [single-sentence task summary]
+
+## Constraints & Preferences
+- [user constraints, preferences, specs, or "(none)"]
+
+## Progress
+### Done
+- [completed work or "(none)"]
+
+### In Progress
+- [current work or "(none)"]
+
+### Blocked
+- [blockers or "(none)"]
+
+## Key Decisions
+- [decision and why, or "(none)"]
+
+## Next Steps
+- [ordered next actions or "(none)"]
+
+## Critical Context
+- [important technical facts, errors, open questions, or "(none)"]
+
+## Relevant Files
+- [file or directory path: why it matters, or "(none)"]
+</template>
+
+Rules:
+- Keep every section, even when empty.
+- Use terse bullets, not prose paragraphs.
+- Preserve exact file paths, commands, error strings, and identifiers when known.
+- Do not mention the summary process or that context was compacted.`;
--- a/apps/server/src/services/compaction.ts
+++ b/apps/server/src/services/compaction.ts
@@ -0,0 +1,510 @@
+// v1.11: anchored rolling compaction. Ported algorithms (not Effect-TS code)
+// from opencode (packages/opencode/src/session/{compaction,overflow}.ts).
+//
+// What's different from BooCode's legacy /compact:
+//   - Operates per-chat (chats have N:1 to sessions; history is per-chat).
+//   - Detects overflow automatically after each inference completion using
+//     llama-swap's reported n_ctx; flags chats.needs_compaction=true.
+//   - On the next turn (or manual /compact) we summarize the *head* (messages
+//     prior to a preserved tail of N user-turns) into a single
+//     summary=true assistant row. Older messages get compacted_at-stamped so
+//     inference assembly filters them out; the GET endpoint still returns
+//     them so the UI can show history with the summary card inline.
+//   - The summary is *anchored rolling* — exactly one live summary=true row
+//     per chat. Subsequent compactions read the prior summary as
+//     previousSummary, ask the LLM to update-merge it, then mark the prior
+//     summary row compacted_at too (it stays in the UI but isn't sent to the
+//     LLM again).
+
+import type { FastifyBaseLogger } from 'fastify';
+import type { Sql } from '../db.js';
+import type { Config } from '../config.js';
+import type { Broker } from './broker.js';
+import { SUMMARY_TEMPLATE } from './compaction-prompt.js';
+import * as modelContextLookup from './model-context.js';
+
+const COMPACTION_BUFFER = 20_000;
+const MIN_PRESERVE_RECENT_TOKENS = 2_000;
+const MAX_PRESERVE_RECENT_TOKENS = 8_000;
+const DEFAULT_TAIL_TURNS = 2;
+
+// Subset of Message fields compaction touches. Selecting only what's needed
+// keeps process() independent of api.ts mutations and reduces DB egress.
+export interface CompactionMessage {
+  id: string;
+  role: 'user' | 'assistant' | 'system' | 'tool';
+  content: string;
+  kind: 'message' | 'compact';
+  summary: boolean;
+  status: 'streaming' | 'complete' | 'failed' | 'cancelled';
+  tool_calls: Array<{ id: string; name: string; args: Record<string, unknown> }> | null;
+  tool_results: { tool_call_id: string; output: unknown; truncated: boolean; error?: string } | null;
+  metadata: { kind?: string } | null;
+  created_at: string;
+}
+
+// === overflow ===
+
+// Tokens we hold in reserve for the model's response so a near-full context
+// can still produce a useful turn. Mirrors opencode's COMPACTION_BUFFER.
+// Returns 0 when the context limit is unknown (caller treats 0 as "do not
+// trigger overflow"); avoids dividing-by-zero downstream.
+export function usable(contextLimit: number): number {
+  if (!contextLimit || contextLimit <= 0) return 0;
+  return Math.max(0, contextLimit - COMPACTION_BUFFER);
+}
+
+export interface Usage {
+  prompt_tokens: number;
+  completion_tokens: number;
+}
+
+// True when the assistant just used >= usable() tokens. Unknown limit → false
+// (we never auto-trigger compaction without a budget — better to keep
+// inference flowing than to fall into a compaction we can't size properly).
+export function isOverflow(usage: Usage, contextLimit: number): boolean {
+  const budget = usable(contextLimit);
+  if (budget <= 0) return false;
+  return (usage.prompt_tokens + usage.completion_tokens) >= budget;
+}
+
+// === selection ===
+
+interface Turn {
+  start: number;
+  end: number;
+  id: string;
+}
+
+// Char-count / 4 token estimate. Matches opencode's Token.estimate (which
+// also goes through JSON.stringify). Adequate for tail-fitting math; we
+// don't need a real tokenizer here — the 20k buffer absorbs the slop.
+export function estimate(messages: CompactionMessage[]): number {
+  return Math.ceil(JSON.stringify(messages).length / 4);
+}
+
+// Walk messages, return one Turn per user message that is NOT a summary row.
+// end = next-user-start; final turn ends at messages.length.
+export function turns(messages: CompactionMessage[]): Turn[] {
+  const result: Turn[] = [];
+  for (let i = 0; i < messages.length; i++) {
+    const m = messages[i]!;
+    if (m.role !== 'user') continue;
+    if (m.summary) continue;
+    result.push({ start: i, end: messages.length, id: m.id });
+  }
+  for (let i = 0; i < result.length - 1; i++) {
+    result[i]!.end = result[i + 1]!.start;
+  }
+  return result;
+}
+
+// Inside a turn that doesn't fit whole, walk forward from start+1 looking for
+// the largest suffix that fits the remaining budget. Returns the keep-start
+// index (the first preserved message) or undefined if no suffix fits.
+function splitTurn(
+  messages: CompactionMessage[],
+  turn: Turn,
+  budget: number,
+): { start: number; id: string } | undefined {
+  if (budget <= 0) return undefined;
+  if (turn.end - turn.start <= 1) return undefined;
+  for (let start = turn.start + 1; start < turn.end; start++) {
+    const size = estimate(messages.slice(start, turn.end));
+    if (size > budget) continue;
+    return { start, id: messages[start]!.id };
+  }
+  return undefined;
+}
+
+export interface SelectResult {
+  head: CompactionMessage[];
+  tail_start_id: string | undefined;
+}
+
+// Choose the boundary between the "head" (to be summarized) and the "tail"
+// (preserved verbatim). Strategy:
+//   1. Reserve a budget for the recent tail. Default ranges [2k, 8k] tokens
+//      with 25% of usable() as the target.
+//   2. Take the last `tail_turns` user-turns; greedily fit from newest back.
+//   3. If the next-older turn doesn't fit whole, split it mid-turn.
+//   4. If we couldn't keep anything OR everything fit (keep.start === 0),
+//      return full-preserve (no compaction this round).
+export function select(
+  messages: CompactionMessage[],
+  contextLimit: number,
+  tailTurns: number = DEFAULT_TAIL_TURNS,
+): SelectResult {
+  if (tailTurns <= 0) return { head: messages, tail_start_id: undefined };
+  const budget = Math.min(
+    MAX_PRESERVE_RECENT_TOKENS,
+    Math.max(MIN_PRESERVE_RECENT_TOKENS, Math.floor(usable(contextLimit) * 0.25)),
+  );
+
+  const all = turns(messages);
+  if (all.length === 0) return { head: messages, tail_start_id: undefined };
+  const recent = all.slice(-tailTurns);
+
+  let total = 0;
+  let keep: { start: number; id: string } | undefined;
+  for (let i = recent.length - 1; i >= 0; i--) {
+    const turn = recent[i]!;
+    const size = estimate(messages.slice(turn.start, turn.end));
+    if (total + size <= budget) {
+      total += size;
+      keep = { start: turn.start, id: turn.id };
+      continue;
+    }
+    const remaining = budget - total;
+    const split = splitTurn(messages, turn, remaining);
+    if (split) keep = split;
+    break;
+  }
+
+  if (!keep || keep.start === 0) {
+    return { head: messages, tail_start_id: undefined };
+  }
+  return {
+    head: messages.slice(0, keep.start),
+    tail_start_id: keep.id,
+  };
+}
+
+// === prompt assembly ===
+
+// Build the final user message that asks the model to (re)produce the
+// anchored summary. `context` is reserved for future plugin injection;
+// callers pass [] today.
+export function buildPrompt(
+  previousSummary: string | undefined,
+  context: string[],
+): string {
+  const anchor = previousSummary
+    ? [
+        'Update the anchored summary below using the conversation history above.',
+        'Preserve still-true details, remove stale details, and merge in the new facts.',
+        '<previous-summary>',
+        previousSummary,
+        '</previous-summary>',
+      ].join('\n')
+    : 'Create a new anchored summary from the conversation history above.';
+  return [anchor, SUMMARY_TEMPLATE, ...context].join('\n\n');
+}
+
+// === OpenAI conversion (compaction-local; intentionally does NOT call
+// inference.ts buildMessagesPayload because that uses the legacy "find latest
+// kind='compact' marker and skip everything before it" shortcircuit, which
+// would silently drop pre-legacy-compact history before the LLM sees it.
+// Compaction wants to send the entire head, full stop.) ===
+
+interface OpenAiMessage {
+  role: 'system' | 'user' | 'assistant' | 'tool';
+  content: string | null;
+  tool_calls?: Array<{
+    id: string;
+    type: 'function';
+    function: { name: string; arguments: string };
+  }>;
+  tool_call_id?: string;
+}
+
+function isCapHitSentinel(m: CompactionMessage): boolean {
+  return m.role === 'system' && m.metadata != null && m.metadata.kind === 'cap_hit';
+}
+
+function buildHeadPayload(head: CompactionMessage[]): OpenAiMessage[] {
+  const out: OpenAiMessage[] = [];
+  for (const m of head) {
+    if (isCapHitSentinel(m)) continue;
+    if (m.role === 'assistant' && (m.status === 'streaming' || m.status === 'cancelled')) continue;
+    if (m.kind === 'compact') {
+      // Legacy compact row — pass through as system context. The new
+      // anchored summary will subsume it, but the LLM should see it during
+      // the bridging round so it can carry forward the still-true bits.
+      out.push({ role: 'system', content: m.content });
+      continue;
+    }
+    if (m.summary) {
+      // Defense in depth: process() filters these out of the select-input
+      // already. If one slips through, render it as assistant content so we
+      // never crash here.
+      out.push({ role: 'assistant', content: m.content });
+      continue;
+    }
+    if (m.role === 'tool') {
+      const tr = m.tool_results;
+      if (!tr) continue;
+      const outputText = tr.error
+        ? `error: ${tr.error}`
+        : typeof tr.output === 'string'
+          ? tr.output
+          : JSON.stringify(tr.output);
+      out.push({ role: 'tool', content: outputText, tool_call_id: tr.tool_call_id });
+      continue;
+    }
+    if (m.role === 'assistant') {
+      const msg: OpenAiMessage = {
+        role: 'assistant',
+        content: m.content && m.content.length > 0 ? m.content : null,
+      };
+      if (m.tool_calls && m.tool_calls.length > 0) {
+        msg.tool_calls = m.tool_calls.map((tc) => ({
+          id: tc.id,
+          type: 'function' as const,
+          function: { name: tc.name, arguments: JSON.stringify(tc.args) },
+        }));
+      }
+      out.push(msg);
+      continue;
+    }
+    out.push({ role: 'user', content: m.content });
+  }
+  return out;
+}
+
+// === llama-swap call ===
+
+// Non-streaming completion. Opencode streams; for a one-shot summary call a
+// single POST is less code and the latency hit is acceptable (the user
+// doesn't see this directly — useSessionStream emits the toast + refetches
+// on the 'compacted' frame).
+interface CompletionResult {
+  content: string;
+  promptTokens: number;
+  completionTokens: number;
+}
+
+async function callLlamaSwap(
+  config: Config,
+  model: string,
+  messages: OpenAiMessage[],
+  log: FastifyBaseLogger,
+): Promise<CompletionResult> {
+  const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/chat/completions`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ model, messages, stream: false }),
+  });
+  if (!res.ok) {
+    const text = await res.text().catch(() => '');
+    throw new Error(`llama-swap returned ${res.status}: ${text.slice(0, 200)}`);
+  }
+  const json = (await res.json()) as {
+    choices?: Array<{ message?: { content?: string } }>;
+    usage?: { prompt_tokens?: number; completion_tokens?: number };
+  };
+  // v1.11.3: removed the dead `json.timings?.n_ctx` read — llama-server's
+  // completions don't emit n_ctx in timings. ctx_max on the summary row
+  // comes from model-context.getModelContext below in process().
+  const content = json.choices?.[0]?.message?.content ?? '';
+  const promptTokens = json.usage?.prompt_tokens ?? 0;
+  const completionTokens = json.usage?.completion_tokens ?? 0;
+  log.debug({ promptTokens, completionTokens, chars: content.length }, 'compaction llm complete');
+  return { content, promptTokens, completionTokens };
+}
+
+// === entry point ===
+
+export interface ProcessInput {
+  sql: Sql;
+  config: Config;
+  log: FastifyBaseLogger;
+  broker: Broker;
+  chatId: string;
+}
+
+// Runs one round of anchored rolling compaction on `chatId`. No-ops cleanly
+// (clearing needs_compaction) when there's nothing reasonable to compact.
+// Throws on LLM failure — callers decide whether to log+swallow or surface.
+export async function process(input: ProcessInput): Promise<void> {
+  const { sql, config, log, broker, chatId } = input;
+
+  // 1. Resolve chat → session for model + WS publish channel.
+  const chatRows = await sql<{ id: string; session_id: string }[]>`
+    SELECT id, session_id FROM chats WHERE id = ${chatId}
+  `;
+  if (chatRows.length === 0) {
+    log.warn({ chatId }, 'compaction: chat not found');
+    return;
+  }
+  const chat = chatRows[0]!;
+  const sessionId = chat.session_id;
+
+  const sessRows = await sql<{ id: string; model: string }[]>`
+    SELECT id, model FROM sessions WHERE id = ${sessionId}
+  `;
+  if (sessRows.length === 0) {
+    log.warn({ chatId, sessionId }, 'compaction: session not found');
+    return;
+  }
+  const session = sessRows[0]!;
+
+  // 2. All currently-active messages in this chat (compacted_at IS NULL).
+  // ORDER BY (created_at, id) matches loadContext in inference.ts so the
+  // turns() boundary logic sees the same sequence the LLM will.
+  const messages = await sql<CompactionMessage[]>`
+    SELECT id, role, content, kind, summary, status, tool_calls, tool_results, metadata, created_at
+    FROM messages
+    WHERE chat_id = ${chatId} AND compacted_at IS NULL
+    ORDER BY created_at ASC, id ASC
+  `;
+  if (messages.length === 0) {
+    await sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
+    return;
+  }
+
+  // 3. Find the prior anchored summary (newest summary=true row). Its content
+  // becomes previousSummary — the anchor in the prompt. Filter it out of the
+  // select-input so we don't double-encode (it's already in the anchor text).
+  const previousSummary = messages.filter((m) => m.summary).at(-1)?.content;
+  const forSelect = messages.filter((m) => !m.summary);
+
+  // 4. Resolve a recent context limit. llama-swap reports timings.n_ctx per
+  // completion; we cache it on messages.ctx_max. Use the most recent value
+  // from any message in this chat (oldest assumption is the same model is
+  // still running). When unknown, fall back to model.context_limit-less
+  // defaults via the buffer-only path (see usable()).
+  const ctxRows = await sql<{ ctx_max: number | null }[]>`
+    SELECT ctx_max FROM messages
+    WHERE chat_id = ${chatId} AND ctx_max IS NOT NULL
+    ORDER BY created_at DESC LIMIT 1
+  `;
+  const contextLimit = ctxRows[0]?.ctx_max ?? 0;
+
+  // 5. Decide head / tail.
+  const sel = select(forSelect, contextLimit);
+  if (!sel.tail_start_id || sel.head.length === 0) {
+    // Full preserve — nothing to compact this round. Clear the flag so we
+    // don't loop. (Could happen when the chat is short or the budget swung
+    // wider after a model context bump.)
+    await sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
+    log.info({ chatId, contextLimit, msgCount: messages.length }, 'compaction: nothing to compact');
+    return;
+  }
+
+  // 6. Build the OpenAI request: head as user/assistant/tool turns + a final
+  // user message carrying buildPrompt(previousSummary, []). No system prompt
+  // — matches opencode (`system: []`); the template + anchor are sufficient.
+  const headPayload = buildHeadPayload(sel.head);
+  const finalUser: OpenAiMessage = { role: 'user', content: buildPrompt(previousSummary, []) };
+  const payload = [...headPayload, finalUser];
+
+  log.info(
+    {
+      chatId,
+      contextLimit,
+      headLen: sel.head.length,
+      tailStartId: sel.tail_start_id,
+      hadPrevSummary: previousSummary !== undefined,
+    },
+    'compaction: invoking model',
+  );
+
+  // 6a. Flip the chat dot amber for the duration of the LLM call + DB writes.
+  // Same { type: 'chat_status', status: 'working', at } shape inference.ts
+  // emits at runner enqueue. publishUser → broadcasts on the per-user channel
+  // (all devices / tabs see it) since chat_status is a user-channel frame in
+  // BooCode (see useChatStatus.ts, which is the consumer).
+  broker.publishUser('default', {
+    type: 'chat_status',
+    chat_id: chatId,
+    status: 'working',
+    at: new Date().toISOString(),
+  });
+
+  // try/finally so the dot ALWAYS drops back to idle, even if the LLM call
+  // throws or a downstream DB write fails. The succeeded flag gates the
+  // 'compacted' frame + final log: we only signal completion to the UI when
+  // the new summary row actually landed.
+  let succeeded = false;
+  let newId = '';
+  let result: CompletionResult | undefined;
+  try {
+    // 7. Single completion (no tools). Throws on llama-swap failure.
+    result = await callLlamaSwap(config, session.model, payload, log);
+
+    // 7b. v1.11.3: fetch the model's true context window from llama-swap's
+    // /upstream/<model>/props (the streaming completion doesn't carry it).
+    // Same pattern as inference.ts; the cache makes repeated calls free.
+    const mctx = await modelContextLookup.getModelContext(session.model);
+    const nCtx = mctx?.n_ctx ?? null;
+
+    // 8. Insert the new anchored summary row. role='assistant' per spec; the
+    // UI distinguishes via summary=true. tail_start_id points at the first
+    // preserved tail message so debug surfaces / future tools can reason
+    // about the boundary without re-deriving from compacted_at.
+    const insertRows = await sql<{ id: string }[]>`
+      INSERT INTO messages (
+        session_id, chat_id, role, content, kind, status,
+        summary, tail_start_id,
+        tokens_used, ctx_used, ctx_max,
+        created_at, finished_at
+      )
+      VALUES (
+        ${sessionId}, ${chatId}, 'assistant', ${result.content}, 'message', 'complete',
+        true, ${sel.tail_start_id},
+        ${result.completionTokens}, ${result.promptTokens}, ${nCtx},
+        clock_timestamp(), clock_timestamp()
+      )
+      RETURNING id
+    `;
+    newId = insertRows[0]!.id;
+
+    // 9. Mark every prior live message (head + prior summary) as compacted.
+    // Bound by "created_at strictly less than tail_start_id's created_at" so
+    // the preserved tail stays compacted_at=NULL. Exclude the new summary
+    // row we just inserted (it's "now", which is >= tail_start_id's
+    // created_at anyway, but defensive).
+    await sql`
+      UPDATE messages
+      SET compacted_at = clock_timestamp()
+      WHERE chat_id = ${chatId}
+        AND compacted_at IS NULL
+        AND id != ${newId}
+        AND created_at < (SELECT created_at FROM messages WHERE id = ${sel.tail_start_id})
+    `;
+
+    // 10. Clear the flag and bump the chat's updated_at so the sidebar
+    // reflects recent activity.
+    await sql`
+      UPDATE chats
+      SET needs_compaction = false, updated_at = clock_timestamp()
+      WHERE id = ${chatId}
+    `;
+
+    succeeded = true;
+  } finally {
+    // Always restore the dot. Status='idle' (not 'error') even on failure —
+    // the caller logs/re-surfaces the error separately; the dot doesn't
+    // need to stay red across reloads for a transient compaction blip.
+    broker.publishUser('default', {
+      type: 'chat_status',
+      chat_id: chatId,
+      status: 'idle',
+      at: new Date().toISOString(),
+    });
+  }
+
+  // 11. Tell the client. useSessionStream subscribes to the per-session WS
+  // channel; the handler refetches messages (so the new summary row + the
+  // compacted_at-stamped older rows render correctly) and fires a sonner
+  // toast. Order matters: idle must precede 'compacted' so the dot is
+  // already green by the time the refetch toast appears.
+  if (succeeded) {
+    broker.publish(sessionId, {
+      type: 'compacted',
+      session_id: sessionId,
+      chat_id: chatId,
+      summary_message_id: newId,
+    });
+    log.info(
+      {
+        chatId,
+        newId,
+        completionTokens: result?.completionTokens,
+        promptTokens: result?.promptTokens,
+      },
+      'compaction: complete',
+    );
+  }
+}
--- a/apps/server/src/services/inference.ts
+++ b/apps/server/src/services/inference.ts
@@ -21,6 +21,9 @@ import {
 import { PathScopeError, resolveProjectRoot } from './path_guard.js';
 import { maybeAutoNameChat } from './auto_name.js';
 import { getAgentById } from './agents.js';
+import * as compaction from './compaction.js';
+import * as modelContext from './model-context.js';
+import type { Broker } from './broker.js';

 const BASE_SYSTEM_PROMPT = (projectPath: string) =>
  `You are BooCode Chat, a code investigation assistant. The user is working on a project located at ${projectPath}. Use the file-read tools (view_file, list_dir, grep, find_files) to investigate code when needed. Be concise. Cite file paths and line numbers when discussing code. Do not hallucinate file contents — read the file first. Tool results may be truncated; if so, narrow your query rather than guessing.`;
@@ -136,9 +139,6 @@ interface ChatCompletionChunk {
    completion_tokens?: number;
    total_tokens?: number;
  };
-  timings?: {
-    n_ctx?: number;
-  };
 }

 export interface InferenceContext {
@@ -147,6 +147,12 @@ export interface InferenceContext {
  log: FastifyBaseLogger;
  publish: FramePublisher;
  publishUser: (frame: UserStreamFrame) => void;
+  // v1.11: passed through so compaction.process can publish 'compacted'
+  // frames on the same session WS channel useSessionStream subscribes to.
+  // Compaction is the only path that needs the raw broker handle (regular
+  // inference goes through `publish`); keeping a separate field avoids
+  // tempting other code paths into bypassing the session-id binding.
+  broker: Broker;
 }

 // Resolution order: base prompt < agent.system_prompt < user prompt, where
@@ -260,17 +266,48 @@ async function loadContext(
  if (projectRows.length === 0) return null;
  const project = projectRows[0]!;

+  // v1.11: filter compacted messages out of the inference assembly. The GET
+  // /api/sessions/:id/messages endpoint still returns everything (so the UI
+  // can show history with the summary card inline); only LLM payloads skip
+  // compacted rows. compacted_at IS NULL keeps the active summary + tail.
  const history = await sql<Message[]>`
    SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
           tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata
    FROM messages
-    WHERE chat_id = ${chatId}
+    WHERE chat_id = ${chatId} AND compacted_at IS NULL
    ORDER BY created_at ASC, id ASC
  `;

  return { session, project, history };
 }

+// v1.11: shared helper used after both finalizeCompletion and executeToolPhase
+// persist their token counts. Reads tokens off the just-UPDATEd row (which
+// the caller returns from RETURNING), runs compaction.isOverflow, and flips
+// chats.needs_compaction. The next runAssistantTurn invocation acts on it.
+// Silent on missing tokens — llama-swap occasionally omits usage on truncated
+// streams, and we'd rather miss one overflow than crash the inference path.
+async function maybeFlagForCompaction(
+  ctx: InferenceContext,
+  chatId: string,
+  updated: { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null } | undefined,
+): Promise<void> {
+  if (!updated) return;
+  const promptTokens = updated.ctx_used;
+  const completionTokens = updated.tokens_used;
+  const contextLimit = updated.ctx_max;
+  if (typeof promptTokens !== 'number') return;
+  if (typeof completionTokens !== 'number') return;
+  if (typeof contextLimit !== 'number') return;
+  const overflow = compaction.isOverflow(
+    { prompt_tokens: promptTokens, completion_tokens: completionTokens },
+    contextLimit,
+  );
+  if (!overflow) return;
+  await ctx.sql`UPDATE chats SET needs_compaction = true WHERE id = ${chatId}`;
+  ctx.log.info({ chatId, promptTokens, completionTokens, contextLimit }, 'inference: flagged for compaction');
+}
+
 async function* sseLines(stream: ReadableStream<Uint8Array>): AsyncGenerator<string> {
  const reader = stream.getReader();
  const decoder = new TextDecoder('utf-8');
@@ -300,7 +337,6 @@ interface StreamResult {
  toolCalls: ToolCall[];
  promptTokens: number | null;
  completionTokens: number | null;
-  nCtx: number | null;
 }

 interface StreamOptions {
@@ -415,7 +451,6 @@ async function streamCompletion(
  let finishReason: string | null = null;
  let promptTokens: number | null = null;
  let completionTokens: number | null = null;
-  let nCtx: number | null = null;
  const toolCallsBuffer = new Map<number, { id: string; name: string; argsText: string }>();

  for await (const line of sseLines(res.body)) {
@@ -437,9 +472,11 @@ async function streamCompletion(
        completionTokens = parsed.usage.completion_tokens;
      }
    }
-    if (parsed.timings && typeof parsed.timings.n_ctx === 'number') {
-      nCtx = parsed.timings.n_ctx;
-    }
+    // v1.11.3: removed dead `parsed.timings.n_ctx` read. llama-server's
+    // streaming completion does NOT emit n_ctx in timings (verified
+    // empirically); the authoritative source is llama-swap's
+    // /upstream/<model>/props endpoint, fetched per-turn via
+    // model-context.getModelContext() at the finalization sites below.

    const choice = parsed.choices?.[0];
    if (!choice) continue;
@@ -525,7 +562,7 @@ async function streamCompletion(
    toolCalls.push({ id: t.id || `call_${toolCalls.length}`, name: t.name, args });
  }

-  return { finishReason, content, toolCalls, promptTokens, completionTokens, nCtx };
+  return { finishReason, content, toolCalls, promptTokens, completionTokens };
 }

 async function executeToolCall(
@@ -742,7 +779,14 @@ async function executeToolPhase(
  projectRoot: string
 ): Promise<void> {
  const { sessionId, chatId, assistantMessageId, toolsUsed, signal } = args;
-  const { content, toolCalls, promptTokens, completionTokens, nCtx } = result;
+  const { content, toolCalls, promptTokens, completionTokens } = result;
+
+  // v1.11.3: ctx_max comes from llama-swap /upstream/<model>/props, not the
+  // streaming completion (which doesn't emit n_ctx). getModelContext caches
+  // the positive lookup for the process lifetime, so this is a single Map
+  // hit after the first invocation per model.
+  const mctx = await modelContext.getModelContext(session.model);
+  const nCtx = mctx?.n_ctx ?? null;

  const [updated] = await ctx.sql<
    { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
@@ -758,6 +802,10 @@ async function executeToolPhase(
    WHERE id = ${assistantMessageId}
    RETURNING tokens_used, ctx_used, ctx_max, finished_at
  `;
+  // v1.11: flag for compaction if this turn pushed us over the usable budget.
+  // We never compact mid-loop (the recursive runAssistantTurn keeps tools
+  // flowing); the flag fires on the NEXT turn's pre-fetch hook above.
+  await maybeFlagForCompaction(ctx, chatId, updated);
  const [toolSessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>`
    UPDATE sessions SET updated_at = clock_timestamp()
    WHERE id = ${sessionId}
@@ -874,7 +922,11 @@ async function finalizeCompletion(
  session: Session
 ): Promise<void> {
  const { sessionId, chatId, assistantMessageId } = args;
-  const { content, finishReason, promptTokens, completionTokens, nCtx } = result;
+  const { content, finishReason, promptTokens, completionTokens } = result;
+
+  // v1.11.3: see executeToolPhase for the rationale.
+  const mctx = await modelContext.getModelContext(session.model);
+  const nCtx = mctx?.n_ctx ?? null;

  const [updated] = await ctx.sql<
    { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
@@ -889,6 +941,9 @@ async function finalizeCompletion(
    WHERE id = ${assistantMessageId}
    RETURNING tokens_used, ctx_used, ctx_max, finished_at
  `;
+  // v1.11: flag for compaction on the terminal turn too. Catches the common
+  // case of a turn that hit the limit without invoking tools.
+  await maybeFlagForCompaction(ctx, chatId, updated);
  const [completeSessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>`
    UPDATE sessions SET updated_at = clock_timestamp()
    WHERE id = ${sessionId}
@@ -927,6 +982,29 @@ async function runAssistantTurn(
 ): Promise<void> {
  const { sessionId, chatId } = args;

+  // v1.11: if the prior turn flagged this chat for compaction, run it first
+  // so loadContext below reads the post-compaction history. We swallow
+  // compaction failures (clearing the flag so we don't loop) and proceed
+  // with the un-compacted history — a slow turn that hits the model's
+  // hard limit is recoverable; a dead session is not.
+  const chatFlag = await ctx.sql<{ needs_compaction: boolean }[]>`
+    SELECT needs_compaction FROM chats WHERE id = ${chatId}
+  `;
+  if (chatFlag[0]?.needs_compaction) {
+    try {
+      await compaction.process({
+        sql: ctx.sql,
+        config: ctx.config,
+        log: ctx.log,
+        broker: ctx.broker,
+        chatId,
+      });
+    } catch (err) {
+      ctx.log.warn({ err, chatId }, 'auto-compaction failed; clearing flag and proceeding');
+      await ctx.sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
+    }
+  }
+
  const loaded = await loadContext(ctx.sql, sessionId, chatId);
  if (!loaded) {
    ctx.log.warn({ sessionId }, 'inference: session or project missing');
@@ -1081,6 +1159,9 @@ async function runCapHitSummary(
  // even on a partial / failed summary the chat history shows where the
  // budget was hit.
  if (summaryOk && result) {
+    // v1.11.3: see executeToolPhase for the rationale.
+    const mctx = await modelContext.getModelContext(session.model);
+    const nCtx = mctx?.n_ctx ?? null;
    const [updated] = await ctx.sql<
      { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
    >`
@@ -1089,7 +1170,7 @@ async function runCapHitSummary(
          status = 'complete',
          tokens_used = ${result.completionTokens},
          ctx_used = ${result.promptTokens},
-          ctx_max = ${result.nCtx},
+          ctx_max = ${nCtx},
          finished_at = clock_timestamp()
      WHERE id = ${assistantMessageId}
      RETURNING tokens_used, ctx_used, ctx_max, finished_at
@@ -1237,81 +1318,6 @@ async function insertCapHitSentinel(
  });
 }

-const COMPACT_SYSTEM_PROMPT =
-  'Summarize the preceding conversation into a dense but complete context paragraph. Preserve all key facts, decisions, file paths, code patterns, and action items. Do not add any new information. Output only the summary paragraph.';
-
-async function runCompact(
-  ctx: InferenceContext,
-  sessionId: string,
-  chatId: string,
-  compactMessageId: string
-): Promise<void> {
-  const loaded = await loadContext(ctx.sql, sessionId, chatId);
-  if (!loaded) return;
-  const { session, project, history } = loaded;
-
-  const messagesForSummary = buildMessagesPayload(session, project,
-    history.filter((m) => m.id !== compactMessageId)
-  );
-  messagesForSummary.push({
-    role: 'system',
-    content: COMPACT_SYSTEM_PROMPT,
-  });
-
-  ctx.publish(sessionId, {
-    type: 'message_started',
-    message_id: compactMessageId,
-    chat_id: chatId,
-    role: 'assistant',
-  });
-
-  let content = '';
-  try {
-    const result = await streamCompletion(
-      ctx,
-      session.model,
-      messagesForSummary,
-      { tools: null },
-      (delta) => {
-        content += delta;
-        ctx.publish(sessionId, {
-          type: 'delta',
-          message_id: compactMessageId,
-          chat_id: chatId,
-          content: delta,
-        });
-      }
-    );
-    content = result.content;
-  } catch (err) {
-    const errMsg = err instanceof Error ? err.message : String(err);
-    await ctx.sql`
-      UPDATE messages SET status = 'failed', content = ${content}, finished_at = clock_timestamp()
-      WHERE id = ${compactMessageId}
-    `;
-    ctx.publish(sessionId, {
-      type: 'error',
-      message_id: compactMessageId,
-      chat_id: chatId,
-      error: errMsg,
-    });
-    return;
-  }
-
-  const preCompactCount = history.filter((m) => m.id !== compactMessageId && m.kind !== 'compact').length;
-  const summary = `[Context compacted — ${preCompactCount} messages summarized]\n\n${content}`;
-
-  await ctx.sql`
-    UPDATE messages SET content = ${summary}, status = 'complete', finished_at = clock_timestamp()
-    WHERE id = ${compactMessageId}
-  `;
-  ctx.publish(sessionId, {
-    type: 'message_complete',
-    message_id: compactMessageId,
-    chat_id: chatId,
-  });
-}
-
 interface InferenceRegistration {
  controller: AbortController;
  completed: Promise<void>;
@@ -1328,6 +1334,10 @@ export function createInferenceRunner(
      const callCtx: InferenceContext = {
        ...ctx,
        publishUser: (frame) => publishUserFn(user, frame),
+        // v1.11: broker comes in via ctx (set at registration time). Repeated
+        // here so the destructure carries it onto the per-call ctx without
+        // having to add it to every enqueue/cancel signature individually.
+        broker: ctx.broker,
      };
      // v1.8 mobile-tabs: announce working before the async loop starts so
      // every device subscribed to the user channel sees the amber dot.
@@ -1357,20 +1367,6 @@ export function createInferenceRunner(
      })();
    },

-    enqueueCompact(sessionId: string, chatId: string, compactMessageId: string, user: string) {
-      const callCtx: InferenceContext = {
-        ...ctx,
-        publishUser: (frame) => publishUserFn(user, frame),
-      };
-      void (async () => {
-        try {
-          await runCompact(callCtx, sessionId, chatId, compactMessageId);
-        } catch (err) {
-          callCtx.log.error({ err }, 'unhandled compact error');
-        }
-      })();
-    },
-
    async cancel(_sessionId: string, chatId: string): Promise<boolean> {
      const reg = registry.get(chatId);
      if (!reg) return false;
--- a/apps/server/src/services/model-context.ts
+++ b/apps/server/src/services/model-context.ts
@@ -0,0 +1,113 @@
+// v1.11.3: llama-swap model-context cache. Replaces the dead
+// `parsed.timings.n_ctx` capture in inference.ts / compaction.ts —
+// llama-server's streaming completion never emits n_ctx in timings (verified
+// empirically: timings carries prompt_n / predicted_n / *_ms / *_per_second
+// only). The authoritative source is llama-swap's
+// /upstream/<model>/props endpoint at .default_generation_settings.n_ctx.
+//
+// Cache design:
+//   - Positive entries (n_ctx + total_slots) have no TTL. A model's context
+//     size doesn't change while llama-swap is running; an admin endpoint
+//     can invalidateModelContext() if it ever does.
+//   - Negative entries (failed fetch) have a 60s TTL so a misconfigured or
+//     down model doesn't get hammered every inference turn, but recovers
+//     within a minute once the upstream comes back.
+//   - 3s AbortController timeout on the fetch — long enough for a healthy
+//     upstream, short enough that a stuck upstream doesn't block the
+//     ctx_max UPDATE that follows.
+
+export interface ModelContext {
+  n_ctx: number;
+  total_slots: number;
+  fetched_at: number;
+}
+
+const NEGATIVE_TTL_MS = 60_000;
+const FETCH_TIMEOUT_MS = 3_000;
+
+const positiveCache = new Map<string, ModelContext>();
+// Value is the unix-ms timestamp of the last failed fetch. Used to gate
+// re-fetches within the 60s window.
+const negativeCache = new Map<string, number>();
+
+// Set once at startup by index.ts. We don't import loadConfig() directly
+// here to keep this module trivially mockable in tests (set the URL in
+// beforeEach instead of stubbing process.env + loadConfig's cache).
+let llamaSwapUrl: string | null = null;
+
+export function configureModelContext(opts: { llamaSwapUrl: string }): void {
+  llamaSwapUrl = opts.llamaSwapUrl;
+}
+
+export async function getModelContext(model: string): Promise<ModelContext | null> {
+  // 1. Positive cache hit — no TTL check, model n_ctx is invariant.
+  const pos = positiveCache.get(model);
+  if (pos) return pos;
+
+  // 2. Negative cache hit within TTL — return null without refetching.
+  // Stale negative entries (older than the TTL) fall through to a fresh
+  // attempt below; we don't delete them eagerly because the next successful
+  // fetch will overwrite via the positive map and the negative entry
+  // becomes irrelevant.
+  const negTs = negativeCache.get(model);
+  if (negTs !== undefined && Date.now() - negTs < NEGATIVE_TTL_MS) {
+    return null;
+  }
+
+  // 3. Module not initialized. Defensive — index.ts calls
+  // configureModelContext at startup; if a test forgets, fail closed so
+  // the chat still works (ctx_max stays null, UI degrades gracefully).
+  if (!llamaSwapUrl) {
+    negativeCache.set(model, Date.now());
+    return null;
+  }
+
+  // 4. Fetch with timeout. AbortController fires after FETCH_TIMEOUT_MS;
+  // both the timeout path and a fetch reject end up in the catch below
+  // and produce a negative cache entry.
+  const url = `${llamaSwapUrl}/upstream/${encodeURIComponent(model)}/props`;
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
+  try {
+    const res = await fetch(url, { signal: controller.signal });
+    clearTimeout(timer);
+    if (!res.ok) {
+      negativeCache.set(model, Date.now());
+      return null;
+    }
+    const body = (await res.json()) as {
+      default_generation_settings?: { n_ctx?: number };
+      total_slots?: number;
+    };
+    const n_ctx = body?.default_generation_settings?.n_ctx;
+    if (typeof n_ctx !== 'number' || n_ctx <= 0) {
+      negativeCache.set(model, Date.now());
+      return null;
+    }
+    // total_slots is informational; default to 1 if missing rather than
+    // reject the whole response. Most local llama-swap setups run a
+    // single slot anyway.
+    const total_slots =
+      typeof body?.total_slots === 'number' && body.total_slots > 0 ? body.total_slots : 1;
+    const entry: ModelContext = { n_ctx, total_slots, fetched_at: Date.now() };
+    positiveCache.set(model, entry);
+    // Clear any stale negative entry so a future query sees the positive
+    // hit cleanly (otherwise the negative TTL never expires from the map).
+    negativeCache.delete(model);
+    return entry;
+  } catch {
+    clearTimeout(timer);
+    negativeCache.set(model, Date.now());
+    return null;
+  }
+}
+
+export function invalidateModelContext(model?: string): void {
+  if (model === undefined) {
+    positiveCache.clear();
+    negativeCache.clear();
+  } else {
+    positiveCache.delete(model);
+    negativeCache.delete(model);
+  }
+}
--- a/apps/server/src/types/api.ts
+++ b/apps/server/src/types/api.ts
@@ -159,6 +159,12 @@ export interface Message {
  // v1.8.2: per-message metadata. See MessageMetadata for the discriminated
  // shapes currently in use.
  metadata: MessageMetadata | null;
+  // v1.11: anchored rolling compaction. Optional so consumers that SELECT
+  // the pre-v1.11 column set still type-check. See compaction.ts +
+  // schema.sql for semantics.
+  summary?: boolean;
+  tail_start_id?: string | null;
+  compacted_at?: string | null;
 }

 export interface ModelInfo {
--- a/apps/web/src/api/client.ts
+++ b/apps/web/src/api/client.ts
@@ -168,8 +168,11 @@ export const api = {
      request<void>(`/api/chats/${chatId}`, { method: 'DELETE' }),
    messages: (chatId: string) =>
      request<Message[]>(`/api/chats/${chatId}/messages`),
+    // v1.11: anchored-rolling compaction. POST awaits the LLM call inside
+    // the route's lifecycle; the new summary row arrives via the 'compacted'
+    // WS frame (useSessionStream refetches + toasts).
    compact: (chatId: string) =>
-      request<{ compact_message_id: string }>(`/api/chats/${chatId}/compact`, { method: 'POST' }),
+      request<{ ok: true }>(`/api/chats/${chatId}/compact`, { method: 'POST' }),
    stop: (chatId: string) =>
      request<{ stopped: boolean }>(`/api/chats/${chatId}/stop`, { method: 'POST' }),
    forceSend: (chatId: string, content: string) =>
--- a/apps/web/src/api/types.ts
+++ b/apps/web/src/api/types.ts
@@ -145,6 +145,19 @@ export interface Message {
  // v1.8.2: per-message metadata; see MessageMetadata. null for the vast
  // majority of messages.
  metadata: MessageMetadata | null;
+  // v1.11: anchored rolling compaction fields. Optional on the wire so that
+  // older API responses (or test fixtures) parse without explicit nulls.
+  //   summary       — true on the assistant row that holds the active
+  //                   anchored summary. Render via SummaryCard.
+  //   tail_start_id — first preserved tail message the summary covers up to
+  //                   (exclusive). Diagnostic only on the client.
+  //   compacted_at  — set on rows that are "behind the curtain" of the
+  //                   current summary. Returned by the GET endpoint so the
+  //                   UI can show history, but the server-side inference
+  //                   assembly filters these out.
+  summary?: boolean;
+  tail_start_id?: string | null;
+  compacted_at?: string | null;
 }

 export interface ModelInfo {
@@ -305,6 +318,11 @@ export type WsFrame =
    }
  | { type: 'messages_deleted'; message_ids: string[]; chat_id?: string }
  | { type: 'chat_renamed'; chat_id: string; name: string }
+  // v1.11: published by services/compaction.ts after the new anchored
+  // summary row lands. Carries the new summary row id for diagnostics; the
+  // session-stream handler ignores the id and re-fetches the full message
+  // list (the cohort of compacted_at-stamped rows changed too).
+  | { type: 'compacted'; session_id: string; chat_id: string; summary_message_id: string }
  // v1.8.2: `reason` discriminates structured failures (the UI prefers it
  // over `error` text when present).
  | { type: 'error'; message_id?: string; chat_id?: string; error: string; reason?: ErrorReason };
--- a/apps/web/src/components/ChatTabBar.tsx
+++ b/apps/web/src/components/ChatTabBar.tsx
@@ -1,5 +1,5 @@
 import { useState } from 'react';
-import { History, MessageSquare, Plus, X } from 'lucide-react';
+import { Bot, History, MessageSquare, Plus, Terminal, X } from 'lucide-react';
 import type { Chat, WorkspacePane } from '@/api/types';
 import { StatusDot } from '@/components/StatusDot';
 import {
@@ -9,6 +9,12 @@ import {
  ContextMenuSeparator,
  ContextMenuTrigger,
 } from '@/components/ui/context-menu';
+import {
+  DropdownMenu,
+  DropdownMenuContent,
+  DropdownMenuItem,
+  DropdownMenuTrigger,
+} from '@/components/ui/dropdown-menu';
 import { useLongPress } from '@/hooks/useLongPress';
 import { cn } from '@/lib/utils';

@@ -20,7 +26,7 @@ interface Props {
  onCloseOthers: (chatId: string) => void;
  onCloseToRight: (chatId: string) => void;
  onCloseAll: () => void;
-  onNewChat: () => void;
+  onAddPane: (kind: 'chat' | 'terminal' | 'agent') => void;
  onShowHistory: () => void;
  onRename: (chatId: string, name: string) => Promise<void>;
  onRemovePane?: () => void;
@@ -34,7 +40,7 @@ export function ChatTabBar({
  onCloseOthers,
  onCloseToRight,
  onCloseAll,
-  onNewChat,
+  onAddPane,
  onShowHistory,
  onRename,
  onRemovePane,
@@ -125,7 +131,7 @@ export function ChatTabBar({
              </div>
            </ContextMenuTrigger>
            <ContextMenuContent>
-              <ContextMenuItem onSelect={() => onNewChat()}>
+              <ContextMenuItem onSelect={() => onAddPane('chat')}>
                New chat
              </ContextMenuItem>
              <ContextMenuSeparator />
@@ -164,15 +170,29 @@ export function ChatTabBar({
      )}

      <div className="flex items-center ml-auto gap-0.5 px-1 shrink-0">
+        <DropdownMenu>
+          <DropdownMenuTrigger asChild>
            <button
              type="button"
-          onClick={onNewChat}
              className="inline-flex items-center justify-center p-1 rounded text-muted-foreground hover:bg-muted hover:text-foreground max-md:min-h-[44px] max-md:min-w-[44px]"
-          aria-label="New chat"
-          title="New chat"
+              aria-label="New pane"
+              title="New pane"
            >
              <Plus size={12} />
            </button>
+          </DropdownMenuTrigger>
+          <DropdownMenuContent align="end" className="min-w-40">
+            <DropdownMenuItem onSelect={() => onAddPane('chat')}>
+              <MessageSquare size={14} /> New chat
+            </DropdownMenuItem>
+            <DropdownMenuItem onSelect={() => onAddPane('terminal')}>
+              <Terminal size={14} /> New terminal
+            </DropdownMenuItem>
+            <DropdownMenuItem onSelect={() => onAddPane('agent')}>
+              <Bot size={14} /> New agent
+            </DropdownMenuItem>
+          </DropdownMenuContent>
+        </DropdownMenu>
        <button
          type="button"
          onClick={onShowHistory}
--- a/apps/web/src/components/ContextBar.tsx
+++ b/apps/web/src/components/ContextBar.tsx
@@ -0,0 +1,86 @@
+import type { Message } from '@/api/types';
+
+interface Props {
+  messages: Message[];
+}
+
+// v1.11.2: persistent context-usage indicator above MessageList. Mirrors the
+// server-side compaction.usable() formula — color thresholds are computed
+// against (max - 20k buffer), not raw max, so the bar turns amber/orange
+// /red at the same boundaries auto-compaction will fire. The popover above
+// the input (ChatContextPopover) uses raw-% thresholds and is intentionally
+// kept separate (it's a different surface and a different signal).
+const COMPACTION_BUFFER = 20_000;
+
+// Walk newest-first; first message with both ctx_used and ctx_max non-null
+// AND ctx_max > 0 wins. Older messages may have ctx_used but missing ctx_max
+// (early v1 before llama-swap's n_ctx capture worked) — skip them and keep
+// walking. If nothing usable in the chat, caller renders null.
+function latestPair(messages: Message[]): { used: number; max: number } | null {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const m = messages[i]!;
+    if (m.ctx_used == null || m.ctx_max == null) continue;
+    if (m.ctx_max <= 0) continue;
+    return { used: m.ctx_used, max: m.ctx_max };
+  }
+  return null;
+}
+
+interface ColorTier {
+  // Tailwind utility for the label / numbers. Uses literal palette names
+  // rather than design tokens because we want three distinct severities
+  // (amber → orange → red) and BooCode only defines one warning token
+  // (`destructive`). Literal classes keep the gradation explicit.
+  text: string;
+  bar: string;
+}
+
+function tierFor(usablePct: number): ColorTier {
+  if (usablePct >= 0.95) return { text: 'text-red-600 dark:text-red-400', bar: 'bg-red-500' };
+  if (usablePct >= 0.80) return { text: 'text-orange-600 dark:text-orange-400', bar: 'bg-orange-500' };
+  if (usablePct >= 0.60) return { text: 'text-amber-600 dark:text-amber-400', bar: 'bg-amber-500' };
+  return { text: 'text-muted-foreground', bar: 'bg-muted-foreground/40' };
+}
+
+export function ContextBar({ messages }: Props) {
+  const pair = latestPair(messages);
+  if (!pair) return null;
+
+  const { used, max } = pair;
+  const usable = Math.max(0, max - COMPACTION_BUFFER);
+  const pct = used / max;
+  const usablePct = usable > 0 ? used / usable : 0;
+  const tier = tierFor(usablePct);
+
+  // Bar fill is clamped to [0, 100] — over-budget cases (usable < used) still
+  // show the bar at 100% red rather than overflowing the track visually.
+  const fillPct = Math.min(100, Math.max(0, pct * 100));
+  const compactionThresholdPct = max > 0 ? Math.round((usable / max) * 100) : 0;
+
+  return (
+    <div className="border-b px-4 py-1 shrink-0">
+      <div className="max-w-[1000px] mx-auto w-full">
+        <div className="flex items-baseline justify-between text-[10px] font-mono leading-tight">
+          {/* "Context" on >=sm, "Ctx" on phones to save horizontal space. */}
+          <span className={tier.text}>
+            <span className="hidden sm:inline">Context</span>
+            <span className="sm:hidden">Ctx</span>
+          </span>
+          <span
+            className={tier.text}
+            title={`Auto-compaction at ~${compactionThresholdPct}%`}
+          >
+            {used.toLocaleString()} / {max.toLocaleString()}{' '}
+            <span className="max-[380px]:hidden">({Math.round(pct * 100)}%)</span>
+          </span>
+        </div>
+        <div className="mt-1 h-1 rounded-full bg-muted overflow-hidden">
+          <div
+            className={`h-full ${tier.bar} transition-[width] duration-300`}
+            style={{ width: `${fillPct}%` }}
+          />
+        </div>
+      </div>
+    </div>
+  );
+}
--- a/apps/web/src/components/MessageBubble.tsx
+++ b/apps/web/src/components/MessageBubble.tsx
@@ -537,7 +537,70 @@ function CompactCard({ message, sessionChats }: { message: Message; sessionChats
  );
 }

+// v1.11 anchored rolling summary. Inserted by services/compaction.ts as a
+// role='assistant', summary=true row. Distinct from legacy CompactCard
+// (which renders the kind='compact' system rows produced by v1.10 /compact).
+// Collapsed by default; header shows the timestamp; body renders the
+// summary markdown when expanded. Copy button matches CompactCard's affordance.
+function SummaryCard({ message }: { message: Message }) {
+  const [expanded, setExpanded] = useState(false);
+  const [copied, setCopied] = useState(false);
+
+  // Use finished_at when available (that's when the summary actually landed);
+  // fall back to created_at for any row missing it. Both are ISO strings.
+  const ts = message.finished_at ?? message.created_at;
+  const headerTs = ts ? new Date(ts).toLocaleString() : '';
+
+  async function handleCopy() {
+    try {
+      await navigator.clipboard.writeText(message.content);
+      setCopied(true);
+      setTimeout(() => setCopied(false), 1200);
+      toast.success('Summary copied to clipboard');
+    } catch {
+      toast.error('Copy failed');
+    }
+  }
+
+  return (
+    <div className="rounded-lg border border-primary/30 bg-primary/5 text-sm">
+      <div className="flex items-center gap-2 px-3 py-2">
+        <button
+          type="button"
+          onClick={() => setExpanded(!expanded)}
+          className="flex items-center gap-1.5 flex-1 min-w-0 text-left text-muted-foreground hover:text-foreground"
+        >
+          {expanded ? <ChevronDown size={14} /> : <ChevronRight size={14} />}
+          <span className="text-xs font-medium truncate">
+            Compacted summary — {headerTs}
+          </span>
+        </button>
+        <button
+          type="button"
+          onClick={() => void handleCopy()}
+          className="p-1 rounded hover:bg-muted text-muted-foreground"
+          aria-label="Copy summary"
+          title="Copy summary"
+        >
+          {copied ? <Check size={12} /> : <Copy size={12} />}
+        </button>
+      </div>
+      {expanded && (
+        <div className="px-3 pb-3 text-xs leading-relaxed border-t pt-2">
+          <MarkdownBody content={message.content} />
+        </div>
+      )}
+    </div>
+  );
+}
+
 export function MessageBubble({ message, sessionChats, capHitInfo }: Props) {
+  // v1.11: anchored rolling summary row. Checked BEFORE the kind==='compact'
+  // branch because summary=true never coexists with kind='compact' (new
+  // compactions emit role='assistant' rows with kind='message'+summary=true).
+  if (message.summary) {
+    return <SummaryCard message={message} />;
+  }
  if (message.kind === 'compact') {
    return <CompactCard message={message} sessionChats={sessionChats} />;
  }
--- a/apps/web/src/components/Workspace.tsx
+++ b/apps/web/src/components/Workspace.tsx
@@ -1,5 +1,5 @@
 import { useEffect, useMemo, useState } from 'react';
-import { PanelRight, MessageSquare, Terminal, Bot, Clipboard, X } from 'lucide-react';
+import { PanelRight, MessageSquare, Terminal, Bot, Clipboard, Plus, X } from 'lucide-react';
 import type { Chat, Project, Session, WorkspacePane } from '@/api/types';
 import { MAX_PANES, type UseWorkspacePanesResult } from '@/hooks/useWorkspacePanes';
 import type { UseSessionChatsResult } from '@/hooks/useSessionChats';
@@ -227,7 +227,10 @@ export function Workspace({
                  onCloseOthers={(chatId) => closeOtherTabs(idx, chatId)}
                  onCloseToRight={(chatId) => closeTabsToRight(idx, chatId)}
                  onCloseAll={() => closeAllTabs(idx)}
-                  onNewChat={() => void createChat(idx)}
+                  onAddPane={(kind) => {
+                    if (kind === 'chat') void createChat(idx);
+                    else addSplitPane(kind);
+                  }}
                  onShowHistory={() => showLandingPage(idx)}
                  onRename={renameChat}
                  onRemovePane={panes.length > 1 ? () => removePane(idx) : undefined}
@@ -239,6 +242,30 @@ export function Workspace({
                  <span className="text-xs text-muted-foreground">
                    {terminalLabels.get(pane.id) ?? 'Terminal'}
                  </span>
+                  <DropdownMenu>
+                    <DropdownMenuTrigger asChild>
+                      <button
+                        type="button"
+                        onClick={(e) => e.stopPropagation()}
+                        className="ml-auto inline-flex items-center justify-center size-5 rounded text-muted-foreground hover:bg-muted hover:text-foreground max-md:size-7"
+                        aria-label="New pane"
+                        title="New pane"
+                      >
+                        <Plus size={12} />
+                      </button>
+                    </DropdownMenuTrigger>
+                    <DropdownMenuContent align="end" className="min-w-40">
+                      <DropdownMenuItem onSelect={() => addSplitPane('chat')}>
+                        <MessageSquare size={14} /> New chat
+                      </DropdownMenuItem>
+                      <DropdownMenuItem onSelect={() => addSplitPane('terminal')}>
+                        <Terminal size={14} /> New terminal
+                      </DropdownMenuItem>
+                      <DropdownMenuItem onSelect={() => addSplitPane('agent')}>
+                        <Bot size={14} /> New agent
+                      </DropdownMenuItem>
+                    </DropdownMenuContent>
+                  </DropdownMenu>
                  {/* v1.10.4: iOS Safari restricts navigator.clipboard.readText
                      outside direct user gestures. A real button click IS a
                      gesture, so this works where keystroke-driven paste may
@@ -250,7 +277,7 @@ export function Workspace({
                      e.stopPropagation();
                      terminalsRegistry.get(pane.id)?.paste();
                    }}
-                    className="ml-auto inline-flex items-center justify-center size-5 rounded text-muted-foreground hover:bg-muted hover:text-foreground max-md:size-7"
+                    className="inline-flex items-center justify-center size-5 rounded text-muted-foreground hover:bg-muted hover:text-foreground max-md:size-7"
                    aria-label="Paste from clipboard"
                    title="Paste from clipboard"
                  >
--- a/apps/web/src/components/panes/ChatPane.tsx
+++ b/apps/web/src/components/panes/ChatPane.tsx
@@ -7,6 +7,7 @@ import { useChatContextStats } from '@/hooks/useChatContextStats';
 import { MessageList } from '@/components/MessageList';
 import { ChatInput } from '@/components/ChatInput';
 import { ChatContextPopover } from '@/components/ChatContextPopover';
+import { ContextBar } from '@/components/ContextBar';
 import {
  DropdownMenu,
  DropdownMenuContent,
@@ -125,6 +126,10 @@ export function ChatPane({ sessionId, chatId, projectId, agentId, onAgentChange,

  return (
    <div className="flex flex-col h-full min-h-0">
+      {/* v1.11.2: persistent context-usage indicator. Renders null when there
+          are no assistant messages yet (fresh chat). shrink-0 keeps it out of
+          the MessageList scroll region — bar stays pinned, list scrolls. */}
+      <ContextBar messages={chatMessages} />
      <MessageList messages={chatMessages} sessionChats={sessionChats} />

      {/* Queued messages */}
--- a/apps/web/src/hooks/useSessionStream.ts
+++ b/apps/web/src/hooks/useSessionStream.ts
@@ -1,5 +1,7 @@
 import { useEffect, useRef, useState } from 'react';
+import { toast } from 'sonner';
 import type { Message, WsFrame } from '@/api/types';
+import { api } from '@/api/client';
 import { sessionEvents } from './sessionEvents';

 // session_renamed frame removed from WsFrame — it was declared but never
@@ -161,6 +163,12 @@ function applyFrame(state: State, frame: WsFrame): State {
        : state.messages;
      return { ...state, messages: next, error: frame.error };
    }
+    case 'compacted': {
+      // v1.11: side effects (refetch + toast) live in ws.onmessage; the
+      // reducer just no-ops so TS exhaustiveness is satisfied without
+      // duplicating async work inside a synchronous reducer.
+      return state;
+    }
  }
 }

@@ -196,6 +204,25 @@ export function useSessionStream(sessionId: string | undefined) {
      ws.onmessage = (ev) => {
        try {
          const frame = JSON.parse(typeof ev.data === 'string' ? ev.data : '') as WsFrame;
+          // v1.11: on a compaction completion, re-fetch the message list so
+          // the new summary row + the cohort of compacted_at-stamped older
+          // rows render correctly. We dispatch the fresh list as a synthetic
+          // 'snapshot' frame so the reducer's existing path handles state
+          // replacement (no need for a parallel "refetched" path).
+          // The toast is purely UX feedback; missing it would still leave
+          // the chat in a valid state.
+          if (frame.type === 'compacted') {
+            toast.success('Context compacted to free space');
+            void api.messages
+              .list(frame.session_id)
+              .then((messages) => {
+                setState((s) => applyFrame(s, { type: 'snapshot', messages }));
+              })
+              .catch((err: unknown) => {
+                console.warn('compacted refetch failed', err);
+              });
+            return;
+          }
          setState((s) => applyFrame(s, frame));
        } catch (err) {
          console.warn('bad ws frame', err);
Author	SHA1	Message	Date
indifferentketchup	3a5cf0c81a	merge v1.11.3-ctxmax	2026-05-20 19:29:26 +00:00
indifferentketchup	89dcfb95dc	v1.11.3: fix ctx_max capture via /props endpoint - llama-server does not emit n_ctx in timings (confirmed empirically); dead code at inference.ts:479 and compaction.ts:300 never fired - New model-context.ts: cached fetch of /upstream/<model>/props with positive-cache (no TTL) and 60s negative-cache - Wired into all 4 ctx_max write sites: 3 in inference.ts (executeToolPhase, finalizeCompletion, runCapHitSummary) and 1 in compaction.ts (summary row INSERT) - AbortController 3s timeout, lenient parsing with sensible defaults - 12 new vitest cases for the cache module (59 total) - 7 historical assistant rows backfilled manually (see notes)	2026-05-20 19:29:26 +00:00
indifferentketchup	8cd270a5da	ContextBar: persistent context-usage indicator above MessageList Walks chat messages newest-first for the latest ctx_used/ctx_max pair. Color tiers fire against (max - 20k compaction reserve) so the bar warns amber/orange/red at the same boundaries auto-compaction triggers. "Context" → "Ctx" at <640px, (NN%) drops at <380px. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-20 19:18:27 +00:00
indifferentketchup	c48de06f42	merge v1.11-compaction	2026-05-20 19:05:35 +00:00
indifferentketchup	dc43dd44f9	v1.11: opencode-style compaction port - compaction.ts: usable/isOverflow/estimate/turns/select/buildPrompt/process - compaction-prompt.ts: SUMMARY_TEMPLATE verbatim from opencode - schema: messages.{compacted_at,summary,tail_start_id} + chats.needs_compaction - inference: auto-trigger on overflow, pre-fetch compaction before next turn - /compact slash command rewired to new path - WS: chat_status working/idle around compaction + compacted frame - frontend: SummaryCard + sonner toast on compacted - 24 unit tests for pure functions	2026-05-20 19:05:35 +00:00
indifferentketchup	6aab4f7d2a	ChatTabBar: + button dropdown to add chat / terminal / agent pane Replaces single onNewChat handler with onAddPane(kind). Terminal pane header gets matching + dropdown. Context menu "New chat" stays. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-20 18:13:55 +00:00