Compare commits
6 Commits
2d841ee0b4
...
v1.11.0-co
| Author | SHA1 | Date | |
|---|---|---|---|
| 3a5cf0c81a | |||
| 89dcfb95dc | |||
| 8cd270a5da | |||
| c48de06f42 | |||
| dc43dd44f9 | |||
| 6aab4f7d2a |
@@ -19,6 +19,8 @@ import { registerSkillsRoutes } from './routes/skills.js';
|
||||
import { createInferenceRunner } from './services/inference.js';
|
||||
import { createBroker } from './services/broker.js';
|
||||
import { listSkills } from './services/skills.js';
|
||||
import * as compaction from './services/compaction.js';
|
||||
import { configureModelContext } from './services/model-context.js';
|
||||
|
||||
async function main() {
|
||||
const config = loadConfig();
|
||||
@@ -47,6 +49,11 @@ async function main() {
|
||||
await applySchema(sql);
|
||||
app.log.info('database schema applied');
|
||||
|
||||
// v1.11.3: tell the model-context cache where llama-swap lives. Cache
|
||||
// lookups go to ${LLAMA_SWAP_URL}/upstream/<model>/props to read
|
||||
// default_generation_settings.n_ctx — the value persisted as messages.ctx_max.
|
||||
configureModelContext({ llamaSwapUrl: config.LLAMA_SWAP_URL });
|
||||
|
||||
await app.register(fastifyWebsocket);
|
||||
|
||||
app.get('/api/health', async () => {
|
||||
@@ -81,6 +88,11 @@ async function main() {
|
||||
publish: (sessionId, frame) => {
|
||||
broker.publish(sessionId, frame as unknown as Record<string, unknown> & { type: string });
|
||||
},
|
||||
// v1.11: broker handle for compaction.process to publish 'compacted'
|
||||
// frames on the per-session channel. Inference's regular publish path
|
||||
// is bound to (sessionId, InferenceFrame); compaction publishes a
|
||||
// different frame shape, so it goes through the raw broker.
|
||||
broker,
|
||||
},
|
||||
(user, frame) => {
|
||||
broker.publishUser(user, frame as unknown as Record<string, unknown> & { type: string });
|
||||
@@ -90,9 +102,13 @@ async function main() {
|
||||
enqueueInference: (sessionId, chatId, assistantId, user) => {
|
||||
inference.enqueue(sessionId, chatId, assistantId, user);
|
||||
},
|
||||
enqueueCompact: (sessionId, chatId, compactId, user) => {
|
||||
inference.enqueueCompact(sessionId, chatId, compactId, user);
|
||||
},
|
||||
// v1.11: synchronous compaction. Awaits the LLM call inside the route's
|
||||
// request lifecycle; the new summary row arrives via the WS 'compacted'
|
||||
// frame published from inside compaction.process. We let the error
|
||||
// bubble up so the route can reply 500 — manual /compact failures
|
||||
// should be loud (the user just clicked a button).
|
||||
runCompaction: (chatId) =>
|
||||
compaction.process({ sql, config, log: app.log, broker, chatId }),
|
||||
cancelInference: async (sessionId, chatId) => {
|
||||
return inference.cancel(sessionId, chatId);
|
||||
},
|
||||
|
||||
@@ -316,7 +316,8 @@ export function registerChatRoutes(
|
||||
}
|
||||
const rows = await sql<Message[]>`
|
||||
SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
|
||||
tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata
|
||||
tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata,
|
||||
summary, tail_start_id, compacted_at
|
||||
FROM messages
|
||||
WHERE chat_id = ${req.params.id}
|
||||
ORDER BY created_at ASC, id ASC
|
||||
|
||||
@@ -49,7 +49,12 @@ const AskUserInputArgs = z.object({
|
||||
|
||||
interface MessageHandlers {
|
||||
enqueueInference: (sessionId: string, chatId: string, assistantMessageId: string, user: string) => void;
|
||||
enqueueCompact: (sessionId: string, chatId: string, compactMessageId: string, user: string) => void;
|
||||
// v1.11: returns a promise that resolves after compaction.process finishes
|
||||
// (await the LLM call). Throws on failure — the route surfaces a 500.
|
||||
// Replaces the v1.10 enqueueCompact (which fired-and-forgot a kind='compact'
|
||||
// streaming row). The new anchored-rolling strategy inserts a single
|
||||
// summary=true assistant row only after the LLM responds.
|
||||
runCompaction: (chatId: string) => Promise<void>;
|
||||
publishUserMessage: (
|
||||
sessionId: string,
|
||||
chatId: string,
|
||||
@@ -81,9 +86,15 @@ export function registerMessageRoutes(
|
||||
reply.code(404);
|
||||
return { error: 'session not found' };
|
||||
}
|
||||
// v1.11: returns ALL messages including compacted ones. The UI
|
||||
// distinguishes via the new `summary` flag (renders an accordion
|
||||
// SummaryCard) and shows compacted_at-stamped rows inline for context.
|
||||
// Internal inference assembly filters compacted_at IS NULL separately —
|
||||
// see services/inference.ts loadContext + services/compaction.ts.
|
||||
const rows = await sql<Message[]>`
|
||||
SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
|
||||
tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata
|
||||
tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata,
|
||||
summary, tail_start_id, compacted_at
|
||||
FROM messages
|
||||
WHERE session_id = ${req.params.id}
|
||||
ORDER BY created_at ASC, id ASC
|
||||
@@ -251,29 +262,30 @@ export function registerMessageRoutes(
|
||||
}
|
||||
);
|
||||
|
||||
// v1.11: manual /compact. Was a streaming kind='compact' row inserted by
|
||||
// this handler; now delegates to the anchored-rolling compaction service.
|
||||
// Synchronous (we await the LLM call) — callers either await or rely on
|
||||
// the 'compacted' WS frame to refresh their view. The response carries
|
||||
// no body of interest; the new summary row arrives via the WS frame.
|
||||
app.post<{ Params: { id: string } }>(
|
||||
'/api/chats/:id/compact',
|
||||
async (req, reply) => {
|
||||
const chatRows = await sql<Chat[]>`
|
||||
SELECT id, session_id FROM chats WHERE id = ${req.params.id} AND status = 'open'
|
||||
const chatRows = await sql<{ id: string }[]>`
|
||||
SELECT id FROM chats WHERE id = ${req.params.id} AND status = 'open'
|
||||
`;
|
||||
if (chatRows.length === 0) {
|
||||
reply.code(404);
|
||||
return { error: 'chat not found' };
|
||||
}
|
||||
const chat = chatRows[0]!;
|
||||
const sessionId = chat.session_id;
|
||||
|
||||
const [compactMsg] = await sql<{ id: string }[]>`
|
||||
INSERT INTO messages (session_id, chat_id, role, content, kind, status, created_at)
|
||||
VALUES (${sessionId}, ${chat.id}, 'system', '', 'compact', 'streaming', clock_timestamp())
|
||||
RETURNING id
|
||||
`;
|
||||
|
||||
handlers.enqueueCompact(sessionId, chat.id, compactMsg!.id, 'default');
|
||||
|
||||
reply.code(202);
|
||||
return { compact_message_id: compactMsg!.id };
|
||||
try {
|
||||
await handlers.runCompaction(chatRows[0]!.id);
|
||||
} catch (err) {
|
||||
req.log.error({ err, chatId: chatRows[0]!.id }, 'manual compaction failed');
|
||||
reply.code(500);
|
||||
return { error: err instanceof Error ? err.message : 'compaction failed' };
|
||||
}
|
||||
reply.code(200);
|
||||
return { ok: true };
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
@@ -21,9 +21,12 @@ export function registerWebSocket(
|
||||
return;
|
||||
}
|
||||
|
||||
// v1.11: snapshot includes compaction fields so MessageBubble can
|
||||
// render the SummaryCard for summary=true rows on first connect.
|
||||
const messages = await sql<Message[]>`
|
||||
SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
|
||||
tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata
|
||||
tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata,
|
||||
summary, tail_start_id, compacted_at
|
||||
FROM messages
|
||||
WHERE session_id = ${sessionId}
|
||||
ORDER BY created_at ASC, id ASC
|
||||
|
||||
@@ -179,3 +179,25 @@ INSERT INTO settings (key, value) VALUES ('theme_mode', '"dark"') ON CONFLICT (k
|
||||
ALTER TABLE projects ADD COLUMN IF NOT EXISTS default_system_prompt TEXT NOT NULL DEFAULT '';
|
||||
ALTER TABLE projects ADD COLUMN IF NOT EXISTS default_web_search_enabled BOOLEAN NOT NULL DEFAULT false;
|
||||
ALTER TABLE sessions ADD COLUMN IF NOT EXISTS web_search_enabled BOOLEAN;
|
||||
|
||||
-- v1.11: anchored rolling compaction.
|
||||
-- compacted_at — marks rows that are "behind the curtain" of the latest
|
||||
-- summary. Inference assembly filters compacted_at IS NULL;
|
||||
-- the API GET still returns all rows so the UI can show
|
||||
-- history with the summary card inline.
|
||||
-- summary — true on the assistant row that IS the anchored summary.
|
||||
-- Exactly one row per chat is the "current" summary
|
||||
-- (every prior summary row is itself compacted_at-stamped
|
||||
-- when superseded, leaving one live anchor).
|
||||
-- tail_start_id — points at the first preserved message that the summary
|
||||
-- covers up to (exclusive). Lets the UI/debug reason about
|
||||
-- the boundary without re-deriving from compacted_at.
|
||||
-- needs_compaction — flag on chats (not sessions) because chat history is
|
||||
-- per-chat; sessions have 1:N chats. Set true post-overflow,
|
||||
-- cleared by compaction.process at the start of the next
|
||||
-- inference turn.
|
||||
ALTER TABLE messages ADD COLUMN IF NOT EXISTS compacted_at TIMESTAMPTZ;
|
||||
ALTER TABLE messages ADD COLUMN IF NOT EXISTS summary BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
ALTER TABLE messages ADD COLUMN IF NOT EXISTS tail_start_id UUID REFERENCES messages(id) ON DELETE SET NULL;
|
||||
ALTER TABLE chats ADD COLUMN IF NOT EXISTS needs_compaction BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
CREATE INDEX IF NOT EXISTS idx_messages_chat_compacted ON messages (chat_id, compacted_at);
|
||||
|
||||
258
apps/server/src/services/__tests__/compaction.test.ts
Normal file
258
apps/server/src/services/__tests__/compaction.test.ts
Normal file
@@ -0,0 +1,258 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import {
|
||||
usable,
|
||||
isOverflow,
|
||||
estimate,
|
||||
turns,
|
||||
select,
|
||||
buildPrompt,
|
||||
type CompactionMessage,
|
||||
} from '../compaction.js';
|
||||
import { SUMMARY_TEMPLATE } from '../compaction-prompt.js';
|
||||
|
||||
// ---- fixture ----------------------------------------------------------------
|
||||
// Tiny constructor for the message shape `compaction.ts` consumes. Default
|
||||
// values match the post-CP1 schema (summary=false, kind='message', complete).
|
||||
// Tests that need a summary row pass `summary: true`.
|
||||
|
||||
let counter = 0;
|
||||
function mkMsg(
|
||||
role: CompactionMessage['role'],
|
||||
content: string,
|
||||
overrides: Partial<CompactionMessage> = {},
|
||||
): CompactionMessage {
|
||||
counter += 1;
|
||||
return {
|
||||
id: `m${counter}`,
|
||||
role,
|
||||
content,
|
||||
kind: 'message',
|
||||
summary: false,
|
||||
status: 'complete',
|
||||
tool_calls: null,
|
||||
tool_results: null,
|
||||
metadata: null,
|
||||
created_at: new Date(counter * 1000).toISOString(),
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
// ---- usable -----------------------------------------------------------------
|
||||
|
||||
describe('usable', () => {
|
||||
it('returns 0 when contextLimit is 0', () => {
|
||||
expect(usable(0)).toBe(0);
|
||||
});
|
||||
|
||||
it('returns 0 when contextLimit is below the 20k buffer', () => {
|
||||
// Math.max(0, x - 20000) clamps the subtraction so we never report
|
||||
// negative headroom. A 10k-context model reports 0 usable, which makes
|
||||
// isOverflow short-circuit to false (correct — we can't size the
|
||||
// compaction with no headroom).
|
||||
expect(usable(10_000)).toBe(0);
|
||||
expect(usable(19_999)).toBe(0);
|
||||
expect(usable(20_000)).toBe(0);
|
||||
});
|
||||
|
||||
it('subtracts the 20k buffer from a normal-sized context window', () => {
|
||||
expect(usable(100_000)).toBe(80_000);
|
||||
expect(usable(32_768)).toBe(12_768);
|
||||
});
|
||||
});
|
||||
|
||||
// ---- isOverflow -------------------------------------------------------------
|
||||
|
||||
describe('isOverflow', () => {
|
||||
it('returns false when usable is 0 (unknown / sub-buffer context)', () => {
|
||||
expect(isOverflow({ prompt_tokens: 999_999, completion_tokens: 0 }, 0)).toBe(false);
|
||||
expect(isOverflow({ prompt_tokens: 0, completion_tokens: 999_999 }, 10_000)).toBe(false);
|
||||
});
|
||||
|
||||
it('returns false at 50% of usable', () => {
|
||||
// usable(100k) = 80k → 50% = 40k.
|
||||
expect(isOverflow({ prompt_tokens: 30_000, completion_tokens: 10_000 }, 100_000)).toBe(false);
|
||||
});
|
||||
|
||||
it('returns false just under usable', () => {
|
||||
expect(isOverflow({ prompt_tokens: 79_000, completion_tokens: 999 }, 100_000)).toBe(false);
|
||||
});
|
||||
|
||||
it('returns true exactly at usable (>=, not strict >)', () => {
|
||||
expect(isOverflow({ prompt_tokens: 80_000, completion_tokens: 0 }, 100_000)).toBe(true);
|
||||
});
|
||||
|
||||
it('returns true above usable', () => {
|
||||
expect(isOverflow({ prompt_tokens: 50_000, completion_tokens: 40_000 }, 100_000)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---- estimate ---------------------------------------------------------------
|
||||
|
||||
describe('estimate', () => {
|
||||
it('returns a tiny value for an empty array (JSON.stringify([]) is "[]")', () => {
|
||||
// Math.ceil('[]'.length / 4) = 1. Documented here so the next reader
|
||||
// doesn't think "0" is the expected baseline — char-count/4 will never
|
||||
// be exactly 0 for any JSON-serializable input.
|
||||
expect(estimate([])).toBe(1);
|
||||
});
|
||||
|
||||
it('scales roughly with content length', () => {
|
||||
const tiny = estimate([mkMsg('user', 'hi')]);
|
||||
const big = estimate([mkMsg('user', 'x'.repeat(4000))]);
|
||||
expect(big).toBeGreaterThan(tiny);
|
||||
expect(big).toBeGreaterThanOrEqual(1000); // 4000 chars / 4 = 1000 floor
|
||||
});
|
||||
|
||||
it('is deterministic across repeated calls', () => {
|
||||
const msgs = [mkMsg('user', 'one'), mkMsg('assistant', 'two')];
|
||||
expect(estimate(msgs)).toBe(estimate(msgs));
|
||||
});
|
||||
});
|
||||
|
||||
// ---- turns ------------------------------------------------------------------
|
||||
|
||||
describe('turns', () => {
|
||||
it('returns [] for an empty message list', () => {
|
||||
expect(turns([])).toEqual([]);
|
||||
});
|
||||
|
||||
it('returns one turn for a single user message', () => {
|
||||
const u = mkMsg('user', 'hi');
|
||||
const result = turns([u]);
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0]).toEqual({ start: 0, end: 1, id: u.id });
|
||||
});
|
||||
|
||||
it('returns two turns for user/assistant/user/assistant', () => {
|
||||
const u1 = mkMsg('user', 'q1');
|
||||
const a1 = mkMsg('assistant', 'a1');
|
||||
const u2 = mkMsg('user', 'q2');
|
||||
const a2 = mkMsg('assistant', 'a2');
|
||||
const result = turns([u1, a1, u2, a2]);
|
||||
expect(result).toEqual([
|
||||
{ start: 0, end: 2, id: u1.id },
|
||||
{ start: 2, end: 4, id: u2.id },
|
||||
]);
|
||||
});
|
||||
|
||||
it('extends the final turn end to include trailing non-user messages', () => {
|
||||
// Spec wording: "user/assistant + trailing system → trailing included
|
||||
// in last turn's range". Single-turn variant: [user, assistant, system]
|
||||
// should produce one turn with end=3 (covers all three indices).
|
||||
const u = mkMsg('user', 'q');
|
||||
const a = mkMsg('assistant', 'a');
|
||||
const s = mkMsg('system', 'note');
|
||||
const result = turns([u, a, s]);
|
||||
expect(result).toEqual([{ start: 0, end: 3, id: u.id }]);
|
||||
});
|
||||
|
||||
it('skips user rows flagged as summary (anchored-rolling rows)', () => {
|
||||
// Defense-in-depth — process() pre-filters summary rows, but turns()
|
||||
// also skips them so a misuse from another caller doesn't create a
|
||||
// bogus turn boundary on the summary row itself.
|
||||
const u1 = mkMsg('user', 'q1');
|
||||
const a1 = mkMsg('assistant', 'a1');
|
||||
const sum = mkMsg('user', 'rolled-up', { summary: true });
|
||||
const u2 = mkMsg('user', 'q2');
|
||||
const result = turns([u1, a1, sum, u2]);
|
||||
expect(result.map((t) => t.id)).toEqual([u1.id, u2.id]);
|
||||
});
|
||||
});
|
||||
|
||||
// ---- select -----------------------------------------------------------------
|
||||
|
||||
describe('select', () => {
|
||||
it('returns empty head + undefined tail for an empty message list', () => {
|
||||
const result = select([], 100_000);
|
||||
expect(result.head).toEqual([]);
|
||||
expect(result.tail_start_id).toBeUndefined();
|
||||
});
|
||||
|
||||
it('full-preserves when there are fewer turns than tail_turns', () => {
|
||||
// 1 turn but tail_turns=2: keep === turn0 → keep.start === 0 →
|
||||
// sentinel-return path that signals "no compaction this round".
|
||||
const u = mkMsg('user', 'only');
|
||||
const a = mkMsg('assistant', 'a');
|
||||
const result = select([u, a], 100_000, 2);
|
||||
expect(result.head).toEqual([u, a]);
|
||||
expect(result.tail_start_id).toBeUndefined();
|
||||
});
|
||||
|
||||
it('keeps the last tail_turns turns when they all fit the budget', () => {
|
||||
// 3 turns, all small. tail_turns=2 means keep the last 2; head =
|
||||
// messages[0..turn2.start] = just turn1's content.
|
||||
const u1 = mkMsg('user', 'q1');
|
||||
const a1 = mkMsg('assistant', 'a1');
|
||||
const u2 = mkMsg('user', 'q2');
|
||||
const a2 = mkMsg('assistant', 'a2');
|
||||
const u3 = mkMsg('user', 'q3');
|
||||
const a3 = mkMsg('assistant', 'a3');
|
||||
const msgs = [u1, a1, u2, a2, u3, a3];
|
||||
const result = select(msgs, 100_000, 2);
|
||||
// Turn boundaries: [0,2), [2,4), [4,6). slice(-2) = turns at 2 and 4.
|
||||
// Walking backward: u3 fits, then u2 fits → keep={start:2, id:u2.id}.
|
||||
expect(result.tail_start_id).toBe(u2.id);
|
||||
expect(result.head).toEqual([u1, a1]);
|
||||
});
|
||||
|
||||
it('splits a turn mid-stream when the whole turn would overflow the budget', () => {
|
||||
// tail_turns=1 so we look only at the most recent turn. Stuff it past
|
||||
// 8k of content (max preserve budget) and the splitter walks forward
|
||||
// looking for the largest suffix that fits.
|
||||
const u1 = mkMsg('user', 'q1');
|
||||
const a1 = mkMsg('assistant', 'a1');
|
||||
const u2 = mkMsg('user', 'q2 with a giant payload');
|
||||
const huge = mkMsg('assistant', 'X'.repeat(40_000)); // ~10k tokens
|
||||
const smallTail = mkMsg('assistant', 'short answer');
|
||||
const msgs = [u1, a1, u2, huge, smallTail];
|
||||
const result = select(msgs, 100_000, 1);
|
||||
// The split walks from turn.start+1 forward; the first index whose
|
||||
// [i, end) slice fits the budget becomes the new keep. We don't assert
|
||||
// a specific id (depends on character math), only that compaction was
|
||||
// triggered (tail_start_id set, head non-empty) and that the head
|
||||
// doesn't include the final small message.
|
||||
expect(result.tail_start_id).toBeDefined();
|
||||
expect(result.head.length).toBeGreaterThan(0);
|
||||
expect(result.head).not.toContain(smallTail);
|
||||
});
|
||||
|
||||
it('full-preserves when no split point fits', () => {
|
||||
// Single oversized turn; splitTurn walks but each suffix is still too
|
||||
// big. After the loop, keep is undefined → full-preserve sentinel.
|
||||
// Force this with a sub-buffer context so budget is the floor (2k),
|
||||
// and a single 40k-char message.
|
||||
const u = mkMsg('user', 'oversized');
|
||||
const a = mkMsg('assistant', 'Y'.repeat(40_000));
|
||||
const result = select([u, a], 30_000, 1);
|
||||
// usable(30k) = 10k → budget = min(8k, max(2k, floor(10k*0.25))) =
|
||||
// min(8k, max(2k, 2500)) = 2500. 40k chars ≈ 10k tokens. Can't fit.
|
||||
expect(result.tail_start_id).toBeUndefined();
|
||||
expect(result.head).toEqual([u, a]);
|
||||
});
|
||||
});
|
||||
|
||||
// ---- buildPrompt ------------------------------------------------------------
|
||||
|
||||
describe('buildPrompt', () => {
|
||||
it('opens with the "create new" anchor when previousSummary is undefined', () => {
|
||||
const out = buildPrompt(undefined, []);
|
||||
expect(out.startsWith('Create a new anchored summary')).toBe(true);
|
||||
expect(out).toContain(SUMMARY_TEMPLATE);
|
||||
expect(out).not.toContain('<previous-summary>');
|
||||
});
|
||||
|
||||
it('opens with the "update" anchor and embeds previousSummary verbatim', () => {
|
||||
const prev = '## Goal\n- finish v1.11 compaction';
|
||||
const out = buildPrompt(prev, []);
|
||||
expect(out.startsWith('Update the anchored summary')).toBe(true);
|
||||
expect(out).toContain('<previous-summary>');
|
||||
expect(out).toContain(prev);
|
||||
expect(out).toContain('</previous-summary>');
|
||||
expect(out).toContain(SUMMARY_TEMPLATE);
|
||||
});
|
||||
|
||||
it('appends extra context strings after the template (reserved for plugin injection)', () => {
|
||||
const out = buildPrompt(undefined, ['extra-context-line']);
|
||||
expect(out.endsWith('extra-context-line')).toBe(true);
|
||||
});
|
||||
});
|
||||
205
apps/server/src/services/__tests__/model-context.test.ts
Normal file
205
apps/server/src/services/__tests__/model-context.test.ts
Normal file
@@ -0,0 +1,205 @@
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
import {
|
||||
configureModelContext,
|
||||
getModelContext,
|
||||
invalidateModelContext,
|
||||
} from '../model-context.js';
|
||||
|
||||
// ---- fixtures ---------------------------------------------------------------
|
||||
|
||||
const TEST_URL = 'http://llama-swap.test:8401';
|
||||
|
||||
function mockOkProps(n_ctx: number, total_slots = 1) {
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
default_generation_settings: { n_ctx },
|
||||
total_slots,
|
||||
}),
|
||||
{ status: 200, headers: { 'Content-Type': 'application/json' } },
|
||||
);
|
||||
}
|
||||
|
||||
beforeEach(() => {
|
||||
invalidateModelContext();
|
||||
configureModelContext({ llamaSwapUrl: TEST_URL });
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
// ---- positive cache ---------------------------------------------------------
|
||||
|
||||
describe('getModelContext — positive cache', () => {
|
||||
it('returns the parsed body on a 200 with valid shape', async () => {
|
||||
const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce(mockOkProps(262_144, 1));
|
||||
const result = await getModelContext('qwen3.6');
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.n_ctx).toBe(262_144);
|
||||
expect(result!.total_slots).toBe(1);
|
||||
expect(typeof result!.fetched_at).toBe('number');
|
||||
// Verify the URL was constructed correctly — encodes the model name in
|
||||
// case it contains characters that would break the path.
|
||||
expect(fetchSpy).toHaveBeenCalledExactlyOnceWith(
|
||||
`${TEST_URL}/upstream/qwen3.6/props`,
|
||||
expect.objectContaining({ signal: expect.any(AbortSignal) }),
|
||||
);
|
||||
});
|
||||
|
||||
it('serves the second call from cache without refetching', async () => {
|
||||
const fetchSpy = vi
|
||||
.spyOn(globalThis, 'fetch')
|
||||
.mockResolvedValueOnce(mockOkProps(262_144));
|
||||
const a = await getModelContext('qwen3.6');
|
||||
const b = await getModelContext('qwen3.6');
|
||||
expect(a).toEqual(b);
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('defaults total_slots to 1 when the server omits it', async () => {
|
||||
// Mirror the docstring claim — total_slots is informational and we don't
|
||||
// reject the response just because it's missing.
|
||||
vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce(
|
||||
new Response(JSON.stringify({ default_generation_settings: { n_ctx: 8192 } }), {
|
||||
status: 200,
|
||||
}),
|
||||
);
|
||||
const result = await getModelContext('partial-model');
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.n_ctx).toBe(8192);
|
||||
expect(result!.total_slots).toBe(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ---- negative cache (single-shot) ------------------------------------------
|
||||
|
||||
describe('getModelContext — negative cache (single failure modes)', () => {
|
||||
it('returns null and negative-caches when default_generation_settings is missing', async () => {
|
||||
const fetchSpy = vi
|
||||
.spyOn(globalThis, 'fetch')
|
||||
.mockResolvedValueOnce(new Response(JSON.stringify({ total_slots: 1 }), { status: 200 }));
|
||||
const result = await getModelContext('broken');
|
||||
expect(result).toBeNull();
|
||||
// Second call within TTL must not refetch.
|
||||
const result2 = await getModelContext('broken');
|
||||
expect(result2).toBeNull();
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('returns null and negative-caches when n_ctx is missing inside default_generation_settings', async () => {
|
||||
const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce(
|
||||
new Response(JSON.stringify({ default_generation_settings: {}, total_slots: 1 }), {
|
||||
status: 200,
|
||||
}),
|
||||
);
|
||||
await getModelContext('half-broken');
|
||||
await getModelContext('half-broken');
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('returns null and negative-caches on non-200 (404)', async () => {
|
||||
const fetchSpy = vi
|
||||
.spyOn(globalThis, 'fetch')
|
||||
.mockResolvedValueOnce(new Response('not found', { status: 404 }));
|
||||
const result = await getModelContext('missing-model');
|
||||
expect(result).toBeNull();
|
||||
const result2 = await getModelContext('missing-model');
|
||||
expect(result2).toBeNull();
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('returns null and negative-caches on network error', async () => {
|
||||
const fetchSpy = vi
|
||||
.spyOn(globalThis, 'fetch')
|
||||
.mockRejectedValueOnce(new TypeError('fetch failed: connect ECONNREFUSED'));
|
||||
const result = await getModelContext('down-upstream');
|
||||
expect(result).toBeNull();
|
||||
const result2 = await getModelContext('down-upstream');
|
||||
expect(result2).toBeNull();
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ---- negative cache TTL -----------------------------------------------------
|
||||
|
||||
describe('getModelContext — negative cache TTL', () => {
|
||||
it('does NOT refetch when a second call lands within the 60s TTL', async () => {
|
||||
vi.useFakeTimers();
|
||||
const fetchSpy = vi
|
||||
.spyOn(globalThis, 'fetch')
|
||||
.mockResolvedValueOnce(new Response('boom', { status: 500 }));
|
||||
|
||||
await getModelContext('flapping');
|
||||
vi.advanceTimersByTime(30_000);
|
||||
await getModelContext('flapping');
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('refetches when the second call lands after the 60s TTL expires', async () => {
|
||||
vi.useFakeTimers();
|
||||
const fetchSpy = vi
|
||||
.spyOn(globalThis, 'fetch')
|
||||
.mockResolvedValueOnce(new Response('boom', { status: 500 }))
|
||||
// Recovered upstream on the retry — we expect a positive cache hit
|
||||
// after this fires.
|
||||
.mockResolvedValueOnce(mockOkProps(8192));
|
||||
|
||||
await getModelContext('flapping');
|
||||
vi.advanceTimersByTime(61_000);
|
||||
const result = await getModelContext('flapping');
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.n_ctx).toBe(8192);
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
});
|
||||
|
||||
// ---- invalidateModelContext -------------------------------------------------
|
||||
|
||||
describe('invalidateModelContext', () => {
|
||||
it('clears a single positive entry by model name', async () => {
|
||||
const fetchSpy = vi
|
||||
.spyOn(globalThis, 'fetch')
|
||||
.mockResolvedValueOnce(mockOkProps(8192))
|
||||
.mockResolvedValueOnce(mockOkProps(8192));
|
||||
|
||||
await getModelContext('cleared');
|
||||
invalidateModelContext('cleared');
|
||||
await getModelContext('cleared');
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it('clears ALL entries when called with no arg', async () => {
|
||||
const fetchSpy = vi
|
||||
.spyOn(globalThis, 'fetch')
|
||||
.mockResolvedValueOnce(mockOkProps(8192))
|
||||
.mockResolvedValueOnce(mockOkProps(16_384))
|
||||
// After the full clear, both models re-fetch.
|
||||
.mockResolvedValueOnce(mockOkProps(8192))
|
||||
.mockResolvedValueOnce(mockOkProps(16_384));
|
||||
|
||||
await getModelContext('alpha');
|
||||
await getModelContext('beta');
|
||||
invalidateModelContext();
|
||||
await getModelContext('alpha');
|
||||
await getModelContext('beta');
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(4);
|
||||
});
|
||||
|
||||
it('clearing a positive entry also clears the matching negative entry', async () => {
|
||||
// Mixed state: first call fails (negative-caches), then we invalidate
|
||||
// explicitly and the next call should fetch again rather than serve
|
||||
// the stale negative entry.
|
||||
const fetchSpy = vi
|
||||
.spyOn(globalThis, 'fetch')
|
||||
.mockResolvedValueOnce(new Response('boom', { status: 500 }))
|
||||
.mockResolvedValueOnce(mockOkProps(4096));
|
||||
|
||||
await getModelContext('formerly-broken');
|
||||
invalidateModelContext('formerly-broken');
|
||||
const result = await getModelContext('formerly-broken');
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.n_ctx).toBe(4096);
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
});
|
||||
40
apps/server/src/services/compaction-prompt.ts
Normal file
40
apps/server/src/services/compaction-prompt.ts
Normal file
@@ -0,0 +1,40 @@
|
||||
// v1.11: anchored rolling summary template. Verbatim port from opencode
|
||||
// (packages/opencode/src/session/compaction.ts SUMMARY_TEMPLATE). Kept in a
|
||||
// separate module so the long template literal doesn't bloat compaction.ts.
|
||||
|
||||
export const SUMMARY_TEMPLATE = `Output exactly the Markdown structure shown inside <template> and keep the section order unchanged. Do not include the <template> tags in your response.
|
||||
<template>
|
||||
## Goal
|
||||
- [single-sentence task summary]
|
||||
|
||||
## Constraints & Preferences
|
||||
- [user constraints, preferences, specs, or "(none)"]
|
||||
|
||||
## Progress
|
||||
### Done
|
||||
- [completed work or "(none)"]
|
||||
|
||||
### In Progress
|
||||
- [current work or "(none)"]
|
||||
|
||||
### Blocked
|
||||
- [blockers or "(none)"]
|
||||
|
||||
## Key Decisions
|
||||
- [decision and why, or "(none)"]
|
||||
|
||||
## Next Steps
|
||||
- [ordered next actions or "(none)"]
|
||||
|
||||
## Critical Context
|
||||
- [important technical facts, errors, open questions, or "(none)"]
|
||||
|
||||
## Relevant Files
|
||||
- [file or directory path: why it matters, or "(none)"]
|
||||
</template>
|
||||
|
||||
Rules:
|
||||
- Keep every section, even when empty.
|
||||
- Use terse bullets, not prose paragraphs.
|
||||
- Preserve exact file paths, commands, error strings, and identifiers when known.
|
||||
- Do not mention the summary process or that context was compacted.`;
|
||||
510
apps/server/src/services/compaction.ts
Normal file
510
apps/server/src/services/compaction.ts
Normal file
@@ -0,0 +1,510 @@
|
||||
// v1.11: anchored rolling compaction. Ported algorithms (not Effect-TS code)
|
||||
// from opencode (packages/opencode/src/session/{compaction,overflow}.ts).
|
||||
//
|
||||
// What's different from BooCode's legacy /compact:
|
||||
// - Operates per-chat (chats have N:1 to sessions; history is per-chat).
|
||||
// - Detects overflow automatically after each inference completion using
|
||||
// llama-swap's reported n_ctx; flags chats.needs_compaction=true.
|
||||
// - On the next turn (or manual /compact) we summarize the *head* (messages
|
||||
// prior to a preserved tail of N user-turns) into a single
|
||||
// summary=true assistant row. Older messages get compacted_at-stamped so
|
||||
// inference assembly filters them out; the GET endpoint still returns
|
||||
// them so the UI can show history with the summary card inline.
|
||||
// - The summary is *anchored rolling* — exactly one live summary=true row
|
||||
// per chat. Subsequent compactions read the prior summary as
|
||||
// previousSummary, ask the LLM to update-merge it, then mark the prior
|
||||
// summary row compacted_at too (it stays in the UI but isn't sent to the
|
||||
// LLM again).
|
||||
|
||||
import type { FastifyBaseLogger } from 'fastify';
|
||||
import type { Sql } from '../db.js';
|
||||
import type { Config } from '../config.js';
|
||||
import type { Broker } from './broker.js';
|
||||
import { SUMMARY_TEMPLATE } from './compaction-prompt.js';
|
||||
import * as modelContextLookup from './model-context.js';
|
||||
|
||||
const COMPACTION_BUFFER = 20_000;
|
||||
const MIN_PRESERVE_RECENT_TOKENS = 2_000;
|
||||
const MAX_PRESERVE_RECENT_TOKENS = 8_000;
|
||||
const DEFAULT_TAIL_TURNS = 2;
|
||||
|
||||
// Subset of Message fields compaction touches. Selecting only what's needed
|
||||
// keeps process() independent of api.ts mutations and reduces DB egress.
|
||||
export interface CompactionMessage {
|
||||
id: string;
|
||||
role: 'user' | 'assistant' | 'system' | 'tool';
|
||||
content: string;
|
||||
kind: 'message' | 'compact';
|
||||
summary: boolean;
|
||||
status: 'streaming' | 'complete' | 'failed' | 'cancelled';
|
||||
tool_calls: Array<{ id: string; name: string; args: Record<string, unknown> }> | null;
|
||||
tool_results: { tool_call_id: string; output: unknown; truncated: boolean; error?: string } | null;
|
||||
metadata: { kind?: string } | null;
|
||||
created_at: string;
|
||||
}
|
||||
|
||||
// === overflow ===
|
||||
|
||||
// Tokens we hold in reserve for the model's response so a near-full context
|
||||
// can still produce a useful turn. Mirrors opencode's COMPACTION_BUFFER.
|
||||
// Returns 0 when the context limit is unknown (caller treats 0 as "do not
|
||||
// trigger overflow"); avoids dividing-by-zero downstream.
|
||||
export function usable(contextLimit: number): number {
|
||||
if (!contextLimit || contextLimit <= 0) return 0;
|
||||
return Math.max(0, contextLimit - COMPACTION_BUFFER);
|
||||
}
|
||||
|
||||
export interface Usage {
|
||||
prompt_tokens: number;
|
||||
completion_tokens: number;
|
||||
}
|
||||
|
||||
// True when the assistant just used >= usable() tokens. Unknown limit → false
|
||||
// (we never auto-trigger compaction without a budget — better to keep
|
||||
// inference flowing than to fall into a compaction we can't size properly).
|
||||
export function isOverflow(usage: Usage, contextLimit: number): boolean {
|
||||
const budget = usable(contextLimit);
|
||||
if (budget <= 0) return false;
|
||||
return (usage.prompt_tokens + usage.completion_tokens) >= budget;
|
||||
}
|
||||
|
||||
// === selection ===
|
||||
|
||||
interface Turn {
|
||||
start: number;
|
||||
end: number;
|
||||
id: string;
|
||||
}
|
||||
|
||||
// Char-count / 4 token estimate. Matches opencode's Token.estimate (which
|
||||
// also goes through JSON.stringify). Adequate for tail-fitting math; we
|
||||
// don't need a real tokenizer here — the 20k buffer absorbs the slop.
|
||||
export function estimate(messages: CompactionMessage[]): number {
|
||||
return Math.ceil(JSON.stringify(messages).length / 4);
|
||||
}
|
||||
|
||||
// Walk messages, return one Turn per user message that is NOT a summary row.
|
||||
// end = next-user-start; final turn ends at messages.length.
|
||||
export function turns(messages: CompactionMessage[]): Turn[] {
|
||||
const result: Turn[] = [];
|
||||
for (let i = 0; i < messages.length; i++) {
|
||||
const m = messages[i]!;
|
||||
if (m.role !== 'user') continue;
|
||||
if (m.summary) continue;
|
||||
result.push({ start: i, end: messages.length, id: m.id });
|
||||
}
|
||||
for (let i = 0; i < result.length - 1; i++) {
|
||||
result[i]!.end = result[i + 1]!.start;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Inside a turn that doesn't fit whole, walk forward from start+1 looking for
|
||||
// the largest suffix that fits the remaining budget. Returns the keep-start
|
||||
// index (the first preserved message) or undefined if no suffix fits.
|
||||
function splitTurn(
|
||||
messages: CompactionMessage[],
|
||||
turn: Turn,
|
||||
budget: number,
|
||||
): { start: number; id: string } | undefined {
|
||||
if (budget <= 0) return undefined;
|
||||
if (turn.end - turn.start <= 1) return undefined;
|
||||
for (let start = turn.start + 1; start < turn.end; start++) {
|
||||
const size = estimate(messages.slice(start, turn.end));
|
||||
if (size > budget) continue;
|
||||
return { start, id: messages[start]!.id };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export interface SelectResult {
|
||||
head: CompactionMessage[];
|
||||
tail_start_id: string | undefined;
|
||||
}
|
||||
|
||||
// Choose the boundary between the "head" (to be summarized) and the "tail"
|
||||
// (preserved verbatim). Strategy:
|
||||
// 1. Reserve a budget for the recent tail. Default ranges [2k, 8k] tokens
|
||||
// with 25% of usable() as the target.
|
||||
// 2. Take the last `tail_turns` user-turns; greedily fit from newest back.
|
||||
// 3. If the next-older turn doesn't fit whole, split it mid-turn.
|
||||
// 4. If we couldn't keep anything OR everything fit (keep.start === 0),
|
||||
// return full-preserve (no compaction this round).
|
||||
export function select(
|
||||
messages: CompactionMessage[],
|
||||
contextLimit: number,
|
||||
tailTurns: number = DEFAULT_TAIL_TURNS,
|
||||
): SelectResult {
|
||||
if (tailTurns <= 0) return { head: messages, tail_start_id: undefined };
|
||||
const budget = Math.min(
|
||||
MAX_PRESERVE_RECENT_TOKENS,
|
||||
Math.max(MIN_PRESERVE_RECENT_TOKENS, Math.floor(usable(contextLimit) * 0.25)),
|
||||
);
|
||||
|
||||
const all = turns(messages);
|
||||
if (all.length === 0) return { head: messages, tail_start_id: undefined };
|
||||
const recent = all.slice(-tailTurns);
|
||||
|
||||
let total = 0;
|
||||
let keep: { start: number; id: string } | undefined;
|
||||
for (let i = recent.length - 1; i >= 0; i--) {
|
||||
const turn = recent[i]!;
|
||||
const size = estimate(messages.slice(turn.start, turn.end));
|
||||
if (total + size <= budget) {
|
||||
total += size;
|
||||
keep = { start: turn.start, id: turn.id };
|
||||
continue;
|
||||
}
|
||||
const remaining = budget - total;
|
||||
const split = splitTurn(messages, turn, remaining);
|
||||
if (split) keep = split;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!keep || keep.start === 0) {
|
||||
return { head: messages, tail_start_id: undefined };
|
||||
}
|
||||
return {
|
||||
head: messages.slice(0, keep.start),
|
||||
tail_start_id: keep.id,
|
||||
};
|
||||
}
|
||||
|
||||
// === prompt assembly ===
|
||||
|
||||
// Build the final user message that asks the model to (re)produce the
|
||||
// anchored summary. `context` is reserved for future plugin injection;
|
||||
// callers pass [] today.
|
||||
export function buildPrompt(
|
||||
previousSummary: string | undefined,
|
||||
context: string[],
|
||||
): string {
|
||||
const anchor = previousSummary
|
||||
? [
|
||||
'Update the anchored summary below using the conversation history above.',
|
||||
'Preserve still-true details, remove stale details, and merge in the new facts.',
|
||||
'<previous-summary>',
|
||||
previousSummary,
|
||||
'</previous-summary>',
|
||||
].join('\n')
|
||||
: 'Create a new anchored summary from the conversation history above.';
|
||||
return [anchor, SUMMARY_TEMPLATE, ...context].join('\n\n');
|
||||
}
|
||||
|
||||
// === OpenAI conversion (compaction-local; intentionally does NOT call
|
||||
// inference.ts buildMessagesPayload because that uses the legacy "find latest
|
||||
// kind='compact' marker and skip everything before it" shortcircuit, which
|
||||
// would silently drop pre-legacy-compact history before the LLM sees it.
|
||||
// Compaction wants to send the entire head, full stop.) ===
|
||||
|
||||
interface OpenAiMessage {
|
||||
role: 'system' | 'user' | 'assistant' | 'tool';
|
||||
content: string | null;
|
||||
tool_calls?: Array<{
|
||||
id: string;
|
||||
type: 'function';
|
||||
function: { name: string; arguments: string };
|
||||
}>;
|
||||
tool_call_id?: string;
|
||||
}
|
||||
|
||||
function isCapHitSentinel(m: CompactionMessage): boolean {
|
||||
return m.role === 'system' && m.metadata != null && m.metadata.kind === 'cap_hit';
|
||||
}
|
||||
|
||||
function buildHeadPayload(head: CompactionMessage[]): OpenAiMessage[] {
|
||||
const out: OpenAiMessage[] = [];
|
||||
for (const m of head) {
|
||||
if (isCapHitSentinel(m)) continue;
|
||||
if (m.role === 'assistant' && (m.status === 'streaming' || m.status === 'cancelled')) continue;
|
||||
if (m.kind === 'compact') {
|
||||
// Legacy compact row — pass through as system context. The new
|
||||
// anchored summary will subsume it, but the LLM should see it during
|
||||
// the bridging round so it can carry forward the still-true bits.
|
||||
out.push({ role: 'system', content: m.content });
|
||||
continue;
|
||||
}
|
||||
if (m.summary) {
|
||||
// Defense in depth: process() filters these out of the select-input
|
||||
// already. If one slips through, render it as assistant content so we
|
||||
// never crash here.
|
||||
out.push({ role: 'assistant', content: m.content });
|
||||
continue;
|
||||
}
|
||||
if (m.role === 'tool') {
|
||||
const tr = m.tool_results;
|
||||
if (!tr) continue;
|
||||
const outputText = tr.error
|
||||
? `error: ${tr.error}`
|
||||
: typeof tr.output === 'string'
|
||||
? tr.output
|
||||
: JSON.stringify(tr.output);
|
||||
out.push({ role: 'tool', content: outputText, tool_call_id: tr.tool_call_id });
|
||||
continue;
|
||||
}
|
||||
if (m.role === 'assistant') {
|
||||
const msg: OpenAiMessage = {
|
||||
role: 'assistant',
|
||||
content: m.content && m.content.length > 0 ? m.content : null,
|
||||
};
|
||||
if (m.tool_calls && m.tool_calls.length > 0) {
|
||||
msg.tool_calls = m.tool_calls.map((tc) => ({
|
||||
id: tc.id,
|
||||
type: 'function' as const,
|
||||
function: { name: tc.name, arguments: JSON.stringify(tc.args) },
|
||||
}));
|
||||
}
|
||||
out.push(msg);
|
||||
continue;
|
||||
}
|
||||
out.push({ role: 'user', content: m.content });
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// === llama-swap call ===
|
||||
|
||||
// Non-streaming completion. Opencode streams; for a one-shot summary call a
|
||||
// single POST is less code and the latency hit is acceptable (the user
|
||||
// doesn't see this directly — useSessionStream emits the toast + refetches
|
||||
// on the 'compacted' frame).
|
||||
interface CompletionResult {
|
||||
content: string;
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
}
|
||||
|
||||
async function callLlamaSwap(
|
||||
config: Config,
|
||||
model: string,
|
||||
messages: OpenAiMessage[],
|
||||
log: FastifyBaseLogger,
|
||||
): Promise<CompletionResult> {
|
||||
const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ model, messages, stream: false }),
|
||||
});
|
||||
if (!res.ok) {
|
||||
const text = await res.text().catch(() => '');
|
||||
throw new Error(`llama-swap returned ${res.status}: ${text.slice(0, 200)}`);
|
||||
}
|
||||
const json = (await res.json()) as {
|
||||
choices?: Array<{ message?: { content?: string } }>;
|
||||
usage?: { prompt_tokens?: number; completion_tokens?: number };
|
||||
};
|
||||
// v1.11.3: removed the dead `json.timings?.n_ctx` read — llama-server's
|
||||
// completions don't emit n_ctx in timings. ctx_max on the summary row
|
||||
// comes from model-context.getModelContext below in process().
|
||||
const content = json.choices?.[0]?.message?.content ?? '';
|
||||
const promptTokens = json.usage?.prompt_tokens ?? 0;
|
||||
const completionTokens = json.usage?.completion_tokens ?? 0;
|
||||
log.debug({ promptTokens, completionTokens, chars: content.length }, 'compaction llm complete');
|
||||
return { content, promptTokens, completionTokens };
|
||||
}
|
||||
|
||||
// === entry point ===
|
||||
|
||||
export interface ProcessInput {
|
||||
sql: Sql;
|
||||
config: Config;
|
||||
log: FastifyBaseLogger;
|
||||
broker: Broker;
|
||||
chatId: string;
|
||||
}
|
||||
|
||||
// Runs one round of anchored rolling compaction on `chatId`. No-ops cleanly
|
||||
// (clearing needs_compaction) when there's nothing reasonable to compact.
|
||||
// Throws on LLM failure — callers decide whether to log+swallow or surface.
|
||||
export async function process(input: ProcessInput): Promise<void> {
|
||||
const { sql, config, log, broker, chatId } = input;
|
||||
|
||||
// 1. Resolve chat → session for model + WS publish channel.
|
||||
const chatRows = await sql<{ id: string; session_id: string }[]>`
|
||||
SELECT id, session_id FROM chats WHERE id = ${chatId}
|
||||
`;
|
||||
if (chatRows.length === 0) {
|
||||
log.warn({ chatId }, 'compaction: chat not found');
|
||||
return;
|
||||
}
|
||||
const chat = chatRows[0]!;
|
||||
const sessionId = chat.session_id;
|
||||
|
||||
const sessRows = await sql<{ id: string; model: string }[]>`
|
||||
SELECT id, model FROM sessions WHERE id = ${sessionId}
|
||||
`;
|
||||
if (sessRows.length === 0) {
|
||||
log.warn({ chatId, sessionId }, 'compaction: session not found');
|
||||
return;
|
||||
}
|
||||
const session = sessRows[0]!;
|
||||
|
||||
// 2. All currently-active messages in this chat (compacted_at IS NULL).
|
||||
// ORDER BY (created_at, id) matches loadContext in inference.ts so the
|
||||
// turns() boundary logic sees the same sequence the LLM will.
|
||||
const messages = await sql<CompactionMessage[]>`
|
||||
SELECT id, role, content, kind, summary, status, tool_calls, tool_results, metadata, created_at
|
||||
FROM messages
|
||||
WHERE chat_id = ${chatId} AND compacted_at IS NULL
|
||||
ORDER BY created_at ASC, id ASC
|
||||
`;
|
||||
if (messages.length === 0) {
|
||||
await sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
|
||||
return;
|
||||
}
|
||||
|
||||
// 3. Find the prior anchored summary (newest summary=true row). Its content
|
||||
// becomes previousSummary — the anchor in the prompt. Filter it out of the
|
||||
// select-input so we don't double-encode (it's already in the anchor text).
|
||||
const previousSummary = messages.filter((m) => m.summary).at(-1)?.content;
|
||||
const forSelect = messages.filter((m) => !m.summary);
|
||||
|
||||
// 4. Resolve a recent context limit. llama-swap reports timings.n_ctx per
|
||||
// completion; we cache it on messages.ctx_max. Use the most recent value
|
||||
// from any message in this chat (oldest assumption is the same model is
|
||||
// still running). When unknown, fall back to model.context_limit-less
|
||||
// defaults via the buffer-only path (see usable()).
|
||||
const ctxRows = await sql<{ ctx_max: number | null }[]>`
|
||||
SELECT ctx_max FROM messages
|
||||
WHERE chat_id = ${chatId} AND ctx_max IS NOT NULL
|
||||
ORDER BY created_at DESC LIMIT 1
|
||||
`;
|
||||
const contextLimit = ctxRows[0]?.ctx_max ?? 0;
|
||||
|
||||
// 5. Decide head / tail.
|
||||
const sel = select(forSelect, contextLimit);
|
||||
if (!sel.tail_start_id || sel.head.length === 0) {
|
||||
// Full preserve — nothing to compact this round. Clear the flag so we
|
||||
// don't loop. (Could happen when the chat is short or the budget swung
|
||||
// wider after a model context bump.)
|
||||
await sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
|
||||
log.info({ chatId, contextLimit, msgCount: messages.length }, 'compaction: nothing to compact');
|
||||
return;
|
||||
}
|
||||
|
||||
// 6. Build the OpenAI request: head as user/assistant/tool turns + a final
|
||||
// user message carrying buildPrompt(previousSummary, []). No system prompt
|
||||
// — matches opencode (`system: []`); the template + anchor are sufficient.
|
||||
const headPayload = buildHeadPayload(sel.head);
|
||||
const finalUser: OpenAiMessage = { role: 'user', content: buildPrompt(previousSummary, []) };
|
||||
const payload = [...headPayload, finalUser];
|
||||
|
||||
log.info(
|
||||
{
|
||||
chatId,
|
||||
contextLimit,
|
||||
headLen: sel.head.length,
|
||||
tailStartId: sel.tail_start_id,
|
||||
hadPrevSummary: previousSummary !== undefined,
|
||||
},
|
||||
'compaction: invoking model',
|
||||
);
|
||||
|
||||
// 6a. Flip the chat dot amber for the duration of the LLM call + DB writes.
|
||||
// Same { type: 'chat_status', status: 'working', at } shape inference.ts
|
||||
// emits at runner enqueue. publishUser → broadcasts on the per-user channel
|
||||
// (all devices / tabs see it) since chat_status is a user-channel frame in
|
||||
// BooCode (see useChatStatus.ts, which is the consumer).
|
||||
broker.publishUser('default', {
|
||||
type: 'chat_status',
|
||||
chat_id: chatId,
|
||||
status: 'working',
|
||||
at: new Date().toISOString(),
|
||||
});
|
||||
|
||||
// try/finally so the dot ALWAYS drops back to idle, even if the LLM call
|
||||
// throws or a downstream DB write fails. The succeeded flag gates the
|
||||
// 'compacted' frame + final log: we only signal completion to the UI when
|
||||
// the new summary row actually landed.
|
||||
let succeeded = false;
|
||||
let newId = '';
|
||||
let result: CompletionResult | undefined;
|
||||
try {
|
||||
// 7. Single completion (no tools). Throws on llama-swap failure.
|
||||
result = await callLlamaSwap(config, session.model, payload, log);
|
||||
|
||||
// 7b. v1.11.3: fetch the model's true context window from llama-swap's
|
||||
// /upstream/<model>/props (the streaming completion doesn't carry it).
|
||||
// Same pattern as inference.ts; the cache makes repeated calls free.
|
||||
const mctx = await modelContextLookup.getModelContext(session.model);
|
||||
const nCtx = mctx?.n_ctx ?? null;
|
||||
|
||||
// 8. Insert the new anchored summary row. role='assistant' per spec; the
|
||||
// UI distinguishes via summary=true. tail_start_id points at the first
|
||||
// preserved tail message so debug surfaces / future tools can reason
|
||||
// about the boundary without re-deriving from compacted_at.
|
||||
const insertRows = await sql<{ id: string }[]>`
|
||||
INSERT INTO messages (
|
||||
session_id, chat_id, role, content, kind, status,
|
||||
summary, tail_start_id,
|
||||
tokens_used, ctx_used, ctx_max,
|
||||
created_at, finished_at
|
||||
)
|
||||
VALUES (
|
||||
${sessionId}, ${chatId}, 'assistant', ${result.content}, 'message', 'complete',
|
||||
true, ${sel.tail_start_id},
|
||||
${result.completionTokens}, ${result.promptTokens}, ${nCtx},
|
||||
clock_timestamp(), clock_timestamp()
|
||||
)
|
||||
RETURNING id
|
||||
`;
|
||||
newId = insertRows[0]!.id;
|
||||
|
||||
// 9. Mark every prior live message (head + prior summary) as compacted.
|
||||
// Bound by "created_at strictly less than tail_start_id's created_at" so
|
||||
// the preserved tail stays compacted_at=NULL. Exclude the new summary
|
||||
// row we just inserted (it's "now", which is >= tail_start_id's
|
||||
// created_at anyway, but defensive).
|
||||
await sql`
|
||||
UPDATE messages
|
||||
SET compacted_at = clock_timestamp()
|
||||
WHERE chat_id = ${chatId}
|
||||
AND compacted_at IS NULL
|
||||
AND id != ${newId}
|
||||
AND created_at < (SELECT created_at FROM messages WHERE id = ${sel.tail_start_id})
|
||||
`;
|
||||
|
||||
// 10. Clear the flag and bump the chat's updated_at so the sidebar
|
||||
// reflects recent activity.
|
||||
await sql`
|
||||
UPDATE chats
|
||||
SET needs_compaction = false, updated_at = clock_timestamp()
|
||||
WHERE id = ${chatId}
|
||||
`;
|
||||
|
||||
succeeded = true;
|
||||
} finally {
|
||||
// Always restore the dot. Status='idle' (not 'error') even on failure —
|
||||
// the caller logs/re-surfaces the error separately; the dot doesn't
|
||||
// need to stay red across reloads for a transient compaction blip.
|
||||
broker.publishUser('default', {
|
||||
type: 'chat_status',
|
||||
chat_id: chatId,
|
||||
status: 'idle',
|
||||
at: new Date().toISOString(),
|
||||
});
|
||||
}
|
||||
|
||||
// 11. Tell the client. useSessionStream subscribes to the per-session WS
|
||||
// channel; the handler refetches messages (so the new summary row + the
|
||||
// compacted_at-stamped older rows render correctly) and fires a sonner
|
||||
// toast. Order matters: idle must precede 'compacted' so the dot is
|
||||
// already green by the time the refetch toast appears.
|
||||
if (succeeded) {
|
||||
broker.publish(sessionId, {
|
||||
type: 'compacted',
|
||||
session_id: sessionId,
|
||||
chat_id: chatId,
|
||||
summary_message_id: newId,
|
||||
});
|
||||
log.info(
|
||||
{
|
||||
chatId,
|
||||
newId,
|
||||
completionTokens: result?.completionTokens,
|
||||
promptTokens: result?.promptTokens,
|
||||
},
|
||||
'compaction: complete',
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -21,6 +21,9 @@ import {
|
||||
import { PathScopeError, resolveProjectRoot } from './path_guard.js';
|
||||
import { maybeAutoNameChat } from './auto_name.js';
|
||||
import { getAgentById } from './agents.js';
|
||||
import * as compaction from './compaction.js';
|
||||
import * as modelContext from './model-context.js';
|
||||
import type { Broker } from './broker.js';
|
||||
|
||||
const BASE_SYSTEM_PROMPT = (projectPath: string) =>
|
||||
`You are BooCode Chat, a code investigation assistant. The user is working on a project located at ${projectPath}. Use the file-read tools (view_file, list_dir, grep, find_files) to investigate code when needed. Be concise. Cite file paths and line numbers when discussing code. Do not hallucinate file contents — read the file first. Tool results may be truncated; if so, narrow your query rather than guessing.`;
|
||||
@@ -136,9 +139,6 @@ interface ChatCompletionChunk {
|
||||
completion_tokens?: number;
|
||||
total_tokens?: number;
|
||||
};
|
||||
timings?: {
|
||||
n_ctx?: number;
|
||||
};
|
||||
}
|
||||
|
||||
export interface InferenceContext {
|
||||
@@ -147,6 +147,12 @@ export interface InferenceContext {
|
||||
log: FastifyBaseLogger;
|
||||
publish: FramePublisher;
|
||||
publishUser: (frame: UserStreamFrame) => void;
|
||||
// v1.11: passed through so compaction.process can publish 'compacted'
|
||||
// frames on the same session WS channel useSessionStream subscribes to.
|
||||
// Compaction is the only path that needs the raw broker handle (regular
|
||||
// inference goes through `publish`); keeping a separate field avoids
|
||||
// tempting other code paths into bypassing the session-id binding.
|
||||
broker: Broker;
|
||||
}
|
||||
|
||||
// Resolution order: base prompt < agent.system_prompt < user prompt, where
|
||||
@@ -260,17 +266,48 @@ async function loadContext(
|
||||
if (projectRows.length === 0) return null;
|
||||
const project = projectRows[0]!;
|
||||
|
||||
// v1.11: filter compacted messages out of the inference assembly. The GET
|
||||
// /api/sessions/:id/messages endpoint still returns everything (so the UI
|
||||
// can show history with the summary card inline); only LLM payloads skip
|
||||
// compacted rows. compacted_at IS NULL keeps the active summary + tail.
|
||||
const history = await sql<Message[]>`
|
||||
SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
|
||||
tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata
|
||||
FROM messages
|
||||
WHERE chat_id = ${chatId}
|
||||
WHERE chat_id = ${chatId} AND compacted_at IS NULL
|
||||
ORDER BY created_at ASC, id ASC
|
||||
`;
|
||||
|
||||
return { session, project, history };
|
||||
}
|
||||
|
||||
// v1.11: shared helper used after both finalizeCompletion and executeToolPhase
|
||||
// persist their token counts. Reads tokens off the just-UPDATEd row (which
|
||||
// the caller returns from RETURNING), runs compaction.isOverflow, and flips
|
||||
// chats.needs_compaction. The next runAssistantTurn invocation acts on it.
|
||||
// Silent on missing tokens — llama-swap occasionally omits usage on truncated
|
||||
// streams, and we'd rather miss one overflow than crash the inference path.
|
||||
async function maybeFlagForCompaction(
|
||||
ctx: InferenceContext,
|
||||
chatId: string,
|
||||
updated: { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null } | undefined,
|
||||
): Promise<void> {
|
||||
if (!updated) return;
|
||||
const promptTokens = updated.ctx_used;
|
||||
const completionTokens = updated.tokens_used;
|
||||
const contextLimit = updated.ctx_max;
|
||||
if (typeof promptTokens !== 'number') return;
|
||||
if (typeof completionTokens !== 'number') return;
|
||||
if (typeof contextLimit !== 'number') return;
|
||||
const overflow = compaction.isOverflow(
|
||||
{ prompt_tokens: promptTokens, completion_tokens: completionTokens },
|
||||
contextLimit,
|
||||
);
|
||||
if (!overflow) return;
|
||||
await ctx.sql`UPDATE chats SET needs_compaction = true WHERE id = ${chatId}`;
|
||||
ctx.log.info({ chatId, promptTokens, completionTokens, contextLimit }, 'inference: flagged for compaction');
|
||||
}
|
||||
|
||||
async function* sseLines(stream: ReadableStream<Uint8Array>): AsyncGenerator<string> {
|
||||
const reader = stream.getReader();
|
||||
const decoder = new TextDecoder('utf-8');
|
||||
@@ -300,7 +337,6 @@ interface StreamResult {
|
||||
toolCalls: ToolCall[];
|
||||
promptTokens: number | null;
|
||||
completionTokens: number | null;
|
||||
nCtx: number | null;
|
||||
}
|
||||
|
||||
interface StreamOptions {
|
||||
@@ -415,7 +451,6 @@ async function streamCompletion(
|
||||
let finishReason: string | null = null;
|
||||
let promptTokens: number | null = null;
|
||||
let completionTokens: number | null = null;
|
||||
let nCtx: number | null = null;
|
||||
const toolCallsBuffer = new Map<number, { id: string; name: string; argsText: string }>();
|
||||
|
||||
for await (const line of sseLines(res.body)) {
|
||||
@@ -437,9 +472,11 @@ async function streamCompletion(
|
||||
completionTokens = parsed.usage.completion_tokens;
|
||||
}
|
||||
}
|
||||
if (parsed.timings && typeof parsed.timings.n_ctx === 'number') {
|
||||
nCtx = parsed.timings.n_ctx;
|
||||
}
|
||||
// v1.11.3: removed dead `parsed.timings.n_ctx` read. llama-server's
|
||||
// streaming completion does NOT emit n_ctx in timings (verified
|
||||
// empirically); the authoritative source is llama-swap's
|
||||
// /upstream/<model>/props endpoint, fetched per-turn via
|
||||
// model-context.getModelContext() at the finalization sites below.
|
||||
|
||||
const choice = parsed.choices?.[0];
|
||||
if (!choice) continue;
|
||||
@@ -525,7 +562,7 @@ async function streamCompletion(
|
||||
toolCalls.push({ id: t.id || `call_${toolCalls.length}`, name: t.name, args });
|
||||
}
|
||||
|
||||
return { finishReason, content, toolCalls, promptTokens, completionTokens, nCtx };
|
||||
return { finishReason, content, toolCalls, promptTokens, completionTokens };
|
||||
}
|
||||
|
||||
async function executeToolCall(
|
||||
@@ -742,7 +779,14 @@ async function executeToolPhase(
|
||||
projectRoot: string
|
||||
): Promise<void> {
|
||||
const { sessionId, chatId, assistantMessageId, toolsUsed, signal } = args;
|
||||
const { content, toolCalls, promptTokens, completionTokens, nCtx } = result;
|
||||
const { content, toolCalls, promptTokens, completionTokens } = result;
|
||||
|
||||
// v1.11.3: ctx_max comes from llama-swap /upstream/<model>/props, not the
|
||||
// streaming completion (which doesn't emit n_ctx). getModelContext caches
|
||||
// the positive lookup for the process lifetime, so this is a single Map
|
||||
// hit after the first invocation per model.
|
||||
const mctx = await modelContext.getModelContext(session.model);
|
||||
const nCtx = mctx?.n_ctx ?? null;
|
||||
|
||||
const [updated] = await ctx.sql<
|
||||
{ tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
|
||||
@@ -758,6 +802,10 @@ async function executeToolPhase(
|
||||
WHERE id = ${assistantMessageId}
|
||||
RETURNING tokens_used, ctx_used, ctx_max, finished_at
|
||||
`;
|
||||
// v1.11: flag for compaction if this turn pushed us over the usable budget.
|
||||
// We never compact mid-loop (the recursive runAssistantTurn keeps tools
|
||||
// flowing); the flag fires on the NEXT turn's pre-fetch hook above.
|
||||
await maybeFlagForCompaction(ctx, chatId, updated);
|
||||
const [toolSessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>`
|
||||
UPDATE sessions SET updated_at = clock_timestamp()
|
||||
WHERE id = ${sessionId}
|
||||
@@ -874,7 +922,11 @@ async function finalizeCompletion(
|
||||
session: Session
|
||||
): Promise<void> {
|
||||
const { sessionId, chatId, assistantMessageId } = args;
|
||||
const { content, finishReason, promptTokens, completionTokens, nCtx } = result;
|
||||
const { content, finishReason, promptTokens, completionTokens } = result;
|
||||
|
||||
// v1.11.3: see executeToolPhase for the rationale.
|
||||
const mctx = await modelContext.getModelContext(session.model);
|
||||
const nCtx = mctx?.n_ctx ?? null;
|
||||
|
||||
const [updated] = await ctx.sql<
|
||||
{ tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
|
||||
@@ -889,6 +941,9 @@ async function finalizeCompletion(
|
||||
WHERE id = ${assistantMessageId}
|
||||
RETURNING tokens_used, ctx_used, ctx_max, finished_at
|
||||
`;
|
||||
// v1.11: flag for compaction on the terminal turn too. Catches the common
|
||||
// case of a turn that hit the limit without invoking tools.
|
||||
await maybeFlagForCompaction(ctx, chatId, updated);
|
||||
const [completeSessRow] = await ctx.sql<{ project_id: string; name: string; updated_at: string }[]>`
|
||||
UPDATE sessions SET updated_at = clock_timestamp()
|
||||
WHERE id = ${sessionId}
|
||||
@@ -927,6 +982,29 @@ async function runAssistantTurn(
|
||||
): Promise<void> {
|
||||
const { sessionId, chatId } = args;
|
||||
|
||||
// v1.11: if the prior turn flagged this chat for compaction, run it first
|
||||
// so loadContext below reads the post-compaction history. We swallow
|
||||
// compaction failures (clearing the flag so we don't loop) and proceed
|
||||
// with the un-compacted history — a slow turn that hits the model's
|
||||
// hard limit is recoverable; a dead session is not.
|
||||
const chatFlag = await ctx.sql<{ needs_compaction: boolean }[]>`
|
||||
SELECT needs_compaction FROM chats WHERE id = ${chatId}
|
||||
`;
|
||||
if (chatFlag[0]?.needs_compaction) {
|
||||
try {
|
||||
await compaction.process({
|
||||
sql: ctx.sql,
|
||||
config: ctx.config,
|
||||
log: ctx.log,
|
||||
broker: ctx.broker,
|
||||
chatId,
|
||||
});
|
||||
} catch (err) {
|
||||
ctx.log.warn({ err, chatId }, 'auto-compaction failed; clearing flag and proceeding');
|
||||
await ctx.sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
|
||||
}
|
||||
}
|
||||
|
||||
const loaded = await loadContext(ctx.sql, sessionId, chatId);
|
||||
if (!loaded) {
|
||||
ctx.log.warn({ sessionId }, 'inference: session or project missing');
|
||||
@@ -1081,6 +1159,9 @@ async function runCapHitSummary(
|
||||
// even on a partial / failed summary the chat history shows where the
|
||||
// budget was hit.
|
||||
if (summaryOk && result) {
|
||||
// v1.11.3: see executeToolPhase for the rationale.
|
||||
const mctx = await modelContext.getModelContext(session.model);
|
||||
const nCtx = mctx?.n_ctx ?? null;
|
||||
const [updated] = await ctx.sql<
|
||||
{ tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
|
||||
>`
|
||||
@@ -1089,7 +1170,7 @@ async function runCapHitSummary(
|
||||
status = 'complete',
|
||||
tokens_used = ${result.completionTokens},
|
||||
ctx_used = ${result.promptTokens},
|
||||
ctx_max = ${result.nCtx},
|
||||
ctx_max = ${nCtx},
|
||||
finished_at = clock_timestamp()
|
||||
WHERE id = ${assistantMessageId}
|
||||
RETURNING tokens_used, ctx_used, ctx_max, finished_at
|
||||
@@ -1237,81 +1318,6 @@ async function insertCapHitSentinel(
|
||||
});
|
||||
}
|
||||
|
||||
const COMPACT_SYSTEM_PROMPT =
|
||||
'Summarize the preceding conversation into a dense but complete context paragraph. Preserve all key facts, decisions, file paths, code patterns, and action items. Do not add any new information. Output only the summary paragraph.';
|
||||
|
||||
async function runCompact(
|
||||
ctx: InferenceContext,
|
||||
sessionId: string,
|
||||
chatId: string,
|
||||
compactMessageId: string
|
||||
): Promise<void> {
|
||||
const loaded = await loadContext(ctx.sql, sessionId, chatId);
|
||||
if (!loaded) return;
|
||||
const { session, project, history } = loaded;
|
||||
|
||||
const messagesForSummary = buildMessagesPayload(session, project,
|
||||
history.filter((m) => m.id !== compactMessageId)
|
||||
);
|
||||
messagesForSummary.push({
|
||||
role: 'system',
|
||||
content: COMPACT_SYSTEM_PROMPT,
|
||||
});
|
||||
|
||||
ctx.publish(sessionId, {
|
||||
type: 'message_started',
|
||||
message_id: compactMessageId,
|
||||
chat_id: chatId,
|
||||
role: 'assistant',
|
||||
});
|
||||
|
||||
let content = '';
|
||||
try {
|
||||
const result = await streamCompletion(
|
||||
ctx,
|
||||
session.model,
|
||||
messagesForSummary,
|
||||
{ tools: null },
|
||||
(delta) => {
|
||||
content += delta;
|
||||
ctx.publish(sessionId, {
|
||||
type: 'delta',
|
||||
message_id: compactMessageId,
|
||||
chat_id: chatId,
|
||||
content: delta,
|
||||
});
|
||||
}
|
||||
);
|
||||
content = result.content;
|
||||
} catch (err) {
|
||||
const errMsg = err instanceof Error ? err.message : String(err);
|
||||
await ctx.sql`
|
||||
UPDATE messages SET status = 'failed', content = ${content}, finished_at = clock_timestamp()
|
||||
WHERE id = ${compactMessageId}
|
||||
`;
|
||||
ctx.publish(sessionId, {
|
||||
type: 'error',
|
||||
message_id: compactMessageId,
|
||||
chat_id: chatId,
|
||||
error: errMsg,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const preCompactCount = history.filter((m) => m.id !== compactMessageId && m.kind !== 'compact').length;
|
||||
const summary = `[Context compacted — ${preCompactCount} messages summarized]\n\n${content}`;
|
||||
|
||||
await ctx.sql`
|
||||
UPDATE messages SET content = ${summary}, status = 'complete', finished_at = clock_timestamp()
|
||||
WHERE id = ${compactMessageId}
|
||||
`;
|
||||
ctx.publish(sessionId, {
|
||||
type: 'message_complete',
|
||||
message_id: compactMessageId,
|
||||
chat_id: chatId,
|
||||
});
|
||||
}
|
||||
|
||||
interface InferenceRegistration {
|
||||
controller: AbortController;
|
||||
completed: Promise<void>;
|
||||
@@ -1328,6 +1334,10 @@ export function createInferenceRunner(
|
||||
const callCtx: InferenceContext = {
|
||||
...ctx,
|
||||
publishUser: (frame) => publishUserFn(user, frame),
|
||||
// v1.11: broker comes in via ctx (set at registration time). Repeated
|
||||
// here so the destructure carries it onto the per-call ctx without
|
||||
// having to add it to every enqueue/cancel signature individually.
|
||||
broker: ctx.broker,
|
||||
};
|
||||
// v1.8 mobile-tabs: announce working before the async loop starts so
|
||||
// every device subscribed to the user channel sees the amber dot.
|
||||
@@ -1357,20 +1367,6 @@ export function createInferenceRunner(
|
||||
})();
|
||||
},
|
||||
|
||||
enqueueCompact(sessionId: string, chatId: string, compactMessageId: string, user: string) {
|
||||
const callCtx: InferenceContext = {
|
||||
...ctx,
|
||||
publishUser: (frame) => publishUserFn(user, frame),
|
||||
};
|
||||
void (async () => {
|
||||
try {
|
||||
await runCompact(callCtx, sessionId, chatId, compactMessageId);
|
||||
} catch (err) {
|
||||
callCtx.log.error({ err }, 'unhandled compact error');
|
||||
}
|
||||
})();
|
||||
},
|
||||
|
||||
async cancel(_sessionId: string, chatId: string): Promise<boolean> {
|
||||
const reg = registry.get(chatId);
|
||||
if (!reg) return false;
|
||||
|
||||
113
apps/server/src/services/model-context.ts
Normal file
113
apps/server/src/services/model-context.ts
Normal file
@@ -0,0 +1,113 @@
|
||||
// v1.11.3: llama-swap model-context cache. Replaces the dead
|
||||
// `parsed.timings.n_ctx` capture in inference.ts / compaction.ts —
|
||||
// llama-server's streaming completion never emits n_ctx in timings (verified
|
||||
// empirically: timings carries prompt_n / predicted_n / *_ms / *_per_second
|
||||
// only). The authoritative source is llama-swap's
|
||||
// /upstream/<model>/props endpoint at .default_generation_settings.n_ctx.
|
||||
//
|
||||
// Cache design:
|
||||
// - Positive entries (n_ctx + total_slots) have no TTL. A model's context
|
||||
// size doesn't change while llama-swap is running; an admin endpoint
|
||||
// can invalidateModelContext() if it ever does.
|
||||
// - Negative entries (failed fetch) have a 60s TTL so a misconfigured or
|
||||
// down model doesn't get hammered every inference turn, but recovers
|
||||
// within a minute once the upstream comes back.
|
||||
// - 3s AbortController timeout on the fetch — long enough for a healthy
|
||||
// upstream, short enough that a stuck upstream doesn't block the
|
||||
// ctx_max UPDATE that follows.
|
||||
|
||||
export interface ModelContext {
|
||||
n_ctx: number;
|
||||
total_slots: number;
|
||||
fetched_at: number;
|
||||
}
|
||||
|
||||
const NEGATIVE_TTL_MS = 60_000;
|
||||
const FETCH_TIMEOUT_MS = 3_000;
|
||||
|
||||
const positiveCache = new Map<string, ModelContext>();
|
||||
// Value is the unix-ms timestamp of the last failed fetch. Used to gate
|
||||
// re-fetches within the 60s window.
|
||||
const negativeCache = new Map<string, number>();
|
||||
|
||||
// Set once at startup by index.ts. We don't import loadConfig() directly
|
||||
// here to keep this module trivially mockable in tests (set the URL in
|
||||
// beforeEach instead of stubbing process.env + loadConfig's cache).
|
||||
let llamaSwapUrl: string | null = null;
|
||||
|
||||
export function configureModelContext(opts: { llamaSwapUrl: string }): void {
|
||||
llamaSwapUrl = opts.llamaSwapUrl;
|
||||
}
|
||||
|
||||
export async function getModelContext(model: string): Promise<ModelContext | null> {
|
||||
// 1. Positive cache hit — no TTL check, model n_ctx is invariant.
|
||||
const pos = positiveCache.get(model);
|
||||
if (pos) return pos;
|
||||
|
||||
// 2. Negative cache hit within TTL — return null without refetching.
|
||||
// Stale negative entries (older than the TTL) fall through to a fresh
|
||||
// attempt below; we don't delete them eagerly because the next successful
|
||||
// fetch will overwrite via the positive map and the negative entry
|
||||
// becomes irrelevant.
|
||||
const negTs = negativeCache.get(model);
|
||||
if (negTs !== undefined && Date.now() - negTs < NEGATIVE_TTL_MS) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 3. Module not initialized. Defensive — index.ts calls
|
||||
// configureModelContext at startup; if a test forgets, fail closed so
|
||||
// the chat still works (ctx_max stays null, UI degrades gracefully).
|
||||
if (!llamaSwapUrl) {
|
||||
negativeCache.set(model, Date.now());
|
||||
return null;
|
||||
}
|
||||
|
||||
// 4. Fetch with timeout. AbortController fires after FETCH_TIMEOUT_MS;
|
||||
// both the timeout path and a fetch reject end up in the catch below
|
||||
// and produce a negative cache entry.
|
||||
const url = `${llamaSwapUrl}/upstream/${encodeURIComponent(model)}/props`;
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
|
||||
try {
|
||||
const res = await fetch(url, { signal: controller.signal });
|
||||
clearTimeout(timer);
|
||||
if (!res.ok) {
|
||||
negativeCache.set(model, Date.now());
|
||||
return null;
|
||||
}
|
||||
const body = (await res.json()) as {
|
||||
default_generation_settings?: { n_ctx?: number };
|
||||
total_slots?: number;
|
||||
};
|
||||
const n_ctx = body?.default_generation_settings?.n_ctx;
|
||||
if (typeof n_ctx !== 'number' || n_ctx <= 0) {
|
||||
negativeCache.set(model, Date.now());
|
||||
return null;
|
||||
}
|
||||
// total_slots is informational; default to 1 if missing rather than
|
||||
// reject the whole response. Most local llama-swap setups run a
|
||||
// single slot anyway.
|
||||
const total_slots =
|
||||
typeof body?.total_slots === 'number' && body.total_slots > 0 ? body.total_slots : 1;
|
||||
const entry: ModelContext = { n_ctx, total_slots, fetched_at: Date.now() };
|
||||
positiveCache.set(model, entry);
|
||||
// Clear any stale negative entry so a future query sees the positive
|
||||
// hit cleanly (otherwise the negative TTL never expires from the map).
|
||||
negativeCache.delete(model);
|
||||
return entry;
|
||||
} catch {
|
||||
clearTimeout(timer);
|
||||
negativeCache.set(model, Date.now());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export function invalidateModelContext(model?: string): void {
|
||||
if (model === undefined) {
|
||||
positiveCache.clear();
|
||||
negativeCache.clear();
|
||||
} else {
|
||||
positiveCache.delete(model);
|
||||
negativeCache.delete(model);
|
||||
}
|
||||
}
|
||||
@@ -159,6 +159,12 @@ export interface Message {
|
||||
// v1.8.2: per-message metadata. See MessageMetadata for the discriminated
|
||||
// shapes currently in use.
|
||||
metadata: MessageMetadata | null;
|
||||
// v1.11: anchored rolling compaction. Optional so consumers that SELECT
|
||||
// the pre-v1.11 column set still type-check. See compaction.ts +
|
||||
// schema.sql for semantics.
|
||||
summary?: boolean;
|
||||
tail_start_id?: string | null;
|
||||
compacted_at?: string | null;
|
||||
}
|
||||
|
||||
export interface ModelInfo {
|
||||
|
||||
@@ -168,8 +168,11 @@ export const api = {
|
||||
request<void>(`/api/chats/${chatId}`, { method: 'DELETE' }),
|
||||
messages: (chatId: string) =>
|
||||
request<Message[]>(`/api/chats/${chatId}/messages`),
|
||||
// v1.11: anchored-rolling compaction. POST awaits the LLM call inside
|
||||
// the route's lifecycle; the new summary row arrives via the 'compacted'
|
||||
// WS frame (useSessionStream refetches + toasts).
|
||||
compact: (chatId: string) =>
|
||||
request<{ compact_message_id: string }>(`/api/chats/${chatId}/compact`, { method: 'POST' }),
|
||||
request<{ ok: true }>(`/api/chats/${chatId}/compact`, { method: 'POST' }),
|
||||
stop: (chatId: string) =>
|
||||
request<{ stopped: boolean }>(`/api/chats/${chatId}/stop`, { method: 'POST' }),
|
||||
forceSend: (chatId: string, content: string) =>
|
||||
|
||||
@@ -145,6 +145,19 @@ export interface Message {
|
||||
// v1.8.2: per-message metadata; see MessageMetadata. null for the vast
|
||||
// majority of messages.
|
||||
metadata: MessageMetadata | null;
|
||||
// v1.11: anchored rolling compaction fields. Optional on the wire so that
|
||||
// older API responses (or test fixtures) parse without explicit nulls.
|
||||
// summary — true on the assistant row that holds the active
|
||||
// anchored summary. Render via SummaryCard.
|
||||
// tail_start_id — first preserved tail message the summary covers up to
|
||||
// (exclusive). Diagnostic only on the client.
|
||||
// compacted_at — set on rows that are "behind the curtain" of the
|
||||
// current summary. Returned by the GET endpoint so the
|
||||
// UI can show history, but the server-side inference
|
||||
// assembly filters these out.
|
||||
summary?: boolean;
|
||||
tail_start_id?: string | null;
|
||||
compacted_at?: string | null;
|
||||
}
|
||||
|
||||
export interface ModelInfo {
|
||||
@@ -305,6 +318,11 @@ export type WsFrame =
|
||||
}
|
||||
| { type: 'messages_deleted'; message_ids: string[]; chat_id?: string }
|
||||
| { type: 'chat_renamed'; chat_id: string; name: string }
|
||||
// v1.11: published by services/compaction.ts after the new anchored
|
||||
// summary row lands. Carries the new summary row id for diagnostics; the
|
||||
// session-stream handler ignores the id and re-fetches the full message
|
||||
// list (the cohort of compacted_at-stamped rows changed too).
|
||||
| { type: 'compacted'; session_id: string; chat_id: string; summary_message_id: string }
|
||||
// v1.8.2: `reason` discriminates structured failures (the UI prefers it
|
||||
// over `error` text when present).
|
||||
| { type: 'error'; message_id?: string; chat_id?: string; error: string; reason?: ErrorReason };
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { useState } from 'react';
|
||||
import { History, MessageSquare, Plus, X } from 'lucide-react';
|
||||
import { Bot, History, MessageSquare, Plus, Terminal, X } from 'lucide-react';
|
||||
import type { Chat, WorkspacePane } from '@/api/types';
|
||||
import { StatusDot } from '@/components/StatusDot';
|
||||
import {
|
||||
@@ -9,6 +9,12 @@ import {
|
||||
ContextMenuSeparator,
|
||||
ContextMenuTrigger,
|
||||
} from '@/components/ui/context-menu';
|
||||
import {
|
||||
DropdownMenu,
|
||||
DropdownMenuContent,
|
||||
DropdownMenuItem,
|
||||
DropdownMenuTrigger,
|
||||
} from '@/components/ui/dropdown-menu';
|
||||
import { useLongPress } from '@/hooks/useLongPress';
|
||||
import { cn } from '@/lib/utils';
|
||||
|
||||
@@ -20,7 +26,7 @@ interface Props {
|
||||
onCloseOthers: (chatId: string) => void;
|
||||
onCloseToRight: (chatId: string) => void;
|
||||
onCloseAll: () => void;
|
||||
onNewChat: () => void;
|
||||
onAddPane: (kind: 'chat' | 'terminal' | 'agent') => void;
|
||||
onShowHistory: () => void;
|
||||
onRename: (chatId: string, name: string) => Promise<void>;
|
||||
onRemovePane?: () => void;
|
||||
@@ -34,7 +40,7 @@ export function ChatTabBar({
|
||||
onCloseOthers,
|
||||
onCloseToRight,
|
||||
onCloseAll,
|
||||
onNewChat,
|
||||
onAddPane,
|
||||
onShowHistory,
|
||||
onRename,
|
||||
onRemovePane,
|
||||
@@ -125,7 +131,7 @@ export function ChatTabBar({
|
||||
</div>
|
||||
</ContextMenuTrigger>
|
||||
<ContextMenuContent>
|
||||
<ContextMenuItem onSelect={() => onNewChat()}>
|
||||
<ContextMenuItem onSelect={() => onAddPane('chat')}>
|
||||
New chat
|
||||
</ContextMenuItem>
|
||||
<ContextMenuSeparator />
|
||||
@@ -164,15 +170,29 @@ export function ChatTabBar({
|
||||
)}
|
||||
|
||||
<div className="flex items-center ml-auto gap-0.5 px-1 shrink-0">
|
||||
<DropdownMenu>
|
||||
<DropdownMenuTrigger asChild>
|
||||
<button
|
||||
type="button"
|
||||
onClick={onNewChat}
|
||||
className="inline-flex items-center justify-center p-1 rounded text-muted-foreground hover:bg-muted hover:text-foreground max-md:min-h-[44px] max-md:min-w-[44px]"
|
||||
aria-label="New chat"
|
||||
title="New chat"
|
||||
aria-label="New pane"
|
||||
title="New pane"
|
||||
>
|
||||
<Plus size={12} />
|
||||
</button>
|
||||
</DropdownMenuTrigger>
|
||||
<DropdownMenuContent align="end" className="min-w-40">
|
||||
<DropdownMenuItem onSelect={() => onAddPane('chat')}>
|
||||
<MessageSquare size={14} /> New chat
|
||||
</DropdownMenuItem>
|
||||
<DropdownMenuItem onSelect={() => onAddPane('terminal')}>
|
||||
<Terminal size={14} /> New terminal
|
||||
</DropdownMenuItem>
|
||||
<DropdownMenuItem onSelect={() => onAddPane('agent')}>
|
||||
<Bot size={14} /> New agent
|
||||
</DropdownMenuItem>
|
||||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
<button
|
||||
type="button"
|
||||
onClick={onShowHistory}
|
||||
|
||||
86
apps/web/src/components/ContextBar.tsx
Normal file
86
apps/web/src/components/ContextBar.tsx
Normal file
@@ -0,0 +1,86 @@
|
||||
import type { Message } from '@/api/types';
|
||||
|
||||
interface Props {
|
||||
messages: Message[];
|
||||
}
|
||||
|
||||
// v1.11.2: persistent context-usage indicator above MessageList. Mirrors the
|
||||
// server-side compaction.usable() formula — color thresholds are computed
|
||||
// against (max - 20k buffer), not raw max, so the bar turns amber/orange
|
||||
// /red at the same boundaries auto-compaction will fire. The popover above
|
||||
// the input (ChatContextPopover) uses raw-% thresholds and is intentionally
|
||||
// kept separate (it's a different surface and a different signal).
|
||||
const COMPACTION_BUFFER = 20_000;
|
||||
|
||||
// Walk newest-first; first message with both ctx_used and ctx_max non-null
|
||||
// AND ctx_max > 0 wins. Older messages may have ctx_used but missing ctx_max
|
||||
// (early v1 before llama-swap's n_ctx capture worked) — skip them and keep
|
||||
// walking. If nothing usable in the chat, caller renders null.
|
||||
function latestPair(messages: Message[]): { used: number; max: number } | null {
|
||||
for (let i = messages.length - 1; i >= 0; i--) {
|
||||
const m = messages[i]!;
|
||||
if (m.ctx_used == null || m.ctx_max == null) continue;
|
||||
if (m.ctx_max <= 0) continue;
|
||||
return { used: m.ctx_used, max: m.ctx_max };
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
interface ColorTier {
|
||||
// Tailwind utility for the label / numbers. Uses literal palette names
|
||||
// rather than design tokens because we want three distinct severities
|
||||
// (amber → orange → red) and BooCode only defines one warning token
|
||||
// (`destructive`). Literal classes keep the gradation explicit.
|
||||
text: string;
|
||||
bar: string;
|
||||
}
|
||||
|
||||
function tierFor(usablePct: number): ColorTier {
|
||||
if (usablePct >= 0.95) return { text: 'text-red-600 dark:text-red-400', bar: 'bg-red-500' };
|
||||
if (usablePct >= 0.80) return { text: 'text-orange-600 dark:text-orange-400', bar: 'bg-orange-500' };
|
||||
if (usablePct >= 0.60) return { text: 'text-amber-600 dark:text-amber-400', bar: 'bg-amber-500' };
|
||||
return { text: 'text-muted-foreground', bar: 'bg-muted-foreground/40' };
|
||||
}
|
||||
|
||||
export function ContextBar({ messages }: Props) {
|
||||
const pair = latestPair(messages);
|
||||
if (!pair) return null;
|
||||
|
||||
const { used, max } = pair;
|
||||
const usable = Math.max(0, max - COMPACTION_BUFFER);
|
||||
const pct = used / max;
|
||||
const usablePct = usable > 0 ? used / usable : 0;
|
||||
const tier = tierFor(usablePct);
|
||||
|
||||
// Bar fill is clamped to [0, 100] — over-budget cases (usable < used) still
|
||||
// show the bar at 100% red rather than overflowing the track visually.
|
||||
const fillPct = Math.min(100, Math.max(0, pct * 100));
|
||||
const compactionThresholdPct = max > 0 ? Math.round((usable / max) * 100) : 0;
|
||||
|
||||
return (
|
||||
<div className="border-b px-4 py-1 shrink-0">
|
||||
<div className="max-w-[1000px] mx-auto w-full">
|
||||
<div className="flex items-baseline justify-between text-[10px] font-mono leading-tight">
|
||||
{/* "Context" on >=sm, "Ctx" on phones to save horizontal space. */}
|
||||
<span className={tier.text}>
|
||||
<span className="hidden sm:inline">Context</span>
|
||||
<span className="sm:hidden">Ctx</span>
|
||||
</span>
|
||||
<span
|
||||
className={tier.text}
|
||||
title={`Auto-compaction at ~${compactionThresholdPct}%`}
|
||||
>
|
||||
{used.toLocaleString()} / {max.toLocaleString()}{' '}
|
||||
<span className="max-[380px]:hidden">({Math.round(pct * 100)}%)</span>
|
||||
</span>
|
||||
</div>
|
||||
<div className="mt-1 h-1 rounded-full bg-muted overflow-hidden">
|
||||
<div
|
||||
className={`h-full ${tier.bar} transition-[width] duration-300`}
|
||||
style={{ width: `${fillPct}%` }}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -537,7 +537,70 @@ function CompactCard({ message, sessionChats }: { message: Message; sessionChats
|
||||
);
|
||||
}
|
||||
|
||||
// v1.11 anchored rolling summary. Inserted by services/compaction.ts as a
|
||||
// role='assistant', summary=true row. Distinct from legacy CompactCard
|
||||
// (which renders the kind='compact' system rows produced by v1.10 /compact).
|
||||
// Collapsed by default; header shows the timestamp; body renders the
|
||||
// summary markdown when expanded. Copy button matches CompactCard's affordance.
|
||||
function SummaryCard({ message }: { message: Message }) {
|
||||
const [expanded, setExpanded] = useState(false);
|
||||
const [copied, setCopied] = useState(false);
|
||||
|
||||
// Use finished_at when available (that's when the summary actually landed);
|
||||
// fall back to created_at for any row missing it. Both are ISO strings.
|
||||
const ts = message.finished_at ?? message.created_at;
|
||||
const headerTs = ts ? new Date(ts).toLocaleString() : '';
|
||||
|
||||
async function handleCopy() {
|
||||
try {
|
||||
await navigator.clipboard.writeText(message.content);
|
||||
setCopied(true);
|
||||
setTimeout(() => setCopied(false), 1200);
|
||||
toast.success('Summary copied to clipboard');
|
||||
} catch {
|
||||
toast.error('Copy failed');
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="rounded-lg border border-primary/30 bg-primary/5 text-sm">
|
||||
<div className="flex items-center gap-2 px-3 py-2">
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setExpanded(!expanded)}
|
||||
className="flex items-center gap-1.5 flex-1 min-w-0 text-left text-muted-foreground hover:text-foreground"
|
||||
>
|
||||
{expanded ? <ChevronDown size={14} /> : <ChevronRight size={14} />}
|
||||
<span className="text-xs font-medium truncate">
|
||||
Compacted summary — {headerTs}
|
||||
</span>
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => void handleCopy()}
|
||||
className="p-1 rounded hover:bg-muted text-muted-foreground"
|
||||
aria-label="Copy summary"
|
||||
title="Copy summary"
|
||||
>
|
||||
{copied ? <Check size={12} /> : <Copy size={12} />}
|
||||
</button>
|
||||
</div>
|
||||
{expanded && (
|
||||
<div className="px-3 pb-3 text-xs leading-relaxed border-t pt-2">
|
||||
<MarkdownBody content={message.content} />
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export function MessageBubble({ message, sessionChats, capHitInfo }: Props) {
|
||||
// v1.11: anchored rolling summary row. Checked BEFORE the kind==='compact'
|
||||
// branch because summary=true never coexists with kind='compact' (new
|
||||
// compactions emit role='assistant' rows with kind='message'+summary=true).
|
||||
if (message.summary) {
|
||||
return <SummaryCard message={message} />;
|
||||
}
|
||||
if (message.kind === 'compact') {
|
||||
return <CompactCard message={message} sessionChats={sessionChats} />;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { useEffect, useMemo, useState } from 'react';
|
||||
import { PanelRight, MessageSquare, Terminal, Bot, Clipboard, X } from 'lucide-react';
|
||||
import { PanelRight, MessageSquare, Terminal, Bot, Clipboard, Plus, X } from 'lucide-react';
|
||||
import type { Chat, Project, Session, WorkspacePane } from '@/api/types';
|
||||
import { MAX_PANES, type UseWorkspacePanesResult } from '@/hooks/useWorkspacePanes';
|
||||
import type { UseSessionChatsResult } from '@/hooks/useSessionChats';
|
||||
@@ -227,7 +227,10 @@ export function Workspace({
|
||||
onCloseOthers={(chatId) => closeOtherTabs(idx, chatId)}
|
||||
onCloseToRight={(chatId) => closeTabsToRight(idx, chatId)}
|
||||
onCloseAll={() => closeAllTabs(idx)}
|
||||
onNewChat={() => void createChat(idx)}
|
||||
onAddPane={(kind) => {
|
||||
if (kind === 'chat') void createChat(idx);
|
||||
else addSplitPane(kind);
|
||||
}}
|
||||
onShowHistory={() => showLandingPage(idx)}
|
||||
onRename={renameChat}
|
||||
onRemovePane={panes.length > 1 ? () => removePane(idx) : undefined}
|
||||
@@ -239,6 +242,30 @@ export function Workspace({
|
||||
<span className="text-xs text-muted-foreground">
|
||||
{terminalLabels.get(pane.id) ?? 'Terminal'}
|
||||
</span>
|
||||
<DropdownMenu>
|
||||
<DropdownMenuTrigger asChild>
|
||||
<button
|
||||
type="button"
|
||||
onClick={(e) => e.stopPropagation()}
|
||||
className="ml-auto inline-flex items-center justify-center size-5 rounded text-muted-foreground hover:bg-muted hover:text-foreground max-md:size-7"
|
||||
aria-label="New pane"
|
||||
title="New pane"
|
||||
>
|
||||
<Plus size={12} />
|
||||
</button>
|
||||
</DropdownMenuTrigger>
|
||||
<DropdownMenuContent align="end" className="min-w-40">
|
||||
<DropdownMenuItem onSelect={() => addSplitPane('chat')}>
|
||||
<MessageSquare size={14} /> New chat
|
||||
</DropdownMenuItem>
|
||||
<DropdownMenuItem onSelect={() => addSplitPane('terminal')}>
|
||||
<Terminal size={14} /> New terminal
|
||||
</DropdownMenuItem>
|
||||
<DropdownMenuItem onSelect={() => addSplitPane('agent')}>
|
||||
<Bot size={14} /> New agent
|
||||
</DropdownMenuItem>
|
||||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
{/* v1.10.4: iOS Safari restricts navigator.clipboard.readText
|
||||
outside direct user gestures. A real button click IS a
|
||||
gesture, so this works where keystroke-driven paste may
|
||||
@@ -250,7 +277,7 @@ export function Workspace({
|
||||
e.stopPropagation();
|
||||
terminalsRegistry.get(pane.id)?.paste();
|
||||
}}
|
||||
className="ml-auto inline-flex items-center justify-center size-5 rounded text-muted-foreground hover:bg-muted hover:text-foreground max-md:size-7"
|
||||
className="inline-flex items-center justify-center size-5 rounded text-muted-foreground hover:bg-muted hover:text-foreground max-md:size-7"
|
||||
aria-label="Paste from clipboard"
|
||||
title="Paste from clipboard"
|
||||
>
|
||||
|
||||
@@ -7,6 +7,7 @@ import { useChatContextStats } from '@/hooks/useChatContextStats';
|
||||
import { MessageList } from '@/components/MessageList';
|
||||
import { ChatInput } from '@/components/ChatInput';
|
||||
import { ChatContextPopover } from '@/components/ChatContextPopover';
|
||||
import { ContextBar } from '@/components/ContextBar';
|
||||
import {
|
||||
DropdownMenu,
|
||||
DropdownMenuContent,
|
||||
@@ -125,6 +126,10 @@ export function ChatPane({ sessionId, chatId, projectId, agentId, onAgentChange,
|
||||
|
||||
return (
|
||||
<div className="flex flex-col h-full min-h-0">
|
||||
{/* v1.11.2: persistent context-usage indicator. Renders null when there
|
||||
are no assistant messages yet (fresh chat). shrink-0 keeps it out of
|
||||
the MessageList scroll region — bar stays pinned, list scrolls. */}
|
||||
<ContextBar messages={chatMessages} />
|
||||
<MessageList messages={chatMessages} sessionChats={sessionChats} />
|
||||
|
||||
{/* Queued messages */}
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import { useEffect, useRef, useState } from 'react';
|
||||
import { toast } from 'sonner';
|
||||
import type { Message, WsFrame } from '@/api/types';
|
||||
import { api } from '@/api/client';
|
||||
import { sessionEvents } from './sessionEvents';
|
||||
|
||||
// session_renamed frame removed from WsFrame — it was declared but never
|
||||
@@ -161,6 +163,12 @@ function applyFrame(state: State, frame: WsFrame): State {
|
||||
: state.messages;
|
||||
return { ...state, messages: next, error: frame.error };
|
||||
}
|
||||
case 'compacted': {
|
||||
// v1.11: side effects (refetch + toast) live in ws.onmessage; the
|
||||
// reducer just no-ops so TS exhaustiveness is satisfied without
|
||||
// duplicating async work inside a synchronous reducer.
|
||||
return state;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -196,6 +204,25 @@ export function useSessionStream(sessionId: string | undefined) {
|
||||
ws.onmessage = (ev) => {
|
||||
try {
|
||||
const frame = JSON.parse(typeof ev.data === 'string' ? ev.data : '') as WsFrame;
|
||||
// v1.11: on a compaction completion, re-fetch the message list so
|
||||
// the new summary row + the cohort of compacted_at-stamped older
|
||||
// rows render correctly. We dispatch the fresh list as a synthetic
|
||||
// 'snapshot' frame so the reducer's existing path handles state
|
||||
// replacement (no need for a parallel "refetched" path).
|
||||
// The toast is purely UX feedback; missing it would still leave
|
||||
// the chat in a valid state.
|
||||
if (frame.type === 'compacted') {
|
||||
toast.success('Context compacted to free space');
|
||||
void api.messages
|
||||
.list(frame.session_id)
|
||||
.then((messages) => {
|
||||
setState((s) => applyFrame(s, { type: 'snapshot', messages }));
|
||||
})
|
||||
.catch((err: unknown) => {
|
||||
console.warn('compacted refetch failed', err);
|
||||
});
|
||||
return;
|
||||
}
|
||||
setState((s) => applyFrame(s, frame));
|
||||
} catch (err) {
|
||||
console.warn('bad ws frame', err);
|
||||
|
||||
Reference in New Issue
Block a user