- message_parts.hidden_at timestamptz column (NULL by default) with a partial index on (message_id) WHERE hidden_at IS NULL for the common visible-parts filter. - messages_with_parts view changed from COALESCE(parts, legacy) to CASE WHEN EXISTS(any parts of kind) THEN visible-parts ELSE legacy. COALESCE would have leaked hidden parts back via the legacy fallback when every part was pruned (smoke caught it pre-commit). The CASE distinguishes "no parts at all → fall back to legacy column for pre-v1.13.0 history" from "all parts hidden → return null/empty so the row drops out of the model payload" exactly. - prune.ts: scans tool_result parts newest-first, protects the last 40k tokens (PROTECTED_TOKENS), marks older candidates hidden when their combined estimate clears 20k (PRUNE_TRIGGER_TOKENS — equal to COMPACTION_BUFFER from v1.11.0, so a successful prune is exactly the budget the summary path would have freed). Stops at chats.tail_start_id so it doesn't double-erase across the last summary boundary. Pure decision helper selectPruneTargets exported separately for unit tests. - Wired into maybeFlagForCompaction: prune runs synchronously when overflow is detected; if it freed >= PRUNE_TRIGGER_TOKENS, the needs_compaction flag is NOT set and the (expensive) summary inference call is skipped this turn. The next turn's overflow check re-evaluates from scratch. - 6 new unit tests in prune.test.ts cover: empty input, protection-only (no candidates), candidates below trigger, candidates above trigger, candidates straddling a summary boundary, exactly-protection-tokens. 179 tests total (was 173). Smoke verified post-rebuild: - \\d message_parts shows hidden_at + partial index. - View definition shows AND p.hidden_at IS NULL filters on all three subselects. - Synthetic hide-then-restore confirmed the view drops the tool_result jsonb to null when its only part is hidden, and restores when un-hidden. - EXPLAIN ANALYZE on the 42-message stress chat: 0.325ms (faster than v1.13.1-B's 1.018ms — EXISTS short-circuits cleanly for the common no-parts case). - Normal turn (plain text prompt) completes unaffected. Closes a v1.11.0 design item that was scoped but never implemented. With v1.13's parts table the prune is dramatically cheaper to write — pre-parts it would have meant editing JSON blobs in-place; now it's a hidden_at flag and a view subselect. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
193 lines
7.6 KiB
TypeScript
193 lines
7.6 KiB
TypeScript
import type { Sql } from '../../db.js';
|
|
import type {
|
|
Agent,
|
|
Message,
|
|
Project,
|
|
Session,
|
|
} from '../../types/api.js';
|
|
import * as compaction from '../compaction.js';
|
|
import { buildSystemPrompt } from '../system-prompt.js';
|
|
import { isAnySentinel } from './sentinels.js';
|
|
import { PRUNE_TRIGGER_TOKENS, prune } from './prune.js';
|
|
import type { InferenceContext } from './turn.js';
|
|
|
|
export interface OpenAiMessage {
|
|
role: 'system' | 'user' | 'assistant' | 'tool';
|
|
content: string | null;
|
|
tool_calls?: Array<{
|
|
id: string;
|
|
type: 'function';
|
|
function: { name: string; arguments: string };
|
|
}>;
|
|
tool_call_id?: string;
|
|
// v1.13.1-C: reasoning text from a prior assistant turn, sourced from
|
|
// message_parts kind='reasoning' rows joined in via reasoning_parts on
|
|
// the messages_with_parts view. stream-phase.ts/toModelMessages threads
|
|
// this into the AI SDK ReasoningPart when forwarding to the model so
|
|
// reasoning models can resume mid-thought across tool-call boundaries.
|
|
reasoning?: string;
|
|
}
|
|
|
|
// v1.12: buildSystemPrompt lives in services/system-prompt.ts. It awaits the
|
|
// container-guidance loader, so this function is async too and every call
|
|
// site in inference.ts awaits the result.
|
|
export async function buildMessagesPayload(
|
|
session: Session,
|
|
project: Project,
|
|
history: Message[],
|
|
agent: Agent | null = null
|
|
): Promise<OpenAiMessage[]> {
|
|
const out: OpenAiMessage[] = [];
|
|
const systemPrompt = await buildSystemPrompt(project, session, agent);
|
|
out.push({ role: 'system', content: systemPrompt });
|
|
|
|
// Find the latest compact marker — only send messages from that point onwards
|
|
let startIdx = 0;
|
|
for (let i = history.length - 1; i >= 0; i--) {
|
|
if (history[i]!.kind === 'compact') {
|
|
startIdx = i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (let i = startIdx; i < history.length; i++) {
|
|
const m = history[i]!;
|
|
if (m.kind === 'compact') {
|
|
out.push({ role: 'system', content: m.content });
|
|
continue;
|
|
}
|
|
// v1.8.2 / v1.11.6: cap-hit and doom-loop sentinels are UI-only — never
|
|
// send them to the LLM. The synthetic instruction note lives only inside
|
|
// the summary call's messages array and is never persisted, so on a
|
|
// follow-up turn the model resumes with a clean context.
|
|
if (isAnySentinel(m)) continue;
|
|
if (m.role === 'assistant' && m.status === 'streaming') continue;
|
|
if (m.role === 'assistant' && m.status === 'cancelled') continue;
|
|
if (m.role === 'tool') {
|
|
const tr = m.tool_results;
|
|
if (!tr) continue;
|
|
const outputText = tr.error
|
|
? `error: ${tr.error}`
|
|
: typeof tr.output === 'string'
|
|
? tr.output
|
|
: JSON.stringify(tr.output);
|
|
out.push({
|
|
role: 'tool',
|
|
content: outputText,
|
|
tool_call_id: tr.tool_call_id,
|
|
});
|
|
continue;
|
|
}
|
|
if (m.role === 'assistant') {
|
|
const msg: OpenAiMessage = {
|
|
role: 'assistant',
|
|
content: m.content && m.content.length > 0 ? m.content : null,
|
|
};
|
|
if (m.tool_calls && m.tool_calls.length > 0) {
|
|
msg.tool_calls = m.tool_calls.map((tc) => ({
|
|
id: tc.id,
|
|
type: 'function' as const,
|
|
function: { name: tc.name, arguments: JSON.stringify(tc.args) },
|
|
}));
|
|
}
|
|
// v1.13.1-C: collapse reasoning_parts into a single string. The view
|
|
// returns them ordered by sequence; multiple reasoning parts on one
|
|
// message are rare but concat preserves ordering. Skip when absent.
|
|
if (m.reasoning_parts && m.reasoning_parts.length > 0) {
|
|
msg.reasoning = m.reasoning_parts.map((p) => p.text ?? '').join('');
|
|
}
|
|
out.push(msg);
|
|
continue;
|
|
}
|
|
out.push({ role: 'user', content: m.content });
|
|
}
|
|
return out;
|
|
}
|
|
|
|
export async function loadContext(
|
|
sql: Sql,
|
|
sessionId: string,
|
|
chatId: string
|
|
): Promise<{ session: Session; project: Project; history: Message[] } | null> {
|
|
const sessionRows = await sql<Session[]>`
|
|
SELECT id, project_id, name, model, system_prompt, status, created_at, updated_at,
|
|
agent_id, web_search_enabled
|
|
FROM sessions WHERE id = ${sessionId}
|
|
`;
|
|
if (sessionRows.length === 0) return null;
|
|
const session = sessionRows[0]!;
|
|
|
|
const projectRows = await sql<Project[]>`
|
|
SELECT id, name, path, added_at, last_session_id, status, gitea_remote,
|
|
default_system_prompt, default_web_search_enabled
|
|
FROM projects WHERE id = ${session.project_id}
|
|
`;
|
|
if (projectRows.length === 0) return null;
|
|
const project = projectRows[0]!;
|
|
|
|
// v1.11: filter compacted messages out of the inference assembly. The GET
|
|
// /api/sessions/:id/messages endpoint still returns everything (so the UI
|
|
// can show history with the summary card inline); only LLM payloads skip
|
|
// compacted rows. compacted_at IS NULL keeps the active summary + tail.
|
|
// v1.13.1-B: reads tool_calls/tool_results via the parts-merged view.
|
|
// v1.13.1-C: also pull reasoning_parts so assistant messages from
|
|
// reasoning models can be replayed with their reasoning context preserved.
|
|
const history = await sql<Message[]>`
|
|
SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
|
|
tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata,
|
|
reasoning_parts
|
|
FROM messages_with_parts
|
|
WHERE chat_id = ${chatId} AND compacted_at IS NULL
|
|
ORDER BY created_at ASC, id ASC
|
|
`;
|
|
|
|
return { session, project, history };
|
|
}
|
|
|
|
// v1.11: shared helper used after both finalizeCompletion and executeToolPhase
|
|
// persist their token counts. Reads tokens off the just-UPDATEd row (which
|
|
// the caller returns from RETURNING), runs compaction.isOverflow, and flips
|
|
// chats.needs_compaction. The next runAssistantTurn invocation acts on it.
|
|
// Silent on missing tokens — llama-swap occasionally omits usage on truncated
|
|
// streams, and we'd rather miss one overflow than crash the inference path.
|
|
export async function maybeFlagForCompaction(
|
|
ctx: InferenceContext,
|
|
chatId: string,
|
|
updated: { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null } | undefined,
|
|
): Promise<void> {
|
|
if (!updated) return;
|
|
const promptTokens = updated.ctx_used;
|
|
const completionTokens = updated.tokens_used;
|
|
const contextLimit = updated.ctx_max;
|
|
if (typeof promptTokens !== 'number') return;
|
|
if (typeof completionTokens !== 'number') return;
|
|
if (typeof contextLimit !== 'number') return;
|
|
const overflow = compaction.isOverflow(
|
|
{ prompt_tokens: promptTokens, completion_tokens: completionTokens },
|
|
contextLimit,
|
|
);
|
|
if (!overflow) return;
|
|
|
|
// v1.13.4: try the cheap prune first. If it freed at least the buffer
|
|
// worth of tokens (PRUNE_TRIGGER_TOKENS, identical to COMPACTION_BUFFER),
|
|
// we're below the threshold again — skip flagging summarize for the next
|
|
// turn. The next turn's overflow check will re-evaluate from scratch.
|
|
// Prune failures (DB errors etc.) propagate so the surrounding inference
|
|
// path sees them; the catch in finalizeCompletion / executeToolPhase
|
|
// doesn't shield this — by design, we want to know if prune is broken.
|
|
const pruned = await prune({ sql: ctx.sql, chatId });
|
|
if (pruned.hidden > 0) {
|
|
ctx.log.info(
|
|
{ chatId, hidden: pruned.hidden, freedTokens: pruned.freedTokens },
|
|
'inference: prune freed context budget',
|
|
);
|
|
}
|
|
if (pruned.freedTokens >= PRUNE_TRIGGER_TOKENS) {
|
|
// Prune handled it; skip the (expensive) summarize path.
|
|
return;
|
|
}
|
|
|
|
await ctx.sql`UPDATE chats SET needs_compaction = true WHERE id = ${chatId}`;
|
|
ctx.log.info({ chatId, promptTokens, completionTokens, contextLimit }, 'inference: flagged for compaction');
|
|
}
|