import type { Sql } from '../../db.js'; import type { Agent, Message, Project, Session, } from '../../types/api.js'; import * as compaction from '../compaction.js'; import { buildSystemPrompt } from '../system-prompt.js'; import { isAnySentinel } from './sentinels.js'; import { PRUNE_TRIGGER_TOKENS, prune } from './prune.js'; import type { InferenceContext } from './turn.js'; export interface OpenAiMessage { role: 'system' | 'user' | 'assistant' | 'tool'; content: string | null; tool_calls?: Array<{ id: string; type: 'function'; function: { name: string; arguments: string }; }>; tool_call_id?: string; // v1.13.1-C: reasoning text from a prior assistant turn, sourced from // message_parts kind='reasoning' rows joined in via reasoning_parts on // the messages_with_parts view. stream-phase.ts/toModelMessages threads // this into the AI SDK ReasoningPart when forwarding to the model so // reasoning models can resume mid-thought across tool-call boundaries. reasoning?: string; } // v1.12: buildSystemPrompt lives in services/system-prompt.ts. It awaits the // container-guidance loader, so this function is async too and every call // site in inference.ts awaits the result. export async function buildMessagesPayload( session: Session, project: Project, history: Message[], agent: Agent | null = null ): Promise { const out: OpenAiMessage[] = []; const systemPrompt = await buildSystemPrompt(project, session, agent); out.push({ role: 'system', content: systemPrompt }); // Find the latest compact marker — only send messages from that point onwards let startIdx = 0; for (let i = history.length - 1; i >= 0; i--) { if (history[i]!.kind === 'compact') { startIdx = i; break; } } for (let i = startIdx; i < history.length; i++) { const m = history[i]!; if (m.kind === 'compact') { out.push({ role: 'system', content: m.content }); continue; } // v1.8.2 / v1.11.6: cap-hit and doom-loop sentinels are UI-only — never // send them to the LLM. The synthetic instruction note lives only inside // the summary call's messages array and is never persisted, so on a // follow-up turn the model resumes with a clean context. if (isAnySentinel(m)) continue; if (m.role === 'assistant' && m.status === 'streaming') continue; if (m.role === 'assistant' && m.status === 'cancelled') continue; if (m.role === 'tool') { const tr = m.tool_results; if (!tr) continue; const outputText = tr.error ? `error: ${tr.error}` : typeof tr.output === 'string' ? tr.output : JSON.stringify(tr.output); out.push({ role: 'tool', content: outputText, tool_call_id: tr.tool_call_id, }); continue; } if (m.role === 'assistant') { const msg: OpenAiMessage = { role: 'assistant', content: m.content && m.content.length > 0 ? m.content : null, }; if (m.tool_calls && m.tool_calls.length > 0) { msg.tool_calls = m.tool_calls.map((tc) => ({ id: tc.id, type: 'function' as const, function: { name: tc.name, arguments: JSON.stringify(tc.args) }, })); } // v1.13.1-C: collapse reasoning_parts into a single string. The view // returns them ordered by sequence; multiple reasoning parts on one // message are rare but concat preserves ordering. Skip when absent. if (m.reasoning_parts && m.reasoning_parts.length > 0) { msg.reasoning = m.reasoning_parts.map((p) => p.text ?? '').join(''); } out.push(msg); continue; } out.push({ role: 'user', content: m.content }); } return out; } export async function loadContext( sql: Sql, sessionId: string, chatId: string ): Promise<{ session: Session; project: Project; history: Message[] } | null> { const sessionRows = await sql` SELECT id, project_id, name, model, system_prompt, status, created_at, updated_at, agent_id, web_search_enabled FROM sessions WHERE id = ${sessionId} `; if (sessionRows.length === 0) return null; const session = sessionRows[0]!; const projectRows = await sql` SELECT id, name, path, added_at, last_session_id, status, gitea_remote, default_system_prompt, default_web_search_enabled FROM projects WHERE id = ${session.project_id} `; if (projectRows.length === 0) return null; const project = projectRows[0]!; // v1.11: filter compacted messages out of the inference assembly. The GET // /api/sessions/:id/messages endpoint still returns everything (so the UI // can show history with the summary card inline); only LLM payloads skip // compacted rows. compacted_at IS NULL keeps the active summary + tail. // v1.13.1-B: reads tool_calls/tool_results via the parts-merged view. // v1.13.1-C: also pull reasoning_parts so assistant messages from // reasoning models can be replayed with their reasoning context preserved. const history = await sql` SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq, tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata, reasoning_parts FROM messages_with_parts WHERE chat_id = ${chatId} AND compacted_at IS NULL ORDER BY created_at ASC, id ASC `; return { session, project, history }; } // v1.11: shared helper used after both finalizeCompletion and executeToolPhase // persist their token counts. Reads tokens off the just-UPDATEd row (which // the caller returns from RETURNING), runs compaction.isOverflow, and flips // chats.needs_compaction. The next runAssistantTurn invocation acts on it. // Silent on missing tokens — llama-swap occasionally omits usage on truncated // streams, and we'd rather miss one overflow than crash the inference path. export async function maybeFlagForCompaction( ctx: InferenceContext, chatId: string, updated: { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null } | undefined, ): Promise { if (!updated) return; const promptTokens = updated.ctx_used; const completionTokens = updated.tokens_used; const contextLimit = updated.ctx_max; if (typeof promptTokens !== 'number') return; if (typeof completionTokens !== 'number') return; if (typeof contextLimit !== 'number') return; const overflow = compaction.isOverflow( { prompt_tokens: promptTokens, completion_tokens: completionTokens }, contextLimit, ); if (!overflow) return; // v1.13.4: try the cheap prune first. If it freed at least the buffer // worth of tokens (PRUNE_TRIGGER_TOKENS, identical to COMPACTION_BUFFER), // we're below the threshold again — skip flagging summarize for the next // turn. The next turn's overflow check will re-evaluate from scratch. // Prune failures (DB errors etc.) propagate so the surrounding inference // path sees them; the catch in finalizeCompletion / executeToolPhase // doesn't shield this — by design, we want to know if prune is broken. const pruned = await prune({ sql: ctx.sql, chatId }); if (pruned.hidden > 0) { ctx.log.info( { chatId, hidden: pruned.hidden, freedTokens: pruned.freedTokens }, 'inference: prune freed context budget', ); } if (pruned.freedTokens >= PRUNE_TRIGGER_TOKENS) { // Prune handled it; skip the (expensive) summarize path. return; } await ctx.sql`UPDATE chats SET needs_compaction = true WHERE id = ${chatId}`; ctx.log.info({ chatId, promptTokens, completionTokens, contextLimit }, 'inference: flagged for compaction'); }