v1.12.4-rc2: extract payload + error-handler from inference.ts
- payload.ts: buildMessagesPayload (re-exported), loadContext, maybeFlagForCompaction - error-handler.ts: handleAbortOrError, finalizeCompletion Both new files type-import InferenceContext/StreamResult/TurnArgs from inference.ts; ESM elides type imports so there's no runtime cycle. handleAbortOrError turned out not to call the summary functions, so no back-edge needed. inference.ts shrinks from ~1676 to ~1401 LoC. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
155
apps/server/src/services/inference/payload.ts
Normal file
155
apps/server/src/services/inference/payload.ts
Normal file
@@ -0,0 +1,155 @@
|
||||
import type { Sql } from '../../db.js';
|
||||
import type {
|
||||
Agent,
|
||||
Message,
|
||||
Project,
|
||||
Session,
|
||||
} from '../../types/api.js';
|
||||
import * as compaction from '../compaction.js';
|
||||
import { buildSystemPrompt } from '../system-prompt.js';
|
||||
import { isAnySentinel } from './sentinels.js';
|
||||
import type { InferenceContext } from '../inference.js';
|
||||
|
||||
export interface OpenAiMessage {
|
||||
role: 'system' | 'user' | 'assistant' | 'tool';
|
||||
content: string | null;
|
||||
tool_calls?: Array<{
|
||||
id: string;
|
||||
type: 'function';
|
||||
function: { name: string; arguments: string };
|
||||
}>;
|
||||
tool_call_id?: string;
|
||||
}
|
||||
|
||||
// v1.12: buildSystemPrompt lives in services/system-prompt.ts. It awaits the
|
||||
// container-guidance loader, so this function is async too and every call
|
||||
// site in inference.ts awaits the result.
|
||||
export async function buildMessagesPayload(
|
||||
session: Session,
|
||||
project: Project,
|
||||
history: Message[],
|
||||
agent: Agent | null = null
|
||||
): Promise<OpenAiMessage[]> {
|
||||
const out: OpenAiMessage[] = [];
|
||||
const systemPrompt = await buildSystemPrompt(project, session, agent);
|
||||
out.push({ role: 'system', content: systemPrompt });
|
||||
|
||||
// Find the latest compact marker — only send messages from that point onwards
|
||||
let startIdx = 0;
|
||||
for (let i = history.length - 1; i >= 0; i--) {
|
||||
if (history[i]!.kind === 'compact') {
|
||||
startIdx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (let i = startIdx; i < history.length; i++) {
|
||||
const m = history[i]!;
|
||||
if (m.kind === 'compact') {
|
||||
out.push({ role: 'system', content: m.content });
|
||||
continue;
|
||||
}
|
||||
// v1.8.2 / v1.11.6: cap-hit and doom-loop sentinels are UI-only — never
|
||||
// send them to the LLM. The synthetic instruction note lives only inside
|
||||
// the summary call's messages array and is never persisted, so on a
|
||||
// follow-up turn the model resumes with a clean context.
|
||||
if (isAnySentinel(m)) continue;
|
||||
if (m.role === 'assistant' && m.status === 'streaming') continue;
|
||||
if (m.role === 'assistant' && m.status === 'cancelled') continue;
|
||||
if (m.role === 'tool') {
|
||||
const tr = m.tool_results;
|
||||
if (!tr) continue;
|
||||
const outputText = tr.error
|
||||
? `error: ${tr.error}`
|
||||
: typeof tr.output === 'string'
|
||||
? tr.output
|
||||
: JSON.stringify(tr.output);
|
||||
out.push({
|
||||
role: 'tool',
|
||||
content: outputText,
|
||||
tool_call_id: tr.tool_call_id,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
if (m.role === 'assistant') {
|
||||
const msg: OpenAiMessage = {
|
||||
role: 'assistant',
|
||||
content: m.content && m.content.length > 0 ? m.content : null,
|
||||
};
|
||||
if (m.tool_calls && m.tool_calls.length > 0) {
|
||||
msg.tool_calls = m.tool_calls.map((tc) => ({
|
||||
id: tc.id,
|
||||
type: 'function' as const,
|
||||
function: { name: tc.name, arguments: JSON.stringify(tc.args) },
|
||||
}));
|
||||
}
|
||||
out.push(msg);
|
||||
continue;
|
||||
}
|
||||
out.push({ role: 'user', content: m.content });
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
export async function loadContext(
|
||||
sql: Sql,
|
||||
sessionId: string,
|
||||
chatId: string
|
||||
): Promise<{ session: Session; project: Project; history: Message[] } | null> {
|
||||
const sessionRows = await sql<Session[]>`
|
||||
SELECT id, project_id, name, model, system_prompt, status, created_at, updated_at,
|
||||
agent_id, web_search_enabled
|
||||
FROM sessions WHERE id = ${sessionId}
|
||||
`;
|
||||
if (sessionRows.length === 0) return null;
|
||||
const session = sessionRows[0]!;
|
||||
|
||||
const projectRows = await sql<Project[]>`
|
||||
SELECT id, name, path, added_at, last_session_id, status, gitea_remote,
|
||||
default_system_prompt, default_web_search_enabled
|
||||
FROM projects WHERE id = ${session.project_id}
|
||||
`;
|
||||
if (projectRows.length === 0) return null;
|
||||
const project = projectRows[0]!;
|
||||
|
||||
// v1.11: filter compacted messages out of the inference assembly. The GET
|
||||
// /api/sessions/:id/messages endpoint still returns everything (so the UI
|
||||
// can show history with the summary card inline); only LLM payloads skip
|
||||
// compacted rows. compacted_at IS NULL keeps the active summary + tail.
|
||||
const history = await sql<Message[]>`
|
||||
SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
|
||||
tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata
|
||||
FROM messages
|
||||
WHERE chat_id = ${chatId} AND compacted_at IS NULL
|
||||
ORDER BY created_at ASC, id ASC
|
||||
`;
|
||||
|
||||
return { session, project, history };
|
||||
}
|
||||
|
||||
// v1.11: shared helper used after both finalizeCompletion and executeToolPhase
|
||||
// persist their token counts. Reads tokens off the just-UPDATEd row (which
|
||||
// the caller returns from RETURNING), runs compaction.isOverflow, and flips
|
||||
// chats.needs_compaction. The next runAssistantTurn invocation acts on it.
|
||||
// Silent on missing tokens — llama-swap occasionally omits usage on truncated
|
||||
// streams, and we'd rather miss one overflow than crash the inference path.
|
||||
export async function maybeFlagForCompaction(
|
||||
ctx: InferenceContext,
|
||||
chatId: string,
|
||||
updated: { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null } | undefined,
|
||||
): Promise<void> {
|
||||
if (!updated) return;
|
||||
const promptTokens = updated.ctx_used;
|
||||
const completionTokens = updated.tokens_used;
|
||||
const contextLimit = updated.ctx_max;
|
||||
if (typeof promptTokens !== 'number') return;
|
||||
if (typeof completionTokens !== 'number') return;
|
||||
if (typeof contextLimit !== 'number') return;
|
||||
const overflow = compaction.isOverflow(
|
||||
{ prompt_tokens: promptTokens, completion_tokens: completionTokens },
|
||||
contextLimit,
|
||||
);
|
||||
if (!overflow) return;
|
||||
await ctx.sql`UPDATE chats SET needs_compaction = true WHERE id = ${chatId}`;
|
||||
ctx.log.info({ chatId, promptTokens, completionTokens, contextLimit }, 'inference: flagged for compaction');
|
||||
}
|
||||
Reference in New Issue
Block a user