Files
boocode/apps/server/src/services/inference/payload.ts
indifferentketchup 471f2b0ea8 v1.13.7: stability bundle — usage capture + payload/UI sanitization
Five fixes for latent regressions surfaced during the v1.13.x.cosmetic
revert investigation. None alter schema or compaction; all cleanup
against the v1.13.1-A AI SDK migration's hidden surface.

(1) provider.ts — includeUsage: true on createOpenAICompatible.
@ai-sdk/openai-compatible defaults this false, omitting
stream_options.include_usage from the request body; llama-swap never
emitted the usage block, so result.usage.inputTokens/outputTokens
resolved undefined and tokens_used / ctx_used landed NULL in every
assistant row since v1.13.1-A. No historical backfill.

(2) MessageList.tsx — hasText = m.content.trim().length > 0.
AI SDK v6 streaming occasionally emits a leading "\n" text-delta on
tool-call-only turns; the literal newline passed length > 0 and
rendered an empty bubble + ActionRow between every tool call. Trim
catches it without changing semantics for genuine content.

(3) MessageBubble.tsx — same trim on hasContent for the no-tool-calls
path. Defensive symmetry with MessageList.flatten.

(4) payload.ts — buildMessagesPayload skips assistant rows with
status='failed' AND assistant rows with status='complete' + empty
content + no tool_calls. Without this, a trailing empty/failed
assistant + the next attempt's placeholder produced "Cannot have 2
or more assistant messages at the end of the list" rejections from
the OpenAI-compatible upstream after cap-hit + Continue.

(5) budget.ts — BUDGET_NO_AGENT 15 → 30. Every tool in ALL_TOOLS is
read-only today; the 15-cap was forward-looking for write tools that
haven't landed. No-agent mode now matches BUDGET_READ_ONLY.

47 LoC across 5 files. 190/190 server tests pass.

Verified live: new assistant turns populate StatsLine token data;
single-tool-call turns no longer render the stray empty-bubble +
ActionRow between tool calls; Continue after cap-hit no longer hits
the trailing-assistant API rejection.
2026-05-22 13:24:19 +00:00

212 lines
8.6 KiB
TypeScript

import type { Sql } from '../../db.js';
import type {
Agent,
Message,
Project,
Session,
} from '../../types/api.js';
import * as compaction from '../compaction.js';
import { buildSystemPrompt } from '../system-prompt.js';
import { isAnySentinel } from './sentinels.js';
import { PRUNE_TRIGGER_TOKENS, prune } from './prune.js';
import type { InferenceContext } from './turn.js';
export interface OpenAiMessage {
role: 'system' | 'user' | 'assistant' | 'tool';
content: string | null;
tool_calls?: Array<{
id: string;
type: 'function';
function: { name: string; arguments: string };
}>;
tool_call_id?: string;
// v1.13.1-C: reasoning text from a prior assistant turn, sourced from
// message_parts kind='reasoning' rows joined in via reasoning_parts on
// the messages_with_parts view. stream-phase.ts/toModelMessages threads
// this into the AI SDK ReasoningPart when forwarding to the model so
// reasoning models can resume mid-thought across tool-call boundaries.
reasoning?: string;
}
// v1.12: buildSystemPrompt lives in services/system-prompt.ts. It awaits the
// container-guidance loader, so this function is async too and every call
// site in inference.ts awaits the result.
export async function buildMessagesPayload(
session: Session,
project: Project,
history: Message[],
agent: Agent | null = null
): Promise<OpenAiMessage[]> {
const out: OpenAiMessage[] = [];
const systemPrompt = await buildSystemPrompt(project, session, agent);
out.push({ role: 'system', content: systemPrompt });
// Find the latest compact marker — only send messages from that point onwards
let startIdx = 0;
for (let i = history.length - 1; i >= 0; i--) {
if (history[i]!.kind === 'compact') {
startIdx = i;
break;
}
}
for (let i = startIdx; i < history.length; i++) {
const m = history[i]!;
if (m.kind === 'compact') {
out.push({ role: 'system', content: m.content });
continue;
}
// v1.8.2 / v1.11.6: cap-hit and doom-loop sentinels are UI-only — never
// send them to the LLM. The synthetic instruction note lives only inside
// the summary call's messages array and is never persisted, so on a
// follow-up turn the model resumes with a clean context.
if (isAnySentinel(m)) continue;
if (m.role === 'assistant' && m.status === 'streaming') continue;
if (m.role === 'assistant' && m.status === 'cancelled') continue;
// v1.13.7: skip failed assistant turns. A failed row carries no usable
// content for the model, and leaving it in the payload alongside any
// following assistant message produces "Cannot have 2 or more assistant
// messages at the end of the list" from the OpenAI-compatible upstream.
if (m.role === 'assistant' && m.status === 'failed') continue;
// v1.13.7: skip "empty" completed assistants — clen=0 + no tool_calls.
// These can land when an upstream stream returns finishReason='stop' with
// no text/tool output (network blip, rate limit recovery, model quirk).
// Same risk as the failed-status case: a trailing empty assistant plus
// the next attempt's assistant placeholder = two trailing assistants and
// the API rejects the whole payload.
if (
m.role === 'assistant' &&
m.status === 'complete' &&
(m.content == null || m.content.trim().length === 0) &&
(m.tool_calls == null || m.tool_calls.length === 0)
) {
continue;
}
if (m.role === 'tool') {
const tr = m.tool_results;
if (!tr) continue;
const outputText = tr.error
? `error: ${tr.error}`
: typeof tr.output === 'string'
? tr.output
: JSON.stringify(tr.output);
out.push({
role: 'tool',
content: outputText,
tool_call_id: tr.tool_call_id,
});
continue;
}
if (m.role === 'assistant') {
const msg: OpenAiMessage = {
role: 'assistant',
content: m.content && m.content.length > 0 ? m.content : null,
};
if (m.tool_calls && m.tool_calls.length > 0) {
msg.tool_calls = m.tool_calls.map((tc) => ({
id: tc.id,
type: 'function' as const,
function: { name: tc.name, arguments: JSON.stringify(tc.args) },
}));
}
// v1.13.1-C: collapse reasoning_parts into a single string. The view
// returns them ordered by sequence; multiple reasoning parts on one
// message are rare but concat preserves ordering. Skip when absent.
if (m.reasoning_parts && m.reasoning_parts.length > 0) {
msg.reasoning = m.reasoning_parts.map((p) => p.text ?? '').join('');
}
out.push(msg);
continue;
}
out.push({ role: 'user', content: m.content });
}
return out;
}
export async function loadContext(
sql: Sql,
sessionId: string,
chatId: string
): Promise<{ session: Session; project: Project; history: Message[] } | null> {
const sessionRows = await sql<Session[]>`
SELECT id, project_id, name, model, system_prompt, status, created_at, updated_at,
agent_id, web_search_enabled
FROM sessions WHERE id = ${sessionId}
`;
if (sessionRows.length === 0) return null;
const session = sessionRows[0]!;
const projectRows = await sql<Project[]>`
SELECT id, name, path, added_at, last_session_id, status, gitea_remote,
default_system_prompt, default_web_search_enabled
FROM projects WHERE id = ${session.project_id}
`;
if (projectRows.length === 0) return null;
const project = projectRows[0]!;
// v1.11: filter compacted messages out of the inference assembly. The GET
// /api/sessions/:id/messages endpoint still returns everything (so the UI
// can show history with the summary card inline); only LLM payloads skip
// compacted rows. compacted_at IS NULL keeps the active summary + tail.
// v1.13.1-B: reads tool_calls/tool_results via the parts-merged view.
// v1.13.1-C: also pull reasoning_parts so assistant messages from
// reasoning models can be replayed with their reasoning context preserved.
const history = await sql<Message[]>`
SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata,
reasoning_parts
FROM messages_with_parts
WHERE chat_id = ${chatId} AND compacted_at IS NULL
ORDER BY created_at ASC, id ASC
`;
return { session, project, history };
}
// v1.11: shared helper used after both finalizeCompletion and executeToolPhase
// persist their token counts. Reads tokens off the just-UPDATEd row (which
// the caller returns from RETURNING), runs compaction.isOverflow, and flips
// chats.needs_compaction. The next runAssistantTurn invocation acts on it.
// Silent on missing tokens — llama-swap occasionally omits usage on truncated
// streams, and we'd rather miss one overflow than crash the inference path.
export async function maybeFlagForCompaction(
ctx: InferenceContext,
chatId: string,
updated: { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null } | undefined,
): Promise<void> {
if (!updated) return;
const promptTokens = updated.ctx_used;
const completionTokens = updated.tokens_used;
const contextLimit = updated.ctx_max;
if (typeof promptTokens !== 'number') return;
if (typeof completionTokens !== 'number') return;
if (typeof contextLimit !== 'number') return;
const overflow = compaction.isOverflow(
{ prompt_tokens: promptTokens, completion_tokens: completionTokens },
contextLimit,
);
if (!overflow) return;
// v1.13.4: try the cheap prune first. If it freed at least the buffer
// worth of tokens (PRUNE_TRIGGER_TOKENS, identical to COMPACTION_BUFFER),
// we're below the threshold again — skip flagging summarize for the next
// turn. The next turn's overflow check will re-evaluate from scratch.
// Prune failures (DB errors etc.) propagate so the surrounding inference
// path sees them; the catch in finalizeCompletion / executeToolPhase
// doesn't shield this — by design, we want to know if prune is broken.
const pruned = await prune({ sql: ctx.sql, chatId });
if (pruned.hidden > 0) {
ctx.log.info(
{ chatId, hidden: pruned.hidden, freedTokens: pruned.freedTokens },
'inference: prune freed context budget',
);
}
if (pruned.freedTokens >= PRUNE_TRIGGER_TOKENS) {
// Prune handled it; skip the (expensive) summarize path.
return;
}
await ctx.sql`UPDATE chats SET needs_compaction = true WHERE id = ${chatId}`;
ctx.log.info({ chatId, promptTokens, completionTokens, contextLimit }, 'inference: flagged for compaction');
}