Five fixes for latent regressions surfaced during the v1.13.x.cosmetic revert investigation. None alter schema or compaction; all cleanup against the v1.13.1-A AI SDK migration's hidden surface. (1) provider.ts — includeUsage: true on createOpenAICompatible. @ai-sdk/openai-compatible defaults this false, omitting stream_options.include_usage from the request body; llama-swap never emitted the usage block, so result.usage.inputTokens/outputTokens resolved undefined and tokens_used / ctx_used landed NULL in every assistant row since v1.13.1-A. No historical backfill. (2) MessageList.tsx — hasText = m.content.trim().length > 0. AI SDK v6 streaming occasionally emits a leading "\n" text-delta on tool-call-only turns; the literal newline passed length > 0 and rendered an empty bubble + ActionRow between every tool call. Trim catches it without changing semantics for genuine content. (3) MessageBubble.tsx — same trim on hasContent for the no-tool-calls path. Defensive symmetry with MessageList.flatten. (4) payload.ts — buildMessagesPayload skips assistant rows with status='failed' AND assistant rows with status='complete' + empty content + no tool_calls. Without this, a trailing empty/failed assistant + the next attempt's placeholder produced "Cannot have 2 or more assistant messages at the end of the list" rejections from the OpenAI-compatible upstream after cap-hit + Continue. (5) budget.ts — BUDGET_NO_AGENT 15 → 30. Every tool in ALL_TOOLS is read-only today; the 15-cap was forward-looking for write tools that haven't landed. No-agent mode now matches BUDGET_READ_ONLY. 47 LoC across 5 files. 190/190 server tests pass. Verified live: new assistant turns populate StatsLine token data; single-tool-call turns no longer render the stray empty-bubble + ActionRow between tool calls; Continue after cap-hit no longer hits the trailing-assistant API rejection.
212 lines
8.6 KiB
TypeScript
212 lines
8.6 KiB
TypeScript
import type { Sql } from '../../db.js';
|
|
import type {
|
|
Agent,
|
|
Message,
|
|
Project,
|
|
Session,
|
|
} from '../../types/api.js';
|
|
import * as compaction from '../compaction.js';
|
|
import { buildSystemPrompt } from '../system-prompt.js';
|
|
import { isAnySentinel } from './sentinels.js';
|
|
import { PRUNE_TRIGGER_TOKENS, prune } from './prune.js';
|
|
import type { InferenceContext } from './turn.js';
|
|
|
|
export interface OpenAiMessage {
|
|
role: 'system' | 'user' | 'assistant' | 'tool';
|
|
content: string | null;
|
|
tool_calls?: Array<{
|
|
id: string;
|
|
type: 'function';
|
|
function: { name: string; arguments: string };
|
|
}>;
|
|
tool_call_id?: string;
|
|
// v1.13.1-C: reasoning text from a prior assistant turn, sourced from
|
|
// message_parts kind='reasoning' rows joined in via reasoning_parts on
|
|
// the messages_with_parts view. stream-phase.ts/toModelMessages threads
|
|
// this into the AI SDK ReasoningPart when forwarding to the model so
|
|
// reasoning models can resume mid-thought across tool-call boundaries.
|
|
reasoning?: string;
|
|
}
|
|
|
|
// v1.12: buildSystemPrompt lives in services/system-prompt.ts. It awaits the
|
|
// container-guidance loader, so this function is async too and every call
|
|
// site in inference.ts awaits the result.
|
|
export async function buildMessagesPayload(
|
|
session: Session,
|
|
project: Project,
|
|
history: Message[],
|
|
agent: Agent | null = null
|
|
): Promise<OpenAiMessage[]> {
|
|
const out: OpenAiMessage[] = [];
|
|
const systemPrompt = await buildSystemPrompt(project, session, agent);
|
|
out.push({ role: 'system', content: systemPrompt });
|
|
|
|
// Find the latest compact marker — only send messages from that point onwards
|
|
let startIdx = 0;
|
|
for (let i = history.length - 1; i >= 0; i--) {
|
|
if (history[i]!.kind === 'compact') {
|
|
startIdx = i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (let i = startIdx; i < history.length; i++) {
|
|
const m = history[i]!;
|
|
if (m.kind === 'compact') {
|
|
out.push({ role: 'system', content: m.content });
|
|
continue;
|
|
}
|
|
// v1.8.2 / v1.11.6: cap-hit and doom-loop sentinels are UI-only — never
|
|
// send them to the LLM. The synthetic instruction note lives only inside
|
|
// the summary call's messages array and is never persisted, so on a
|
|
// follow-up turn the model resumes with a clean context.
|
|
if (isAnySentinel(m)) continue;
|
|
if (m.role === 'assistant' && m.status === 'streaming') continue;
|
|
if (m.role === 'assistant' && m.status === 'cancelled') continue;
|
|
// v1.13.7: skip failed assistant turns. A failed row carries no usable
|
|
// content for the model, and leaving it in the payload alongside any
|
|
// following assistant message produces "Cannot have 2 or more assistant
|
|
// messages at the end of the list" from the OpenAI-compatible upstream.
|
|
if (m.role === 'assistant' && m.status === 'failed') continue;
|
|
// v1.13.7: skip "empty" completed assistants — clen=0 + no tool_calls.
|
|
// These can land when an upstream stream returns finishReason='stop' with
|
|
// no text/tool output (network blip, rate limit recovery, model quirk).
|
|
// Same risk as the failed-status case: a trailing empty assistant plus
|
|
// the next attempt's assistant placeholder = two trailing assistants and
|
|
// the API rejects the whole payload.
|
|
if (
|
|
m.role === 'assistant' &&
|
|
m.status === 'complete' &&
|
|
(m.content == null || m.content.trim().length === 0) &&
|
|
(m.tool_calls == null || m.tool_calls.length === 0)
|
|
) {
|
|
continue;
|
|
}
|
|
if (m.role === 'tool') {
|
|
const tr = m.tool_results;
|
|
if (!tr) continue;
|
|
const outputText = tr.error
|
|
? `error: ${tr.error}`
|
|
: typeof tr.output === 'string'
|
|
? tr.output
|
|
: JSON.stringify(tr.output);
|
|
out.push({
|
|
role: 'tool',
|
|
content: outputText,
|
|
tool_call_id: tr.tool_call_id,
|
|
});
|
|
continue;
|
|
}
|
|
if (m.role === 'assistant') {
|
|
const msg: OpenAiMessage = {
|
|
role: 'assistant',
|
|
content: m.content && m.content.length > 0 ? m.content : null,
|
|
};
|
|
if (m.tool_calls && m.tool_calls.length > 0) {
|
|
msg.tool_calls = m.tool_calls.map((tc) => ({
|
|
id: tc.id,
|
|
type: 'function' as const,
|
|
function: { name: tc.name, arguments: JSON.stringify(tc.args) },
|
|
}));
|
|
}
|
|
// v1.13.1-C: collapse reasoning_parts into a single string. The view
|
|
// returns them ordered by sequence; multiple reasoning parts on one
|
|
// message are rare but concat preserves ordering. Skip when absent.
|
|
if (m.reasoning_parts && m.reasoning_parts.length > 0) {
|
|
msg.reasoning = m.reasoning_parts.map((p) => p.text ?? '').join('');
|
|
}
|
|
out.push(msg);
|
|
continue;
|
|
}
|
|
out.push({ role: 'user', content: m.content });
|
|
}
|
|
return out;
|
|
}
|
|
|
|
export async function loadContext(
|
|
sql: Sql,
|
|
sessionId: string,
|
|
chatId: string
|
|
): Promise<{ session: Session; project: Project; history: Message[] } | null> {
|
|
const sessionRows = await sql<Session[]>`
|
|
SELECT id, project_id, name, model, system_prompt, status, created_at, updated_at,
|
|
agent_id, web_search_enabled
|
|
FROM sessions WHERE id = ${sessionId}
|
|
`;
|
|
if (sessionRows.length === 0) return null;
|
|
const session = sessionRows[0]!;
|
|
|
|
const projectRows = await sql<Project[]>`
|
|
SELECT id, name, path, added_at, last_session_id, status, gitea_remote,
|
|
default_system_prompt, default_web_search_enabled
|
|
FROM projects WHERE id = ${session.project_id}
|
|
`;
|
|
if (projectRows.length === 0) return null;
|
|
const project = projectRows[0]!;
|
|
|
|
// v1.11: filter compacted messages out of the inference assembly. The GET
|
|
// /api/sessions/:id/messages endpoint still returns everything (so the UI
|
|
// can show history with the summary card inline); only LLM payloads skip
|
|
// compacted rows. compacted_at IS NULL keeps the active summary + tail.
|
|
// v1.13.1-B: reads tool_calls/tool_results via the parts-merged view.
|
|
// v1.13.1-C: also pull reasoning_parts so assistant messages from
|
|
// reasoning models can be replayed with their reasoning context preserved.
|
|
const history = await sql<Message[]>`
|
|
SELECT id, session_id, chat_id, role, content, kind, tool_calls, tool_results, status, last_seq,
|
|
tokens_used, ctx_used, ctx_max, started_at, finished_at, created_at, metadata,
|
|
reasoning_parts
|
|
FROM messages_with_parts
|
|
WHERE chat_id = ${chatId} AND compacted_at IS NULL
|
|
ORDER BY created_at ASC, id ASC
|
|
`;
|
|
|
|
return { session, project, history };
|
|
}
|
|
|
|
// v1.11: shared helper used after both finalizeCompletion and executeToolPhase
|
|
// persist their token counts. Reads tokens off the just-UPDATEd row (which
|
|
// the caller returns from RETURNING), runs compaction.isOverflow, and flips
|
|
// chats.needs_compaction. The next runAssistantTurn invocation acts on it.
|
|
// Silent on missing tokens — llama-swap occasionally omits usage on truncated
|
|
// streams, and we'd rather miss one overflow than crash the inference path.
|
|
export async function maybeFlagForCompaction(
|
|
ctx: InferenceContext,
|
|
chatId: string,
|
|
updated: { tokens_used: number | null; ctx_used: number | null; ctx_max: number | null } | undefined,
|
|
): Promise<void> {
|
|
if (!updated) return;
|
|
const promptTokens = updated.ctx_used;
|
|
const completionTokens = updated.tokens_used;
|
|
const contextLimit = updated.ctx_max;
|
|
if (typeof promptTokens !== 'number') return;
|
|
if (typeof completionTokens !== 'number') return;
|
|
if (typeof contextLimit !== 'number') return;
|
|
const overflow = compaction.isOverflow(
|
|
{ prompt_tokens: promptTokens, completion_tokens: completionTokens },
|
|
contextLimit,
|
|
);
|
|
if (!overflow) return;
|
|
|
|
// v1.13.4: try the cheap prune first. If it freed at least the buffer
|
|
// worth of tokens (PRUNE_TRIGGER_TOKENS, identical to COMPACTION_BUFFER),
|
|
// we're below the threshold again — skip flagging summarize for the next
|
|
// turn. The next turn's overflow check will re-evaluate from scratch.
|
|
// Prune failures (DB errors etc.) propagate so the surrounding inference
|
|
// path sees them; the catch in finalizeCompletion / executeToolPhase
|
|
// doesn't shield this — by design, we want to know if prune is broken.
|
|
const pruned = await prune({ sql: ctx.sql, chatId });
|
|
if (pruned.hidden > 0) {
|
|
ctx.log.info(
|
|
{ chatId, hidden: pruned.hidden, freedTokens: pruned.freedTokens },
|
|
'inference: prune freed context budget',
|
|
);
|
|
}
|
|
if (pruned.freedTokens >= PRUNE_TRIGGER_TOKENS) {
|
|
// Prune handled it; skip the (expensive) summarize path.
|
|
return;
|
|
}
|
|
|
|
await ctx.sql`UPDATE chats SET needs_compaction = true WHERE id = ${chatId}`;
|
|
ctx.log.info({ chatId, promptTokens, completionTokens, contextLimit }, 'inference: flagged for compaction');
|
|
}
|