Recon during planning disproved the original v1.13.7 (DB-cache) premise: buildSystemPrompt already runs over inputs mtime-cached at the file layer (BOOCHAT.md in system-prompt.ts:25, AGENTS.md global+per-project in agents.ts:245), and DB scalars are byte-stable until edited. The output is microsecond pure-string concat with no I/O. Skills aren't in the prefix; tools live in a separate request body field alpha-sorted by v1.13.3. This batch closes the verification gap with instrumentation, not implementation: - system-prompt.ts: buildSystemPromptWithFingerprint canonical impl computes SHA-256 over the assembled prefix, runs a per-session Map<sessionId, lastHash> observer, emits PrefixFingerprint per call and PrefixDrift (with field-level changed_inputs) on hash change. buildSystemPrompt is now a thin shim returning .prompt. - agents.ts: getAgentsMtimes accessor — cache-read only, no I/O. - payload.ts: buildMessagesPayload takes optional log argument; when passed, emits prefix-fingerprint (info) + prefix-drift (warn). - turn.ts + sentinel-summaries.ts: pass ctx.log at 3 production call sites; sentinel summaries log too so any drift across cap-hit / doom-loop paths surfaces. - system-prompt.test.ts: 4 new tests (byte-identical, no-drift-on- stable, drift-fires-with-changed-inputs, cross-session-no-drift). 194/194 tests pass (was 190). Smoke: 5 messages in a fresh session produced 7 prefix-fingerprint logs (extras from buildMessagesPayload being called from sentinel summary paths), all with identical prefix_hash and prefix_length=2907, zero prefix-drift. Prefix is byte-stable in steady-state. Decision: original system_prompt_cache DB table from the roadmap is permanently dropped. The v1.12.0 mtime caches at the input layer plus alpha tool ordering at the request body (v1.13.3) already address the load-bearing cache-stability surfaces. Instrumentation stays so the claim can be re-verified at any time.
330 lines
12 KiB
TypeScript
330 lines
12 KiB
TypeScript
import type { FastifyBaseLogger } from 'fastify';
|
|
import type { Sql } from '../../db.js';
|
|
import type { Config } from '../../config.js';
|
|
import type {
|
|
Agent,
|
|
ErrorReason,
|
|
Message,
|
|
MessageMetadata,
|
|
Project,
|
|
Session,
|
|
ToolCall,
|
|
UserStreamFrame,
|
|
} from '../../types/api.js';
|
|
import { ALL_TOOLS } from '../tools.js';
|
|
import { resolveProjectRoot } from '../path_guard.js';
|
|
import { maybeAutoNameChat } from '../auto_name.js';
|
|
import { getAgentById } from '../agents.js';
|
|
import * as compaction from '../compaction.js';
|
|
import * as modelContext from '../model-context.js';
|
|
import type { Broker } from '../broker.js';
|
|
import { resolveToolBudget } from './budget.js';
|
|
import {
|
|
DOOM_LOOP_THRESHOLD,
|
|
detectDoomLoop,
|
|
} from './sentinels.js';
|
|
import {
|
|
buildMessagesPayload,
|
|
loadContext,
|
|
} from './payload.js';
|
|
import {
|
|
finalizeCompletion,
|
|
handleAbortOrError,
|
|
} from './error-handler.js';
|
|
import {
|
|
executeStreamPhase,
|
|
streamCompletion,
|
|
} from './stream-phase.js';
|
|
import { executeToolPhase } from './tool-phase.js';
|
|
import { DB_FLUSH_INTERVAL_MS, type StreamPhaseState } from './types.js';
|
|
import {
|
|
runCapHitSummary,
|
|
runDoomLoopSummary,
|
|
} from './sentinel-summaries.js';
|
|
|
|
// v1.12.4: re-exported so external callers (tests, future consumers) keep
|
|
// importing from services/inference.js as the public surface.
|
|
export { detectDoomLoop, DOOM_LOOP_THRESHOLD } from './sentinels.js';
|
|
export { buildMessagesPayload } from './payload.js';
|
|
|
|
export interface InferenceFrame {
|
|
type:
|
|
| 'message_started'
|
|
| 'delta'
|
|
| 'tool_call'
|
|
| 'tool_result'
|
|
| 'message_complete'
|
|
| 'usage'
|
|
| 'messages_deleted'
|
|
| 'session_renamed'
|
|
| 'chat_renamed'
|
|
| 'error';
|
|
message_id?: string;
|
|
message_ids?: string[];
|
|
chat_id?: string;
|
|
tool_message_id?: string;
|
|
tool_call_id?: string;
|
|
// v1.8.2: 'system' added so cap-hit sentinel messages can announce themselves
|
|
// through the normal message_started → delta → message_complete sequence.
|
|
role?: 'assistant' | 'tool' | 'user' | 'system';
|
|
content?: string;
|
|
tool_call?: ToolCall;
|
|
output?: unknown;
|
|
truncated?: boolean;
|
|
error?: string;
|
|
// v1.8.2: structured error reason. Set on `type: 'error'` so the UI can
|
|
// surface a specific message; `error` stays the human-readable text.
|
|
reason?: ErrorReason;
|
|
// v1.8.2: piggybacks on `message_complete` so static or terminally-resolved
|
|
// messages can carry their persisted metadata to the live stream without a
|
|
// refetch (sentinels carry { kind: 'cap_hit', ... }; failed messages carry
|
|
// { kind: 'error', ... }).
|
|
metadata?: MessageMetadata | null;
|
|
tokens_used?: number | null;
|
|
ctx_used?: number | null;
|
|
ctx_max?: number | null;
|
|
completion_tokens?: number | null;
|
|
started_at?: string | null;
|
|
finished_at?: string | null;
|
|
model?: string;
|
|
session_id?: string;
|
|
name?: string;
|
|
}
|
|
|
|
export type FramePublisher = (sessionId: string, frame: InferenceFrame) => void;
|
|
|
|
export interface InferenceContext {
|
|
sql: Sql;
|
|
config: Config;
|
|
log: FastifyBaseLogger;
|
|
publish: FramePublisher;
|
|
publishUser: (frame: UserStreamFrame) => void;
|
|
// v1.11: passed through so compaction.process can publish 'compacted'
|
|
// frames on the same session WS channel useSessionStream subscribes to.
|
|
// Compaction is the only path that needs the raw broker handle (regular
|
|
// inference goes through `publish`); keeping a separate field avoids
|
|
// tempting other code paths into bypassing the session-id binding.
|
|
broker: Broker;
|
|
}
|
|
|
|
// v1.12.4: payload assembly extracted to ./inference/payload.ts (tests
|
|
// import buildMessagesPayload from this module, so a re-export below
|
|
// preserves the public surface). Stream + tool phases extracted to
|
|
// ./inference/stream-phase.ts and ./inference/tool-phase.ts.
|
|
|
|
export interface StreamResult {
|
|
finishReason: string | null;
|
|
content: string;
|
|
toolCalls: ToolCall[];
|
|
promptTokens: number | null;
|
|
completionTokens: number | null;
|
|
// v1.13.1-C: reasoning text accumulated across reasoning-delta parts.
|
|
// Empty string when the model doesn't emit reasoning (most cases).
|
|
reasoning: string;
|
|
}
|
|
|
|
|
|
export interface TurnArgs {
|
|
sessionId: string;
|
|
chatId: string;
|
|
assistantMessageId: string;
|
|
// v1.8.2: cumulative tool calls executed this run. Compared against the
|
|
// resolved budget at the top of each turn. Replaces the older `depth`
|
|
// counter (which counted iterations, not invocations).
|
|
toolsUsed: number;
|
|
// v1.11.6: ordered tool calls executed in this user-message turn (across
|
|
// recursive runAssistantTurn invocations). Reset to [] at user-message
|
|
// boundaries by runInference, same as toolsUsed. Doom-loop check at the
|
|
// top of runAssistantTurn slices the last DOOM_LOOP_THRESHOLD entries.
|
|
recentToolCalls: ToolCall[];
|
|
signal: AbortSignal | undefined;
|
|
}
|
|
|
|
|
|
export async function runAssistantTurn(
|
|
ctx: InferenceContext,
|
|
args: TurnArgs,
|
|
): Promise<void> {
|
|
const { sessionId, chatId } = args;
|
|
|
|
// v1.11: if the prior turn flagged this chat for compaction, run it first
|
|
// so loadContext below reads the post-compaction history. We swallow
|
|
// compaction failures (clearing the flag so we don't loop) and proceed
|
|
// with the un-compacted history — a slow turn that hits the model's
|
|
// hard limit is recoverable; a dead session is not.
|
|
const chatFlag = await ctx.sql<{ needs_compaction: boolean }[]>`
|
|
SELECT needs_compaction FROM chats WHERE id = ${chatId}
|
|
`;
|
|
if (chatFlag[0]?.needs_compaction) {
|
|
try {
|
|
await compaction.process({
|
|
sql: ctx.sql,
|
|
config: ctx.config,
|
|
log: ctx.log,
|
|
broker: ctx.broker,
|
|
chatId,
|
|
});
|
|
} catch (err) {
|
|
ctx.log.warn({ err, chatId }, 'auto-compaction failed; clearing flag and proceeding');
|
|
await ctx.sql`UPDATE chats SET needs_compaction = false WHERE id = ${chatId}`;
|
|
}
|
|
}
|
|
|
|
const loaded = await loadContext(ctx.sql, sessionId, chatId);
|
|
if (!loaded) {
|
|
ctx.log.warn({ sessionId }, 'inference: session or project missing');
|
|
return;
|
|
}
|
|
const { session, project, history } = loaded;
|
|
const projectRoot = await resolveProjectRoot(project.path);
|
|
// Agent resolution is per-turn so PATCH agent_id mid-conversation takes
|
|
// effect on the next message. Unknown agent_id returns null silently —
|
|
// session falls back to base prompt + all tools + default temperature.
|
|
const agent = session.agent_id
|
|
? await getAgentById(project.path, session.agent_id)
|
|
: null;
|
|
|
|
// v1.8.2: cap-hit replaces the older "tool loop depth exceeded" failure.
|
|
// When we've already burned the budget *before* this turn even runs, we
|
|
// skip straight to the summary flow — the in-flight assistant message slot
|
|
// gets reused for the wrap-up reply instead of being marked failed.
|
|
const budget = resolveToolBudget(agent);
|
|
if (args.toolsUsed >= budget) {
|
|
await runCapHitSummary(ctx, args, session, project, history, agent, budget);
|
|
return;
|
|
}
|
|
|
|
// v1.11.6: doom-loop guard. Detected BEFORE the budget cap (the model can
|
|
// burn through 3 identical calls long before the 15-call budget fires).
|
|
// Same in-flight-slot-reuse pattern as runCapHitSummary — wrap-up reply
|
|
// lands in args.assistantMessageId, then a doom_loop sentinel is inserted
|
|
// to make the abort visible in the chat history.
|
|
const loop = detectDoomLoop(args.recentToolCalls);
|
|
if (loop) {
|
|
await runDoomLoopSummary(ctx, args, session, project, history, agent, loop);
|
|
return;
|
|
}
|
|
|
|
const messages = await buildMessagesPayload(session, project, history, agent, ctx.log);
|
|
|
|
// v1.11.8: resolve per-chat web-tools opt-in. Tri-state on the wire:
|
|
// - session.web_search_enabled = null → inherit project default
|
|
// - session.web_search_enabled = true/false → explicit
|
|
// Both web_search and web_fetch are gated by this single flag (the UI
|
|
// label is "Enable web search and fetch" — same store, both tools).
|
|
// Default is false unless explicitly opted in, matching the v1.9
|
|
// plumbing intent ("inert until Batch 8 ships the actual tools").
|
|
const webToolsEnabled =
|
|
session.web_search_enabled ?? project.default_web_search_enabled ?? false;
|
|
|
|
const state: StreamPhaseState = { accumulated: '', startedAt: null };
|
|
let result: StreamResult;
|
|
try {
|
|
result = await executeStreamPhase(ctx, args, session, messages, state, agent, webToolsEnabled);
|
|
} catch (err) {
|
|
await handleAbortOrError(ctx, args, state.accumulated, err);
|
|
return;
|
|
}
|
|
|
|
if (result.toolCalls.length > 0) {
|
|
await executeToolPhase(ctx, args, result, state.startedAt, session, projectRoot);
|
|
return;
|
|
}
|
|
|
|
await finalizeCompletion(ctx, args, result, state.startedAt, session);
|
|
}
|
|
|
|
export async function runInference(
|
|
ctx: InferenceContext,
|
|
sessionId: string,
|
|
chatId: string,
|
|
assistantMessageId: string,
|
|
signal?: AbortSignal
|
|
): Promise<void> {
|
|
// v1.8.2: every fresh inference (initial send, regenerate, force_send,
|
|
// continue) starts with a clean budget. Tool-call accumulation across
|
|
// Continue invocations is what the hard ceiling guards against, not the
|
|
// per-call budget.
|
|
// v1.11.6: recentToolCalls also resets — doom-loop detection is scoped
|
|
// to a single user-message turn, so a Continue starts with no history.
|
|
return runAssistantTurn(ctx, {
|
|
sessionId,
|
|
chatId,
|
|
assistantMessageId,
|
|
toolsUsed: 0,
|
|
recentToolCalls: [],
|
|
signal,
|
|
});
|
|
}
|
|
|
|
// v1.8.2: cap-hit summary flow. Called instead of erroring when the loop
|
|
// hits its budget. Reuses the in-flight assistant message slot to stream a
|
|
// short wrap-up reply with the synthetic note prepended and tools disabled,
|
|
// then always inserts a cap_hit sentinel afterward (regardless of summary
|
|
// outcome) so the UI can show a Continue affordance.
|
|
interface InferenceRegistration {
|
|
controller: AbortController;
|
|
completed: Promise<void>;
|
|
}
|
|
|
|
export function createInferenceRunner(
|
|
ctx: Omit<InferenceContext, 'publishUser'>,
|
|
publishUserFn: (user: string, frame: UserStreamFrame) => void
|
|
) {
|
|
const registry = new Map<string, InferenceRegistration>();
|
|
|
|
return {
|
|
enqueue(sessionId: string, chatId: string, assistantMessageId: string, user: string) {
|
|
const callCtx: InferenceContext = {
|
|
...ctx,
|
|
publishUser: (frame) => publishUserFn(user, frame),
|
|
// v1.11: broker comes in via ctx (set at registration time). Repeated
|
|
// here so the destructure carries it onto the per-call ctx without
|
|
// having to add it to every enqueue/cancel signature individually.
|
|
broker: ctx.broker,
|
|
};
|
|
// v1.8 mobile-tabs: announce working before the async loop starts so
|
|
// every device subscribed to the user channel sees the amber dot.
|
|
callCtx.publishUser({ type: 'chat_status', chat_id: chatId, status: 'streaming', at: new Date().toISOString() });
|
|
const controller = new AbortController();
|
|
let resolveCompleted!: () => void;
|
|
const completed = new Promise<void>((res) => { resolveCompleted = res; });
|
|
const registration: InferenceRegistration = { controller, completed };
|
|
registry.set(chatId, registration);
|
|
void (async () => {
|
|
try {
|
|
await runInference(callCtx, sessionId, chatId, assistantMessageId, controller.signal);
|
|
setImmediate(() => {
|
|
void maybeAutoNameChat(callCtx, chatId, sessionId).catch((err: Error) => {
|
|
callCtx.log.warn({ err, chatId }, 'auto-name failed');
|
|
});
|
|
});
|
|
} catch (err) {
|
|
callCtx.log.error({ err }, 'unhandled inference error');
|
|
} finally {
|
|
resolveCompleted();
|
|
// Only clear our own registration; a force-send may have replaced it.
|
|
if (registry.get(chatId) === registration) {
|
|
registry.delete(chatId);
|
|
}
|
|
}
|
|
})();
|
|
},
|
|
|
|
async cancel(_sessionId: string, chatId: string): Promise<boolean> {
|
|
const reg = registry.get(chatId);
|
|
if (!reg) return false;
|
|
reg.controller.abort();
|
|
// Swallow — we just need to wait for the catch/finally to persist state.
|
|
await reg.completed.catch(() => {});
|
|
return true;
|
|
},
|
|
|
|
hasActive(chatId: string): boolean {
|
|
return registry.has(chatId);
|
|
},
|
|
};
|
|
}
|
|
|
|
export const _toolNames = ALL_TOOLS.map((t) => t.name);
|