v1.11.3: fix ctx_max capture via /props endpoint
- llama-server does not emit n_ctx in timings (confirmed empirically); dead code at inference.ts:479 and compaction.ts:300 never fired - New model-context.ts: cached fetch of /upstream/<model>/props with positive-cache (no TTL) and 60s negative-cache - Wired into all 4 ctx_max write sites: 3 in inference.ts (executeToolPhase, finalizeCompletion, runCapHitSummary) and 1 in compaction.ts (summary row INSERT) - AbortController 3s timeout, lenient parsing with sensible defaults - 12 new vitest cases for the cache module (59 total) - 7 historical assistant rows backfilled manually (see notes)
This commit is contained in:
@@ -22,6 +22,7 @@ import { PathScopeError, resolveProjectRoot } from './path_guard.js';
|
||||
import { maybeAutoNameChat } from './auto_name.js';
|
||||
import { getAgentById } from './agents.js';
|
||||
import * as compaction from './compaction.js';
|
||||
import * as modelContext from './model-context.js';
|
||||
import type { Broker } from './broker.js';
|
||||
|
||||
const BASE_SYSTEM_PROMPT = (projectPath: string) =>
|
||||
@@ -138,9 +139,6 @@ interface ChatCompletionChunk {
|
||||
completion_tokens?: number;
|
||||
total_tokens?: number;
|
||||
};
|
||||
timings?: {
|
||||
n_ctx?: number;
|
||||
};
|
||||
}
|
||||
|
||||
export interface InferenceContext {
|
||||
@@ -339,7 +337,6 @@ interface StreamResult {
|
||||
toolCalls: ToolCall[];
|
||||
promptTokens: number | null;
|
||||
completionTokens: number | null;
|
||||
nCtx: number | null;
|
||||
}
|
||||
|
||||
interface StreamOptions {
|
||||
@@ -454,7 +451,6 @@ async function streamCompletion(
|
||||
let finishReason: string | null = null;
|
||||
let promptTokens: number | null = null;
|
||||
let completionTokens: number | null = null;
|
||||
let nCtx: number | null = null;
|
||||
const toolCallsBuffer = new Map<number, { id: string; name: string; argsText: string }>();
|
||||
|
||||
for await (const line of sseLines(res.body)) {
|
||||
@@ -476,9 +472,11 @@ async function streamCompletion(
|
||||
completionTokens = parsed.usage.completion_tokens;
|
||||
}
|
||||
}
|
||||
if (parsed.timings && typeof parsed.timings.n_ctx === 'number') {
|
||||
nCtx = parsed.timings.n_ctx;
|
||||
}
|
||||
// v1.11.3: removed dead `parsed.timings.n_ctx` read. llama-server's
|
||||
// streaming completion does NOT emit n_ctx in timings (verified
|
||||
// empirically); the authoritative source is llama-swap's
|
||||
// /upstream/<model>/props endpoint, fetched per-turn via
|
||||
// model-context.getModelContext() at the finalization sites below.
|
||||
|
||||
const choice = parsed.choices?.[0];
|
||||
if (!choice) continue;
|
||||
@@ -564,7 +562,7 @@ async function streamCompletion(
|
||||
toolCalls.push({ id: t.id || `call_${toolCalls.length}`, name: t.name, args });
|
||||
}
|
||||
|
||||
return { finishReason, content, toolCalls, promptTokens, completionTokens, nCtx };
|
||||
return { finishReason, content, toolCalls, promptTokens, completionTokens };
|
||||
}
|
||||
|
||||
async function executeToolCall(
|
||||
@@ -781,7 +779,14 @@ async function executeToolPhase(
|
||||
projectRoot: string
|
||||
): Promise<void> {
|
||||
const { sessionId, chatId, assistantMessageId, toolsUsed, signal } = args;
|
||||
const { content, toolCalls, promptTokens, completionTokens, nCtx } = result;
|
||||
const { content, toolCalls, promptTokens, completionTokens } = result;
|
||||
|
||||
// v1.11.3: ctx_max comes from llama-swap /upstream/<model>/props, not the
|
||||
// streaming completion (which doesn't emit n_ctx). getModelContext caches
|
||||
// the positive lookup for the process lifetime, so this is a single Map
|
||||
// hit after the first invocation per model.
|
||||
const mctx = await modelContext.getModelContext(session.model);
|
||||
const nCtx = mctx?.n_ctx ?? null;
|
||||
|
||||
const [updated] = await ctx.sql<
|
||||
{ tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
|
||||
@@ -917,7 +922,11 @@ async function finalizeCompletion(
|
||||
session: Session
|
||||
): Promise<void> {
|
||||
const { sessionId, chatId, assistantMessageId } = args;
|
||||
const { content, finishReason, promptTokens, completionTokens, nCtx } = result;
|
||||
const { content, finishReason, promptTokens, completionTokens } = result;
|
||||
|
||||
// v1.11.3: see executeToolPhase for the rationale.
|
||||
const mctx = await modelContext.getModelContext(session.model);
|
||||
const nCtx = mctx?.n_ctx ?? null;
|
||||
|
||||
const [updated] = await ctx.sql<
|
||||
{ tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
|
||||
@@ -1150,6 +1159,9 @@ async function runCapHitSummary(
|
||||
// even on a partial / failed summary the chat history shows where the
|
||||
// budget was hit.
|
||||
if (summaryOk && result) {
|
||||
// v1.11.3: see executeToolPhase for the rationale.
|
||||
const mctx = await modelContext.getModelContext(session.model);
|
||||
const nCtx = mctx?.n_ctx ?? null;
|
||||
const [updated] = await ctx.sql<
|
||||
{ tokens_used: number | null; ctx_used: number | null; ctx_max: number | null; finished_at: string | null }[]
|
||||
>`
|
||||
@@ -1158,7 +1170,7 @@ async function runCapHitSummary(
|
||||
status = 'complete',
|
||||
tokens_used = ${result.completionTokens},
|
||||
ctx_used = ${result.promptTokens},
|
||||
ctx_max = ${result.nCtx},
|
||||
ctx_max = ${nCtx},
|
||||
finished_at = clock_timestamp()
|
||||
WHERE id = ${assistantMessageId}
|
||||
RETURNING tokens_used, ctx_used, ctx_max, finished_at
|
||||
|
||||
Reference in New Issue
Block a user