v1.11.3: fix ctx_max capture via /props endpoint

- llama-server does not emit n_ctx in timings (confirmed empirically);
  dead code at inference.ts:479 and compaction.ts:300 never fired
- New model-context.ts: cached fetch of /upstream/<model>/props
  with positive-cache (no TTL) and 60s negative-cache
- Wired into all 4 ctx_max write sites: 3 in inference.ts
  (executeToolPhase, finalizeCompletion, runCapHitSummary) and
  1 in compaction.ts (summary row INSERT)
- AbortController 3s timeout, lenient parsing with sensible defaults
- 12 new vitest cases for the cache module (59 total)
- 7 historical assistant rows backfilled manually (see notes)
This commit is contained in:
2026-05-20 19:29:26 +00:00
parent 8cd270a5da
commit 89dcfb95dc
5 changed files with 361 additions and 18 deletions

View File

@@ -21,6 +21,7 @@ import type { Sql } from '../db.js';
import type { Config } from '../config.js';
import type { Broker } from './broker.js';
import { SUMMARY_TEMPLATE } from './compaction-prompt.js';
import * as modelContextLookup from './model-context.js';
const COMPACTION_BUFFER = 20_000;
const MIN_PRESERVE_RECENT_TOKENS = 2_000;
@@ -271,7 +272,6 @@ interface CompletionResult {
content: string;
promptTokens: number;
completionTokens: number;
nCtx: number | null;
}
async function callLlamaSwap(
@@ -292,14 +292,15 @@ async function callLlamaSwap(
const json = (await res.json()) as {
choices?: Array<{ message?: { content?: string } }>;
usage?: { prompt_tokens?: number; completion_tokens?: number };
timings?: { n_ctx?: number };
};
// v1.11.3: removed the dead `json.timings?.n_ctx` read — llama-server's
// completions don't emit n_ctx in timings. ctx_max on the summary row
// comes from model-context.getModelContext below in process().
const content = json.choices?.[0]?.message?.content ?? '';
const promptTokens = json.usage?.prompt_tokens ?? 0;
const completionTokens = json.usage?.completion_tokens ?? 0;
const nCtx = typeof json.timings?.n_ctx === 'number' ? json.timings.n_ctx : null;
log.debug({ promptTokens, completionTokens, nCtx, chars: content.length }, 'compaction llm complete');
return { content, promptTokens, completionTokens, nCtx };
log.debug({ promptTokens, completionTokens, chars: content.length }, 'compaction llm complete');
return { content, promptTokens, completionTokens };
}
// === entry point ===
@@ -422,6 +423,12 @@ export async function process(input: ProcessInput): Promise<void> {
// 7. Single completion (no tools). Throws on llama-swap failure.
result = await callLlamaSwap(config, session.model, payload, log);
// 7b. v1.11.3: fetch the model's true context window from llama-swap's
// /upstream/<model>/props (the streaming completion doesn't carry it).
// Same pattern as inference.ts; the cache makes repeated calls free.
const mctx = await modelContextLookup.getModelContext(session.model);
const nCtx = mctx?.n_ctx ?? null;
// 8. Insert the new anchored summary row. role='assistant' per spec; the
// UI distinguishes via summary=true. tail_start_id points at the first
// preserved tail message so debug surfaces / future tools can reason
@@ -436,7 +443,7 @@ export async function process(input: ProcessInput): Promise<void> {
VALUES (
${sessionId}, ${chatId}, 'assistant', ${result.content}, 'message', 'complete',
true, ${sel.tail_start_id},
${result.completionTokens}, ${result.promptTokens}, ${result.nCtx},
${result.completionTokens}, ${result.promptTokens}, ${nCtx},
clock_timestamp(), clock_timestamp()
)
RETURNING id