v1.11.3: fix ctx_max capture via /props endpoint

- llama-server does not emit n_ctx in timings (confirmed empirically); dead code at inference.ts:479 and compaction.ts:300 never fired - New model-context.ts: cached fetch of /upstream/<model>/props with positive-cache (no TTL) and 60s negative-cache - Wired into all 4 ctx_max write sites: 3 in inference.ts (executeToolPhase, finalizeCompletion, runCapHitSummary) and 1 in compaction.ts (summary row INSERT) - AbortController 3s timeout, lenient parsing with sensible defaults - 12 new vitest cases for the cache module (59 total) - 7 historical assistant rows backfilled manually (see notes)
2026-05-20 19:29:26 +00:00
parent 8cd270a5da
commit 89dcfb95dc
5 changed files with 361 additions and 18 deletions
--- a/apps/server/src/services/compaction.ts
+++ b/apps/server/src/services/compaction.ts
@@ -21,6 +21,7 @@ import type { Sql } from '../db.js';
 import type { Config } from '../config.js';
 import type { Broker } from './broker.js';
 import { SUMMARY_TEMPLATE } from './compaction-prompt.js';
+import * as modelContextLookup from './model-context.js';

 const COMPACTION_BUFFER = 20_000;
 const MIN_PRESERVE_RECENT_TOKENS = 2_000;
@@ -271,7 +272,6 @@ interface CompletionResult {
  content: string;
  promptTokens: number;
  completionTokens: number;
-  nCtx: number | null;
 }

 async function callLlamaSwap(
@@ -292,14 +292,15 @@ async function callLlamaSwap(
  const json = (await res.json()) as {
    choices?: Array<{ message?: { content?: string } }>;
    usage?: { prompt_tokens?: number; completion_tokens?: number };
-    timings?: { n_ctx?: number };
  };
+  // v1.11.3: removed the dead `json.timings?.n_ctx` read — llama-server's
+  // completions don't emit n_ctx in timings. ctx_max on the summary row
+  // comes from model-context.getModelContext below in process().
  const content = json.choices?.[0]?.message?.content ?? '';
  const promptTokens = json.usage?.prompt_tokens ?? 0;
  const completionTokens = json.usage?.completion_tokens ?? 0;
-  const nCtx = typeof json.timings?.n_ctx === 'number' ? json.timings.n_ctx : null;
-  log.debug({ promptTokens, completionTokens, nCtx, chars: content.length }, 'compaction llm complete');
-  return { content, promptTokens, completionTokens, nCtx };
+  log.debug({ promptTokens, completionTokens, chars: content.length }, 'compaction llm complete');
+  return { content, promptTokens, completionTokens };
 }

 // === entry point ===
@@ -422,6 +423,12 @@ export async function process(input: ProcessInput): Promise<void> {
    // 7. Single completion (no tools). Throws on llama-swap failure.
    result = await callLlamaSwap(config, session.model, payload, log);

+    // 7b. v1.11.3: fetch the model's true context window from llama-swap's
+    // /upstream/<model>/props (the streaming completion doesn't carry it).
+    // Same pattern as inference.ts; the cache makes repeated calls free.
+    const mctx = await modelContextLookup.getModelContext(session.model);
+    const nCtx = mctx?.n_ctx ?? null;
+
    // 8. Insert the new anchored summary row. role='assistant' per spec; the
    // UI distinguishes via summary=true. tail_start_id points at the first
    // preserved tail message so debug surfaces / future tools can reason
@@ -436,7 +443,7 @@ export async function process(input: ProcessInput): Promise<void> {
      VALUES (
        ${sessionId}, ${chatId}, 'assistant', ${result.content}, 'message', 'complete',
        true, ${sel.tail_start_id},
-        ${result.completionTokens}, ${result.promptTokens}, ${result.nCtx},
+        ${result.completionTokens}, ${result.promptTokens}, ${nCtx},
        clock_timestamp(), clock_timestamp()
      )
      RETURNING id