v1.11.3: fix ctx_max capture via /props endpoint

- llama-server does not emit n_ctx in timings (confirmed empirically); dead code at inference.ts:479 and compaction.ts:300 never fired - New model-context.ts: cached fetch of /upstream/<model>/props with positive-cache (no TTL) and 60s negative-cache - Wired into all 4 ctx_max write sites: 3 in inference.ts (executeToolPhase, finalizeCompletion, runCapHitSummary) and 1 in compaction.ts (summary row INSERT) - AbortController 3s timeout, lenient parsing with sensible defaults - 12 new vitest cases for the cache module (59 total) - 7 historical assistant rows backfilled manually (see notes)
2026-05-20 19:29:26 +00:00
parent 8cd270a5da
commit 89dcfb95dc
5 changed files with 361 additions and 18 deletions
--- a/apps/server/src/index.ts
+++ b/apps/server/src/index.ts
@@ -20,6 +20,7 @@ import { createInferenceRunner } from './services/inference.js';
 import { createBroker } from './services/broker.js';
 import { listSkills } from './services/skills.js';
 import * as compaction from './services/compaction.js';
+import { configureModelContext } from './services/model-context.js';

 async function main() {
  const config = loadConfig();
@@ -48,6 +49,11 @@ async function main() {
  await applySchema(sql);
  app.log.info('database schema applied');

+  // v1.11.3: tell the model-context cache where llama-swap lives. Cache
+  // lookups go to ${LLAMA_SWAP_URL}/upstream/<model>/props to read
+  // default_generation_settings.n_ctx — the value persisted as messages.ctx_max.
+  configureModelContext({ llamaSwapUrl: config.LLAMA_SWAP_URL });
+
  await app.register(fastifyWebsocket);

  app.get('/api/health', async () => {