// v1.11.3: llama-swap model-context cache. Replaces the dead // `parsed.timings.n_ctx` capture in inference.ts / compaction.ts — // llama-server's streaming completion never emits n_ctx in timings (verified // empirically: timings carries prompt_n / predicted_n / *_ms / *_per_second // only). The authoritative source is llama-swap's // /upstream//props endpoint at .default_generation_settings.n_ctx. // // Cache design: // - Positive entries (n_ctx + total_slots) have no TTL. A model's context // size doesn't change while llama-swap is running; an admin endpoint // can invalidateModelContext() if it ever does. // - Negative entries (failed fetch) have a 60s TTL so a misconfigured or // down model doesn't get hammered every inference turn, but recovers // within a minute once the upstream comes back. // - 3s AbortController timeout on the fetch — long enough for a healthy // upstream, short enough that a stuck upstream doesn't block the // ctx_max UPDATE that follows. export interface ModelContext { n_ctx: number; total_slots: number; fetched_at: number; } const NEGATIVE_TTL_MS = 60_000; const FETCH_TIMEOUT_MS = 3_000; const positiveCache = new Map(); // Value is the unix-ms timestamp of the last failed fetch. Used to gate // re-fetches within the 60s window. const negativeCache = new Map(); // Set once at startup by index.ts. We don't import loadConfig() directly // here to keep this module trivially mockable in tests (set the URL in // beforeEach instead of stubbing process.env + loadConfig's cache). let llamaSwapUrl: string | null = null; export function configureModelContext(opts: { llamaSwapUrl: string }): void { llamaSwapUrl = opts.llamaSwapUrl; } export async function getModelContext(model: string): Promise { // 1. Positive cache hit — no TTL check, model n_ctx is invariant. const pos = positiveCache.get(model); if (pos) return pos; // 2. Negative cache hit within TTL — return null without refetching. // Stale negative entries (older than the TTL) fall through to a fresh // attempt below; we don't delete them eagerly because the next successful // fetch will overwrite via the positive map and the negative entry // becomes irrelevant. const negTs = negativeCache.get(model); if (negTs !== undefined && Date.now() - negTs < NEGATIVE_TTL_MS) { return null; } // 3. Module not initialized. Defensive — index.ts calls // configureModelContext at startup; if a test forgets, fail closed so // the chat still works (ctx_max stays null, UI degrades gracefully). if (!llamaSwapUrl) { negativeCache.set(model, Date.now()); return null; } // 4. Fetch with timeout. AbortController fires after FETCH_TIMEOUT_MS; // both the timeout path and a fetch reject end up in the catch below // and produce a negative cache entry. const url = `${llamaSwapUrl}/upstream/${encodeURIComponent(model)}/props`; const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS); try { const res = await fetch(url, { signal: controller.signal }); clearTimeout(timer); if (!res.ok) { negativeCache.set(model, Date.now()); return null; } const body = (await res.json()) as { default_generation_settings?: { n_ctx?: number }; total_slots?: number; }; const n_ctx = body?.default_generation_settings?.n_ctx; if (typeof n_ctx !== 'number' || n_ctx <= 0) { negativeCache.set(model, Date.now()); return null; } // total_slots is informational; default to 1 if missing rather than // reject the whole response. Most local llama-swap setups run a // single slot anyway. const total_slots = typeof body?.total_slots === 'number' && body.total_slots > 0 ? body.total_slots : 1; const entry: ModelContext = { n_ctx, total_slots, fetched_at: Date.now() }; positiveCache.set(model, entry); // Clear any stale negative entry so a future query sees the positive // hit cleanly (otherwise the negative TTL never expires from the map). negativeCache.delete(model); return entry; } catch { clearTimeout(timer); negativeCache.set(model, Date.now()); return null; } } export function invalidateModelContext(model?: string): void { if (model === undefined) { positiveCache.clear(); negativeCache.clear(); } else { positiveCache.delete(model); negativeCache.delete(model); } }