Files
boocode/apps/server/src/services/model-context.ts
indifferentketchup 89dcfb95dc v1.11.3: fix ctx_max capture via /props endpoint
- llama-server does not emit n_ctx in timings (confirmed empirically);
  dead code at inference.ts:479 and compaction.ts:300 never fired
- New model-context.ts: cached fetch of /upstream/<model>/props
  with positive-cache (no TTL) and 60s negative-cache
- Wired into all 4 ctx_max write sites: 3 in inference.ts
  (executeToolPhase, finalizeCompletion, runCapHitSummary) and
  1 in compaction.ts (summary row INSERT)
- AbortController 3s timeout, lenient parsing with sensible defaults
- 12 new vitest cases for the cache module (59 total)
- 7 historical assistant rows backfilled manually (see notes)
2026-05-20 19:29:26 +00:00

114 lines
4.4 KiB
TypeScript

// v1.11.3: llama-swap model-context cache. Replaces the dead
// `parsed.timings.n_ctx` capture in inference.ts / compaction.ts —
// llama-server's streaming completion never emits n_ctx in timings (verified
// empirically: timings carries prompt_n / predicted_n / *_ms / *_per_second
// only). The authoritative source is llama-swap's
// /upstream/<model>/props endpoint at .default_generation_settings.n_ctx.
//
// Cache design:
// - Positive entries (n_ctx + total_slots) have no TTL. A model's context
// size doesn't change while llama-swap is running; an admin endpoint
// can invalidateModelContext() if it ever does.
// - Negative entries (failed fetch) have a 60s TTL so a misconfigured or
// down model doesn't get hammered every inference turn, but recovers
// within a minute once the upstream comes back.
// - 3s AbortController timeout on the fetch — long enough for a healthy
// upstream, short enough that a stuck upstream doesn't block the
// ctx_max UPDATE that follows.
export interface ModelContext {
n_ctx: number;
total_slots: number;
fetched_at: number;
}
const NEGATIVE_TTL_MS = 60_000;
const FETCH_TIMEOUT_MS = 3_000;
const positiveCache = new Map<string, ModelContext>();
// Value is the unix-ms timestamp of the last failed fetch. Used to gate
// re-fetches within the 60s window.
const negativeCache = new Map<string, number>();
// Set once at startup by index.ts. We don't import loadConfig() directly
// here to keep this module trivially mockable in tests (set the URL in
// beforeEach instead of stubbing process.env + loadConfig's cache).
let llamaSwapUrl: string | null = null;
export function configureModelContext(opts: { llamaSwapUrl: string }): void {
llamaSwapUrl = opts.llamaSwapUrl;
}
export async function getModelContext(model: string): Promise<ModelContext | null> {
// 1. Positive cache hit — no TTL check, model n_ctx is invariant.
const pos = positiveCache.get(model);
if (pos) return pos;
// 2. Negative cache hit within TTL — return null without refetching.
// Stale negative entries (older than the TTL) fall through to a fresh
// attempt below; we don't delete them eagerly because the next successful
// fetch will overwrite via the positive map and the negative entry
// becomes irrelevant.
const negTs = negativeCache.get(model);
if (negTs !== undefined && Date.now() - negTs < NEGATIVE_TTL_MS) {
return null;
}
// 3. Module not initialized. Defensive — index.ts calls
// configureModelContext at startup; if a test forgets, fail closed so
// the chat still works (ctx_max stays null, UI degrades gracefully).
if (!llamaSwapUrl) {
negativeCache.set(model, Date.now());
return null;
}
// 4. Fetch with timeout. AbortController fires after FETCH_TIMEOUT_MS;
// both the timeout path and a fetch reject end up in the catch below
// and produce a negative cache entry.
const url = `${llamaSwapUrl}/upstream/${encodeURIComponent(model)}/props`;
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
try {
const res = await fetch(url, { signal: controller.signal });
clearTimeout(timer);
if (!res.ok) {
negativeCache.set(model, Date.now());
return null;
}
const body = (await res.json()) as {
default_generation_settings?: { n_ctx?: number };
total_slots?: number;
};
const n_ctx = body?.default_generation_settings?.n_ctx;
if (typeof n_ctx !== 'number' || n_ctx <= 0) {
negativeCache.set(model, Date.now());
return null;
}
// total_slots is informational; default to 1 if missing rather than
// reject the whole response. Most local llama-swap setups run a
// single slot anyway.
const total_slots =
typeof body?.total_slots === 'number' && body.total_slots > 0 ? body.total_slots : 1;
const entry: ModelContext = { n_ctx, total_slots, fetched_at: Date.now() };
positiveCache.set(model, entry);
// Clear any stale negative entry so a future query sees the positive
// hit cleanly (otherwise the negative TTL never expires from the map).
negativeCache.delete(model);
return entry;
} catch {
clearTimeout(timer);
negativeCache.set(model, Date.now());
return null;
}
}
export function invalidateModelContext(model?: string): void {
if (model === undefined) {
positiveCache.clear();
negativeCache.clear();
} else {
positiveCache.delete(model);
negativeCache.delete(model);
}
}