- llama-server does not emit n_ctx in timings (confirmed empirically); dead code at inference.ts:479 and compaction.ts:300 never fired - New model-context.ts: cached fetch of /upstream/<model>/props with positive-cache (no TTL) and 60s negative-cache - Wired into all 4 ctx_max write sites: 3 in inference.ts (executeToolPhase, finalizeCompletion, runCapHitSummary) and 1 in compaction.ts (summary row INSERT) - AbortController 3s timeout, lenient parsing with sensible defaults - 12 new vitest cases for the cache module (59 total) - 7 historical assistant rows backfilled manually (see notes)
114 lines
4.4 KiB
TypeScript
114 lines
4.4 KiB
TypeScript
// v1.11.3: llama-swap model-context cache. Replaces the dead
|
|
// `parsed.timings.n_ctx` capture in inference.ts / compaction.ts —
|
|
// llama-server's streaming completion never emits n_ctx in timings (verified
|
|
// empirically: timings carries prompt_n / predicted_n / *_ms / *_per_second
|
|
// only). The authoritative source is llama-swap's
|
|
// /upstream/<model>/props endpoint at .default_generation_settings.n_ctx.
|
|
//
|
|
// Cache design:
|
|
// - Positive entries (n_ctx + total_slots) have no TTL. A model's context
|
|
// size doesn't change while llama-swap is running; an admin endpoint
|
|
// can invalidateModelContext() if it ever does.
|
|
// - Negative entries (failed fetch) have a 60s TTL so a misconfigured or
|
|
// down model doesn't get hammered every inference turn, but recovers
|
|
// within a minute once the upstream comes back.
|
|
// - 3s AbortController timeout on the fetch — long enough for a healthy
|
|
// upstream, short enough that a stuck upstream doesn't block the
|
|
// ctx_max UPDATE that follows.
|
|
|
|
export interface ModelContext {
|
|
n_ctx: number;
|
|
total_slots: number;
|
|
fetched_at: number;
|
|
}
|
|
|
|
const NEGATIVE_TTL_MS = 60_000;
|
|
const FETCH_TIMEOUT_MS = 3_000;
|
|
|
|
const positiveCache = new Map<string, ModelContext>();
|
|
// Value is the unix-ms timestamp of the last failed fetch. Used to gate
|
|
// re-fetches within the 60s window.
|
|
const negativeCache = new Map<string, number>();
|
|
|
|
// Set once at startup by index.ts. We don't import loadConfig() directly
|
|
// here to keep this module trivially mockable in tests (set the URL in
|
|
// beforeEach instead of stubbing process.env + loadConfig's cache).
|
|
let llamaSwapUrl: string | null = null;
|
|
|
|
export function configureModelContext(opts: { llamaSwapUrl: string }): void {
|
|
llamaSwapUrl = opts.llamaSwapUrl;
|
|
}
|
|
|
|
export async function getModelContext(model: string): Promise<ModelContext | null> {
|
|
// 1. Positive cache hit — no TTL check, model n_ctx is invariant.
|
|
const pos = positiveCache.get(model);
|
|
if (pos) return pos;
|
|
|
|
// 2. Negative cache hit within TTL — return null without refetching.
|
|
// Stale negative entries (older than the TTL) fall through to a fresh
|
|
// attempt below; we don't delete them eagerly because the next successful
|
|
// fetch will overwrite via the positive map and the negative entry
|
|
// becomes irrelevant.
|
|
const negTs = negativeCache.get(model);
|
|
if (negTs !== undefined && Date.now() - negTs < NEGATIVE_TTL_MS) {
|
|
return null;
|
|
}
|
|
|
|
// 3. Module not initialized. Defensive — index.ts calls
|
|
// configureModelContext at startup; if a test forgets, fail closed so
|
|
// the chat still works (ctx_max stays null, UI degrades gracefully).
|
|
if (!llamaSwapUrl) {
|
|
negativeCache.set(model, Date.now());
|
|
return null;
|
|
}
|
|
|
|
// 4. Fetch with timeout. AbortController fires after FETCH_TIMEOUT_MS;
|
|
// both the timeout path and a fetch reject end up in the catch below
|
|
// and produce a negative cache entry.
|
|
const url = `${llamaSwapUrl}/upstream/${encodeURIComponent(model)}/props`;
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
|
|
try {
|
|
const res = await fetch(url, { signal: controller.signal });
|
|
clearTimeout(timer);
|
|
if (!res.ok) {
|
|
negativeCache.set(model, Date.now());
|
|
return null;
|
|
}
|
|
const body = (await res.json()) as {
|
|
default_generation_settings?: { n_ctx?: number };
|
|
total_slots?: number;
|
|
};
|
|
const n_ctx = body?.default_generation_settings?.n_ctx;
|
|
if (typeof n_ctx !== 'number' || n_ctx <= 0) {
|
|
negativeCache.set(model, Date.now());
|
|
return null;
|
|
}
|
|
// total_slots is informational; default to 1 if missing rather than
|
|
// reject the whole response. Most local llama-swap setups run a
|
|
// single slot anyway.
|
|
const total_slots =
|
|
typeof body?.total_slots === 'number' && body.total_slots > 0 ? body.total_slots : 1;
|
|
const entry: ModelContext = { n_ctx, total_slots, fetched_at: Date.now() };
|
|
positiveCache.set(model, entry);
|
|
// Clear any stale negative entry so a future query sees the positive
|
|
// hit cleanly (otherwise the negative TTL never expires from the map).
|
|
negativeCache.delete(model);
|
|
return entry;
|
|
} catch {
|
|
clearTimeout(timer);
|
|
negativeCache.set(model, Date.now());
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export function invalidateModelContext(model?: string): void {
|
|
if (model === undefined) {
|
|
positiveCache.clear();
|
|
negativeCache.clear();
|
|
} else {
|
|
positiveCache.delete(model);
|
|
negativeCache.delete(model);
|
|
}
|
|
}
|