boocode/apps/server/src/services/model-context.ts

// v1.11.3: llama-swap model-context cache. Replaces the dead
// `parsed.timings.n_ctx` capture in inference.ts / compaction.ts —
// llama-server's streaming completion never emits n_ctx in timings (verified
// empirically: timings carries prompt_n / predicted_n / *_ms / *_per_second
// only). The authoritative source is llama-swap's
// /upstream/<model>/props endpoint at .default_generation_settings.n_ctx.
//
// Cache design:
//   - Positive entries (n_ctx + total_slots) have no TTL. A model's context
//     size doesn't change while llama-swap is running; an admin endpoint
//     can invalidateModelContext() if it ever does.
//   - Negative entries (failed fetch) have a 60s TTL so a misconfigured or
//     down model doesn't get hammered every inference turn, but recovers
//     within a minute once the upstream comes back.
//   - 3s AbortController timeout on the fetch — long enough for a healthy
//     upstream, short enough that a stuck upstream doesn't block the
//     ctx_max UPDATE that follows.

export interface ModelContext {
  n_ctx: number;
  total_slots: number;
  fetched_at: number;
}

const NEGATIVE_TTL_MS = 60_000;
const FETCH_TIMEOUT_MS = 3_000;

const positiveCache = new Map<string, ModelContext>();
// Value is the unix-ms timestamp of the last failed fetch. Used to gate
// re-fetches within the 60s window.
const negativeCache = new Map<string, number>();

// Set once at startup by index.ts. We don't import loadConfig() directly
// here to keep this module trivially mockable in tests (set the URL in
// beforeEach instead of stubbing process.env + loadConfig's cache).
let llamaSwapUrl: string | null = null;

export function configureModelContext(opts: { llamaSwapUrl: string }): void {
  llamaSwapUrl = opts.llamaSwapUrl;
}

export async function getModelContext(model: string): Promise<ModelContext | null> {
  // 1. Positive cache hit — no TTL check, model n_ctx is invariant.
  const pos = positiveCache.get(model);
  if (pos) return pos;

  // 2. Negative cache hit within TTL — return null without refetching.
  // Stale negative entries (older than the TTL) fall through to a fresh
  // attempt below; we don't delete them eagerly because the next successful
  // fetch will overwrite via the positive map and the negative entry
  // becomes irrelevant.
  const negTs = negativeCache.get(model);
  if (negTs !== undefined && Date.now() - negTs < NEGATIVE_TTL_MS) {
    return null;
  }

  // 3. Module not initialized. Defensive — index.ts calls
  // configureModelContext at startup; if a test forgets, fail closed so
  // the chat still works (ctx_max stays null, UI degrades gracefully).
  if (!llamaSwapUrl) {
    negativeCache.set(model, Date.now());
    return null;
  }

  // 4. Fetch with timeout. AbortController fires after FETCH_TIMEOUT_MS;
  // both the timeout path and a fetch reject end up in the catch below
  // and produce a negative cache entry.
  const url = `${llamaSwapUrl}/upstream/${encodeURIComponent(model)}/props`;
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
  try {
    const res = await fetch(url, { signal: controller.signal });
    clearTimeout(timer);
    if (!res.ok) {
      negativeCache.set(model, Date.now());
      return null;
    }
    const body = (await res.json()) as {
      default_generation_settings?: { n_ctx?: number };
      total_slots?: number;
    };
    const n_ctx = body?.default_generation_settings?.n_ctx;
    if (typeof n_ctx !== 'number' || n_ctx <= 0) {
      negativeCache.set(model, Date.now());
      return null;
    }
    // total_slots is informational; default to 1 if missing rather than
    // reject the whole response. Most local llama-swap setups run a
    // single slot anyway.
    const total_slots =
      typeof body?.total_slots === 'number' && body.total_slots > 0 ? body.total_slots : 1;
    const entry: ModelContext = { n_ctx, total_slots, fetched_at: Date.now() };
    positiveCache.set(model, entry);
    // Clear any stale negative entry so a future query sees the positive
    // hit cleanly (otherwise the negative TTL never expires from the map).
    negativeCache.delete(model);
    return entry;
  } catch {
    clearTimeout(timer);
    negativeCache.set(model, Date.now());
    return null;
  }
}

export function invalidateModelContext(model?: string): void {
  if (model === undefined) {
    positiveCache.clear();
    negativeCache.clear();
  } else {
    positiveCache.delete(model);
    negativeCache.delete(model);
  }
}