/** * v2.6 Phase 3 — pure lifecycle decision helpers. * * The eviction / LRU-cap / busy-aware-restart / reaper-target logic, factored out * of AgentPool + the backends + the periodic sweeper so it's unit-testable with no * DB, no child processes, no timers (modeled on * apps/server/src/services/inference/prune.ts:selectPruneTargets — a pure decision * core the caller acts on). * * Three decisions live here: * 1. selectIdleEvictionTargets — which warm backends to evict for being idle. * 2. selectLruEvictionTargets — which warm backends to evict to honour a max-live * cap (least-recently-used beyond the cap), NEVER a busy one. * 3. shouldRestartCrashedBackend (busy-aware) — openchamber's skip-while-busy + * stale-grace state machine, re-implemented for BooCode's per-(chat,agent) pool. * * "Busy" = the backend has an in-flight turn. The hard rule (design §6, decisions): * never evict or force-restart a busy backend; defer with a stale-grace. */ // ─── Idle TTL eviction (3.1) ───────────────────────────────────────────────── /** Default idle TTL before a warm backend/session is evicted (design §6 ~30 min). */ export const DEFAULT_IDLE_TTL_MS = 30 * 60 * 1000; /** A pool entry as the decision helpers see it (no backend internals). */ export interface PoolEntrySnapshot { /** Pool key `${primary}:${agent}` — opaque to the decision, used for selection. */ key: string; /** Epoch ms of the last turn activity (start or settle) on this backend. */ lastActiveAt: number; /** True iff a turn is in flight right now. Busy entries are never evicted. */ busy: boolean; } /** * Idle eviction: an entry is evictable when it has been idle (no turn) for longer * than `ttlMs` AND is not currently busy. Returns the keys to evict. * * Pure: `now` is injected so tests don't depend on wall-clock. Busy entries are * categorically excluded — a long-running turn that exceeds the TTL must NOT be * torn down mid-stream (the §6 / openchamber busy rule). */ export function selectIdleEvictionTargets( entries: ReadonlyArray, now: number, ttlMs: number = DEFAULT_IDLE_TTL_MS, ): string[] { const out: string[] = []; for (const e of entries) { if (e.busy) continue; if (now - e.lastActiveAt >= ttlMs) out.push(e.key); } return out; } // ─── LRU cap (3.4) ─────────────────────────────────────────────────────────── /** Default max live warm backends/worktrees before the LRU cap evicts (env-overridable). */ export const DEFAULT_MAX_LIVE_BACKENDS = 10; /** * LRU cap: when more than `cap` non-busy entries are live, evict the * least-recently-used ones (oldest `lastActiveAt` first) until at most `cap` * remain. Busy entries are never evicted AND are not counted toward the cap's * "kept" budget being freed — i.e. we only ever evict idle entries, so a burst of * concurrent busy turns can transiently exceed the cap rather than kill live work. * * Returns the keys to evict, least-recently-used first. Pure / deterministic: * ties broken by key for stable test output. */ export function selectLruEvictionTargets( entries: ReadonlyArray, cap: number = DEFAULT_MAX_LIVE_BACKENDS, ): string[] { if (cap < 0) cap = 0; if (entries.length <= cap) return []; // Only idle entries are eligible to be evicted. const evictable = entries .filter((e) => !e.busy) .sort((a, b) => a.lastActiveAt - b.lastActiveAt || (a.key < b.key ? -1 : a.key > b.key ? 1 : 0)); // We must shrink total live count down to `cap`. Busy entries can't be evicted, // so the number we CAN remove is bounded by the evictable pool; evict the oldest // (total - cap) of them, never more than exist. const overBy = entries.length - cap; const toEvict = evictable.slice(0, Math.max(0, overBy)); return toEvict.map((e) => e.key); } // ─── Busy-aware crash restart (3.2) — openchamber lift ─────────────────────── /** * Default grace after which a backend that has stayed unhealthy WHILE busy is * force-restarted anyway (openchamber's STALE_BUSY_GRACE_MS = 2 min). Guards * against a permanently-stuck "busy" turn wedging recovery forever. */ export const DEFAULT_STALE_BUSY_GRACE_MS = 2 * 60 * 1000; /** Default consecutive health-check failures before a restart is attempted. */ export const DEFAULT_HEALTH_FAILURE_THRESHOLD = 3; export interface RestartDecisionInput { /** True iff the process is actually dead (exited). A dead process restarts * immediately regardless of busy/threshold — there's nothing to protect. */ processExited: boolean; /** Consecutive failed health probes so far (including the current one). */ consecutiveFailures: number; /** Whether the backend currently has an in-flight turn. */ busy: boolean; /** Epoch ms when the unhealthy-while-busy window started, or 0 if not in one. */ unhealthyBusySince: number; /** Injected clock. */ now: number; failureThreshold?: number; staleBusyGraceMs?: number; } export type RestartDecision = | { action: 'restart'; reason: 'process-exited' | 'threshold' | 'stale-busy-grace' } | { action: 'wait'; reason: 'below-threshold' | 'busy-grace' } | { action: 'none'; reason: 'healthy' }; /** * Decide whether to restart a backend after a health probe. Mirrors * openchamber's `runHealthCheckCycle` + `shouldSkipRestartForBusySessions`, * re-implemented as a pure function over injected state (the caller owns the * mutable counters + the actual restart side-effect). * * Order (matches openchamber): * - process exited → restart now (nothing live to protect). * - below failure threshold → wait (transient blip; the next probe re-checks). * - threshold reached + idle → restart now. * - threshold reached + busy → skip UNLESS the unhealthy-busy window exceeded * the stale grace, then force restart. * * `healthy: true` callers don't reach here; included for completeness so the * caller can pass through and reset counters on a single code path. */ export function decideRestart(input: RestartDecisionInput & { healthy?: boolean }): RestartDecision { if (input.healthy) return { action: 'none', reason: 'healthy' }; if (input.processExited) return { action: 'restart', reason: 'process-exited' }; const threshold = input.failureThreshold ?? DEFAULT_HEALTH_FAILURE_THRESHOLD; if (input.consecutiveFailures < threshold) { return { action: 'wait', reason: 'below-threshold' }; } if (!input.busy) { return { action: 'restart', reason: 'threshold' }; } // Busy + unhealthy at/over threshold: defer, but not forever. const grace = input.staleBusyGraceMs ?? DEFAULT_STALE_BUSY_GRACE_MS; if (input.unhealthyBusySince > 0 && input.now - input.unhealthyBusySince >= grace) { return { action: 'restart', reason: 'stale-busy-grace' }; } return { action: 'wait', reason: 'busy-grace' }; } // ─── Orphan worktree reaper target selection (3.4) ─────────────────────────── /** Default TTL: an on-disk worktree dir with no live `worktrees` row is reaped * only after it's been orphaned at least this long (mtime-based grace so a * just-created dir mid-`ensureSessionWorktree` race is never swept). */ export const DEFAULT_ORPHAN_WORKTREE_GRACE_MS = 60 * 60 * 1000; // 1h export interface OnDiskWorktree { /** Absolute path of the worktree dir on disk. */ path: string; /** Last-modified epoch ms of the dir (newest of dir + contents, caller's choice). */ mtimeMs: number; } /** * Reaper target selection: which on-disk worktree dirs are orphans safe to * inspect-and-reap. An orphan is a dir under the worktree base that has NO live * `worktrees` row (path not in `liveWorktreePaths`) AND whose mtime is older than * the grace window (so an in-flight create isn't swept). * * Pure — the caller (the sweeper) then runs the at-risk preflight (dirty/unpushed) * on each returned path and only physically removes the SAFE ones. This helper * never decides to remove work-at-risk; it only narrows the candidate set. */ export function selectOrphanWorktreeTargets( onDisk: ReadonlyArray, liveWorktreePaths: ReadonlySet, now: number, graceMs: number = DEFAULT_ORPHAN_WORKTREE_GRACE_MS, ): string[] { const out: string[] = []; for (const w of onDisk) { if (liveWorktreePaths.has(w.path)) continue; // tracked → not an orphan if (now - w.mtimeMs < graceMs) continue; // too fresh → could be mid-create out.push(w.path); } return out; }