Idle TTL eviction per (chat,agent) + LRU cap (never a busy backend); pure lifecycle-decisions.ts (TDD). Crash recovery lifts openchamber's health-monitor + busy-aware-restart + stale-grace state machine into opencode-server.ts (+ port reclaim) and warm-acp.ts; opencode crash -> fresh sessions, ACP -> re-session/new. F.1 turn-guard + U.6 usage preserved (their tests pass). Orphan worktree reaper (1h grace, superset-style dirty/unpushed preflight, Paseo soft-delete) + close hooks + diff re-baseline after apply_pending. 35 new tests + DB-opt-in reconnect test; 215 coder tests pass; tsc + build clean. Completes v2.6. Follow-ups out of scope: apps/server close-hook caller, 3.7 DiffPanel staging hint, live smokes. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
247 lines
9.8 KiB
TypeScript
247 lines
9.8 KiB
TypeScript
/**
|
|
* v2.6 — AgentPool.
|
|
*
|
|
* Lazy get-or-create registry of `AgentBackend` instances keyed by
|
|
* `${primary}:${agent}` (primary = chatId for warm-ACP, a fixed sentinel for the
|
|
* single shared opencode server). Phase 0 shipped the skeleton (Map + health +
|
|
* dispose). Phase 3 adds the LIFECYCLE: per-entry idle tracking, a periodic
|
|
* idle-TTL + LRU-cap sweep (the pure decisions live in
|
|
* `backends/lifecycle-decisions.ts`), and a `closeChat` helper for the chat-close
|
|
* hook. Reattach after eviction is implicit — the next turn's `ensureSession`
|
|
* rebuilds the backend from `agent_sessions` / `worktrees` (DB is the source of
|
|
* truth; the in-memory pool is a warm cache).
|
|
*
|
|
* The hard rule (design §6): NEVER evict a busy backend (one with an in-flight
|
|
* turn). `selectIdleEvictionTargets` / `selectLruEvictionTargets` enforce it via
|
|
* `backend.isBusy()`; a long turn that outlives the TTL is left alone.
|
|
*
|
|
* Spec: openspec/changes/v2-6-persistent-agent-sessions/design.md §2 / §6.
|
|
*/
|
|
import type { FastifyBaseLogger } from 'fastify';
|
|
import type { AgentBackend } from './agent-backend.js';
|
|
import {
|
|
selectIdleEvictionTargets,
|
|
selectLruEvictionTargets,
|
|
DEFAULT_IDLE_TTL_MS,
|
|
DEFAULT_MAX_LIVE_BACKENDS,
|
|
} from './backends/lifecycle-decisions.js';
|
|
|
|
interface PoolEntry {
|
|
primary: string;
|
|
agent: string;
|
|
backend: AgentBackend;
|
|
/** Epoch ms of the last turn boundary (register or touch). Drives idle/LRU. */
|
|
lastActiveAt: number;
|
|
}
|
|
|
|
export interface AgentPoolOpts {
|
|
/** Idle TTL before a non-busy backend is evicted. Default 30 min. */
|
|
idleTtlMs?: number;
|
|
/** Max live backends before the LRU cap evicts the least-recently-used. */
|
|
maxLive?: number;
|
|
/** Sweep cadence. Default 60s (mirrors the server's periodic sweeper). */
|
|
sweepIntervalMs?: number;
|
|
log?: FastifyBaseLogger;
|
|
}
|
|
|
|
const DEFAULT_SWEEP_INTERVAL_MS = 60_000;
|
|
|
|
export class AgentPool {
|
|
private readonly backends = new Map<string, PoolEntry>();
|
|
private idleTtlMs: number;
|
|
private maxLive: number;
|
|
private sweepIntervalMs: number;
|
|
private log: FastifyBaseLogger | undefined;
|
|
private sweepTimer: ReturnType<typeof setInterval> | null = null;
|
|
/** Serializes sweep runs so a slow eviction can't overlap the next tick. */
|
|
private sweeping = false;
|
|
|
|
constructor(opts: AgentPoolOpts = {}) {
|
|
this.idleTtlMs = opts.idleTtlMs ?? DEFAULT_IDLE_TTL_MS;
|
|
this.maxLive = opts.maxLive ?? DEFAULT_MAX_LIVE_BACKENDS;
|
|
this.sweepIntervalMs = opts.sweepIntervalMs ?? DEFAULT_SWEEP_INTERVAL_MS;
|
|
this.log = opts.log;
|
|
}
|
|
|
|
/** Apply env-derived knobs to the module singleton at bootstrap (before
|
|
* startReaper). Only overrides explicitly-provided fields. */
|
|
configure(opts: AgentPoolOpts): void {
|
|
if (opts.idleTtlMs != null) this.idleTtlMs = opts.idleTtlMs;
|
|
if (opts.maxLive != null) this.maxLive = opts.maxLive;
|
|
if (opts.sweepIntervalMs != null) this.sweepIntervalMs = opts.sweepIntervalMs;
|
|
if (opts.log) this.log = opts.log;
|
|
}
|
|
|
|
private key(primary: string, agent: string): string {
|
|
return `${primary}:${agent}`;
|
|
}
|
|
|
|
/** Map lookup only. Spawning happens in the dispatcher (Phase 1/2). A hit also
|
|
* marks the entry recently-active so a resolve-without-prompt doesn't get it
|
|
* evicted out from under an imminent turn. */
|
|
get(primary: string, agent: string): AgentBackend | undefined {
|
|
const entry = this.backends.get(this.key(primary, agent));
|
|
if (entry) entry.lastActiveAt = Date.now();
|
|
return entry?.backend;
|
|
}
|
|
|
|
/** Store a backend instance for this (primary, agent). */
|
|
register(primary: string, agent: string, backend: AgentBackend): void {
|
|
this.backends.set(this.key(primary, agent), { primary, agent, backend, lastActiveAt: Date.now() });
|
|
}
|
|
|
|
/** Mark a backend recently-active (call at turn start AND settle so a long turn
|
|
* keeps its slot warm). No-op if the key isn't pooled. */
|
|
touch(primary: string, agent: string): void {
|
|
const entry = this.backends.get(this.key(primary, agent));
|
|
if (entry) entry.lastActiveAt = Date.now();
|
|
}
|
|
|
|
/** Snapshot for the decision helpers (busy is read live from the backend). */
|
|
private snapshots(): { key: string; lastActiveAt: number; busy: boolean }[] {
|
|
const out: { key: string; lastActiveAt: number; busy: boolean }[] = [];
|
|
for (const [key, e] of this.backends) {
|
|
out.push({ key, lastActiveAt: e.lastActiveAt, busy: e.backend.isBusy?.() ?? false });
|
|
}
|
|
return out;
|
|
}
|
|
|
|
/** Summary for the health endpoint. */
|
|
health(): { size: number; busy: number } {
|
|
let busy = 0;
|
|
for (const e of this.backends.values()) if (e.backend.isBusy?.()) busy++;
|
|
return { size: this.backends.size, busy };
|
|
}
|
|
|
|
// ─── Phase 3: idle-TTL + LRU eviction sweep ──────────────────────────────────
|
|
|
|
/** Start the periodic idle + LRU sweep. Idempotent; unref'd so it never holds
|
|
* the process open on its own. */
|
|
startReaper(log?: FastifyBaseLogger): void {
|
|
if (log) this.log = log;
|
|
if (this.sweepTimer) return;
|
|
this.sweepTimer = setInterval(() => {
|
|
void this.sweep().catch((err) => {
|
|
this.log?.warn({ err: errMsg(err) }, 'agent-pool: sweep error');
|
|
});
|
|
}, this.sweepIntervalMs);
|
|
this.sweepTimer.unref?.();
|
|
}
|
|
|
|
stopReaper(): void {
|
|
if (this.sweepTimer) {
|
|
clearInterval(this.sweepTimer);
|
|
this.sweepTimer = null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* One sweep pass: evict idle-past-TTL backends, then enforce the LRU cap.
|
|
* Deduped (a key can't appear in both lists for one pass). Busy backends are
|
|
* excluded by the decision helpers — a live turn is never torn down.
|
|
*/
|
|
async sweep(now: number = Date.now()): Promise<{ evicted: string[] }> {
|
|
if (this.sweeping) return { evicted: [] };
|
|
this.sweeping = true;
|
|
try {
|
|
// Phase 3: drive each backend's optional proactive health probe first (the
|
|
// opencode server's busy-aware hung-detect + self-restart). Best-effort —
|
|
// a probe must never fail the sweep.
|
|
for (const e of this.backends.values()) {
|
|
if (e.backend.tickHealth) {
|
|
await e.backend.tickHealth(now).catch((err) => {
|
|
this.log?.warn({ key: this.key(e.primary, e.agent), err: errMsg(err) }, 'agent-pool: tickHealth threw');
|
|
});
|
|
}
|
|
}
|
|
const snaps = this.snapshots();
|
|
const idle = selectIdleEvictionTargets(snaps, now, this.idleTtlMs);
|
|
// LRU runs on what remains after idle eviction, so the two never double-evict.
|
|
const idleSet = new Set(idle);
|
|
const remaining = snaps.filter((s) => !idleSet.has(s.key));
|
|
const lru = selectLruEvictionTargets(remaining, this.maxLive);
|
|
const targets = [...idle, ...lru];
|
|
if (targets.length === 0) return { evicted: [] };
|
|
|
|
const evicted: string[] = [];
|
|
for (const key of targets) {
|
|
const entry = this.backends.get(key);
|
|
if (!entry) continue;
|
|
// Re-check busy right before teardown — a turn may have started since the
|
|
// snapshot. Defensive; the decision already excluded busy at snapshot time.
|
|
if (entry.backend.isBusy?.()) continue;
|
|
this.backends.delete(key);
|
|
try {
|
|
await entry.backend.dispose();
|
|
} catch (err) {
|
|
this.log?.warn({ key, err: errMsg(err) }, 'agent-pool: backend dispose threw during eviction');
|
|
}
|
|
evicted.push(key);
|
|
}
|
|
if (evicted.length > 0) {
|
|
this.log?.info({ evicted, size: this.backends.size }, 'agent-pool: evicted idle/over-cap backends');
|
|
}
|
|
return { evicted };
|
|
} finally {
|
|
this.sweeping = false;
|
|
}
|
|
}
|
|
|
|
// ─── Phase 3: chat-close cleanup (3.3) ───────────────────────────────────────
|
|
|
|
/**
|
|
* Tear down every pooled backend whose key is for this chat. Used by the
|
|
* chat-close hook. The opencode server is shared (keyed on a sentinel, not the
|
|
* chat), so it is NOT disposed here — only its session is closed via
|
|
* `closeSession`, which the hook calls directly with the per-(chat,agent)
|
|
* handle. Returns the keys it removed. Skips busy entries (a close mid-turn is
|
|
* rare but must not kill a live stream — the idle sweep reaps it shortly after).
|
|
*/
|
|
async closeChat(chatId: string): Promise<string[]> {
|
|
const removed: string[] = [];
|
|
const prefix = `${chatId}:`;
|
|
for (const [key, entry] of [...this.backends]) {
|
|
if (!key.startsWith(prefix)) continue;
|
|
if (entry.backend.isBusy?.()) continue;
|
|
this.backends.delete(key);
|
|
try {
|
|
await entry.backend.dispose();
|
|
} catch (err) {
|
|
this.log?.warn({ key, err: errMsg(err) }, 'agent-pool: dispose threw during closeChat');
|
|
}
|
|
removed.push(key);
|
|
}
|
|
return removed;
|
|
}
|
|
|
|
/** Look up a backend by exact key without bumping its activity (for closeSession). */
|
|
peek(primary: string, agent: string): AgentBackend | undefined {
|
|
return this.backends.get(this.key(primary, agent))?.backend;
|
|
}
|
|
|
|
/** Dispose every backend and clear the map. Tolerates throwing backends. */
|
|
async dispose(): Promise<void> {
|
|
this.stopReaper();
|
|
const entries = [...this.backends.values()];
|
|
this.backends.clear();
|
|
await Promise.allSettled(entries.map((e) => e.backend.dispose()));
|
|
}
|
|
}
|
|
|
|
function errMsg(e: unknown): string {
|
|
return e instanceof Error ? e.message : String(e);
|
|
}
|
|
|
|
/**
|
|
* The shared opencode server is pooled under a FIXED sentinel (one server per
|
|
* BooCoder process, multiplexing all opencode sessions internally) rather than a
|
|
* chat id — so it is NOT torn down by `closeChat(chatId)` (only its per-chat
|
|
* session is closed). Exported so the dispatcher + the lifecycle close-hook agree
|
|
* on the key without drift.
|
|
*/
|
|
export const OPENCODE_POOL_KEY = '__opencode_server__';
|
|
|
|
/** Single shared instance — registered by the dispatcher, swept + drained by the
|
|
* server's onClose hook. */
|
|
export const agentPool = new AgentPool();
|