feat(coder): v2.6 Phase 3 — lifecycle hardening (idle evict, crash recovery, worktree reaper)
Idle TTL eviction per (chat,agent) + LRU cap (never a busy backend); pure lifecycle-decisions.ts (TDD). Crash recovery lifts openchamber's health-monitor + busy-aware-restart + stale-grace state machine into opencode-server.ts (+ port reclaim) and warm-acp.ts; opencode crash -> fresh sessions, ACP -> re-session/new. F.1 turn-guard + U.6 usage preserved (their tests pass). Orphan worktree reaper (1h grace, superset-style dirty/unpushed preflight, Paseo soft-delete) + close hooks + diff re-baseline after apply_pending. 35 new tests + DB-opt-in reconnect test; 215 coder tests pass; tsc + build clean. Completes v2.6. Follow-ups out of scope: apps/server close-hook caller, 3.7 DiffPanel staging hint, live smokes. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,44 +1,246 @@
|
||||
/**
|
||||
* v2.6 — AgentPool (Phase 0 scaffold).
|
||||
* v2.6 — AgentPool.
|
||||
*
|
||||
* Lazy get-or-create registry of `AgentBackend` instances keyed by
|
||||
* `${sessionId}:${agent}`. Phase 0 ships the skeleton only: an in-memory Map,
|
||||
* lookup / register / health, and clean disposal wired to the server's onClose.
|
||||
* Spawning lands in Phase 1/2; nothing populates the map yet.
|
||||
* `${primary}:${agent}` (primary = chatId for warm-ACP, a fixed sentinel for the
|
||||
* single shared opencode server). Phase 0 shipped the skeleton (Map + health +
|
||||
* dispose). Phase 3 adds the LIFECYCLE: per-entry idle tracking, a periodic
|
||||
* idle-TTL + LRU-cap sweep (the pure decisions live in
|
||||
* `backends/lifecycle-decisions.ts`), and a `closeChat` helper for the chat-close
|
||||
* hook. Reattach after eviction is implicit — the next turn's `ensureSession`
|
||||
* rebuilds the backend from `agent_sessions` / `worktrees` (DB is the source of
|
||||
* truth; the in-memory pool is a warm cache).
|
||||
*
|
||||
* Spec: openspec/changes/v2-6-persistent-agent-sessions/design.md §2.
|
||||
* The hard rule (design §6): NEVER evict a busy backend (one with an in-flight
|
||||
* turn). `selectIdleEvictionTargets` / `selectLruEvictionTargets` enforce it via
|
||||
* `backend.isBusy()`; a long turn that outlives the TTL is left alone.
|
||||
*
|
||||
* Spec: openspec/changes/v2-6-persistent-agent-sessions/design.md §2 / §6.
|
||||
*/
|
||||
import type { FastifyBaseLogger } from 'fastify';
|
||||
import type { AgentBackend } from './agent-backend.js';
|
||||
import {
|
||||
selectIdleEvictionTargets,
|
||||
selectLruEvictionTargets,
|
||||
DEFAULT_IDLE_TTL_MS,
|
||||
DEFAULT_MAX_LIVE_BACKENDS,
|
||||
} from './backends/lifecycle-decisions.js';
|
||||
|
||||
interface PoolEntry {
|
||||
primary: string;
|
||||
agent: string;
|
||||
backend: AgentBackend;
|
||||
/** Epoch ms of the last turn boundary (register or touch). Drives idle/LRU. */
|
||||
lastActiveAt: number;
|
||||
}
|
||||
|
||||
export interface AgentPoolOpts {
|
||||
/** Idle TTL before a non-busy backend is evicted. Default 30 min. */
|
||||
idleTtlMs?: number;
|
||||
/** Max live backends before the LRU cap evicts the least-recently-used. */
|
||||
maxLive?: number;
|
||||
/** Sweep cadence. Default 60s (mirrors the server's periodic sweeper). */
|
||||
sweepIntervalMs?: number;
|
||||
log?: FastifyBaseLogger;
|
||||
}
|
||||
|
||||
const DEFAULT_SWEEP_INTERVAL_MS = 60_000;
|
||||
|
||||
export class AgentPool {
|
||||
private readonly backends = new Map<string, AgentBackend>();
|
||||
private readonly backends = new Map<string, PoolEntry>();
|
||||
private idleTtlMs: number;
|
||||
private maxLive: number;
|
||||
private sweepIntervalMs: number;
|
||||
private log: FastifyBaseLogger | undefined;
|
||||
private sweepTimer: ReturnType<typeof setInterval> | null = null;
|
||||
/** Serializes sweep runs so a slow eviction can't overlap the next tick. */
|
||||
private sweeping = false;
|
||||
|
||||
private key(sessionId: string, agent: string): string {
|
||||
return `${sessionId}:${agent}`;
|
||||
constructor(opts: AgentPoolOpts = {}) {
|
||||
this.idleTtlMs = opts.idleTtlMs ?? DEFAULT_IDLE_TTL_MS;
|
||||
this.maxLive = opts.maxLive ?? DEFAULT_MAX_LIVE_BACKENDS;
|
||||
this.sweepIntervalMs = opts.sweepIntervalMs ?? DEFAULT_SWEEP_INTERVAL_MS;
|
||||
this.log = opts.log;
|
||||
}
|
||||
|
||||
/** Map lookup only. Spawning is Phase 1/2 — never creates here. */
|
||||
get(sessionId: string, agent: string): AgentBackend | undefined {
|
||||
return this.backends.get(this.key(sessionId, agent));
|
||||
/** Apply env-derived knobs to the module singleton at bootstrap (before
|
||||
* startReaper). Only overrides explicitly-provided fields. */
|
||||
configure(opts: AgentPoolOpts): void {
|
||||
if (opts.idleTtlMs != null) this.idleTtlMs = opts.idleTtlMs;
|
||||
if (opts.maxLive != null) this.maxLive = opts.maxLive;
|
||||
if (opts.sweepIntervalMs != null) this.sweepIntervalMs = opts.sweepIntervalMs;
|
||||
if (opts.log) this.log = opts.log;
|
||||
}
|
||||
|
||||
/** Store a backend instance for this (session, agent). */
|
||||
register(sessionId: string, agent: string, backend: AgentBackend): void {
|
||||
this.backends.set(this.key(sessionId, agent), backend);
|
||||
private key(primary: string, agent: string): string {
|
||||
return `${primary}:${agent}`;
|
||||
}
|
||||
|
||||
/** Map lookup only. Spawning happens in the dispatcher (Phase 1/2). A hit also
|
||||
* marks the entry recently-active so a resolve-without-prompt doesn't get it
|
||||
* evicted out from under an imminent turn. */
|
||||
get(primary: string, agent: string): AgentBackend | undefined {
|
||||
const entry = this.backends.get(this.key(primary, agent));
|
||||
if (entry) entry.lastActiveAt = Date.now();
|
||||
return entry?.backend;
|
||||
}
|
||||
|
||||
/** Store a backend instance for this (primary, agent). */
|
||||
register(primary: string, agent: string, backend: AgentBackend): void {
|
||||
this.backends.set(this.key(primary, agent), { primary, agent, backend, lastActiveAt: Date.now() });
|
||||
}
|
||||
|
||||
/** Mark a backend recently-active (call at turn start AND settle so a long turn
|
||||
* keeps its slot warm). No-op if the key isn't pooled. */
|
||||
touch(primary: string, agent: string): void {
|
||||
const entry = this.backends.get(this.key(primary, agent));
|
||||
if (entry) entry.lastActiveAt = Date.now();
|
||||
}
|
||||
|
||||
/** Snapshot for the decision helpers (busy is read live from the backend). */
|
||||
private snapshots(): { key: string; lastActiveAt: number; busy: boolean }[] {
|
||||
const out: { key: string; lastActiveAt: number; busy: boolean }[] = [];
|
||||
for (const [key, e] of this.backends) {
|
||||
out.push({ key, lastActiveAt: e.lastActiveAt, busy: e.backend.isBusy?.() ?? false });
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/** Summary for the health endpoint. */
|
||||
health(): { size: number } {
|
||||
return { size: this.backends.size };
|
||||
health(): { size: number; busy: number } {
|
||||
let busy = 0;
|
||||
for (const e of this.backends.values()) if (e.backend.isBusy?.()) busy++;
|
||||
return { size: this.backends.size, busy };
|
||||
}
|
||||
|
||||
// ─── Phase 3: idle-TTL + LRU eviction sweep ──────────────────────────────────
|
||||
|
||||
/** Start the periodic idle + LRU sweep. Idempotent; unref'd so it never holds
|
||||
* the process open on its own. */
|
||||
startReaper(log?: FastifyBaseLogger): void {
|
||||
if (log) this.log = log;
|
||||
if (this.sweepTimer) return;
|
||||
this.sweepTimer = setInterval(() => {
|
||||
void this.sweep().catch((err) => {
|
||||
this.log?.warn({ err: errMsg(err) }, 'agent-pool: sweep error');
|
||||
});
|
||||
}, this.sweepIntervalMs);
|
||||
this.sweepTimer.unref?.();
|
||||
}
|
||||
|
||||
stopReaper(): void {
|
||||
if (this.sweepTimer) {
|
||||
clearInterval(this.sweepTimer);
|
||||
this.sweepTimer = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* One sweep pass: evict idle-past-TTL backends, then enforce the LRU cap.
|
||||
* Deduped (a key can't appear in both lists for one pass). Busy backends are
|
||||
* excluded by the decision helpers — a live turn is never torn down.
|
||||
*/
|
||||
async sweep(now: number = Date.now()): Promise<{ evicted: string[] }> {
|
||||
if (this.sweeping) return { evicted: [] };
|
||||
this.sweeping = true;
|
||||
try {
|
||||
// Phase 3: drive each backend's optional proactive health probe first (the
|
||||
// opencode server's busy-aware hung-detect + self-restart). Best-effort —
|
||||
// a probe must never fail the sweep.
|
||||
for (const e of this.backends.values()) {
|
||||
if (e.backend.tickHealth) {
|
||||
await e.backend.tickHealth(now).catch((err) => {
|
||||
this.log?.warn({ key: this.key(e.primary, e.agent), err: errMsg(err) }, 'agent-pool: tickHealth threw');
|
||||
});
|
||||
}
|
||||
}
|
||||
const snaps = this.snapshots();
|
||||
const idle = selectIdleEvictionTargets(snaps, now, this.idleTtlMs);
|
||||
// LRU runs on what remains after idle eviction, so the two never double-evict.
|
||||
const idleSet = new Set(idle);
|
||||
const remaining = snaps.filter((s) => !idleSet.has(s.key));
|
||||
const lru = selectLruEvictionTargets(remaining, this.maxLive);
|
||||
const targets = [...idle, ...lru];
|
||||
if (targets.length === 0) return { evicted: [] };
|
||||
|
||||
const evicted: string[] = [];
|
||||
for (const key of targets) {
|
||||
const entry = this.backends.get(key);
|
||||
if (!entry) continue;
|
||||
// Re-check busy right before teardown — a turn may have started since the
|
||||
// snapshot. Defensive; the decision already excluded busy at snapshot time.
|
||||
if (entry.backend.isBusy?.()) continue;
|
||||
this.backends.delete(key);
|
||||
try {
|
||||
await entry.backend.dispose();
|
||||
} catch (err) {
|
||||
this.log?.warn({ key, err: errMsg(err) }, 'agent-pool: backend dispose threw during eviction');
|
||||
}
|
||||
evicted.push(key);
|
||||
}
|
||||
if (evicted.length > 0) {
|
||||
this.log?.info({ evicted, size: this.backends.size }, 'agent-pool: evicted idle/over-cap backends');
|
||||
}
|
||||
return { evicted };
|
||||
} finally {
|
||||
this.sweeping = false;
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Phase 3: chat-close cleanup (3.3) ───────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Tear down every pooled backend whose key is for this chat. Used by the
|
||||
* chat-close hook. The opencode server is shared (keyed on a sentinel, not the
|
||||
* chat), so it is NOT disposed here — only its session is closed via
|
||||
* `closeSession`, which the hook calls directly with the per-(chat,agent)
|
||||
* handle. Returns the keys it removed. Skips busy entries (a close mid-turn is
|
||||
* rare but must not kill a live stream — the idle sweep reaps it shortly after).
|
||||
*/
|
||||
async closeChat(chatId: string): Promise<string[]> {
|
||||
const removed: string[] = [];
|
||||
const prefix = `${chatId}:`;
|
||||
for (const [key, entry] of [...this.backends]) {
|
||||
if (!key.startsWith(prefix)) continue;
|
||||
if (entry.backend.isBusy?.()) continue;
|
||||
this.backends.delete(key);
|
||||
try {
|
||||
await entry.backend.dispose();
|
||||
} catch (err) {
|
||||
this.log?.warn({ key, err: errMsg(err) }, 'agent-pool: dispose threw during closeChat');
|
||||
}
|
||||
removed.push(key);
|
||||
}
|
||||
return removed;
|
||||
}
|
||||
|
||||
/** Look up a backend by exact key without bumping its activity (for closeSession). */
|
||||
peek(primary: string, agent: string): AgentBackend | undefined {
|
||||
return this.backends.get(this.key(primary, agent))?.backend;
|
||||
}
|
||||
|
||||
/** Dispose every backend and clear the map. Tolerates throwing backends. */
|
||||
async dispose(): Promise<void> {
|
||||
this.stopReaper();
|
||||
const entries = [...this.backends.values()];
|
||||
this.backends.clear();
|
||||
await Promise.allSettled(entries.map((b) => b.dispose()));
|
||||
await Promise.allSettled(entries.map((e) => e.backend.dispose()));
|
||||
}
|
||||
}
|
||||
|
||||
/** Single shared instance — referenced only by the server's onClose hook in Phase 0. */
|
||||
function errMsg(e: unknown): string {
|
||||
return e instanceof Error ? e.message : String(e);
|
||||
}
|
||||
|
||||
/**
|
||||
* The shared opencode server is pooled under a FIXED sentinel (one server per
|
||||
* BooCoder process, multiplexing all opencode sessions internally) rather than a
|
||||
* chat id — so it is NOT torn down by `closeChat(chatId)` (only its per-chat
|
||||
* session is closed). Exported so the dispatcher + the lifecycle close-hook agree
|
||||
* on the key without drift.
|
||||
*/
|
||||
export const OPENCODE_POOL_KEY = '__opencode_server__';
|
||||
|
||||
/** Single shared instance — registered by the dispatcher, swept + drained by the
|
||||
* server's onClose hook. */
|
||||
export const agentPool = new AgentPool();
|
||||
|
||||
Reference in New Issue
Block a user