feat(coder): v2.6 Phase 3 — lifecycle hardening (idle evict, crash recovery, worktree reaper)
Idle TTL eviction per (chat,agent) + LRU cap (never a busy backend); pure lifecycle-decisions.ts (TDD). Crash recovery lifts openchamber's health-monitor + busy-aware-restart + stale-grace state machine into opencode-server.ts (+ port reclaim) and warm-acp.ts; opencode crash -> fresh sessions, ACP -> re-session/new. F.1 turn-guard + U.6 usage preserved (their tests pass). Orphan worktree reaper (1h grace, superset-style dirty/unpushed preflight, Paseo soft-delete) + close hooks + diff re-baseline after apply_pending. 35 new tests + DB-opt-in reconnect test; 215 coder tests pass; tsc + build clean. Completes v2.6. Follow-ups out of scope: apps/server close-hook caller, 3.7 DiffPanel staging hint, live smokes. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,176 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import {
|
||||
selectIdleEvictionTargets,
|
||||
selectLruEvictionTargets,
|
||||
decideRestart,
|
||||
selectOrphanWorktreeTargets,
|
||||
DEFAULT_IDLE_TTL_MS,
|
||||
DEFAULT_MAX_LIVE_BACKENDS,
|
||||
type PoolEntrySnapshot,
|
||||
} from '../lifecycle-decisions.js';
|
||||
|
||||
/**
|
||||
* v2.6 Phase 3 — pure lifecycle decisions. No DB, no children, no timers; `now`
|
||||
* is injected. Models prune.ts:selectPruneTargets — the caller acts on the keys.
|
||||
*/
|
||||
|
||||
const NOW = 1_000_000_000_000;
|
||||
|
||||
function entry(key: string, ageMs: number, busy = false): PoolEntrySnapshot {
|
||||
return { key, lastActiveAt: NOW - ageMs, busy };
|
||||
}
|
||||
|
||||
describe('selectIdleEvictionTargets (3.1)', () => {
|
||||
it('evicts entries idle past the TTL', () => {
|
||||
const entries = [
|
||||
entry('a:opencode', DEFAULT_IDLE_TTL_MS + 1),
|
||||
entry('b:goose', DEFAULT_IDLE_TTL_MS - 1),
|
||||
];
|
||||
expect(selectIdleEvictionTargets(entries, NOW)).toEqual(['a:opencode']);
|
||||
});
|
||||
|
||||
it('never evicts a busy entry even when idle past the TTL', () => {
|
||||
const entries = [entry('a:opencode', DEFAULT_IDLE_TTL_MS * 10, /* busy */ true)];
|
||||
expect(selectIdleEvictionTargets(entries, NOW)).toEqual([]);
|
||||
});
|
||||
|
||||
it('respects a custom TTL', () => {
|
||||
const entries = [entry('a:goose', 5_000), entry('b:qwen', 500)];
|
||||
expect(selectIdleEvictionTargets(entries, NOW, 1_000)).toEqual(['a:goose']);
|
||||
});
|
||||
|
||||
it('treats exactly-at-TTL as evictable (>=)', () => {
|
||||
expect(selectIdleEvictionTargets([entry('a:x', 1_000)], NOW, 1_000)).toEqual(['a:x']);
|
||||
});
|
||||
|
||||
it('returns empty for an empty pool', () => {
|
||||
expect(selectIdleEvictionTargets([], NOW)).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('selectLruEvictionTargets (3.4)', () => {
|
||||
it('returns nothing when at or under the cap', () => {
|
||||
const entries = [entry('a:x', 10), entry('b:y', 20)];
|
||||
expect(selectLruEvictionTargets(entries, 2)).toEqual([]);
|
||||
expect(selectLruEvictionTargets(entries, 5)).toEqual([]);
|
||||
});
|
||||
|
||||
it('evicts the least-recently-used beyond the cap', () => {
|
||||
// oldest first: c (300ms ago) is LRU, then a (100ms), then b (10ms).
|
||||
const entries = [entry('a:x', 100), entry('b:y', 10), entry('c:z', 300)];
|
||||
expect(selectLruEvictionTargets(entries, 2)).toEqual(['c:z']);
|
||||
});
|
||||
|
||||
it('evicts multiple LRU entries to reach the cap', () => {
|
||||
const entries = [
|
||||
entry('a:x', 100),
|
||||
entry('b:y', 10),
|
||||
entry('c:z', 300),
|
||||
entry('d:w', 200),
|
||||
];
|
||||
// cap 1: must remove 3, oldest-first c(300), d(200), a(100).
|
||||
expect(selectLruEvictionTargets(entries, 1)).toEqual(['c:z', 'd:w', 'a:x']);
|
||||
});
|
||||
|
||||
it('never evicts a busy entry even if it is the LRU', () => {
|
||||
// c is LRU but busy → it cannot be evicted; fall to the next-oldest (a).
|
||||
const entries = [entry('a:x', 100), entry('b:y', 10), entry('c:z', 300, true)];
|
||||
expect(selectLruEvictionTargets(entries, 2)).toEqual(['a:x']);
|
||||
});
|
||||
|
||||
it('can transiently exceed the cap when too many are busy', () => {
|
||||
// cap 1, but both old entries busy → only the single idle one is evictable.
|
||||
const entries = [entry('a:x', 100, true), entry('c:z', 300, true), entry('b:y', 10)];
|
||||
expect(selectLruEvictionTargets(entries, 1)).toEqual(['b:y']);
|
||||
});
|
||||
|
||||
it('uses the default cap when omitted', () => {
|
||||
const entries = Array.from({ length: DEFAULT_MAX_LIVE_BACKENDS + 1 }, (_, i) =>
|
||||
entry(`k${String(i).padStart(2, '0')}:a`, (i + 1) * 1000),
|
||||
);
|
||||
const evicted = selectLruEvictionTargets(entries);
|
||||
// exactly one over the default cap → evict the single LRU (largest age).
|
||||
expect(evicted).toHaveLength(1);
|
||||
expect(evicted[0]).toBe(`k${String(DEFAULT_MAX_LIVE_BACKENDS).padStart(2, '0')}:a`);
|
||||
});
|
||||
});
|
||||
|
||||
describe('decideRestart (3.2, busy-aware)', () => {
|
||||
const base = {
|
||||
consecutiveFailures: 0,
|
||||
busy: false,
|
||||
unhealthyBusySince: 0,
|
||||
now: NOW,
|
||||
failureThreshold: 3,
|
||||
staleBusyGraceMs: 120_000,
|
||||
};
|
||||
|
||||
it('does nothing when healthy', () => {
|
||||
expect(decideRestart({ ...base, processExited: false, healthy: true }))
|
||||
.toEqual({ action: 'none', reason: 'healthy' });
|
||||
});
|
||||
|
||||
it('restarts immediately when the process exited', () => {
|
||||
expect(decideRestart({ ...base, processExited: true, busy: true }))
|
||||
.toEqual({ action: 'restart', reason: 'process-exited' });
|
||||
});
|
||||
|
||||
it('waits below the failure threshold', () => {
|
||||
expect(decideRestart({ ...base, processExited: false, consecutiveFailures: 2 }))
|
||||
.toEqual({ action: 'wait', reason: 'below-threshold' });
|
||||
});
|
||||
|
||||
it('restarts at the threshold when idle', () => {
|
||||
expect(decideRestart({ ...base, processExited: false, consecutiveFailures: 3 }))
|
||||
.toEqual({ action: 'restart', reason: 'threshold' });
|
||||
});
|
||||
|
||||
it('defers a restart while busy within the grace window', () => {
|
||||
expect(decideRestart({
|
||||
...base, processExited: false, consecutiveFailures: 5, busy: true,
|
||||
unhealthyBusySince: NOW - 1_000,
|
||||
})).toEqual({ action: 'wait', reason: 'busy-grace' });
|
||||
});
|
||||
|
||||
it('force-restarts a busy backend after the stale-busy grace', () => {
|
||||
expect(decideRestart({
|
||||
...base, processExited: false, consecutiveFailures: 5, busy: true,
|
||||
unhealthyBusySince: NOW - 120_001,
|
||||
})).toEqual({ action: 'restart', reason: 'stale-busy-grace' });
|
||||
});
|
||||
|
||||
it('waits (busy-grace) when busy + threshold but the window just started', () => {
|
||||
// unhealthyBusySince === 0 means the caller is about to stamp it this cycle.
|
||||
expect(decideRestart({
|
||||
...base, processExited: false, consecutiveFailures: 5, busy: true,
|
||||
unhealthyBusySince: 0,
|
||||
})).toEqual({ action: 'wait', reason: 'busy-grace' });
|
||||
});
|
||||
});
|
||||
|
||||
describe('selectOrphanWorktreeTargets (3.4)', () => {
|
||||
it('skips dirs tracked by a live worktrees row', () => {
|
||||
const onDisk = [{ path: '/wt/sess-a', mtimeMs: NOW - 10_000_000 }];
|
||||
expect(selectOrphanWorktreeTargets(onDisk, new Set(['/wt/sess-a']), NOW, 1000)).toEqual([]);
|
||||
});
|
||||
|
||||
it('reaps an untracked dir older than the grace', () => {
|
||||
const onDisk = [{ path: '/wt/sess-orphan', mtimeMs: NOW - 5000 }];
|
||||
expect(selectOrphanWorktreeTargets(onDisk, new Set(), NOW, 1000)).toEqual(['/wt/sess-orphan']);
|
||||
});
|
||||
|
||||
it('never reaps a dir younger than the grace (mid-create race)', () => {
|
||||
const onDisk = [{ path: '/wt/sess-fresh', mtimeMs: NOW - 500 }];
|
||||
expect(selectOrphanWorktreeTargets(onDisk, new Set(), NOW, 1000)).toEqual([]);
|
||||
});
|
||||
|
||||
it('mixes tracked, fresh, and orphaned correctly', () => {
|
||||
const onDisk = [
|
||||
{ path: '/wt/sess-live', mtimeMs: NOW - 10_000 },
|
||||
{ path: '/wt/sess-fresh', mtimeMs: NOW - 100 },
|
||||
{ path: '/wt/sess-orphan', mtimeMs: NOW - 10_000 },
|
||||
];
|
||||
expect(selectOrphanWorktreeTargets(onDisk, new Set(['/wt/sess-live']), NOW, 1000))
|
||||
.toEqual(['/wt/sess-orphan']);
|
||||
});
|
||||
});
|
||||
197
apps/coder/src/services/backends/lifecycle-decisions.ts
Normal file
197
apps/coder/src/services/backends/lifecycle-decisions.ts
Normal file
@@ -0,0 +1,197 @@
|
||||
/**
|
||||
* v2.6 Phase 3 — pure lifecycle decision helpers.
|
||||
*
|
||||
* The eviction / LRU-cap / busy-aware-restart / reaper-target logic, factored out
|
||||
* of AgentPool + the backends + the periodic sweeper so it's unit-testable with no
|
||||
* DB, no child processes, no timers (modeled on
|
||||
* apps/server/src/services/inference/prune.ts:selectPruneTargets — a pure decision
|
||||
* core the caller acts on).
|
||||
*
|
||||
* Three decisions live here:
|
||||
* 1. selectIdleEvictionTargets — which warm backends to evict for being idle.
|
||||
* 2. selectLruEvictionTargets — which warm backends to evict to honour a max-live
|
||||
* cap (least-recently-used beyond the cap), NEVER a busy one.
|
||||
* 3. shouldRestartCrashedBackend (busy-aware) — openchamber's skip-while-busy +
|
||||
* stale-grace state machine, re-implemented for BooCode's per-(chat,agent) pool.
|
||||
*
|
||||
* "Busy" = the backend has an in-flight turn. The hard rule (design §6, decisions):
|
||||
* never evict or force-restart a busy backend; defer with a stale-grace.
|
||||
*/
|
||||
|
||||
// ─── Idle TTL eviction (3.1) ─────────────────────────────────────────────────
|
||||
|
||||
/** Default idle TTL before a warm backend/session is evicted (design §6 ~30 min). */
|
||||
export const DEFAULT_IDLE_TTL_MS = 30 * 60 * 1000;
|
||||
|
||||
/** A pool entry as the decision helpers see it (no backend internals). */
|
||||
export interface PoolEntrySnapshot {
|
||||
/** Pool key `${primary}:${agent}` — opaque to the decision, used for selection. */
|
||||
key: string;
|
||||
/** Epoch ms of the last turn activity (start or settle) on this backend. */
|
||||
lastActiveAt: number;
|
||||
/** True iff a turn is in flight right now. Busy entries are never evicted. */
|
||||
busy: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Idle eviction: an entry is evictable when it has been idle (no turn) for longer
|
||||
* than `ttlMs` AND is not currently busy. Returns the keys to evict.
|
||||
*
|
||||
* Pure: `now` is injected so tests don't depend on wall-clock. Busy entries are
|
||||
* categorically excluded — a long-running turn that exceeds the TTL must NOT be
|
||||
* torn down mid-stream (the §6 / openchamber busy rule).
|
||||
*/
|
||||
export function selectIdleEvictionTargets(
|
||||
entries: ReadonlyArray<PoolEntrySnapshot>,
|
||||
now: number,
|
||||
ttlMs: number = DEFAULT_IDLE_TTL_MS,
|
||||
): string[] {
|
||||
const out: string[] = [];
|
||||
for (const e of entries) {
|
||||
if (e.busy) continue;
|
||||
if (now - e.lastActiveAt >= ttlMs) out.push(e.key);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// ─── LRU cap (3.4) ───────────────────────────────────────────────────────────
|
||||
|
||||
/** Default max live warm backends/worktrees before the LRU cap evicts (env-overridable). */
|
||||
export const DEFAULT_MAX_LIVE_BACKENDS = 10;
|
||||
|
||||
/**
|
||||
* LRU cap: when more than `cap` non-busy entries are live, evict the
|
||||
* least-recently-used ones (oldest `lastActiveAt` first) until at most `cap`
|
||||
* remain. Busy entries are never evicted AND are not counted toward the cap's
|
||||
* "kept" budget being freed — i.e. we only ever evict idle entries, so a burst of
|
||||
* concurrent busy turns can transiently exceed the cap rather than kill live work.
|
||||
*
|
||||
* Returns the keys to evict, least-recently-used first. Pure / deterministic:
|
||||
* ties broken by key for stable test output.
|
||||
*/
|
||||
export function selectLruEvictionTargets(
|
||||
entries: ReadonlyArray<PoolEntrySnapshot>,
|
||||
cap: number = DEFAULT_MAX_LIVE_BACKENDS,
|
||||
): string[] {
|
||||
if (cap < 0) cap = 0;
|
||||
if (entries.length <= cap) return [];
|
||||
// Only idle entries are eligible to be evicted.
|
||||
const evictable = entries
|
||||
.filter((e) => !e.busy)
|
||||
.sort((a, b) => a.lastActiveAt - b.lastActiveAt || (a.key < b.key ? -1 : a.key > b.key ? 1 : 0));
|
||||
// We must shrink total live count down to `cap`. Busy entries can't be evicted,
|
||||
// so the number we CAN remove is bounded by the evictable pool; evict the oldest
|
||||
// (total - cap) of them, never more than exist.
|
||||
const overBy = entries.length - cap;
|
||||
const toEvict = evictable.slice(0, Math.max(0, overBy));
|
||||
return toEvict.map((e) => e.key);
|
||||
}
|
||||
|
||||
// ─── Busy-aware crash restart (3.2) — openchamber lift ───────────────────────
|
||||
|
||||
/**
|
||||
* Default grace after which a backend that has stayed unhealthy WHILE busy is
|
||||
* force-restarted anyway (openchamber's STALE_BUSY_GRACE_MS = 2 min). Guards
|
||||
* against a permanently-stuck "busy" turn wedging recovery forever.
|
||||
*/
|
||||
export const DEFAULT_STALE_BUSY_GRACE_MS = 2 * 60 * 1000;
|
||||
|
||||
/** Default consecutive health-check failures before a restart is attempted. */
|
||||
export const DEFAULT_HEALTH_FAILURE_THRESHOLD = 3;
|
||||
|
||||
export interface RestartDecisionInput {
|
||||
/** True iff the process is actually dead (exited). A dead process restarts
|
||||
* immediately regardless of busy/threshold — there's nothing to protect. */
|
||||
processExited: boolean;
|
||||
/** Consecutive failed health probes so far (including the current one). */
|
||||
consecutiveFailures: number;
|
||||
/** Whether the backend currently has an in-flight turn. */
|
||||
busy: boolean;
|
||||
/** Epoch ms when the unhealthy-while-busy window started, or 0 if not in one. */
|
||||
unhealthyBusySince: number;
|
||||
/** Injected clock. */
|
||||
now: number;
|
||||
failureThreshold?: number;
|
||||
staleBusyGraceMs?: number;
|
||||
}
|
||||
|
||||
export type RestartDecision =
|
||||
| { action: 'restart'; reason: 'process-exited' | 'threshold' | 'stale-busy-grace' }
|
||||
| { action: 'wait'; reason: 'below-threshold' | 'busy-grace' }
|
||||
| { action: 'none'; reason: 'healthy' };
|
||||
|
||||
/**
|
||||
* Decide whether to restart a backend after a health probe. Mirrors
|
||||
* openchamber's `runHealthCheckCycle` + `shouldSkipRestartForBusySessions`,
|
||||
* re-implemented as a pure function over injected state (the caller owns the
|
||||
* mutable counters + the actual restart side-effect).
|
||||
*
|
||||
* Order (matches openchamber):
|
||||
* - process exited → restart now (nothing live to protect).
|
||||
* - below failure threshold → wait (transient blip; the next probe re-checks).
|
||||
* - threshold reached + idle → restart now.
|
||||
* - threshold reached + busy → skip UNLESS the unhealthy-busy window exceeded
|
||||
* the stale grace, then force restart.
|
||||
*
|
||||
* `healthy: true` callers don't reach here; included for completeness so the
|
||||
* caller can pass through and reset counters on a single code path.
|
||||
*/
|
||||
export function decideRestart(input: RestartDecisionInput & { healthy?: boolean }): RestartDecision {
|
||||
if (input.healthy) return { action: 'none', reason: 'healthy' };
|
||||
if (input.processExited) return { action: 'restart', reason: 'process-exited' };
|
||||
|
||||
const threshold = input.failureThreshold ?? DEFAULT_HEALTH_FAILURE_THRESHOLD;
|
||||
if (input.consecutiveFailures < threshold) {
|
||||
return { action: 'wait', reason: 'below-threshold' };
|
||||
}
|
||||
|
||||
if (!input.busy) {
|
||||
return { action: 'restart', reason: 'threshold' };
|
||||
}
|
||||
|
||||
// Busy + unhealthy at/over threshold: defer, but not forever.
|
||||
const grace = input.staleBusyGraceMs ?? DEFAULT_STALE_BUSY_GRACE_MS;
|
||||
if (input.unhealthyBusySince > 0 && input.now - input.unhealthyBusySince >= grace) {
|
||||
return { action: 'restart', reason: 'stale-busy-grace' };
|
||||
}
|
||||
return { action: 'wait', reason: 'busy-grace' };
|
||||
}
|
||||
|
||||
// ─── Orphan worktree reaper target selection (3.4) ───────────────────────────
|
||||
|
||||
/** Default TTL: an on-disk worktree dir with no live `worktrees` row is reaped
|
||||
* only after it's been orphaned at least this long (mtime-based grace so a
|
||||
* just-created dir mid-`ensureSessionWorktree` race is never swept). */
|
||||
export const DEFAULT_ORPHAN_WORKTREE_GRACE_MS = 60 * 60 * 1000; // 1h
|
||||
|
||||
export interface OnDiskWorktree {
|
||||
/** Absolute path of the worktree dir on disk. */
|
||||
path: string;
|
||||
/** Last-modified epoch ms of the dir (newest of dir + contents, caller's choice). */
|
||||
mtimeMs: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reaper target selection: which on-disk worktree dirs are orphans safe to
|
||||
* inspect-and-reap. An orphan is a dir under the worktree base that has NO live
|
||||
* `worktrees` row (path not in `liveWorktreePaths`) AND whose mtime is older than
|
||||
* the grace window (so an in-flight create isn't swept).
|
||||
*
|
||||
* Pure — the caller (the sweeper) then runs the at-risk preflight (dirty/unpushed)
|
||||
* on each returned path and only physically removes the SAFE ones. This helper
|
||||
* never decides to remove work-at-risk; it only narrows the candidate set.
|
||||
*/
|
||||
export function selectOrphanWorktreeTargets(
|
||||
onDisk: ReadonlyArray<OnDiskWorktree>,
|
||||
liveWorktreePaths: ReadonlySet<string>,
|
||||
now: number,
|
||||
graceMs: number = DEFAULT_ORPHAN_WORKTREE_GRACE_MS,
|
||||
): string[] {
|
||||
const out: string[] = [];
|
||||
for (const w of onDisk) {
|
||||
if (liveWorktreePaths.has(w.path)) continue; // tracked → not an orphan
|
||||
if (now - w.mtimeMs < graceMs) continue; // too fresh → could be mid-create
|
||||
out.push(w.path);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
@@ -21,9 +21,9 @@
|
||||
* - promptAsync is fire-and-forget (204); the turn completes via a
|
||||
* 'session.idle' event for that opencode session id.
|
||||
*/
|
||||
import { spawn, type ChildProcess } from 'node:child_process';
|
||||
import { spawn, spawnSync, type ChildProcess } from 'node:child_process';
|
||||
import { createHash } from 'node:crypto';
|
||||
import { createServer } from 'node:net';
|
||||
import { createServer, connect as netConnect } from 'node:net';
|
||||
import type { FastifyBaseLogger } from 'fastify';
|
||||
import {
|
||||
createOpencodeClient,
|
||||
@@ -39,6 +39,7 @@ import type { Sql } from '../../db.js';
|
||||
import type { AcpToolSnapshot } from '../acp-tool-snapshot.js';
|
||||
import { armAbortGuard, noteTurnActivity, consumeTerminal } from './turn-guard.js';
|
||||
import { stepEndedToUsage, type StepUsage } from './opencode-usage.js';
|
||||
import { decideRestart, DEFAULT_HEALTH_FAILURE_THRESHOLD } from './lifecycle-decisions.js';
|
||||
import type {
|
||||
AgentBackend,
|
||||
AgentEvent,
|
||||
@@ -104,6 +105,11 @@ export class OpenCodeServerBackend implements AgentBackend {
|
||||
private port: number | null = null;
|
||||
private up = false;
|
||||
private serverStarting: Promise<void> | null = null;
|
||||
// Phase 3 busy-aware health monitor (openchamber lift): consecutive failed
|
||||
// probes + the start of an unhealthy-while-busy window feed `decideRestart`.
|
||||
private consecutiveHealthFailures = 0;
|
||||
private unhealthyBusySince = 0;
|
||||
private restarting: Promise<void> | null = null;
|
||||
|
||||
/** opencode session id → demux state. Maintained by ensureSession; read by the SSE loop. */
|
||||
private readonly byOpencodeId = new Map<string, SessionState>();
|
||||
@@ -119,11 +125,30 @@ export class OpenCodeServerBackend implements AgentBackend {
|
||||
return this.up ? 'up' : 'down';
|
||||
}
|
||||
|
||||
// ─── Server lifecycle (1.2: spawn once + client + ready) ─────────────────────
|
||||
/** Phase 3: busy iff ANY pooled opencode session has an in-flight turn. The
|
||||
* pool reads this to skip idle/LRU eviction and the health-monitor to defer a
|
||||
* restart (never tear down a session mid-stream). */
|
||||
isBusy(): boolean {
|
||||
for (const st of this.byOpencodeId.values()) {
|
||||
if (st.activeTurn) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Lazy: start the single server on first use. Idempotent — one server per backend. */
|
||||
// ─── Server lifecycle (1.2: spawn once + client + ready; Phase 3 crash-restart) ──
|
||||
|
||||
/**
|
||||
* Lazy: start the single server on first use; re-spawn after a crash. Idempotent
|
||||
* within one live server — `serverStarting` caches the in-flight start, and is
|
||||
* reset to null by the crash handler so the NEXT ensureServer re-spawns a fresh
|
||||
* server (Phase 3 crash recovery). A dead-but-not-yet-reaped child (exit handler
|
||||
* raced) is also treated as needing a restart.
|
||||
*/
|
||||
private ensureServer(): Promise<void> {
|
||||
if (!this.serverStarting) this.serverStarting = this.startServer();
|
||||
const childDead = this.child != null && (this.child.exitCode !== null || this.child.signalCode !== null);
|
||||
if (!this.serverStarting || (!this.up && childDead)) {
|
||||
this.serverStarting = this.startServer();
|
||||
}
|
||||
return this.serverStarting;
|
||||
}
|
||||
|
||||
@@ -143,11 +168,15 @@ export class OpenCodeServerBackend implements AgentBackend {
|
||||
this.port = port;
|
||||
|
||||
// Child lifetime is the backend's (the pool's), NOT a request's. We never tie
|
||||
// it to a per-turn abort signal. On unexpected exit we mark down + log; crash
|
||||
// recovery is Phase 3.
|
||||
// it to a per-turn abort signal. Phase 3: on unexpected exit we recover —
|
||||
// settle any in-flight turns as failed, mark their agent_sessions rows crashed,
|
||||
// and reset `serverStarting` so the next ensureServer re-spawns. opencode keeps
|
||||
// sessions on disk, but a fresh server's in-memory state is gone, so the next
|
||||
// turn's ensureSession (rows now 'crashed') creates fresh opencode sessions.
|
||||
child.on('exit', (code, signal) => {
|
||||
this.up = false;
|
||||
this.log.warn({ code, signal, port }, 'opencode-server: child exited (recovery is Phase 3)');
|
||||
// Only react to THIS child's exit (a restart may have swapped in a new one).
|
||||
if (this.child !== child) return;
|
||||
this.handleServerCrash(code, signal, port);
|
||||
});
|
||||
|
||||
await waitForReady(child, READY_TIMEOUT_MS);
|
||||
@@ -157,6 +186,136 @@ export class OpenCodeServerBackend implements AgentBackend {
|
||||
this.log.info({ port }, 'opencode-server: ready');
|
||||
}
|
||||
|
||||
/**
|
||||
* Crash handler (Phase 3, lift of openchamber's restart-on-exit path). The
|
||||
* server died with N live opencode sessions; we can't restart it here (the next
|
||||
* turn does, lazily — avoids a restart storm if the binary is broken). We:
|
||||
* 1. fail every in-flight turn so its dispatcher unblocks + publishes an error,
|
||||
* 2. mark each session's agent_sessions row 'crashed' so ensureSession won't
|
||||
* resume a now-dead native session id (it creates fresh),
|
||||
* 3. tear down the SSE loops + demux state (stale against the dead server),
|
||||
* 4. reclaim the port + reset state so the next ensureServer re-spawns.
|
||||
*/
|
||||
private handleServerCrash(code: number | null, signal: NodeJS.Signals | null, port: number): void {
|
||||
this.up = false;
|
||||
const states = [...this.byOpencodeId.values()];
|
||||
this.log.warn(
|
||||
{ code, signal, port, liveSessions: states.length },
|
||||
'opencode-server: child exited — recovering (fail in-flight, mark crashed, re-spawn next turn)',
|
||||
);
|
||||
|
||||
const crashedIds: string[] = [];
|
||||
for (const st of states) {
|
||||
st.sseAbort?.abort();
|
||||
if (st.activeTurn) {
|
||||
st.activeTurn.settle({ ok: false, error: 'opencode server crashed mid-turn' });
|
||||
st.activeTurn = null;
|
||||
}
|
||||
if (st.watchdog) {
|
||||
clearTimeout(st.watchdog);
|
||||
st.watchdog = null;
|
||||
}
|
||||
crashedIds.push(st.agentSessionId);
|
||||
}
|
||||
// Drop the demux map: every session id is stale against a fresh server.
|
||||
this.byOpencodeId.clear();
|
||||
this.client = null;
|
||||
this.serverStarting = null; // force a re-spawn on the next ensureServer
|
||||
|
||||
if (crashedIds.length > 0) {
|
||||
this.sql`
|
||||
UPDATE agent_sessions SET status = 'crashed'
|
||||
WHERE agent_session_id = ANY(${crashedIds}) AND status <> 'closed'
|
||||
`.catch((err) => {
|
||||
this.log.warn({ err: errMsg(err) }, 'opencode-server: failed to mark crashed sessions (non-fatal)');
|
||||
});
|
||||
}
|
||||
|
||||
// Reclaim the port so a re-spawn on a fixed/leaked port isn't blocked. Best
|
||||
// effort; the next start uses a fresh ephemeral port anyway.
|
||||
reclaimPort(port);
|
||||
}
|
||||
|
||||
/**
|
||||
* Phase 3 proactive health monitor (openchamber `runHealthCheckCycle` lift,
|
||||
* busy-aware). Probes the server's /global/health; on a sustained failure of a
|
||||
* NON-busy server, force a restart so the next turn isn't blocked by a wedged
|
||||
* (hung-but-not-exited) process. Busy servers are deferred via the stale-grace in
|
||||
* `decideRestart` — never tear down live work. Driven by the pool's periodic
|
||||
* sweep (best-effort; a crash-exit is already handled by `handleServerCrash` +
|
||||
* lazy `ensureServer` re-spawn, so this only catches the hung case). No-op when
|
||||
* the server was never started or a restart is already in flight.
|
||||
*/
|
||||
async tickHealth(now: number = Date.now()): Promise<void> {
|
||||
if (!this.child || this.restarting) return;
|
||||
const childExited = this.child.exitCode !== null || this.child.signalCode !== null;
|
||||
// An exited child is recovered lazily by ensureServer; don't double-restart it.
|
||||
if (childExited) return;
|
||||
|
||||
const healthy = await this.probeHealth();
|
||||
if (healthy) {
|
||||
this.consecutiveHealthFailures = 0;
|
||||
this.unhealthyBusySince = 0;
|
||||
return;
|
||||
}
|
||||
this.consecutiveHealthFailures += 1;
|
||||
const busy = this.isBusy();
|
||||
const decision = decideRestart({
|
||||
processExited: false,
|
||||
consecutiveFailures: this.consecutiveHealthFailures,
|
||||
busy,
|
||||
unhealthyBusySince: this.unhealthyBusySince,
|
||||
now,
|
||||
failureThreshold: DEFAULT_HEALTH_FAILURE_THRESHOLD,
|
||||
});
|
||||
// Stamp the start of an unhealthy-while-busy window so the stale-grace can fire.
|
||||
if (busy && this.unhealthyBusySince === 0) this.unhealthyBusySince = now;
|
||||
if (decision.action === 'restart') {
|
||||
this.log.warn(
|
||||
{ failures: this.consecutiveHealthFailures, busy, reason: decision.reason },
|
||||
'opencode-server: health monitor forcing restart',
|
||||
);
|
||||
this.consecutiveHealthFailures = 0;
|
||||
this.unhealthyBusySince = 0;
|
||||
await this.restartServer();
|
||||
}
|
||||
}
|
||||
|
||||
private async probeHealth(): Promise<boolean> {
|
||||
if (!this.client) return false;
|
||||
try {
|
||||
const res = await this.client.global.health();
|
||||
return !res.error;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Force-kill the current server + reclaim its port; the next ensureServer
|
||||
* re-spawns (lazy). Mirrors handleServerCrash's state reset but is initiated by
|
||||
* the health monitor rather than the OS. */
|
||||
private async restartServer(): Promise<void> {
|
||||
if (this.restarting) return this.restarting;
|
||||
this.restarting = (async () => {
|
||||
const child = this.child;
|
||||
const port = this.port;
|
||||
this.up = false;
|
||||
// Fail in-flight turns + mark sessions crashed via the same path as a crash.
|
||||
if (child) {
|
||||
this.handleServerCrash(null, null, port ?? 0);
|
||||
if (!child.killed) child.kill('SIGTERM');
|
||||
}
|
||||
if (port) {
|
||||
reclaimPort(port);
|
||||
await waitForPortRelease(port, 3_000);
|
||||
}
|
||||
this.child = null;
|
||||
})().finally(() => {
|
||||
this.restarting = null;
|
||||
});
|
||||
return this.restarting;
|
||||
}
|
||||
|
||||
// ─── SSE read loop + demux + translate (1.3) + dedup (1.4) ───────────────────
|
||||
|
||||
/** Per-session SSE subscription, scoped to the session's worktree directory.
|
||||
@@ -756,6 +915,67 @@ function mapToolStatus(s: ToolState['status'] | undefined): ToolCallStatus | nul
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reclaim a loopback port a dead opencode child may still hold (lift of
|
||||
* openchamber `killProcessOnPort`). Best-effort, POSIX-only (`lsof`/`kill`); a
|
||||
* failure is harmless because the next spawn allocates a fresh ephemeral port.
|
||||
* Never kills this process. Synchronous + short-timeout so the crash handler
|
||||
* doesn't block.
|
||||
*/
|
||||
function reclaimPort(port: number | null): void {
|
||||
if (!port || process.platform === 'win32') return;
|
||||
try {
|
||||
const res = spawnSync('lsof', ['-ti', `:${port}`], { encoding: 'utf8', timeout: 3_000, windowsHide: true });
|
||||
const out = res.stdout || '';
|
||||
const myPid = process.pid;
|
||||
for (const pidStr of out.split(/\s+/)) {
|
||||
const pid = parseInt(pidStr.trim(), 10);
|
||||
if (pid && pid !== myPid) {
|
||||
try {
|
||||
spawnSync('kill', ['-9', String(pid)], { stdio: 'ignore', timeout: 2_000 });
|
||||
} catch {
|
||||
// ignore — best effort
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// lsof absent or failed — the fresh-ephemeral-port spawn doesn't need this.
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve true once nothing is listening on `port` (lift of openchamber
|
||||
* `waitForPortRelease`). Used before re-spawning on a fixed port; with ephemeral
|
||||
* ports it's a fast no-op. Probes 127.0.0.1; resolves false at the deadline.
|
||||
*/
|
||||
function waitForPortRelease(port: number, timeoutMs: number): Promise<boolean> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
return new Promise((resolve) => {
|
||||
const attempt = () => {
|
||||
const socket = netConnect({ port, host: '127.0.0.1' });
|
||||
let settled = false;
|
||||
const finish = (released: boolean) => {
|
||||
if (settled) return;
|
||||
settled = true;
|
||||
socket.removeAllListeners();
|
||||
socket.destroy();
|
||||
if (released || Date.now() >= deadline) {
|
||||
resolve(released);
|
||||
return;
|
||||
}
|
||||
setTimeout(attempt, 150);
|
||||
};
|
||||
socket.once('connect', () => finish(false));
|
||||
socket.once('error', (err: NodeJS.ErrnoException) => {
|
||||
if (err && (err.code === 'ECONNREFUSED' || err.code === 'EHOSTUNREACH')) finish(true);
|
||||
else finish(false);
|
||||
});
|
||||
socket.setTimeout(500, () => finish(true));
|
||||
};
|
||||
attempt();
|
||||
});
|
||||
}
|
||||
|
||||
/** Bind-probe an ephemeral port on loopback. */
|
||||
function freePort(): Promise<number> {
|
||||
return new Promise((resolve, reject) => {
|
||||
|
||||
@@ -132,6 +132,12 @@ export class WarmAcpBackend implements AgentBackend {
|
||||
return this.up ? 'up' : 'down';
|
||||
}
|
||||
|
||||
/** Phase 3: busy iff this backend's single session has an in-flight turn. The
|
||||
* pool reads this to skip idle/LRU eviction (never kill the child mid-prompt). */
|
||||
isBusy(): boolean {
|
||||
return this.activeTurn != null;
|
||||
}
|
||||
|
||||
// ─── warm-process lifecycle (2.1 spawn + initialize + session/new ONCE) ───────
|
||||
|
||||
/** Lazy: spawn the warm process on first use. Idempotent — one process per backend. */
|
||||
|
||||
Reference in New Issue
Block a user