/** * OpenCodeServerSupervisor — the opencode `serve` child + HTTP client + port + * health-counter lifecycle, extracted (v2.7 audit reshape) from the backend * god-class. Owns spawn / ready / crash / proactive-health restart / dispose and * exposes `client` / `port` / `health()` / `tickHealth()` to the backend. * * Session-level recovery (failing in-flight turns, marking agent_sessions crashed, * tearing down SSE loops) is NOT a process concern — it's delegated back to the * backend through the injected `hooks.onServerDown` callback, keeping this module * free of the demux map / SQL / turn state. * * v2.7 concurrency hardening: `ensureServer` is guarded against the crash-window * double-spawn (two concurrent callers each re-spawning on different ports) via a * synchronous `startInFlight` flag — see `shouldStartServer`. */ import { spawn, type ChildProcess } from 'node:child_process'; import { createOpencodeClient, type OpencodeClient } from '@opencode-ai/sdk/v2/client'; import type { FastifyBaseLogger } from 'fastify'; import { decideRestart, DEFAULT_HEALTH_FAILURE_THRESHOLD } from './lifecycle-decisions.js'; import { reclaimPort, waitForPortRelease, freePort } from '../net/port-utils.js'; const READY_TIMEOUT_MS = 30_000; /** Info handed to the backend when the server goes down (crash or forced restart). */ export interface ServerDownInfo { code: number | null; signal: NodeJS.Signals | null; port: number; } export interface SupervisorHooks { /** True iff ANY pooled session has an in-flight turn (defers a busy restart). */ isBusy: () => boolean; /** Session-level recovery: fail in-flight turns, mark crashed, drop demux state. */ onServerDown: (info: ServerDownInfo) => void; } export interface OpenCodeServerSupervisorDeps { /** Absolute path to the opencode binary (resolved from available_agents). */ opencodeBinary: string; log: FastifyBaseLogger; hooks: SupervisorHooks; } /** * Pure decision for `ensureServer`: should we (re)spawn the server right now? * * - A live, ready server (`up && client`) → no. * - A start already in flight (`startInFlight`) → no, NEVER double-spawn — join the * running start instead. This is checked BEFORE `serverStarting` because the crash * handler can null `serverStarting` mid-start (a crash during `await freePort()`), * and without this guard the `!serverStarting` branch would spawn a second server * on a different port while the first is still coming up. * - No start cached/running → yes (fresh start or post-crash re-spawn, since the * crash handler nulls `serverStarting`). * - A cached start that already finished, but the child has since died and the crash * handler hasn't reset us yet → yes. */ export function shouldStartServer(s: { up: boolean; hasClient: boolean; serverStarting: boolean; childDead: boolean; startInFlight: boolean; }): boolean { if (s.up && s.hasClient) return false; if (s.startInFlight) return false; if (!s.serverStarting) return true; if (!s.up && s.childDead) return true; return false; } export class OpenCodeServerSupervisor { private readonly opencodeBinary: string; private readonly log: FastifyBaseLogger; private readonly hooks: SupervisorHooks; private childProc: ChildProcess | null = null; private opencodeClient: OpencodeClient | null = null; private serverPort: number | null = null; private up = false; private serverStarting: Promise | null = null; /** True from the synchronous head of startServer() until it settles — the * double-spawn guard reads it so a concurrent ensureServer joins instead of * kicking a second spawn. */ private startInFlight = false; // Phase 3 busy-aware health monitor (openchamber lift): consecutive failed // probes + the start of an unhealthy-while-busy window feed `decideRestart`. private consecutiveHealthFailures = 0; private unhealthyBusySince = 0; private restarting: Promise | null = null; constructor(deps: OpenCodeServerSupervisorDeps) { this.opencodeBinary = deps.opencodeBinary; this.log = deps.log; this.hooks = deps.hooks; } /** The live opencode HTTP client, or null between (re)starts. */ get client(): OpencodeClient | null { return this.opencodeClient; } /** The current server port, or null before the first start. */ get port(): number | null { return this.serverPort; } /** §2: liveness for the health endpoint + dispatcher fallback decision. */ health(): 'up' | 'down' { return this.up ? 'up' : 'down'; } isUp(): boolean { return this.up; } // ─── lifecycle (spawn once + client + ready; crash-restart) ────────────────── /** * Lazy: start the single server on first use; re-spawn after a crash. Idempotent * within one live server — `serverStarting` caches the in-flight start, reset to * null by the crash handler so the NEXT ensureServer re-spawns. A dead-but-not- * yet-reaped child (exit handler raced) is also treated as needing a restart. * Concurrent callers in a crash window are coalesced via `startInFlight`. */ ensureServer(): Promise { if (this.up && this.opencodeClient) return Promise.resolve(); const childDead = this.childProc != null && (this.childProc.exitCode !== null || this.childProc.signalCode !== null); if ( shouldStartServer({ up: this.up, hasClient: this.opencodeClient != null, serverStarting: this.serverStarting != null, childDead, startInFlight: this.startInFlight, }) ) { this.serverStarting = this.startServer(); } return this.serverStarting ?? Promise.resolve(); } private async startServer(): Promise { // Set synchronously (before the first await) so a concurrent ensureServer sees // the in-flight start and joins `serverStarting` instead of double-spawning. this.startInFlight = true; try { const port = await freePort(); // Phase 1: run unsecured on loopback (opencode's documented default — serve.ts // only WARNS when OPENCODE_SERVER_PASSWORD is unset). The real boundary is the // 127.0.0.1 bind. const child = spawn(this.opencodeBinary, ['serve', '--hostname', '127.0.0.1', '--port', String(port)], { stdio: ['ignore', 'pipe', 'pipe'], env: { ...process.env }, }); this.childProc = child; this.serverPort = port; // Child lifetime is the backend's (the pool's), NOT a request's. On unexpected // exit we recover: settle in-flight turns, mark sessions crashed (the backend's // onServerDown), reclaim the port, and reset state so the next ensureServer // re-spawns. child.on('exit', (code, signal) => { // Only react to THIS child's exit (a restart may have swapped in a new one). if (this.childProc !== child) return; this.handleCrash(code, signal, port); }); await waitForReady(child, READY_TIMEOUT_MS); this.opencodeClient = createOpencodeClient({ baseUrl: `http://127.0.0.1:${port}` }); this.up = true; this.log.info({ port }, 'opencode-server: ready'); } finally { this.startInFlight = false; } } /** * Server down (crash-exit or forced restart): reset process/port state, delegate * session-level recovery to the backend, and reclaim the port. Mirrors the * original `handleServerCrash` ordering (up=false → session cleanup → client/ * serverStarting null → reclaimPort). */ private handleCrash(code: number | null, signal: NodeJS.Signals | null, port: number): void { this.up = false; this.hooks.onServerDown({ code, signal, port }); this.opencodeClient = null; this.serverStarting = null; // force a re-spawn on the next ensureServer // Reclaim the port so a re-spawn on a fixed/leaked port isn't blocked. Best // effort; the next start uses a fresh ephemeral port anyway. reclaimPort(port); } /** * Phase 3 proactive health monitor (openchamber `runHealthCheckCycle` lift, * busy-aware). Probes /global/health; on a sustained failure of a NON-busy server, * force a restart so the next turn isn't blocked by a wedged process. Busy servers * are deferred via the stale-grace in `decideRestart`. No-op when never started or * a restart is already in flight. */ async tickHealth(now: number = Date.now()): Promise { if (!this.childProc || this.restarting) return; const childExited = this.childProc.exitCode !== null || this.childProc.signalCode !== null; // An exited child is recovered lazily by ensureServer; don't double-restart it. if (childExited) return; const healthy = await this.probeHealth(); if (healthy) { this.consecutiveHealthFailures = 0; this.unhealthyBusySince = 0; return; } this.consecutiveHealthFailures += 1; const busy = this.hooks.isBusy(); const decision = decideRestart({ processExited: false, consecutiveFailures: this.consecutiveHealthFailures, busy, unhealthyBusySince: this.unhealthyBusySince, now, failureThreshold: DEFAULT_HEALTH_FAILURE_THRESHOLD, }); // Stamp the start of an unhealthy-while-busy window so the stale-grace can fire. if (busy && this.unhealthyBusySince === 0) this.unhealthyBusySince = now; if (decision.action === 'restart') { this.log.warn( { failures: this.consecutiveHealthFailures, busy, reason: decision.reason }, 'opencode-server: health monitor forcing restart', ); this.consecutiveHealthFailures = 0; this.unhealthyBusySince = 0; await this.restartServer(); } } private async probeHealth(): Promise { if (!this.opencodeClient) return false; try { const res = await this.opencodeClient.global.health(); return !res.error; } catch { return false; } } /** Force-kill the current server + reclaim its port; the next ensureServer * re-spawns (lazy). Mirrors handleCrash's state reset but is initiated by the * health monitor rather than the OS. */ private async restartServer(): Promise { if (this.restarting) return this.restarting; this.restarting = (async () => { const child = this.childProc; const port = this.serverPort; this.up = false; // Fail in-flight turns + mark sessions crashed via the same path as a crash. if (child) { this.handleCrash(null, null, port ?? 0); if (!child.killed) child.kill('SIGTERM'); } if (port) { reclaimPort(port); await waitForPortRelease(port, 3_000); } this.childProc = null; })().finally(() => { this.restarting = null; }); return this.restarting; } /** Full teardown of the child + client + port state. */ async dispose(): Promise { this.up = false; const child = this.childProc; this.childProc = null; this.opencodeClient = null; if (child && !child.killed) { child.kill('SIGTERM'); const t = setTimeout(() => { if (!child.killed) child.kill('SIGKILL'); }, 5_000); t.unref(); } } } /** Resolve when the child prints the ready line; reject on timeout or early exit. */ function waitForReady(child: ChildProcess, timeoutMs: number): Promise { return new Promise((resolve, reject) => { let done = false; let stderrBuf = ''; const finish = (err?: Error) => { if (done) return; done = true; clearTimeout(timer); child.stdout?.off('data', onOut); child.stderr?.off('data', onErr); child.off('exit', onExit); if (err) reject(err); else resolve(); }; const onOut = (buf: Buffer) => { if (buf.toString().includes('opencode server listening on')) finish(); }; const onErr = (buf: Buffer) => { stderrBuf += buf.toString(); }; const onExit = (code: number | null) => finish(new Error(`opencode serve exited before ready (code ${code}); stderr: ${stderrBuf.slice(-2000)}`)); const timer = setTimeout( () => finish(new Error(`opencode serve not ready in ${timeoutMs}ms; stderr: ${stderrBuf.slice(-2000)}`)), timeoutMs, ); child.stdout?.on('data', onOut); child.stderr?.on('data', onErr); child.on('exit', onExit); }); }