Files
boocode/apps/coder/src/services/backends/opencode-server-process.ts
indifferentketchup abe1a311d0 refactor: codebase audit cleanup — dead code, dedup, module splits
Multi-agent audit + aggressive cleanup across server/web/coder/booterm,
delivered behind a DEFER discipline so none of the in-flight files were
touched. Removes dead code/deps/columns, dedups server + coder helpers,
and splits the oversized modules (tools.ts, opencode-server.ts,
sentinel-summaries, turn.ts, TerminalPane.tsx) behind stable contracts.
Adds 78 parity/unit tests (server 587, coder 323); fixes two latent bugs
(ChatPane queue keys, FileViewerOverlay blank-line parity).

Intended tag: v2.7.12-audit-cleanup.
2026-06-02 21:12:29 +00:00

326 lines
12 KiB
TypeScript

/**
* OpenCodeServerSupervisor — the opencode `serve` child + HTTP client + port +
* health-counter lifecycle, extracted (v2.7 audit reshape) from the backend
* god-class. Owns spawn / ready / crash / proactive-health restart / dispose and
* exposes `client` / `port` / `health()` / `tickHealth()` to the backend.
*
* Session-level recovery (failing in-flight turns, marking agent_sessions crashed,
* tearing down SSE loops) is NOT a process concern — it's delegated back to the
* backend through the injected `hooks.onServerDown` callback, keeping this module
* free of the demux map / SQL / turn state.
*
* v2.7 concurrency hardening: `ensureServer` is guarded against the crash-window
* double-spawn (two concurrent callers each re-spawning on different ports) via a
* synchronous `startInFlight` flag — see `shouldStartServer`.
*/
import { spawn, type ChildProcess } from 'node:child_process';
import { createOpencodeClient, type OpencodeClient } from '@opencode-ai/sdk/v2/client';
import type { FastifyBaseLogger } from 'fastify';
import { decideRestart, DEFAULT_HEALTH_FAILURE_THRESHOLD } from './lifecycle-decisions.js';
import { reclaimPort, waitForPortRelease, freePort } from '../net/port-utils.js';
const READY_TIMEOUT_MS = 30_000;
/** Info handed to the backend when the server goes down (crash or forced restart). */
export interface ServerDownInfo {
code: number | null;
signal: NodeJS.Signals | null;
port: number;
}
export interface SupervisorHooks {
/** True iff ANY pooled session has an in-flight turn (defers a busy restart). */
isBusy: () => boolean;
/** Session-level recovery: fail in-flight turns, mark crashed, drop demux state. */
onServerDown: (info: ServerDownInfo) => void;
}
export interface OpenCodeServerSupervisorDeps {
/** Absolute path to the opencode binary (resolved from available_agents). */
opencodeBinary: string;
log: FastifyBaseLogger;
hooks: SupervisorHooks;
}
/**
* Pure decision for `ensureServer`: should we (re)spawn the server right now?
*
* - A live, ready server (`up && client`) → no.
* - A start already in flight (`startInFlight`) → no, NEVER double-spawn — join the
* running start instead. This is checked BEFORE `serverStarting` because the crash
* handler can null `serverStarting` mid-start (a crash during `await freePort()`),
* and without this guard the `!serverStarting` branch would spawn a second server
* on a different port while the first is still coming up.
* - No start cached/running → yes (fresh start or post-crash re-spawn, since the
* crash handler nulls `serverStarting`).
* - A cached start that already finished, but the child has since died and the crash
* handler hasn't reset us yet → yes.
*/
export function shouldStartServer(s: {
up: boolean;
hasClient: boolean;
serverStarting: boolean;
childDead: boolean;
startInFlight: boolean;
}): boolean {
if (s.up && s.hasClient) return false;
if (s.startInFlight) return false;
if (!s.serverStarting) return true;
if (!s.up && s.childDead) return true;
return false;
}
export class OpenCodeServerSupervisor {
private readonly opencodeBinary: string;
private readonly log: FastifyBaseLogger;
private readonly hooks: SupervisorHooks;
private childProc: ChildProcess | null = null;
private opencodeClient: OpencodeClient | null = null;
private serverPort: number | null = null;
private up = false;
private serverStarting: Promise<void> | null = null;
/** True from the synchronous head of startServer() until it settles — the
* double-spawn guard reads it so a concurrent ensureServer joins instead of
* kicking a second spawn. */
private startInFlight = false;
// Phase 3 busy-aware health monitor (openchamber lift): consecutive failed
// probes + the start of an unhealthy-while-busy window feed `decideRestart`.
private consecutiveHealthFailures = 0;
private unhealthyBusySince = 0;
private restarting: Promise<void> | null = null;
constructor(deps: OpenCodeServerSupervisorDeps) {
this.opencodeBinary = deps.opencodeBinary;
this.log = deps.log;
this.hooks = deps.hooks;
}
/** The live opencode HTTP client, or null between (re)starts. */
get client(): OpencodeClient | null {
return this.opencodeClient;
}
/** The current server port, or null before the first start. */
get port(): number | null {
return this.serverPort;
}
/** §2: liveness for the health endpoint + dispatcher fallback decision. */
health(): 'up' | 'down' {
return this.up ? 'up' : 'down';
}
isUp(): boolean {
return this.up;
}
// ─── lifecycle (spawn once + client + ready; crash-restart) ──────────────────
/**
* Lazy: start the single server on first use; re-spawn after a crash. Idempotent
* within one live server — `serverStarting` caches the in-flight start, reset to
* null by the crash handler so the NEXT ensureServer re-spawns. A dead-but-not-
* yet-reaped child (exit handler raced) is also treated as needing a restart.
* Concurrent callers in a crash window are coalesced via `startInFlight`.
*/
ensureServer(): Promise<void> {
if (this.up && this.opencodeClient) return Promise.resolve();
const childDead =
this.childProc != null && (this.childProc.exitCode !== null || this.childProc.signalCode !== null);
if (
shouldStartServer({
up: this.up,
hasClient: this.opencodeClient != null,
serverStarting: this.serverStarting != null,
childDead,
startInFlight: this.startInFlight,
})
) {
this.serverStarting = this.startServer();
}
return this.serverStarting ?? Promise.resolve();
}
private async startServer(): Promise<void> {
// Set synchronously (before the first await) so a concurrent ensureServer sees
// the in-flight start and joins `serverStarting` instead of double-spawning.
this.startInFlight = true;
try {
const port = await freePort();
// Phase 1: run unsecured on loopback (opencode's documented default — serve.ts
// only WARNS when OPENCODE_SERVER_PASSWORD is unset). The real boundary is the
// 127.0.0.1 bind.
const child = spawn(this.opencodeBinary, ['serve', '--hostname', '127.0.0.1', '--port', String(port)], {
stdio: ['ignore', 'pipe', 'pipe'],
env: { ...process.env },
});
this.childProc = child;
this.serverPort = port;
// Child lifetime is the backend's (the pool's), NOT a request's. On unexpected
// exit we recover: settle in-flight turns, mark sessions crashed (the backend's
// onServerDown), reclaim the port, and reset state so the next ensureServer
// re-spawns.
child.on('exit', (code, signal) => {
// Only react to THIS child's exit (a restart may have swapped in a new one).
if (this.childProc !== child) return;
this.handleCrash(code, signal, port);
});
await waitForReady(child, READY_TIMEOUT_MS);
this.opencodeClient = createOpencodeClient({ baseUrl: `http://127.0.0.1:${port}` });
this.up = true;
this.log.info({ port }, 'opencode-server: ready');
} finally {
this.startInFlight = false;
}
}
/**
* Server down (crash-exit or forced restart): reset process/port state, delegate
* session-level recovery to the backend, and reclaim the port. Mirrors the
* original `handleServerCrash` ordering (up=false → session cleanup → client/
* serverStarting null → reclaimPort).
*/
private handleCrash(code: number | null, signal: NodeJS.Signals | null, port: number): void {
this.up = false;
this.hooks.onServerDown({ code, signal, port });
this.opencodeClient = null;
this.serverStarting = null; // force a re-spawn on the next ensureServer
// Reclaim the port so a re-spawn on a fixed/leaked port isn't blocked. Best
// effort; the next start uses a fresh ephemeral port anyway.
reclaimPort(port);
}
/**
* Phase 3 proactive health monitor (openchamber `runHealthCheckCycle` lift,
* busy-aware). Probes /global/health; on a sustained failure of a NON-busy server,
* force a restart so the next turn isn't blocked by a wedged process. Busy servers
* are deferred via the stale-grace in `decideRestart`. No-op when never started or
* a restart is already in flight.
*/
async tickHealth(now: number = Date.now()): Promise<void> {
if (!this.childProc || this.restarting) return;
const childExited = this.childProc.exitCode !== null || this.childProc.signalCode !== null;
// An exited child is recovered lazily by ensureServer; don't double-restart it.
if (childExited) return;
const healthy = await this.probeHealth();
if (healthy) {
this.consecutiveHealthFailures = 0;
this.unhealthyBusySince = 0;
return;
}
this.consecutiveHealthFailures += 1;
const busy = this.hooks.isBusy();
const decision = decideRestart({
processExited: false,
consecutiveFailures: this.consecutiveHealthFailures,
busy,
unhealthyBusySince: this.unhealthyBusySince,
now,
failureThreshold: DEFAULT_HEALTH_FAILURE_THRESHOLD,
});
// Stamp the start of an unhealthy-while-busy window so the stale-grace can fire.
if (busy && this.unhealthyBusySince === 0) this.unhealthyBusySince = now;
if (decision.action === 'restart') {
this.log.warn(
{ failures: this.consecutiveHealthFailures, busy, reason: decision.reason },
'opencode-server: health monitor forcing restart',
);
this.consecutiveHealthFailures = 0;
this.unhealthyBusySince = 0;
await this.restartServer();
}
}
private async probeHealth(): Promise<boolean> {
if (!this.opencodeClient) return false;
try {
const res = await this.opencodeClient.global.health();
return !res.error;
} catch {
return false;
}
}
/** Force-kill the current server + reclaim its port; the next ensureServer
* re-spawns (lazy). Mirrors handleCrash's state reset but is initiated by the
* health monitor rather than the OS. */
private async restartServer(): Promise<void> {
if (this.restarting) return this.restarting;
this.restarting = (async () => {
const child = this.childProc;
const port = this.serverPort;
this.up = false;
// Fail in-flight turns + mark sessions crashed via the same path as a crash.
if (child) {
this.handleCrash(null, null, port ?? 0);
if (!child.killed) child.kill('SIGTERM');
}
if (port) {
reclaimPort(port);
await waitForPortRelease(port, 3_000);
}
this.childProc = null;
})().finally(() => {
this.restarting = null;
});
return this.restarting;
}
/** Full teardown of the child + client + port state. */
async dispose(): Promise<void> {
this.up = false;
const child = this.childProc;
this.childProc = null;
this.opencodeClient = null;
if (child && !child.killed) {
child.kill('SIGTERM');
const t = setTimeout(() => {
if (!child.killed) child.kill('SIGKILL');
}, 5_000);
t.unref();
}
}
}
/** Resolve when the child prints the ready line; reject on timeout or early exit. */
function waitForReady(child: ChildProcess, timeoutMs: number): Promise<void> {
return new Promise((resolve, reject) => {
let done = false;
let stderrBuf = '';
const finish = (err?: Error) => {
if (done) return;
done = true;
clearTimeout(timer);
child.stdout?.off('data', onOut);
child.stderr?.off('data', onErr);
child.off('exit', onExit);
if (err) reject(err);
else resolve();
};
const onOut = (buf: Buffer) => {
if (buf.toString().includes('opencode server listening on')) finish();
};
const onErr = (buf: Buffer) => {
stderrBuf += buf.toString();
};
const onExit = (code: number | null) =>
finish(new Error(`opencode serve exited before ready (code ${code}); stderr: ${stderrBuf.slice(-2000)}`));
const timer = setTimeout(
() => finish(new Error(`opencode serve not ready in ${timeoutMs}ms; stderr: ${stderrBuf.slice(-2000)}`)),
timeoutMs,
);
child.stdout?.on('data', onOut);
child.stderr?.on('data', onErr);
child.on('exit', onExit);
});
}