Multi-agent audit + aggressive cleanup across server/web/coder/booterm, delivered behind a DEFER discipline so none of the in-flight files were touched. Removes dead code/deps/columns, dedups server + coder helpers, and splits the oversized modules (tools.ts, opencode-server.ts, sentinel-summaries, turn.ts, TerminalPane.tsx) behind stable contracts. Adds 78 parity/unit tests (server 587, coder 323); fixes two latent bugs (ChatPane queue keys, FileViewerOverlay blank-line parity). Intended tag: v2.7.12-audit-cleanup.
326 lines
12 KiB
TypeScript
326 lines
12 KiB
TypeScript
/**
|
|
* OpenCodeServerSupervisor — the opencode `serve` child + HTTP client + port +
|
|
* health-counter lifecycle, extracted (v2.7 audit reshape) from the backend
|
|
* god-class. Owns spawn / ready / crash / proactive-health restart / dispose and
|
|
* exposes `client` / `port` / `health()` / `tickHealth()` to the backend.
|
|
*
|
|
* Session-level recovery (failing in-flight turns, marking agent_sessions crashed,
|
|
* tearing down SSE loops) is NOT a process concern — it's delegated back to the
|
|
* backend through the injected `hooks.onServerDown` callback, keeping this module
|
|
* free of the demux map / SQL / turn state.
|
|
*
|
|
* v2.7 concurrency hardening: `ensureServer` is guarded against the crash-window
|
|
* double-spawn (two concurrent callers each re-spawning on different ports) via a
|
|
* synchronous `startInFlight` flag — see `shouldStartServer`.
|
|
*/
|
|
import { spawn, type ChildProcess } from 'node:child_process';
|
|
import { createOpencodeClient, type OpencodeClient } from '@opencode-ai/sdk/v2/client';
|
|
import type { FastifyBaseLogger } from 'fastify';
|
|
import { decideRestart, DEFAULT_HEALTH_FAILURE_THRESHOLD } from './lifecycle-decisions.js';
|
|
import { reclaimPort, waitForPortRelease, freePort } from '../net/port-utils.js';
|
|
|
|
const READY_TIMEOUT_MS = 30_000;
|
|
|
|
/** Info handed to the backend when the server goes down (crash or forced restart). */
|
|
export interface ServerDownInfo {
|
|
code: number | null;
|
|
signal: NodeJS.Signals | null;
|
|
port: number;
|
|
}
|
|
|
|
export interface SupervisorHooks {
|
|
/** True iff ANY pooled session has an in-flight turn (defers a busy restart). */
|
|
isBusy: () => boolean;
|
|
/** Session-level recovery: fail in-flight turns, mark crashed, drop demux state. */
|
|
onServerDown: (info: ServerDownInfo) => void;
|
|
}
|
|
|
|
export interface OpenCodeServerSupervisorDeps {
|
|
/** Absolute path to the opencode binary (resolved from available_agents). */
|
|
opencodeBinary: string;
|
|
log: FastifyBaseLogger;
|
|
hooks: SupervisorHooks;
|
|
}
|
|
|
|
/**
|
|
* Pure decision for `ensureServer`: should we (re)spawn the server right now?
|
|
*
|
|
* - A live, ready server (`up && client`) → no.
|
|
* - A start already in flight (`startInFlight`) → no, NEVER double-spawn — join the
|
|
* running start instead. This is checked BEFORE `serverStarting` because the crash
|
|
* handler can null `serverStarting` mid-start (a crash during `await freePort()`),
|
|
* and without this guard the `!serverStarting` branch would spawn a second server
|
|
* on a different port while the first is still coming up.
|
|
* - No start cached/running → yes (fresh start or post-crash re-spawn, since the
|
|
* crash handler nulls `serverStarting`).
|
|
* - A cached start that already finished, but the child has since died and the crash
|
|
* handler hasn't reset us yet → yes.
|
|
*/
|
|
export function shouldStartServer(s: {
|
|
up: boolean;
|
|
hasClient: boolean;
|
|
serverStarting: boolean;
|
|
childDead: boolean;
|
|
startInFlight: boolean;
|
|
}): boolean {
|
|
if (s.up && s.hasClient) return false;
|
|
if (s.startInFlight) return false;
|
|
if (!s.serverStarting) return true;
|
|
if (!s.up && s.childDead) return true;
|
|
return false;
|
|
}
|
|
|
|
export class OpenCodeServerSupervisor {
|
|
private readonly opencodeBinary: string;
|
|
private readonly log: FastifyBaseLogger;
|
|
private readonly hooks: SupervisorHooks;
|
|
|
|
private childProc: ChildProcess | null = null;
|
|
private opencodeClient: OpencodeClient | null = null;
|
|
private serverPort: number | null = null;
|
|
private up = false;
|
|
private serverStarting: Promise<void> | null = null;
|
|
/** True from the synchronous head of startServer() until it settles — the
|
|
* double-spawn guard reads it so a concurrent ensureServer joins instead of
|
|
* kicking a second spawn. */
|
|
private startInFlight = false;
|
|
// Phase 3 busy-aware health monitor (openchamber lift): consecutive failed
|
|
// probes + the start of an unhealthy-while-busy window feed `decideRestart`.
|
|
private consecutiveHealthFailures = 0;
|
|
private unhealthyBusySince = 0;
|
|
private restarting: Promise<void> | null = null;
|
|
|
|
constructor(deps: OpenCodeServerSupervisorDeps) {
|
|
this.opencodeBinary = deps.opencodeBinary;
|
|
this.log = deps.log;
|
|
this.hooks = deps.hooks;
|
|
}
|
|
|
|
/** The live opencode HTTP client, or null between (re)starts. */
|
|
get client(): OpencodeClient | null {
|
|
return this.opencodeClient;
|
|
}
|
|
|
|
/** The current server port, or null before the first start. */
|
|
get port(): number | null {
|
|
return this.serverPort;
|
|
}
|
|
|
|
/** §2: liveness for the health endpoint + dispatcher fallback decision. */
|
|
health(): 'up' | 'down' {
|
|
return this.up ? 'up' : 'down';
|
|
}
|
|
|
|
isUp(): boolean {
|
|
return this.up;
|
|
}
|
|
|
|
// ─── lifecycle (spawn once + client + ready; crash-restart) ──────────────────
|
|
|
|
/**
|
|
* Lazy: start the single server on first use; re-spawn after a crash. Idempotent
|
|
* within one live server — `serverStarting` caches the in-flight start, reset to
|
|
* null by the crash handler so the NEXT ensureServer re-spawns. A dead-but-not-
|
|
* yet-reaped child (exit handler raced) is also treated as needing a restart.
|
|
* Concurrent callers in a crash window are coalesced via `startInFlight`.
|
|
*/
|
|
ensureServer(): Promise<void> {
|
|
if (this.up && this.opencodeClient) return Promise.resolve();
|
|
const childDead =
|
|
this.childProc != null && (this.childProc.exitCode !== null || this.childProc.signalCode !== null);
|
|
if (
|
|
shouldStartServer({
|
|
up: this.up,
|
|
hasClient: this.opencodeClient != null,
|
|
serverStarting: this.serverStarting != null,
|
|
childDead,
|
|
startInFlight: this.startInFlight,
|
|
})
|
|
) {
|
|
this.serverStarting = this.startServer();
|
|
}
|
|
return this.serverStarting ?? Promise.resolve();
|
|
}
|
|
|
|
private async startServer(): Promise<void> {
|
|
// Set synchronously (before the first await) so a concurrent ensureServer sees
|
|
// the in-flight start and joins `serverStarting` instead of double-spawning.
|
|
this.startInFlight = true;
|
|
try {
|
|
const port = await freePort();
|
|
|
|
// Phase 1: run unsecured on loopback (opencode's documented default — serve.ts
|
|
// only WARNS when OPENCODE_SERVER_PASSWORD is unset). The real boundary is the
|
|
// 127.0.0.1 bind.
|
|
const child = spawn(this.opencodeBinary, ['serve', '--hostname', '127.0.0.1', '--port', String(port)], {
|
|
stdio: ['ignore', 'pipe', 'pipe'],
|
|
env: { ...process.env },
|
|
});
|
|
this.childProc = child;
|
|
this.serverPort = port;
|
|
|
|
// Child lifetime is the backend's (the pool's), NOT a request's. On unexpected
|
|
// exit we recover: settle in-flight turns, mark sessions crashed (the backend's
|
|
// onServerDown), reclaim the port, and reset state so the next ensureServer
|
|
// re-spawns.
|
|
child.on('exit', (code, signal) => {
|
|
// Only react to THIS child's exit (a restart may have swapped in a new one).
|
|
if (this.childProc !== child) return;
|
|
this.handleCrash(code, signal, port);
|
|
});
|
|
|
|
await waitForReady(child, READY_TIMEOUT_MS);
|
|
|
|
this.opencodeClient = createOpencodeClient({ baseUrl: `http://127.0.0.1:${port}` });
|
|
this.up = true;
|
|
this.log.info({ port }, 'opencode-server: ready');
|
|
} finally {
|
|
this.startInFlight = false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Server down (crash-exit or forced restart): reset process/port state, delegate
|
|
* session-level recovery to the backend, and reclaim the port. Mirrors the
|
|
* original `handleServerCrash` ordering (up=false → session cleanup → client/
|
|
* serverStarting null → reclaimPort).
|
|
*/
|
|
private handleCrash(code: number | null, signal: NodeJS.Signals | null, port: number): void {
|
|
this.up = false;
|
|
this.hooks.onServerDown({ code, signal, port });
|
|
this.opencodeClient = null;
|
|
this.serverStarting = null; // force a re-spawn on the next ensureServer
|
|
// Reclaim the port so a re-spawn on a fixed/leaked port isn't blocked. Best
|
|
// effort; the next start uses a fresh ephemeral port anyway.
|
|
reclaimPort(port);
|
|
}
|
|
|
|
/**
|
|
* Phase 3 proactive health monitor (openchamber `runHealthCheckCycle` lift,
|
|
* busy-aware). Probes /global/health; on a sustained failure of a NON-busy server,
|
|
* force a restart so the next turn isn't blocked by a wedged process. Busy servers
|
|
* are deferred via the stale-grace in `decideRestart`. No-op when never started or
|
|
* a restart is already in flight.
|
|
*/
|
|
async tickHealth(now: number = Date.now()): Promise<void> {
|
|
if (!this.childProc || this.restarting) return;
|
|
const childExited = this.childProc.exitCode !== null || this.childProc.signalCode !== null;
|
|
// An exited child is recovered lazily by ensureServer; don't double-restart it.
|
|
if (childExited) return;
|
|
|
|
const healthy = await this.probeHealth();
|
|
if (healthy) {
|
|
this.consecutiveHealthFailures = 0;
|
|
this.unhealthyBusySince = 0;
|
|
return;
|
|
}
|
|
this.consecutiveHealthFailures += 1;
|
|
const busy = this.hooks.isBusy();
|
|
const decision = decideRestart({
|
|
processExited: false,
|
|
consecutiveFailures: this.consecutiveHealthFailures,
|
|
busy,
|
|
unhealthyBusySince: this.unhealthyBusySince,
|
|
now,
|
|
failureThreshold: DEFAULT_HEALTH_FAILURE_THRESHOLD,
|
|
});
|
|
// Stamp the start of an unhealthy-while-busy window so the stale-grace can fire.
|
|
if (busy && this.unhealthyBusySince === 0) this.unhealthyBusySince = now;
|
|
if (decision.action === 'restart') {
|
|
this.log.warn(
|
|
{ failures: this.consecutiveHealthFailures, busy, reason: decision.reason },
|
|
'opencode-server: health monitor forcing restart',
|
|
);
|
|
this.consecutiveHealthFailures = 0;
|
|
this.unhealthyBusySince = 0;
|
|
await this.restartServer();
|
|
}
|
|
}
|
|
|
|
private async probeHealth(): Promise<boolean> {
|
|
if (!this.opencodeClient) return false;
|
|
try {
|
|
const res = await this.opencodeClient.global.health();
|
|
return !res.error;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/** Force-kill the current server + reclaim its port; the next ensureServer
|
|
* re-spawns (lazy). Mirrors handleCrash's state reset but is initiated by the
|
|
* health monitor rather than the OS. */
|
|
private async restartServer(): Promise<void> {
|
|
if (this.restarting) return this.restarting;
|
|
this.restarting = (async () => {
|
|
const child = this.childProc;
|
|
const port = this.serverPort;
|
|
this.up = false;
|
|
// Fail in-flight turns + mark sessions crashed via the same path as a crash.
|
|
if (child) {
|
|
this.handleCrash(null, null, port ?? 0);
|
|
if (!child.killed) child.kill('SIGTERM');
|
|
}
|
|
if (port) {
|
|
reclaimPort(port);
|
|
await waitForPortRelease(port, 3_000);
|
|
}
|
|
this.childProc = null;
|
|
})().finally(() => {
|
|
this.restarting = null;
|
|
});
|
|
return this.restarting;
|
|
}
|
|
|
|
/** Full teardown of the child + client + port state. */
|
|
async dispose(): Promise<void> {
|
|
this.up = false;
|
|
const child = this.childProc;
|
|
this.childProc = null;
|
|
this.opencodeClient = null;
|
|
if (child && !child.killed) {
|
|
child.kill('SIGTERM');
|
|
const t = setTimeout(() => {
|
|
if (!child.killed) child.kill('SIGKILL');
|
|
}, 5_000);
|
|
t.unref();
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Resolve when the child prints the ready line; reject on timeout or early exit. */
|
|
function waitForReady(child: ChildProcess, timeoutMs: number): Promise<void> {
|
|
return new Promise((resolve, reject) => {
|
|
let done = false;
|
|
let stderrBuf = '';
|
|
|
|
const finish = (err?: Error) => {
|
|
if (done) return;
|
|
done = true;
|
|
clearTimeout(timer);
|
|
child.stdout?.off('data', onOut);
|
|
child.stderr?.off('data', onErr);
|
|
child.off('exit', onExit);
|
|
if (err) reject(err);
|
|
else resolve();
|
|
};
|
|
|
|
const onOut = (buf: Buffer) => {
|
|
if (buf.toString().includes('opencode server listening on')) finish();
|
|
};
|
|
const onErr = (buf: Buffer) => {
|
|
stderrBuf += buf.toString();
|
|
};
|
|
const onExit = (code: number | null) =>
|
|
finish(new Error(`opencode serve exited before ready (code ${code}); stderr: ${stderrBuf.slice(-2000)}`));
|
|
const timer = setTimeout(
|
|
() => finish(new Error(`opencode serve not ready in ${timeoutMs}ms; stderr: ${stderrBuf.slice(-2000)}`)),
|
|
timeoutMs,
|
|
);
|
|
|
|
child.stdout?.on('data', onOut);
|
|
child.stderr?.on('data', onErr);
|
|
child.on('exit', onExit);
|
|
});
|
|
}
|