refactor: codebase audit cleanup — dead code, dedup, module splits
Multi-agent audit + aggressive cleanup across server/web/coder/booterm, delivered behind a DEFER discipline so none of the in-flight files were touched. Removes dead code/deps/columns, dedups server + coder helpers, and splits the oversized modules (tools.ts, opencode-server.ts, sentinel-summaries, turn.ts, TerminalPane.tsx) behind stable contracts. Adds 78 parity/unit tests (server 587, coder 323); fixes two latent bugs (ChatPane queue keys, FileViewerOverlay blank-line parity). Intended tag: v2.7.12-audit-cleanup.
This commit is contained in:
325
apps/coder/src/services/backends/opencode-server-process.ts
Normal file
325
apps/coder/src/services/backends/opencode-server-process.ts
Normal file
@@ -0,0 +1,325 @@
|
||||
/**
|
||||
* OpenCodeServerSupervisor — the opencode `serve` child + HTTP client + port +
|
||||
* health-counter lifecycle, extracted (v2.7 audit reshape) from the backend
|
||||
* god-class. Owns spawn / ready / crash / proactive-health restart / dispose and
|
||||
* exposes `client` / `port` / `health()` / `tickHealth()` to the backend.
|
||||
*
|
||||
* Session-level recovery (failing in-flight turns, marking agent_sessions crashed,
|
||||
* tearing down SSE loops) is NOT a process concern — it's delegated back to the
|
||||
* backend through the injected `hooks.onServerDown` callback, keeping this module
|
||||
* free of the demux map / SQL / turn state.
|
||||
*
|
||||
* v2.7 concurrency hardening: `ensureServer` is guarded against the crash-window
|
||||
* double-spawn (two concurrent callers each re-spawning on different ports) via a
|
||||
* synchronous `startInFlight` flag — see `shouldStartServer`.
|
||||
*/
|
||||
import { spawn, type ChildProcess } from 'node:child_process';
|
||||
import { createOpencodeClient, type OpencodeClient } from '@opencode-ai/sdk/v2/client';
|
||||
import type { FastifyBaseLogger } from 'fastify';
|
||||
import { decideRestart, DEFAULT_HEALTH_FAILURE_THRESHOLD } from './lifecycle-decisions.js';
|
||||
import { reclaimPort, waitForPortRelease, freePort } from '../net/port-utils.js';
|
||||
|
||||
const READY_TIMEOUT_MS = 30_000;
|
||||
|
||||
/** Info handed to the backend when the server goes down (crash or forced restart). */
|
||||
export interface ServerDownInfo {
|
||||
code: number | null;
|
||||
signal: NodeJS.Signals | null;
|
||||
port: number;
|
||||
}
|
||||
|
||||
export interface SupervisorHooks {
|
||||
/** True iff ANY pooled session has an in-flight turn (defers a busy restart). */
|
||||
isBusy: () => boolean;
|
||||
/** Session-level recovery: fail in-flight turns, mark crashed, drop demux state. */
|
||||
onServerDown: (info: ServerDownInfo) => void;
|
||||
}
|
||||
|
||||
export interface OpenCodeServerSupervisorDeps {
|
||||
/** Absolute path to the opencode binary (resolved from available_agents). */
|
||||
opencodeBinary: string;
|
||||
log: FastifyBaseLogger;
|
||||
hooks: SupervisorHooks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pure decision for `ensureServer`: should we (re)spawn the server right now?
|
||||
*
|
||||
* - A live, ready server (`up && client`) → no.
|
||||
* - A start already in flight (`startInFlight`) → no, NEVER double-spawn — join the
|
||||
* running start instead. This is checked BEFORE `serverStarting` because the crash
|
||||
* handler can null `serverStarting` mid-start (a crash during `await freePort()`),
|
||||
* and without this guard the `!serverStarting` branch would spawn a second server
|
||||
* on a different port while the first is still coming up.
|
||||
* - No start cached/running → yes (fresh start or post-crash re-spawn, since the
|
||||
* crash handler nulls `serverStarting`).
|
||||
* - A cached start that already finished, but the child has since died and the crash
|
||||
* handler hasn't reset us yet → yes.
|
||||
*/
|
||||
export function shouldStartServer(s: {
|
||||
up: boolean;
|
||||
hasClient: boolean;
|
||||
serverStarting: boolean;
|
||||
childDead: boolean;
|
||||
startInFlight: boolean;
|
||||
}): boolean {
|
||||
if (s.up && s.hasClient) return false;
|
||||
if (s.startInFlight) return false;
|
||||
if (!s.serverStarting) return true;
|
||||
if (!s.up && s.childDead) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
export class OpenCodeServerSupervisor {
|
||||
private readonly opencodeBinary: string;
|
||||
private readonly log: FastifyBaseLogger;
|
||||
private readonly hooks: SupervisorHooks;
|
||||
|
||||
private childProc: ChildProcess | null = null;
|
||||
private opencodeClient: OpencodeClient | null = null;
|
||||
private serverPort: number | null = null;
|
||||
private up = false;
|
||||
private serverStarting: Promise<void> | null = null;
|
||||
/** True from the synchronous head of startServer() until it settles — the
|
||||
* double-spawn guard reads it so a concurrent ensureServer joins instead of
|
||||
* kicking a second spawn. */
|
||||
private startInFlight = false;
|
||||
// Phase 3 busy-aware health monitor (openchamber lift): consecutive failed
|
||||
// probes + the start of an unhealthy-while-busy window feed `decideRestart`.
|
||||
private consecutiveHealthFailures = 0;
|
||||
private unhealthyBusySince = 0;
|
||||
private restarting: Promise<void> | null = null;
|
||||
|
||||
constructor(deps: OpenCodeServerSupervisorDeps) {
|
||||
this.opencodeBinary = deps.opencodeBinary;
|
||||
this.log = deps.log;
|
||||
this.hooks = deps.hooks;
|
||||
}
|
||||
|
||||
/** The live opencode HTTP client, or null between (re)starts. */
|
||||
get client(): OpencodeClient | null {
|
||||
return this.opencodeClient;
|
||||
}
|
||||
|
||||
/** The current server port, or null before the first start. */
|
||||
get port(): number | null {
|
||||
return this.serverPort;
|
||||
}
|
||||
|
||||
/** §2: liveness for the health endpoint + dispatcher fallback decision. */
|
||||
health(): 'up' | 'down' {
|
||||
return this.up ? 'up' : 'down';
|
||||
}
|
||||
|
||||
isUp(): boolean {
|
||||
return this.up;
|
||||
}
|
||||
|
||||
// ─── lifecycle (spawn once + client + ready; crash-restart) ──────────────────
|
||||
|
||||
/**
|
||||
* Lazy: start the single server on first use; re-spawn after a crash. Idempotent
|
||||
* within one live server — `serverStarting` caches the in-flight start, reset to
|
||||
* null by the crash handler so the NEXT ensureServer re-spawns. A dead-but-not-
|
||||
* yet-reaped child (exit handler raced) is also treated as needing a restart.
|
||||
* Concurrent callers in a crash window are coalesced via `startInFlight`.
|
||||
*/
|
||||
ensureServer(): Promise<void> {
|
||||
if (this.up && this.opencodeClient) return Promise.resolve();
|
||||
const childDead =
|
||||
this.childProc != null && (this.childProc.exitCode !== null || this.childProc.signalCode !== null);
|
||||
if (
|
||||
shouldStartServer({
|
||||
up: this.up,
|
||||
hasClient: this.opencodeClient != null,
|
||||
serverStarting: this.serverStarting != null,
|
||||
childDead,
|
||||
startInFlight: this.startInFlight,
|
||||
})
|
||||
) {
|
||||
this.serverStarting = this.startServer();
|
||||
}
|
||||
return this.serverStarting ?? Promise.resolve();
|
||||
}
|
||||
|
||||
private async startServer(): Promise<void> {
|
||||
// Set synchronously (before the first await) so a concurrent ensureServer sees
|
||||
// the in-flight start and joins `serverStarting` instead of double-spawning.
|
||||
this.startInFlight = true;
|
||||
try {
|
||||
const port = await freePort();
|
||||
|
||||
// Phase 1: run unsecured on loopback (opencode's documented default — serve.ts
|
||||
// only WARNS when OPENCODE_SERVER_PASSWORD is unset). The real boundary is the
|
||||
// 127.0.0.1 bind.
|
||||
const child = spawn(this.opencodeBinary, ['serve', '--hostname', '127.0.0.1', '--port', String(port)], {
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
env: { ...process.env },
|
||||
});
|
||||
this.childProc = child;
|
||||
this.serverPort = port;
|
||||
|
||||
// Child lifetime is the backend's (the pool's), NOT a request's. On unexpected
|
||||
// exit we recover: settle in-flight turns, mark sessions crashed (the backend's
|
||||
// onServerDown), reclaim the port, and reset state so the next ensureServer
|
||||
// re-spawns.
|
||||
child.on('exit', (code, signal) => {
|
||||
// Only react to THIS child's exit (a restart may have swapped in a new one).
|
||||
if (this.childProc !== child) return;
|
||||
this.handleCrash(code, signal, port);
|
||||
});
|
||||
|
||||
await waitForReady(child, READY_TIMEOUT_MS);
|
||||
|
||||
this.opencodeClient = createOpencodeClient({ baseUrl: `http://127.0.0.1:${port}` });
|
||||
this.up = true;
|
||||
this.log.info({ port }, 'opencode-server: ready');
|
||||
} finally {
|
||||
this.startInFlight = false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Server down (crash-exit or forced restart): reset process/port state, delegate
|
||||
* session-level recovery to the backend, and reclaim the port. Mirrors the
|
||||
* original `handleServerCrash` ordering (up=false → session cleanup → client/
|
||||
* serverStarting null → reclaimPort).
|
||||
*/
|
||||
private handleCrash(code: number | null, signal: NodeJS.Signals | null, port: number): void {
|
||||
this.up = false;
|
||||
this.hooks.onServerDown({ code, signal, port });
|
||||
this.opencodeClient = null;
|
||||
this.serverStarting = null; // force a re-spawn on the next ensureServer
|
||||
// Reclaim the port so a re-spawn on a fixed/leaked port isn't blocked. Best
|
||||
// effort; the next start uses a fresh ephemeral port anyway.
|
||||
reclaimPort(port);
|
||||
}
|
||||
|
||||
/**
|
||||
* Phase 3 proactive health monitor (openchamber `runHealthCheckCycle` lift,
|
||||
* busy-aware). Probes /global/health; on a sustained failure of a NON-busy server,
|
||||
* force a restart so the next turn isn't blocked by a wedged process. Busy servers
|
||||
* are deferred via the stale-grace in `decideRestart`. No-op when never started or
|
||||
* a restart is already in flight.
|
||||
*/
|
||||
async tickHealth(now: number = Date.now()): Promise<void> {
|
||||
if (!this.childProc || this.restarting) return;
|
||||
const childExited = this.childProc.exitCode !== null || this.childProc.signalCode !== null;
|
||||
// An exited child is recovered lazily by ensureServer; don't double-restart it.
|
||||
if (childExited) return;
|
||||
|
||||
const healthy = await this.probeHealth();
|
||||
if (healthy) {
|
||||
this.consecutiveHealthFailures = 0;
|
||||
this.unhealthyBusySince = 0;
|
||||
return;
|
||||
}
|
||||
this.consecutiveHealthFailures += 1;
|
||||
const busy = this.hooks.isBusy();
|
||||
const decision = decideRestart({
|
||||
processExited: false,
|
||||
consecutiveFailures: this.consecutiveHealthFailures,
|
||||
busy,
|
||||
unhealthyBusySince: this.unhealthyBusySince,
|
||||
now,
|
||||
failureThreshold: DEFAULT_HEALTH_FAILURE_THRESHOLD,
|
||||
});
|
||||
// Stamp the start of an unhealthy-while-busy window so the stale-grace can fire.
|
||||
if (busy && this.unhealthyBusySince === 0) this.unhealthyBusySince = now;
|
||||
if (decision.action === 'restart') {
|
||||
this.log.warn(
|
||||
{ failures: this.consecutiveHealthFailures, busy, reason: decision.reason },
|
||||
'opencode-server: health monitor forcing restart',
|
||||
);
|
||||
this.consecutiveHealthFailures = 0;
|
||||
this.unhealthyBusySince = 0;
|
||||
await this.restartServer();
|
||||
}
|
||||
}
|
||||
|
||||
private async probeHealth(): Promise<boolean> {
|
||||
if (!this.opencodeClient) return false;
|
||||
try {
|
||||
const res = await this.opencodeClient.global.health();
|
||||
return !res.error;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Force-kill the current server + reclaim its port; the next ensureServer
|
||||
* re-spawns (lazy). Mirrors handleCrash's state reset but is initiated by the
|
||||
* health monitor rather than the OS. */
|
||||
private async restartServer(): Promise<void> {
|
||||
if (this.restarting) return this.restarting;
|
||||
this.restarting = (async () => {
|
||||
const child = this.childProc;
|
||||
const port = this.serverPort;
|
||||
this.up = false;
|
||||
// Fail in-flight turns + mark sessions crashed via the same path as a crash.
|
||||
if (child) {
|
||||
this.handleCrash(null, null, port ?? 0);
|
||||
if (!child.killed) child.kill('SIGTERM');
|
||||
}
|
||||
if (port) {
|
||||
reclaimPort(port);
|
||||
await waitForPortRelease(port, 3_000);
|
||||
}
|
||||
this.childProc = null;
|
||||
})().finally(() => {
|
||||
this.restarting = null;
|
||||
});
|
||||
return this.restarting;
|
||||
}
|
||||
|
||||
/** Full teardown of the child + client + port state. */
|
||||
async dispose(): Promise<void> {
|
||||
this.up = false;
|
||||
const child = this.childProc;
|
||||
this.childProc = null;
|
||||
this.opencodeClient = null;
|
||||
if (child && !child.killed) {
|
||||
child.kill('SIGTERM');
|
||||
const t = setTimeout(() => {
|
||||
if (!child.killed) child.kill('SIGKILL');
|
||||
}, 5_000);
|
||||
t.unref();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Resolve when the child prints the ready line; reject on timeout or early exit. */
|
||||
function waitForReady(child: ChildProcess, timeoutMs: number): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
let done = false;
|
||||
let stderrBuf = '';
|
||||
|
||||
const finish = (err?: Error) => {
|
||||
if (done) return;
|
||||
done = true;
|
||||
clearTimeout(timer);
|
||||
child.stdout?.off('data', onOut);
|
||||
child.stderr?.off('data', onErr);
|
||||
child.off('exit', onExit);
|
||||
if (err) reject(err);
|
||||
else resolve();
|
||||
};
|
||||
|
||||
const onOut = (buf: Buffer) => {
|
||||
if (buf.toString().includes('opencode server listening on')) finish();
|
||||
};
|
||||
const onErr = (buf: Buffer) => {
|
||||
stderrBuf += buf.toString();
|
||||
};
|
||||
const onExit = (code: number | null) =>
|
||||
finish(new Error(`opencode serve exited before ready (code ${code}); stderr: ${stderrBuf.slice(-2000)}`));
|
||||
const timer = setTimeout(
|
||||
() => finish(new Error(`opencode serve not ready in ${timeoutMs}ms; stderr: ${stderrBuf.slice(-2000)}`)),
|
||||
timeoutMs,
|
||||
);
|
||||
|
||||
child.stdout?.on('data', onOut);
|
||||
child.stderr?.on('data', onErr);
|
||||
child.on('exit', onExit);
|
||||
});
|
||||
}
|
||||
Reference in New Issue
Block a user