feat(coder): v2.6 Phase 3 — lifecycle hardening (idle evict, crash recovery, worktree reaper)
Idle TTL eviction per (chat,agent) + LRU cap (never a busy backend); pure lifecycle-decisions.ts (TDD). Crash recovery lifts openchamber's health-monitor + busy-aware-restart + stale-grace state machine into opencode-server.ts (+ port reclaim) and warm-acp.ts; opencode crash -> fresh sessions, ACP -> re-session/new. F.1 turn-guard + U.6 usage preserved (their tests pass). Orphan worktree reaper (1h grace, superset-style dirty/unpushed preflight, Paseo soft-delete) + close hooks + diff re-baseline after apply_pending. 35 new tests + DB-opt-in reconnect test; 215 coder tests pass; tsc + build clean. Completes v2.6. Follow-ups out of scope: apps/server close-hook caller, 3.7 DiffPanel staging hint, live smokes. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
170
apps/coder/src/services/orphan-worktree-reaper.ts
Normal file
170
apps/coder/src/services/orphan-worktree-reaper.ts
Normal file
@@ -0,0 +1,170 @@
|
||||
/**
|
||||
* v2.6 Phase 3 (3.4) — orphan worktree reaper.
|
||||
*
|
||||
* Reclaims on-disk session worktree dirs under WORKTREE_BASE that have NO live
|
||||
* (`status='active'`) row in the `worktrees` table — leaks from a crash between
|
||||
* `git worktree add` and the DB insert, a missed chat-close hook, or a manual rm
|
||||
* of the DB row. Extends the periodic-sweeper pattern (apps/server's truncation +
|
||||
* stale-streaming reaper).
|
||||
*
|
||||
* SAFETY (Paseo worktree-archive cascade + superset destroy-saga lift): before
|
||||
* removing ANY dir, run `checkWorktreeWorkAtRisk` — a dirty / unpushed / unmerged
|
||||
* worktree is SKIPPED (logged), never force-removed. The pure orphan-target
|
||||
* selection (which dirs are candidates) lives in
|
||||
* `backends/lifecycle-decisions.ts:selectOrphanWorktreeTargets` and is unit-tested;
|
||||
* this module does the DB read + fs stat + git preflight + removal side-effects.
|
||||
*
|
||||
* The mtime grace (default 1h) means a dir mid-`ensureSessionWorktree` (created on
|
||||
* disk, row not yet committed) is never swept — the grace window covers the gap.
|
||||
*/
|
||||
import { readdir, stat } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import type { FastifyBaseLogger } from 'fastify';
|
||||
import type { Sql } from '../db.js';
|
||||
import { WORKTREE_BASE, checkWorktreeWorkAtRisk } from './worktrees.js';
|
||||
import { hostExec } from './host-exec.js';
|
||||
import {
|
||||
selectOrphanWorktreeTargets,
|
||||
DEFAULT_ORPHAN_WORKTREE_GRACE_MS,
|
||||
} from './backends/lifecycle-decisions.js';
|
||||
|
||||
export interface OrphanWorktreeReaperDeps {
|
||||
sql: Sql;
|
||||
log: FastifyBaseLogger;
|
||||
intervalMs: number;
|
||||
graceMs?: number;
|
||||
}
|
||||
|
||||
export interface OrphanReaperResult {
|
||||
scanned: number;
|
||||
candidates: number;
|
||||
reaped: string[];
|
||||
skippedAtRisk: string[];
|
||||
}
|
||||
|
||||
/** Single-pass reap: select orphan candidates, preflight at-risk, remove the safe. */
|
||||
export async function reapOrphanWorktrees(
|
||||
sql: Sql,
|
||||
log: FastifyBaseLogger,
|
||||
graceMs: number = DEFAULT_ORPHAN_WORKTREE_GRACE_MS,
|
||||
now: number = Date.now(),
|
||||
): Promise<OrphanReaperResult> {
|
||||
// Enumerate on-disk session worktree dirs (`sess-*`). Per-task worktrees
|
||||
// (arena/new_task/MCP) are cleaned up inline by the one-shot path, so we only
|
||||
// own the persistent session dirs the warm paths leave behind.
|
||||
let dirents: string[];
|
||||
try {
|
||||
dirents = await readdir(WORKTREE_BASE);
|
||||
} catch {
|
||||
return { scanned: 0, candidates: 0, reaped: [], skippedAtRisk: [] }; // base absent → nothing to do
|
||||
}
|
||||
const onDisk: { path: string; mtimeMs: number }[] = [];
|
||||
for (const name of dirents) {
|
||||
if (!name.startsWith('sess-')) continue; // only persistent session worktrees
|
||||
const path = join(WORKTREE_BASE, name);
|
||||
try {
|
||||
const s = await stat(path);
|
||||
if (!s.isDirectory()) continue;
|
||||
onDisk.push({ path, mtimeMs: s.mtimeMs });
|
||||
} catch {
|
||||
// vanished between readdir and stat — skip
|
||||
}
|
||||
}
|
||||
|
||||
// Live worktree paths from the DB (active rows only — archived/removed rows are
|
||||
// not "live", so their leftover dirs are reapable orphans).
|
||||
const liveRows = await sql<{ path: string }[]>`
|
||||
SELECT path FROM worktrees WHERE status = 'active'
|
||||
`;
|
||||
const live = new Set(liveRows.map((r) => r.path));
|
||||
|
||||
const candidates = selectOrphanWorktreeTargets(onDisk, live, now, graceMs);
|
||||
const reaped: string[] = [];
|
||||
const skippedAtRisk: string[] = [];
|
||||
|
||||
for (const path of candidates) {
|
||||
// Preflight: never reap work at risk. A git error forces atRisk=true (fail
|
||||
// closed), so a half-broken worktree is kept, not silently destroyed.
|
||||
const risk = await checkWorktreeWorkAtRisk(path);
|
||||
if (risk.atRisk) {
|
||||
skippedAtRisk.push(path);
|
||||
log.warn({ path, dirty: risk.dirty, unmerged: risk.unmerged, error: risk.error }, 'orphan-reaper: skipping at-risk orphan worktree');
|
||||
continue;
|
||||
}
|
||||
const removed = await removeOrphanDir(path);
|
||||
if (removed) reaped.push(path);
|
||||
}
|
||||
|
||||
if (reaped.length > 0 || skippedAtRisk.length > 0) {
|
||||
log.info({ scanned: onDisk.length, candidates: candidates.length, reaped, skippedAtRisk }, 'orphan-reaper: pass complete');
|
||||
}
|
||||
return { scanned: onDisk.length, candidates: candidates.length, reaped, skippedAtRisk };
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a single orphan worktree dir. Resolve its main repo via the git
|
||||
* common-dir, run `worktree remove --force` from there + prune, then rm the dir as
|
||||
* a backstop. Best-effort: every step is independently fault-tolerant so a partial
|
||||
* state (dir present, git untracked) still gets reclaimed.
|
||||
*/
|
||||
async function removeOrphanDir(path: string): Promise<boolean> {
|
||||
// Find the owning repo (the common git dir's parent). When the dir isn't a valid
|
||||
// worktree anymore, this fails and we fall back to a plain rm.
|
||||
const common = await hostExec(
|
||||
`git -C ${shellEscape(path)} rev-parse --path-format=absolute --git-common-dir`,
|
||||
{ timeoutMs: 10_000 },
|
||||
).catch(() => null);
|
||||
const commonDir = common && common.exitCode === 0 ? common.stdout.trim() : '';
|
||||
// The repo worktree root is the parent of the .git common dir (strip trailing /.git).
|
||||
const repoRoot = commonDir.replace(/\/\.git\/?$/, '').replace(/\/\.git$/, '');
|
||||
|
||||
if (repoRoot && repoRoot !== commonDir) {
|
||||
await hostExec(
|
||||
`git -C ${shellEscape(repoRoot)} worktree remove ${shellEscape(path)} --force`,
|
||||
{ timeoutMs: 15_000 },
|
||||
).catch(() => {});
|
||||
await hostExec(
|
||||
`git -C ${shellEscape(repoRoot)} worktree prune`,
|
||||
{ timeoutMs: 10_000 },
|
||||
).catch(() => {});
|
||||
}
|
||||
// Backstop: ensure the dir is gone even if the git remove no-op'd.
|
||||
const rm = await hostExec(`rm -rf ${shellEscape(path)}`, { timeoutMs: 15_000 }).catch(() => null);
|
||||
return rm != null && rm.exitCode === 0;
|
||||
}
|
||||
|
||||
/** Minimal single-quote shell escape (mirrors worktrees.ts). */
|
||||
function shellEscape(s: string): string {
|
||||
return "'" + s.replace(/'/g, "'\\''") + "'";
|
||||
}
|
||||
|
||||
/** Periodic orphan-worktree reaper, started/stopped by the bootstrap. Unref'd. */
|
||||
export function createOrphanWorktreeReaper(deps: OrphanWorktreeReaperDeps): { start(): void; stop(): void } {
|
||||
const { sql, log, intervalMs } = deps;
|
||||
const graceMs = deps.graceMs ?? DEFAULT_ORPHAN_WORKTREE_GRACE_MS;
|
||||
let timer: ReturnType<typeof setInterval> | null = null;
|
||||
let running = false;
|
||||
|
||||
return {
|
||||
start() {
|
||||
if (timer) return;
|
||||
timer = setInterval(() => {
|
||||
if (running) return; // a slow pass must not overlap the next tick
|
||||
running = true;
|
||||
void reapOrphanWorktrees(sql, log, graceMs)
|
||||
.catch((err) => log.warn({ err: err instanceof Error ? err.message : String(err) }, 'orphan-reaper: pass error'))
|
||||
.finally(() => {
|
||||
running = false;
|
||||
});
|
||||
}, intervalMs);
|
||||
timer.unref?.();
|
||||
log.info({ intervalMs, graceMs }, 'orphan-reaper: started');
|
||||
},
|
||||
stop() {
|
||||
if (timer) {
|
||||
clearInterval(timer);
|
||||
timer = null;
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user