/** * write-edit-robustness #4 — worktree checkpoints. * * External agents (opencode / goose / qwen / claude) write DIRECTLY into the * shared session worktree (`/tmp/booworktrees/sess-`); BooCode's own `rewind` * only reverses `pending_changes` against the project root, so it has zero coverage * there. A checkpoint is a pre-turn shadow-commit of the worktree tree (tracked + * untracked) captured WITHOUT touching the real index/working tree, stored in a * private GC-safe ref. `restoreCheckpoint` rewinds the worktree to that commit, * trims the transcript from the anchor message forward, and resets the agent * backend so the next turn re-establishes a fresh context consistent with the * restored files. * * All git goes through hostExec + shellEscape (BooCoder runs on the host; the * worktrees live on the host fs). Checkpoint CREATION is best-effort: a failure * logs and returns null — it must NEVER throw into the dispatch turn. */ import { randomUUID } from 'node:crypto'; import type { FastifyBaseLogger } from 'fastify'; import type { Sql } from '../db.js'; import { hostExec } from './host-exec.js'; import { agentPool, OPENCODE_POOL_KEY } from './agent-pool.js'; import type { AgentSessionHandle } from './agent-backend.js'; /** Minimal shell escape for paths/refs (single-quote wrapping). Mirrors worktrees.ts. */ function shellEscape(s: string): string { return "'" + s.replace(/'/g, "'\\''") + "'"; } /** * Pure builder for the shadow-commit command. Captures tracked + untracked files * in the worktree into a temp index (so the real index/working tree is untouched), * writes a tree, commits it parented on HEAD, and parks the commit under a private * ref `refs/boocode/checkpoints/` so git's GC never reclaims it. Prints ONLY * the resulting SHA on stdout (the trailing `printf '%s'`), so the caller parses * stdout.trim() directly. * * `id` is the row UUID (minted before the ref so the ref name matches the row). * Both the worktree path and the id are shell-escaped. */ export function buildShadowCommitCommand(worktreePath: string, id: string): string { const wt = shellEscape(worktreePath); const ref = shellEscape(`refs/boocode/checkpoints/${id}`); return ( `cd ${wt} && TMP=$(mktemp) && GIT_INDEX_FILE="$TMP" git read-tree HEAD ` + `&& GIT_INDEX_FILE="$TMP" git add -A ` + `&& TREE=$(GIT_INDEX_FILE="$TMP" git write-tree) ` + `&& SHA=$(git commit-tree "$TREE" -p HEAD -m "boocode checkpoint") ` + `&& git update-ref ${ref} "$SHA" && rm -f "$TMP" && printf '%s' "$SHA"` ); } export interface CreateCheckpointArgs { chatId: string; sessionId: string | null; worktreeId: string | null; worktreePath: string; messageId: string | null; label?: string | null; } /** * Capture a pre-turn checkpoint of the session worktree. Best-effort: returns the * inserted row's { id, commit_sha } on success, or null on any failure (the turn * proceeds either way — a missing checkpoint just means no restore point for that * turn). NEVER throws. * * The id is minted up front so the git ref name (`refs/boocode/checkpoints/`) * matches the DB row id, keeping ref and row in lockstep. */ export async function createCheckpoint( sql: Sql, args: CreateCheckpointArgs, opts?: { signal?: AbortSignal; log?: FastifyBaseLogger }, ): Promise<{ id: string; commit_sha: string } | null> { const id = randomUUID(); try { const cmd = buildShadowCommitCommand(args.worktreePath, id); const res = await hostExec(cmd, { signal: opts?.signal, timeoutMs: 30_000 }); if (res.exitCode !== 0) { opts?.log?.warn( { chatId: args.chatId, worktreePath: args.worktreePath, stderr: res.stderr.trim().slice(0, 500) }, 'checkpoint: shadow-commit failed (turn proceeds without a checkpoint)', ); return null; } const commitSha = res.stdout.trim(); if (!commitSha) { opts?.log?.warn( { chatId: args.chatId, worktreePath: args.worktreePath }, 'checkpoint: shadow-commit produced no SHA (turn proceeds)', ); return null; } await sql` INSERT INTO checkpoints (id, chat_id, session_id, worktree_id, message_id, commit_sha, label) VALUES (${id}, ${args.chatId}, ${args.sessionId}, ${args.worktreeId}, ${args.messageId}, ${commitSha}, ${args.label ?? null}) `; opts?.log?.info({ checkpointId: id, chatId: args.chatId, commitSha }, 'checkpoint: created'); return { id, commit_sha: commitSha }; } catch (err) { opts?.log?.warn( { chatId: args.chatId, err: err instanceof Error ? err.message : String(err) }, 'checkpoint: create threw (turn proceeds without a checkpoint)', ); return null; } } /** Error the route maps to a 404 when the checkpoint can't be resolved / scoped. */ export class CheckpointNotFoundError extends Error { constructor(message: string) { super(message); this.name = 'CheckpointNotFoundError'; } } export interface RestoreCheckpointResult { checkpoint_id: string; messages_deleted: number; worktree_reset: boolean; backend_reset: boolean; } export interface RestoreCheckpointOpts { signal?: AbortSignal; log?: FastifyBaseLogger; /** If set, the checkpoint MUST belong to this session (route scope guard). */ sessionId?: string; } interface CheckpointRow { id: string; chat_id: string; session_id: string | null; worktree_id: string | null; message_id: string | null; commit_sha: string; created_at: Date; } /** * Restore a checkpoint: rewind its worktree to the shadow commit, trim the * transcript from the anchor message forward, reset the backend session, and drop * now-orphaned later checkpoints. Throws CheckpointNotFoundError when the * checkpoint is missing or not in the requested session (route → 404). */ export async function restoreCheckpoint( sql: Sql, checkpointId: string, opts?: RestoreCheckpointOpts, ): Promise { // 1. Resolve the checkpoint. const [cp] = await sql` SELECT id, chat_id, session_id, worktree_id, message_id, commit_sha, created_at FROM checkpoints WHERE id = ${checkpointId} `; if (!cp) { throw new CheckpointNotFoundError('checkpoint not found'); } // Authorization scope (fail-safe): the checkpoint's chat must belong to the // requested session. cp.session_id is a denormalized hint that may be null, so // gating on it directly fails open — resolve the owning session via chats // (authoritative; chat_id is NOT NULL) and deny on any mismatch or missing row. if (opts?.sessionId) { const [owner] = await sql<{ session_id: string | null }[]>` SELECT session_id FROM chats WHERE id = ${cp.chat_id} `; if (!owner || owner.session_id !== opts.sessionId) { throw new CheckpointNotFoundError('checkpoint not in session'); } } // 2. Resolve the worktree path (by worktree_id, else the session's active one). let worktreePath: string | null = null; if (cp.worktree_id) { const [wt] = await sql<{ path: string }[]>` SELECT path FROM worktrees WHERE id = ${cp.worktree_id} `; worktreePath = wt?.path ?? null; } if (!worktreePath) { const sid = cp.session_id ?? opts?.sessionId ?? null; if (sid) { const [wt] = await sql<{ path: string }[]>` SELECT path FROM worktrees WHERE session_id = ${sid} AND status = 'active' LIMIT 1 `; worktreePath = wt?.path ?? null; } } // 3. Worktree reset — hard-reset to the shadow commit, then clean untracked. let worktreeReset = false; if (worktreePath) { const resetRes = await hostExec( `git -C ${shellEscape(worktreePath)} reset --hard ${shellEscape(cp.commit_sha)}`, { signal: opts?.signal, timeoutMs: 30_000 }, ).catch((err) => { opts?.log?.warn( { checkpointId, err: err instanceof Error ? err.message : String(err) }, 'checkpoint restore: reset --hard threw', ); return null; }); if (resetRes && resetRes.exitCode === 0) { const cleanRes = await hostExec( `git -C ${shellEscape(worktreePath)} clean -fd`, { signal: opts?.signal, timeoutMs: 30_000 }, ).catch(() => null); worktreeReset = cleanRes != null && cleanRes.exitCode === 0; if (!worktreeReset) { opts?.log?.warn({ checkpointId, worktreePath }, 'checkpoint restore: clean -fd did not succeed'); } } else { opts?.log?.warn( { checkpointId, worktreePath, stderr: resetRes?.stderr?.trim()?.slice(0, 500) }, 'checkpoint restore: reset --hard did not succeed', ); } } else { opts?.log?.warn({ checkpointId }, 'checkpoint restore: no worktree path resolved (files not reset)'); } // 4. Trim the transcript from the anchor message forward. message_parts FK to // messages is ON DELETE CASCADE (apps/server schema.sql:49), so parts are // removed with their messages — no explicit parts delete needed. let messagesDeleted = 0; if (cp.message_id) { const deleted = await sql<{ id: string }[]>` DELETE FROM messages WHERE chat_id = ${cp.chat_id} AND created_at >= (SELECT created_at FROM messages WHERE id = ${cp.message_id}) RETURNING id `; messagesDeleted = deleted.length; } // 5. Backend reset — mark the chat's agent sessions crashed so the next turn // re-establishes a fresh backend, and evict the live pool session(s) for this // (chat, agent). Warm backends hold context server-side with no partial // rewind, so a full reset is the only consistent option (proposal §4). const agentRows = await sql<{ agent: string; backend: string; agent_session_id: string | null; session_id: string | null; worktree_id: string | null }[]>` SELECT agent, backend, agent_session_id, session_id, worktree_id FROM agent_sessions WHERE chat_id = ${cp.chat_id} `; await sql` UPDATE agent_sessions SET status = 'crashed' WHERE chat_id = ${cp.chat_id} `.catch(() => {}); let backendReset = false; try { // opencode runs on the SHARED server (keyed on a sentinel, not the chat) — close // just this chat's session(s) on it, mirroring the lifecycle close-hook. const ocBackend = agentPool.peek(OPENCODE_POOL_KEY, 'opencode'); if (ocBackend) { for (const row of agentRows) { if (row.backend !== 'opencode_server' || !row.agent_session_id) continue; const handle: AgentSessionHandle = { sessionId: row.session_id ?? '', agent: row.agent, backend: 'opencode_server', chatId: cp.chat_id, worktreeId: row.worktree_id ?? '', agentSessionId: row.agent_session_id, serverPort: null, }; await ocBackend.closeSession(handle).catch((err) => { opts?.log?.warn( { checkpointId, err: err instanceof Error ? err.message : String(err) }, 'checkpoint restore: opencode closeSession threw', ); }); } } // Warm-ACP backends are pooled under the chat id — dispose them (kills the // goose/qwen child). closeChat skips busy backends (a live turn isn't torn down). const disposed = await agentPool.closeChat(cp.chat_id); backendReset = true; opts?.log?.info({ checkpointId, chatId: cp.chat_id, disposed }, 'checkpoint restore: backend reset'); } catch (err) { opts?.log?.warn( { checkpointId, err: err instanceof Error ? err.message : String(err) }, 'checkpoint restore: backend reset threw', ); } // 6. Drop now-orphaned later checkpoints for this chat (their anchor messages were // just trimmed). Compare `created_at` SERVER-SIDE via a subquery (NOT the JS // Date round-trip, which truncates the stored microsecond precision to ms and // would make this checkpoint delete ITSELF), and exclude this checkpoint's own // id so it always survives — letting the user re-restore to it. await sql` DELETE FROM checkpoints WHERE chat_id = ${cp.chat_id} AND id <> ${cp.id} AND created_at > (SELECT created_at FROM checkpoints WHERE id = ${cp.id}) `.catch(() => {}); return { checkpoint_id: checkpointId, messages_deleted: messagesDeleted, worktree_reset: worktreeReset, backend_reset: backendReset, }; }