Flagged by the automated push security review on v2.7.1. - GET /checkpoints?chat_id= : the chat_id branch filtered by chat_id alone (any session's chat_id read its checkpoints). Now joins chats and gates on chats.session_id. - restoreCheckpoint scope guard was fail-open: `cp.session_id && cp.session_id !== sessionId` fell through on a null denormalized session_id, allowing a cross-session restore (worktree reset + transcript trim). Now resolves the owning session via the checkpoint's chat and denies on missing/mismatch. - Adds a DB-integration regression for the null-session_id cross-session case. Both scope authoritatively through chats.session_id (checkpoints.session_id is a nullable hint). Coder suite 234 passing; 7/7 checkpoint tests (incl. the regression) against live postgres+git; typecheck clean. Hotfix on v2.7.1. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
307 lines
12 KiB
TypeScript
307 lines
12 KiB
TypeScript
/**
|
|
* write-edit-robustness #4 — worktree checkpoints.
|
|
*
|
|
* External agents (opencode / goose / qwen / claude) write DIRECTLY into the
|
|
* shared session worktree (`/tmp/booworktrees/sess-<id>`); BooCode's own `rewind`
|
|
* only reverses `pending_changes` against the project root, so it has zero coverage
|
|
* there. A checkpoint is a pre-turn shadow-commit of the worktree tree (tracked +
|
|
* untracked) captured WITHOUT touching the real index/working tree, stored in a
|
|
* private GC-safe ref. `restoreCheckpoint` rewinds the worktree to that commit,
|
|
* trims the transcript from the anchor message forward, and resets the agent
|
|
* backend so the next turn re-establishes a fresh context consistent with the
|
|
* restored files.
|
|
*
|
|
* All git goes through hostExec + shellEscape (BooCoder runs on the host; the
|
|
* worktrees live on the host fs). Checkpoint CREATION is best-effort: a failure
|
|
* logs and returns null — it must NEVER throw into the dispatch turn.
|
|
*/
|
|
import { randomUUID } from 'node:crypto';
|
|
import type { FastifyBaseLogger } from 'fastify';
|
|
import type { Sql } from '../db.js';
|
|
import { hostExec } from './host-exec.js';
|
|
import { agentPool, OPENCODE_POOL_KEY } from './agent-pool.js';
|
|
import type { AgentSessionHandle } from './agent-backend.js';
|
|
|
|
/** Minimal shell escape for paths/refs (single-quote wrapping). Mirrors worktrees.ts. */
|
|
function shellEscape(s: string): string {
|
|
return "'" + s.replace(/'/g, "'\\''") + "'";
|
|
}
|
|
|
|
/**
|
|
* Pure builder for the shadow-commit command. Captures tracked + untracked files
|
|
* in the worktree into a temp index (so the real index/working tree is untouched),
|
|
* writes a tree, commits it parented on HEAD, and parks the commit under a private
|
|
* ref `refs/boocode/checkpoints/<id>` so git's GC never reclaims it. Prints ONLY
|
|
* the resulting SHA on stdout (the trailing `printf '%s'`), so the caller parses
|
|
* stdout.trim() directly.
|
|
*
|
|
* `id` is the row UUID (minted before the ref so the ref name matches the row).
|
|
* Both the worktree path and the id are shell-escaped.
|
|
*/
|
|
export function buildShadowCommitCommand(worktreePath: string, id: string): string {
|
|
const wt = shellEscape(worktreePath);
|
|
const ref = shellEscape(`refs/boocode/checkpoints/${id}`);
|
|
return (
|
|
`cd ${wt} && TMP=$(mktemp) && GIT_INDEX_FILE="$TMP" git read-tree HEAD ` +
|
|
`&& GIT_INDEX_FILE="$TMP" git add -A ` +
|
|
`&& TREE=$(GIT_INDEX_FILE="$TMP" git write-tree) ` +
|
|
`&& SHA=$(git commit-tree "$TREE" -p HEAD -m "boocode checkpoint") ` +
|
|
`&& git update-ref ${ref} "$SHA" && rm -f "$TMP" && printf '%s' "$SHA"`
|
|
);
|
|
}
|
|
|
|
export interface CreateCheckpointArgs {
|
|
chatId: string;
|
|
sessionId: string | null;
|
|
worktreeId: string | null;
|
|
worktreePath: string;
|
|
messageId: string | null;
|
|
label?: string | null;
|
|
}
|
|
|
|
/**
|
|
* Capture a pre-turn checkpoint of the session worktree. Best-effort: returns the
|
|
* inserted row's { id, commit_sha } on success, or null on any failure (the turn
|
|
* proceeds either way — a missing checkpoint just means no restore point for that
|
|
* turn). NEVER throws.
|
|
*
|
|
* The id is minted up front so the git ref name (`refs/boocode/checkpoints/<id>`)
|
|
* matches the DB row id, keeping ref and row in lockstep.
|
|
*/
|
|
export async function createCheckpoint(
|
|
sql: Sql,
|
|
args: CreateCheckpointArgs,
|
|
opts?: { signal?: AbortSignal; log?: FastifyBaseLogger },
|
|
): Promise<{ id: string; commit_sha: string } | null> {
|
|
const id = randomUUID();
|
|
try {
|
|
const cmd = buildShadowCommitCommand(args.worktreePath, id);
|
|
const res = await hostExec(cmd, { signal: opts?.signal, timeoutMs: 30_000 });
|
|
if (res.exitCode !== 0) {
|
|
opts?.log?.warn(
|
|
{ chatId: args.chatId, worktreePath: args.worktreePath, stderr: res.stderr.trim().slice(0, 500) },
|
|
'checkpoint: shadow-commit failed (turn proceeds without a checkpoint)',
|
|
);
|
|
return null;
|
|
}
|
|
const commitSha = res.stdout.trim();
|
|
if (!commitSha) {
|
|
opts?.log?.warn(
|
|
{ chatId: args.chatId, worktreePath: args.worktreePath },
|
|
'checkpoint: shadow-commit produced no SHA (turn proceeds)',
|
|
);
|
|
return null;
|
|
}
|
|
|
|
await sql`
|
|
INSERT INTO checkpoints (id, chat_id, session_id, worktree_id, message_id, commit_sha, label)
|
|
VALUES (${id}, ${args.chatId}, ${args.sessionId}, ${args.worktreeId}, ${args.messageId}, ${commitSha}, ${args.label ?? null})
|
|
`;
|
|
opts?.log?.info({ checkpointId: id, chatId: args.chatId, commitSha }, 'checkpoint: created');
|
|
return { id, commit_sha: commitSha };
|
|
} catch (err) {
|
|
opts?.log?.warn(
|
|
{ chatId: args.chatId, err: err instanceof Error ? err.message : String(err) },
|
|
'checkpoint: create threw (turn proceeds without a checkpoint)',
|
|
);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/** Error the route maps to a 404 when the checkpoint can't be resolved / scoped. */
|
|
export class CheckpointNotFoundError extends Error {
|
|
constructor(message: string) {
|
|
super(message);
|
|
this.name = 'CheckpointNotFoundError';
|
|
}
|
|
}
|
|
|
|
export interface RestoreCheckpointResult {
|
|
checkpoint_id: string;
|
|
messages_deleted: number;
|
|
worktree_reset: boolean;
|
|
backend_reset: boolean;
|
|
}
|
|
|
|
export interface RestoreCheckpointOpts {
|
|
signal?: AbortSignal;
|
|
log?: FastifyBaseLogger;
|
|
/** If set, the checkpoint MUST belong to this session (route scope guard). */
|
|
sessionId?: string;
|
|
}
|
|
|
|
interface CheckpointRow {
|
|
id: string;
|
|
chat_id: string;
|
|
session_id: string | null;
|
|
worktree_id: string | null;
|
|
message_id: string | null;
|
|
commit_sha: string;
|
|
created_at: Date;
|
|
}
|
|
|
|
/**
|
|
* Restore a checkpoint: rewind its worktree to the shadow commit, trim the
|
|
* transcript from the anchor message forward, reset the backend session, and drop
|
|
* now-orphaned later checkpoints. Throws CheckpointNotFoundError when the
|
|
* checkpoint is missing or not in the requested session (route → 404).
|
|
*/
|
|
export async function restoreCheckpoint(
|
|
sql: Sql,
|
|
checkpointId: string,
|
|
opts?: RestoreCheckpointOpts,
|
|
): Promise<RestoreCheckpointResult> {
|
|
// 1. Resolve the checkpoint.
|
|
const [cp] = await sql<CheckpointRow[]>`
|
|
SELECT id, chat_id, session_id, worktree_id, message_id, commit_sha, created_at
|
|
FROM checkpoints WHERE id = ${checkpointId}
|
|
`;
|
|
if (!cp) {
|
|
throw new CheckpointNotFoundError('checkpoint not found');
|
|
}
|
|
// Authorization scope (fail-safe): the checkpoint's chat must belong to the
|
|
// requested session. cp.session_id is a denormalized hint that may be null, so
|
|
// gating on it directly fails open — resolve the owning session via chats
|
|
// (authoritative; chat_id is NOT NULL) and deny on any mismatch or missing row.
|
|
if (opts?.sessionId) {
|
|
const [owner] = await sql<{ session_id: string | null }[]>`
|
|
SELECT session_id FROM chats WHERE id = ${cp.chat_id}
|
|
`;
|
|
if (!owner || owner.session_id !== opts.sessionId) {
|
|
throw new CheckpointNotFoundError('checkpoint not in session');
|
|
}
|
|
}
|
|
|
|
// 2. Resolve the worktree path (by worktree_id, else the session's active one).
|
|
let worktreePath: string | null = null;
|
|
if (cp.worktree_id) {
|
|
const [wt] = await sql<{ path: string }[]>`
|
|
SELECT path FROM worktrees WHERE id = ${cp.worktree_id}
|
|
`;
|
|
worktreePath = wt?.path ?? null;
|
|
}
|
|
if (!worktreePath) {
|
|
const sid = cp.session_id ?? opts?.sessionId ?? null;
|
|
if (sid) {
|
|
const [wt] = await sql<{ path: string }[]>`
|
|
SELECT path FROM worktrees WHERE session_id = ${sid} AND status = 'active' LIMIT 1
|
|
`;
|
|
worktreePath = wt?.path ?? null;
|
|
}
|
|
}
|
|
|
|
// 3. Worktree reset — hard-reset to the shadow commit, then clean untracked.
|
|
let worktreeReset = false;
|
|
if (worktreePath) {
|
|
const resetRes = await hostExec(
|
|
`git -C ${shellEscape(worktreePath)} reset --hard ${shellEscape(cp.commit_sha)}`,
|
|
{ signal: opts?.signal, timeoutMs: 30_000 },
|
|
).catch((err) => {
|
|
opts?.log?.warn(
|
|
{ checkpointId, err: err instanceof Error ? err.message : String(err) },
|
|
'checkpoint restore: reset --hard threw',
|
|
);
|
|
return null;
|
|
});
|
|
if (resetRes && resetRes.exitCode === 0) {
|
|
const cleanRes = await hostExec(
|
|
`git -C ${shellEscape(worktreePath)} clean -fd`,
|
|
{ signal: opts?.signal, timeoutMs: 30_000 },
|
|
).catch(() => null);
|
|
worktreeReset = cleanRes != null && cleanRes.exitCode === 0;
|
|
if (!worktreeReset) {
|
|
opts?.log?.warn({ checkpointId, worktreePath }, 'checkpoint restore: clean -fd did not succeed');
|
|
}
|
|
} else {
|
|
opts?.log?.warn(
|
|
{ checkpointId, worktreePath, stderr: resetRes?.stderr?.trim()?.slice(0, 500) },
|
|
'checkpoint restore: reset --hard did not succeed',
|
|
);
|
|
}
|
|
} else {
|
|
opts?.log?.warn({ checkpointId }, 'checkpoint restore: no worktree path resolved (files not reset)');
|
|
}
|
|
|
|
// 4. Trim the transcript from the anchor message forward. message_parts FK to
|
|
// messages is ON DELETE CASCADE (apps/server schema.sql:49), so parts are
|
|
// removed with their messages — no explicit parts delete needed.
|
|
let messagesDeleted = 0;
|
|
if (cp.message_id) {
|
|
const deleted = await sql<{ id: string }[]>`
|
|
DELETE FROM messages
|
|
WHERE chat_id = ${cp.chat_id}
|
|
AND created_at >= (SELECT created_at FROM messages WHERE id = ${cp.message_id})
|
|
RETURNING id
|
|
`;
|
|
messagesDeleted = deleted.length;
|
|
}
|
|
|
|
// 5. Backend reset — mark the chat's agent sessions crashed so the next turn
|
|
// re-establishes a fresh backend, and evict the live pool session(s) for this
|
|
// (chat, agent). Warm backends hold context server-side with no partial
|
|
// rewind, so a full reset is the only consistent option (proposal §4).
|
|
const agentRows = await sql<{ agent: string; backend: string; agent_session_id: string | null; session_id: string | null; worktree_id: string | null }[]>`
|
|
SELECT agent, backend, agent_session_id, session_id, worktree_id
|
|
FROM agent_sessions WHERE chat_id = ${cp.chat_id}
|
|
`;
|
|
await sql`
|
|
UPDATE agent_sessions SET status = 'crashed' WHERE chat_id = ${cp.chat_id}
|
|
`.catch(() => {});
|
|
|
|
let backendReset = false;
|
|
try {
|
|
// opencode runs on the SHARED server (keyed on a sentinel, not the chat) — close
|
|
// just this chat's session(s) on it, mirroring the lifecycle close-hook.
|
|
const ocBackend = agentPool.peek(OPENCODE_POOL_KEY, 'opencode');
|
|
if (ocBackend) {
|
|
for (const row of agentRows) {
|
|
if (row.backend !== 'opencode_server' || !row.agent_session_id) continue;
|
|
const handle: AgentSessionHandle = {
|
|
sessionId: row.session_id ?? '',
|
|
agent: row.agent,
|
|
backend: 'opencode_server',
|
|
chatId: cp.chat_id,
|
|
worktreeId: row.worktree_id ?? '',
|
|
agentSessionId: row.agent_session_id,
|
|
serverPort: null,
|
|
};
|
|
await ocBackend.closeSession(handle).catch((err) => {
|
|
opts?.log?.warn(
|
|
{ checkpointId, err: err instanceof Error ? err.message : String(err) },
|
|
'checkpoint restore: opencode closeSession threw',
|
|
);
|
|
});
|
|
}
|
|
}
|
|
// Warm-ACP backends are pooled under the chat id — dispose them (kills the
|
|
// goose/qwen child). closeChat skips busy backends (a live turn isn't torn down).
|
|
const disposed = await agentPool.closeChat(cp.chat_id);
|
|
backendReset = true;
|
|
opts?.log?.info({ checkpointId, chatId: cp.chat_id, disposed }, 'checkpoint restore: backend reset');
|
|
} catch (err) {
|
|
opts?.log?.warn(
|
|
{ checkpointId, err: err instanceof Error ? err.message : String(err) },
|
|
'checkpoint restore: backend reset threw',
|
|
);
|
|
}
|
|
|
|
// 6. Drop now-orphaned later checkpoints for this chat (their anchor messages were
|
|
// just trimmed). Compare `created_at` SERVER-SIDE via a subquery (NOT the JS
|
|
// Date round-trip, which truncates the stored microsecond precision to ms and
|
|
// would make this checkpoint delete ITSELF), and exclude this checkpoint's own
|
|
// id so it always survives — letting the user re-restore to it.
|
|
await sql`
|
|
DELETE FROM checkpoints
|
|
WHERE chat_id = ${cp.chat_id}
|
|
AND id <> ${cp.id}
|
|
AND created_at > (SELECT created_at FROM checkpoints WHERE id = ${cp.id})
|
|
`.catch(() => {});
|
|
|
|
return {
|
|
checkpoint_id: checkpointId,
|
|
messages_deleted: messagesDeleted,
|
|
worktree_reset: worktreeReset,
|
|
backend_reset: backendReset,
|
|
};
|
|
}
|