feat: write/edit robustness — fuzzy patch applier + worktree checkpoints (v2.7.1)

#3 Fuzzy patch applier: new pure fuzzy-match.ts (locateMatch, exact→trim→
unicode-canon→Levenshtein≥0.66, refuse-on-ambiguous) wired into pending_changes
applyOne/rewindOne so local-model whitespace/unicode drift in old_string no
longer loses the edit.

#4 Worktree checkpoint + conversation-trim: checkpoints table + checkpoints.ts
(shadow-commit of tracked+untracked into refs/boocode/checkpoints, hooked into
the 3 external-agent dispatcher paths) + POST restore route (reset --hard +
clean -fd -> transcript trim -> backend-session reset) + "Restore to here" UI.

Built by 3 parallel agents; DB-integration testing caught a created_at
self-deletion bug. Coder suite 234 passing; server+coder build + web tsc clean.
Builds on v2.7.0-mit. openspec write-edit-robustness.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-01 12:01:57 +00:00
parent 1108d07fb2
commit 59f07e8cb8
15 changed files with 1443 additions and 11 deletions

View File

@@ -0,0 +1,271 @@
// Fuzzy patch locator for staged edits.
//
// Local quantized models (qwen3.6 and friends) frequently reproduce an
// `old_string` with small, semantically-irrelevant drift: trailing whitespace,
// a different indent width, or "smart" unicode punctuation (curly quotes, an
// en/em-dash, a non-breaking space) where the source has the plain ASCII form.
// An exact `String.includes` then fails and the queued edit is lost even though
// a human would say it obviously matches.
//
// `locateMatch` walks a ladder of progressively looser strategies and returns
// the real `[start, end)` byte-offset span in the ORIGINAL content so the caller
// can splice in `new_string` over the true file text (preserving the file's own
// whitespace/unicode, not the model's drifted copy). The ladder stops at the
// first strategy that resolves to a single span:
//
// 1. exact — indexOf; >1 hit is reported `ambiguous` (we refuse to
// guess which occurrence the model meant).
// 2. per-line ws — line-window compare ignoring per-line trailing
// whitespace and leading/trailing blank needle lines.
// 3. unicode canon — same line-window compare after folding smart
// punctuation to ASCII on both sides; the match is
// mapped back to original offsets.
// 4. levenshtein — best line-window by normalized edit-distance
// similarity; accepted only at >= SIMILARITY_THRESHOLD.
//
// Pure and dependency-free (Levenshtein is the standard iterative two-row DP),
// reimplemented from the general technique — no vendored source.
export type MatchResult =
| { kind: 'exact' | 'fuzzy'; start: number; end: number } // [start,end) offsets into content
| { kind: 'ambiguous'; count: number }
| { kind: 'not_found' };
/** Levenshtein similarity floor for the final fuzzy fallback (strategy 4). */
export const SIMILARITY_THRESHOLD = 0.66;
export function locateMatch(content: string, needle: string): MatchResult {
// Empty needle has no meaningful match.
if (needle.length === 0) return { kind: 'not_found' };
// --- 1. Exact ----------------------------------------------------------------
const exact = locateExact(content, needle);
if (exact) return exact;
// --- 2. Per-line whitespace-insensitive -------------------------------------
const ws = locateByLineWindow(content, needle);
if (ws) return ws;
// --- 3. Unicode-canonicalized whitespace pass -------------------------------
const canon = locateCanonical(content, needle);
if (canon) return canon;
// --- 4. Levenshtein similarity ----------------------------------------------
const lev = locateByLevenshtein(content, needle);
if (lev) return lev;
return { kind: 'not_found' };
}
// --- Strategy 1: exact -------------------------------------------------------
function locateExact(content: string, needle: string): MatchResult | null {
const first = content.indexOf(needle);
if (first === -1) return null;
const second = content.indexOf(needle, first + 1);
if (second === -1) {
return { kind: 'exact', start: first, end: first + needle.length };
}
// Count all occurrences so the caller can report a useful number.
let count = 2;
let idx = content.indexOf(needle, second + 1);
while (idx !== -1) {
count++;
idx = content.indexOf(needle, idx + 1);
}
return { kind: 'ambiguous', count };
}
// --- Line-window machinery ---------------------------------------------------
interface Line {
/** Raw line text (no trailing newline). */
text: string;
/** Offset of the first char of this line in the original content. */
start: number;
/** Offset one past the last char of this line (before its newline, if any). */
end: number;
}
/**
* Split content into lines, tracking each line's real offset span. The span
* EXCLUDES the trailing newline so consecutive line spans plus their newlines
* exactly reconstruct the content; the match span we hand back covers from the
* first matched line's start through the last matched line's end (i.e. without a
* trailing newline), which is what an in-place splice wants.
*/
function splitLines(content: string): Line[] {
const lines: Line[] = [];
let start = 0;
for (let i = 0; i <= content.length; i++) {
if (i === content.length || content[i] === '\n') {
lines.push({ text: content.slice(start, i), start, end: i });
start = i + 1;
}
}
return lines;
}
/** Strip leading/trailing all-blank lines; returns the trimmed slice. */
function trimBlankLines(lines: string[]): string[] {
let lo = 0;
let hi = lines.length;
while (lo < hi && lines[lo]!.trim() === '') lo++;
while (hi > lo && lines[hi - 1]!.trim() === '') hi--;
return lines.slice(lo, hi);
}
/**
* Find a contiguous window of content lines whose trailing-whitespace-trimmed
* text equals the needle's (blank-trimmed) lines. Returns the real offset span
* over the matched content lines, or null if zero match. Multiple matches →
* ambiguous. `normalize` lets the caller fold unicode before comparing.
*/
function locateByLineWindow(
content: string,
needle: string,
normalize: (s: string) => string = (s) => s,
): MatchResult | null {
const contentLines = splitLines(content);
const needleLines = trimBlankLines(needle.split('\n'));
const n = needleLines.length;
if (n === 0) return null;
// A single needle line that is itself blank can't be located meaningfully.
if (n === 1 && needleLines[0]!.trim() === '') return null;
const needleKey = needleLines.map((l) => normalize(l.trimEnd())).join('\n');
const hits: Array<{ start: number; end: number }> = [];
for (let i = 0; i + n <= contentLines.length; i++) {
const windowKey = contentLines
.slice(i, i + n)
.map((l) => normalize(l.text.trimEnd()))
.join('\n');
if (windowKey === needleKey) {
hits.push({ start: contentLines[i]!.start, end: contentLines[i + n - 1]!.end });
}
}
if (hits.length === 0) return null;
if (hits.length > 1) return { kind: 'ambiguous', count: hits.length };
return { kind: 'fuzzy', start: hits[0]!.start, end: hits[0]!.end };
}
// --- Strategy 3: unicode canonicalization ------------------------------------
/**
* Fold smart punctuation to its ASCII equivalent. Crucially this is a
* length-PRESERVING, per-character map (every replacement is one char → one
* char), so an offset into the canonical string is also a valid offset into the
* original — letting strategy 3 reuse the line-window matcher and still hand
* back true original-content offsets.
*/
function canonicalizeChar(ch: string): string {
switch (ch) {
// single quotes / apostrophes
case '': // '
case '': // '
case '': //
case '': //
return "'";
// double quotes
case '“': // "
case '”': // "
case '„': // „
case '‟': // ‟
return '"';
// dashes
case '': // en dash
case '—': // — em dash
case '': // figure dash
case '―': // ― horizontal bar
case '': // minus sign
return '-';
// spaces
case ' ': // nbsp
case '': // figure space
case '': // narrow nbsp
return ' ';
default:
return ch;
}
}
function canonicalize(s: string): string {
let out = '';
for (const ch of s) out += canonicalizeChar(ch);
return out;
}
function locateCanonical(content: string, needle: string): MatchResult | null {
// Only worth running if canonicalization actually changes something on either
// side — otherwise it's identical to strategy 2 which already failed.
const canonContent = canonicalize(content);
const canonNeedle = canonicalize(needle);
if (canonContent === content && canonNeedle === needle) return null;
// Offsets are preserved (length-preserving fold), so a match on the canonical
// content maps directly back to the original.
return locateByLineWindow(canonContent, canonNeedle);
}
// --- Strategy 4: Levenshtein similarity --------------------------------------
/** Standard iterative two-row Levenshtein edit distance. */
function levenshtein(a: string, b: string): number {
if (a === b) return 0;
if (a.length === 0) return b.length;
if (b.length === 0) return a.length;
let prev = new Array<number>(b.length + 1);
let curr = new Array<number>(b.length + 1);
for (let j = 0; j <= b.length; j++) prev[j] = j;
for (let i = 1; i <= a.length; i++) {
curr[0] = i;
const ac = a.charCodeAt(i - 1);
for (let j = 1; j <= b.length; j++) {
const cost = ac === b.charCodeAt(j - 1) ? 0 : 1;
curr[j] = Math.min(
prev[j]! + 1, // deletion
curr[j - 1]! + 1, // insertion
prev[j - 1]! + cost, // substitution
);
}
[prev, curr] = [curr, prev];
}
return prev[b.length]!;
}
/** Normalized similarity in [0,1]: 1 - dist / max(len). */
function similarity(a: string, b: string): number {
const maxLen = Math.max(a.length, b.length);
if (maxLen === 0) return 1;
return 1 - levenshtein(a, b) / maxLen;
}
function locateByLevenshtein(content: string, needle: string): MatchResult | null {
const contentLines = splitLines(content);
const needleLines = trimBlankLines(needle.split('\n'));
const n = needleLines.length;
if (n === 0) return null;
if (contentLines.length < n) return null;
const needleJoined = needleLines.map((l) => l.trim()).join('\n');
let best = -1;
let bestSpan: { start: number; end: number } | null = null;
for (let i = 0; i + n <= contentLines.length; i++) {
const window = contentLines.slice(i, i + n);
const windowJoined = window.map((l) => l.text.trim()).join('\n');
const score = similarity(windowJoined, needleJoined);
if (score > best) {
best = score;
bestSpan = { start: window[0]!.start, end: window[n - 1]!.end };
}
}
if (bestSpan && best >= SIMILARITY_THRESHOLD) {
return { kind: 'fuzzy', start: bestSpan.start, end: bestSpan.end };
}
return null;
}