#3 Fuzzy patch applier: new pure fuzzy-match.ts (locateMatch, exact→trim→ unicode-canon→Levenshtein≥0.66, refuse-on-ambiguous) wired into pending_changes applyOne/rewindOne so local-model whitespace/unicode drift in old_string no longer loses the edit. #4 Worktree checkpoint + conversation-trim: checkpoints table + checkpoints.ts (shadow-commit of tracked+untracked into refs/boocode/checkpoints, hooked into the 3 external-agent dispatcher paths) + POST restore route (reset --hard + clean -fd -> transcript trim -> backend-session reset) + "Restore to here" UI. Built by 3 parallel agents; DB-integration testing caught a created_at self-deletion bug. Coder suite 234 passing; server+coder build + web tsc clean. Builds on v2.7.0-mit. openspec write-edit-robustness. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
272 lines
9.9 KiB
TypeScript
272 lines
9.9 KiB
TypeScript
// Fuzzy patch locator for staged edits.
|
||
//
|
||
// Local quantized models (qwen3.6 and friends) frequently reproduce an
|
||
// `old_string` with small, semantically-irrelevant drift: trailing whitespace,
|
||
// a different indent width, or "smart" unicode punctuation (curly quotes, an
|
||
// en/em-dash, a non-breaking space) where the source has the plain ASCII form.
|
||
// An exact `String.includes` then fails and the queued edit is lost even though
|
||
// a human would say it obviously matches.
|
||
//
|
||
// `locateMatch` walks a ladder of progressively looser strategies and returns
|
||
// the real `[start, end)` byte-offset span in the ORIGINAL content so the caller
|
||
// can splice in `new_string` over the true file text (preserving the file's own
|
||
// whitespace/unicode, not the model's drifted copy). The ladder stops at the
|
||
// first strategy that resolves to a single span:
|
||
//
|
||
// 1. exact — indexOf; >1 hit is reported `ambiguous` (we refuse to
|
||
// guess which occurrence the model meant).
|
||
// 2. per-line ws — line-window compare ignoring per-line trailing
|
||
// whitespace and leading/trailing blank needle lines.
|
||
// 3. unicode canon — same line-window compare after folding smart
|
||
// punctuation to ASCII on both sides; the match is
|
||
// mapped back to original offsets.
|
||
// 4. levenshtein — best line-window by normalized edit-distance
|
||
// similarity; accepted only at >= SIMILARITY_THRESHOLD.
|
||
//
|
||
// Pure and dependency-free (Levenshtein is the standard iterative two-row DP),
|
||
// reimplemented from the general technique — no vendored source.
|
||
|
||
export type MatchResult =
|
||
| { kind: 'exact' | 'fuzzy'; start: number; end: number } // [start,end) offsets into content
|
||
| { kind: 'ambiguous'; count: number }
|
||
| { kind: 'not_found' };
|
||
|
||
/** Levenshtein similarity floor for the final fuzzy fallback (strategy 4). */
|
||
export const SIMILARITY_THRESHOLD = 0.66;
|
||
|
||
export function locateMatch(content: string, needle: string): MatchResult {
|
||
// Empty needle has no meaningful match.
|
||
if (needle.length === 0) return { kind: 'not_found' };
|
||
|
||
// --- 1. Exact ----------------------------------------------------------------
|
||
const exact = locateExact(content, needle);
|
||
if (exact) return exact;
|
||
|
||
// --- 2. Per-line whitespace-insensitive -------------------------------------
|
||
const ws = locateByLineWindow(content, needle);
|
||
if (ws) return ws;
|
||
|
||
// --- 3. Unicode-canonicalized whitespace pass -------------------------------
|
||
const canon = locateCanonical(content, needle);
|
||
if (canon) return canon;
|
||
|
||
// --- 4. Levenshtein similarity ----------------------------------------------
|
||
const lev = locateByLevenshtein(content, needle);
|
||
if (lev) return lev;
|
||
|
||
return { kind: 'not_found' };
|
||
}
|
||
|
||
// --- Strategy 1: exact -------------------------------------------------------
|
||
|
||
function locateExact(content: string, needle: string): MatchResult | null {
|
||
const first = content.indexOf(needle);
|
||
if (first === -1) return null;
|
||
const second = content.indexOf(needle, first + 1);
|
||
if (second === -1) {
|
||
return { kind: 'exact', start: first, end: first + needle.length };
|
||
}
|
||
// Count all occurrences so the caller can report a useful number.
|
||
let count = 2;
|
||
let idx = content.indexOf(needle, second + 1);
|
||
while (idx !== -1) {
|
||
count++;
|
||
idx = content.indexOf(needle, idx + 1);
|
||
}
|
||
return { kind: 'ambiguous', count };
|
||
}
|
||
|
||
// --- Line-window machinery ---------------------------------------------------
|
||
|
||
interface Line {
|
||
/** Raw line text (no trailing newline). */
|
||
text: string;
|
||
/** Offset of the first char of this line in the original content. */
|
||
start: number;
|
||
/** Offset one past the last char of this line (before its newline, if any). */
|
||
end: number;
|
||
}
|
||
|
||
/**
|
||
* Split content into lines, tracking each line's real offset span. The span
|
||
* EXCLUDES the trailing newline so consecutive line spans plus their newlines
|
||
* exactly reconstruct the content; the match span we hand back covers from the
|
||
* first matched line's start through the last matched line's end (i.e. without a
|
||
* trailing newline), which is what an in-place splice wants.
|
||
*/
|
||
function splitLines(content: string): Line[] {
|
||
const lines: Line[] = [];
|
||
let start = 0;
|
||
for (let i = 0; i <= content.length; i++) {
|
||
if (i === content.length || content[i] === '\n') {
|
||
lines.push({ text: content.slice(start, i), start, end: i });
|
||
start = i + 1;
|
||
}
|
||
}
|
||
return lines;
|
||
}
|
||
|
||
/** Strip leading/trailing all-blank lines; returns the trimmed slice. */
|
||
function trimBlankLines(lines: string[]): string[] {
|
||
let lo = 0;
|
||
let hi = lines.length;
|
||
while (lo < hi && lines[lo]!.trim() === '') lo++;
|
||
while (hi > lo && lines[hi - 1]!.trim() === '') hi--;
|
||
return lines.slice(lo, hi);
|
||
}
|
||
|
||
/**
|
||
* Find a contiguous window of content lines whose trailing-whitespace-trimmed
|
||
* text equals the needle's (blank-trimmed) lines. Returns the real offset span
|
||
* over the matched content lines, or null if zero match. Multiple matches →
|
||
* ambiguous. `normalize` lets the caller fold unicode before comparing.
|
||
*/
|
||
function locateByLineWindow(
|
||
content: string,
|
||
needle: string,
|
||
normalize: (s: string) => string = (s) => s,
|
||
): MatchResult | null {
|
||
const contentLines = splitLines(content);
|
||
const needleLines = trimBlankLines(needle.split('\n'));
|
||
const n = needleLines.length;
|
||
if (n === 0) return null;
|
||
// A single needle line that is itself blank can't be located meaningfully.
|
||
if (n === 1 && needleLines[0]!.trim() === '') return null;
|
||
|
||
const needleKey = needleLines.map((l) => normalize(l.trimEnd())).join('\n');
|
||
|
||
const hits: Array<{ start: number; end: number }> = [];
|
||
for (let i = 0; i + n <= contentLines.length; i++) {
|
||
const windowKey = contentLines
|
||
.slice(i, i + n)
|
||
.map((l) => normalize(l.text.trimEnd()))
|
||
.join('\n');
|
||
if (windowKey === needleKey) {
|
||
hits.push({ start: contentLines[i]!.start, end: contentLines[i + n - 1]!.end });
|
||
}
|
||
}
|
||
|
||
if (hits.length === 0) return null;
|
||
if (hits.length > 1) return { kind: 'ambiguous', count: hits.length };
|
||
return { kind: 'fuzzy', start: hits[0]!.start, end: hits[0]!.end };
|
||
}
|
||
|
||
// --- Strategy 3: unicode canonicalization ------------------------------------
|
||
|
||
/**
|
||
* Fold smart punctuation to its ASCII equivalent. Crucially this is a
|
||
* length-PRESERVING, per-character map (every replacement is one char → one
|
||
* char), so an offset into the canonical string is also a valid offset into the
|
||
* original — letting strategy 3 reuse the line-window matcher and still hand
|
||
* back true original-content offsets.
|
||
*/
|
||
function canonicalizeChar(ch: string): string {
|
||
switch (ch) {
|
||
// single quotes / apostrophes
|
||
case '‘': // '
|
||
case '’': // '
|
||
case '‚': // ‚
|
||
case '‛': // ‛
|
||
return "'";
|
||
// double quotes
|
||
case '“': // "
|
||
case '”': // "
|
||
case '„': // „
|
||
case '‟': // ‟
|
||
return '"';
|
||
// dashes
|
||
case '–': // – en dash
|
||
case '—': // — em dash
|
||
case '‒': // ‒ figure dash
|
||
case '―': // ― horizontal bar
|
||
case '−': // − minus sign
|
||
return '-';
|
||
// spaces
|
||
case ' ': // nbsp
|
||
case ' ': // figure space
|
||
case ' ': // narrow nbsp
|
||
return ' ';
|
||
default:
|
||
return ch;
|
||
}
|
||
}
|
||
|
||
function canonicalize(s: string): string {
|
||
let out = '';
|
||
for (const ch of s) out += canonicalizeChar(ch);
|
||
return out;
|
||
}
|
||
|
||
function locateCanonical(content: string, needle: string): MatchResult | null {
|
||
// Only worth running if canonicalization actually changes something on either
|
||
// side — otherwise it's identical to strategy 2 which already failed.
|
||
const canonContent = canonicalize(content);
|
||
const canonNeedle = canonicalize(needle);
|
||
if (canonContent === content && canonNeedle === needle) return null;
|
||
// Offsets are preserved (length-preserving fold), so a match on the canonical
|
||
// content maps directly back to the original.
|
||
return locateByLineWindow(canonContent, canonNeedle);
|
||
}
|
||
|
||
// --- Strategy 4: Levenshtein similarity --------------------------------------
|
||
|
||
/** Standard iterative two-row Levenshtein edit distance. */
|
||
function levenshtein(a: string, b: string): number {
|
||
if (a === b) return 0;
|
||
if (a.length === 0) return b.length;
|
||
if (b.length === 0) return a.length;
|
||
|
||
let prev = new Array<number>(b.length + 1);
|
||
let curr = new Array<number>(b.length + 1);
|
||
for (let j = 0; j <= b.length; j++) prev[j] = j;
|
||
|
||
for (let i = 1; i <= a.length; i++) {
|
||
curr[0] = i;
|
||
const ac = a.charCodeAt(i - 1);
|
||
for (let j = 1; j <= b.length; j++) {
|
||
const cost = ac === b.charCodeAt(j - 1) ? 0 : 1;
|
||
curr[j] = Math.min(
|
||
prev[j]! + 1, // deletion
|
||
curr[j - 1]! + 1, // insertion
|
||
prev[j - 1]! + cost, // substitution
|
||
);
|
||
}
|
||
[prev, curr] = [curr, prev];
|
||
}
|
||
return prev[b.length]!;
|
||
}
|
||
|
||
/** Normalized similarity in [0,1]: 1 - dist / max(len). */
|
||
function similarity(a: string, b: string): number {
|
||
const maxLen = Math.max(a.length, b.length);
|
||
if (maxLen === 0) return 1;
|
||
return 1 - levenshtein(a, b) / maxLen;
|
||
}
|
||
|
||
function locateByLevenshtein(content: string, needle: string): MatchResult | null {
|
||
const contentLines = splitLines(content);
|
||
const needleLines = trimBlankLines(needle.split('\n'));
|
||
const n = needleLines.length;
|
||
if (n === 0) return null;
|
||
if (contentLines.length < n) return null;
|
||
|
||
const needleJoined = needleLines.map((l) => l.trim()).join('\n');
|
||
|
||
let best = -1;
|
||
let bestSpan: { start: number; end: number } | null = null;
|
||
for (let i = 0; i + n <= contentLines.length; i++) {
|
||
const window = contentLines.slice(i, i + n);
|
||
const windowJoined = window.map((l) => l.text.trim()).join('\n');
|
||
const score = similarity(windowJoined, needleJoined);
|
||
if (score > best) {
|
||
best = score;
|
||
bestSpan = { start: window[0]!.start, end: window[n - 1]!.end };
|
||
}
|
||
}
|
||
|
||
if (bestSpan && best >= SIMILARITY_THRESHOLD) {
|
||
return { kind: 'fuzzy', start: bestSpan.start, end: bestSpan.end };
|
||
}
|
||
return null;
|
||
}
|