Root cause: two proven corruption mechanisms — (M1) non-idempotent apply stamped the same block N times when a quantized model re-emitted the same edit_file call or a turn was retried; (M2) Levenshtein tier 4 was fail-open with no uniqueness guard, silently splicing into the wrong location. Fixes applied at every layer of the pipeline: Matcher (fuzzy-match.ts): raise SIMILARITY_THRESHOLD 0.66 → 0.85; add AMBIGUITY_EPSILON uniqueness guard — two windows within 0.05 of the top score → ambiguous, not a guess; add block-anchor gate (≥3-line needles require first+last line exact match before a window is scored). Edit planner (pending_changes.ts): extract planEdit() as a pure function; idempotency guards detect already-applied states (anchored insert re-stamp, old-gone-but-new-present); findPendingDuplicate() collapses identical pending rows at queue time so M1 never reaches applyOne. Atomic writes (pending_changes.ts): temp-file + rename on the same filesystem so a crash can't leave a half-written source file; realpath() first so symlinks survive the rename. Per-file mutex (pending_changes.ts): withFileLock() serializes concurrent read-modify-write on the same path via a chained-Promise Map. EOL preservation (pending_changes.ts): normalize CRLF → LF for matching, restore native line ending on write so Windows-style files stay clean. Context isolation (inference_context.ts): replace module-level singleton with AsyncLocalStorage so concurrent inference runs (arena parallel dispatch, dispatcher poll racing a user message) each get their own scoped context with no clobbering. Tests: plan-edit.test.ts (pure planEdit unit tests), extended fuzzy-match and pending_changes_integration suites, ALS isolation test that proves overlapping runs get correct session IDs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
323 lines
13 KiB
TypeScript
323 lines
13 KiB
TypeScript
// Fuzzy patch locator for staged edits.
|
||
//
|
||
// Local quantized models (qwen3.6 and friends) frequently reproduce an
|
||
// `old_string` with small, semantically-irrelevant drift: trailing whitespace,
|
||
// a different indent width, or "smart" unicode punctuation (curly quotes, an
|
||
// en/em-dash, a non-breaking space) where the source has the plain ASCII form.
|
||
// An exact `String.includes` then fails and the queued edit is lost even though
|
||
// a human would say it obviously matches.
|
||
//
|
||
// `locateMatch` walks a ladder of progressively looser strategies and returns
|
||
// the real `[start, end)` byte-offset span in the ORIGINAL content so the caller
|
||
// can splice in `new_string` over the true file text (preserving the file's own
|
||
// whitespace/unicode, not the model's drifted copy). The ladder stops at the
|
||
// first strategy that resolves to a single span:
|
||
//
|
||
// 1. exact — indexOf; >1 hit is reported `ambiguous` (we refuse to
|
||
// guess which occurrence the model meant).
|
||
// 2. per-line ws — line-window compare ignoring per-line trailing
|
||
// whitespace and leading/trailing blank needle lines.
|
||
// 3. unicode canon — same line-window compare after folding smart
|
||
// punctuation to ASCII on both sides; the match is
|
||
// mapped back to original offsets.
|
||
// 4. levenshtein — best line-window by normalized edit-distance
|
||
// similarity; accepted only at >= SIMILARITY_THRESHOLD,
|
||
// anchored on an exact first+last line for multi-line
|
||
// needles, and REFUSED (ambiguous) when a second window
|
||
// scores within AMBIGUITY_EPSILON of the best. Like the
|
||
// exact/whitespace tiers, this tier fails CLOSED — it
|
||
// never splices over a merely-plausible guess, because a
|
||
// wrong-window splice corrupts the file (it leaves the
|
||
// real target intact and duplicates it). This mirrors
|
||
// opencode/cline/qwen, whose fuzzy tiers all keep the
|
||
// unique-match requirement rather than picking a winner.
|
||
//
|
||
// Pure and dependency-free (Levenshtein is the standard iterative two-row DP),
|
||
// reimplemented from the general technique — no vendored source.
|
||
|
||
export type MatchResult =
|
||
| { kind: 'exact' | 'fuzzy'; start: number; end: number } // [start,end) offsets into content
|
||
| { kind: 'ambiguous'; count: number }
|
||
| { kind: 'not_found' };
|
||
|
||
/**
|
||
* Levenshtein similarity floor for the final fuzzy fallback (strategy 4).
|
||
* 0.66 was far too low — at two-thirds similarity a structurally-wrong window
|
||
* (e.g. one of three near-identical form blocks) clears the bar and gets spliced
|
||
* over, leaving the real target intact and duplicated. Competent agents anchor
|
||
* far tighter (opencode's BlockAnchor needs an exact anchor; cline needs exact
|
||
* first+last lines). 0.85 keeps genuine quantized-model drift (a typo, an indent
|
||
* shift) while refusing a different block.
|
||
*/
|
||
export const SIMILARITY_THRESHOLD = 0.85;
|
||
|
||
/**
|
||
* If a second candidate window scores within this of the best, the match is
|
||
* ambiguous and tier 4 refuses rather than guessing — the same fail-closed
|
||
* stance the exact and whitespace tiers take on multiple hits. Repetitive files
|
||
* (the duplicate-block corruption case) produce near-tied windows; this is what
|
||
* turns that into a clean "add more context" error instead of a wrong splice.
|
||
*/
|
||
export const AMBIGUITY_EPSILON = 0.05;
|
||
|
||
/** Multi-line needles at or above this length must anchor on an exact (after
|
||
* trim + unicode-fold) first AND last line before similarity is even scored —
|
||
* the cline/opencode block-anchor rule. Below it, threshold + uniqueness alone
|
||
* guard the match. */
|
||
const ANCHOR_MIN_LINES = 3;
|
||
|
||
export function locateMatch(content: string, needle: string): MatchResult {
|
||
// Empty needle has no meaningful match.
|
||
if (needle.length === 0) return { kind: 'not_found' };
|
||
|
||
// --- 1. Exact ----------------------------------------------------------------
|
||
const exact = locateExact(content, needle);
|
||
if (exact) return exact;
|
||
|
||
// --- 2. Per-line whitespace-insensitive -------------------------------------
|
||
const ws = locateByLineWindow(content, needle);
|
||
if (ws) return ws;
|
||
|
||
// --- 3. Unicode-canonicalized whitespace pass -------------------------------
|
||
const canon = locateCanonical(content, needle);
|
||
if (canon) return canon;
|
||
|
||
// --- 4. Levenshtein similarity ----------------------------------------------
|
||
const lev = locateByLevenshtein(content, needle);
|
||
if (lev) return lev;
|
||
|
||
return { kind: 'not_found' };
|
||
}
|
||
|
||
// --- Strategy 1: exact -------------------------------------------------------
|
||
|
||
function locateExact(content: string, needle: string): MatchResult | null {
|
||
const first = content.indexOf(needle);
|
||
if (first === -1) return null;
|
||
const second = content.indexOf(needle, first + 1);
|
||
if (second === -1) {
|
||
return { kind: 'exact', start: first, end: first + needle.length };
|
||
}
|
||
// Count all occurrences so the caller can report a useful number.
|
||
let count = 2;
|
||
let idx = content.indexOf(needle, second + 1);
|
||
while (idx !== -1) {
|
||
count++;
|
||
idx = content.indexOf(needle, idx + 1);
|
||
}
|
||
return { kind: 'ambiguous', count };
|
||
}
|
||
|
||
// --- Line-window machinery ---------------------------------------------------
|
||
|
||
interface Line {
|
||
/** Raw line text (no trailing newline). */
|
||
text: string;
|
||
/** Offset of the first char of this line in the original content. */
|
||
start: number;
|
||
/** Offset one past the last char of this line (before its newline, if any). */
|
||
end: number;
|
||
}
|
||
|
||
/**
|
||
* Split content into lines, tracking each line's real offset span. The span
|
||
* EXCLUDES the trailing newline so consecutive line spans plus their newlines
|
||
* exactly reconstruct the content; the match span we hand back covers from the
|
||
* first matched line's start through the last matched line's end (i.e. without a
|
||
* trailing newline), which is what an in-place splice wants.
|
||
*/
|
||
function splitLines(content: string): Line[] {
|
||
const lines: Line[] = [];
|
||
let start = 0;
|
||
for (let i = 0; i <= content.length; i++) {
|
||
if (i === content.length || content[i] === '\n') {
|
||
lines.push({ text: content.slice(start, i), start, end: i });
|
||
start = i + 1;
|
||
}
|
||
}
|
||
return lines;
|
||
}
|
||
|
||
/** Strip leading/trailing all-blank lines; returns the trimmed slice. */
|
||
function trimBlankLines(lines: string[]): string[] {
|
||
let lo = 0;
|
||
let hi = lines.length;
|
||
while (lo < hi && lines[lo]!.trim() === '') lo++;
|
||
while (hi > lo && lines[hi - 1]!.trim() === '') hi--;
|
||
return lines.slice(lo, hi);
|
||
}
|
||
|
||
/**
|
||
* Find a contiguous window of content lines whose trailing-whitespace-trimmed
|
||
* text equals the needle's (blank-trimmed) lines. Returns the real offset span
|
||
* over the matched content lines, or null if zero match. Multiple matches →
|
||
* ambiguous. `normalize` lets the caller fold unicode before comparing.
|
||
*/
|
||
function locateByLineWindow(
|
||
content: string,
|
||
needle: string,
|
||
normalize: (s: string) => string = (s) => s,
|
||
): MatchResult | null {
|
||
const contentLines = splitLines(content);
|
||
const needleLines = trimBlankLines(needle.split('\n'));
|
||
const n = needleLines.length;
|
||
if (n === 0) return null;
|
||
// A single needle line that is itself blank can't be located meaningfully.
|
||
if (n === 1 && needleLines[0]!.trim() === '') return null;
|
||
|
||
const needleKey = needleLines.map((l) => normalize(l.trimEnd())).join('\n');
|
||
|
||
const hits: Array<{ start: number; end: number }> = [];
|
||
for (let i = 0; i + n <= contentLines.length; i++) {
|
||
const windowKey = contentLines
|
||
.slice(i, i + n)
|
||
.map((l) => normalize(l.text.trimEnd()))
|
||
.join('\n');
|
||
if (windowKey === needleKey) {
|
||
hits.push({ start: contentLines[i]!.start, end: contentLines[i + n - 1]!.end });
|
||
}
|
||
}
|
||
|
||
if (hits.length === 0) return null;
|
||
if (hits.length > 1) return { kind: 'ambiguous', count: hits.length };
|
||
return { kind: 'fuzzy', start: hits[0]!.start, end: hits[0]!.end };
|
||
}
|
||
|
||
// --- Strategy 3: unicode canonicalization ------------------------------------
|
||
|
||
/**
|
||
* Fold smart punctuation to its ASCII equivalent. Crucially this is a
|
||
* length-PRESERVING, per-character map (every replacement is one char → one
|
||
* char), so an offset into the canonical string is also a valid offset into the
|
||
* original — letting strategy 3 reuse the line-window matcher and still hand
|
||
* back true original-content offsets.
|
||
*/
|
||
function canonicalizeChar(ch: string): string {
|
||
switch (ch) {
|
||
// single quotes / apostrophes
|
||
case '‘': // '
|
||
case '’': // '
|
||
case '‚': // ‚
|
||
case '‛': // ‛
|
||
return "'";
|
||
// double quotes
|
||
case '“': // "
|
||
case '”': // "
|
||
case '„': // „
|
||
case '‟': // ‟
|
||
return '"';
|
||
// dashes
|
||
case '–': // – en dash
|
||
case '—': // — em dash
|
||
case '‒': // ‒ figure dash
|
||
case '―': // ― horizontal bar
|
||
case '−': // − minus sign
|
||
return '-';
|
||
// spaces
|
||
case ' ': // nbsp
|
||
case ' ': // figure space
|
||
case ' ': // narrow nbsp
|
||
return ' ';
|
||
default:
|
||
return ch;
|
||
}
|
||
}
|
||
|
||
function canonicalize(s: string): string {
|
||
let out = '';
|
||
for (const ch of s) out += canonicalizeChar(ch);
|
||
return out;
|
||
}
|
||
|
||
function locateCanonical(content: string, needle: string): MatchResult | null {
|
||
// Only worth running if canonicalization actually changes something on either
|
||
// side — otherwise it's identical to strategy 2 which already failed.
|
||
const canonContent = canonicalize(content);
|
||
const canonNeedle = canonicalize(needle);
|
||
if (canonContent === content && canonNeedle === needle) return null;
|
||
// Offsets are preserved (length-preserving fold), so a match on the canonical
|
||
// content maps directly back to the original.
|
||
return locateByLineWindow(canonContent, canonNeedle);
|
||
}
|
||
|
||
// --- Strategy 4: Levenshtein similarity --------------------------------------
|
||
|
||
/** Standard iterative two-row Levenshtein edit distance. */
|
||
function levenshtein(a: string, b: string): number {
|
||
if (a === b) return 0;
|
||
if (a.length === 0) return b.length;
|
||
if (b.length === 0) return a.length;
|
||
|
||
let prev = new Array<number>(b.length + 1);
|
||
let curr = new Array<number>(b.length + 1);
|
||
for (let j = 0; j <= b.length; j++) prev[j] = j;
|
||
|
||
for (let i = 1; i <= a.length; i++) {
|
||
curr[0] = i;
|
||
const ac = a.charCodeAt(i - 1);
|
||
for (let j = 1; j <= b.length; j++) {
|
||
const cost = ac === b.charCodeAt(j - 1) ? 0 : 1;
|
||
curr[j] = Math.min(
|
||
prev[j]! + 1, // deletion
|
||
curr[j - 1]! + 1, // insertion
|
||
prev[j - 1]! + cost, // substitution
|
||
);
|
||
}
|
||
[prev, curr] = [curr, prev];
|
||
}
|
||
return prev[b.length]!;
|
||
}
|
||
|
||
/** Normalized similarity in [0,1]: 1 - dist / max(len). */
|
||
function similarity(a: string, b: string): number {
|
||
const maxLen = Math.max(a.length, b.length);
|
||
if (maxLen === 0) return 1;
|
||
return 1 - levenshtein(a, b) / maxLen;
|
||
}
|
||
|
||
function locateByLevenshtein(content: string, needle: string): MatchResult | null {
|
||
const contentLines = splitLines(content);
|
||
const needleLines = trimBlankLines(needle.split('\n'));
|
||
const n = needleLines.length;
|
||
if (n === 0) return null;
|
||
if (contentLines.length < n) return null;
|
||
|
||
const needleJoined = needleLines.map((l) => l.trim()).join('\n');
|
||
|
||
// Block-anchor gate for multi-line needles: the first and last lines must match
|
||
// exactly (after trim + unicode-fold) or the window is not even scored. This
|
||
// stops a high interior-similarity from dragging a structurally-wrong window
|
||
// over the threshold — the failure that duplicates blocks in repetitive files.
|
||
const anchored = n >= ANCHOR_MIN_LINES;
|
||
const needleFirst = canonicalize(needleLines[0]!.trim());
|
||
const needleLast = canonicalize(needleLines[n - 1]!.trim());
|
||
|
||
const scored: Array<{ score: number; start: number; end: number }> = [];
|
||
for (let i = 0; i + n <= contentLines.length; i++) {
|
||
const window = contentLines.slice(i, i + n);
|
||
if (anchored) {
|
||
const winFirst = canonicalize(window[0]!.text.trim());
|
||
const winLast = canonicalize(window[n - 1]!.text.trim());
|
||
if (winFirst !== needleFirst || winLast !== needleLast) continue;
|
||
}
|
||
const windowJoined = window.map((l) => l.text.trim()).join('\n');
|
||
scored.push({
|
||
score: similarity(windowJoined, needleJoined),
|
||
start: window[0]!.start,
|
||
end: window[n - 1]!.end,
|
||
});
|
||
}
|
||
|
||
if (scored.length === 0) return null;
|
||
scored.sort((a, b) => b.score - a.score);
|
||
const best = scored[0]!;
|
||
if (best.score < SIMILARITY_THRESHOLD) return null;
|
||
|
||
// Uniqueness guard: refuse when a second window is within epsilon of the best.
|
||
// Fail closed (ambiguous) rather than silently splicing one of several lookalikes.
|
||
const tied = scored.filter((s) => s.score >= best.score - AMBIGUITY_EPSILON);
|
||
if (tied.length > 1) return { kind: 'ambiguous', count: tied.length };
|
||
|
||
return { kind: 'fuzzy', start: best.start, end: best.end };
|
||
}
|