// Fuzzy patch locator for staged edits. // // Local quantized models (qwen3.6 and friends) frequently reproduce an // `old_string` with small, semantically-irrelevant drift: trailing whitespace, // a different indent width, or "smart" unicode punctuation (curly quotes, an // en/em-dash, a non-breaking space) where the source has the plain ASCII form. // An exact `String.includes` then fails and the queued edit is lost even though // a human would say it obviously matches. // // `locateMatch` walks a ladder of progressively looser strategies and returns // the real `[start, end)` byte-offset span in the ORIGINAL content so the caller // can splice in `new_string` over the true file text (preserving the file's own // whitespace/unicode, not the model's drifted copy). The ladder stops at the // first strategy that resolves to a single span: // // 1. exact — indexOf; >1 hit is reported `ambiguous` (we refuse to // guess which occurrence the model meant). // 2. per-line ws — line-window compare ignoring per-line trailing // whitespace and leading/trailing blank needle lines. // 3. unicode canon — same line-window compare after folding smart // punctuation to ASCII on both sides; the match is // mapped back to original offsets. // 4. levenshtein — best line-window by normalized edit-distance // similarity; accepted only at >= SIMILARITY_THRESHOLD. // // Pure and dependency-free (Levenshtein is the standard iterative two-row DP), // reimplemented from the general technique — no vendored source. export type MatchResult = | { kind: 'exact' | 'fuzzy'; start: number; end: number } // [start,end) offsets into content | { kind: 'ambiguous'; count: number } | { kind: 'not_found' }; /** Levenshtein similarity floor for the final fuzzy fallback (strategy 4). */ export const SIMILARITY_THRESHOLD = 0.66; export function locateMatch(content: string, needle: string): MatchResult { // Empty needle has no meaningful match. if (needle.length === 0) return { kind: 'not_found' }; // --- 1. Exact ---------------------------------------------------------------- const exact = locateExact(content, needle); if (exact) return exact; // --- 2. Per-line whitespace-insensitive ------------------------------------- const ws = locateByLineWindow(content, needle); if (ws) return ws; // --- 3. Unicode-canonicalized whitespace pass ------------------------------- const canon = locateCanonical(content, needle); if (canon) return canon; // --- 4. Levenshtein similarity ---------------------------------------------- const lev = locateByLevenshtein(content, needle); if (lev) return lev; return { kind: 'not_found' }; } // --- Strategy 1: exact ------------------------------------------------------- function locateExact(content: string, needle: string): MatchResult | null { const first = content.indexOf(needle); if (first === -1) return null; const second = content.indexOf(needle, first + 1); if (second === -1) { return { kind: 'exact', start: first, end: first + needle.length }; } // Count all occurrences so the caller can report a useful number. let count = 2; let idx = content.indexOf(needle, second + 1); while (idx !== -1) { count++; idx = content.indexOf(needle, idx + 1); } return { kind: 'ambiguous', count }; } // --- Line-window machinery --------------------------------------------------- interface Line { /** Raw line text (no trailing newline). */ text: string; /** Offset of the first char of this line in the original content. */ start: number; /** Offset one past the last char of this line (before its newline, if any). */ end: number; } /** * Split content into lines, tracking each line's real offset span. The span * EXCLUDES the trailing newline so consecutive line spans plus their newlines * exactly reconstruct the content; the match span we hand back covers from the * first matched line's start through the last matched line's end (i.e. without a * trailing newline), which is what an in-place splice wants. */ function splitLines(content: string): Line[] { const lines: Line[] = []; let start = 0; for (let i = 0; i <= content.length; i++) { if (i === content.length || content[i] === '\n') { lines.push({ text: content.slice(start, i), start, end: i }); start = i + 1; } } return lines; } /** Strip leading/trailing all-blank lines; returns the trimmed slice. */ function trimBlankLines(lines: string[]): string[] { let lo = 0; let hi = lines.length; while (lo < hi && lines[lo]!.trim() === '') lo++; while (hi > lo && lines[hi - 1]!.trim() === '') hi--; return lines.slice(lo, hi); } /** * Find a contiguous window of content lines whose trailing-whitespace-trimmed * text equals the needle's (blank-trimmed) lines. Returns the real offset span * over the matched content lines, or null if zero match. Multiple matches → * ambiguous. `normalize` lets the caller fold unicode before comparing. */ function locateByLineWindow( content: string, needle: string, normalize: (s: string) => string = (s) => s, ): MatchResult | null { const contentLines = splitLines(content); const needleLines = trimBlankLines(needle.split('\n')); const n = needleLines.length; if (n === 0) return null; // A single needle line that is itself blank can't be located meaningfully. if (n === 1 && needleLines[0]!.trim() === '') return null; const needleKey = needleLines.map((l) => normalize(l.trimEnd())).join('\n'); const hits: Array<{ start: number; end: number }> = []; for (let i = 0; i + n <= contentLines.length; i++) { const windowKey = contentLines .slice(i, i + n) .map((l) => normalize(l.text.trimEnd())) .join('\n'); if (windowKey === needleKey) { hits.push({ start: contentLines[i]!.start, end: contentLines[i + n - 1]!.end }); } } if (hits.length === 0) return null; if (hits.length > 1) return { kind: 'ambiguous', count: hits.length }; return { kind: 'fuzzy', start: hits[0]!.start, end: hits[0]!.end }; } // --- Strategy 3: unicode canonicalization ------------------------------------ /** * Fold smart punctuation to its ASCII equivalent. Crucially this is a * length-PRESERVING, per-character map (every replacement is one char → one * char), so an offset into the canonical string is also a valid offset into the * original — letting strategy 3 reuse the line-window matcher and still hand * back true original-content offsets. */ function canonicalizeChar(ch: string): string { switch (ch) { // single quotes / apostrophes case '‘': // ' case '’': // ' case '‚': // ‚ case '‛': // ‛ return "'"; // double quotes case '“': // " case '”': // " case '„': // „ case '‟': // ‟ return '"'; // dashes case '–': // – en dash case '—': // — em dash case '‒': // ‒ figure dash case '―': // ― horizontal bar case '−': // − minus sign return '-'; // spaces case ' ': // nbsp case ' ': // figure space case ' ': // narrow nbsp return ' '; default: return ch; } } function canonicalize(s: string): string { let out = ''; for (const ch of s) out += canonicalizeChar(ch); return out; } function locateCanonical(content: string, needle: string): MatchResult | null { // Only worth running if canonicalization actually changes something on either // side — otherwise it's identical to strategy 2 which already failed. const canonContent = canonicalize(content); const canonNeedle = canonicalize(needle); if (canonContent === content && canonNeedle === needle) return null; // Offsets are preserved (length-preserving fold), so a match on the canonical // content maps directly back to the original. return locateByLineWindow(canonContent, canonNeedle); } // --- Strategy 4: Levenshtein similarity -------------------------------------- /** Standard iterative two-row Levenshtein edit distance. */ function levenshtein(a: string, b: string): number { if (a === b) return 0; if (a.length === 0) return b.length; if (b.length === 0) return a.length; let prev = new Array(b.length + 1); let curr = new Array(b.length + 1); for (let j = 0; j <= b.length; j++) prev[j] = j; for (let i = 1; i <= a.length; i++) { curr[0] = i; const ac = a.charCodeAt(i - 1); for (let j = 1; j <= b.length; j++) { const cost = ac === b.charCodeAt(j - 1) ? 0 : 1; curr[j] = Math.min( prev[j]! + 1, // deletion curr[j - 1]! + 1, // insertion prev[j - 1]! + cost, // substitution ); } [prev, curr] = [curr, prev]; } return prev[b.length]!; } /** Normalized similarity in [0,1]: 1 - dist / max(len). */ function similarity(a: string, b: string): number { const maxLen = Math.max(a.length, b.length); if (maxLen === 0) return 1; return 1 - levenshtein(a, b) / maxLen; } function locateByLevenshtein(content: string, needle: string): MatchResult | null { const contentLines = splitLines(content); const needleLines = trimBlankLines(needle.split('\n')); const n = needleLines.length; if (n === 0) return null; if (contentLines.length < n) return null; const needleJoined = needleLines.map((l) => l.trim()).join('\n'); let best = -1; let bestSpan: { start: number; end: number } | null = null; for (let i = 0; i + n <= contentLines.length; i++) { const window = contentLines.slice(i, i + n); const windowJoined = window.map((l) => l.text.trim()).join('\n'); const score = similarity(windowJoined, needleJoined); if (score > best) { best = score; bestSpan = { start: window[0]!.start, end: window[n - 1]!.end }; } } if (bestSpan && best >= SIMILARITY_THRESHOLD) { return { kind: 'fuzzy', start: bestSpan.start, end: bestSpan.end }; } return null; }