boocode/apps/coder/src/services/fuzzy-match.ts

// Fuzzy patch locator for staged edits.
//
// Local quantized models (qwen3.6 and friends) frequently reproduce an
// `old_string` with small, semantically-irrelevant drift: trailing whitespace,
// a different indent width, or "smart" unicode punctuation (curly quotes, an
// en/em-dash, a non-breaking space) where the source has the plain ASCII form.
// An exact `String.includes` then fails and the queued edit is lost even though
// a human would say it obviously matches.
//
// `locateMatch` walks a ladder of progressively looser strategies and returns
// the real `[start, end)` byte-offset span in the ORIGINAL content so the caller
// can splice in `new_string` over the true file text (preserving the file's own
// whitespace/unicode, not the model's drifted copy). The ladder stops at the
// first strategy that resolves to a single span:
//
//   1. exact            — indexOf; >1 hit is reported `ambiguous` (we refuse to
//                         guess which occurrence the model meant).
//   2. per-line ws      — line-window compare ignoring per-line trailing
//                         whitespace and leading/trailing blank needle lines.
//   3. unicode canon    — same line-window compare after folding smart
//                         punctuation to ASCII on both sides; the match is
//                         mapped back to original offsets.
//   4. levenshtein      — best line-window by normalized edit-distance
//                         similarity; accepted only at >= SIMILARITY_THRESHOLD,
//                         anchored on an exact first+last line for multi-line
//                         needles, and REFUSED (ambiguous) when a second window
//                         scores within AMBIGUITY_EPSILON of the best. Like the
//                         exact/whitespace tiers, this tier fails CLOSED — it
//                         never splices over a merely-plausible guess, because a
//                         wrong-window splice corrupts the file (it leaves the
//                         real target intact and duplicates it). This mirrors
//                         opencode/cline/qwen, whose fuzzy tiers all keep the
//                         unique-match requirement rather than picking a winner.
//
// Pure and dependency-free (Levenshtein is the standard iterative two-row DP),
// reimplemented from the general technique — no vendored source.

export type MatchResult =
  | { kind: 'exact' | 'fuzzy'; start: number; end: number } // [start,end) offsets into content
  | { kind: 'ambiguous'; count: number }
  | { kind: 'not_found' };

/**
 * Levenshtein similarity floor for the final fuzzy fallback (strategy 4).
 * 0.66 was far too low — at two-thirds similarity a structurally-wrong window
 * (e.g. one of three near-identical form blocks) clears the bar and gets spliced
 * over, leaving the real target intact and duplicated. Competent agents anchor
 * far tighter (opencode's BlockAnchor needs an exact anchor; cline needs exact
 * first+last lines). 0.85 keeps genuine quantized-model drift (a typo, an indent
 * shift) while refusing a different block.
 */
export const SIMILARITY_THRESHOLD = 0.85;

/**
 * If a second candidate window scores within this of the best, the match is
 * ambiguous and tier 4 refuses rather than guessing — the same fail-closed
 * stance the exact and whitespace tiers take on multiple hits. Repetitive files
 * (the duplicate-block corruption case) produce near-tied windows; this is what
 * turns that into a clean "add more context" error instead of a wrong splice.
 */
export const AMBIGUITY_EPSILON = 0.05;

/** Multi-line needles at or above this length must anchor on an exact (after
 *  trim + unicode-fold) first AND last line before similarity is even scored —
 *  the cline/opencode block-anchor rule. Below it, threshold + uniqueness alone
 *  guard the match. */
const ANCHOR_MIN_LINES = 3;

export function locateMatch(content: string, needle: string): MatchResult {
  // Empty needle has no meaningful match.
  if (needle.length === 0) return { kind: 'not_found' };

  // --- 1. Exact ----------------------------------------------------------------
  const exact = locateExact(content, needle);
  if (exact) return exact;

  // --- 2. Per-line whitespace-insensitive -------------------------------------
  const ws = locateByLineWindow(content, needle);
  if (ws) return ws;

  // --- 3. Unicode-canonicalized whitespace pass -------------------------------
  const canon = locateCanonical(content, needle);
  if (canon) return canon;

  // --- 4. Levenshtein similarity ----------------------------------------------
  const lev = locateByLevenshtein(content, needle);
  if (lev) return lev;

  return { kind: 'not_found' };
}

// --- Strategy 1: exact -------------------------------------------------------

function locateExact(content: string, needle: string): MatchResult | null {
  const first = content.indexOf(needle);
  if (first === -1) return null;
  const second = content.indexOf(needle, first + 1);
  if (second === -1) {
    return { kind: 'exact', start: first, end: first + needle.length };
  }
  // Count all occurrences so the caller can report a useful number.
  let count = 2;
  let idx = content.indexOf(needle, second + 1);
  while (idx !== -1) {
    count++;
    idx = content.indexOf(needle, idx + 1);
  }
  return { kind: 'ambiguous', count };
}

// --- Line-window machinery ---------------------------------------------------

interface Line {
  /** Raw line text (no trailing newline). */
  text: string;
  /** Offset of the first char of this line in the original content. */
  start: number;
  /** Offset one past the last char of this line (before its newline, if any). */
  end: number;
}

/**
 * Split content into lines, tracking each line's real offset span. The span
 * EXCLUDES the trailing newline so consecutive line spans plus their newlines
 * exactly reconstruct the content; the match span we hand back covers from the
 * first matched line's start through the last matched line's end (i.e. without a
 * trailing newline), which is what an in-place splice wants.
 */
function splitLines(content: string): Line[] {
  const lines: Line[] = [];
  let start = 0;
  for (let i = 0; i <= content.length; i++) {
    if (i === content.length || content[i] === '\n') {
      lines.push({ text: content.slice(start, i), start, end: i });
      start = i + 1;
    }
  }
  return lines;
}

/** Strip leading/trailing all-blank lines; returns the trimmed slice. */
function trimBlankLines(lines: string[]): string[] {
  let lo = 0;
  let hi = lines.length;
  while (lo < hi && lines[lo]!.trim() === '') lo++;
  while (hi > lo && lines[hi - 1]!.trim() === '') hi--;
  return lines.slice(lo, hi);
}

/**
 * Find a contiguous window of content lines whose trailing-whitespace-trimmed
 * text equals the needle's (blank-trimmed) lines. Returns the real offset span
 * over the matched content lines, or null if zero match. Multiple matches →
 * ambiguous. `normalize` lets the caller fold unicode before comparing.
 */
function locateByLineWindow(
  content: string,
  needle: string,
  normalize: (s: string) => string = (s) => s,
): MatchResult | null {
  const contentLines = splitLines(content);
  const needleLines = trimBlankLines(needle.split('\n'));
  const n = needleLines.length;
  if (n === 0) return null;
  // A single needle line that is itself blank can't be located meaningfully.
  if (n === 1 && needleLines[0]!.trim() === '') return null;

  const needleKey = needleLines.map((l) => normalize(l.trimEnd())).join('\n');

  const hits: Array<{ start: number; end: number }> = [];
  for (let i = 0; i + n <= contentLines.length; i++) {
    const windowKey = contentLines
      .slice(i, i + n)
      .map((l) => normalize(l.text.trimEnd()))
      .join('\n');
    if (windowKey === needleKey) {
      hits.push({ start: contentLines[i]!.start, end: contentLines[i + n - 1]!.end });
    }
  }

  if (hits.length === 0) return null;
  if (hits.length > 1) return { kind: 'ambiguous', count: hits.length };
  return { kind: 'fuzzy', start: hits[0]!.start, end: hits[0]!.end };
}

// --- Strategy 3: unicode canonicalization ------------------------------------

/**
 * Fold smart punctuation to its ASCII equivalent. Crucially this is a
 * length-PRESERVING, per-character map (every replacement is one char → one
 * char), so an offset into the canonical string is also a valid offset into the
 * original — letting strategy 3 reuse the line-window matcher and still hand
 * back true original-content offsets.
 */
function canonicalizeChar(ch: string): string {
  switch (ch) {
    // single quotes / apostrophes
    case '‘': // '
    case '’': // '
    case '‚': // ‚
    case '‛': // ‛
      return "'";
    // double quotes
    case '“': // "
    case '”': // "
    case '„': // „
    case '‟': // ‟
      return '"';
    // dashes
    case '–': // – en dash
    case '—': // — em dash
    case '‒': // ‒ figure dash
    case '―': // ― horizontal bar
    case '−': // − minus sign
      return '-';
    // spaces
    case ' ': // nbsp
    case ' ': // figure space
    case ' ': // narrow nbsp
      return ' ';
    default:
      return ch;
  }
}

function canonicalize(s: string): string {
  let out = '';
  for (const ch of s) out += canonicalizeChar(ch);
  return out;
}

function locateCanonical(content: string, needle: string): MatchResult | null {
  // Only worth running if canonicalization actually changes something on either
  // side — otherwise it's identical to strategy 2 which already failed.
  const canonContent = canonicalize(content);
  const canonNeedle = canonicalize(needle);
  if (canonContent === content && canonNeedle === needle) return null;
  // Offsets are preserved (length-preserving fold), so a match on the canonical
  // content maps directly back to the original.
  return locateByLineWindow(canonContent, canonNeedle);
}

// --- Strategy 4: Levenshtein similarity --------------------------------------

/** Standard iterative two-row Levenshtein edit distance. */
function levenshtein(a: string, b: string): number {
  if (a === b) return 0;
  if (a.length === 0) return b.length;
  if (b.length === 0) return a.length;

  let prev = new Array<number>(b.length + 1);
  let curr = new Array<number>(b.length + 1);
  for (let j = 0; j <= b.length; j++) prev[j] = j;

  for (let i = 1; i <= a.length; i++) {
    curr[0] = i;
    const ac = a.charCodeAt(i - 1);
    for (let j = 1; j <= b.length; j++) {
      const cost = ac === b.charCodeAt(j - 1) ? 0 : 1;
      curr[j] = Math.min(
        prev[j]! + 1, // deletion
        curr[j - 1]! + 1, // insertion
        prev[j - 1]! + cost, // substitution
      );
    }
    [prev, curr] = [curr, prev];
  }
  return prev[b.length]!;
}

/** Normalized similarity in [0,1]: 1 - dist / max(len). */
function similarity(a: string, b: string): number {
  const maxLen = Math.max(a.length, b.length);
  if (maxLen === 0) return 1;
  return 1 - levenshtein(a, b) / maxLen;
}

function locateByLevenshtein(content: string, needle: string): MatchResult | null {
  const contentLines = splitLines(content);
  const needleLines = trimBlankLines(needle.split('\n'));
  const n = needleLines.length;
  if (n === 0) return null;
  if (contentLines.length < n) return null;

  const needleJoined = needleLines.map((l) => l.trim()).join('\n');

  // Block-anchor gate for multi-line needles: the first and last lines must match
  // exactly (after trim + unicode-fold) or the window is not even scored. This
  // stops a high interior-similarity from dragging a structurally-wrong window
  // over the threshold — the failure that duplicates blocks in repetitive files.
  const anchored = n >= ANCHOR_MIN_LINES;
  const needleFirst = canonicalize(needleLines[0]!.trim());
  const needleLast = canonicalize(needleLines[n - 1]!.trim());

  const scored: Array<{ score: number; start: number; end: number }> = [];
  for (let i = 0; i + n <= contentLines.length; i++) {
    const window = contentLines.slice(i, i + n);
    if (anchored) {
      const winFirst = canonicalize(window[0]!.text.trim());
      const winLast = canonicalize(window[n - 1]!.text.trim());
      if (winFirst !== needleFirst || winLast !== needleLast) continue;
    }
    const windowJoined = window.map((l) => l.text.trim()).join('\n');
    scored.push({
      score: similarity(windowJoined, needleJoined),
      start: window[0]!.start,
      end: window[n - 1]!.end,
    });
  }

  if (scored.length === 0) return null;
  scored.sort((a, b) => b.score - a.score);
  const best = scored[0]!;
  if (best.score < SIMILARITY_THRESHOLD) return null;

  // Uniqueness guard: refuse when a second window is within epsilon of the best.
  // Fail closed (ambiguous) rather than silently splicing one of several lookalikes.
  const tied = scored.filter((s) => s.score >= best.score - AMBIGUITY_EPSILON);
  if (tied.length > 1) return { kind: 'ambiguous', count: tied.length };

  return { kind: 'fuzzy', start: best.start, end: best.end };
}