boocode/apps/coder/src/services/__tests__/fuzzy-match.test.ts

import { describe, it, expect } from 'vitest';
import { locateMatch, SIMILARITY_THRESHOLD } from '../fuzzy-match.js';

// Helper: assert a resolved span and slice it back out of the content so the
// test pins the EXACT file text the caller would replace.
function span(result: ReturnType<typeof locateMatch>): { start: number; end: number } {
  if (result.kind !== 'exact' && result.kind !== 'fuzzy') {
    throw new Error(`expected a located span, got ${result.kind}`);
  }
  return { start: result.start, end: result.end };
}

describe('locateMatch — strategy 1: exact', () => {
  it('returns an exact unique span', () => {
    const content = 'alpha\nbeta\ngamma\n';
    const result = locateMatch(content, 'beta');
    expect(result.kind).toBe('exact');
    const { start, end } = span(result);
    expect(content.slice(start, end)).toBe('beta');
  });

  it('returns the right offsets for a multi-line exact needle', () => {
    const content = 'one\ntwo\nthree\nfour\n';
    const needle = 'two\nthree';
    const result = locateMatch(content, needle);
    expect(result.kind).toBe('exact');
    const { start, end } = span(result);
    expect(content.slice(start, end)).toBe(needle);
  });

  it('refuses when the exact needle occurs more than once', () => {
    const content = 'foo\nbar\nfoo\nbar\nfoo\n';
    const result = locateMatch(content, 'foo');
    expect(result).toEqual({ kind: 'ambiguous', count: 3 });
  });
});

describe('locateMatch — strategy 2: per-line whitespace', () => {
  it('matches across trailing-whitespace drift at the real span', () => {
    // File has trailing spaces the model dropped from a TWO-line copy. A
    // single-line needle would be located by exact indexOf (it's a substring),
    // so use two lines where line 1's trailing ws breaks an exact substring run.
    const content = 'function f() {\n  setup();   \n  return 1;\n}\n';
    const needle = '  setup();\n  return 1;'; // line 1 missing trailing spaces
    const result = locateMatch(content, needle);
    expect(result.kind).toBe('fuzzy');
    const { start, end } = span(result);
    // The returned span covers the ORIGINAL lines including the trailing spaces.
    expect(content.slice(start, end)).toBe('  setup();   \n  return 1;');
  });

  it('matches across indentation drift (multi-line block)', () => {
    // File indents with 4 spaces; model emitted 2-space indentation. trimEnd
    // alone does not normalize LEADING whitespace, so this exercises... actually
    // leading-indent drift is a Levenshtein-tier fallback. Here we keep the
    // leading indent identical and drift only trailing whitespace per line.
    const content = ['if (x) {', '    doThing();    ', '    doOther();', '}'].join('\n');
    const needle = ['    doThing();', '    doOther();'].join('\n');
    const result = locateMatch(content, needle);
    expect(result.kind).toBe('fuzzy');
    const { start, end } = span(result);
    expect(content.slice(start, end)).toBe('    doThing();    \n    doOther();');
  });

  it('ignores leading/trailing blank needle lines', () => {
    const content = 'header\nbody line\nfooter\n';
    const needle = '\n\nbody line\n\n';
    const result = locateMatch(content, needle);
    expect(result.kind).toBe('fuzzy');
    const { start, end } = span(result);
    expect(content.slice(start, end)).toBe('body line');
  });

  it('reports ambiguous when a whitespace-window matches twice', () => {
    // Both line 1 and line 4 differ from the needle only by trailing whitespace,
    // so exact indexOf fails (no exact substring) and the whitespace tier finds
    // two equivalent windows → ambiguous.
    const content = 'x = 1;  \ny = 2;\nz = 3;\nx = 1;\t\n';
    const needle = 'x = 1;'; // no trailing ws → not an exact substring of either line
    const result = locateMatch(content, needle);
    expect(result).toEqual({ kind: 'ambiguous', count: 2 });
  });
});

describe('locateMatch — strategy 3: unicode canonicalization', () => {
  it('matches across curly quotes', () => {
    const content = "const s = 'hello';\n";
    const needle = 'const s = ‘hello’;'; // ‘hello’
    const result = locateMatch(content, needle);
    expect(result.kind).toBe('fuzzy');
    const { start, end } = span(result);
    // Span maps back to ORIGINAL (straight-quote) text.
    expect(content.slice(start, end)).toBe("const s = 'hello';");
  });

  it('matches across curly double-quotes', () => {
    const content = 'log("done");\n';
    const needle = 'log(“done”);'; // “done”
    const result = locateMatch(content, needle);
    expect(result.kind).toBe('fuzzy');
    const { start, end } = span(result);
    expect(content.slice(start, end)).toBe('log("done");');
  });

  it('matches across an em-dash drift', () => {
    const content = 'range 1-10 inclusive\n';
    const needle = 'range 1—10 inclusive'; // em-dash
    const result = locateMatch(content, needle);
    expect(result.kind).toBe('fuzzy');
    const { start, end } = span(result);
    expect(content.slice(start, end)).toBe('range 1-10 inclusive');
  });

  it('matches across a non-breaking space drift', () => {
    const content = 'a b c\n'; // plain spaces
    const needle = 'a b c'; // nbsp between words
    const result = locateMatch(content, needle);
    expect(result.kind).toBe('fuzzy');
    const { start, end } = span(result);
    expect(content.slice(start, end)).toBe('a b c');
  });
});

describe('locateMatch — strategy 4: Levenshtein', () => {
  it('matches a >= threshold near-miss (small typo drift)', () => {
    // Needle has a one-char typo ('totals' vs 'total') so it is NOT an exact
    // substring and the whitespace/canonical tiers (which require equality) both
    // miss; Levenshtein similarity stays well above the 0.66 floor.
    const content = 'const total = sum + tax;\n';
    const needle = 'const totals = sum + tax;';
    const result = locateMatch(content, needle);
    expect(result.kind).toBe('fuzzy');
    const { start, end } = span(result);
    // Span maps to the real (correctly-spelled) file line.
    expect(content.slice(start, end)).toBe('const total = sum + tax;');
  });

  it('matches a multi-line block with indentation drift via Levenshtein', () => {
    const content = ['function g() {', '  return compute(a, b);', '}'].join('\n');
    // 6-space indent vs file's 2-space; trimEnd does not fix leading indent, so
    // this lands on the Levenshtein tier (joined-trim makes it identical → ~1.0).
    const needle = ['      return compute(a, b);'].join('\n');
    const result = locateMatch(content, needle);
    expect(result.kind).toBe('fuzzy');
    const { start, end } = span(result);
    expect(content.slice(start, end)).toBe('  return compute(a, b);');
  });

  it('returns not_found for a below-threshold miss', () => {
    const content = 'the quick brown fox jumps over the lazy dog\n';
    const needle = 'completely unrelated string of text here xyz';
    const result = locateMatch(content, needle);
    expect(result).toEqual({ kind: 'not_found' });
  });

  it('returns not_found for a genuinely-absent needle', () => {
    const content = 'alpha\nbeta\ngamma\n';
    const needle = 'this content does not exist anywhere at all';
    const result = locateMatch(content, needle);
    expect(result).toEqual({ kind: 'not_found' });
  });
});

describe('locateMatch — strategy 4: fail-closed on ambiguity (corruption guard)', () => {
  it('refuses (ambiguous) when two equally-similar anchored blocks both clear the bar', () => {
    // The repetitive-file case that duplicated blocks: two blocks share the same
    // first+last anchor lines and their middle lines are EQUALLY similar to the
    // (drifted) needle. Tier 4 must refuse rather than splice over one of them.
    const content = [
      'const x = {',
      '  total = aa;',
      '};',
      'const x = {',
      '  total = bb;',
      '};',
    ].join('\n');
    const needle = ['const x = {', '  total = ab;', '};'].join('\n');
    const result = locateMatch(content, needle);
    expect(result.kind).toBe('ambiguous');
  });

  it('refuses a below-threshold near-miss that the old 0.66 floor would have spliced', () => {
    // ~0.7 similar: under the raised 0.85 floor this is now not_found, so the
    // caller surfaces a correctable error instead of corrupting the file.
    const content = 'const grandTotalAmount = a + b;\n';
    const needle = 'const totalValue = a + b;';
    const result = locateMatch(content, needle);
    expect(result).toEqual({ kind: 'not_found' });
  });

  it('still matches a single genuine high-similarity drift uniquely', () => {
    const content = 'const total = sum + tax;\n';
    const needle = 'const totals = sum + tax;'; // one-char typo, ~0.96
    const result = locateMatch(content, needle);
    expect(result.kind).toBe('fuzzy');
    const { start, end } = span(result);
    expect(content.slice(start, end)).toBe('const total = sum + tax;');
  });

  it('requires an exact first+last line anchor for multi-line needles', () => {
    // First line drifted too far to anchor → no window is scored → not_found,
    // even though the middle lines are identical.
    const content = ['function compute() {', '  return a + b;', '  return done;', '}'].join('\n');
    const needle = ['totally different opener', '  return a + b;', '}'].join('\n');
    const result = locateMatch(content, needle);
    expect(result).toEqual({ kind: 'not_found' });
  });
});

describe('locateMatch — edge cases', () => {
  it('returns not_found for an empty needle', () => {
    expect(locateMatch('anything', '')).toEqual({ kind: 'not_found' });
  });

  it('exposes a sane similarity threshold', () => {
    expect(SIMILARITY_THRESHOLD).toBeGreaterThan(0);
    expect(SIMILARITY_THRESHOLD).toBeLessThanOrEqual(1);
  });
});