// BM25 ranker — pure Okapi BM25 scoring. No external deps. interface Bm25Config { k1?: number; b?: number; } export class Bm25Ranker { private k1: number; private b: number; private corpus: string[]; private avgDocLen: number; private idfCache: Map; private docCount: number; constructor(config?: Bm25Config) { this.k1 = config?.k1 ?? 1.5; this.b = config?.b ?? 0.75; this.corpus = []; this.avgDocLen = 0; this.idfCache = new Map(); this.docCount = 0; } fit(docs: string[]): void { this.corpus = docs; this.docCount = docs.length; const lengths = docs.map((d) => d.split(/\s+/).length); this.avgDocLen = lengths.reduce((a, b) => a + b, 0) / lengths.length; this.idfCache.clear(); } private tokenize(text: string): string[] { return text.toLowerCase().split(/\s+/).filter((t) => t.length > 0); } private idf(term: string): number { const cached = this.idfCache.get(term); if (cached !== undefined) return cached; const docsWithTerm = this.corpus.filter((d) => this.tokenize(d).includes(term)).length; const idf = Math.log(1 + (this.docCount - docsWithTerm + 0.5) / (docsWithTerm + 0.5)); this.idfCache.set(term, idf); return idf; } score(query: string, docIndex: number): number { if (docIndex < 0 || docIndex >= this.corpus.length) return 0; const doc = this.corpus[docIndex]!; const queryTerms = this.tokenize(query); const docTokens = this.tokenize(doc); const docLen = docTokens.length; let total = 0; for (const term of queryTerms) { const tf = docTokens.filter((t) => t === term).length; if (tf === 0) continue; const idfVal = this.idf(term); total += idfVal * ((tf * (this.k1 + 1)) / (tf + this.k1 * (1 - this.b + this.b * docLen / this.avgDocLen))); } return total; } rank(query: string, topN: number = 10): Array<{ index: number; score: number }> { const scores = this.corpus.map((_, i) => ({ index: i, score: this.score(query, i) })); return scores.sort((a, b) => b.score - a.score).slice(0, topN).filter((s) => s.score > 0); } }