- Bm25Ranker: Okapi BM25 scoring (pure TS, no deps) - Embedding module: ONNX-based local embeddings via onnxruntime-node - Hybrid recall: BM25 (30%) + cosine similarity (70%) weighted merge - Falls back to keyword-only via MEMORY_SEARCH=keyword env var - extract_memory agent tool for persisting memory entries
68 lines
2.1 KiB
TypeScript
68 lines
2.1 KiB
TypeScript
// BM25 ranker — pure Okapi BM25 scoring. No external deps.
|
|
|
|
interface Bm25Config {
|
|
k1?: number;
|
|
b?: number;
|
|
}
|
|
|
|
export class Bm25Ranker {
|
|
private k1: number;
|
|
private b: number;
|
|
private corpus: string[];
|
|
private avgDocLen: number;
|
|
private idfCache: Map<string, number>;
|
|
private docCount: number;
|
|
|
|
constructor(config?: Bm25Config) {
|
|
this.k1 = config?.k1 ?? 1.5;
|
|
this.b = config?.b ?? 0.75;
|
|
this.corpus = [];
|
|
this.avgDocLen = 0;
|
|
this.idfCache = new Map();
|
|
this.docCount = 0;
|
|
}
|
|
|
|
fit(docs: string[]): void {
|
|
this.corpus = docs;
|
|
this.docCount = docs.length;
|
|
const lengths = docs.map((d) => d.split(/\s+/).length);
|
|
this.avgDocLen = lengths.reduce((a, b) => a + b, 0) / lengths.length;
|
|
this.idfCache.clear();
|
|
}
|
|
|
|
private tokenize(text: string): string[] {
|
|
return text.toLowerCase().split(/\s+/).filter((t) => t.length > 0);
|
|
}
|
|
|
|
private idf(term: string): number {
|
|
const cached = this.idfCache.get(term);
|
|
if (cached !== undefined) return cached;
|
|
const docsWithTerm = this.corpus.filter((d) => this.tokenize(d).includes(term)).length;
|
|
const idf = Math.log(1 + (this.docCount - docsWithTerm + 0.5) / (docsWithTerm + 0.5));
|
|
this.idfCache.set(term, idf);
|
|
return idf;
|
|
}
|
|
|
|
score(query: string, docIndex: number): number {
|
|
if (docIndex < 0 || docIndex >= this.corpus.length) return 0;
|
|
const doc = this.corpus[docIndex]!;
|
|
const queryTerms = this.tokenize(query);
|
|
const docTokens = this.tokenize(doc);
|
|
const docLen = docTokens.length;
|
|
|
|
let total = 0;
|
|
for (const term of queryTerms) {
|
|
const tf = docTokens.filter((t) => t === term).length;
|
|
if (tf === 0) continue;
|
|
const idfVal = this.idf(term);
|
|
total += idfVal * ((tf * (this.k1 + 1)) / (tf + this.k1 * (1 - this.b + this.b * docLen / this.avgDocLen)));
|
|
}
|
|
return total;
|
|
}
|
|
|
|
rank(query: string, topN: number = 10): Array<{ index: number; score: number }> {
|
|
const scores = this.corpus.map((_, i) => ({ index: i, score: this.score(query, i) }));
|
|
return scores.sort((a, b) => b.score - a.score).slice(0, topN).filter((s) => s.score > 0);
|
|
}
|
|
}
|