import { describe, it, expect } from 'vitest'; import { MISTAKE_THRESHOLD, freshMistakeState, recordStep, detectMistakePattern, MISTAKE_RECOVERY_NOTE, type FailureKind, } from '../inference/mistake-tracker.js'; // ---- helpers ---------------------------------------------------------------- // Replays a sequence of outcomes against a fresh state, returning the final // state so assertions can read .run / .nudges. The caller mimics turn.ts: after // each recordStep we consult detectMistakePattern and, if it returns 'nudge', // bump nudges + reset run (the loop's nudge-handling side effect). function replay( outcomes: (FailureKind | 'success')[], { applyNudge = false }: { applyNudge?: boolean } = {}, ) { const state = freshMistakeState(); const decisions: (ReturnType)[] = []; for (const o of outcomes) { recordStep(state, o); const decision = detectMistakePattern(state); decisions.push(decision); if (applyNudge && decision === 'nudge') { // Mirror turn.ts's nudge side effect: bump the counter, reset the streak. state.nudges += 1; state.run = []; } } return { state, decisions }; } // ---- fresh state ------------------------------------------------------------ describe('freshMistakeState', () => { it('starts with an empty run and zero nudges', () => { const s = freshMistakeState(); expect(s.run).toEqual([]); expect(s.nudges).toBe(0); }); }); // ---- below threshold -------------------------------------------------------- describe('detectMistakePattern — below threshold', () => { it('returns null on a fresh state', () => { expect(detectMistakePattern(freshMistakeState())).toBeNull(); }); it('returns null after fewer than MISTAKE_THRESHOLD failures', () => { const { decisions } = replay(['zod_reject', 'exec_error']); expect(decisions).toEqual([null, null]); }); }); // ---- success reset ---------------------------------------------------------- describe('recordStep — success resets', () => { it("'success' clears both the run streak and the nudge counter", () => { const state = freshMistakeState(); recordStep(state, 'zod_reject'); recordStep(state, 'exec_error'); state.nudges = 2; // simulate prior nudges recordStep(state, 'success'); expect(state.run).toEqual([]); expect(state.nudges).toBe(0); }); it('a success mid-streak prevents the threshold from tripping', () => { // fail, fail, success, fail, fail → streak never reaches 3. const { decisions } = replay([ 'zod_reject', 'exec_error', 'success', 'tool_not_found', 'permission_denied', ]); expect(decisions.every((d) => d === null)).toBe(true); }); }); // ---- 3-streak nudge --------------------------------------------------------- describe('detectMistakePattern — nudge on 3-streak', () => { it("returns 'nudge' the first time the streak reaches MISTAKE_THRESHOLD", () => { const { decisions } = replay(['zod_reject', 'exec_error', 'tool_not_found']); expect(decisions).toEqual([null, null, 'nudge']); }); it("fires 'nudge' for a streak of identical kinds too (kind-agnostic)", () => { const { decisions } = replay(['exec_error', 'exec_error', 'exec_error']); expect(decisions[2]).toBe('nudge'); }); }); // ---- re-trip escalate ------------------------------------------------------- describe('detectMistakePattern — escalate on re-trip', () => { it("escalates when the streak re-trips after a nudge with no intervening success", () => { // 3 fails → nudge (run reset, nudges=1), then 3 more fails → escalate. const { decisions } = replay( [ 'zod_reject', 'exec_error', 'tool_not_found', 'permission_denied', 'exec_error', 'zod_reject', ], { applyNudge: true }, ); expect(decisions[2]).toBe('nudge'); expect(decisions[5]).toBe('escalate'); }); it("does NOT escalate if a success lands between the nudge and the next streak", () => { const { decisions } = replay( [ 'zod_reject', 'exec_error', 'tool_not_found', // nudge here 'success', // clears nudges back to 0 'exec_error', 'zod_reject', 'tool_not_found', // 3-streak again → nudge, NOT escalate ], { applyNudge: true }, ); expect(decisions[2]).toBe('nudge'); expect(decisions[6]).toBe('nudge'); expect(decisions).not.toContain('escalate'); }); }); // ---- mixed kinds ------------------------------------------------------------ describe('detectMistakePattern — mixed failure kinds', () => { it('counts a streak of all five distinct kinds toward the threshold', () => { const { state, decisions } = replay([ 'zod_reject', 'tool_not_found', 'exec_error', ]); expect(decisions[2]).toBe('nudge'); expect(state.run).toEqual(['zod_reject', 'tool_not_found', 'exec_error']); }); }); // ---- contract --------------------------------------------------------------- describe('MISTAKE_THRESHOLD + MISTAKE_RECOVERY_NOTE', () => { it('threshold is a positive integer (tests assume 3)', () => { expect(MISTAKE_THRESHOLD).toBeGreaterThan(0); expect(Number.isInteger(MISTAKE_THRESHOLD)).toBe(true); }); it('recovery note is a non-empty model-facing string', () => { expect(typeof MISTAKE_RECOVERY_NOTE).toBe('string'); expect(MISTAKE_RECOVERY_NOTE.length).toBeGreaterThan(0); }); });