feat: MistakeTracker + file-provenance ledger (v2.7.4)

Two native-inference hardening features from boocode_code_review_v2 §1 #12. MistakeTracker: new pure mistake-tracker.ts tracks consecutive heterogeneous tool failures (kinds surfaced per tool from tool-phase.ts). On 3 in a row the turn loop soft-nudges (model-facing recovery guidance + mistake_recovery sentinel + reset), then escalates to stopping the turn (cap-hit-style, Continue affordance) on a re-trip. Complements doom-loop (identical repeats) + cap-hit. File-provenance ledger: compaction.ts derives a deterministic ## Files Read list from the head messages' read-tool calls and injects it into the rolling-summary prompt so provenance survives compaction (no new table; read-only). mistake_recovery sentinel: MessageMetadata arm (server + web) + MessageBubble render branch. Built by 2 parallel agents. Server 545 tests passing (23 new); build + web tsc clean. Native-inference only. Builds on v2.7.3. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-01 13:05:03 +00:00
parent f53d6a8afd
commit bcc89d8adc
15 changed files with 816 additions and 20 deletions
--- a/apps/server/src/services/tests/mistake-tracker.test.ts
+++ b/apps/server/src/services/tests/mistake-tracker.test.ts
@@ -0,0 +1,164 @@
+import { describe, it, expect } from 'vitest';
+import {
+  MISTAKE_THRESHOLD,
+  freshMistakeState,
+  recordStep,
+  detectMistakePattern,
+  MISTAKE_RECOVERY_NOTE,
+  type FailureKind,
+} from '../inference/mistake-tracker.js';
+
+// ---- helpers ----------------------------------------------------------------
+// Replays a sequence of outcomes against a fresh state, returning the final
+// state so assertions can read .run / .nudges. The caller mimics turn.ts: after
+// each recordStep we consult detectMistakePattern and, if it returns 'nudge',
+// bump nudges + reset run (the loop's nudge-handling side effect).
+
+function replay(
+  outcomes: (FailureKind | 'success')[],
+  { applyNudge = false }: { applyNudge?: boolean } = {},
+) {
+  const state = freshMistakeState();
+  const decisions: (ReturnType<typeof detectMistakePattern>)[] = [];
+  for (const o of outcomes) {
+    recordStep(state, o);
+    const decision = detectMistakePattern(state);
+    decisions.push(decision);
+    if (applyNudge && decision === 'nudge') {
+      // Mirror turn.ts's nudge side effect: bump the counter, reset the streak.
+      state.nudges += 1;
+      state.run = [];
+    }
+  }
+  return { state, decisions };
+}
+
+// ---- fresh state ------------------------------------------------------------
+
+describe('freshMistakeState', () => {
+  it('starts with an empty run and zero nudges', () => {
+    const s = freshMistakeState();
+    expect(s.run).toEqual([]);
+    expect(s.nudges).toBe(0);
+  });
+});
+
+// ---- below threshold --------------------------------------------------------
+
+describe('detectMistakePattern — below threshold', () => {
+  it('returns null on a fresh state', () => {
+    expect(detectMistakePattern(freshMistakeState())).toBeNull();
+  });
+
+  it('returns null after fewer than MISTAKE_THRESHOLD failures', () => {
+    const { decisions } = replay(['zod_reject', 'exec_error']);
+    expect(decisions).toEqual([null, null]);
+  });
+});
+
+// ---- success reset ----------------------------------------------------------
+
+describe('recordStep — success resets', () => {
+  it("'success' clears both the run streak and the nudge counter", () => {
+    const state = freshMistakeState();
+    recordStep(state, 'zod_reject');
+    recordStep(state, 'exec_error');
+    state.nudges = 2; // simulate prior nudges
+    recordStep(state, 'success');
+    expect(state.run).toEqual([]);
+    expect(state.nudges).toBe(0);
+  });
+
+  it('a success mid-streak prevents the threshold from tripping', () => {
+    // fail, fail, success, fail, fail → streak never reaches 3.
+    const { decisions } = replay([
+      'zod_reject',
+      'exec_error',
+      'success',
+      'tool_not_found',
+      'permission_denied',
+    ]);
+    expect(decisions.every((d) => d === null)).toBe(true);
+  });
+});
+
+// ---- 3-streak nudge ---------------------------------------------------------
+
+describe('detectMistakePattern — nudge on 3-streak', () => {
+  it("returns 'nudge' the first time the streak reaches MISTAKE_THRESHOLD", () => {
+    const { decisions } = replay(['zod_reject', 'exec_error', 'tool_not_found']);
+    expect(decisions).toEqual([null, null, 'nudge']);
+  });
+
+  it("fires 'nudge' for a streak of identical kinds too (kind-agnostic)", () => {
+    const { decisions } = replay(['exec_error', 'exec_error', 'exec_error']);
+    expect(decisions[2]).toBe('nudge');
+  });
+});
+
+// ---- re-trip escalate -------------------------------------------------------
+
+describe('detectMistakePattern — escalate on re-trip', () => {
+  it("escalates when the streak re-trips after a nudge with no intervening success", () => {
+    // 3 fails → nudge (run reset, nudges=1), then 3 more fails → escalate.
+    const { decisions } = replay(
+      [
+        'zod_reject',
+        'exec_error',
+        'tool_not_found',
+        'permission_denied',
+        'exec_error',
+        'zod_reject',
+      ],
+      { applyNudge: true },
+    );
+    expect(decisions[2]).toBe('nudge');
+    expect(decisions[5]).toBe('escalate');
+  });
+
+  it("does NOT escalate if a success lands between the nudge and the next streak", () => {
+    const { decisions } = replay(
+      [
+        'zod_reject',
+        'exec_error',
+        'tool_not_found', // nudge here
+        'success', // clears nudges back to 0
+        'exec_error',
+        'zod_reject',
+        'tool_not_found', // 3-streak again → nudge, NOT escalate
+      ],
+      { applyNudge: true },
+    );
+    expect(decisions[2]).toBe('nudge');
+    expect(decisions[6]).toBe('nudge');
+    expect(decisions).not.toContain('escalate');
+  });
+});
+
+// ---- mixed kinds ------------------------------------------------------------
+
+describe('detectMistakePattern — mixed failure kinds', () => {
+  it('counts a streak of all five distinct kinds toward the threshold', () => {
+    const { state, decisions } = replay([
+      'zod_reject',
+      'tool_not_found',
+      'exec_error',
+    ]);
+    expect(decisions[2]).toBe('nudge');
+    expect(state.run).toEqual(['zod_reject', 'tool_not_found', 'exec_error']);
+  });
+});
+
+// ---- contract ---------------------------------------------------------------
+
+describe('MISTAKE_THRESHOLD + MISTAKE_RECOVERY_NOTE', () => {
+  it('threshold is a positive integer (tests assume 3)', () => {
+    expect(MISTAKE_THRESHOLD).toBeGreaterThan(0);
+    expect(Number.isInteger(MISTAKE_THRESHOLD)).toBe(true);
+  });
+
+  it('recovery note is a non-empty model-facing string', () => {
+    expect(typeof MISTAKE_RECOVERY_NOTE).toBe('string');
+    expect(MISTAKE_RECOVERY_NOTE.length).toBeGreaterThan(0);
+  });
+});