Two native-inference hardening features from boocode_code_review_v2 §1 #12. MistakeTracker: new pure mistake-tracker.ts tracks consecutive heterogeneous tool failures (kinds surfaced per tool from tool-phase.ts). On 3 in a row the turn loop soft-nudges (model-facing recovery guidance + mistake_recovery sentinel + reset), then escalates to stopping the turn (cap-hit-style, Continue affordance) on a re-trip. Complements doom-loop (identical repeats) + cap-hit. File-provenance ledger: compaction.ts derives a deterministic ## Files Read list from the head messages' read-tool calls and injects it into the rolling-summary prompt so provenance survives compaction (no new table; read-only). mistake_recovery sentinel: MessageMetadata arm (server + web) + MessageBubble render branch. Built by 2 parallel agents. Server 545 tests passing (23 new); build + web tsc clean. Native-inference only. Builds on v2.7.3. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
165 lines
5.4 KiB
TypeScript
165 lines
5.4 KiB
TypeScript
import { describe, it, expect } from 'vitest';
|
|
import {
|
|
MISTAKE_THRESHOLD,
|
|
freshMistakeState,
|
|
recordStep,
|
|
detectMistakePattern,
|
|
MISTAKE_RECOVERY_NOTE,
|
|
type FailureKind,
|
|
} from '../inference/mistake-tracker.js';
|
|
|
|
// ---- helpers ----------------------------------------------------------------
|
|
// Replays a sequence of outcomes against a fresh state, returning the final
|
|
// state so assertions can read .run / .nudges. The caller mimics turn.ts: after
|
|
// each recordStep we consult detectMistakePattern and, if it returns 'nudge',
|
|
// bump nudges + reset run (the loop's nudge-handling side effect).
|
|
|
|
function replay(
|
|
outcomes: (FailureKind | 'success')[],
|
|
{ applyNudge = false }: { applyNudge?: boolean } = {},
|
|
) {
|
|
const state = freshMistakeState();
|
|
const decisions: (ReturnType<typeof detectMistakePattern>)[] = [];
|
|
for (const o of outcomes) {
|
|
recordStep(state, o);
|
|
const decision = detectMistakePattern(state);
|
|
decisions.push(decision);
|
|
if (applyNudge && decision === 'nudge') {
|
|
// Mirror turn.ts's nudge side effect: bump the counter, reset the streak.
|
|
state.nudges += 1;
|
|
state.run = [];
|
|
}
|
|
}
|
|
return { state, decisions };
|
|
}
|
|
|
|
// ---- fresh state ------------------------------------------------------------
|
|
|
|
describe('freshMistakeState', () => {
|
|
it('starts with an empty run and zero nudges', () => {
|
|
const s = freshMistakeState();
|
|
expect(s.run).toEqual([]);
|
|
expect(s.nudges).toBe(0);
|
|
});
|
|
});
|
|
|
|
// ---- below threshold --------------------------------------------------------
|
|
|
|
describe('detectMistakePattern — below threshold', () => {
|
|
it('returns null on a fresh state', () => {
|
|
expect(detectMistakePattern(freshMistakeState())).toBeNull();
|
|
});
|
|
|
|
it('returns null after fewer than MISTAKE_THRESHOLD failures', () => {
|
|
const { decisions } = replay(['zod_reject', 'exec_error']);
|
|
expect(decisions).toEqual([null, null]);
|
|
});
|
|
});
|
|
|
|
// ---- success reset ----------------------------------------------------------
|
|
|
|
describe('recordStep — success resets', () => {
|
|
it("'success' clears both the run streak and the nudge counter", () => {
|
|
const state = freshMistakeState();
|
|
recordStep(state, 'zod_reject');
|
|
recordStep(state, 'exec_error');
|
|
state.nudges = 2; // simulate prior nudges
|
|
recordStep(state, 'success');
|
|
expect(state.run).toEqual([]);
|
|
expect(state.nudges).toBe(0);
|
|
});
|
|
|
|
it('a success mid-streak prevents the threshold from tripping', () => {
|
|
// fail, fail, success, fail, fail → streak never reaches 3.
|
|
const { decisions } = replay([
|
|
'zod_reject',
|
|
'exec_error',
|
|
'success',
|
|
'tool_not_found',
|
|
'permission_denied',
|
|
]);
|
|
expect(decisions.every((d) => d === null)).toBe(true);
|
|
});
|
|
});
|
|
|
|
// ---- 3-streak nudge ---------------------------------------------------------
|
|
|
|
describe('detectMistakePattern — nudge on 3-streak', () => {
|
|
it("returns 'nudge' the first time the streak reaches MISTAKE_THRESHOLD", () => {
|
|
const { decisions } = replay(['zod_reject', 'exec_error', 'tool_not_found']);
|
|
expect(decisions).toEqual([null, null, 'nudge']);
|
|
});
|
|
|
|
it("fires 'nudge' for a streak of identical kinds too (kind-agnostic)", () => {
|
|
const { decisions } = replay(['exec_error', 'exec_error', 'exec_error']);
|
|
expect(decisions[2]).toBe('nudge');
|
|
});
|
|
});
|
|
|
|
// ---- re-trip escalate -------------------------------------------------------
|
|
|
|
describe('detectMistakePattern — escalate on re-trip', () => {
|
|
it("escalates when the streak re-trips after a nudge with no intervening success", () => {
|
|
// 3 fails → nudge (run reset, nudges=1), then 3 more fails → escalate.
|
|
const { decisions } = replay(
|
|
[
|
|
'zod_reject',
|
|
'exec_error',
|
|
'tool_not_found',
|
|
'permission_denied',
|
|
'exec_error',
|
|
'zod_reject',
|
|
],
|
|
{ applyNudge: true },
|
|
);
|
|
expect(decisions[2]).toBe('nudge');
|
|
expect(decisions[5]).toBe('escalate');
|
|
});
|
|
|
|
it("does NOT escalate if a success lands between the nudge and the next streak", () => {
|
|
const { decisions } = replay(
|
|
[
|
|
'zod_reject',
|
|
'exec_error',
|
|
'tool_not_found', // nudge here
|
|
'success', // clears nudges back to 0
|
|
'exec_error',
|
|
'zod_reject',
|
|
'tool_not_found', // 3-streak again → nudge, NOT escalate
|
|
],
|
|
{ applyNudge: true },
|
|
);
|
|
expect(decisions[2]).toBe('nudge');
|
|
expect(decisions[6]).toBe('nudge');
|
|
expect(decisions).not.toContain('escalate');
|
|
});
|
|
});
|
|
|
|
// ---- mixed kinds ------------------------------------------------------------
|
|
|
|
describe('detectMistakePattern — mixed failure kinds', () => {
|
|
it('counts a streak of all five distinct kinds toward the threshold', () => {
|
|
const { state, decisions } = replay([
|
|
'zod_reject',
|
|
'tool_not_found',
|
|
'exec_error',
|
|
]);
|
|
expect(decisions[2]).toBe('nudge');
|
|
expect(state.run).toEqual(['zod_reject', 'tool_not_found', 'exec_error']);
|
|
});
|
|
});
|
|
|
|
// ---- contract ---------------------------------------------------------------
|
|
|
|
describe('MISTAKE_THRESHOLD + MISTAKE_RECOVERY_NOTE', () => {
|
|
it('threshold is a positive integer (tests assume 3)', () => {
|
|
expect(MISTAKE_THRESHOLD).toBeGreaterThan(0);
|
|
expect(Number.isInteger(MISTAKE_THRESHOLD)).toBe(true);
|
|
});
|
|
|
|
it('recovery note is a non-empty model-facing string', () => {
|
|
expect(typeof MISTAKE_RECOVERY_NOTE).toBe('string');
|
|
expect(MISTAKE_RECOVERY_NOTE.length).toBeGreaterThan(0);
|
|
});
|
|
});
|