Files
boocode/apps/coder/src/services/arena-analyzer-helpers.ts
indifferentketchup d6d246c15b feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt
Arena is a new pane kind for competitive AI evaluation. A Battle runs
the same prompt against 2-6 Contestants across two concurrent lanes:
local lane (llama-swap models, serial) and cloud lane (parallel).

Added to all three registries: @boocode/contracts WsFrameSchema,
server InferenceFrame, and web WsFrame.

Backend (apps/coder):
- arena-runner: battle scheduler, lane classifier, benchmark, results
  writer, resume, user winner override
- arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL
- arena-decisions: status transitions and resume logic (unit-tested)
- arena-analyzer-helpers: pure helper functions (unit-tested)
- arena-model-call: model call utility for analysis
- arena routes: create/get/list/stop/analyze/cross-examine/winner/diff
- schema: battles, contestants, cross_examinations tables (idempotent)
- remove old /api/arena* routes and tasks.arena_id column

Frontend (apps/web):
- ArenaLauncherDialog: battle type, prompt, contestant selection
- ArenaPane: live roster, streaming output, analysis, cross-exam
- DiffView: unified diff with line-by-line color for coding contests
- Winner override per-row dropdown (Trophy icon)
- battle_updated WS handler for live winner/analysis updates
- arena pane kind in Workspace, ChatTabBar, useSidebar

Cross-app:
- ArenaState and ArenaContestantShape/WsFrame types (contracts)
- battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame
- manifest.json written per battle results folder
- /Arena added to .gitignore

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-06 23:25:29 +00:00

192 lines
7.4 KiB
TypeScript

/**
* Pure, side-effect-free helpers for the Arena analyzer.
* No DB, no IO, no network — safe to unit-test directly.
*
* Covers: digest-prompt assembly, judge-prompt assembly, winner extraction
* from the judge output, the <2-survivors no-winner rule, and the
* cross-examination prompt.
*/
// ─── Shared types ─────────────────────────────────────────────────────────────
export interface ContestantDigestInput {
identity: string;
model: string;
resultMd: string;
diffPatch?: string;
benchmarkLine: string;
}
export interface ContestantDigest {
identity: string;
model: string;
digest: string;
benchmarkLine: string;
}
// ─── Digest stage ─────────────────────────────────────────────────────────────
/**
* Build the system + user prompts for the per-contestant digest call.
* The digest is a short structured summary; it keeps each call's context small
* so the downstream judge only sees digests (not raw diffs).
*/
export function buildDigestPrompt(input: ContestantDigestInput): { system: string; user: string } {
const system =
'You are an expert technical analyst evaluating the output of an AI coding or Q&A battle. ' +
'Produce a concise structured digest (under 300 words, Markdown bullet points) covering: ' +
'(1) correctness and quality, (2) completeness, (3) notable strengths, (4) notable weaknesses or issues. ' +
'Do not reference the battle or other contestants — focus only on this submission.';
const parts: string[] = [
`# Contestant: ${input.identity} / ${input.model}`,
`\nBenchmark: ${input.benchmarkLine}`,
'\n## Result\n',
input.resultMd.slice(0, 8_000),
];
if (input.diffPatch) {
parts.push('\n## Code Changes (diff)\n```diff');
parts.push(input.diffPatch.slice(0, 5_000));
parts.push('```');
}
return { system, user: parts.join('\n') };
}
// ─── Judge stage ──────────────────────────────────────────────────────────────
/**
* Build the system + user prompts for the comparative judge call.
* Receives contestant digests (NOT raw diffs) to keep context bounded.
*
* The judge output must contain a line starting with WINNER: or NO_WINNER.
* The caller extracts it with extractWinner().
*/
export function buildJudgePrompt(
originalPrompt: string,
digests: ContestantDigest[],
): { system: string; user: string } {
const canName = shouldNameWinner(digests.length);
const winnerInstruction = canName
? 'After your comparative analysis, name the best submission on its own line in this exact format:\n' +
'WINNER: <identity>/<model>\n' +
'where <identity> and <model> exactly match the heading above. No other text on that line.'
: 'Fewer than 2 contestants succeeded. Do NOT name a winner. Write the following on its own line:\nNO_WINNER';
const system =
'You are an expert judge for an AI battle. You have received digest summaries of each ' +
"contestant's work on the same task. Write a comparative analysis, then follow these instructions:\n" +
winnerInstruction;
const parts: string[] = [
'# Original Task Prompt\n',
originalPrompt.slice(0, 2_000),
'\n# Contestant Digests\n',
];
for (const d of digests) {
parts.push(`\n## ${d.identity} / ${d.model}`);
parts.push(`Benchmark: ${d.benchmarkLine}`);
parts.push(d.digest);
}
parts.push(
'\n# Instructions\nCompare the contestants and follow the winner-naming instructions above.',
);
return { system, user: parts.join('\n') };
}
// ─── No-winner rule ───────────────────────────────────────────────────────────
/**
* Returns true when enough contestants succeeded to name a winner.
* Rule: at least 2 must have produced a result. With 0 or 1 success the
* analysis must NOT name a winner (no meaningful comparison possible).
*/
export function shouldNameWinner(succeededCount: number): boolean {
return succeededCount >= 2;
}
// ─── Winner extraction ────────────────────────────────────────────────────────
/**
* Parse the judge's text output and extract the declared winner.
* Looks for a line matching: WINNER: <identity>/<model>
* Returns null when no valid winner line is found, or when the line contains
* NO_WINNER.
*
* The parse is lenient on surrounding whitespace and case for the keyword.
*/
export function extractWinner(judgeOutput: string): { identity: string; model: string } | null {
for (const line of judgeOutput.split('\n')) {
const trimmed = line.trim();
if (!trimmed.toUpperCase().startsWith('WINNER:')) continue;
const rest = trimmed.slice('WINNER:'.length).trim();
if (rest.toUpperCase() === 'NO_WINNER' || rest === '') return null;
const slashIdx = rest.indexOf('/');
if (slashIdx === -1) return null;
const identity = rest.slice(0, slashIdx).trim();
const model = rest.slice(slashIdx + 1).trim();
if (identity && model) return { identity, model };
}
return null;
}
// ─── Cross-examination stage ──────────────────────────────────────────────────
/**
* Build the system + user prompts for a cross-examination call.
* The cross-examiner sees the original prompt, contestant digests, and the
* proposed analysis, and is asked to challenge the result.
*/
export function buildCrossExamPrompt(opts: {
originalPrompt: string;
digests: ContestantDigest[];
analysisContent: string;
proposedWinner: string | null;
examinerIdentity: string;
examinerModel: string;
}): { system: string; user: string } {
const system =
`You are ${opts.examinerIdentity} (model: ${opts.examinerModel}), acting as an independent ` +
'cross-examiner in an AI battle. Your role is to critically challenge the proposed analysis ' +
'and winner, then give your own verdict. Be rigorous but fair. ' +
'End your response with your verdict on its own line:\n' +
'VERDICT: <identity>/<model> — if you agree or disagree with the proposed winner but can name one\n' +
'VERDICT: NO_WINNER — if no clear winner exists';
const parts: string[] = [
'# Original Task Prompt\n',
opts.originalPrompt.slice(0, 2_000),
'\n# Contestant Digests\n',
];
for (const d of opts.digests) {
parts.push(`\n## ${d.identity} / ${d.model}`);
parts.push(`Benchmark: ${d.benchmarkLine}`);
parts.push(d.digest);
}
parts.push('\n# Proposed Analysis\n');
parts.push(opts.analysisContent.slice(0, 5_000));
if (opts.proposedWinner) {
parts.push(`\n*(Proposed winner: ${opts.proposedWinner})*`);
} else {
parts.push('\n*(No winner was proposed — fewer than 2 contestants succeeded.)*');
}
parts.push(
'\n# Your Cross-Examination\n' +
'Challenge the analysis above, then give your independent verdict (VERDICT: … on its own line).',
);
return { system, user: parts.join('\n') };
}