Arena is a new pane kind for competitive AI evaluation. A Battle runs the same prompt against 2-6 Contestants across two concurrent lanes: local lane (llama-swap models, serial) and cloud lane (parallel). Added to all three registries: @boocode/contracts WsFrameSchema, server InferenceFrame, and web WsFrame. Backend (apps/coder): - arena-runner: battle scheduler, lane classifier, benchmark, results writer, resume, user winner override - arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL - arena-decisions: status transitions and resume logic (unit-tested) - arena-analyzer-helpers: pure helper functions (unit-tested) - arena-model-call: model call utility for analysis - arena routes: create/get/list/stop/analyze/cross-examine/winner/diff - schema: battles, contestants, cross_examinations tables (idempotent) - remove old /api/arena* routes and tasks.arena_id column Frontend (apps/web): - ArenaLauncherDialog: battle type, prompt, contestant selection - ArenaPane: live roster, streaming output, analysis, cross-exam - DiffView: unified diff with line-by-line color for coding contests - Winner override per-row dropdown (Trophy icon) - battle_updated WS handler for live winner/analysis updates - arena pane kind in Workspace, ChatTabBar, useSidebar Cross-app: - ArenaState and ArenaContestantShape/WsFrame types (contracts) - battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame - manifest.json written per battle results folder - /Arena added to .gitignore Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
192 lines
7.4 KiB
TypeScript
192 lines
7.4 KiB
TypeScript
/**
|
|
* Pure, side-effect-free helpers for the Arena analyzer.
|
|
* No DB, no IO, no network — safe to unit-test directly.
|
|
*
|
|
* Covers: digest-prompt assembly, judge-prompt assembly, winner extraction
|
|
* from the judge output, the <2-survivors no-winner rule, and the
|
|
* cross-examination prompt.
|
|
*/
|
|
|
|
// ─── Shared types ─────────────────────────────────────────────────────────────
|
|
|
|
export interface ContestantDigestInput {
|
|
identity: string;
|
|
model: string;
|
|
resultMd: string;
|
|
diffPatch?: string;
|
|
benchmarkLine: string;
|
|
}
|
|
|
|
export interface ContestantDigest {
|
|
identity: string;
|
|
model: string;
|
|
digest: string;
|
|
benchmarkLine: string;
|
|
}
|
|
|
|
// ─── Digest stage ─────────────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Build the system + user prompts for the per-contestant digest call.
|
|
* The digest is a short structured summary; it keeps each call's context small
|
|
* so the downstream judge only sees digests (not raw diffs).
|
|
*/
|
|
export function buildDigestPrompt(input: ContestantDigestInput): { system: string; user: string } {
|
|
const system =
|
|
'You are an expert technical analyst evaluating the output of an AI coding or Q&A battle. ' +
|
|
'Produce a concise structured digest (under 300 words, Markdown bullet points) covering: ' +
|
|
'(1) correctness and quality, (2) completeness, (3) notable strengths, (4) notable weaknesses or issues. ' +
|
|
'Do not reference the battle or other contestants — focus only on this submission.';
|
|
|
|
const parts: string[] = [
|
|
`# Contestant: ${input.identity} / ${input.model}`,
|
|
`\nBenchmark: ${input.benchmarkLine}`,
|
|
'\n## Result\n',
|
|
input.resultMd.slice(0, 8_000),
|
|
];
|
|
|
|
if (input.diffPatch) {
|
|
parts.push('\n## Code Changes (diff)\n```diff');
|
|
parts.push(input.diffPatch.slice(0, 5_000));
|
|
parts.push('```');
|
|
}
|
|
|
|
return { system, user: parts.join('\n') };
|
|
}
|
|
|
|
// ─── Judge stage ──────────────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Build the system + user prompts for the comparative judge call.
|
|
* Receives contestant digests (NOT raw diffs) to keep context bounded.
|
|
*
|
|
* The judge output must contain a line starting with WINNER: or NO_WINNER.
|
|
* The caller extracts it with extractWinner().
|
|
*/
|
|
export function buildJudgePrompt(
|
|
originalPrompt: string,
|
|
digests: ContestantDigest[],
|
|
): { system: string; user: string } {
|
|
const canName = shouldNameWinner(digests.length);
|
|
|
|
const winnerInstruction = canName
|
|
? 'After your comparative analysis, name the best submission on its own line in this exact format:\n' +
|
|
'WINNER: <identity>/<model>\n' +
|
|
'where <identity> and <model> exactly match the heading above. No other text on that line.'
|
|
: 'Fewer than 2 contestants succeeded. Do NOT name a winner. Write the following on its own line:\nNO_WINNER';
|
|
|
|
const system =
|
|
'You are an expert judge for an AI battle. You have received digest summaries of each ' +
|
|
"contestant's work on the same task. Write a comparative analysis, then follow these instructions:\n" +
|
|
winnerInstruction;
|
|
|
|
const parts: string[] = [
|
|
'# Original Task Prompt\n',
|
|
originalPrompt.slice(0, 2_000),
|
|
'\n# Contestant Digests\n',
|
|
];
|
|
|
|
for (const d of digests) {
|
|
parts.push(`\n## ${d.identity} / ${d.model}`);
|
|
parts.push(`Benchmark: ${d.benchmarkLine}`);
|
|
parts.push(d.digest);
|
|
}
|
|
|
|
parts.push(
|
|
'\n# Instructions\nCompare the contestants and follow the winner-naming instructions above.',
|
|
);
|
|
|
|
return { system, user: parts.join('\n') };
|
|
}
|
|
|
|
// ─── No-winner rule ───────────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Returns true when enough contestants succeeded to name a winner.
|
|
* Rule: at least 2 must have produced a result. With 0 or 1 success the
|
|
* analysis must NOT name a winner (no meaningful comparison possible).
|
|
*/
|
|
export function shouldNameWinner(succeededCount: number): boolean {
|
|
return succeededCount >= 2;
|
|
}
|
|
|
|
// ─── Winner extraction ────────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Parse the judge's text output and extract the declared winner.
|
|
* Looks for a line matching: WINNER: <identity>/<model>
|
|
* Returns null when no valid winner line is found, or when the line contains
|
|
* NO_WINNER.
|
|
*
|
|
* The parse is lenient on surrounding whitespace and case for the keyword.
|
|
*/
|
|
export function extractWinner(judgeOutput: string): { identity: string; model: string } | null {
|
|
for (const line of judgeOutput.split('\n')) {
|
|
const trimmed = line.trim();
|
|
if (!trimmed.toUpperCase().startsWith('WINNER:')) continue;
|
|
|
|
const rest = trimmed.slice('WINNER:'.length).trim();
|
|
if (rest.toUpperCase() === 'NO_WINNER' || rest === '') return null;
|
|
|
|
const slashIdx = rest.indexOf('/');
|
|
if (slashIdx === -1) return null;
|
|
|
|
const identity = rest.slice(0, slashIdx).trim();
|
|
const model = rest.slice(slashIdx + 1).trim();
|
|
if (identity && model) return { identity, model };
|
|
}
|
|
return null;
|
|
}
|
|
|
|
// ─── Cross-examination stage ──────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Build the system + user prompts for a cross-examination call.
|
|
* The cross-examiner sees the original prompt, contestant digests, and the
|
|
* proposed analysis, and is asked to challenge the result.
|
|
*/
|
|
export function buildCrossExamPrompt(opts: {
|
|
originalPrompt: string;
|
|
digests: ContestantDigest[];
|
|
analysisContent: string;
|
|
proposedWinner: string | null;
|
|
examinerIdentity: string;
|
|
examinerModel: string;
|
|
}): { system: string; user: string } {
|
|
const system =
|
|
`You are ${opts.examinerIdentity} (model: ${opts.examinerModel}), acting as an independent ` +
|
|
'cross-examiner in an AI battle. Your role is to critically challenge the proposed analysis ' +
|
|
'and winner, then give your own verdict. Be rigorous but fair. ' +
|
|
'End your response with your verdict on its own line:\n' +
|
|
'VERDICT: <identity>/<model> — if you agree or disagree with the proposed winner but can name one\n' +
|
|
'VERDICT: NO_WINNER — if no clear winner exists';
|
|
|
|
const parts: string[] = [
|
|
'# Original Task Prompt\n',
|
|
opts.originalPrompt.slice(0, 2_000),
|
|
'\n# Contestant Digests\n',
|
|
];
|
|
|
|
for (const d of opts.digests) {
|
|
parts.push(`\n## ${d.identity} / ${d.model}`);
|
|
parts.push(`Benchmark: ${d.benchmarkLine}`);
|
|
parts.push(d.digest);
|
|
}
|
|
|
|
parts.push('\n# Proposed Analysis\n');
|
|
parts.push(opts.analysisContent.slice(0, 5_000));
|
|
|
|
if (opts.proposedWinner) {
|
|
parts.push(`\n*(Proposed winner: ${opts.proposedWinner})*`);
|
|
} else {
|
|
parts.push('\n*(No winner was proposed — fewer than 2 contestants succeeded.)*');
|
|
}
|
|
|
|
parts.push(
|
|
'\n# Your Cross-Examination\n' +
|
|
'Challenge the analysis above, then give your independent verdict (VERDICT: … on its own line).',
|
|
);
|
|
|
|
return { system, user: parts.join('\n') };
|
|
}
|