feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt
Arena is a new pane kind for competitive AI evaluation. A Battle runs the same prompt against 2-6 Contestants across two concurrent lanes: local lane (llama-swap models, serial) and cloud lane (parallel). Added to all three registries: @boocode/contracts WsFrameSchema, server InferenceFrame, and web WsFrame. Backend (apps/coder): - arena-runner: battle scheduler, lane classifier, benchmark, results writer, resume, user winner override - arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL - arena-decisions: status transitions and resume logic (unit-tested) - arena-analyzer-helpers: pure helper functions (unit-tested) - arena-model-call: model call utility for analysis - arena routes: create/get/list/stop/analyze/cross-examine/winner/diff - schema: battles, contestants, cross_examinations tables (idempotent) - remove old /api/arena* routes and tasks.arena_id column Frontend (apps/web): - ArenaLauncherDialog: battle type, prompt, contestant selection - ArenaPane: live roster, streaming output, analysis, cross-exam - DiffView: unified diff with line-by-line color for coding contests - Winner override per-row dropdown (Trophy icon) - battle_updated WS handler for live winner/analysis updates - arena pane kind in Workspace, ChatTabBar, useSidebar Cross-app: - ArenaState and ArenaContestantShape/WsFrame types (contracts) - battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame - manifest.json written per battle results folder - /Arena added to .gitignore Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
254
apps/coder/src/services/__tests__/arena-analyzer-helpers.test.ts
Normal file
254
apps/coder/src/services/__tests__/arena-analyzer-helpers.test.ts
Normal file
@@ -0,0 +1,254 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import {
|
||||
buildDigestPrompt,
|
||||
buildJudgePrompt,
|
||||
buildCrossExamPrompt,
|
||||
extractWinner,
|
||||
shouldNameWinner,
|
||||
type ContestantDigest,
|
||||
type ContestantDigestInput,
|
||||
} from '../arena-analyzer-helpers.js';
|
||||
|
||||
// ─── shouldNameWinner ─────────────────────────────────────────────────────────
|
||||
|
||||
describe('shouldNameWinner', () => {
|
||||
it('returns false with 0 succeeded contestants', () => {
|
||||
expect(shouldNameWinner(0)).toBe(false);
|
||||
});
|
||||
|
||||
it('returns false with exactly 1 succeeded contestant', () => {
|
||||
expect(shouldNameWinner(1)).toBe(false);
|
||||
});
|
||||
|
||||
it('returns true with exactly 2 succeeded contestants', () => {
|
||||
expect(shouldNameWinner(2)).toBe(true);
|
||||
});
|
||||
|
||||
it('returns true with more than 2 succeeded contestants', () => {
|
||||
expect(shouldNameWinner(3)).toBe(true);
|
||||
expect(shouldNameWinner(6)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── extractWinner ────────────────────────────────────────────────────────────
|
||||
|
||||
describe('extractWinner', () => {
|
||||
it('extracts identity and model from a WINNER: line', () => {
|
||||
const output = 'Some analysis\n\nWINNER: claude/opus-4-5\n\nMore text.';
|
||||
expect(extractWinner(output)).toEqual({ identity: 'claude', model: 'opus-4-5' });
|
||||
});
|
||||
|
||||
it('is case-insensitive for the WINNER keyword', () => {
|
||||
expect(extractWinner('winner: boocode/qwen3.6-35b')).toEqual({
|
||||
identity: 'boocode',
|
||||
model: 'qwen3.6-35b',
|
||||
});
|
||||
expect(extractWinner('Winner: opencode/some-model')).toEqual({
|
||||
identity: 'opencode',
|
||||
model: 'some-model',
|
||||
});
|
||||
});
|
||||
|
||||
it('returns null when NO_WINNER is declared', () => {
|
||||
expect(extractWinner('WINNER: NO_WINNER')).toBeNull();
|
||||
expect(extractWinner('winner: no_winner')).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when no WINNER line is present', () => {
|
||||
expect(extractWinner('Just some analysis text with no verdict.')).toBeNull();
|
||||
expect(extractWinner('')).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when the WINNER line has no slash separator', () => {
|
||||
expect(extractWinner('WINNER: justidentity')).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when the WINNER line is empty after the colon', () => {
|
||||
expect(extractWinner('WINNER:')).toBeNull();
|
||||
expect(extractWinner('WINNER: ')).toBeNull();
|
||||
});
|
||||
|
||||
it('handles leading and trailing whitespace around the slash parts', () => {
|
||||
const result = extractWinner('WINNER: claude / opus-4-5 ');
|
||||
expect(result).toEqual({ identity: 'claude', model: 'opus-4-5' });
|
||||
});
|
||||
|
||||
it('picks the first WINNER line when multiple are present', () => {
|
||||
const output = 'WINNER: claude/opus-4-5\nWINNER: opencode/other-model';
|
||||
expect(extractWinner(output)).toEqual({ identity: 'claude', model: 'opus-4-5' });
|
||||
});
|
||||
|
||||
it('handles model names that contain slashes by splitting at the first slash only', () => {
|
||||
// edge case: model name with a slash — should still split at first slash
|
||||
// identity = 'native', model = 'llama-swap/qwen3.6'
|
||||
const result = extractWinner('WINNER: native/llama-swap/qwen3.6');
|
||||
expect(result).toEqual({ identity: 'native', model: 'llama-swap/qwen3.6' });
|
||||
});
|
||||
});
|
||||
|
||||
// ─── buildDigestPrompt ────────────────────────────────────────────────────────
|
||||
|
||||
describe('buildDigestPrompt', () => {
|
||||
const base: ContestantDigestInput = {
|
||||
identity: 'claude',
|
||||
model: 'opus-4-5',
|
||||
resultMd: '# Output\n\nSome result content.',
|
||||
benchmarkLine: '12000ms',
|
||||
};
|
||||
|
||||
it('returns an object with non-empty system and user strings', () => {
|
||||
const { system, user } = buildDigestPrompt(base);
|
||||
expect(system.length).toBeGreaterThan(0);
|
||||
expect(user.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('includes the contestant identity and model in the user prompt', () => {
|
||||
const { user } = buildDigestPrompt(base);
|
||||
expect(user).toContain('claude');
|
||||
expect(user).toContain('opus-4-5');
|
||||
});
|
||||
|
||||
it('includes the benchmark line in the user prompt', () => {
|
||||
const { user } = buildDigestPrompt(base);
|
||||
expect(user).toContain('12000ms');
|
||||
});
|
||||
|
||||
it('includes the result.md content in the user prompt', () => {
|
||||
const { user } = buildDigestPrompt(base);
|
||||
expect(user).toContain('Some result content.');
|
||||
});
|
||||
|
||||
it('includes the diff.patch when provided', () => {
|
||||
const input: ContestantDigestInput = { ...base, diffPatch: '--- a/foo.ts\n+++ b/foo.ts\n+added' };
|
||||
const { user } = buildDigestPrompt(input);
|
||||
expect(user).toContain('added');
|
||||
expect(user).toContain('```diff');
|
||||
});
|
||||
|
||||
it('omits the diff section when diffPatch is undefined', () => {
|
||||
const { user } = buildDigestPrompt(base);
|
||||
expect(user).not.toContain('```diff');
|
||||
});
|
||||
|
||||
it('truncates resultMd longer than 8000 characters', () => {
|
||||
const longResult = 'x'.repeat(10_000);
|
||||
const { user } = buildDigestPrompt({ ...base, resultMd: longResult });
|
||||
// The truncated content must not exceed 8000 chars in the sliced section.
|
||||
// We just check the total user string doesn't balloon unreasonably.
|
||||
expect(user.length).toBeLessThan(15_000);
|
||||
});
|
||||
|
||||
it('truncates diffPatch longer than 5000 characters', () => {
|
||||
const longDiff = '+' + 'x'.repeat(10_000);
|
||||
const { user } = buildDigestPrompt({ ...base, diffPatch: longDiff });
|
||||
expect(user.length).toBeLessThan(16_000);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── buildJudgePrompt ─────────────────────────────────────────────────────────
|
||||
|
||||
describe('buildJudgePrompt', () => {
|
||||
const digests: ContestantDigest[] = [
|
||||
{ identity: 'claude', model: 'opus-4-5', digest: 'Good result.', benchmarkLine: '5000ms' },
|
||||
{ identity: 'opencode', model: 'qwen3.6', digest: 'Decent result.', benchmarkLine: '8000ms' },
|
||||
];
|
||||
|
||||
it('includes the original prompt in the user section', () => {
|
||||
const { user } = buildJudgePrompt('Write a sorting algorithm', digests);
|
||||
expect(user).toContain('Write a sorting algorithm');
|
||||
});
|
||||
|
||||
it('includes each contestant heading in the user section', () => {
|
||||
const { user } = buildJudgePrompt('prompt', digests);
|
||||
expect(user).toContain('claude');
|
||||
expect(user).toContain('opus-4-5');
|
||||
expect(user).toContain('opencode');
|
||||
expect(user).toContain('qwen3.6');
|
||||
});
|
||||
|
||||
it('includes each contestant digest text', () => {
|
||||
const { user } = buildJudgePrompt('prompt', digests);
|
||||
expect(user).toContain('Good result.');
|
||||
expect(user).toContain('Decent result.');
|
||||
});
|
||||
|
||||
it('instructs the model to name a WINNER when 2+ digests are provided', () => {
|
||||
const { system } = buildJudgePrompt('prompt', digests);
|
||||
expect(system).toContain('WINNER:');
|
||||
});
|
||||
|
||||
it('instructs the model NOT to name a winner when fewer than 2 digests are provided', () => {
|
||||
const oneDigest = digests.slice(0, 1);
|
||||
const { system } = buildJudgePrompt('prompt', oneDigest);
|
||||
expect(system).toContain('NO_WINNER');
|
||||
expect(system).not.toContain('WINNER: <identity>');
|
||||
});
|
||||
|
||||
it('instructs NO_WINNER when digests list is empty', () => {
|
||||
const { system } = buildJudgePrompt('prompt', []);
|
||||
expect(system).toContain('NO_WINNER');
|
||||
});
|
||||
|
||||
it('truncates originalPrompt longer than 2000 characters', () => {
|
||||
const longPrompt = 'p'.repeat(5_000);
|
||||
const { user } = buildJudgePrompt(longPrompt, digests);
|
||||
// Should not contain more than 2000 chars of the prompt.
|
||||
const promptSection = user.split('# Contestant Digests')[0] ?? '';
|
||||
expect(promptSection.length).toBeLessThan(3_000);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── buildCrossExamPrompt ─────────────────────────────────────────────────────
|
||||
|
||||
describe('buildCrossExamPrompt', () => {
|
||||
const digests: ContestantDigest[] = [
|
||||
{ identity: 'claude', model: 'opus-4-5', digest: 'Strong result.', benchmarkLine: '5000ms' },
|
||||
{ identity: 'boocode', model: 'qwen3.6-35b', digest: 'Decent result.', benchmarkLine: '12000ms' },
|
||||
];
|
||||
|
||||
const baseOpts = {
|
||||
originalPrompt: 'Write a sorting algorithm.',
|
||||
digests,
|
||||
analysisContent: '# Arena Analysis\n\nClaude did better.\n\nWINNER: claude/opus-4-5',
|
||||
proposedWinner: 'claude/opus-4-5',
|
||||
examinerIdentity: 'goose',
|
||||
examinerModel: 'gpt-4o',
|
||||
};
|
||||
|
||||
it('includes the examiner identity and model in the system prompt', () => {
|
||||
const { system } = buildCrossExamPrompt(baseOpts);
|
||||
expect(system).toContain('goose');
|
||||
expect(system).toContain('gpt-4o');
|
||||
});
|
||||
|
||||
it('includes the original prompt in the user section', () => {
|
||||
const { user } = buildCrossExamPrompt(baseOpts);
|
||||
expect(user).toContain('Write a sorting algorithm.');
|
||||
});
|
||||
|
||||
it('includes each contestant digest', () => {
|
||||
const { user } = buildCrossExamPrompt(baseOpts);
|
||||
expect(user).toContain('Strong result.');
|
||||
expect(user).toContain('Decent result.');
|
||||
});
|
||||
|
||||
it('includes the proposed analysis content', () => {
|
||||
const { user } = buildCrossExamPrompt(baseOpts);
|
||||
expect(user).toContain('Claude did better.');
|
||||
});
|
||||
|
||||
it('includes the proposed winner when set', () => {
|
||||
const { user } = buildCrossExamPrompt(baseOpts);
|
||||
expect(user).toContain('claude/opus-4-5');
|
||||
});
|
||||
|
||||
it('notes that no winner was proposed when proposedWinner is null', () => {
|
||||
const { user } = buildCrossExamPrompt({ ...baseOpts, proposedWinner: null });
|
||||
expect(user).toContain('No winner was proposed');
|
||||
});
|
||||
|
||||
it('instructs the examiner to provide a VERDICT line', () => {
|
||||
const { system } = buildCrossExamPrompt(baseOpts);
|
||||
expect(system).toContain('VERDICT:');
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user