feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt

Arena is a new pane kind for competitive AI evaluation. A Battle runs
the same prompt against 2-6 Contestants across two concurrent lanes:
local lane (llama-swap models, serial) and cloud lane (parallel).

Added to all three registries: @boocode/contracts WsFrameSchema,
server InferenceFrame, and web WsFrame.

Backend (apps/coder):
- arena-runner: battle scheduler, lane classifier, benchmark, results
  writer, resume, user winner override
- arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL
- arena-decisions: status transitions and resume logic (unit-tested)
- arena-analyzer-helpers: pure helper functions (unit-tested)
- arena-model-call: model call utility for analysis
- arena routes: create/get/list/stop/analyze/cross-examine/winner/diff
- schema: battles, contestants, cross_examinations tables (idempotent)
- remove old /api/arena* routes and tasks.arena_id column

Frontend (apps/web):
- ArenaLauncherDialog: battle type, prompt, contestant selection
- ArenaPane: live roster, streaming output, analysis, cross-exam
- DiffView: unified diff with line-by-line color for coding contests
- Winner override per-row dropdown (Trophy icon)
- battle_updated WS handler for live winner/analysis updates
- arena pane kind in Workspace, ChatTabBar, useSidebar

Cross-app:
- ArenaState and ArenaContestantShape/WsFrame types (contracts)
- battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame
- manifest.json written per battle results folder
- /Arena added to .gitignore

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-06 23:25:29 +00:00
parent e04d0fdaa8
commit d6d246c15b
34 changed files with 4581 additions and 146 deletions

View File

@@ -28,6 +28,10 @@
"./worktree-risk": {
"types": "./dist/worktree-risk.d.ts",
"default": "./dist/worktree-risk.js"
},
"./arena": {
"types": "./dist/arena.d.ts",
"default": "./dist/arena.js"
}
},
"scripts": {

View File

@@ -0,0 +1,55 @@
/** Arena types — single source of truth for cross-app Arena wire contracts. */
export type BattleType = 'coding' | 'qa';
export type BattleStatus = 'pending' | 'running' | 'completed' | 'failed' | 'cancelled';
export type ContestantStatus = 'queued' | 'running' | 'done' | 'error';
export type ContestantLane = 'local' | 'cloud';
// Pane state — carried on the WorkspacePane row, mirrors OrchestratorState.
export interface ArenaState {
battle_id: string;
battle_type: BattleType;
prompt: string;
}
export interface BattleShape {
id: string;
project_id: string;
battle_type: BattleType;
prompt: string;
status: BattleStatus;
winner_contestant_id: string | null;
results_path: string | null;
error: string | null;
created_at: string;
updated_at: string;
}
export interface ContestantShape {
id: string;
battle_id: string;
/** Backend name (coding) or persona name (qa). Unique per (battle, model) pair. */
identity: string;
model: string;
lane: ContestantLane;
task_id: string | null;
worktree_id: string | null;
status: ContestantStatus;
duration_ms: number | null;
tokens_per_sec: number | null;
cost_tokens: number | null;
result_path: string | null;
error: string | null;
created_at: string;
updated_at: string;
}
export interface CrossExaminationShape {
id: string;
battle_id: string;
/** Backend + model performing the cross-examination. */
identity: string;
model: string;
verdict: string | null;
created_at: string;
}

View File

@@ -358,6 +358,53 @@ export const FlowRunStepUpdatedFrame = z.object({
report: z.string().optional(),
});
// ---- arena frames ----------------------------------------------------------
const ContestantManifestEntry = z.object({
id: Uuid,
identity: z.string().min(1),
model: z.string().min(1),
lane: z.enum(['local', 'cloud']),
});
// Published once when a battle starts. Carries the contestant roster so the
// ArenaPane can build its grid immediately.
export const BattleStartedFrame = z.object({
type: z.literal('battle_started'),
battle_id: Uuid,
battle_type: z.enum(['coding', 'qa']),
prompt: z.string(),
contestants: z.array(ContestantManifestEntry),
});
// Published on every contestant status change or streaming update.
// `delta` carries the latest chunk of streaming output while status='running'.
// `battle_status` is present only on the final transition that closes the battle.
export const ContestantUpdatedFrame = z.object({
type: z.literal('contestant_updated'),
battle_id: Uuid,
contestant_id: Uuid,
status: z.enum(['queued', 'running', 'done', 'error']).optional(),
duration_ms: z.number().int().nonnegative().optional(),
tokens_per_sec: z.number().nonnegative().optional(),
battle_status: z.enum(['pending', 'running', 'completed', 'failed', 'cancelled']).optional(),
delta: z.string().optional(),
error: z.string().optional(),
});
// Published when battle-level state changes that don't ride on a contestant
// update: analysis finished, winner set, cross-exam verdict ready. The pane
// uses this to update its analysis panel and winner badge without a refetch.
// Fields are all optional — publishers include only what changed.
export const BattleUpdatedFrame = z.object({
type: z.literal('battle_updated'),
battle_id: Uuid,
status: z.enum(['pending', 'running', 'completed', 'failed', 'cancelled']).optional(),
winner_contestant_id: Uuid.nullable().optional(),
analysis_ready: z.boolean().optional(),
cross_exam_id: Uuid.optional(),
});
// ---- discriminated union ---------------------------------------------------
export const WsFrameSchema = z.discriminatedUnion('type', [
@@ -381,6 +428,10 @@ export const WsFrameSchema = z.discriminatedUnion('type', [
// orchestrator
FlowRunStartedFrame,
FlowRunStepUpdatedFrame,
// arena
BattleStartedFrame,
ContestantUpdatedFrame,
BattleUpdatedFrame,
// per-user
ChatStatusFrame,
SessionUpdatedFrame,
@@ -425,6 +476,9 @@ export const KNOWN_FRAME_TYPES: readonly WsFrame['type'][] = [
'agent_status_updated',
'flow_run_started',
'flow_run_step_updated',
'battle_started',
'contestant_updated',
'battle_updated',
'chat_status',
'session_updated',
'session_renamed',