feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt

Arena is a new pane kind for competitive AI evaluation. A Battle runs
the same prompt against 2-6 Contestants across two concurrent lanes:
local lane (llama-swap models, serial) and cloud lane (parallel).

Added to all three registries: @boocode/contracts WsFrameSchema,
server InferenceFrame, and web WsFrame.

Backend (apps/coder):
- arena-runner: battle scheduler, lane classifier, benchmark, results
  writer, resume, user winner override
- arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL
- arena-decisions: status transitions and resume logic (unit-tested)
- arena-analyzer-helpers: pure helper functions (unit-tested)
- arena-model-call: model call utility for analysis
- arena routes: create/get/list/stop/analyze/cross-examine/winner/diff
- schema: battles, contestants, cross_examinations tables (idempotent)
- remove old /api/arena* routes and tasks.arena_id column

Frontend (apps/web):
- ArenaLauncherDialog: battle type, prompt, contestant selection
- ArenaPane: live roster, streaming output, analysis, cross-exam
- DiffView: unified diff with line-by-line color for coding contests
- Winner override per-row dropdown (Trophy icon)
- battle_updated WS handler for live winner/analysis updates
- arena pane kind in Workspace, ChatTabBar, useSidebar

Cross-app:
- ArenaState and ArenaContestantShape/WsFrame types (contracts)
- battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame
- manifest.json written per battle results folder
- /Arena added to .gitignore
This commit is contained in:
2026-06-06 23:25:29 +00:00
parent 84a024a5a4
commit 3474be4865
34 changed files with 4581 additions and 146 deletions

View File

@@ -3,7 +3,11 @@
// also refresh the sidebar's session list).
import type {
ArenaState,
BattleShape,
Chat,
ContestantShape,
CrossExaminationShape,
ErrorReason,
HtmlArtifactState,
MarkdownArtifactState,
@@ -231,6 +235,53 @@ export interface FlowRunStepUpdatedEvent {
report?: string;
}
// Arena: emitted by "New Arena" menu items to request the launcher dialog.
export interface OpenArenaLauncherEvent {
type: 'open_arena_launcher';
project_id: string;
placement?: 'new' | 'split';
}
// Arena: emitted after a battle is created to open/focus the arena pane.
export interface OpenArenaPaneEvent {
type: 'open_arena_pane';
state: ArenaState;
placement?: 'new' | 'split';
}
// Arena: battle lifecycle frames forwarded from the coder user channel.
export interface BattleStartedEvent {
type: 'battle_started';
battle_id: string;
battle_type: 'coding' | 'qa';
prompt: string;
contestants: Array<{ id: string; identity: string; model: string; lane: 'local' | 'cloud' }>;
}
export interface ContestantUpdatedEvent {
type: 'contestant_updated';
battle_id: string;
contestant_id: string;
status?: 'queued' | 'running' | 'done' | 'error';
duration_ms?: number;
tokens_per_sec?: number;
battle_status?: 'pending' | 'running' | 'completed' | 'failed' | 'cancelled';
delta?: string;
error?: string;
}
export interface BattleUpdatedEvent {
type: 'battle_updated';
battle_id: string;
status?: 'pending' | 'running' | 'completed' | 'failed' | 'cancelled';
winner_contestant_id?: string | null;
analysis_ready?: boolean;
cross_exam_id?: string;
}
// Re-export arena API shapes for consumers that need the full battle data.
export type { BattleShape, ContestantShape, CrossExaminationShape };
export type SessionEvent =
| SessionRenamedEvent
| ProjectCreatedEvent
@@ -262,7 +313,12 @@ export type SessionEvent =
| OpenOrchestratorPaneEvent
| FlowRunStartedEvent
| FlowRunStepUpdatedEvent
| OpenFlowLauncherEvent;
| OpenFlowLauncherEvent
| OpenArenaLauncherEvent
| OpenArenaPaneEvent
| BattleStartedEvent
| ContestantUpdatedEvent
| BattleUpdatedEvent;
type Listener = (event: SessionEvent) => void;
const listeners = new Set<Listener>();

View File

@@ -8,7 +8,13 @@
import { useEffect } from 'react';
import { WsFrameSchema } from '@boocode/contracts/ws-frames';
import { sessionEvents } from './sessionEvents';
import type { FlowRunStartedEvent, FlowRunStepUpdatedEvent } from './sessionEvents';
import type {
BattleStartedEvent,
BattleUpdatedEvent,
ContestantUpdatedEvent,
FlowRunStartedEvent,
FlowRunStepUpdatedEvent,
} from './sessionEvents';
const RECONNECT_INITIAL_MS = 1000;
const RECONNECT_MAX_MS = 30_000;
@@ -49,6 +55,12 @@ export function useCoderUserEvents(): void {
sessionEvents.emit(frame as unknown as FlowRunStartedEvent);
} else if (frame.type === 'flow_run_step_updated') {
sessionEvents.emit(frame as unknown as FlowRunStepUpdatedEvent);
} else if (frame.type === 'battle_started') {
sessionEvents.emit(frame as unknown as BattleStartedEvent);
} else if (frame.type === 'contestant_updated') {
sessionEvents.emit(frame as unknown as ContestantUpdatedEvent);
} else if (frame.type === 'battle_updated') {
sessionEvents.emit(frame as unknown as BattleUpdatedEvent);
}
};

View File

@@ -204,6 +204,13 @@ function applyFrame(state: State, frame: WsFrame): State {
// No-op here to keep TS exhaustiveness satisfied.
return state;
}
case 'battle_started':
case 'contestant_updated':
case 'battle_updated': {
// Arena frames consumed by ArenaPane's own subscription.
// No-op here to keep TS exhaustiveness satisfied.
return state;
}
}
}

View File

@@ -195,6 +195,13 @@ function applyEvent(prev: SidebarResponse, event: import('./sessionEvents').Sess
case 'flow_run_step_updated':
// Consumed by useWorkspacePanes / OrchestratorPane / FlowLauncherDialog; sidebar has no stake.
return prev;
case 'open_arena_launcher':
case 'open_arena_pane':
case 'battle_started':
case 'contestant_updated':
case 'battle_updated':
// Consumed by useWorkspacePanes / ArenaPane / ArenaLauncherDialog; sidebar has no stake.
return prev;
case 'project_archived': {
const next = prev.projects.filter((p) => p.id !== event.project_id);
if (next.length === prev.projects.length) return prev;

View File

@@ -3,6 +3,7 @@ import type { DragEvent } from 'react';
import { toast } from 'sonner';
import { api } from '@/api/client';
import type {
ArenaState,
ClosedPaneEntry,
HtmlArtifactState,
MarkdownArtifactState,
@@ -187,6 +188,16 @@ function orchestratorPane(state: OrchestratorState): WorkspacePane {
};
}
function arenaPane(state: ArenaState): WorkspacePane {
return {
id: generateId(),
kind: 'arena',
chatIds: [],
activeChatIdx: -1,
arena_state: state,
};
}
// v1.9: settings panes are ephemeral. Filter them out before persisting so a
// page reload always returns to a clean workspace; the user re-opens via the
// sidebar Settings button when needed.
@@ -290,6 +301,8 @@ export interface UseWorkspacePanesResult {
createTab: (paneIdx: number, kind: WorkspaceTabKind) => Promise<void>;
/** Open an orchestrator run pane (or focus an existing one for the same run_id). */
addOrchestratorPane: (state: OrchestratorState) => string | null;
/** Open an arena battle pane (or focus an existing one for the same battle_id). */
addArenaPane: (state: ArenaState) => string | null;
/** Back-compat alias for createTab(paneIdx, 'coder'). */
createCoderTab: (paneIdx: number) => Promise<void>;
// Open-on-first-click, close-on-second-click. Singleton — settings panes
@@ -877,6 +890,38 @@ export function useWorkspacePanes(sessionId: string): UseWorkspacePanesResult {
});
}, [addOrchestratorPane]);
const addArenaPane = useCallback((state: ArenaState): string | null => {
let openedId: string | null = null;
setPanes((prev) => {
const existingIdx = prev.findIndex(
(p) => p.kind === 'arena' && p.arena_state?.battle_id === state.battle_id,
);
if (existingIdx >= 0) {
setActivePaneIdx(existingIdx);
openedId = prev[existingIdx]!.id;
return prev;
}
if (nonSettingsCount(prev) >= MAX_PANES) {
toast.error(`Maximum ${MAX_PANES} panes`);
return prev;
}
const newPane = arenaPane(state);
openedId = newPane.id;
const next = [...prev, newPane];
setActivePaneIdx(next.length - 1);
return next;
});
return openedId;
}, []);
// Arena pane: open via sessionEvents (fired by the launcher).
useEffect(() => {
return sessionEvents.subscribe((ev) => {
if (ev.type !== 'open_arena_pane') return;
addArenaPane(ev.state);
});
}, [addArenaPane]);
// Returns the new settings pane id when one is OPENED (so mobile callers can
// push ?pane= atomically — see addPaneAndSwitch), or null when it was closed.
// Id generated outside the updater so a strict-mode double-invoke agrees.
@@ -1121,6 +1166,7 @@ export function useWorkspacePanes(sessionId: string): UseWorkspacePanesResult {
addSplitPane,
createTab,
addOrchestratorPane,
addArenaPane,
createCoderTab,
toggleSettingsPane,
removePane,