feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt

Arena is a new pane kind for competitive AI evaluation. A Battle runs
the same prompt against 2-6 Contestants across two concurrent lanes:
local lane (llama-swap models, serial) and cloud lane (parallel).

Added to all three registries: @boocode/contracts WsFrameSchema,
server InferenceFrame, and web WsFrame.

Backend (apps/coder):
- arena-runner: battle scheduler, lane classifier, benchmark, results
  writer, resume, user winner override
- arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL
- arena-decisions: status transitions and resume logic (unit-tested)
- arena-analyzer-helpers: pure helper functions (unit-tested)
- arena-model-call: model call utility for analysis
- arena routes: create/get/list/stop/analyze/cross-examine/winner/diff
- schema: battles, contestants, cross_examinations tables (idempotent)
- remove old /api/arena* routes and tasks.arena_id column

Frontend (apps/web):
- ArenaLauncherDialog: battle type, prompt, contestant selection
- ArenaPane: live roster, streaming output, analysis, cross-exam
- DiffView: unified diff with line-by-line color for coding contests
- Winner override per-row dropdown (Trophy icon)
- battle_updated WS handler for live winner/analysis updates
- arena pane kind in Workspace, ChatTabBar, useSidebar

Cross-app:
- ArenaState and ArenaContestantShape/WsFrame types (contracts)
- battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame
- manifest.json written per battle results folder
- /Arena added to .gitignore

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-06 23:25:29 +00:00
parent e04d0fdaa8
commit d6d246c15b
34 changed files with 4581 additions and 146 deletions

View File

@@ -0,0 +1,664 @@
// ArenaPane — live view for an Arena battle.
// Mirrors OrchestratorPane: header with status/winner, contestant roster
// (collapsed rows, expand-one), analysis panel, cross-examination control.
//
// Subscribes to the coder user channel (via useCoderUserEvents → sessionEvents)
// for battle_started / contestant_updated / battle_updated frames.
import { useCallback, useEffect, useRef, useState } from 'react';
import { ChevronDown, ChevronRight, Loader2, MoreHorizontal, RotateCcw, Swords, Trophy, X } from 'lucide-react';
import { toast } from 'sonner';
import { api } from '@/api/client';
import type { ArenaState, BattleShape, ContestantShape, CrossExaminationShape, ProviderSnapshotEntry } from '@/api/types';
import { sessionEvents } from '@/hooks/sessionEvents';
import { useProviderSnapshot } from '@/hooks/useProviderSnapshot';
import {
DropdownMenu,
DropdownMenuContent,
DropdownMenuItem,
DropdownMenuTrigger,
} from '@/components/ui/dropdown-menu';
import { cn } from '@/lib/utils';
// ─── Status dot (mirrors FlowStepStatusDot) ───────────────────────────────────
function ContestantStatusDot({ status }: { status: ContestantShape['status'] }) {
if (status === 'running') {
return (
<span
aria-label="running"
className="inline-block w-3 h-3 rounded-full border-2 border-emerald-500 border-t-transparent animate-spin shrink-0"
/>
);
}
const cls =
status === 'done'
? 'bg-emerald-500'
: status === 'error'
? 'bg-destructive'
: 'bg-muted-foreground/40'; // queued
return <span aria-label={status} className={cn('inline-block w-1.5 h-1.5 rounded-full shrink-0', cls)} />;
}
// ─── Lane badge ───────────────────────────────────────────────────────────────
function LaneBadge({ lane }: { lane: ContestantShape['lane'] }) {
return (
<span
className={cn(
'text-[10px] px-1 py-0.5 rounded shrink-0',
lane === 'local'
? 'bg-sky-500/10 text-sky-600 dark:text-sky-400'
: 'bg-violet-500/10 text-violet-600 dark:text-violet-400',
)}
>
{lane}
</span>
);
}
// ─── Duration formatter ───────────────────────────────────────────────────────
function formatDuration(ms: number | null): string {
if (ms == null) return '';
const s = Math.round(ms / 1000);
if (s < 60) return `${s}s`;
return `${Math.floor(s / 60)}m${String(s % 60).padStart(2, '0')}s`;
}
// ─── Live ticker for running contestants ─────────────────────────────────────
function LiveDuration({ startedAt }: { startedAt: number }) {
const [elapsed, setElapsed] = useState(() => Date.now() - startedAt);
useEffect(() => {
const id = setInterval(() => setElapsed(Date.now() - startedAt), 1000);
return () => clearInterval(id);
}, [startedAt]);
return <span>{formatDuration(elapsed)}</span>;
}
// ─── DiffView ─────────────────────────────────────────────────────────────────
function DiffView({ diff }: { diff: string }) {
const lines = diff.split('\n');
return (
<div className="border-t border-border/50">
<div className="px-3 pt-2 pb-1 text-[10px] font-medium uppercase tracking-wide text-muted-foreground">
Diff
</div>
<pre className="px-3 pb-3 text-xs font-mono whitespace-pre leading-relaxed overflow-x-auto">
{lines.map((line, i) => {
const cls =
line.startsWith('+') && !line.startsWith('+++')
? 'text-emerald-600 dark:text-emerald-400'
: line.startsWith('-') && !line.startsWith('---')
? 'text-destructive'
: line.startsWith('@@')
? 'text-violet-500 dark:text-violet-400'
: 'text-muted-foreground';
return (
<span key={i} className={cn('block', cls)}>
{line || ' '}
</span>
);
})}
</pre>
</div>
);
}
// ─── ContestantRow ────────────────────────────────────────────────────────────
interface ContestantRowState {
data: ContestantShape;
output: string;
startedAt: number | null;
}
function ContestantRow({
row,
isExpanded,
onToggle,
isWinner,
battleId,
battleType,
}: {
row: ContestantRowState;
isExpanded: boolean;
onToggle: () => void;
isWinner: boolean;
battleId: string;
battleType: 'coding' | 'qa';
}) {
const { data, output, startedAt } = row;
const label = `${data.identity} / ${data.model}`;
// Lazy-fetch diff for coding contestants once they are done and expanded.
const [diff, setDiff] = useState<string | null>(null);
useEffect(() => {
if (!isExpanded || battleType !== 'coding' || data.status !== 'done') return;
if (diff !== null) return;
api.battles.getDiff(battleId, data.id)
.then(({ diff: d }) => setDiff(d))
.catch(() => setDiff(''));
}, [isExpanded, battleType, data.status, data.id, battleId, diff]);
async function handleSetWinner(contestantId: string | null) {
try {
await api.battles.setWinner(battleId, { winner_contestant_id: contestantId });
} catch {
// WS frame updates the badge; a failed call just leaves it unchanged
}
}
return (
<div>
<button
type="button"
onClick={onToggle}
className="w-full flex items-center gap-2 px-3 py-2.5 text-left hover:bg-muted/30 transition-colors"
>
<ContestantStatusDot status={data.status} />
<span className="text-sm flex-1 truncate min-w-0">{label}</span>
{isWinner && (
<Trophy size={11} className="shrink-0 text-emerald-500" aria-label="winner" />
)}
<LaneBadge lane={data.lane} />
{data.status === 'running' && startedAt != null ? (
<span className="text-xs text-muted-foreground shrink-0 tabular-nums">
<LiveDuration startedAt={startedAt} />
</span>
) : data.duration_ms != null ? (
<span className="text-xs text-muted-foreground shrink-0 tabular-nums">
{formatDuration(data.duration_ms)}
</span>
) : null}
{data.tokens_per_sec != null && (
<span className="text-xs text-muted-foreground shrink-0 hidden sm:block tabular-nums">
{data.tokens_per_sec.toFixed(1)} tok/s
</span>
)}
{data.status === 'error' && (
<span className="text-xs text-destructive shrink-0 hidden sm:block truncate max-w-[100px]" title={data.error ?? ''}>
{data.error ?? 'error'}
</span>
)}
{isExpanded ? (
<ChevronDown size={12} className="shrink-0 text-muted-foreground" />
) : (
<ChevronRight size={12} className="shrink-0 text-muted-foreground" />
)}
{/* Row menu: winner override. Stop propagation so the row toggle isn't triggered. */}
<span onClick={(e) => e.stopPropagation()}>
<DropdownMenu>
<DropdownMenuTrigger asChild>
<button
type="button"
className="shrink-0 p-0.5 rounded text-muted-foreground hover:text-foreground hover:bg-muted"
aria-label="Contestant options"
>
<MoreHorizontal size={12} />
</button>
</DropdownMenuTrigger>
<DropdownMenuContent align="end">
{!isWinner && (
<DropdownMenuItem onSelect={() => void handleSetWinner(data.id)}>
<Trophy size={12} /> Set as winner
</DropdownMenuItem>
)}
{isWinner && (
<DropdownMenuItem onSelect={() => void handleSetWinner(null)}>
Clear winner
</DropdownMenuItem>
)}
</DropdownMenuContent>
</DropdownMenu>
</span>
</button>
{isExpanded && (
<div className="border-t border-border/50 bg-muted/10 max-h-[55vh] overflow-y-auto">
{output.length === 0 ? (
<div className="flex items-center justify-center py-6 text-sm text-muted-foreground">
{data.status === 'queued'
? 'Waiting to start…'
: data.status === 'error'
? data.error ?? 'Error'
: 'Connecting…'}
</div>
) : (
<pre className="p-3 text-xs font-mono whitespace-pre-wrap leading-relaxed break-all text-foreground">
{output}
</pre>
)}
{battleType === 'coding' && data.status === 'done' && diff && (
<DiffView diff={diff} />
)}
</div>
)}
</div>
);
}
// ─── CrossExaminationPanel ────────────────────────────────────────────────────
function CrossExaminationPanel({
battleId,
crossExams,
snapshot,
}: {
battleId: string;
crossExams: CrossExaminationShape[];
snapshot: ProviderSnapshotEntry[] | null;
}) {
const [identity, setIdentity] = useState('');
const [model, setModel] = useState('');
const [running, setRunning] = useState(false);
const identityOptions = (snapshot ?? [])
.filter((e) => e.installed && e.enabled)
.map((e) => ({ value: e.name, label: e.label }));
const modelOptions = (() => {
const provider = (snapshot ?? []).find((e) => e.name === identity);
return (provider?.models ?? []).map((m) => ({ value: m.id, label: m.label }));
})();
async function handleRun() {
if (!identity || !model || running) return;
setRunning(true);
try {
await api.battles.crossExamine(battleId, { identity, model });
// The verdict arrives via battle_updated frame; ArenaPane will refetch.
} catch (err) {
toast.error(err instanceof Error ? err.message : 'Cross-examination failed');
} finally {
setRunning(false);
}
}
return (
<div className="border-t border-border p-4 flex flex-col gap-3">
<div className="text-xs font-medium text-muted-foreground uppercase tracking-wide">
Cross-examination
</div>
<p className="text-xs text-muted-foreground">
Challenge the results with any model. The verdict is advisory and never changes the recorded winner.
</p>
<div className="flex gap-2 items-center flex-wrap">
<select
value={identity}
onChange={(e) => { setIdentity(e.target.value); setModel(''); }}
className="flex-1 min-w-[120px] text-xs border border-border rounded bg-background px-2 py-1.5 text-foreground focus:outline-none focus:ring-1 focus:ring-ring"
aria-label="Backend"
>
<option value="">Backend</option>
{identityOptions.map((o) => (
<option key={o.value} value={o.value}>{o.label}</option>
))}
</select>
<select
value={model}
onChange={(e) => setModel(e.target.value)}
disabled={!identity}
className="flex-1 min-w-[120px] text-xs border border-border rounded bg-background px-2 py-1.5 text-foreground focus:outline-none focus:ring-1 focus:ring-ring disabled:opacity-50"
aria-label="Model"
>
<option value="">Model</option>
{modelOptions.map((o) => (
<option key={o.value} value={o.value}>{o.label}</option>
))}
</select>
<button
type="button"
onClick={() => void handleRun()}
disabled={!identity || !model || running}
className="inline-flex items-center gap-1 text-xs px-2 py-1.5 rounded border border-border text-foreground hover:bg-muted disabled:opacity-50"
>
{running && <Loader2 size={10} className="animate-spin" />}
Run
</button>
</div>
{crossExams.length > 0 && (
<div className="flex flex-col gap-3 mt-1">
{crossExams.map((xe) => (
<div key={xe.id} className="rounded border border-border/50 bg-muted/20 p-3">
<div className="text-xs font-medium text-muted-foreground mb-1.5">
{xe.identity} / {xe.model}
</div>
{xe.verdict ? (
<div className="text-sm whitespace-pre-wrap leading-relaxed">{xe.verdict}</div>
) : (
<div className="text-xs text-muted-foreground flex items-center gap-1.5">
<Loader2 size={10} className="animate-spin" /> Running
</div>
)}
</div>
))}
</div>
)}
</div>
);
}
// ─── ArenaPane ────────────────────────────────────────────────────────────────
interface Props {
state: ArenaState;
projectId: string; // available for future use (e.g. file browser affordance)
onClose: () => void;
}
export function ArenaPane({ state, onClose }: Props) {
const [battle, setBattle] = useState<BattleShape | null>(null);
const [contestantRows, setContestantRows] = useState<ContestantRowState[]>([]);
const [crossExams, setCrossExams] = useState<CrossExaminationShape[]>([]);
const [analysis, setAnalysis] = useState<string | null>(null);
const [expandedId, setExpandedId] = useState<string | null>(null);
const [stopping, setStopping] = useState(false);
const [reanalyzing, setReanalyzing] = useState(false);
const startTimesRef = useRef<Map<string, number>>(new Map());
const snapshot = useProviderSnapshot();
// Fetch current battle state on mount / battle_id change.
useEffect(() => {
setBattle(null);
setContestantRows([]);
setCrossExams([]);
setAnalysis(null);
setExpandedId(null);
api.battles.get(state.battle_id)
.then(({ battle: b, contestants, cross_examinations }) => {
setBattle(b);
setContestantRows(
contestants.map((c) => ({
data: c,
output: '',
startedAt: c.status === 'running' ? Date.now() : null,
})),
);
setCrossExams(cross_examinations);
// Fetch analysis text if battle is already completed.
if (b.status === 'completed') {
api.battles.getAnalysis(state.battle_id)
.then(({ text }) => setAnalysis(text))
.catch(() => {});
}
// Auto-expand first running contestant.
const firstRunning = contestants.find((c) => c.status === 'running');
if (firstRunning) setExpandedId(firstRunning.id);
})
.catch(() => {});
}, [state.battle_id]);
// Subscribe to live battle/contestant frames.
useEffect(() => {
return sessionEvents.subscribe((ev) => {
if (ev.type === 'battle_started' && ev.battle_id === state.battle_id) {
setContestantRows((prev) => {
if (prev.length > 0) return prev;
return ev.contestants.map((c) => ({
data: {
id: c.id,
battle_id: ev.battle_id,
identity: c.identity,
model: c.model,
lane: c.lane,
task_id: null,
worktree_id: null,
status: 'queued' as const,
duration_ms: null,
tokens_per_sec: null,
cost_tokens: null,
result_path: null,
error: null,
created_at: new Date().toISOString(),
updated_at: new Date().toISOString(),
},
output: '',
startedAt: null,
}));
});
} else if (ev.type === 'contestant_updated' && ev.battle_id === state.battle_id) {
setContestantRows((prev) =>
prev.map((row) => {
if (row.data.id !== ev.contestant_id) return row;
const updatedData: ContestantShape = {
...row.data,
...(ev.status != null ? { status: ev.status } : {}),
...(ev.duration_ms != null ? { duration_ms: ev.duration_ms } : {}),
...(ev.tokens_per_sec != null ? { tokens_per_sec: ev.tokens_per_sec } : {}),
...(ev.error != null ? { error: ev.error } : {}),
};
const newStartedAt =
ev.status === 'running' && row.startedAt == null
? Date.now()
: ev.status === 'done' || ev.status === 'error'
? null
: row.startedAt;
if (ev.status === 'running') {
startTimesRef.current.set(ev.contestant_id, newStartedAt ?? Date.now());
setExpandedId(ev.contestant_id);
}
return {
data: updatedData,
output: ev.delta ? row.output + ev.delta : row.output,
startedAt: newStartedAt,
};
}),
);
if (ev.battle_status) {
setBattle((prev) => prev ? { ...prev, status: ev.battle_status! } : prev);
}
} else if (ev.type === 'battle_updated' && ev.battle_id === state.battle_id) {
setBattle((prev) => {
if (!prev) return prev;
return {
...prev,
...(ev.status != null ? { status: ev.status } : {}),
...(ev.winner_contestant_id !== undefined ? { winner_contestant_id: ev.winner_contestant_id } : {}),
};
});
if (ev.analysis_ready) {
api.battles.getAnalysis(state.battle_id)
.then(({ text }) => setAnalysis(text))
.catch(() => setAnalysis('Analysis ready — failed to load text.'));
}
if (ev.cross_exam_id) {
// Refetch cross-exams to get the latest verdict.
api.battles.get(state.battle_id)
.then(({ cross_examinations }) => setCrossExams(cross_examinations))
.catch(() => {});
}
}
});
}, [state.battle_id]);
const toggleExpand = useCallback((id: string) => {
setExpandedId((prev) => (prev === id ? null : id));
}, []);
async function handleStop() {
if (stopping) return;
setStopping(true);
try {
await api.battles.stop(state.battle_id);
} catch {
// non-fatal
} finally {
setStopping(false);
}
}
async function handleReanalyze() {
if (reanalyzing) return;
setReanalyzing(true);
try {
await api.battles.analyze(state.battle_id);
toast.success('Re-analysis triggered');
} catch (err) {
toast.error(err instanceof Error ? err.message : 'Re-analysis failed');
} finally {
setReanalyzing(false);
}
}
function handleOpenResults() {
if (!battle?.results_path) return;
sessionEvents.emit({ type: 'open_file_in_browser', path: battle.results_path });
}
function handleCopyAnalysis() {
if (!analysis) return;
navigator.clipboard.writeText(analysis).catch(() => toast.error('Clipboard write failed'));
}
const battleStatus = battle?.status ?? 'running';
const isRunning = battleStatus === 'running' || battleStatus === 'pending';
const isCompleted = battleStatus === 'completed';
const winnerId = battle?.winner_contestant_id;
const winnerRow = winnerId ? contestantRows.find((r) => r.data.id === winnerId) : null;
const winnerLabel = winnerRow ? `${winnerRow.data.identity} / ${winnerRow.data.model}` : null;
return (
<div className="flex flex-col h-full min-h-0 overflow-hidden">
{/* Header */}
<div className="flex items-center gap-2 border-b border-border bg-muted/20 px-3 py-2 shrink-0">
<Swords size={13} className="text-muted-foreground shrink-0" />
<span className="text-sm font-medium truncate min-w-0 flex-1" title={state.prompt}>
{state.prompt.length > 60 ? state.prompt.slice(0, 60) + '…' : state.prompt}
</span>
<span className="text-xs text-muted-foreground shrink-0 capitalize">{state.battle_type}</span>
{winnerLabel && (
<span
className="text-xs px-1.5 py-0.5 rounded bg-emerald-500/10 text-emerald-600 dark:text-emerald-400 shrink-0 hidden sm:block truncate max-w-[130px]"
title={`Winner: ${winnerLabel}`}
>
{winnerLabel}
</span>
)}
<div className="ml-auto flex items-center gap-1 shrink-0">
{isRunning ? (
<button
type="button"
onClick={() => void handleStop()}
disabled={stopping}
className="inline-flex items-center gap-1 text-xs px-1.5 py-0.5 rounded border border-border text-muted-foreground hover:text-foreground hover:bg-muted disabled:opacity-50"
title="Stop battle"
>
Stop
</button>
) : (
<span
className={cn(
'text-xs px-1.5 py-0.5 rounded',
isCompleted
? 'text-emerald-600 bg-emerald-500/10'
: battleStatus === 'failed' || battleStatus === 'cancelled'
? 'text-destructive bg-destructive/10'
: 'text-muted-foreground bg-muted/40',
)}
>
{battleStatus}
</span>
)}
{isCompleted && (
<DropdownMenu>
<DropdownMenuTrigger asChild>
<button
type="button"
className="inline-flex items-center justify-center p-1 rounded text-muted-foreground hover:bg-muted hover:text-foreground"
aria-label="Battle options"
>
<MoreHorizontal size={14} />
</button>
</DropdownMenuTrigger>
<DropdownMenuContent align="end">
<DropdownMenuItem onSelect={() => void handleReanalyze()} disabled={reanalyzing}>
<RotateCcw size={14} /> Re-analyze
</DropdownMenuItem>
{battle?.results_path && (
<DropdownMenuItem onSelect={handleOpenResults}>
Open results folder
</DropdownMenuItem>
)}
{analysis && (
<DropdownMenuItem onSelect={handleCopyAnalysis}>
Copy analysis
</DropdownMenuItem>
)}
</DropdownMenuContent>
</DropdownMenu>
)}
<button
type="button"
onClick={onClose}
className="inline-flex items-center justify-center p-1 rounded text-muted-foreground hover:bg-muted hover:text-foreground"
aria-label="Close pane"
title="Close pane"
>
<X size={12} />
</button>
</div>
</div>
{/* Body */}
<div className="flex-1 min-h-0 overflow-y-auto">
{/* Analysis panel */}
{analysis && (
<div className="border-b border-border p-4">
<div className="text-xs font-medium text-muted-foreground uppercase tracking-wide mb-2 pb-1 border-b border-border/50">
Analysis
</div>
<div className="text-sm text-foreground whitespace-pre-wrap leading-relaxed">
{analysis}
</div>
{winnerLabel && (
<div className="mt-2 text-sm font-medium text-emerald-600 dark:text-emerald-400">
Winner: {winnerLabel}
</div>
)}
</div>
)}
{/* Empty state */}
{contestantRows.length === 0 && !analysis && (
<div className="flex items-center justify-center h-24 text-sm text-muted-foreground">
Starting battle
</div>
)}
{/* Contestant roster */}
<div className="divide-y divide-border">
{contestantRows.map((row) => (
<ContestantRow
key={row.data.id}
row={row}
isExpanded={expandedId === row.data.id}
onToggle={() => toggleExpand(row.data.id)}
isWinner={winnerId === row.data.id}
battleId={state.battle_id}
battleType={battle?.battle_type ?? state.battle_type}
/>
))}
</div>
{/* Cross-examination panel — available after battle finishes */}
{!isRunning && (
<CrossExaminationPanel
battleId={state.battle_id}
crossExams={crossExams}
snapshot={snapshot}
/>
)}
</div>
</div>
);
}