feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt
Arena is a new pane kind for competitive AI evaluation. A Battle runs the same prompt against 2-6 Contestants across two concurrent lanes: local lane (llama-swap models, serial) and cloud lane (parallel). Added to all three registries: @boocode/contracts WsFrameSchema, server InferenceFrame, and web WsFrame. Backend (apps/coder): - arena-runner: battle scheduler, lane classifier, benchmark, results writer, resume, user winner override - arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL - arena-decisions: status transitions and resume logic (unit-tested) - arena-analyzer-helpers: pure helper functions (unit-tested) - arena-model-call: model call utility for analysis - arena routes: create/get/list/stop/analyze/cross-examine/winner/diff - schema: battles, contestants, cross_examinations tables (idempotent) - remove old /api/arena* routes and tasks.arena_id column Frontend (apps/web): - ArenaLauncherDialog: battle type, prompt, contestant selection - ArenaPane: live roster, streaming output, analysis, cross-exam - DiffView: unified diff with line-by-line color for coding contests - Winner override per-row dropdown (Trophy icon) - battle_updated WS handler for live winner/analysis updates - arena pane kind in Workspace, ChatTabBar, useSidebar Cross-app: - ArenaState and ArenaContestantShape/WsFrame types (contracts) - battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame - manifest.json written per battle results folder - /Arena added to .gitignore Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -28,6 +28,10 @@
|
||||
"./worktree-risk": {
|
||||
"types": "./dist/worktree-risk.d.ts",
|
||||
"default": "./dist/worktree-risk.js"
|
||||
},
|
||||
"./arena": {
|
||||
"types": "./dist/arena.d.ts",
|
||||
"default": "./dist/arena.js"
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
|
||||
55
packages/contracts/src/arena.ts
Normal file
55
packages/contracts/src/arena.ts
Normal file
@@ -0,0 +1,55 @@
|
||||
/** Arena types — single source of truth for cross-app Arena wire contracts. */
|
||||
|
||||
export type BattleType = 'coding' | 'qa';
|
||||
export type BattleStatus = 'pending' | 'running' | 'completed' | 'failed' | 'cancelled';
|
||||
export type ContestantStatus = 'queued' | 'running' | 'done' | 'error';
|
||||
export type ContestantLane = 'local' | 'cloud';
|
||||
|
||||
// Pane state — carried on the WorkspacePane row, mirrors OrchestratorState.
|
||||
export interface ArenaState {
|
||||
battle_id: string;
|
||||
battle_type: BattleType;
|
||||
prompt: string;
|
||||
}
|
||||
|
||||
export interface BattleShape {
|
||||
id: string;
|
||||
project_id: string;
|
||||
battle_type: BattleType;
|
||||
prompt: string;
|
||||
status: BattleStatus;
|
||||
winner_contestant_id: string | null;
|
||||
results_path: string | null;
|
||||
error: string | null;
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
}
|
||||
|
||||
export interface ContestantShape {
|
||||
id: string;
|
||||
battle_id: string;
|
||||
/** Backend name (coding) or persona name (qa). Unique per (battle, model) pair. */
|
||||
identity: string;
|
||||
model: string;
|
||||
lane: ContestantLane;
|
||||
task_id: string | null;
|
||||
worktree_id: string | null;
|
||||
status: ContestantStatus;
|
||||
duration_ms: number | null;
|
||||
tokens_per_sec: number | null;
|
||||
cost_tokens: number | null;
|
||||
result_path: string | null;
|
||||
error: string | null;
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
}
|
||||
|
||||
export interface CrossExaminationShape {
|
||||
id: string;
|
||||
battle_id: string;
|
||||
/** Backend + model performing the cross-examination. */
|
||||
identity: string;
|
||||
model: string;
|
||||
verdict: string | null;
|
||||
created_at: string;
|
||||
}
|
||||
@@ -358,6 +358,53 @@ export const FlowRunStepUpdatedFrame = z.object({
|
||||
report: z.string().optional(),
|
||||
});
|
||||
|
||||
// ---- arena frames ----------------------------------------------------------
|
||||
|
||||
const ContestantManifestEntry = z.object({
|
||||
id: Uuid,
|
||||
identity: z.string().min(1),
|
||||
model: z.string().min(1),
|
||||
lane: z.enum(['local', 'cloud']),
|
||||
});
|
||||
|
||||
// Published once when a battle starts. Carries the contestant roster so the
|
||||
// ArenaPane can build its grid immediately.
|
||||
export const BattleStartedFrame = z.object({
|
||||
type: z.literal('battle_started'),
|
||||
battle_id: Uuid,
|
||||
battle_type: z.enum(['coding', 'qa']),
|
||||
prompt: z.string(),
|
||||
contestants: z.array(ContestantManifestEntry),
|
||||
});
|
||||
|
||||
// Published on every contestant status change or streaming update.
|
||||
// `delta` carries the latest chunk of streaming output while status='running'.
|
||||
// `battle_status` is present only on the final transition that closes the battle.
|
||||
export const ContestantUpdatedFrame = z.object({
|
||||
type: z.literal('contestant_updated'),
|
||||
battle_id: Uuid,
|
||||
contestant_id: Uuid,
|
||||
status: z.enum(['queued', 'running', 'done', 'error']).optional(),
|
||||
duration_ms: z.number().int().nonnegative().optional(),
|
||||
tokens_per_sec: z.number().nonnegative().optional(),
|
||||
battle_status: z.enum(['pending', 'running', 'completed', 'failed', 'cancelled']).optional(),
|
||||
delta: z.string().optional(),
|
||||
error: z.string().optional(),
|
||||
});
|
||||
|
||||
// Published when battle-level state changes that don't ride on a contestant
|
||||
// update: analysis finished, winner set, cross-exam verdict ready. The pane
|
||||
// uses this to update its analysis panel and winner badge without a refetch.
|
||||
// Fields are all optional — publishers include only what changed.
|
||||
export const BattleUpdatedFrame = z.object({
|
||||
type: z.literal('battle_updated'),
|
||||
battle_id: Uuid,
|
||||
status: z.enum(['pending', 'running', 'completed', 'failed', 'cancelled']).optional(),
|
||||
winner_contestant_id: Uuid.nullable().optional(),
|
||||
analysis_ready: z.boolean().optional(),
|
||||
cross_exam_id: Uuid.optional(),
|
||||
});
|
||||
|
||||
// ---- discriminated union ---------------------------------------------------
|
||||
|
||||
export const WsFrameSchema = z.discriminatedUnion('type', [
|
||||
@@ -381,6 +428,10 @@ export const WsFrameSchema = z.discriminatedUnion('type', [
|
||||
// orchestrator
|
||||
FlowRunStartedFrame,
|
||||
FlowRunStepUpdatedFrame,
|
||||
// arena
|
||||
BattleStartedFrame,
|
||||
ContestantUpdatedFrame,
|
||||
BattleUpdatedFrame,
|
||||
// per-user
|
||||
ChatStatusFrame,
|
||||
SessionUpdatedFrame,
|
||||
@@ -425,6 +476,9 @@ export const KNOWN_FRAME_TYPES: readonly WsFrame['type'][] = [
|
||||
'agent_status_updated',
|
||||
'flow_run_started',
|
||||
'flow_run_step_updated',
|
||||
'battle_started',
|
||||
'contestant_updated',
|
||||
'battle_updated',
|
||||
'chat_status',
|
||||
'session_updated',
|
||||
'session_renamed',
|
||||
|
||||
Reference in New Issue
Block a user