feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt

Arena is a new pane kind for competitive AI evaluation. A Battle runs
the same prompt against 2-6 Contestants across two concurrent lanes:
local lane (llama-swap models, serial) and cloud lane (parallel).

Added to all three registries: @boocode/contracts WsFrameSchema,
server InferenceFrame, and web WsFrame.

Backend (apps/coder):
- arena-runner: battle scheduler, lane classifier, benchmark, results
  writer, resume, user winner override
- arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL
- arena-decisions: status transitions and resume logic (unit-tested)
- arena-analyzer-helpers: pure helper functions (unit-tested)
- arena-model-call: model call utility for analysis
- arena routes: create/get/list/stop/analyze/cross-examine/winner/diff
- schema: battles, contestants, cross_examinations tables (idempotent)
- remove old /api/arena* routes and tasks.arena_id column

Frontend (apps/web):
- ArenaLauncherDialog: battle type, prompt, contestant selection
- ArenaPane: live roster, streaming output, analysis, cross-exam
- DiffView: unified diff with line-by-line color for coding contests
- Winner override per-row dropdown (Trophy icon)
- battle_updated WS handler for live winner/analysis updates
- arena pane kind in Workspace, ChatTabBar, useSidebar

Cross-app:
- ArenaState and ArenaContestantShape/WsFrame types (contracts)
- battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame
- manifest.json written per battle results folder
- /Arena added to .gitignore
This commit is contained in:
2026-06-06 23:25:29 +00:00
parent 84a024a5a4
commit 3474be4865
34 changed files with 4581 additions and 146 deletions

View File

@@ -54,9 +54,6 @@ DO $$ BEGIN
END IF;
END $$;
-- v2.0.5: arena support — group tasks into competitive arenas.
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS arena_id UUID;
-- Human inbox: tasks needing attention
CREATE OR REPLACE VIEW human_inbox AS
SELECT * FROM tasks WHERE state IN ('blocked', 'failed');
@@ -81,6 +78,7 @@ ALTER TABLE tasks ADD COLUMN IF NOT EXISTS thinking_option_id TEXT;
DROP VIEW IF EXISTS human_inbox;
ALTER TABLE tasks DROP COLUMN IF EXISTS feature_values;
ALTER TABLE tasks DROP COLUMN IF EXISTS worktree_path;
ALTER TABLE tasks DROP COLUMN IF EXISTS arena_id;
CREATE OR REPLACE VIEW human_inbox AS
SELECT * FROM tasks WHERE state IN ('blocked', 'failed');
@@ -157,7 +155,7 @@ CREATE UNIQUE INDEX IF NOT EXISTS worktrees_active_path_uidx ON worktrees(path)
DROP TABLE IF EXISTS session_worktrees;
-- Dispatch hint: which chat (tab) a task belongs to. The coder message route and
-- skills route set it from the frontend tab; session-less creators (arena, MCP,
-- skills route set it from the frontend tab; session-less creators (MCP,
-- new_task, generic /api/tasks) leave it NULL and the dispatcher creates a chat.
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS chat_id UUID REFERENCES chats(id) ON DELETE SET NULL;
@@ -271,7 +269,7 @@ ALTER TABLE agent_sessions ADD CONSTRAINT agent_sessions_backend_chk
CHECK (backend IN ('opencode_server', 'acp_warm', 'claude_sdk'));
-- LISTEN/NOTIFY fast path: every tasks INSERT (from any call site — routes,
-- new_task tool, arena, MCP server) fires pg_notify('tasks_new') in the same
-- new_task tool, MCP server) fires pg_notify('tasks_new') in the same
-- transaction, so the dispatcher reacts immediately instead of waiting for the
-- fallback poll. Postgres holds the notification until COMMIT, so the listener
-- always sees the committed row. A trigger covers all insert paths with no
@@ -357,3 +355,71 @@ DO $$ BEGIN
CHECK (status IN ('pending', 'running', 'completed', 'failed', 'skipped', 'cancelled'));
END IF;
END $$;
-- Arena: battles + contestants + cross_examinations.
-- project_id carries no FK (matches tasks.project_id + flow_runs.project_id convention).
-- winner_contestant_id FK is deferred (forward reference): added via guarded ALTER below.
CREATE TABLE IF NOT EXISTS battles (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
project_id UUID NOT NULL,
battle_type TEXT NOT NULL,
prompt TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'pending',
winner_contestant_id UUID,
results_path TEXT,
error TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
CONSTRAINT battles_type_chk CHECK (battle_type IN ('coding', 'qa')),
CONSTRAINT battles_status_chk CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
);
CREATE TABLE IF NOT EXISTS contestants (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
battle_id UUID NOT NULL REFERENCES battles(id) ON DELETE CASCADE,
identity TEXT NOT NULL,
model TEXT NOT NULL,
lane TEXT NOT NULL,
task_id UUID REFERENCES tasks(id) ON DELETE SET NULL,
worktree_id UUID REFERENCES worktrees(id) ON DELETE SET NULL,
status TEXT NOT NULL DEFAULT 'queued',
duration_ms INTEGER,
tokens_per_sec DOUBLE PRECISION,
cost_tokens INTEGER,
result_path TEXT,
error TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
CONSTRAINT contestants_lane_chk CHECK (lane IN ('local', 'cloud')),
CONSTRAINT contestants_status_chk CHECK (status IN ('queued', 'running', 'done', 'error')),
UNIQUE (battle_id, identity, model)
);
CREATE TABLE IF NOT EXISTS cross_examinations (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
battle_id UUID NOT NULL REFERENCES battles(id) ON DELETE CASCADE,
identity TEXT NOT NULL,
model TEXT NOT NULL,
verdict TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp()
);
-- Add the winner FK now that contestants exists.
DO $$ BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'battles_winner_contestant_id_fkey') THEN
ALTER TABLE battles ADD CONSTRAINT battles_winner_contestant_id_fkey
FOREIGN KEY (winner_contestant_id) REFERENCES contestants(id) ON DELETE SET NULL;
END IF;
END $$;
-- battles query (GET /api/battles?project_id=).
CREATE INDEX IF NOT EXISTS battles_project_created_idx ON battles(project_id, created_at DESC);
-- Lane-scheduler advance scans (contestants WHERE battle_id = ? AND status = ?).
CREATE INDEX IF NOT EXISTS contestants_battle_status_idx ON contestants(battle_id, status);
-- onTaskTerminal callback: look up the contestant owning a completed task.
CREATE INDEX IF NOT EXISTS contestants_task_id_idx ON contestants(task_id);
-- Cross-examination listing per battle.
CREATE INDEX IF NOT EXISTS cross_examinations_battle_idx ON cross_examinations(battle_id);