feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt
Arena is a new pane kind for competitive AI evaluation. A Battle runs the same prompt against 2-6 Contestants across two concurrent lanes: local lane (llama-swap models, serial) and cloud lane (parallel). Added to all three registries: @boocode/contracts WsFrameSchema, server InferenceFrame, and web WsFrame. Backend (apps/coder): - arena-runner: battle scheduler, lane classifier, benchmark, results writer, resume, user winner override - arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL - arena-decisions: status transitions and resume logic (unit-tested) - arena-analyzer-helpers: pure helper functions (unit-tested) - arena-model-call: model call utility for analysis - arena routes: create/get/list/stop/analyze/cross-examine/winner/diff - schema: battles, contestants, cross_examinations tables (idempotent) - remove old /api/arena* routes and tasks.arena_id column Frontend (apps/web): - ArenaLauncherDialog: battle type, prompt, contestant selection - ArenaPane: live roster, streaming output, analysis, cross-exam - DiffView: unified diff with line-by-line color for coding contests - Winner override per-row dropdown (Trophy icon) - battle_updated WS handler for live winner/analysis updates - arena pane kind in Workspace, ChatTabBar, useSidebar Cross-app: - ArenaState and ArenaContestantShape/WsFrame types (contracts) - battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame - manifest.json written per battle results folder - /Arena added to .gitignore Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -310,6 +310,9 @@ export function createDispatcher(deps: Deps): {
|
||||
const taskId = task.id;
|
||||
log.info({ taskId }, 'dispatcher: starting task (path A — native)');
|
||||
|
||||
// Declared before try so the catch block can write it back on the task row.
|
||||
let chatId: string | null = null;
|
||||
|
||||
try {
|
||||
// Mark running
|
||||
await sql`
|
||||
@@ -318,26 +321,29 @@ export function createDispatcher(deps: Deps): {
|
||||
WHERE id = ${taskId}
|
||||
`;
|
||||
|
||||
// Create session + chat for this task
|
||||
// Session setup: reuse a pre-created session (e.g. Q&A arena contestants
|
||||
// whose persona is stamped on the session via agent_id) or create a fresh one.
|
||||
const model = task.model ?? config.DEFAULT_MODEL;
|
||||
const sessionName = 'Task: ' + task.input.slice(0, 40);
|
||||
|
||||
const [session] = await sql<{ id: string }[]>`
|
||||
INSERT INTO sessions (project_id, name, model, status)
|
||||
VALUES (${task.project_id}, ${sessionName}, ${model}, 'open')
|
||||
RETURNING id
|
||||
`;
|
||||
const sessionId = session!.id;
|
||||
let sessionId: string;
|
||||
if (task.session_id) {
|
||||
sessionId = task.session_id;
|
||||
} else {
|
||||
const sessionName = 'Task: ' + task.input.slice(0, 40);
|
||||
const [session] = await sql<{ id: string }[]>`
|
||||
INSERT INTO sessions (project_id, name, model, status)
|
||||
VALUES (${task.project_id}, ${sessionName}, ${model}, 'open')
|
||||
RETURNING id
|
||||
`;
|
||||
sessionId = session!.id;
|
||||
await sql`UPDATE tasks SET session_id = ${sessionId} WHERE id = ${taskId}`;
|
||||
}
|
||||
|
||||
const [chat] = await sql<{ id: string }[]>`
|
||||
INSERT INTO chats (session_id, name, status)
|
||||
VALUES (${sessionId}, 'Task execution', 'open')
|
||||
RETURNING id
|
||||
`;
|
||||
const chatId = chat!.id;
|
||||
|
||||
// Link task to session
|
||||
await sql`UPDATE tasks SET session_id = ${sessionId} WHERE id = ${taskId}`;
|
||||
chatId = chat!.id;
|
||||
|
||||
// Create user message + streaming assistant
|
||||
await sql<{ id: string }[]>`
|
||||
@@ -382,7 +388,7 @@ export function createDispatcher(deps: Deps): {
|
||||
const summary = (msg?.content ?? '').slice(0, 500);
|
||||
await sql`
|
||||
UPDATE tasks
|
||||
SET state = 'completed', ended_at = clock_timestamp(), output_summary = ${summary}, cost_tokens = ${costTokens}
|
||||
SET state = 'completed', ended_at = clock_timestamp(), output_summary = ${summary}, cost_tokens = ${costTokens}, chat_id = ${chatId}
|
||||
WHERE id = ${taskId}
|
||||
`;
|
||||
log.info({ taskId, costTokens }, 'dispatcher: task completed (native)');
|
||||
@@ -409,7 +415,7 @@ export function createDispatcher(deps: Deps): {
|
||||
const summary = (msg?.content ?? 'Inference failed').slice(0, 500);
|
||||
await sql`
|
||||
UPDATE tasks
|
||||
SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${summary}, cost_tokens = ${costTokens}
|
||||
SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${summary}, cost_tokens = ${costTokens}, chat_id = ${chatId}
|
||||
WHERE id = ${taskId}
|
||||
`;
|
||||
log.warn({ taskId, finalStatus }, 'dispatcher: task failed (native)');
|
||||
@@ -419,7 +425,7 @@ export function createDispatcher(deps: Deps): {
|
||||
log.error({ taskId, err: errMsg }, 'dispatcher: task error (native)');
|
||||
await sql`
|
||||
UPDATE tasks
|
||||
SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}
|
||||
SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}, chat_id = ${chatId}
|
||||
WHERE id = ${taskId}
|
||||
`.catch(() => {});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user