feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt
Arena is a new pane kind for competitive AI evaluation. A Battle runs the same prompt against 2-6 Contestants across two concurrent lanes: local lane (llama-swap models, serial) and cloud lane (parallel). Added to all three registries: @boocode/contracts WsFrameSchema, server InferenceFrame, and web WsFrame. Backend (apps/coder): - arena-runner: battle scheduler, lane classifier, benchmark, results writer, resume, user winner override - arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL - arena-decisions: status transitions and resume logic (unit-tested) - arena-analyzer-helpers: pure helper functions (unit-tested) - arena-model-call: model call utility for analysis - arena routes: create/get/list/stop/analyze/cross-examine/winner/diff - schema: battles, contestants, cross_examinations tables (idempotent) - remove old /api/arena* routes and tasks.arena_id column Frontend (apps/web): - ArenaLauncherDialog: battle type, prompt, contestant selection - ArenaPane: live roster, streaming output, analysis, cross-exam - DiffView: unified diff with line-by-line color for coding contests - Winner override per-row dropdown (Trophy icon) - battle_updated WS handler for live winner/analysis updates - arena pane kind in Workspace, ChatTabBar, useSidebar Cross-app: - ArenaState and ArenaContestantShape/WsFrame types (contracts) - battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame - manifest.json written per battle results folder - /Arena added to .gitignore Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1,136 +1,412 @@
|
||||
/**
|
||||
* v2.0.5: Arena routes — competitive dispatch of the same task to multiple agents.
|
||||
* Arena routes — HTTP surface for the Battle UI.
|
||||
*
|
||||
* POST /api/arena — create an arena with 2-5 contestants
|
||||
* GET /api/arena/:id — get all tasks in an arena
|
||||
* POST /api/arena/:id/select/:task_id — mark a task as the arena winner
|
||||
* POST /api/battles — launch a battle
|
||||
* GET /api/battles?project_id= — list battles for a project
|
||||
* GET /api/battles/:id — one battle + contestants + cross-exams
|
||||
* POST /api/battles/:id/stop — cancel a running battle
|
||||
* POST /api/battles/:id/analyze — trigger analysis (Phase 5 fills the logic)
|
||||
* POST /api/battles/:id/cross-examine — start a cross-examination (Phase 5 fills the logic)
|
||||
*
|
||||
* Mirrors the shape of runs.ts (Orchestrator routes). Battle creation delegates to
|
||||
* the battle-runner; cancellation calls cancelBattle then aborts in-flight tasks
|
||||
* via the dispatcher's cancelExternalTask.
|
||||
*/
|
||||
import type { FastifyInstance } from 'fastify';
|
||||
import { z } from 'zod';
|
||||
import { readFile } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import type { Sql } from '../db.js';
|
||||
import type { Config } from '../config.js';
|
||||
import type { BattleRunner } from '../services/arena-runner.js';
|
||||
import type { ExternalCancelFn } from './tasks.js';
|
||||
import { arenaModelCall } from '../services/arena-model-call.js';
|
||||
|
||||
const ContestantSchema = z.object({
|
||||
agent: z.string().max(100).optional(),
|
||||
model: z.string().max(200).optional(),
|
||||
mode_id: z.string().max(200).optional(),
|
||||
thinking_option_id: z.string().max(200).optional(),
|
||||
// ─── Validation schemas ───────────────────────────────────────────────────────
|
||||
|
||||
const UuidParam = z.string().uuid();
|
||||
|
||||
const ContestantInput = z.object({
|
||||
identity: z.string().min(1).max(200),
|
||||
model: z.string().min(1).max(200),
|
||||
});
|
||||
|
||||
const CreateArenaBody = z.object({
|
||||
const CreateBattleBody = z.object({
|
||||
project_id: z.string().uuid(),
|
||||
input: z.string().min(1).max(64_000),
|
||||
contestants: z.array(ContestantSchema).min(2).max(5),
|
||||
battle_type: z.enum(['coding', 'qa']),
|
||||
prompt: z.string().min(1).max(64_000),
|
||||
contestants: z
|
||||
.array(ContestantInput)
|
||||
.min(2, 'at least 2 contestants required')
|
||||
.max(6, 'at most 6 contestants allowed'),
|
||||
});
|
||||
|
||||
interface TaskRow {
|
||||
id: string;
|
||||
agent: string | null;
|
||||
model: string | null;
|
||||
mode_id: string | null;
|
||||
thinking_option_id: string | null;
|
||||
state: string;
|
||||
}
|
||||
const ListBattlesQuery = z.object({
|
||||
project_id: z.string().uuid(),
|
||||
});
|
||||
|
||||
export function registerArenaRoutes(app: FastifyInstance, sql: Sql): void {
|
||||
// POST /api/arena — create a new arena
|
||||
app.post('/api/arena', async (req, reply) => {
|
||||
const parsed = CreateArenaBody.safeParse(req.body);
|
||||
const CrossExamineBody = z.object({
|
||||
identity: z.string().min(1).max(200),
|
||||
model: z.string().min(1).max(200),
|
||||
});
|
||||
|
||||
const SetWinnerBody = z.object({
|
||||
winner_contestant_id: z.string().uuid().nullable(),
|
||||
});
|
||||
|
||||
// ─── Route registration ───────────────────────────────────────────────────────
|
||||
|
||||
const GeneratePromptBody = z.object({
|
||||
description: z.string().min(1).max(2_000),
|
||||
});
|
||||
|
||||
export function registerArenaRoutes(
|
||||
app: FastifyInstance,
|
||||
sql: Sql,
|
||||
battleRunner: BattleRunner,
|
||||
cancelExternal: ExternalCancelFn,
|
||||
config: Config,
|
||||
): void {
|
||||
|
||||
// POST /api/battles/generate-prompt — draft a fuller battle prompt from a
|
||||
// short description using the default BooChat model. One-shot, non-streaming.
|
||||
// Must be registered BEFORE /api/battles/:id so the literal 'generate-prompt'
|
||||
// path is not mistaken for a UUID param.
|
||||
app.post('/api/battles/generate-prompt', async (req, reply) => {
|
||||
const parsed = GeneratePromptBody.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid body', details: parsed.error.flatten() };
|
||||
}
|
||||
|
||||
const { project_id, input, contestants } = parsed.data;
|
||||
const arenaId = crypto.randomUUID();
|
||||
const { description } = parsed.data;
|
||||
|
||||
const tasks: TaskRow[] = [];
|
||||
for (const contestant of contestants) {
|
||||
const [task] = await sql<TaskRow[]>`
|
||||
INSERT INTO tasks (project_id, input, agent, model, mode_id, thinking_option_id, arena_id)
|
||||
VALUES (
|
||||
${project_id},
|
||||
${input},
|
||||
${contestant.agent ?? null},
|
||||
${contestant.model ?? null},
|
||||
${contestant.mode_id ?? null},
|
||||
${contestant.thinking_option_id ?? null},
|
||||
${arenaId}
|
||||
)
|
||||
RETURNING id, agent, model, mode_id, thinking_option_id, state
|
||||
`;
|
||||
tasks.push(task!);
|
||||
try {
|
||||
const prompt = await arenaModelCall({
|
||||
config,
|
||||
model: config.DEFAULT_MODEL,
|
||||
system: [
|
||||
'You are a battle-prompt writer for an AI Arena.',
|
||||
'The user gives you a short description of a coding or Q&A challenge.',
|
||||
'Expand it into a clear, self-contained prompt (2–6 sentences) that any AI model can act on.',
|
||||
'Include specific acceptance criteria where helpful.',
|
||||
'Output ONLY the prompt — no preamble, no labels, no meta-commentary.',
|
||||
].join(' '),
|
||||
user: description,
|
||||
maxTokens: 400,
|
||||
temperature: 0.6,
|
||||
});
|
||||
return { prompt };
|
||||
} catch (err) {
|
||||
app.log.warn(
|
||||
{ err: err instanceof Error ? err.message : String(err) },
|
||||
'arena generate-prompt: model call failed',
|
||||
);
|
||||
reply.code(502);
|
||||
return { error: 'model call failed' };
|
||||
}
|
||||
});
|
||||
|
||||
// POST /api/battles — launch a battle
|
||||
app.post('/api/battles', async (req, reply) => {
|
||||
const parsed = CreateBattleBody.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid body', details: parsed.error.flatten() };
|
||||
}
|
||||
|
||||
const { project_id, battle_type, prompt, contestants } = parsed.data;
|
||||
|
||||
// Reject duplicate (identity, model) pairs up front — the schema UNIQUE
|
||||
// constraint would catch it too, but an early 422 is friendlier.
|
||||
const seen = new Set<string>();
|
||||
for (const c of contestants) {
|
||||
const key = `${c.identity}::${c.model}`;
|
||||
if (seen.has(key)) {
|
||||
reply.code(422);
|
||||
return {
|
||||
error: 'duplicate_contestant',
|
||||
message: `duplicate contestant: identity="${c.identity}" model="${c.model}"`,
|
||||
};
|
||||
}
|
||||
seen.add(key);
|
||||
}
|
||||
|
||||
// Verify project exists
|
||||
const [proj] = await sql<{ id: string }[]>`SELECT id FROM projects WHERE id = ${project_id}`;
|
||||
if (!proj) {
|
||||
reply.code(404);
|
||||
return { error: 'project not found' };
|
||||
}
|
||||
|
||||
const { battleId } = await battleRunner.startBattle({
|
||||
projectId: project_id,
|
||||
battleType: battle_type,
|
||||
prompt,
|
||||
contestants,
|
||||
});
|
||||
|
||||
reply.code(201);
|
||||
return {
|
||||
arena_id: arenaId,
|
||||
tasks: tasks.map((t) => ({
|
||||
id: t.id,
|
||||
agent: t.agent,
|
||||
model: t.model,
|
||||
mode_id: t.mode_id,
|
||||
thinking_option_id: t.thinking_option_id,
|
||||
state: t.state,
|
||||
})),
|
||||
};
|
||||
return { battle_id: battleId };
|
||||
});
|
||||
|
||||
// GET /api/arena/:arena_id — list all tasks in an arena
|
||||
app.get<{ Params: { arena_id: string } }>('/api/arena/:arena_id', async (req, reply) => {
|
||||
const { arena_id } = req.params;
|
||||
|
||||
// Validate UUID format
|
||||
const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
||||
if (!uuidRegex.test(arena_id)) {
|
||||
// GET /api/battles?project_id= — list battles, most-recent-first
|
||||
app.get('/api/battles', async (req, reply) => {
|
||||
const parsed = ListBattlesQuery.safeParse(req.query);
|
||||
if (!parsed.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid arena_id format' };
|
||||
return { error: 'invalid query', details: parsed.error.flatten() };
|
||||
}
|
||||
|
||||
const tasks = await sql`
|
||||
SELECT id, project_id, state, input, output_summary, agent, model, mode_id, thinking_option_id, execution_path, session_id, started_at, ended_at, created_at, arena_id
|
||||
FROM tasks
|
||||
WHERE arena_id = ${arena_id}
|
||||
ORDER BY created_at
|
||||
const battles = await sql`
|
||||
SELECT id, project_id, battle_type, prompt, status,
|
||||
winner_contestant_id, results_path, error,
|
||||
created_at, updated_at
|
||||
FROM battles
|
||||
WHERE project_id = ${parsed.data.project_id}
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 100
|
||||
`;
|
||||
|
||||
if (tasks.length === 0) {
|
||||
reply.code(404);
|
||||
return { error: 'arena not found' };
|
||||
}
|
||||
|
||||
return { arena_id, tasks };
|
||||
return { battles };
|
||||
});
|
||||
|
||||
// POST /api/arena/:arena_id/select/:task_id — mark the winner
|
||||
app.post<{ Params: { arena_id: string; task_id: string } }>(
|
||||
'/api/arena/:arena_id/select/:task_id',
|
||||
async (req, reply) => {
|
||||
const { arena_id, task_id } = req.params;
|
||||
|
||||
// Verify the task belongs to this arena
|
||||
const rows = await sql<{ id: string; state: string; arena_id: string | null }[]>`
|
||||
SELECT id, state, arena_id FROM tasks WHERE id = ${task_id}
|
||||
`;
|
||||
|
||||
if (rows.length === 0) {
|
||||
reply.code(404);
|
||||
return { error: 'task not found' };
|
||||
}
|
||||
|
||||
const task = rows[0]!;
|
||||
if (task.arena_id !== arena_id) {
|
||||
reply.code(409);
|
||||
return { error: 'task does not belong to this arena' };
|
||||
}
|
||||
|
||||
// Mark as selected via output_summary prefix (lightweight — no schema change)
|
||||
await sql`
|
||||
UPDATE tasks
|
||||
SET output_summary = COALESCE('[SELECTED] ' || output_summary, '[SELECTED]')
|
||||
WHERE id = ${task_id}
|
||||
`;
|
||||
|
||||
return { selected: true, task_id, arena_id };
|
||||
// GET /api/battles/:id — one battle + its contestants + cross-examinations
|
||||
app.get<{ Params: { id: string } }>('/api/battles/:id', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
if (!parsedId.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
);
|
||||
const id = parsedId.data;
|
||||
|
||||
const [battle] = await sql<{
|
||||
id: string;
|
||||
project_id: string;
|
||||
battle_type: string;
|
||||
prompt: string;
|
||||
status: string;
|
||||
winner_contestant_id: string | null;
|
||||
results_path: string | null;
|
||||
error: string | null;
|
||||
created_at: unknown;
|
||||
updated_at: unknown;
|
||||
}[]>`
|
||||
SELECT id, project_id, battle_type, prompt, status,
|
||||
winner_contestant_id, results_path, error,
|
||||
created_at, updated_at
|
||||
FROM battles WHERE id = ${id}
|
||||
`;
|
||||
|
||||
if (!battle) {
|
||||
reply.code(404);
|
||||
return { error: 'battle not found' };
|
||||
}
|
||||
|
||||
const contestants = await sql`
|
||||
SELECT id, battle_id, identity, model, lane, task_id, worktree_id,
|
||||
status, duration_ms, tokens_per_sec, cost_tokens, result_path, error,
|
||||
created_at, updated_at
|
||||
FROM contestants
|
||||
WHERE battle_id = ${id}
|
||||
ORDER BY created_at ASC
|
||||
`;
|
||||
|
||||
const crossExaminations = await sql`
|
||||
SELECT id, battle_id, identity, model, verdict, created_at
|
||||
FROM cross_examinations
|
||||
WHERE battle_id = ${id}
|
||||
ORDER BY created_at ASC
|
||||
`;
|
||||
|
||||
return { battle, contestants, cross_examinations: crossExaminations };
|
||||
});
|
||||
|
||||
// POST /api/battles/:id/stop — cancel a running battle
|
||||
app.post<{ Params: { id: string } }>('/api/battles/:id/stop', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
if (!parsedId.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
const id = parsedId.data;
|
||||
|
||||
const [row] = await sql<{ id: string; status: string }[]>`
|
||||
SELECT id, status FROM battles WHERE id = ${id}
|
||||
`;
|
||||
if (!row) {
|
||||
reply.code(404);
|
||||
return { error: 'battle not found' };
|
||||
}
|
||||
if (row.status !== 'running') {
|
||||
reply.code(409);
|
||||
return { error: `cannot stop battle in status '${row.status}'` };
|
||||
}
|
||||
|
||||
const { cancelled, taskIds } = await battleRunner.cancelBattle(id);
|
||||
if (!cancelled) {
|
||||
reply.code(409);
|
||||
return { error: 'battle is no longer running' };
|
||||
}
|
||||
|
||||
// Abort any in-flight dispatcher tasks (cloud contestants running externally).
|
||||
for (const taskId of taskIds) {
|
||||
cancelExternal(taskId);
|
||||
}
|
||||
|
||||
return { cancelled: true };
|
||||
});
|
||||
|
||||
// GET /api/battles/:id/analysis — read analysis.md from the battle's results_path
|
||||
app.get<{ Params: { id: string } }>('/api/battles/:id/analysis', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
if (!parsedId.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
const id = parsedId.data;
|
||||
|
||||
const [row] = await sql<{ results_path: string | null }[]>`
|
||||
SELECT results_path FROM battles WHERE id = ${id}
|
||||
`;
|
||||
if (!row) {
|
||||
reply.code(404);
|
||||
return { error: 'battle not found' };
|
||||
}
|
||||
if (!row.results_path) {
|
||||
reply.code(404);
|
||||
return { error: 'analysis not ready' };
|
||||
}
|
||||
|
||||
try {
|
||||
const text = await readFile(join(row.results_path, 'analysis.md'), 'utf8');
|
||||
return { text };
|
||||
} catch {
|
||||
reply.code(404);
|
||||
return { error: 'analysis not ready' };
|
||||
}
|
||||
});
|
||||
|
||||
// POST /api/battles/:id/analyze — trigger or re-trigger analysis
|
||||
app.post<{ Params: { id: string } }>('/api/battles/:id/analyze', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
if (!parsedId.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
const id = parsedId.data;
|
||||
|
||||
const [row] = await sql<{ id: string; status: string }[]>`
|
||||
SELECT id, status FROM battles WHERE id = ${id}
|
||||
`;
|
||||
if (!row) {
|
||||
reply.code(404);
|
||||
return { error: 'battle not found' };
|
||||
}
|
||||
if (row.status === 'running') {
|
||||
reply.code(409);
|
||||
return { error: 'battle is still running — wait for all contestants to finish' };
|
||||
}
|
||||
|
||||
const result = await battleRunner.triggerAnalysis(id);
|
||||
if (!result.triggered) {
|
||||
reply.code(404);
|
||||
return { error: 'battle not found' };
|
||||
}
|
||||
|
||||
reply.code(202);
|
||||
return { triggered: true };
|
||||
});
|
||||
|
||||
// PATCH /api/battles/:id/winner — manually set or clear the winner.
|
||||
// Validates the contestant belongs to the battle; publishes battle_updated so
|
||||
// the pane badge reflects the override immediately. Human is authoritative.
|
||||
app.patch<{ Params: { id: string } }>('/api/battles/:id/winner', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
if (!parsedId.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
|
||||
const parsed = SetWinnerBody.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid body', details: parsed.error.flatten() };
|
||||
}
|
||||
|
||||
const result = await battleRunner.setWinner(parsedId.data, parsed.data.winner_contestant_id);
|
||||
if (!result.ok) {
|
||||
if (result.notFound) { reply.code(404); return { error: 'battle not found' }; }
|
||||
if (result.invalidContestant) { reply.code(422); return { error: 'contestant not found in this battle' }; }
|
||||
reply.code(500); return { error: 'unknown error' };
|
||||
}
|
||||
return { ok: true };
|
||||
});
|
||||
|
||||
// GET /api/battles/:id/contestants/:cid/diff — read the diff.patch for a coding contestant.
|
||||
app.get<{ Params: { id: string; cid: string } }>('/api/battles/:id/contestants/:cid/diff', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
const parsedCid = UuidParam.safeParse(req.params.cid);
|
||||
if (!parsedId.success || !parsedCid.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
|
||||
const [contestant] = await sql<{ result_path: string | null }[]>`
|
||||
SELECT result_path FROM contestants
|
||||
WHERE id = ${parsedCid.data} AND battle_id = ${parsedId.data}
|
||||
`;
|
||||
if (!contestant) {
|
||||
reply.code(404);
|
||||
return { error: 'contestant not found' };
|
||||
}
|
||||
if (!contestant.result_path) {
|
||||
reply.code(404);
|
||||
return { error: 'diff not available' };
|
||||
}
|
||||
|
||||
try {
|
||||
const text = await readFile(join(contestant.result_path, 'diff.patch'), 'utf8');
|
||||
return { diff: text };
|
||||
} catch {
|
||||
reply.code(404);
|
||||
return { error: 'diff not available' };
|
||||
}
|
||||
});
|
||||
|
||||
// POST /api/battles/:id/cross-examine — start a cross-examination
|
||||
app.post<{ Params: { id: string } }>('/api/battles/:id/cross-examine', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
if (!parsedId.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
const id = parsedId.data;
|
||||
|
||||
const parsed = CrossExamineBody.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid body', details: parsed.error.flatten() };
|
||||
}
|
||||
|
||||
const [row] = await sql<{ id: string; status: string }[]>`
|
||||
SELECT id, status FROM battles WHERE id = ${id}
|
||||
`;
|
||||
if (!row) {
|
||||
reply.code(404);
|
||||
return { error: 'battle not found' };
|
||||
}
|
||||
if (row.status === 'running') {
|
||||
reply.code(409);
|
||||
return { error: 'battle is still running — cross-examine after all contestants finish' };
|
||||
}
|
||||
|
||||
const { crossExamId } = await battleRunner.startCrossExam(id, {
|
||||
identity: parsed.data.identity,
|
||||
model: parsed.data.model,
|
||||
});
|
||||
|
||||
reply.code(202);
|
||||
return { cross_exam_id: crossExamId };
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user