From d6d246c15b7abb44b84c1e5a37351e682d878da9 Mon Sep 17 00:00:00 2001 From: indifferentketchup Date: Sat, 6 Jun 2026 23:25:29 +0000 Subject: [PATCH] =?UTF-8?q?feat(web,coder):=20arena=20pane=20=E2=80=94=20c?= =?UTF-8?q?ompare=202-6=20AI=20competitors=20on=20same=20prompt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arena is a new pane kind for competitive AI evaluation. A Battle runs the same prompt against 2-6 Contestants across two concurrent lanes: local lane (llama-swap models, serial) and cloud lane (parallel). Added to all three registries: @boocode/contracts WsFrameSchema, server InferenceFrame, and web WsFrame. Backend (apps/coder): - arena-runner: battle scheduler, lane classifier, benchmark, results writer, resume, user winner override - arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL - arena-decisions: status transitions and resume logic (unit-tested) - arena-analyzer-helpers: pure helper functions (unit-tested) - arena-model-call: model call utility for analysis - arena routes: create/get/list/stop/analyze/cross-examine/winner/diff - schema: battles, contestants, cross_examinations tables (idempotent) - remove old /api/arena* routes and tasks.arena_id column Frontend (apps/web): - ArenaLauncherDialog: battle type, prompt, contestant selection - ArenaPane: live roster, streaming output, analysis, cross-exam - DiffView: unified diff with line-by-line color for coding contests - Winner override per-row dropdown (Trophy icon) - battle_updated WS handler for live winner/analysis updates - arena pane kind in Workspace, ChatTabBar, useSidebar Cross-app: - ArenaState and ArenaContestantShape/WsFrame types (contracts) - battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame - manifest.json written per battle results folder - /Arena added to .gitignore Co-Authored-By: Claude Opus 4.8 --- .gitignore | 1 + CLAUDE.md | 2 +- CONTEXT.md | 67 ++ apps/coder/src/index.ts | 113 ++- apps/coder/src/routes/arena.ts | 482 ++++++++-- apps/coder/src/schema.sql | 76 +- .../__tests__/arena-analyzer-helpers.test.ts | 254 +++++ .../__tests__/arena-decisions.test.ts | 332 +++++++ .../src/services/arena-analyzer-helpers.ts | 191 ++++ apps/coder/src/services/arena-analyzer.ts | 496 ++++++++++ apps/coder/src/services/arena-decisions.ts | 186 ++++ apps/coder/src/services/arena-model-call.ts | 70 ++ apps/coder/src/services/arena-runner.ts | 895 ++++++++++++++++++ apps/coder/src/services/dispatcher.ts | 38 +- apps/server/src/schema.sql | 6 +- apps/server/src/services/inference/types.ts | 19 +- apps/web/src/App.tsx | 2 + apps/web/src/api/client.ts | 60 ++ apps/web/src/api/types.ts | 36 +- .../src/components/ArenaLauncherDialog.tsx | 410 ++++++++ apps/web/src/components/ChatTabBar.tsx | 3 + apps/web/src/components/PaneHeaderActions.tsx | 15 +- apps/web/src/components/Workspace.tsx | 23 +- apps/web/src/components/panes/ArenaPane.tsx | 664 +++++++++++++ apps/web/src/hooks/sessionEvents.ts | 58 +- apps/web/src/hooks/useCoderUserEvents.ts | 14 +- apps/web/src/hooks/useSessionStream.ts | 7 + apps/web/src/hooks/useSidebar.ts | 7 + apps/web/src/hooks/useWorkspacePanes.ts | 46 + docs/adr/0001-arena-two-lane-scheduling.md | 19 + ...-arena-dedicated-tables-not-flow-runner.md | 22 + packages/contracts/package.json | 4 + packages/contracts/src/arena.ts | 55 ++ packages/contracts/src/ws-frames.ts | 54 ++ 34 files changed, 4581 insertions(+), 146 deletions(-) create mode 100644 CONTEXT.md create mode 100644 apps/coder/src/services/__tests__/arena-analyzer-helpers.test.ts create mode 100644 apps/coder/src/services/__tests__/arena-decisions.test.ts create mode 100644 apps/coder/src/services/arena-analyzer-helpers.ts create mode 100644 apps/coder/src/services/arena-analyzer.ts create mode 100644 apps/coder/src/services/arena-decisions.ts create mode 100644 apps/coder/src/services/arena-model-call.ts create mode 100644 apps/coder/src/services/arena-runner.ts create mode 100644 apps/web/src/components/ArenaLauncherDialog.tsx create mode 100644 apps/web/src/components/panes/ArenaPane.tsx create mode 100644 docs/adr/0001-arena-two-lane-scheduling.md create mode 100644 docs/adr/0002-arena-dedicated-tables-not-flow-runner.md create mode 100644 packages/contracts/src/arena.ts diff --git a/.gitignore b/.gitignore index 61c63d2..9d45b0a 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ data/* !data/mcp.example.json !data/coder-providers.example.json codecontext/fork.tar.gz +/Arena diff --git a/CLAUDE.md b/CLAUDE.md index 8572f6c..dadc61c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -78,7 +78,7 @@ BooCoder at port 9502: `curl http://100.114.205.53:9502/api/health`. Runs as `bo - `FAST_MODEL` (optional) — cheaper model for titles, summaries, labeling (auto_name.ts, tool-summaries.ts). Falls back to session model or DEFAULT_MODEL. Set to a small llama-swap model (e.g. `nemotron-nano-4b`) to avoid loading the 35B for 20-token calls. - Qwen Code dispatch: `OPENAI_BASE_URL=http://100.101.41.16:8401/v1 OPENAI_API_KEY=dummy qwen -p "" --output-format stream-json`. Install: `npm install -g @qwen-code/qwen-code@latest`. Node ≥22 on host (container stays Node 20; BooCoder dispatches via direct spawn on host). No `--yolo` flag — `-p` runs autonomously without prompts. ACP bridge is an HTTP daemon (not stdio); use PTY dispatch. -- Arena: `POST /api/arena {project_id, input, contestants: [{agent?, model?}]}` dispatches the same task to N models/agents in parallel; each contestant gets its own task + worktree. `GET /api/arena/:id` for results; `POST /api/arena/:id/select/:task_id` picks a winner. +- Arena: `POST /api/battles {project_id, battle_type, prompt, contestants}` starts a battle; `GET /api/battles/:id` returns battle + contestants + cross-examinations; `POST /api/battles/:id/stop` cancels; `POST /api/battles/:id/analyze` triggers/re-triggers two-stage digest→judge analysis; `GET /api/battles/:id/analysis` reads `analysis.md`; `POST /api/battles/:id/cross-examine {identity, model}` runs a cross-examination. All `/api/battles*` routes are served by `apps/coder` at port 9502 (proxied through `apps/server` as `/api/coder/battles*`). ## Workflow diff --git a/CONTEXT.md b/CONTEXT.md new file mode 100644 index 0000000..e6622ab --- /dev/null +++ b/CONTEXT.md @@ -0,0 +1,67 @@ +# Context: BooCode + +Glossary of the domain language. Terms only — no implementation detail. + +## Workspace + +- **Pane** — one tile in the multi-pane workspace. Each pane has a *kind*: + Chat (BooChat), Coder (BooCoder), Terminal (BooTerm), Orchestrator, Arena, + plus artifact/settings kinds. + +- **Backend** — an AI engine a task is dispatched to: *native* (BooChat + inference on a local llama-swap model) or an *external* CLI agent (Claude Code, + OpenCode, Qwen, Goose). Code sometimes calls this the "agent" (`tasks.agent`). + +- **BooChat Agent** (a.k.a. *persona*) — a preset from the `data/AGENTS.md` + registry (e.g. "Code Reviewer", "Debugger"): a system prompt + tool whitelist + + sampling knobs that runs **on the native backend** with a chosen model. + Distinct from a Backend — this is the overloaded sense of "agent" the UI's + Agent picker selects. + +## Arena + +A way to run the **same prompt** against several AI competitors at once and pick +the best result. + +- **Battle** — one Arena run. Dated. Produces a results folder at + `//Arena//`. (The earlier API-only feature called + this an "arena"; a Battle is one such run.) + +- **Battle Type** — what is being compared: + - *Coding* — Contestants change code; a result is the **diff** they produced + (plus their explanation). Each Contestant works in its own worktree. + - *Q&A* — Contestants answer a prompt; a result is the **text answer**. No + code changes. + +- **Contestant** — one competitor in a Battle, given the Battle's prompt. What + defines a Contestant depends on Battle Type: + - *Coding* — a **Backend + Model** (e.g. Claude Code + opus, native BooCode + + 35b). Each works in its own isolated git **worktree** (a branched on-disk + copy of the project). Contestants do not see each other's work. + - *Q&A* — a **BooChat Agent (persona) + Model** (e.g. Debugger + 35b), running + on the native backend only. No worktree (no code changes). + The same model can appear under two Contestants, so a Contestant's identity is + the (backend-or-persona, model) pair, not the model alone. + +- **Benchmark** — per-Contestant performance captured during a Battle. Wall-clock + **duration** is recorded for every Contestant; **throughput** (tokens/sec) is + recorded only for local (llama-swap) models, which are the ones the speed + comparison is meaningful for. + +- **Arena results folder** (`//Arena//`) — where a + Battle's *results* are written (not the working copies — those stay in each + Contestant's worktree). Holds the per-Contestant result and the final + analysis. + +- **Lane** — how a Battle's Contestants are scheduled. The *local lane* holds + every llama-swap-backed Contestant and runs them strictly one at a time (the + local server can only load one model at a time, which also keeps their speed + Benchmark fair). The *cloud lane* holds cloud-backed Contestants (Claude Code, + OpenCode-on-cloud) and runs them all in parallel. The two lanes run + concurrently with each other. + +- **Analysis** — an end-of-Battle judgement of the Contestants' results, + produced by the default BooChat model, naming a **Winner**. + +- **Cross-examination** — an after-the-Battle step where a chosen model (from any + agent) is pointed at the Battle's results to interrogate / compare them. diff --git a/apps/coder/src/index.ts b/apps/coder/src/index.ts index caaa9ef..f007c2f 100644 --- a/apps/coder/src/index.ts +++ b/apps/coder/src/index.ts @@ -23,8 +23,8 @@ import { registerAgentSessionRoutes } from './routes/agent-sessions.js'; import { registerTaskRoutes } from './routes/tasks.js'; import { registerInboxRoutes } from './routes/inbox.js'; import { registerStatsRoutes } from './routes/stats.js'; -import { registerArenaRoutes } from './routes/arena.js'; import { registerRunsRoutes } from './routes/runs.js'; +import { registerArenaRoutes } from './routes/arena.js'; import { registerProviderRoutes } from './routes/providers.js'; import { registerWorktreeSafetyRoutes } from './routes/worktree-safety.js'; import { registerLifecycleRoutes } from './routes/lifecycle.js'; @@ -34,10 +34,13 @@ import { createDispatcher } from './services/dispatcher.js'; // Orchestrator (Phase 2): DB-backed flow-runner; advances on the dispatcher's // onTaskTerminal hook. import { createFlowRunner } from './services/flow-runner.js'; +// Arena: DB-backed battle-runner; also advances on the onTaskTerminal hook. +import { createBattleRunner, type DispatchContestantFn } from './services/arena-runner.js'; +import { createAnalyzer } from './services/arena-analyzer.js'; import { agentPool } from './services/agent-pool.js'; import { createOrphanWorktreeReaper } from './services/orphan-worktree-reaper.js'; import { probeAgents } from './services/agent-probe.js'; -import { getProviderSnapshot, persistProbedModels } from './services/provider-snapshot.js'; +import { getProviderSnapshot, persistProbedModels, fetchLlamaSwapModels } from './services/provider-snapshot.js'; import { setPermissionHooks } from './services/permission-waiter.js'; import { publishAgentStatus } from './services/agent-status-publish.js'; import { homedir } from 'node:os'; @@ -220,31 +223,119 @@ async function main() { // Orchestrator (Phase 2): the flow-runner reacts to the dispatcher's // onTaskTerminal hook to advance flow_runs. Created before the dispatcher so its - // terminal callback can be wired in. Its launch() is driven by the runs route - // (a later phase); resume on startup is a later phase too. + // terminal callback can be wired in. const flowRunner = createFlowRunner({ sql, broker, log: app.log, config }); - // Phase 4: dispatcher — polls tasks table and runs inference. onTaskTerminal - // notifies the flow-runner when a step's task settles (D-2). + // Arena SEAM (a): build the local-model set from the live llama-swap model list. + // Both bare IDs ('qwen3.6-35b') and prefixed IDs ('llama-swap/qwen3.6-35b') are + // included so opencode-style prefixed contestants and native-style bare contestants + // both classify correctly as local. + const localModelsList = await fetchLlamaSwapModels(config).catch(() => []); + const localModels = new Set([ + ...localModelsList.map((m) => m.id), + ...localModelsList.map((m) => `llama-swap/${m.id}`), + ]); + + // Arena dispatch function — Phase 4 SEAM (b). + // Coding: insert a tasks row with agent=identity (null for native/boocode); + // the dispatcher creates a worktree and runs the external agent (or native). + // Q&A: pre-create a session with agent_id stamped to the persona slug so native + // inference loads the persona's system_prompt + tools from AGENTS.md; + // task.session_id is pre-set so runNativeInference reuses the session. + const dispatchContestant: DispatchContestantFn = async ({ + projectId, + prompt, + identity, + model, + battleType, + }) => { + if (battleType === 'qa') { + const sessionName = `Arena Q&A [${identity}]: ${prompt.slice(0, 30)}`; + const [session] = await sql<{ id: string }[]>` + INSERT INTO sessions (project_id, name, model, agent_id, status) + VALUES (${projectId}, ${sessionName}, ${model}, ${identity}, 'open') + RETURNING id + `; + const [task] = await sql<{ id: string }[]>` + INSERT INTO tasks (project_id, input, model, session_id) + VALUES (${projectId}, ${prompt}, ${model}, ${session!.id}) + RETURNING id + `; + return { taskId: task!.id, sessionId: session!.id }; + } + // Coding: boocode = native inference (no external agent); any other identity + // is an external agent name (claude, opencode, qwen, goose) that maps to + // available_agents and gets its own per-task worktree via runExternalAgent. + // Session is created lazily by the dispatcher, so sessionId is unknown here. + const agentName = identity === 'boocode' ? null : identity; + const [task] = await sql<{ id: string }[]>` + INSERT INTO tasks (project_id, input, agent, model) + VALUES (${projectId}, ${prompt}, ${agentName}, ${model}) + RETURNING id + `; + return { taskId: task!.id, sessionId: null }; + }; + + // Arena analyzer: two-stage digest→judge (v1). Pluggable seam — a v2 Han + // Orchestrator flow can replace this without schema changes. + const analyzer = createAnalyzer({ + sql, + broker, + log: app.log, + config, + localModels, + }); + + // Arena battle-runner: notified on the same onTaskTerminal hook as the flow-runner. + const battleRunner = createBattleRunner({ + sql, + broker, + log: app.log, + dispatch: dispatchContestant, + onBattleComplete: (battleId) => { + void analyzer.analyze(battleId); + }, + onCrossExamStart: ({ battleId, crossExamId, identity, model }) => { + void analyzer.crossExamine(battleId, crossExamId, { identity, model }); + }, + localModels, + }); + + // Compose onTaskTerminal: both flow-runner and battle-runner are notified. + // Each ignores tasks it doesn't own (flow-runner checks flow_steps.task_id; + // battle-runner checks contestants.task_id). + const onTaskTerminal = (taskId: string, state: string): void => { + flowRunner.handleTaskTerminal(taskId, state); + battleRunner.handleTaskTerminal(taskId, state); + }; + + // Phase 4: dispatcher — polls tasks table and runs inference. The composed + // onTaskTerminal hook notifies both the flow-runner and the battle-runner when + // any task settles. const dispatcher = createDispatcher({ sql, inference: inferenceApi, broker, log: app.log, config, - onTaskTerminal: flowRunner.handleTaskTerminal, + onTaskTerminal, }); dispatcher.start(); - // Phase 5: re-advance any flow_runs that were 'running' when the service last - // stopped (D-9). Runs AFTER dispatcher.start() so re-dispatched 'pending' tasks - // are picked up by the dispatcher's startup poll. + // Re-advance in-flight flow_runs and battles after a coder restart. Both run + // AFTER dispatcher.start() so re-dispatched 'pending' tasks are picked up. void flowRunner.initResume().catch((err) => { app.log.error( { err: err instanceof Error ? err.message : String(err) }, 'flow-runner: initResume failed', ); }); + void battleRunner.initResume().catch((err) => { + app.log.error( + { err: err instanceof Error ? err.message : String(err) }, + 'arena: initResume failed', + ); + }); // v2.6 Phase 3: configure + start the agent-pool lifecycle sweep (idle-TTL + // LRU-cap eviction of warm backends, plus each backend's proactive health probe) @@ -281,8 +372,8 @@ async function main() { registerTaskRoutes(app, sql, inferenceApi, dispatcher.cancelExternalTask); registerInboxRoutes(app, sql); registerStatsRoutes(app, sql); - registerArenaRoutes(app, sql); registerRunsRoutes(app, sql, flowRunner, dispatcher.cancelExternalTask); + registerArenaRoutes(app, sql, battleRunner, dispatcher.cancelExternalTask, config); registerProviderRoutes(app, sql, config); registerWorktreeSafetyRoutes(app, sql); registerLifecycleRoutes(app, sql); diff --git a/apps/coder/src/routes/arena.ts b/apps/coder/src/routes/arena.ts index 8757fb1..7a481fc 100644 --- a/apps/coder/src/routes/arena.ts +++ b/apps/coder/src/routes/arena.ts @@ -1,136 +1,412 @@ /** - * v2.0.5: Arena routes — competitive dispatch of the same task to multiple agents. + * Arena routes — HTTP surface for the Battle UI. * - * POST /api/arena — create an arena with 2-5 contestants - * GET /api/arena/:id — get all tasks in an arena - * POST /api/arena/:id/select/:task_id — mark a task as the arena winner + * POST /api/battles — launch a battle + * GET /api/battles?project_id= — list battles for a project + * GET /api/battles/:id — one battle + contestants + cross-exams + * POST /api/battles/:id/stop — cancel a running battle + * POST /api/battles/:id/analyze — trigger analysis (Phase 5 fills the logic) + * POST /api/battles/:id/cross-examine — start a cross-examination (Phase 5 fills the logic) + * + * Mirrors the shape of runs.ts (Orchestrator routes). Battle creation delegates to + * the battle-runner; cancellation calls cancelBattle then aborts in-flight tasks + * via the dispatcher's cancelExternalTask. */ import type { FastifyInstance } from 'fastify'; import { z } from 'zod'; +import { readFile } from 'node:fs/promises'; +import { join } from 'node:path'; import type { Sql } from '../db.js'; +import type { Config } from '../config.js'; +import type { BattleRunner } from '../services/arena-runner.js'; +import type { ExternalCancelFn } from './tasks.js'; +import { arenaModelCall } from '../services/arena-model-call.js'; -const ContestantSchema = z.object({ - agent: z.string().max(100).optional(), - model: z.string().max(200).optional(), - mode_id: z.string().max(200).optional(), - thinking_option_id: z.string().max(200).optional(), +// ─── Validation schemas ─────────────────────────────────────────────────────── + +const UuidParam = z.string().uuid(); + +const ContestantInput = z.object({ + identity: z.string().min(1).max(200), + model: z.string().min(1).max(200), }); -const CreateArenaBody = z.object({ +const CreateBattleBody = z.object({ project_id: z.string().uuid(), - input: z.string().min(1).max(64_000), - contestants: z.array(ContestantSchema).min(2).max(5), + battle_type: z.enum(['coding', 'qa']), + prompt: z.string().min(1).max(64_000), + contestants: z + .array(ContestantInput) + .min(2, 'at least 2 contestants required') + .max(6, 'at most 6 contestants allowed'), }); -interface TaskRow { - id: string; - agent: string | null; - model: string | null; - mode_id: string | null; - thinking_option_id: string | null; - state: string; -} +const ListBattlesQuery = z.object({ + project_id: z.string().uuid(), +}); -export function registerArenaRoutes(app: FastifyInstance, sql: Sql): void { - // POST /api/arena — create a new arena - app.post('/api/arena', async (req, reply) => { - const parsed = CreateArenaBody.safeParse(req.body); +const CrossExamineBody = z.object({ + identity: z.string().min(1).max(200), + model: z.string().min(1).max(200), +}); + +const SetWinnerBody = z.object({ + winner_contestant_id: z.string().uuid().nullable(), +}); + +// ─── Route registration ─────────────────────────────────────────────────────── + +const GeneratePromptBody = z.object({ + description: z.string().min(1).max(2_000), +}); + +export function registerArenaRoutes( + app: FastifyInstance, + sql: Sql, + battleRunner: BattleRunner, + cancelExternal: ExternalCancelFn, + config: Config, +): void { + + // POST /api/battles/generate-prompt — draft a fuller battle prompt from a + // short description using the default BooChat model. One-shot, non-streaming. + // Must be registered BEFORE /api/battles/:id so the literal 'generate-prompt' + // path is not mistaken for a UUID param. + app.post('/api/battles/generate-prompt', async (req, reply) => { + const parsed = GeneratePromptBody.safeParse(req.body); if (!parsed.success) { reply.code(400); return { error: 'invalid body', details: parsed.error.flatten() }; } - const { project_id, input, contestants } = parsed.data; - const arenaId = crypto.randomUUID(); + const { description } = parsed.data; - const tasks: TaskRow[] = []; - for (const contestant of contestants) { - const [task] = await sql` - INSERT INTO tasks (project_id, input, agent, model, mode_id, thinking_option_id, arena_id) - VALUES ( - ${project_id}, - ${input}, - ${contestant.agent ?? null}, - ${contestant.model ?? null}, - ${contestant.mode_id ?? null}, - ${contestant.thinking_option_id ?? null}, - ${arenaId} - ) - RETURNING id, agent, model, mode_id, thinking_option_id, state - `; - tasks.push(task!); + try { + const prompt = await arenaModelCall({ + config, + model: config.DEFAULT_MODEL, + system: [ + 'You are a battle-prompt writer for an AI Arena.', + 'The user gives you a short description of a coding or Q&A challenge.', + 'Expand it into a clear, self-contained prompt (2–6 sentences) that any AI model can act on.', + 'Include specific acceptance criteria where helpful.', + 'Output ONLY the prompt — no preamble, no labels, no meta-commentary.', + ].join(' '), + user: description, + maxTokens: 400, + temperature: 0.6, + }); + return { prompt }; + } catch (err) { + app.log.warn( + { err: err instanceof Error ? err.message : String(err) }, + 'arena generate-prompt: model call failed', + ); + reply.code(502); + return { error: 'model call failed' }; } + }); + + // POST /api/battles — launch a battle + app.post('/api/battles', async (req, reply) => { + const parsed = CreateBattleBody.safeParse(req.body); + if (!parsed.success) { + reply.code(400); + return { error: 'invalid body', details: parsed.error.flatten() }; + } + + const { project_id, battle_type, prompt, contestants } = parsed.data; + + // Reject duplicate (identity, model) pairs up front — the schema UNIQUE + // constraint would catch it too, but an early 422 is friendlier. + const seen = new Set(); + for (const c of contestants) { + const key = `${c.identity}::${c.model}`; + if (seen.has(key)) { + reply.code(422); + return { + error: 'duplicate_contestant', + message: `duplicate contestant: identity="${c.identity}" model="${c.model}"`, + }; + } + seen.add(key); + } + + // Verify project exists + const [proj] = await sql<{ id: string }[]>`SELECT id FROM projects WHERE id = ${project_id}`; + if (!proj) { + reply.code(404); + return { error: 'project not found' }; + } + + const { battleId } = await battleRunner.startBattle({ + projectId: project_id, + battleType: battle_type, + prompt, + contestants, + }); reply.code(201); - return { - arena_id: arenaId, - tasks: tasks.map((t) => ({ - id: t.id, - agent: t.agent, - model: t.model, - mode_id: t.mode_id, - thinking_option_id: t.thinking_option_id, - state: t.state, - })), - }; + return { battle_id: battleId }; }); - // GET /api/arena/:arena_id — list all tasks in an arena - app.get<{ Params: { arena_id: string } }>('/api/arena/:arena_id', async (req, reply) => { - const { arena_id } = req.params; - - // Validate UUID format - const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; - if (!uuidRegex.test(arena_id)) { + // GET /api/battles?project_id= — list battles, most-recent-first + app.get('/api/battles', async (req, reply) => { + const parsed = ListBattlesQuery.safeParse(req.query); + if (!parsed.success) { reply.code(400); - return { error: 'invalid arena_id format' }; + return { error: 'invalid query', details: parsed.error.flatten() }; } - const tasks = await sql` - SELECT id, project_id, state, input, output_summary, agent, model, mode_id, thinking_option_id, execution_path, session_id, started_at, ended_at, created_at, arena_id - FROM tasks - WHERE arena_id = ${arena_id} - ORDER BY created_at + const battles = await sql` + SELECT id, project_id, battle_type, prompt, status, + winner_contestant_id, results_path, error, + created_at, updated_at + FROM battles + WHERE project_id = ${parsed.data.project_id} + ORDER BY created_at DESC + LIMIT 100 `; - if (tasks.length === 0) { - reply.code(404); - return { error: 'arena not found' }; - } - - return { arena_id, tasks }; + return { battles }; }); - // POST /api/arena/:arena_id/select/:task_id — mark the winner - app.post<{ Params: { arena_id: string; task_id: string } }>( - '/api/arena/:arena_id/select/:task_id', - async (req, reply) => { - const { arena_id, task_id } = req.params; - - // Verify the task belongs to this arena - const rows = await sql<{ id: string; state: string; arena_id: string | null }[]>` - SELECT id, state, arena_id FROM tasks WHERE id = ${task_id} - `; - - if (rows.length === 0) { - reply.code(404); - return { error: 'task not found' }; - } - - const task = rows[0]!; - if (task.arena_id !== arena_id) { - reply.code(409); - return { error: 'task does not belong to this arena' }; - } - - // Mark as selected via output_summary prefix (lightweight — no schema change) - await sql` - UPDATE tasks - SET output_summary = COALESCE('[SELECTED] ' || output_summary, '[SELECTED]') - WHERE id = ${task_id} - `; - - return { selected: true, task_id, arena_id }; + // GET /api/battles/:id — one battle + its contestants + cross-examinations + app.get<{ Params: { id: string } }>('/api/battles/:id', async (req, reply) => { + const parsedId = UuidParam.safeParse(req.params.id); + if (!parsedId.success) { + reply.code(400); + return { error: 'invalid id' }; } - ); + const id = parsedId.data; + + const [battle] = await sql<{ + id: string; + project_id: string; + battle_type: string; + prompt: string; + status: string; + winner_contestant_id: string | null; + results_path: string | null; + error: string | null; + created_at: unknown; + updated_at: unknown; + }[]>` + SELECT id, project_id, battle_type, prompt, status, + winner_contestant_id, results_path, error, + created_at, updated_at + FROM battles WHERE id = ${id} + `; + + if (!battle) { + reply.code(404); + return { error: 'battle not found' }; + } + + const contestants = await sql` + SELECT id, battle_id, identity, model, lane, task_id, worktree_id, + status, duration_ms, tokens_per_sec, cost_tokens, result_path, error, + created_at, updated_at + FROM contestants + WHERE battle_id = ${id} + ORDER BY created_at ASC + `; + + const crossExaminations = await sql` + SELECT id, battle_id, identity, model, verdict, created_at + FROM cross_examinations + WHERE battle_id = ${id} + ORDER BY created_at ASC + `; + + return { battle, contestants, cross_examinations: crossExaminations }; + }); + + // POST /api/battles/:id/stop — cancel a running battle + app.post<{ Params: { id: string } }>('/api/battles/:id/stop', async (req, reply) => { + const parsedId = UuidParam.safeParse(req.params.id); + if (!parsedId.success) { + reply.code(400); + return { error: 'invalid id' }; + } + const id = parsedId.data; + + const [row] = await sql<{ id: string; status: string }[]>` + SELECT id, status FROM battles WHERE id = ${id} + `; + if (!row) { + reply.code(404); + return { error: 'battle not found' }; + } + if (row.status !== 'running') { + reply.code(409); + return { error: `cannot stop battle in status '${row.status}'` }; + } + + const { cancelled, taskIds } = await battleRunner.cancelBattle(id); + if (!cancelled) { + reply.code(409); + return { error: 'battle is no longer running' }; + } + + // Abort any in-flight dispatcher tasks (cloud contestants running externally). + for (const taskId of taskIds) { + cancelExternal(taskId); + } + + return { cancelled: true }; + }); + + // GET /api/battles/:id/analysis — read analysis.md from the battle's results_path + app.get<{ Params: { id: string } }>('/api/battles/:id/analysis', async (req, reply) => { + const parsedId = UuidParam.safeParse(req.params.id); + if (!parsedId.success) { + reply.code(400); + return { error: 'invalid id' }; + } + const id = parsedId.data; + + const [row] = await sql<{ results_path: string | null }[]>` + SELECT results_path FROM battles WHERE id = ${id} + `; + if (!row) { + reply.code(404); + return { error: 'battle not found' }; + } + if (!row.results_path) { + reply.code(404); + return { error: 'analysis not ready' }; + } + + try { + const text = await readFile(join(row.results_path, 'analysis.md'), 'utf8'); + return { text }; + } catch { + reply.code(404); + return { error: 'analysis not ready' }; + } + }); + + // POST /api/battles/:id/analyze — trigger or re-trigger analysis + app.post<{ Params: { id: string } }>('/api/battles/:id/analyze', async (req, reply) => { + const parsedId = UuidParam.safeParse(req.params.id); + if (!parsedId.success) { + reply.code(400); + return { error: 'invalid id' }; + } + const id = parsedId.data; + + const [row] = await sql<{ id: string; status: string }[]>` + SELECT id, status FROM battles WHERE id = ${id} + `; + if (!row) { + reply.code(404); + return { error: 'battle not found' }; + } + if (row.status === 'running') { + reply.code(409); + return { error: 'battle is still running — wait for all contestants to finish' }; + } + + const result = await battleRunner.triggerAnalysis(id); + if (!result.triggered) { + reply.code(404); + return { error: 'battle not found' }; + } + + reply.code(202); + return { triggered: true }; + }); + + // PATCH /api/battles/:id/winner — manually set or clear the winner. + // Validates the contestant belongs to the battle; publishes battle_updated so + // the pane badge reflects the override immediately. Human is authoritative. + app.patch<{ Params: { id: string } }>('/api/battles/:id/winner', async (req, reply) => { + const parsedId = UuidParam.safeParse(req.params.id); + if (!parsedId.success) { + reply.code(400); + return { error: 'invalid id' }; + } + + const parsed = SetWinnerBody.safeParse(req.body); + if (!parsed.success) { + reply.code(400); + return { error: 'invalid body', details: parsed.error.flatten() }; + } + + const result = await battleRunner.setWinner(parsedId.data, parsed.data.winner_contestant_id); + if (!result.ok) { + if (result.notFound) { reply.code(404); return { error: 'battle not found' }; } + if (result.invalidContestant) { reply.code(422); return { error: 'contestant not found in this battle' }; } + reply.code(500); return { error: 'unknown error' }; + } + return { ok: true }; + }); + + // GET /api/battles/:id/contestants/:cid/diff — read the diff.patch for a coding contestant. + app.get<{ Params: { id: string; cid: string } }>('/api/battles/:id/contestants/:cid/diff', async (req, reply) => { + const parsedId = UuidParam.safeParse(req.params.id); + const parsedCid = UuidParam.safeParse(req.params.cid); + if (!parsedId.success || !parsedCid.success) { + reply.code(400); + return { error: 'invalid id' }; + } + + const [contestant] = await sql<{ result_path: string | null }[]>` + SELECT result_path FROM contestants + WHERE id = ${parsedCid.data} AND battle_id = ${parsedId.data} + `; + if (!contestant) { + reply.code(404); + return { error: 'contestant not found' }; + } + if (!contestant.result_path) { + reply.code(404); + return { error: 'diff not available' }; + } + + try { + const text = await readFile(join(contestant.result_path, 'diff.patch'), 'utf8'); + return { diff: text }; + } catch { + reply.code(404); + return { error: 'diff not available' }; + } + }); + + // POST /api/battles/:id/cross-examine — start a cross-examination + app.post<{ Params: { id: string } }>('/api/battles/:id/cross-examine', async (req, reply) => { + const parsedId = UuidParam.safeParse(req.params.id); + if (!parsedId.success) { + reply.code(400); + return { error: 'invalid id' }; + } + const id = parsedId.data; + + const parsed = CrossExamineBody.safeParse(req.body); + if (!parsed.success) { + reply.code(400); + return { error: 'invalid body', details: parsed.error.flatten() }; + } + + const [row] = await sql<{ id: string; status: string }[]>` + SELECT id, status FROM battles WHERE id = ${id} + `; + if (!row) { + reply.code(404); + return { error: 'battle not found' }; + } + if (row.status === 'running') { + reply.code(409); + return { error: 'battle is still running — cross-examine after all contestants finish' }; + } + + const { crossExamId } = await battleRunner.startCrossExam(id, { + identity: parsed.data.identity, + model: parsed.data.model, + }); + + reply.code(202); + return { cross_exam_id: crossExamId }; + }); } diff --git a/apps/coder/src/schema.sql b/apps/coder/src/schema.sql index 71ecc49..5bef50d 100644 --- a/apps/coder/src/schema.sql +++ b/apps/coder/src/schema.sql @@ -54,9 +54,6 @@ DO $$ BEGIN END IF; END $$; --- v2.0.5: arena support — group tasks into competitive arenas. -ALTER TABLE tasks ADD COLUMN IF NOT EXISTS arena_id UUID; - -- Human inbox: tasks needing attention CREATE OR REPLACE VIEW human_inbox AS SELECT * FROM tasks WHERE state IN ('blocked', 'failed'); @@ -81,6 +78,7 @@ ALTER TABLE tasks ADD COLUMN IF NOT EXISTS thinking_option_id TEXT; DROP VIEW IF EXISTS human_inbox; ALTER TABLE tasks DROP COLUMN IF EXISTS feature_values; ALTER TABLE tasks DROP COLUMN IF EXISTS worktree_path; +ALTER TABLE tasks DROP COLUMN IF EXISTS arena_id; CREATE OR REPLACE VIEW human_inbox AS SELECT * FROM tasks WHERE state IN ('blocked', 'failed'); @@ -157,7 +155,7 @@ CREATE UNIQUE INDEX IF NOT EXISTS worktrees_active_path_uidx ON worktrees(path) DROP TABLE IF EXISTS session_worktrees; -- Dispatch hint: which chat (tab) a task belongs to. The coder message route and --- skills route set it from the frontend tab; session-less creators (arena, MCP, +-- skills route set it from the frontend tab; session-less creators (MCP, -- new_task, generic /api/tasks) leave it NULL and the dispatcher creates a chat. ALTER TABLE tasks ADD COLUMN IF NOT EXISTS chat_id UUID REFERENCES chats(id) ON DELETE SET NULL; @@ -271,7 +269,7 @@ ALTER TABLE agent_sessions ADD CONSTRAINT agent_sessions_backend_chk CHECK (backend IN ('opencode_server', 'acp_warm', 'claude_sdk')); -- LISTEN/NOTIFY fast path: every tasks INSERT (from any call site — routes, --- new_task tool, arena, MCP server) fires pg_notify('tasks_new') in the same +-- new_task tool, MCP server) fires pg_notify('tasks_new') in the same -- transaction, so the dispatcher reacts immediately instead of waiting for the -- fallback poll. Postgres holds the notification until COMMIT, so the listener -- always sees the committed row. A trigger covers all insert paths with no @@ -357,3 +355,71 @@ DO $$ BEGIN CHECK (status IN ('pending', 'running', 'completed', 'failed', 'skipped', 'cancelled')); END IF; END $$; + +-- Arena: battles + contestants + cross_examinations. +-- project_id carries no FK (matches tasks.project_id + flow_runs.project_id convention). +-- winner_contestant_id FK is deferred (forward reference): added via guarded ALTER below. +CREATE TABLE IF NOT EXISTS battles ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + project_id UUID NOT NULL, + battle_type TEXT NOT NULL, + prompt TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + winner_contestant_id UUID, + results_path TEXT, + error TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(), + CONSTRAINT battles_type_chk CHECK (battle_type IN ('coding', 'qa')), + CONSTRAINT battles_status_chk CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled')) +); + +CREATE TABLE IF NOT EXISTS contestants ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + battle_id UUID NOT NULL REFERENCES battles(id) ON DELETE CASCADE, + identity TEXT NOT NULL, + model TEXT NOT NULL, + lane TEXT NOT NULL, + task_id UUID REFERENCES tasks(id) ON DELETE SET NULL, + worktree_id UUID REFERENCES worktrees(id) ON DELETE SET NULL, + status TEXT NOT NULL DEFAULT 'queued', + duration_ms INTEGER, + tokens_per_sec DOUBLE PRECISION, + cost_tokens INTEGER, + result_path TEXT, + error TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(), + CONSTRAINT contestants_lane_chk CHECK (lane IN ('local', 'cloud')), + CONSTRAINT contestants_status_chk CHECK (status IN ('queued', 'running', 'done', 'error')), + UNIQUE (battle_id, identity, model) +); + +CREATE TABLE IF NOT EXISTS cross_examinations ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + battle_id UUID NOT NULL REFERENCES battles(id) ON DELETE CASCADE, + identity TEXT NOT NULL, + model TEXT NOT NULL, + verdict TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp() +); + +-- Add the winner FK now that contestants exists. +DO $$ BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'battles_winner_contestant_id_fkey') THEN + ALTER TABLE battles ADD CONSTRAINT battles_winner_contestant_id_fkey + FOREIGN KEY (winner_contestant_id) REFERENCES contestants(id) ON DELETE SET NULL; + END IF; +END $$; + +-- battles query (GET /api/battles?project_id=). +CREATE INDEX IF NOT EXISTS battles_project_created_idx ON battles(project_id, created_at DESC); + +-- Lane-scheduler advance scans (contestants WHERE battle_id = ? AND status = ?). +CREATE INDEX IF NOT EXISTS contestants_battle_status_idx ON contestants(battle_id, status); + +-- onTaskTerminal callback: look up the contestant owning a completed task. +CREATE INDEX IF NOT EXISTS contestants_task_id_idx ON contestants(task_id); + +-- Cross-examination listing per battle. +CREATE INDEX IF NOT EXISTS cross_examinations_battle_idx ON cross_examinations(battle_id); diff --git a/apps/coder/src/services/__tests__/arena-analyzer-helpers.test.ts b/apps/coder/src/services/__tests__/arena-analyzer-helpers.test.ts new file mode 100644 index 0000000..809ea69 --- /dev/null +++ b/apps/coder/src/services/__tests__/arena-analyzer-helpers.test.ts @@ -0,0 +1,254 @@ +import { describe, it, expect } from 'vitest'; +import { + buildDigestPrompt, + buildJudgePrompt, + buildCrossExamPrompt, + extractWinner, + shouldNameWinner, + type ContestantDigest, + type ContestantDigestInput, +} from '../arena-analyzer-helpers.js'; + +// ─── shouldNameWinner ───────────────────────────────────────────────────────── + +describe('shouldNameWinner', () => { + it('returns false with 0 succeeded contestants', () => { + expect(shouldNameWinner(0)).toBe(false); + }); + + it('returns false with exactly 1 succeeded contestant', () => { + expect(shouldNameWinner(1)).toBe(false); + }); + + it('returns true with exactly 2 succeeded contestants', () => { + expect(shouldNameWinner(2)).toBe(true); + }); + + it('returns true with more than 2 succeeded contestants', () => { + expect(shouldNameWinner(3)).toBe(true); + expect(shouldNameWinner(6)).toBe(true); + }); +}); + +// ─── extractWinner ──────────────────────────────────────────────────────────── + +describe('extractWinner', () => { + it('extracts identity and model from a WINNER: line', () => { + const output = 'Some analysis\n\nWINNER: claude/opus-4-5\n\nMore text.'; + expect(extractWinner(output)).toEqual({ identity: 'claude', model: 'opus-4-5' }); + }); + + it('is case-insensitive for the WINNER keyword', () => { + expect(extractWinner('winner: boocode/qwen3.6-35b')).toEqual({ + identity: 'boocode', + model: 'qwen3.6-35b', + }); + expect(extractWinner('Winner: opencode/some-model')).toEqual({ + identity: 'opencode', + model: 'some-model', + }); + }); + + it('returns null when NO_WINNER is declared', () => { + expect(extractWinner('WINNER: NO_WINNER')).toBeNull(); + expect(extractWinner('winner: no_winner')).toBeNull(); + }); + + it('returns null when no WINNER line is present', () => { + expect(extractWinner('Just some analysis text with no verdict.')).toBeNull(); + expect(extractWinner('')).toBeNull(); + }); + + it('returns null when the WINNER line has no slash separator', () => { + expect(extractWinner('WINNER: justidentity')).toBeNull(); + }); + + it('returns null when the WINNER line is empty after the colon', () => { + expect(extractWinner('WINNER:')).toBeNull(); + expect(extractWinner('WINNER: ')).toBeNull(); + }); + + it('handles leading and trailing whitespace around the slash parts', () => { + const result = extractWinner('WINNER: claude / opus-4-5 '); + expect(result).toEqual({ identity: 'claude', model: 'opus-4-5' }); + }); + + it('picks the first WINNER line when multiple are present', () => { + const output = 'WINNER: claude/opus-4-5\nWINNER: opencode/other-model'; + expect(extractWinner(output)).toEqual({ identity: 'claude', model: 'opus-4-5' }); + }); + + it('handles model names that contain slashes by splitting at the first slash only', () => { + // edge case: model name with a slash — should still split at first slash + // identity = 'native', model = 'llama-swap/qwen3.6' + const result = extractWinner('WINNER: native/llama-swap/qwen3.6'); + expect(result).toEqual({ identity: 'native', model: 'llama-swap/qwen3.6' }); + }); +}); + +// ─── buildDigestPrompt ──────────────────────────────────────────────────────── + +describe('buildDigestPrompt', () => { + const base: ContestantDigestInput = { + identity: 'claude', + model: 'opus-4-5', + resultMd: '# Output\n\nSome result content.', + benchmarkLine: '12000ms', + }; + + it('returns an object with non-empty system and user strings', () => { + const { system, user } = buildDigestPrompt(base); + expect(system.length).toBeGreaterThan(0); + expect(user.length).toBeGreaterThan(0); + }); + + it('includes the contestant identity and model in the user prompt', () => { + const { user } = buildDigestPrompt(base); + expect(user).toContain('claude'); + expect(user).toContain('opus-4-5'); + }); + + it('includes the benchmark line in the user prompt', () => { + const { user } = buildDigestPrompt(base); + expect(user).toContain('12000ms'); + }); + + it('includes the result.md content in the user prompt', () => { + const { user } = buildDigestPrompt(base); + expect(user).toContain('Some result content.'); + }); + + it('includes the diff.patch when provided', () => { + const input: ContestantDigestInput = { ...base, diffPatch: '--- a/foo.ts\n+++ b/foo.ts\n+added' }; + const { user } = buildDigestPrompt(input); + expect(user).toContain('added'); + expect(user).toContain('```diff'); + }); + + it('omits the diff section when diffPatch is undefined', () => { + const { user } = buildDigestPrompt(base); + expect(user).not.toContain('```diff'); + }); + + it('truncates resultMd longer than 8000 characters', () => { + const longResult = 'x'.repeat(10_000); + const { user } = buildDigestPrompt({ ...base, resultMd: longResult }); + // The truncated content must not exceed 8000 chars in the sliced section. + // We just check the total user string doesn't balloon unreasonably. + expect(user.length).toBeLessThan(15_000); + }); + + it('truncates diffPatch longer than 5000 characters', () => { + const longDiff = '+' + 'x'.repeat(10_000); + const { user } = buildDigestPrompt({ ...base, diffPatch: longDiff }); + expect(user.length).toBeLessThan(16_000); + }); +}); + +// ─── buildJudgePrompt ───────────────────────────────────────────────────────── + +describe('buildJudgePrompt', () => { + const digests: ContestantDigest[] = [ + { identity: 'claude', model: 'opus-4-5', digest: 'Good result.', benchmarkLine: '5000ms' }, + { identity: 'opencode', model: 'qwen3.6', digest: 'Decent result.', benchmarkLine: '8000ms' }, + ]; + + it('includes the original prompt in the user section', () => { + const { user } = buildJudgePrompt('Write a sorting algorithm', digests); + expect(user).toContain('Write a sorting algorithm'); + }); + + it('includes each contestant heading in the user section', () => { + const { user } = buildJudgePrompt('prompt', digests); + expect(user).toContain('claude'); + expect(user).toContain('opus-4-5'); + expect(user).toContain('opencode'); + expect(user).toContain('qwen3.6'); + }); + + it('includes each contestant digest text', () => { + const { user } = buildJudgePrompt('prompt', digests); + expect(user).toContain('Good result.'); + expect(user).toContain('Decent result.'); + }); + + it('instructs the model to name a WINNER when 2+ digests are provided', () => { + const { system } = buildJudgePrompt('prompt', digests); + expect(system).toContain('WINNER:'); + }); + + it('instructs the model NOT to name a winner when fewer than 2 digests are provided', () => { + const oneDigest = digests.slice(0, 1); + const { system } = buildJudgePrompt('prompt', oneDigest); + expect(system).toContain('NO_WINNER'); + expect(system).not.toContain('WINNER: '); + }); + + it('instructs NO_WINNER when digests list is empty', () => { + const { system } = buildJudgePrompt('prompt', []); + expect(system).toContain('NO_WINNER'); + }); + + it('truncates originalPrompt longer than 2000 characters', () => { + const longPrompt = 'p'.repeat(5_000); + const { user } = buildJudgePrompt(longPrompt, digests); + // Should not contain more than 2000 chars of the prompt. + const promptSection = user.split('# Contestant Digests')[0] ?? ''; + expect(promptSection.length).toBeLessThan(3_000); + }); +}); + +// ─── buildCrossExamPrompt ───────────────────────────────────────────────────── + +describe('buildCrossExamPrompt', () => { + const digests: ContestantDigest[] = [ + { identity: 'claude', model: 'opus-4-5', digest: 'Strong result.', benchmarkLine: '5000ms' }, + { identity: 'boocode', model: 'qwen3.6-35b', digest: 'Decent result.', benchmarkLine: '12000ms' }, + ]; + + const baseOpts = { + originalPrompt: 'Write a sorting algorithm.', + digests, + analysisContent: '# Arena Analysis\n\nClaude did better.\n\nWINNER: claude/opus-4-5', + proposedWinner: 'claude/opus-4-5', + examinerIdentity: 'goose', + examinerModel: 'gpt-4o', + }; + + it('includes the examiner identity and model in the system prompt', () => { + const { system } = buildCrossExamPrompt(baseOpts); + expect(system).toContain('goose'); + expect(system).toContain('gpt-4o'); + }); + + it('includes the original prompt in the user section', () => { + const { user } = buildCrossExamPrompt(baseOpts); + expect(user).toContain('Write a sorting algorithm.'); + }); + + it('includes each contestant digest', () => { + const { user } = buildCrossExamPrompt(baseOpts); + expect(user).toContain('Strong result.'); + expect(user).toContain('Decent result.'); + }); + + it('includes the proposed analysis content', () => { + const { user } = buildCrossExamPrompt(baseOpts); + expect(user).toContain('Claude did better.'); + }); + + it('includes the proposed winner when set', () => { + const { user } = buildCrossExamPrompt(baseOpts); + expect(user).toContain('claude/opus-4-5'); + }); + + it('notes that no winner was proposed when proposedWinner is null', () => { + const { user } = buildCrossExamPrompt({ ...baseOpts, proposedWinner: null }); + expect(user).toContain('No winner was proposed'); + }); + + it('instructs the examiner to provide a VERDICT line', () => { + const { system } = buildCrossExamPrompt(baseOpts); + expect(system).toContain('VERDICT:'); + }); +}); diff --git a/apps/coder/src/services/__tests__/arena-decisions.test.ts b/apps/coder/src/services/__tests__/arena-decisions.test.ts new file mode 100644 index 0000000..176ba22 --- /dev/null +++ b/apps/coder/src/services/__tests__/arena-decisions.test.ts @@ -0,0 +1,332 @@ +import { describe, it, expect } from 'vitest'; +import { + classifyLane, + nextLocalContestant, + isBattleComplete, + computeBenchmark, + sanitizeSlug, + buildBattleSlug, + buildContestantDir, + reconcileContestantResume, + reconcileContestants, + type ContestantSlot, +} from '../arena-decisions.js'; + +// Local models = what the llama-swap server actually serves. +const LOCAL_MODELS: ReadonlySet = new Set([ + 'qwen3.6-35b-a3b-mxfp4', + 'qwen2.5-coder-7b', +]); + +// ─── classifyLane ──────────────────────────────────────────────────────────── + +describe('classifyLane', () => { + it('classifies qa battles as local regardless of identity or model', () => { + expect(classifyLane('qa', 'boocode', 'qwen3.6-35b-a3b-mxfp4', LOCAL_MODELS)).toBe('local'); + expect(classifyLane('qa', 'claude', 'claude-opus-4-5', LOCAL_MODELS)).toBe('local'); + expect(classifyLane('qa', 'Debugger', 'cloud-model', new Set())).toBe('local'); + expect(classifyLane('qa', 'opencode', 'any-model', LOCAL_MODELS)).toBe('local'); + }); + + it('classifies coding contestants as local when model is in localModels', () => { + expect(classifyLane('coding', 'boocode', 'qwen3.6-35b-a3b-mxfp4', LOCAL_MODELS)).toBe('local'); + expect(classifyLane('coding', 'opencode', 'qwen3.6-35b-a3b-mxfp4', LOCAL_MODELS)).toBe('local'); + expect(classifyLane('coding', 'qwen', 'qwen2.5-coder-7b', LOCAL_MODELS)).toBe('local'); + }); + + it('classifies coding contestants as cloud when model is not in localModels', () => { + expect(classifyLane('coding', 'claude', 'claude-opus-4-5', LOCAL_MODELS)).toBe('cloud'); + expect(classifyLane('coding', 'opencode', 'claude-opus-4-5', LOCAL_MODELS)).toBe('cloud'); + expect(classifyLane('coding', 'goose', 'gpt-4o', LOCAL_MODELS)).toBe('cloud'); + expect(classifyLane('coding', 'qwen', 'unknown-remote-model', LOCAL_MODELS)).toBe('cloud'); + }); + + it('uses the injected localModels set, not a hardcoded list', () => { + const custom = new Set(['my-local-model']); + expect(classifyLane('coding', 'any-agent', 'my-local-model', custom)).toBe('local'); + expect(classifyLane('coding', 'boocode', 'other-model', custom)).toBe('cloud'); + }); + + it('defaults to cloud for an empty localModels set', () => { + expect(classifyLane('coding', 'boocode', 'qwen3.6-35b-a3b-mxfp4', new Set())).toBe('cloud'); + expect(classifyLane('coding', 'native', 'any-local-model', new Set())).toBe('cloud'); + }); +}); + +// ─── nextLocalContestant ───────────────────────────────────────────────────── + +describe('nextLocalContestant', () => { + it('returns null for an empty list', () => { + expect(nextLocalContestant([])).toBeNull(); + }); + + it('returns null when no local contestants are queued', () => { + const slots: ContestantSlot[] = [ + { id: 'c1', lane: 'local', status: 'running' }, + { id: 'c2', lane: 'cloud', status: 'queued' }, + ]; + expect(nextLocalContestant(slots)).toBeNull(); + }); + + it('returns the first queued local contestant in order', () => { + const slots: ContestantSlot[] = [ + { id: 'c1', lane: 'local', status: 'done' }, + { id: 'c2', lane: 'local', status: 'queued' }, + { id: 'c3', lane: 'local', status: 'queued' }, + ]; + expect(nextLocalContestant(slots)).toBe('c2'); + }); + + it('skips done/error local contestants and cloud contestants', () => { + const slots: ContestantSlot[] = [ + { id: 'c1', lane: 'cloud', status: 'queued' }, + { id: 'c2', lane: 'local', status: 'error' }, + { id: 'c3', lane: 'local', status: 'queued' }, + ]; + expect(nextLocalContestant(slots)).toBe('c3'); + }); + + it('returns null when all local contestants are done or error', () => { + const slots: ContestantSlot[] = [ + { id: 'c1', lane: 'local', status: 'done' }, + { id: 'c2', lane: 'local', status: 'error' }, + ]; + expect(nextLocalContestant(slots)).toBeNull(); + }); +}); + +// ─── isBattleComplete ──────────────────────────────────────────────────────── + +describe('isBattleComplete', () => { + it('returns false for an empty list', () => { + expect(isBattleComplete([])).toBe(false); + }); + + it('returns true when all contestants are done', () => { + expect(isBattleComplete([{ status: 'done' }, { status: 'done' }])).toBe(true); + }); + + it('returns true when all contestants are error', () => { + expect(isBattleComplete([{ status: 'error' }, { status: 'error' }])).toBe(true); + }); + + it('returns true for a mixed done/error result', () => { + expect(isBattleComplete([{ status: 'done' }, { status: 'error' }, { status: 'done' }])).toBe(true); + }); + + it('returns false while any contestant is still running', () => { + expect(isBattleComplete([{ status: 'done' }, { status: 'running' }])).toBe(false); + }); + + it('returns false while any contestant is still queued', () => { + expect(isBattleComplete([{ status: 'done' }, { status: 'queued' }])).toBe(false); + }); +}); + +// ─── computeBenchmark ──────────────────────────────────────────────────────── + +describe('computeBenchmark', () => { + const t0 = new Date('2026-06-06T10:00:00.000Z'); + const t1 = new Date('2026-06-06T10:00:05.000Z'); // +5 000ms + + it('computes duration in ms for both lanes', () => { + const local = computeBenchmark(t0, t1, 100, 'local'); + expect(local.durationMs).toBe(5000); + const cloud = computeBenchmark(t0, t1, null, 'cloud'); + expect(cloud.durationMs).toBe(5000); + }); + + it('computes tokens/sec for local lane when costTokens is known', () => { + const bench = computeBenchmark(t0, t1, 500, 'local'); + expect(bench.tokensPerSec).toBeCloseTo(100, 5); // 500 / 5 = 100 tok/s + }); + + it('omits tokens/sec for cloud lane regardless of costTokens', () => { + const bench = computeBenchmark(t0, t1, 500, 'cloud'); + expect(bench.tokensPerSec).toBeNull(); + }); + + it('omits tokens/sec for local lane when costTokens is null', () => { + const bench = computeBenchmark(t0, t1, null, 'local'); + expect(bench.tokensPerSec).toBeNull(); + }); + + it('returns durationMs = 0 and null tokensPerSec when timestamps are equal', () => { + const bench = computeBenchmark(t0, t0, 100, 'local'); + expect(bench.durationMs).toBe(0); + expect(bench.tokensPerSec).toBeNull(); + }); + + it('clamps negative duration to 0 (clock skew)', () => { + const bench = computeBenchmark(t1, t0, 50, 'local'); + expect(bench.durationMs).toBe(0); + expect(bench.tokensPerSec).toBeNull(); + }); +}); + +// ─── sanitizeSlug ──────────────────────────────────────────────────────────── + +describe('sanitizeSlug', () => { + it('lowercases and preserves alphanumeric + hyphens', () => { + expect(sanitizeSlug('claude')).toBe('claude'); + expect(sanitizeSlug('claude-opus-4-5')).toBe('claude-opus-4-5'); + }); + + it('replaces spaces and special characters with hyphens', () => { + expect(sanitizeSlug('Code Reviewer')).toBe('code-reviewer'); + expect(sanitizeSlug('native/boocode')).toBe('native-boocode'); + expect(sanitizeSlug('qwen2.5-coder-35b')).toBe('qwen2-5-coder-35b'); + }); + + it('collapses consecutive non-alphanumeric runs to a single hyphen', () => { + expect(sanitizeSlug('foo bar---baz')).toBe('foo-bar-baz'); + }); + + it('strips leading and trailing hyphens', () => { + expect(sanitizeSlug('---foo---')).toBe('foo'); + }); + + it('truncates to 64 characters', () => { + const long = 'a'.repeat(100); + expect(sanitizeSlug(long).length).toBe(64); + }); +}); + +// ─── buildBattleSlug ───────────────────────────────────────────────────────── + +describe('buildBattleSlug', () => { + it('builds a deterministic dated slug from id, type, and createdAt', () => { + const id = 'a1b2c3d4-e5f6-7890-abcd-ef1234567890'; + const createdAt = new Date('2026-06-06T12:00:00.000Z'); + const slug = buildBattleSlug(id, 'coding', createdAt); + expect(slug).toBe('2026-06-06-coding-a1b2c3d4'); + }); + + it('includes the battle type in the slug', () => { + const id = 'aaaaaaaa-0000-0000-0000-000000000000'; + const createdAt = new Date('2026-01-01T00:00:00.000Z'); + expect(buildBattleSlug(id, 'qa', createdAt)).toContain('-qa-'); + expect(buildBattleSlug(id, 'coding', createdAt)).toContain('-coding-'); + }); + + it('uses the first 8 hex chars of the uuid (dashes stripped)', () => { + const id = 'deadbeef-0000-0000-0000-000000000000'; + const slug = buildBattleSlug(id, 'coding', new Date('2026-06-06T00:00:00Z')); + expect(slug.endsWith('-deadbeef')).toBe(true); + }); +}); + +// ─── buildContestantDir ────────────────────────────────────────────────────── + +describe('buildContestantDir', () => { + it('joins sanitized identity and model with a hyphen', () => { + expect(buildContestantDir('claude', 'claude-opus-4-5')).toBe('claude-claude-opus-4-5'); + }); + + it('sanitizes both parts independently', () => { + expect(buildContestantDir('Code Reviewer', 'qwen2.5-35b')).toBe('code-reviewer-qwen2-5-35b'); + }); +}); + +// ─── reconcileContestantResume ─────────────────────────────────────────────── + +describe('reconcileContestantResume', () => { + it('keeps non-running contestants regardless of task state', () => { + for (const status of ['queued', 'done', 'error']) { + expect(reconcileContestantResume(status, 'tid', 'completed')).toBe('keep'); + expect(reconcileContestantResume(status, null, null)).toBe('keep'); + } + }); + + it('re-dispatches a running contestant with no task_id', () => { + expect(reconcileContestantResume('running', null, null)).toBe('re-dispatch'); + }); + + it('re-dispatches a running contestant whose task row is absent', () => { + expect(reconcileContestantResume('running', 'tid', null)).toBe('re-dispatch'); + }); + + it('marks done when the task completed before the terminal callback ran', () => { + expect(reconcileContestantResume('running', 'tid', 'completed')).toBe('mark-done'); + }); + + it('marks error when the task failed', () => { + expect(reconcileContestantResume('running', 'tid', 'failed')).toBe('mark-error'); + }); + + it('marks cancelled when the task was cancelled', () => { + expect(reconcileContestantResume('running', 'tid', 'cancelled')).toBe('mark-cancelled'); + }); + + it('keeps a running contestant whose task is pending (dispatcher handles it)', () => { + expect(reconcileContestantResume('running', 'tid', 'pending')).toBe('keep'); + }); + + it('re-dispatches when the task is stuck running (process died)', () => { + expect(reconcileContestantResume('running', 'tid', 'running')).toBe('re-dispatch'); + }); + + it('re-dispatches when the task is blocked (permission dialog gone on restart)', () => { + expect(reconcileContestantResume('running', 'tid', 'blocked')).toBe('re-dispatch'); + }); +}); + +// ─── reconcileContestants ──────────────────────────────────────────────────── + +describe('reconcileContestants', () => { + it('returns one decision per contestant', () => { + const contestants = [ + { contestantId: 'c1', taskId: null, status: 'done' }, + { contestantId: 'c2', taskId: 't1', status: 'running' }, + { contestantId: 'c3', taskId: 't2', status: 'running' }, + ]; + const taskStates = new Map([['t1', 'completed'], ['t2', 'running']]); + const decisions = reconcileContestants(contestants, taskStates); + expect(decisions).toHaveLength(3); + expect(decisions[0]).toEqual({ contestantId: 'c1', action: 'keep' }); + expect(decisions[1]).toEqual({ contestantId: 'c2', action: 'mark-done' }); + expect(decisions[2]).toEqual({ contestantId: 'c3', action: 're-dispatch' }); + }); + + it('re-dispatches a running contestant whose taskId is absent from taskStates', () => { + const contestants = [{ contestantId: 'c1', taskId: 'orphan', status: 'running' }]; + const decisions = reconcileContestants(contestants, new Map()); + expect(decisions[0]?.action).toBe('re-dispatch'); + }); + + it('re-dispatches a running contestant with null taskId', () => { + const contestants = [{ contestantId: 'c1', taskId: null, status: 'running' }]; + const decisions = reconcileContestants(contestants, new Map()); + expect(decisions[0]?.action).toBe('re-dispatch'); + }); + + it('returns empty array for no contestants', () => { + expect(reconcileContestants([], new Map())).toEqual([]); + }); + + it('keeps a running contestant whose task is pending', () => { + const contestants = [{ contestantId: 'c1', taskId: 't1', status: 'running' }]; + const taskStates = new Map([['t1', 'pending']]); + const decisions = reconcileContestants(contestants, taskStates); + expect(decisions[0]?.action).toBe('keep'); + }); + + it('handles a mixed battle: done/queued kept, stale running re-dispatched', () => { + const contestants = [ + { contestantId: 'c1', taskId: 't1', status: 'done' }, + { contestantId: 'c2', taskId: null, status: 'queued' }, + { contestantId: 'c3', taskId: 't2', status: 'running' }, + { contestantId: 'c4', taskId: 't3', status: 'running' }, + ]; + const taskStates = new Map([ + ['t1', 'completed'], + ['t2', 'running'], // stuck — process dead + ['t3', 'pending'], // dispatcher will handle + ]); + const decisions = reconcileContestants(contestants, taskStates); + expect(decisions.find((d) => d.contestantId === 'c1')?.action).toBe('keep'); + expect(decisions.find((d) => d.contestantId === 'c2')?.action).toBe('keep'); + expect(decisions.find((d) => d.contestantId === 'c3')?.action).toBe('re-dispatch'); + expect(decisions.find((d) => d.contestantId === 'c4')?.action).toBe('keep'); + }); +}); diff --git a/apps/coder/src/services/arena-analyzer-helpers.ts b/apps/coder/src/services/arena-analyzer-helpers.ts new file mode 100644 index 0000000..89f7270 --- /dev/null +++ b/apps/coder/src/services/arena-analyzer-helpers.ts @@ -0,0 +1,191 @@ +/** + * Pure, side-effect-free helpers for the Arena analyzer. + * No DB, no IO, no network — safe to unit-test directly. + * + * Covers: digest-prompt assembly, judge-prompt assembly, winner extraction + * from the judge output, the <2-survivors no-winner rule, and the + * cross-examination prompt. + */ + +// ─── Shared types ───────────────────────────────────────────────────────────── + +export interface ContestantDigestInput { + identity: string; + model: string; + resultMd: string; + diffPatch?: string; + benchmarkLine: string; +} + +export interface ContestantDigest { + identity: string; + model: string; + digest: string; + benchmarkLine: string; +} + +// ─── Digest stage ───────────────────────────────────────────────────────────── + +/** + * Build the system + user prompts for the per-contestant digest call. + * The digest is a short structured summary; it keeps each call's context small + * so the downstream judge only sees digests (not raw diffs). + */ +export function buildDigestPrompt(input: ContestantDigestInput): { system: string; user: string } { + const system = + 'You are an expert technical analyst evaluating the output of an AI coding or Q&A battle. ' + + 'Produce a concise structured digest (under 300 words, Markdown bullet points) covering: ' + + '(1) correctness and quality, (2) completeness, (3) notable strengths, (4) notable weaknesses or issues. ' + + 'Do not reference the battle or other contestants — focus only on this submission.'; + + const parts: string[] = [ + `# Contestant: ${input.identity} / ${input.model}`, + `\nBenchmark: ${input.benchmarkLine}`, + '\n## Result\n', + input.resultMd.slice(0, 8_000), + ]; + + if (input.diffPatch) { + parts.push('\n## Code Changes (diff)\n```diff'); + parts.push(input.diffPatch.slice(0, 5_000)); + parts.push('```'); + } + + return { system, user: parts.join('\n') }; +} + +// ─── Judge stage ────────────────────────────────────────────────────────────── + +/** + * Build the system + user prompts for the comparative judge call. + * Receives contestant digests (NOT raw diffs) to keep context bounded. + * + * The judge output must contain a line starting with WINNER: or NO_WINNER. + * The caller extracts it with extractWinner(). + */ +export function buildJudgePrompt( + originalPrompt: string, + digests: ContestantDigest[], +): { system: string; user: string } { + const canName = shouldNameWinner(digests.length); + + const winnerInstruction = canName + ? 'After your comparative analysis, name the best submission on its own line in this exact format:\n' + + 'WINNER: /\n' + + 'where and exactly match the heading above. No other text on that line.' + : 'Fewer than 2 contestants succeeded. Do NOT name a winner. Write the following on its own line:\nNO_WINNER'; + + const system = + 'You are an expert judge for an AI battle. You have received digest summaries of each ' + + "contestant's work on the same task. Write a comparative analysis, then follow these instructions:\n" + + winnerInstruction; + + const parts: string[] = [ + '# Original Task Prompt\n', + originalPrompt.slice(0, 2_000), + '\n# Contestant Digests\n', + ]; + + for (const d of digests) { + parts.push(`\n## ${d.identity} / ${d.model}`); + parts.push(`Benchmark: ${d.benchmarkLine}`); + parts.push(d.digest); + } + + parts.push( + '\n# Instructions\nCompare the contestants and follow the winner-naming instructions above.', + ); + + return { system, user: parts.join('\n') }; +} + +// ─── No-winner rule ─────────────────────────────────────────────────────────── + +/** + * Returns true when enough contestants succeeded to name a winner. + * Rule: at least 2 must have produced a result. With 0 or 1 success the + * analysis must NOT name a winner (no meaningful comparison possible). + */ +export function shouldNameWinner(succeededCount: number): boolean { + return succeededCount >= 2; +} + +// ─── Winner extraction ──────────────────────────────────────────────────────── + +/** + * Parse the judge's text output and extract the declared winner. + * Looks for a line matching: WINNER: / + * Returns null when no valid winner line is found, or when the line contains + * NO_WINNER. + * + * The parse is lenient on surrounding whitespace and case for the keyword. + */ +export function extractWinner(judgeOutput: string): { identity: string; model: string } | null { + for (const line of judgeOutput.split('\n')) { + const trimmed = line.trim(); + if (!trimmed.toUpperCase().startsWith('WINNER:')) continue; + + const rest = trimmed.slice('WINNER:'.length).trim(); + if (rest.toUpperCase() === 'NO_WINNER' || rest === '') return null; + + const slashIdx = rest.indexOf('/'); + if (slashIdx === -1) return null; + + const identity = rest.slice(0, slashIdx).trim(); + const model = rest.slice(slashIdx + 1).trim(); + if (identity && model) return { identity, model }; + } + return null; +} + +// ─── Cross-examination stage ────────────────────────────────────────────────── + +/** + * Build the system + user prompts for a cross-examination call. + * The cross-examiner sees the original prompt, contestant digests, and the + * proposed analysis, and is asked to challenge the result. + */ +export function buildCrossExamPrompt(opts: { + originalPrompt: string; + digests: ContestantDigest[]; + analysisContent: string; + proposedWinner: string | null; + examinerIdentity: string; + examinerModel: string; +}): { system: string; user: string } { + const system = + `You are ${opts.examinerIdentity} (model: ${opts.examinerModel}), acting as an independent ` + + 'cross-examiner in an AI battle. Your role is to critically challenge the proposed analysis ' + + 'and winner, then give your own verdict. Be rigorous but fair. ' + + 'End your response with your verdict on its own line:\n' + + 'VERDICT: / — if you agree or disagree with the proposed winner but can name one\n' + + 'VERDICT: NO_WINNER — if no clear winner exists'; + + const parts: string[] = [ + '# Original Task Prompt\n', + opts.originalPrompt.slice(0, 2_000), + '\n# Contestant Digests\n', + ]; + + for (const d of opts.digests) { + parts.push(`\n## ${d.identity} / ${d.model}`); + parts.push(`Benchmark: ${d.benchmarkLine}`); + parts.push(d.digest); + } + + parts.push('\n# Proposed Analysis\n'); + parts.push(opts.analysisContent.slice(0, 5_000)); + + if (opts.proposedWinner) { + parts.push(`\n*(Proposed winner: ${opts.proposedWinner})*`); + } else { + parts.push('\n*(No winner was proposed — fewer than 2 contestants succeeded.)*'); + } + + parts.push( + '\n# Your Cross-Examination\n' + + 'Challenge the analysis above, then give your independent verdict (VERDICT: … on its own line).', + ); + + return { system, user: parts.join('\n') }; +} diff --git a/apps/coder/src/services/arena-analyzer.ts b/apps/coder/src/services/arena-analyzer.ts new file mode 100644 index 0000000..b6a3192 --- /dev/null +++ b/apps/coder/src/services/arena-analyzer.ts @@ -0,0 +1,496 @@ +/** + * Arena Analyzer — pluggable seam for battle analysis and cross-examination. + * + * The Analyzer interface is the plug point: a v2 Han Orchestrator flow can + * replace the v1 two-stage digest→judge implementation without a schema change. + * + * v1 implementation uses DEFAULT_MODEL via direct llama-swap calls (arenaModelCall): + * Digest stage — one call per succeeded contestant, concurrent; produces a + * bounded summary of each result (result.md + diff.patch for + * coding, result.md for Q&A). + * Judge stage — one call with all digests + the original prompt; writes + * analysis.md, names a winner (unless < 2 succeeded), and + * updates battles.winner_contestant_id. + * + * Cross-examination: + * Local model — direct arenaModelCall to llama-swap with the chosen model. + * Cloud model — inserts a tasks row (triggers the dispatcher via pg_notify); + * polls for completion; reads output_summary as the verdict. + * In both cases the verdict is written to cross_examinations.verdict, appended + * to /cross-exam.md, and a battle_updated frame is published. + * + * Never throws — all errors are caught, logged, and swallowed so the caller + * (arena-runner's onBattleComplete / onCrossExamStart) is never wedged. + */ + +import { readFile, writeFile, mkdir } from 'node:fs/promises'; +import { join } from 'node:path'; +import type { Sql } from '../db.js'; +import type { Broker } from '@boocode/server/broker'; +import type { WsFrame } from '@boocode/contracts/ws-frames'; +import type { FastifyBaseLogger } from 'fastify'; +import type { Config } from '../config.js'; +import type { BattleType } from '@boocode/contracts/arena'; +import { arenaModelCall } from './arena-model-call.js'; +import { + buildDigestPrompt, + buildJudgePrompt, + buildCrossExamPrompt, + extractWinner, + shouldNameWinner, + type ContestantDigest, +} from './arena-analyzer-helpers.js'; + +// ─── Public interface ───────────────────────────────────────────────────────── + +/** Pluggable analysis seam — swap to a Han Orchestrator flow in v2. */ +export interface Analyzer { + /** Run the two-stage digest→judge analysis for a completed battle. */ + analyze(battleId: string): Promise; + /** + * Run a cross-examination for an already-inserted cross_examinations row. + * The result is written back to that row and a battle_updated frame is published. + */ + crossExamine( + battleId: string, + crossExamId: string, + opts: { identity: string; model: string }, + ): Promise; +} + +// ─── Internal DB row types ──────────────────────────────────────────────────── + +interface BattleRow { + id: string; + project_id: string; + battle_type: BattleType; + prompt: string; + status: string; + results_path: string | null; + winner_contestant_id: string | null; +} + +interface ContestantRow { + id: string; + identity: string; + model: string; + lane: string; + status: string; + result_path: string | null; + duration_ms: number | null; + tokens_per_sec: number | null; +} + +// ─── Factory ────────────────────────────────────────────────────────────────── + +interface AnalyzerDeps { + sql: Sql; + broker: Broker; + log: FastifyBaseLogger; + config: Pick; + /** Model IDs served by local llama-swap — cross-exam routing uses this. */ + localModels: ReadonlySet; +} + +export function createAnalyzer(deps: AnalyzerDeps): Analyzer { + const { sql, broker, log, config, localModels } = deps; + + // ─── analyze ────────────────────────────────────────────────────────────── + + async function analyze(battleId: string): Promise { + try { + await runAnalysis(battleId); + } catch (err) { + log.error( + { err: errMsg(err), battleId }, + 'arena-analyzer: analysis failed', + ); + } + } + + async function runAnalysis(battleId: string): Promise { + const battle = await loadBattle(battleId); + if (!battle) { + log.warn({ battleId }, 'arena-analyzer: battle not found'); + return; + } + + const contestants = await loadContestants(battleId); + const succeeded = contestants.filter((c) => c.status === 'done' && c.result_path); + + log.info( + { battleId, total: contestants.length, succeeded: succeeded.length }, + 'arena-analyzer: starting analysis', + ); + + // Digest stage — concurrent, one call per succeeded contestant. + const digests = ( + await Promise.all(succeeded.map((c) => digestContestant(battle, c))) + ).filter((d): d is ContestantDigest => d !== null); + + // Failed contestants are noted in the analysis even if they produced no digest. + const failedNotes = contestants + .filter((c) => c.status === 'error') + .map((c) => `- **${c.identity} / ${c.model}**: failed (no result)\n`); + + // Judge stage — single call with all digests. + const { analysisText, winner } = await judgeContestants(battle, digests, failedNotes); + + // Write analysis.md to the battle results folder. + const resultsPath = battle.results_path; + if (resultsPath) { + await mkdir(resultsPath, { recursive: true }); + await writeFile(join(resultsPath, 'analysis.md'), analysisText, 'utf8'); + } + + // Resolve the winner to a contestant id and update the battle row. + let winnerId: string | null = null; + if (winner && shouldNameWinner(succeeded.length)) { + const winnerContestant = contestants.find( + (c) => c.identity === winner.identity && c.model === winner.model, + ); + if (winnerContestant) { + winnerId = winnerContestant.id; + await sql` + UPDATE battles + SET winner_contestant_id = ${winnerId}, updated_at = clock_timestamp() + WHERE id = ${battleId} + `; + log.info({ battleId, winnerId, identity: winner.identity, model: winner.model }, 'arena-analyzer: winner set'); + } else { + log.warn({ battleId, winner }, 'arena-analyzer: judge named a winner not found in contestants'); + } + } + + publishUser({ + type: 'battle_updated', + battle_id: battleId, + winner_contestant_id: winnerId, + analysis_ready: true, + }); + + log.info({ battleId }, 'arena-analyzer: analysis complete'); + } + + // ─── crossExamine ───────────────────────────────────────────────────────── + + async function crossExamine( + battleId: string, + crossExamId: string, + opts: { identity: string; model: string }, + ): Promise { + try { + await runCrossExam(battleId, crossExamId, opts); + } catch (err) { + log.error( + { err: errMsg(err), battleId, crossExamId }, + 'arena-analyzer: cross-exam failed', + ); + } + } + + async function runCrossExam( + battleId: string, + crossExamId: string, + opts: { identity: string; model: string }, + ): Promise { + const battle = await loadBattle(battleId); + if (!battle) { + log.warn({ battleId }, 'arena-analyzer: battle not found for cross-exam'); + return; + } + + const contestants = await loadContestants(battleId); + + // Re-read the digests (if contestants have results) for context. + const succeeded = contestants.filter((c) => c.status === 'done' && c.result_path); + const digests = ( + await Promise.all(succeeded.map((c) => digestContestant(battle, c))) + ).filter((d): d is ContestantDigest => d !== null); + + // Read analysis.md for the proposed analysis content. + let analysisContent = ''; + if (battle.results_path) { + analysisContent = await readFile( + join(battle.results_path, 'analysis.md'), 'utf8', + ).catch(() => ''); + } + + // Resolve proposed winner label. + let proposedWinner: string | null = null; + if (battle.winner_contestant_id) { + const w = contestants.find((c) => c.id === battle.winner_contestant_id); + if (w) proposedWinner = `${w.identity}/${w.model}`; + } + + const { system, user } = buildCrossExamPrompt({ + originalPrompt: battle.prompt, + digests, + analysisContent, + proposedWinner, + examinerIdentity: opts.identity, + examinerModel: opts.model, + }); + + log.info({ battleId, crossExamId, identity: opts.identity, model: opts.model }, 'arena-analyzer: running cross-exam'); + + const verdict = await executeModelCall({ + battleId, + projectId: battle.project_id, + identity: opts.identity, + model: opts.model, + system, + user, + }); + + // Persist verdict and append to cross-exam.md. + await sql` + UPDATE cross_examinations + SET verdict = ${verdict} + WHERE id = ${crossExamId} + `; + + if (battle.results_path) { + const crossExamPath = join(battle.results_path, 'cross-exam.md'); + const section = + `\n---\n\n# Cross-Examination by ${opts.identity} / ${opts.model}\n\n` + + `${verdict}\n`; + await writeFile(crossExamPath, section, { flag: 'a', encoding: 'utf8' }); + } + + publishUser({ + type: 'battle_updated', + battle_id: battleId, + cross_exam_id: crossExamId, + }); + + log.info({ battleId, crossExamId }, 'arena-analyzer: cross-exam complete'); + } + + // ─── Model call routing ─────────────────────────────────────────────────── + + /** + * Route a one-shot model call to llama-swap (local) or the task dispatcher + * (cloud). Cloud dispatch inserts a tasks row and polls for completion. + */ + async function executeModelCall(opts: { + battleId: string; + projectId: string; + identity: string; + model: string; + system: string; + user: string; + }): Promise { + const isLocal = localModels.has(opts.model) || localModels.has(`llama-swap/${opts.model}`); + + if (isLocal) { + return arenaModelCall({ + config, + model: opts.model, + system: opts.system, + user: opts.user, + maxTokens: 2_000, + temperature: 0.3, + }); + } + + // Cloud path: dispatch through the task system and poll for completion. + return executeCloudModelCall(opts); + } + + async function executeCloudModelCall(opts: { + projectId: string; + identity: string; + model: string; + system: string; + user: string; + }): Promise { + // The cross-exam prompt is the full input to the external agent. We embed + // the system prompt as a preamble in the user message (external agents don't + // take a separate system arg through the tasks dispatcher). + const input = `${opts.system}\n\n${opts.user}`; + + // For well-known external agents, stamp the agent name so the dispatcher + // routes via PTY/ACP. For unknown identities fall back to native inference + // (agent = null → DEFAULT_MODEL text generation). + const knownAgents = new Set(['claude', 'opencode', 'qwen', 'goose']); + const agentName = knownAgents.has(opts.identity) ? opts.identity : null; + + const [task] = await sql<{ id: string }[]>` + INSERT INTO tasks (project_id, input, agent, model) + VALUES (${opts.projectId}, ${input}, ${agentName}, ${opts.model}) + RETURNING id + `; + const taskId = task!.id; + + log.info({ taskId, identity: opts.identity, model: opts.model }, 'arena-analyzer: cloud cross-exam task dispatched'); + + // Poll until terminal (up to 5 minutes). + const timeoutMs = 5 * 60 * 1_000; + const pollMs = 2_000; + const deadline = Date.now() + timeoutMs; + + while (Date.now() < deadline) { + await sleep(pollMs); + const [row] = await sql<{ state: string; output_summary: string | null }[]>` + SELECT state, output_summary FROM tasks WHERE id = ${taskId} + `; + if (!row) break; + if (row.state === 'completed') return row.output_summary ?? ''; + if (row.state === 'failed' || row.state === 'cancelled') { + throw new Error(`cross-exam task ${row.state}: ${row.output_summary ?? ''}`); + } + } + + throw new Error(`cloud cross-exam task timed out after ${timeoutMs / 1000}s`); + } + + // ─── Digest helper ──────────────────────────────────────────────────────── + + async function digestContestant( + battle: BattleRow, + c: ContestantRow, + ): Promise { + if (!c.result_path) return null; + + const resultMd = await readFile(join(c.result_path, 'result.md'), 'utf8').catch(() => ''); + + let diffPatch: string | undefined; + if (battle.battle_type === 'coding') { + diffPatch = await readFile(join(c.result_path, 'diff.patch'), 'utf8').catch( + () => undefined, + ); + } + + const benchmarkLine = formatBenchmarkLine(c); + const { system, user } = buildDigestPrompt({ + identity: c.identity, + model: c.model, + resultMd, + diffPatch, + benchmarkLine, + }); + + let digest: string; + try { + digest = await arenaModelCall({ + config, + model: config.DEFAULT_MODEL, + system, + user, + maxTokens: 500, + temperature: 0.3, + }); + } catch (err) { + log.warn( + { err: errMsg(err), identity: c.identity, model: c.model }, + 'arena-analyzer: digest call failed — skipping contestant', + ); + return null; + } + + return { identity: c.identity, model: c.model, digest, benchmarkLine }; + } + + // ─── Judge helper ───────────────────────────────────────────────────────── + + async function judgeContestants( + battle: BattleRow, + digests: ContestantDigest[], + failedNotes: string[], + ): Promise<{ analysisText: string; winner: { identity: string; model: string } | null }> { + const { system, user } = buildJudgePrompt(battle.prompt, digests); + + let judgeOutput = ''; + try { + judgeOutput = await arenaModelCall({ + config, + model: config.DEFAULT_MODEL, + system, + user, + maxTokens: 2_000, + temperature: 0.3, + }); + } catch (err) { + log.error({ err: errMsg(err), battleId: battle.id }, 'arena-analyzer: judge call failed'); + judgeOutput = '*(Judge call failed — no comparison produced.)*'; + } + + const winner = shouldNameWinner(digests.length) ? extractWinner(judgeOutput) : null; + + const sections: string[] = [ + `# Arena Analysis`, + `\n**Battle type:** ${battle.battle_type}`, + ]; + + if (failedNotes.length > 0) { + sections.push('\n## Failed Contestants\n'); + sections.push(...failedNotes); + } + + if (digests.length > 0) { + sections.push('\n## Contestant Digests\n'); + for (const d of digests) { + sections.push(`### ${d.identity} / ${d.model}`); + sections.push(`*Benchmark: ${d.benchmarkLine}*\n`); + sections.push(d.digest); + } + } + + sections.push("\n## Judge's Verdict\n"); + sections.push(judgeOutput); + + if (winner) { + sections.push(`\n## Winner\n**${winner.identity} / ${winner.model}**`); + } else { + const reason = + digests.length < 2 + ? 'fewer than 2 contestants produced results' + : 'no clear winner identified'; + sections.push(`\n## Winner\n*No winner named (${reason}).*`); + } + + return { analysisText: sections.join('\n'), winner }; + } + + // ─── DB helpers ─────────────────────────────────────────────────────────── + + async function loadBattle(battleId: string): Promise { + const [b] = await sql` + SELECT id, project_id, battle_type, prompt, status, results_path, winner_contestant_id + FROM battles WHERE id = ${battleId} + `; + return b ?? null; + } + + async function loadContestants(battleId: string): Promise { + return sql` + SELECT id, identity, model, lane, status, result_path, duration_ms, tokens_per_sec + FROM contestants WHERE battle_id = ${battleId} + ORDER BY created_at ASC + `; + } + + // ─── Misc helpers ───────────────────────────────────────────────────────── + + function formatBenchmarkLine(c: ContestantRow): string { + const parts: string[] = []; + if (c.duration_ms !== null) parts.push(`${c.duration_ms}ms`); + if (c.tokens_per_sec !== null) parts.push(`${c.tokens_per_sec.toFixed(1)} tok/s`); + return parts.length > 0 ? parts.join(', ') : 'no benchmark'; + } + + function publishUser(frame: Record): void { + broker.publishUserFrame('default', frame as unknown as WsFrame); + } + + function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); + } + + return { analyze, crossExamine }; +} + +function errMsg(e: unknown): string { + return e instanceof Error ? e.message : String(e); +} diff --git a/apps/coder/src/services/arena-decisions.ts b/apps/coder/src/services/arena-decisions.ts new file mode 100644 index 0000000..fd2d642 --- /dev/null +++ b/apps/coder/src/services/arena-decisions.ts @@ -0,0 +1,186 @@ +/** + * Pure scheduling and classification decisions for the Arena battle-runner. + * No database, no IO. Mirrors the pattern of flow-runner-decisions.ts. + * + * Vocabulary: + * local lane — llama-swap-backed contestants, run strictly one at a time + * cloud lane — cloud-backed contestants, run all in parallel + * + * A contestant's status lifecycle: + * queued → running → done | error + */ +import type { BattleType, ContestantLane } from '@boocode/contracts/arena'; + +// ─── Lane classification ────────────────────────────────────────────────────── + +/** + * Classify a contestant into a lane. + * + * Q&A contestants always run on the native (llama-swap) backend → local. + * Coding contestants: their MODEL is checked against the localModels set + * (all model IDs served by the local llama-swap server). This means an + * opencode or qwen contestant pointed at a local model counts as local, + * which correctly captures GPU-contention and fair benchmarking (ADR 0001). + * + * @param battleType 'coding' | 'qa' + * @param identity backend name (coding) or persona name (qa) — not used for lane logic + * @param model the contestant's model id + * @param localModels set of model IDs served by the local llama-swap server + */ +export function classifyLane( + battleType: BattleType, + _identity: string, + model: string, + localModels: ReadonlySet, +): ContestantLane { + if (battleType === 'qa') return 'local'; + return localModels.has(model) ? 'local' : 'cloud'; +} + +// ─── Local-lane queue ───────────────────────────────────────────────────────── + +export interface ContestantSlot { + id: string; + lane: ContestantLane; + status: string; +} + +/** + * The next queued local contestant to dispatch — the first 'queued' contestant + * in the local lane, in creation order (caller must supply rows in created_at ASC). + * Returns null when the local queue is empty or all local slots are non-queued. + */ +export function nextLocalContestant(contestants: readonly ContestantSlot[]): string | null { + for (const c of contestants) { + if (c.lane === 'local' && c.status === 'queued') return c.id; + } + return null; +} + +// ─── Battle completion ──────────────────────────────────────────────────────── + +/** + * True when every contestant has reached a terminal state (done | error). + * Returns false for an empty list — a battle with no contestants never completes. + */ +export function isBattleComplete(contestants: readonly { status: string }[]): boolean { + if (contestants.length === 0) return false; + return contestants.every((c) => c.status === 'done' || c.status === 'error'); +} + +// ─── Benchmark ──────────────────────────────────────────────────────────────── + +export interface Benchmark { + durationMs: number; + tokensPerSec: number | null; +} + +/** + * Compute the benchmark for a contestant. + * Wall-clock duration is captured for every contestant; tokens/sec is only + * meaningful for local (llama-swap) contestants where the model has sole + * access to the GPU and the measurement is fair. + */ +export function computeBenchmark( + startedAt: Date, + endedAt: Date, + costTokens: number | null, + lane: ContestantLane, +): Benchmark { + const durationMs = Math.max(0, endedAt.getTime() - startedAt.getTime()); + const tokensPerSec = + lane === 'local' && costTokens !== null && durationMs > 0 + ? (costTokens / durationMs) * 1000 + : null; + return { durationMs, tokensPerSec }; +} + +// ─── Slug / path helpers ────────────────────────────────────────────────────── + +/** + * Sanitize a string for use as a directory name component. + * Lowercases, replaces non-alphanumeric runs with '-', trims leading/trailing + * dashes, and caps at 64 characters. + */ +export function sanitizeSlug(s: string): string { + return s + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') + .slice(0, 64); +} + +/** + * Build the dated battle slug used as the Arena results folder name. + * Format: YYYY-MM-DD-- + * Deterministic: callers can rebuild it from (id, type, created_at) on resume. + */ +export function buildBattleSlug(battleId: string, battleType: BattleType, createdAt: Date): string { + const date = createdAt.toISOString().slice(0, 10); + const shortId = battleId.replace(/-/g, '').slice(0, 8); + return `${date}-${battleType}-${shortId}`; +} + +/** + * Build the per-contestant results directory name within a battle folder. + * Format: - + */ +export function buildContestantDir(identity: string, model: string): string { + return `${sanitizeSlug(identity)}-${sanitizeSlug(model)}`; +} + +// ─── Resume reconciliation ──────────────────────────────────────────────────── + +export type ContestantResumeAction = + | 'keep' + | 're-dispatch' + | 'mark-done' + | 'mark-error' + | 'mark-cancelled'; + +export interface ContestantResumeDecision { + contestantId: string; + action: ContestantResumeAction; +} + +/** + * Decide what to do with ONE contestant during startup resume. + * Mirrors reconcileResumeStep from flow-runner-decisions.ts. + * + * @param status contestants.status + * @param taskId contestants.task_id (null when not yet dispatched) + * @param taskState tasks.state for taskId, or null if the task row is absent + */ +export function reconcileContestantResume( + status: string, + taskId: string | null, + taskState: string | null, +): ContestantResumeAction { + if (status !== 'running') return 'keep'; + if (!taskId || taskState === null) return 're-dispatch'; + switch (taskState) { + case 'completed': return 'mark-done'; + case 'failed': return 'mark-error'; + case 'cancelled': return 'mark-cancelled'; + case 'pending': return 'keep'; // dispatcher startup poll will run it normally + default: return 're-dispatch'; // 'running'/'blocked' — process is dead + } +} + +/** + * Reconcile every contestant of an in-flight battle for startup resume. + * Returns one decision per contestant. Pure — no IO. + */ +export function reconcileContestants( + contestants: ReadonlyArray<{ contestantId: string; taskId: string | null; status: string }>, + taskStates: ReadonlyMap, +): ContestantResumeDecision[] { + return contestants.map((c) => ({ + contestantId: c.contestantId, + action: reconcileContestantResume( + c.status, + c.taskId, + c.taskId ? (taskStates.get(c.taskId) ?? null) : null, + ), + })); +} diff --git a/apps/coder/src/services/arena-model-call.ts b/apps/coder/src/services/arena-model-call.ts new file mode 100644 index 0000000..35c95eb --- /dev/null +++ b/apps/coder/src/services/arena-model-call.ts @@ -0,0 +1,70 @@ +/** + * One-shot model completion for the Arena analyzer. + * + * Calls the local llama-swap server directly for a single non-streaming + * completion. Used for the digest and judge stages (always DEFAULT_MODEL) + * and for local-model cross-examinations (any local model). + * + * Mirrors apps/server/src/services/task-model.ts but targets the coder's + * config shape and uses a longer timeout appropriate for analysis calls. + */ + +import type { Config } from '../config.js'; + +const TIMEOUT_MS = 120_000; + +export async function arenaModelCall(opts: { + config: Pick; + model: string; + system: string; + user: string; + maxTokens?: number; + temperature?: number; +}): Promise { + const { config, model, system, user } = opts; + const maxTokens = opts.maxTokens ?? 2_000; + const temperature = opts.temperature ?? 0.3; + + const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/chat/completions`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model, + messages: [ + { role: 'system', content: system }, + { role: 'user', content: user }, + ], + max_tokens: maxTokens, + temperature, + stream: false, + chat_template_kwargs: { enable_thinking: false }, + }), + signal: AbortSignal.timeout(TIMEOUT_MS), + }); + + if (!res.ok) { + const text = await res.text().catch(() => ''); + throw new Error(`llama-swap responded ${res.status}: ${text.slice(0, 200)}`); + } + + const data = (await res.json()) as { + choices?: Array<{ + message?: { content?: string; reasoning_content?: string }; + }>; + }; + + const choice = data.choices?.[0]?.message; + if (!choice) return ''; + + const content = (choice.content ?? '').trim(); + if (content.length > 0) return content; + + // For thinking-mode models the answer sometimes only lands in reasoning_content. + const reasoning = (choice.reasoning_content ?? '').trim(); + if (reasoning.length > 0) { + const lines = reasoning.split('\n').filter((l) => l.trim().length > 0); + return lines[lines.length - 1] ?? ''; + } + + return ''; +} diff --git a/apps/coder/src/services/arena-runner.ts b/apps/coder/src/services/arena-runner.ts new file mode 100644 index 0000000..fa92962 --- /dev/null +++ b/apps/coder/src/services/arena-runner.ts @@ -0,0 +1,895 @@ +/** + * Arena battle-runner — DB-backed execution engine for Arena battles. + * + * Mirrors flow-runner.ts but implements the Arena's two-lane scheduler instead + * of the Orchestrator's wave scheduler. Persists to battles/contestants tables + * (not flow_runs/flow_steps). Each contestant is dispatched as a real tasks row + * via an injected DispatchContestantFn (Phase 4 wires this to the dispatcher). + * Advances on the dispatcher's onTaskTerminal hook. + * + * Scheduling: + * - Cloud lane: all contestants start immediately, in parallel. + * - Local lane: contestants run strictly one at a time (serial queue). Only + * the first local contestant runs at start; the next is dispatched when the + * current one terminates. Both lanes run concurrently with each other. + * + * Results: + * Written to /Arena//-/ + * Coding: result.md + diff.patch (from the contestant's worktree). + * Q&A: result.md with the text answer. + * + * Analyzer seam: + * onBattleComplete is called when all contestants are terminal. Phase 5 wires + * this to the two-stage digest→judge analyzer. A failed contestant does NOT + * abort the battle — others continue and the analyzer judges survivors. + */ +import type { Sql } from '../db.js'; +import type { Broker } from '@boocode/server/broker'; +import type { WsFrame } from '@boocode/contracts/ws-frames'; +import type { FastifyBaseLogger } from 'fastify'; +import type { BattleType, ContestantLane } from '@boocode/contracts/arena'; +import { mkdir, writeFile } from 'node:fs/promises'; +import { join } from 'node:path'; +import { diffWorktree } from './worktrees.js'; +import { + buildBattleSlug, + buildContestantDir, + classifyLane, + computeBenchmark, + isBattleComplete, + nextLocalContestant, + reconcileContestants, + type ContestantResumeAction, + type ContestantSlot, +} from './arena-decisions.js'; + +// ─── Public types ───────────────────────────────────────────────────────────── + +export interface ContestantSpec { + /** Backend name (coding) or persona name (qa). */ + identity: string; + model: string; +} + +export interface BattleStartOpts { + projectId: string; + battleType: BattleType; + prompt: string; + /** 2–6 contestants. Duplicate (identity, model) pairs are rejected by the schema UNIQUE constraint. */ + contestants: ContestantSpec[]; +} + +/** + * Injected dispatch function — Phase 4 wires this to the real task inserter. + * Must INSERT a tasks row and return its id. The arena-runner sets the + * contestant's task_id and status after this call. + * `sessionId` is returned when already known (Q&A pre-creates the session); + * null for coding contestants whose session is created lazily by the dispatcher. + */ +export type DispatchContestantFn = (opts: { + projectId: string; + contestantId: string; + prompt: string; + identity: string; + model: string; + battleType: BattleType; +}) => Promise<{ taskId: string; sessionId: string | null }>; + +/** + * Called once when every contestant in a battle has reached a terminal state. + * Phase 5 wires this to the two-stage digest→judge analyzer. + * Must never throw — the caller swallows errors. + */ +export type OnBattleComplete = (battleId: string) => void; + +/** + * Called after a cross_examinations row has been inserted, with its id. + * Phase 5 wires this to the analyzer's cross-examination runner. + * Must never throw — the caller swallows errors. + */ +export type OnCrossExamStart = (opts: { + battleId: string; + crossExamId: string; + identity: string; + model: string; +}) => void; + +export interface BattleRunner { + /** Start a battle: persist it + its contestants, classify lanes, dispatch initial wave. */ + startBattle(opts: BattleStartOpts): Promise<{ battleId: string }>; + /** + * Wire to createDispatcher({ onTaskTerminal }). Fires when ANY task settles; + * the runner ignores tasks it doesn't own. Never throws. + */ + handleTaskTerminal(taskId: string, state: string): void; + /** + * Re-advance any battles still marked 'running' after a coder restart. + * Mirrors flow-runner's initResume (D-9). Never throws. + */ + initResume(): Promise; + /** + * Cancel a running battle. Marks it and all non-terminal contestants cancelled, + * publishes frames, and returns the task_ids of in-flight contestants so the + * route can abort them via the dispatcher's cancelExternalTask. + */ + cancelBattle(battleId: string): Promise<{ cancelled: boolean; taskIds: string[] }>; + /** + * Trigger analysis for a completed (or manually re-analyzed) battle. + * Phase 5 wires this to the two-stage digest→judge analyzer. For now, calls + * the injected onBattleComplete seam directly. + */ + triggerAnalysis(battleId: string): Promise<{ triggered: boolean }>; + /** + * Start a cross-examination on a battle. Inserts a cross_examinations row and + * invokes the analyzer seam. Phase 5 fills the actual verdict logic. + */ + startCrossExam( + battleId: string, + opts: { identity: string; model: string }, + ): Promise<{ crossExamId: string }>; + /** + * Manually set (or clear) the winner. Validates the contestant belongs to the + * battle, updates battles.winner_contestant_id, and publishes a battle_updated + * frame so the pane reflects the override immediately. + */ + setWinner(battleId: string, winnerId: string | null): Promise<{ + ok: boolean; + notFound?: boolean; + invalidContestant?: boolean; + }>; +} + +// ─── Internal row shapes ────────────────────────────────────────────────────── + +interface ContestantRow { + id: string; + battle_id: string; + identity: string; + model: string; + lane: ContestantLane; + task_id: string | null; + worktree_id: string | null; + status: string; +} + +interface BattleRow { + id: string; + project_id: string; + battle_type: BattleType; + prompt: string; + status: string; + results_path: string | null; + created_at: Date; +} + +// ─── Deps / factory ─────────────────────────────────────────────────────────── + +interface Deps { + sql: Sql; + broker: Broker; + log: FastifyBaseLogger; + dispatch: DispatchContestantFn; + onBattleComplete: OnBattleComplete; + /** + * Called after a cross_examinations row is inserted. Phase 5 wires this to + * the analyzer's cross-examination runner. Optional: absent → no cross-exam + * logic runs (stub behaviour for tests). + */ + onCrossExamStart?: OnCrossExamStart; + /** + * Model IDs served by the local llama-swap server. Used for lane classification: + * a contestant whose model is in this set runs in the local lane (serial, GPU-fair). + * Q&A contestants are always local regardless of this set. + * Defaults to an empty set → all coding contestants go to the cloud lane. + */ + localModels?: ReadonlySet; +} + +const DEFAULT_LOCAL_MODELS: ReadonlySet = new Set(); + +export function createBattleRunner(deps: Deps): BattleRunner { + const { sql, broker, log, dispatch, onBattleComplete, onCrossExamStart } = deps; + const localModels = deps.localModels ?? DEFAULT_LOCAL_MODELS; + + // Serialize local-lane advance per battle so two near-simultaneous terminal + // callbacks don't double-dispatch the next local contestant. + const advanceChain = new Map>(); + + // Delta bridge: per-contestant broker unsubscribe functions. + // 'terminated' sentinel prevents a late-arriving setupDeltaBridge from + // registering a subscription that would never be cleaned up. + const deltaUnsubs = new Map void) | 'terminated'>(); + + function publishUser(frame: Record): void { + broker.publishUserFrame('default', frame as unknown as WsFrame); + } + + /** + * Subscribe to the contestant's inference session and forward delta frames + * to the user channel as contestant_updated{delta}. Polls for session_id + * when not immediately known (coding contestants whose session is created + * lazily by the dispatcher). Unsubscribes on termination or max retries. + */ + async function setupDeltaBridge( + battleId: string, + contestantId: string, + taskId: string, + knownSessionId: string | null, + ): Promise { + let sessionId = knownSessionId; + if (!sessionId) { + // Coding contestant: session_id is written by the dispatcher just before + // inference starts. Poll until it appears or the contestant terminates. + for (let i = 0; i < 50; i++) { + if (deltaUnsubs.get(contestantId) === 'terminated') return; + const [row] = await sql<{ session_id: string | null }[]>` + SELECT session_id FROM tasks WHERE id = ${taskId} + `.catch(() => []); + if (row?.session_id) { sessionId = row.session_id; break; } + await new Promise((r) => setTimeout(r, 200)); + } + } + if (!sessionId) return; + if (deltaUnsubs.get(contestantId) === 'terminated') return; + + const unsub = broker.subscribe(sessionId, (frame) => { + if (frame.type === 'delta') { + const deltaContent = (frame as unknown as { content?: unknown }).content; + if (typeof deltaContent === 'string') { + publishUser({ + type: 'contestant_updated', + battle_id: battleId, + contestant_id: contestantId, + delta: deltaContent, + }); + } + } + }); + + const existing = deltaUnsubs.get(contestantId); + if (existing === 'terminated') { + unsub(); + } else { + deltaUnsubs.set(contestantId, unsub); + } + } + + function teardownDeltaBridge(contestantId: string): void { + const entry = deltaUnsubs.get(contestantId); + if (typeof entry === 'function') { + entry(); + deltaUnsubs.delete(contestantId); + } else { + deltaUnsubs.set(contestantId, 'terminated'); + } + } + + // ─── startBattle ──────────────────────────────────────────────────────────── + + async function startBattle(opts: BattleStartOpts): Promise<{ battleId: string }> { + if (opts.contestants.length < 2 || opts.contestants.length > 6) { + throw new Error(`battle requires 2–6 contestants; got ${opts.contestants.length}`); + } + + const [proj] = await sql<{ path: string }[]>`SELECT path FROM projects WHERE id = ${opts.projectId}`; + if (!proj) throw new Error(`project not found: ${opts.projectId}`); + + // Insert the battle row as 'running'; update results_path once we have the id. + const [battle] = await sql<{ id: string; created_at: Date }[]>` + INSERT INTO battles (project_id, battle_type, prompt, status) + VALUES (${opts.projectId}, ${opts.battleType}, ${opts.prompt}, 'running') + RETURNING id, created_at + `; + const battleId = battle!.id; + const battleSlug = buildBattleSlug(battleId, opts.battleType, battle!.created_at); + const resultsPath = join(proj.path, 'Arena', battleSlug); + + await sql` + UPDATE battles SET results_path = ${resultsPath}, updated_at = clock_timestamp() + WHERE id = ${battleId} + `; + + // Insert all contestant rows with lane classification. + const contestantRows: Array<{ id: string; identity: string; model: string; lane: ContestantLane }> = []; + for (const spec of opts.contestants) { + const lane = classifyLane(opts.battleType, spec.identity, spec.model, localModels); + const [row] = await sql<{ id: string }[]>` + INSERT INTO contestants (battle_id, identity, model, lane, status) + VALUES (${battleId}, ${spec.identity}, ${spec.model}, ${lane}, 'queued') + RETURNING id + `; + contestantRows.push({ id: row!.id, identity: spec.identity, model: spec.model, lane }); + } + + // Write initial manifest so the results folder is always populated. + await writeManifest( + battleId, resultsPath, opts.battleType, opts.prompt, battle!.created_at, + contestantRows.map((c) => ({ identity: c.identity, model: c.model, lane: c.lane })), + null, + ).catch((err) => { + log.warn({ err: errMsg(err), battleId }, 'arena-runner: initial manifest write failed'); + }); + + publishUser({ + type: 'battle_started', + battle_id: battleId, + battle_type: opts.battleType, + prompt: opts.prompt, + contestants: contestantRows.map((c) => ({ + id: c.id, + identity: c.identity, + model: c.model, + lane: c.lane, + })), + }); + + // Dispatch: cloud lane starts all contestants in parallel; local lane starts + // only the first queued contestant (serial queue). + let localStarted = false; + for (const c of contestantRows) { + if (c.lane === 'cloud') { + await dispatchContestant(battleId, opts.projectId, opts.battleType, opts.prompt, c); + } else if (!localStarted) { + await dispatchContestant(battleId, opts.projectId, opts.battleType, opts.prompt, c); + localStarted = true; + // remaining local contestants stay 'queued' until this one finishes + } + } + + return { battleId }; + } + + async function dispatchContestant( + battleId: string, + projectId: string, + battleType: BattleType, + prompt: string, + c: { id: string; identity: string; model: string; lane: ContestantLane }, + ): Promise { + const { taskId, sessionId } = await dispatch({ + projectId, + contestantId: c.id, + prompt, + identity: c.identity, + model: c.model, + battleType, + }); + await sql` + UPDATE contestants + SET task_id = ${taskId}, status = 'running', updated_at = clock_timestamp() + WHERE id = ${c.id} + `; + publishContestantFrame(battleId, c.id, { status: 'running' }); + // Start the delta bridge in the background; unsubscribe when the contestant + // terminates (teardownDeltaBridge called in handleTaskTerminal). + void setupDeltaBridge(battleId, c.id, taskId, sessionId ?? null); + } + + // ─── local-lane advance (serialized per battle) ─────────────────────────── + + function advanceLocalLane(battleId: string): Promise { + const prev = advanceChain.get(battleId) ?? Promise.resolve(); + const next = prev + .catch(() => {}) + .then(() => + advanceLocalLaneInner(battleId).catch((err) => { + log.error({ err: errMsg(err), battleId }, 'arena-runner: advanceLocalLane failed'); + }), + ); + advanceChain.set(battleId, next); + void next.finally(() => { + if (advanceChain.get(battleId) === next) advanceChain.delete(battleId); + }); + return next; + } + + async function advanceLocalLaneInner(battleId: string): Promise { + const battle = await loadBattle(battleId); + if (!battle || battle.status !== 'running') return; + + const contestants = await loadContestants(battleId); + const slots: ContestantSlot[] = contestants.map((c) => ({ + id: c.id, + lane: c.lane, + status: c.status, + })); + + // Nothing to do if the local lane is still busy. + const localRunning = slots.some((c) => c.lane === 'local' && c.status === 'running'); + if (localRunning) return; + + const nextId = nextLocalContestant(slots); + if (!nextId) return; // local queue is exhausted + + const next = contestants.find((c) => c.id === nextId)!; + await dispatchContestant(battleId, battle.project_id, battle.battle_type, battle.prompt, { + id: next.id, + identity: next.identity, + model: next.model, + lane: next.lane, + }); + } + + // ─── handleTaskTerminal ─────────────────────────────────────────────────── + + function handleTaskTerminal(taskId: string, state: string): void { + void (async () => { + // Look up which contestant owns this task (contestants_task_id_idx). + const [row] = await sql` + SELECT id, battle_id, identity, model, lane, task_id, worktree_id, status + FROM contestants WHERE task_id = ${taskId} + `; + if (!row) return; // not an arena task — ignore + if (row.status !== 'running') return; // already settled (idempotent) + + const battle = await loadBattle(row.battle_id); + + // Pull the task row for benchmark + output. + const [task] = await sql<{ + chat_id: string | null; + started_at: Date | null; + ended_at: Date | null; + cost_tokens: number | null; + }[]>`SELECT chat_id, started_at, ended_at, cost_tokens FROM tasks WHERE id = ${taskId}`; + + const endedAt = task?.ended_at ?? new Date(); + + if (state === 'completed') { + const startedAt = task?.started_at ?? endedAt; + const bench = computeBenchmark(startedAt, endedAt, task?.cost_tokens ?? null, row.lane); + + const output = task?.chat_id ? await readChatOutput(task.chat_id) : ''; + + const resultPath = battle + ? await writeContestantResults(battle, row, output, bench).catch((err) => { + log.warn({ err: errMsg(err), contestantId: row.id }, 'arena-runner: result write failed'); + return null; + }) + : null; + + await sql` + UPDATE contestants + SET status = 'done', + duration_ms = ${Math.round(bench.durationMs)}, + tokens_per_sec = ${bench.tokensPerSec}, + cost_tokens = ${task?.cost_tokens ?? null}, + result_path = ${resultPath}, + updated_at = clock_timestamp() + WHERE id = ${row.id} AND status = 'running' + `; + teardownDeltaBridge(row.id); + + // Check if this was the last contestant. + const allContestants = await loadContestants(row.battle_id); + const battleDone = isBattleComplete(allContestants); + + publishContestantFrame(row.battle_id, row.id, { + status: 'done', + duration_ms: Math.round(bench.durationMs), + ...(bench.tokensPerSec !== null ? { tokens_per_sec: bench.tokensPerSec } : {}), + ...(battleDone ? { battle_status: 'completed' } : {}), + }); + + if (battleDone) { + await completeBattle(row.battle_id); + } else if (row.lane === 'local') { + void advanceLocalLane(row.battle_id); + } + } else { + // failed or cancelled — the contest continues; this contestant is error. + const errorMsg = state === 'cancelled' ? 'cancelled' : `task ${state}`; + await sql` + UPDATE contestants + SET status = 'error', error = ${errorMsg}, updated_at = clock_timestamp() + WHERE id = ${row.id} AND status = 'running' + `; + teardownDeltaBridge(row.id); + + const allContestants = await loadContestants(row.battle_id); + const battleDone = isBattleComplete(allContestants); + + publishContestantFrame(row.battle_id, row.id, { + status: 'error', + error: errorMsg, + ...(battleDone ? { battle_status: 'completed' } : {}), + }); + + if (battleDone) { + await completeBattle(row.battle_id); + } else if (row.lane === 'local') { + void advanceLocalLane(row.battle_id); + } + } + })().catch((err) => { + log.error({ err: errMsg(err), taskId }, 'arena-runner: handleTaskTerminal failed'); + }); + } + + // ─── battle finalization ────────────────────────────────────────────────── + + async function completeBattle(battleId: string): Promise { + const updated = await sql` + UPDATE battles SET status = 'completed', updated_at = clock_timestamp() + WHERE id = ${battleId} AND status = 'running' + `; + if (updated.count === 0) return; // already terminal (race guard) + log.info({ battleId }, 'arena-runner: battle completed'); + + // Update manifest with finished_at timestamp. + const completedBattle = await loadBattle(battleId); + if (completedBattle?.results_path) { + const contestants = await loadContestants(battleId); + await writeManifest( + battleId, + completedBattle.results_path, + completedBattle.battle_type, + completedBattle.prompt, + completedBattle.created_at, + contestants.map((c) => ({ identity: c.identity, model: c.model, lane: c.lane })), + new Date(), + ).catch((err) => { + log.warn({ err: errMsg(err), battleId }, 'arena-runner: manifest update failed'); + }); + } + + onBattleComplete(battleId); + } + + // ─── manifest writer ───────────────────────────────────────────────────── + + async function writeManifest( + battleId: string, + resultsPath: string, + battleType: BattleType, + prompt: string, + createdAt: Date, + contestants: Array<{ identity: string; model: string; lane: ContestantLane }>, + finishedAt: Date | null, + ): Promise { + await mkdir(resultsPath, { recursive: true }); + const manifest = { + id: battleId, + battle_type: battleType, + prompt, + contestants, + created_at: createdAt.toISOString(), + finished_at: finishedAt?.toISOString() ?? null, + }; + await writeFile(join(resultsPath, 'manifest.json'), JSON.stringify(manifest, null, 2), 'utf8'); + } + + // ─── results writer ─────────────────────────────────────────────────────── + + async function writeContestantResults( + battle: BattleRow, + contestant: { identity: string; model: string; lane: ContestantLane; worktree_id: string | null }, + output: string, + bench: { durationMs: number; tokensPerSec: number | null }, + ): Promise { + const resultsPath = await getOrBuildResultsPath(battle); + if (!resultsPath) throw new Error('cannot resolve results path for battle ' + battle.id); + + const contestantDir = buildContestantDir(contestant.identity, contestant.model); + const dir = join(resultsPath, contestantDir); + await mkdir(dir, { recursive: true }); + + const benchLines = [ + `duration: ${bench.durationMs}ms`, + bench.tokensPerSec != null ? `tokens/sec: ${bench.tokensPerSec.toFixed(1)}` : null, + ] + .filter(Boolean) + .join('\n'); + + const resultMd = + `# ${contestant.identity} / ${contestant.model}\n\n` + + `## Benchmark\n\n${benchLines}\n\n` + + `## Output\n\n${output}\n`; + await writeFile(join(dir, 'result.md'), resultMd, 'utf8'); + + if (battle.battle_type === 'coding' && contestant.worktree_id) { + const [wt] = await sql<{ path: string; base_commit: string | null }[]>` + SELECT path, base_commit FROM worktrees WHERE id = ${contestant.worktree_id} + `; + if (wt) { + const [proj] = await sql<{ path: string }[]>` + SELECT path FROM projects WHERE id = ${battle.project_id} + `; + if (proj) { + const diff = await diffWorktree(wt.path, proj.path, { + baseRef: wt.base_commit ?? undefined, + }).catch(() => ''); + await writeFile(join(dir, 'diff.patch'), diff, 'utf8'); + } + } + } + + return dir; + } + + /** Resolve or rebuild results_path for a battle (handles crash-before-UPDATE). */ + async function getOrBuildResultsPath(battle: BattleRow): Promise { + if (battle.results_path) return battle.results_path; + const [proj] = await sql<{ path: string }[]>`SELECT path FROM projects WHERE id = ${battle.project_id}`; + if (!proj) return null; + const slug = buildBattleSlug(battle.id, battle.battle_type, battle.created_at); + const resultsPath = join(proj.path, 'Arena', slug); + await sql` + UPDATE battles SET results_path = ${resultsPath}, updated_at = clock_timestamp() + WHERE id = ${battle.id} + `; + return resultsPath; + } + + // ─── helpers ────────────────────────────────────────────────────────────── + + async function readChatOutput(chatId: string): Promise { + const [m] = await sql<{ content: string | null }[]>` + SELECT content FROM messages + WHERE chat_id = ${chatId} AND role = 'assistant' + ORDER BY created_at DESC LIMIT 1 + `; + return m?.content ?? ''; + } + + async function loadBattle(battleId: string): Promise { + const [b] = await sql` + SELECT id, project_id, battle_type, prompt, status, results_path, created_at + FROM battles WHERE id = ${battleId} + `; + return b ?? null; + } + + async function loadContestants(battleId: string): Promise { + return sql` + SELECT id, battle_id, identity, model, lane, task_id, worktree_id, status + FROM contestants WHERE battle_id = ${battleId} + ORDER BY created_at ASC + `; + } + + function publishContestantFrame( + battleId: string, + contestantId: string, + extra: Record, + ): void { + publishUser({ + type: 'contestant_updated', + battle_id: battleId, + contestant_id: contestantId, + ...extra, + }); + } + + // ─── initResume ─────────────────────────────────────────────────────────── + + async function initResume(): Promise { + const battles = await sql` + SELECT id, project_id, battle_type, prompt, status, results_path, created_at + FROM battles WHERE status = 'running' + `; + if (battles.length === 0) return; + log.info({ count: battles.length }, 'arena-runner: resuming in-flight battles on startup'); + for (const battle of battles) { + await resumeBattle(battle).catch((err) => { + log.error({ err: errMsg(err), battleId: battle.id }, 'arena-runner: initResume failed for battle'); + }); + } + } + + async function resumeBattle(battle: BattleRow): Promise { + const contestants = await loadContestants(battle.id); + + const taskIds = contestants.map((c) => c.task_id).filter((id): id is string => id !== null); + const taskStates = new Map(); + if (taskIds.length > 0) { + const tasks = await sql<{ id: string; state: string }[]>` + SELECT id, state FROM tasks WHERE id = ANY(${taskIds}) + `; + for (const t of tasks) taskStates.set(t.id, t.state); + } + + const decisions = reconcileContestants( + contestants.map((c) => ({ contestantId: c.id, taskId: c.task_id, status: c.status })), + taskStates, + ); + + for (const decision of decisions) { + if (decision.action === 'keep') continue; + const contestant = contestants.find((c) => c.id === decision.contestantId)!; + await applyResumeDecision(battle, contestant, decision.action); + } + + // Re-check completion after applying decisions. + const updated = await loadContestants(battle.id); + if (isBattleComplete(updated)) { + await completeBattle(battle.id); + } else { + // Advance local lane in case a slot opened up. + void advanceLocalLane(battle.id); + } + + log.info({ battleId: battle.id }, 'arena-runner: battle resumed'); + } + + async function applyResumeDecision( + battle: BattleRow, + contestant: ContestantRow, + action: ContestantResumeAction, + ): Promise { + switch (action) { + case 'keep': break; + + case 'mark-done': { + const taskRow = contestant.task_id + ? (await sql<{ started_at: Date | null; ended_at: Date | null; cost_tokens: number | null; chat_id: string | null }[]>` + SELECT started_at, ended_at, cost_tokens, chat_id FROM tasks WHERE id = ${contestant.task_id}`)[0] + : null; + const endedAt = taskRow?.ended_at ?? new Date(); + const startedAt = taskRow?.started_at ?? endedAt; + const bench = computeBenchmark(startedAt, endedAt, taskRow?.cost_tokens ?? null, contestant.lane); + const output = taskRow?.chat_id ? await readChatOutput(taskRow.chat_id) : ''; + const resultPath = battle + ? await writeContestantResults(battle, contestant, output, bench).catch((err) => { + log.warn({ err: errMsg(err), contestantId: contestant.id }, 'arena-runner: resume result write failed'); + return null; + }) + : null; + await sql` + UPDATE contestants + SET status = 'done', + duration_ms = ${Math.round(bench.durationMs)}, + tokens_per_sec = ${bench.tokensPerSec}, + result_path = ${resultPath}, + updated_at = clock_timestamp() + WHERE id = ${contestant.id} + `; + break; + } + + case 'mark-error': + await sql` + UPDATE contestants + SET status = 'error', error = 'task failed before callback', + updated_at = clock_timestamp() + WHERE id = ${contestant.id} + `; + break; + + case 'mark-cancelled': + await sql` + UPDATE contestants + SET status = 'error', error = 'cancelled before callback', + updated_at = clock_timestamp() + WHERE id = ${contestant.id} + `; + break; + + case 're-dispatch': { + const { taskId } = await dispatch({ + projectId: battle.project_id, + contestantId: contestant.id, + prompt: battle.prompt, + identity: contestant.identity, + model: contestant.model, + battleType: battle.battle_type, + }); + await sql` + UPDATE contestants + SET task_id = ${taskId}, updated_at = clock_timestamp() + WHERE id = ${contestant.id} + `; + log.info( + { battleId: battle.id, contestantId: contestant.id, taskId }, + 'arena-runner: contestant re-dispatched on resume', + ); + break; + } + } + } + + // ─── cancelBattle ───────────────────────────────────────────────────────── + + async function cancelBattle(battleId: string): Promise<{ cancelled: boolean; taskIds: string[] }> { + const updated = await sql` + UPDATE battles SET status = 'cancelled', updated_at = clock_timestamp() + WHERE id = ${battleId} AND status = 'running' + `; + if (updated.count === 0) return { cancelled: false, taskIds: [] }; + + // Mark all non-terminal contestants cancelled and collect in-flight task_ids. + const contestants = await sql<{ id: string; task_id: string | null; status: string }[]>` + SELECT id, task_id, status FROM contestants + WHERE battle_id = ${battleId} AND status NOT IN ('done', 'error') + `; + + if (contestants.length > 0) { + await sql` + UPDATE contestants + SET status = 'error', error = 'battle cancelled', updated_at = clock_timestamp() + WHERE battle_id = ${battleId} AND status NOT IN ('done', 'error') + `; + for (const c of contestants) { + publishContestantFrame(battleId, c.id, { + status: 'error', + error: 'battle cancelled', + battle_status: 'cancelled', + }); + } + } + + const taskIds = contestants + .filter( + (c): c is typeof c & { task_id: string } => + c.task_id !== null && c.status === 'running', + ) + .map((c) => c.task_id); + + log.info({ battleId }, 'arena-runner: battle cancelled by request'); + return { cancelled: true, taskIds }; + } + + // ─── triggerAnalysis (Phase 5 seam) ────────────────────────────────────── + + async function triggerAnalysis(battleId: string): Promise<{ triggered: boolean }> { + const battle = await loadBattle(battleId); + if (!battle) return { triggered: false }; + log.info({ battleId }, 'arena-runner: triggerAnalysis requested'); + // Calls the injected onBattleComplete seam — Phase 5 replaces this with the + // real two-stage digest→judge analyzer (see ADR 0002 + plan Phase 5). + onBattleComplete(battleId); + return { triggered: true }; + } + + // ─── startCrossExam (Phase 5 seam) ─────────────────────────────────────── + + async function startCrossExam( + battleId: string, + opts: { identity: string; model: string }, + ): Promise<{ crossExamId: string }> { + const [row] = await sql<{ id: string }[]>` + INSERT INTO cross_examinations (battle_id, identity, model) + VALUES (${battleId}, ${opts.identity}, ${opts.model}) + RETURNING id + `; + const crossExamId = row!.id; + log.info({ battleId, crossExamId, ...opts }, 'arena-runner: cross-exam inserted, triggering analyzer'); + if (onCrossExamStart) { + try { + onCrossExamStart({ battleId, crossExamId, identity: opts.identity, model: opts.model }); + } catch (err) { + log.error({ err: err instanceof Error ? err.message : String(err), battleId, crossExamId }, 'arena-runner: onCrossExamStart threw'); + } + } + return { crossExamId }; + } + + // ─── setWinner (user override) ──────────────────────────────────────────── + + async function setWinner( + battleId: string, + winnerId: string | null, + ): Promise<{ ok: boolean; notFound?: boolean; invalidContestant?: boolean }> { + const [row] = await sql<{ id: string }[]>`SELECT id FROM battles WHERE id = ${battleId}`; + if (!row) return { ok: false, notFound: true }; + + if (winnerId !== null) { + const [c] = await sql<{ id: string }[]>` + SELECT id FROM contestants WHERE id = ${winnerId} AND battle_id = ${battleId} + `; + if (!c) return { ok: false, invalidContestant: true }; + } + + await sql` + UPDATE battles SET winner_contestant_id = ${winnerId}, updated_at = clock_timestamp() + WHERE id = ${battleId} + `; + publishUser({ type: 'battle_updated', battle_id: battleId, winner_contestant_id: winnerId }); + return { ok: true }; + } + + return { startBattle, handleTaskTerminal, initResume, cancelBattle, triggerAnalysis, startCrossExam, setWinner }; +} + +function errMsg(e: unknown): string { + return e instanceof Error ? e.message : String(e); +} diff --git a/apps/coder/src/services/dispatcher.ts b/apps/coder/src/services/dispatcher.ts index b866aa4..3a93aab 100644 --- a/apps/coder/src/services/dispatcher.ts +++ b/apps/coder/src/services/dispatcher.ts @@ -310,6 +310,9 @@ export function createDispatcher(deps: Deps): { const taskId = task.id; log.info({ taskId }, 'dispatcher: starting task (path A — native)'); + // Declared before try so the catch block can write it back on the task row. + let chatId: string | null = null; + try { // Mark running await sql` @@ -318,26 +321,29 @@ export function createDispatcher(deps: Deps): { WHERE id = ${taskId} `; - // Create session + chat for this task + // Session setup: reuse a pre-created session (e.g. Q&A arena contestants + // whose persona is stamped on the session via agent_id) or create a fresh one. const model = task.model ?? config.DEFAULT_MODEL; - const sessionName = 'Task: ' + task.input.slice(0, 40); - - const [session] = await sql<{ id: string }[]>` - INSERT INTO sessions (project_id, name, model, status) - VALUES (${task.project_id}, ${sessionName}, ${model}, 'open') - RETURNING id - `; - const sessionId = session!.id; + let sessionId: string; + if (task.session_id) { + sessionId = task.session_id; + } else { + const sessionName = 'Task: ' + task.input.slice(0, 40); + const [session] = await sql<{ id: string }[]>` + INSERT INTO sessions (project_id, name, model, status) + VALUES (${task.project_id}, ${sessionName}, ${model}, 'open') + RETURNING id + `; + sessionId = session!.id; + await sql`UPDATE tasks SET session_id = ${sessionId} WHERE id = ${taskId}`; + } const [chat] = await sql<{ id: string }[]>` INSERT INTO chats (session_id, name, status) VALUES (${sessionId}, 'Task execution', 'open') RETURNING id `; - const chatId = chat!.id; - - // Link task to session - await sql`UPDATE tasks SET session_id = ${sessionId} WHERE id = ${taskId}`; + chatId = chat!.id; // Create user message + streaming assistant await sql<{ id: string }[]>` @@ -382,7 +388,7 @@ export function createDispatcher(deps: Deps): { const summary = (msg?.content ?? '').slice(0, 500); await sql` UPDATE tasks - SET state = 'completed', ended_at = clock_timestamp(), output_summary = ${summary}, cost_tokens = ${costTokens} + SET state = 'completed', ended_at = clock_timestamp(), output_summary = ${summary}, cost_tokens = ${costTokens}, chat_id = ${chatId} WHERE id = ${taskId} `; log.info({ taskId, costTokens }, 'dispatcher: task completed (native)'); @@ -409,7 +415,7 @@ export function createDispatcher(deps: Deps): { const summary = (msg?.content ?? 'Inference failed').slice(0, 500); await sql` UPDATE tasks - SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${summary}, cost_tokens = ${costTokens} + SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${summary}, cost_tokens = ${costTokens}, chat_id = ${chatId} WHERE id = ${taskId} `; log.warn({ taskId, finalStatus }, 'dispatcher: task failed (native)'); @@ -419,7 +425,7 @@ export function createDispatcher(deps: Deps): { log.error({ taskId, err: errMsg }, 'dispatcher: task error (native)'); await sql` UPDATE tasks - SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)} + SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}, chat_id = ${chatId} WHERE id = ${taskId} `.catch(() => {}); } diff --git a/apps/server/src/schema.sql b/apps/server/src/schema.sql index 7df4ff5..d6d3089 100644 --- a/apps/server/src/schema.sql +++ b/apps/server/src/schema.sql @@ -372,13 +372,12 @@ ALTER TABLE messages ADD COLUMN IF NOT EXISTS tail_start_id UUID REFERENCES mess ALTER TABLE chats ADD COLUMN IF NOT EXISTS needs_compaction BOOLEAN NOT NULL DEFAULT FALSE; CREATE INDEX IF NOT EXISTS idx_messages_chat_compacted ON messages (chat_id, compacted_at); --- tasks table (provider dispatch, arena) +-- tasks table (provider dispatch) CREATE TABLE IF NOT EXISTS tasks ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), project_id UUID NOT NULL REFERENCES projects(id) ON DELETE CASCADE, session_id UUID REFERENCES sessions(id) ON DELETE CASCADE, parent_task_id UUID REFERENCES tasks(id), - arena_id UUID, state TEXT NOT NULL DEFAULT 'pending' CHECK (state IN ('pending','running','completed','failed','blocked','cancelled')), input TEXT NOT NULL, @@ -405,3 +404,6 @@ DO $$ BEGIN FOREIGN KEY (session_id) REFERENCES sessions(id) ON DELETE CASCADE; END IF; END $$; + +-- Remove the v2.0.5 arena_id column (replaced by the new Arena feature). +ALTER TABLE tasks DROP COLUMN IF EXISTS arena_id; diff --git a/apps/server/src/services/inference/types.ts b/apps/server/src/services/inference/types.ts index 307077d..d3f3bd0 100644 --- a/apps/server/src/services/inference/types.ts +++ b/apps/server/src/services/inference/types.ts @@ -44,7 +44,11 @@ export interface InferenceFrame { | 'chat_renamed' | 'error' | 'flow_run_started' - | 'flow_run_step_updated'; + | 'flow_run_step_updated' + // arena frames + | 'battle_started' + | 'contestant_updated' + | 'battle_updated'; message_id?: string; message_ids?: string[]; chat_id?: string; @@ -84,6 +88,19 @@ export interface InferenceFrame { status?: string; run_status?: 'running' | 'completed' | 'failed' | 'cancelled'; report?: string; + // arena frames + battle_id?: string; + battle_type?: 'coding' | 'qa'; + prompt?: string; + contestants?: Array<{ id: string; identity: string; model: string; lane: 'local' | 'cloud' }>; + contestant_id?: string; + battle_status?: 'pending' | 'running' | 'completed' | 'failed' | 'cancelled'; + duration_ms?: number; + tokens_per_sec?: number; + winner_contestant_id?: string | null; + analysis_ready?: boolean; + cross_exam_id?: string; + delta?: string; } export type FramePublisher = (sessionId: string, frame: InferenceFrame) => void; diff --git a/apps/web/src/App.tsx b/apps/web/src/App.tsx index 951499d..130ea33 100644 --- a/apps/web/src/App.tsx +++ b/apps/web/src/App.tsx @@ -16,6 +16,7 @@ import { RightRailDrawerProvider, useRightRailDrawer } from '@/hooks/useRightRai import { useViewport } from '@/hooks/useViewport'; import { ThemeFx } from '@/components/fx/ThemeFx'; import { FlowLauncherDialog } from '@/components/FlowLauncherDialog'; +import { ArenaLauncherDialog } from '@/components/ArenaLauncherDialog'; function SessionRightRail() { const { id } = useParams<{ id: string }>(); @@ -102,6 +103,7 @@ function AppShell() { + ); diff --git a/apps/web/src/api/client.ts b/apps/web/src/api/client.ts index c65a710..64e6f65 100644 --- a/apps/web/src/api/client.ts +++ b/apps/web/src/api/client.ts @@ -27,6 +27,9 @@ import type { WorkspaceState, FlowRunRow, FlowStepRow, + BattleShape, + ContestantShape, + CrossExaminationShape, } from './types'; // v2.6 Phase 1-UX §9b: chat-scoped agent-session rows. Returned by @@ -518,6 +521,63 @@ export const api = { request(`/api/projects/${projectId}/agents`), }, + // Arena battle API — proxied to boocoder at /api/coder/battles/*. + battles: { + create: (body: { + project_id: string; + battle_type: 'coding' | 'qa'; + prompt: string; + contestants: Array<{ identity: string; model: string }>; + }) => + request<{ battle_id: string }>('/api/coder/battles', { + method: 'POST', + body: JSON.stringify(body), + }), + list: (projectId: string) => + request<{ battles: BattleShape[] }>( + `/api/coder/battles?project_id=${encodeURIComponent(projectId)}`, + ), + get: (battleId: string) => + request<{ + battle: BattleShape; + contestants: ContestantShape[]; + cross_examinations: CrossExaminationShape[]; + }>(`/api/coder/battles/${encodeURIComponent(battleId)}`), + stop: (battleId: string) => + request<{ cancelled: boolean }>( + `/api/coder/battles/${encodeURIComponent(battleId)}/stop`, + { method: 'POST' }, + ), + analyze: (battleId: string) => + request<{ triggered: boolean }>( + `/api/coder/battles/${encodeURIComponent(battleId)}/analyze`, + { method: 'POST' }, + ), + crossExamine: (battleId: string, body: { identity: string; model: string }) => + request<{ cross_exam_id: string }>( + `/api/coder/battles/${encodeURIComponent(battleId)}/cross-examine`, + { method: 'POST', body: JSON.stringify(body) }, + ), + getAnalysis: (battleId: string) => + request<{ text: string }>( + `/api/coder/battles/${encodeURIComponent(battleId)}/analysis`, + ), + generatePrompt: (description: string) => + request<{ prompt: string }>('/api/coder/battles/generate-prompt', { + method: 'POST', + body: JSON.stringify({ description }), + }), + setWinner: (battleId: string, body: { winner_contestant_id: string | null }) => + request<{ ok: boolean }>( + `/api/coder/battles/${encodeURIComponent(battleId)}/winner`, + { method: 'PATCH', body: JSON.stringify(body) }, + ), + getDiff: (battleId: string, contestantId: string) => + request<{ diff: string }>( + `/api/coder/battles/${encodeURIComponent(battleId)}/contestants/${encodeURIComponent(contestantId)}/diff`, + ), + }, + skills: { list: () => request<{ skills: Skill[] }>('/api/skills'), }, diff --git a/apps/web/src/api/types.ts b/apps/web/src/api/types.ts index 75cde26..27f91d0 100644 --- a/apps/web/src/api/types.ts +++ b/apps/web/src/api/types.ts @@ -391,7 +391,8 @@ export type WorkspacePaneKind = | 'settings' | 'markdown_artifact' | 'html_artifact' - | 'orchestrator'; + | 'orchestrator' + | 'arena'; // Mixed tabs: a pane can hold tabs of different kinds (a BooChat tab next to a // BooCode tab next to a Terminal tab). Each tab carries its own kind; the active @@ -424,6 +425,10 @@ export interface OrchestratorState { band: 'small' | 'medium' | 'large'; } +// Arena pane state — single-sourced in @boocode/contracts; edit the package, not here. +import type { ArenaState, BattleShape, ContestantShape, CrossExaminationShape, BattleType, BattleStatus, ContestantStatus, ContestantLane } from '@boocode/contracts/arena'; +export type { ArenaState, BattleShape, ContestantShape, CrossExaminationShape, BattleType, BattleStatus, ContestantStatus, ContestantLane }; + // Orchestrator run API types (returned by GET /api/coder/runs/:id). export interface FlowRunRow { id: string; @@ -475,6 +480,8 @@ export interface WorkspacePane { html_artifact_state?: HtmlArtifactState; // orchestrator pane: populated only when kind === 'orchestrator'. orchestrator_state?: OrchestratorState; + // arena pane: populated only when kind === 'arena'. + arena_state?: ArenaState; } // Reopen LIFO stack entry. Shape unchanged from the prior module-level stack; @@ -592,4 +599,31 @@ export type WsFrame = status: 'pending' | 'running' | 'completed' | 'failed' | 'skipped' | 'cancelled'; run_status?: 'running' | 'completed' | 'failed' | 'cancelled'; report?: string; + } + // arena frames: battle lifecycle + per-contestant streaming + | { + type: 'battle_started'; + battle_id: string; + battle_type: 'coding' | 'qa'; + prompt: string; + contestants: Array<{ id: string; identity: string; model: string; lane: 'local' | 'cloud' }>; + } + | { + type: 'contestant_updated'; + battle_id: string; + contestant_id: string; + status?: 'queued' | 'running' | 'done' | 'error'; + duration_ms?: number; + tokens_per_sec?: number; + battle_status?: 'pending' | 'running' | 'completed' | 'failed' | 'cancelled'; + delta?: string; + error?: string; + } + | { + type: 'battle_updated'; + battle_id: string; + status?: 'pending' | 'running' | 'completed' | 'failed' | 'cancelled'; + winner_contestant_id?: string | null; + analysis_ready?: boolean; + cross_exam_id?: string; }; diff --git a/apps/web/src/components/ArenaLauncherDialog.tsx b/apps/web/src/components/ArenaLauncherDialog.tsx new file mode 100644 index 0000000..d386dec --- /dev/null +++ b/apps/web/src/components/ArenaLauncherDialog.tsx @@ -0,0 +1,410 @@ +// ArenaLauncherDialog — mirrors FlowLauncherDialog. +// Opens via sessionEvents 'open_arena_launcher'. +// Flow: pick Battle Type → write/generate prompt → add 2–6 contestants → Start. + +import { useCallback, useEffect, useRef, useState } from 'react'; +import { Loader2, Minus, Plus, Swords, TriangleAlert, X } from 'lucide-react'; +import { toast } from 'sonner'; +import { + Dialog, + DialogContent, + DialogFooter, + DialogHeader, + DialogTitle, +} from '@/components/ui/dialog'; +import { Button } from '@/components/ui/button'; +import { Label } from '@/components/ui/label'; +import { api } from '@/api/client'; +import type { Agent, ProviderSnapshotEntry } from '@/api/types'; +import { sessionEvents } from '@/hooks/sessionEvents'; +import { useProviderSnapshot } from '@/hooks/useProviderSnapshot'; +import { cn } from '@/lib/utils'; + +// ─── types ──────────────────────────────────────────────────────────────────── + +type BattleType = 'coding' | 'qa'; + +interface Contestant { + key: string; // local unique key for React + identity: string; + model: string; +} + +// ─── helpers ───────────────────────────────────────────────────────────────── + +function newContestant(): Contestant { + return { key: crypto.randomUUID(), identity: '', model: '' }; +} + +function isDuplicate(contestants: Contestant[], c: Contestant): boolean { + const dups = contestants.filter( + (x) => x.key !== c.key && x.identity === c.identity && x.model === c.model && x.identity !== '', + ); + return dups.length > 0; +} + +function hasDuplicatePair(contestants: Contestant[]): boolean { + return contestants.some((c) => isDuplicate(contestants, c)); +} + +function localCount(battleType: BattleType, contestants: Contestant[], snapshot: ProviderSnapshotEntry[] | null): number { + if (battleType === 'qa') return contestants.filter((c) => c.identity !== '').length; + const boocode = snapshot?.find((e) => e.name === 'boocode'); + const localModelIds = new Set(boocode?.models.map((m) => m.id) ?? []); + return contestants.filter((c) => { + // Match bare IDs (boocode/native) and llama-swap/-prefixed IDs used by + // opencode and other external agents pointing at the local llama-swap server. + return localModelIds.has(c.model) || localModelIds.has(c.model.replace(/^llama-swap\//, '')); + }).length; +} + +// ─── ContestantRow ──────────────────────────────────────────────────────────── + +function ContestantRow({ + contestant, + battleType, + snapshot, + agents, + allContestants, + onUpdate, + onRemove, + removable, +}: { + contestant: Contestant; + battleType: BattleType; + snapshot: ProviderSnapshotEntry[] | null; + agents: Agent[]; + allContestants: Contestant[]; + onUpdate: (patch: Partial) => void; + onRemove: () => void; + removable: boolean; +}) { + const dup = isDuplicate(allContestants, contestant); + + // Identity options for Coding: installed provider names. + // Identity options for Q&A: agents by id. + const identityOptions = + battleType === 'coding' + ? (snapshot ?? []) + .filter((e) => e.installed && e.enabled) + .map((e) => ({ value: e.name, label: e.label })) + : agents.map((a) => ({ value: a.id, label: a.name })); + + // Model options: for Coding use the selected provider's models; for Q&A use boocode models. + const modelOptions: { value: string; label: string }[] = (() => { + if (battleType === 'coding') { + const provider = (snapshot ?? []).find((e) => e.name === contestant.identity); + return (provider?.models ?? []).map((m) => ({ value: m.id, label: m.label })); + } + // Q&A: native backend only — use boocode models + const boocode = (snapshot ?? []).find((e) => e.name === 'boocode'); + return (boocode?.models ?? []).map((m) => ({ value: m.id, label: m.label })); + })(); + + function handleIdentityChange(value: string) { + // Reset model when identity changes so stale model doesn't persist. + onUpdate({ identity: value, model: '' }); + } + + function handleModelChange(value: string) { + onUpdate({ model: value }); + } + + return ( +
+ + + {dup && ( + + + + )} + {removable && ( + + )} +
+ ); +} + +// ─── ArenaLauncherDialog ────────────────────────────────────────────────────── + +export function ArenaLauncherDialog() { + const [open, setOpen] = useState(false); + const [projectId, setProjectId] = useState(''); + const [placement, setPlacement] = useState<'new' | 'split'>('new'); + const [battleType, setBattleType] = useState('coding'); + const [prompt, setPrompt] = useState(''); + const [contestants, setContestants] = useState(() => [ + newContestant(), + newContestant(), + ]); + const [generating, setGenerating] = useState(false); + const [starting, setStarting] = useState(false); + const [agents, setAgents] = useState([]); + const promptRef = useRef(null); + + const snapshot = useProviderSnapshot(); + + useEffect(() => { + return sessionEvents.subscribe((ev) => { + if (ev.type !== 'open_arena_launcher') return; + setProjectId(ev.project_id); + setPlacement(ev.placement ?? 'new'); + setBattleType('coding'); + setPrompt(''); + setContestants([newContestant(), newContestant()]); + setGenerating(false); + setStarting(false); + setOpen(true); + }); + }, []); + + // Load agents list when dialog opens (for Q&A mode). + useEffect(() => { + if (!open || !projectId) return; + api.agents.list(projectId) + .then((r) => setAgents(r.agents)) + .catch(() => {}); + }, [open, projectId]); + + const handleGeneratePrompt = useCallback(async () => { + const description = prompt.trim(); + if (!description || generating) return; + setGenerating(true); + try { + const { prompt: generated } = await api.battles.generatePrompt(description); + setPrompt(generated); + promptRef.current?.focus(); + } catch (err) { + toast.error(err instanceof Error ? err.message : 'Generate failed'); + } finally { + setGenerating(false); + } + }, [prompt, generating]); + + function updateContestant(key: string, patch: Partial) { + setContestants((prev) => prev.map((c) => (c.key === key ? { ...c, ...patch } : c))); + } + + function removeContestant(key: string) { + setContestants((prev) => prev.filter((c) => c.key !== key)); + } + + function addContestant() { + if (contestants.length >= 6) return; + setContestants((prev) => [...prev, newContestant()]); + } + + const canStart = + !starting && + prompt.trim().length > 0 && + contestants.length >= 2 && + contestants.every((c) => c.identity !== '' && c.model !== '') && + !hasDuplicatePair(contestants); + + const localLaneCount = localCount(battleType, contestants, snapshot); + const showLocalWarning = localLaneCount >= 3; + + async function handleStart() { + if (!canStart) return; + setStarting(true); + try { + const { battle_id } = await api.battles.create({ + project_id: projectId, + battle_type: battleType, + prompt: prompt.trim(), + contestants: contestants.map((c) => ({ identity: c.identity, model: c.model })), + }); + sessionEvents.emit({ + type: 'open_arena_pane', + state: { battle_id, battle_type: battleType, prompt: prompt.trim() }, + placement, + }); + setOpen(false); + } catch (err) { + toast.error(err instanceof Error ? err.message : 'Failed to start battle'); + } finally { + setStarting(false); + } + } + + return ( + + + +
+ + New Arena Battle +
+

+ Run the same prompt against multiple AI competitors and pick the best result. +

+
+ +
+ {/* Battle type */} +
+ +
+ {(['coding', 'qa'] as const).map((t) => ( + + ))} +
+

+ {battleType === 'coding' + ? 'Each contestant works in its own isolated worktree. Results include a diff.' + : 'Contestants answer the prompt as text. No code changes.'} +

+
+ + {/* Prompt */} +
+
+ + +
+