feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt

Arena is a new pane kind for competitive AI evaluation. A Battle runs the same prompt against 2-6 Contestants across two concurrent lanes: local lane (llama-swap models, serial) and cloud lane (parallel). Added to all three registries: @boocode/contracts WsFrameSchema, server InferenceFrame, and web WsFrame. Backend (apps/coder): - arena-runner: battle scheduler, lane classifier, benchmark, results writer, resume, user winner override - arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL - arena-decisions: status transitions and resume logic (unit-tested) - arena-analyzer-helpers: pure helper functions (unit-tested) - arena-model-call: model call utility for analysis - arena routes: create/get/list/stop/analyze/cross-examine/winner/diff - schema: battles, contestants, cross_examinations tables (idempotent) - remove old /api/arena* routes and tasks.arena_id column Frontend (apps/web): - ArenaLauncherDialog: battle type, prompt, contestant selection - ArenaPane: live roster, streaming output, analysis, cross-exam - DiffView: unified diff with line-by-line color for coding contests - Winner override per-row dropdown (Trophy icon) - battle_updated WS handler for live winner/analysis updates - arena pane kind in Workspace, ChatTabBar, useSidebar Cross-app: - ArenaState and ArenaContestantShape/WsFrame types (contracts) - battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame - manifest.json written per battle results folder - /Arena added to .gitignore Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-06 23:25:29 +00:00
parent e04d0fdaa8
commit d6d246c15b
34 changed files with 4581 additions and 146 deletions
--- a/apps/coder/src/index.ts
+++ b/apps/coder/src/index.ts
@@ -23,8 +23,8 @@ import { registerAgentSessionRoutes } from './routes/agent-sessions.js';
 import { registerTaskRoutes } from './routes/tasks.js';
 import { registerInboxRoutes } from './routes/inbox.js';
 import { registerStatsRoutes } from './routes/stats.js';
-import { registerArenaRoutes } from './routes/arena.js';
 import { registerRunsRoutes } from './routes/runs.js';
+import { registerArenaRoutes } from './routes/arena.js';
 import { registerProviderRoutes } from './routes/providers.js';
 import { registerWorktreeSafetyRoutes } from './routes/worktree-safety.js';
 import { registerLifecycleRoutes } from './routes/lifecycle.js';
@@ -34,10 +34,13 @@ import { createDispatcher } from './services/dispatcher.js';
 // Orchestrator (Phase 2): DB-backed flow-runner; advances on the dispatcher's
 // onTaskTerminal hook.
 import { createFlowRunner } from './services/flow-runner.js';
+// Arena: DB-backed battle-runner; also advances on the onTaskTerminal hook.
+import { createBattleRunner, type DispatchContestantFn } from './services/arena-runner.js';
+import { createAnalyzer } from './services/arena-analyzer.js';
 import { agentPool } from './services/agent-pool.js';
 import { createOrphanWorktreeReaper } from './services/orphan-worktree-reaper.js';
 import { probeAgents } from './services/agent-probe.js';
-import { getProviderSnapshot, persistProbedModels } from './services/provider-snapshot.js';
+import { getProviderSnapshot, persistProbedModels, fetchLlamaSwapModels } from './services/provider-snapshot.js';
 import { setPermissionHooks } from './services/permission-waiter.js';
 import { publishAgentStatus } from './services/agent-status-publish.js';
 import { homedir } from 'node:os';
@@ -220,31 +223,119 @@ async function main() {

  // Orchestrator (Phase 2): the flow-runner reacts to the dispatcher's
  // onTaskTerminal hook to advance flow_runs. Created before the dispatcher so its
-  // terminal callback can be wired in. Its launch() is driven by the runs route
-  // (a later phase); resume on startup is a later phase too.
+  // terminal callback can be wired in.
  const flowRunner = createFlowRunner({ sql, broker, log: app.log, config });

-  // Phase 4: dispatcher — polls tasks table and runs inference. onTaskTerminal
-  // notifies the flow-runner when a step's task settles (D-2).
+  // Arena SEAM (a): build the local-model set from the live llama-swap model list.
+  // Both bare IDs ('qwen3.6-35b') and prefixed IDs ('llama-swap/qwen3.6-35b') are
+  // included so opencode-style prefixed contestants and native-style bare contestants
+  // both classify correctly as local.
+  const localModelsList = await fetchLlamaSwapModels(config).catch(() => []);
+  const localModels = new Set([
+    ...localModelsList.map((m) => m.id),
+    ...localModelsList.map((m) => `llama-swap/${m.id}`),
+  ]);
+
+  // Arena dispatch function — Phase 4 SEAM (b).
+  // Coding: insert a tasks row with agent=identity (null for native/boocode);
+  //   the dispatcher creates a worktree and runs the external agent (or native).
+  // Q&A: pre-create a session with agent_id stamped to the persona slug so native
+  //   inference loads the persona's system_prompt + tools from AGENTS.md;
+  //   task.session_id is pre-set so runNativeInference reuses the session.
+  const dispatchContestant: DispatchContestantFn = async ({
+    projectId,
+    prompt,
+    identity,
+    model,
+    battleType,
+  }) => {
+    if (battleType === 'qa') {
+      const sessionName = `Arena Q&A [${identity}]: ${prompt.slice(0, 30)}`;
+      const [session] = await sql<{ id: string }[]>`
+        INSERT INTO sessions (project_id, name, model, agent_id, status)
+        VALUES (${projectId}, ${sessionName}, ${model}, ${identity}, 'open')
+        RETURNING id
+      `;
+      const [task] = await sql<{ id: string }[]>`
+        INSERT INTO tasks (project_id, input, model, session_id)
+        VALUES (${projectId}, ${prompt}, ${model}, ${session!.id})
+        RETURNING id
+      `;
+      return { taskId: task!.id, sessionId: session!.id };
+    }
+    // Coding: boocode = native inference (no external agent); any other identity
+    // is an external agent name (claude, opencode, qwen, goose) that maps to
+    // available_agents and gets its own per-task worktree via runExternalAgent.
+    // Session is created lazily by the dispatcher, so sessionId is unknown here.
+    const agentName = identity === 'boocode' ? null : identity;
+    const [task] = await sql<{ id: string }[]>`
+      INSERT INTO tasks (project_id, input, agent, model)
+      VALUES (${projectId}, ${prompt}, ${agentName}, ${model})
+      RETURNING id
+    `;
+    return { taskId: task!.id, sessionId: null };
+  };
+
+  // Arena analyzer: two-stage digest→judge (v1). Pluggable seam — a v2 Han
+  // Orchestrator flow can replace this without schema changes.
+  const analyzer = createAnalyzer({
+    sql,
+    broker,
+    log: app.log,
+    config,
+    localModels,
+  });
+
+  // Arena battle-runner: notified on the same onTaskTerminal hook as the flow-runner.
+  const battleRunner = createBattleRunner({
+    sql,
+    broker,
+    log: app.log,
+    dispatch: dispatchContestant,
+    onBattleComplete: (battleId) => {
+      void analyzer.analyze(battleId);
+    },
+    onCrossExamStart: ({ battleId, crossExamId, identity, model }) => {
+      void analyzer.crossExamine(battleId, crossExamId, { identity, model });
+    },
+    localModels,
+  });
+
+  // Compose onTaskTerminal: both flow-runner and battle-runner are notified.
+  // Each ignores tasks it doesn't own (flow-runner checks flow_steps.task_id;
+  // battle-runner checks contestants.task_id).
+  const onTaskTerminal = (taskId: string, state: string): void => {
+    flowRunner.handleTaskTerminal(taskId, state);
+    battleRunner.handleTaskTerminal(taskId, state);
+  };
+
+  // Phase 4: dispatcher — polls tasks table and runs inference. The composed
+  // onTaskTerminal hook notifies both the flow-runner and the battle-runner when
+  // any task settles.
  const dispatcher = createDispatcher({
    sql,
    inference: inferenceApi,
    broker,
    log: app.log,
    config,
-    onTaskTerminal: flowRunner.handleTaskTerminal,
+    onTaskTerminal,
  });
  dispatcher.start();

-  // Phase 5: re-advance any flow_runs that were 'running' when the service last
-  // stopped (D-9). Runs AFTER dispatcher.start() so re-dispatched 'pending' tasks
-  // are picked up by the dispatcher's startup poll.
+  // Re-advance in-flight flow_runs and battles after a coder restart. Both run
+  // AFTER dispatcher.start() so re-dispatched 'pending' tasks are picked up.
  void flowRunner.initResume().catch((err) => {
    app.log.error(
      { err: err instanceof Error ? err.message : String(err) },
      'flow-runner: initResume failed',
    );
  });
+  void battleRunner.initResume().catch((err) => {
+    app.log.error(
+      { err: err instanceof Error ? err.message : String(err) },
+      'arena: initResume failed',
+    );
+  });

  // v2.6 Phase 3: configure + start the agent-pool lifecycle sweep (idle-TTL +
  // LRU-cap eviction of warm backends, plus each backend's proactive health probe)
@@ -281,8 +372,8 @@ async function main() {
  registerTaskRoutes(app, sql, inferenceApi, dispatcher.cancelExternalTask);
  registerInboxRoutes(app, sql);
  registerStatsRoutes(app, sql);
-  registerArenaRoutes(app, sql);
  registerRunsRoutes(app, sql, flowRunner, dispatcher.cancelExternalTask);
+  registerArenaRoutes(app, sql, battleRunner, dispatcher.cancelExternalTask, config);
  registerProviderRoutes(app, sql, config);
  registerWorktreeSafetyRoutes(app, sql);
  registerLifecycleRoutes(app, sql);