feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt
Arena is a new pane kind for competitive AI evaluation. A Battle runs the same prompt against 2-6 Contestants across two concurrent lanes: local lane (llama-swap models, serial) and cloud lane (parallel). Added to all three registries: @boocode/contracts WsFrameSchema, server InferenceFrame, and web WsFrame. Backend (apps/coder): - arena-runner: battle scheduler, lane classifier, benchmark, results writer, resume, user winner override - arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL - arena-decisions: status transitions and resume logic (unit-tested) - arena-analyzer-helpers: pure helper functions (unit-tested) - arena-model-call: model call utility for analysis - arena routes: create/get/list/stop/analyze/cross-examine/winner/diff - schema: battles, contestants, cross_examinations tables (idempotent) - remove old /api/arena* routes and tasks.arena_id column Frontend (apps/web): - ArenaLauncherDialog: battle type, prompt, contestant selection - ArenaPane: live roster, streaming output, analysis, cross-exam - DiffView: unified diff with line-by-line color for coding contests - Winner override per-row dropdown (Trophy icon) - battle_updated WS handler for live winner/analysis updates - arena pane kind in Workspace, ChatTabBar, useSidebar Cross-app: - ArenaState and ArenaContestantShape/WsFrame types (contracts) - battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame - manifest.json written per battle results folder - /Arena added to .gitignore Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -23,8 +23,8 @@ import { registerAgentSessionRoutes } from './routes/agent-sessions.js';
|
||||
import { registerTaskRoutes } from './routes/tasks.js';
|
||||
import { registerInboxRoutes } from './routes/inbox.js';
|
||||
import { registerStatsRoutes } from './routes/stats.js';
|
||||
import { registerArenaRoutes } from './routes/arena.js';
|
||||
import { registerRunsRoutes } from './routes/runs.js';
|
||||
import { registerArenaRoutes } from './routes/arena.js';
|
||||
import { registerProviderRoutes } from './routes/providers.js';
|
||||
import { registerWorktreeSafetyRoutes } from './routes/worktree-safety.js';
|
||||
import { registerLifecycleRoutes } from './routes/lifecycle.js';
|
||||
@@ -34,10 +34,13 @@ import { createDispatcher } from './services/dispatcher.js';
|
||||
// Orchestrator (Phase 2): DB-backed flow-runner; advances on the dispatcher's
|
||||
// onTaskTerminal hook.
|
||||
import { createFlowRunner } from './services/flow-runner.js';
|
||||
// Arena: DB-backed battle-runner; also advances on the onTaskTerminal hook.
|
||||
import { createBattleRunner, type DispatchContestantFn } from './services/arena-runner.js';
|
||||
import { createAnalyzer } from './services/arena-analyzer.js';
|
||||
import { agentPool } from './services/agent-pool.js';
|
||||
import { createOrphanWorktreeReaper } from './services/orphan-worktree-reaper.js';
|
||||
import { probeAgents } from './services/agent-probe.js';
|
||||
import { getProviderSnapshot, persistProbedModels } from './services/provider-snapshot.js';
|
||||
import { getProviderSnapshot, persistProbedModels, fetchLlamaSwapModels } from './services/provider-snapshot.js';
|
||||
import { setPermissionHooks } from './services/permission-waiter.js';
|
||||
import { publishAgentStatus } from './services/agent-status-publish.js';
|
||||
import { homedir } from 'node:os';
|
||||
@@ -220,31 +223,119 @@ async function main() {
|
||||
|
||||
// Orchestrator (Phase 2): the flow-runner reacts to the dispatcher's
|
||||
// onTaskTerminal hook to advance flow_runs. Created before the dispatcher so its
|
||||
// terminal callback can be wired in. Its launch() is driven by the runs route
|
||||
// (a later phase); resume on startup is a later phase too.
|
||||
// terminal callback can be wired in.
|
||||
const flowRunner = createFlowRunner({ sql, broker, log: app.log, config });
|
||||
|
||||
// Phase 4: dispatcher — polls tasks table and runs inference. onTaskTerminal
|
||||
// notifies the flow-runner when a step's task settles (D-2).
|
||||
// Arena SEAM (a): build the local-model set from the live llama-swap model list.
|
||||
// Both bare IDs ('qwen3.6-35b') and prefixed IDs ('llama-swap/qwen3.6-35b') are
|
||||
// included so opencode-style prefixed contestants and native-style bare contestants
|
||||
// both classify correctly as local.
|
||||
const localModelsList = await fetchLlamaSwapModels(config).catch(() => []);
|
||||
const localModels = new Set([
|
||||
...localModelsList.map((m) => m.id),
|
||||
...localModelsList.map((m) => `llama-swap/${m.id}`),
|
||||
]);
|
||||
|
||||
// Arena dispatch function — Phase 4 SEAM (b).
|
||||
// Coding: insert a tasks row with agent=identity (null for native/boocode);
|
||||
// the dispatcher creates a worktree and runs the external agent (or native).
|
||||
// Q&A: pre-create a session with agent_id stamped to the persona slug so native
|
||||
// inference loads the persona's system_prompt + tools from AGENTS.md;
|
||||
// task.session_id is pre-set so runNativeInference reuses the session.
|
||||
const dispatchContestant: DispatchContestantFn = async ({
|
||||
projectId,
|
||||
prompt,
|
||||
identity,
|
||||
model,
|
||||
battleType,
|
||||
}) => {
|
||||
if (battleType === 'qa') {
|
||||
const sessionName = `Arena Q&A [${identity}]: ${prompt.slice(0, 30)}`;
|
||||
const [session] = await sql<{ id: string }[]>`
|
||||
INSERT INTO sessions (project_id, name, model, agent_id, status)
|
||||
VALUES (${projectId}, ${sessionName}, ${model}, ${identity}, 'open')
|
||||
RETURNING id
|
||||
`;
|
||||
const [task] = await sql<{ id: string }[]>`
|
||||
INSERT INTO tasks (project_id, input, model, session_id)
|
||||
VALUES (${projectId}, ${prompt}, ${model}, ${session!.id})
|
||||
RETURNING id
|
||||
`;
|
||||
return { taskId: task!.id, sessionId: session!.id };
|
||||
}
|
||||
// Coding: boocode = native inference (no external agent); any other identity
|
||||
// is an external agent name (claude, opencode, qwen, goose) that maps to
|
||||
// available_agents and gets its own per-task worktree via runExternalAgent.
|
||||
// Session is created lazily by the dispatcher, so sessionId is unknown here.
|
||||
const agentName = identity === 'boocode' ? null : identity;
|
||||
const [task] = await sql<{ id: string }[]>`
|
||||
INSERT INTO tasks (project_id, input, agent, model)
|
||||
VALUES (${projectId}, ${prompt}, ${agentName}, ${model})
|
||||
RETURNING id
|
||||
`;
|
||||
return { taskId: task!.id, sessionId: null };
|
||||
};
|
||||
|
||||
// Arena analyzer: two-stage digest→judge (v1). Pluggable seam — a v2 Han
|
||||
// Orchestrator flow can replace this without schema changes.
|
||||
const analyzer = createAnalyzer({
|
||||
sql,
|
||||
broker,
|
||||
log: app.log,
|
||||
config,
|
||||
localModels,
|
||||
});
|
||||
|
||||
// Arena battle-runner: notified on the same onTaskTerminal hook as the flow-runner.
|
||||
const battleRunner = createBattleRunner({
|
||||
sql,
|
||||
broker,
|
||||
log: app.log,
|
||||
dispatch: dispatchContestant,
|
||||
onBattleComplete: (battleId) => {
|
||||
void analyzer.analyze(battleId);
|
||||
},
|
||||
onCrossExamStart: ({ battleId, crossExamId, identity, model }) => {
|
||||
void analyzer.crossExamine(battleId, crossExamId, { identity, model });
|
||||
},
|
||||
localModels,
|
||||
});
|
||||
|
||||
// Compose onTaskTerminal: both flow-runner and battle-runner are notified.
|
||||
// Each ignores tasks it doesn't own (flow-runner checks flow_steps.task_id;
|
||||
// battle-runner checks contestants.task_id).
|
||||
const onTaskTerminal = (taskId: string, state: string): void => {
|
||||
flowRunner.handleTaskTerminal(taskId, state);
|
||||
battleRunner.handleTaskTerminal(taskId, state);
|
||||
};
|
||||
|
||||
// Phase 4: dispatcher — polls tasks table and runs inference. The composed
|
||||
// onTaskTerminal hook notifies both the flow-runner and the battle-runner when
|
||||
// any task settles.
|
||||
const dispatcher = createDispatcher({
|
||||
sql,
|
||||
inference: inferenceApi,
|
||||
broker,
|
||||
log: app.log,
|
||||
config,
|
||||
onTaskTerminal: flowRunner.handleTaskTerminal,
|
||||
onTaskTerminal,
|
||||
});
|
||||
dispatcher.start();
|
||||
|
||||
// Phase 5: re-advance any flow_runs that were 'running' when the service last
|
||||
// stopped (D-9). Runs AFTER dispatcher.start() so re-dispatched 'pending' tasks
|
||||
// are picked up by the dispatcher's startup poll.
|
||||
// Re-advance in-flight flow_runs and battles after a coder restart. Both run
|
||||
// AFTER dispatcher.start() so re-dispatched 'pending' tasks are picked up.
|
||||
void flowRunner.initResume().catch((err) => {
|
||||
app.log.error(
|
||||
{ err: err instanceof Error ? err.message : String(err) },
|
||||
'flow-runner: initResume failed',
|
||||
);
|
||||
});
|
||||
void battleRunner.initResume().catch((err) => {
|
||||
app.log.error(
|
||||
{ err: err instanceof Error ? err.message : String(err) },
|
||||
'arena: initResume failed',
|
||||
);
|
||||
});
|
||||
|
||||
// v2.6 Phase 3: configure + start the agent-pool lifecycle sweep (idle-TTL +
|
||||
// LRU-cap eviction of warm backends, plus each backend's proactive health probe)
|
||||
@@ -281,8 +372,8 @@ async function main() {
|
||||
registerTaskRoutes(app, sql, inferenceApi, dispatcher.cancelExternalTask);
|
||||
registerInboxRoutes(app, sql);
|
||||
registerStatsRoutes(app, sql);
|
||||
registerArenaRoutes(app, sql);
|
||||
registerRunsRoutes(app, sql, flowRunner, dispatcher.cancelExternalTask);
|
||||
registerArenaRoutes(app, sql, battleRunner, dispatcher.cancelExternalTask, config);
|
||||
registerProviderRoutes(app, sql, config);
|
||||
registerWorktreeSafetyRoutes(app, sql);
|
||||
registerLifecycleRoutes(app, sql);
|
||||
|
||||
Reference in New Issue
Block a user