feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt
Arena is a new pane kind for competitive AI evaluation. A Battle runs the same prompt against 2-6 Contestants across two concurrent lanes: local lane (llama-swap models, serial) and cloud lane (parallel). Added to all three registries: @boocode/contracts WsFrameSchema, server InferenceFrame, and web WsFrame. Backend (apps/coder): - arena-runner: battle scheduler, lane classifier, benchmark, results writer, resume, user winner override - arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL - arena-decisions: status transitions and resume logic (unit-tested) - arena-analyzer-helpers: pure helper functions (unit-tested) - arena-model-call: model call utility for analysis - arena routes: create/get/list/stop/analyze/cross-examine/winner/diff - schema: battles, contestants, cross_examinations tables (idempotent) - remove old /api/arena* routes and tasks.arena_id column Frontend (apps/web): - ArenaLauncherDialog: battle type, prompt, contestant selection - ArenaPane: live roster, streaming output, analysis, cross-exam - DiffView: unified diff with line-by-line color for coding contests - Winner override per-row dropdown (Trophy icon) - battle_updated WS handler for live winner/analysis updates - arena pane kind in Workspace, ChatTabBar, useSidebar Cross-app: - ArenaState and ArenaContestantShape/WsFrame types (contracts) - battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame - manifest.json written per battle results folder - /Arena added to .gitignore Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
496
apps/coder/src/services/arena-analyzer.ts
Normal file
496
apps/coder/src/services/arena-analyzer.ts
Normal file
@@ -0,0 +1,496 @@
|
||||
/**
|
||||
* Arena Analyzer — pluggable seam for battle analysis and cross-examination.
|
||||
*
|
||||
* The Analyzer interface is the plug point: a v2 Han Orchestrator flow can
|
||||
* replace the v1 two-stage digest→judge implementation without a schema change.
|
||||
*
|
||||
* v1 implementation uses DEFAULT_MODEL via direct llama-swap calls (arenaModelCall):
|
||||
* Digest stage — one call per succeeded contestant, concurrent; produces a
|
||||
* bounded summary of each result (result.md + diff.patch for
|
||||
* coding, result.md for Q&A).
|
||||
* Judge stage — one call with all digests + the original prompt; writes
|
||||
* analysis.md, names a winner (unless < 2 succeeded), and
|
||||
* updates battles.winner_contestant_id.
|
||||
*
|
||||
* Cross-examination:
|
||||
* Local model — direct arenaModelCall to llama-swap with the chosen model.
|
||||
* Cloud model — inserts a tasks row (triggers the dispatcher via pg_notify);
|
||||
* polls for completion; reads output_summary as the verdict.
|
||||
* In both cases the verdict is written to cross_examinations.verdict, appended
|
||||
* to <resultsPath>/cross-exam.md, and a battle_updated frame is published.
|
||||
*
|
||||
* Never throws — all errors are caught, logged, and swallowed so the caller
|
||||
* (arena-runner's onBattleComplete / onCrossExamStart) is never wedged.
|
||||
*/
|
||||
|
||||
import { readFile, writeFile, mkdir } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import type { Sql } from '../db.js';
|
||||
import type { Broker } from '@boocode/server/broker';
|
||||
import type { WsFrame } from '@boocode/contracts/ws-frames';
|
||||
import type { FastifyBaseLogger } from 'fastify';
|
||||
import type { Config } from '../config.js';
|
||||
import type { BattleType } from '@boocode/contracts/arena';
|
||||
import { arenaModelCall } from './arena-model-call.js';
|
||||
import {
|
||||
buildDigestPrompt,
|
||||
buildJudgePrompt,
|
||||
buildCrossExamPrompt,
|
||||
extractWinner,
|
||||
shouldNameWinner,
|
||||
type ContestantDigest,
|
||||
} from './arena-analyzer-helpers.js';
|
||||
|
||||
// ─── Public interface ─────────────────────────────────────────────────────────
|
||||
|
||||
/** Pluggable analysis seam — swap to a Han Orchestrator flow in v2. */
|
||||
export interface Analyzer {
|
||||
/** Run the two-stage digest→judge analysis for a completed battle. */
|
||||
analyze(battleId: string): Promise<void>;
|
||||
/**
|
||||
* Run a cross-examination for an already-inserted cross_examinations row.
|
||||
* The result is written back to that row and a battle_updated frame is published.
|
||||
*/
|
||||
crossExamine(
|
||||
battleId: string,
|
||||
crossExamId: string,
|
||||
opts: { identity: string; model: string },
|
||||
): Promise<void>;
|
||||
}
|
||||
|
||||
// ─── Internal DB row types ────────────────────────────────────────────────────
|
||||
|
||||
interface BattleRow {
|
||||
id: string;
|
||||
project_id: string;
|
||||
battle_type: BattleType;
|
||||
prompt: string;
|
||||
status: string;
|
||||
results_path: string | null;
|
||||
winner_contestant_id: string | null;
|
||||
}
|
||||
|
||||
interface ContestantRow {
|
||||
id: string;
|
||||
identity: string;
|
||||
model: string;
|
||||
lane: string;
|
||||
status: string;
|
||||
result_path: string | null;
|
||||
duration_ms: number | null;
|
||||
tokens_per_sec: number | null;
|
||||
}
|
||||
|
||||
// ─── Factory ──────────────────────────────────────────────────────────────────
|
||||
|
||||
interface AnalyzerDeps {
|
||||
sql: Sql;
|
||||
broker: Broker;
|
||||
log: FastifyBaseLogger;
|
||||
config: Pick<Config, 'LLAMA_SWAP_URL' | 'DEFAULT_MODEL'>;
|
||||
/** Model IDs served by local llama-swap — cross-exam routing uses this. */
|
||||
localModels: ReadonlySet<string>;
|
||||
}
|
||||
|
||||
export function createAnalyzer(deps: AnalyzerDeps): Analyzer {
|
||||
const { sql, broker, log, config, localModels } = deps;
|
||||
|
||||
// ─── analyze ──────────────────────────────────────────────────────────────
|
||||
|
||||
async function analyze(battleId: string): Promise<void> {
|
||||
try {
|
||||
await runAnalysis(battleId);
|
||||
} catch (err) {
|
||||
log.error(
|
||||
{ err: errMsg(err), battleId },
|
||||
'arena-analyzer: analysis failed',
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async function runAnalysis(battleId: string): Promise<void> {
|
||||
const battle = await loadBattle(battleId);
|
||||
if (!battle) {
|
||||
log.warn({ battleId }, 'arena-analyzer: battle not found');
|
||||
return;
|
||||
}
|
||||
|
||||
const contestants = await loadContestants(battleId);
|
||||
const succeeded = contestants.filter((c) => c.status === 'done' && c.result_path);
|
||||
|
||||
log.info(
|
||||
{ battleId, total: contestants.length, succeeded: succeeded.length },
|
||||
'arena-analyzer: starting analysis',
|
||||
);
|
||||
|
||||
// Digest stage — concurrent, one call per succeeded contestant.
|
||||
const digests = (
|
||||
await Promise.all(succeeded.map((c) => digestContestant(battle, c)))
|
||||
).filter((d): d is ContestantDigest => d !== null);
|
||||
|
||||
// Failed contestants are noted in the analysis even if they produced no digest.
|
||||
const failedNotes = contestants
|
||||
.filter((c) => c.status === 'error')
|
||||
.map((c) => `- **${c.identity} / ${c.model}**: failed (no result)\n`);
|
||||
|
||||
// Judge stage — single call with all digests.
|
||||
const { analysisText, winner } = await judgeContestants(battle, digests, failedNotes);
|
||||
|
||||
// Write analysis.md to the battle results folder.
|
||||
const resultsPath = battle.results_path;
|
||||
if (resultsPath) {
|
||||
await mkdir(resultsPath, { recursive: true });
|
||||
await writeFile(join(resultsPath, 'analysis.md'), analysisText, 'utf8');
|
||||
}
|
||||
|
||||
// Resolve the winner to a contestant id and update the battle row.
|
||||
let winnerId: string | null = null;
|
||||
if (winner && shouldNameWinner(succeeded.length)) {
|
||||
const winnerContestant = contestants.find(
|
||||
(c) => c.identity === winner.identity && c.model === winner.model,
|
||||
);
|
||||
if (winnerContestant) {
|
||||
winnerId = winnerContestant.id;
|
||||
await sql`
|
||||
UPDATE battles
|
||||
SET winner_contestant_id = ${winnerId}, updated_at = clock_timestamp()
|
||||
WHERE id = ${battleId}
|
||||
`;
|
||||
log.info({ battleId, winnerId, identity: winner.identity, model: winner.model }, 'arena-analyzer: winner set');
|
||||
} else {
|
||||
log.warn({ battleId, winner }, 'arena-analyzer: judge named a winner not found in contestants');
|
||||
}
|
||||
}
|
||||
|
||||
publishUser({
|
||||
type: 'battle_updated',
|
||||
battle_id: battleId,
|
||||
winner_contestant_id: winnerId,
|
||||
analysis_ready: true,
|
||||
});
|
||||
|
||||
log.info({ battleId }, 'arena-analyzer: analysis complete');
|
||||
}
|
||||
|
||||
// ─── crossExamine ─────────────────────────────────────────────────────────
|
||||
|
||||
async function crossExamine(
|
||||
battleId: string,
|
||||
crossExamId: string,
|
||||
opts: { identity: string; model: string },
|
||||
): Promise<void> {
|
||||
try {
|
||||
await runCrossExam(battleId, crossExamId, opts);
|
||||
} catch (err) {
|
||||
log.error(
|
||||
{ err: errMsg(err), battleId, crossExamId },
|
||||
'arena-analyzer: cross-exam failed',
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async function runCrossExam(
|
||||
battleId: string,
|
||||
crossExamId: string,
|
||||
opts: { identity: string; model: string },
|
||||
): Promise<void> {
|
||||
const battle = await loadBattle(battleId);
|
||||
if (!battle) {
|
||||
log.warn({ battleId }, 'arena-analyzer: battle not found for cross-exam');
|
||||
return;
|
||||
}
|
||||
|
||||
const contestants = await loadContestants(battleId);
|
||||
|
||||
// Re-read the digests (if contestants have results) for context.
|
||||
const succeeded = contestants.filter((c) => c.status === 'done' && c.result_path);
|
||||
const digests = (
|
||||
await Promise.all(succeeded.map((c) => digestContestant(battle, c)))
|
||||
).filter((d): d is ContestantDigest => d !== null);
|
||||
|
||||
// Read analysis.md for the proposed analysis content.
|
||||
let analysisContent = '';
|
||||
if (battle.results_path) {
|
||||
analysisContent = await readFile(
|
||||
join(battle.results_path, 'analysis.md'), 'utf8',
|
||||
).catch(() => '');
|
||||
}
|
||||
|
||||
// Resolve proposed winner label.
|
||||
let proposedWinner: string | null = null;
|
||||
if (battle.winner_contestant_id) {
|
||||
const w = contestants.find((c) => c.id === battle.winner_contestant_id);
|
||||
if (w) proposedWinner = `${w.identity}/${w.model}`;
|
||||
}
|
||||
|
||||
const { system, user } = buildCrossExamPrompt({
|
||||
originalPrompt: battle.prompt,
|
||||
digests,
|
||||
analysisContent,
|
||||
proposedWinner,
|
||||
examinerIdentity: opts.identity,
|
||||
examinerModel: opts.model,
|
||||
});
|
||||
|
||||
log.info({ battleId, crossExamId, identity: opts.identity, model: opts.model }, 'arena-analyzer: running cross-exam');
|
||||
|
||||
const verdict = await executeModelCall({
|
||||
battleId,
|
||||
projectId: battle.project_id,
|
||||
identity: opts.identity,
|
||||
model: opts.model,
|
||||
system,
|
||||
user,
|
||||
});
|
||||
|
||||
// Persist verdict and append to cross-exam.md.
|
||||
await sql`
|
||||
UPDATE cross_examinations
|
||||
SET verdict = ${verdict}
|
||||
WHERE id = ${crossExamId}
|
||||
`;
|
||||
|
||||
if (battle.results_path) {
|
||||
const crossExamPath = join(battle.results_path, 'cross-exam.md');
|
||||
const section =
|
||||
`\n---\n\n# Cross-Examination by ${opts.identity} / ${opts.model}\n\n` +
|
||||
`${verdict}\n`;
|
||||
await writeFile(crossExamPath, section, { flag: 'a', encoding: 'utf8' });
|
||||
}
|
||||
|
||||
publishUser({
|
||||
type: 'battle_updated',
|
||||
battle_id: battleId,
|
||||
cross_exam_id: crossExamId,
|
||||
});
|
||||
|
||||
log.info({ battleId, crossExamId }, 'arena-analyzer: cross-exam complete');
|
||||
}
|
||||
|
||||
// ─── Model call routing ───────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Route a one-shot model call to llama-swap (local) or the task dispatcher
|
||||
* (cloud). Cloud dispatch inserts a tasks row and polls for completion.
|
||||
*/
|
||||
async function executeModelCall(opts: {
|
||||
battleId: string;
|
||||
projectId: string;
|
||||
identity: string;
|
||||
model: string;
|
||||
system: string;
|
||||
user: string;
|
||||
}): Promise<string> {
|
||||
const isLocal = localModels.has(opts.model) || localModels.has(`llama-swap/${opts.model}`);
|
||||
|
||||
if (isLocal) {
|
||||
return arenaModelCall({
|
||||
config,
|
||||
model: opts.model,
|
||||
system: opts.system,
|
||||
user: opts.user,
|
||||
maxTokens: 2_000,
|
||||
temperature: 0.3,
|
||||
});
|
||||
}
|
||||
|
||||
// Cloud path: dispatch through the task system and poll for completion.
|
||||
return executeCloudModelCall(opts);
|
||||
}
|
||||
|
||||
async function executeCloudModelCall(opts: {
|
||||
projectId: string;
|
||||
identity: string;
|
||||
model: string;
|
||||
system: string;
|
||||
user: string;
|
||||
}): Promise<string> {
|
||||
// The cross-exam prompt is the full input to the external agent. We embed
|
||||
// the system prompt as a preamble in the user message (external agents don't
|
||||
// take a separate system arg through the tasks dispatcher).
|
||||
const input = `${opts.system}\n\n${opts.user}`;
|
||||
|
||||
// For well-known external agents, stamp the agent name so the dispatcher
|
||||
// routes via PTY/ACP. For unknown identities fall back to native inference
|
||||
// (agent = null → DEFAULT_MODEL text generation).
|
||||
const knownAgents = new Set(['claude', 'opencode', 'qwen', 'goose']);
|
||||
const agentName = knownAgents.has(opts.identity) ? opts.identity : null;
|
||||
|
||||
const [task] = await sql<{ id: string }[]>`
|
||||
INSERT INTO tasks (project_id, input, agent, model)
|
||||
VALUES (${opts.projectId}, ${input}, ${agentName}, ${opts.model})
|
||||
RETURNING id
|
||||
`;
|
||||
const taskId = task!.id;
|
||||
|
||||
log.info({ taskId, identity: opts.identity, model: opts.model }, 'arena-analyzer: cloud cross-exam task dispatched');
|
||||
|
||||
// Poll until terminal (up to 5 minutes).
|
||||
const timeoutMs = 5 * 60 * 1_000;
|
||||
const pollMs = 2_000;
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
|
||||
while (Date.now() < deadline) {
|
||||
await sleep(pollMs);
|
||||
const [row] = await sql<{ state: string; output_summary: string | null }[]>`
|
||||
SELECT state, output_summary FROM tasks WHERE id = ${taskId}
|
||||
`;
|
||||
if (!row) break;
|
||||
if (row.state === 'completed') return row.output_summary ?? '';
|
||||
if (row.state === 'failed' || row.state === 'cancelled') {
|
||||
throw new Error(`cross-exam task ${row.state}: ${row.output_summary ?? ''}`);
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(`cloud cross-exam task timed out after ${timeoutMs / 1000}s`);
|
||||
}
|
||||
|
||||
// ─── Digest helper ────────────────────────────────────────────────────────
|
||||
|
||||
async function digestContestant(
|
||||
battle: BattleRow,
|
||||
c: ContestantRow,
|
||||
): Promise<ContestantDigest | null> {
|
||||
if (!c.result_path) return null;
|
||||
|
||||
const resultMd = await readFile(join(c.result_path, 'result.md'), 'utf8').catch(() => '');
|
||||
|
||||
let diffPatch: string | undefined;
|
||||
if (battle.battle_type === 'coding') {
|
||||
diffPatch = await readFile(join(c.result_path, 'diff.patch'), 'utf8').catch(
|
||||
() => undefined,
|
||||
);
|
||||
}
|
||||
|
||||
const benchmarkLine = formatBenchmarkLine(c);
|
||||
const { system, user } = buildDigestPrompt({
|
||||
identity: c.identity,
|
||||
model: c.model,
|
||||
resultMd,
|
||||
diffPatch,
|
||||
benchmarkLine,
|
||||
});
|
||||
|
||||
let digest: string;
|
||||
try {
|
||||
digest = await arenaModelCall({
|
||||
config,
|
||||
model: config.DEFAULT_MODEL,
|
||||
system,
|
||||
user,
|
||||
maxTokens: 500,
|
||||
temperature: 0.3,
|
||||
});
|
||||
} catch (err) {
|
||||
log.warn(
|
||||
{ err: errMsg(err), identity: c.identity, model: c.model },
|
||||
'arena-analyzer: digest call failed — skipping contestant',
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
return { identity: c.identity, model: c.model, digest, benchmarkLine };
|
||||
}
|
||||
|
||||
// ─── Judge helper ─────────────────────────────────────────────────────────
|
||||
|
||||
async function judgeContestants(
|
||||
battle: BattleRow,
|
||||
digests: ContestantDigest[],
|
||||
failedNotes: string[],
|
||||
): Promise<{ analysisText: string; winner: { identity: string; model: string } | null }> {
|
||||
const { system, user } = buildJudgePrompt(battle.prompt, digests);
|
||||
|
||||
let judgeOutput = '';
|
||||
try {
|
||||
judgeOutput = await arenaModelCall({
|
||||
config,
|
||||
model: config.DEFAULT_MODEL,
|
||||
system,
|
||||
user,
|
||||
maxTokens: 2_000,
|
||||
temperature: 0.3,
|
||||
});
|
||||
} catch (err) {
|
||||
log.error({ err: errMsg(err), battleId: battle.id }, 'arena-analyzer: judge call failed');
|
||||
judgeOutput = '*(Judge call failed — no comparison produced.)*';
|
||||
}
|
||||
|
||||
const winner = shouldNameWinner(digests.length) ? extractWinner(judgeOutput) : null;
|
||||
|
||||
const sections: string[] = [
|
||||
`# Arena Analysis`,
|
||||
`\n**Battle type:** ${battle.battle_type}`,
|
||||
];
|
||||
|
||||
if (failedNotes.length > 0) {
|
||||
sections.push('\n## Failed Contestants\n');
|
||||
sections.push(...failedNotes);
|
||||
}
|
||||
|
||||
if (digests.length > 0) {
|
||||
sections.push('\n## Contestant Digests\n');
|
||||
for (const d of digests) {
|
||||
sections.push(`### ${d.identity} / ${d.model}`);
|
||||
sections.push(`*Benchmark: ${d.benchmarkLine}*\n`);
|
||||
sections.push(d.digest);
|
||||
}
|
||||
}
|
||||
|
||||
sections.push("\n## Judge's Verdict\n");
|
||||
sections.push(judgeOutput);
|
||||
|
||||
if (winner) {
|
||||
sections.push(`\n## Winner\n**${winner.identity} / ${winner.model}**`);
|
||||
} else {
|
||||
const reason =
|
||||
digests.length < 2
|
||||
? 'fewer than 2 contestants produced results'
|
||||
: 'no clear winner identified';
|
||||
sections.push(`\n## Winner\n*No winner named (${reason}).*`);
|
||||
}
|
||||
|
||||
return { analysisText: sections.join('\n'), winner };
|
||||
}
|
||||
|
||||
// ─── DB helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
async function loadBattle(battleId: string): Promise<BattleRow | null> {
|
||||
const [b] = await sql<BattleRow[]>`
|
||||
SELECT id, project_id, battle_type, prompt, status, results_path, winner_contestant_id
|
||||
FROM battles WHERE id = ${battleId}
|
||||
`;
|
||||
return b ?? null;
|
||||
}
|
||||
|
||||
async function loadContestants(battleId: string): Promise<ContestantRow[]> {
|
||||
return sql<ContestantRow[]>`
|
||||
SELECT id, identity, model, lane, status, result_path, duration_ms, tokens_per_sec
|
||||
FROM contestants WHERE battle_id = ${battleId}
|
||||
ORDER BY created_at ASC
|
||||
`;
|
||||
}
|
||||
|
||||
// ─── Misc helpers ─────────────────────────────────────────────────────────
|
||||
|
||||
function formatBenchmarkLine(c: ContestantRow): string {
|
||||
const parts: string[] = [];
|
||||
if (c.duration_ms !== null) parts.push(`${c.duration_ms}ms`);
|
||||
if (c.tokens_per_sec !== null) parts.push(`${c.tokens_per_sec.toFixed(1)} tok/s`);
|
||||
return parts.length > 0 ? parts.join(', ') : 'no benchmark';
|
||||
}
|
||||
|
||||
function publishUser(frame: Record<string, unknown>): void {
|
||||
broker.publishUserFrame('default', frame as unknown as WsFrame);
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
return { analyze, crossExamine };
|
||||
}
|
||||
|
||||
function errMsg(e: unknown): string {
|
||||
return e instanceof Error ? e.message : String(e);
|
||||
}
|
||||
Reference in New Issue
Block a user