Arena is a new pane kind for competitive AI evaluation. A Battle runs the same prompt against 2-6 Contestants across two concurrent lanes: local lane (llama-swap models, serial) and cloud lane (parallel). Added to all three registries: @boocode/contracts WsFrameSchema, server InferenceFrame, and web WsFrame. Backend (apps/coder): - arena-runner: battle scheduler, lane classifier, benchmark, results writer, resume, user winner override - arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL - arena-decisions: status transitions and resume logic (unit-tested) - arena-analyzer-helpers: pure helper functions (unit-tested) - arena-model-call: model call utility for analysis - arena routes: create/get/list/stop/analyze/cross-examine/winner/diff - schema: battles, contestants, cross_examinations tables (idempotent) - remove old /api/arena* routes and tasks.arena_id column Frontend (apps/web): - ArenaLauncherDialog: battle type, prompt, contestant selection - ArenaPane: live roster, streaming output, analysis, cross-exam - DiffView: unified diff with line-by-line color for coding contests - Winner override per-row dropdown (Trophy icon) - battle_updated WS handler for live winner/analysis updates - arena pane kind in Workspace, ChatTabBar, useSidebar Cross-app: - ArenaState and ArenaContestantShape/WsFrame types (contracts) - battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame - manifest.json written per battle results folder - /Arena added to .gitignore Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
497 lines
17 KiB
TypeScript
497 lines
17 KiB
TypeScript
/**
|
|
* Arena Analyzer — pluggable seam for battle analysis and cross-examination.
|
|
*
|
|
* The Analyzer interface is the plug point: a v2 Han Orchestrator flow can
|
|
* replace the v1 two-stage digest→judge implementation without a schema change.
|
|
*
|
|
* v1 implementation uses DEFAULT_MODEL via direct llama-swap calls (arenaModelCall):
|
|
* Digest stage — one call per succeeded contestant, concurrent; produces a
|
|
* bounded summary of each result (result.md + diff.patch for
|
|
* coding, result.md for Q&A).
|
|
* Judge stage — one call with all digests + the original prompt; writes
|
|
* analysis.md, names a winner (unless < 2 succeeded), and
|
|
* updates battles.winner_contestant_id.
|
|
*
|
|
* Cross-examination:
|
|
* Local model — direct arenaModelCall to llama-swap with the chosen model.
|
|
* Cloud model — inserts a tasks row (triggers the dispatcher via pg_notify);
|
|
* polls for completion; reads output_summary as the verdict.
|
|
* In both cases the verdict is written to cross_examinations.verdict, appended
|
|
* to <resultsPath>/cross-exam.md, and a battle_updated frame is published.
|
|
*
|
|
* Never throws — all errors are caught, logged, and swallowed so the caller
|
|
* (arena-runner's onBattleComplete / onCrossExamStart) is never wedged.
|
|
*/
|
|
|
|
import { readFile, writeFile, mkdir } from 'node:fs/promises';
|
|
import { join } from 'node:path';
|
|
import type { Sql } from '../db.js';
|
|
import type { Broker } from '@boocode/server/broker';
|
|
import type { WsFrame } from '@boocode/contracts/ws-frames';
|
|
import type { FastifyBaseLogger } from 'fastify';
|
|
import type { Config } from '../config.js';
|
|
import type { BattleType } from '@boocode/contracts/arena';
|
|
import { arenaModelCall } from './arena-model-call.js';
|
|
import {
|
|
buildDigestPrompt,
|
|
buildJudgePrompt,
|
|
buildCrossExamPrompt,
|
|
extractWinner,
|
|
shouldNameWinner,
|
|
type ContestantDigest,
|
|
} from './arena-analyzer-helpers.js';
|
|
|
|
// ─── Public interface ─────────────────────────────────────────────────────────
|
|
|
|
/** Pluggable analysis seam — swap to a Han Orchestrator flow in v2. */
|
|
export interface Analyzer {
|
|
/** Run the two-stage digest→judge analysis for a completed battle. */
|
|
analyze(battleId: string): Promise<void>;
|
|
/**
|
|
* Run a cross-examination for an already-inserted cross_examinations row.
|
|
* The result is written back to that row and a battle_updated frame is published.
|
|
*/
|
|
crossExamine(
|
|
battleId: string,
|
|
crossExamId: string,
|
|
opts: { identity: string; model: string },
|
|
): Promise<void>;
|
|
}
|
|
|
|
// ─── Internal DB row types ────────────────────────────────────────────────────
|
|
|
|
interface BattleRow {
|
|
id: string;
|
|
project_id: string;
|
|
battle_type: BattleType;
|
|
prompt: string;
|
|
status: string;
|
|
results_path: string | null;
|
|
winner_contestant_id: string | null;
|
|
}
|
|
|
|
interface ContestantRow {
|
|
id: string;
|
|
identity: string;
|
|
model: string;
|
|
lane: string;
|
|
status: string;
|
|
result_path: string | null;
|
|
duration_ms: number | null;
|
|
tokens_per_sec: number | null;
|
|
}
|
|
|
|
// ─── Factory ──────────────────────────────────────────────────────────────────
|
|
|
|
interface AnalyzerDeps {
|
|
sql: Sql;
|
|
broker: Broker;
|
|
log: FastifyBaseLogger;
|
|
config: Pick<Config, 'LLAMA_SWAP_URL' | 'DEFAULT_MODEL'>;
|
|
/** Model IDs served by local llama-swap — cross-exam routing uses this. */
|
|
localModels: ReadonlySet<string>;
|
|
}
|
|
|
|
export function createAnalyzer(deps: AnalyzerDeps): Analyzer {
|
|
const { sql, broker, log, config, localModels } = deps;
|
|
|
|
// ─── analyze ──────────────────────────────────────────────────────────────
|
|
|
|
async function analyze(battleId: string): Promise<void> {
|
|
try {
|
|
await runAnalysis(battleId);
|
|
} catch (err) {
|
|
log.error(
|
|
{ err: errMsg(err), battleId },
|
|
'arena-analyzer: analysis failed',
|
|
);
|
|
}
|
|
}
|
|
|
|
async function runAnalysis(battleId: string): Promise<void> {
|
|
const battle = await loadBattle(battleId);
|
|
if (!battle) {
|
|
log.warn({ battleId }, 'arena-analyzer: battle not found');
|
|
return;
|
|
}
|
|
|
|
const contestants = await loadContestants(battleId);
|
|
const succeeded = contestants.filter((c) => c.status === 'done' && c.result_path);
|
|
|
|
log.info(
|
|
{ battleId, total: contestants.length, succeeded: succeeded.length },
|
|
'arena-analyzer: starting analysis',
|
|
);
|
|
|
|
// Digest stage — concurrent, one call per succeeded contestant.
|
|
const digests = (
|
|
await Promise.all(succeeded.map((c) => digestContestant(battle, c)))
|
|
).filter((d): d is ContestantDigest => d !== null);
|
|
|
|
// Failed contestants are noted in the analysis even if they produced no digest.
|
|
const failedNotes = contestants
|
|
.filter((c) => c.status === 'error')
|
|
.map((c) => `- **${c.identity} / ${c.model}**: failed (no result)\n`);
|
|
|
|
// Judge stage — single call with all digests.
|
|
const { analysisText, winner } = await judgeContestants(battle, digests, failedNotes);
|
|
|
|
// Write analysis.md to the battle results folder.
|
|
const resultsPath = battle.results_path;
|
|
if (resultsPath) {
|
|
await mkdir(resultsPath, { recursive: true });
|
|
await writeFile(join(resultsPath, 'analysis.md'), analysisText, 'utf8');
|
|
}
|
|
|
|
// Resolve the winner to a contestant id and update the battle row.
|
|
let winnerId: string | null = null;
|
|
if (winner && shouldNameWinner(succeeded.length)) {
|
|
const winnerContestant = contestants.find(
|
|
(c) => c.identity === winner.identity && c.model === winner.model,
|
|
);
|
|
if (winnerContestant) {
|
|
winnerId = winnerContestant.id;
|
|
await sql`
|
|
UPDATE battles
|
|
SET winner_contestant_id = ${winnerId}, updated_at = clock_timestamp()
|
|
WHERE id = ${battleId}
|
|
`;
|
|
log.info({ battleId, winnerId, identity: winner.identity, model: winner.model }, 'arena-analyzer: winner set');
|
|
} else {
|
|
log.warn({ battleId, winner }, 'arena-analyzer: judge named a winner not found in contestants');
|
|
}
|
|
}
|
|
|
|
publishUser({
|
|
type: 'battle_updated',
|
|
battle_id: battleId,
|
|
winner_contestant_id: winnerId,
|
|
analysis_ready: true,
|
|
});
|
|
|
|
log.info({ battleId }, 'arena-analyzer: analysis complete');
|
|
}
|
|
|
|
// ─── crossExamine ─────────────────────────────────────────────────────────
|
|
|
|
async function crossExamine(
|
|
battleId: string,
|
|
crossExamId: string,
|
|
opts: { identity: string; model: string },
|
|
): Promise<void> {
|
|
try {
|
|
await runCrossExam(battleId, crossExamId, opts);
|
|
} catch (err) {
|
|
log.error(
|
|
{ err: errMsg(err), battleId, crossExamId },
|
|
'arena-analyzer: cross-exam failed',
|
|
);
|
|
}
|
|
}
|
|
|
|
async function runCrossExam(
|
|
battleId: string,
|
|
crossExamId: string,
|
|
opts: { identity: string; model: string },
|
|
): Promise<void> {
|
|
const battle = await loadBattle(battleId);
|
|
if (!battle) {
|
|
log.warn({ battleId }, 'arena-analyzer: battle not found for cross-exam');
|
|
return;
|
|
}
|
|
|
|
const contestants = await loadContestants(battleId);
|
|
|
|
// Re-read the digests (if contestants have results) for context.
|
|
const succeeded = contestants.filter((c) => c.status === 'done' && c.result_path);
|
|
const digests = (
|
|
await Promise.all(succeeded.map((c) => digestContestant(battle, c)))
|
|
).filter((d): d is ContestantDigest => d !== null);
|
|
|
|
// Read analysis.md for the proposed analysis content.
|
|
let analysisContent = '';
|
|
if (battle.results_path) {
|
|
analysisContent = await readFile(
|
|
join(battle.results_path, 'analysis.md'), 'utf8',
|
|
).catch(() => '');
|
|
}
|
|
|
|
// Resolve proposed winner label.
|
|
let proposedWinner: string | null = null;
|
|
if (battle.winner_contestant_id) {
|
|
const w = contestants.find((c) => c.id === battle.winner_contestant_id);
|
|
if (w) proposedWinner = `${w.identity}/${w.model}`;
|
|
}
|
|
|
|
const { system, user } = buildCrossExamPrompt({
|
|
originalPrompt: battle.prompt,
|
|
digests,
|
|
analysisContent,
|
|
proposedWinner,
|
|
examinerIdentity: opts.identity,
|
|
examinerModel: opts.model,
|
|
});
|
|
|
|
log.info({ battleId, crossExamId, identity: opts.identity, model: opts.model }, 'arena-analyzer: running cross-exam');
|
|
|
|
const verdict = await executeModelCall({
|
|
battleId,
|
|
projectId: battle.project_id,
|
|
identity: opts.identity,
|
|
model: opts.model,
|
|
system,
|
|
user,
|
|
});
|
|
|
|
// Persist verdict and append to cross-exam.md.
|
|
await sql`
|
|
UPDATE cross_examinations
|
|
SET verdict = ${verdict}
|
|
WHERE id = ${crossExamId}
|
|
`;
|
|
|
|
if (battle.results_path) {
|
|
const crossExamPath = join(battle.results_path, 'cross-exam.md');
|
|
const section =
|
|
`\n---\n\n# Cross-Examination by ${opts.identity} / ${opts.model}\n\n` +
|
|
`${verdict}\n`;
|
|
await writeFile(crossExamPath, section, { flag: 'a', encoding: 'utf8' });
|
|
}
|
|
|
|
publishUser({
|
|
type: 'battle_updated',
|
|
battle_id: battleId,
|
|
cross_exam_id: crossExamId,
|
|
});
|
|
|
|
log.info({ battleId, crossExamId }, 'arena-analyzer: cross-exam complete');
|
|
}
|
|
|
|
// ─── Model call routing ───────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Route a one-shot model call to llama-swap (local) or the task dispatcher
|
|
* (cloud). Cloud dispatch inserts a tasks row and polls for completion.
|
|
*/
|
|
async function executeModelCall(opts: {
|
|
battleId: string;
|
|
projectId: string;
|
|
identity: string;
|
|
model: string;
|
|
system: string;
|
|
user: string;
|
|
}): Promise<string> {
|
|
const isLocal = localModels.has(opts.model) || localModels.has(`llama-swap/${opts.model}`);
|
|
|
|
if (isLocal) {
|
|
return arenaModelCall({
|
|
config,
|
|
model: opts.model,
|
|
system: opts.system,
|
|
user: opts.user,
|
|
maxTokens: 2_000,
|
|
temperature: 0.3,
|
|
});
|
|
}
|
|
|
|
// Cloud path: dispatch through the task system and poll for completion.
|
|
return executeCloudModelCall(opts);
|
|
}
|
|
|
|
async function executeCloudModelCall(opts: {
|
|
projectId: string;
|
|
identity: string;
|
|
model: string;
|
|
system: string;
|
|
user: string;
|
|
}): Promise<string> {
|
|
// The cross-exam prompt is the full input to the external agent. We embed
|
|
// the system prompt as a preamble in the user message (external agents don't
|
|
// take a separate system arg through the tasks dispatcher).
|
|
const input = `${opts.system}\n\n${opts.user}`;
|
|
|
|
// For well-known external agents, stamp the agent name so the dispatcher
|
|
// routes via PTY/ACP. For unknown identities fall back to native inference
|
|
// (agent = null → DEFAULT_MODEL text generation).
|
|
const knownAgents = new Set(['claude', 'opencode', 'qwen', 'goose']);
|
|
const agentName = knownAgents.has(opts.identity) ? opts.identity : null;
|
|
|
|
const [task] = await sql<{ id: string }[]>`
|
|
INSERT INTO tasks (project_id, input, agent, model)
|
|
VALUES (${opts.projectId}, ${input}, ${agentName}, ${opts.model})
|
|
RETURNING id
|
|
`;
|
|
const taskId = task!.id;
|
|
|
|
log.info({ taskId, identity: opts.identity, model: opts.model }, 'arena-analyzer: cloud cross-exam task dispatched');
|
|
|
|
// Poll until terminal (up to 5 minutes).
|
|
const timeoutMs = 5 * 60 * 1_000;
|
|
const pollMs = 2_000;
|
|
const deadline = Date.now() + timeoutMs;
|
|
|
|
while (Date.now() < deadline) {
|
|
await sleep(pollMs);
|
|
const [row] = await sql<{ state: string; output_summary: string | null }[]>`
|
|
SELECT state, output_summary FROM tasks WHERE id = ${taskId}
|
|
`;
|
|
if (!row) break;
|
|
if (row.state === 'completed') return row.output_summary ?? '';
|
|
if (row.state === 'failed' || row.state === 'cancelled') {
|
|
throw new Error(`cross-exam task ${row.state}: ${row.output_summary ?? ''}`);
|
|
}
|
|
}
|
|
|
|
throw new Error(`cloud cross-exam task timed out after ${timeoutMs / 1000}s`);
|
|
}
|
|
|
|
// ─── Digest helper ────────────────────────────────────────────────────────
|
|
|
|
async function digestContestant(
|
|
battle: BattleRow,
|
|
c: ContestantRow,
|
|
): Promise<ContestantDigest | null> {
|
|
if (!c.result_path) return null;
|
|
|
|
const resultMd = await readFile(join(c.result_path, 'result.md'), 'utf8').catch(() => '');
|
|
|
|
let diffPatch: string | undefined;
|
|
if (battle.battle_type === 'coding') {
|
|
diffPatch = await readFile(join(c.result_path, 'diff.patch'), 'utf8').catch(
|
|
() => undefined,
|
|
);
|
|
}
|
|
|
|
const benchmarkLine = formatBenchmarkLine(c);
|
|
const { system, user } = buildDigestPrompt({
|
|
identity: c.identity,
|
|
model: c.model,
|
|
resultMd,
|
|
diffPatch,
|
|
benchmarkLine,
|
|
});
|
|
|
|
let digest: string;
|
|
try {
|
|
digest = await arenaModelCall({
|
|
config,
|
|
model: config.DEFAULT_MODEL,
|
|
system,
|
|
user,
|
|
maxTokens: 500,
|
|
temperature: 0.3,
|
|
});
|
|
} catch (err) {
|
|
log.warn(
|
|
{ err: errMsg(err), identity: c.identity, model: c.model },
|
|
'arena-analyzer: digest call failed — skipping contestant',
|
|
);
|
|
return null;
|
|
}
|
|
|
|
return { identity: c.identity, model: c.model, digest, benchmarkLine };
|
|
}
|
|
|
|
// ─── Judge helper ─────────────────────────────────────────────────────────
|
|
|
|
async function judgeContestants(
|
|
battle: BattleRow,
|
|
digests: ContestantDigest[],
|
|
failedNotes: string[],
|
|
): Promise<{ analysisText: string; winner: { identity: string; model: string } | null }> {
|
|
const { system, user } = buildJudgePrompt(battle.prompt, digests);
|
|
|
|
let judgeOutput = '';
|
|
try {
|
|
judgeOutput = await arenaModelCall({
|
|
config,
|
|
model: config.DEFAULT_MODEL,
|
|
system,
|
|
user,
|
|
maxTokens: 2_000,
|
|
temperature: 0.3,
|
|
});
|
|
} catch (err) {
|
|
log.error({ err: errMsg(err), battleId: battle.id }, 'arena-analyzer: judge call failed');
|
|
judgeOutput = '*(Judge call failed — no comparison produced.)*';
|
|
}
|
|
|
|
const winner = shouldNameWinner(digests.length) ? extractWinner(judgeOutput) : null;
|
|
|
|
const sections: string[] = [
|
|
`# Arena Analysis`,
|
|
`\n**Battle type:** ${battle.battle_type}`,
|
|
];
|
|
|
|
if (failedNotes.length > 0) {
|
|
sections.push('\n## Failed Contestants\n');
|
|
sections.push(...failedNotes);
|
|
}
|
|
|
|
if (digests.length > 0) {
|
|
sections.push('\n## Contestant Digests\n');
|
|
for (const d of digests) {
|
|
sections.push(`### ${d.identity} / ${d.model}`);
|
|
sections.push(`*Benchmark: ${d.benchmarkLine}*\n`);
|
|
sections.push(d.digest);
|
|
}
|
|
}
|
|
|
|
sections.push("\n## Judge's Verdict\n");
|
|
sections.push(judgeOutput);
|
|
|
|
if (winner) {
|
|
sections.push(`\n## Winner\n**${winner.identity} / ${winner.model}**`);
|
|
} else {
|
|
const reason =
|
|
digests.length < 2
|
|
? 'fewer than 2 contestants produced results'
|
|
: 'no clear winner identified';
|
|
sections.push(`\n## Winner\n*No winner named (${reason}).*`);
|
|
}
|
|
|
|
return { analysisText: sections.join('\n'), winner };
|
|
}
|
|
|
|
// ─── DB helpers ───────────────────────────────────────────────────────────
|
|
|
|
async function loadBattle(battleId: string): Promise<BattleRow | null> {
|
|
const [b] = await sql<BattleRow[]>`
|
|
SELECT id, project_id, battle_type, prompt, status, results_path, winner_contestant_id
|
|
FROM battles WHERE id = ${battleId}
|
|
`;
|
|
return b ?? null;
|
|
}
|
|
|
|
async function loadContestants(battleId: string): Promise<ContestantRow[]> {
|
|
return sql<ContestantRow[]>`
|
|
SELECT id, identity, model, lane, status, result_path, duration_ms, tokens_per_sec
|
|
FROM contestants WHERE battle_id = ${battleId}
|
|
ORDER BY created_at ASC
|
|
`;
|
|
}
|
|
|
|
// ─── Misc helpers ─────────────────────────────────────────────────────────
|
|
|
|
function formatBenchmarkLine(c: ContestantRow): string {
|
|
const parts: string[] = [];
|
|
if (c.duration_ms !== null) parts.push(`${c.duration_ms}ms`);
|
|
if (c.tokens_per_sec !== null) parts.push(`${c.tokens_per_sec.toFixed(1)} tok/s`);
|
|
return parts.length > 0 ? parts.join(', ') : 'no benchmark';
|
|
}
|
|
|
|
function publishUser(frame: Record<string, unknown>): void {
|
|
broker.publishUserFrame('default', frame as unknown as WsFrame);
|
|
}
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
return { analyze, crossExamine };
|
|
}
|
|
|
|
function errMsg(e: unknown): string {
|
|
return e instanceof Error ? e.message : String(e);
|
|
}
|