/** * Arena Analyzer — pluggable seam for battle analysis and cross-examination. * * The Analyzer interface is the plug point: a v2 Han Orchestrator flow can * replace the v1 two-stage digest→judge implementation without a schema change. * * v1 implementation uses DEFAULT_MODEL via direct llama-swap calls (arenaModelCall): * Digest stage — one call per succeeded contestant, concurrent; produces a * bounded summary of each result (result.md + diff.patch for * coding, result.md for Q&A). * Judge stage — one call with all digests + the original prompt; writes * analysis.md, names a winner (unless < 2 succeeded), and * updates battles.winner_contestant_id. * * Cross-examination: * Local model — direct arenaModelCall to llama-swap with the chosen model. * Cloud model — inserts a tasks row (triggers the dispatcher via pg_notify); * polls for completion; reads output_summary as the verdict. * In both cases the verdict is written to cross_examinations.verdict, appended * to /cross-exam.md, and a battle_updated frame is published. * * Never throws — all errors are caught, logged, and swallowed so the caller * (arena-runner's onBattleComplete / onCrossExamStart) is never wedged. */ import { readFile, writeFile, mkdir } from 'node:fs/promises'; import { join } from 'node:path'; import type { Sql } from '../db.js'; import type { Broker } from '@boocode/server/broker'; import type { WsFrame } from '@boocode/contracts/ws-frames'; import type { FastifyBaseLogger } from 'fastify'; import type { Config } from '../config.js'; import type { BattleType } from '@boocode/contracts/arena'; import { arenaModelCall } from './arena-model-call.js'; import { buildDigestPrompt, buildJudgePrompt, buildCrossExamPrompt, extractWinner, shouldNameWinner, type ContestantDigest, } from './arena-analyzer-helpers.js'; // ─── Public interface ───────────────────────────────────────────────────────── /** Pluggable analysis seam — swap to a Han Orchestrator flow in v2. */ export interface Analyzer { /** Run the two-stage digest→judge analysis for a completed battle. */ analyze(battleId: string): Promise; /** * Run a cross-examination for an already-inserted cross_examinations row. * The result is written back to that row and a battle_updated frame is published. */ crossExamine( battleId: string, crossExamId: string, opts: { identity: string; model: string }, ): Promise; } // ─── Internal DB row types ──────────────────────────────────────────────────── interface BattleRow { id: string; project_id: string; battle_type: BattleType; prompt: string; status: string; results_path: string | null; winner_contestant_id: string | null; } interface ContestantRow { id: string; identity: string; model: string; lane: string; status: string; result_path: string | null; duration_ms: number | null; tokens_per_sec: number | null; } // ─── Factory ────────────────────────────────────────────────────────────────── interface AnalyzerDeps { sql: Sql; broker: Broker; log: FastifyBaseLogger; config: Pick; /** Model IDs served by local llama-swap — cross-exam routing uses this. */ localModels: ReadonlySet; } export function createAnalyzer(deps: AnalyzerDeps): Analyzer { const { sql, broker, log, config, localModels } = deps; // ─── analyze ────────────────────────────────────────────────────────────── async function analyze(battleId: string): Promise { try { await runAnalysis(battleId); } catch (err) { log.error( { err: errMsg(err), battleId }, 'arena-analyzer: analysis failed', ); } } async function runAnalysis(battleId: string): Promise { const battle = await loadBattle(battleId); if (!battle) { log.warn({ battleId }, 'arena-analyzer: battle not found'); return; } const contestants = await loadContestants(battleId); const succeeded = contestants.filter((c) => c.status === 'done' && c.result_path); log.info( { battleId, total: contestants.length, succeeded: succeeded.length }, 'arena-analyzer: starting analysis', ); // Digest stage — concurrent, one call per succeeded contestant. const digests = ( await Promise.all(succeeded.map((c) => digestContestant(battle, c))) ).filter((d): d is ContestantDigest => d !== null); // Failed contestants are noted in the analysis even if they produced no digest. const failedNotes = contestants .filter((c) => c.status === 'error') .map((c) => `- **${c.identity} / ${c.model}**: failed (no result)\n`); // Judge stage — single call with all digests. const { analysisText, winner } = await judgeContestants(battle, digests, failedNotes); // Write analysis.md to the battle results folder. const resultsPath = battle.results_path; if (resultsPath) { await mkdir(resultsPath, { recursive: true }); await writeFile(join(resultsPath, 'analysis.md'), analysisText, 'utf8'); } // Resolve the winner to a contestant id and update the battle row. let winnerId: string | null = null; if (winner && shouldNameWinner(succeeded.length)) { const winnerContestant = contestants.find( (c) => c.identity === winner.identity && c.model === winner.model, ); if (winnerContestant) { winnerId = winnerContestant.id; await sql` UPDATE battles SET winner_contestant_id = ${winnerId}, updated_at = clock_timestamp() WHERE id = ${battleId} `; log.info({ battleId, winnerId, identity: winner.identity, model: winner.model }, 'arena-analyzer: winner set'); } else { log.warn({ battleId, winner }, 'arena-analyzer: judge named a winner not found in contestants'); } } publishUser({ type: 'battle_updated', battle_id: battleId, winner_contestant_id: winnerId, analysis_ready: true, }); log.info({ battleId }, 'arena-analyzer: analysis complete'); } // ─── crossExamine ───────────────────────────────────────────────────────── async function crossExamine( battleId: string, crossExamId: string, opts: { identity: string; model: string }, ): Promise { try { await runCrossExam(battleId, crossExamId, opts); } catch (err) { log.error( { err: errMsg(err), battleId, crossExamId }, 'arena-analyzer: cross-exam failed', ); } } async function runCrossExam( battleId: string, crossExamId: string, opts: { identity: string; model: string }, ): Promise { const battle = await loadBattle(battleId); if (!battle) { log.warn({ battleId }, 'arena-analyzer: battle not found for cross-exam'); return; } const contestants = await loadContestants(battleId); // Re-read the digests (if contestants have results) for context. const succeeded = contestants.filter((c) => c.status === 'done' && c.result_path); const digests = ( await Promise.all(succeeded.map((c) => digestContestant(battle, c))) ).filter((d): d is ContestantDigest => d !== null); // Read analysis.md for the proposed analysis content. let analysisContent = ''; if (battle.results_path) { analysisContent = await readFile( join(battle.results_path, 'analysis.md'), 'utf8', ).catch(() => ''); } // Resolve proposed winner label. let proposedWinner: string | null = null; if (battle.winner_contestant_id) { const w = contestants.find((c) => c.id === battle.winner_contestant_id); if (w) proposedWinner = `${w.identity}/${w.model}`; } const { system, user } = buildCrossExamPrompt({ originalPrompt: battle.prompt, digests, analysisContent, proposedWinner, examinerIdentity: opts.identity, examinerModel: opts.model, }); log.info({ battleId, crossExamId, identity: opts.identity, model: opts.model }, 'arena-analyzer: running cross-exam'); const verdict = await executeModelCall({ battleId, projectId: battle.project_id, identity: opts.identity, model: opts.model, system, user, }); // Persist verdict and append to cross-exam.md. await sql` UPDATE cross_examinations SET verdict = ${verdict} WHERE id = ${crossExamId} `; if (battle.results_path) { const crossExamPath = join(battle.results_path, 'cross-exam.md'); const section = `\n---\n\n# Cross-Examination by ${opts.identity} / ${opts.model}\n\n` + `${verdict}\n`; await writeFile(crossExamPath, section, { flag: 'a', encoding: 'utf8' }); } publishUser({ type: 'battle_updated', battle_id: battleId, cross_exam_id: crossExamId, }); log.info({ battleId, crossExamId }, 'arena-analyzer: cross-exam complete'); } // ─── Model call routing ─────────────────────────────────────────────────── /** * Route a one-shot model call to llama-swap (local) or the task dispatcher * (cloud). Cloud dispatch inserts a tasks row and polls for completion. */ async function executeModelCall(opts: { battleId: string; projectId: string; identity: string; model: string; system: string; user: string; }): Promise { const isLocal = localModels.has(opts.model) || localModels.has(`llama-swap/${opts.model}`); if (isLocal) { return arenaModelCall({ config, model: opts.model, system: opts.system, user: opts.user, maxTokens: 2_000, temperature: 0.3, }); } // Cloud path: dispatch through the task system and poll for completion. return executeCloudModelCall(opts); } async function executeCloudModelCall(opts: { projectId: string; identity: string; model: string; system: string; user: string; }): Promise { // The cross-exam prompt is the full input to the external agent. We embed // the system prompt as a preamble in the user message (external agents don't // take a separate system arg through the tasks dispatcher). const input = `${opts.system}\n\n${opts.user}`; // For well-known external agents, stamp the agent name so the dispatcher // routes via PTY/ACP. For unknown identities fall back to native inference // (agent = null → DEFAULT_MODEL text generation). const knownAgents = new Set(['claude', 'opencode', 'qwen', 'goose']); const agentName = knownAgents.has(opts.identity) ? opts.identity : null; const [task] = await sql<{ id: string }[]>` INSERT INTO tasks (project_id, input, agent, model) VALUES (${opts.projectId}, ${input}, ${agentName}, ${opts.model}) RETURNING id `; const taskId = task!.id; log.info({ taskId, identity: opts.identity, model: opts.model }, 'arena-analyzer: cloud cross-exam task dispatched'); // Poll until terminal (up to 5 minutes). const timeoutMs = 5 * 60 * 1_000; const pollMs = 2_000; const deadline = Date.now() + timeoutMs; while (Date.now() < deadline) { await sleep(pollMs); const [row] = await sql<{ state: string; output_summary: string | null }[]>` SELECT state, output_summary FROM tasks WHERE id = ${taskId} `; if (!row) break; if (row.state === 'completed') return row.output_summary ?? ''; if (row.state === 'failed' || row.state === 'cancelled') { throw new Error(`cross-exam task ${row.state}: ${row.output_summary ?? ''}`); } } throw new Error(`cloud cross-exam task timed out after ${timeoutMs / 1000}s`); } // ─── Digest helper ──────────────────────────────────────────────────────── async function digestContestant( battle: BattleRow, c: ContestantRow, ): Promise { if (!c.result_path) return null; const resultMd = await readFile(join(c.result_path, 'result.md'), 'utf8').catch(() => ''); let diffPatch: string | undefined; if (battle.battle_type === 'coding') { diffPatch = await readFile(join(c.result_path, 'diff.patch'), 'utf8').catch( () => undefined, ); } const benchmarkLine = formatBenchmarkLine(c); const { system, user } = buildDigestPrompt({ identity: c.identity, model: c.model, resultMd, diffPatch, benchmarkLine, }); let digest: string; try { digest = await arenaModelCall({ config, model: config.DEFAULT_MODEL, system, user, maxTokens: 500, temperature: 0.3, }); } catch (err) { log.warn( { err: errMsg(err), identity: c.identity, model: c.model }, 'arena-analyzer: digest call failed — skipping contestant', ); return null; } return { identity: c.identity, model: c.model, digest, benchmarkLine }; } // ─── Judge helper ───────────────────────────────────────────────────────── async function judgeContestants( battle: BattleRow, digests: ContestantDigest[], failedNotes: string[], ): Promise<{ analysisText: string; winner: { identity: string; model: string } | null }> { const { system, user } = buildJudgePrompt(battle.prompt, digests); let judgeOutput = ''; try { judgeOutput = await arenaModelCall({ config, model: config.DEFAULT_MODEL, system, user, maxTokens: 2_000, temperature: 0.3, }); } catch (err) { log.error({ err: errMsg(err), battleId: battle.id }, 'arena-analyzer: judge call failed'); judgeOutput = '*(Judge call failed — no comparison produced.)*'; } const winner = shouldNameWinner(digests.length) ? extractWinner(judgeOutput) : null; const sections: string[] = [ `# Arena Analysis`, `\n**Battle type:** ${battle.battle_type}`, ]; if (failedNotes.length > 0) { sections.push('\n## Failed Contestants\n'); sections.push(...failedNotes); } if (digests.length > 0) { sections.push('\n## Contestant Digests\n'); for (const d of digests) { sections.push(`### ${d.identity} / ${d.model}`); sections.push(`*Benchmark: ${d.benchmarkLine}*\n`); sections.push(d.digest); } } sections.push("\n## Judge's Verdict\n"); sections.push(judgeOutput); if (winner) { sections.push(`\n## Winner\n**${winner.identity} / ${winner.model}**`); } else { const reason = digests.length < 2 ? 'fewer than 2 contestants produced results' : 'no clear winner identified'; sections.push(`\n## Winner\n*No winner named (${reason}).*`); } return { analysisText: sections.join('\n'), winner }; } // ─── DB helpers ─────────────────────────────────────────────────────────── async function loadBattle(battleId: string): Promise { const [b] = await sql` SELECT id, project_id, battle_type, prompt, status, results_path, winner_contestant_id FROM battles WHERE id = ${battleId} `; return b ?? null; } async function loadContestants(battleId: string): Promise { return sql` SELECT id, identity, model, lane, status, result_path, duration_ms, tokens_per_sec FROM contestants WHERE battle_id = ${battleId} ORDER BY created_at ASC `; } // ─── Misc helpers ───────────────────────────────────────────────────────── function formatBenchmarkLine(c: ContestantRow): string { const parts: string[] = []; if (c.duration_ms !== null) parts.push(`${c.duration_ms}ms`); if (c.tokens_per_sec !== null) parts.push(`${c.tokens_per_sec.toFixed(1)} tok/s`); return parts.length > 0 ? parts.join(', ') : 'no benchmark'; } function publishUser(frame: Record): void { broker.publishUserFrame('default', frame as unknown as WsFrame); } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } return { analyze, crossExamine }; } function errMsg(e: unknown): string { return e instanceof Error ? e.message : String(e); }