feat(web,coder): arena pane — compare 2-6 AI competitors on same prompt
Arena is a new pane kind for competitive AI evaluation. A Battle runs the same prompt against 2-6 Contestants across two concurrent lanes: local lane (llama-swap models, serial) and cloud lane (parallel). Added to all three registries: @boocode/contracts WsFrameSchema, server InferenceFrame, and web WsFrame. Backend (apps/coder): - arena-runner: battle scheduler, lane classifier, benchmark, results writer, resume, user winner override - arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL - arena-decisions: status transitions and resume logic (unit-tested) - arena-analyzer-helpers: pure helper functions (unit-tested) - arena-model-call: model call utility for analysis - arena routes: create/get/list/stop/analyze/cross-examine/winner/diff - schema: battles, contestants, cross_examinations tables (idempotent) - remove old /api/arena* routes and tasks.arena_id column Frontend (apps/web): - ArenaLauncherDialog: battle type, prompt, contestant selection - ArenaPane: live roster, streaming output, analysis, cross-exam - DiffView: unified diff with line-by-line color for coding contests - Winner override per-row dropdown (Trophy icon) - battle_updated WS handler for live winner/analysis updates - arena pane kind in Workspace, ChatTabBar, useSidebar Cross-app: - ArenaState and ArenaContestantShape/WsFrame types (contracts) - battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame - manifest.json written per battle results folder - /Arena added to .gitignore Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -23,8 +23,8 @@ import { registerAgentSessionRoutes } from './routes/agent-sessions.js';
|
||||
import { registerTaskRoutes } from './routes/tasks.js';
|
||||
import { registerInboxRoutes } from './routes/inbox.js';
|
||||
import { registerStatsRoutes } from './routes/stats.js';
|
||||
import { registerArenaRoutes } from './routes/arena.js';
|
||||
import { registerRunsRoutes } from './routes/runs.js';
|
||||
import { registerArenaRoutes } from './routes/arena.js';
|
||||
import { registerProviderRoutes } from './routes/providers.js';
|
||||
import { registerWorktreeSafetyRoutes } from './routes/worktree-safety.js';
|
||||
import { registerLifecycleRoutes } from './routes/lifecycle.js';
|
||||
@@ -34,10 +34,13 @@ import { createDispatcher } from './services/dispatcher.js';
|
||||
// Orchestrator (Phase 2): DB-backed flow-runner; advances on the dispatcher's
|
||||
// onTaskTerminal hook.
|
||||
import { createFlowRunner } from './services/flow-runner.js';
|
||||
// Arena: DB-backed battle-runner; also advances on the onTaskTerminal hook.
|
||||
import { createBattleRunner, type DispatchContestantFn } from './services/arena-runner.js';
|
||||
import { createAnalyzer } from './services/arena-analyzer.js';
|
||||
import { agentPool } from './services/agent-pool.js';
|
||||
import { createOrphanWorktreeReaper } from './services/orphan-worktree-reaper.js';
|
||||
import { probeAgents } from './services/agent-probe.js';
|
||||
import { getProviderSnapshot, persistProbedModels } from './services/provider-snapshot.js';
|
||||
import { getProviderSnapshot, persistProbedModels, fetchLlamaSwapModels } from './services/provider-snapshot.js';
|
||||
import { setPermissionHooks } from './services/permission-waiter.js';
|
||||
import { publishAgentStatus } from './services/agent-status-publish.js';
|
||||
import { homedir } from 'node:os';
|
||||
@@ -220,31 +223,119 @@ async function main() {
|
||||
|
||||
// Orchestrator (Phase 2): the flow-runner reacts to the dispatcher's
|
||||
// onTaskTerminal hook to advance flow_runs. Created before the dispatcher so its
|
||||
// terminal callback can be wired in. Its launch() is driven by the runs route
|
||||
// (a later phase); resume on startup is a later phase too.
|
||||
// terminal callback can be wired in.
|
||||
const flowRunner = createFlowRunner({ sql, broker, log: app.log, config });
|
||||
|
||||
// Phase 4: dispatcher — polls tasks table and runs inference. onTaskTerminal
|
||||
// notifies the flow-runner when a step's task settles (D-2).
|
||||
// Arena SEAM (a): build the local-model set from the live llama-swap model list.
|
||||
// Both bare IDs ('qwen3.6-35b') and prefixed IDs ('llama-swap/qwen3.6-35b') are
|
||||
// included so opencode-style prefixed contestants and native-style bare contestants
|
||||
// both classify correctly as local.
|
||||
const localModelsList = await fetchLlamaSwapModels(config).catch(() => []);
|
||||
const localModels = new Set([
|
||||
...localModelsList.map((m) => m.id),
|
||||
...localModelsList.map((m) => `llama-swap/${m.id}`),
|
||||
]);
|
||||
|
||||
// Arena dispatch function — Phase 4 SEAM (b).
|
||||
// Coding: insert a tasks row with agent=identity (null for native/boocode);
|
||||
// the dispatcher creates a worktree and runs the external agent (or native).
|
||||
// Q&A: pre-create a session with agent_id stamped to the persona slug so native
|
||||
// inference loads the persona's system_prompt + tools from AGENTS.md;
|
||||
// task.session_id is pre-set so runNativeInference reuses the session.
|
||||
const dispatchContestant: DispatchContestantFn = async ({
|
||||
projectId,
|
||||
prompt,
|
||||
identity,
|
||||
model,
|
||||
battleType,
|
||||
}) => {
|
||||
if (battleType === 'qa') {
|
||||
const sessionName = `Arena Q&A [${identity}]: ${prompt.slice(0, 30)}`;
|
||||
const [session] = await sql<{ id: string }[]>`
|
||||
INSERT INTO sessions (project_id, name, model, agent_id, status)
|
||||
VALUES (${projectId}, ${sessionName}, ${model}, ${identity}, 'open')
|
||||
RETURNING id
|
||||
`;
|
||||
const [task] = await sql<{ id: string }[]>`
|
||||
INSERT INTO tasks (project_id, input, model, session_id)
|
||||
VALUES (${projectId}, ${prompt}, ${model}, ${session!.id})
|
||||
RETURNING id
|
||||
`;
|
||||
return { taskId: task!.id, sessionId: session!.id };
|
||||
}
|
||||
// Coding: boocode = native inference (no external agent); any other identity
|
||||
// is an external agent name (claude, opencode, qwen, goose) that maps to
|
||||
// available_agents and gets its own per-task worktree via runExternalAgent.
|
||||
// Session is created lazily by the dispatcher, so sessionId is unknown here.
|
||||
const agentName = identity === 'boocode' ? null : identity;
|
||||
const [task] = await sql<{ id: string }[]>`
|
||||
INSERT INTO tasks (project_id, input, agent, model)
|
||||
VALUES (${projectId}, ${prompt}, ${agentName}, ${model})
|
||||
RETURNING id
|
||||
`;
|
||||
return { taskId: task!.id, sessionId: null };
|
||||
};
|
||||
|
||||
// Arena analyzer: two-stage digest→judge (v1). Pluggable seam — a v2 Han
|
||||
// Orchestrator flow can replace this without schema changes.
|
||||
const analyzer = createAnalyzer({
|
||||
sql,
|
||||
broker,
|
||||
log: app.log,
|
||||
config,
|
||||
localModels,
|
||||
});
|
||||
|
||||
// Arena battle-runner: notified on the same onTaskTerminal hook as the flow-runner.
|
||||
const battleRunner = createBattleRunner({
|
||||
sql,
|
||||
broker,
|
||||
log: app.log,
|
||||
dispatch: dispatchContestant,
|
||||
onBattleComplete: (battleId) => {
|
||||
void analyzer.analyze(battleId);
|
||||
},
|
||||
onCrossExamStart: ({ battleId, crossExamId, identity, model }) => {
|
||||
void analyzer.crossExamine(battleId, crossExamId, { identity, model });
|
||||
},
|
||||
localModels,
|
||||
});
|
||||
|
||||
// Compose onTaskTerminal: both flow-runner and battle-runner are notified.
|
||||
// Each ignores tasks it doesn't own (flow-runner checks flow_steps.task_id;
|
||||
// battle-runner checks contestants.task_id).
|
||||
const onTaskTerminal = (taskId: string, state: string): void => {
|
||||
flowRunner.handleTaskTerminal(taskId, state);
|
||||
battleRunner.handleTaskTerminal(taskId, state);
|
||||
};
|
||||
|
||||
// Phase 4: dispatcher — polls tasks table and runs inference. The composed
|
||||
// onTaskTerminal hook notifies both the flow-runner and the battle-runner when
|
||||
// any task settles.
|
||||
const dispatcher = createDispatcher({
|
||||
sql,
|
||||
inference: inferenceApi,
|
||||
broker,
|
||||
log: app.log,
|
||||
config,
|
||||
onTaskTerminal: flowRunner.handleTaskTerminal,
|
||||
onTaskTerminal,
|
||||
});
|
||||
dispatcher.start();
|
||||
|
||||
// Phase 5: re-advance any flow_runs that were 'running' when the service last
|
||||
// stopped (D-9). Runs AFTER dispatcher.start() so re-dispatched 'pending' tasks
|
||||
// are picked up by the dispatcher's startup poll.
|
||||
// Re-advance in-flight flow_runs and battles after a coder restart. Both run
|
||||
// AFTER dispatcher.start() so re-dispatched 'pending' tasks are picked up.
|
||||
void flowRunner.initResume().catch((err) => {
|
||||
app.log.error(
|
||||
{ err: err instanceof Error ? err.message : String(err) },
|
||||
'flow-runner: initResume failed',
|
||||
);
|
||||
});
|
||||
void battleRunner.initResume().catch((err) => {
|
||||
app.log.error(
|
||||
{ err: err instanceof Error ? err.message : String(err) },
|
||||
'arena: initResume failed',
|
||||
);
|
||||
});
|
||||
|
||||
// v2.6 Phase 3: configure + start the agent-pool lifecycle sweep (idle-TTL +
|
||||
// LRU-cap eviction of warm backends, plus each backend's proactive health probe)
|
||||
@@ -281,8 +372,8 @@ async function main() {
|
||||
registerTaskRoutes(app, sql, inferenceApi, dispatcher.cancelExternalTask);
|
||||
registerInboxRoutes(app, sql);
|
||||
registerStatsRoutes(app, sql);
|
||||
registerArenaRoutes(app, sql);
|
||||
registerRunsRoutes(app, sql, flowRunner, dispatcher.cancelExternalTask);
|
||||
registerArenaRoutes(app, sql, battleRunner, dispatcher.cancelExternalTask, config);
|
||||
registerProviderRoutes(app, sql, config);
|
||||
registerWorktreeSafetyRoutes(app, sql);
|
||||
registerLifecycleRoutes(app, sql);
|
||||
|
||||
@@ -1,136 +1,412 @@
|
||||
/**
|
||||
* v2.0.5: Arena routes — competitive dispatch of the same task to multiple agents.
|
||||
* Arena routes — HTTP surface for the Battle UI.
|
||||
*
|
||||
* POST /api/arena — create an arena with 2-5 contestants
|
||||
* GET /api/arena/:id — get all tasks in an arena
|
||||
* POST /api/arena/:id/select/:task_id — mark a task as the arena winner
|
||||
* POST /api/battles — launch a battle
|
||||
* GET /api/battles?project_id= — list battles for a project
|
||||
* GET /api/battles/:id — one battle + contestants + cross-exams
|
||||
* POST /api/battles/:id/stop — cancel a running battle
|
||||
* POST /api/battles/:id/analyze — trigger analysis (Phase 5 fills the logic)
|
||||
* POST /api/battles/:id/cross-examine — start a cross-examination (Phase 5 fills the logic)
|
||||
*
|
||||
* Mirrors the shape of runs.ts (Orchestrator routes). Battle creation delegates to
|
||||
* the battle-runner; cancellation calls cancelBattle then aborts in-flight tasks
|
||||
* via the dispatcher's cancelExternalTask.
|
||||
*/
|
||||
import type { FastifyInstance } from 'fastify';
|
||||
import { z } from 'zod';
|
||||
import { readFile } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import type { Sql } from '../db.js';
|
||||
import type { Config } from '../config.js';
|
||||
import type { BattleRunner } from '../services/arena-runner.js';
|
||||
import type { ExternalCancelFn } from './tasks.js';
|
||||
import { arenaModelCall } from '../services/arena-model-call.js';
|
||||
|
||||
const ContestantSchema = z.object({
|
||||
agent: z.string().max(100).optional(),
|
||||
model: z.string().max(200).optional(),
|
||||
mode_id: z.string().max(200).optional(),
|
||||
thinking_option_id: z.string().max(200).optional(),
|
||||
// ─── Validation schemas ───────────────────────────────────────────────────────
|
||||
|
||||
const UuidParam = z.string().uuid();
|
||||
|
||||
const ContestantInput = z.object({
|
||||
identity: z.string().min(1).max(200),
|
||||
model: z.string().min(1).max(200),
|
||||
});
|
||||
|
||||
const CreateArenaBody = z.object({
|
||||
const CreateBattleBody = z.object({
|
||||
project_id: z.string().uuid(),
|
||||
input: z.string().min(1).max(64_000),
|
||||
contestants: z.array(ContestantSchema).min(2).max(5),
|
||||
battle_type: z.enum(['coding', 'qa']),
|
||||
prompt: z.string().min(1).max(64_000),
|
||||
contestants: z
|
||||
.array(ContestantInput)
|
||||
.min(2, 'at least 2 contestants required')
|
||||
.max(6, 'at most 6 contestants allowed'),
|
||||
});
|
||||
|
||||
interface TaskRow {
|
||||
id: string;
|
||||
agent: string | null;
|
||||
model: string | null;
|
||||
mode_id: string | null;
|
||||
thinking_option_id: string | null;
|
||||
state: string;
|
||||
}
|
||||
const ListBattlesQuery = z.object({
|
||||
project_id: z.string().uuid(),
|
||||
});
|
||||
|
||||
export function registerArenaRoutes(app: FastifyInstance, sql: Sql): void {
|
||||
// POST /api/arena — create a new arena
|
||||
app.post('/api/arena', async (req, reply) => {
|
||||
const parsed = CreateArenaBody.safeParse(req.body);
|
||||
const CrossExamineBody = z.object({
|
||||
identity: z.string().min(1).max(200),
|
||||
model: z.string().min(1).max(200),
|
||||
});
|
||||
|
||||
const SetWinnerBody = z.object({
|
||||
winner_contestant_id: z.string().uuid().nullable(),
|
||||
});
|
||||
|
||||
// ─── Route registration ───────────────────────────────────────────────────────
|
||||
|
||||
const GeneratePromptBody = z.object({
|
||||
description: z.string().min(1).max(2_000),
|
||||
});
|
||||
|
||||
export function registerArenaRoutes(
|
||||
app: FastifyInstance,
|
||||
sql: Sql,
|
||||
battleRunner: BattleRunner,
|
||||
cancelExternal: ExternalCancelFn,
|
||||
config: Config,
|
||||
): void {
|
||||
|
||||
// POST /api/battles/generate-prompt — draft a fuller battle prompt from a
|
||||
// short description using the default BooChat model. One-shot, non-streaming.
|
||||
// Must be registered BEFORE /api/battles/:id so the literal 'generate-prompt'
|
||||
// path is not mistaken for a UUID param.
|
||||
app.post('/api/battles/generate-prompt', async (req, reply) => {
|
||||
const parsed = GeneratePromptBody.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid body', details: parsed.error.flatten() };
|
||||
}
|
||||
|
||||
const { project_id, input, contestants } = parsed.data;
|
||||
const arenaId = crypto.randomUUID();
|
||||
const { description } = parsed.data;
|
||||
|
||||
const tasks: TaskRow[] = [];
|
||||
for (const contestant of contestants) {
|
||||
const [task] = await sql<TaskRow[]>`
|
||||
INSERT INTO tasks (project_id, input, agent, model, mode_id, thinking_option_id, arena_id)
|
||||
VALUES (
|
||||
${project_id},
|
||||
${input},
|
||||
${contestant.agent ?? null},
|
||||
${contestant.model ?? null},
|
||||
${contestant.mode_id ?? null},
|
||||
${contestant.thinking_option_id ?? null},
|
||||
${arenaId}
|
||||
)
|
||||
RETURNING id, agent, model, mode_id, thinking_option_id, state
|
||||
`;
|
||||
tasks.push(task!);
|
||||
try {
|
||||
const prompt = await arenaModelCall({
|
||||
config,
|
||||
model: config.DEFAULT_MODEL,
|
||||
system: [
|
||||
'You are a battle-prompt writer for an AI Arena.',
|
||||
'The user gives you a short description of a coding or Q&A challenge.',
|
||||
'Expand it into a clear, self-contained prompt (2–6 sentences) that any AI model can act on.',
|
||||
'Include specific acceptance criteria where helpful.',
|
||||
'Output ONLY the prompt — no preamble, no labels, no meta-commentary.',
|
||||
].join(' '),
|
||||
user: description,
|
||||
maxTokens: 400,
|
||||
temperature: 0.6,
|
||||
});
|
||||
return { prompt };
|
||||
} catch (err) {
|
||||
app.log.warn(
|
||||
{ err: err instanceof Error ? err.message : String(err) },
|
||||
'arena generate-prompt: model call failed',
|
||||
);
|
||||
reply.code(502);
|
||||
return { error: 'model call failed' };
|
||||
}
|
||||
});
|
||||
|
||||
// POST /api/battles — launch a battle
|
||||
app.post('/api/battles', async (req, reply) => {
|
||||
const parsed = CreateBattleBody.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid body', details: parsed.error.flatten() };
|
||||
}
|
||||
|
||||
const { project_id, battle_type, prompt, contestants } = parsed.data;
|
||||
|
||||
// Reject duplicate (identity, model) pairs up front — the schema UNIQUE
|
||||
// constraint would catch it too, but an early 422 is friendlier.
|
||||
const seen = new Set<string>();
|
||||
for (const c of contestants) {
|
||||
const key = `${c.identity}::${c.model}`;
|
||||
if (seen.has(key)) {
|
||||
reply.code(422);
|
||||
return {
|
||||
error: 'duplicate_contestant',
|
||||
message: `duplicate contestant: identity="${c.identity}" model="${c.model}"`,
|
||||
};
|
||||
}
|
||||
seen.add(key);
|
||||
}
|
||||
|
||||
// Verify project exists
|
||||
const [proj] = await sql<{ id: string }[]>`SELECT id FROM projects WHERE id = ${project_id}`;
|
||||
if (!proj) {
|
||||
reply.code(404);
|
||||
return { error: 'project not found' };
|
||||
}
|
||||
|
||||
const { battleId } = await battleRunner.startBattle({
|
||||
projectId: project_id,
|
||||
battleType: battle_type,
|
||||
prompt,
|
||||
contestants,
|
||||
});
|
||||
|
||||
reply.code(201);
|
||||
return {
|
||||
arena_id: arenaId,
|
||||
tasks: tasks.map((t) => ({
|
||||
id: t.id,
|
||||
agent: t.agent,
|
||||
model: t.model,
|
||||
mode_id: t.mode_id,
|
||||
thinking_option_id: t.thinking_option_id,
|
||||
state: t.state,
|
||||
})),
|
||||
};
|
||||
return { battle_id: battleId };
|
||||
});
|
||||
|
||||
// GET /api/arena/:arena_id — list all tasks in an arena
|
||||
app.get<{ Params: { arena_id: string } }>('/api/arena/:arena_id', async (req, reply) => {
|
||||
const { arena_id } = req.params;
|
||||
|
||||
// Validate UUID format
|
||||
const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
||||
if (!uuidRegex.test(arena_id)) {
|
||||
// GET /api/battles?project_id= — list battles, most-recent-first
|
||||
app.get('/api/battles', async (req, reply) => {
|
||||
const parsed = ListBattlesQuery.safeParse(req.query);
|
||||
if (!parsed.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid arena_id format' };
|
||||
return { error: 'invalid query', details: parsed.error.flatten() };
|
||||
}
|
||||
|
||||
const tasks = await sql`
|
||||
SELECT id, project_id, state, input, output_summary, agent, model, mode_id, thinking_option_id, execution_path, session_id, started_at, ended_at, created_at, arena_id
|
||||
FROM tasks
|
||||
WHERE arena_id = ${arena_id}
|
||||
ORDER BY created_at
|
||||
const battles = await sql`
|
||||
SELECT id, project_id, battle_type, prompt, status,
|
||||
winner_contestant_id, results_path, error,
|
||||
created_at, updated_at
|
||||
FROM battles
|
||||
WHERE project_id = ${parsed.data.project_id}
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 100
|
||||
`;
|
||||
|
||||
if (tasks.length === 0) {
|
||||
reply.code(404);
|
||||
return { error: 'arena not found' };
|
||||
}
|
||||
|
||||
return { arena_id, tasks };
|
||||
return { battles };
|
||||
});
|
||||
|
||||
// POST /api/arena/:arena_id/select/:task_id — mark the winner
|
||||
app.post<{ Params: { arena_id: string; task_id: string } }>(
|
||||
'/api/arena/:arena_id/select/:task_id',
|
||||
async (req, reply) => {
|
||||
const { arena_id, task_id } = req.params;
|
||||
|
||||
// Verify the task belongs to this arena
|
||||
const rows = await sql<{ id: string; state: string; arena_id: string | null }[]>`
|
||||
SELECT id, state, arena_id FROM tasks WHERE id = ${task_id}
|
||||
`;
|
||||
|
||||
if (rows.length === 0) {
|
||||
reply.code(404);
|
||||
return { error: 'task not found' };
|
||||
}
|
||||
|
||||
const task = rows[0]!;
|
||||
if (task.arena_id !== arena_id) {
|
||||
reply.code(409);
|
||||
return { error: 'task does not belong to this arena' };
|
||||
}
|
||||
|
||||
// Mark as selected via output_summary prefix (lightweight — no schema change)
|
||||
await sql`
|
||||
UPDATE tasks
|
||||
SET output_summary = COALESCE('[SELECTED] ' || output_summary, '[SELECTED]')
|
||||
WHERE id = ${task_id}
|
||||
`;
|
||||
|
||||
return { selected: true, task_id, arena_id };
|
||||
// GET /api/battles/:id — one battle + its contestants + cross-examinations
|
||||
app.get<{ Params: { id: string } }>('/api/battles/:id', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
if (!parsedId.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
);
|
||||
const id = parsedId.data;
|
||||
|
||||
const [battle] = await sql<{
|
||||
id: string;
|
||||
project_id: string;
|
||||
battle_type: string;
|
||||
prompt: string;
|
||||
status: string;
|
||||
winner_contestant_id: string | null;
|
||||
results_path: string | null;
|
||||
error: string | null;
|
||||
created_at: unknown;
|
||||
updated_at: unknown;
|
||||
}[]>`
|
||||
SELECT id, project_id, battle_type, prompt, status,
|
||||
winner_contestant_id, results_path, error,
|
||||
created_at, updated_at
|
||||
FROM battles WHERE id = ${id}
|
||||
`;
|
||||
|
||||
if (!battle) {
|
||||
reply.code(404);
|
||||
return { error: 'battle not found' };
|
||||
}
|
||||
|
||||
const contestants = await sql`
|
||||
SELECT id, battle_id, identity, model, lane, task_id, worktree_id,
|
||||
status, duration_ms, tokens_per_sec, cost_tokens, result_path, error,
|
||||
created_at, updated_at
|
||||
FROM contestants
|
||||
WHERE battle_id = ${id}
|
||||
ORDER BY created_at ASC
|
||||
`;
|
||||
|
||||
const crossExaminations = await sql`
|
||||
SELECT id, battle_id, identity, model, verdict, created_at
|
||||
FROM cross_examinations
|
||||
WHERE battle_id = ${id}
|
||||
ORDER BY created_at ASC
|
||||
`;
|
||||
|
||||
return { battle, contestants, cross_examinations: crossExaminations };
|
||||
});
|
||||
|
||||
// POST /api/battles/:id/stop — cancel a running battle
|
||||
app.post<{ Params: { id: string } }>('/api/battles/:id/stop', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
if (!parsedId.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
const id = parsedId.data;
|
||||
|
||||
const [row] = await sql<{ id: string; status: string }[]>`
|
||||
SELECT id, status FROM battles WHERE id = ${id}
|
||||
`;
|
||||
if (!row) {
|
||||
reply.code(404);
|
||||
return { error: 'battle not found' };
|
||||
}
|
||||
if (row.status !== 'running') {
|
||||
reply.code(409);
|
||||
return { error: `cannot stop battle in status '${row.status}'` };
|
||||
}
|
||||
|
||||
const { cancelled, taskIds } = await battleRunner.cancelBattle(id);
|
||||
if (!cancelled) {
|
||||
reply.code(409);
|
||||
return { error: 'battle is no longer running' };
|
||||
}
|
||||
|
||||
// Abort any in-flight dispatcher tasks (cloud contestants running externally).
|
||||
for (const taskId of taskIds) {
|
||||
cancelExternal(taskId);
|
||||
}
|
||||
|
||||
return { cancelled: true };
|
||||
});
|
||||
|
||||
// GET /api/battles/:id/analysis — read analysis.md from the battle's results_path
|
||||
app.get<{ Params: { id: string } }>('/api/battles/:id/analysis', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
if (!parsedId.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
const id = parsedId.data;
|
||||
|
||||
const [row] = await sql<{ results_path: string | null }[]>`
|
||||
SELECT results_path FROM battles WHERE id = ${id}
|
||||
`;
|
||||
if (!row) {
|
||||
reply.code(404);
|
||||
return { error: 'battle not found' };
|
||||
}
|
||||
if (!row.results_path) {
|
||||
reply.code(404);
|
||||
return { error: 'analysis not ready' };
|
||||
}
|
||||
|
||||
try {
|
||||
const text = await readFile(join(row.results_path, 'analysis.md'), 'utf8');
|
||||
return { text };
|
||||
} catch {
|
||||
reply.code(404);
|
||||
return { error: 'analysis not ready' };
|
||||
}
|
||||
});
|
||||
|
||||
// POST /api/battles/:id/analyze — trigger or re-trigger analysis
|
||||
app.post<{ Params: { id: string } }>('/api/battles/:id/analyze', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
if (!parsedId.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
const id = parsedId.data;
|
||||
|
||||
const [row] = await sql<{ id: string; status: string }[]>`
|
||||
SELECT id, status FROM battles WHERE id = ${id}
|
||||
`;
|
||||
if (!row) {
|
||||
reply.code(404);
|
||||
return { error: 'battle not found' };
|
||||
}
|
||||
if (row.status === 'running') {
|
||||
reply.code(409);
|
||||
return { error: 'battle is still running — wait for all contestants to finish' };
|
||||
}
|
||||
|
||||
const result = await battleRunner.triggerAnalysis(id);
|
||||
if (!result.triggered) {
|
||||
reply.code(404);
|
||||
return { error: 'battle not found' };
|
||||
}
|
||||
|
||||
reply.code(202);
|
||||
return { triggered: true };
|
||||
});
|
||||
|
||||
// PATCH /api/battles/:id/winner — manually set or clear the winner.
|
||||
// Validates the contestant belongs to the battle; publishes battle_updated so
|
||||
// the pane badge reflects the override immediately. Human is authoritative.
|
||||
app.patch<{ Params: { id: string } }>('/api/battles/:id/winner', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
if (!parsedId.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
|
||||
const parsed = SetWinnerBody.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid body', details: parsed.error.flatten() };
|
||||
}
|
||||
|
||||
const result = await battleRunner.setWinner(parsedId.data, parsed.data.winner_contestant_id);
|
||||
if (!result.ok) {
|
||||
if (result.notFound) { reply.code(404); return { error: 'battle not found' }; }
|
||||
if (result.invalidContestant) { reply.code(422); return { error: 'contestant not found in this battle' }; }
|
||||
reply.code(500); return { error: 'unknown error' };
|
||||
}
|
||||
return { ok: true };
|
||||
});
|
||||
|
||||
// GET /api/battles/:id/contestants/:cid/diff — read the diff.patch for a coding contestant.
|
||||
app.get<{ Params: { id: string; cid: string } }>('/api/battles/:id/contestants/:cid/diff', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
const parsedCid = UuidParam.safeParse(req.params.cid);
|
||||
if (!parsedId.success || !parsedCid.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
|
||||
const [contestant] = await sql<{ result_path: string | null }[]>`
|
||||
SELECT result_path FROM contestants
|
||||
WHERE id = ${parsedCid.data} AND battle_id = ${parsedId.data}
|
||||
`;
|
||||
if (!contestant) {
|
||||
reply.code(404);
|
||||
return { error: 'contestant not found' };
|
||||
}
|
||||
if (!contestant.result_path) {
|
||||
reply.code(404);
|
||||
return { error: 'diff not available' };
|
||||
}
|
||||
|
||||
try {
|
||||
const text = await readFile(join(contestant.result_path, 'diff.patch'), 'utf8');
|
||||
return { diff: text };
|
||||
} catch {
|
||||
reply.code(404);
|
||||
return { error: 'diff not available' };
|
||||
}
|
||||
});
|
||||
|
||||
// POST /api/battles/:id/cross-examine — start a cross-examination
|
||||
app.post<{ Params: { id: string } }>('/api/battles/:id/cross-examine', async (req, reply) => {
|
||||
const parsedId = UuidParam.safeParse(req.params.id);
|
||||
if (!parsedId.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid id' };
|
||||
}
|
||||
const id = parsedId.data;
|
||||
|
||||
const parsed = CrossExamineBody.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
reply.code(400);
|
||||
return { error: 'invalid body', details: parsed.error.flatten() };
|
||||
}
|
||||
|
||||
const [row] = await sql<{ id: string; status: string }[]>`
|
||||
SELECT id, status FROM battles WHERE id = ${id}
|
||||
`;
|
||||
if (!row) {
|
||||
reply.code(404);
|
||||
return { error: 'battle not found' };
|
||||
}
|
||||
if (row.status === 'running') {
|
||||
reply.code(409);
|
||||
return { error: 'battle is still running — cross-examine after all contestants finish' };
|
||||
}
|
||||
|
||||
const { crossExamId } = await battleRunner.startCrossExam(id, {
|
||||
identity: parsed.data.identity,
|
||||
model: parsed.data.model,
|
||||
});
|
||||
|
||||
reply.code(202);
|
||||
return { cross_exam_id: crossExamId };
|
||||
});
|
||||
}
|
||||
|
||||
@@ -54,9 +54,6 @@ DO $$ BEGIN
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- v2.0.5: arena support — group tasks into competitive arenas.
|
||||
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS arena_id UUID;
|
||||
|
||||
-- Human inbox: tasks needing attention
|
||||
CREATE OR REPLACE VIEW human_inbox AS
|
||||
SELECT * FROM tasks WHERE state IN ('blocked', 'failed');
|
||||
@@ -81,6 +78,7 @@ ALTER TABLE tasks ADD COLUMN IF NOT EXISTS thinking_option_id TEXT;
|
||||
DROP VIEW IF EXISTS human_inbox;
|
||||
ALTER TABLE tasks DROP COLUMN IF EXISTS feature_values;
|
||||
ALTER TABLE tasks DROP COLUMN IF EXISTS worktree_path;
|
||||
ALTER TABLE tasks DROP COLUMN IF EXISTS arena_id;
|
||||
CREATE OR REPLACE VIEW human_inbox AS
|
||||
SELECT * FROM tasks WHERE state IN ('blocked', 'failed');
|
||||
|
||||
@@ -157,7 +155,7 @@ CREATE UNIQUE INDEX IF NOT EXISTS worktrees_active_path_uidx ON worktrees(path)
|
||||
DROP TABLE IF EXISTS session_worktrees;
|
||||
|
||||
-- Dispatch hint: which chat (tab) a task belongs to. The coder message route and
|
||||
-- skills route set it from the frontend tab; session-less creators (arena, MCP,
|
||||
-- skills route set it from the frontend tab; session-less creators (MCP,
|
||||
-- new_task, generic /api/tasks) leave it NULL and the dispatcher creates a chat.
|
||||
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS chat_id UUID REFERENCES chats(id) ON DELETE SET NULL;
|
||||
|
||||
@@ -271,7 +269,7 @@ ALTER TABLE agent_sessions ADD CONSTRAINT agent_sessions_backend_chk
|
||||
CHECK (backend IN ('opencode_server', 'acp_warm', 'claude_sdk'));
|
||||
|
||||
-- LISTEN/NOTIFY fast path: every tasks INSERT (from any call site — routes,
|
||||
-- new_task tool, arena, MCP server) fires pg_notify('tasks_new') in the same
|
||||
-- new_task tool, MCP server) fires pg_notify('tasks_new') in the same
|
||||
-- transaction, so the dispatcher reacts immediately instead of waiting for the
|
||||
-- fallback poll. Postgres holds the notification until COMMIT, so the listener
|
||||
-- always sees the committed row. A trigger covers all insert paths with no
|
||||
@@ -357,3 +355,71 @@ DO $$ BEGIN
|
||||
CHECK (status IN ('pending', 'running', 'completed', 'failed', 'skipped', 'cancelled'));
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Arena: battles + contestants + cross_examinations.
|
||||
-- project_id carries no FK (matches tasks.project_id + flow_runs.project_id convention).
|
||||
-- winner_contestant_id FK is deferred (forward reference): added via guarded ALTER below.
|
||||
CREATE TABLE IF NOT EXISTS battles (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
project_id UUID NOT NULL,
|
||||
battle_type TEXT NOT NULL,
|
||||
prompt TEXT NOT NULL,
|
||||
status TEXT NOT NULL DEFAULT 'pending',
|
||||
winner_contestant_id UUID,
|
||||
results_path TEXT,
|
||||
error TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
|
||||
CONSTRAINT battles_type_chk CHECK (battle_type IN ('coding', 'qa')),
|
||||
CONSTRAINT battles_status_chk CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS contestants (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
battle_id UUID NOT NULL REFERENCES battles(id) ON DELETE CASCADE,
|
||||
identity TEXT NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
lane TEXT NOT NULL,
|
||||
task_id UUID REFERENCES tasks(id) ON DELETE SET NULL,
|
||||
worktree_id UUID REFERENCES worktrees(id) ON DELETE SET NULL,
|
||||
status TEXT NOT NULL DEFAULT 'queued',
|
||||
duration_ms INTEGER,
|
||||
tokens_per_sec DOUBLE PRECISION,
|
||||
cost_tokens INTEGER,
|
||||
result_path TEXT,
|
||||
error TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp(),
|
||||
CONSTRAINT contestants_lane_chk CHECK (lane IN ('local', 'cloud')),
|
||||
CONSTRAINT contestants_status_chk CHECK (status IN ('queued', 'running', 'done', 'error')),
|
||||
UNIQUE (battle_id, identity, model)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS cross_examinations (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
battle_id UUID NOT NULL REFERENCES battles(id) ON DELETE CASCADE,
|
||||
identity TEXT NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
verdict TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT clock_timestamp()
|
||||
);
|
||||
|
||||
-- Add the winner FK now that contestants exists.
|
||||
DO $$ BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'battles_winner_contestant_id_fkey') THEN
|
||||
ALTER TABLE battles ADD CONSTRAINT battles_winner_contestant_id_fkey
|
||||
FOREIGN KEY (winner_contestant_id) REFERENCES contestants(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- battles query (GET /api/battles?project_id=).
|
||||
CREATE INDEX IF NOT EXISTS battles_project_created_idx ON battles(project_id, created_at DESC);
|
||||
|
||||
-- Lane-scheduler advance scans (contestants WHERE battle_id = ? AND status = ?).
|
||||
CREATE INDEX IF NOT EXISTS contestants_battle_status_idx ON contestants(battle_id, status);
|
||||
|
||||
-- onTaskTerminal callback: look up the contestant owning a completed task.
|
||||
CREATE INDEX IF NOT EXISTS contestants_task_id_idx ON contestants(task_id);
|
||||
|
||||
-- Cross-examination listing per battle.
|
||||
CREATE INDEX IF NOT EXISTS cross_examinations_battle_idx ON cross_examinations(battle_id);
|
||||
|
||||
254
apps/coder/src/services/__tests__/arena-analyzer-helpers.test.ts
Normal file
254
apps/coder/src/services/__tests__/arena-analyzer-helpers.test.ts
Normal file
@@ -0,0 +1,254 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import {
|
||||
buildDigestPrompt,
|
||||
buildJudgePrompt,
|
||||
buildCrossExamPrompt,
|
||||
extractWinner,
|
||||
shouldNameWinner,
|
||||
type ContestantDigest,
|
||||
type ContestantDigestInput,
|
||||
} from '../arena-analyzer-helpers.js';
|
||||
|
||||
// ─── shouldNameWinner ─────────────────────────────────────────────────────────
|
||||
|
||||
describe('shouldNameWinner', () => {
|
||||
it('returns false with 0 succeeded contestants', () => {
|
||||
expect(shouldNameWinner(0)).toBe(false);
|
||||
});
|
||||
|
||||
it('returns false with exactly 1 succeeded contestant', () => {
|
||||
expect(shouldNameWinner(1)).toBe(false);
|
||||
});
|
||||
|
||||
it('returns true with exactly 2 succeeded contestants', () => {
|
||||
expect(shouldNameWinner(2)).toBe(true);
|
||||
});
|
||||
|
||||
it('returns true with more than 2 succeeded contestants', () => {
|
||||
expect(shouldNameWinner(3)).toBe(true);
|
||||
expect(shouldNameWinner(6)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── extractWinner ────────────────────────────────────────────────────────────
|
||||
|
||||
describe('extractWinner', () => {
|
||||
it('extracts identity and model from a WINNER: line', () => {
|
||||
const output = 'Some analysis\n\nWINNER: claude/opus-4-5\n\nMore text.';
|
||||
expect(extractWinner(output)).toEqual({ identity: 'claude', model: 'opus-4-5' });
|
||||
});
|
||||
|
||||
it('is case-insensitive for the WINNER keyword', () => {
|
||||
expect(extractWinner('winner: boocode/qwen3.6-35b')).toEqual({
|
||||
identity: 'boocode',
|
||||
model: 'qwen3.6-35b',
|
||||
});
|
||||
expect(extractWinner('Winner: opencode/some-model')).toEqual({
|
||||
identity: 'opencode',
|
||||
model: 'some-model',
|
||||
});
|
||||
});
|
||||
|
||||
it('returns null when NO_WINNER is declared', () => {
|
||||
expect(extractWinner('WINNER: NO_WINNER')).toBeNull();
|
||||
expect(extractWinner('winner: no_winner')).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when no WINNER line is present', () => {
|
||||
expect(extractWinner('Just some analysis text with no verdict.')).toBeNull();
|
||||
expect(extractWinner('')).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when the WINNER line has no slash separator', () => {
|
||||
expect(extractWinner('WINNER: justidentity')).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when the WINNER line is empty after the colon', () => {
|
||||
expect(extractWinner('WINNER:')).toBeNull();
|
||||
expect(extractWinner('WINNER: ')).toBeNull();
|
||||
});
|
||||
|
||||
it('handles leading and trailing whitespace around the slash parts', () => {
|
||||
const result = extractWinner('WINNER: claude / opus-4-5 ');
|
||||
expect(result).toEqual({ identity: 'claude', model: 'opus-4-5' });
|
||||
});
|
||||
|
||||
it('picks the first WINNER line when multiple are present', () => {
|
||||
const output = 'WINNER: claude/opus-4-5\nWINNER: opencode/other-model';
|
||||
expect(extractWinner(output)).toEqual({ identity: 'claude', model: 'opus-4-5' });
|
||||
});
|
||||
|
||||
it('handles model names that contain slashes by splitting at the first slash only', () => {
|
||||
// edge case: model name with a slash — should still split at first slash
|
||||
// identity = 'native', model = 'llama-swap/qwen3.6'
|
||||
const result = extractWinner('WINNER: native/llama-swap/qwen3.6');
|
||||
expect(result).toEqual({ identity: 'native', model: 'llama-swap/qwen3.6' });
|
||||
});
|
||||
});
|
||||
|
||||
// ─── buildDigestPrompt ────────────────────────────────────────────────────────
|
||||
|
||||
describe('buildDigestPrompt', () => {
|
||||
const base: ContestantDigestInput = {
|
||||
identity: 'claude',
|
||||
model: 'opus-4-5',
|
||||
resultMd: '# Output\n\nSome result content.',
|
||||
benchmarkLine: '12000ms',
|
||||
};
|
||||
|
||||
it('returns an object with non-empty system and user strings', () => {
|
||||
const { system, user } = buildDigestPrompt(base);
|
||||
expect(system.length).toBeGreaterThan(0);
|
||||
expect(user.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('includes the contestant identity and model in the user prompt', () => {
|
||||
const { user } = buildDigestPrompt(base);
|
||||
expect(user).toContain('claude');
|
||||
expect(user).toContain('opus-4-5');
|
||||
});
|
||||
|
||||
it('includes the benchmark line in the user prompt', () => {
|
||||
const { user } = buildDigestPrompt(base);
|
||||
expect(user).toContain('12000ms');
|
||||
});
|
||||
|
||||
it('includes the result.md content in the user prompt', () => {
|
||||
const { user } = buildDigestPrompt(base);
|
||||
expect(user).toContain('Some result content.');
|
||||
});
|
||||
|
||||
it('includes the diff.patch when provided', () => {
|
||||
const input: ContestantDigestInput = { ...base, diffPatch: '--- a/foo.ts\n+++ b/foo.ts\n+added' };
|
||||
const { user } = buildDigestPrompt(input);
|
||||
expect(user).toContain('added');
|
||||
expect(user).toContain('```diff');
|
||||
});
|
||||
|
||||
it('omits the diff section when diffPatch is undefined', () => {
|
||||
const { user } = buildDigestPrompt(base);
|
||||
expect(user).not.toContain('```diff');
|
||||
});
|
||||
|
||||
it('truncates resultMd longer than 8000 characters', () => {
|
||||
const longResult = 'x'.repeat(10_000);
|
||||
const { user } = buildDigestPrompt({ ...base, resultMd: longResult });
|
||||
// The truncated content must not exceed 8000 chars in the sliced section.
|
||||
// We just check the total user string doesn't balloon unreasonably.
|
||||
expect(user.length).toBeLessThan(15_000);
|
||||
});
|
||||
|
||||
it('truncates diffPatch longer than 5000 characters', () => {
|
||||
const longDiff = '+' + 'x'.repeat(10_000);
|
||||
const { user } = buildDigestPrompt({ ...base, diffPatch: longDiff });
|
||||
expect(user.length).toBeLessThan(16_000);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── buildJudgePrompt ─────────────────────────────────────────────────────────
|
||||
|
||||
describe('buildJudgePrompt', () => {
|
||||
const digests: ContestantDigest[] = [
|
||||
{ identity: 'claude', model: 'opus-4-5', digest: 'Good result.', benchmarkLine: '5000ms' },
|
||||
{ identity: 'opencode', model: 'qwen3.6', digest: 'Decent result.', benchmarkLine: '8000ms' },
|
||||
];
|
||||
|
||||
it('includes the original prompt in the user section', () => {
|
||||
const { user } = buildJudgePrompt('Write a sorting algorithm', digests);
|
||||
expect(user).toContain('Write a sorting algorithm');
|
||||
});
|
||||
|
||||
it('includes each contestant heading in the user section', () => {
|
||||
const { user } = buildJudgePrompt('prompt', digests);
|
||||
expect(user).toContain('claude');
|
||||
expect(user).toContain('opus-4-5');
|
||||
expect(user).toContain('opencode');
|
||||
expect(user).toContain('qwen3.6');
|
||||
});
|
||||
|
||||
it('includes each contestant digest text', () => {
|
||||
const { user } = buildJudgePrompt('prompt', digests);
|
||||
expect(user).toContain('Good result.');
|
||||
expect(user).toContain('Decent result.');
|
||||
});
|
||||
|
||||
it('instructs the model to name a WINNER when 2+ digests are provided', () => {
|
||||
const { system } = buildJudgePrompt('prompt', digests);
|
||||
expect(system).toContain('WINNER:');
|
||||
});
|
||||
|
||||
it('instructs the model NOT to name a winner when fewer than 2 digests are provided', () => {
|
||||
const oneDigest = digests.slice(0, 1);
|
||||
const { system } = buildJudgePrompt('prompt', oneDigest);
|
||||
expect(system).toContain('NO_WINNER');
|
||||
expect(system).not.toContain('WINNER: <identity>');
|
||||
});
|
||||
|
||||
it('instructs NO_WINNER when digests list is empty', () => {
|
||||
const { system } = buildJudgePrompt('prompt', []);
|
||||
expect(system).toContain('NO_WINNER');
|
||||
});
|
||||
|
||||
it('truncates originalPrompt longer than 2000 characters', () => {
|
||||
const longPrompt = 'p'.repeat(5_000);
|
||||
const { user } = buildJudgePrompt(longPrompt, digests);
|
||||
// Should not contain more than 2000 chars of the prompt.
|
||||
const promptSection = user.split('# Contestant Digests')[0] ?? '';
|
||||
expect(promptSection.length).toBeLessThan(3_000);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── buildCrossExamPrompt ─────────────────────────────────────────────────────
|
||||
|
||||
describe('buildCrossExamPrompt', () => {
|
||||
const digests: ContestantDigest[] = [
|
||||
{ identity: 'claude', model: 'opus-4-5', digest: 'Strong result.', benchmarkLine: '5000ms' },
|
||||
{ identity: 'boocode', model: 'qwen3.6-35b', digest: 'Decent result.', benchmarkLine: '12000ms' },
|
||||
];
|
||||
|
||||
const baseOpts = {
|
||||
originalPrompt: 'Write a sorting algorithm.',
|
||||
digests,
|
||||
analysisContent: '# Arena Analysis\n\nClaude did better.\n\nWINNER: claude/opus-4-5',
|
||||
proposedWinner: 'claude/opus-4-5',
|
||||
examinerIdentity: 'goose',
|
||||
examinerModel: 'gpt-4o',
|
||||
};
|
||||
|
||||
it('includes the examiner identity and model in the system prompt', () => {
|
||||
const { system } = buildCrossExamPrompt(baseOpts);
|
||||
expect(system).toContain('goose');
|
||||
expect(system).toContain('gpt-4o');
|
||||
});
|
||||
|
||||
it('includes the original prompt in the user section', () => {
|
||||
const { user } = buildCrossExamPrompt(baseOpts);
|
||||
expect(user).toContain('Write a sorting algorithm.');
|
||||
});
|
||||
|
||||
it('includes each contestant digest', () => {
|
||||
const { user } = buildCrossExamPrompt(baseOpts);
|
||||
expect(user).toContain('Strong result.');
|
||||
expect(user).toContain('Decent result.');
|
||||
});
|
||||
|
||||
it('includes the proposed analysis content', () => {
|
||||
const { user } = buildCrossExamPrompt(baseOpts);
|
||||
expect(user).toContain('Claude did better.');
|
||||
});
|
||||
|
||||
it('includes the proposed winner when set', () => {
|
||||
const { user } = buildCrossExamPrompt(baseOpts);
|
||||
expect(user).toContain('claude/opus-4-5');
|
||||
});
|
||||
|
||||
it('notes that no winner was proposed when proposedWinner is null', () => {
|
||||
const { user } = buildCrossExamPrompt({ ...baseOpts, proposedWinner: null });
|
||||
expect(user).toContain('No winner was proposed');
|
||||
});
|
||||
|
||||
it('instructs the examiner to provide a VERDICT line', () => {
|
||||
const { system } = buildCrossExamPrompt(baseOpts);
|
||||
expect(system).toContain('VERDICT:');
|
||||
});
|
||||
});
|
||||
332
apps/coder/src/services/__tests__/arena-decisions.test.ts
Normal file
332
apps/coder/src/services/__tests__/arena-decisions.test.ts
Normal file
@@ -0,0 +1,332 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import {
|
||||
classifyLane,
|
||||
nextLocalContestant,
|
||||
isBattleComplete,
|
||||
computeBenchmark,
|
||||
sanitizeSlug,
|
||||
buildBattleSlug,
|
||||
buildContestantDir,
|
||||
reconcileContestantResume,
|
||||
reconcileContestants,
|
||||
type ContestantSlot,
|
||||
} from '../arena-decisions.js';
|
||||
|
||||
// Local models = what the llama-swap server actually serves.
|
||||
const LOCAL_MODELS: ReadonlySet<string> = new Set([
|
||||
'qwen3.6-35b-a3b-mxfp4',
|
||||
'qwen2.5-coder-7b',
|
||||
]);
|
||||
|
||||
// ─── classifyLane ────────────────────────────────────────────────────────────
|
||||
|
||||
describe('classifyLane', () => {
|
||||
it('classifies qa battles as local regardless of identity or model', () => {
|
||||
expect(classifyLane('qa', 'boocode', 'qwen3.6-35b-a3b-mxfp4', LOCAL_MODELS)).toBe('local');
|
||||
expect(classifyLane('qa', 'claude', 'claude-opus-4-5', LOCAL_MODELS)).toBe('local');
|
||||
expect(classifyLane('qa', 'Debugger', 'cloud-model', new Set())).toBe('local');
|
||||
expect(classifyLane('qa', 'opencode', 'any-model', LOCAL_MODELS)).toBe('local');
|
||||
});
|
||||
|
||||
it('classifies coding contestants as local when model is in localModels', () => {
|
||||
expect(classifyLane('coding', 'boocode', 'qwen3.6-35b-a3b-mxfp4', LOCAL_MODELS)).toBe('local');
|
||||
expect(classifyLane('coding', 'opencode', 'qwen3.6-35b-a3b-mxfp4', LOCAL_MODELS)).toBe('local');
|
||||
expect(classifyLane('coding', 'qwen', 'qwen2.5-coder-7b', LOCAL_MODELS)).toBe('local');
|
||||
});
|
||||
|
||||
it('classifies coding contestants as cloud when model is not in localModels', () => {
|
||||
expect(classifyLane('coding', 'claude', 'claude-opus-4-5', LOCAL_MODELS)).toBe('cloud');
|
||||
expect(classifyLane('coding', 'opencode', 'claude-opus-4-5', LOCAL_MODELS)).toBe('cloud');
|
||||
expect(classifyLane('coding', 'goose', 'gpt-4o', LOCAL_MODELS)).toBe('cloud');
|
||||
expect(classifyLane('coding', 'qwen', 'unknown-remote-model', LOCAL_MODELS)).toBe('cloud');
|
||||
});
|
||||
|
||||
it('uses the injected localModels set, not a hardcoded list', () => {
|
||||
const custom = new Set(['my-local-model']);
|
||||
expect(classifyLane('coding', 'any-agent', 'my-local-model', custom)).toBe('local');
|
||||
expect(classifyLane('coding', 'boocode', 'other-model', custom)).toBe('cloud');
|
||||
});
|
||||
|
||||
it('defaults to cloud for an empty localModels set', () => {
|
||||
expect(classifyLane('coding', 'boocode', 'qwen3.6-35b-a3b-mxfp4', new Set())).toBe('cloud');
|
||||
expect(classifyLane('coding', 'native', 'any-local-model', new Set())).toBe('cloud');
|
||||
});
|
||||
});
|
||||
|
||||
// ─── nextLocalContestant ─────────────────────────────────────────────────────
|
||||
|
||||
describe('nextLocalContestant', () => {
|
||||
it('returns null for an empty list', () => {
|
||||
expect(nextLocalContestant([])).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when no local contestants are queued', () => {
|
||||
const slots: ContestantSlot[] = [
|
||||
{ id: 'c1', lane: 'local', status: 'running' },
|
||||
{ id: 'c2', lane: 'cloud', status: 'queued' },
|
||||
];
|
||||
expect(nextLocalContestant(slots)).toBeNull();
|
||||
});
|
||||
|
||||
it('returns the first queued local contestant in order', () => {
|
||||
const slots: ContestantSlot[] = [
|
||||
{ id: 'c1', lane: 'local', status: 'done' },
|
||||
{ id: 'c2', lane: 'local', status: 'queued' },
|
||||
{ id: 'c3', lane: 'local', status: 'queued' },
|
||||
];
|
||||
expect(nextLocalContestant(slots)).toBe('c2');
|
||||
});
|
||||
|
||||
it('skips done/error local contestants and cloud contestants', () => {
|
||||
const slots: ContestantSlot[] = [
|
||||
{ id: 'c1', lane: 'cloud', status: 'queued' },
|
||||
{ id: 'c2', lane: 'local', status: 'error' },
|
||||
{ id: 'c3', lane: 'local', status: 'queued' },
|
||||
];
|
||||
expect(nextLocalContestant(slots)).toBe('c3');
|
||||
});
|
||||
|
||||
it('returns null when all local contestants are done or error', () => {
|
||||
const slots: ContestantSlot[] = [
|
||||
{ id: 'c1', lane: 'local', status: 'done' },
|
||||
{ id: 'c2', lane: 'local', status: 'error' },
|
||||
];
|
||||
expect(nextLocalContestant(slots)).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
// ─── isBattleComplete ────────────────────────────────────────────────────────
|
||||
|
||||
describe('isBattleComplete', () => {
|
||||
it('returns false for an empty list', () => {
|
||||
expect(isBattleComplete([])).toBe(false);
|
||||
});
|
||||
|
||||
it('returns true when all contestants are done', () => {
|
||||
expect(isBattleComplete([{ status: 'done' }, { status: 'done' }])).toBe(true);
|
||||
});
|
||||
|
||||
it('returns true when all contestants are error', () => {
|
||||
expect(isBattleComplete([{ status: 'error' }, { status: 'error' }])).toBe(true);
|
||||
});
|
||||
|
||||
it('returns true for a mixed done/error result', () => {
|
||||
expect(isBattleComplete([{ status: 'done' }, { status: 'error' }, { status: 'done' }])).toBe(true);
|
||||
});
|
||||
|
||||
it('returns false while any contestant is still running', () => {
|
||||
expect(isBattleComplete([{ status: 'done' }, { status: 'running' }])).toBe(false);
|
||||
});
|
||||
|
||||
it('returns false while any contestant is still queued', () => {
|
||||
expect(isBattleComplete([{ status: 'done' }, { status: 'queued' }])).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── computeBenchmark ────────────────────────────────────────────────────────
|
||||
|
||||
describe('computeBenchmark', () => {
|
||||
const t0 = new Date('2026-06-06T10:00:00.000Z');
|
||||
const t1 = new Date('2026-06-06T10:00:05.000Z'); // +5 000ms
|
||||
|
||||
it('computes duration in ms for both lanes', () => {
|
||||
const local = computeBenchmark(t0, t1, 100, 'local');
|
||||
expect(local.durationMs).toBe(5000);
|
||||
const cloud = computeBenchmark(t0, t1, null, 'cloud');
|
||||
expect(cloud.durationMs).toBe(5000);
|
||||
});
|
||||
|
||||
it('computes tokens/sec for local lane when costTokens is known', () => {
|
||||
const bench = computeBenchmark(t0, t1, 500, 'local');
|
||||
expect(bench.tokensPerSec).toBeCloseTo(100, 5); // 500 / 5 = 100 tok/s
|
||||
});
|
||||
|
||||
it('omits tokens/sec for cloud lane regardless of costTokens', () => {
|
||||
const bench = computeBenchmark(t0, t1, 500, 'cloud');
|
||||
expect(bench.tokensPerSec).toBeNull();
|
||||
});
|
||||
|
||||
it('omits tokens/sec for local lane when costTokens is null', () => {
|
||||
const bench = computeBenchmark(t0, t1, null, 'local');
|
||||
expect(bench.tokensPerSec).toBeNull();
|
||||
});
|
||||
|
||||
it('returns durationMs = 0 and null tokensPerSec when timestamps are equal', () => {
|
||||
const bench = computeBenchmark(t0, t0, 100, 'local');
|
||||
expect(bench.durationMs).toBe(0);
|
||||
expect(bench.tokensPerSec).toBeNull();
|
||||
});
|
||||
|
||||
it('clamps negative duration to 0 (clock skew)', () => {
|
||||
const bench = computeBenchmark(t1, t0, 50, 'local');
|
||||
expect(bench.durationMs).toBe(0);
|
||||
expect(bench.tokensPerSec).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
// ─── sanitizeSlug ────────────────────────────────────────────────────────────
|
||||
|
||||
describe('sanitizeSlug', () => {
|
||||
it('lowercases and preserves alphanumeric + hyphens', () => {
|
||||
expect(sanitizeSlug('claude')).toBe('claude');
|
||||
expect(sanitizeSlug('claude-opus-4-5')).toBe('claude-opus-4-5');
|
||||
});
|
||||
|
||||
it('replaces spaces and special characters with hyphens', () => {
|
||||
expect(sanitizeSlug('Code Reviewer')).toBe('code-reviewer');
|
||||
expect(sanitizeSlug('native/boocode')).toBe('native-boocode');
|
||||
expect(sanitizeSlug('qwen2.5-coder-35b')).toBe('qwen2-5-coder-35b');
|
||||
});
|
||||
|
||||
it('collapses consecutive non-alphanumeric runs to a single hyphen', () => {
|
||||
expect(sanitizeSlug('foo bar---baz')).toBe('foo-bar-baz');
|
||||
});
|
||||
|
||||
it('strips leading and trailing hyphens', () => {
|
||||
expect(sanitizeSlug('---foo---')).toBe('foo');
|
||||
});
|
||||
|
||||
it('truncates to 64 characters', () => {
|
||||
const long = 'a'.repeat(100);
|
||||
expect(sanitizeSlug(long).length).toBe(64);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── buildBattleSlug ─────────────────────────────────────────────────────────
|
||||
|
||||
describe('buildBattleSlug', () => {
|
||||
it('builds a deterministic dated slug from id, type, and createdAt', () => {
|
||||
const id = 'a1b2c3d4-e5f6-7890-abcd-ef1234567890';
|
||||
const createdAt = new Date('2026-06-06T12:00:00.000Z');
|
||||
const slug = buildBattleSlug(id, 'coding', createdAt);
|
||||
expect(slug).toBe('2026-06-06-coding-a1b2c3d4');
|
||||
});
|
||||
|
||||
it('includes the battle type in the slug', () => {
|
||||
const id = 'aaaaaaaa-0000-0000-0000-000000000000';
|
||||
const createdAt = new Date('2026-01-01T00:00:00.000Z');
|
||||
expect(buildBattleSlug(id, 'qa', createdAt)).toContain('-qa-');
|
||||
expect(buildBattleSlug(id, 'coding', createdAt)).toContain('-coding-');
|
||||
});
|
||||
|
||||
it('uses the first 8 hex chars of the uuid (dashes stripped)', () => {
|
||||
const id = 'deadbeef-0000-0000-0000-000000000000';
|
||||
const slug = buildBattleSlug(id, 'coding', new Date('2026-06-06T00:00:00Z'));
|
||||
expect(slug.endsWith('-deadbeef')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── buildContestantDir ──────────────────────────────────────────────────────
|
||||
|
||||
describe('buildContestantDir', () => {
|
||||
it('joins sanitized identity and model with a hyphen', () => {
|
||||
expect(buildContestantDir('claude', 'claude-opus-4-5')).toBe('claude-claude-opus-4-5');
|
||||
});
|
||||
|
||||
it('sanitizes both parts independently', () => {
|
||||
expect(buildContestantDir('Code Reviewer', 'qwen2.5-35b')).toBe('code-reviewer-qwen2-5-35b');
|
||||
});
|
||||
});
|
||||
|
||||
// ─── reconcileContestantResume ───────────────────────────────────────────────
|
||||
|
||||
describe('reconcileContestantResume', () => {
|
||||
it('keeps non-running contestants regardless of task state', () => {
|
||||
for (const status of ['queued', 'done', 'error']) {
|
||||
expect(reconcileContestantResume(status, 'tid', 'completed')).toBe('keep');
|
||||
expect(reconcileContestantResume(status, null, null)).toBe('keep');
|
||||
}
|
||||
});
|
||||
|
||||
it('re-dispatches a running contestant with no task_id', () => {
|
||||
expect(reconcileContestantResume('running', null, null)).toBe('re-dispatch');
|
||||
});
|
||||
|
||||
it('re-dispatches a running contestant whose task row is absent', () => {
|
||||
expect(reconcileContestantResume('running', 'tid', null)).toBe('re-dispatch');
|
||||
});
|
||||
|
||||
it('marks done when the task completed before the terminal callback ran', () => {
|
||||
expect(reconcileContestantResume('running', 'tid', 'completed')).toBe('mark-done');
|
||||
});
|
||||
|
||||
it('marks error when the task failed', () => {
|
||||
expect(reconcileContestantResume('running', 'tid', 'failed')).toBe('mark-error');
|
||||
});
|
||||
|
||||
it('marks cancelled when the task was cancelled', () => {
|
||||
expect(reconcileContestantResume('running', 'tid', 'cancelled')).toBe('mark-cancelled');
|
||||
});
|
||||
|
||||
it('keeps a running contestant whose task is pending (dispatcher handles it)', () => {
|
||||
expect(reconcileContestantResume('running', 'tid', 'pending')).toBe('keep');
|
||||
});
|
||||
|
||||
it('re-dispatches when the task is stuck running (process died)', () => {
|
||||
expect(reconcileContestantResume('running', 'tid', 'running')).toBe('re-dispatch');
|
||||
});
|
||||
|
||||
it('re-dispatches when the task is blocked (permission dialog gone on restart)', () => {
|
||||
expect(reconcileContestantResume('running', 'tid', 'blocked')).toBe('re-dispatch');
|
||||
});
|
||||
});
|
||||
|
||||
// ─── reconcileContestants ────────────────────────────────────────────────────
|
||||
|
||||
describe('reconcileContestants', () => {
|
||||
it('returns one decision per contestant', () => {
|
||||
const contestants = [
|
||||
{ contestantId: 'c1', taskId: null, status: 'done' },
|
||||
{ contestantId: 'c2', taskId: 't1', status: 'running' },
|
||||
{ contestantId: 'c3', taskId: 't2', status: 'running' },
|
||||
];
|
||||
const taskStates = new Map([['t1', 'completed'], ['t2', 'running']]);
|
||||
const decisions = reconcileContestants(contestants, taskStates);
|
||||
expect(decisions).toHaveLength(3);
|
||||
expect(decisions[0]).toEqual({ contestantId: 'c1', action: 'keep' });
|
||||
expect(decisions[1]).toEqual({ contestantId: 'c2', action: 'mark-done' });
|
||||
expect(decisions[2]).toEqual({ contestantId: 'c3', action: 're-dispatch' });
|
||||
});
|
||||
|
||||
it('re-dispatches a running contestant whose taskId is absent from taskStates', () => {
|
||||
const contestants = [{ contestantId: 'c1', taskId: 'orphan', status: 'running' }];
|
||||
const decisions = reconcileContestants(contestants, new Map());
|
||||
expect(decisions[0]?.action).toBe('re-dispatch');
|
||||
});
|
||||
|
||||
it('re-dispatches a running contestant with null taskId', () => {
|
||||
const contestants = [{ contestantId: 'c1', taskId: null, status: 'running' }];
|
||||
const decisions = reconcileContestants(contestants, new Map());
|
||||
expect(decisions[0]?.action).toBe('re-dispatch');
|
||||
});
|
||||
|
||||
it('returns empty array for no contestants', () => {
|
||||
expect(reconcileContestants([], new Map())).toEqual([]);
|
||||
});
|
||||
|
||||
it('keeps a running contestant whose task is pending', () => {
|
||||
const contestants = [{ contestantId: 'c1', taskId: 't1', status: 'running' }];
|
||||
const taskStates = new Map([['t1', 'pending']]);
|
||||
const decisions = reconcileContestants(contestants, taskStates);
|
||||
expect(decisions[0]?.action).toBe('keep');
|
||||
});
|
||||
|
||||
it('handles a mixed battle: done/queued kept, stale running re-dispatched', () => {
|
||||
const contestants = [
|
||||
{ contestantId: 'c1', taskId: 't1', status: 'done' },
|
||||
{ contestantId: 'c2', taskId: null, status: 'queued' },
|
||||
{ contestantId: 'c3', taskId: 't2', status: 'running' },
|
||||
{ contestantId: 'c4', taskId: 't3', status: 'running' },
|
||||
];
|
||||
const taskStates = new Map([
|
||||
['t1', 'completed'],
|
||||
['t2', 'running'], // stuck — process dead
|
||||
['t3', 'pending'], // dispatcher will handle
|
||||
]);
|
||||
const decisions = reconcileContestants(contestants, taskStates);
|
||||
expect(decisions.find((d) => d.contestantId === 'c1')?.action).toBe('keep');
|
||||
expect(decisions.find((d) => d.contestantId === 'c2')?.action).toBe('keep');
|
||||
expect(decisions.find((d) => d.contestantId === 'c3')?.action).toBe('re-dispatch');
|
||||
expect(decisions.find((d) => d.contestantId === 'c4')?.action).toBe('keep');
|
||||
});
|
||||
});
|
||||
191
apps/coder/src/services/arena-analyzer-helpers.ts
Normal file
191
apps/coder/src/services/arena-analyzer-helpers.ts
Normal file
@@ -0,0 +1,191 @@
|
||||
/**
|
||||
* Pure, side-effect-free helpers for the Arena analyzer.
|
||||
* No DB, no IO, no network — safe to unit-test directly.
|
||||
*
|
||||
* Covers: digest-prompt assembly, judge-prompt assembly, winner extraction
|
||||
* from the judge output, the <2-survivors no-winner rule, and the
|
||||
* cross-examination prompt.
|
||||
*/
|
||||
|
||||
// ─── Shared types ─────────────────────────────────────────────────────────────
|
||||
|
||||
export interface ContestantDigestInput {
|
||||
identity: string;
|
||||
model: string;
|
||||
resultMd: string;
|
||||
diffPatch?: string;
|
||||
benchmarkLine: string;
|
||||
}
|
||||
|
||||
export interface ContestantDigest {
|
||||
identity: string;
|
||||
model: string;
|
||||
digest: string;
|
||||
benchmarkLine: string;
|
||||
}
|
||||
|
||||
// ─── Digest stage ─────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Build the system + user prompts for the per-contestant digest call.
|
||||
* The digest is a short structured summary; it keeps each call's context small
|
||||
* so the downstream judge only sees digests (not raw diffs).
|
||||
*/
|
||||
export function buildDigestPrompt(input: ContestantDigestInput): { system: string; user: string } {
|
||||
const system =
|
||||
'You are an expert technical analyst evaluating the output of an AI coding or Q&A battle. ' +
|
||||
'Produce a concise structured digest (under 300 words, Markdown bullet points) covering: ' +
|
||||
'(1) correctness and quality, (2) completeness, (3) notable strengths, (4) notable weaknesses or issues. ' +
|
||||
'Do not reference the battle or other contestants — focus only on this submission.';
|
||||
|
||||
const parts: string[] = [
|
||||
`# Contestant: ${input.identity} / ${input.model}`,
|
||||
`\nBenchmark: ${input.benchmarkLine}`,
|
||||
'\n## Result\n',
|
||||
input.resultMd.slice(0, 8_000),
|
||||
];
|
||||
|
||||
if (input.diffPatch) {
|
||||
parts.push('\n## Code Changes (diff)\n```diff');
|
||||
parts.push(input.diffPatch.slice(0, 5_000));
|
||||
parts.push('```');
|
||||
}
|
||||
|
||||
return { system, user: parts.join('\n') };
|
||||
}
|
||||
|
||||
// ─── Judge stage ──────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Build the system + user prompts for the comparative judge call.
|
||||
* Receives contestant digests (NOT raw diffs) to keep context bounded.
|
||||
*
|
||||
* The judge output must contain a line starting with WINNER: or NO_WINNER.
|
||||
* The caller extracts it with extractWinner().
|
||||
*/
|
||||
export function buildJudgePrompt(
|
||||
originalPrompt: string,
|
||||
digests: ContestantDigest[],
|
||||
): { system: string; user: string } {
|
||||
const canName = shouldNameWinner(digests.length);
|
||||
|
||||
const winnerInstruction = canName
|
||||
? 'After your comparative analysis, name the best submission on its own line in this exact format:\n' +
|
||||
'WINNER: <identity>/<model>\n' +
|
||||
'where <identity> and <model> exactly match the heading above. No other text on that line.'
|
||||
: 'Fewer than 2 contestants succeeded. Do NOT name a winner. Write the following on its own line:\nNO_WINNER';
|
||||
|
||||
const system =
|
||||
'You are an expert judge for an AI battle. You have received digest summaries of each ' +
|
||||
"contestant's work on the same task. Write a comparative analysis, then follow these instructions:\n" +
|
||||
winnerInstruction;
|
||||
|
||||
const parts: string[] = [
|
||||
'# Original Task Prompt\n',
|
||||
originalPrompt.slice(0, 2_000),
|
||||
'\n# Contestant Digests\n',
|
||||
];
|
||||
|
||||
for (const d of digests) {
|
||||
parts.push(`\n## ${d.identity} / ${d.model}`);
|
||||
parts.push(`Benchmark: ${d.benchmarkLine}`);
|
||||
parts.push(d.digest);
|
||||
}
|
||||
|
||||
parts.push(
|
||||
'\n# Instructions\nCompare the contestants and follow the winner-naming instructions above.',
|
||||
);
|
||||
|
||||
return { system, user: parts.join('\n') };
|
||||
}
|
||||
|
||||
// ─── No-winner rule ───────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Returns true when enough contestants succeeded to name a winner.
|
||||
* Rule: at least 2 must have produced a result. With 0 or 1 success the
|
||||
* analysis must NOT name a winner (no meaningful comparison possible).
|
||||
*/
|
||||
export function shouldNameWinner(succeededCount: number): boolean {
|
||||
return succeededCount >= 2;
|
||||
}
|
||||
|
||||
// ─── Winner extraction ────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Parse the judge's text output and extract the declared winner.
|
||||
* Looks for a line matching: WINNER: <identity>/<model>
|
||||
* Returns null when no valid winner line is found, or when the line contains
|
||||
* NO_WINNER.
|
||||
*
|
||||
* The parse is lenient on surrounding whitespace and case for the keyword.
|
||||
*/
|
||||
export function extractWinner(judgeOutput: string): { identity: string; model: string } | null {
|
||||
for (const line of judgeOutput.split('\n')) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed.toUpperCase().startsWith('WINNER:')) continue;
|
||||
|
||||
const rest = trimmed.slice('WINNER:'.length).trim();
|
||||
if (rest.toUpperCase() === 'NO_WINNER' || rest === '') return null;
|
||||
|
||||
const slashIdx = rest.indexOf('/');
|
||||
if (slashIdx === -1) return null;
|
||||
|
||||
const identity = rest.slice(0, slashIdx).trim();
|
||||
const model = rest.slice(slashIdx + 1).trim();
|
||||
if (identity && model) return { identity, model };
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── Cross-examination stage ──────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Build the system + user prompts for a cross-examination call.
|
||||
* The cross-examiner sees the original prompt, contestant digests, and the
|
||||
* proposed analysis, and is asked to challenge the result.
|
||||
*/
|
||||
export function buildCrossExamPrompt(opts: {
|
||||
originalPrompt: string;
|
||||
digests: ContestantDigest[];
|
||||
analysisContent: string;
|
||||
proposedWinner: string | null;
|
||||
examinerIdentity: string;
|
||||
examinerModel: string;
|
||||
}): { system: string; user: string } {
|
||||
const system =
|
||||
`You are ${opts.examinerIdentity} (model: ${opts.examinerModel}), acting as an independent ` +
|
||||
'cross-examiner in an AI battle. Your role is to critically challenge the proposed analysis ' +
|
||||
'and winner, then give your own verdict. Be rigorous but fair. ' +
|
||||
'End your response with your verdict on its own line:\n' +
|
||||
'VERDICT: <identity>/<model> — if you agree or disagree with the proposed winner but can name one\n' +
|
||||
'VERDICT: NO_WINNER — if no clear winner exists';
|
||||
|
||||
const parts: string[] = [
|
||||
'# Original Task Prompt\n',
|
||||
opts.originalPrompt.slice(0, 2_000),
|
||||
'\n# Contestant Digests\n',
|
||||
];
|
||||
|
||||
for (const d of opts.digests) {
|
||||
parts.push(`\n## ${d.identity} / ${d.model}`);
|
||||
parts.push(`Benchmark: ${d.benchmarkLine}`);
|
||||
parts.push(d.digest);
|
||||
}
|
||||
|
||||
parts.push('\n# Proposed Analysis\n');
|
||||
parts.push(opts.analysisContent.slice(0, 5_000));
|
||||
|
||||
if (opts.proposedWinner) {
|
||||
parts.push(`\n*(Proposed winner: ${opts.proposedWinner})*`);
|
||||
} else {
|
||||
parts.push('\n*(No winner was proposed — fewer than 2 contestants succeeded.)*');
|
||||
}
|
||||
|
||||
parts.push(
|
||||
'\n# Your Cross-Examination\n' +
|
||||
'Challenge the analysis above, then give your independent verdict (VERDICT: … on its own line).',
|
||||
);
|
||||
|
||||
return { system, user: parts.join('\n') };
|
||||
}
|
||||
496
apps/coder/src/services/arena-analyzer.ts
Normal file
496
apps/coder/src/services/arena-analyzer.ts
Normal file
@@ -0,0 +1,496 @@
|
||||
/**
|
||||
* Arena Analyzer — pluggable seam for battle analysis and cross-examination.
|
||||
*
|
||||
* The Analyzer interface is the plug point: a v2 Han Orchestrator flow can
|
||||
* replace the v1 two-stage digest→judge implementation without a schema change.
|
||||
*
|
||||
* v1 implementation uses DEFAULT_MODEL via direct llama-swap calls (arenaModelCall):
|
||||
* Digest stage — one call per succeeded contestant, concurrent; produces a
|
||||
* bounded summary of each result (result.md + diff.patch for
|
||||
* coding, result.md for Q&A).
|
||||
* Judge stage — one call with all digests + the original prompt; writes
|
||||
* analysis.md, names a winner (unless < 2 succeeded), and
|
||||
* updates battles.winner_contestant_id.
|
||||
*
|
||||
* Cross-examination:
|
||||
* Local model — direct arenaModelCall to llama-swap with the chosen model.
|
||||
* Cloud model — inserts a tasks row (triggers the dispatcher via pg_notify);
|
||||
* polls for completion; reads output_summary as the verdict.
|
||||
* In both cases the verdict is written to cross_examinations.verdict, appended
|
||||
* to <resultsPath>/cross-exam.md, and a battle_updated frame is published.
|
||||
*
|
||||
* Never throws — all errors are caught, logged, and swallowed so the caller
|
||||
* (arena-runner's onBattleComplete / onCrossExamStart) is never wedged.
|
||||
*/
|
||||
|
||||
import { readFile, writeFile, mkdir } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import type { Sql } from '../db.js';
|
||||
import type { Broker } from '@boocode/server/broker';
|
||||
import type { WsFrame } from '@boocode/contracts/ws-frames';
|
||||
import type { FastifyBaseLogger } from 'fastify';
|
||||
import type { Config } from '../config.js';
|
||||
import type { BattleType } from '@boocode/contracts/arena';
|
||||
import { arenaModelCall } from './arena-model-call.js';
|
||||
import {
|
||||
buildDigestPrompt,
|
||||
buildJudgePrompt,
|
||||
buildCrossExamPrompt,
|
||||
extractWinner,
|
||||
shouldNameWinner,
|
||||
type ContestantDigest,
|
||||
} from './arena-analyzer-helpers.js';
|
||||
|
||||
// ─── Public interface ─────────────────────────────────────────────────────────
|
||||
|
||||
/** Pluggable analysis seam — swap to a Han Orchestrator flow in v2. */
|
||||
export interface Analyzer {
|
||||
/** Run the two-stage digest→judge analysis for a completed battle. */
|
||||
analyze(battleId: string): Promise<void>;
|
||||
/**
|
||||
* Run a cross-examination for an already-inserted cross_examinations row.
|
||||
* The result is written back to that row and a battle_updated frame is published.
|
||||
*/
|
||||
crossExamine(
|
||||
battleId: string,
|
||||
crossExamId: string,
|
||||
opts: { identity: string; model: string },
|
||||
): Promise<void>;
|
||||
}
|
||||
|
||||
// ─── Internal DB row types ────────────────────────────────────────────────────
|
||||
|
||||
interface BattleRow {
|
||||
id: string;
|
||||
project_id: string;
|
||||
battle_type: BattleType;
|
||||
prompt: string;
|
||||
status: string;
|
||||
results_path: string | null;
|
||||
winner_contestant_id: string | null;
|
||||
}
|
||||
|
||||
interface ContestantRow {
|
||||
id: string;
|
||||
identity: string;
|
||||
model: string;
|
||||
lane: string;
|
||||
status: string;
|
||||
result_path: string | null;
|
||||
duration_ms: number | null;
|
||||
tokens_per_sec: number | null;
|
||||
}
|
||||
|
||||
// ─── Factory ──────────────────────────────────────────────────────────────────
|
||||
|
||||
interface AnalyzerDeps {
|
||||
sql: Sql;
|
||||
broker: Broker;
|
||||
log: FastifyBaseLogger;
|
||||
config: Pick<Config, 'LLAMA_SWAP_URL' | 'DEFAULT_MODEL'>;
|
||||
/** Model IDs served by local llama-swap — cross-exam routing uses this. */
|
||||
localModels: ReadonlySet<string>;
|
||||
}
|
||||
|
||||
export function createAnalyzer(deps: AnalyzerDeps): Analyzer {
|
||||
const { sql, broker, log, config, localModels } = deps;
|
||||
|
||||
// ─── analyze ──────────────────────────────────────────────────────────────
|
||||
|
||||
async function analyze(battleId: string): Promise<void> {
|
||||
try {
|
||||
await runAnalysis(battleId);
|
||||
} catch (err) {
|
||||
log.error(
|
||||
{ err: errMsg(err), battleId },
|
||||
'arena-analyzer: analysis failed',
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async function runAnalysis(battleId: string): Promise<void> {
|
||||
const battle = await loadBattle(battleId);
|
||||
if (!battle) {
|
||||
log.warn({ battleId }, 'arena-analyzer: battle not found');
|
||||
return;
|
||||
}
|
||||
|
||||
const contestants = await loadContestants(battleId);
|
||||
const succeeded = contestants.filter((c) => c.status === 'done' && c.result_path);
|
||||
|
||||
log.info(
|
||||
{ battleId, total: contestants.length, succeeded: succeeded.length },
|
||||
'arena-analyzer: starting analysis',
|
||||
);
|
||||
|
||||
// Digest stage — concurrent, one call per succeeded contestant.
|
||||
const digests = (
|
||||
await Promise.all(succeeded.map((c) => digestContestant(battle, c)))
|
||||
).filter((d): d is ContestantDigest => d !== null);
|
||||
|
||||
// Failed contestants are noted in the analysis even if they produced no digest.
|
||||
const failedNotes = contestants
|
||||
.filter((c) => c.status === 'error')
|
||||
.map((c) => `- **${c.identity} / ${c.model}**: failed (no result)\n`);
|
||||
|
||||
// Judge stage — single call with all digests.
|
||||
const { analysisText, winner } = await judgeContestants(battle, digests, failedNotes);
|
||||
|
||||
// Write analysis.md to the battle results folder.
|
||||
const resultsPath = battle.results_path;
|
||||
if (resultsPath) {
|
||||
await mkdir(resultsPath, { recursive: true });
|
||||
await writeFile(join(resultsPath, 'analysis.md'), analysisText, 'utf8');
|
||||
}
|
||||
|
||||
// Resolve the winner to a contestant id and update the battle row.
|
||||
let winnerId: string | null = null;
|
||||
if (winner && shouldNameWinner(succeeded.length)) {
|
||||
const winnerContestant = contestants.find(
|
||||
(c) => c.identity === winner.identity && c.model === winner.model,
|
||||
);
|
||||
if (winnerContestant) {
|
||||
winnerId = winnerContestant.id;
|
||||
await sql`
|
||||
UPDATE battles
|
||||
SET winner_contestant_id = ${winnerId}, updated_at = clock_timestamp()
|
||||
WHERE id = ${battleId}
|
||||
`;
|
||||
log.info({ battleId, winnerId, identity: winner.identity, model: winner.model }, 'arena-analyzer: winner set');
|
||||
} else {
|
||||
log.warn({ battleId, winner }, 'arena-analyzer: judge named a winner not found in contestants');
|
||||
}
|
||||
}
|
||||
|
||||
publishUser({
|
||||
type: 'battle_updated',
|
||||
battle_id: battleId,
|
||||
winner_contestant_id: winnerId,
|
||||
analysis_ready: true,
|
||||
});
|
||||
|
||||
log.info({ battleId }, 'arena-analyzer: analysis complete');
|
||||
}
|
||||
|
||||
// ─── crossExamine ─────────────────────────────────────────────────────────
|
||||
|
||||
async function crossExamine(
|
||||
battleId: string,
|
||||
crossExamId: string,
|
||||
opts: { identity: string; model: string },
|
||||
): Promise<void> {
|
||||
try {
|
||||
await runCrossExam(battleId, crossExamId, opts);
|
||||
} catch (err) {
|
||||
log.error(
|
||||
{ err: errMsg(err), battleId, crossExamId },
|
||||
'arena-analyzer: cross-exam failed',
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async function runCrossExam(
|
||||
battleId: string,
|
||||
crossExamId: string,
|
||||
opts: { identity: string; model: string },
|
||||
): Promise<void> {
|
||||
const battle = await loadBattle(battleId);
|
||||
if (!battle) {
|
||||
log.warn({ battleId }, 'arena-analyzer: battle not found for cross-exam');
|
||||
return;
|
||||
}
|
||||
|
||||
const contestants = await loadContestants(battleId);
|
||||
|
||||
// Re-read the digests (if contestants have results) for context.
|
||||
const succeeded = contestants.filter((c) => c.status === 'done' && c.result_path);
|
||||
const digests = (
|
||||
await Promise.all(succeeded.map((c) => digestContestant(battle, c)))
|
||||
).filter((d): d is ContestantDigest => d !== null);
|
||||
|
||||
// Read analysis.md for the proposed analysis content.
|
||||
let analysisContent = '';
|
||||
if (battle.results_path) {
|
||||
analysisContent = await readFile(
|
||||
join(battle.results_path, 'analysis.md'), 'utf8',
|
||||
).catch(() => '');
|
||||
}
|
||||
|
||||
// Resolve proposed winner label.
|
||||
let proposedWinner: string | null = null;
|
||||
if (battle.winner_contestant_id) {
|
||||
const w = contestants.find((c) => c.id === battle.winner_contestant_id);
|
||||
if (w) proposedWinner = `${w.identity}/${w.model}`;
|
||||
}
|
||||
|
||||
const { system, user } = buildCrossExamPrompt({
|
||||
originalPrompt: battle.prompt,
|
||||
digests,
|
||||
analysisContent,
|
||||
proposedWinner,
|
||||
examinerIdentity: opts.identity,
|
||||
examinerModel: opts.model,
|
||||
});
|
||||
|
||||
log.info({ battleId, crossExamId, identity: opts.identity, model: opts.model }, 'arena-analyzer: running cross-exam');
|
||||
|
||||
const verdict = await executeModelCall({
|
||||
battleId,
|
||||
projectId: battle.project_id,
|
||||
identity: opts.identity,
|
||||
model: opts.model,
|
||||
system,
|
||||
user,
|
||||
});
|
||||
|
||||
// Persist verdict and append to cross-exam.md.
|
||||
await sql`
|
||||
UPDATE cross_examinations
|
||||
SET verdict = ${verdict}
|
||||
WHERE id = ${crossExamId}
|
||||
`;
|
||||
|
||||
if (battle.results_path) {
|
||||
const crossExamPath = join(battle.results_path, 'cross-exam.md');
|
||||
const section =
|
||||
`\n---\n\n# Cross-Examination by ${opts.identity} / ${opts.model}\n\n` +
|
||||
`${verdict}\n`;
|
||||
await writeFile(crossExamPath, section, { flag: 'a', encoding: 'utf8' });
|
||||
}
|
||||
|
||||
publishUser({
|
||||
type: 'battle_updated',
|
||||
battle_id: battleId,
|
||||
cross_exam_id: crossExamId,
|
||||
});
|
||||
|
||||
log.info({ battleId, crossExamId }, 'arena-analyzer: cross-exam complete');
|
||||
}
|
||||
|
||||
// ─── Model call routing ───────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Route a one-shot model call to llama-swap (local) or the task dispatcher
|
||||
* (cloud). Cloud dispatch inserts a tasks row and polls for completion.
|
||||
*/
|
||||
async function executeModelCall(opts: {
|
||||
battleId: string;
|
||||
projectId: string;
|
||||
identity: string;
|
||||
model: string;
|
||||
system: string;
|
||||
user: string;
|
||||
}): Promise<string> {
|
||||
const isLocal = localModels.has(opts.model) || localModels.has(`llama-swap/${opts.model}`);
|
||||
|
||||
if (isLocal) {
|
||||
return arenaModelCall({
|
||||
config,
|
||||
model: opts.model,
|
||||
system: opts.system,
|
||||
user: opts.user,
|
||||
maxTokens: 2_000,
|
||||
temperature: 0.3,
|
||||
});
|
||||
}
|
||||
|
||||
// Cloud path: dispatch through the task system and poll for completion.
|
||||
return executeCloudModelCall(opts);
|
||||
}
|
||||
|
||||
async function executeCloudModelCall(opts: {
|
||||
projectId: string;
|
||||
identity: string;
|
||||
model: string;
|
||||
system: string;
|
||||
user: string;
|
||||
}): Promise<string> {
|
||||
// The cross-exam prompt is the full input to the external agent. We embed
|
||||
// the system prompt as a preamble in the user message (external agents don't
|
||||
// take a separate system arg through the tasks dispatcher).
|
||||
const input = `${opts.system}\n\n${opts.user}`;
|
||||
|
||||
// For well-known external agents, stamp the agent name so the dispatcher
|
||||
// routes via PTY/ACP. For unknown identities fall back to native inference
|
||||
// (agent = null → DEFAULT_MODEL text generation).
|
||||
const knownAgents = new Set(['claude', 'opencode', 'qwen', 'goose']);
|
||||
const agentName = knownAgents.has(opts.identity) ? opts.identity : null;
|
||||
|
||||
const [task] = await sql<{ id: string }[]>`
|
||||
INSERT INTO tasks (project_id, input, agent, model)
|
||||
VALUES (${opts.projectId}, ${input}, ${agentName}, ${opts.model})
|
||||
RETURNING id
|
||||
`;
|
||||
const taskId = task!.id;
|
||||
|
||||
log.info({ taskId, identity: opts.identity, model: opts.model }, 'arena-analyzer: cloud cross-exam task dispatched');
|
||||
|
||||
// Poll until terminal (up to 5 minutes).
|
||||
const timeoutMs = 5 * 60 * 1_000;
|
||||
const pollMs = 2_000;
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
|
||||
while (Date.now() < deadline) {
|
||||
await sleep(pollMs);
|
||||
const [row] = await sql<{ state: string; output_summary: string | null }[]>`
|
||||
SELECT state, output_summary FROM tasks WHERE id = ${taskId}
|
||||
`;
|
||||
if (!row) break;
|
||||
if (row.state === 'completed') return row.output_summary ?? '';
|
||||
if (row.state === 'failed' || row.state === 'cancelled') {
|
||||
throw new Error(`cross-exam task ${row.state}: ${row.output_summary ?? ''}`);
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(`cloud cross-exam task timed out after ${timeoutMs / 1000}s`);
|
||||
}
|
||||
|
||||
// ─── Digest helper ────────────────────────────────────────────────────────
|
||||
|
||||
async function digestContestant(
|
||||
battle: BattleRow,
|
||||
c: ContestantRow,
|
||||
): Promise<ContestantDigest | null> {
|
||||
if (!c.result_path) return null;
|
||||
|
||||
const resultMd = await readFile(join(c.result_path, 'result.md'), 'utf8').catch(() => '');
|
||||
|
||||
let diffPatch: string | undefined;
|
||||
if (battle.battle_type === 'coding') {
|
||||
diffPatch = await readFile(join(c.result_path, 'diff.patch'), 'utf8').catch(
|
||||
() => undefined,
|
||||
);
|
||||
}
|
||||
|
||||
const benchmarkLine = formatBenchmarkLine(c);
|
||||
const { system, user } = buildDigestPrompt({
|
||||
identity: c.identity,
|
||||
model: c.model,
|
||||
resultMd,
|
||||
diffPatch,
|
||||
benchmarkLine,
|
||||
});
|
||||
|
||||
let digest: string;
|
||||
try {
|
||||
digest = await arenaModelCall({
|
||||
config,
|
||||
model: config.DEFAULT_MODEL,
|
||||
system,
|
||||
user,
|
||||
maxTokens: 500,
|
||||
temperature: 0.3,
|
||||
});
|
||||
} catch (err) {
|
||||
log.warn(
|
||||
{ err: errMsg(err), identity: c.identity, model: c.model },
|
||||
'arena-analyzer: digest call failed — skipping contestant',
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
return { identity: c.identity, model: c.model, digest, benchmarkLine };
|
||||
}
|
||||
|
||||
// ─── Judge helper ─────────────────────────────────────────────────────────
|
||||
|
||||
async function judgeContestants(
|
||||
battle: BattleRow,
|
||||
digests: ContestantDigest[],
|
||||
failedNotes: string[],
|
||||
): Promise<{ analysisText: string; winner: { identity: string; model: string } | null }> {
|
||||
const { system, user } = buildJudgePrompt(battle.prompt, digests);
|
||||
|
||||
let judgeOutput = '';
|
||||
try {
|
||||
judgeOutput = await arenaModelCall({
|
||||
config,
|
||||
model: config.DEFAULT_MODEL,
|
||||
system,
|
||||
user,
|
||||
maxTokens: 2_000,
|
||||
temperature: 0.3,
|
||||
});
|
||||
} catch (err) {
|
||||
log.error({ err: errMsg(err), battleId: battle.id }, 'arena-analyzer: judge call failed');
|
||||
judgeOutput = '*(Judge call failed — no comparison produced.)*';
|
||||
}
|
||||
|
||||
const winner = shouldNameWinner(digests.length) ? extractWinner(judgeOutput) : null;
|
||||
|
||||
const sections: string[] = [
|
||||
`# Arena Analysis`,
|
||||
`\n**Battle type:** ${battle.battle_type}`,
|
||||
];
|
||||
|
||||
if (failedNotes.length > 0) {
|
||||
sections.push('\n## Failed Contestants\n');
|
||||
sections.push(...failedNotes);
|
||||
}
|
||||
|
||||
if (digests.length > 0) {
|
||||
sections.push('\n## Contestant Digests\n');
|
||||
for (const d of digests) {
|
||||
sections.push(`### ${d.identity} / ${d.model}`);
|
||||
sections.push(`*Benchmark: ${d.benchmarkLine}*\n`);
|
||||
sections.push(d.digest);
|
||||
}
|
||||
}
|
||||
|
||||
sections.push("\n## Judge's Verdict\n");
|
||||
sections.push(judgeOutput);
|
||||
|
||||
if (winner) {
|
||||
sections.push(`\n## Winner\n**${winner.identity} / ${winner.model}**`);
|
||||
} else {
|
||||
const reason =
|
||||
digests.length < 2
|
||||
? 'fewer than 2 contestants produced results'
|
||||
: 'no clear winner identified';
|
||||
sections.push(`\n## Winner\n*No winner named (${reason}).*`);
|
||||
}
|
||||
|
||||
return { analysisText: sections.join('\n'), winner };
|
||||
}
|
||||
|
||||
// ─── DB helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
async function loadBattle(battleId: string): Promise<BattleRow | null> {
|
||||
const [b] = await sql<BattleRow[]>`
|
||||
SELECT id, project_id, battle_type, prompt, status, results_path, winner_contestant_id
|
||||
FROM battles WHERE id = ${battleId}
|
||||
`;
|
||||
return b ?? null;
|
||||
}
|
||||
|
||||
async function loadContestants(battleId: string): Promise<ContestantRow[]> {
|
||||
return sql<ContestantRow[]>`
|
||||
SELECT id, identity, model, lane, status, result_path, duration_ms, tokens_per_sec
|
||||
FROM contestants WHERE battle_id = ${battleId}
|
||||
ORDER BY created_at ASC
|
||||
`;
|
||||
}
|
||||
|
||||
// ─── Misc helpers ─────────────────────────────────────────────────────────
|
||||
|
||||
function formatBenchmarkLine(c: ContestantRow): string {
|
||||
const parts: string[] = [];
|
||||
if (c.duration_ms !== null) parts.push(`${c.duration_ms}ms`);
|
||||
if (c.tokens_per_sec !== null) parts.push(`${c.tokens_per_sec.toFixed(1)} tok/s`);
|
||||
return parts.length > 0 ? parts.join(', ') : 'no benchmark';
|
||||
}
|
||||
|
||||
function publishUser(frame: Record<string, unknown>): void {
|
||||
broker.publishUserFrame('default', frame as unknown as WsFrame);
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
return { analyze, crossExamine };
|
||||
}
|
||||
|
||||
function errMsg(e: unknown): string {
|
||||
return e instanceof Error ? e.message : String(e);
|
||||
}
|
||||
186
apps/coder/src/services/arena-decisions.ts
Normal file
186
apps/coder/src/services/arena-decisions.ts
Normal file
@@ -0,0 +1,186 @@
|
||||
/**
|
||||
* Pure scheduling and classification decisions for the Arena battle-runner.
|
||||
* No database, no IO. Mirrors the pattern of flow-runner-decisions.ts.
|
||||
*
|
||||
* Vocabulary:
|
||||
* local lane — llama-swap-backed contestants, run strictly one at a time
|
||||
* cloud lane — cloud-backed contestants, run all in parallel
|
||||
*
|
||||
* A contestant's status lifecycle:
|
||||
* queued → running → done | error
|
||||
*/
|
||||
import type { BattleType, ContestantLane } from '@boocode/contracts/arena';
|
||||
|
||||
// ─── Lane classification ──────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Classify a contestant into a lane.
|
||||
*
|
||||
* Q&A contestants always run on the native (llama-swap) backend → local.
|
||||
* Coding contestants: their MODEL is checked against the localModels set
|
||||
* (all model IDs served by the local llama-swap server). This means an
|
||||
* opencode or qwen contestant pointed at a local model counts as local,
|
||||
* which correctly captures GPU-contention and fair benchmarking (ADR 0001).
|
||||
*
|
||||
* @param battleType 'coding' | 'qa'
|
||||
* @param identity backend name (coding) or persona name (qa) — not used for lane logic
|
||||
* @param model the contestant's model id
|
||||
* @param localModels set of model IDs served by the local llama-swap server
|
||||
*/
|
||||
export function classifyLane(
|
||||
battleType: BattleType,
|
||||
_identity: string,
|
||||
model: string,
|
||||
localModels: ReadonlySet<string>,
|
||||
): ContestantLane {
|
||||
if (battleType === 'qa') return 'local';
|
||||
return localModels.has(model) ? 'local' : 'cloud';
|
||||
}
|
||||
|
||||
// ─── Local-lane queue ─────────────────────────────────────────────────────────
|
||||
|
||||
export interface ContestantSlot {
|
||||
id: string;
|
||||
lane: ContestantLane;
|
||||
status: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* The next queued local contestant to dispatch — the first 'queued' contestant
|
||||
* in the local lane, in creation order (caller must supply rows in created_at ASC).
|
||||
* Returns null when the local queue is empty or all local slots are non-queued.
|
||||
*/
|
||||
export function nextLocalContestant(contestants: readonly ContestantSlot[]): string | null {
|
||||
for (const c of contestants) {
|
||||
if (c.lane === 'local' && c.status === 'queued') return c.id;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── Battle completion ────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* True when every contestant has reached a terminal state (done | error).
|
||||
* Returns false for an empty list — a battle with no contestants never completes.
|
||||
*/
|
||||
export function isBattleComplete(contestants: readonly { status: string }[]): boolean {
|
||||
if (contestants.length === 0) return false;
|
||||
return contestants.every((c) => c.status === 'done' || c.status === 'error');
|
||||
}
|
||||
|
||||
// ─── Benchmark ────────────────────────────────────────────────────────────────
|
||||
|
||||
export interface Benchmark {
|
||||
durationMs: number;
|
||||
tokensPerSec: number | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the benchmark for a contestant.
|
||||
* Wall-clock duration is captured for every contestant; tokens/sec is only
|
||||
* meaningful for local (llama-swap) contestants where the model has sole
|
||||
* access to the GPU and the measurement is fair.
|
||||
*/
|
||||
export function computeBenchmark(
|
||||
startedAt: Date,
|
||||
endedAt: Date,
|
||||
costTokens: number | null,
|
||||
lane: ContestantLane,
|
||||
): Benchmark {
|
||||
const durationMs = Math.max(0, endedAt.getTime() - startedAt.getTime());
|
||||
const tokensPerSec =
|
||||
lane === 'local' && costTokens !== null && durationMs > 0
|
||||
? (costTokens / durationMs) * 1000
|
||||
: null;
|
||||
return { durationMs, tokensPerSec };
|
||||
}
|
||||
|
||||
// ─── Slug / path helpers ──────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Sanitize a string for use as a directory name component.
|
||||
* Lowercases, replaces non-alphanumeric runs with '-', trims leading/trailing
|
||||
* dashes, and caps at 64 characters.
|
||||
*/
|
||||
export function sanitizeSlug(s: string): string {
|
||||
return s
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '')
|
||||
.slice(0, 64);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the dated battle slug used as the Arena results folder name.
|
||||
* Format: YYYY-MM-DD-<battleType>-<first-8-hex-of-uuid>
|
||||
* Deterministic: callers can rebuild it from (id, type, created_at) on resume.
|
||||
*/
|
||||
export function buildBattleSlug(battleId: string, battleType: BattleType, createdAt: Date): string {
|
||||
const date = createdAt.toISOString().slice(0, 10);
|
||||
const shortId = battleId.replace(/-/g, '').slice(0, 8);
|
||||
return `${date}-${battleType}-${shortId}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the per-contestant results directory name within a battle folder.
|
||||
* Format: <sanitized-identity>-<sanitized-model>
|
||||
*/
|
||||
export function buildContestantDir(identity: string, model: string): string {
|
||||
return `${sanitizeSlug(identity)}-${sanitizeSlug(model)}`;
|
||||
}
|
||||
|
||||
// ─── Resume reconciliation ────────────────────────────────────────────────────
|
||||
|
||||
export type ContestantResumeAction =
|
||||
| 'keep'
|
||||
| 're-dispatch'
|
||||
| 'mark-done'
|
||||
| 'mark-error'
|
||||
| 'mark-cancelled';
|
||||
|
||||
export interface ContestantResumeDecision {
|
||||
contestantId: string;
|
||||
action: ContestantResumeAction;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decide what to do with ONE contestant during startup resume.
|
||||
* Mirrors reconcileResumeStep from flow-runner-decisions.ts.
|
||||
*
|
||||
* @param status contestants.status
|
||||
* @param taskId contestants.task_id (null when not yet dispatched)
|
||||
* @param taskState tasks.state for taskId, or null if the task row is absent
|
||||
*/
|
||||
export function reconcileContestantResume(
|
||||
status: string,
|
||||
taskId: string | null,
|
||||
taskState: string | null,
|
||||
): ContestantResumeAction {
|
||||
if (status !== 'running') return 'keep';
|
||||
if (!taskId || taskState === null) return 're-dispatch';
|
||||
switch (taskState) {
|
||||
case 'completed': return 'mark-done';
|
||||
case 'failed': return 'mark-error';
|
||||
case 'cancelled': return 'mark-cancelled';
|
||||
case 'pending': return 'keep'; // dispatcher startup poll will run it normally
|
||||
default: return 're-dispatch'; // 'running'/'blocked' — process is dead
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconcile every contestant of an in-flight battle for startup resume.
|
||||
* Returns one decision per contestant. Pure — no IO.
|
||||
*/
|
||||
export function reconcileContestants(
|
||||
contestants: ReadonlyArray<{ contestantId: string; taskId: string | null; status: string }>,
|
||||
taskStates: ReadonlyMap<string, string>,
|
||||
): ContestantResumeDecision[] {
|
||||
return contestants.map((c) => ({
|
||||
contestantId: c.contestantId,
|
||||
action: reconcileContestantResume(
|
||||
c.status,
|
||||
c.taskId,
|
||||
c.taskId ? (taskStates.get(c.taskId) ?? null) : null,
|
||||
),
|
||||
}));
|
||||
}
|
||||
70
apps/coder/src/services/arena-model-call.ts
Normal file
70
apps/coder/src/services/arena-model-call.ts
Normal file
@@ -0,0 +1,70 @@
|
||||
/**
|
||||
* One-shot model completion for the Arena analyzer.
|
||||
*
|
||||
* Calls the local llama-swap server directly for a single non-streaming
|
||||
* completion. Used for the digest and judge stages (always DEFAULT_MODEL)
|
||||
* and for local-model cross-examinations (any local model).
|
||||
*
|
||||
* Mirrors apps/server/src/services/task-model.ts but targets the coder's
|
||||
* config shape and uses a longer timeout appropriate for analysis calls.
|
||||
*/
|
||||
|
||||
import type { Config } from '../config.js';
|
||||
|
||||
const TIMEOUT_MS = 120_000;
|
||||
|
||||
export async function arenaModelCall(opts: {
|
||||
config: Pick<Config, 'LLAMA_SWAP_URL'>;
|
||||
model: string;
|
||||
system: string;
|
||||
user: string;
|
||||
maxTokens?: number;
|
||||
temperature?: number;
|
||||
}): Promise<string> {
|
||||
const { config, model, system, user } = opts;
|
||||
const maxTokens = opts.maxTokens ?? 2_000;
|
||||
const temperature = opts.temperature ?? 0.3;
|
||||
|
||||
const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
messages: [
|
||||
{ role: 'system', content: system },
|
||||
{ role: 'user', content: user },
|
||||
],
|
||||
max_tokens: maxTokens,
|
||||
temperature,
|
||||
stream: false,
|
||||
chat_template_kwargs: { enable_thinking: false },
|
||||
}),
|
||||
signal: AbortSignal.timeout(TIMEOUT_MS),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const text = await res.text().catch(() => '');
|
||||
throw new Error(`llama-swap responded ${res.status}: ${text.slice(0, 200)}`);
|
||||
}
|
||||
|
||||
const data = (await res.json()) as {
|
||||
choices?: Array<{
|
||||
message?: { content?: string; reasoning_content?: string };
|
||||
}>;
|
||||
};
|
||||
|
||||
const choice = data.choices?.[0]?.message;
|
||||
if (!choice) return '';
|
||||
|
||||
const content = (choice.content ?? '').trim();
|
||||
if (content.length > 0) return content;
|
||||
|
||||
// For thinking-mode models the answer sometimes only lands in reasoning_content.
|
||||
const reasoning = (choice.reasoning_content ?? '').trim();
|
||||
if (reasoning.length > 0) {
|
||||
const lines = reasoning.split('\n').filter((l) => l.trim().length > 0);
|
||||
return lines[lines.length - 1] ?? '';
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
895
apps/coder/src/services/arena-runner.ts
Normal file
895
apps/coder/src/services/arena-runner.ts
Normal file
@@ -0,0 +1,895 @@
|
||||
/**
|
||||
* Arena battle-runner — DB-backed execution engine for Arena battles.
|
||||
*
|
||||
* Mirrors flow-runner.ts but implements the Arena's two-lane scheduler instead
|
||||
* of the Orchestrator's wave scheduler. Persists to battles/contestants tables
|
||||
* (not flow_runs/flow_steps). Each contestant is dispatched as a real tasks row
|
||||
* via an injected DispatchContestantFn (Phase 4 wires this to the dispatcher).
|
||||
* Advances on the dispatcher's onTaskTerminal hook.
|
||||
*
|
||||
* Scheduling:
|
||||
* - Cloud lane: all contestants start immediately, in parallel.
|
||||
* - Local lane: contestants run strictly one at a time (serial queue). Only
|
||||
* the first local contestant runs at start; the next is dispatched when the
|
||||
* current one terminates. Both lanes run concurrently with each other.
|
||||
*
|
||||
* Results:
|
||||
* Written to <projectRoot>/Arena/<battleSlug>/<identity>-<model>/
|
||||
* Coding: result.md + diff.patch (from the contestant's worktree).
|
||||
* Q&A: result.md with the text answer.
|
||||
*
|
||||
* Analyzer seam:
|
||||
* onBattleComplete is called when all contestants are terminal. Phase 5 wires
|
||||
* this to the two-stage digest→judge analyzer. A failed contestant does NOT
|
||||
* abort the battle — others continue and the analyzer judges survivors.
|
||||
*/
|
||||
import type { Sql } from '../db.js';
|
||||
import type { Broker } from '@boocode/server/broker';
|
||||
import type { WsFrame } from '@boocode/contracts/ws-frames';
|
||||
import type { FastifyBaseLogger } from 'fastify';
|
||||
import type { BattleType, ContestantLane } from '@boocode/contracts/arena';
|
||||
import { mkdir, writeFile } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import { diffWorktree } from './worktrees.js';
|
||||
import {
|
||||
buildBattleSlug,
|
||||
buildContestantDir,
|
||||
classifyLane,
|
||||
computeBenchmark,
|
||||
isBattleComplete,
|
||||
nextLocalContestant,
|
||||
reconcileContestants,
|
||||
type ContestantResumeAction,
|
||||
type ContestantSlot,
|
||||
} from './arena-decisions.js';
|
||||
|
||||
// ─── Public types ─────────────────────────────────────────────────────────────
|
||||
|
||||
export interface ContestantSpec {
|
||||
/** Backend name (coding) or persona name (qa). */
|
||||
identity: string;
|
||||
model: string;
|
||||
}
|
||||
|
||||
export interface BattleStartOpts {
|
||||
projectId: string;
|
||||
battleType: BattleType;
|
||||
prompt: string;
|
||||
/** 2–6 contestants. Duplicate (identity, model) pairs are rejected by the schema UNIQUE constraint. */
|
||||
contestants: ContestantSpec[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Injected dispatch function — Phase 4 wires this to the real task inserter.
|
||||
* Must INSERT a tasks row and return its id. The arena-runner sets the
|
||||
* contestant's task_id and status after this call.
|
||||
* `sessionId` is returned when already known (Q&A pre-creates the session);
|
||||
* null for coding contestants whose session is created lazily by the dispatcher.
|
||||
*/
|
||||
export type DispatchContestantFn = (opts: {
|
||||
projectId: string;
|
||||
contestantId: string;
|
||||
prompt: string;
|
||||
identity: string;
|
||||
model: string;
|
||||
battleType: BattleType;
|
||||
}) => Promise<{ taskId: string; sessionId: string | null }>;
|
||||
|
||||
/**
|
||||
* Called once when every contestant in a battle has reached a terminal state.
|
||||
* Phase 5 wires this to the two-stage digest→judge analyzer.
|
||||
* Must never throw — the caller swallows errors.
|
||||
*/
|
||||
export type OnBattleComplete = (battleId: string) => void;
|
||||
|
||||
/**
|
||||
* Called after a cross_examinations row has been inserted, with its id.
|
||||
* Phase 5 wires this to the analyzer's cross-examination runner.
|
||||
* Must never throw — the caller swallows errors.
|
||||
*/
|
||||
export type OnCrossExamStart = (opts: {
|
||||
battleId: string;
|
||||
crossExamId: string;
|
||||
identity: string;
|
||||
model: string;
|
||||
}) => void;
|
||||
|
||||
export interface BattleRunner {
|
||||
/** Start a battle: persist it + its contestants, classify lanes, dispatch initial wave. */
|
||||
startBattle(opts: BattleStartOpts): Promise<{ battleId: string }>;
|
||||
/**
|
||||
* Wire to createDispatcher({ onTaskTerminal }). Fires when ANY task settles;
|
||||
* the runner ignores tasks it doesn't own. Never throws.
|
||||
*/
|
||||
handleTaskTerminal(taskId: string, state: string): void;
|
||||
/**
|
||||
* Re-advance any battles still marked 'running' after a coder restart.
|
||||
* Mirrors flow-runner's initResume (D-9). Never throws.
|
||||
*/
|
||||
initResume(): Promise<void>;
|
||||
/**
|
||||
* Cancel a running battle. Marks it and all non-terminal contestants cancelled,
|
||||
* publishes frames, and returns the task_ids of in-flight contestants so the
|
||||
* route can abort them via the dispatcher's cancelExternalTask.
|
||||
*/
|
||||
cancelBattle(battleId: string): Promise<{ cancelled: boolean; taskIds: string[] }>;
|
||||
/**
|
||||
* Trigger analysis for a completed (or manually re-analyzed) battle.
|
||||
* Phase 5 wires this to the two-stage digest→judge analyzer. For now, calls
|
||||
* the injected onBattleComplete seam directly.
|
||||
*/
|
||||
triggerAnalysis(battleId: string): Promise<{ triggered: boolean }>;
|
||||
/**
|
||||
* Start a cross-examination on a battle. Inserts a cross_examinations row and
|
||||
* invokes the analyzer seam. Phase 5 fills the actual verdict logic.
|
||||
*/
|
||||
startCrossExam(
|
||||
battleId: string,
|
||||
opts: { identity: string; model: string },
|
||||
): Promise<{ crossExamId: string }>;
|
||||
/**
|
||||
* Manually set (or clear) the winner. Validates the contestant belongs to the
|
||||
* battle, updates battles.winner_contestant_id, and publishes a battle_updated
|
||||
* frame so the pane reflects the override immediately.
|
||||
*/
|
||||
setWinner(battleId: string, winnerId: string | null): Promise<{
|
||||
ok: boolean;
|
||||
notFound?: boolean;
|
||||
invalidContestant?: boolean;
|
||||
}>;
|
||||
}
|
||||
|
||||
// ─── Internal row shapes ──────────────────────────────────────────────────────
|
||||
|
||||
interface ContestantRow {
|
||||
id: string;
|
||||
battle_id: string;
|
||||
identity: string;
|
||||
model: string;
|
||||
lane: ContestantLane;
|
||||
task_id: string | null;
|
||||
worktree_id: string | null;
|
||||
status: string;
|
||||
}
|
||||
|
||||
interface BattleRow {
|
||||
id: string;
|
||||
project_id: string;
|
||||
battle_type: BattleType;
|
||||
prompt: string;
|
||||
status: string;
|
||||
results_path: string | null;
|
||||
created_at: Date;
|
||||
}
|
||||
|
||||
// ─── Deps / factory ───────────────────────────────────────────────────────────
|
||||
|
||||
interface Deps {
|
||||
sql: Sql;
|
||||
broker: Broker;
|
||||
log: FastifyBaseLogger;
|
||||
dispatch: DispatchContestantFn;
|
||||
onBattleComplete: OnBattleComplete;
|
||||
/**
|
||||
* Called after a cross_examinations row is inserted. Phase 5 wires this to
|
||||
* the analyzer's cross-examination runner. Optional: absent → no cross-exam
|
||||
* logic runs (stub behaviour for tests).
|
||||
*/
|
||||
onCrossExamStart?: OnCrossExamStart;
|
||||
/**
|
||||
* Model IDs served by the local llama-swap server. Used for lane classification:
|
||||
* a contestant whose model is in this set runs in the local lane (serial, GPU-fair).
|
||||
* Q&A contestants are always local regardless of this set.
|
||||
* Defaults to an empty set → all coding contestants go to the cloud lane.
|
||||
*/
|
||||
localModels?: ReadonlySet<string>;
|
||||
}
|
||||
|
||||
const DEFAULT_LOCAL_MODELS: ReadonlySet<string> = new Set();
|
||||
|
||||
export function createBattleRunner(deps: Deps): BattleRunner {
|
||||
const { sql, broker, log, dispatch, onBattleComplete, onCrossExamStart } = deps;
|
||||
const localModels = deps.localModels ?? DEFAULT_LOCAL_MODELS;
|
||||
|
||||
// Serialize local-lane advance per battle so two near-simultaneous terminal
|
||||
// callbacks don't double-dispatch the next local contestant.
|
||||
const advanceChain = new Map<string, Promise<void>>();
|
||||
|
||||
// Delta bridge: per-contestant broker unsubscribe functions.
|
||||
// 'terminated' sentinel prevents a late-arriving setupDeltaBridge from
|
||||
// registering a subscription that would never be cleaned up.
|
||||
const deltaUnsubs = new Map<string, (() => void) | 'terminated'>();
|
||||
|
||||
function publishUser(frame: Record<string, unknown>): void {
|
||||
broker.publishUserFrame('default', frame as unknown as WsFrame);
|
||||
}
|
||||
|
||||
/**
|
||||
* Subscribe to the contestant's inference session and forward delta frames
|
||||
* to the user channel as contestant_updated{delta}. Polls for session_id
|
||||
* when not immediately known (coding contestants whose session is created
|
||||
* lazily by the dispatcher). Unsubscribes on termination or max retries.
|
||||
*/
|
||||
async function setupDeltaBridge(
|
||||
battleId: string,
|
||||
contestantId: string,
|
||||
taskId: string,
|
||||
knownSessionId: string | null,
|
||||
): Promise<void> {
|
||||
let sessionId = knownSessionId;
|
||||
if (!sessionId) {
|
||||
// Coding contestant: session_id is written by the dispatcher just before
|
||||
// inference starts. Poll until it appears or the contestant terminates.
|
||||
for (let i = 0; i < 50; i++) {
|
||||
if (deltaUnsubs.get(contestantId) === 'terminated') return;
|
||||
const [row] = await sql<{ session_id: string | null }[]>`
|
||||
SELECT session_id FROM tasks WHERE id = ${taskId}
|
||||
`.catch(() => []);
|
||||
if (row?.session_id) { sessionId = row.session_id; break; }
|
||||
await new Promise((r) => setTimeout(r, 200));
|
||||
}
|
||||
}
|
||||
if (!sessionId) return;
|
||||
if (deltaUnsubs.get(contestantId) === 'terminated') return;
|
||||
|
||||
const unsub = broker.subscribe(sessionId, (frame) => {
|
||||
if (frame.type === 'delta') {
|
||||
const deltaContent = (frame as unknown as { content?: unknown }).content;
|
||||
if (typeof deltaContent === 'string') {
|
||||
publishUser({
|
||||
type: 'contestant_updated',
|
||||
battle_id: battleId,
|
||||
contestant_id: contestantId,
|
||||
delta: deltaContent,
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const existing = deltaUnsubs.get(contestantId);
|
||||
if (existing === 'terminated') {
|
||||
unsub();
|
||||
} else {
|
||||
deltaUnsubs.set(contestantId, unsub);
|
||||
}
|
||||
}
|
||||
|
||||
function teardownDeltaBridge(contestantId: string): void {
|
||||
const entry = deltaUnsubs.get(contestantId);
|
||||
if (typeof entry === 'function') {
|
||||
entry();
|
||||
deltaUnsubs.delete(contestantId);
|
||||
} else {
|
||||
deltaUnsubs.set(contestantId, 'terminated');
|
||||
}
|
||||
}
|
||||
|
||||
// ─── startBattle ────────────────────────────────────────────────────────────
|
||||
|
||||
async function startBattle(opts: BattleStartOpts): Promise<{ battleId: string }> {
|
||||
if (opts.contestants.length < 2 || opts.contestants.length > 6) {
|
||||
throw new Error(`battle requires 2–6 contestants; got ${opts.contestants.length}`);
|
||||
}
|
||||
|
||||
const [proj] = await sql<{ path: string }[]>`SELECT path FROM projects WHERE id = ${opts.projectId}`;
|
||||
if (!proj) throw new Error(`project not found: ${opts.projectId}`);
|
||||
|
||||
// Insert the battle row as 'running'; update results_path once we have the id.
|
||||
const [battle] = await sql<{ id: string; created_at: Date }[]>`
|
||||
INSERT INTO battles (project_id, battle_type, prompt, status)
|
||||
VALUES (${opts.projectId}, ${opts.battleType}, ${opts.prompt}, 'running')
|
||||
RETURNING id, created_at
|
||||
`;
|
||||
const battleId = battle!.id;
|
||||
const battleSlug = buildBattleSlug(battleId, opts.battleType, battle!.created_at);
|
||||
const resultsPath = join(proj.path, 'Arena', battleSlug);
|
||||
|
||||
await sql`
|
||||
UPDATE battles SET results_path = ${resultsPath}, updated_at = clock_timestamp()
|
||||
WHERE id = ${battleId}
|
||||
`;
|
||||
|
||||
// Insert all contestant rows with lane classification.
|
||||
const contestantRows: Array<{ id: string; identity: string; model: string; lane: ContestantLane }> = [];
|
||||
for (const spec of opts.contestants) {
|
||||
const lane = classifyLane(opts.battleType, spec.identity, spec.model, localModels);
|
||||
const [row] = await sql<{ id: string }[]>`
|
||||
INSERT INTO contestants (battle_id, identity, model, lane, status)
|
||||
VALUES (${battleId}, ${spec.identity}, ${spec.model}, ${lane}, 'queued')
|
||||
RETURNING id
|
||||
`;
|
||||
contestantRows.push({ id: row!.id, identity: spec.identity, model: spec.model, lane });
|
||||
}
|
||||
|
||||
// Write initial manifest so the results folder is always populated.
|
||||
await writeManifest(
|
||||
battleId, resultsPath, opts.battleType, opts.prompt, battle!.created_at,
|
||||
contestantRows.map((c) => ({ identity: c.identity, model: c.model, lane: c.lane })),
|
||||
null,
|
||||
).catch((err) => {
|
||||
log.warn({ err: errMsg(err), battleId }, 'arena-runner: initial manifest write failed');
|
||||
});
|
||||
|
||||
publishUser({
|
||||
type: 'battle_started',
|
||||
battle_id: battleId,
|
||||
battle_type: opts.battleType,
|
||||
prompt: opts.prompt,
|
||||
contestants: contestantRows.map((c) => ({
|
||||
id: c.id,
|
||||
identity: c.identity,
|
||||
model: c.model,
|
||||
lane: c.lane,
|
||||
})),
|
||||
});
|
||||
|
||||
// Dispatch: cloud lane starts all contestants in parallel; local lane starts
|
||||
// only the first queued contestant (serial queue).
|
||||
let localStarted = false;
|
||||
for (const c of contestantRows) {
|
||||
if (c.lane === 'cloud') {
|
||||
await dispatchContestant(battleId, opts.projectId, opts.battleType, opts.prompt, c);
|
||||
} else if (!localStarted) {
|
||||
await dispatchContestant(battleId, opts.projectId, opts.battleType, opts.prompt, c);
|
||||
localStarted = true;
|
||||
// remaining local contestants stay 'queued' until this one finishes
|
||||
}
|
||||
}
|
||||
|
||||
return { battleId };
|
||||
}
|
||||
|
||||
async function dispatchContestant(
|
||||
battleId: string,
|
||||
projectId: string,
|
||||
battleType: BattleType,
|
||||
prompt: string,
|
||||
c: { id: string; identity: string; model: string; lane: ContestantLane },
|
||||
): Promise<void> {
|
||||
const { taskId, sessionId } = await dispatch({
|
||||
projectId,
|
||||
contestantId: c.id,
|
||||
prompt,
|
||||
identity: c.identity,
|
||||
model: c.model,
|
||||
battleType,
|
||||
});
|
||||
await sql`
|
||||
UPDATE contestants
|
||||
SET task_id = ${taskId}, status = 'running', updated_at = clock_timestamp()
|
||||
WHERE id = ${c.id}
|
||||
`;
|
||||
publishContestantFrame(battleId, c.id, { status: 'running' });
|
||||
// Start the delta bridge in the background; unsubscribe when the contestant
|
||||
// terminates (teardownDeltaBridge called in handleTaskTerminal).
|
||||
void setupDeltaBridge(battleId, c.id, taskId, sessionId ?? null);
|
||||
}
|
||||
|
||||
// ─── local-lane advance (serialized per battle) ───────────────────────────
|
||||
|
||||
function advanceLocalLane(battleId: string): Promise<void> {
|
||||
const prev = advanceChain.get(battleId) ?? Promise.resolve();
|
||||
const next = prev
|
||||
.catch(() => {})
|
||||
.then(() =>
|
||||
advanceLocalLaneInner(battleId).catch((err) => {
|
||||
log.error({ err: errMsg(err), battleId }, 'arena-runner: advanceLocalLane failed');
|
||||
}),
|
||||
);
|
||||
advanceChain.set(battleId, next);
|
||||
void next.finally(() => {
|
||||
if (advanceChain.get(battleId) === next) advanceChain.delete(battleId);
|
||||
});
|
||||
return next;
|
||||
}
|
||||
|
||||
async function advanceLocalLaneInner(battleId: string): Promise<void> {
|
||||
const battle = await loadBattle(battleId);
|
||||
if (!battle || battle.status !== 'running') return;
|
||||
|
||||
const contestants = await loadContestants(battleId);
|
||||
const slots: ContestantSlot[] = contestants.map((c) => ({
|
||||
id: c.id,
|
||||
lane: c.lane,
|
||||
status: c.status,
|
||||
}));
|
||||
|
||||
// Nothing to do if the local lane is still busy.
|
||||
const localRunning = slots.some((c) => c.lane === 'local' && c.status === 'running');
|
||||
if (localRunning) return;
|
||||
|
||||
const nextId = nextLocalContestant(slots);
|
||||
if (!nextId) return; // local queue is exhausted
|
||||
|
||||
const next = contestants.find((c) => c.id === nextId)!;
|
||||
await dispatchContestant(battleId, battle.project_id, battle.battle_type, battle.prompt, {
|
||||
id: next.id,
|
||||
identity: next.identity,
|
||||
model: next.model,
|
||||
lane: next.lane,
|
||||
});
|
||||
}
|
||||
|
||||
// ─── handleTaskTerminal ───────────────────────────────────────────────────
|
||||
|
||||
function handleTaskTerminal(taskId: string, state: string): void {
|
||||
void (async () => {
|
||||
// Look up which contestant owns this task (contestants_task_id_idx).
|
||||
const [row] = await sql<ContestantRow[]>`
|
||||
SELECT id, battle_id, identity, model, lane, task_id, worktree_id, status
|
||||
FROM contestants WHERE task_id = ${taskId}
|
||||
`;
|
||||
if (!row) return; // not an arena task — ignore
|
||||
if (row.status !== 'running') return; // already settled (idempotent)
|
||||
|
||||
const battle = await loadBattle(row.battle_id);
|
||||
|
||||
// Pull the task row for benchmark + output.
|
||||
const [task] = await sql<{
|
||||
chat_id: string | null;
|
||||
started_at: Date | null;
|
||||
ended_at: Date | null;
|
||||
cost_tokens: number | null;
|
||||
}[]>`SELECT chat_id, started_at, ended_at, cost_tokens FROM tasks WHERE id = ${taskId}`;
|
||||
|
||||
const endedAt = task?.ended_at ?? new Date();
|
||||
|
||||
if (state === 'completed') {
|
||||
const startedAt = task?.started_at ?? endedAt;
|
||||
const bench = computeBenchmark(startedAt, endedAt, task?.cost_tokens ?? null, row.lane);
|
||||
|
||||
const output = task?.chat_id ? await readChatOutput(task.chat_id) : '';
|
||||
|
||||
const resultPath = battle
|
||||
? await writeContestantResults(battle, row, output, bench).catch((err) => {
|
||||
log.warn({ err: errMsg(err), contestantId: row.id }, 'arena-runner: result write failed');
|
||||
return null;
|
||||
})
|
||||
: null;
|
||||
|
||||
await sql`
|
||||
UPDATE contestants
|
||||
SET status = 'done',
|
||||
duration_ms = ${Math.round(bench.durationMs)},
|
||||
tokens_per_sec = ${bench.tokensPerSec},
|
||||
cost_tokens = ${task?.cost_tokens ?? null},
|
||||
result_path = ${resultPath},
|
||||
updated_at = clock_timestamp()
|
||||
WHERE id = ${row.id} AND status = 'running'
|
||||
`;
|
||||
teardownDeltaBridge(row.id);
|
||||
|
||||
// Check if this was the last contestant.
|
||||
const allContestants = await loadContestants(row.battle_id);
|
||||
const battleDone = isBattleComplete(allContestants);
|
||||
|
||||
publishContestantFrame(row.battle_id, row.id, {
|
||||
status: 'done',
|
||||
duration_ms: Math.round(bench.durationMs),
|
||||
...(bench.tokensPerSec !== null ? { tokens_per_sec: bench.tokensPerSec } : {}),
|
||||
...(battleDone ? { battle_status: 'completed' } : {}),
|
||||
});
|
||||
|
||||
if (battleDone) {
|
||||
await completeBattle(row.battle_id);
|
||||
} else if (row.lane === 'local') {
|
||||
void advanceLocalLane(row.battle_id);
|
||||
}
|
||||
} else {
|
||||
// failed or cancelled — the contest continues; this contestant is error.
|
||||
const errorMsg = state === 'cancelled' ? 'cancelled' : `task ${state}`;
|
||||
await sql`
|
||||
UPDATE contestants
|
||||
SET status = 'error', error = ${errorMsg}, updated_at = clock_timestamp()
|
||||
WHERE id = ${row.id} AND status = 'running'
|
||||
`;
|
||||
teardownDeltaBridge(row.id);
|
||||
|
||||
const allContestants = await loadContestants(row.battle_id);
|
||||
const battleDone = isBattleComplete(allContestants);
|
||||
|
||||
publishContestantFrame(row.battle_id, row.id, {
|
||||
status: 'error',
|
||||
error: errorMsg,
|
||||
...(battleDone ? { battle_status: 'completed' } : {}),
|
||||
});
|
||||
|
||||
if (battleDone) {
|
||||
await completeBattle(row.battle_id);
|
||||
} else if (row.lane === 'local') {
|
||||
void advanceLocalLane(row.battle_id);
|
||||
}
|
||||
}
|
||||
})().catch((err) => {
|
||||
log.error({ err: errMsg(err), taskId }, 'arena-runner: handleTaskTerminal failed');
|
||||
});
|
||||
}
|
||||
|
||||
// ─── battle finalization ──────────────────────────────────────────────────
|
||||
|
||||
async function completeBattle(battleId: string): Promise<void> {
|
||||
const updated = await sql`
|
||||
UPDATE battles SET status = 'completed', updated_at = clock_timestamp()
|
||||
WHERE id = ${battleId} AND status = 'running'
|
||||
`;
|
||||
if (updated.count === 0) return; // already terminal (race guard)
|
||||
log.info({ battleId }, 'arena-runner: battle completed');
|
||||
|
||||
// Update manifest with finished_at timestamp.
|
||||
const completedBattle = await loadBattle(battleId);
|
||||
if (completedBattle?.results_path) {
|
||||
const contestants = await loadContestants(battleId);
|
||||
await writeManifest(
|
||||
battleId,
|
||||
completedBattle.results_path,
|
||||
completedBattle.battle_type,
|
||||
completedBattle.prompt,
|
||||
completedBattle.created_at,
|
||||
contestants.map((c) => ({ identity: c.identity, model: c.model, lane: c.lane })),
|
||||
new Date(),
|
||||
).catch((err) => {
|
||||
log.warn({ err: errMsg(err), battleId }, 'arena-runner: manifest update failed');
|
||||
});
|
||||
}
|
||||
|
||||
onBattleComplete(battleId);
|
||||
}
|
||||
|
||||
// ─── manifest writer ─────────────────────────────────────────────────────
|
||||
|
||||
async function writeManifest(
|
||||
battleId: string,
|
||||
resultsPath: string,
|
||||
battleType: BattleType,
|
||||
prompt: string,
|
||||
createdAt: Date,
|
||||
contestants: Array<{ identity: string; model: string; lane: ContestantLane }>,
|
||||
finishedAt: Date | null,
|
||||
): Promise<void> {
|
||||
await mkdir(resultsPath, { recursive: true });
|
||||
const manifest = {
|
||||
id: battleId,
|
||||
battle_type: battleType,
|
||||
prompt,
|
||||
contestants,
|
||||
created_at: createdAt.toISOString(),
|
||||
finished_at: finishedAt?.toISOString() ?? null,
|
||||
};
|
||||
await writeFile(join(resultsPath, 'manifest.json'), JSON.stringify(manifest, null, 2), 'utf8');
|
||||
}
|
||||
|
||||
// ─── results writer ───────────────────────────────────────────────────────
|
||||
|
||||
async function writeContestantResults(
|
||||
battle: BattleRow,
|
||||
contestant: { identity: string; model: string; lane: ContestantLane; worktree_id: string | null },
|
||||
output: string,
|
||||
bench: { durationMs: number; tokensPerSec: number | null },
|
||||
): Promise<string> {
|
||||
const resultsPath = await getOrBuildResultsPath(battle);
|
||||
if (!resultsPath) throw new Error('cannot resolve results path for battle ' + battle.id);
|
||||
|
||||
const contestantDir = buildContestantDir(contestant.identity, contestant.model);
|
||||
const dir = join(resultsPath, contestantDir);
|
||||
await mkdir(dir, { recursive: true });
|
||||
|
||||
const benchLines = [
|
||||
`duration: ${bench.durationMs}ms`,
|
||||
bench.tokensPerSec != null ? `tokens/sec: ${bench.tokensPerSec.toFixed(1)}` : null,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join('\n');
|
||||
|
||||
const resultMd =
|
||||
`# ${contestant.identity} / ${contestant.model}\n\n` +
|
||||
`## Benchmark\n\n${benchLines}\n\n` +
|
||||
`## Output\n\n${output}\n`;
|
||||
await writeFile(join(dir, 'result.md'), resultMd, 'utf8');
|
||||
|
||||
if (battle.battle_type === 'coding' && contestant.worktree_id) {
|
||||
const [wt] = await sql<{ path: string; base_commit: string | null }[]>`
|
||||
SELECT path, base_commit FROM worktrees WHERE id = ${contestant.worktree_id}
|
||||
`;
|
||||
if (wt) {
|
||||
const [proj] = await sql<{ path: string }[]>`
|
||||
SELECT path FROM projects WHERE id = ${battle.project_id}
|
||||
`;
|
||||
if (proj) {
|
||||
const diff = await diffWorktree(wt.path, proj.path, {
|
||||
baseRef: wt.base_commit ?? undefined,
|
||||
}).catch(() => '');
|
||||
await writeFile(join(dir, 'diff.patch'), diff, 'utf8');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return dir;
|
||||
}
|
||||
|
||||
/** Resolve or rebuild results_path for a battle (handles crash-before-UPDATE). */
|
||||
async function getOrBuildResultsPath(battle: BattleRow): Promise<string | null> {
|
||||
if (battle.results_path) return battle.results_path;
|
||||
const [proj] = await sql<{ path: string }[]>`SELECT path FROM projects WHERE id = ${battle.project_id}`;
|
||||
if (!proj) return null;
|
||||
const slug = buildBattleSlug(battle.id, battle.battle_type, battle.created_at);
|
||||
const resultsPath = join(proj.path, 'Arena', slug);
|
||||
await sql`
|
||||
UPDATE battles SET results_path = ${resultsPath}, updated_at = clock_timestamp()
|
||||
WHERE id = ${battle.id}
|
||||
`;
|
||||
return resultsPath;
|
||||
}
|
||||
|
||||
// ─── helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
async function readChatOutput(chatId: string): Promise<string> {
|
||||
const [m] = await sql<{ content: string | null }[]>`
|
||||
SELECT content FROM messages
|
||||
WHERE chat_id = ${chatId} AND role = 'assistant'
|
||||
ORDER BY created_at DESC LIMIT 1
|
||||
`;
|
||||
return m?.content ?? '';
|
||||
}
|
||||
|
||||
async function loadBattle(battleId: string): Promise<BattleRow | null> {
|
||||
const [b] = await sql<BattleRow[]>`
|
||||
SELECT id, project_id, battle_type, prompt, status, results_path, created_at
|
||||
FROM battles WHERE id = ${battleId}
|
||||
`;
|
||||
return b ?? null;
|
||||
}
|
||||
|
||||
async function loadContestants(battleId: string): Promise<ContestantRow[]> {
|
||||
return sql<ContestantRow[]>`
|
||||
SELECT id, battle_id, identity, model, lane, task_id, worktree_id, status
|
||||
FROM contestants WHERE battle_id = ${battleId}
|
||||
ORDER BY created_at ASC
|
||||
`;
|
||||
}
|
||||
|
||||
function publishContestantFrame(
|
||||
battleId: string,
|
||||
contestantId: string,
|
||||
extra: Record<string, unknown>,
|
||||
): void {
|
||||
publishUser({
|
||||
type: 'contestant_updated',
|
||||
battle_id: battleId,
|
||||
contestant_id: contestantId,
|
||||
...extra,
|
||||
});
|
||||
}
|
||||
|
||||
// ─── initResume ───────────────────────────────────────────────────────────
|
||||
|
||||
async function initResume(): Promise<void> {
|
||||
const battles = await sql<BattleRow[]>`
|
||||
SELECT id, project_id, battle_type, prompt, status, results_path, created_at
|
||||
FROM battles WHERE status = 'running'
|
||||
`;
|
||||
if (battles.length === 0) return;
|
||||
log.info({ count: battles.length }, 'arena-runner: resuming in-flight battles on startup');
|
||||
for (const battle of battles) {
|
||||
await resumeBattle(battle).catch((err) => {
|
||||
log.error({ err: errMsg(err), battleId: battle.id }, 'arena-runner: initResume failed for battle');
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async function resumeBattle(battle: BattleRow): Promise<void> {
|
||||
const contestants = await loadContestants(battle.id);
|
||||
|
||||
const taskIds = contestants.map((c) => c.task_id).filter((id): id is string => id !== null);
|
||||
const taskStates = new Map<string, string>();
|
||||
if (taskIds.length > 0) {
|
||||
const tasks = await sql<{ id: string; state: string }[]>`
|
||||
SELECT id, state FROM tasks WHERE id = ANY(${taskIds})
|
||||
`;
|
||||
for (const t of tasks) taskStates.set(t.id, t.state);
|
||||
}
|
||||
|
||||
const decisions = reconcileContestants(
|
||||
contestants.map((c) => ({ contestantId: c.id, taskId: c.task_id, status: c.status })),
|
||||
taskStates,
|
||||
);
|
||||
|
||||
for (const decision of decisions) {
|
||||
if (decision.action === 'keep') continue;
|
||||
const contestant = contestants.find((c) => c.id === decision.contestantId)!;
|
||||
await applyResumeDecision(battle, contestant, decision.action);
|
||||
}
|
||||
|
||||
// Re-check completion after applying decisions.
|
||||
const updated = await loadContestants(battle.id);
|
||||
if (isBattleComplete(updated)) {
|
||||
await completeBattle(battle.id);
|
||||
} else {
|
||||
// Advance local lane in case a slot opened up.
|
||||
void advanceLocalLane(battle.id);
|
||||
}
|
||||
|
||||
log.info({ battleId: battle.id }, 'arena-runner: battle resumed');
|
||||
}
|
||||
|
||||
async function applyResumeDecision(
|
||||
battle: BattleRow,
|
||||
contestant: ContestantRow,
|
||||
action: ContestantResumeAction,
|
||||
): Promise<void> {
|
||||
switch (action) {
|
||||
case 'keep': break;
|
||||
|
||||
case 'mark-done': {
|
||||
const taskRow = contestant.task_id
|
||||
? (await sql<{ started_at: Date | null; ended_at: Date | null; cost_tokens: number | null; chat_id: string | null }[]>`
|
||||
SELECT started_at, ended_at, cost_tokens, chat_id FROM tasks WHERE id = ${contestant.task_id}`)[0]
|
||||
: null;
|
||||
const endedAt = taskRow?.ended_at ?? new Date();
|
||||
const startedAt = taskRow?.started_at ?? endedAt;
|
||||
const bench = computeBenchmark(startedAt, endedAt, taskRow?.cost_tokens ?? null, contestant.lane);
|
||||
const output = taskRow?.chat_id ? await readChatOutput(taskRow.chat_id) : '';
|
||||
const resultPath = battle
|
||||
? await writeContestantResults(battle, contestant, output, bench).catch((err) => {
|
||||
log.warn({ err: errMsg(err), contestantId: contestant.id }, 'arena-runner: resume result write failed');
|
||||
return null;
|
||||
})
|
||||
: null;
|
||||
await sql`
|
||||
UPDATE contestants
|
||||
SET status = 'done',
|
||||
duration_ms = ${Math.round(bench.durationMs)},
|
||||
tokens_per_sec = ${bench.tokensPerSec},
|
||||
result_path = ${resultPath},
|
||||
updated_at = clock_timestamp()
|
||||
WHERE id = ${contestant.id}
|
||||
`;
|
||||
break;
|
||||
}
|
||||
|
||||
case 'mark-error':
|
||||
await sql`
|
||||
UPDATE contestants
|
||||
SET status = 'error', error = 'task failed before callback',
|
||||
updated_at = clock_timestamp()
|
||||
WHERE id = ${contestant.id}
|
||||
`;
|
||||
break;
|
||||
|
||||
case 'mark-cancelled':
|
||||
await sql`
|
||||
UPDATE contestants
|
||||
SET status = 'error', error = 'cancelled before callback',
|
||||
updated_at = clock_timestamp()
|
||||
WHERE id = ${contestant.id}
|
||||
`;
|
||||
break;
|
||||
|
||||
case 're-dispatch': {
|
||||
const { taskId } = await dispatch({
|
||||
projectId: battle.project_id,
|
||||
contestantId: contestant.id,
|
||||
prompt: battle.prompt,
|
||||
identity: contestant.identity,
|
||||
model: contestant.model,
|
||||
battleType: battle.battle_type,
|
||||
});
|
||||
await sql`
|
||||
UPDATE contestants
|
||||
SET task_id = ${taskId}, updated_at = clock_timestamp()
|
||||
WHERE id = ${contestant.id}
|
||||
`;
|
||||
log.info(
|
||||
{ battleId: battle.id, contestantId: contestant.id, taskId },
|
||||
'arena-runner: contestant re-dispatched on resume',
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── cancelBattle ─────────────────────────────────────────────────────────
|
||||
|
||||
async function cancelBattle(battleId: string): Promise<{ cancelled: boolean; taskIds: string[] }> {
|
||||
const updated = await sql`
|
||||
UPDATE battles SET status = 'cancelled', updated_at = clock_timestamp()
|
||||
WHERE id = ${battleId} AND status = 'running'
|
||||
`;
|
||||
if (updated.count === 0) return { cancelled: false, taskIds: [] };
|
||||
|
||||
// Mark all non-terminal contestants cancelled and collect in-flight task_ids.
|
||||
const contestants = await sql<{ id: string; task_id: string | null; status: string }[]>`
|
||||
SELECT id, task_id, status FROM contestants
|
||||
WHERE battle_id = ${battleId} AND status NOT IN ('done', 'error')
|
||||
`;
|
||||
|
||||
if (contestants.length > 0) {
|
||||
await sql`
|
||||
UPDATE contestants
|
||||
SET status = 'error', error = 'battle cancelled', updated_at = clock_timestamp()
|
||||
WHERE battle_id = ${battleId} AND status NOT IN ('done', 'error')
|
||||
`;
|
||||
for (const c of contestants) {
|
||||
publishContestantFrame(battleId, c.id, {
|
||||
status: 'error',
|
||||
error: 'battle cancelled',
|
||||
battle_status: 'cancelled',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const taskIds = contestants
|
||||
.filter(
|
||||
(c): c is typeof c & { task_id: string } =>
|
||||
c.task_id !== null && c.status === 'running',
|
||||
)
|
||||
.map((c) => c.task_id);
|
||||
|
||||
log.info({ battleId }, 'arena-runner: battle cancelled by request');
|
||||
return { cancelled: true, taskIds };
|
||||
}
|
||||
|
||||
// ─── triggerAnalysis (Phase 5 seam) ──────────────────────────────────────
|
||||
|
||||
async function triggerAnalysis(battleId: string): Promise<{ triggered: boolean }> {
|
||||
const battle = await loadBattle(battleId);
|
||||
if (!battle) return { triggered: false };
|
||||
log.info({ battleId }, 'arena-runner: triggerAnalysis requested');
|
||||
// Calls the injected onBattleComplete seam — Phase 5 replaces this with the
|
||||
// real two-stage digest→judge analyzer (see ADR 0002 + plan Phase 5).
|
||||
onBattleComplete(battleId);
|
||||
return { triggered: true };
|
||||
}
|
||||
|
||||
// ─── startCrossExam (Phase 5 seam) ───────────────────────────────────────
|
||||
|
||||
async function startCrossExam(
|
||||
battleId: string,
|
||||
opts: { identity: string; model: string },
|
||||
): Promise<{ crossExamId: string }> {
|
||||
const [row] = await sql<{ id: string }[]>`
|
||||
INSERT INTO cross_examinations (battle_id, identity, model)
|
||||
VALUES (${battleId}, ${opts.identity}, ${opts.model})
|
||||
RETURNING id
|
||||
`;
|
||||
const crossExamId = row!.id;
|
||||
log.info({ battleId, crossExamId, ...opts }, 'arena-runner: cross-exam inserted, triggering analyzer');
|
||||
if (onCrossExamStart) {
|
||||
try {
|
||||
onCrossExamStart({ battleId, crossExamId, identity: opts.identity, model: opts.model });
|
||||
} catch (err) {
|
||||
log.error({ err: err instanceof Error ? err.message : String(err), battleId, crossExamId }, 'arena-runner: onCrossExamStart threw');
|
||||
}
|
||||
}
|
||||
return { crossExamId };
|
||||
}
|
||||
|
||||
// ─── setWinner (user override) ────────────────────────────────────────────
|
||||
|
||||
async function setWinner(
|
||||
battleId: string,
|
||||
winnerId: string | null,
|
||||
): Promise<{ ok: boolean; notFound?: boolean; invalidContestant?: boolean }> {
|
||||
const [row] = await sql<{ id: string }[]>`SELECT id FROM battles WHERE id = ${battleId}`;
|
||||
if (!row) return { ok: false, notFound: true };
|
||||
|
||||
if (winnerId !== null) {
|
||||
const [c] = await sql<{ id: string }[]>`
|
||||
SELECT id FROM contestants WHERE id = ${winnerId} AND battle_id = ${battleId}
|
||||
`;
|
||||
if (!c) return { ok: false, invalidContestant: true };
|
||||
}
|
||||
|
||||
await sql`
|
||||
UPDATE battles SET winner_contestant_id = ${winnerId}, updated_at = clock_timestamp()
|
||||
WHERE id = ${battleId}
|
||||
`;
|
||||
publishUser({ type: 'battle_updated', battle_id: battleId, winner_contestant_id: winnerId });
|
||||
return { ok: true };
|
||||
}
|
||||
|
||||
return { startBattle, handleTaskTerminal, initResume, cancelBattle, triggerAnalysis, startCrossExam, setWinner };
|
||||
}
|
||||
|
||||
function errMsg(e: unknown): string {
|
||||
return e instanceof Error ? e.message : String(e);
|
||||
}
|
||||
@@ -310,6 +310,9 @@ export function createDispatcher(deps: Deps): {
|
||||
const taskId = task.id;
|
||||
log.info({ taskId }, 'dispatcher: starting task (path A — native)');
|
||||
|
||||
// Declared before try so the catch block can write it back on the task row.
|
||||
let chatId: string | null = null;
|
||||
|
||||
try {
|
||||
// Mark running
|
||||
await sql`
|
||||
@@ -318,26 +321,29 @@ export function createDispatcher(deps: Deps): {
|
||||
WHERE id = ${taskId}
|
||||
`;
|
||||
|
||||
// Create session + chat for this task
|
||||
// Session setup: reuse a pre-created session (e.g. Q&A arena contestants
|
||||
// whose persona is stamped on the session via agent_id) or create a fresh one.
|
||||
const model = task.model ?? config.DEFAULT_MODEL;
|
||||
const sessionName = 'Task: ' + task.input.slice(0, 40);
|
||||
|
||||
const [session] = await sql<{ id: string }[]>`
|
||||
INSERT INTO sessions (project_id, name, model, status)
|
||||
VALUES (${task.project_id}, ${sessionName}, ${model}, 'open')
|
||||
RETURNING id
|
||||
`;
|
||||
const sessionId = session!.id;
|
||||
let sessionId: string;
|
||||
if (task.session_id) {
|
||||
sessionId = task.session_id;
|
||||
} else {
|
||||
const sessionName = 'Task: ' + task.input.slice(0, 40);
|
||||
const [session] = await sql<{ id: string }[]>`
|
||||
INSERT INTO sessions (project_id, name, model, status)
|
||||
VALUES (${task.project_id}, ${sessionName}, ${model}, 'open')
|
||||
RETURNING id
|
||||
`;
|
||||
sessionId = session!.id;
|
||||
await sql`UPDATE tasks SET session_id = ${sessionId} WHERE id = ${taskId}`;
|
||||
}
|
||||
|
||||
const [chat] = await sql<{ id: string }[]>`
|
||||
INSERT INTO chats (session_id, name, status)
|
||||
VALUES (${sessionId}, 'Task execution', 'open')
|
||||
RETURNING id
|
||||
`;
|
||||
const chatId = chat!.id;
|
||||
|
||||
// Link task to session
|
||||
await sql`UPDATE tasks SET session_id = ${sessionId} WHERE id = ${taskId}`;
|
||||
chatId = chat!.id;
|
||||
|
||||
// Create user message + streaming assistant
|
||||
await sql<{ id: string }[]>`
|
||||
@@ -382,7 +388,7 @@ export function createDispatcher(deps: Deps): {
|
||||
const summary = (msg?.content ?? '').slice(0, 500);
|
||||
await sql`
|
||||
UPDATE tasks
|
||||
SET state = 'completed', ended_at = clock_timestamp(), output_summary = ${summary}, cost_tokens = ${costTokens}
|
||||
SET state = 'completed', ended_at = clock_timestamp(), output_summary = ${summary}, cost_tokens = ${costTokens}, chat_id = ${chatId}
|
||||
WHERE id = ${taskId}
|
||||
`;
|
||||
log.info({ taskId, costTokens }, 'dispatcher: task completed (native)');
|
||||
@@ -409,7 +415,7 @@ export function createDispatcher(deps: Deps): {
|
||||
const summary = (msg?.content ?? 'Inference failed').slice(0, 500);
|
||||
await sql`
|
||||
UPDATE tasks
|
||||
SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${summary}, cost_tokens = ${costTokens}
|
||||
SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${summary}, cost_tokens = ${costTokens}, chat_id = ${chatId}
|
||||
WHERE id = ${taskId}
|
||||
`;
|
||||
log.warn({ taskId, finalStatus }, 'dispatcher: task failed (native)');
|
||||
@@ -419,7 +425,7 @@ export function createDispatcher(deps: Deps): {
|
||||
log.error({ taskId, err: errMsg }, 'dispatcher: task error (native)');
|
||||
await sql`
|
||||
UPDATE tasks
|
||||
SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}
|
||||
SET state = 'failed', ended_at = clock_timestamp(), output_summary = ${errMsg.slice(0, 500)}, chat_id = ${chatId}
|
||||
WHERE id = ${taskId}
|
||||
`.catch(() => {});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user