Arena is a new pane kind for competitive AI evaluation. A Battle runs the same prompt against 2-6 Contestants across two concurrent lanes: local lane (llama-swap models, serial) and cloud lane (parallel). Added to all three registries: @boocode/contracts WsFrameSchema, server InferenceFrame, and web WsFrame. Backend (apps/coder): - arena-runner: battle scheduler, lane classifier, benchmark, results writer, resume, user winner override - arena-analyzer: two-stage digest→judge analysis on DEFAULT_MODEL - arena-decisions: status transitions and resume logic (unit-tested) - arena-analyzer-helpers: pure helper functions (unit-tested) - arena-model-call: model call utility for analysis - arena routes: create/get/list/stop/analyze/cross-examine/winner/diff - schema: battles, contestants, cross_examinations tables (idempotent) - remove old /api/arena* routes and tasks.arena_id column Frontend (apps/web): - ArenaLauncherDialog: battle type, prompt, contestant selection - ArenaPane: live roster, streaming output, analysis, cross-exam - DiffView: unified diff with line-by-line color for coding contests - Winner override per-row dropdown (Trophy icon) - battle_updated WS handler for live winner/analysis updates - arena pane kind in Workspace, ChatTabBar, useSidebar Cross-app: - ArenaState and ArenaContestantShape/WsFrame types (contracts) - battle_* frames in WsFrameSchema, InferenceFrame, and web WsFrame - manifest.json written per battle results folder - /Arena added to .gitignore Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
71 lines
2.1 KiB
TypeScript
71 lines
2.1 KiB
TypeScript
/**
|
|
* One-shot model completion for the Arena analyzer.
|
|
*
|
|
* Calls the local llama-swap server directly for a single non-streaming
|
|
* completion. Used for the digest and judge stages (always DEFAULT_MODEL)
|
|
* and for local-model cross-examinations (any local model).
|
|
*
|
|
* Mirrors apps/server/src/services/task-model.ts but targets the coder's
|
|
* config shape and uses a longer timeout appropriate for analysis calls.
|
|
*/
|
|
|
|
import type { Config } from '../config.js';
|
|
|
|
const TIMEOUT_MS = 120_000;
|
|
|
|
export async function arenaModelCall(opts: {
|
|
config: Pick<Config, 'LLAMA_SWAP_URL'>;
|
|
model: string;
|
|
system: string;
|
|
user: string;
|
|
maxTokens?: number;
|
|
temperature?: number;
|
|
}): Promise<string> {
|
|
const { config, model, system, user } = opts;
|
|
const maxTokens = opts.maxTokens ?? 2_000;
|
|
const temperature = opts.temperature ?? 0.3;
|
|
|
|
const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/chat/completions`, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
model,
|
|
messages: [
|
|
{ role: 'system', content: system },
|
|
{ role: 'user', content: user },
|
|
],
|
|
max_tokens: maxTokens,
|
|
temperature,
|
|
stream: false,
|
|
chat_template_kwargs: { enable_thinking: false },
|
|
}),
|
|
signal: AbortSignal.timeout(TIMEOUT_MS),
|
|
});
|
|
|
|
if (!res.ok) {
|
|
const text = await res.text().catch(() => '');
|
|
throw new Error(`llama-swap responded ${res.status}: ${text.slice(0, 200)}`);
|
|
}
|
|
|
|
const data = (await res.json()) as {
|
|
choices?: Array<{
|
|
message?: { content?: string; reasoning_content?: string };
|
|
}>;
|
|
};
|
|
|
|
const choice = data.choices?.[0]?.message;
|
|
if (!choice) return '';
|
|
|
|
const content = (choice.content ?? '').trim();
|
|
if (content.length > 0) return content;
|
|
|
|
// For thinking-mode models the answer sometimes only lands in reasoning_content.
|
|
const reasoning = (choice.reasoning_content ?? '').trim();
|
|
if (reasoning.length > 0) {
|
|
const lines = reasoning.split('\n').filter((l) => l.trim().length > 0);
|
|
return lines[lines.length - 1] ?? '';
|
|
}
|
|
|
|
return '';
|
|
}
|