/** * One-shot model completion for the Arena analyzer. * * Calls the local llama-swap server directly for a single non-streaming * completion. Used for the digest and judge stages (always DEFAULT_MODEL) * and for local-model cross-examinations (any local model). * * Mirrors apps/server/src/services/task-model.ts but targets the coder's * config shape and uses a longer timeout appropriate for analysis calls. */ import type { Config } from '../config.js'; const TIMEOUT_MS = 120_000; export async function arenaModelCall(opts: { config: Pick; model: string; system: string; user: string; maxTokens?: number; temperature?: number; }): Promise { const { config, model, system, user } = opts; const maxTokens = opts.maxTokens ?? 2_000; const temperature = opts.temperature ?? 0.3; const res = await fetch(`${config.LLAMA_SWAP_URL}/v1/chat/completions`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model, messages: [ { role: 'system', content: system }, { role: 'user', content: user }, ], max_tokens: maxTokens, temperature, stream: false, chat_template_kwargs: { enable_thinking: false }, }), signal: AbortSignal.timeout(TIMEOUT_MS), }); if (!res.ok) { const text = await res.text().catch(() => ''); throw new Error(`llama-swap responded ${res.status}: ${text.slice(0, 200)}`); } const data = (await res.json()) as { choices?: Array<{ message?: { content?: string; reasoning_content?: string }; }>; }; const choice = data.choices?.[0]?.message; if (!choice) return ''; const content = (choice.content ?? '').trim(); if (content.length > 0) return content; // For thinking-mode models the answer sometimes only lands in reasoning_content. const reasoning = (choice.reasoning_content ?? '').trim(); if (reasoning.length > 0) { const lines = reasoning.split('\n').filter((l) => l.trim().length > 0); return lines[lines.length - 1] ?? ''; } return ''; }