boocode/apps/coder/src/services/arena-analyzer.ts

/**
 * Arena Analyzer — pluggable seam for battle analysis and cross-examination.
 *
 * The Analyzer interface is the plug point: a v2 Han Orchestrator flow can
 * replace the v1 two-stage digest→judge implementation without a schema change.
 *
 * v1 implementation uses DEFAULT_MODEL via direct llama-swap calls (arenaModelCall):
 *   Digest stage  — one call per succeeded contestant, concurrent; produces a
 *                   bounded summary of each result (result.md + diff.patch for
 *                   coding, result.md for Q&A).
 *   Judge stage   — one call with all digests + the original prompt; writes
 *                   analysis.md, names a winner (unless < 2 succeeded), and
 *                   updates battles.winner_contestant_id.
 *
 * Cross-examination:
 *   Local model   — direct arenaModelCall to llama-swap with the chosen model.
 *   Cloud model   — inserts a tasks row (triggers the dispatcher via pg_notify);
 *                   polls for completion; reads output_summary as the verdict.
 *   In both cases the verdict is written to cross_examinations.verdict, appended
 *   to <resultsPath>/cross-exam.md, and a battle_updated frame is published.
 *
 * Never throws — all errors are caught, logged, and swallowed so the caller
 * (arena-runner's onBattleComplete / onCrossExamStart) is never wedged.
 */

import { readFile, writeFile, mkdir } from 'node:fs/promises';
import { join } from 'node:path';
import type { Sql } from '../db.js';
import type { Broker } from '@boocode/server/broker';
import type { WsFrame } from '@boocode/contracts/ws-frames';
import type { FastifyBaseLogger } from 'fastify';
import type { Config } from '../config.js';
import type { BattleType } from '@boocode/contracts/arena';
import { arenaModelCall } from './arena-model-call.js';
import {
  buildDigestPrompt,
  buildJudgePrompt,
  buildCrossExamPrompt,
  extractWinner,
  shouldNameWinner,
  type ContestantDigest,
} from './arena-analyzer-helpers.js';

// ─── Public interface ─────────────────────────────────────────────────────────

/** Pluggable analysis seam — swap to a Han Orchestrator flow in v2. */
export interface Analyzer {
  /** Run the two-stage digest→judge analysis for a completed battle. */
  analyze(battleId: string): Promise<void>;
  /**
   * Run a cross-examination for an already-inserted cross_examinations row.
   * The result is written back to that row and a battle_updated frame is published.
   */
  crossExamine(
    battleId: string,
    crossExamId: string,
    opts: { identity: string; model: string },
  ): Promise<void>;
}

// ─── Internal DB row types ────────────────────────────────────────────────────

interface BattleRow {
  id: string;
  project_id: string;
  battle_type: BattleType;
  prompt: string;
  status: string;
  results_path: string | null;
  winner_contestant_id: string | null;
}

interface ContestantRow {
  id: string;
  identity: string;
  model: string;
  lane: string;
  status: string;
  result_path: string | null;
  duration_ms: number | null;
  tokens_per_sec: number | null;
}

// ─── Factory ──────────────────────────────────────────────────────────────────

interface AnalyzerDeps {
  sql: Sql;
  broker: Broker;
  log: FastifyBaseLogger;
  config: Pick<Config, 'LLAMA_SWAP_URL' | 'DEFAULT_MODEL'>;
  /** Model IDs served by local llama-swap — cross-exam routing uses this. */
  localModels: ReadonlySet<string>;
}

export function createAnalyzer(deps: AnalyzerDeps): Analyzer {
  const { sql, broker, log, config, localModels } = deps;

  // ─── analyze ──────────────────────────────────────────────────────────────

  async function analyze(battleId: string): Promise<void> {
    try {
      await runAnalysis(battleId);
    } catch (err) {
      log.error(
        { err: errMsg(err), battleId },
        'arena-analyzer: analysis failed',
      );
    }
  }

  async function runAnalysis(battleId: string): Promise<void> {
    const battle = await loadBattle(battleId);
    if (!battle) {
      log.warn({ battleId }, 'arena-analyzer: battle not found');
      return;
    }

    const contestants = await loadContestants(battleId);
    const succeeded = contestants.filter((c) => c.status === 'done' && c.result_path);

    log.info(
      { battleId, total: contestants.length, succeeded: succeeded.length },
      'arena-analyzer: starting analysis',
    );

    // Digest stage — concurrent, one call per succeeded contestant.
    const digests = (
      await Promise.all(succeeded.map((c) => digestContestant(battle, c)))
    ).filter((d): d is ContestantDigest => d !== null);

    // Failed contestants are noted in the analysis even if they produced no digest.
    const failedNotes = contestants
      .filter((c) => c.status === 'error')
      .map((c) => `- **${c.identity} / ${c.model}**: failed (no result)\n`);

    // Judge stage — single call with all digests.
    const { analysisText, winner } = await judgeContestants(battle, digests, failedNotes);

    // Write analysis.md to the battle results folder.
    const resultsPath = battle.results_path;
    if (resultsPath) {
      await mkdir(resultsPath, { recursive: true });
      await writeFile(join(resultsPath, 'analysis.md'), analysisText, 'utf8');
    }

    // Resolve the winner to a contestant id and update the battle row.
    let winnerId: string | null = null;
    if (winner && shouldNameWinner(succeeded.length)) {
      const winnerContestant = contestants.find(
        (c) => c.identity === winner.identity && c.model === winner.model,
      );
      if (winnerContestant) {
        winnerId = winnerContestant.id;
        await sql`
          UPDATE battles
          SET winner_contestant_id = ${winnerId}, updated_at = clock_timestamp()
          WHERE id = ${battleId}
        `;
        log.info({ battleId, winnerId, identity: winner.identity, model: winner.model }, 'arena-analyzer: winner set');
      } else {
        log.warn({ battleId, winner }, 'arena-analyzer: judge named a winner not found in contestants');
      }
    }

    publishUser({
      type: 'battle_updated',
      battle_id: battleId,
      winner_contestant_id: winnerId,
      analysis_ready: true,
    });

    log.info({ battleId }, 'arena-analyzer: analysis complete');
  }

  // ─── crossExamine ─────────────────────────────────────────────────────────

  async function crossExamine(
    battleId: string,
    crossExamId: string,
    opts: { identity: string; model: string },
  ): Promise<void> {
    try {
      await runCrossExam(battleId, crossExamId, opts);
    } catch (err) {
      log.error(
        { err: errMsg(err), battleId, crossExamId },
        'arena-analyzer: cross-exam failed',
      );
    }
  }

  async function runCrossExam(
    battleId: string,
    crossExamId: string,
    opts: { identity: string; model: string },
  ): Promise<void> {
    const battle = await loadBattle(battleId);
    if (!battle) {
      log.warn({ battleId }, 'arena-analyzer: battle not found for cross-exam');
      return;
    }

    const contestants = await loadContestants(battleId);

    // Re-read the digests (if contestants have results) for context.
    const succeeded = contestants.filter((c) => c.status === 'done' && c.result_path);
    const digests = (
      await Promise.all(succeeded.map((c) => digestContestant(battle, c)))
    ).filter((d): d is ContestantDigest => d !== null);

    // Read analysis.md for the proposed analysis content.
    let analysisContent = '';
    if (battle.results_path) {
      analysisContent = await readFile(
        join(battle.results_path, 'analysis.md'), 'utf8',
      ).catch(() => '');
    }

    // Resolve proposed winner label.
    let proposedWinner: string | null = null;
    if (battle.winner_contestant_id) {
      const w = contestants.find((c) => c.id === battle.winner_contestant_id);
      if (w) proposedWinner = `${w.identity}/${w.model}`;
    }

    const { system, user } = buildCrossExamPrompt({
      originalPrompt: battle.prompt,
      digests,
      analysisContent,
      proposedWinner,
      examinerIdentity: opts.identity,
      examinerModel: opts.model,
    });

    log.info({ battleId, crossExamId, identity: opts.identity, model: opts.model }, 'arena-analyzer: running cross-exam');

    const verdict = await executeModelCall({
      battleId,
      projectId: battle.project_id,
      identity: opts.identity,
      model: opts.model,
      system,
      user,
    });

    // Persist verdict and append to cross-exam.md.
    await sql`
      UPDATE cross_examinations
      SET verdict = ${verdict}
      WHERE id = ${crossExamId}
    `;

    if (battle.results_path) {
      const crossExamPath = join(battle.results_path, 'cross-exam.md');
      const section =
        `\n---\n\n# Cross-Examination by ${opts.identity} / ${opts.model}\n\n` +
        `${verdict}\n`;
      await writeFile(crossExamPath, section, { flag: 'a', encoding: 'utf8' });
    }

    publishUser({
      type: 'battle_updated',
      battle_id: battleId,
      cross_exam_id: crossExamId,
    });

    log.info({ battleId, crossExamId }, 'arena-analyzer: cross-exam complete');
  }

  // ─── Model call routing ───────────────────────────────────────────────────

  /**
   * Route a one-shot model call to llama-swap (local) or the task dispatcher
   * (cloud). Cloud dispatch inserts a tasks row and polls for completion.
   */
  async function executeModelCall(opts: {
    battleId: string;
    projectId: string;
    identity: string;
    model: string;
    system: string;
    user: string;
  }): Promise<string> {
    const isLocal = localModels.has(opts.model) || localModels.has(`llama-swap/${opts.model}`);

    if (isLocal) {
      return arenaModelCall({
        config,
        model: opts.model,
        system: opts.system,
        user: opts.user,
        maxTokens: 2_000,
        temperature: 0.3,
      });
    }

    // Cloud path: dispatch through the task system and poll for completion.
    return executeCloudModelCall(opts);
  }

  async function executeCloudModelCall(opts: {
    projectId: string;
    identity: string;
    model: string;
    system: string;
    user: string;
  }): Promise<string> {
    // The cross-exam prompt is the full input to the external agent. We embed
    // the system prompt as a preamble in the user message (external agents don't
    // take a separate system arg through the tasks dispatcher).
    const input = `${opts.system}\n\n${opts.user}`;

    // For well-known external agents, stamp the agent name so the dispatcher
    // routes via PTY/ACP. For unknown identities fall back to native inference
    // (agent = null → DEFAULT_MODEL text generation).
    const knownAgents = new Set(['claude', 'opencode', 'qwen', 'goose']);
    const agentName = knownAgents.has(opts.identity) ? opts.identity : null;

    const [task] = await sql<{ id: string }[]>`
      INSERT INTO tasks (project_id, input, agent, model)
      VALUES (${opts.projectId}, ${input}, ${agentName}, ${opts.model})
      RETURNING id
    `;
    const taskId = task!.id;

    log.info({ taskId, identity: opts.identity, model: opts.model }, 'arena-analyzer: cloud cross-exam task dispatched');

    // Poll until terminal (up to 5 minutes).
    const timeoutMs = 5 * 60 * 1_000;
    const pollMs = 2_000;
    const deadline = Date.now() + timeoutMs;

    while (Date.now() < deadline) {
      await sleep(pollMs);
      const [row] = await sql<{ state: string; output_summary: string | null }[]>`
        SELECT state, output_summary FROM tasks WHERE id = ${taskId}
      `;
      if (!row) break;
      if (row.state === 'completed') return row.output_summary ?? '';
      if (row.state === 'failed' || row.state === 'cancelled') {
        throw new Error(`cross-exam task ${row.state}: ${row.output_summary ?? ''}`);
      }
    }

    throw new Error(`cloud cross-exam task timed out after ${timeoutMs / 1000}s`);
  }

  // ─── Digest helper ────────────────────────────────────────────────────────

  async function digestContestant(
    battle: BattleRow,
    c: ContestantRow,
  ): Promise<ContestantDigest | null> {
    if (!c.result_path) return null;

    const resultMd = await readFile(join(c.result_path, 'result.md'), 'utf8').catch(() => '');

    let diffPatch: string | undefined;
    if (battle.battle_type === 'coding') {
      diffPatch = await readFile(join(c.result_path, 'diff.patch'), 'utf8').catch(
        () => undefined,
      );
    }

    const benchmarkLine = formatBenchmarkLine(c);
    const { system, user } = buildDigestPrompt({
      identity: c.identity,
      model: c.model,
      resultMd,
      diffPatch,
      benchmarkLine,
    });

    let digest: string;
    try {
      digest = await arenaModelCall({
        config,
        model: config.DEFAULT_MODEL,
        system,
        user,
        maxTokens: 500,
        temperature: 0.3,
      });
    } catch (err) {
      log.warn(
        { err: errMsg(err), identity: c.identity, model: c.model },
        'arena-analyzer: digest call failed — skipping contestant',
      );
      return null;
    }

    return { identity: c.identity, model: c.model, digest, benchmarkLine };
  }

  // ─── Judge helper ─────────────────────────────────────────────────────────

  async function judgeContestants(
    battle: BattleRow,
    digests: ContestantDigest[],
    failedNotes: string[],
  ): Promise<{ analysisText: string; winner: { identity: string; model: string } | null }> {
    const { system, user } = buildJudgePrompt(battle.prompt, digests);

    let judgeOutput = '';
    try {
      judgeOutput = await arenaModelCall({
        config,
        model: config.DEFAULT_MODEL,
        system,
        user,
        maxTokens: 2_000,
        temperature: 0.3,
      });
    } catch (err) {
      log.error({ err: errMsg(err), battleId: battle.id }, 'arena-analyzer: judge call failed');
      judgeOutput = '*(Judge call failed — no comparison produced.)*';
    }

    const winner = shouldNameWinner(digests.length) ? extractWinner(judgeOutput) : null;

    const sections: string[] = [
      `# Arena Analysis`,
      `\n**Battle type:** ${battle.battle_type}`,
    ];

    if (failedNotes.length > 0) {
      sections.push('\n## Failed Contestants\n');
      sections.push(...failedNotes);
    }

    if (digests.length > 0) {
      sections.push('\n## Contestant Digests\n');
      for (const d of digests) {
        sections.push(`### ${d.identity} / ${d.model}`);
        sections.push(`*Benchmark: ${d.benchmarkLine}*\n`);
        sections.push(d.digest);
      }
    }

    sections.push("\n## Judge's Verdict\n");
    sections.push(judgeOutput);

    if (winner) {
      sections.push(`\n## Winner\n**${winner.identity} / ${winner.model}**`);
    } else {
      const reason =
        digests.length < 2
          ? 'fewer than 2 contestants produced results'
          : 'no clear winner identified';
      sections.push(`\n## Winner\n*No winner named (${reason}).*`);
    }

    return { analysisText: sections.join('\n'), winner };
  }

  // ─── DB helpers ───────────────────────────────────────────────────────────

  async function loadBattle(battleId: string): Promise<BattleRow | null> {
    const [b] = await sql<BattleRow[]>`
      SELECT id, project_id, battle_type, prompt, status, results_path, winner_contestant_id
      FROM battles WHERE id = ${battleId}
    `;
    return b ?? null;
  }

  async function loadContestants(battleId: string): Promise<ContestantRow[]> {
    return sql<ContestantRow[]>`
      SELECT id, identity, model, lane, status, result_path, duration_ms, tokens_per_sec
      FROM contestants WHERE battle_id = ${battleId}
      ORDER BY created_at ASC
    `;
  }

  // ─── Misc helpers ─────────────────────────────────────────────────────────

  function formatBenchmarkLine(c: ContestantRow): string {
    const parts: string[] = [];
    if (c.duration_ms !== null) parts.push(`${c.duration_ms}ms`);
    if (c.tokens_per_sec !== null) parts.push(`${c.tokens_per_sec.toFixed(1)} tok/s`);
    return parts.length > 0 ? parts.join(', ') : 'no benchmark';
  }

  function publishUser(frame: Record<string, unknown>): void {
    broker.publishUserFrame('default', frame as unknown as WsFrame);
  }

  function sleep(ms: number): Promise<void> {
    return new Promise((resolve) => setTimeout(resolve, ms));
  }

  return { analyze, crossExamine };
}

function errMsg(e: unknown): string {
  return e instanceof Error ? e.message : String(e);
}