/** * P6.1: Advisory routing scores. * * Combines three signals per (provider_id, model) into an advisory score and * a set of category badges surfaced in the BooChat model picker: * - eval results (eval_runs.aggregate.avgScore, split by suite kind) * - live latency (control_requests gen_tps + duration over a recent window) * - host health (fleet liveness — an unhealthy host can win no badge) * * Advisory only: this never enforces routing. It powers display badges * ("best code model right now") and the P7 gateway candidate ordering. * * The pure scoring/badge helpers are extracted for unit testing per the * turn-guard.ts pattern; the DB read lives in computeRoutingScores(). */ import type { Sql } from '../db.js'; import type { FleetState } from './fleet-state.js'; /** Recent-activity window for live latency signals. */ const LIVE_WINDOW_HOURS = 24; export interface ModelScore { /** Composite picker id: `${providerId}/${model}` (matches /api/models). */ compositeId: string; providerId: string; model: string; /** Avg score (0..1) from completed code-suite eval runs, or null. */ codeScore: number | null; /** Avg score (0..1) from completed chat-suite eval runs, or null. */ chatScore: number | null; /** Best eval score across kinds, or null when never evaluated. */ evalScore: number | null; /** Avg gen tok/s over the live window, or null when no recent traffic. */ avgGenTps: number | null; /** Avg request duration (ms) over the live window, or null. */ avgLatencyMs: number | null; /** Recent request count in the live window. */ sampleCount: number; /** Whether the owning host is currently connected. */ healthy: boolean; /** Category badges this model currently wins. */ badges: BadgeKind[]; } export type BadgeKind = 'best-code' | 'best-chat' | 'best-fast'; export const BADGE_LABELS: Record = { 'best-code': 'Best code model now', 'best-chat': 'Best chat model now', 'best-fast': 'Fastest model now', }; interface EvalRow { provider_id: string; model: string; suite_kind: string; avg_score: number | null; } interface LatencyRow { provider_id: string; model: string; avg_gen_tps: number | null; avg_duration_ms: number | null; sample_count: number; } /** * Pure badge assignment: given the per-model signals, award one winner per * category. Only healthy hosts are eligible; ties broken by first-seen order * (callers sort deterministically before passing in). */ export function assignBadges(scores: ModelScore[]): void { const eligible = scores.filter((s) => s.healthy); const award = ( pick: (s: ModelScore) => number | null, badge: BadgeKind, ): void => { let best: ModelScore | null = null; let bestVal = -Infinity; for (const s of eligible) { const v = pick(s); if (v == null) continue; if (v > bestVal) { bestVal = v; best = s; } } if (best && bestVal > -Infinity) { best.badges.push(badge); } }; award((s) => s.codeScore, 'best-code'); award((s) => s.chatScore, 'best-chat'); award((s) => s.avgGenTps, 'best-fast'); } /** * Compute advisory routing scores across all (provider_id, model) pairs that * have either eval history or recent live traffic. */ export async function computeRoutingScores( sql: Sql, fleet: FleetState, ): Promise { // 1. Eval scores — latest completed run per (provider, model, kind). // Take the most recent finished run's aggregate avgScore per kind so a // fresh run supersedes stale numbers. const evalRows = await sql` SELECT er.provider_id, er.model, es.kind AS suite_kind, (er.aggregate::jsonb ->> 'avgScore')::float AS avg_score FROM eval_runs er JOIN eval_suites es ON er.suite_id = es.id WHERE er.status = 'completed' AND er.aggregate IS NOT NULL AND er.finished_at = ( SELECT MAX(er2.finished_at) FROM eval_runs er2 JOIN eval_suites es2 ON er2.suite_id = es2.id WHERE er2.provider_id = er.provider_id AND er2.model = er.model AND es2.kind = es.kind AND er2.status = 'completed' ) `; // 2. Live latency/throughput — recent control_requests per (provider, model). const cutoff = new Date(Date.now() - LIVE_WINDOW_HOURS * 3600_000).toISOString(); const latencyRows = await sql` SELECT provider_id, model, AVG(gen_tps) FILTER (WHERE gen_tps > 0) AS avg_gen_tps, AVG(duration_ms) FILTER (WHERE duration_ms > 0) AS avg_duration_ms, COUNT(*)::int AS sample_count FROM control_requests WHERE ts >= ${cutoff} AND model IS NOT NULL GROUP BY provider_id, model `; // 3. Merge signals keyed by compositeId. const byKey = new Map(); const keyOf = (providerId: string, model: string) => `${providerId}/${model}`; const ensure = (providerId: string, model: string): ModelScore => { const compositeId = keyOf(providerId, model); let s = byKey.get(compositeId); if (!s) { s = { compositeId, providerId, model, codeScore: null, chatScore: null, evalScore: null, avgGenTps: null, avgLatencyMs: null, sampleCount: 0, healthy: fleet.hosts.get(providerId)?.liveness === 'connected', badges: [], }; byKey.set(compositeId, s); } return s; }; for (const row of evalRows) { const s = ensure(row.provider_id, row.model); if (row.suite_kind === 'code') s.codeScore = row.avg_score; else if (row.suite_kind === 'chat') s.chatScore = row.avg_score; const best = Math.max(s.codeScore ?? -Infinity, s.chatScore ?? -Infinity); s.evalScore = best > -Infinity ? best : null; } for (const row of latencyRows) { const s = ensure(row.provider_id, row.model); s.avgGenTps = row.avg_gen_tps; s.avgLatencyMs = row.avg_duration_ms; s.sampleCount = row.sample_count; } // Deterministic order before badge assignment so ties are stable. const scores = Array.from(byKey.values()).sort((a, b) => a.compositeId < b.compositeId ? -1 : a.compositeId > b.compositeId ? 1 : 0, ); assignBadges(scores); return scores; }