chore: snapshot working tree - pty_exited notifications + in-flight inference WIP

feat(booterm): structured pty_exited WS notifications. Plan-validated, impl-validated, code-reviewed green (contracts build clean, contracts test 29/29, booterm + web typecheck clean). wip: in-progress inference/provider refactor (agents.ts, provider.ts, new llama-providers.ts, removed llama-args-validator), plus arena, dispatcher, compaction, schema changes. openspec: pty-exit-notifications complete; x-agent-flags planned (not yet implemented).
2026-06-14 12:48:47 +00:00
parent 0ed506f1da
commit b18de2a331
204 changed files with 25344 additions and 867 deletions
--- a/apps/control/src/services/routing-scores.ts
+++ b/apps/control/src/services/routing-scores.ts
@@ -0,0 +1,194 @@
+/**
+ * P6.1: Advisory routing scores.
+ *
+ * Combines three signals per (provider_id, model) into an advisory score and
+ * a set of category badges surfaced in the BooChat model picker:
+ *   - eval results   (eval_runs.aggregate.avgScore, split by suite kind)
+ *   - live latency   (control_requests gen_tps + duration over a recent window)
+ *   - host health    (fleet liveness — an unhealthy host can win no badge)
+ *
+ * Advisory only: this never enforces routing. It powers display badges
+ * ("best code model right now") and the P7 gateway candidate ordering.
+ *
+ * The pure scoring/badge helpers are extracted for unit testing per the
+ * turn-guard.ts pattern; the DB read lives in computeRoutingScores().
+ */
+
+import type { Sql } from '../db.js';
+import type { FleetState } from './fleet-state.js';
+
+/** Recent-activity window for live latency signals. */
+const LIVE_WINDOW_HOURS = 24;
+
+export interface ModelScore {
+  /** Composite picker id: `${providerId}/${model}` (matches /api/models). */
+  compositeId: string;
+  providerId: string;
+  model: string;
+  /** Avg score (0..1) from completed code-suite eval runs, or null. */
+  codeScore: number | null;
+  /** Avg score (0..1) from completed chat-suite eval runs, or null. */
+  chatScore: number | null;
+  /** Best eval score across kinds, or null when never evaluated. */
+  evalScore: number | null;
+  /** Avg gen tok/s over the live window, or null when no recent traffic. */
+  avgGenTps: number | null;
+  /** Avg request duration (ms) over the live window, or null. */
+  avgLatencyMs: number | null;
+  /** Recent request count in the live window. */
+  sampleCount: number;
+  /** Whether the owning host is currently connected. */
+  healthy: boolean;
+  /** Category badges this model currently wins. */
+  badges: BadgeKind[];
+}
+
+export type BadgeKind = 'best-code' | 'best-chat' | 'best-fast';
+
+export const BADGE_LABELS: Record<BadgeKind, string> = {
+  'best-code': 'Best code model now',
+  'best-chat': 'Best chat model now',
+  'best-fast': 'Fastest model now',
+};
+
+interface EvalRow {
+  provider_id: string;
+  model: string;
+  suite_kind: string;
+  avg_score: number | null;
+}
+
+interface LatencyRow {
+  provider_id: string;
+  model: string;
+  avg_gen_tps: number | null;
+  avg_duration_ms: number | null;
+  sample_count: number;
+}
+
+/**
+ * Pure badge assignment: given the per-model signals, award one winner per
+ * category. Only healthy hosts are eligible; ties broken by first-seen order
+ * (callers sort deterministically before passing in).
+ */
+export function assignBadges(scores: ModelScore[]): void {
+  const eligible = scores.filter((s) => s.healthy);
+
+  const award = (
+    pick: (s: ModelScore) => number | null,
+    badge: BadgeKind,
+  ): void => {
+    let best: ModelScore | null = null;
+    let bestVal = -Infinity;
+    for (const s of eligible) {
+      const v = pick(s);
+      if (v == null) continue;
+      if (v > bestVal) {
+        bestVal = v;
+        best = s;
+      }
+    }
+    if (best && bestVal > -Infinity) {
+      best.badges.push(badge);
+    }
+  };
+
+  award((s) => s.codeScore, 'best-code');
+  award((s) => s.chatScore, 'best-chat');
+  award((s) => s.avgGenTps, 'best-fast');
+}
+
+/**
+ * Compute advisory routing scores across all (provider_id, model) pairs that
+ * have either eval history or recent live traffic.
+ */
+export async function computeRoutingScores(
+  sql: Sql,
+  fleet: FleetState,
+): Promise<ModelScore[]> {
+  // 1. Eval scores — latest completed run per (provider, model, kind).
+  //    Take the most recent finished run's aggregate avgScore per kind so a
+  //    fresh run supersedes stale numbers.
+  const evalRows = await sql<EvalRow[]>`
+    SELECT er.provider_id,
+           er.model,
+           es.kind AS suite_kind,
+           (er.aggregate::jsonb ->> 'avgScore')::float AS avg_score
+    FROM eval_runs er
+    JOIN eval_suites es ON er.suite_id = es.id
+    WHERE er.status = 'completed'
+      AND er.aggregate IS NOT NULL
+      AND er.finished_at = (
+        SELECT MAX(er2.finished_at)
+        FROM eval_runs er2
+        JOIN eval_suites es2 ON er2.suite_id = es2.id
+        WHERE er2.provider_id = er.provider_id
+          AND er2.model = er.model
+          AND es2.kind = es.kind
+          AND er2.status = 'completed'
+      )
+  `;
+
+  // 2. Live latency/throughput — recent control_requests per (provider, model).
+  const cutoff = new Date(Date.now() - LIVE_WINDOW_HOURS * 3600_000).toISOString();
+  const latencyRows = await sql<LatencyRow[]>`
+    SELECT provider_id,
+           model,
+           AVG(gen_tps) FILTER (WHERE gen_tps > 0) AS avg_gen_tps,
+           AVG(duration_ms) FILTER (WHERE duration_ms > 0) AS avg_duration_ms,
+           COUNT(*)::int AS sample_count
+    FROM control_requests
+    WHERE ts >= ${cutoff}
+      AND model IS NOT NULL
+    GROUP BY provider_id, model
+  `;
+
+  // 3. Merge signals keyed by compositeId.
+  const byKey = new Map<string, ModelScore>();
+  const keyOf = (providerId: string, model: string) => `${providerId}/${model}`;
+
+  const ensure = (providerId: string, model: string): ModelScore => {
+    const compositeId = keyOf(providerId, model);
+    let s = byKey.get(compositeId);
+    if (!s) {
+      s = {
+        compositeId,
+        providerId,
+        model,
+        codeScore: null,
+        chatScore: null,
+        evalScore: null,
+        avgGenTps: null,
+        avgLatencyMs: null,
+        sampleCount: 0,
+        healthy: fleet.hosts.get(providerId)?.liveness === 'connected',
+        badges: [],
+      };
+      byKey.set(compositeId, s);
+    }
+    return s;
+  };
+
+  for (const row of evalRows) {
+    const s = ensure(row.provider_id, row.model);
+    if (row.suite_kind === 'code') s.codeScore = row.avg_score;
+    else if (row.suite_kind === 'chat') s.chatScore = row.avg_score;
+    const best = Math.max(s.codeScore ?? -Infinity, s.chatScore ?? -Infinity);
+    s.evalScore = best > -Infinity ? best : null;
+  }
+
+  for (const row of latencyRows) {
+    const s = ensure(row.provider_id, row.model);
+    s.avgGenTps = row.avg_gen_tps;
+    s.avgLatencyMs = row.avg_duration_ms;
+    s.sampleCount = row.sample_count;
+  }
+
+  // Deterministic order before badge assignment so ties are stable.
+  const scores = Array.from(byKey.values()).sort((a, b) =>
+    a.compositeId < b.compositeId ? -1 : a.compositeId > b.compositeId ? 1 : 0,
+  );
+
+  assignBadges(scores);
+  return scores;
+}