chore: snapshot working tree - pty_exited notifications + in-flight inference WIP
feat(booterm): structured pty_exited WS notifications. Plan-validated, impl-validated, code-reviewed green (contracts build clean, contracts test 29/29, booterm + web typecheck clean). wip: in-progress inference/provider refactor (agents.ts, provider.ts, new llama-providers.ts, removed llama-args-validator), plus arena, dispatcher, compaction, schema changes. openspec: pty-exit-notifications complete; x-agent-flags planned (not yet implemented).
This commit is contained in:
194
apps/control/src/services/routing-scores.ts
Normal file
194
apps/control/src/services/routing-scores.ts
Normal file
@@ -0,0 +1,194 @@
|
||||
/**
|
||||
* P6.1: Advisory routing scores.
|
||||
*
|
||||
* Combines three signals per (provider_id, model) into an advisory score and
|
||||
* a set of category badges surfaced in the BooChat model picker:
|
||||
* - eval results (eval_runs.aggregate.avgScore, split by suite kind)
|
||||
* - live latency (control_requests gen_tps + duration over a recent window)
|
||||
* - host health (fleet liveness — an unhealthy host can win no badge)
|
||||
*
|
||||
* Advisory only: this never enforces routing. It powers display badges
|
||||
* ("best code model right now") and the P7 gateway candidate ordering.
|
||||
*
|
||||
* The pure scoring/badge helpers are extracted for unit testing per the
|
||||
* turn-guard.ts pattern; the DB read lives in computeRoutingScores().
|
||||
*/
|
||||
|
||||
import type { Sql } from '../db.js';
|
||||
import type { FleetState } from './fleet-state.js';
|
||||
|
||||
/** Recent-activity window for live latency signals. */
|
||||
const LIVE_WINDOW_HOURS = 24;
|
||||
|
||||
export interface ModelScore {
|
||||
/** Composite picker id: `${providerId}/${model}` (matches /api/models). */
|
||||
compositeId: string;
|
||||
providerId: string;
|
||||
model: string;
|
||||
/** Avg score (0..1) from completed code-suite eval runs, or null. */
|
||||
codeScore: number | null;
|
||||
/** Avg score (0..1) from completed chat-suite eval runs, or null. */
|
||||
chatScore: number | null;
|
||||
/** Best eval score across kinds, or null when never evaluated. */
|
||||
evalScore: number | null;
|
||||
/** Avg gen tok/s over the live window, or null when no recent traffic. */
|
||||
avgGenTps: number | null;
|
||||
/** Avg request duration (ms) over the live window, or null. */
|
||||
avgLatencyMs: number | null;
|
||||
/** Recent request count in the live window. */
|
||||
sampleCount: number;
|
||||
/** Whether the owning host is currently connected. */
|
||||
healthy: boolean;
|
||||
/** Category badges this model currently wins. */
|
||||
badges: BadgeKind[];
|
||||
}
|
||||
|
||||
export type BadgeKind = 'best-code' | 'best-chat' | 'best-fast';
|
||||
|
||||
export const BADGE_LABELS: Record<BadgeKind, string> = {
|
||||
'best-code': 'Best code model now',
|
||||
'best-chat': 'Best chat model now',
|
||||
'best-fast': 'Fastest model now',
|
||||
};
|
||||
|
||||
interface EvalRow {
|
||||
provider_id: string;
|
||||
model: string;
|
||||
suite_kind: string;
|
||||
avg_score: number | null;
|
||||
}
|
||||
|
||||
interface LatencyRow {
|
||||
provider_id: string;
|
||||
model: string;
|
||||
avg_gen_tps: number | null;
|
||||
avg_duration_ms: number | null;
|
||||
sample_count: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pure badge assignment: given the per-model signals, award one winner per
|
||||
* category. Only healthy hosts are eligible; ties broken by first-seen order
|
||||
* (callers sort deterministically before passing in).
|
||||
*/
|
||||
export function assignBadges(scores: ModelScore[]): void {
|
||||
const eligible = scores.filter((s) => s.healthy);
|
||||
|
||||
const award = (
|
||||
pick: (s: ModelScore) => number | null,
|
||||
badge: BadgeKind,
|
||||
): void => {
|
||||
let best: ModelScore | null = null;
|
||||
let bestVal = -Infinity;
|
||||
for (const s of eligible) {
|
||||
const v = pick(s);
|
||||
if (v == null) continue;
|
||||
if (v > bestVal) {
|
||||
bestVal = v;
|
||||
best = s;
|
||||
}
|
||||
}
|
||||
if (best && bestVal > -Infinity) {
|
||||
best.badges.push(badge);
|
||||
}
|
||||
};
|
||||
|
||||
award((s) => s.codeScore, 'best-code');
|
||||
award((s) => s.chatScore, 'best-chat');
|
||||
award((s) => s.avgGenTps, 'best-fast');
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute advisory routing scores across all (provider_id, model) pairs that
|
||||
* have either eval history or recent live traffic.
|
||||
*/
|
||||
export async function computeRoutingScores(
|
||||
sql: Sql,
|
||||
fleet: FleetState,
|
||||
): Promise<ModelScore[]> {
|
||||
// 1. Eval scores — latest completed run per (provider, model, kind).
|
||||
// Take the most recent finished run's aggregate avgScore per kind so a
|
||||
// fresh run supersedes stale numbers.
|
||||
const evalRows = await sql<EvalRow[]>`
|
||||
SELECT er.provider_id,
|
||||
er.model,
|
||||
es.kind AS suite_kind,
|
||||
(er.aggregate::jsonb ->> 'avgScore')::float AS avg_score
|
||||
FROM eval_runs er
|
||||
JOIN eval_suites es ON er.suite_id = es.id
|
||||
WHERE er.status = 'completed'
|
||||
AND er.aggregate IS NOT NULL
|
||||
AND er.finished_at = (
|
||||
SELECT MAX(er2.finished_at)
|
||||
FROM eval_runs er2
|
||||
JOIN eval_suites es2 ON er2.suite_id = es2.id
|
||||
WHERE er2.provider_id = er.provider_id
|
||||
AND er2.model = er.model
|
||||
AND es2.kind = es.kind
|
||||
AND er2.status = 'completed'
|
||||
)
|
||||
`;
|
||||
|
||||
// 2. Live latency/throughput — recent control_requests per (provider, model).
|
||||
const cutoff = new Date(Date.now() - LIVE_WINDOW_HOURS * 3600_000).toISOString();
|
||||
const latencyRows = await sql<LatencyRow[]>`
|
||||
SELECT provider_id,
|
||||
model,
|
||||
AVG(gen_tps) FILTER (WHERE gen_tps > 0) AS avg_gen_tps,
|
||||
AVG(duration_ms) FILTER (WHERE duration_ms > 0) AS avg_duration_ms,
|
||||
COUNT(*)::int AS sample_count
|
||||
FROM control_requests
|
||||
WHERE ts >= ${cutoff}
|
||||
AND model IS NOT NULL
|
||||
GROUP BY provider_id, model
|
||||
`;
|
||||
|
||||
// 3. Merge signals keyed by compositeId.
|
||||
const byKey = new Map<string, ModelScore>();
|
||||
const keyOf = (providerId: string, model: string) => `${providerId}/${model}`;
|
||||
|
||||
const ensure = (providerId: string, model: string): ModelScore => {
|
||||
const compositeId = keyOf(providerId, model);
|
||||
let s = byKey.get(compositeId);
|
||||
if (!s) {
|
||||
s = {
|
||||
compositeId,
|
||||
providerId,
|
||||
model,
|
||||
codeScore: null,
|
||||
chatScore: null,
|
||||
evalScore: null,
|
||||
avgGenTps: null,
|
||||
avgLatencyMs: null,
|
||||
sampleCount: 0,
|
||||
healthy: fleet.hosts.get(providerId)?.liveness === 'connected',
|
||||
badges: [],
|
||||
};
|
||||
byKey.set(compositeId, s);
|
||||
}
|
||||
return s;
|
||||
};
|
||||
|
||||
for (const row of evalRows) {
|
||||
const s = ensure(row.provider_id, row.model);
|
||||
if (row.suite_kind === 'code') s.codeScore = row.avg_score;
|
||||
else if (row.suite_kind === 'chat') s.chatScore = row.avg_score;
|
||||
const best = Math.max(s.codeScore ?? -Infinity, s.chatScore ?? -Infinity);
|
||||
s.evalScore = best > -Infinity ? best : null;
|
||||
}
|
||||
|
||||
for (const row of latencyRows) {
|
||||
const s = ensure(row.provider_id, row.model);
|
||||
s.avgGenTps = row.avg_gen_tps;
|
||||
s.avgLatencyMs = row.avg_duration_ms;
|
||||
s.sampleCount = row.sample_count;
|
||||
}
|
||||
|
||||
// Deterministic order before badge assignment so ties are stable.
|
||||
const scores = Array.from(byKey.values()).sort((a, b) =>
|
||||
a.compositeId < b.compositeId ? -1 : a.compositeId > b.compositeId ? 1 : 0,
|
||||
);
|
||||
|
||||
assignBadges(scores);
|
||||
return scores;
|
||||
}
|
||||
Reference in New Issue
Block a user