162 lines
5.2 KiB
TypeScript
162 lines
5.2 KiB
TypeScript
/**
|
|
* Retention job: daily in-process timer that rolls up raw perf samples and
|
|
* prunes old data.
|
|
*
|
|
* Crash-safe by construction:
|
|
* 1. Rollup is an idempotent upsert (INSERT ... ON CONFLICT DO UPDATE).
|
|
* 2. Delete raw only AFTER covering buckets are committed.
|
|
* 3. Chunked transactions: one per provider per 1-hour window.
|
|
*/
|
|
|
|
import type { Sql } from '../db.js';
|
|
import type { Config } from '../config.js';
|
|
|
|
export interface RetentionConfig {
|
|
rawHours: number;
|
|
rollupDays: number;
|
|
captureSizeKB: number;
|
|
captureBudgetMB: number;
|
|
}
|
|
|
|
export function buildRetentionConfig(cfg: Config): RetentionConfig {
|
|
return {
|
|
rawHours: cfg.RETENTION_RAW_HOURS,
|
|
rollupDays: cfg.RETENTION_ROLLUP_DAYS,
|
|
captureSizeKB: cfg.CAPTURE_SIZE_KB,
|
|
captureBudgetMB: cfg.CAPTURE_BUDGET_MB,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Roll up raw perf samples into 5-minute buckets.
|
|
* Idempotent: re-running the same window produces identical rollups.
|
|
*/
|
|
export async function runRollup(sql: Sql, providerId: string, hours: number): Promise<void> {
|
|
const cutoff = new Date(Date.now() - hours * 3600_000);
|
|
const buckets = await sql<{ bucket: Date }[]>`
|
|
SELECT date_trunc('5 minutes', ts) AS bucket
|
|
FROM control_perf_samples
|
|
WHERE provider_id = ${providerId}
|
|
AND ts >= ${cutoff.toISOString()}
|
|
GROUP BY bucket
|
|
ORDER BY bucket
|
|
`;
|
|
|
|
for (const { bucket } of buckets) {
|
|
const bucketStart = new Date(bucket);
|
|
const bucketEnd = new Date(bucket.getTime() + 5 * 60_000);
|
|
|
|
// Idempotent upsert: re-run recomputes the same buckets, never double-counts.
|
|
await sql`
|
|
INSERT INTO control_perf_rollup_5m (provider_id, bucket, gpu_agg, sys_agg)
|
|
SELECT
|
|
${providerId},
|
|
${bucketStart.toISOString()},
|
|
jsonb_agg(DISTINCT jsonb_build_object('ts', ts, 'gpu', gpu)) AS gpu_agg,
|
|
jsonb_agg(DISTINCT jsonb_build_object('ts', ts, 'sys', sys)) AS sys_agg
|
|
FROM control_perf_samples
|
|
WHERE provider_id = ${providerId}
|
|
AND ts >= ${bucketStart.toISOString()}
|
|
AND ts < ${bucketEnd.toISOString()}
|
|
GROUP BY provider_id
|
|
ON CONFLICT (provider_id, bucket) DO UPDATE SET
|
|
gpu_agg = EXCLUDED.gpu_agg,
|
|
sys_agg = EXCLUDED.sys_agg
|
|
`;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Prune raw perf samples older than the retention window.
|
|
* Chunked: one transaction per provider per 1-hour window.
|
|
*/
|
|
export async function pruneRawSamples(sql: Sql, providerId: string, hours: number): Promise<void> {
|
|
const cutoff = new Date(Date.now() - hours * 3600_000);
|
|
const chunkSize = 1000;
|
|
|
|
while (true) {
|
|
const toDelete = await sql<{ ts: Date }[]>`
|
|
SELECT ts FROM control_perf_samples
|
|
WHERE provider_id = ${providerId}
|
|
AND ts < ${cutoff.toISOString()}
|
|
ORDER BY ts DESC
|
|
LIMIT ${chunkSize}
|
|
`;
|
|
if (toDelete.length === 0) break;
|
|
|
|
const timestamps = toDelete.map((r) => r.ts);
|
|
await sql`DELETE FROM control_perf_samples WHERE provider_id = ${providerId} AND ts = ANY(${timestamps})`;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Prune activity (control_requests) older than the retention window.
|
|
* Chunked: one transaction per batch to avoid long lock hold times.
|
|
*/
|
|
export async function pruneActivity(sql: Sql, hours: number): Promise<void> {
|
|
const cutoff = new Date(Date.now() - hours * 3600_000);
|
|
const chunkSize = 1000;
|
|
|
|
while (true) {
|
|
const toDelete = await sql<{ ts: Date }[]>`
|
|
SELECT ts FROM control_requests
|
|
WHERE ts < ${cutoff.toISOString()}
|
|
ORDER BY ts DESC
|
|
LIMIT ${chunkSize}
|
|
`;
|
|
if (toDelete.length === 0) break;
|
|
|
|
const timestamps = toDelete.map((r) => r.ts);
|
|
await sql`DELETE FROM control_requests WHERE ts = ANY(${timestamps})`;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Prune model events older than the retention window.
|
|
* Chunked: one transaction per batch to avoid long lock hold times.
|
|
*/
|
|
export async function pruneModelEvents(sql: Sql, hours: number): Promise<void> {
|
|
const cutoff = new Date(Date.now() - hours * 3600_000);
|
|
const chunkSize = 1000;
|
|
|
|
while (true) {
|
|
const toDelete = await sql<{ ts: Date }[]>`
|
|
SELECT ts FROM control_model_events
|
|
WHERE ts < ${cutoff.toISOString()}
|
|
ORDER BY ts DESC
|
|
LIMIT ${chunkSize}
|
|
`;
|
|
if (toDelete.length === 0) break;
|
|
|
|
const timestamps = toDelete.map((r) => r.ts);
|
|
await sql`DELETE FROM control_model_events WHERE ts = ANY(${timestamps})`;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Trim capture JSONB per-row to the configured size cap.
|
|
* Returns the trimmed JSON string, or null.
|
|
*/
|
|
export function trimCapture(captureJson: string | null, sizeKB: number): string | null {
|
|
if (!captureJson) return null;
|
|
const sizeBytes = Buffer.byteLength(captureJson, 'utf8');
|
|
if (sizeBytes <= sizeKB * 1024) return captureJson;
|
|
// Trim by BYTES, not JS chars: a char-index slice can split a multi-byte
|
|
// codepoint and emit invalid UTF-8 (DB write error / corruption). Buffer
|
|
// subarray + toString('utf8') truncates at the last whole codepoint.
|
|
return Buffer.from(captureJson, 'utf8').subarray(0, Math.floor(sizeKB * 1024)).toString('utf8');
|
|
}
|
|
|
|
/**
|
|
* Parse a capture JSON string into an object for sql.json().
|
|
* Returns null if the input is null or invalid JSON.
|
|
*/
|
|
export function parseCaptureJson(captureJson: string | null): Record<string, unknown> | null {
|
|
if (!captureJson) return null;
|
|
try {
|
|
return JSON.parse(captureJson) as Record<string, unknown>;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|