feat(conductor): task state machine — TIMED_OUT state and retriable steps
- Add 'timed_out' to flow_runs/flow_steps CHECK constraints - Add retry_count and max_retries columns to flow_steps - Add timeout detection in advanceInner loop (configurable FLOW_STEP_TIMEOUT_MS) - Add retriable logic: re-dispatch on timeout if maxRetries > 0 and retryCount < maxRetries - Add isRetriable() + shouldRetry() pure decision functions - Add timed_out handling to reconcileResumeStep and reconcileRun - Add 'timed_out' to ws-frames enum, publishStep status type
This commit is contained in:
@@ -33,11 +33,13 @@ export interface SchedulerState {
|
||||
readonly inFlight: ReadonlySet<string>;
|
||||
/** step ids pre-skipped at launch (band/when gating) — never given a row */
|
||||
readonly excluded: ReadonlySet<string>;
|
||||
/** step ids that timed out (terminal — no retries remaining or not retriable) */
|
||||
readonly timedOut: ReadonlySet<string>;
|
||||
}
|
||||
|
||||
/** A dependency is satisfied once it is done, skipped, or excluded. */
|
||||
/** A dependency is satisfied once it is done, skipped, excluded, or timed out. */
|
||||
function isSatisfied(state: SchedulerState, id: string): boolean {
|
||||
return state.done.has(id) || state.skipped.has(id) || state.excluded.has(id);
|
||||
return state.done.has(id) || state.skipped.has(id) || state.excluded.has(id) || state.timedOut.has(id);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -118,25 +120,50 @@ export function isStuck(flow: Flow, state: SchedulerState): boolean {
|
||||
* - 'mark-cancelled': task was cancelled before the callback ran; propagate so
|
||||
* advance() cancels the run.
|
||||
*/
|
||||
/**
|
||||
* True when the step definition allows retries on timeout.
|
||||
* Pure — no IO.
|
||||
*/
|
||||
export function isRetriable(step: { maxRetries?: number }): boolean {
|
||||
return (step.maxRetries ?? 0) > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* True when the step has retries remaining.
|
||||
* Pure — no IO.
|
||||
*/
|
||||
export function shouldRetry(maxRetries: number | undefined | null, retryCount: number): boolean {
|
||||
return retryCount < (maxRetries ?? 0);
|
||||
}
|
||||
|
||||
export type ResumeAction =
|
||||
| 'keep'
|
||||
| 're-dispatch'
|
||||
| 'mark-done'
|
||||
| 'mark-failed'
|
||||
| 'mark-cancelled';
|
||||
| 'mark-cancelled'
|
||||
| 'retry';
|
||||
|
||||
/**
|
||||
* Decide what to do with ONE flow step during startup resume (D-9). Pure.
|
||||
*
|
||||
* @param status - flow_steps.status
|
||||
* @param taskId - flow_steps.task_id (null for code steps or unstarted agent steps)
|
||||
* @param taskState - tasks.state for taskId, or null if the task row is absent
|
||||
* @param status - flow_steps.status
|
||||
* @param taskId - flow_steps.task_id (null for code steps or unstarted agent steps)
|
||||
* @param taskState - tasks.state for taskId, or null if the task row is absent
|
||||
* @param retryCount - flow_steps.retry_count (default 0)
|
||||
* @param maxRetries - flow_steps.max_retries (null = no retry)
|
||||
*/
|
||||
export function reconcileResumeStep(
|
||||
status: string,
|
||||
taskId: string | null,
|
||||
taskState: string | null,
|
||||
retryCount?: number,
|
||||
maxRetries?: number | null,
|
||||
): ResumeAction {
|
||||
if (status === 'timed_out') {
|
||||
if (shouldRetry(maxRetries, retryCount ?? 0)) return 'retry';
|
||||
return 'mark-failed';
|
||||
}
|
||||
if (status !== 'running') return 'keep';
|
||||
// Running step: decide by its task's current state.
|
||||
if (!taskId || taskState === null) return 're-dispatch'; // task gone or never created
|
||||
@@ -198,7 +225,7 @@ export function evaluateTriggerRule(
|
||||
* decision per step. Pure — no IO.
|
||||
*/
|
||||
export function reconcileRun(
|
||||
steps: ReadonlyArray<{ stepId: string; taskId: string | null; status: string }>,
|
||||
steps: ReadonlyArray<{ stepId: string; taskId: string | null; status: string; retryCount?: number; maxRetries?: number | null }>,
|
||||
taskStates: ReadonlyMap<string, string>,
|
||||
): StepResumeDecision[] {
|
||||
return steps.map((step) => ({
|
||||
@@ -207,6 +234,8 @@ export function reconcileRun(
|
||||
step.status,
|
||||
step.taskId,
|
||||
step.taskId ? (taskStates.get(step.taskId) ?? null) : null,
|
||||
step.retryCount,
|
||||
step.maxRetries,
|
||||
),
|
||||
}));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user