feat(conductor): task state machine — TIMED_OUT state and retriable steps

- Add 'timed_out' to flow_runs/flow_steps CHECK constraints
- Add retry_count and max_retries columns to flow_steps
- Add timeout detection in advanceInner loop (configurable FLOW_STEP_TIMEOUT_MS)
- Add retriable logic: re-dispatch on timeout if maxRetries > 0 and retryCount < maxRetries
- Add isRetriable() + shouldRetry() pure decision functions
- Add timed_out handling to reconcileResumeStep and reconcileRun
- Add 'timed_out' to ws-frames enum, publishStep status type
This commit is contained in:
2026-06-08 02:43:45 +00:00
parent 4bb0100282
commit 4715830ef0
8 changed files with 207 additions and 19 deletions

View File

@@ -266,7 +266,7 @@ CREATE INDEX IF NOT EXISTS claude_session_entries_key_idx ON claude_session_entr
-- replaces it with the three-value list).
ALTER TABLE agent_sessions DROP CONSTRAINT IF EXISTS agent_sessions_backend_chk;
ALTER TABLE agent_sessions ADD CONSTRAINT agent_sessions_backend_chk
CHECK (backend IN ('opencode_server', 'acp_warm', 'claude_sdk'));
CHECK (backend IN ('opencode_server', 'acp_warm', 'claude_sdk', 'paseo'));
-- LISTEN/NOTIFY fast path: every tasks INSERT (from any call site — routes,
-- new_task tool, MCP server) fires pg_notify('tasks_new') in the same
@@ -340,11 +340,12 @@ CREATE INDEX IF NOT EXISTS flow_steps_task_id_idx ON flow_steps(task_id);
-- edits above are no-ops on the existing DB (CREATE TABLE IF NOT EXISTS skips an
-- existing table) — widen via the repo's DROP-IF-EXISTS → guarded-ADD discipline.
-- Pure ADD of a new allowed value, so no row UPDATE is needed (no value renamed).
-- v2.9.x: widen status CHECKs to include 'timed_out' for Task State Machine.
ALTER TABLE flow_runs DROP CONSTRAINT IF EXISTS flow_runs_status_chk;
DO $$ BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'flow_runs_status_chk') THEN
ALTER TABLE flow_runs ADD CONSTRAINT flow_runs_status_chk
CHECK (status IN ('running', 'completed', 'failed', 'cancelled'));
CHECK (status IN ('running', 'completed', 'failed', 'cancelled', 'timed_out'));
END IF;
END $$;
@@ -352,10 +353,14 @@ ALTER TABLE flow_steps DROP CONSTRAINT IF EXISTS flow_steps_status_chk;
DO $$ BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'flow_steps_status_chk') THEN
ALTER TABLE flow_steps ADD CONSTRAINT flow_steps_status_chk
CHECK (status IN ('pending', 'running', 'completed', 'failed', 'skipped', 'cancelled'));
CHECK (status IN ('pending', 'running', 'completed', 'failed', 'skipped', 'cancelled', 'timed_out'));
END IF;
END $$;
-- Task State Machine: retry columns for flow_steps.
ALTER TABLE flow_steps ADD COLUMN IF NOT EXISTS retry_count INTEGER NOT NULL DEFAULT 0;
ALTER TABLE flow_steps ADD COLUMN IF NOT EXISTS max_retries INTEGER;
-- Arena: battles + contestants + cross_examinations.
-- project_id carries no FK (matches tasks.project_id + flow_runs.project_id convention).
-- winner_contestant_id FK is deferred (forward reference): added via guarded ALTER below.