Source code for ralph.agents.idle_watchdog.timeout_policy

"""Timeout policy configuration for the idle watchdog."""

from __future__ import annotations

from dataclasses import dataclass

from ralph.timeout_defaults import (
    DESCENDANT_WAIT_POLL_SECONDS,
    DESCENDANT_WAIT_TIMEOUT_SECONDS,
    DRAIN_WINDOW_SECONDS,
    IDLE_POLL_INTERVAL_SECONDS,
    MAX_WAITING_ON_CHILD_NO_PROGRESS_SECONDS,
    MAX_WAITING_ON_CHILD_SECONDS,
    PARENT_EXIT_GRACE_SECONDS,
    PROCESS_EXIT_WAIT_SECONDS,
    SUSPECT_WAITING_ON_CHILD_SECONDS,
    WAITING_STATUS_INTERVAL_SECONDS,
)


[docs] @dataclass(frozen=True) class TimeoutPolicy: """Consolidated timeout configuration for all agent timeout dimensions. All timeout constants that previously appeared as module-level magic numbers in invoke.py are consolidated here so a single config-built TimeoutPolicy governs every timeout decision. Precedence of fire conditions (in evaluation order): 1. SESSION_CEILING_EXCEEDED — absolute wall-clock cap; activity cannot reset it. 2. NO_OUTPUT_DEADLINE (+ drain window) — idle deadline since last output. 3. CHILDREN_PERSIST_TOO_LONG — cumulative WAITING_ON_CHILD ceiling; this is an absolute ceiling across the session and never decays. 4. PROCESS_EXIT_HANG — subprocess closed stdout but did not exit within budget. 5. DESCENDANT_HANG — descendant-wait deadline elapsed with persistent WAITING_ON_CHILD (post-exit only, owned by PostExitWatchdog). Suspicion is purely informational and does NOT affect any fire condition. The ``suspect_waiting_on_child_seconds`` threshold exists only to emit an elevated warning event before the hard stop; crossing it never shortens the hard-stop ceiling. Attributes: idle_timeout_seconds: Maximum seconds without output before watchdog may fire. None disables the idle-timeout watchdog entirely. drain_window_seconds: After a potential timeout, the watchdog enters a drain window of this duration to allow late output to flush. max_waiting_on_child_seconds: Hard cumulative ceiling on time spent in WAITING_ON_CHILD state across the entire session. Activity cannot decay or reset it; once exceeded, fires CHILDREN_PERSIST_TOO_LONG even while children are still alive. max_session_seconds: Absolute wall-clock ceiling for the entire session. Activity cannot reset this ceiling. None means no ceiling (opt-in). When set, must be >= idle_timeout_seconds. idle_poll_interval_seconds: How often the read loop polls for new lines. Values < 0.01s are intended for tests only. parent_exit_grace_seconds: Grace window after parent rc=0 exit during which we poll for late completion signals or appearing children. descendant_wait_timeout_seconds: Maximum time to wait for descendant processes to finish before declaring failure. descendant_wait_poll_seconds: Poll interval for descendant-wait and process-exit-wait loops. Values < 0.01s are intended for tests only. process_exit_wait_seconds: Maximum time to wait for a subprocess to exit after its stdout closes. Prevents hanging on subprocesses that close stdout but never call exit(). waiting_status_interval_seconds: How often to emit a PROGRESS status event while WAITING_ON_CHILD deferral is active. Controls only the status emission cadence; does NOT affect timeout safety or ceiling math. suspect_waiting_on_child_seconds: Cumulative WAITING time after which a SUSPECTED_FROZEN event is emitted. Purely informational — does NOT shorten the hard-stop ceiling or change the watchdog verdict. Must be strictly less than max_waiting_on_child_seconds when set. None disables suspicion events. max_waiting_on_child_no_progress_seconds: Hard ceiling on cumulative WAITING_ON_CHILD time when corroboration shows the child is alive but not making progress (e.g., heartbeat-only, stale-label, or OS-descendant-only evidence). When set, must be <= max_waiting_on_child_seconds. When None, the no-progress ceiling is disabled and max_waiting_on_child_seconds is used for all WAITING_ON_CHILD states. """ idle_timeout_seconds: float | None drain_window_seconds: float = DRAIN_WINDOW_SECONDS max_waiting_on_child_seconds: float = MAX_WAITING_ON_CHILD_SECONDS max_session_seconds: float | None = None idle_poll_interval_seconds: float = IDLE_POLL_INTERVAL_SECONDS parent_exit_grace_seconds: float = PARENT_EXIT_GRACE_SECONDS descendant_wait_timeout_seconds: float = DESCENDANT_WAIT_TIMEOUT_SECONDS descendant_wait_poll_seconds: float = DESCENDANT_WAIT_POLL_SECONDS process_exit_wait_seconds: float = PROCESS_EXIT_WAIT_SECONDS waiting_status_interval_seconds: float = WAITING_STATUS_INTERVAL_SECONDS suspect_waiting_on_child_seconds: float | None = SUSPECT_WAITING_ON_CHILD_SECONDS max_waiting_on_child_no_progress_seconds: float | None = ( MAX_WAITING_ON_CHILD_NO_PROGRESS_SECONDS ) def __post_init__(self) -> None: self._validate_idle_fields() self._validate_session_and_poll_fields() self._validate_waiting_status_fields() def _validate_idle_fields(self) -> None: if self.idle_timeout_seconds is not None and self.idle_timeout_seconds <= 0: msg = "idle_timeout_seconds must be positive" raise ValueError(msg) if self.drain_window_seconds < 0: msg = "drain_window_seconds must be >= 0" raise ValueError(msg) if ( self.idle_timeout_seconds is not None and self.max_waiting_on_child_seconds < self.idle_timeout_seconds ): msg = "max_waiting_on_child_seconds must be >= idle_timeout_seconds when both set" raise ValueError(msg) def _validate_session_and_poll_fields(self) -> None: if self.max_session_seconds is not None and self.max_session_seconds <= 0: msg = "max_session_seconds must be positive" raise ValueError(msg) if ( self.max_session_seconds is not None and self.idle_timeout_seconds is not None and self.max_session_seconds < self.idle_timeout_seconds ): msg = "max_session_seconds must be >= idle_timeout_seconds" raise ValueError(msg) if self.idle_poll_interval_seconds <= 0: msg = "idle_poll_interval_seconds must be positive" raise ValueError(msg) if self.parent_exit_grace_seconds < 0: msg = "parent_exit_grace_seconds must be >= 0" raise ValueError(msg) if self.descendant_wait_timeout_seconds < 0: msg = "descendant_wait_timeout_seconds must be >= 0" raise ValueError(msg) if self.descendant_wait_poll_seconds <= 0: msg = "descendant_wait_poll_seconds must be positive" raise ValueError(msg) if self.process_exit_wait_seconds < 0: msg = "process_exit_wait_seconds must be >= 0" raise ValueError(msg) def _validate_waiting_status_fields(self) -> None: if self.waiting_status_interval_seconds <= 0: msg = "waiting_status_interval_seconds must be positive" raise ValueError(msg) if self.suspect_waiting_on_child_seconds is not None: if self.suspect_waiting_on_child_seconds <= 0: msg = "suspect_waiting_on_child_seconds must be positive" raise ValueError(msg) if self.suspect_waiting_on_child_seconds >= self.max_waiting_on_child_seconds: msg = ( "suspect_waiting_on_child_seconds must be strictly less than" " max_waiting_on_child_seconds" ) raise ValueError(msg) if self.max_waiting_on_child_no_progress_seconds is not None: if self.max_waiting_on_child_no_progress_seconds <= 0: msg = "max_waiting_on_child_no_progress_seconds must be positive" raise ValueError(msg) if self.max_waiting_on_child_no_progress_seconds > self.max_waiting_on_child_seconds: msg = ( "max_waiting_on_child_no_progress_seconds must be <=" " max_waiting_on_child_seconds" ) raise ValueError(msg)