agentmux_srv\backend\blockcontroller/
watchdog.rs

1// Copyright 2025, Command Line Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Agent process watchdog: kills agent panes that exceed max-runtime or idle-output limits.
5//!
6//! Runs every 60 seconds and inspects every running ShellController that has
7//! `is_agent_pane = true`. Two independent kill conditions:
8//!
9//!   A) `term:agentmaxruntimehours` — wall-clock runtime since spawn (0 = disabled)
10//!   B) `term:agentidletimeoutmins` — minutes since last PTY byte (0 = disabled)
11//!
12//! Both limits default to 0 (disabled) so the watchdog is opt-in.
13
14use std::sync::Arc;
15use std::time::{Duration, SystemTime, UNIX_EPOCH};
16use tokio::time::interval;
17
18use crate::backend::wconfig::ConfigWatcher;
19use super::{get_all_controllers, STATUS_RUNNING};
20
21/// Check interval for the watchdog loop.
22const WATCHDOG_INTERVAL_SECS: u64 = 60;
23
24/// Run the agent watchdog loop. Never returns.
25pub async fn run_watchdog_loop(config: Arc<ConfigWatcher>) {
26    let mut ticker = interval(Duration::from_secs(WATCHDOG_INTERVAL_SECS));
27    loop {
28        ticker.tick().await;
29        let settings = config.get_settings();
30        let max_runtime_hours = settings.term_agent_max_runtime_hours;
31        let idle_timeout_mins = settings.term_agent_idle_timeout_mins;
32
33        // Skip entire scan if both limits are disabled.
34        if max_runtime_hours <= 0.0 && idle_timeout_mins <= 0.0 {
35            continue;
36        }
37
38        let now_ms = SystemTime::now()
39            .duration_since(UNIX_EPOCH)
40            .map(|d| d.as_millis() as i64)
41            .unwrap_or(0);
42
43        for (block_id, ctrl) in get_all_controllers() {
44            let status = ctrl.get_runtime_status();
45            if status.shellprocstatus != STATUS_RUNNING {
46                continue;
47            }
48            if !status.is_agent_pane {
49                continue;
50            }
51
52            // ── Condition A: max runtime ──────────────────────────────────
53            if max_runtime_hours > 0.0 {
54                if let Some(spawn_ms) = status.spawn_ts_ms {
55                    let elapsed_secs = ((now_ms - spawn_ms).max(0) as u64) / 1000;
56                    let limit_secs = (max_runtime_hours * 3600.0) as u64;
57                    if elapsed_secs >= limit_secs {
58                        tracing::warn!(
59                            block_id = %block_id,
60                            elapsed_hours = elapsed_secs / 3600,
61                            limit_hours = %max_runtime_hours,
62                            "watchdog: agent pane exceeded max-runtime, stopping"
63                        );
64                        let _ = ctrl.stop(true, super::STATUS_DONE);
65                        continue;
66                    }
67                }
68            }
69
70            // ── Condition B: idle output timeout ─────────────────────────
71            if idle_timeout_mins > 0.0 {
72                if let Some(shell_ctrl) = ctrl.as_any().downcast_ref::<super::shell::ShellController>() {
73                    if let Some(idle_secs) = shell_ctrl.last_output_secs_ago() {
74                        let limit_secs = (idle_timeout_mins * 60.0) as u64;
75                        if idle_secs >= limit_secs {
76                            tracing::warn!(
77                                block_id = %block_id,
78                                idle_mins = idle_secs / 60,
79                                limit_mins = %idle_timeout_mins,
80                                "watchdog: agent pane exceeded idle-output timeout, stopping"
81                            );
82                            let _ = ctrl.stop(true, super::STATUS_DONE);
83                        }
84                    }
85                }
86            }
87        }
88    }
89}