agentmux_srv/
crash_monitor.rs

1// Copyright 2025-2026, AgentMux Corp.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Out-of-process crash dump monitor for Windows.
5//!
6//! Architecture
7//! ============
8//! Two-process pattern:
9//!   - Main process: installs a VEH handler via `crash-handler`. On crash, the handler
10//!     sends the crash context to the monitor over a Unix Domain Socket (IPC via `minidumper`).
11//!   - Monitor process: same binary, launched with `--crash-monitor`. Runs a blocking
12//!     `minidumper::Server` that receives crash contexts and writes .dmp files.
13//!
14//! Why out-of-process?
15//!   A crash may corrupt the heap or stack. Writing a minidump from inside the crashing
16//!   process is unreliable. The monitor runs in a healthy isolated process and can safely
17//!   call `MiniDumpWriteDump` even if the main process is badly corrupted.
18//!
19//! What this catches
20//! =================
21//! - Access violations (`0xC0000005`)
22//! - Heap corruption detected by heap manager
23//! - Rust panics that abort
24//! - Any exception that reaches the Vectored Exception Handler
25//!
26//! What this does NOT catch
27//! ========================
28//! - `__fastfail` (`int 0x29`, exit code `0xC0000409`) — the CPU traps directly to the
29//!   kernel which terminates the process before returning to user mode. VEH is bypassed.
30//!   Use WER `LocalDumps` for that (already configured via `enable-wer-dumps.reg`).
31//!
32//! Dump location: `C:\CrashDumps\agentmuxsrv\agentmuxsrv-<unix_ts>-<pid>.dmp`
33//! Socket path:   `C:\CrashDumps\agentmuxsrv\monitor.sock`
34
35#![cfg(windows)]
36
37use std::path::PathBuf;
38use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
39
40const DUMP_DIR: &str = r"C:\CrashDumps\agentmuxsrv";
41/// Unix Domain Socket path used for crash-handler IPC.
42const SOCKET_PATH: &str = r"C:\CrashDumps\agentmuxsrv\monitor.sock";
43/// Message kind for sending the crashing process's PID before a dump request.
44/// The monitor stores this PID and uses it in the dump filename.
45const MSG_KIND_CRASH_PID: u32 = 0;
46
47// ─── Monitor process (server side) ───────────────────────────────────────────
48
49/// Entry point for the monitor process.
50///
51/// Called when the binary is invoked with `--crash-monitor`. Runs a blocking
52/// `minidumper::Server` loop. Exits when the main process disconnects (either
53/// cleanly on shutdown, or after the crash dump is written).
54pub fn run_monitor() {
55    let dump_dir = PathBuf::from(DUMP_DIR);
56    if let Err(e) = std::fs::create_dir_all(&dump_dir) {
57        eprintln!("[crash-monitor] failed to create dump dir {}: {}", dump_dir.display(), e);
58        // Continue — create_minidump_file will fail gracefully per dump request.
59    }
60    eprintln!(
61        "[crash-monitor] started (pid={}), writing dumps to {}",
62        std::process::id(),
63        dump_dir.display()
64    );
65
66    let socket_name = minidumper::SocketName::path(SOCKET_PATH);
67    let mut server = match minidumper::Server::with_name(socket_name) {
68        Ok(s) => s,
69        Err(e) => {
70            eprintln!("[crash-monitor] failed to bind socket '{}': {}", SOCKET_PATH, e);
71            return;
72        }
73    };
74
75    let shutdown = AtomicBool::new(false);
76    let handler = Box::new(CrashDumpHandler { dump_dir, crash_pid: AtomicU32::new(0) });
77
78    if let Err(e) = server.run(handler, &shutdown, None) {
79        eprintln!("[crash-monitor] server loop ended: {}", e);
80    }
81
82    eprintln!("[crash-monitor] exiting");
83}
84
85struct CrashDumpHandler {
86    dump_dir: PathBuf,
87    /// PID of the crashing process, set by `on_message` before `create_minidump_file` is called.
88    /// The client sends MSG_KIND_CRASH_PID just before `request_dump` so the PID is available
89    /// when the server needs to name the file.
90    crash_pid: AtomicU32,
91}
92
93impl minidumper::ServerHandler for CrashDumpHandler {
94    fn create_minidump_file(&self) -> Result<(std::fs::File, PathBuf), std::io::Error> {
95        let ts = std::time::SystemTime::now()
96            .duration_since(std::time::UNIX_EPOCH)
97            .unwrap_or_default()
98            .as_secs();
99        // Use the crashing process PID sent via on_message, not std::process::id()
100        // (which would give the monitor's own PID — misleading for diagnosis).
101        let pid = self.crash_pid.load(Ordering::Relaxed);
102        let filename = if pid != 0 {
103            format!("agentmuxsrv-{}-{}.dmp", ts, pid)
104        } else {
105            format!("agentmuxsrv-{}-unknown.dmp", ts)
106        };
107        let path = self.dump_dir.join(&filename);
108        eprintln!("[crash-monitor] creating dump: {}", path.display());
109        let file = std::fs::File::create(&path)?;
110        Ok((file, path))
111    }
112
113    fn on_minidump_created(
114        &self,
115        result: Result<minidumper::MinidumpBinary, minidumper::Error>,
116    ) -> minidumper::LoopAction {
117        match result {
118            Ok(binary) => {
119                eprintln!("[crash-monitor] dump written: {}", binary.path.display());
120            }
121            Err(e) => {
122                eprintln!("[crash-monitor] failed to write dump: {}", e);
123            }
124        }
125        // Continue — keep monitoring in case of multiple clients or future restarts.
126        minidumper::LoopAction::Continue
127    }
128
129    fn on_message(&self, kind: u32, buffer: Vec<u8>) {
130        if kind == MSG_KIND_CRASH_PID {
131            if let Ok(bytes) = <[u8; 4]>::try_from(buffer.as_slice()) {
132                let pid = u32::from_le_bytes(bytes);
133                self.crash_pid.store(pid, Ordering::Relaxed);
134                eprintln!("[crash-monitor] crash pid set to {}", pid);
135            }
136        }
137    }
138
139    fn on_client_disconnected(&self, num_clients: usize) -> minidumper::LoopAction {
140        // Exit when the last client (the main process) disconnects.
141        if num_clients == 0 {
142            minidumper::LoopAction::Exit
143        } else {
144            minidumper::LoopAction::Continue
145        }
146    }
147}
148
149// ─── Main process (client + handler side) ────────────────────────────────────
150
151/// Guard that keeps the crash handler installed for the lifetime of the process.
152/// Dropping this guard uninstalls the VEH handler.
153pub struct CrashHandlerGuard {
154    _handler: crash_handler::CrashHandler,
155}
156
157/// Spawn a crash monitor child process and attach the VEH crash handler in this process.
158///
159/// Returns `Some(guard)` on success. The guard must be kept alive (e.g. as a `let _` binding
160/// at the top of `main`) for the handler to remain active. Returns `None` on any failure —
161/// non-fatal, the process continues without the VEH handler (WER LocalDumps still works).
162pub fn spawn_and_attach() -> Option<CrashHandlerGuard> {
163    // Ensure dump directory exists before spawning the monitor.
164    let _ = std::fs::create_dir_all(DUMP_DIR);
165
166    let exe = match std::env::current_exe() {
167        Ok(p) => p,
168        Err(e) => {
169            eprintln!("[crash-handler] failed to get current exe path: {}", e);
170            return None;
171        }
172    };
173
174    // Spawn monitor process. Null stdin/stdout so it doesn't inherit the sidecar's
175    // stdin reader (which drives the stdin-EOF watchdog in the main process).
176    let child = match std::process::Command::new(&exe)
177        .arg("--crash-monitor")
178        .stdin(std::process::Stdio::null())
179        .stdout(std::process::Stdio::null())
180        .spawn()
181    {
182        Ok(c) => c,
183        Err(e) => {
184            eprintln!("[crash-handler] failed to spawn crash monitor: {}", e);
185            return None;
186        }
187    };
188    eprintln!("[crash-handler] crash monitor spawned (pid={})", child.id());
189
190    // Connect to the monitor's socket with retry — give the server time to start.
191    let socket_name = minidumper::SocketName::path(SOCKET_PATH);
192    let client = connect_with_retry(socket_name, 10, std::time::Duration::from_millis(25));
193    let client = match client {
194        Some(c) => c,
195        None => {
196            eprintln!("[crash-handler] could not connect to crash monitor after retries");
197            return None;
198        }
199    };
200
201    // Verify the connection is healthy before installing the VEH handler.
202    if let Err(e) = client.ping() {
203        eprintln!("[crash-handler] crash monitor ping failed: {}", e);
204        return None;
205    }
206
207    // Install Vectored Exception Handler.
208    //
209    // Safety contract: the closure is called in an exception context (Windows VEH).
210    // Only async-signal-safe operations are allowed:
211    //   - `client.request_dump()` uses pre-allocated IPC buffers, no heap allocation.
212    //   - No Rust std synchronisation primitives (Mutex/RwLock) are used.
213    //   - No log flushing or tracing calls (those may lock).
214    let handler = unsafe {
215        crash_handler::CrashHandler::attach(crash_handler::make_crash_event(
216            move |crash_context: &crash_handler::CrashContext| {
217                // Send our PID to the monitor BEFORE requesting the dump so that
218                // on_message sets crash_pid before create_minidump_file is called.
219                // The send uses a fixed-size stack buffer internally — no heap allocation.
220                let pid_bytes = crash_context.process_id.to_le_bytes();
221                let _ = client.send_message(MSG_KIND_CRASH_PID, &pid_bytes);
222
223                // Best-effort dump request. If the pipe is broken we can't do anything useful.
224                let _ = client.request_dump(crash_context);
225                // Return Handled(false): let the exception continue propagating so that
226                // WER still fires (necessary for __fastfail which bypasses VEH anyway,
227                // and to preserve normal Windows crash reporting for other exception types).
228                crash_handler::CrashEventResult::Handled(false)
229            },
230        ))
231    };
232
233    match handler {
234        Ok(h) => {
235            eprintln!("[crash-handler] VEH handler installed");
236            Some(CrashHandlerGuard { _handler: h })
237        }
238        Err(e) => {
239            eprintln!("[crash-handler] failed to install VEH handler: {}", e);
240            None
241        }
242    }
243}
244
245/// Try to connect to the server socket up to `attempts` times, sleeping `delay` between tries.
246fn connect_with_retry(
247    socket_name: minidumper::SocketName<'_>,
248    attempts: u32,
249    delay: std::time::Duration,
250) -> Option<minidumper::Client> {
251    for i in 0..attempts {
252        match minidumper::Client::with_name(socket_name) {
253            Ok(c) => return Some(c),
254            Err(e) if i + 1 < attempts => {
255                eprintln!("[crash-handler] connect attempt {}/{}: {}", i + 1, attempts, e);
256                std::thread::sleep(delay);
257            }
258            Err(e) => {
259                eprintln!("[crash-handler] connect attempt {}/{} failed: {}", i + 1, attempts, e);
260            }
261        }
262    }
263    None
264}