agentmux_srv/crash_monitor.rs
1// Copyright 2025-2026, AgentMux Corp.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Out-of-process crash dump monitor for Windows.
5//!
6//! Architecture
7//! ============
8//! Two-process pattern:
9//! - Main process: installs a VEH handler via `crash-handler`. On crash, the handler
10//! sends the crash context to the monitor over a Unix Domain Socket (IPC via `minidumper`).
11//! - Monitor process: same binary, launched with `--crash-monitor`. Runs a blocking
12//! `minidumper::Server` that receives crash contexts and writes .dmp files.
13//!
14//! Why out-of-process?
15//! A crash may corrupt the heap or stack. Writing a minidump from inside the crashing
16//! process is unreliable. The monitor runs in a healthy isolated process and can safely
17//! call `MiniDumpWriteDump` even if the main process is badly corrupted.
18//!
19//! What this catches
20//! =================
21//! - Access violations (`0xC0000005`)
22//! - Heap corruption detected by heap manager
23//! - Rust panics that abort
24//! - Any exception that reaches the Vectored Exception Handler
25//!
26//! What this does NOT catch
27//! ========================
28//! - `__fastfail` (`int 0x29`, exit code `0xC0000409`) — the CPU traps directly to the
29//! kernel which terminates the process before returning to user mode. VEH is bypassed.
30//! Use WER `LocalDumps` for that (already configured via `enable-wer-dumps.reg`).
31//!
32//! Dump location: `C:\CrashDumps\agentmuxsrv\agentmuxsrv-<unix_ts>-<pid>.dmp`
33//! Socket path: `C:\CrashDumps\agentmuxsrv\monitor.sock`
34
35#![cfg(windows)]
36
37use std::path::PathBuf;
38use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
39
40const DUMP_DIR: &str = r"C:\CrashDumps\agentmuxsrv";
41/// Unix Domain Socket path used for crash-handler IPC.
42const SOCKET_PATH: &str = r"C:\CrashDumps\agentmuxsrv\monitor.sock";
43/// Message kind for sending the crashing process's PID before a dump request.
44/// The monitor stores this PID and uses it in the dump filename.
45const MSG_KIND_CRASH_PID: u32 = 0;
46
47// ─── Monitor process (server side) ───────────────────────────────────────────
48
49/// Entry point for the monitor process.
50///
51/// Called when the binary is invoked with `--crash-monitor`. Runs a blocking
52/// `minidumper::Server` loop. Exits when the main process disconnects (either
53/// cleanly on shutdown, or after the crash dump is written).
54pub fn run_monitor() {
55 let dump_dir = PathBuf::from(DUMP_DIR);
56 if let Err(e) = std::fs::create_dir_all(&dump_dir) {
57 eprintln!("[crash-monitor] failed to create dump dir {}: {}", dump_dir.display(), e);
58 // Continue — create_minidump_file will fail gracefully per dump request.
59 }
60 eprintln!(
61 "[crash-monitor] started (pid={}), writing dumps to {}",
62 std::process::id(),
63 dump_dir.display()
64 );
65
66 let socket_name = minidumper::SocketName::path(SOCKET_PATH);
67 let mut server = match minidumper::Server::with_name(socket_name) {
68 Ok(s) => s,
69 Err(e) => {
70 eprintln!("[crash-monitor] failed to bind socket '{}': {}", SOCKET_PATH, e);
71 return;
72 }
73 };
74
75 let shutdown = AtomicBool::new(false);
76 let handler = Box::new(CrashDumpHandler { dump_dir, crash_pid: AtomicU32::new(0) });
77
78 if let Err(e) = server.run(handler, &shutdown, None) {
79 eprintln!("[crash-monitor] server loop ended: {}", e);
80 }
81
82 eprintln!("[crash-monitor] exiting");
83}
84
85struct CrashDumpHandler {
86 dump_dir: PathBuf,
87 /// PID of the crashing process, set by `on_message` before `create_minidump_file` is called.
88 /// The client sends MSG_KIND_CRASH_PID just before `request_dump` so the PID is available
89 /// when the server needs to name the file.
90 crash_pid: AtomicU32,
91}
92
93impl minidumper::ServerHandler for CrashDumpHandler {
94 fn create_minidump_file(&self) -> Result<(std::fs::File, PathBuf), std::io::Error> {
95 let ts = std::time::SystemTime::now()
96 .duration_since(std::time::UNIX_EPOCH)
97 .unwrap_or_default()
98 .as_secs();
99 // Use the crashing process PID sent via on_message, not std::process::id()
100 // (which would give the monitor's own PID — misleading for diagnosis).
101 let pid = self.crash_pid.load(Ordering::Relaxed);
102 let filename = if pid != 0 {
103 format!("agentmuxsrv-{}-{}.dmp", ts, pid)
104 } else {
105 format!("agentmuxsrv-{}-unknown.dmp", ts)
106 };
107 let path = self.dump_dir.join(&filename);
108 eprintln!("[crash-monitor] creating dump: {}", path.display());
109 let file = std::fs::File::create(&path)?;
110 Ok((file, path))
111 }
112
113 fn on_minidump_created(
114 &self,
115 result: Result<minidumper::MinidumpBinary, minidumper::Error>,
116 ) -> minidumper::LoopAction {
117 match result {
118 Ok(binary) => {
119 eprintln!("[crash-monitor] dump written: {}", binary.path.display());
120 }
121 Err(e) => {
122 eprintln!("[crash-monitor] failed to write dump: {}", e);
123 }
124 }
125 // Continue — keep monitoring in case of multiple clients or future restarts.
126 minidumper::LoopAction::Continue
127 }
128
129 fn on_message(&self, kind: u32, buffer: Vec<u8>) {
130 if kind == MSG_KIND_CRASH_PID {
131 if let Ok(bytes) = <[u8; 4]>::try_from(buffer.as_slice()) {
132 let pid = u32::from_le_bytes(bytes);
133 self.crash_pid.store(pid, Ordering::Relaxed);
134 eprintln!("[crash-monitor] crash pid set to {}", pid);
135 }
136 }
137 }
138
139 fn on_client_disconnected(&self, num_clients: usize) -> minidumper::LoopAction {
140 // Exit when the last client (the main process) disconnects.
141 if num_clients == 0 {
142 minidumper::LoopAction::Exit
143 } else {
144 minidumper::LoopAction::Continue
145 }
146 }
147}
148
149// ─── Main process (client + handler side) ────────────────────────────────────
150
151/// Guard that keeps the crash handler installed for the lifetime of the process.
152/// Dropping this guard uninstalls the VEH handler.
153pub struct CrashHandlerGuard {
154 _handler: crash_handler::CrashHandler,
155}
156
157/// Spawn a crash monitor child process and attach the VEH crash handler in this process.
158///
159/// Returns `Some(guard)` on success. The guard must be kept alive (e.g. as a `let _` binding
160/// at the top of `main`) for the handler to remain active. Returns `None` on any failure —
161/// non-fatal, the process continues without the VEH handler (WER LocalDumps still works).
162pub fn spawn_and_attach() -> Option<CrashHandlerGuard> {
163 // Ensure dump directory exists before spawning the monitor.
164 let _ = std::fs::create_dir_all(DUMP_DIR);
165
166 let exe = match std::env::current_exe() {
167 Ok(p) => p,
168 Err(e) => {
169 eprintln!("[crash-handler] failed to get current exe path: {}", e);
170 return None;
171 }
172 };
173
174 // Spawn monitor process. Null stdin/stdout so it doesn't inherit the sidecar's
175 // stdin reader (which drives the stdin-EOF watchdog in the main process).
176 let child = match std::process::Command::new(&exe)
177 .arg("--crash-monitor")
178 .stdin(std::process::Stdio::null())
179 .stdout(std::process::Stdio::null())
180 .spawn()
181 {
182 Ok(c) => c,
183 Err(e) => {
184 eprintln!("[crash-handler] failed to spawn crash monitor: {}", e);
185 return None;
186 }
187 };
188 eprintln!("[crash-handler] crash monitor spawned (pid={})", child.id());
189
190 // Connect to the monitor's socket with retry — give the server time to start.
191 let socket_name = minidumper::SocketName::path(SOCKET_PATH);
192 let client = connect_with_retry(socket_name, 10, std::time::Duration::from_millis(25));
193 let client = match client {
194 Some(c) => c,
195 None => {
196 eprintln!("[crash-handler] could not connect to crash monitor after retries");
197 return None;
198 }
199 };
200
201 // Verify the connection is healthy before installing the VEH handler.
202 if let Err(e) = client.ping() {
203 eprintln!("[crash-handler] crash monitor ping failed: {}", e);
204 return None;
205 }
206
207 // Install Vectored Exception Handler.
208 //
209 // Safety contract: the closure is called in an exception context (Windows VEH).
210 // Only async-signal-safe operations are allowed:
211 // - `client.request_dump()` uses pre-allocated IPC buffers, no heap allocation.
212 // - No Rust std synchronisation primitives (Mutex/RwLock) are used.
213 // - No log flushing or tracing calls (those may lock).
214 let handler = unsafe {
215 crash_handler::CrashHandler::attach(crash_handler::make_crash_event(
216 move |crash_context: &crash_handler::CrashContext| {
217 // Send our PID to the monitor BEFORE requesting the dump so that
218 // on_message sets crash_pid before create_minidump_file is called.
219 // The send uses a fixed-size stack buffer internally — no heap allocation.
220 let pid_bytes = crash_context.process_id.to_le_bytes();
221 let _ = client.send_message(MSG_KIND_CRASH_PID, &pid_bytes);
222
223 // Best-effort dump request. If the pipe is broken we can't do anything useful.
224 let _ = client.request_dump(crash_context);
225 // Return Handled(false): let the exception continue propagating so that
226 // WER still fires (necessary for __fastfail which bypasses VEH anyway,
227 // and to preserve normal Windows crash reporting for other exception types).
228 crash_handler::CrashEventResult::Handled(false)
229 },
230 ))
231 };
232
233 match handler {
234 Ok(h) => {
235 eprintln!("[crash-handler] VEH handler installed");
236 Some(CrashHandlerGuard { _handler: h })
237 }
238 Err(e) => {
239 eprintln!("[crash-handler] failed to install VEH handler: {}", e);
240 None
241 }
242 }
243}
244
245/// Try to connect to the server socket up to `attempts` times, sleeping `delay` between tries.
246fn connect_with_retry(
247 socket_name: minidumper::SocketName<'_>,
248 attempts: u32,
249 delay: std::time::Duration,
250) -> Option<minidumper::Client> {
251 for i in 0..attempts {
252 match minidumper::Client::with_name(socket_name) {
253 Ok(c) => return Some(c),
254 Err(e) if i + 1 < attempts => {
255 eprintln!("[crash-handler] connect attempt {}/{}: {}", i + 1, attempts, e);
256 std::thread::sleep(delay);
257 }
258 Err(e) => {
259 eprintln!("[crash-handler] connect attempt {}/{} failed: {}", i + 1, attempts, e);
260 }
261 }
262 }
263 None
264}