agentmux_cef/
sidecar.rs

1// Copyright 2026, AgentMux Corp.
2// SPDX-License-Identifier: Apache-2.0
3//
4// Sidecar spawning and management for the CEF host.
5// Ported from src-tauri/src/sidecar/ using std::process instead of tauri-plugin-shell.
6
7use std::io::BufRead;
8use std::sync::Arc;
9
10use crate::state::AppState;
11
12/// State returned after successfully spawning the backend.
13#[derive(Clone, Debug)]
14pub struct BackendSpawnResult {
15    pub ws_endpoint: String,
16    pub web_endpoint: String,
17    pub version: String,
18    pub instance_id: String,
19}
20
21/// Phase B.1: when the launcher has already spawned srv (env var
22/// `AGENTMUX_BACKEND_WS` is set), populate `state` from the env vars
23/// the launcher passed and return Ok — no need to spawn srv ourselves.
24///
25/// Returns `Ok(BackendSpawnResult)` mirroring the shape of
26/// `spawn_backend` so the caller in main.rs is symmetric. Returns
27/// `Err` only if env vars are present but malformed; if they're
28/// absent the caller should fall through to `spawn_backend` (the
29/// `task dev` path where the launcher isn't in the loop).
30pub fn use_launcher_endpoints(
31    state: &Arc<AppState>,
32) -> Option<Result<BackendSpawnResult, String>> {
33    // The env var being absent means "host is running standalone";
34    // fall through to spawn_backend. The env var being PRESENT but
35    // empty means the launcher set it (we're in launcher-managed
36    // mode) but with a malformed value — likely an ESTART parse
37    // mismatch in the launcher. Treat that as Err, NOT as
38    // "fall back to spawn_backend" — the latter would spawn a
39    // SECOND srv against the same data dir while the launcher's
40    // already-running srv keeps going, corrupting state. Better to
41    // fail fast and surface the launcher bug. (codex P2 @
42    // sidecar.rs:35, PR #571 round-4.)
43    let ws = std::env::var("AGENTMUX_BACKEND_WS").ok()?;
44    if ws.is_empty() {
45        return Some(Err(
46            "launcher set AGENTMUX_BACKEND_WS but it was empty — \
47             refusing to fall back to spawn_backend (would create a \
48             duplicate srv against the same data dir). This is a \
49             launcher bug; check the AGENTMUXSRV-ESTART parse path in \
50             agentmux-launcher/src/srv_spawner.rs::parse_estart."
51                .to_string(),
52        ));
53    }
54    // From here on we KNOW the launcher provided env; missing fields
55    // mean a launcher bug, so emit Err rather than fall through to
56    // the dev-mode spawn (which would fight the launcher's srv).
57    let try_get = |key: &str| -> Result<String, String> {
58        std::env::var(key).map_err(|_| format!("launcher set AGENTMUX_BACKEND_WS but not {}", key))
59    };
60    let result = (|| -> Result<BackendSpawnResult, String> {
61        let web = try_get("AGENTMUX_BACKEND_WEB")?;
62        let pid_str = try_get("AGENTMUX_BACKEND_PID")?;
63        let pid: u32 = pid_str
64            .parse()
65            .map_err(|_| format!("AGENTMUX_BACKEND_PID not a u32: {}", pid_str))?;
66        let auth_key = try_get("AGENTMUX_AUTH_KEY")?;
67        let instance_id = try_get("AGENTMUX_INSTANCE_ID")?;
68        let data_dir = try_get("AGENTMUX_DATA_DIR")?;
69        let config_dir = try_get("AGENTMUX_CONFIG_DIR")?;
70        // Frontend "user home" → the AgentMux root (`~/.agentmux/`).
71        // The frontend's `agentmuxHome()` appends multiple suffix
72        // patterns including some that are EXPLICITLY versioned
73        // (`/instances/v<version>/cli/<provider>`), so user_home_dir
74        // must be the account-wide root, NOT a per-version subdir.
75        // We pull it via DataPaths::from_env which re-derives the
76        // root consumer-side (it isn't an env var).
77        let user_home_dir = agentmux_common::DataPaths::from_env()
78            .ok_or_else(|| "DataPaths::from_env() failed inside use_launcher_endpoints".to_string())?
79            .home_dir
80            .to_string_lossy()
81            .to_string();
82
83        // Populate AppState in the same shape `spawn_backend` would.
84        // Notably we do NOT take ownership of a Child handle (launcher
85        // owns it) and we do NOT create a Job Object (launcher's J0
86        // already covers srv via assignment, not inheritance).
87        *state.auth_key.lock() = auth_key;
88        *state.backend_pid.lock() = Some(pid);
89        *state.backend_started_at.lock() = Some(chrono::Utc::now().to_rfc3339());
90        *state.version_data_dir.lock() = Some(data_dir);
91        *state.version_config_dir.lock() = Some(config_dir);
92        *state.user_home_dir.lock() = Some(user_home_dir);
93
94        let version = env!("CARGO_PKG_VERSION").to_string();
95        Ok(BackendSpawnResult {
96            ws_endpoint: ws,
97            web_endpoint: web,
98            version,
99            instance_id,
100        })
101    })();
102    Some(result)
103}
104
105/// Spawn the agentmux-srv backend sidecar and wait for it to signal
106/// readiness via a `AGENTMUXSRV-ESTART` line on stderr (30s timeout).
107pub async fn spawn_backend(state: &Arc<AppState>) -> Result<BackendSpawnResult, String> {
108    tracing::info!("spawn_backend() called");
109
110    // 1. Resolve directories.
111    //
112    // Two reachable paths:
113    //   a) The `restart_backend` RPC after launcher-managed startup —
114    //      `DataPaths::from_env()` succeeds because the host inherits
115    //      the launcher's env.
116    //   b) Standalone `task dev` (host launched directly by `task dev`,
117    //      no launcher in the loop) — env vars absent. We re-derive
118    //      via the same `agentmux_common` API the launcher would have
119    //      used, so the disk layout is identical. This is functionally
120    //      a fallback the launcher would otherwise have produced.
121    let current_version = env!("CARGO_PKG_VERSION");
122    // Mirror the env-isolation guard from main.rs: a dev build never
123    // trusts inherited AGENTMUX_* env, even on the restart_backend path
124    // where main.rs has already resolved paths once. Otherwise the
125    // sidecar would re-spawn against a stale parent's data dir.
126    let host_exe_dir = std::env::current_exe()
127        .map_err(|e| format!("current_exe() failed: {}", e))?;
128    let host_exe_dir = host_exe_dir
129        .parent()
130        .ok_or_else(|| "current_exe has no parent".to_string())?;
131    let paths = if agentmux_common::is_dev_build_exe(host_exe_dir) {
132        let mode = agentmux_common::RuntimeMode::current_path_only(host_exe_dir);
133        // resolve_path_only mirrors current_path_only's env-isolation:
134        // ignore inherited AGENTMUX_CHANNEL so a re-spawned dev sidecar
135        // doesn't redirect into a parent agentmux instance's channel
136        // (would trip the channel single-instance lock).
137        agentmux_common::DataPaths::resolve_path_only(current_version, &mode)?
138    } else {
139        match agentmux_common::DataPaths::from_env() {
140            Some(p) => p,
141            None => {
142                // No launcher env — derive from the host's own vantage.
143                // Mode detection works at this point even though the host
144                // exe lives inside `runtime/`, because portable detection
145                // uses the `agentmux-portable.marker` (not `runtime/`).
146                let mode = agentmux_common::RuntimeMode::current(host_exe_dir);
147                agentmux_common::DataPaths::resolve(current_version, &mode)?
148            }
149        }
150    };
151    let version_instance_id = format!("v{}", current_version);
152    let data_dir = paths.data_dir.clone();
153    let config_dir = paths.config_dir.clone();
154
155    tracing::info!(
156        runtime_mode = ?paths.mode.to_env_string(),
157        "Using data_dir: {}",
158        data_dir.display()
159    );
160    tracing::info!("Using config_dir: {}", config_dir.display());
161
162    // 2. Ensure directory tree (idempotent; launcher already did this
163    // at startup but we rerun in case the dir got nuked between launch
164    // and restart).
165    paths
166        .ensure_dirs()
167        .map_err(|e| format!("Failed to ensure data dirs: {}", e))?;
168
169    // Store version-specific paths in AppState for frontend IPC commands.
170    *state.version_data_dir.lock() = Some(data_dir.to_string_lossy().to_string());
171    *state.version_config_dir.lock() = Some(config_dir.to_string_lossy().to_string());
172    // Frontend "user home" → the AgentMux root (`~/.agentmux/`).
173    // The frontend appends multiple suffix patterns including some
174    // explicitly versioned (`/instances/v<version>/cli/<provider>`),
175    // so the value must be the account-wide root, not a per-version
176    // subdir.
177    *state.user_home_dir.lock() = Some(paths.home_dir.to_string_lossy().to_string());
178
179    // 3. Resolve the backend binary path
180    let backend_name = "agentmux-srv";
181    let exe_suffix = if cfg!(windows) { ".exe" } else { "" };
182
183    let backend_path = resolve_backend_binary(backend_name, exe_suffix)?;
184    tracing::info!("Using backend binary: {}", backend_path.display());
185
186    // 4. Resolve AGENTMUX_APP_PATH
187    let app_path = std::env::current_exe()
188        .ok()
189        .and_then(|p| p.parent().map(|d| d.to_path_buf()))
190        .unwrap_or_default();
191
192    let app_path_str = app_path.to_string_lossy().to_string();
193
194    // wsh has been retired — see specs/SPEC_RETIRE_WSH_2026_04_12.md.
195    // No binary to deploy anymore.
196
197    // Spawn the process
198    let auth_key = state.auth_key.lock().clone();
199    tracing::info!(
200        "Spawning agentmux-srv with auth key: {}...",
201        &auth_key[..8.min(auth_key.len())]
202    );
203
204    let mut cmd = std::process::Command::new(&backend_path);
205    cmd.args([
206        "--wavedata",
207        &data_dir.to_string_lossy(),
208        "--instance",
209        &version_instance_id,
210    ])
211    .env("AGENTMUX_AUTH_KEY", &auth_key)
212    // Canonical AGENTMUX_* env vars from `DataPaths::to_env_vars()`.
213    // Replaces the pre-unification AGENTMUX_DATA_HOME / CONFIG_HOME /
214    // SETTINGS_DIR / DEV set, which is no longer set by anyone in the
215    // chain (launcher → host, host → srv).
216    .envs(paths.to_env_vars())
217    .env("AGENTMUX_APP_PATH", &app_path_str)
218    .stdin(std::process::Stdio::piped())
219    .stdout(std::process::Stdio::piped())
220    .stderr(std::process::Stdio::piped());
221
222    #[cfg(windows)]
223    {
224        use std::os::windows::process::CommandExt;
225        cmd.creation_flags(0x08000000); // CREATE_NO_WINDOW
226    }
227
228    let mut child = cmd
229        .spawn()
230        .map_err(|e| format!("Failed to spawn agentmux-srv: {}", e))?;
231
232    let child_pid = child.id();
233    tracing::info!("Backend spawned with PID: {}", child_pid);
234
235    // 7. Store PID and start time
236    *state.backend_pid.lock() = Some(child_pid);
237    *state.backend_started_at.lock() = Some(chrono::Utc::now().to_rfc3339());
238
239    // (Phase B.1: removed host-owned Job Object J1 on srv. The
240    // launcher's J0 covers srv via direct AssignProcessToJobObject.
241    //
242    // Why not "defense-in-depth": the spec originally suggested
243    // keeping host's J1 as a backstop, but on inspection it would
244    // ACTIVELY defeat B.1's goal. KILL_ON_JOB_CLOSE on J1 fires
245    // when the host process exits — taking srv with it. After B.1,
246    // we want srv to survive host crashes (so the launcher can
247    // restart the host without losing srv state). The two reapers
248    // are mutually exclusive given that goal: launcher's J0 wants
249    // srv to live; host's J1 wants srv to die. We picked J0.
250    //
251    // The `task dev` fallback path (host runs without launcher)
252    // still loses kernel-level reaping for srv on host crash.
253    // That's an accepted regression for dev-mode-only — production
254    // (portable / installed) is unaffected. Phase 7 may add a
255    // separate Job Object for the dev path.)
256
257    // 9. Parse stderr for ESTART (30s timeout)
258    let stderr = child.stderr.take().expect("Failed to get stderr");
259
260    // Take ownership of stdout for logging
261    let stdout = child.stdout.take();
262
263    // Store the child handle
264    *state.sidecar_child.lock() = Some(child);
265
266    // Spawn stdout reader
267    if let Some(stdout) = stdout {
268        std::thread::spawn(move || {
269            let reader = std::io::BufReader::new(stdout);
270            for line in reader.lines() {
271                match line {
272                    Ok(l) => tracing::info!("[agentmux-srv stdout] {}", l),
273                    Err(_) => break,
274                }
275            }
276        });
277    }
278
279    // Parse ESTART from stderr
280    let (tx, mut rx) = tokio::sync::mpsc::channel::<BackendSpawnResult>(1);
281    let state_for_monitor = state.clone();
282
283    std::thread::spawn(move || {
284        let reader = std::io::BufReader::new(stderr);
285        let mut estart_received = false;
286        for line in reader.lines() {
287            match line {
288                Ok(l) => {
289                    if l.starts_with("AGENTMUXSRV-ESTART") {
290                        let result = parse_estart(&l);
291                        tracing::info!(
292                            "Backend started: ws={} web={} version={} instance={}",
293                            result.ws_endpoint,
294                            result.web_endpoint,
295                            result.version,
296                            result.instance_id
297                        );
298                        estart_received = true;
299                        let _ = tx.blocking_send(result);
300                    } else if let Some(event_data) = l.strip_prefix("AGENTMUXSRV-EVENT:") {
301                        tracing::debug!("Backend event: {}", event_data);
302                        // Forward events to the frontend
303                        if let Ok(payload) = serde_json::from_str::<serde_json::Value>(event_data)
304                        {
305                            crate::events::emit_event_from_state(
306                                &state_for_monitor,
307                                "agentmuxsrv-event",
308                                &payload,
309                            );
310                        } else {
311                            crate::events::emit_event_from_state(
312                                &state_for_monitor,
313                                "agentmuxsrv-event",
314                                &serde_json::json!(event_data),
315                            );
316                        }
317                    } else {
318                        tracing::info!("[agentmux-srv] {}", l);
319                    }
320                }
321                Err(_) => break,
322            }
323        }
324
325        // Process exited — emit backend-terminated
326        let pid = state_for_monitor.backend_pid.lock().unwrap_or(0);
327        if estart_received {
328            tracing::error!(
329                "[agentmux-srv] RUNTIME CRASH — pid={}",
330                pid
331            );
332        } else {
333            tracing::error!(
334                "[agentmux-srv] STARTUP CRASH — terminated before ready (pid={})",
335                pid
336            );
337        }
338
339        let payload = serde_json::json!({
340            "pid": pid,
341        });
342        crate::events::emit_event_from_state(
343            &state_for_monitor,
344            "backend-terminated",
345            &payload,
346        );
347    });
348
349    // Wait for ESTART with 30s timeout
350    let result = tokio::time::timeout(std::time::Duration::from_secs(30), rx.recv())
351        .await
352        .map_err(|_| "Timeout waiting for agentmux-srv to start (30s)".to_string())?
353        .ok_or_else(|| "agentmux-srv channel closed before sending endpoints".to_string())?;
354
355    tracing::info!(
356        "Backend successfully started: ws={} web={} version={} instance={}",
357        result.ws_endpoint,
358        result.web_endpoint,
359        result.version,
360        result.instance_id
361    );
362
363    Ok(result)
364}
365
366/// Resolve the backend binary path.
367///
368/// The CEF host lives in `runtime/` alongside the backend binary in portable builds,
369/// so `exe_dir` IS the runtime directory. Search order:
370///   1. Same dir as CEF host: {name}-{version}-{os}.{arch}.exe (versioned portable)
371///   2. Same dir as CEF host: {name}.exe (dev mode — cargo build output)
372///   3. Workspace dist/bin/: {name}-{version}-{os}.{arch}.exe
373///   4. Workspace dist/bin/: {name}.exe (plain fallback)
374fn resolve_backend_binary(
375    backend_name: &str,
376    exe_suffix: &str,
377) -> Result<std::path::PathBuf, String> {
378    let exe_path = std::env::current_exe()
379        .map_err(|e| format!("Failed to get current exe: {}", e))?;
380    let exe_dir = exe_path.parent().unwrap();
381    let version = env!("CARGO_PKG_VERSION");
382
383    tracing::info!("resolve_backend_binary: exe_dir={:?}, version={}", exe_dir, version);
384
385    let (os_name, arch) = if cfg!(target_os = "macos") {
386        ("darwin", if cfg!(target_arch = "aarch64") { "arm64" } else { "x64" })
387    } else if cfg!(target_os = "linux") {
388        ("linux", if cfg!(target_arch = "aarch64") { "arm64" } else { "x64" })
389    } else {
390        ("windows", if cfg!(target_arch = "aarch64") { "arm64" } else { "x64" })
391    };
392
393    // 1. Versioned binary in same directory as CEF host (portable layout)
394    //    e.g. runtime/agentmux-srv-0.33.37-windows.x64.exe
395    let versioned = exe_dir.join(format!(
396        "{}-{}-{}.{}{}", backend_name, version, os_name, arch, exe_suffix
397    ));
398    if versioned.exists() {
399        tracing::info!("Using versioned {} at: {:?}", backend_name, versioned);
400        return Ok(versioned);
401    }
402
403    // 2. Plain binary in same directory (dev mode — cargo build output)
404    //    e.g. target/release/agentmux-srv.exe
405    let plain = exe_dir.join(format!("{}{}", backend_name, exe_suffix));
406    if plain.exists() {
407        tracing::info!("Using dev-mode {} at: {:?}", backend_name, plain);
408        return Ok(plain);
409    }
410
411    // 3. Workspace dist/bin/ (for `task dev` / `task cef:run`)
412    let dist_bin = exe_dir.parent()
413        .and_then(|p| p.parent())
414        .map(|ws| ws.join("dist").join("bin"));
415
416    if let Some(ref dist_bin) = dist_bin {
417        let dist_versioned = dist_bin.join(format!(
418            "{}-{}-{}.{}{}", backend_name, version, os_name, arch, exe_suffix
419        ));
420        if dist_versioned.exists() {
421            tracing::info!("Using dist {} at: {:?}", backend_name, dist_versioned);
422            return Ok(dist_versioned);
423        }
424
425        let dist_plain = dist_bin.join(format!("{}{}", backend_name, exe_suffix));
426        if dist_plain.exists() {
427            tracing::info!("Using dist {} at: {:?}", backend_name, dist_plain);
428            return Ok(dist_plain);
429        }
430    }
431
432    // Diagnostic: list exe_dir contents
433    let dir_listing = std::fs::read_dir(exe_dir)
434        .map(|entries| {
435            entries
436                .filter_map(|e| e.ok())
437                .map(|e| e.file_name().to_string_lossy().to_string())
438                .filter(|n| n.contains("agentmux") || n.contains("srv"))
439                .collect::<Vec<_>>()
440                .join(", ")
441        })
442        .unwrap_or_else(|_| "unreadable".to_string());
443
444    let dist_info = dist_bin
445        .map(|d| format!("dist/bin: {:?}", d))
446        .unwrap_or_else(|| "dist/bin: N/A (no workspace root)".to_string());
447
448    Err(format!(
449        "Backend binary '{}' not found (version {}).\n\
450         exe_dir: {:?}\n\
451         Searched:\n\
452         \x20 1. {:?} (versioned, same dir)\n\
453         \x20 2. {:?} (plain, dev mode)\n\
454         \x20 3. {}\n\
455         Relevant files in exe_dir: [{}]",
456        backend_name, version, exe_dir, versioned, plain, dist_info, dir_listing
457    ))
458}
459
460/// Parse the key=value fields out of a `AGENTMUXSRV-ESTART` line.
461fn parse_estart(line: &str) -> BackendSpawnResult {
462    let parts: Vec<&str> = line.split_whitespace().collect();
463    let get = |prefix: &str| -> String {
464        parts
465            .iter()
466            .find_map(|p| p.strip_prefix(prefix))
467            .unwrap_or_default()
468            .to_string()
469    };
470    BackendSpawnResult {
471        ws_endpoint: get("ws:"),
472        web_endpoint: get("web:"),
473        version: get("version:"),
474        instance_id: get("instance:"),
475    }
476}
477
478// Phase B.1: removed `create_job_object_for_child`. Host no longer
479// owns a Job Object; launcher's J0 (in agentmux-launcher/src/main.rs)
480// covers srv via direct AssignProcessToJobObject. The same windows-sys
481// FFI pattern lives in the launcher now.