agentmux_srv\sagas/
mod.rs

1// Copyright 2026, AgentMux Corp.
2// SPDX-License-Identifier: Apache-2.0
3//
4// Phase E.5.5 — srv-side saga coordinator.
5//
6// **Why srv, not launcher:** the existing E.1a coordinator
7// framework lives in `agentmux-launcher::saga` (which still does
8// nothing — no consumers). The original Phase E spec assumed
9// sagas would fan out across host/launcher/srv via cross-process
10// IPC; the actual implementation kept that fan-out in the frontend
11// (`requestTearOff` calls srv-rpc and host-rpc directly), so every
12// saga in the E.5 plan mutates only srv state. In-process oneshot
13// dispatch beats an IPC round-trip on every saga step. See
14// `docs/retro/saga-coordinator-location-analysis-2026-04-30.md` for
15// the full reasoning, including the robustness trade-offs (which
16// are the same for both placements).
17//
18// **Shape:** sagas are async functions that:
19//   1. allocate a fresh saga_id via `alloc_saga_id`,
20//   2. emit `Event::SagaStarted` via `emit_saga_started`,
21//   3. drive their state machine via `SagaCtx::dispatch` /
22//      `SagaCtx::compensate`,
23//   4. emit `Event::SagaCompleted` or `Event::SagaFailed` via
24//      `emit_terminal` once the inner work returns.
25//
26// `run_saga(state, name, future)` is a thin wrapper that does
27// 1+2+4 + applies a 5 s timeout. Sagas pass the future directly
28// (not a closure), avoiding the lifetime-of-SagaCtx complication
29// that closure-style coordinators run into.
30//
31// **Compensation:** the saga's inner future is responsible for
32// driving compensation before returning `Err`. `SagaCtx::compensate`
33// is a best-effort dispatch that swallows errors (the saga is
34// already failing; secondary failures get logged). Idempotency of
35// compensating commands (`MoveTab` back to source, `DeleteWorkspace`,
36// etc.) keeps the cleanup safe even if a step partially applied.
37//
38// What this module does NOT close (per the location analysis §4.2):
39// * Per-step SQLite transactions in the subscriber (gap; F1.A).
40// * Host pool-promote and renderer registration outside the saga
41//   (gap; Phase F).
42// * Saga state across srv restart (gap; Phase F+).
43
44pub mod delete_block;
45pub mod delete_tab;
46pub mod delete_workspace;
47pub mod log;
48pub mod promote_block_to_tab;
49pub mod recovery;
50pub mod redock_floating_pane;
51pub mod restore_torn_off_tab;
52pub mod tear_off_block;
53pub mod tear_off_tab;
54
55// Step 7 — E.7 integration tests. Cross-saga end-to-end coverage
56// that exercises reducer + saga coordinator + persist subscriber +
57// saga log together against a real `AppState` (in-memory wstore +
58// sagalog). Per-saga unit tests under each saga module already cover
59// happy + reject paths in isolation; this module focuses on
60// multi-surface consistency (reducer/wstore/saga-log) that PR 2's
61// `compensate_unresolved` will rely on.
62#[cfg(test)]
63mod integration_tests;
64
65use std::sync::atomic::{AtomicU32, Ordering};
66
67use agentmux_common::ipc::{Command, Event};
68use serde_json::Value;
69
70use crate::sagas::log::{command_discriminant_name, SagaOutcome};
71use crate::server::AppState;
72
73/// Maximum wall-clock time a saga is allowed to run before the
74/// coordinator force-fails it. Tear-off sagas should complete in
75/// tens of milliseconds; the budget is generous to absorb SQLite
76/// write spikes without flapping in CI.
77const SAGA_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5);
78
79/// Read-only context passed to a saga's inner async function.
80/// Wraps the AppState handle and the saga's allocated id.
81///
82/// Construct via [`SagaCtx::new`] — the durability log requires the
83/// per-step counter to start at zero and be owned by the ctx (so
84/// concurrent sagas don't interleave step indices).
85pub struct SagaCtx<'a> {
86    pub(crate) state: &'a AppState,
87    pub(crate) saga_id: u64,
88    /// Monotonic step index (0, 1, 2, ...) for this saga. Each
89    /// `dispatch` / `compensate` call `fetch_add(1)`s and writes the
90    /// resulting index into the saga log. Atomic because saga inner
91    /// futures may parallelise dispatches in the future (today they
92    /// don't, but the cost is one cache line).
93    pub(crate) step_index: AtomicU32,
94    /// (codex P1 PR #636 round 4.) Stack of forward-step indices that
95    /// have completed successfully and are eligible to be undone by
96    /// the next `compensate` call. `dispatch` pushes on success;
97    /// `compensate` pops to determine which original forward step
98    /// it's reversing, and marks that step `compensated` in the log.
99    /// Without this, in-process compensation only writes new
100    /// `compensated` rows at fresh indices; the original `succeeded`
101    /// rows stay `succeeded`, so resume-on-restart re-replays them
102    /// and either no-ops or worse double-applies the inverse.
103    ///
104    /// `Mutex<Vec>` (rather than a lock-free counter) because saga
105    /// inner futures could in theory parallelize compensations; in
106    /// practice they're serial today, so contention is zero.
107    pub(crate) forward_step_stack: tokio::sync::Mutex<Vec<u32>>,
108}
109
110impl<'a> SagaCtx<'a> {
111    /// Construct a fresh context for a saga that has just allocated
112    /// its `saga_id` (via [`alloc_saga_id`]).
113    pub fn new(state: &'a AppState, saga_id: u64) -> Self {
114        Self {
115            state,
116            saga_id,
117            step_index: AtomicU32::new(0),
118            forward_step_stack: tokio::sync::Mutex::new(Vec::new()),
119        }
120    }
121
122    /// Saga-id this context belongs to. Used by sagas that need to
123    /// log progress with the saga prefix.
124    #[allow(dead_code)]
125    pub fn saga_id(&self) -> u64 {
126        self.saga_id
127    }
128
129    /// Acquire the reducer's state lock for read-only inspection.
130    /// Used by sagas that need to inspect post-step state to decide
131    /// the next step (e.g. RestoreTornOffTab checking whether the
132    /// source workspace is now empty before issuing the cascade
133    /// delete). Hold briefly — the reducer is single-mutex.
134    pub async fn state_lock(&self) -> tokio::sync::MutexGuard<'_, crate::state::State> {
135        self.state.srv_state.lock().await
136    }
137
138    /// Dispatch `cmd` through the srv reducer and apply the emitted
139    /// events to SQLite + the broadcast bus, exactly like the
140    /// in-handler reducer-dispatch helpers.
141    ///
142    /// Returns the emitted event vec on success. If the reducer
143    /// emits any `Event::Error`, the error message is returned and
144    /// SQLite/bus side-effects are skipped — the caller must then
145    /// dispatch compensation for the saga's already-applied steps.
146    pub async fn dispatch(&self, cmd: Command) -> Result<Vec<Event>, String> {
147        // Saga durability — write a `pending` step row before
148        // dispatch so a crash mid-dispatch leaves a recoverable
149        // breadcrumb (PR 2's compensate-on-restart will see it).
150        let idx = self.step_index.fetch_add(1, Ordering::Relaxed);
151        let step_name = command_discriminant_name(&cmd);
152        if let Err(e) = self
153            .state
154            .saga_log
155            .start_step(self.saga_id, idx, &step_name, &cmd)
156        {
157            // Log-write failure is non-fatal: the in-memory saga
158            // path is still authoritative for THIS srv run; we lose
159            // crash-recovery for this step, but the user's command
160            // shouldn't fail because durability hiccupped.
161            tracing::warn!(
162                saga_id = self.saga_id,
163                step_index = idx,
164                "[saga] start_step log write failed: {} — continuing without durable log for this step",
165                e
166            );
167        }
168
169        let events = crate::server::service::dispatch_to_reducer(self.state, cmd).await;
170        if let Some(message) = events.iter().find_map(|e| match e {
171            Event::Error { message, .. } => Some(message.clone()),
172            _ => None,
173        }) {
174            if let Err(e) = self.state.saga_log.fail_step(self.saga_id, idx, &message) {
175                tracing::warn!(
176                    saga_id = self.saga_id,
177                    step_index = idx,
178                    "[saga] fail_step log write failed: {}",
179                    e
180                );
181            }
182            return Err(message);
183        }
184        for ev in &events {
185            if let Err(e) = crate::persist_subscriber::apply_event_to_wstore(ev, &self.state.wstore)
186            {
187                // (reagent P1 PR #631 round 2) Mark the step as
188                // failed in the durable log BEFORE returning. Without
189                // this, the step row stays in `pending` state even
190                // though the reducer already applied the command
191                // (line 139); PR 2's compensate-on-restart sees a
192                // `pending` step and can't determine whether the
193                // command was applied.
194                let err_msg = e.to_string();
195                if let Err(log_err) =
196                    self.state.saga_log.fail_step(self.saga_id, idx, &err_msg)
197                {
198                    tracing::warn!(
199                        saga_id = self.saga_id,
200                        step_index = idx,
201                        "[saga] fail_step log write failed during wstore-apply error path: {}",
202                        log_err,
203                    );
204                }
205                return Err(err_msg);
206            }
207        }
208        if let Err(e) = self.state.saga_log.finish_step(self.saga_id, idx, &events) {
209            tracing::warn!(
210                saga_id = self.saga_id,
211                step_index = idx,
212                "[saga] finish_step log write failed: {}",
213                e
214            );
215        }
216        // (codex P1 PR #636 round 4.) Track this idx as a successful
217        // forward step eligible for compensation. The next
218        // `compensate` call will pop this and mark the original step
219        // `compensated`, preventing resume-on-restart from re-replaying
220        // an inverse that already ran in-process.
221        self.forward_step_stack.lock().await.push(idx);
222        crate::server::service::publish_events(self.state, &events);
223        Ok(events)
224    }
225
226    /// Best-effort compensating dispatch. Same as `dispatch` but
227    /// SQLite-write failures are logged and swallowed. Intended for
228    /// the unwind path: the saga is already returning an error to
229    /// the caller; throwing on cleanup hides the original cause and
230    /// prevents subsequent compensating commands from running.
231    pub async fn compensate(&self, cmd: Command) {
232        // Compensation gets its own step row so the durable log
233        // distinguishes "step that succeeded forward" from "step
234        // that ran in unwind". Index continues monotonically from
235        // forward steps so `--diag sagas` shows the full sequence.
236        let idx = self.step_index.fetch_add(1, Ordering::Relaxed);
237        let step_name = command_discriminant_name(&cmd);
238        if let Err(e) = self
239            .state
240            .saga_log
241            .start_step(self.saga_id, idx, &step_name, &cmd)
242        {
243            tracing::warn!(
244                saga_id = self.saga_id,
245                step_index = idx,
246                "[saga] compensate start_step log write failed: {}",
247                e
248            );
249        }
250        let events =
251            crate::server::service::dispatch_to_reducer(self.state, cmd.clone()).await;
252        if let Some(message) = events.iter().find_map(|e| match e {
253            Event::Error { message, .. } => Some(message.clone()),
254            _ => None,
255        }) {
256            tracing::warn!(
257                saga_id = self.saga_id,
258                "[saga] compensation rejected by reducer: {} (cmd discriminant: {:?})",
259                message,
260                std::mem::discriminant(&cmd),
261            );
262            if let Err(e) = self.state.saga_log.fail_step(self.saga_id, idx, &message) {
263                tracing::warn!(
264                    saga_id = self.saga_id,
265                    step_index = idx,
266                    "[saga] compensate fail_step log write failed: {}",
267                    e
268                );
269            }
270            return;
271        }
272        for ev in &events {
273            if let Err(e) =
274                crate::persist_subscriber::apply_event_to_wstore(ev, &self.state.wstore)
275            {
276                tracing::warn!(
277                    saga_id = self.saga_id,
278                    "[saga] compensation: SQLite write failed: {}",
279                    e
280                );
281            }
282        }
283        if let Err(e) = self
284            .state
285            .saga_log
286            .compensate_step(self.saga_id, idx, &events)
287        {
288            tracing::warn!(
289                saga_id = self.saga_id,
290                step_index = idx,
291                "[saga] compensate_step log write failed: {}",
292                e
293            );
294        }
295        // (codex P1 PR #636 round 4.) Pop the most-recent successful
296        // forward step from the stack and mark its original log row
297        // as compensated. This prevents resume-on-restart from
298        // double-replaying the inverse of a step that already had
299        // in-process compensation. Idempotent — UPDATE only matches
300        // rows still in `succeeded` state.
301        if let Some(forward_idx) = self.forward_step_stack.lock().await.pop() {
302            if let Err(e) = self
303                .state
304                .saga_log
305                .mark_step_compensated(self.saga_id, forward_idx)
306            {
307                tracing::warn!(
308                    saga_id = self.saga_id,
309                    forward_step_index = forward_idx,
310                    "[saga] mark_step_compensated (live) log write failed: {} — restart may re-replay this inverse",
311                    e
312                );
313            }
314        }
315        crate::server::service::publish_events(self.state, &events);
316    }
317}
318
319/// Allocate the next saga_id. Monotonic per srv-process run.
320pub fn alloc_saga_id(state: &AppState) -> u64 {
321    state.saga_id_alloc.fetch_add(1, Ordering::Relaxed) + 1
322}
323
324/// Emit `Event::SagaStarted` for a freshly-allocated saga_id.
325/// Sagas call this immediately after `alloc_saga_id` so subscribers
326/// see the start record before any per-step events.
327///
328/// Also writes a `running` row to the durable saga log (PR 1 of
329/// SPEC_SAGA_DURABILITY_2026-05-01.md), recording `input` as the
330/// saga's arguments serialized to JSON. PR 2's `compensate_unresolved`
331/// + `--diag sagas` rely on this for crash-recovery provenance, so
332/// callers should pass a structured representation of their inputs
333/// (typically `serde_json::json!({...})`). (reagent P1 PR #631 —
334/// `Value::Null` placeholder erased provenance.)
335///
336/// **Fail-fast on log error.** (codex P1 PR #631 round 2.) If
337/// `start_saga` fails — most likely a UNIQUE constraint violation
338/// from a saga_id collision — the saga MUST NOT proceed. Otherwise
339/// later `terminate()` calls would `UPDATE saga SET ... WHERE saga_id=?`
340/// against a *different run's* row, mixing lifecycle data across
341/// sagas and silently corrupting the durability log. Returning
342/// `Err` here propagates up to the caller, which records the
343/// failure via `emit_terminal` (with a fresh saga_id allocated by
344/// the caller's `alloc_saga_id` retry path, if any).
345pub async fn emit_saga_started(
346    state: &AppState,
347    saga_id: u64,
348    name: &'static str,
349    input: serde_json::Value,
350) -> Result<(), String> {
351    if let Err(e) = state.saga_log.start_saga(saga_id, name, &input) {
352        let msg = format!(
353            "saga durable start row insert failed for saga_id={}: {} (likely ID collision; refusing to run)",
354            saga_id, e
355        );
356        tracing::error!(
357            saga_id,
358            name,
359            "[saga] {} — aborting saga to avoid corrupting prior run's lifecycle row",
360            msg,
361        );
362        return Err(msg);
363    }
364    let v = state.srv_state.lock().await.bump_version();
365    let _ = state.srv_events_tx.send(Event::SagaStarted {
366        saga_id,
367        name: name.to_string(),
368        version: v,
369    });
370    Ok(())
371}
372
373/// Outcome a saga's inner future hands back to `emit_terminal`.
374///
375/// (codex P1 PR #631) The original PR 1 implementation mapped every
376/// `Err` to `SagaOutcome::Compensated`, which is wrong for timeout/
377/// abort paths: `run_saga` wraps the inner future in
378/// `tokio::time::timeout`, and a timeout cancels the future *before*
379/// it can run its compensation block. Recording "compensated" when
380/// nothing was compensated would hide partially-applied state from
381/// PR 2's `compensate_unresolved` resume scan — exactly the failure
382/// mode this log exists to catch.
383///
384/// `Compensated` should only be recorded when compensation actually
385/// completed; everything else (timeout, panic-converted-to-error,
386/// pre-compensation early-return) records `Failed`, which leaves the
387/// saga visible to PR 2's resume scan.
388#[derive(Debug)]
389pub enum SagaTerminal<'a> {
390    /// All steps applied successfully.
391    Completed,
392    /// Compensation block ran to completion. Caller asserts this only
393    /// after every compensating dispatch returned without error.
394    Compensated { reason: &'a str },
395    /// Saga aborted before/during compensation: timeout, panic,
396    /// pre-compensation early-return, or any other path where
397    /// compensation can't be assumed to have run. Default for the
398    /// "I don't know if compensation completed" case.
399    Failed { reason: &'a str },
400}
401
402/// Emit the saga's terminal lifecycle event + durable log row.
403///
404/// Maps `SagaTerminal` to:
405/// - `Completed` → `Event::SagaCompleted` + log state `completed`.
406/// - `Compensated { reason }` → `Event::SagaFailed { reason }` + log state `compensated`.
407/// - `Failed { reason }` → `Event::SagaFailed { reason }` + log state `failed`.
408///
409/// The renderer-facing event is the same `SagaFailed` for both
410/// non-success paths (the renderer doesn't currently distinguish).
411/// The durable log distinguishes — PR 2's resume scan picks up
412/// `failed` rows where compensation may not have run.
413pub async fn emit_terminal(state: &AppState, saga_id: u64, terminal: SagaTerminal<'_>) {
414    let log_outcome = match &terminal {
415        SagaTerminal::Completed => SagaOutcome::Completed,
416        SagaTerminal::Compensated { reason } => SagaOutcome::Compensated {
417            reason: reason.to_string(),
418        },
419        SagaTerminal::Failed { reason } => SagaOutcome::Failed {
420            reason: reason.to_string(),
421        },
422    };
423    // (codex P1 PR #636 round 7 — reverted from round 6.)
424    // Bulk-mark only on Compensated. Round 6 extended to Failed too,
425    // but BOTH bots flagged that as data-loss: timeout/abort paths
426    // classify as Failed and never run compensation, but the bulk-
427    // mark would relabel forward steps as `compensated`, hiding
428    // them from recovery and leaving side effects permanently
429    // applied.
430    //
431    // Sagas that DO unwind via inner-future ctx.compensate calls
432    // should classify as Compensated (the per-step pop already
433    // marks 1:1; this bulk call catches residual 1:N cases like
434    // tear_off_block's single DeleteWorkspace undoing both
435    // CreateWorkspace + CreateTab). `classify_run_saga_result`
436    // maps non-timeout Err → Compensated to support this; timeouts
437    // → Failed so recovery picks up un-undone rows.
438    if matches!(terminal, SagaTerminal::Compensated { .. }) {
439        if let Err(e) = state.saga_log.mark_all_succeeded_steps_compensated(saga_id) {
440            tracing::warn!(
441                saga_id,
442                "[saga] mark_all_succeeded_steps_compensated failed: {} — restart may re-replay an inverse",
443                e
444            );
445        }
446    }
447    if let Err(e) = state.saga_log.terminate(saga_id, log_outcome) {
448        tracing::warn!(
449            saga_id,
450            "[saga] terminate log write failed: {} — saga lifecycle row will look 'running' to PR 2's resume scan, which will then compensate it",
451            e
452        );
453    }
454    let v = state.srv_state.lock().await.bump_version();
455    let event = match terminal {
456        SagaTerminal::Completed => Event::SagaCompleted {
457            saga_id,
458            version: v,
459        },
460        SagaTerminal::Compensated { reason } | SagaTerminal::Failed { reason } => {
461            Event::SagaFailed {
462                saga_id,
463                reason: reason.to_string(),
464                version: v,
465            }
466        }
467    };
468    let _ = state.srv_events_tx.send(event);
469}
470
471/// Convenience: classify the standard `run_saga` `Result<Value, String>`
472/// outcome into a `SagaTerminal`.
473///
474/// - `Ok(_)` → `Completed`.
475/// - `Err(_)` → `Failed`.
476///
477/// (codex P1 PR #631 round 2.) The earlier round mapped non-timeout
478/// `Err` to `Compensated` on the assumption that "our sagas drive
479/// compensation in their inner future before returning `Err`."
480/// That's true for the *forward* dispatch failures, but
481/// `SagaCtx::compensate` is **best-effort** — if a compensating
482/// dispatch is itself rejected by the reducer, `compensate` logs a
483/// warning and returns without signaling failure. Marking those as
484/// `Compensated` would hide partially-applied state from PR 2's
485/// restart recovery (which scans for `running`/`failed` to know what
486/// to compensate).
487///
488/// Conservative default: classify all errors as `Failed`. Sagas that
489/// can *prove* compensation succeeded (e.g. a future per-step
490/// compensation-success log) construct `SagaTerminal::Compensated`
491/// directly without going through this helper.
492pub fn classify_run_saga_result(result: &Result<serde_json::Value, String>) -> SagaTerminal<'_> {
493    match result {
494        Ok(_) => SagaTerminal::Completed,
495        // Timeouts/aborts: compensation never ran (run_saga's
496        // tokio::time::timeout cancels the inner future before it
497        // can compensate). Classify as Failed so recovery picks up
498        // the un-undone forward steps.
499        Err(reason) if reason.contains("timed out") => SagaTerminal::Failed { reason },
500        // Other Err: by convention, our sagas drive compensation
501        // in their inner future before returning Err (each
502        // ctx.compensate call already marked its target). Classify
503        // as Compensated so emit_terminal's bulk-mark cleans up any
504        // residual succeeded rows from 1:N compensation patterns
505        // (e.g. tear_off_block's single DeleteWorkspace undoing
506        // multiple CreateX steps). Sagas that abort without
507        // compensating should explicitly construct
508        // SagaTerminal::Failed instead of using this helper.
509        // (codex round 7 reversal of round 1's blanket-Failed.)
510        Err(reason) => SagaTerminal::Compensated { reason },
511    }
512}
513
514/// Run a saga's inner future under a 5 s timeout. The inner future
515/// is responsible for emitting `SagaStarted` (the saga itself, since
516/// it owns the saga_id allocation) and any compensation it needs;
517/// `run_saga` only enforces the timeout and emits the terminal
518/// `SagaCompleted` / `SagaFailed`.
519///
520/// Concrete usage (per saga):
521/// ```ignore
522/// pub async fn run(state: &AppState, ...) -> Result<Value, String> {
523///     let saga_id = alloc_saga_id(state);
524///     emit_saga_started(state, saga_id, "tear_off_tab", serde_json::json!({})).await;
525///     let ctx = SagaCtx::new(state, saga_id);
526///     let result = run_saga(run_inner(ctx, ...)).await;
527///     emit_terminal(state, saga_id, classify_run_saga_result(&result)).await;
528///     result
529/// }
530/// ```
531pub async fn run_saga<Fut>(name: &'static str, fut: Fut) -> Result<Value, String>
532where
533    Fut: std::future::Future<Output = Result<Value, String>>,
534{
535    match tokio::time::timeout(SAGA_TIMEOUT, fut).await {
536        Ok(r) => r,
537        Err(_) => Err(format!("saga '{}' timed out after {:?}", name, SAGA_TIMEOUT)),
538    }
539}
agentmux_srv\sagas/mod.rs

agentmux_srv\sagas/
mod.rs