├── .github ├── dependabot.yml ├── renovate.json └── workflows │ └── rust.yml ├── .gitignore ├── CHANGELOG.adoc ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.adoc ├── examples ├── demo-provision.rs └── trip.rs ├── notes.txt ├── release.toml ├── rust-toolchain.toml ├── rustfmt.toml ├── src ├── dag.rs ├── example_provision.rs ├── lib.rs ├── rust_features.rs ├── saga_action_error.rs ├── saga_action_func.rs ├── saga_action_generic.rs ├── saga_exec.rs ├── saga_log.rs ├── sec.rs └── store.rs └── tests ├── test_smoke.rs ├── test_smoke_dot.out ├── test_smoke_info.out ├── test_smoke_no_args.out ├── test_smoke_run_basic.out ├── test_smoke_run_error.out ├── test_smoke_run_recover_done.out ├── test_smoke_run_recover_fail_done.out ├── test_smoke_run_recover_fail_some.out ├── test_smoke_run_recover_some.out ├── test_smoke_run_recover_stuck_done.out ├── test_smoke_run_stuck.out └── test_unregistered_action.rs /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Dependabot configuration file 3 | # 4 | 5 | version: 2 6 | updates: 7 | - package-ecosystem: "cargo" 8 | directory: "/" 9 | schedule: 10 | interval: "weekly" 11 | -------------------------------------------------------------------------------- /.github/renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": ["local>oxidecomputer/renovate-config"] 4 | } 5 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Configuration for GitHub-based CI, based on the stock GitHub Rust config. 3 | # 4 | name: Rust 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | jobs: 13 | check-style: 14 | runs-on: ubuntu-24.04 15 | steps: 16 | # actions/checkout@v2 17 | - uses: actions/checkout@09d2acae674a48949e3602304ab46fd20ae0c42f 18 | - name: Report cargo version 19 | run: cargo --version 20 | - name: Report rustfmt version 21 | run: cargo fmt -- --version 22 | - name: Check style 23 | run: cargo fmt -- --check 24 | 25 | clippy-lint: 26 | runs-on: ubuntu-24.04 27 | steps: 28 | # actions/checkout@v2 29 | - uses: actions/checkout@09d2acae674a48949e3602304ab46fd20ae0c42f 30 | - name: Report cargo version 31 | run: cargo --version 32 | - name: Report Clippy version 33 | run: cargo clippy -- --version 34 | - name: Run Clippy Lints 35 | # 36 | # Clippy's style nits are useful, but not worth keeping in CI. This 37 | # override belongs in src/lib.rs, and it is there, but that doesn't 38 | # reliably work due to rust-lang/rust-clippy#6610. 39 | # 40 | run: cargo clippy --all-targets -- --deny warnings --allow clippy::style 41 | 42 | build-and-test: 43 | runs-on: ${{ matrix.os }} 44 | strategy: 45 | matrix: 46 | os: [ ubuntu-24.04, windows-2022, macos-14 ] 47 | steps: 48 | # actions/checkout@v2 49 | - uses: actions/checkout@09d2acae674a48949e3602304ab46fd20ae0c42f 50 | - uses: actions-rs/toolchain@88dc2356392166efad76775c878094f4e83ff746 51 | with: 52 | profile: minimal 53 | toolchain: stable 54 | override: true 55 | - name: Report cargo version 56 | run: cargo --version 57 | - name: Build 58 | run: cargo build --tests --verbose 59 | - name: Run tests 60 | run: cargo test --verbose 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | README.html 3 | -------------------------------------------------------------------------------- /CHANGELOG.adoc: -------------------------------------------------------------------------------- 1 | :showtitle: 2 | :toc: left 3 | :icons: font 4 | :toclevels: 1 5 | 6 | = Steno Changelog 7 | 8 | // WARNING: This file is modified programmatically by `cargo release` as 9 | // configured in release.toml. DO NOT change the format of the headers or the 10 | // list of raw commits. 11 | 12 | // cargo-release: next header goes here (do not change this line) 13 | 14 | == Unreleased changes (release date TBD) 15 | 16 | https://github.com/oxidecomputer/steno/compare/v0.4.1\...HEAD[Full list of commits] 17 | 18 | == 0.4.1 (released 2024-07-01) 19 | 20 | https://github.com/oxidecomputer/steno/compare/v0.4.0\...v0.4.1[Full list of commits] 21 | 22 | * https://github.com/oxidecomputer/steno/pull/286[#286] Steno now more consistently logs events in a saga lifetime: create/restore, start running, finish running. 23 | 24 | == 0.4.0 (released 2023-05-25) 25 | 26 | https://github.com/oxidecomputer/steno/compare/v0.3.1\...v0.4.0[Full list of commits] 27 | 28 | === Breaking changes 29 | 30 | * https://github.com/oxidecomputer/steno/pull/138[#138] Steno no longer panics when an undo action fails. `SagaResultErr` has a new optional field describing whether any undo action failed during unwinding. **You should check this.** If an undo action fails, then the program has failed to provide the usual guarantee that a saga either runs to completion or completely unwinds. What to do next is application-specific but in general this cannot be automatically recovered from. (If there are steps that can automatically recover in this case, the undo action that failed should probably do that instead.) 31 | 32 | == 0.3.1 (released 2023-01-06) 33 | 34 | https://github.com/oxidecomputer/steno/compare/v0.3.0\...v0.3.1[Full list of commits] 35 | 36 | * https://github.com/oxidecomputer/steno/pull/88[#88] Add `SecClient::saga_inject_repeat` method to help with testing idempotency 37 | 38 | == 0.3.0 (released 2022-11-02) 39 | 40 | https://github.com/oxidecomputer/steno/compare/v0.2.0\...v0.3.0[Full list of commits] 41 | 42 | * https://github.com/oxidecomputer/steno/pull/40[#40] Add `Dag::builder` method 43 | * https://github.com/oxidecomputer/steno/pull/67[#67] Add methods for inspecting Saga DAG / https://github.com/oxidecomputer/steno/pull/73[#73] A minor extention to #67, expose indices 44 | 45 | === Breaking Changes 46 | 47 | None. 48 | 49 | == 0.2.0 (released 2022-08-05) 50 | 51 | Changes not documented. 52 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "steno" 3 | version = "0.4.2-dev" 4 | edition = "2021" 5 | license = "Apache-2.0" 6 | repository = "https://github.com/oxidecomputer/steno" 7 | description = "distributed saga implementation" 8 | 9 | [profile.dev] 10 | #panic = "abort" 11 | 12 | [dependencies] 13 | anyhow = "1.0.98" 14 | async-trait = "0.1.88" 15 | futures = "0.3.31" 16 | newtype_derive = "0.1.6" 17 | serde_json = "1.0" 18 | thiserror = "2.0" 19 | lazy_static = "1.5.0" 20 | 21 | [dependencies.petgraph] 22 | version = "0.8.2" 23 | features = [ "serde-1" ] 24 | 25 | [dependencies.chrono] 26 | version = "0.4" 27 | features = [ "clock", "serde", "std" ] 28 | default-features = false 29 | 30 | [dependencies.schemars] 31 | version = "0.8.22" 32 | features = [ "chrono", "uuid1" ] 33 | 34 | [dependencies.serde] 35 | version = "1.0" 36 | features = [ "derive", "rc" ] 37 | 38 | [dependencies.slog] 39 | version = "2.7" 40 | features = [ "max_level_trace", "release_max_level_debug" ] 41 | 42 | [dependencies.tokio] 43 | version = "1" 44 | features = [ "full" ] 45 | 46 | [dependencies.uuid] 47 | version = "1.16.0" 48 | features = [ "serde", "v4" ] 49 | 50 | [dev-dependencies] 51 | expectorate = "1.0" 52 | slog-async = "2.8" 53 | slog-term = "2.9" 54 | structopt = "0.3" 55 | subprocess = "0.2.6" 56 | proptest = "1.7.0" 57 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.adoc: -------------------------------------------------------------------------------- 1 | :showtitle: 2 | :toc: left 3 | :icons: font 4 | 5 | = steno 6 | 7 | This repo contains an in-progress prototype interface for sagas based on 8 | https://www.youtube.com/watch?v=0UTOLRTwOX0[Distributed Sagas as described by 9 | Caitie McAffrey]. See the crate documentation for details. You can build the 10 | docs yourself with: 11 | 12 | ``` 13 | cargo doc 14 | ``` 15 | 16 | Sagas seek to decompose complex tasks into comparatively simple actions. Execution of the saga (dealing with unwinding, etc.) is implemented in one place with good mechanisms for observing progress, controlling execution, etc. That's what this crate provides. 17 | 18 | == Status 19 | 20 | This crate has usable interfaces for defining and executing sagas. 21 | 22 | Features: 23 | 24 | * Execution is recorded to a saga log. Consumers can impl `SecStore` to persist this to a database. Intermediate log states can be recovered, meaning that you can resume execution after a simulated crash. 25 | * Actions can share state using arbitrary serializable types (dynamically-checked, unfortunately). 26 | * Unwinding: if an action fails, all nodes are unwound (basically, undo actions are executed for nodes whose actions completed; it's more complicated for nodes whose actions _may_ have run) 27 | * Injecting errors into an arbitrary node 28 | * Fine-grained status reporting (status of each action) 29 | 30 | There's a demo program (`examples/demo-provision`) to exercise all of this with a toy saga that resembles VM provisioning. 31 | 32 | There are lots of caveats: 33 | 34 | * All experimentation and testing uses a toy saga that doesn't actually do anything. 35 | * The code is prototype-quality (i.e., mediocre). There's tremendous room for cleanup and improvement. 36 | * There's virtually no automated testing yet. 37 | * There are many important considerations not yet addressed. To start with: 38 | ** updates and versioning: how a saga's code gets packaged, updated, etc.; and how the code and state get versioned 39 | ** Subsagas: it's totally possible for saga actions to create other sagas, which is important because composeability is important for our use case. However, doing so is not idempotent, and won't necessarily do the right thing in the event of failures. 40 | 41 | Major risks and open questions: 42 | 43 | * Does this abstraction make sense? Let's try prototyping it in oxide-api-prototype. 44 | * Failover (or "high availability execution") 45 | 46 | Future feature ideas include: 47 | 48 | * control: pause/unpause, abort, concurrency limits, single-step, breakpoints 49 | * canarying 50 | * other policies around nodes with no undo actions (e.g., pause and notify an operator, then resume the saga if directed; fail-forward only) 51 | * a notion of "scope" or "blast radius" for a saga so that a larger system can schedule sagas in a way that preserves overall availability 52 | * better compile-time type checking, so that you cannot add a node to a saga graph that uses data not provided by one of its ancestors 53 | 54 | == Divergence from distributed sagas 55 | 56 | As mentioned above, this implementation is very heavily based on distributed sagas. There are a few important considerations not covered in the talk referenced above: 57 | 58 | * How do actions share state with one another? (If an early step allocates an IP address, how can a later step use that IP address to plumb up a network interface?) 59 | * How do you provide high-availability execution (at least, execution that automatically continues in the face of failure of the saga execution coordinator (SEC))? Equivalently: how do you ensure that two SEC instances aren't working concurrently on the same saga? 60 | 61 | We're also generalizing the idea in a few ways: 62 | 63 | * A node need not have an undo action (compensating request). We might provide policy that can cause the saga to pause and wait for an operator, or to only fail-forward. 64 | * See above: canarying, scope, blast radius, etc. 65 | 66 | The terminology used in the original talk seems to come from microservices and databases. We found some of these confusing and chose some different terms: 67 | 68 | [cols="1,2,1,2",options="header"] 69 | |=== 70 | |Our term 71 | |What it means 72 | |Distributed Sagas term 73 | |Why we picked another term 74 | 75 | |Action 76 | |A node in the saga graph, or (equivalently) the user-defined action taken when the executor "executes" that node of the graph 77 | |Request 78 | |"Request" suggests an RPC or an HTTP request. Our actions may involve neither of those or they may comprise many requests. 79 | 80 | |Undo action 81 | |The user-defined action taken for a node whose action needs to be logically reversed 82 | |Compensating request 83 | |See "Action" above. We could have called this "compensating action" but "undo" felt more evocative of what's happening. 84 | 85 | |Fail/Failed 86 | |The result of an action that was not successful 87 | |Abort/Aborted 88 | |"Abort" can be used to mean a bunch of things, like maybe that an action failed, or that it was cancelled while it was still running, or that it was undone. These are all different things so we chose different terms to avoid confusion. 89 | 90 | |Undo 91 | |What happens to a node whose action needs to be logically reversed. This might involve doing nothing (if the action never ran), executing the undo action (if the action previously succeeded), or something a bit more complicated. 92 | |Cancel/Cancelled 93 | |"Cancel" might suggest to a reader that we stopped an action while it was in progress. That's not what it means here. Plus, we avoid the awkward "canceled" vs. "cancelled" debate. 94 | 95 | |=== 96 | -------------------------------------------------------------------------------- /examples/demo-provision.rs: -------------------------------------------------------------------------------- 1 | //! Command-line tool for demo'ing saga interfaces 2 | 3 | use anyhow::anyhow; 4 | use anyhow::Context; 5 | use slog::Drain; 6 | use std::convert::TryFrom; 7 | use std::fs; 8 | use std::io; 9 | use std::path::Path; 10 | use std::path::PathBuf; 11 | use std::sync::Arc; 12 | use steno::load_example_actions; 13 | use steno::make_example_provision_dag; 14 | use steno::ActionRegistry; 15 | use steno::ExampleContext; 16 | use steno::ExampleParams; 17 | use steno::ExampleSagaType; 18 | use steno::SagaDag; 19 | use steno::SagaId; 20 | use steno::SagaLog; 21 | use steno::SagaResultErr; 22 | use steno::SagaSerialized; 23 | use structopt::StructOpt; 24 | use uuid::Uuid; 25 | 26 | #[tokio::main] 27 | async fn main() -> Result<(), anyhow::Error> { 28 | let subcmd = Demo::from_args(); 29 | match subcmd { 30 | Demo::Dot => cmd_dot().await, 31 | Demo::Info => cmd_info().await, 32 | Demo::PrintLog { ref print_log_args } => { 33 | cmd_print_log(print_log_args).await 34 | } 35 | Demo::Run { ref run_args } => cmd_run(run_args).await, 36 | } 37 | } 38 | 39 | /// Demo saga implementation 40 | #[derive(Debug, StructOpt)] 41 | #[structopt(no_version)] 42 | enum Demo { 43 | /// Dump a dot (graphviz) representation of the saga graph 44 | Dot, 45 | 46 | /// Dump information about the saga graph (not an execution) 47 | Info, 48 | 49 | /// Pretty-print the log from a previous execution 50 | PrintLog { 51 | #[structopt(flatten)] 52 | print_log_args: PrintLogArgs, 53 | }, 54 | 55 | /// Execute the saga 56 | Run { 57 | #[structopt(flatten)] 58 | run_args: RunArgs, 59 | }, 60 | } 61 | 62 | // We use a hardcoded SagaId for ease of automated testing. See the note in 63 | // demo_prov_server_alloc(). 64 | fn make_saga_id() -> SagaId { 65 | SagaId(Uuid::parse_str("049b2522-308d-442e-bc65-9bfaef863597").unwrap()) 66 | } 67 | 68 | fn make_log() -> slog::Logger { 69 | let decorator = slog_term::TermDecorator::new().build(); 70 | let drain = slog_term::FullFormat::new(decorator).build().fuse(); 71 | let drain = slog::LevelFilter(drain, slog::Level::Warning).fuse(); 72 | let drain = slog_async::Async::new(drain).build().fuse(); 73 | slog::Logger::root(drain, slog::o!()) 74 | } 75 | 76 | fn make_sec(log: &slog::Logger) -> steno::SecClient { 77 | steno::sec(log.new(slog::o!()), Arc::new(steno::InMemorySecStore::new())) 78 | } 79 | 80 | fn reader_for_log_input( 81 | path: &Path, 82 | ) -> Result, anyhow::Error> { 83 | if *path == PathBuf::from("-") { 84 | Ok(Box::new(io::stdin())) 85 | } else { 86 | Ok(Box::new(fs::File::open(&path).with_context(|| { 87 | format!("open recovery log \"{}\"", path.display()) 88 | })?)) 89 | } 90 | } 91 | 92 | fn read_saga_state( 93 | reader: R, 94 | ) -> Result { 95 | serde_json::from_reader(reader).context("reading saga state") 96 | } 97 | 98 | fn make_example_action_registry() -> Arc> { 99 | let mut registry = ActionRegistry::new(); 100 | load_example_actions(&mut registry); 101 | Arc::new(registry) 102 | } 103 | 104 | // "dot" subcommand 105 | 106 | async fn cmd_dot() -> Result<(), anyhow::Error> { 107 | let params = ExampleParams { 108 | instance_name: "fake-o-instance".to_string(), 109 | number_of_instances: 1, 110 | }; 111 | let dag = make_example_provision_dag(params); 112 | println!("{}", dag.dot()); 113 | Ok(()) 114 | } 115 | 116 | // "info" subcommand 117 | 118 | async fn cmd_info() -> Result<(), anyhow::Error> { 119 | let log = make_log(); 120 | let sec = make_sec(&log); 121 | 122 | let registry = make_example_action_registry(); 123 | let params = ExampleParams { 124 | instance_name: "fake-o instance".to_string(), 125 | number_of_instances: 1, 126 | }; 127 | let dag = make_example_provision_dag(params); 128 | println!("*** saga dag definition ***"); 129 | println!("saga graph: "); 130 | println!("{}", dag.dot()); 131 | 132 | println!("*** initial state ***"); 133 | let saga_id = make_saga_id(); 134 | let uctx = Arc::new(ExampleContext {}); 135 | let _unused_future = sec.saga_create(saga_id, uctx, dag, registry).await?; 136 | 137 | let saga = sec.saga_get(saga_id).await.unwrap(); 138 | let status = saga.state.status(); 139 | println!("{}", status); 140 | 141 | Ok(()) 142 | } 143 | 144 | // "print-log" subcommand 145 | 146 | #[derive(Debug, StructOpt)] 147 | struct PrintLogArgs { 148 | /// path to the saga log to pretty-print 149 | input_log_path: PathBuf, 150 | } 151 | 152 | async fn cmd_print_log(args: &PrintLogArgs) -> Result<(), anyhow::Error> { 153 | let input_log_path = &args.input_log_path; 154 | let file = reader_for_log_input(&input_log_path)?; 155 | let saga_serialized = read_saga_state(file)?; 156 | let saga_log = SagaLog::try_from(saga_serialized)?; 157 | println!("{:?}", saga_log.pretty()); 158 | Ok(()) 159 | } 160 | 161 | // "run" subcommand 162 | 163 | #[derive(Debug, StructOpt)] 164 | struct RunArgs { 165 | /// simulate an error at the named saga node 166 | #[structopt(long)] 167 | inject_error: Vec, 168 | 169 | /// simulate an error at the named saga node's undo action 170 | #[structopt(long)] 171 | inject_undo_error: Vec, 172 | 173 | /// do not print to stdout 174 | #[structopt(long)] 175 | quiet: bool, 176 | 177 | /// upon completion, dump the workflog log to the named file 178 | #[structopt(long)] 179 | dump_to: Option, 180 | 181 | /// recover the saga log from the named file and resume execution 182 | #[structopt(long)] 183 | recover_from: Option, 184 | } 185 | 186 | async fn cmd_run(args: &RunArgs) -> Result<(), anyhow::Error> { 187 | let log = make_log(); 188 | let sec = make_sec(&log); 189 | let registry = make_example_action_registry(); 190 | let uctx = Arc::new(ExampleContext {}); 191 | let (saga_id, future, dag) = 192 | if let Some(input_log_path) = &args.recover_from { 193 | if !args.quiet { 194 | println!("recovering from log: {}", input_log_path.display()); 195 | } 196 | 197 | let file = reader_for_log_input(input_log_path)?; 198 | let saga_recovered = read_saga_state(file)?; 199 | let saga_id = saga_recovered.saga_id; 200 | let future = sec 201 | .saga_resume( 202 | saga_id, 203 | uctx, 204 | saga_recovered.dag.clone(), 205 | registry, 206 | saga_recovered.events, 207 | ) 208 | .await 209 | .context("resuming saga")?; 210 | let saga = sec.saga_get(saga_id).await.map_err(|_: ()| { 211 | anyhow!("failed to fetch newly-created saga") 212 | })?; 213 | if !args.quiet { 214 | print!("recovered state\n"); 215 | println!("{}", saga.state.status()); 216 | println!(""); 217 | } 218 | let dag: Arc = 219 | Arc::new(serde_json::from_value(saga_recovered.dag)?); 220 | (saga_id, future, dag) 221 | } else { 222 | let params = ExampleParams { 223 | instance_name: "fake-o instance".to_string(), 224 | number_of_instances: 1, 225 | }; 226 | let dag = make_example_provision_dag(params); 227 | let saga_id = make_saga_id(); 228 | let future = 229 | sec.saga_create(saga_id, uctx, dag.clone(), registry).await?; 230 | (saga_id, future, dag) 231 | }; 232 | 233 | for node_name in &args.inject_error { 234 | let node_id = dag.get_index(node_name).with_context(|| { 235 | format!("bad argument for --inject-error: {:?}", node_name) 236 | })?; 237 | sec.saga_inject_error(saga_id, node_id) 238 | .await 239 | .context("injecting error")?; 240 | if !args.quiet { 241 | println!("will inject error at node \"{}\"", node_name); 242 | } 243 | } 244 | 245 | for node_name in &args.inject_undo_error { 246 | let node_id = dag.get_index(node_name).with_context(|| { 247 | format!("bad argument for --inject-undo-error: {:?}", node_name) 248 | })?; 249 | sec.saga_inject_error_undo(saga_id, node_id) 250 | .await 251 | .context("injecting error")?; 252 | if !args.quiet { 253 | println!("will inject error at node \"{}\" undo action", node_name); 254 | } 255 | } 256 | 257 | if !args.quiet { 258 | println!("*** running saga ***"); 259 | } 260 | 261 | sec.saga_start(saga_id).await.expect("failed to start saga"); 262 | let result = future.await; 263 | assert_eq!(saga_id, result.saga_id); 264 | 265 | let saga = sec 266 | .saga_get(saga_id) 267 | .await 268 | .map_err(|_: ()| anyhow!("failed to fetch saga after running it"))?; 269 | if !args.quiet { 270 | println!("*** finished saga ***"); 271 | println!("\n*** final state ***"); 272 | println!("{}", saga.state.status()); 273 | 274 | print!("result: "); 275 | match result.kind { 276 | Ok(success_case) => { 277 | println!("SUCCESS"); 278 | println!( 279 | "final output: {:?}", 280 | success_case.saga_output::().unwrap() 281 | ); 282 | } 283 | Err(SagaResultErr { 284 | error_node_name, 285 | error_source, 286 | undo_failure, 287 | }) => { 288 | println!("ACTION FAILURE"); 289 | println!("failed at node: {:?}", error_node_name); 290 | println!("failed with error: {:#}", error_source); 291 | if let Some((undo_node_name, undo_error)) = undo_failure { 292 | println!("FOLLOWED BY UNDO ACTION FAILURE"); 293 | println!("failed at node: {:?}", undo_node_name); 294 | println!("failed with error: {:#}", undo_error); 295 | } 296 | } 297 | } 298 | } 299 | 300 | if let Some(output_log_path) = &args.dump_to { 301 | let serialized = saga.serialized(); 302 | let (mut stdout_holder, mut file_holder); 303 | let (label, out): (String, &mut dyn io::Write) = if *output_log_path 304 | == PathBuf::from("-") 305 | { 306 | stdout_holder = io::stdout(); 307 | (String::from("stdout"), &mut stdout_holder) 308 | } else { 309 | file_holder = fs::OpenOptions::new() 310 | .create(true) 311 | .write(true) 312 | .truncate(true) 313 | .open(output_log_path) 314 | .with_context(|| { 315 | format!("open output log \"{}\"", output_log_path.display()) 316 | })?; 317 | (format!("\"{}\"", output_log_path.display()), &mut file_holder) 318 | }; 319 | serde_json::to_writer_pretty(out, &serialized) 320 | .with_context(|| format!("save output log {}", label))?; 321 | if !args.quiet { 322 | println!("dumped log to {}", label); 323 | } 324 | } 325 | 326 | Ok(()) 327 | } 328 | -------------------------------------------------------------------------------- /examples/trip.rs: -------------------------------------------------------------------------------- 1 | //! Example from the canonical distributed sagas talk: suppose you have existing 2 | //! functions to book a hotel, book a flight, book a car reservation, and charge 3 | //! a payment card. You also have functions to cancel a hotel, flight, or car 4 | //! reservation and refund a credit card charge. You want to implement a "book 5 | //! trip" function whose implementation makes sure that you ultimately wind up 6 | //! with all of these bookings (and having paid for it) or none (and having not 7 | //! paid for it). 8 | 9 | // Names are given here for clarity, even when they're not needed. 10 | #![allow(unused_variables)] 11 | 12 | use serde::Deserialize; 13 | use serde::Serialize; 14 | use slog::Drain; 15 | use std::sync::Arc; 16 | use steno::ActionContext; 17 | use steno::ActionError; 18 | use steno::ActionRegistry; 19 | use steno::DagBuilder; 20 | use steno::Node; 21 | use steno::SagaDag; 22 | use steno::SagaId; 23 | use steno::SagaName; 24 | use steno::SagaResultErr; 25 | use steno::SagaType; 26 | use steno::SecClient; 27 | use uuid::Uuid; 28 | 29 | // This is where we're going: this program will collect payment and book a whole 30 | // trip that includes a hotel, flight, and car. This will either all succeed or 31 | // any steps that ran will be undone. In a real example, you'd persist the saga 32 | // log and use the saga recovery interface to resume execution after a crash. 33 | #[tokio::main] 34 | async fn main() { 35 | let log = { 36 | let decorator = slog_term::TermDecorator::new().build(); 37 | let drain = slog_term::FullFormat::new(decorator).build().fuse(); 38 | let drain = slog::LevelFilter(drain, slog::Level::Warning).fuse(); 39 | let drain = slog_async::Async::new(drain).build().fuse(); 40 | slog::Logger::root(drain, slog::o!()) 41 | }; 42 | let sec = steno::sec( 43 | log.new(slog::o!()), 44 | Arc::new(steno::InMemorySecStore::new()), 45 | ); 46 | let trip_context = Arc::new(TripContext {}); 47 | let params = TripParams { 48 | hotel_name: String::from("Springfield Palace Hotel"), 49 | flight_info: String::from("any flight"), 50 | car_info: String::from("1998 Canyonero"), 51 | charge_details: String::from("Moneybank Charge Card"), 52 | }; 53 | book_trip(sec, trip_context, params).await; 54 | } 55 | 56 | // Create a new "book trip" saga with the given parameters and then execute it. 57 | async fn book_trip( 58 | sec: SecClient, 59 | trip_context: Arc, 60 | params: TripParams, 61 | ) { 62 | // Register the actions to run during saga execution. 63 | // This is created once for all sagas. 64 | let registry = { 65 | let mut registry = ActionRegistry::new(); 66 | load_trip_actions(&mut registry); 67 | Arc::new(registry) 68 | }; 69 | 70 | // Build a saga DAG. The DAG describes the actions that are part of the 71 | // saga (including the functions to be invoked to do each of the steps) and 72 | // how they depend on each other. This can be dynamically created for each 73 | // trip. 74 | let dag = make_trip_dag(params); 75 | 76 | // Get ready to execute the saga. 77 | 78 | // Each execution needs a new unique id. 79 | let saga_id = SagaId(Uuid::new_v4()); 80 | 81 | // Create the saga. 82 | let saga_future = sec 83 | .saga_create(saga_id, Arc::new(trip_context), dag, registry) 84 | .await 85 | .expect("failed to create saga"); 86 | 87 | // Set it running. 88 | sec.saga_start(saga_id).await.expect("failed to start saga running"); 89 | 90 | // Wait for the saga to finish running. This could take a while, depending 91 | // on what the saga does! This traverses the DAG of actions, executing each 92 | // one. If one fails, then it's all unwound: any actions that previously 93 | // completed will be undone. 94 | // 95 | // Note that the SEC will run all this regardless of whether you wait for it 96 | // here. This is just a handle for you to know when the saga has finished. 97 | let result = saga_future.await; 98 | 99 | // Print the results. 100 | match result.kind { 101 | Ok(success) => { 102 | println!( 103 | "hotel: {:?}", 104 | success.lookup_node_output::("hotel") 105 | ); 106 | println!( 107 | "flight: {:?}", 108 | success.lookup_node_output::("flight") 109 | ); 110 | println!( 111 | "car: {:?}", 112 | success.lookup_node_output::("car") 113 | ); 114 | println!( 115 | "payment: {:?}", 116 | success.lookup_node_output::("payment") 117 | ); 118 | println!("\nraw summary:\n{:?}", success.saga_output::()); 119 | } 120 | Err(SagaResultErr { error_node_name, error_source, undo_failure }) => { 121 | println!("action failed: {}", error_node_name.as_ref()); 122 | println!("error: {}", error_source); 123 | if let Some((undo_node_name, undo_error_source)) = undo_failure { 124 | println!("additionally:"); 125 | println!("undo action failed: {}", undo_node_name.as_ref()); 126 | println!("error: {}", undo_error_source); 127 | } 128 | } 129 | } 130 | } 131 | 132 | /// Define the actions as globals so that we can easily and type-safely access 133 | /// them during registration and DAG construction. 134 | mod actions { 135 | use super::TripSaga; 136 | use lazy_static::lazy_static; 137 | use std::sync::Arc; 138 | use steno::new_action_noop_undo; 139 | use steno::Action; 140 | use steno::ActionFunc; 141 | 142 | lazy_static! { 143 | pub(super) static ref PAYMENT: Arc> = 144 | ActionFunc::new_action( 145 | "payment", 146 | super::saga_charge_card, 147 | super::saga_refund_card 148 | ); 149 | pub(super) static ref HOTEL: Arc> = 150 | ActionFunc::new_action( 151 | "hotel", 152 | super::saga_book_hotel, 153 | super::saga_cancel_hotel 154 | ); 155 | pub(super) static ref FLIGHT: Arc> = 156 | ActionFunc::new_action( 157 | "flight", 158 | super::saga_book_flight, 159 | super::saga_cancel_flight 160 | ); 161 | pub(super) static ref CAR: Arc> = 162 | ActionFunc::new_action( 163 | "car", 164 | super::saga_book_car, 165 | super::saga_cancel_car 166 | ); 167 | pub(super) static ref PRINT: Arc> = 168 | new_action_noop_undo("print", super::saga_print); 169 | } 170 | } 171 | 172 | /// Load our actions into an ActionRegistry 173 | /// 174 | /// This step is separate from building the DAG because if we implement saga 175 | /// recovery (i.e., resuming sagas after a crash), we need to register the 176 | /// actions but we don't need to build a new DAG. 177 | fn load_trip_actions(registry: &mut ActionRegistry) { 178 | registry.register(actions::PAYMENT.clone()); 179 | registry.register(actions::HOTEL.clone()); 180 | registry.register(actions::FLIGHT.clone()); 181 | registry.register(actions::CAR.clone()); 182 | registry.register(actions::PRINT.clone()); 183 | } 184 | 185 | /// Build the DAG for booking a trip 186 | fn make_trip_dag(params: TripParams) -> Arc { 187 | // The builder methods describes the actions that are part of the saga 188 | // (including the functions to be invoked to do each of the steps) and how 189 | // they depend on each other. 190 | let name = SagaName::new("book-trip"); 191 | let mut builder = DagBuilder::new(name); 192 | 193 | // Somewhat arbitrarily, we're choosing to charge the credit card first, 194 | // then make all the bookings in parallel. We could do these all in 195 | // parallel, or all sequentially, and the saga would still be correct, since 196 | // Steno guarantees that eventually either all actions will succeed or all 197 | // executed actions will be undone. 198 | builder.append(Node::action( 199 | // name of this action's output (can be used in subsequent actions) 200 | "payment", 201 | // human-readable label for the action 202 | "ChargeCreditCard", 203 | // The name of the action to run. This can be either a &dyn Action or a 204 | // literal `ActionName`. Either way, the named action must appear in 205 | // the action registry. 206 | actions::PAYMENT.as_ref(), 207 | )); 208 | 209 | builder.append_parallel(vec![ 210 | Node::action("hotel", "BookHotel", actions::HOTEL.as_ref()), 211 | Node::action("flight", "BookFlight", actions::FLIGHT.as_ref()), 212 | Node::action("car", "BookCar", actions::CAR.as_ref()), 213 | ]); 214 | 215 | builder.append(Node::action("output", "Print", actions::PRINT.as_ref())); 216 | 217 | Arc::new(SagaDag::new( 218 | builder.build().expect("DAG was unexpectedly invalid"), 219 | serde_json::to_value(params).unwrap(), 220 | )) 221 | } 222 | 223 | // Implementation of the trip saga 224 | 225 | // Saga parameters. Each trip will have a separate set of parameters. We use 226 | // plain strings here, but these can be any serializable / deserializable types. 227 | #[derive(Debug, Deserialize, Serialize)] 228 | struct TripParams { 229 | hotel_name: String, 230 | flight_info: String, 231 | car_info: String, 232 | charge_details: String, 233 | } 234 | 235 | // Application-specific context that we want to provide to every action in the 236 | // saga. This can be any object we want. We'll pass it to Steno when the saga 237 | // begins execution. Steno passes it back to us in each action. This makes it 238 | // easy for us to access application-specific state, like a logger, HTTP 239 | // clients, etc. 240 | #[derive(Debug)] 241 | struct TripContext; 242 | 243 | // Steno uses several type parameters that you specify by impl'ing the SagaType 244 | // trait. 245 | #[derive(Debug)] 246 | struct TripSaga; 247 | impl SagaType for TripSaga { 248 | // Type for the application-specific context (see above) 249 | type ExecContextType = Arc; 250 | } 251 | 252 | // Data types emitted by various saga actions. These must be serializable and 253 | // deserializable. This is the only supported way to share data between 254 | // actions in the same saga. 255 | 256 | #[derive(Debug, Deserialize, Serialize)] 257 | struct HotelReservation(String); 258 | #[derive(Debug, Deserialize, Serialize)] 259 | struct FlightReservation(String); 260 | #[derive(Debug, Deserialize, Serialize)] 261 | struct CarReservation(String); 262 | #[derive(Debug, Deserialize, Serialize)] 263 | struct PaymentConfirmation(String); 264 | #[derive(Debug, Deserialize, Serialize)] 265 | struct Summary { 266 | car: CarReservation, 267 | flight: FlightReservation, 268 | hotel: HotelReservation, 269 | payment: PaymentConfirmation, 270 | } 271 | 272 | // Saga action implementations 273 | 274 | async fn saga_charge_card( 275 | action_context: ActionContext, 276 | ) -> Result { 277 | let trip_context = action_context.user_data(); 278 | let params = action_context.saga_params::()?; 279 | let charge_details = ¶ms.charge_details; 280 | // ... (make request to another service) 281 | Ok(PaymentConfirmation(String::from("123"))) 282 | } 283 | 284 | async fn saga_refund_card( 285 | action_context: ActionContext, 286 | ) -> Result<(), anyhow::Error> { 287 | // Fetch the payment confirmation. The undo function is only ever invoked 288 | // after the action function has succeeded. This node is called "payment", 289 | // so we fetch our own action's output by looking up the data for "payment". 290 | let trip_context = action_context.user_data(); 291 | let p: PaymentConfirmation = action_context.lookup("payment")?; 292 | // ... (make request to another service -- must not fail) 293 | Ok(()) 294 | } 295 | 296 | async fn saga_book_hotel( 297 | action_context: ActionContext, 298 | ) -> Result { 299 | // ... 300 | let trip_context = action_context.user_data(); 301 | let params = action_context.saga_params::()?; 302 | let hotel_name = ¶ms.hotel_name; 303 | // ... (make request to another service) 304 | Ok(HotelReservation(String::from("123"))) 305 | } 306 | 307 | async fn saga_cancel_hotel( 308 | action_context: ActionContext, 309 | ) -> Result<(), anyhow::Error> { 310 | // ... 311 | let trip_context = action_context.user_data(); 312 | let confirmation: HotelReservation = action_context.lookup("hotel")?; 313 | // ... (make request to another service -- must not fail) 314 | Ok(()) 315 | } 316 | 317 | async fn saga_book_flight( 318 | action_context: ActionContext, 319 | ) -> Result { 320 | // ... 321 | let trip_context = action_context.user_data(); 322 | let params = action_context.saga_params::()?; 323 | let flight_info = ¶ms.flight_info; 324 | // ... (make request to another service) 325 | Ok(FlightReservation(String::from("123"))) 326 | } 327 | 328 | async fn saga_cancel_flight( 329 | action_context: ActionContext, 330 | ) -> Result<(), anyhow::Error> { 331 | // ... 332 | let trip_context = action_context.user_data(); 333 | let confirmation: FlightReservation = action_context.lookup("flight")?; 334 | // ... (make request to another service -- must not fail) 335 | Ok(()) 336 | } 337 | 338 | async fn saga_book_car( 339 | action_context: ActionContext, 340 | ) -> Result { 341 | // ... 342 | let trip_context = action_context.user_data(); 343 | let params = action_context.saga_params::()?; 344 | let car_info = ¶ms.car_info; 345 | // ... (make request to another service) 346 | Ok(CarReservation(String::from("123"))) 347 | } 348 | 349 | async fn saga_cancel_car( 350 | action_context: ActionContext, 351 | ) -> Result<(), anyhow::Error> { 352 | // ... 353 | let trip_context = action_context.user_data(); 354 | let confirmation: CarReservation = action_context.lookup("car")?; 355 | // ... (make request to another service -- must not fail) 356 | Ok(()) 357 | } 358 | 359 | async fn saga_print( 360 | action_context: ActionContext, 361 | ) -> Result { 362 | Ok(Summary { 363 | car: action_context.lookup("car")?, 364 | flight: action_context.lookup("flight")?, 365 | hotel: action_context.lookup("hotel")?, 366 | payment: action_context.lookup("payment")?, 367 | }) 368 | } 369 | -------------------------------------------------------------------------------- /notes.txt: -------------------------------------------------------------------------------- 1 | Notes for future work 2 | ----------------------- 3 | 4 | Breakpoints, pausing, etc: create a common "wait" function. Arguments describe 5 | the event (e.g., starting action for node X). In pause mode, it checks a 6 | condition and decides whether to wait. In "step" mode, it decrements a 7 | semaphore. In normal mode, it does nothing.) Could add CLI tool that runs up 8 | to a breakpoint and exits (optionally dumps log). Could then add a "step" 9 | version. 10 | 11 | Subsagas and recovery: this might "just work" if saga actions that 12 | create subsagas were idempotent. But I don't think they are today. Maybe 13 | if we first-class `ActionSaga`, the framework can ensure that this is 14 | done idempotently. Relatedly, if we add policies around whether each node has 15 | an "undo" action, for this type of node, we might need that policy to differ 16 | depending on the ActionSaga (and not the Saga behind it). 17 | 18 | Static typing in the construction and execution of the graph: 19 | 20 | - Probably: this isn't so much input/output for each function, but a set of 21 | state depended-on by the function and a set of state produced by the 22 | function. (The difference is that the input of one does not need to 23 | match the output of the previous one, like I was previously assuming). 24 | - starting to see how a macro could allow you to say field X has type Y and 25 | insert into the beginning of your function the appropriate calls to 26 | lookup(), though that still wouldn't be statically type-checked. 27 | - Would one piece of this be to use macros on the action functions that 28 | also generate input and output types specific to that function? Would 29 | they also need to generate glue code from previous and subsequent nodes? 30 | - I think we want the Graph data structure to erase the specific 31 | input/output types as we do today. But maybe when it's still in the 32 | builder stage, we keep track of these types so that we can fail at 33 | compile time when constructing an invalid graph. 34 | 35 | Even if we skip the static typing: the interface for constructing a saga DAG 36 | feels like it needs work. 37 | -------------------------------------------------------------------------------- /release.toml: -------------------------------------------------------------------------------- 1 | # This file is used by cargo-release. 2 | # This version is cribbed from Dropshot and updated for cargo-release 0.21.1 3 | 4 | # Update the change log to reflect the new release and set us up for the next release. 5 | pre-release-replacements = [ 6 | # First, replace the current "Unreleased changes" header with one reflecting the new release version and date. 7 | {file="CHANGELOG.adoc", search="Unreleased changes \\(release date TBD\\)", replace="{{version}} (released {{date}})", exactly=1}, 8 | # Update the link to the list of raw commits in the formerly "Unreleased changes" section. It should end at the tag for the newly-released version. 9 | {file="CHANGELOG.adoc", search="\\\\.\\.\\.HEAD", replace="\\...{{tag_name}}", exactly=1}, 10 | # Next, append a new "Unreleased changes" header beneath the sentinel line. 11 | {file="CHANGELOG.adoc", search="// cargo-release: next header goes here \\(do not change this line\\)", replace="// cargo-release: next header goes here (do not change this line)\n\n== Unreleased changes (release date TBD)\n\nhttps://github.com/oxidecomputer/steno/compare/{{tag_name}}\\...HEAD[Full list of commits]", exactly=1}, 12 | ] 13 | 14 | push = false 15 | pre-release-commit-message = "release {{crate_name}} {{version}}" 16 | post-release-commit-message = "starting {{crate_name}} {{next_version}} after releasing {{version}}" 17 | tag-message = "release {{crate_name}} {{version}}" 18 | tag-prefix = "" 19 | dev-version = true 20 | dev-version-ext = "dev" 21 | allow-branch = [ "main" ] 22 | -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | # We use a specific toolchain revision to ensure consistent behavior among 2 | # developers and test environments. The intent is to keep this updated as new 3 | # stable versions are released. 4 | [toolchain] 5 | channel = "1.87.0" 6 | profile = "default" 7 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------------------- 2 | # Stable features that we customize locally 3 | # --------------------------------------------------------------------------- 4 | max_width = 80 5 | use_small_heuristics = "max" 6 | edition = "2018" 7 | -------------------------------------------------------------------------------- /src/dag.rs: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at https://mozilla.org/MPL/2.0/. 4 | 5 | //! Saga and saga DAG construction 6 | //! 7 | //! A _saga_ is primarily made up of a directed acylic graph (DAG) of saga 8 | //! _nodes_, most of which are _actions_. The facilities in this module are 9 | //! used to build up sagas. It looks like this: 10 | //! 11 | //! At the lowest layer, we have [`Node`]s, which usually describe an action. 12 | //! Use [`DagBuilder`] to assemble these into a [`Dag`]. The resulting `Dag` 13 | //! can be used in one of two ways: 14 | //! 15 | //! 1. When combined with parameters for the saga, the `Dag` becomes a 16 | //! [`SagaDag`] and can be executed using the [`crate::sec()`]. 17 | //! 2. Alternatively, the `Dag` can be used as a _subsaga_ of some other saga. 18 | //! To do this, use [`Node::subsaga`] to create a subsaga _node_ containing 19 | //! the subsaga `Dag`, then append this to the [`DagBuilder`] that's used to 20 | //! construct the outer saga. 21 | // Note: The graph-related types here don't implement JSON schema because of 22 | // Graph and NodeIndex. 23 | 24 | use crate::saga_action_generic::Action; 25 | use crate::SagaType; 26 | use anyhow::anyhow; 27 | use petgraph::dot; 28 | use petgraph::graph::NodeIndex; 29 | use petgraph::Directed; 30 | use petgraph::Graph; 31 | use schemars::JsonSchema; 32 | use serde::Deserialize; 33 | use serde::Serialize; 34 | use std::collections::BTreeMap; 35 | use std::collections::BTreeSet; 36 | use std::fmt; 37 | use std::sync::Arc; 38 | use thiserror::Error; 39 | use uuid::Uuid; 40 | 41 | /// Unique identifier for a Saga (an execution of a saga template) 42 | #[derive( 43 | Clone, 44 | Copy, 45 | Deserialize, 46 | Eq, 47 | JsonSchema, 48 | Ord, 49 | PartialEq, 50 | PartialOrd, 51 | Serialize, 52 | )] 53 | #[serde(transparent)] 54 | pub struct SagaId(pub Uuid); 55 | // TODO-cleanup figure out how to use custom_derive here? 56 | NewtypeDebug! { () pub struct SagaId(Uuid); } 57 | // TODO-design In the Oxide consumer, we probably want to have the serialized 58 | // form of ids have a prefix describing the type. This seems consumer-specific, 59 | // though. Is there a good way to do support that? Maybe the best way to do 60 | // this is to have the consumer have their own enum or trait that impls Display 61 | // using the various ids provided by consumers. 62 | NewtypeDisplay! { () pub struct SagaId(Uuid); } 63 | NewtypeFrom! { () pub struct SagaId(Uuid); } 64 | 65 | /// Unique name for a saga [`Action`] 66 | /// 67 | /// Each action requires a string name that's unique within an 68 | /// [`ActionRegistry`]. During normal execution and when recovering sagas after 69 | /// a crash, the name is used to link each node in the DAG with the 70 | /// [`Action`] implementation. 71 | #[derive( 72 | Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, JsonSchema, 73 | )] 74 | pub struct ActionName(String); 75 | 76 | impl ActionName { 77 | pub fn new>(name: S) -> ActionName { 78 | ActionName(name.as_ref().to_string()) 79 | } 80 | } 81 | 82 | impl fmt::Debug for ActionName { 83 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 84 | f.write_fmt(format_args!("{:?}", self.0)) 85 | } 86 | } 87 | 88 | impl From for ActionName 89 | where 90 | S: AsRef, 91 | { 92 | fn from(s: S) -> Self { 93 | ActionName::new(s) 94 | } 95 | } 96 | 97 | /// Unique name for a saga [`Node`] 98 | /// 99 | /// Each node requires a string name that's unique within its DAG. The name is 100 | /// used to identify its output. Nodes that depend on a given node (either 101 | /// directly or indirectly) can access the node's output using its name. 102 | #[derive( 103 | Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, JsonSchema, 104 | )] 105 | pub struct NodeName(String); 106 | 107 | impl NodeName { 108 | pub fn new>(name: S) -> NodeName { 109 | NodeName(name.as_ref().to_string()) 110 | } 111 | } 112 | 113 | impl AsRef for NodeName { 114 | fn as_ref(&self) -> &str { 115 | &self.0 116 | } 117 | } 118 | 119 | impl fmt::Debug for NodeName { 120 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 121 | f.write_fmt(format_args!("{:?}", self.0)) 122 | } 123 | } 124 | 125 | /// Human-readable name for a particular saga 126 | /// 127 | /// Steno makes no assumptions about the semantics of this name. Consumers may 128 | /// wish to use this as a unique identifier. 129 | #[derive( 130 | Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, JsonSchema, 131 | )] 132 | #[serde(transparent)] 133 | pub struct SagaName(String); 134 | 135 | NewtypeDisplay! { () pub struct SagaName(String); } 136 | 137 | impl SagaName { 138 | pub fn new(name: &str) -> SagaName { 139 | SagaName(name.to_string()) 140 | } 141 | } 142 | 143 | impl fmt::Debug for SagaName { 144 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 145 | f.write_fmt(format_args!("{:?}", self.0)) 146 | } 147 | } 148 | 149 | /// An error returned from [`ActionRegistry::get()`] 150 | #[derive(Debug)] 151 | pub enum ActionRegistryError { 152 | /// No action has been registered with the specified name 153 | NotFound, 154 | } 155 | 156 | /// A registry of saga actions that can be used across multiple sagas. 157 | /// 158 | /// Actions are identified by their [`ActionName`]. 159 | // Actions can exist at multiple nodes in each saga DAG. Since saga construction 160 | // is dynamic and based upon user input, we need to allow a way to insert 161 | // actions at runtime into the DAG. While this could be achieved by referencing 162 | // the action during saga construction, this is not possible when reloading a 163 | // saga from persistent storage. In this case, the concrete type of the Action 164 | // is erased and the only mechanism we have to recover it is an `ActionName`. We 165 | // therefore have all users register their actions for use across sagas so we 166 | // can dynamically construct and restore sagas. 167 | #[derive(Debug)] 168 | pub struct ActionRegistry { 169 | actions: BTreeMap>>, 170 | } 171 | 172 | impl ActionRegistry { 173 | pub fn new() -> ActionRegistry { 174 | ActionRegistry { actions: BTreeMap::new() } 175 | } 176 | 177 | pub fn register(&mut self, action: Arc>) { 178 | let already_inserted = self.actions.insert(action.name(), action); 179 | assert!(already_inserted.is_none()); 180 | } 181 | 182 | pub fn get( 183 | &self, 184 | name: &ActionName, 185 | ) -> Result>, ActionRegistryError> { 186 | self.actions.get(name).cloned().ok_or(ActionRegistryError::NotFound) 187 | } 188 | } 189 | 190 | impl Default for ActionRegistry { 191 | fn default() -> Self { 192 | Self::new() 193 | } 194 | } 195 | 196 | /// Describes a node in the saga DAG 197 | /// 198 | /// There are three kinds of nodes you can add to a graph: 199 | /// 200 | /// * an _action_ (see [`Node::action`]), which executes a particular [`Action`] 201 | /// with an associated undo action 202 | /// * a _constant_ (see [`Node::constant`]), which is like an action that 203 | /// outputs a value that's known when the DAG is constructed 204 | /// * a _subsaga_ (see [`Node::subsaga`]), which executes another DAG in the 205 | /// context of this saga 206 | /// 207 | /// Each of these node types has a `node_name` and produces an output. Other 208 | /// nodes that depend on this node (directly or indirectly) can access the 209 | /// output by looking it up by the node name using 210 | /// [`crate::ActionContext::lookup`]: 211 | /// 212 | /// * The output of an action node is emitted by the action itself. 213 | /// * The output of a constant node is the value provided when the node was 214 | /// created (see [`Node::constant`]). 215 | /// * The output of a subsaga node is the output of the subsaga itself. Note 216 | /// that the output of individual nodes from the subsaga DAG is _not_ 217 | /// available to other nodes in this DAG. Only the final output is available. 218 | #[derive(Debug, Clone)] 219 | pub struct Node { 220 | node_name: NodeName, 221 | kind: NodeKind, 222 | } 223 | 224 | #[derive(Debug, Clone)] 225 | enum NodeKind { 226 | Action { label: String, action_name: ActionName }, 227 | Constant { value: serde_json::Value }, 228 | Subsaga { params_node_name: NodeName, dag: Dag }, 229 | } 230 | 231 | impl Node { 232 | /// Make a new action node (see [`Node`]) 233 | /// 234 | /// This node is used to execute the given action. The action's output will 235 | /// be available to dependent nodes by looking up the name `node_name`. See 236 | /// [`Action`] for more information. 237 | pub fn action, L: AsRef, A: SagaType>( 238 | node_name: N, 239 | label: L, 240 | action: &dyn Action, 241 | ) -> Node { 242 | Node { 243 | node_name: NodeName::new(node_name), 244 | kind: NodeKind::Action { 245 | label: label.as_ref().to_string(), 246 | action_name: action.name(), 247 | }, 248 | } 249 | } 250 | 251 | /// Make a new constant node (see [`Node`]) 252 | /// 253 | /// This node immediately emits `value`. Why would you want this? Suppose 254 | /// you're working with some saga action that expects input to come from 255 | /// some previous saga node. But in your case, you know the input up front. 256 | /// You can use this to provide the value to the downstream action. 257 | pub fn constant>( 258 | node_name: N, 259 | value: serde_json::Value, 260 | ) -> Node { 261 | Node { 262 | node_name: NodeName::new(node_name), 263 | kind: NodeKind::Constant { value }, 264 | } 265 | } 266 | 267 | /// Make a new subsaga node (see [`Node`]) 268 | /// 269 | /// This is used to insert a subsaga into another saga. The output of the 270 | /// subsaga will have name `node_name` in the outer saga. The subsaga's DAG 271 | /// is described by `dag`. Its input parameters will come from node 272 | /// `params_node_name` in the outer saga. 273 | pub fn subsaga, N2: AsRef>( 274 | node_name: N1, 275 | dag: Dag, 276 | params_node_name: N2, 277 | ) -> Node { 278 | Node { 279 | node_name: NodeName::new(node_name), 280 | kind: NodeKind::Subsaga { 281 | params_node_name: NodeName::new(params_node_name), 282 | dag, 283 | }, 284 | } 285 | } 286 | } 287 | 288 | /// A Node in the saga DAG (internal representation) 289 | /// 290 | /// Since sagas are constructed dynamically at runtime, we don't know the 291 | /// shape of the graph ahead of time. We need to maintain enough information 292 | /// to reconstruct the saga when loaded from persistent storage. The easiest 293 | /// way to do that is to store the graph itself with enough information in 294 | /// each node that allows us to recreate the Saga. Note that we don't store 295 | /// the execution state of the saga here. That continues to reside in saga log 296 | /// consisting of `SagaNodeEvent`s. 297 | #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] 298 | pub(crate) enum InternalNode { 299 | Start { params: Arc }, 300 | End, 301 | Action { name: NodeName, label: String, action_name: ActionName }, 302 | Constant { name: NodeName, value: Arc }, 303 | SubsagaStart { saga_name: SagaName, params_node_name: NodeName }, 304 | SubsagaEnd { name: NodeName }, 305 | } 306 | 307 | impl InternalNode { 308 | pub fn node_name(&self) -> Option<&NodeName> { 309 | match self { 310 | InternalNode::Start { .. } 311 | | InternalNode::End 312 | | InternalNode::SubsagaStart { .. } => None, 313 | InternalNode::Action { name, .. } => Some(&name), 314 | InternalNode::Constant { name, .. } => Some(&name), 315 | InternalNode::SubsagaEnd { name, .. } => Some(&name), 316 | } 317 | } 318 | 319 | pub fn label(&self) -> String { 320 | match self { 321 | InternalNode::Start { .. } => String::from("(start node)"), 322 | InternalNode::End => String::from("(end node)"), 323 | InternalNode::Action { label, .. } => label.clone(), 324 | InternalNode::Constant { value, .. } => { 325 | let value_as_json = serde_json::to_string(value) 326 | .unwrap_or_else(|e| { 327 | format!("(failed to serialize constant value: {:#})", e) 328 | }); 329 | format!("(constant = {})", value_as_json) 330 | } 331 | InternalNode::SubsagaStart { saga_name, .. } => { 332 | format!("(subsaga start: {:?})", saga_name) 333 | } 334 | InternalNode::SubsagaEnd { .. } => String::from("(subsaga end)"), 335 | } 336 | } 337 | } 338 | 339 | /// A named, user-visible node in a saga graph. 340 | pub struct NodeEntry<'a> { 341 | internal: &'a InternalNode, 342 | index: NodeIndex, 343 | } 344 | 345 | impl NodeEntry<'_> { 346 | pub fn name(&self) -> &NodeName { 347 | self.internal.node_name().unwrap() 348 | } 349 | 350 | pub fn label(&self) -> String { 351 | self.internal.label() 352 | } 353 | 354 | pub fn index(&self) -> NodeIndex { 355 | self.index 356 | } 357 | } 358 | 359 | /// A [`Dag`] plus saga input parameters that together can be used to execute a 360 | /// saga 361 | /// 362 | /// The output of the saga is the output of the last node in the DAG. 363 | #[derive(Debug, Clone, Serialize, Deserialize)] 364 | pub struct SagaDag { 365 | /// name of the saga (intended primarily for use by the consumer) 366 | pub(crate) saga_name: SagaName, 367 | /// the actual DAG representation 368 | /// 369 | /// Unlike [`Dag`], [`SagaDag`]'s graph can contain any type of [`Node`]. 370 | /// There is always exactly one [`InternalNode::Start`] node and exactly 371 | /// one [`InternalNode::End`] node. The graph can contain subsagas, 372 | /// which are always bracketed by [`InternalNode::SubsagaStart`] and 373 | /// [`InternalNode::SubsagaEnd`] nodes. 374 | pub(crate) graph: Graph, 375 | /// the index of the [`InternalNode::Start`] node for this Saga 376 | pub(crate) start_node: NodeIndex, 377 | /// the index of the [`InternalNode::End`] node for this Saga 378 | pub(crate) end_node: NodeIndex, 379 | } 380 | 381 | /// An [`Iterator`] over all named nodes in the DAG. 382 | pub struct SagaDagIterator<'a> { 383 | dag: &'a SagaDag, 384 | index: NodeIndex, 385 | } 386 | 387 | impl<'a> Iterator for SagaDagIterator<'a> { 388 | type Item = NodeEntry<'a>; 389 | 390 | fn next(&mut self) -> Option { 391 | while let Some(node) = self.dag.get(self.index) { 392 | let index = self.index; 393 | self.index = NodeIndex::new(self.index.index() + 1); 394 | match node { 395 | InternalNode::Action { .. } 396 | | InternalNode::Constant { .. } 397 | | InternalNode::SubsagaEnd { .. } => { 398 | return Some(NodeEntry { internal: node, index }) 399 | } 400 | _ => (), 401 | } 402 | } 403 | None 404 | } 405 | } 406 | 407 | impl SagaDag { 408 | /// Make a [`SagaDag`] from the given DAG and input parameters 409 | pub fn new(dagfrag: Dag, params: serde_json::Value) -> SagaDag { 410 | // Wrap the DAG with a Start node (which stores the parameters) and an 411 | // end node so that we can easily tell when the saga has completed. 412 | let mut graph = dagfrag.graph; 413 | let start_node = 414 | graph.add_node(InternalNode::Start { params: Arc::new(params) }); 415 | let end_node = graph.add_node(InternalNode::End); 416 | 417 | // The first-added nodes in the graph depend on the "start" node. 418 | for first_node in &dagfrag.first_nodes { 419 | graph.add_edge(start_node, *first_node, ()); 420 | } 421 | 422 | // The "end" node depends on the last-added nodes in the DAG. 423 | for last_node in &dagfrag.last_nodes { 424 | graph.add_edge(*last_node, end_node, ()); 425 | } 426 | 427 | SagaDag { 428 | saga_name: dagfrag.saga_name, 429 | graph: graph, 430 | start_node, 431 | end_node, 432 | } 433 | } 434 | 435 | pub fn saga_name(&self) -> &SagaName { 436 | &self.saga_name 437 | } 438 | 439 | /// Return a node given its index 440 | pub(crate) fn get(&self, node_index: NodeIndex) -> Option<&InternalNode> { 441 | self.graph.node_weight(node_index) 442 | } 443 | 444 | /// Return the index for a given node name 445 | pub fn get_index(&self, name: &str) -> Result { 446 | self.graph 447 | .node_indices() 448 | .find(|i| { 449 | self.graph[*i] 450 | .node_name() 451 | .map(|n| n.as_ref() == name) 452 | .unwrap_or(false) 453 | }) 454 | .ok_or_else(|| anyhow!("saga has no node named \"{}\"", name)) 455 | } 456 | 457 | /// Returns an iterator over all named nodes in the saga DAG. 458 | pub fn get_nodes(&self) -> SagaDagIterator<'_> { 459 | SagaDagIterator { dag: self, index: NodeIndex::new(0) } 460 | } 461 | 462 | /// Returns an object that can be used to print a graphviz-format 463 | /// representation of the underlying DAG 464 | pub fn dot(&self) -> DagDot<'_> { 465 | DagDot(&self.graph) 466 | } 467 | } 468 | 469 | /// Graphviz-formatted view of a saga DAG 470 | /// 471 | /// Use the `Display` impl to print a representation suitable as input to 472 | /// the `dot` command. You could put this into a file `graph.out` and run 473 | /// something like `dot -Tpng -o graph.png graph.out` to produce `graph.png`, a 474 | /// visual representation of the saga graph. 475 | pub struct DagDot<'a>(&'a Graph); 476 | impl fmt::Display for DagDot<'_> { 477 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 478 | let config = &[dot::Config::EdgeNoLabel]; 479 | let dot = dot::Dot::with_config(&self.0, config); 480 | write!(f, "{:?}", dot) 481 | } 482 | } 483 | 484 | /// Describes a directed acyclic graph (DAG) to be used as a saga or subsaga 485 | /// 486 | /// If you want to run this `Dag` as a saga, you need to create a [`SagaDag`] 487 | /// (which requires providing input parameters). 488 | /// 489 | /// If you want to insert this `Dag` into another saga (as a subsaga), use 490 | /// [`Node::subsaga()`] to create a subsaga _node_ and append this to the outer 491 | /// saga's [`DagBuilder`]. 492 | /// 493 | /// This type is built with [`DagBuilder`]. 494 | #[derive(Debug, Clone, Serialize, Deserialize)] 495 | pub struct Dag { 496 | /// name of the saga (intended primarily for use by the consumer) 497 | saga_name: SagaName, 498 | 499 | /// the actual DAG representation 500 | /// 501 | /// This graph does *not* contain a [`InternalNode::Start`] or 502 | /// [`InternalNode::End`] node. Those only make sense for `Dag`s that will 503 | /// become top-level sagas (as opposed to subsagas). Instead, we keep 504 | /// track of the first group of DAG (root nodes) and the last group of 505 | /// DAG nodes (leaf nodes). Later, we'll wrap this `Dag` in either 506 | /// [`SagaDag`] (for use as a top-level saga), in which case we'll add 507 | /// the start and end nodes, or we'll use it as a subsaga, in which 508 | /// case we'll add SubsagaStart and SubsagaEnd nodes. 509 | graph: Graph, 510 | 511 | /// the initial nodes (root nodes) of the DAG 512 | first_nodes: Vec, 513 | 514 | /// the last nodes (leaf nodes) of the DAG 515 | last_nodes: Vec, 516 | } 517 | 518 | /// Used to build a [`Dag`] that can then be executed as either a saga or 519 | /// subsaga 520 | /// 521 | /// Use [`DagBuilder::append()`] and [`DagBuilder::append_parallel()`] to add 522 | /// nodes to the graph. Use [`DagBuilder::build()`] to finish construction and 523 | /// build a [`Dag`]. 524 | #[derive(Debug)] 525 | pub struct DagBuilder { 526 | /// name of the saga (intended primarily for use by the consumer) 527 | saga_name: SagaName, 528 | /// the actual DAG representation 529 | /// 530 | /// This looks the same as [`Dag`]'s `graph`. 531 | graph: Graph, 532 | 533 | /// the initial set of nodes (root nodes), if any have been added 534 | first_added: Option>, 535 | 536 | /// the most-recently-added set of nodes (current leaf nodes) 537 | /// 538 | /// Callers use the builder by appending a sequence of nodes (or subsagas). 539 | /// Some of these may run concurrently. 540 | /// 541 | /// The `append()`/`append_parallel()` functions are public. As the names 542 | /// imply, they append new nodes to the graph. They also update 543 | /// "last_added" so that the next set of nodes will depend on the ones that 544 | /// were just added. 545 | /// 546 | /// The `add_*()` functions are private, for use only by 547 | /// `append()`/`append_parallel()`. These functions have a consistent 548 | /// pattern: they add nodes to the graph, they create dependencies from 549 | /// each node in "last_added" to each of the new nodes, and they return the 550 | /// index of the last node they added. They do _not_ update "last_added" 551 | /// themselves. 552 | last_added: Vec, 553 | 554 | /// names of nodes added so far 555 | node_names: BTreeSet, 556 | 557 | /// error from any builder operation (returned by `build()`) 558 | error: Option, 559 | } 560 | 561 | #[derive(Clone, Debug, Eq, Error, PartialEq)] 562 | #[error("building saga \"{saga_name}\": {kind:#}")] 563 | pub struct DagBuilderError { 564 | saga_name: SagaName, 565 | 566 | #[source] 567 | kind: DagBuilderErrorKind, 568 | } 569 | 570 | #[derive(Clone, Debug, Eq, Error, PartialEq)] 571 | enum DagBuilderErrorKind { 572 | #[error("saga must end with exactly one node")] 573 | /// The saga ended with zero nodes (if it was empty) or more than one node. 574 | /// Sagas are required to end with a single node that emits the output for 575 | /// the saga itself. 576 | BadOutputNode, 577 | 578 | #[error( 579 | "subsaga node {0:?} has parameters that come from node {1:?}, but it \ 580 | does not depend on any such node" 581 | )] 582 | /// A subsaga was appended whose parameters were supposed to come from a 583 | /// node that does not exist or that the subsaga does not depend on. 584 | BadSubsagaParams(NodeName, NodeName), 585 | 586 | #[error("name was used multiple times in the same Dag: {0:?}")] 587 | /// The same name was given to multiple nodes in the same DAG. This is not 588 | /// allowed. Node names must be unique because they are used to identify 589 | /// outputs. 590 | DuplicateName(NodeName), 591 | 592 | #[error("attempted to append 0 nodes in parallel")] 593 | /// It's not allowed to append 0 nodes in parallel. 594 | EmptyStage, 595 | } 596 | 597 | impl Dag { 598 | /// Creates a new [`DagBuilder`] instance. 599 | pub fn builder(saga_name: SagaName) -> DagBuilder { 600 | DagBuilder::new(saga_name) 601 | } 602 | } 603 | 604 | impl DagBuilder { 605 | /// Begin building a DAG for a saga or subsaga 606 | pub fn new(saga_name: SagaName) -> DagBuilder { 607 | DagBuilder { 608 | saga_name, 609 | graph: Graph::new(), 610 | first_added: None, 611 | last_added: vec![], 612 | node_names: BTreeSet::new(), 613 | error: None, 614 | } 615 | } 616 | 617 | /// Adds a new node to the graph to be run after the most-recently-appended 618 | /// node(s) 619 | /// 620 | /// The new node will depend on completion of all actions that were added in 621 | /// the last call to `append` or `append_parallel`. The idea is to `append` 622 | /// a sequence of steps that run one after another. 623 | pub fn append(&mut self, user_node: Node) { 624 | self.append_parallel(vec![user_node]) 625 | } 626 | 627 | /// Adds a set of nodes to the graph that depend on the 628 | /// most-recently-appended node(s) but that can be executed concurrently 629 | /// with each other 630 | /// 631 | /// The new nodes will individually depend on completion of all actions that 632 | /// were added in the last call to `append()` or `append_parallel()`. 633 | pub fn append_parallel(&mut self, user_nodes: Vec) { 634 | // If we've encountered an error already, don't do anything. We'll 635 | // report this when the user invokes `build()`. 636 | if self.error.is_some() { 637 | return; 638 | } 639 | 640 | // It's not allowed to have an empty stage. It's not clear what would 641 | // be intended by this. With the current implementation, you'd wind up 642 | // creating two separate connected components of the DAG, which violates 643 | // all kinds of assumptions. 644 | if user_nodes.len() == 0 { 645 | self.error = Some(DagBuilderErrorKind::EmptyStage); 646 | return; 647 | } 648 | 649 | // Validate that if we're appending a subsaga, then the node from which 650 | // it gets its parameters has already been appended. If it hasn't been 651 | // appended, this is definitely invalid because we'd have no way to get 652 | // these parameters when we need them. As long as the parameters node 653 | // has been appended already, then the new subsaga node will depend on 654 | // that node (either directly or indirectly). 655 | // 656 | // It might seem more natural to validate this in `add_subsaga()`. But 657 | // if we're appending multiple nodes in parallel here, then by the time 658 | // we get to `add_subsaga()`, it's possible that we've added nodes to 659 | // the graph on which the subsaga does _not_ depend. That would cause 660 | // us to erroneously think this is valid when it's not. 661 | // 662 | // If this fails, we won't report it until the user calls `build()`. In 663 | // the meantime, we proceed with the building. The problem here doesn't 664 | // make that any harder. 665 | for node in &user_nodes { 666 | if let NodeKind::Subsaga { params_node_name, .. } = &node.kind { 667 | if !self.node_names.contains(¶ms_node_name) 668 | && self.error.is_none() 669 | { 670 | self.error = Some(DagBuilderErrorKind::BadSubsagaParams( 671 | node.node_name.clone(), 672 | params_node_name.clone(), 673 | )); 674 | return; 675 | } 676 | } 677 | } 678 | 679 | // Now validate that the names of these new nodes are unique. It's 680 | // important that we do this after the above check because otherwise if 681 | // the user added a subsaga node whose parameters came from a parallel 682 | // node (added in the same call), we wouldn't catch the problem. 683 | for node in &user_nodes { 684 | if !self.node_names.insert(node.node_name.clone()) { 685 | self.error = Some(DagBuilderErrorKind::DuplicateName( 686 | node.node_name.clone(), 687 | )); 688 | return; 689 | } 690 | } 691 | 692 | // Now we can proceed with adding the nodes. 693 | let newnodes: Vec = user_nodes 694 | .into_iter() 695 | .map(|user_node| self.add_node(user_node)) 696 | .collect(); 697 | 698 | if self.first_added.is_none() { 699 | self.first_added = Some(newnodes.clone()); 700 | } 701 | 702 | // TODO-design For this exploration, we assume that any nodes appended 703 | // after a parallel set are intended to depend on _all_ nodes in the 704 | // parallel set. This doesn't have to be the case in general, but if 705 | // you wanted to do something else, you probably would need pretty 706 | // fine-grained control over the construction of the graph. This is 707 | // mostly a question of how to express the construction of the graph, 708 | // not the graph itself nor how it gets processed, so let's defer for 709 | // now. 710 | // 711 | // Even given that, it might make more sense to implement this by 712 | // creating an intermediate node that all the parallel nodes have edges 713 | // to, and then edges from this intermediate node to the next set of 714 | // parallel nodes. 715 | self.set_last(&newnodes); 716 | } 717 | 718 | // Implementation note: the add_* functions here add nodes to the DAG, add 719 | // dependencies from `self.last` (the last set of nodes added), but do NOT 720 | // set `self.last`. They return the `NodeIndex` of the last thing they 721 | // added. 722 | 723 | /// Adds any kind of [`Node`] to the graph 724 | fn add_node(&mut self, user_node: Node) -> NodeIndex { 725 | match user_node.kind { 726 | NodeKind::Action { label, action_name } => { 727 | self.add_simple(InternalNode::Action { 728 | name: user_node.node_name, 729 | label, 730 | action_name, 731 | }) 732 | } 733 | NodeKind::Constant { value } => { 734 | self.add_simple(InternalNode::Constant { 735 | name: user_node.node_name, 736 | value: Arc::new(value), 737 | }) 738 | } 739 | NodeKind::Subsaga { params_node_name, dag } => { 740 | self.add_subsaga(user_node.node_name, dag, params_node_name) 741 | } 742 | } 743 | } 744 | 745 | /// Adds a `InternalNode::Constant` or `InternalNode::Action` to the graph 746 | fn add_simple(&mut self, node: InternalNode) -> NodeIndex { 747 | assert!(matches!( 748 | node, 749 | InternalNode::Constant { .. } | InternalNode::Action { .. } 750 | )); 751 | let newnode = self.graph.add_node(node); 752 | self.depends_on_last(newnode); 753 | newnode 754 | } 755 | 756 | /// Adds another DAG to this one as a subsaga 757 | /// 758 | /// This isn't quite the same as inserting the given DAG into the DAG that 759 | /// we're building. Subsaga nodes live in a separate namespace of node 760 | /// names. We do this by adding the SubsagaStart and SubsagaEnd nodes 761 | /// around the given DAG. 762 | fn add_subsaga( 763 | &mut self, 764 | name: NodeName, 765 | subsaga_dag: Dag, 766 | params_node_name: NodeName, 767 | ) -> NodeIndex { 768 | let node_start = InternalNode::SubsagaStart { 769 | saga_name: subsaga_dag.saga_name.clone(), 770 | params_node_name: NodeName::new(params_node_name), 771 | }; 772 | let subsaga_start = self.graph.add_node(node_start); 773 | self.depends_on_last(subsaga_start); 774 | 775 | // Insert all the nodes of the subsaga into this saga. 776 | let subgraph = &subsaga_dag.graph; 777 | let mut subsaga_idx_to_saga_idx = BTreeMap::new(); 778 | for child_node_index in 0..subgraph.node_count() { 779 | let child_node_index = NodeIndex::from(child_node_index as u32); 780 | let node = subgraph.node_weight(child_node_index).unwrap().clone(); 781 | 782 | // Dags are not allowed to have Start/End nodes. These are only 783 | // added to `SagaDag`s. Given that, we can copy the rest of the 784 | // nodes directly into the parent graph. 785 | match node { 786 | InternalNode::Start { .. } | InternalNode::End => { 787 | panic!("subsaga Dag contained unexpected node: {:?}", node); 788 | } 789 | InternalNode::Action { .. } 790 | | InternalNode::Constant { .. } 791 | | InternalNode::SubsagaStart { .. } 792 | | InternalNode::SubsagaEnd { .. } => (), 793 | }; 794 | 795 | // We already appended the start node 796 | let parent_node_index = self.graph.add_node(node); 797 | 798 | assert!(subsaga_idx_to_saga_idx 799 | .insert(child_node_index, parent_node_index) 800 | .is_none()); 801 | 802 | // For any incoming edges for this node in the subgraph, create a 803 | // corresponding edge in the new graph. 804 | for ancestor_child_node_index in subgraph 805 | .neighbors_directed(child_node_index, petgraph::Incoming) 806 | { 807 | let ancestor_parent_node_index = subsaga_idx_to_saga_idx 808 | .get(&ancestor_child_node_index) 809 | .expect("graph was not a DAG"); 810 | self.graph.add_edge( 811 | *ancestor_parent_node_index, 812 | parent_node_index, 813 | (), 814 | ); 815 | } 816 | } 817 | 818 | // The initial nodes of the subsaga DAG must depend on the SubsagaStart 819 | // node that we added. 820 | for child_first_node in &subsaga_dag.first_nodes { 821 | let parent_first_node = 822 | subsaga_idx_to_saga_idx.get(&child_first_node).unwrap(); 823 | self.graph.add_edge(subsaga_start, *parent_first_node, ()); 824 | } 825 | 826 | // Add a SubsagaEnd node that depends on the last nodes of the subsaga 827 | // DAG. 828 | let subsaga_end = 829 | self.graph.add_node(InternalNode::SubsagaEnd { name }); 830 | for child_last_node in &subsaga_dag.last_nodes { 831 | let parent_last_node = 832 | subsaga_idx_to_saga_idx.get(&child_last_node).unwrap(); 833 | self.graph.add_edge(*parent_last_node, subsaga_end, ()); 834 | } 835 | 836 | subsaga_end 837 | } 838 | 839 | /// Record that node `newnode` depends on the last set of nodes that were 840 | /// appended 841 | fn depends_on_last(&mut self, newnode: NodeIndex) { 842 | for node in &self.last_added { 843 | self.graph.add_edge(*node, newnode, ()); 844 | } 845 | } 846 | 847 | /// Record that the nodes in `nodes` should be ancestors of whatever nodes 848 | /// get added next. 849 | fn set_last(&mut self, nodes: &[NodeIndex]) { 850 | self.last_added = nodes.to_vec(); 851 | } 852 | 853 | /// Return the constructed DAG 854 | pub fn build(self) -> Result { 855 | // If we ran into a problem along the way, report it now. 856 | if let Some(error) = self.error { 857 | return Err(DagBuilderError { 858 | saga_name: self.saga_name.clone(), 859 | kind: error, 860 | }); 861 | } 862 | 863 | // Every saga must end in exactly one leaf node. 864 | if self.last_added.len() != 1 { 865 | return Err(DagBuilderError { 866 | saga_name: self.saga_name.clone(), 867 | kind: DagBuilderErrorKind::BadOutputNode, 868 | }); 869 | } 870 | 871 | Ok(Dag { 872 | saga_name: self.saga_name, 873 | graph: self.graph, 874 | first_nodes: self.first_added.unwrap_or_else(|| Vec::new()), 875 | last_nodes: self.last_added, 876 | }) 877 | } 878 | } 879 | 880 | #[cfg(test)] 881 | mod test { 882 | use super::DagBuilder; 883 | use super::DagBuilderErrorKind; 884 | use super::Node; 885 | use super::NodeName; 886 | use super::SagaName; 887 | 888 | #[test] 889 | fn test_saga_names_and_label() { 890 | let mut builder = DagBuilder::new(SagaName::new("test-saga")); 891 | builder.append(Node::constant("a", serde_json::Value::Null)); 892 | let dag = crate::SagaDag::new( 893 | builder.build().expect("Should have built DAG"), 894 | serde_json::Value::Null, 895 | ); 896 | 897 | let mut nodes = dag.get_nodes(); 898 | 899 | let node = nodes.next().unwrap(); 900 | assert_eq!("a", node.name().as_ref()); 901 | assert_eq!("(constant = null)", node.label()); 902 | 903 | assert!(nodes.next().is_none()); 904 | } 905 | 906 | #[test] 907 | fn test_builder_bad_output_nodes() { 908 | // error case: totally empty DAG 909 | let builder = DagBuilder::new(SagaName::new("test-saga")); 910 | let result = builder.build(); 911 | println!("{:?}", result); 912 | match result { 913 | Ok(_) => panic!("unexpected success"), 914 | Err(error) => { 915 | assert_eq!(error.saga_name.to_string(), "test-saga"); 916 | assert!(matches!( 917 | error.kind, 918 | DagBuilderErrorKind::BadOutputNode 919 | )); 920 | assert_eq!( 921 | error.to_string(), 922 | "building saga \"test-saga\": saga must end with exactly \ 923 | one node" 924 | ); 925 | } 926 | }; 927 | 928 | // error case: a DAG that ends with two nodes 929 | let mut builder = DagBuilder::new(SagaName::new("test-saga")); 930 | builder.append_parallel(vec![ 931 | Node::constant("a", serde_json::Value::Null), 932 | Node::constant("b", serde_json::Value::Null), 933 | ]); 934 | let result = builder.build(); 935 | println!("{:?}", result); 936 | assert!(matches!( 937 | result.unwrap_err().kind, 938 | DagBuilderErrorKind::BadOutputNode 939 | )); 940 | } 941 | 942 | #[test] 943 | fn test_builder_empty_stage() { 944 | // error case: a DAG with a 0-node stage in it 945 | let mut builder = DagBuilder::new(SagaName::new("test-saga")); 946 | builder.append_parallel(vec![]); 947 | let result = builder.build(); 948 | println!("{:?}", result); 949 | let error = result.unwrap_err(); 950 | assert!(matches!(error.kind, DagBuilderErrorKind::EmptyStage)); 951 | assert_eq!( 952 | error.to_string(), 953 | "building saga \"test-saga\": attempted to append 0 nodes in \ 954 | parallel" 955 | ); 956 | } 957 | 958 | #[test] 959 | fn test_builder_duplicate_names() { 960 | // error case: a DAG that duplicates names (direct ancestor) 961 | let mut builder = DagBuilder::new(SagaName::new("test-saga")); 962 | builder.append(Node::constant("a", serde_json::Value::Null)); 963 | builder.append(Node::constant("a", serde_json::Value::Null)); 964 | let error = builder.build().unwrap_err(); 965 | println!("{:?}", error); 966 | assert_eq!( 967 | error.kind, 968 | DagBuilderErrorKind::DuplicateName(NodeName::new("a")) 969 | ); 970 | assert_eq!( 971 | error.to_string(), 972 | "building saga \"test-saga\": name was used multiple times in the \ 973 | same Dag: \"a\"" 974 | ); 975 | 976 | // error case: a DAG that duplicates names (indirect ancestor) 977 | let mut builder = DagBuilder::new(SagaName::new("test-saga")); 978 | builder.append(Node::constant("a", serde_json::Value::Null)); 979 | builder.append_parallel(vec![ 980 | Node::constant("b", serde_json::Value::Null), 981 | Node::constant("c", serde_json::Value::Null), 982 | ]); 983 | builder.append(Node::constant("a", serde_json::Value::Null)); 984 | let error = builder.build().unwrap_err(); 985 | println!("{:?}", error); 986 | assert_eq!( 987 | error.kind, 988 | DagBuilderErrorKind::DuplicateName(NodeName::new("a")) 989 | ); 990 | 991 | // error case: a DAG that duplicates names (parallel) 992 | let mut builder = DagBuilder::new(SagaName::new("test-saga")); 993 | builder.append(Node::constant("a", serde_json::Value::Null)); 994 | builder.append_parallel(vec![ 995 | Node::constant("b", serde_json::Value::Null), 996 | Node::constant("b", serde_json::Value::Null), 997 | ]); 998 | let error = builder.build().unwrap_err(); 999 | println!("{:?}", error); 1000 | assert_eq!( 1001 | error.kind, 1002 | DagBuilderErrorKind::DuplicateName(NodeName::new("b")) 1003 | ); 1004 | 1005 | // success case: a DAG that uses the same name for a node outside and 1006 | // inside a subsaga 1007 | let mut inner_builder = DagBuilder::new(SagaName::new("inner-saga")); 1008 | inner_builder.append(Node::constant("a", serde_json::Value::Null)); 1009 | let inner_dag = inner_builder.build().unwrap(); 1010 | let mut outer_builder = DagBuilder::new(SagaName::new("outer-saga")); 1011 | outer_builder.append(Node::constant("a", serde_json::Value::Null)); 1012 | outer_builder.append(Node::subsaga("b", inner_dag, "a")); 1013 | let _ = outer_builder.build().unwrap(); 1014 | } 1015 | 1016 | #[test] 1017 | fn test_builder_bad_subsaga_params() { 1018 | let mut subsaga_builder = DagBuilder::new(SagaName::new("inner-saga")); 1019 | subsaga_builder.append(Node::constant("a", serde_json::Value::Null)); 1020 | let subsaga_dag = subsaga_builder.build().unwrap(); 1021 | 1022 | // error case: subsaga depends on params node that doesn't exist 1023 | let mut builder = DagBuilder::new(SagaName::new("test-saga")); 1024 | builder.append(Node::constant("a", serde_json::Value::Null)); 1025 | builder.append(Node::subsaga("b", subsaga_dag.clone(), "barf")); 1026 | let error = builder.build().unwrap_err(); 1027 | println!("{:?}", error); 1028 | assert_eq!( 1029 | error.kind, 1030 | DagBuilderErrorKind::BadSubsagaParams( 1031 | NodeName::new("b"), 1032 | NodeName::new("barf") 1033 | ) 1034 | ); 1035 | assert_eq!( 1036 | error.to_string(), 1037 | "building saga \"test-saga\": subsaga node \"b\" has parameters \ 1038 | that come from node \"barf\", but it does not depend on any such \ 1039 | node" 1040 | ); 1041 | 1042 | // error case: subsaga depends on params node that doesn't exist 1043 | // (itself) 1044 | let mut builder = DagBuilder::new(SagaName::new("test-saga")); 1045 | builder.append(Node::constant("a", serde_json::Value::Null)); 1046 | builder.append(Node::subsaga("b", subsaga_dag.clone(), "b")); 1047 | let error = builder.build().unwrap_err(); 1048 | println!("{:?}", error); 1049 | assert_eq!( 1050 | error.kind, 1051 | DagBuilderErrorKind::BadSubsagaParams( 1052 | NodeName::new("b"), 1053 | NodeName::new("b") 1054 | ) 1055 | ); 1056 | 1057 | // error case: subsaga depends on params node that doesn't exist 1058 | // (added in parallel) 1059 | let mut builder = DagBuilder::new(SagaName::new("test-saga")); 1060 | builder.append(Node::constant("a", serde_json::Value::Null)); 1061 | builder.append_parallel(vec![ 1062 | Node::constant("c", serde_json::Value::Null), 1063 | Node::subsaga("b", subsaga_dag, "c"), 1064 | ]); 1065 | let error = builder.build().unwrap_err(); 1066 | println!("{:?}", error); 1067 | assert_eq!( 1068 | error.kind, 1069 | DagBuilderErrorKind::BadSubsagaParams( 1070 | NodeName::new("b"), 1071 | NodeName::new("c") 1072 | ) 1073 | ); 1074 | } 1075 | } 1076 | -------------------------------------------------------------------------------- /src/example_provision.rs: -------------------------------------------------------------------------------- 1 | //! Common code shared by examples 2 | 3 | use crate::ActionContext; 4 | use crate::ActionError; 5 | use crate::ActionFuncResult; 6 | use crate::ActionRegistry; 7 | use crate::Dag; 8 | use crate::DagBuilder; 9 | use crate::Node; 10 | use crate::SagaDag; 11 | use crate::SagaName; 12 | use crate::SagaType; 13 | use serde::Deserialize; 14 | use serde::Serialize; 15 | use std::sync::Arc; 16 | use thiserror::Error; 17 | 18 | // Demo provision saga: 19 | // 20 | // create instance (database) 21 | // | | | 22 | // +------+ + +-------------+ 23 | // | | | 24 | // v v v 25 | // alloc IP create volume pick server 26 | // | | | 27 | // +------+--+ v 28 | // | allocate server resources 29 | // | | 30 | // +-------------------+ 31 | // | 32 | // v 33 | // configure instance (server) 34 | // | 35 | // v 36 | // attach volume 37 | // | 38 | // v 39 | // boot instance 40 | 41 | #[doc(hidden)] 42 | #[derive(Debug)] 43 | pub struct ExampleSagaType {} 44 | impl SagaType for ExampleSagaType { 45 | type ExecContextType = ExampleContext; 46 | } 47 | 48 | #[doc(hidden)] 49 | #[derive(Debug, Deserialize, Serialize)] 50 | pub struct ExampleParams { 51 | pub instance_name: String, 52 | pub number_of_instances: u16, 53 | } 54 | 55 | #[doc(hidden)] 56 | #[derive(Debug, Default)] 57 | pub struct ExampleContext; 58 | 59 | type SagaExampleContext = ActionContext; 60 | 61 | #[derive(Debug, Deserialize, Error, Serialize)] 62 | enum ExampleError { 63 | #[error("example error")] 64 | AnError, 65 | } 66 | 67 | type ExFuncResult = ActionFuncResult; 68 | 69 | #[derive(Debug)] 70 | struct ExampleSubsagaType {} 71 | impl SagaType for ExampleSubsagaType { 72 | type ExecContextType = ExampleContext; 73 | } 74 | 75 | #[derive(Debug, Deserialize, Serialize)] 76 | struct ExampleSubsagaParams { 77 | number_of_things: u32, 78 | } 79 | 80 | #[derive(Debug, Deserialize, Serialize)] 81 | struct ServerAllocResult { 82 | server_id: u64, 83 | } 84 | 85 | // TODO-cleanup can we implement this generically? 86 | impl From for ActionError { 87 | fn from(t: ExampleError) -> ActionError { 88 | ActionError::action_failed(t) 89 | } 90 | } 91 | 92 | mod actions { 93 | use super::ExampleSagaType; 94 | use crate::new_action_noop_undo; 95 | use crate::Action; 96 | use lazy_static::lazy_static; 97 | use std::sync::Arc; 98 | 99 | lazy_static! { 100 | pub static ref INSTANCE_CREATE: Arc> = 101 | new_action_noop_undo( 102 | "instance_create", 103 | super::demo_prov_instance_create, 104 | ); 105 | pub static ref VPC_ALLOC_IP: Arc> = 106 | new_action_noop_undo("vpc_alloc_ip", super::demo_prov_vpc_alloc_ip); 107 | pub static ref VOLUME_CREATE: Arc> = 108 | new_action_noop_undo( 109 | "volume_create", 110 | super::demo_prov_volume_create, 111 | ); 112 | pub static ref INSTANCE_CONFIGURE: Arc> = 113 | new_action_noop_undo( 114 | "instance_configure", 115 | super::demo_prov_instance_configure, 116 | ); 117 | pub static ref VOLUME_ATTACH: Arc> = 118 | new_action_noop_undo( 119 | "volume_attach", 120 | super::demo_prov_volume_attach, 121 | ); 122 | pub static ref INSTANCE_BOOT: Arc> = 123 | new_action_noop_undo( 124 | "instance_boot", 125 | super::demo_prov_instance_boot, 126 | ); 127 | pub static ref PRINT: Arc> = 128 | new_action_noop_undo("print", super::demo_prov_print); 129 | pub static ref SERVER_PICK: Arc> = 130 | new_action_noop_undo("server_pick", super::demo_prov_server_pick); 131 | pub static ref SERVER_RESERVE: Arc> = 132 | new_action_noop_undo( 133 | "server_reserve", 134 | super::demo_prov_server_reserve, 135 | ); 136 | } 137 | } 138 | 139 | /// Load our actions into an ActionRegistry 140 | #[doc(hidden)] 141 | pub fn load_example_actions(registry: &mut ActionRegistry) { 142 | registry.register(actions::INSTANCE_CREATE.clone()); 143 | registry.register(actions::VPC_ALLOC_IP.clone()); 144 | registry.register(actions::VOLUME_CREATE.clone()); 145 | registry.register(actions::INSTANCE_CONFIGURE.clone()); 146 | registry.register(actions::VOLUME_ATTACH.clone()); 147 | registry.register(actions::INSTANCE_BOOT.clone()); 148 | registry.register(actions::PRINT.clone()); 149 | registry.register(actions::SERVER_PICK.clone()); 150 | registry.register(actions::SERVER_RESERVE.clone()); 151 | } 152 | 153 | /// Create a subsaga for server allocation 154 | fn server_alloc_subsaga() -> Dag { 155 | let name = SagaName::new("server-alloc"); 156 | let mut d = DagBuilder::new(name); 157 | d.append(Node::action( 158 | "server_id", 159 | "ServerPick", 160 | actions::SERVER_PICK.as_ref(), 161 | )); 162 | d.append(Node::action( 163 | "server_reserve", 164 | "ServerReserve", 165 | actions::SERVER_RESERVE.as_ref(), 166 | )); 167 | 168 | d.build().unwrap() 169 | } 170 | 171 | /// Create a dag that describes a demo "VM Provision" Saga 172 | /// 173 | /// The actions in this saga do essentially nothing. They print out what 174 | /// node is running, they produce some data, and they consume some data 175 | /// from previous nodes. The intent is just to exercise the API. You can 176 | /// interact with this using the `demo-provision` example. 177 | #[doc(hidden)] 178 | pub fn make_example_provision_dag(params: ExampleParams) -> Arc { 179 | let name = SagaName::new("DemoVmProvision"); 180 | let mut d = DagBuilder::new(name); 181 | 182 | d.append(Node::action( 183 | "instance_id", 184 | "InstanceCreate", 185 | actions::INSTANCE_CREATE.as_ref(), 186 | )); 187 | 188 | let subsaga_params = ExampleSubsagaParams { number_of_things: 1 }; 189 | d.append(Node::constant( 190 | "server_alloc_params", 191 | serde_json::to_value(subsaga_params).unwrap(), 192 | )); 193 | 194 | d.append_parallel(vec![ 195 | Node::action( 196 | "instance_ip", 197 | "VpcAllocIp", 198 | actions::VPC_ALLOC_IP.as_ref(), 199 | ), 200 | Node::action( 201 | "volume_id", 202 | "VolumeCreate", 203 | actions::VOLUME_CREATE.as_ref(), 204 | ), 205 | Node::subsaga( 206 | "server_alloc", 207 | server_alloc_subsaga(), 208 | "server_alloc_params", 209 | ), 210 | ]); 211 | 212 | // Append nodes that will run after the subsaga completes 213 | d.append(Node::action( 214 | "instance_configure", 215 | "InstanceConfigure", 216 | actions::INSTANCE_CONFIGURE.as_ref(), 217 | )); 218 | d.append(Node::action( 219 | "volume_attach", 220 | "VolumeAttach", 221 | actions::VOLUME_ATTACH.as_ref(), 222 | )); 223 | d.append(Node::action( 224 | "instance_boot", 225 | "InstanceBoot", 226 | actions::INSTANCE_BOOT.as_ref(), 227 | )); 228 | d.append(Node::action("print", "Print", actions::PRINT.as_ref())); 229 | 230 | Arc::new(SagaDag::new( 231 | d.build().unwrap(), 232 | serde_json::to_value(params).unwrap(), 233 | )) 234 | } 235 | 236 | async fn demo_prov_instance_create( 237 | sgctx: SagaExampleContext, 238 | ) -> ExFuncResult { 239 | let params = sgctx.saga_params::()?; 240 | eprintln!( 241 | "running action: {} (instance name: {})", 242 | sgctx.node_label(), 243 | params.instance_name 244 | ); 245 | // exercise saga parameters 246 | // make up an instance ID 247 | let instance_id = 1211u64; 248 | Ok(instance_id) 249 | } 250 | 251 | async fn demo_prov_vpc_alloc_ip( 252 | sgctx: SagaExampleContext, 253 | ) -> ExFuncResult { 254 | eprintln!("running action: {}", sgctx.node_label()); 255 | // exercise using some data from a previous node 256 | let instance_id = sgctx.lookup::("instance_id")?; 257 | assert_eq!(instance_id, 1211); 258 | // make up an IP (simulate allocation) 259 | let ip = String::from("10.120.121.122"); 260 | Ok(ip) 261 | } 262 | 263 | // Another subsaga action 264 | async fn demo_prov_server_pick(sgctx: SagaExampleContext) -> ExFuncResult { 265 | eprintln!("running action: {}", sgctx.node_label()); 266 | let params = sgctx.saga_params::()?; 267 | // exercise subsaga parameters 268 | assert_eq!(params.number_of_things, 1); 269 | // make up ("allocate") a new server id 270 | let server_id = 1212u64; 271 | Ok(server_id) 272 | } 273 | 274 | // The last subsaga action 275 | async fn demo_prov_server_reserve( 276 | sgctx: SagaExampleContext, 277 | ) -> ExFuncResult { 278 | eprintln!("running action: {}", sgctx.node_label()); 279 | let params = sgctx.saga_params::()?; 280 | 281 | // exercise subsaga parameters 282 | assert_eq!(params.number_of_things, 1); 283 | // exercise using data from previous nodes 284 | let server_id = sgctx.lookup::("server_id")?; 285 | assert_eq!(server_id, 1212); 286 | // package this up for downstream consumers 287 | Ok(ServerAllocResult { server_id }) 288 | } 289 | 290 | async fn demo_prov_volume_create( 291 | sgctx: SagaExampleContext, 292 | ) -> ExFuncResult { 293 | eprintln!("running action: {}", sgctx.node_label()); 294 | // exercise using data from previous nodes 295 | assert_eq!(sgctx.lookup::("instance_id")?, 1211); 296 | // make up ("allocate") a volume id 297 | let volume_id = 1213u64; 298 | Ok(volume_id) 299 | } 300 | 301 | async fn demo_prov_instance_configure( 302 | sgctx: SagaExampleContext, 303 | ) -> ExFuncResult<()> { 304 | eprintln!("running action: {}", sgctx.node_label()); 305 | // exercise using data from previous nodes 306 | assert_eq!(sgctx.lookup::("instance_id")?, 1211); 307 | 308 | let params = sgctx.saga_params::()?; 309 | assert_eq!(params.number_of_instances, 1); 310 | assert_eq!( 311 | sgctx.lookup::("server_alloc")?.server_id, 312 | 1212 313 | ); 314 | 315 | assert_eq!(sgctx.lookup::("volume_id")?, 1213); 316 | Ok(()) 317 | } 318 | async fn demo_prov_volume_attach( 319 | sgctx: SagaExampleContext, 320 | ) -> ExFuncResult<()> { 321 | eprintln!("running action: {}", sgctx.node_label()); 322 | // exercise using data from previous nodes 323 | assert_eq!(sgctx.lookup::("instance_id")?, 1211); 324 | assert_eq!(sgctx.lookup::("volume_id")?, 1213); 325 | 326 | assert_eq!( 327 | sgctx.lookup::("server_alloc")?.server_id, 328 | 1212 329 | ); 330 | Ok(()) 331 | } 332 | async fn demo_prov_instance_boot( 333 | sgctx: SagaExampleContext, 334 | ) -> ExFuncResult<()> { 335 | eprintln!("running action: {}", sgctx.node_label()); 336 | // exercise using data from previous nodes 337 | assert_eq!(sgctx.lookup::("instance_id")?, 1211); 338 | assert_eq!(sgctx.lookup::("volume_id")?, 1213); 339 | 340 | // We know there is only one instance of the subsaga that created a server 341 | // id 342 | assert_eq!( 343 | sgctx.lookup::("server_alloc")?.server_id, 344 | 1212 345 | ); 346 | Ok(()) 347 | } 348 | 349 | async fn demo_prov_print(sgctx: SagaExampleContext) -> ExFuncResult { 350 | eprintln!("running action: {}", sgctx.node_label()); 351 | eprintln!("printing final state:"); 352 | let vm_instance_id = sgctx.lookup::("instance_id")?; 353 | eprintln!(" instance id: {}", vm_instance_id); 354 | let ip = sgctx.lookup::("instance_ip")?; 355 | eprintln!(" IP address: {}", ip); 356 | let volume_id = sgctx.lookup::("volume_id")?; 357 | eprintln!(" volume id: {}", volume_id); 358 | let server_id = 359 | sgctx.lookup::("server_alloc")?.server_id; 360 | eprintln!(" server id: {}", server_id); 361 | Ok(String::from("it worked")) 362 | } 363 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Steno is an in-progress prototype implementation of distributed sagas. 2 | //! Sagas orchestrate the execution of a set of asynchronous tasks that can 3 | //! fail. The saga pattern provides useful semantics for unwinding the whole 4 | //! operation when any task fails. For more on distributed sagas, see [this 5 | //! 2017 JOTB talk by Caitie McCaffrey][1]. 6 | //! 7 | //! [1]: https://www.youtube.com/watch?v=0UTOLRTwOX0 8 | //! 9 | //! ## Overview 10 | //! 11 | //! * Write some functions that will be used as _actions_ and _undo actions_ for 12 | //! your saga. Package these up with [`ActionFunc::new_action()`]. 13 | //! * Add these actions to an [`ActionRegistry`] 14 | //! * Use [`DagBuilder`] to construct a graph of these actions. Wrap this up in 15 | //! a [`SagaDag`]. 16 | //! * Construct a saga execution coordinator with [`sec()`] and use that to run 17 | //! the saga. You can start with an [`InMemorySecStore`] or impl your own 18 | //! [`SecStore`]. 19 | //! 20 | //! This crate is necessarily somewhat complex to use. **For a detailed, 21 | //! documented example, see examples/trip.rs.** 22 | 23 | #![deny(elided_lifetimes_in_paths)] 24 | // We disable the warning for unstable name collisions because we deliberately 25 | // have some conflicts in rust_features.rs (corresponding to backports of 26 | // unstable features). If and when these features are stabilized, we should see 27 | // warnings that our backported versions are unused and we can remove them. 28 | #![allow(unstable_name_collisions)] 29 | 30 | mod dag; 31 | mod example_provision; 32 | mod rust_features; 33 | mod saga_action_error; 34 | mod saga_action_func; 35 | mod saga_action_generic; 36 | mod saga_exec; 37 | mod saga_log; 38 | mod sec; 39 | mod store; 40 | 41 | // TODO-cleanup The example_provision stuff should probably be in a separate 42 | // crate that depends on "steno". That would ensure it only uses public 43 | // interfaces. However, the "steno" crate wants to have an example that uses 44 | // this crate, hence our problem. 45 | pub use example_provision::load_example_actions; 46 | pub use example_provision::make_example_provision_dag; 47 | pub use example_provision::ExampleContext; 48 | pub use example_provision::ExampleParams; 49 | pub use example_provision::ExampleSagaType; 50 | 51 | pub use dag::ActionName; 52 | pub use dag::ActionRegistry; 53 | pub use dag::ActionRegistryError; 54 | pub use dag::Dag; 55 | pub use dag::DagBuilder; 56 | pub use dag::DagBuilderError; 57 | pub use dag::Node; 58 | pub use dag::NodeName; 59 | pub use dag::SagaDag; 60 | pub use dag::SagaId; 61 | pub use dag::SagaName; 62 | pub use saga_action_error::ActionError; 63 | pub use saga_action_error::UndoActionError; 64 | pub use saga_action_func::new_action_noop_undo; 65 | pub use saga_action_func::ActionFunc; 66 | pub use saga_action_func::ActionFuncResult; 67 | pub use saga_action_generic::Action; 68 | pub use saga_action_generic::ActionData; 69 | pub use saga_action_generic::ActionResult; 70 | pub use saga_action_generic::SagaType; 71 | pub use saga_action_generic::UndoResult; 72 | pub use saga_exec::ActionContext; 73 | pub use saga_exec::SagaExecStatus; 74 | pub use saga_exec::SagaResult; 75 | pub use saga_exec::SagaResultErr; 76 | pub use saga_exec::SagaResultOk; 77 | pub use saga_log::SagaLog; 78 | pub use saga_log::SagaNodeEvent; 79 | pub use saga_log::SagaNodeEventType; 80 | pub use saga_log::SagaNodeId; 81 | pub use sec::sec; 82 | pub use sec::RepeatInjected; 83 | pub use sec::SagaSerialized; 84 | pub use sec::SagaStateView; 85 | pub use sec::SagaView; 86 | pub use sec::SecClient; 87 | pub use store::InMemorySecStore; 88 | pub use store::SagaCachedState; 89 | pub use store::SagaCreateParams; 90 | pub use store::SecStore; 91 | 92 | // TODO-cleanup This ought not to be exposed. It's here because we expose 93 | // SagaTemplateGeneric, which is important, and it has a function that uses this 94 | // type. This ought to be a sealed trait where this function is private or 95 | // something. 96 | pub use sec::SecExecClient; 97 | 98 | #[macro_use] 99 | extern crate slog; 100 | #[macro_use] 101 | extern crate newtype_derive; 102 | -------------------------------------------------------------------------------- /src/rust_features.rs: -------------------------------------------------------------------------------- 1 | //! Backports of useful unstable Rust features. 2 | 3 | // feature(option_expect_none) 4 | pub trait ExpectNone { 5 | fn expect_none(self, message: &'static str); 6 | } 7 | 8 | impl ExpectNone for Option { 9 | fn expect_none(self, message: &'static str) { 10 | assert!(self.is_none(), "{}", message); 11 | } 12 | } 13 | 14 | #[cfg(test)] 15 | mod test { 16 | use super::ExpectNone; 17 | 18 | #[test] 19 | fn test_some() { 20 | let x: Option<()> = None; 21 | x.expect_none("hello"); 22 | } 23 | 24 | #[test] 25 | #[should_panic(expected = "boom")] 26 | fn test_none() { 27 | let x = Some(()); 28 | x.expect_none("boom"); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/saga_action_error.rs: -------------------------------------------------------------------------------- 1 | //! Error types produced by saga actions 2 | 3 | use crate::saga_action_generic::ActionData; 4 | use schemars::JsonSchema; 5 | use serde::Deserialize; 6 | use serde::Serialize; 7 | use std::fmt::Display; 8 | use thiserror::Error; 9 | 10 | /// An error produced by a saga action 11 | /// 12 | /// On failure, actions always return an `ActionError`. This type can represent 13 | /// a failure from Steno itself or a failure produced by the consumer (e.g., an 14 | /// action whose body fails for some reason). The various specific errors are 15 | /// documented below. 16 | /// 17 | /// You can use your own error type with [`ActionError`]. As long as it meets 18 | /// the requirements of [`ActionData`], you can wrap your error in an 19 | /// [`ActionError::ActionFailed`] variant using 20 | /// [`ActionError::action_failed()`]. Given an [`ActionError::ActionFailed`] 21 | /// variant, you can get your specific type back out again using 22 | /// [`ActionError::convert()`]. 23 | /// 24 | /// Note that the conversion back to your specific error type can fail! This 25 | /// looks like a downcast, but it's not. `ActionError`s are typically recorded 26 | /// in the saga log and interpreted later, possibly after a crash and recovery. 27 | /// Whether there was an intervening crash or not, the conversion here 28 | /// deserializes the error from the log into your custom error type. This won't 29 | /// work if your error type is incompatible with the one that was used to 30 | /// serialize the error in the first place. 31 | /// 32 | /// # Example 33 | /// 34 | /// ```rust 35 | /// use serde::Deserialize; 36 | /// use serde::Serialize; 37 | /// use steno::ActionError; 38 | /// 39 | /// #[derive(Debug, Deserialize, Serialize)] 40 | /// struct MyError { message: String } 41 | /// 42 | /// fn my_func_that_fails() -> Result<(), ActionError> { 43 | /// Err(ActionError::action_failed(MyError { message: "boom!".to_owned() })) 44 | /// } 45 | /// 46 | /// fn handle_error(error: ActionError) { 47 | /// match error.convert::() { 48 | /// Ok(my_error) => { 49 | /// eprintln!("my action failed because: {}", my_error.message); 50 | /// } 51 | /// Err(other_error) => { 52 | /// eprintln!( 53 | /// "my action failed because the framework had a problem: {}", 54 | /// other_error.to_string() 55 | /// ); 56 | /// } 57 | /// } 58 | /// } 59 | /// ``` 60 | #[derive(Clone, Debug, Deserialize, Error, JsonSchema, Serialize)] 61 | pub enum ActionError { 62 | /// Action failed due to a consumer-specific error 63 | #[error("action failed")] 64 | ActionFailed { source_error: serde_json::Value }, 65 | 66 | /// The framework failed to deserialize the saga parameters, an action's 67 | /// successful result, or an action's error. 68 | #[error("deserialize failed: {message}")] 69 | DeserializeFailed { message: String }, 70 | 71 | /// The consumer requested that an error be injected instead of running a 72 | /// particular action's node. 73 | #[error("error injected")] 74 | InjectedError, 75 | 76 | /// The framework failed to serialize the saga parameters, an action's 77 | /// successful result, or an action's error. 78 | #[error("serialize failed: {message}")] 79 | SerializeFailed { message: String }, 80 | 81 | /// The framework failed to create the requested subsaga 82 | #[error("failed to create subsaga")] 83 | SubsagaCreateFailed { message: String }, 84 | } 85 | 86 | impl ActionError { 87 | /// Wrap a consumer-provided error in an [`ActionError`] 88 | // TODO-design Is there a way for us to provide this implementation 89 | // automatically? It would be nice if a consumer could use their own error 90 | // type, use `?` in the body of their function, and then have that get 91 | // wrapped in an ActionError. We'd like to provide a blanket impl for any 92 | // supported error type to convert it to ActionError. But ActionError is 93 | // itself a supported error type (not necessarily by design), so this 94 | // doesn't work. 95 | pub fn action_failed(user_error: E) -> ActionError { 96 | match serde_json::to_value(user_error) { 97 | Ok(source_error) => ActionError::ActionFailed { source_error }, 98 | Err(serialize_error) => ActionError::new_serialize(serialize_error), 99 | } 100 | } 101 | 102 | /// Try to convert the error to a specific consumer error 103 | /// 104 | /// This function streamlines the most common use case by decomposing the 105 | /// error into one of three cases: 106 | /// 107 | /// 1. If the error can be converted to the specific error type `E` (which 108 | /// means that this is the `ActionError::ActionFailed` variant and the 109 | /// wrapped error could be deserialized to `E`), this function returns 110 | /// `Ok(E)`. 111 | /// 112 | /// 2. If the error is the `ActionError::ActionFailed` variant but could not 113 | /// be converted to type `E`, this function returns `Err(ActionError)` 114 | /// where the error is the `ActionError::DeserializeFailed`. This is 115 | /// either a bug in the current program or an unexpected operational 116 | /// error, as might happen if incompatible versions of the saga executor 117 | /// are deployed. Most consumers will propagate this error up and 118 | /// eventually abandon the saga. 119 | /// 120 | /// 3. If the error is any other variant, the error itself is returned as 121 | /// `Err(ActionError)`. Most consumers will propagate this error up. 122 | pub fn convert(self) -> Result { 123 | match self { 124 | ActionError::ActionFailed { source_error } => { 125 | serde_json::from_value(source_error) 126 | .map_err(ActionError::new_deserialize) 127 | } 128 | _ => Err(self), 129 | } 130 | } 131 | 132 | pub fn new_serialize(source: serde_json::Error) -> ActionError { 133 | ActionError::SerializeFailed { message: source.to_string() } 134 | } 135 | 136 | pub fn new_deserialize(source: E) -> ActionError { 137 | ActionError::DeserializeFailed { message: format!("{:#}", source) } 138 | } 139 | 140 | pub fn new_subsaga(source: anyhow::Error) -> ActionError { 141 | let message = format!("{:#}", source); 142 | ActionError::SubsagaCreateFailed { message } 143 | } 144 | } 145 | 146 | /// An error produced by a failed undo action 147 | /// 148 | /// **Returning an error from an undo action should be avoided if at all 149 | /// possible.** If undo actions experience transient issues, they should 150 | /// generally retry until the undo action completes successfully. That's 151 | /// because by definition, failure of an undo action means that the saga's 152 | /// actions cannot be unwound. The system cannot move forward to the desired 153 | /// saga end state nor backward to the initial state. It's left forever in some 154 | /// partially-updated state. This should really only happen because of a bug. 155 | /// It should be expected that human intervention will be required to repair the 156 | /// result of an undo action that has failed. 157 | #[derive(Clone, Debug, Deserialize, Error, JsonSchema, Serialize)] 158 | pub enum UndoActionError { 159 | /// Undo action failed due to a consumer-specific error 160 | #[error("undo action failed permanently: {source_error:#}")] 161 | PermanentFailure { source_error: serde_json::Value }, 162 | } 163 | -------------------------------------------------------------------------------- /src/saga_action_func.rs: -------------------------------------------------------------------------------- 1 | //! Saga actions implemented using a pair of functions 2 | //! 3 | //! This is the primary interface for actions that we expose to users. It's a 4 | //! little fancier than the generic [`Action`] trait. 5 | 6 | use crate::saga_action_error::ActionError; 7 | use crate::saga_action_generic::Action; 8 | use crate::saga_action_generic::ActionData; 9 | use crate::saga_action_generic::ActionResult; 10 | use crate::saga_action_generic::SagaType; 11 | use crate::saga_action_generic::UndoResult; 12 | use crate::saga_exec::ActionContext; 13 | use crate::ActionName; 14 | use futures::future::BoxFuture; 15 | use std::any::type_name; 16 | use std::fmt; 17 | use std::fmt::Debug; 18 | use std::future::Future; 19 | use std::sync::Arc; 20 | 21 | /// Result of a function that implements a saga action 22 | // This differs from [`ActionResult`] because [`ActionResult`] returns a generic 23 | // type. The function-oriented interface allows you to return more specific 24 | // types as long as they implement the [`ActionData`] trait. 25 | // 26 | // TODO-design There's no reason that the generic interface couldn't also look 27 | // like this. We have this mechanism here to allow `ActionFunc` functions 28 | // to return specific types while storing the generic thing inside the 29 | // framework. We do this translation in the impl of `ActionFunc`. But we 30 | // could instead create another layer above `Action` that does this. This gets 31 | // complicated and doesn't seem especially useful yet. 32 | pub type ActionFuncResult = Result; 33 | 34 | /// Trait that expresses the requirements for async functions to be used with 35 | /// `ActionFunc`. This exists just to express the relationships between the 36 | /// types involved in the function, so that they don't have to be repeated 37 | /// everywhere. You don't need to implement it yourself -- a blanket 38 | /// implementation is provided. 39 | pub trait ActionFn<'c, S: SagaType>: Send + Sync + 'static { 40 | /// Type returned when the future finally resolves. 41 | type Output; 42 | /// Type of the future returned when the function is called. 43 | type Future: Future + Send + 'c; 44 | /// Call the function. 45 | fn act(&'c self, ctx: ActionContext) -> Self::Future; 46 | } 47 | 48 | // Blanket impl for Fn types returning futures 49 | impl<'c, F, S, FF> ActionFn<'c, S> for F 50 | where 51 | S: SagaType, 52 | F: Fn(ActionContext) -> FF + Send + Sync + 'static, 53 | FF: std::future::Future + Send + 'c, 54 | { 55 | type Future = FF; 56 | type Output = FF::Output; 57 | fn act(&'c self, ctx: ActionContext) -> Self::Future { 58 | self(ctx) 59 | } 60 | } 61 | 62 | /// Implementation of [`Action`] that uses ordinary functions for the action and 63 | /// undo action 64 | pub struct ActionFunc { 65 | name: ActionName, 66 | action_func: ActionFuncType, 67 | undo_func: UndoFuncType, 68 | } 69 | 70 | impl ActionFunc { 71 | /// Construct an [`Action`] from a pair of functions, using `action_func` 72 | /// for the action and `undo_func` for the undo action 73 | /// 74 | /// The result is returned as `Arc` so that it can be used 75 | /// directly where `Action`s are expected. (The struct `ActionFunc` has no 76 | /// interfaces of its own so there's generally no need to have the specific 77 | /// type.) 78 | pub fn new_action( 79 | name: Name, 80 | action_func: ActionFuncType, 81 | undo_func: UndoFuncType, 82 | ) -> Arc> 83 | where 84 | Name: AsRef, 85 | UserType: SagaType, 86 | for<'c> ActionFuncType: ActionFn< 87 | 'c, 88 | UserType, 89 | Output = ActionFuncResult, 90 | >, 91 | ActionFuncOutput: ActionData, 92 | for<'c> UndoFuncType: ActionFn<'c, UserType, Output = UndoResult>, 93 | { 94 | Arc::new(ActionFunc { 95 | name: ActionName::new(name.as_ref()), 96 | action_func, 97 | undo_func, 98 | }) 99 | } 100 | } 101 | 102 | // TODO-cleanup why can't new_action_noop_undo live in the Action namespace? 103 | 104 | /// Given a function `f`, return an `ActionFunc` that uses `f` as the action and 105 | /// provides a no-op undo function (which does nothing and always succeeds). 106 | pub fn new_action_noop_undo( 107 | name: Name, 108 | f: ActionFuncType, 109 | ) -> Arc> 110 | where 111 | Name: AsRef, 112 | UserType: SagaType, 113 | for<'c> ActionFuncType: ActionFn< 114 | 'c, 115 | UserType, 116 | Output = ActionFuncResult, 117 | >, 118 | ActionFuncOutput: ActionData, 119 | { 120 | // TODO-log 121 | ActionFunc::new_action(name, f, |_| async { Ok(()) }) 122 | } 123 | 124 | impl Action 125 | for ActionFunc 126 | where 127 | UserType: SagaType, 128 | for<'c> ActionFuncType: ActionFn< 129 | 'c, 130 | UserType, 131 | Output = ActionFuncResult, 132 | >, 133 | ActionFuncOutput: ActionData, 134 | for<'c> UndoFuncType: ActionFn<'c, UserType, Output = UndoResult>, 135 | { 136 | fn do_it( 137 | &self, 138 | sgctx: ActionContext, 139 | ) -> BoxFuture<'_, ActionResult> { 140 | Box::pin(async move { 141 | let fut = self.action_func.act(sgctx); 142 | // Execute the caller's function and translate its type into the 143 | // generic JsonValue that the framework uses to store action 144 | // outputs. 145 | fut.await 146 | .and_then(|func_output| { 147 | serde_json::to_value(func_output) 148 | .map_err(ActionError::new_serialize) 149 | }) 150 | .map(Arc::new) 151 | }) 152 | } 153 | 154 | fn undo_it( 155 | &self, 156 | sgctx: ActionContext, 157 | ) -> BoxFuture<'_, UndoResult> { 158 | Box::pin(self.undo_func.act(sgctx)) 159 | } 160 | 161 | fn name(&self) -> ActionName { 162 | self.name.clone() 163 | } 164 | } 165 | 166 | impl Debug 167 | for ActionFunc 168 | { 169 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 170 | // The type name for a function includes its name, so it's a handy 171 | // summary for debugging. 172 | f.write_str(&type_name::()) 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /src/saga_action_generic.rs: -------------------------------------------------------------------------------- 1 | //! Saga actions, core implementations, and related facilities 2 | //! 3 | //! This file contains a generic [`Action`] trait that is not intended to be 4 | //! used by Steno consumers, though it is exposed. Users are expected to 5 | //! use [`crate::ActionFunc`] in saga_action_func.rs. 6 | 7 | use crate::saga_action_error::ActionError; 8 | use crate::saga_exec::ActionContext; 9 | use crate::ActionName; 10 | use futures::future::BoxFuture; 11 | use serde::de::DeserializeOwned; 12 | use serde::Serialize; 13 | use std::fmt::Debug; 14 | use std::sync::Arc; 15 | 16 | /// Collection of consumer-provided types, effectively defining the type 17 | /// signature of a saga 18 | /// 19 | /// This trait bundles a bunch of consumer-provided types that are used 20 | /// throughout Steno to avoid a sprawl of type parameters and duplicated trait 21 | /// bounds. 22 | pub trait SagaType: Debug + 'static { 23 | /// Type for the consumer's context object 24 | /// 25 | /// When beginning execution of a saga with 26 | /// [`crate::SecClient::saga_create()`] or resuming a previous execution 27 | /// with [`crate::SecClient::saga_resume()`], consumers provide a 28 | /// context object with this type. This object is not persistent. 29 | /// Rather, it provides programming interfaces the consumer wants 30 | /// available from within actions. For example, this could include HTTP 31 | /// clients that will be used by the action to make requests to 32 | /// dependent services. This object is made available to actions via 33 | /// [`crate::ActionContext::user_data()`]. There's one context for the 34 | /// life of each saga's execution. 35 | type ExecContextType: Debug + Send + Sync + 'static; 36 | } 37 | 38 | /// Data produced by the consumer that may need to be serialized to the saga log 39 | /// 40 | /// This type is used for saga parameters and the output data and errors from an 41 | /// individual action. It's essentially a synonym for `Debug + DeserializeOwned 42 | /// + Serialize + Send + Sync`. Consumers are not expected to impl this 43 | /// directly. 44 | pub trait ActionData: 45 | Debug + DeserializeOwned + Serialize + Send + Sync + 'static 46 | { 47 | } 48 | impl ActionData 49 | for T 50 | { 51 | } 52 | 53 | /// Result of a saga action 54 | /// 55 | /// In this generic Action interface, actions return a pretty generic 56 | /// `serde_json::Value`. This is something that we can store uniformly, 57 | /// serialize to the log, and deserialize into a more specific type when the 58 | /// consumer asks for that. (By contrast, the `ActionFunc` impl is a little 59 | /// fancier. It allows consumers to return anything that _can_ be serialized. 60 | /// That's why consumers should prefer that interface and not this one.) 61 | // TODO-cleanup can we drop this Arc? 62 | pub type ActionResult = Result, ActionError>; 63 | 64 | /// Result of a saga undo action 65 | // TODO-design what should the error type here be? Maybe something that can 66 | // encompass "general framework error"? This might put the saga into a "needs 67 | // attention" state? 68 | pub type UndoResult = Result<(), anyhow::Error>; 69 | 70 | /// Building blocks of sagas 71 | /// 72 | /// Each node in a saga graph is represented with some kind of `Action`, 73 | /// which provides entry points to asynchronously execute an action and its 74 | /// corresponding undo action. A saga is essentially a directed acyclic graph 75 | /// of these actions with dependencies between them. Each action consumes an 76 | /// [`ActionContext`] and asynchronously produces an [`ActionResult`]. The 77 | /// primary implementor for most consumers is [`crate::ActionFunc`]. 78 | /// 79 | /// Actions should be stateless. Any state is supposed to be stored via the 80 | /// framework. So it should be easy to make Actions Send and Sync. This is 81 | /// important because we want to be able to have multiple references to the same 82 | /// Action in multiple threads -- as might happen if the same action appeared 83 | /// multiple times in the saga or in different sagas. 84 | pub trait Action: Debug + Send + Sync { 85 | /// Executes the action for this saga node, whatever that is. Actions 86 | /// function like requests in distributed sagas: critically, they must be 87 | /// idempotent. This means that multiple calls to the action have the 88 | /// same result on the system as a single call, although the action is not 89 | /// necessarily required to return the same result. 90 | /// 91 | /// As an example, generating a UUID to represent an object is a common saga 92 | /// action: if called repeatedly, it may generate different results, but it 93 | /// has no side effects on the rest of the system. Similarly, using a 94 | /// generated UUID in a subsequent action to create an object may help 95 | /// ensure that the side effects appear the same, regardless of how many 96 | /// times the action has been invoked. 97 | /// 98 | /// Actions should be very careful in using interfaces outside of 99 | /// [`ActionContext`] -- we want them to be as self-contained as possible to 100 | /// ensure idempotence and to minimize versioning issues. 101 | /// 102 | /// On success, this function produces a serialized output. This output 103 | /// will be stored persistently, keyed by the _name_ of the current saga 104 | /// node. Subsequent stages can access this data with 105 | /// [`ActionContext::lookup`]. This is the _only_ supported means of 106 | /// sharing state across actions within a saga. 107 | /// 108 | /// The output of the last node in the DAG becomes the output of the saga. 109 | fn do_it( 110 | &self, 111 | sgctx: ActionContext, 112 | ) -> BoxFuture<'_, ActionResult>; 113 | 114 | /// Executes the undo action for this saga node, whatever that is. 115 | fn undo_it( 116 | &self, 117 | sgctx: ActionContext, 118 | ) -> BoxFuture<'_, UndoResult>; 119 | 120 | /// Return the name of the action used as the key in the ActionRegistry 121 | fn name(&self) -> ActionName; 122 | } 123 | 124 | // Special action implementations 125 | 126 | /// [`Action`] impl that emits a value known when the DAG is created 127 | /// 128 | /// This is used to implement [`UserNode::Constant`]. 129 | #[derive(Debug)] 130 | pub struct ActionConstant { 131 | value: Arc, 132 | } 133 | 134 | impl ActionConstant { 135 | pub fn new(value: Arc) -> ActionConstant { 136 | ActionConstant { value } 137 | } 138 | } 139 | 140 | impl Action for ActionConstant 141 | where 142 | UserType: SagaType, 143 | { 144 | fn do_it(&self, _: ActionContext) -> BoxFuture<'_, ActionResult> { 145 | Box::pin(futures::future::ok(self.value.clone())) 146 | } 147 | 148 | fn undo_it(&self, _: ActionContext) -> BoxFuture<'_, UndoResult> { 149 | Box::pin(futures::future::ok(())) 150 | } 151 | 152 | fn name(&self) -> ActionName { 153 | ActionName::new("ActionConstant") 154 | } 155 | } 156 | 157 | /// Simulates an error at a given spot in the saga graph 158 | #[derive(Debug)] 159 | pub struct ActionInjectError {} 160 | 161 | impl Action for ActionInjectError { 162 | fn do_it(&self, _: ActionContext) -> BoxFuture<'_, ActionResult> { 163 | // TODO-log 164 | Box::pin(futures::future::err(ActionError::InjectedError)) 165 | } 166 | 167 | fn undo_it(&self, _: ActionContext) -> BoxFuture<'_, UndoResult> { 168 | // We should never undo an action that failed. But this same impl is 169 | // plugged into a saga when an "undo action" error is injected. 170 | Box::pin(futures::future::err(anyhow::anyhow!("error injected"))) 171 | } 172 | 173 | fn name(&self) -> ActionName { 174 | ActionName::new("InjectError") 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /src/saga_log.rs: -------------------------------------------------------------------------------- 1 | //! Persistent state for sagas 2 | 3 | use crate::saga_action_error::ActionError; 4 | use crate::saga_action_error::UndoActionError; 5 | use crate::SagaId; 6 | use anyhow::anyhow; 7 | use anyhow::Context; 8 | use schemars::JsonSchema; 9 | use serde::Deserialize; 10 | use serde::Serialize; 11 | use std::collections::BTreeMap; 12 | use std::fmt; 13 | use std::sync::Arc; 14 | use thiserror::Error; 15 | 16 | /// Unique identifier for a saga node 17 | // We use a newtype for SagaNodeId for the usual reasons. What about the 18 | // underlying representation? The Omicron consumer is going to store these in 19 | // CockroachDB, which makes `i64` the most natural numeric type. There's no 20 | // need for signed values here, so we choose `u32` as large enough for our 21 | // purposes, unsigned, and can be infallibly converted to an `i64`. 22 | // 23 | // TODO-cleanup figure out how to use custom_derive here? 24 | #[derive( 25 | Deserialize, 26 | Clone, 27 | Copy, 28 | Eq, 29 | Ord, 30 | JsonSchema, 31 | PartialEq, 32 | PartialOrd, 33 | Serialize, 34 | )] 35 | #[serde(transparent)] 36 | pub struct SagaNodeId(u32); 37 | NewtypeDebug! { () pub struct SagaNodeId(u32); } 38 | NewtypeDisplay! { () pub struct SagaNodeId(u32); } 39 | NewtypeFrom! { () pub struct SagaNodeId(u32); } 40 | 41 | #[derive(Debug, Clone, Error)] 42 | pub enum SagaLogError { 43 | #[error( 44 | "event type {event_type} is illegal with current load status \ 45 | {current_status:?}" 46 | )] 47 | IllegalEventForState { 48 | current_status: SagaNodeLoadStatus, 49 | event_type: SagaNodeEventType, 50 | }, 51 | } 52 | 53 | /// An entry in the saga log 54 | #[derive(Clone, Deserialize, Serialize)] 55 | pub struct SagaNodeEvent { 56 | /// id of the saga 57 | pub saga_id: SagaId, 58 | /// id of the saga node 59 | pub node_id: SagaNodeId, 60 | /// what's indicated by this event 61 | pub event_type: SagaNodeEventType, 62 | } 63 | 64 | impl fmt::Debug for SagaNodeEvent { 65 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 66 | write!(f, "N{:0>3} {}", self.node_id, self.event_type) 67 | } 68 | } 69 | 70 | /// Event types that may be found in the log for a particular action 71 | /// 72 | /// (This is not a general-purpose debug log, but more like an intent log for 73 | /// recovering the action's state in the event of an executor crash. That 74 | /// doesn't mean we can't put debugging information here, though.) 75 | #[derive(Clone, Debug, Deserialize, Serialize)] 76 | #[serde(rename_all = "snake_case")] 77 | pub enum SagaNodeEventType { 78 | /// The action has started running 79 | Started, 80 | /// The action completed successfully (with output data) 81 | Succeeded(Arc), 82 | /// The action failed 83 | Failed(ActionError), 84 | /// The undo action has started running 85 | UndoStarted, 86 | /// The undo action has finished 87 | UndoFinished, 88 | /// The undo action has failed 89 | UndoFailed(UndoActionError), 90 | } 91 | 92 | impl fmt::Display for SagaNodeEventType { 93 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 94 | f.write_str(self.label()) 95 | } 96 | } 97 | 98 | impl SagaNodeEventType { 99 | pub fn label(&self) -> &'static str { 100 | match self { 101 | SagaNodeEventType::Started => "started", 102 | SagaNodeEventType::Succeeded(_) => "succeeded", 103 | SagaNodeEventType::Failed(_) => "failed", 104 | SagaNodeEventType::UndoStarted => "undo_started", 105 | SagaNodeEventType::UndoFinished => "undo_finished", 106 | SagaNodeEventType::UndoFailed(_) => "undo_failed", 107 | } 108 | } 109 | } 110 | 111 | /// Persistent status for a saga node 112 | /// 113 | /// The events present in the log determine the _persistent status_ of the node. 114 | /// You can think of this like a single summary of the state of this action, 115 | /// based solely on the persistent state. When recovering from a crash, the 116 | /// saga executor uses this status to determine what to do next. We also 117 | /// maintain this for each SagaLog to identify illegal transitions at runtime. 118 | /// 119 | /// A node's status is very nearly identified by the type of the last event 120 | /// seen. It's cleaner to have a first-class summary here. 121 | #[derive(Clone, Debug)] 122 | pub enum SagaNodeLoadStatus { 123 | /// The action never started running 124 | NeverStarted, 125 | /// The action has started running 126 | Started, 127 | /// The action completed successfully (with output data) 128 | Succeeded(Arc), 129 | /// The action failed 130 | Failed(ActionError), 131 | /// The undo action has started running (with output data from success) 132 | UndoStarted(Arc), 133 | /// The undo action has finished successfully 134 | UndoFinished, 135 | /// The undo action has failed 136 | UndoFailed(UndoActionError), 137 | } 138 | 139 | impl SagaNodeLoadStatus { 140 | /// Returns the new status for a node after recording the given event. 141 | fn next_status( 142 | &self, 143 | event_type: &SagaNodeEventType, 144 | ) -> Result { 145 | match (self, event_type) { 146 | (SagaNodeLoadStatus::NeverStarted, SagaNodeEventType::Started) => { 147 | Ok(SagaNodeLoadStatus::Started) 148 | } 149 | ( 150 | SagaNodeLoadStatus::Started, 151 | SagaNodeEventType::Succeeded(out), 152 | ) => Ok(SagaNodeLoadStatus::Succeeded(Arc::clone(out))), 153 | (SagaNodeLoadStatus::Started, SagaNodeEventType::Failed(e)) => { 154 | Ok(SagaNodeLoadStatus::Failed(e.clone())) 155 | } 156 | ( 157 | SagaNodeLoadStatus::Succeeded(out), 158 | SagaNodeEventType::UndoStarted, 159 | ) => Ok(SagaNodeLoadStatus::UndoStarted(Arc::clone(out))), 160 | ( 161 | SagaNodeLoadStatus::UndoStarted(_), 162 | SagaNodeEventType::UndoFinished, 163 | ) => Ok(SagaNodeLoadStatus::UndoFinished), 164 | ( 165 | SagaNodeLoadStatus::UndoStarted(_), 166 | SagaNodeEventType::UndoFailed(e), 167 | ) => Ok(SagaNodeLoadStatus::UndoFailed(e.clone())), 168 | _ => Err(SagaLogError::IllegalEventForState { 169 | current_status: self.clone(), 170 | event_type: event_type.clone(), 171 | }), 172 | } 173 | } 174 | } 175 | 176 | /// Write to a saga's log 177 | #[derive(Clone, Debug)] 178 | pub struct SagaLog { 179 | saga_id: SagaId, 180 | unwinding: bool, 181 | events: Vec, 182 | node_status: BTreeMap, 183 | } 184 | 185 | impl SagaLog { 186 | pub fn new_empty(saga_id: SagaId) -> SagaLog { 187 | SagaLog { 188 | saga_id, 189 | events: Vec::new(), 190 | node_status: BTreeMap::new(), 191 | unwinding: false, 192 | } 193 | } 194 | 195 | pub fn new_recover( 196 | saga_id: SagaId, 197 | mut events: Vec, 198 | ) -> Result { 199 | let mut log = Self::new_empty(saga_id); 200 | 201 | // Sort the events by the event type. This ensures that if there's at 202 | // least one valid sequence of events, then we'll replay the events in a 203 | // valid sequence. Thus, if we fail to replay below, then the log is 204 | // corrupted somehow. (Remember, the wall timestamp is never used for 205 | // correctness.) For debugging purposes, this is a little disappointing: 206 | // most likely, the events are already in a valid order that reflects 207 | // when they actually happened. However, there's nothing to guarantee 208 | // that unless we make it so, and our simple approach for doing so here 209 | // destroys the sequential order. This should only really matter for a 210 | // person looking at the sequence of entries (as they appear in memory) 211 | // for debugging. 212 | events.sort_by_key(|f| match f.event_type { 213 | // TODO-cleanup Is there a better way to do this? We want to sort 214 | // by the event type, where event types are compared by the order 215 | // they're defined in SagaEventType. We could almost use derived 216 | // PartialOrd and PartialEq implementations for SagaEventType, 217 | // except that one variant has a payload that does _not_ 218 | // necessarily implement PartialEq or PartialOrd. It 219 | // seems like that means we have to implement this by 220 | // hand. 221 | SagaNodeEventType::Started => 1, 222 | SagaNodeEventType::Succeeded(_) => 2, 223 | SagaNodeEventType::Failed(_) => 3, 224 | SagaNodeEventType::UndoStarted => 4, 225 | SagaNodeEventType::UndoFinished => 5, 226 | SagaNodeEventType::UndoFailed(_) => 6, 227 | }); 228 | 229 | // Replay the events for this saga. 230 | for event in events { 231 | if event.saga_id != saga_id { 232 | return Err(anyhow!( 233 | "found an event in the log for a different saga ({}) than \ 234 | requested ({})", 235 | event.saga_id, 236 | saga_id, 237 | )); 238 | } 239 | 240 | log.record(&event).with_context(|| "SagaLog::new_recover")?; 241 | } 242 | 243 | Ok(log) 244 | } 245 | 246 | pub fn record( 247 | &mut self, 248 | event: &SagaNodeEvent, 249 | ) -> Result<(), SagaLogError> { 250 | let current_status = self.load_status_for_node(event.node_id); 251 | let next_status = current_status.next_status(&event.event_type)?; 252 | 253 | match next_status { 254 | SagaNodeLoadStatus::Failed(_) 255 | | SagaNodeLoadStatus::UndoStarted(_) 256 | | SagaNodeLoadStatus::UndoFinished => { 257 | self.unwinding = true; 258 | } 259 | _ => (), 260 | }; 261 | 262 | self.node_status.insert(event.node_id, next_status); 263 | self.events.push(event.clone()); 264 | Ok(()) 265 | } 266 | 267 | pub fn unwinding(&self) -> bool { 268 | self.unwinding 269 | } 270 | 271 | pub fn load_status_for_node( 272 | &self, 273 | node_id: SagaNodeId, 274 | ) -> &SagaNodeLoadStatus { 275 | self.node_status 276 | .get(&node_id) 277 | .unwrap_or(&SagaNodeLoadStatus::NeverStarted) 278 | } 279 | 280 | pub fn events(&self) -> &[SagaNodeEvent] { 281 | &self.events 282 | } 283 | 284 | pub fn pretty(&self) -> SagaLogPretty<'_> { 285 | SagaLogPretty { log: self } 286 | } 287 | } 288 | 289 | /// Handle for pretty-printing a SagaLog (using the `fmt::Debug` trait) 290 | pub struct SagaLogPretty<'a> { 291 | log: &'a SagaLog, 292 | } 293 | 294 | impl fmt::Debug for SagaLogPretty<'_> { 295 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 296 | write!(f, "SAGA LOG:\n")?; 297 | write!(f, "saga id: {}\n", self.log.saga_id)?; 298 | write!( 299 | f, 300 | "direction: {}\n", 301 | if !self.log.unwinding { "forward" } else { "unwinding" } 302 | )?; 303 | write!(f, "events ({} total):\n", self.log.events.len())?; 304 | write!(f, "\n")?; 305 | for (i, event) in self.log.events.iter().enumerate() { 306 | write!(f, "{:0>3} {:?}\n", i + 1, event)?; 307 | } 308 | Ok(()) 309 | } 310 | } 311 | 312 | // TODO-testing lots of automated tests are possible here, but let's see if the 313 | // abstraction makes any sense first. 314 | // 315 | -------------------------------------------------------------------------------- /src/store.rs: -------------------------------------------------------------------------------- 1 | //! [`SecStore`] trait, related types, and built-in implementations 2 | 3 | use crate::SagaId; 4 | use crate::SagaName; 5 | use crate::SagaNodeEvent; 6 | use anyhow::Context; 7 | use async_trait::async_trait; 8 | use schemars::JsonSchema; 9 | use serde::Deserialize; 10 | use serde::Serialize; 11 | use std::convert::TryFrom; 12 | use std::fmt; 13 | 14 | /// Interfaces implemented by the Steno consumer to storing saga state and saga 15 | /// log state persistently 16 | /// 17 | /// Correct implementation of these interfaces is critical for crash recovery. 18 | #[async_trait] 19 | pub trait SecStore: fmt::Debug + Send + Sync { 20 | /// Create a record for a newly created saga 21 | /// 22 | /// Once this step has completed, the saga will be discovered and recovered 23 | /// upon startup. Until this step has completed, the saga has not finished 24 | /// being created (since it won't be recovered on startup). 25 | async fn saga_create( 26 | &self, 27 | create_params: SagaCreateParams, 28 | ) -> Result<(), anyhow::Error>; 29 | 30 | /// Write a record to a saga's persistent log 31 | async fn record_event(&self, event: SagaNodeEvent); 32 | 33 | /// Update the cached runtime state of the saga 34 | /// 35 | /// Steno invokes this function when the saga has reached one of the states 36 | /// described by [`SagaCachedState`] (like "Done"). This allows consumers 37 | /// to persistently record this information for easy access. This step 38 | /// is not strictly required for correctness, since the saga log 39 | /// contains all the information needed to determine this state. But by 40 | /// recording when a saga has finished, for example, the consumer can 41 | /// avoid having to read the saga's log altogether when it next starts 42 | /// up since there's no need to recover the saga. 43 | async fn saga_update(&self, id: SagaId, update: SagaCachedState); 44 | } 45 | 46 | /// Describes what an impl of [`SecStore`] needs to store for a persistent saga 47 | /// record. 48 | #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] 49 | pub struct SagaCreateParams { 50 | pub id: SagaId, 51 | // The saga name doesn't strictly speaking need to be a separate field here 52 | // because it's contained within `dag`. However, the name is useful to the 53 | // consumer. And they're not supposed to be picking apart `dag`. So we 54 | // pull it out for them. 55 | pub name: SagaName, 56 | pub dag: serde_json::Value, 57 | pub state: SagaCachedState, 58 | } 59 | 60 | /// Describes the cacheable state of the saga 61 | /// 62 | /// See [`SecStore::saga_update`]. 63 | #[derive( 64 | Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize, JsonSchema, 65 | )] 66 | #[serde(rename_all = "snake_case")] 67 | pub enum SagaCachedState { 68 | Running, 69 | Unwinding, 70 | Done, 71 | } 72 | 73 | impl fmt::Display for SagaCachedState { 74 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 75 | write!(f, "{}", <&str>::from(self)) 76 | } 77 | } 78 | 79 | impl TryFrom<&str> for SagaCachedState { 80 | type Error = anyhow::Error; 81 | fn try_from(value: &str) -> Result { 82 | // Round-tripping through serde is a little absurd, but has the benefit 83 | // of always staying in sync with the real definition. (The initial 84 | // serialization is necessary to correctly handle any quotes or the like 85 | // in the input string.) 86 | let json = serde_json::to_string(value).unwrap(); 87 | serde_json::from_str(&json).context("parsing saga state") 88 | } 89 | } 90 | 91 | impl<'a> From<&'a SagaCachedState> for &'a str { 92 | fn from(s: &'a SagaCachedState) -> &'a str { 93 | match s { 94 | SagaCachedState::Running => "running", 95 | SagaCachedState::Unwinding => "unwinding", 96 | SagaCachedState::Done => "done", 97 | } 98 | } 99 | } 100 | 101 | /// Implementation of [`SecStore`] that doesn't store any state persistently 102 | /// 103 | /// Sagas created using this store will not be recovered after the program 104 | /// crashes. 105 | #[derive(Debug)] 106 | pub struct InMemorySecStore {} 107 | 108 | impl InMemorySecStore { 109 | pub fn new() -> InMemorySecStore { 110 | InMemorySecStore {} 111 | } 112 | } 113 | 114 | #[async_trait] 115 | impl SecStore for InMemorySecStore { 116 | async fn saga_create( 117 | &self, 118 | _create_params: SagaCreateParams, 119 | ) -> Result<(), anyhow::Error> { 120 | // Nothing to do. 121 | Ok(()) 122 | } 123 | 124 | async fn record_event(&self, _event: SagaNodeEvent) { 125 | // Nothing to do. 126 | } 127 | 128 | async fn saga_update(&self, _id: SagaId, _update: SagaCachedState) { 129 | // Nothing to do. 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /tests/test_smoke.rs: -------------------------------------------------------------------------------- 1 | //! Smoke tests for steno. These aren't close to exhaustive, but tests that 2 | //! it's not completely broken. 3 | 4 | use expectorate::assert_contents; 5 | use std::env::current_exe; 6 | use std::path::PathBuf; 7 | use steno::SagaSerialized; 8 | use subprocess::Exec; 9 | use subprocess::Redirection; 10 | 11 | fn example_bin() -> PathBuf { 12 | // This is unfortunate, but it's the best way I know to run one of the 13 | // examples out of our project. 14 | let mut my_path = current_exe().expect("failed to find test program"); 15 | my_path.pop(); 16 | assert_eq!(my_path.file_name().unwrap(), "deps"); 17 | my_path.pop(); 18 | my_path.push("examples"); 19 | my_path.push("demo-provision"); 20 | my_path 21 | } 22 | 23 | fn run_example(test_name: &str, config_fn: impl Fn(Exec) -> Exec) -> String { 24 | let config = config_fn(Exec::cmd(example_bin()).stdout(Redirection::Pipe)); 25 | let cmdline = config.to_cmdline_lossy(); 26 | eprintln!("test \"{}\": run: {}", test_name, cmdline); 27 | config.capture().expect("failed to execute command").stdout_str() 28 | } 29 | 30 | #[test] 31 | fn no_args() { 32 | assert_contents( 33 | "tests/test_smoke_no_args.out", 34 | &run_example("no_args", |exec| exec.stderr(Redirection::Merge)), 35 | ); 36 | } 37 | 38 | #[test] 39 | fn cmd_info() { 40 | assert_contents( 41 | "tests/test_smoke_info.out", 42 | &run_example("info", |exec| { 43 | exec.stderr(Redirection::Merge).arg("info") 44 | }), 45 | ); 46 | } 47 | 48 | #[test] 49 | fn cmd_dot() { 50 | assert_contents( 51 | "tests/test_smoke_dot.out", 52 | &run_example("dot", |exec| exec.stderr(Redirection::Merge).arg("dot")), 53 | ); 54 | } 55 | 56 | #[test] 57 | fn cmd_run_basic() { 58 | assert_contents( 59 | "tests/test_smoke_run_basic.out", 60 | &run_example("run_basic", |exec| exec.arg("run")), 61 | ); 62 | } 63 | 64 | #[test] 65 | fn cmd_run_error() { 66 | assert_contents( 67 | "tests/test_smoke_run_error.out", 68 | &run_example("run_error", |exec| { 69 | exec.arg("run").arg("--inject-error=instance_boot") 70 | }), 71 | ); 72 | } 73 | 74 | #[test] 75 | fn cmd_run_stuck() { 76 | assert_contents( 77 | "tests/test_smoke_run_stuck.out", 78 | &run_example("run_stuck", |exec| { 79 | exec.arg("run") 80 | .arg("--inject-error=instance_boot") 81 | .arg("--inject-undo-error=instance_id") 82 | }), 83 | ); 84 | } 85 | 86 | #[test] 87 | fn cmd_run_recover() { 88 | // Do a normal run and save the log so we can try recovering from it. 89 | let log = run_example("recover1", |exec| { 90 | exec.arg("run").arg("--dump-to=-").arg("--quiet") 91 | }); 92 | 93 | // First, try recovery without having changed anything. 94 | let recovery_done = run_example("recover2", |exec| { 95 | exec.arg("run").arg("--recover-from=-").stdin(log.as_str()) 96 | }); 97 | assert_contents("tests/test_smoke_run_recover_done.out", &recovery_done); 98 | 99 | // Now try lopping off the last handful of records so there's work to do. 100 | let mut log_parsed: SagaSerialized = 101 | serde_json::from_str(&log).expect("failed to parse generated log"); 102 | log_parsed.events.truncate( 103 | (log_parsed.events.len() - 5).clamp(0, log_parsed.events.len()), 104 | ); 105 | let log_shortened = serde_json::to_string(&log_parsed).unwrap(); 106 | assert_contents( 107 | "tests/test_smoke_run_recover_some.out", 108 | &run_example("recover3", |exec| { 109 | exec.arg("run") 110 | .arg("--recover-from=-") 111 | .stdin(log_shortened.as_str()) 112 | }), 113 | ); 114 | } 115 | 116 | #[test] 117 | fn cmd_run_recover_unwind() { 118 | // Do a failed run and save the log so we can try recovering from it. 119 | let log = run_example("recover_fail1", |exec| { 120 | exec.arg("run") 121 | .arg("--dump-to=-") 122 | .arg("--quiet") 123 | .arg("--inject-error=instance_boot") 124 | }); 125 | 126 | // First, try recovery without having changed anything. 127 | let recovery_done = run_example("recover_fail2", |exec| { 128 | exec.arg("run").arg("--recover-from=-").stdin(log.as_str()) 129 | }); 130 | assert_contents( 131 | "tests/test_smoke_run_recover_fail_done.out", 132 | &recovery_done, 133 | ); 134 | 135 | // Now try lopping off the last handful of records so there's work to do. 136 | let mut log_parsed: SagaSerialized = 137 | serde_json::from_str(&log).expect("failed to parse generated log"); 138 | log_parsed.events.truncate( 139 | (log_parsed.events.len() - 3).clamp(0, log_parsed.events.len()), 140 | ); 141 | let log_shortened = serde_json::to_string(&log_parsed).unwrap(); 142 | assert_contents( 143 | "tests/test_smoke_run_recover_fail_some.out", 144 | &run_example("recover_fail3", |exec| { 145 | exec.arg("run") 146 | .arg("--recover-from=-") 147 | .stdin(log_shortened.as_str()) 148 | }), 149 | ); 150 | } 151 | 152 | #[test] 153 | fn cmd_run_recover_stuck() { 154 | // Do a failed run and save the log so we can try recovering from it. 155 | let log = run_example("recover_stuck1", |exec| { 156 | exec.arg("run") 157 | .arg("--dump-to=-") 158 | .arg("--quiet") 159 | .arg("--inject-error=instance_boot") 160 | .arg("--inject-undo-error=instance_id") 161 | }); 162 | 163 | // First, try recovery without having changed anything. 164 | let recovery_done = run_example("recover_stuck2", |exec| { 165 | exec.arg("run").arg("--recover-from=-").stdin(log.as_str()) 166 | }); 167 | assert_contents( 168 | "tests/test_smoke_run_recover_stuck_done.out", 169 | &recovery_done, 170 | ); 171 | } 172 | -------------------------------------------------------------------------------- /tests/test_smoke_dot.out: -------------------------------------------------------------------------------- 1 | digraph { 2 | 0 [ label = "Action { name: \"instance_id\", label: \"InstanceCreate\", action_name: \"instance_create\" }" ] 3 | 1 [ label = "Constant { name: \"server_alloc_params\", value: Object {\"number_of_things\": Number(1)} }" ] 4 | 2 [ label = "Action { name: \"instance_ip\", label: \"VpcAllocIp\", action_name: \"vpc_alloc_ip\" }" ] 5 | 3 [ label = "Action { name: \"volume_id\", label: \"VolumeCreate\", action_name: \"volume_create\" }" ] 6 | 4 [ label = "SubsagaStart { saga_name: \"server-alloc\", params_node_name: \"server_alloc_params\" }" ] 7 | 5 [ label = "Action { name: \"server_id\", label: \"ServerPick\", action_name: \"server_pick\" }" ] 8 | 6 [ label = "Action { name: \"server_reserve\", label: \"ServerReserve\", action_name: \"server_reserve\" }" ] 9 | 7 [ label = "SubsagaEnd { name: \"server_alloc\" }" ] 10 | 8 [ label = "Action { name: \"instance_configure\", label: \"InstanceConfigure\", action_name: \"instance_configure\" }" ] 11 | 9 [ label = "Action { name: \"volume_attach\", label: \"VolumeAttach\", action_name: \"volume_attach\" }" ] 12 | 10 [ label = "Action { name: \"instance_boot\", label: \"InstanceBoot\", action_name: \"instance_boot\" }" ] 13 | 11 [ label = "Action { name: \"print\", label: \"Print\", action_name: \"print\" }" ] 14 | 12 [ label = "Start { params: Object {\"instance_name\": String(\"fake-o-instance\"), \"number_of_instances\": Number(1)} }" ] 15 | 13 [ label = "End" ] 16 | 0 -> 1 [ ] 17 | 1 -> 2 [ ] 18 | 1 -> 3 [ ] 19 | 1 -> 4 [ ] 20 | 5 -> 6 [ ] 21 | 4 -> 5 [ ] 22 | 6 -> 7 [ ] 23 | 2 -> 8 [ ] 24 | 3 -> 8 [ ] 25 | 7 -> 8 [ ] 26 | 8 -> 9 [ ] 27 | 9 -> 10 [ ] 28 | 10 -> 11 [ ] 29 | 12 -> 0 [ ] 30 | 11 -> 13 [ ] 31 | } 32 | 33 | -------------------------------------------------------------------------------- /tests/test_smoke_info.out: -------------------------------------------------------------------------------- 1 | *** saga dag definition *** 2 | saga graph: 3 | digraph { 4 | 0 [ label = "Action { name: \"instance_id\", label: \"InstanceCreate\", action_name: \"instance_create\" }" ] 5 | 1 [ label = "Constant { name: \"server_alloc_params\", value: Object {\"number_of_things\": Number(1)} }" ] 6 | 2 [ label = "Action { name: \"instance_ip\", label: \"VpcAllocIp\", action_name: \"vpc_alloc_ip\" }" ] 7 | 3 [ label = "Action { name: \"volume_id\", label: \"VolumeCreate\", action_name: \"volume_create\" }" ] 8 | 4 [ label = "SubsagaStart { saga_name: \"server-alloc\", params_node_name: \"server_alloc_params\" }" ] 9 | 5 [ label = "Action { name: \"server_id\", label: \"ServerPick\", action_name: \"server_pick\" }" ] 10 | 6 [ label = "Action { name: \"server_reserve\", label: \"ServerReserve\", action_name: \"server_reserve\" }" ] 11 | 7 [ label = "SubsagaEnd { name: \"server_alloc\" }" ] 12 | 8 [ label = "Action { name: \"instance_configure\", label: \"InstanceConfigure\", action_name: \"instance_configure\" }" ] 13 | 9 [ label = "Action { name: \"volume_attach\", label: \"VolumeAttach\", action_name: \"volume_attach\" }" ] 14 | 10 [ label = "Action { name: \"instance_boot\", label: \"InstanceBoot\", action_name: \"instance_boot\" }" ] 15 | 11 [ label = "Action { name: \"print\", label: \"Print\", action_name: \"print\" }" ] 16 | 12 [ label = "Start { params: Object {\"instance_name\": String(\"fake-o instance\"), \"number_of_instances\": Number(1)} }" ] 17 | 13 [ label = "End" ] 18 | 0 -> 1 [ ] 19 | 1 -> 2 [ ] 20 | 1 -> 3 [ ] 21 | 1 -> 4 [ ] 22 | 5 -> 6 [ ] 23 | 4 -> 5 [ ] 24 | 6 -> 7 [ ] 25 | 2 -> 8 [ ] 26 | 3 -> 8 [ ] 27 | 7 -> 8 [ ] 28 | 8 -> 9 [ ] 29 | 9 -> 10 [ ] 30 | 10 -> 11 [ ] 31 | 12 -> 0 [ ] 32 | 11 -> 13 [ ] 33 | } 34 | 35 | *** initial state *** 36 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 37 | +-- queued-todo: (start node) 38 | +-- blocked: InstanceCreate (produces "instance_id") 39 | +-- blocked: (constant = {"number_of_things":1}) (produces "server_alloc_params") 40 | +-- (parallel actions): 41 | +-- blocked: VpcAllocIp (produces "instance_ip") 42 | +-- blocked: VolumeCreate (produces "volume_id") 43 | +-- blocked: (subsaga start: "server-alloc") 44 | +-- blocked: ServerPick (produces "server_id") 45 | +-- blocked: ServerReserve (produces "server_reserve") 46 | +-- blocked: (subsaga end) (produces "server_alloc") 47 | +-- blocked: InstanceConfigure (produces "instance_configure") 48 | +-- blocked: VolumeAttach (produces "volume_attach") 49 | +-- blocked: InstanceBoot (produces "instance_boot") 50 | +-- blocked: Print (produces "print") 51 | +-- blocked: (end node) 52 | 53 | -------------------------------------------------------------------------------- /tests/test_smoke_no_args.out: -------------------------------------------------------------------------------- 1 | steno 2 | Demo saga implementation 3 | 4 | USAGE: 5 | demo-provision 6 | 7 | FLAGS: 8 | -h, --help Prints help information 9 | -V, --version Prints version information 10 | 11 | SUBCOMMANDS: 12 | dot Dump a dot (graphviz) representation of the saga graph 13 | help Prints this message or the help of the given subcommand(s) 14 | info Dump information about the saga graph (not an execution) 15 | print-log Pretty-print the log from a previous execution 16 | run Execute the saga 17 | -------------------------------------------------------------------------------- /tests/test_smoke_run_basic.out: -------------------------------------------------------------------------------- 1 | *** running saga *** 2 | *** finished saga *** 3 | 4 | *** final state *** 5 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 6 | +-- done: (start node) 7 | +-- done: InstanceCreate (produces "instance_id") 8 | +-- done: (constant = {"number_of_things":1}) (produces "server_alloc_params") 9 | +-- (parallel actions): 10 | +-- done: VpcAllocIp (produces "instance_ip") 11 | +-- done: VolumeCreate (produces "volume_id") 12 | +-- done: (subsaga start: "server-alloc") 13 | +-- done: ServerPick (produces "server_id") 14 | +-- done: ServerReserve (produces "server_reserve") 15 | +-- done: (subsaga end) (produces "server_alloc") 16 | +-- done: InstanceConfigure (produces "instance_configure") 17 | +-- done: VolumeAttach (produces "volume_attach") 18 | +-- done: InstanceBoot (produces "instance_boot") 19 | +-- done: Print (produces "print") 20 | +-- done: (end node) 21 | 22 | result: SUCCESS 23 | final output: "it worked" 24 | -------------------------------------------------------------------------------- /tests/test_smoke_run_error.out: -------------------------------------------------------------------------------- 1 | will inject error at node "instance_boot" 2 | *** running saga *** 3 | *** finished saga *** 4 | 5 | *** final state *** 6 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 7 | +-- undone: (start node) 8 | +-- undone: InstanceCreate (produces "instance_id") 9 | +-- undone: (constant = {"number_of_things":1}) (produces "server_alloc_params") 10 | +-- (parallel actions): 11 | +-- undone: VpcAllocIp (produces "instance_ip") 12 | +-- undone: VolumeCreate (produces "volume_id") 13 | +-- undone: (subsaga start: "server-alloc") 14 | +-- undone: ServerPick (produces "server_id") 15 | +-- undone: ServerReserve (produces "server_reserve") 16 | +-- undone: (subsaga end) (produces "server_alloc") 17 | +-- undone: InstanceConfigure (produces "instance_configure") 18 | +-- undone: VolumeAttach (produces "volume_attach") 19 | +-- failed: InstanceBoot (produces "instance_boot") 20 | +-- abandoned: Print (produces "print") 21 | +-- abandoned: (end node) 22 | 23 | result: ACTION FAILURE 24 | failed at node: "instance_boot" 25 | failed with error: error injected 26 | -------------------------------------------------------------------------------- /tests/test_smoke_run_recover_done.out: -------------------------------------------------------------------------------- 1 | recovering from log: - 2 | recovered state 3 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 4 | +-- done: (start node) 5 | +-- done: InstanceCreate (produces "instance_id") 6 | +-- done: (constant = {"number_of_things":1}) (produces "server_alloc_params") 7 | +-- (parallel actions): 8 | +-- done: VpcAllocIp (produces "instance_ip") 9 | +-- done: VolumeCreate (produces "volume_id") 10 | +-- done: (subsaga start: "server-alloc") 11 | +-- done: ServerPick (produces "server_id") 12 | +-- done: ServerReserve (produces "server_reserve") 13 | +-- done: (subsaga end) (produces "server_alloc") 14 | +-- done: InstanceConfigure (produces "instance_configure") 15 | +-- done: VolumeAttach (produces "volume_attach") 16 | +-- done: InstanceBoot (produces "instance_boot") 17 | +-- done: Print (produces "print") 18 | +-- done: (end node) 19 | 20 | 21 | *** running saga *** 22 | *** finished saga *** 23 | 24 | *** final state *** 25 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 26 | +-- done: (start node) 27 | +-- done: InstanceCreate (produces "instance_id") 28 | +-- done: (constant = {"number_of_things":1}) (produces "server_alloc_params") 29 | +-- (parallel actions): 30 | +-- done: VpcAllocIp (produces "instance_ip") 31 | +-- done: VolumeCreate (produces "volume_id") 32 | +-- done: (subsaga start: "server-alloc") 33 | +-- done: ServerPick (produces "server_id") 34 | +-- done: ServerReserve (produces "server_reserve") 35 | +-- done: (subsaga end) (produces "server_alloc") 36 | +-- done: InstanceConfigure (produces "instance_configure") 37 | +-- done: VolumeAttach (produces "volume_attach") 38 | +-- done: InstanceBoot (produces "instance_boot") 39 | +-- done: Print (produces "print") 40 | +-- done: (end node) 41 | 42 | result: SUCCESS 43 | final output: "it worked" 44 | -------------------------------------------------------------------------------- /tests/test_smoke_run_recover_fail_done.out: -------------------------------------------------------------------------------- 1 | recovering from log: - 2 | recovered state 3 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 4 | +-- undone: (start node) 5 | +-- undone: InstanceCreate (produces "instance_id") 6 | +-- undone: (constant = {"number_of_things":1}) (produces "server_alloc_params") 7 | +-- (parallel actions): 8 | +-- undone: VpcAllocIp (produces "instance_ip") 9 | +-- undone: VolumeCreate (produces "volume_id") 10 | +-- undone: (subsaga start: "server-alloc") 11 | +-- undone: ServerPick (produces "server_id") 12 | +-- undone: ServerReserve (produces "server_reserve") 13 | +-- undone: (subsaga end) (produces "server_alloc") 14 | +-- undone: InstanceConfigure (produces "instance_configure") 15 | +-- undone: VolumeAttach (produces "volume_attach") 16 | +-- failed: InstanceBoot (produces "instance_boot") 17 | +-- abandoned: Print (produces "print") 18 | +-- abandoned: (end node) 19 | 20 | 21 | *** running saga *** 22 | *** finished saga *** 23 | 24 | *** final state *** 25 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 26 | +-- undone: (start node) 27 | +-- undone: InstanceCreate (produces "instance_id") 28 | +-- undone: (constant = {"number_of_things":1}) (produces "server_alloc_params") 29 | +-- (parallel actions): 30 | +-- undone: VpcAllocIp (produces "instance_ip") 31 | +-- undone: VolumeCreate (produces "volume_id") 32 | +-- undone: (subsaga start: "server-alloc") 33 | +-- undone: ServerPick (produces "server_id") 34 | +-- undone: ServerReserve (produces "server_reserve") 35 | +-- undone: (subsaga end) (produces "server_alloc") 36 | +-- undone: InstanceConfigure (produces "instance_configure") 37 | +-- undone: VolumeAttach (produces "volume_attach") 38 | +-- failed: InstanceBoot (produces "instance_boot") 39 | +-- abandoned: Print (produces "print") 40 | +-- abandoned: (end node) 41 | 42 | result: ACTION FAILURE 43 | failed at node: "instance_boot" 44 | failed with error: error injected 45 | -------------------------------------------------------------------------------- /tests/test_smoke_run_recover_fail_some.out: -------------------------------------------------------------------------------- 1 | recovering from log: - 2 | recovered state 3 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 4 | +-- done: (start node) 5 | +-- queued-undo: InstanceCreate (produces "instance_id") 6 | +-- undone: (constant = {"number_of_things":1}) (produces "server_alloc_params") 7 | +-- (parallel actions): 8 | +-- undone: VpcAllocIp (produces "instance_ip") 9 | +-- undone: VolumeCreate (produces "volume_id") 10 | +-- undone: (subsaga start: "server-alloc") 11 | +-- undone: ServerPick (produces "server_id") 12 | +-- undone: ServerReserve (produces "server_reserve") 13 | +-- undone: (subsaga end) (produces "server_alloc") 14 | +-- undone: InstanceConfigure (produces "instance_configure") 15 | +-- undone: VolumeAttach (produces "volume_attach") 16 | +-- failed: InstanceBoot (produces "instance_boot") 17 | +-- abandoned: Print (produces "print") 18 | +-- abandoned: (end node) 19 | 20 | 21 | *** running saga *** 22 | *** finished saga *** 23 | 24 | *** final state *** 25 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 26 | +-- undone: (start node) 27 | +-- undone: InstanceCreate (produces "instance_id") 28 | +-- undone: (constant = {"number_of_things":1}) (produces "server_alloc_params") 29 | +-- (parallel actions): 30 | +-- undone: VpcAllocIp (produces "instance_ip") 31 | +-- undone: VolumeCreate (produces "volume_id") 32 | +-- undone: (subsaga start: "server-alloc") 33 | +-- undone: ServerPick (produces "server_id") 34 | +-- undone: ServerReserve (produces "server_reserve") 35 | +-- undone: (subsaga end) (produces "server_alloc") 36 | +-- undone: InstanceConfigure (produces "instance_configure") 37 | +-- undone: VolumeAttach (produces "volume_attach") 38 | +-- failed: InstanceBoot (produces "instance_boot") 39 | +-- abandoned: Print (produces "print") 40 | +-- abandoned: (end node) 41 | 42 | result: ACTION FAILURE 43 | failed at node: "instance_boot" 44 | failed with error: error injected 45 | -------------------------------------------------------------------------------- /tests/test_smoke_run_recover_some.out: -------------------------------------------------------------------------------- 1 | recovering from log: - 2 | recovered state 3 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 4 | +-- done: (start node) 5 | +-- done: InstanceCreate (produces "instance_id") 6 | +-- done: (constant = {"number_of_things":1}) (produces "server_alloc_params") 7 | +-- (parallel actions): 8 | +-- done: VpcAllocIp (produces "instance_ip") 9 | +-- done: VolumeCreate (produces "volume_id") 10 | +-- done: (subsaga start: "server-alloc") 11 | +-- done: ServerPick (produces "server_id") 12 | +-- done: ServerReserve (produces "server_reserve") 13 | +-- done: (subsaga end) (produces "server_alloc") 14 | +-- done: InstanceConfigure (produces "instance_configure") 15 | +-- done: VolumeAttach (produces "volume_attach") 16 | +-- queued-todo: InstanceBoot (produces "instance_boot") 17 | +-- blocked: Print (produces "print") 18 | +-- blocked: (end node) 19 | 20 | 21 | *** running saga *** 22 | *** finished saga *** 23 | 24 | *** final state *** 25 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 26 | +-- done: (start node) 27 | +-- done: InstanceCreate (produces "instance_id") 28 | +-- done: (constant = {"number_of_things":1}) (produces "server_alloc_params") 29 | +-- (parallel actions): 30 | +-- done: VpcAllocIp (produces "instance_ip") 31 | +-- done: VolumeCreate (produces "volume_id") 32 | +-- done: (subsaga start: "server-alloc") 33 | +-- done: ServerPick (produces "server_id") 34 | +-- done: ServerReserve (produces "server_reserve") 35 | +-- done: (subsaga end) (produces "server_alloc") 36 | +-- done: InstanceConfigure (produces "instance_configure") 37 | +-- done: VolumeAttach (produces "volume_attach") 38 | +-- done: InstanceBoot (produces "instance_boot") 39 | +-- done: Print (produces "print") 40 | +-- done: (end node) 41 | 42 | result: SUCCESS 43 | final output: "it worked" 44 | -------------------------------------------------------------------------------- /tests/test_smoke_run_recover_stuck_done.out: -------------------------------------------------------------------------------- 1 | recovering from log: - 2 | recovered state 3 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 4 | +-- done: (start node) 5 | +-- undo-failed: InstanceCreate (produces "instance_id") 6 | +-- undone: (constant = {"number_of_things":1}) (produces "server_alloc_params") 7 | +-- (parallel actions): 8 | +-- undone: VpcAllocIp (produces "instance_ip") 9 | +-- undone: VolumeCreate (produces "volume_id") 10 | +-- undone: (subsaga start: "server-alloc") 11 | +-- undone: ServerPick (produces "server_id") 12 | +-- undone: ServerReserve (produces "server_reserve") 13 | +-- undone: (subsaga end) (produces "server_alloc") 14 | +-- undone: InstanceConfigure (produces "instance_configure") 15 | +-- undone: VolumeAttach (produces "volume_attach") 16 | +-- failed: InstanceBoot (produces "instance_boot") 17 | +-- abandoned: Print (produces "print") 18 | +-- abandoned: (end node) 19 | 20 | 21 | *** running saga *** 22 | *** finished saga *** 23 | 24 | *** final state *** 25 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 26 | +-- done: (start node) 27 | +-- undo-failed: InstanceCreate (produces "instance_id") 28 | +-- undone: (constant = {"number_of_things":1}) (produces "server_alloc_params") 29 | +-- (parallel actions): 30 | +-- undone: VpcAllocIp (produces "instance_ip") 31 | +-- undone: VolumeCreate (produces "volume_id") 32 | +-- undone: (subsaga start: "server-alloc") 33 | +-- undone: ServerPick (produces "server_id") 34 | +-- undone: ServerReserve (produces "server_reserve") 35 | +-- undone: (subsaga end) (produces "server_alloc") 36 | +-- undone: InstanceConfigure (produces "instance_configure") 37 | +-- undone: VolumeAttach (produces "volume_attach") 38 | +-- failed: InstanceBoot (produces "instance_boot") 39 | +-- abandoned: Print (produces "print") 40 | +-- abandoned: (end node) 41 | 42 | result: ACTION FAILURE 43 | failed at node: "instance_boot" 44 | failed with error: error injected 45 | FOLLOWED BY UNDO ACTION FAILURE 46 | failed at node: "instance_id" 47 | failed with error: undo action failed permanently: { 48 | "message": "undo action attempt 1: error injected" 49 | } 50 | -------------------------------------------------------------------------------- /tests/test_smoke_run_stuck.out: -------------------------------------------------------------------------------- 1 | will inject error at node "instance_boot" 2 | will inject error at node "instance_id" undo action 3 | *** running saga *** 4 | *** finished saga *** 5 | 6 | *** final state *** 7 | + saga execution: 049b2522-308d-442e-bc65-9bfaef863597 8 | +-- done: (start node) 9 | +-- undo-failed: InstanceCreate (produces "instance_id") 10 | +-- undone: (constant = {"number_of_things":1}) (produces "server_alloc_params") 11 | +-- (parallel actions): 12 | +-- undone: VpcAllocIp (produces "instance_ip") 13 | +-- undone: VolumeCreate (produces "volume_id") 14 | +-- undone: (subsaga start: "server-alloc") 15 | +-- undone: ServerPick (produces "server_id") 16 | +-- undone: ServerReserve (produces "server_reserve") 17 | +-- undone: (subsaga end) (produces "server_alloc") 18 | +-- undone: InstanceConfigure (produces "instance_configure") 19 | +-- undone: VolumeAttach (produces "volume_attach") 20 | +-- failed: InstanceBoot (produces "instance_boot") 21 | +-- abandoned: Print (produces "print") 22 | +-- abandoned: (end node) 23 | 24 | result: ACTION FAILURE 25 | failed at node: "instance_boot" 26 | failed with error: error injected 27 | FOLLOWED BY UNDO ACTION FAILURE 28 | failed at node: "instance_id" 29 | failed with error: undo action failed permanently: { 30 | "message": "undo action attempt 1: error injected" 31 | } 32 | -------------------------------------------------------------------------------- /tests/test_unregistered_action.rs: -------------------------------------------------------------------------------- 1 | //! Tests what happens when running a saga with an unregistered action 2 | 3 | use slog::Drain; 4 | use std::sync::Arc; 5 | use steno::ActionContext; 6 | use steno::ActionError; 7 | use steno::ActionRegistry; 8 | use steno::DagBuilder; 9 | use steno::Node; 10 | use steno::SagaDag; 11 | use steno::SagaId; 12 | use steno::SagaName; 13 | use steno::SagaType; 14 | use uuid::Uuid; 15 | 16 | fn new_log() -> slog::Logger { 17 | let decorator = slog_term::TermDecorator::new().build(); 18 | let drain = slog_term::FullFormat::new(decorator).build().fuse(); 19 | let drain = slog::LevelFilter(drain, slog::Level::Warning).fuse(); 20 | let drain = slog_async::Async::new(drain).build().fuse(); 21 | slog::Logger::root(drain, slog::o!()) 22 | } 23 | 24 | // Tests what happens when running a saga with an unregistered action 25 | #[tokio::test] 26 | async fn unregistered_action() { 27 | #[derive(Debug)] 28 | struct TestSaga; 29 | impl SagaType for TestSaga { 30 | type ExecContextType = (); 31 | } 32 | async fn my_action_func( 33 | _: ActionContext, 34 | ) -> Result<(), ActionError> { 35 | Ok(()) 36 | } 37 | let action = steno::new_action_noop_undo("my_action", my_action_func); 38 | let registry: ActionRegistry = ActionRegistry::new(); 39 | let mut builder = DagBuilder::new(SagaName::new("my-saga")); 40 | builder.append(Node::action("my_node", "my_node", &*action)); 41 | let saga = SagaDag::new( 42 | builder.build().expect("failed to build saga"), 43 | serde_json::Value::Null, 44 | ); 45 | 46 | let log = new_log(); 47 | let sec = steno::sec(log.clone(), Arc::new(steno::InMemorySecStore::new())); 48 | let saga_id = SagaId(Uuid::new_v4()); 49 | let context = Arc::new(()); 50 | let result = sec 51 | .saga_create( 52 | saga_id, 53 | Arc::clone(&context), 54 | Arc::new(saga), 55 | Arc::new(registry), 56 | ) 57 | .await; 58 | if let Err(error) = result { 59 | assert_eq!( 60 | format!("{:#}", error), 61 | "validating saga \"my-saga\": action for node \"my_node\" not \ 62 | registered: \"my_action\"" 63 | ); 64 | } else { 65 | panic!("expected failure to create saga with unregistered action"); 66 | } 67 | } 68 | --------------------------------------------------------------------------------