├── .formatter.exs ├── .github └── workflows │ └── elixir.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE.md ├── README.md ├── lib ├── application.ex ├── flame.ex └── flame │ ├── backend.ex │ ├── code_sync.ex │ ├── fly_backend.ex │ ├── local_backend.ex │ ├── parent.ex │ ├── parser │ └── json.ex │ ├── pool.ex │ ├── pool │ ├── cleaner.ex │ └── supervisor.ex │ ├── queue.ex │ ├── runner.ex │ ├── terminator.ex │ ├── terminator │ └── supervisor.ex │ └── trackable.ex ├── mix.exs ├── mix.lock └── test ├── code_sync_test.exs ├── flame_test.exs ├── fly_backend_test.exs ├── parser └── json_test.exs ├── queue_test.exs ├── runner_test.exs ├── support ├── code_sync_mock.ex └── trackable.ex └── test_helper.exs /.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" 2 | [ 3 | inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] 4 | ] 5 | -------------------------------------------------------------------------------- /.github/workflows/elixir.yml: -------------------------------------------------------------------------------- 1 | name: Elixir CI 2 | on: 3 | pull_request: 4 | push: 5 | 6 | jobs: 7 | main: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | include: 13 | - elixir_version: 1.15.7 14 | otp_version: 26.1.2 15 | lint: true 16 | steps: 17 | - uses: actions/checkout@v4 18 | - uses: erlef/setup-beam@v1 19 | with: 20 | otp-version: ${{matrix.otp_version}} 21 | elixir-version: ${{matrix.elixir_version}} 22 | - run: mix deps.get 23 | - run: mix format --check-formatted 24 | if: ${{ matrix.lint }} 25 | - run: mix deps.unlock --check-unused 26 | if: ${{ matrix.lint }} 27 | - run: mix deps.compile 28 | - run: mix compile --warnings-as-errors 29 | if: ${{ matrix.lint }} 30 | - run: mix test 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build/ 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover/ 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps/ 9 | 10 | # Where third-party dependencies like ExDoc output generated docs. 11 | /doc/ 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | 22 | # Ignore package tarball (built via "mix hex.build"). 23 | flame-*.tar 24 | 25 | # Temporary files, for example, from tests. 26 | /tmp/ 27 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.5.2 (2024-12-06) 4 | 5 | ### Enhancements 6 | - Clean up parent tmp code_sync artifacts on pool shutdown 7 | 8 | ## 0.5.1 (2024-09-19) 9 | 10 | ### Enhancements 11 | - Add basic rate limiting retries to the FlyBackend to abide by Fly's rate limits of 1 request per second, with 3 requests per second burst. 12 | - Add basic retries for `no capacity` errors in the FlyBackend 13 | 14 | ## 0.5.0 (2024-09-11) 15 | 16 | ### Enhancements 17 | - Add `copy_apps` option to `code_sync` to copy all apps in the code path, which is 18 | set to true when `start_apps` is true. 19 | - Support `copy_paths` for arbitrary paths unrelated to beams apps for copying arbitrary 20 | files on boot to the runner. 21 | 22 | ### Deprecations 23 | - `copy_paths: true` has been deprecated in favor of `start_apps: true`, to copy 24 | all apps and start them. You can also pass `copy_paths: true` to copy all apps 25 | without starting them. Now `copy_paths` is reserved for copying arbitrary paths 26 | unrelated to beams apps. 27 | 28 | ## 0.4.4 (2024-09-03) 29 | 30 | ### Bug Fixes 31 | - Fix idle shutdown running before code sync, causing long code syncs to shut runners down prematurely 32 | 33 | ## 0.4.3 (2024-09-02) 34 | 35 | ### Bug Fixes 36 | - Fix `:compress` to `:code_sync` raise invalid option error 37 | 38 | ## 0.4.2 (2024-08-27) 39 | 40 | ### Enhancements 41 | - Support `:compress` option to `code_sync` to control compression of `:copy_paths` and `:sync_beams`. 42 | 43 | ## 0.4.1 (2024-08-27) 44 | 45 | ### Bug Fixes 46 | - Fix beam files not being copied on first sync 47 | 48 | ## 0.4.0 (2024-08-27) 49 | 50 | ### Bug Fixes 51 | - Forward `:boot_timeout` to backend options 52 | 53 | ### Enhancements 54 | - Optimize concurrent runner booting 55 | 56 | ## 0.3.0 (2024-07-26) 57 | 58 | ### Bug Fixes 59 | - Copy sym links in `:copy_paths` and `:sync_beams` 60 | - Fix function error caused by anonymous functions in `:copy_paths` and `:sync_beams` 61 | 62 | ### Enhancements 63 | - Use OTP 27's `:json` if available 64 | - Introduce `FLAME.Trackable` protocol for tracking resources 65 | - Introduce `FLAME.track_resources/3` to recursively track resources 66 | on a given node 67 | 68 | ## 0.2.0 (2024-06-17) 69 | 70 | ### Backwards incompatible changes 71 | - For backend implementations, the `FLAME.Parent` encoded format has changed to include more information about the parent and child. See `FLAME.Parent` moduledoc for more information. 72 | 73 | ### Enhancements 74 | - Add `:code_sync` pool configuration for syncing beam files and code paths to flames 75 | 76 | ## 0.1.12 (2024-03-14) 77 | - Support `link: false` on `FLAME.call/3`, `FLAME.cast/3`, and `FLAME.place_child/3` for opt-in allowance of long-running FLAME operations (up to `:shutdown_timeout`) regardless of what happens to the caller process or caller node. 78 | 79 | ## 0.1.11 (2024-02-22) 80 | - Add ability to configure custom metadata for launch FlyBackend machine 81 | 82 | ## 0.1.10 (2024-02-21) 83 | - Fix `FLAME.cast/2` defaulting to boot timeout for executions 84 | 85 | ## 0.1.9 (2024-02-20) 86 | - Fix `FLAME.cast/2` allowing more than allowed max_concurrency operations 87 | - Explicitly prefer local region in `FlyBackend` 88 | 89 | ## 0.1.8 (2024-01-02) 90 | - Fix Pool supervisor name collisions 91 | 92 | ## 0.1.7 (2023-12-15) 93 | - Fix error on concurrent calls when runners are pending 94 | 95 | ## 0.1.6 (2023-12-11) 96 | - Fix references to incorrectly named FLAME_PARENT export 97 | 98 | ## 0.1.5 (2023-12-07) 99 | - Allow passing fly guest options to configure cpus, cpu_kind, gpu_kind, and memory_mb 100 | 101 | ## 0.1.4 (2023-12-06) 102 | 103 | Public release 🔥 104 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2023 Chris McCord 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/phoenixframework/flame/elixir.yml)](https://github.com/phoenixframework/flame/actions/workflows/elixir.yml) [![Hex.pm](https://img.shields.io/hexpm/v/flame.svg)](https://hex.pm/packages/flame) [![Documentation](https://img.shields.io/badge/documentation-gray)](https://hexdocs.pm/flame) 2 | 3 | Imagine if we could auto scale simply by wrapping any existing app code in a function and have that block of code run in a temporary copy of the app. 4 | 5 | Enter the FLAME pattern. 6 | 7 | > FLAME - Fleeting Lambda Application for Modular Execution 8 | 9 | With FLAME, you treat your *entire application* as a lambda, where modular parts can be executed on short-lived infrastructure. 10 | 11 | Check the screencast to see it in action: 12 | 13 | [![Video](https://img.youtube.com/vi/l1xt_rkWdic/maxresdefault.jpg)](https://www.youtube.com/watch?v=l1xt_rkWdic) 14 | 15 | ## Setup 16 | 17 | First add `:flame` as a dependency: 18 | 19 | ```elixir 20 | defp deps do 21 | [ 22 | # For Erlang/OTP 26 and earlier, you also need Jason 23 | # {:jason, ">= 0.0.0"}, 24 | {:flame, "~> 0.5"} 25 | ] 26 | end 27 | ``` 28 | 29 | Then start a FLAME pool in your supervision tree, typically on `application.ex`. The example below uses [Fly.io](https://fly.io/)'s backend: 30 | 31 | ```elixir 32 | children = [ 33 | {FLAME.Pool, 34 | name: MyApp.SamplePool, 35 | backend: FLAME.FlyBackend, 36 | min: 0, 37 | max: 10, 38 | max_concurrency: 5, 39 | idle_shutdown_after: 30_000, 40 | log: :debug} 41 | ] 42 | ``` 43 | 44 | Now you can wrap any block of code in a `FLAME.call` and it will find or boot a copy of the app, execute the work there, and return the results: 45 | 46 | ```elixir 47 | def generate_thumbnails(%Video{} = vid, interval) do 48 | FLAME.call(MyApp.FFMpegRunner, fn -> 49 | # I'm runner on a short-lived, temporary server 50 | tmp_dir = Path.join(System.tmp_dir!(), Ecto.UUID.generate()) 51 | File.mkdir!(tmp_dir) 52 | System.cmd("ffmpeg", ~w(-i #{vid.url} -vf fps=1/#{interval} #{tmp_dir}/%02d.png)) 53 | urls = VideoStore.put_thumbnails(vid, Path.wildcard(tmp_dir <> "/*.png")) 54 | Repo.insert_all(Thumbnail, Enum.map(urls, &%{video_id: vid.id, url: &1})) 55 | end) 56 | end 57 | ``` 58 | 59 | Here we wrapped up our CPU expensive `ffmpeg` operation in a `FLAME.call/2`. FLAME accepts a function and any variables that the function closes over. In this example, the `%Video{}` struct and `interval` are passed along automatically. The work happens in a temporary copy of the app. We can do any work inside the FLAME call because we are running the *entire application*, database connection(s) and all. 60 | 61 | `FLAME` provides the following interfaces for elastically scaled operations: 62 | 63 | * `FLAME.call/3` - used for synchronous calls 64 | * `FLAME.cast/3` - used for async casts where you don't need to wait on the results 65 | * `FLAME.place_child/3` – used for placing a child spec somewhere to run, in place of `DynamicSupervisor.start_child`, `Task.Supervisor.start_child`, etc 66 | 67 | The `FLAME.Pool` handles elastically scaling runners up and down, as well as remote monitoring of resources. Check the moduledoc for example usage. 68 | -------------------------------------------------------------------------------- /lib/application.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Application do 2 | @moduledoc false 3 | use Application 4 | 5 | @impl true 6 | def start(_type, _args) do 7 | opts = Application.get_env(:flame, :terminator, []) 8 | shutdown = Keyword.get(opts, :shutdown_timeout, 30_000) 9 | 10 | opts = Keyword.put(opts, :name, FLAME.Terminator) 11 | 12 | children = [ 13 | Supervisor.child_spec({FLAME.Terminator, opts}, shutdown: shutdown) 14 | ] 15 | 16 | opts = [strategy: :one_for_one, name: FLAME.Supervisor] 17 | Supervisor.start_link(children, opts) 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/flame.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME do 2 | @moduledoc ~S""" 3 | FLAME remotely executes your application code on ephemeral nodes. 4 | 5 | FLAME allows you to scale your application operations on a granular 6 | level **without rewriting your code**. For example, imagine the following function 7 | in your application that transcodes a video, saves the result to video storage, 8 | and updates the database: 9 | 10 | def resize_video_quality(%Video{} = vid) do 11 | path = "#{vid.id}_720p.mp4" 12 | System.cmd("ffmpeg", ~w(-i #{vid.url} -s 720x480 -c:a copy #{path})) 13 | VideoStore.put_file!("videos/#{path}", path) 14 | {1, _} = Repo.update_all(from v in Video, where v.id == ^vid.id, set: [file_720p: path]) 15 | {:ok, path} 16 | end 17 | 18 | This works great locally and in production under no load, but video transcoding 19 | is necessarily an expensive CPU bound operation. In production, only a 20 | few concurrent users can saturate your CPU and cause your entire application, 21 | web requests, etc, to come to crawl. This is where folks typically reach for 22 | FaaS or external service solutions, but FLAME gives you a better way. 23 | 24 | Simply wrap your existing code in a FLAME function and it will be executed 25 | on a newly spawned, ephemeral node. Using Elixir and Erlang's built-in distribution 26 | features, entire function closures, including any state they close over, can be sent 27 | and executed on a remote node: 28 | 29 | def resize_video_quality(%Video{} = vid) do 30 | FLAME.call(MyApp.FFMpegRunner, fn -> 31 | path = "#{vid.id}_720p.mp4" 32 | System.cmd("ffmpeg", ~w(-i #{vid.url} -s 720x480 -c:a copy #{path})) 33 | VideoStore.put_file!("videos/#{path}", path) 34 | {1, _} = Repo.update_all(from v in Video, where v.id == ^vid.id, set: [file_720p: path]) 35 | {:ok, path} 36 | end) 37 | end 38 | 39 | That's it! The `%Video{}` struct in this example is captured inside the function 40 | and everything executes on the remotely spawned node, returning the result back to the 41 | parent node when it completes. Repo calls Just Work because the new node booted 42 | your entire application, including the database Repo. As soon as the function is done 43 | executing, the ephemeral node is terminated. This means you can elastically scale 44 | your app as load increases, and only pay for the resources you need at the time. 45 | 46 | To support your FLAME calls, you'll need to add a named `FLAME.Pool` to your 47 | application's supervision tree, which we'll discuss next. 48 | 49 | ## Pools 50 | 51 | A `FLAME.Pool` provides elastic runner scaling, allowing a minimum and 52 | maximum number of runners to be configured, and idled down as load decreases. 53 | 54 | Pools give you elastic scale that maximizes the newly spawned hardware. 55 | At the same time, you also want to avoid spawning unbound resources. You also 56 | want to keep spawned nodes alive for a period of time to avoid the overhead 57 | of booting new ones before idling them down. The following pool configuration 58 | takes care of all of this for you: 59 | 60 | children = [ 61 | ..., 62 | {FLAME.Pool, 63 | name: MyApp.FFMpegRunner, 64 | min: 0, 65 | max: 10, 66 | max_concurrency: 5, 67 | idle_shutdown_after: 30_000}, 68 | ] 69 | 70 | Here we add a `FLAME.Pool` to our application supervision tree, configuring 71 | a minimum of 0 and maximum of 10 runners. This achieves "scale to zero" behavior 72 | while also allowing the pool to scale up to 10 runners when load increases. 73 | Each runner in the case will be able to execute up to 5 concurrent functions. 74 | The runners will shut down after 30 seconds of inactivity. 75 | 76 | Calling a pool is as simple as passing its name to the FLAME functions: 77 | 78 | FLAME.call(MyApp.FFMpegRunner, fn -> :operation1 end) 79 | 80 | You'll also often want to enable or disable other application services based on whether 81 | your application is being started as child FLAME runner or being run directly. 82 | See the next `Deployment Considerations` section below for details. 83 | 84 | ## Deployment Considerations 85 | 86 | FLAME nodes effectively clone and start your entire application. This is great 87 | because all application services and dependencies are ready to go and be used to 88 | support your FLAME calls; however, You'll also often want to enable or disable 89 | services based on whether your node is running as a FLAME child or not. 90 | For example, there's usually no need to serve your Phoenix endpoint within a FLAME. 91 | You also likely only need a single or small number of database connections instead of 92 | your existing pool size. 93 | 94 | To accomplish these you can use `FLAME.Parent.get/0` to conditionally enable or 95 | disable processes in your `application.ex` file: 96 | 97 | def start(_type, _args) do 98 | flame_parent = FLAME.Parent.get() 99 | 100 | children = [ 101 | ..., 102 | {FLAME.Pool, 103 | name: Thumbs.FFMpegRunner, 104 | min: 0, 105 | max: 10, 106 | max_concurrency: 5, 107 | idle_shutdown_after: 30_000}, 108 | !flame_parent && ThumbsWeb.Endpoint 109 | ] 110 | |> Enum.filter(& &1) 111 | 112 | opts = [strategy: :one_for_one, name: Thumbs.Supervisor] 113 | Supervisor.start_link(children, opts) 114 | end 115 | 116 | Here we filter the Phoenix endpoint from being started when running as a FLAME 117 | child because we have no need to handle web requests in this case. 118 | 119 | Or you can use `FLAME.Parent.get/0` to configure your database pool size: 120 | 121 | pool_size = 122 | if FLAME.Parent.get() do 123 | 1 124 | else 125 | String.to_integer(System.get_env("POOL_SIZE") || "10") 126 | end 127 | 128 | config :thumbs, Thumbs.Repo, 129 | ..., 130 | pool_size: pool_size 131 | 132 | ## Backends 133 | 134 | The `FLAME.Backend` behavior defines an interface for spawning remote 135 | application nodes and sending functions to them. By default, the 136 | `FLAME.LocalBackend` is used, which is great for development and test 137 | environments, as you can have your code simply execute locally in most cases 138 | and worry about scaling the operation only in production. 139 | 140 | For production, FLAME provides the `FLAME.FlyBackend`, which uses 141 | [Fly.io](https://fly.io). Because Fly deploys a containerized machine of 142 | your application, a single Fly API call can boot a machine running your 143 | exact Docker deployment image, allowing closures to be executed across 144 | distributed nodes. 145 | 146 | Default backends can be configured in your `config/runtime.exs`: 147 | 148 | if config_env() == :prod do 149 | config :flame, :backend, FLAME.FlyBackend 150 | config :flame, FLAME.FlyBackend, token: System.fetch_env!("FLY_API_TOKEN") 151 | ... 152 | end 153 | 154 | ## Termination and remote links 155 | 156 | FLAME runs a termination process to allow remotely spawned functions time to 157 | complete before the node is terminated. This process is started automatically 158 | with the library. The shutdown timeout by default is 30s, but can be configured 159 | in your application configuration, such as `config/runtime.exs`: 160 | 161 | config :flame, :terminator, shutdown_timeout: :timer.seconds(10) 162 | 163 | *Note*: By default `call/3`, `cast/3`, and `place_child/3` will link the caller 164 | to the remote process to prevent orphaned resources when the caller or the caller's node 165 | is terminated. This can be disabled by passing `link: false` to the options, which is 166 | useful for cases where you want to allow long-running work to complete within the 167 | `:shutdown_timeout` of the remote runner, regardless of what happens to the parent caller 168 | process and/or the parent caller node, such as a new cold deploy, a caller crash, etc. 169 | """ 170 | require Logger 171 | 172 | @doc """ 173 | Calls a function in a remote runner for the given `FLAME.Pool`. 174 | 175 | ## Options 176 | 177 | * `:timeout` - The timeout the caller is willing to wait for a response before an 178 | exit with `:timeout`. Defaults to the configured timeout of the pool. 179 | The executed function will also be terminated on the remote flame if 180 | the timeout is reached. 181 | 182 | * `:link` – Whether the caller should be linked to the remote call process 183 | to prevent long-running orphaned resources. Defaults to `true`. Set to `false` to 184 | support long-running work that you want to complete within the `:shutdown_timeout` 185 | of the remote runner, even when the parent process or node is terminated. 186 | *Note*: even when `link: false` is used, an exit in the remote process will raise 187 | an error on the caller. The caller will need to try/catch the call if they wish 188 | to handle the error. 189 | 190 | * `:track_resources` - When true, traverses the returned result looking for 191 | resources that implement the `FLAME.Trackable` protocol and make sure the 192 | FLAME node does not terminate until the tracked resources are removed. 193 | 194 | ## Examples 195 | 196 | def my_expensive_thing(arg) do 197 | FLAME.call(MyApp.Runner, fn -> 198 | # I'm now doing expensive work inside a new node 199 | # pubsub and repo access all just work 200 | Phoenix.PubSub.broadcast(MyApp.PubSub, "topic", result) 201 | 202 | # can return awaitable results back to caller 203 | result 204 | end) 205 | end 206 | 207 | When the caller exits, the remote runner will be terminated. 208 | """ 209 | def call(pool, func, opts \\ []) 210 | when is_atom(pool) and is_function(func, 0) and is_list(opts) do 211 | FLAME.Pool.call(pool, func, opts) 212 | end 213 | 214 | @doc """ 215 | Casts a function to a remote runner for the given `FLAME.Pool`. 216 | 217 | ## Options 218 | 219 | * `:link` – Whether the caller should be linked to the remote cast process 220 | to prevent long-running orphaned resources. Defaults to `true`. Set to `false` to 221 | support long-running work that you want to complete within the `:shutdown_timeout` 222 | of the remote runner, even when the parent process or node is terminated. 223 | """ 224 | def cast(pool, func, opts \\ []) 225 | when is_atom(pool) and is_function(func, 0) and is_list(opts) do 226 | FLAME.Pool.cast(pool, func, opts) 227 | end 228 | 229 | @doc """ 230 | Places a child process on a remote runner for the given `FLAME.Pool`. 231 | 232 | Any child process can be placed on the remote node and it will occupy a space 233 | in the runner's `max_concurrency` allowance. This is useful for long running 234 | workloads that you want to run asynchronously from the parent caller. 235 | 236 | *Note*: The placed child process is linked to the caller and will only survive 237 | as long as the caller does. This is to ensure that the child process is never 238 | orphaned permanently on the remote node. 239 | 240 | *Note*: The child spec will be rewritten to use a temporary restart strategy 241 | to ensure that the child process is never restarted on the remote node when it 242 | exits. If you want restart behavior, you need to monitor on the parent node and 243 | replace the child yourself. 244 | 245 | ## Options 246 | 247 | * `:timeout` - The timeout the caller is willing to wait for a response before an 248 | exit with `:timeout`. Defaults to the configured timeout of the pool. 249 | The executed function will also be terminated on the remote flame if 250 | the timeout is reached. 251 | 252 | * `:link` – Whether the caller should be linked to the remote child process 253 | to prevent long-running orphaned resources. Defaults to `true`. Set to `false` to 254 | support long-running work that you want to complete within the `:shutdown_timeout` 255 | of the remote runner, even when the parent process or node is terminated. 256 | 257 | Accepts any child spec. 258 | 259 | ## Examples 260 | 261 | {:ok, pid} = FLAME.place_child(MyRunner, {MyWorker, []}) 262 | """ 263 | def place_child(pool, child_spec, opts \\ []) when is_atom(pool) and is_list(opts) do 264 | FLAME.Pool.place_child(pool, child_spec, opts) 265 | end 266 | 267 | @doc """ 268 | Callback invoked to recursively track resources 269 | on a given node. 270 | 271 | Sometimes we may want to allocate long lived resources 272 | in a FLAME but, because FLAME nodes are temporary, the 273 | node would terminate shortly after. The `:track_resources` 274 | option tells `FLAME` to look for resources which implement 275 | the `FLAME.Trackable` protocol. Those resources can then 276 | spawn PIDs in the remote node and tell FLAME to track them. 277 | Once all PIDs terminate, the FLAME node will terminate too. 278 | 279 | The `data` is any data type, `acc` is a list of PIDs 280 | (typicalling starts as an empty list), and the `node` 281 | we have received the resources from. See `FLAME.Trackable` 282 | for customization. 283 | """ 284 | def track_resources(data, acc, node) 285 | 286 | def track_resources(tuple, acc, node) when is_tuple(tuple) do 287 | {list, acc} = tuple |> Tuple.to_list() |> track_resources(acc, node) 288 | {List.to_tuple(list), acc} 289 | end 290 | 291 | def track_resources(list, acc, node) when is_list(list) do 292 | Enum.map_reduce(list, acc, &track_resources(&1, &2, node)) 293 | end 294 | 295 | def track_resources(%_{} = other, acc, node) do 296 | FLAME.Trackable.track(other, acc, node) 297 | end 298 | 299 | def track_resources(%{} = map, acc, node) do 300 | {pairs, acc} = 301 | Enum.map_reduce(map, acc, fn {k, v}, acc -> 302 | {k, acc} = track_resources(k, acc, node) 303 | {v, acc} = track_resources(v, acc, node) 304 | {{k, v}, acc} 305 | end) 306 | 307 | {Map.new(pairs), acc} 308 | end 309 | 310 | def track_resources(other, acc, _node) do 311 | {other, acc} 312 | end 313 | end 314 | -------------------------------------------------------------------------------- /lib/flame/backend.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Backend do 2 | @moduledoc """ 3 | Defines a behavior for a FLAME backend. 4 | 5 | A FLAME backend is responsible for booting remote compute resources, 6 | connecting them back to the parent node, and executing functions on them. 7 | 8 | The default `FLAME.LocalBackend` simply runs your code locally, allowing 9 | you to develop and test your application using `FLAME.call/3` without 10 | running an external backend. 11 | 12 | ## Messaging 13 | 14 | The `FLAME.Terminator` process runs on remote nodes automatically and is 15 | responsible for connecting back to the parent node, notifying the parent, and 16 | handling termination of remote processes started via `FLAME.call/3`, `FLAME.cast/3`, 17 | and `FLAME.place_child/3`. When the terminator starts on a newly booted remote 18 | node, it sends the following message to the parent runner process: 19 | 20 | {ref, {:remote_up, remote_terminator_pid}} 21 | 22 | Where ref is the reference generated by the backend and encoded into the 23 | `FLAME.Parent.encode/1` string. 24 | 25 | When the remote terminator is going away gracefully, it sends the following message: 26 | 27 | {ref, {:remote_shutdown, :idle}} 28 | 29 | Backend implementations can react to these messages to handle the remotely 30 | provisioned instance booting up or shutting down. 31 | 32 | See `FLAME.FlyBackend` for an example implementation of this behavior. 33 | """ 34 | @callback init(opts :: Keyword.t()) :: {:ok, state :: term()} | {:error, term()} 35 | @callback remote_spawn_monitor(state :: term, func :: function() | term) :: 36 | {:ok, {pid, reference()}} | {:error, reason :: term} 37 | @callback system_shutdown() :: no_return() 38 | @callback remote_boot(state :: term) :: 39 | {:ok, remote_terminator_pid :: pid(), new_state :: term} | {:error, term} 40 | @callback handle_info(msg :: term, state :: term) :: 41 | {:noreply, new_state :: term} | {:stop, term, new_state :: term} 42 | 43 | @optional_callbacks handle_info: 2 44 | 45 | def init(opts), do: impl().init(opts) 46 | 47 | def remote_spawn_monitor(state, func) do 48 | impl().remote_spawn_monitor(state, func) 49 | end 50 | 51 | def system_shutdown do 52 | impl().system_shutdown() 53 | end 54 | 55 | def remote_boot(state) do 56 | impl().remote_boot(state) 57 | end 58 | 59 | def handle_info(msg, state) do 60 | impl().handle_info(msg, state) 61 | end 62 | 63 | def impl, do: Application.get_env(:flame, :backend, FLAME.LocalBackend) 64 | end 65 | -------------------------------------------------------------------------------- /lib/flame/code_sync.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.CodeSync.PackagedStream do 2 | @moduledoc false 3 | defstruct stream: nil, 4 | id: nil, 5 | extract_dir: nil, 6 | tmp_dir: nil, 7 | apps_to_start: [], 8 | changed_paths: [], 9 | sync_beam_hashes: %{}, 10 | deleted_paths: [], 11 | purge_modules: [], 12 | verbose: false, 13 | compress: false, 14 | chunk_size: 64_000 15 | end 16 | 17 | defmodule FLAME.CodeSync do 18 | @moduledoc false 19 | require Logger 20 | 21 | alias FLAME.CodeSync 22 | alias FLAME.CodeSync.PackagedStream 23 | 24 | defstruct id: nil, 25 | get_path: nil, 26 | sync_beam_hashes: %{}, 27 | copy_apps: nil, 28 | copy_paths: nil, 29 | sync_beams: nil, 30 | extract_dir: nil, 31 | tmp_dir: nil, 32 | start_apps: true, 33 | apps_to_start: [], 34 | changed_paths: [], 35 | deleted_paths: [], 36 | purge_modules: [], 37 | verbose: false, 38 | compress: false, 39 | chunk_size: 64_000 40 | 41 | def new(opts \\ []) do 42 | Keyword.validate!(opts, [ 43 | :get_path, 44 | :tmp_dir, 45 | :extract_dir, 46 | :copy_apps, 47 | :copy_paths, 48 | :sync_beams, 49 | :start_apps, 50 | :verbose, 51 | :compress, 52 | :chunk_size 53 | ]) 54 | 55 | start_apps = Keyword.get(opts, :start_apps, true) 56 | 57 | compute_start_apps(%CodeSync{ 58 | id: System.unique_integer([:positive]), 59 | get_path: Keyword.get(opts, :get_path, &:code.get_path/0), 60 | start_apps: start_apps, 61 | copy_apps: Keyword.get(opts, :copy_apps, start_apps), 62 | copy_paths: Keyword.get(opts, :copy_paths, false), 63 | sync_beams: Keyword.get(opts, :sync_beams, []), 64 | tmp_dir: Keyword.get(opts, :tmp_dir, {System, :tmp_dir!, []}), 65 | extract_dir: Keyword.get(opts, :extract_dir, {Function, :identity, ["/"]}), 66 | verbose: Keyword.get(opts, :verbose, false), 67 | compress: Keyword.get(opts, :compress, true), 68 | chunk_size: Keyword.get(opts, :chunk_size, 64_000) 69 | }) 70 | end 71 | 72 | defp compute_start_apps(%CodeSync{} = code) do 73 | apps_to_start = 74 | case code.start_apps do 75 | true -> 76 | Enum.map(Application.started_applications(), fn {app, _desc, _vsn} -> app end) 77 | 78 | false -> 79 | [] 80 | 81 | apps when is_list(apps) -> 82 | apps 83 | end 84 | 85 | %{code | apps_to_start: apps_to_start} 86 | end 87 | 88 | def compute_sync_beams(%CodeSync{} = code) do 89 | sync_beams_files = lookup_sync_beams_files(code.sync_beams) 90 | 91 | beam_hashes = 92 | for path <- sync_beams_files, 93 | into: %{}, 94 | do: {path, :erlang.md5(File.read!(path))} 95 | 96 | %{ 97 | code 98 | | sync_beam_hashes: beam_hashes, 99 | changed_paths: Enum.uniq(code.changed_paths ++ sync_beams_files) 100 | } 101 | end 102 | 103 | def compute_changed_paths(%CodeSync{} = code) do 104 | copy_apps = 105 | case code.copy_apps do 106 | true -> lookup_apps_files(code) 107 | false -> [] 108 | end 109 | 110 | changed_paths = 111 | case code.copy_paths do 112 | paths when is_list(paths) -> 113 | Enum.uniq(lookup_copy_paths_files(paths) ++ copy_apps) 114 | 115 | false -> 116 | copy_apps 117 | 118 | true -> 119 | IO.warn( 120 | "copy_paths: true is deprecated. Passing start_apps: true, now automatically copies all apps. \n" <> 121 | "You can also pass copy_apps: true to copy all apps without starting them." 122 | ) 123 | 124 | lookup_apps_files(code) 125 | end 126 | 127 | %{code | changed_paths: Enum.uniq(code.changed_paths ++ changed_paths)} 128 | end 129 | 130 | def changed?(%CodeSync{} = code) do 131 | code.changed_paths != [] or code.deleted_paths != [] or code.purge_modules != [] 132 | end 133 | 134 | def diff(%CodeSync{sync_beam_hashes: prev_hashes} = prev) do 135 | current = 136 | prev 137 | |> compute_start_apps() 138 | |> compute_sync_beams() 139 | 140 | changed = 141 | for kv <- current.sync_beam_hashes, 142 | {path, current_hash} = kv, 143 | current_hash != prev_hashes[path], 144 | do: path 145 | 146 | deleted_paths = 147 | for kv <- prev.sync_beam_hashes, 148 | {path, _prev_hash} = kv, 149 | not Map.has_key?(current.sync_beam_hashes, path), 150 | do: path 151 | 152 | module_to_purge = 153 | for path <- deleted_paths, 154 | do: path |> Path.basename(".beam") |> String.to_atom() 155 | 156 | %{ 157 | current 158 | | changed_paths: changed, 159 | deleted_paths: deleted_paths, 160 | purge_modules: module_to_purge, 161 | apps_to_start: [] 162 | } 163 | end 164 | 165 | def package_to_stream(%CodeSync{} = code) do 166 | compressed = if code.compress, do: [:compressed], else: [] 167 | 168 | verbose = 169 | if code.verbose do 170 | if !Enum.empty?(code.changed_paths), 171 | do: log_verbose("packaging changed_paths: #{inspect(code.changed_paths)}") 172 | 173 | if !Enum.empty?(code.apps_to_start), 174 | do: log_verbose("sending apps_to_start: #{inspect(code.apps_to_start)}") 175 | 176 | [:verbose] 177 | else 178 | [] 179 | end 180 | 181 | out_stream = 182 | if code.changed_paths != [] do 183 | out_path = Path.join([mfa(code.tmp_dir), "flame_parent_code_sync_#{code.id}.tar.gz"]) 184 | dirs = for path <- code.changed_paths, uniq: true, do: String.to_charlist(path) 185 | {:ok, tar} = :erl_tar.open(out_path, [:write] ++ compressed) 186 | 187 | for dir <- dirs, 188 | do: :erl_tar.add(tar, dir, trim_leading_slash(dir), [:dereference | verbose]) 189 | 190 | :ok = :erl_tar.close(tar) 191 | 192 | if code.verbose do 193 | log_verbose("packaged size: #{File.stat!(out_path).size / (1024 * 1024)}mb") 194 | end 195 | 196 | # TODO: Change to File.stream!(out_path, code.chunk_size) once we require Elixir v1.16+ 197 | File.stream!(out_path, [], code.chunk_size) 198 | end 199 | 200 | %PackagedStream{ 201 | id: code.id, 202 | tmp_dir: code.tmp_dir, 203 | extract_dir: code.extract_dir, 204 | sync_beam_hashes: code.sync_beam_hashes, 205 | changed_paths: code.changed_paths, 206 | deleted_paths: code.deleted_paths, 207 | purge_modules: code.purge_modules, 208 | apps_to_start: code.apps_to_start, 209 | stream: out_stream, 210 | verbose: code.verbose, 211 | compress: code.compress, 212 | chunk_size: code.chunk_size 213 | } 214 | end 215 | 216 | defp trim_leading_slash([?/ | path]), do: path 217 | defp trim_leading_slash([_ | _] = path), do: path 218 | 219 | def extract_packaged_stream(%PackagedStream{} = pkg) do 220 | extract_dir = 221 | if pkg.stream do 222 | verbose = if pkg.verbose, do: [:verbose], else: [] 223 | compressed = if pkg.compress, do: [:compressed], else: [] 224 | extract_dir = mfa(pkg.extract_dir) 225 | target_tmp_path = Path.join([mfa(pkg.tmp_dir), "flame_child_code_sync_#{pkg.id}.tar.gz"]) 226 | 227 | flame_stream = File.stream!(target_tmp_path) 228 | Enum.into(pkg.stream, flame_stream) 229 | 230 | :ok = :erl_tar.extract(target_tmp_path, [{:cwd, extract_dir}] ++ compressed ++ verbose) 231 | :ok = add_code_paths_from_tar(pkg, extract_dir) 232 | 233 | File.rm(target_tmp_path) 234 | 235 | # purge any deleted modules 236 | for mod <- pkg.purge_modules do 237 | if pkg.verbose && !Enum.empty?(pkg.purge_modules), 238 | do: log_verbose("purging #{inspect(pkg.purge_modules)}") 239 | 240 | :code.purge(mod) 241 | :code.delete(mod) 242 | end 243 | 244 | # delete any deleted code paths, and prune empty dirs 245 | for del_path <- pkg.deleted_paths do 246 | File.rm(del_path) 247 | ebin_dir = Path.dirname(del_path) 248 | 249 | if File.ls!(ebin_dir) == [] do 250 | if pkg.verbose, do: log_verbose("deleting path #{ebin_dir}") 251 | File.rm_rf(ebin_dir) 252 | :code.del_path(String.to_charlist(ebin_dir)) 253 | end 254 | end 255 | 256 | extract_dir 257 | end 258 | 259 | # start any synced apps 260 | if !Enum.empty?(pkg.apps_to_start) do 261 | {:ok, started} = Application.ensure_all_started(pkg.apps_to_start) 262 | if pkg.verbose, do: log_verbose("started #{inspect(started)}") 263 | end 264 | 265 | extract_dir 266 | end 267 | 268 | def rm_packaged_stream(%PackagedStream{} = pkg) do 269 | if pkg.stream, do: File.rm(pkg.stream.path) 270 | :ok 271 | end 272 | 273 | defp lookup_sync_beams_files(paths) do 274 | paths 275 | |> Enum.flat_map(&Path.wildcard(Path.join(&1, "**/*.beam"))) 276 | |> Enum.uniq() 277 | end 278 | 279 | defp lookup_apps_files(%CodeSync{get_path: get_path}) do 280 | otp_lib = to_string(:code.lib_dir()) 281 | 282 | reject_apps = 283 | for app <- [:flame, :eex, :elixir, :ex_unit, :iex, :logger, :mix], 284 | lib_dir = :code.lib_dir(app), 285 | is_list(lib_dir), 286 | do: to_string(:filename.join(lib_dir, ~c"ebin")) 287 | 288 | get_path.() 289 | |> Enum.map(&to_string/1) 290 | |> Kernel.--(["." | reject_apps]) 291 | |> Stream.reject(fn path -> String.starts_with?(path, otp_lib) end) 292 | |> Stream.map(fn parent_dir -> 293 | # include ebin's parent if basename is ebin (will include priv) 294 | case Path.basename(parent_dir) do 295 | "ebin" -> Path.join(Path.dirname(parent_dir), "**/*") 296 | _ -> Path.join(parent_dir, "*") 297 | end 298 | end) 299 | |> Stream.uniq() 300 | |> Stream.flat_map(fn glob -> Path.wildcard(glob) end) 301 | |> Stream.uniq() 302 | |> Enum.filter(fn path -> File.regular?(path, [:raw]) end) 303 | end 304 | 305 | defp lookup_copy_paths_files(paths) do 306 | paths 307 | |> Stream.map(fn parent_dir -> 308 | if File.regular?(parent_dir, [:raw]) do 309 | parent_dir 310 | else 311 | Path.join(parent_dir, "*") 312 | end 313 | end) 314 | |> Stream.uniq() 315 | |> Stream.flat_map(fn glob -> Path.wildcard(glob) end) 316 | |> Stream.uniq() 317 | |> Enum.filter(fn path -> File.regular?(path, [:raw]) end) 318 | end 319 | 320 | defp add_code_paths_from_tar(%PackagedStream{} = pkg, extract_dir) do 321 | init = {_consolidated = [], _regular = [], _beams = [], _reload = [], _seen = MapSet.new()} 322 | 323 | Enum.reduce(pkg.changed_paths, init, fn rel_path, {cons, reg, beams, reload, seen} -> 324 | new_seen = MapSet.put(seen, rel_path) 325 | dir = extract_dir |> Path.join(rel_path) |> Path.dirname() 326 | 327 | new_reload = 328 | case rel_path |> Path.basename() |> String.split(".beam") do 329 | [mod_str, ""] -> 330 | mod = Module.concat([mod_str]) 331 | :code.purge(mod) 332 | :code.delete(mod) 333 | [mod | reload] 334 | 335 | _ -> 336 | reload 337 | end 338 | 339 | cond do 340 | # purge consolidated protocols 341 | # we only need to track new reloads for protocols as other module 342 | # references will reload on demand 343 | MapSet.member?(seen, rel_path) -> 344 | {cons, reg, beams, new_reload, seen} 345 | 346 | Path.basename(dir) == "consolidated" -> 347 | {[dir | cons], reg, beams, new_reload, new_seen} 348 | 349 | pkg.sync_beam_hashes[rel_path] -> 350 | {cons, reg, [dir | beams], reload, new_seen} 351 | 352 | true -> 353 | {cons, [dir | reg], beams, reload, new_seen} 354 | end 355 | end) 356 | |> then(fn {consolidated, regular, sync_beams, reload, _seen} -> 357 | # paths already in reverse order, which is what we want for prepend 358 | if pkg.verbose do 359 | if !Enum.empty?(consolidated), 360 | do: log_verbose("prepending consolidated paths: #{inspect(consolidated)}") 361 | 362 | if !Enum.empty?(regular), 363 | do: log_verbose("appending code paths: #{inspect(regular)}") 364 | 365 | if !Enum.empty?(sync_beams), 366 | do: log_verbose("reloading code paths: #{inspect(sync_beams)}") 367 | end 368 | 369 | Code.prepend_paths(regular, cache: true) 370 | Code.prepend_paths(consolidated, cache: true) 371 | # don't cache for sync_beams 372 | Code.prepend_paths(sync_beams) 373 | 374 | if pkg.verbose && !Enum.empty?(reload), do: log_verbose("reloading #{inspect(reload)}") 375 | for mod <- reload, do: :code.load_file(mod) 376 | 377 | :ok 378 | end) 379 | end 380 | 381 | defp log_verbose(msg) do 382 | Logger.info("[CodeSync #{inspect(node())}] #{msg}") 383 | end 384 | 385 | defp mfa({mod, func, args}), do: apply(mod, func, args) 386 | end 387 | -------------------------------------------------------------------------------- /lib/flame/fly_backend.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.FlyBackend do 2 | @moduledoc """ 3 | A `FLAME.Backend` using [Fly.io](https://fly.io) machines. 4 | 5 | The only required configuration is telling FLAME to use the 6 | `FLAME.FlyBackend` by default and the `:token` which is your Fly.io API 7 | token. These can be set via application configuration in your `config/runtime.exs` 8 | withing a `:prod` block: 9 | 10 | if config_env() == :prod do 11 | config :flame, :backend, FLAME.FlyBackend 12 | config :flame, FLAME.FlyBackend, token: System.fetch_env!("FLY_API_TOKEN") 13 | ... 14 | end 15 | 16 | To set your `FLY_API_TOKEN` secret, you can run the following commands locally: 17 | 18 | ```bash 19 | $ fly secrets set FLY_API_TOKEN="$(fly auth token)" 20 | ``` 21 | 22 | The following backend options are supported, and mirror the 23 | [Fly.io machines create API](https://fly.io/docs/machines/api/machines-resource/#machine-config-object-properties): 24 | 25 | * `:cpu_kind` - The size of the runner CPU. Defaults to `"performance"`. 26 | 27 | * `:cpus` - The number of runner CPUs. Defaults to `System.schedulers_online()` 28 | for the number of cores of the running parent app. 29 | 30 | * `:memory_mb` - The memory of the runner. Must be a 1024 multiple. Defaults to `4096`. 31 | 32 | * `:gpu_kind` - The type of GPU reservation to make. 33 | 34 | * `:gpus` - The number of runner GPUs. Defaults to `1` if `:gpu_kind` is set. 35 | 36 | * `:boot_timeout` - The boot timeout. Defaults to `30_000`. 37 | 38 | * `:app` – The name of the otp app. Defaults to `System.get_env("FLY_APP_NAME")`, 39 | 40 | * `:image` – The URL of the docker image to pass to the machines create endpoint. 41 | Defaults to `System.get_env("FLY_IMAGE_REF")` which is the image of your running app. 42 | 43 | * `:token` – The Fly API token. Defaults to `System.get_env("FLY_API_TOKEN")`. 44 | 45 | * `:host` – The host of the Fly API. Defaults to `"https://api.machines.dev"`. 46 | 47 | * `:init` – The init object to pass to the machines create endpoint. Defaults to `%{}`. 48 | Possible values include: 49 | 50 | * `:cmd` – list of strings for the command 51 | * `:entrypoint` – list strings for the entrypoint command 52 | * `:exec` – list of strings for the exec command 53 | * `:kernel_args` - list of strings 54 | * `:swap_size_mb` – integer value in megabytes for the swap size 55 | * `:tty` – boolean 56 | 57 | * `:services` - The optional services to run on the machine. Defaults to `[]`. 58 | 59 | * `:metadata` - The optional map of metadata to set for the machine. Defaults to `%{}`. 60 | 61 | ## Environment Variables 62 | 63 | The FLAME Fly machines *do not* inherit the environment variables of the parent. 64 | You must explicit provide the environment that you would like to forward to the 65 | machine. For example, if your FLAME's are starting your Ecto repos, you can copy 66 | the env from the parent: 67 | 68 | ```elixir 69 | config :flame, FLAME.FlyBackend, 70 | token: System.fetch_env!("FLY_API_TOKEN"), 71 | env: %{ 72 | "DATABASE_URL" => System.fetch_env!("DATABASE_URL"), 73 | "POOL_SIZE" => "1" 74 | } 75 | ``` 76 | 77 | Or pass the env to each pool: 78 | 79 | ```elixir 80 | {FLAME.Pool, 81 | name: MyRunner, 82 | backend: {FLAME.FlyBackend, env: %{"DATABASE_URL" => System.fetch_env!("DATABASE_URL")}} 83 | } 84 | ``` 85 | """ 86 | @behaviour FLAME.Backend 87 | 88 | alias FLAME.FlyBackend 89 | alias FLAME.Parser.JSON 90 | 91 | require Logger 92 | 93 | @derive {Inspect, 94 | only: [ 95 | :host, 96 | :init, 97 | :cpu_kind, 98 | :cpus, 99 | :memory_mb, 100 | :gpu_kind, 101 | :gpus, 102 | :image, 103 | :app, 104 | :runner_id, 105 | :local_ip, 106 | :remote_terminator_pid, 107 | :runner_instance_id, 108 | :runner_private_ip, 109 | :runner_node_base, 110 | :runner_node_name, 111 | :boot_timeout 112 | ]} 113 | defstruct host: nil, 114 | init: %{}, 115 | local_ip: nil, 116 | env: %{}, 117 | region: nil, 118 | cpu_kind: nil, 119 | cpus: nil, 120 | memory_mb: nil, 121 | gpu_kind: nil, 122 | gpus: nil, 123 | image: nil, 124 | services: [], 125 | metadata: %{}, 126 | app: nil, 127 | token: nil, 128 | boot_timeout: nil, 129 | runner_id: nil, 130 | remote_terminator_pid: nil, 131 | parent_ref: nil, 132 | runner_instance_id: nil, 133 | runner_private_ip: nil, 134 | runner_node_base: nil, 135 | runner_node_name: nil, 136 | log: nil 137 | 138 | @retry 10 139 | 140 | @valid_opts [ 141 | :app, 142 | :region, 143 | :image, 144 | :token, 145 | :host, 146 | :init, 147 | :cpu_kind, 148 | :cpus, 149 | :memory_mb, 150 | :gpu_kind, 151 | :gpus, 152 | :boot_timeout, 153 | :env, 154 | :terminator_sup, 155 | :log, 156 | :services, 157 | :metadata 158 | ] 159 | 160 | @impl true 161 | def init(opts) do 162 | conf = Application.get_env(:flame, __MODULE__) || [] 163 | [_node_base, ip] = node() |> to_string() |> String.split("@") 164 | 165 | default = %FlyBackend{ 166 | app: System.get_env("FLY_APP_NAME"), 167 | region: System.get_env("FLY_REGION"), 168 | image: System.get_env("FLY_IMAGE_REF"), 169 | token: System.get_env("FLY_API_TOKEN"), 170 | host: "https://api.machines.dev", 171 | cpu_kind: "performance", 172 | cpus: System.schedulers_online(), 173 | memory_mb: 4096, 174 | boot_timeout: 30_000, 175 | services: [], 176 | metadata: %{}, 177 | init: %{}, 178 | log: Keyword.get(conf, :log, false) 179 | } 180 | 181 | provided_opts = 182 | conf 183 | |> Keyword.merge(opts) 184 | |> Keyword.validate!(@valid_opts) 185 | 186 | %FlyBackend{} = state = Map.merge(default, Map.new(provided_opts)) 187 | 188 | for key <- [:token, :image, :host, :app] do 189 | unless Map.get(state, key) do 190 | raise ArgumentError, "missing :#{key} config for #{inspect(__MODULE__)}" 191 | end 192 | end 193 | 194 | state = %{state | runner_node_base: "#{state.app}-flame-#{rand_id(20)}"} 195 | parent_ref = make_ref() 196 | 197 | encoded_parent = 198 | parent_ref 199 | |> FLAME.Parent.new(self(), __MODULE__, state.runner_node_base, "FLY_PRIVATE_IP") 200 | |> FLAME.Parent.encode() 201 | 202 | new_env = 203 | %{"PHX_SERVER" => "false", "FLAME_PARENT" => encoded_parent} 204 | |> Map.merge(state.env) 205 | |> then(fn env -> 206 | if flags = System.get_env("ERL_AFLAGS") do 207 | Map.put_new(env, "ERL_AFLAGS", flags) 208 | else 209 | env 210 | end 211 | end) 212 | |> then(fn env -> 213 | if flags = System.get_env("ERL_ZFLAGS") do 214 | Map.put_new(env, "ERL_ZFLAGS", flags) 215 | else 216 | env 217 | end 218 | end) 219 | 220 | new_state = %{state | env: new_env, parent_ref: parent_ref, local_ip: ip} 221 | 222 | {:ok, new_state} 223 | end 224 | 225 | @impl true 226 | # TODO explore spawn_request 227 | def remote_spawn_monitor(%FlyBackend{} = state, term) do 228 | case term do 229 | func when is_function(func, 0) -> 230 | {pid, ref} = Node.spawn_monitor(state.runner_node_name, func) 231 | {:ok, {pid, ref}} 232 | 233 | {mod, fun, args} when is_atom(mod) and is_atom(fun) and is_list(args) -> 234 | {pid, ref} = Node.spawn_monitor(state.runner_node_name, mod, fun, args) 235 | {:ok, {pid, ref}} 236 | 237 | other -> 238 | raise ArgumentError, 239 | "expected a null arity function or {mod, func, args}. Got: #{inspect(other)}" 240 | end 241 | end 242 | 243 | @impl true 244 | def system_shutdown do 245 | System.stop() 246 | end 247 | 248 | def with_elapsed_ms(func) when is_function(func, 0) do 249 | {micro, result} = :timer.tc(func) 250 | {result, div(micro, 1000)} 251 | end 252 | 253 | @impl true 254 | def remote_boot(%FlyBackend{parent_ref: parent_ref} = state) do 255 | {resp, req_connect_time} = 256 | with_elapsed_ms(fn -> 257 | http_post!("#{state.host}/v1/apps/#{state.app}/machines", @retry, 258 | content_type: "application/json", 259 | headers: [ 260 | {"Content-Type", "application/json"}, 261 | {"Authorization", "Bearer #{state.token}"} 262 | ], 263 | connect_timeout: state.boot_timeout, 264 | body: 265 | JSON.encode!(%{ 266 | name: state.runner_node_base, 267 | region: state.region, 268 | config: %{ 269 | image: state.image, 270 | init: state.init, 271 | guest: %{ 272 | cpu_kind: state.cpu_kind, 273 | cpus: state.cpus, 274 | memory_mb: state.memory_mb, 275 | gpu_kind: state.gpu_kind, 276 | gpus: if(state.gpu_kind, do: state.gpus || 1) 277 | }, 278 | auto_destroy: true, 279 | restart: %{policy: "no"}, 280 | env: state.env, 281 | services: state.services, 282 | metadata: Map.put(state.metadata, :flame_parent_ip, state.local_ip) 283 | } 284 | }) 285 | ) 286 | end) 287 | 288 | if state.log do 289 | Logger.log( 290 | state.log, 291 | "#{inspect(__MODULE__)} #{inspect(node())} machine create #{req_connect_time}ms" 292 | ) 293 | end 294 | 295 | remaining_connect_window = state.boot_timeout - req_connect_time 296 | 297 | case resp do 298 | %{"id" => id, "instance_id" => instance_id, "private_ip" => ip} -> 299 | new_state = 300 | %{ 301 | state 302 | | runner_id: id, 303 | runner_instance_id: instance_id, 304 | runner_private_ip: ip 305 | } 306 | 307 | remote_terminator_pid = 308 | receive do 309 | {^parent_ref, {:remote_up, remote_terminator_pid}} -> 310 | remote_terminator_pid 311 | after 312 | remaining_connect_window -> 313 | Logger.error("failed to connect to fly machine within #{state.boot_timeout}ms") 314 | exit(:timeout) 315 | end 316 | 317 | new_state = %{ 318 | new_state 319 | | remote_terminator_pid: remote_terminator_pid, 320 | runner_node_name: node(remote_terminator_pid) 321 | } 322 | 323 | {:ok, remote_terminator_pid, new_state} 324 | 325 | other -> 326 | {:error, other} 327 | end 328 | end 329 | 330 | defp rand_id(len) do 331 | len 332 | |> :crypto.strong_rand_bytes() 333 | |> Base.encode16(case: :lower) 334 | |> binary_part(0, len) 335 | end 336 | 337 | defp http_post!(url, remaining_tries, opts) do 338 | Keyword.validate!(opts, [:headers, :body, :connect_timeout, :content_type]) 339 | 340 | headers = 341 | for {field, val} <- Keyword.fetch!(opts, :headers), 342 | do: {String.to_charlist(field), val} 343 | 344 | body = Keyword.fetch!(opts, :body) 345 | connect_timeout = Keyword.fetch!(opts, :connect_timeout) 346 | content_type = Keyword.fetch!(opts, :content_type) 347 | 348 | http_opts = [ 349 | ssl: 350 | [ 351 | verify: :verify_peer, 352 | depth: 2, 353 | customize_hostname_check: [ 354 | match_fun: :public_key.pkix_verify_hostname_match_fun(:https) 355 | ] 356 | ] ++ cacerts_options(), 357 | connect_timeout: connect_timeout 358 | ] 359 | 360 | case :httpc.request(:post, {url, headers, ~c"#{content_type}", body}, http_opts, 361 | body_format: :binary 362 | ) do 363 | {:ok, {{_, 200, _}, _, response_body}} -> 364 | JSON.decode!(response_body) 365 | 366 | # 429 Too Many Requests (rate limited) 367 | # 412 Precondition Failed (can't find capacity) 368 | # 409 Conflict (the flyd tried ending up not having capacity) 369 | # 422 Unprocessable Entity (could not find capcity for volume workloads) 370 | {:ok, {{_, status, _}, _, _response_body}} 371 | when status in [429, 412, 409, 422] and remaining_tries > 0 -> 372 | Process.sleep(1000) 373 | http_post!(url, remaining_tries - 1, opts) 374 | 375 | {:ok, {{_, status, reason}, _, resp_body}} -> 376 | raise "failed POST #{url} with #{inspect(status)} (#{inspect(reason)}): #{inspect(resp_body)} #{inspect(headers)}" 377 | 378 | {:error, reason} -> 379 | raise "failed POST #{url} with #{inspect(reason)} #{inspect(headers)}" 380 | end 381 | end 382 | 383 | defp cacerts_options do 384 | cond do 385 | certs = otp_cacerts() -> 386 | [cacerts: certs] 387 | 388 | Application.spec(:castore, :vsn) -> 389 | [cacertfile: Application.app_dir(:castore, "priv/cacerts.pem")] 390 | 391 | true -> 392 | IO.warn(""" 393 | No certificate trust store was found. 394 | 395 | A certificate trust store is required in 396 | order to download locales for your configuration. 397 | Since elixir_make could not detect a system 398 | installed certificate trust store one of the 399 | following actions may be taken: 400 | 401 | 1. Use OTP 25+ on an OS that has built-in certificate 402 | trust store. 403 | 404 | 2. Install the hex package `castore`. It will 405 | be automatically detected after recompilation. 406 | 407 | """) 408 | 409 | [] 410 | end 411 | end 412 | 413 | if System.otp_release() >= "25" do 414 | defp otp_cacerts do 415 | :public_key.cacerts_get() 416 | rescue 417 | _ -> nil 418 | end 419 | else 420 | defp otp_cacerts, do: nil 421 | end 422 | end 423 | -------------------------------------------------------------------------------- /lib/flame/local_backend.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.LocalBackend do 2 | @moduledoc """ 3 | A `FLAME.Backend` useful for development and testing. 4 | """ 5 | 6 | @behaviour FLAME.Backend 7 | 8 | @impl true 9 | def init(opts) do 10 | defaults = 11 | Application.get_env(:flame, __MODULE__) || [] 12 | 13 | _terminator_sup = Keyword.fetch!(opts, :terminator_sup) 14 | 15 | {:ok, 16 | defaults 17 | |> Keyword.merge(opts) 18 | |> Enum.into(%{})} 19 | end 20 | 21 | @impl true 22 | def remote_spawn_monitor(_state, term) do 23 | case term do 24 | func when is_function(func, 0) -> 25 | {pid, ref} = spawn_monitor(func) 26 | {:ok, {pid, ref}} 27 | 28 | {mod, fun, args} when is_atom(mod) and is_atom(fun) and is_list(args) -> 29 | {pid, ref} = spawn_monitor(mod, fun, args) 30 | {:ok, {pid, ref}} 31 | 32 | other -> 33 | raise ArgumentError, 34 | "expected a null arity function or {mod, func, args}. Got: #{inspect(other)}" 35 | end 36 | end 37 | 38 | @impl true 39 | def system_shutdown, do: :noop 40 | 41 | @impl true 42 | def remote_boot(state) do 43 | parent = FLAME.Parent.new(make_ref(), self(), __MODULE__, "nonode", nil) 44 | name = Module.concat(state.terminator_sup, to_string(System.unique_integer([:positive]))) 45 | opts = [name: name, parent: parent, log: state.log] 46 | 47 | spec = Supervisor.child_spec({FLAME.Terminator, opts}, restart: :temporary) 48 | {:ok, _sup_pid} = DynamicSupervisor.start_child(state.terminator_sup, spec) 49 | 50 | case Process.whereis(name) do 51 | terminator_pid when is_pid(terminator_pid) -> {:ok, terminator_pid, state} 52 | end 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /lib/flame/parent.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Parent do 2 | @moduledoc """ 3 | Conveniences for looking up FLAME parent information. 4 | 5 | ## Parent Information 6 | 7 | When a FLAME child is started, it contains the `FLAME_PARENT` environment 8 | variable that holds the parent node's information base 64 encoded into a 9 | map, with the following keys: 10 | 11 | * `:ref` - The parent node's reference. 12 | * `:pid` - The parent node's Pid. 13 | * `:backend` - The FLAME backend in use. 14 | * `:flame_vsn` - The FLAME version running on the parent. 15 | * `:backend_app` - The FLAME backend application running on the parent. 16 | * `:backend_vsn` - The FLAME backend version running on the parent. 17 | * `:node_base` - The node basename the parent generated for the runner. 18 | * `:host_env` - The environment variable name on the runner to use to 19 | lookup the runner's hostname for the runner's longname. 20 | """ 21 | 22 | @flame_vsn Keyword.fetch!(Mix.Project.config(), :version) 23 | 24 | defstruct pid: nil, 25 | ref: nil, 26 | backend: nil, 27 | node_base: nil, 28 | flame_vsn: nil, 29 | backend_vsn: nil, 30 | backend_app: nil, 31 | host_env: nil 32 | 33 | @doc """ 34 | Gets the `%FLAME.Parent{}` struct from the system environment. 35 | 36 | Returns `nil` if no parent is set. 37 | 38 | When booting a FLAME node, the `FLAME.Backend` is required to 39 | export the `FLAME_PARENT` environment variable for the provisioned 40 | instance. This value holds required information about the parent node 41 | and can be set using the `encode/1` function. 42 | """ 43 | def get do 44 | with {:ok, encoded} <- System.fetch_env("FLAME_PARENT"), 45 | %{ref: ref, pid: pid, backend: backend, host_env: host_env, node_base: node_base} = 46 | encoded |> Base.decode64!() |> :erlang.binary_to_term() do 47 | new(ref, pid, backend, node_base, host_env) 48 | else 49 | _ -> nil 50 | end 51 | end 52 | 53 | @doc """ 54 | Returns a new `%FLAME.Parent{}` struct. 55 | 56 | The `pid` is the parent node's `FLAME.Runner` process started by 57 | the `FLAME.Pool`. 58 | """ 59 | def new(ref, pid, backend, node_base, host_env) 60 | when is_reference(ref) and is_pid(pid) and is_atom(backend) do 61 | {backend_app, backend_vsn} = 62 | case :application.get_application(backend) do 63 | {:ok, app} -> {app, to_string(Application.spec(app, :vsn))} 64 | :undefined -> {nil, nil} 65 | end 66 | 67 | %__MODULE__{ 68 | pid: pid, 69 | ref: ref, 70 | backend: backend, 71 | node_base: node_base, 72 | host_env: host_env, 73 | flame_vsn: @flame_vsn, 74 | backend_app: backend_app, 75 | backend_vsn: backend_vsn 76 | } 77 | end 78 | 79 | @doc """ 80 | Encodes a `%FLAME.Parent{}` struct into string. 81 | """ 82 | def encode(%__MODULE__{} = parent) do 83 | info = 84 | parent 85 | |> Map.from_struct() 86 | |> Map.take([ 87 | :ref, 88 | :pid, 89 | :backend, 90 | :flame_vsn, 91 | :backend_app, 92 | :backend_vsn, 93 | :node_base, 94 | :host_env 95 | ]) 96 | 97 | info |> :erlang.term_to_binary() |> Base.encode64() 98 | end 99 | end 100 | -------------------------------------------------------------------------------- /lib/flame/parser/json.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Parser.JSON do 2 | @moduledoc false 3 | if Code.ensure_loaded?(:json) do 4 | def encode!(data) do 5 | data 6 | |> :json.encode(&encoder/2) 7 | |> IO.iodata_to_binary() 8 | end 9 | 10 | def decode!(data) do 11 | data 12 | |> :json.decode(:ok, %{null: nil}) 13 | |> handle_decode() 14 | end 15 | 16 | def json_parser, do: :json 17 | 18 | defp encoder(nil, _encoder), do: "null" 19 | defp encoder(term, encoder), do: :json.encode_value(term, encoder) 20 | 21 | defp handle_decode({data, :ok, ""}), do: data 22 | else 23 | def encode!(data), do: Jason.encode!(data) 24 | def decode!(data), do: Jason.decode!(data) 25 | 26 | def json_parser, do: Jason 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/flame/pool.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Pool.RunnerState do 2 | @moduledoc false 3 | 4 | defstruct count: nil, pid: nil, monitor_ref: nil 5 | end 6 | 7 | defmodule FLAME.Pool.WaitingState do 8 | @moduledoc false 9 | 10 | defstruct from: nil, monitor_ref: nil, deadline: nil 11 | end 12 | 13 | defmodule FLAME.Pool.Caller do 14 | @moduledoc false 15 | 16 | defstruct checkout_ref: nil, monitor_ref: nil, runner_ref: nil 17 | end 18 | 19 | defmodule FLAME.Pool do 20 | @moduledoc """ 21 | Manages a pool of `FLAME.Runner` processes. 22 | 23 | Pools support elastic growth and shrinking of the number of runners. 24 | 25 | ## Examples 26 | 27 | children = [ 28 | ..., 29 | {FLAME.Pool, name: MyRunner, min: 1, max: 10, max_concurrency: 100} 30 | ] 31 | 32 | See `start_link/1` for supported options. 33 | 34 | ## TODO 35 | [ ] interface to configure min/max at runtime 36 | 37 | """ 38 | use GenServer 39 | 40 | alias FLAME.{Pool, Runner, Queue, CodeSync} 41 | alias FLAME.Pool.{RunnerState, WaitingState, Caller} 42 | 43 | @default_max_concurrency 100 44 | @boot_timeout 30_000 45 | @idle_shutdown_after 30_000 46 | @async_boot_debounce 1_000 47 | 48 | defstruct name: nil, 49 | runner_sup: nil, 50 | task_sup: nil, 51 | terminator_sup: nil, 52 | child_placement_sup: nil, 53 | boot_timeout: nil, 54 | idle_shutdown_after: nil, 55 | min_idle_shutdown_after: nil, 56 | min: nil, 57 | max: nil, 58 | max_concurrency: nil, 59 | callers: %{}, 60 | waiting: Queue.new(), 61 | runners: %{}, 62 | pending_runners: %{}, 63 | runner_opts: [], 64 | on_grow_start: nil, 65 | on_grow_end: nil, 66 | on_shrink: nil, 67 | async_boot_timer: nil, 68 | track_resources: false, 69 | base_sync_stream: nil 70 | 71 | def child_spec(opts) do 72 | %{ 73 | id: {__MODULE__, Keyword.fetch!(opts, :name)}, 74 | start: {FLAME.Pool.Supervisor, :start_link, [opts]}, 75 | type: :supervisor 76 | } 77 | end 78 | 79 | @doc """ 80 | Starts a pool of runners. 81 | 82 | ## Options 83 | 84 | * `:name` - The name of the pool, for example: `MyApp.FFMPegRunner` 85 | 86 | * `:min` - The minimum number of runners to keep in the pool at all times. 87 | For "scale to zero" behavior you may pass `0`. When starting as a flame child, 88 | the `:min` will be forced to zero to avoid recursively starting backend resources. 89 | 90 | * `:max` - The maximum number of runners to elastically grow to in the pool. 91 | 92 | * `:max_concurrency` - The maximum number of concurrent executions per runner before 93 | booting new runners or queueing calls. Defaults to `100`. 94 | 95 | * `:single_use` - if `true`, runners will be terminated after each call completes. 96 | Defaults `false`. 97 | 98 | * `:backend` - The backend to use. Defaults to the configured `:flame, :backend` or 99 | `FLAME.LocalBackend` if not configured. 100 | 101 | * `:log` - The log level to use for verbose logging. Defaults to `false`. 102 | 103 | * `:timeout` - The time to allow functions to execute on a remote node. Defaults to 30 seconds. 104 | This value is also used as the default `FLAME.call/3` timeout for the caller. 105 | 106 | * `:boot_timeout` - The time to allow for booting and connecting to a remote node. 107 | Defaults to 30 seconds. 108 | 109 | * `:shutdown_timeout` - The time to allow for graceful shutdown on the remote node. 110 | Defaults to 30 seconds. 111 | 112 | * `:idle_shutdown_after` - The amount of time and function check to idle a remote node 113 | down after a period of inactivity. Defaults to 30 seconds. A tuple may also be passed 114 | to check a specific condition, for example: 115 | 116 | {10_000, fn -> Supervisor.which_children(MySup) == []} 117 | 118 | * `:min_idle_shutdown_after` - The same behavior of `:idle_shutdown_after`, but applied 119 | to the the `:min` pool runners. Defaults to `:infinity`. 120 | 121 | * `:on_grow_start` - The optional function to be called when the pool starts booting a new 122 | runner beyond the configured `:min`. The function receives a map with the following metadata: 123 | 124 | * `:name` - The name of the pool 125 | * `:count` - The number of runners the pool is attempting to grow to 126 | * `:pid` - The pid of the async process that is booting the new runner 127 | 128 | * `:on_grow_end` - The optional 2-arity function to be called when the pool growth process completes. 129 | The 2-arity function receives either `:ok` or `{:exit, reason}`, and map with the following metadata: 130 | 131 | * `:name` - The name of the pool 132 | * `:count` - The number of runners the pool is now at 133 | * `:pid` - The pid of the async process that attempted to boot the new runner 134 | 135 | * `:on_shrink` - The optional function to be called when the pool shrinks. 136 | The function receives a map with the following metadata: 137 | 138 | * `:name` - The name of the pool 139 | * `:count` - The number of runners the pool is attempting to shrink to 140 | 141 | * `:track_resources` - When true, traverses the returned results from FLAME 142 | operations looking for resources that implement the `FLAME.Trackable` protocol 143 | and make sure the FLAME node does not terminate until the tracked resources are removed. 144 | Defaults `false`. 145 | 146 | * `:code_sync` – The optional list of options to enable copying and syncing code paths 147 | from the parent node to the runner node. Disabled by default. The options are: 148 | 149 | * `:start_apps` – Either a boolean or a list of specific OTP application names to start 150 | when the runner boots. When `true`, all applications currently running on the parent node 151 | are sent to the runner node to be started. Defaults to `false`. When set to `true`, 152 | `copy_apps` will also be set to `true` if not explicitly set to `false`. 153 | 154 | * `:copy_apps` – The boolean flag to copy all the application artifacts and their beam 155 | files from the parent node to the runner node on boot. Defaults `false`. 156 | When passing `start_apps: true`, automatically sets `copy_paths: true`. 157 | 158 | * `:copy_paths` – The list of arbitrary paths to copy from the parent node to the runner 159 | node on boot. Defaults to `[]`. 160 | 161 | * `:sync_beams` – A list of specific beam code paths to sync to the runner node. Useful 162 | when you want to sync specific beam code paths from the parent after sending all code 163 | paths from `:copy_apps` on initial boot. For example, with `copy_apps: true`, 164 | and `sync_beams: ["/home/app/.cache/.../ebin"]`, all the code from the parent will be 165 | copied on boot, but only the specific beam files will be synced on subsequent calls. 166 | With `copy_apps: false`, and `sync_beams: ["/home/app/.cache/.../ebin"]`, 167 | only the specific beam files will be synced on boot and for subsequent calls. 168 | Defaults to `[]`. 169 | 170 | * `:verbose` – If `true`, the pool will log verbose information about the code sync process. 171 | Defaults to `false`. 172 | 173 | * `:compress` – If `true`, the copy_apps, copy_paths, and sync_beams will be compressed 174 | before sending. Provides savings in network payload size at the cost of CPU time. 175 | Defaults to `true`. 176 | 177 | For example, in [Livebook](https://livebook.dev/), to start a pool with code sync enabled: 178 | 179 | Mix.install([:kino, :flame]) 180 | 181 | Kino.start_child!( 182 | {FLAME.Pool, 183 | name: :my_flame, 184 | code_sync: [ 185 | start_apps: true, 186 | sync_beams: [Path.join(System.tmp_dir!(), "livebook_runtime")] 187 | ], 188 | min: 1, 189 | max: 1, 190 | max_concurrency: 10, 191 | backend: {FLAME.FlyBackend, 192 | cpu_kind: "performance", cpus: 4, memory_mb: 8192, 193 | token: System.fetch_env!("FLY_API_TOKEN"), 194 | env: Map.take(System.get_env(), ["LIVEBOOK_COOKIE"]), 195 | }, 196 | idle_shutdown_after: :timer.minutes(5)} 197 | ) 198 | """ 199 | def start_link(opts) do 200 | Keyword.validate!(opts, [ 201 | :name, 202 | :runner_sup, 203 | :task_sup, 204 | :cleaner, 205 | :terminator_sup, 206 | :child_placement_sup, 207 | :idle_shutdown_after, 208 | :min_idle_shutdown_after, 209 | :min, 210 | :max, 211 | :max_concurrency, 212 | :backend, 213 | :log, 214 | :single_use, 215 | :timeout, 216 | :boot_timeout, 217 | :shutdown_timeout, 218 | :on_grow_start, 219 | :on_grow_end, 220 | :on_shrink, 221 | :code_sync, 222 | :track_resources 223 | ]) 224 | 225 | Keyword.validate!(opts[:code_sync] || [], [ 226 | :get_path, 227 | :extract_dir, 228 | :tmp_dir, 229 | :copy_apps, 230 | :copy_paths, 231 | :sync_beams, 232 | :start_apps, 233 | :verbose, 234 | :compress, 235 | :chunk_size 236 | ]) 237 | 238 | GenServer.start_link(__MODULE__, opts, name: Keyword.fetch!(opts, :name)) 239 | end 240 | 241 | @doc """ 242 | Calls a function in a remote runner for the given `FLAME.Pool`. 243 | 244 | See `FLAME.call/3` for more information. 245 | """ 246 | def call(name, func, opts \\ []) when is_function(func, 0) and is_list(opts) do 247 | caller_pid = self() 248 | do_call(name, func, caller_pid, opts) 249 | end 250 | 251 | defp do_call(name, func, caller_pid, opts) when is_pid(caller_pid) do 252 | caller_checkout!(name, opts, :call, [name, func, opts], fn runner_pid, 253 | remaining_timeout, 254 | track_resources -> 255 | opts = 256 | opts 257 | |> Keyword.put_new(:timeout, remaining_timeout) 258 | |> Keyword.put_new(:track_resources, track_resources) 259 | 260 | {:cancel, :ok, Runner.call(runner_pid, caller_pid, func, opts)} 261 | end) 262 | end 263 | 264 | @doc """ 265 | Casts a function to a remote runner for the given `FLAME.Pool`. 266 | 267 | See `FLAME.cast/3` for more information. 268 | """ 269 | def cast(name, func, opts) when is_function(func, 0) and is_list(opts) do 270 | %{task_sup: task_sup} = lookup_meta(name) 271 | 272 | caller_pid = self() 273 | opts = Keyword.put_new(opts, :timeout, :infinity) 274 | 275 | # we don't care about the result so don't copy it back to the caller 276 | wrapped = fn -> 277 | func.() 278 | :ok 279 | end 280 | 281 | {:ok, _pid} = 282 | Task.Supervisor.start_child(task_sup, fn -> do_call(name, wrapped, caller_pid, opts) end) 283 | 284 | :ok 285 | end 286 | 287 | @doc """ 288 | See `FLAME.place_child/3` for more information. 289 | """ 290 | def place_child(name, child_spec, opts) do 291 | caller_checkout!(name, opts, :place_child, [name, child_spec, opts], fn runner_pid, 292 | remaining_timeout, 293 | track_resources -> 294 | place_opts = 295 | opts 296 | |> Keyword.put(:track_resources, track_resources) 297 | |> Keyword.put_new(:timeout, remaining_timeout) 298 | |> Keyword.put_new(:link, true) 299 | 300 | case Runner.place_child(runner_pid, child_spec, place_opts) do 301 | {{:ok, child_pid}, _trackable_pids = []} = result -> 302 | # we are placing the link back on the parent node, but we are protected 303 | # from racing the link on the child FLAME because the terminator on 304 | # the remote flame is monitoring the caller and will terminator the child 305 | # if we go away 306 | if Keyword.fetch!(place_opts, :link), do: Process.link(child_pid) 307 | {:cancel, {:replace, [child_pid]}, result} 308 | 309 | {:error, _reason} = result -> 310 | {:cancel, :ok, result} 311 | end 312 | end) 313 | end 314 | 315 | defp caller_checkout!(name, opts, fun_name, args, func) do 316 | %{boot_timeout: boot_timeout, track_resources: track_resources} = lookup_meta(name) 317 | timeout = opts[:timeout] || boot_timeout 318 | track_resources = Keyword.get(opts, :track_resources, track_resources) 319 | pid = Process.whereis(name) || exit({:noproc, {__MODULE__, fun_name, args}}) 320 | ref = Process.monitor(pid) 321 | {start_time, deadline} = deadline(timeout) 322 | 323 | # Manually implement call to avoid double monitor. 324 | # Auto-connect is asynchronous. But we still use :noconnect to make sure 325 | # we send on the monitored connection, and not trigger a new auto-connect. 326 | Process.send(pid, {:"$gen_call", {self(), ref}, {:checkout, deadline}}, [:noconnect]) 327 | 328 | receive do 329 | {^ref, runner_pid} -> 330 | try do 331 | Process.demonitor(ref, [:flush]) 332 | remaining_timeout = remaining_timeout(opts, start_time) 333 | func.(runner_pid, remaining_timeout, track_resources) 334 | catch 335 | kind, reason -> 336 | send_cancel(pid, ref, :catch) 337 | :erlang.raise(kind, reason, __STACKTRACE__) 338 | else 339 | {:cancel, :ok, {result, [_ | _] = trackable_pids}} -> 340 | send_cancel(pid, ref, {:replace, trackable_pids}) 341 | result 342 | 343 | {:cancel, reason, {result, [] = _trackable_pids}} -> 344 | send_cancel(pid, ref, reason) 345 | result 346 | end 347 | 348 | {:DOWN, ^ref, _, _, reason} -> 349 | exit({reason, {__MODULE__, fun_name, args}}) 350 | after 351 | timeout -> 352 | send_cancel(pid, ref, :timeout) 353 | Process.demonitor(ref, [:flush]) 354 | exit({:timeout, {__MODULE__, fun_name, args}}) 355 | end 356 | end 357 | 358 | defp send_cancel(pid, ref, reason) when is_pid(pid) and is_reference(ref) do 359 | send(pid, {:cancel, ref, self(), reason}) 360 | end 361 | 362 | defp remaining_timeout(opts, mono_start) do 363 | case Keyword.fetch(opts, :timeout) do 364 | {:ok, :infinity = inf} -> 365 | inf 366 | 367 | {:ok, nil} -> 368 | nil 369 | 370 | {:ok, ms} when is_integer(ms) -> 371 | elapsed_ms = 372 | System.convert_time_unit(System.monotonic_time() - mono_start, :native, :millisecond) 373 | 374 | ms - elapsed_ms 375 | 376 | :error -> 377 | nil 378 | end 379 | end 380 | 381 | defp lookup_meta(name) do 382 | :ets.lookup_element(name, :meta, 2) 383 | end 384 | 385 | @impl true 386 | def init(opts) do 387 | name = Keyword.fetch!(opts, :name) 388 | task_sup = Keyword.fetch!(opts, :task_sup) 389 | boot_timeout = Keyword.get(opts, :boot_timeout, @boot_timeout) 390 | track_resources = Keyword.get(opts, :track_resources, false) 391 | :ets.new(name, [:set, :public, :named_table, read_concurrency: true]) 392 | 393 | :ets.insert( 394 | name, 395 | {:meta, %{boot_timeout: boot_timeout, task_sup: task_sup, track_resources: track_resources}} 396 | ) 397 | 398 | terminator_sup = Keyword.fetch!(opts, :terminator_sup) 399 | cleaner = Keyword.fetch!(opts, :cleaner) 400 | child_placement_sup = Keyword.fetch!(opts, :child_placement_sup) 401 | runner_opts = runner_opts(opts, terminator_sup) 402 | min = Keyword.fetch!(opts, :min) 403 | 404 | # we must avoid recursively booting remote runners if we are a child 405 | min = 406 | if FLAME.Parent.get() do 407 | 0 408 | else 409 | min 410 | end 411 | 412 | base_sync_stream = 413 | if code_sync_opts = opts[:code_sync] do 414 | code_sync = 415 | code_sync_opts 416 | |> CodeSync.new() 417 | |> CodeSync.compute_changed_paths() 418 | 419 | %CodeSync.PackagedStream{} = parent_stream = CodeSync.package_to_stream(code_sync) 420 | 421 | :ok = FLAME.Pool.Cleaner.watch_path(cleaner, parent_stream.stream.path) 422 | 423 | parent_stream 424 | end 425 | 426 | state = %Pool{ 427 | runner_sup: Keyword.fetch!(opts, :runner_sup), 428 | task_sup: task_sup, 429 | terminator_sup: terminator_sup, 430 | child_placement_sup: child_placement_sup, 431 | name: name, 432 | min: min, 433 | max: Keyword.fetch!(opts, :max), 434 | boot_timeout: boot_timeout, 435 | idle_shutdown_after: Keyword.get(opts, :idle_shutdown_after, @idle_shutdown_after), 436 | min_idle_shutdown_after: Keyword.get(opts, :min_idle_shutdown_after, :infinity), 437 | max_concurrency: Keyword.get(opts, :max_concurrency, @default_max_concurrency), 438 | on_grow_start: opts[:on_grow_start], 439 | on_grow_end: opts[:on_grow_end], 440 | on_shrink: opts[:on_shrink], 441 | track_resources: track_resources, 442 | runner_opts: runner_opts, 443 | base_sync_stream: base_sync_stream 444 | } 445 | 446 | {:ok, boot_runners(state)} 447 | end 448 | 449 | defp runner_opts(opts, terminator_sup) do 450 | defaults = [terminator_sup: terminator_sup, log: Keyword.get(opts, :log, false)] 451 | 452 | runner_opts = 453 | Keyword.take( 454 | opts, 455 | [ 456 | :backend, 457 | :log, 458 | :single_use, 459 | :timeout, 460 | :boot_timeout, 461 | :shutdown_timeout, 462 | :idle_shutdown_after, 463 | :code_sync 464 | ] 465 | ) 466 | 467 | case Keyword.fetch(opts, :backend) do 468 | {:ok, {backend, opts}} -> 469 | Keyword.put(runner_opts, :backend, {backend, Keyword.merge(opts, defaults)}) 470 | 471 | {:ok, backend} -> 472 | Keyword.put(runner_opts, :backend, {backend, defaults}) 473 | 474 | :error -> 475 | backend = FLAME.Backend.impl() 476 | backend_opts = Application.get_env(:flame, backend) || [] 477 | Keyword.put(runner_opts, :backend, {backend, Keyword.merge(backend_opts, defaults)}) 478 | end 479 | end 480 | 481 | @impl true 482 | def handle_info({:DOWN, _ref, :process, _pid, _reason} = msg, %Pool{} = state) do 483 | {:noreply, handle_down(state, msg)} 484 | end 485 | 486 | def handle_info({ref, {:ok, pid}}, %Pool{} = state) when is_reference(ref) do 487 | {:noreply, handle_runner_async_up(state, pid, ref)} 488 | end 489 | 490 | def handle_info(:async_boot_continue, %Pool{} = state) do 491 | {:noreply, async_boot_runner(%{state | async_boot_timer: nil})} 492 | end 493 | 494 | def handle_info({:cancel, ref, caller_pid, reason}, state) do 495 | case reason do 496 | {:replace, child_pids} -> 497 | {:noreply, replace_caller(state, ref, caller_pid, child_pids)} 498 | 499 | reason when reason in [:ok, :timeout, :catch] -> 500 | {:noreply, checkin_runner(state, ref, caller_pid, reason)} 501 | end 502 | end 503 | 504 | @impl true 505 | def handle_call({:checkout, deadline}, from, state) do 506 | {:noreply, checkout_runner(state, deadline, from)} 507 | end 508 | 509 | defp runner_count(state) do 510 | map_size(state.runners) + map_size(state.pending_runners) 511 | end 512 | 513 | defp waiting_count(%Pool{waiting: %Queue{} = waiting}) do 514 | Queue.size(waiting) 515 | end 516 | 517 | defp min_runner(state) do 518 | if map_size(state.runners) == 0 do 519 | nil 520 | else 521 | {_ref, min} = Enum.min_by(state.runners, fn {_, %RunnerState{count: count}} -> count end) 522 | min 523 | end 524 | end 525 | 526 | defp await_downs(child_pids) do 527 | if MapSet.size(child_pids) == 0 do 528 | :ok 529 | else 530 | receive do 531 | {:DOWN, _ref, :process, pid, _reason} -> await_downs(MapSet.delete(child_pids, pid)) 532 | end 533 | end 534 | end 535 | 536 | defp replace_caller(%Pool{} = state, checkout_ref, caller_pid, [_ | _] = child_pids) do 537 | # replace caller with child pid and do not inc concurrency counts since we are replacing 538 | %{^caller_pid => %Caller{checkout_ref: ^checkout_ref} = caller} = state.callers 539 | Process.demonitor(caller.monitor_ref, [:flush]) 540 | 541 | # if we have more than 1 child pid, such as for multiple trackables returned for a single 542 | # call, we monitor all of them under a new process and the new process takes the slot in the 543 | # pool. When all trackables are finished, the new process goes down and frees the slot. 544 | child_pid = 545 | case child_pids do 546 | [child_pid] -> 547 | child_pid 548 | 549 | [_ | _] -> 550 | {:ok, child_pid} = 551 | Task.Supervisor.start_child(state.task_sup, fn -> 552 | Enum.each(child_pids, &Process.monitor(&1)) 553 | await_downs(MapSet.new(child_pids)) 554 | end) 555 | 556 | child_pid 557 | end 558 | 559 | new_caller = %Caller{ 560 | checkout_ref: checkout_ref, 561 | monitor_ref: Process.monitor(child_pid), 562 | runner_ref: caller.runner_ref 563 | } 564 | 565 | new_callers = 566 | state.callers 567 | |> Map.delete(caller_pid) 568 | |> Map.put(child_pid, new_caller) 569 | 570 | %{state | callers: new_callers} 571 | end 572 | 573 | defp checkin_runner(state, ref, caller_pid, reason) 574 | when is_reference(ref) and is_pid(caller_pid) do 575 | case state.callers do 576 | %{^caller_pid => %Caller{checkout_ref: ^ref} = caller} -> 577 | Process.demonitor(caller.monitor_ref, [:flush]) 578 | drop_caller(state, caller_pid, caller) 579 | 580 | # the only way to race a checkin is if the caller has expired while still in the 581 | # waiting state and checks in on the timeout before we lease it a runner. 582 | %{} when reason == :timeout -> 583 | maybe_drop_waiting(state, caller_pid) 584 | 585 | %{} -> 586 | raise ArgumentError, 587 | "expected to checkin runner for #{inspect(caller_pid)} that does not exist" 588 | end 589 | end 590 | 591 | defp checkout_runner(%Pool{} = state, deadline, from, monitor_ref \\ nil) do 592 | min_runner = min_runner(state) 593 | runner_count = runner_count(state) 594 | 595 | cond do 596 | min_runner && min_runner.count < state.max_concurrency -> 597 | reply_runner_checkout(state, min_runner, from, monitor_ref) 598 | 599 | runner_count < state.max -> 600 | if state.async_boot_timer || 601 | map_size(state.pending_runners) * state.max_concurrency > waiting_count(state) do 602 | waiting_in(state, deadline, from) 603 | else 604 | state 605 | |> async_boot_runner() 606 | |> waiting_in(deadline, from) 607 | end 608 | 609 | true -> 610 | waiting_in(state, deadline, from) 611 | end 612 | end 613 | 614 | defp reply_runner_checkout(state, %RunnerState{} = runner, from, monitor_ref) do 615 | # we pass monitor_ref down from waiting so we don't need to remonitor if already monitoring 616 | {from_pid, checkout_ref} = from 617 | 618 | caller_monitor_ref = 619 | if monitor_ref do 620 | monitor_ref 621 | else 622 | Process.monitor(from_pid) 623 | end 624 | 625 | GenServer.reply(from, runner.pid) 626 | 627 | new_caller = %Caller{ 628 | checkout_ref: checkout_ref, 629 | monitor_ref: caller_monitor_ref, 630 | runner_ref: runner.monitor_ref 631 | } 632 | 633 | new_state = %{state | callers: Map.put(state.callers, from_pid, new_caller)} 634 | 635 | inc_runner_count(new_state, runner.monitor_ref) 636 | end 637 | 638 | defp waiting_in(%Pool{} = state, deadline, {pid, _tag} = from) do 639 | ref = Process.monitor(pid) 640 | waiting = %WaitingState{from: from, monitor_ref: ref, deadline: deadline} 641 | %{state | waiting: Queue.insert(state.waiting, waiting, pid)} 642 | end 643 | 644 | defp boot_runners(%Pool{} = state) do 645 | if state.min > 0 do 646 | # start min runners, and do not idle them down regardless of idle configuration 647 | # unless `:min_idle_shutdown_after` not infinity 648 | # TODO: allow % threshold of failed min's to continue startup? 649 | 0..(state.min - 1) 650 | |> Task.async_stream( 651 | fn _ -> start_child_runner(state, idle_shutdown_after: state.min_idle_shutdown_after) end, 652 | max_concurrency: 10, 653 | timeout: state.boot_timeout 654 | ) 655 | |> Enum.reduce(state, fn 656 | {:ok, {:ok, pid}}, acc -> 657 | {_runner, new_acc} = put_runner(acc, pid) 658 | new_acc 659 | 660 | {:exit, reason}, _acc -> 661 | raise "failed to boot runner: #{inspect(reason)}" 662 | end) 663 | else 664 | state 665 | end 666 | end 667 | 668 | defp schedule_async_boot_runner(%Pool{} = state) do 669 | if state.async_boot_timer, do: Process.cancel_timer(state.async_boot_timer) 670 | timer = Process.send_after(self(), :async_boot_continue, @async_boot_debounce) 671 | %{state | async_boot_timer: timer} 672 | end 673 | 674 | defp async_boot_runner(%Pool{on_grow_start: on_grow_start, name: name} = state) do 675 | new_count = runner_count(state) + 1 676 | 677 | task = 678 | Task.Supervisor.async_nolink(state.task_sup, fn -> 679 | if on_grow_start, do: on_grow_start.(%{count: new_count, name: name, pid: self()}) 680 | 681 | start_child_runner(state) 682 | end) 683 | 684 | new_pending = Map.put(state.pending_runners, task.ref, task.pid) 685 | %{state | pending_runners: new_pending} 686 | end 687 | 688 | defp start_child_runner(%Pool{} = state, runner_opts \\ []) do 689 | opts = Keyword.merge(state.runner_opts, runner_opts) 690 | name = Module.concat(state.name, "Runner#{map_size(state.runners) + 1}") 691 | 692 | spec = %{ 693 | id: name, 694 | start: {FLAME.Runner, :start_link, [opts]}, 695 | restart: :temporary 696 | } 697 | 698 | {:ok, pid} = DynamicSupervisor.start_child(state.runner_sup, spec) 699 | 700 | try do 701 | case Runner.remote_boot(pid, state.base_sync_stream) do 702 | :ok -> {:ok, pid} 703 | {:error, reason} -> {:error, reason} 704 | end 705 | catch 706 | {:exit, reason} -> {:error, {:exit, reason}} 707 | end 708 | end 709 | 710 | defp put_runner(%Pool{} = state, pid) when is_pid(pid) do 711 | ref = Process.monitor(pid) 712 | runner = %RunnerState{count: 0, pid: pid, monitor_ref: ref} 713 | new_state = %{state | runners: Map.put(state.runners, runner.monitor_ref, runner)} 714 | {runner, new_state} 715 | end 716 | 717 | defp inc_runner_count(%Pool{} = state, ref) do 718 | new_runners = 719 | Map.update!(state.runners, ref, fn %RunnerState{} = runner -> 720 | %{runner | count: runner.count + 1} 721 | end) 722 | 723 | %{state | runners: new_runners} 724 | end 725 | 726 | defp dec_runner_count(%Pool{} = state, ref) do 727 | new_runners = 728 | Map.update!(state.runners, ref, fn %RunnerState{} = runner -> 729 | %{runner | count: runner.count - 1} 730 | end) 731 | 732 | %{state | runners: new_runners} 733 | end 734 | 735 | defp drop_child_runner(%Pool{} = state, runner_ref) when is_reference(runner_ref) do 736 | %{^runner_ref => %RunnerState{}} = state.runners 737 | Process.demonitor(runner_ref, [:flush]) 738 | 739 | # kill all callers that still had a checkout for this runner 740 | new_state = 741 | Enum.reduce(state.callers, state, fn 742 | {caller_pid, %Caller{monitor_ref: ref, runner_ref: ^runner_ref}}, acc -> 743 | Process.demonitor(ref, [:flush]) 744 | Process.exit(caller_pid, :kill) 745 | %{acc | callers: Map.delete(acc.callers, caller_pid)} 746 | 747 | {_caller_pid, %Caller{}}, acc -> 748 | acc 749 | end) 750 | 751 | maybe_on_shrink(%{new_state | runners: Map.delete(new_state.runners, runner_ref)}) 752 | end 753 | 754 | defp drop_caller(%Pool{} = state, caller_pid, %Caller{} = caller) when is_pid(caller_pid) do 755 | new_state = %{state | callers: Map.delete(state.callers, caller_pid)} 756 | 757 | new_state 758 | |> dec_runner_count(caller.runner_ref) 759 | |> call_next_waiting_caller() 760 | end 761 | 762 | defp maybe_drop_waiting(%Pool{} = state, caller_pid) when is_pid(caller_pid) do 763 | %{state | waiting: Queue.delete_by_key(state.waiting, caller_pid)} 764 | end 765 | 766 | defp pop_next_waiting_caller(%Pool{} = state) do 767 | result = 768 | Queue.pop_until(state.waiting, fn _pid, %WaitingState{} = waiting -> 769 | %WaitingState{from: {pid, _}, monitor_ref: ref, deadline: deadline} = waiting 770 | # we don't need to reply to waiting callers because they will either have died 771 | # or execeeded their own deadline handled by receive + after 772 | if Process.alive?(pid) and not deadline_expired?(deadline) do 773 | true 774 | else 775 | Process.demonitor(ref, [:flush]) 776 | false 777 | end 778 | end) 779 | 780 | case result do 781 | {nil, %Queue{} = new_waiting} -> {nil, %{state | waiting: new_waiting}} 782 | {{_pid, %WaitingState{} = first}, %Queue{} = rest} -> {first, %{state | waiting: rest}} 783 | end 784 | end 785 | 786 | defp call_next_waiting_caller(%Pool{} = state) do 787 | case pop_next_waiting_caller(state) do 788 | {nil, new_state} -> 789 | new_state 790 | 791 | {%WaitingState{} = first, new_state} -> 792 | # checkout_runner will borrow already running monitor 793 | checkout_runner(new_state, first.deadline, first.from, first.monitor_ref) 794 | end 795 | end 796 | 797 | defp handle_down(%Pool{} = state, {:DOWN, ref, :process, pid, reason}) do 798 | state = maybe_drop_waiting(state, pid) 799 | 800 | state = 801 | case state.callers do 802 | %{^pid => %Caller{monitor_ref: ^ref} = caller} -> 803 | drop_caller(state, pid, caller) 804 | 805 | %{} -> 806 | state 807 | end 808 | 809 | state = 810 | case state.runners do 811 | %{^ref => _} -> drop_child_runner(state, ref) 812 | %{} -> state 813 | end 814 | 815 | case state.pending_runners do 816 | %{^ref => _} -> 817 | state = %{state | pending_runners: Map.delete(state.pending_runners, ref)} 818 | # we rate limit this to avoid many failed async boot attempts 819 | if has_unmet_servicable_demand?(state) do 820 | state 821 | |> maybe_on_grow_end(pid, {:exit, reason}) 822 | |> schedule_async_boot_runner() 823 | else 824 | maybe_on_grow_end(state, pid, {:exit, reason}) 825 | end 826 | 827 | %{} -> 828 | state 829 | end 830 | end 831 | 832 | defp maybe_on_grow_end(%Pool{on_grow_end: on_grow_end} = state, pid, result) do 833 | new_count = runner_count(state) 834 | meta = %{count: new_count, name: state.name, pid: pid} 835 | 836 | case result do 837 | :ok -> if on_grow_end, do: on_grow_end.(:ok, meta) 838 | {:exit, reason} -> if on_grow_end, do: on_grow_end.({:exit, reason}, meta) 839 | end 840 | 841 | state 842 | end 843 | 844 | defp maybe_on_shrink(%Pool{} = state) do 845 | new_count = runner_count(state) 846 | if state.on_shrink, do: state.on_shrink.(%{count: new_count, name: state.name}) 847 | 848 | state 849 | end 850 | 851 | defp has_unmet_servicable_demand?(%Pool{} = state) do 852 | waiting_count(state) > map_size(state.pending_runners) * state.max_concurrency and 853 | runner_count(state) < state.max 854 | end 855 | 856 | defp handle_runner_async_up(%Pool{} = state, pid, ref) when is_pid(pid) and is_reference(ref) do 857 | %{^ref => task_pid} = state.pending_runners 858 | Process.demonitor(ref, [:flush]) 859 | 860 | new_state = %{state | pending_runners: Map.delete(state.pending_runners, ref)} 861 | {runner, new_state} = put_runner(new_state, pid) 862 | new_state = maybe_on_grow_end(new_state, task_pid, :ok) 863 | 864 | # pop waiting callers up to max_concurrency, but we must handle: 865 | # 1. the case where we have no waiting callers 866 | # 2. the case where we process a DOWN for the new runner as we pop DOWNs 867 | # looking for fresh waiting 868 | # 3. if we still have waiting callers at the end, boot more runners if we have capacity 869 | Enum.reduce_while(1..state.max_concurrency, new_state, fn i, acc -> 870 | with {:ok, %RunnerState{} = runner} <- Map.fetch(acc.runners, runner.monitor_ref), 871 | true <- i <= acc.max_concurrency do 872 | case pop_next_waiting_caller(acc) do 873 | {%WaitingState{} = next, acc} -> 874 | {:cont, reply_runner_checkout(acc, runner, next.from, next.monitor_ref)} 875 | 876 | {nil, acc} -> 877 | {:halt, acc} 878 | end 879 | else 880 | _ -> {:halt, acc} 881 | end 882 | end) 883 | end 884 | 885 | defp deadline(timeout) when is_integer(timeout) do 886 | t1 = System.monotonic_time() 887 | {t1, t1 + System.convert_time_unit(timeout, :millisecond, :native)} 888 | end 889 | 890 | defp deadline(:infinity) do 891 | {System.monotonic_time(), :infinity} 892 | end 893 | 894 | defp deadline_expired?(deadline) when is_integer(deadline) do 895 | System.monotonic_time() >= deadline 896 | end 897 | 898 | defp deadline_expired?(:infinity), do: false 899 | end 900 | -------------------------------------------------------------------------------- /lib/flame/pool/cleaner.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Pool.Cleaner do 2 | @moduledoc false 3 | use GenServer 4 | 5 | def start_link(opts) do 6 | GenServer.start_link(__MODULE__, opts, name: Keyword.fetch!(opts, :name)) 7 | end 8 | 9 | def watch_path(server, path) do 10 | GenServer.call(server, {:watch, path}) 11 | end 12 | 13 | def list_paths(server) do 14 | GenServer.call(server, :list) 15 | end 16 | 17 | def init(_opts) do 18 | Process.flag(:trap_exit, true) 19 | {:ok, %{paths: []}} 20 | end 21 | 22 | def handle_call({:watch, path}, _from, state) do 23 | {:reply, :ok, %{state | paths: [path | state.paths]}} 24 | end 25 | 26 | def handle_call(:list, _from, state) do 27 | {:reply, state.paths, state} 28 | end 29 | 30 | def terminate(_reason, state) do 31 | for path <- state.paths, do: File.rm!(path) 32 | 33 | :ok 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /lib/flame/pool/supervisor.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Pool.Supervisor do 2 | @moduledoc false 3 | 4 | use Supervisor 5 | 6 | def start_link(opts) do 7 | name = Keyword.fetch!(opts, :name) 8 | pool_sup = Module.concat(name, "PoolSup") 9 | 10 | Supervisor.start_link(__MODULE__, opts, name: pool_sup) 11 | end 12 | 13 | def init(opts) do 14 | name = Keyword.fetch!(opts, :name) 15 | runner_sup = Module.concat(name, "RunnerSup") 16 | cleaner = Module.concat(name, "Cleaner") 17 | terminator_sup = Module.concat(name, "TerminatorSup") 18 | task_sup = Module.concat(name, "TaskSup") 19 | 20 | child_placement_sup = 21 | Keyword.get(opts, :child_placement_sup, FLAME.ChildPlacementSup) 22 | 23 | pool_opts = 24 | Keyword.merge(opts, 25 | task_sup: task_sup, 26 | cleaner: cleaner, 27 | runner_sup: runner_sup, 28 | terminator_sup: terminator_sup, 29 | child_placement_sup: child_placement_sup 30 | ) 31 | 32 | children = 33 | [ 34 | {FLAME.Pool.Cleaner, name: cleaner}, 35 | {Task.Supervisor, name: task_sup, strategy: :one_for_one}, 36 | {DynamicSupervisor, name: runner_sup, strategy: :one_for_one}, 37 | {DynamicSupervisor, name: terminator_sup, strategy: :one_for_one}, 38 | %{ 39 | id: {FLAME.Pool, Keyword.fetch!(opts, :name)}, 40 | start: {FLAME.Pool, :start_link, [pool_opts]}, 41 | type: :worker 42 | } 43 | ] 44 | 45 | Supervisor.init(children, strategy: :one_for_all) 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /lib/flame/queue.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Queue do 2 | @moduledoc false 3 | # Provides a FIFO queue with secondary key lookup/delete support. 4 | 5 | defstruct tree: :gb_trees.empty(), keys: %{}, idx: 0 6 | 7 | alias FLAME.Queue 8 | 9 | @doc """ 10 | Builds a new queue. 11 | """ 12 | def new, do: %FLAME.Queue{} 13 | 14 | @doc """ 15 | Returns the size of the queue. 16 | """ 17 | def size(%Queue{} = queue), do: :gb_trees.size(queue.tree) 18 | 19 | @doc """ 20 | Inserts a new item into the queue with a secondary key. 21 | """ 22 | def insert(%Queue{idx: idx} = queue, item, key) do 23 | new_tree = :gb_trees.insert(idx, {key, item}, queue.tree) 24 | new_keys = Map.put(queue.keys, key, idx) 25 | %{queue | tree: new_tree, keys: new_keys, idx: idx + 1} 26 | end 27 | 28 | @doc """ 29 | Pops an item from the queue returning the key/item pair. 30 | 31 | Returns `{nil, new_queue}` when the queue is empty. 32 | 33 | ## Examples 34 | 35 | iex> queue = Queue.insert(Queue.new(), "item1", :key1) 36 | iex> {{:key1, "item1"}, %Queue{} = new_queue} = Queue.pop(queue) 37 | iex> {nil, %Queue{} = new_queue} = Queue.pop(queue) 38 | """ 39 | def pop(%Queue{tree: tree, keys: keys, idx: idx} = queue) do 40 | if size(queue) > 0 do 41 | {_smallest_idx, {key, val}, new_tree} = :gb_trees.take_smallest(tree) 42 | new_keys = Map.delete(keys, key) 43 | new_idx = if :gb_trees.is_empty(new_tree), do: 0, else: idx 44 | {{key, val}, %{queue | tree: new_tree, keys: new_keys, idx: new_idx}} 45 | else 46 | {nil, queue} 47 | end 48 | end 49 | 50 | @doc """ 51 | Pops items from the queue until the function returns true. 52 | 53 | Returns the first key/item pair for which the function returns true, and the new queue. 54 | """ 55 | def pop_until(%Queue{} = queue, func) when is_function(func, 2) do 56 | case pop(queue) do 57 | {nil, %Queue{} = new_queue} -> 58 | {nil, new_queue} 59 | 60 | {{key, item}, %Queue{} = new_queue} -> 61 | if func.(key, item) do 62 | {{key, item}, new_queue} 63 | else 64 | pop_until(new_queue, func) 65 | end 66 | end 67 | end 68 | 69 | @doc """ 70 | Looks up an item by key. 71 | 72 | Returns `nil` for unknown keys. 73 | 74 | ## Examples 75 | 76 | queue = Queue.insert(Queue.new(), "item1", :key1) 77 | "item1" = Queue.get_by_key(queue, :key1) 78 | """ 79 | def get_by_key(%Queue{} = queue, key) do 80 | case queue.keys do 81 | %{^key => idx} -> 82 | {:value, {^key, item}} = :gb_trees.lookup(idx, queue.tree) 83 | item 84 | 85 | %{} -> 86 | nil 87 | end 88 | end 89 | 90 | @doc """ 91 | Deletes an item by key. 92 | 93 | Unknown keys are ignored. 94 | 95 | ## Examples 96 | 97 | queue = Queue.insert(Queue.new(), "item1", :key1) 98 | new_queue = Queue.delete_by_key(queue, :key1) 99 | """ 100 | def delete_by_key(%Queue{tree: tree, keys: keys} = queue, key) do 101 | case keys do 102 | %{^key => index} -> 103 | new_tree = :gb_trees.delete_any(index, tree) 104 | new_keys = Map.delete(keys, key) 105 | new_idx = if :gb_trees.is_empty(new_tree), do: 0, else: queue.idx 106 | %{queue | tree: new_tree, keys: new_keys, idx: new_idx} 107 | 108 | %{} -> 109 | queue 110 | end 111 | end 112 | end 113 | -------------------------------------------------------------------------------- /lib/flame/runner.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Runner do 2 | @moduledoc false 3 | # ## Runners 4 | 5 | # In practice, users utilize the `FLAME.call/3` and `FLAME.cast/3` functions 6 | # to accomplish their work. These functions are backed by a `FLAME.Pool` of 7 | # `FLAME.Runner`'s 8 | # 9 | # A `FLAME.Runner` is responsible for booting a new node, and executing concurrent 10 | # functions on it. For example: 11 | # 12 | # {:ok, runner} = Runner.start_link(backend: FLAME.FlyBackend) 13 | # :ok = Runner.remote_boot(runner) 14 | # Runner.call(runner, fn -> :operation1 end) 15 | # Runner.shutdown(runner) 16 | # 17 | # When a caller exits or crashes, the remote node will automatically be terminated. 18 | # For distributed erlang backends, like `FLAME.FlyBackend`, this will be 19 | # accomplished automatically by the `FLAME.Terminator`, but other methods 20 | # are possible. 21 | 22 | use GenServer 23 | require Logger 24 | 25 | alias FLAME.{Runner, Terminator, CodeSync} 26 | 27 | @derive {Inspect, 28 | only: [ 29 | :id, 30 | :backend, 31 | :terminator, 32 | :instance_id, 33 | :private_ip, 34 | :node_name, 35 | :single_use, 36 | :timeout, 37 | :status, 38 | :log, 39 | :boot_timeout, 40 | :idle_shutdown_after, 41 | :idle_shutdown_check 42 | ]} 43 | 44 | defstruct id: nil, 45 | instance_id: nil, 46 | private_ip: nil, 47 | backend: nil, 48 | terminator: nil, 49 | backend_init: nil, 50 | node_name: nil, 51 | single_use: false, 52 | timeout: 30_000, 53 | status: nil, 54 | log: :info, 55 | boot_timeout: 10_000, 56 | shutdown_timeout: 5_000, 57 | idle_shutdown_after: nil, 58 | idle_shutdown_check: nil, 59 | code_sync_opts: false, 60 | code_sync: nil 61 | 62 | @doc """ 63 | Starts a runner. 64 | 65 | ## Options 66 | 67 | `:backend` - The `Flame.Backend` implementation to use 68 | `:log` - The log level, or `false` 69 | `:single_use` - The boolean on whether to terminate the runner after it's first call 70 | `:timeout` - The execution timeout of calls 71 | `:boot_timeout` - The boot timeout of the runner 72 | `:shutdown_timeout` - The shutdown timeout 73 | `:idle_shutdown_after` - The idle shutdown time 74 | `:code_sync` - The code sync options. See the `FLAME.Pool` module for more information. 75 | """ 76 | def start_link(opts \\ []) do 77 | GenServer.start_link(__MODULE__, opts) 78 | end 79 | 80 | def shutdown(runner, timeout \\ nil) when is_pid(runner) do 81 | GenServer.call(runner, {:runner_shutdown, timeout}) 82 | end 83 | 84 | @doc """ 85 | Boots the remote runner using the `FLAME.Backend`. 86 | """ 87 | def remote_boot(pid, base_sync_stream, timeout \\ nil) when is_pid(pid) do 88 | GenServer.call(pid, {:remote_boot, base_sync_stream, timeout}, timeout || :infinity) 89 | end 90 | 91 | @doc """ 92 | Places a child process on the remote node. 93 | 94 | The started child spec will be rewritten to use the `:temporary` restart strategy 95 | to ensure that the child is not restarted if it exits. If you want restart 96 | behavior, you must monitor the process yourself on the parent node and replace it. 97 | """ 98 | def place_child(runner_pid, child_spec, opts) 99 | when is_pid(runner_pid) and is_list(opts) do 100 | # we must rewrite :temporary restart strategy for the spec to avoid restarting placed children 101 | new_spec = Supervisor.child_spec(child_spec, restart: :temporary) 102 | caller_pid = self() 103 | link? = Keyword.get(opts, :link, true) 104 | 105 | call( 106 | runner_pid, 107 | caller_pid, 108 | fn terminator -> 109 | Terminator.place_child(terminator, caller_pid, link?, new_spec) 110 | end, 111 | opts 112 | ) 113 | end 114 | 115 | @doc """ 116 | Calls a function on the remote node. 117 | """ 118 | def call(runner_pid, caller_pid, func, opts \\ []) 119 | when is_pid(runner_pid) and is_pid(caller_pid) and is_function(func) and is_list(opts) do 120 | link? = Keyword.get(opts, :link, true) 121 | track_resources? = Keyword.get(opts, :track_resources, false) 122 | {ref, %Runner{} = runner, backend_state} = checkout(runner_pid) 123 | %Runner{terminator: terminator} = runner 124 | call_timeout = opts[:timeout] || runner.timeout 125 | 126 | result = 127 | remote_call(runner, backend_state, call_timeout, track_resources?, fn -> 128 | if link?, do: Process.link(caller_pid) 129 | :ok = Terminator.deadline_me(terminator, call_timeout) 130 | if is_function(func, 1), do: func.(terminator), else: func.() 131 | end) 132 | 133 | case result do 134 | {:ok, {value, trackable_pids}} -> 135 | :ok = checkin(runner_pid, ref, trackable_pids) 136 | {value, trackable_pids} 137 | 138 | {:exit, reason} -> 139 | :ok = checkin(runner_pid, ref, []) 140 | exit(reason) 141 | end 142 | end 143 | 144 | defp checkout(runner_pid) do 145 | GenServer.call(runner_pid, :checkout) 146 | end 147 | 148 | defp checkin(runner_pid, ref, trackable_pids) do 149 | GenServer.call(runner_pid, {:checkin, ref, trackable_pids}) 150 | end 151 | 152 | @impl true 153 | def init(opts) do 154 | runner = new(opts) 155 | 156 | case runner.backend_init do 157 | {:ok, backend_state} -> 158 | state = %{ 159 | runner: runner, 160 | checkouts: %{}, 161 | backend_state: backend_state, 162 | otp_app: if(otp_app = System.get_env("RELEASE_NAME"), do: String.to_atom(otp_app)) 163 | } 164 | 165 | {:ok, state} 166 | 167 | {:error, reason} -> 168 | {:stop, reason} 169 | end 170 | end 171 | 172 | @impl true 173 | def handle_info({:DOWN, ref, :process, pid, reason} = msg, state) do 174 | %{runner: %Runner{} = runner} = state 175 | 176 | case runner do 177 | %Runner{terminator: ^pid} -> 178 | {:stop, reason, state} 179 | 180 | %Runner{terminator: _} -> 181 | case state.checkouts do 182 | %{^ref => _from_pid} -> 183 | new_state = drop_checkout(state, ref) 184 | 185 | if runner.single_use do 186 | {:stop, reason, new_state} 187 | else 188 | {:noreply, new_state} 189 | end 190 | 191 | %{} -> 192 | {:noreply, maybe_backend_handle_info(state, msg)} 193 | end 194 | end 195 | end 196 | 197 | def handle_info({_ref, {:remote_shutdown, reason}}, state) do 198 | {:stop, {:shutdown, reason}, state} 199 | end 200 | 201 | def handle_info(msg, state) do 202 | {:noreply, maybe_backend_handle_info(state, msg)} 203 | end 204 | 205 | defp maybe_backend_handle_info(state, msg) do 206 | %Runner{backend: backend} = state.runner 207 | 208 | if function_exported?(backend, :handle_info, 2) do 209 | case backend.handle_info(msg, state.backend_state) do 210 | {:noreply, new_backend_state} -> 211 | %{state | backend_state: new_backend_state} 212 | 213 | other -> 214 | raise ArgumentError, 215 | "expected #{inspect(backend)}.handle_info/2 to return {:noreply, state}, got: #{inspect(other)}" 216 | end 217 | else 218 | state 219 | end 220 | end 221 | 222 | @impl true 223 | def handle_call({:runner_shutdown, timeout}, _from, state) do 224 | %{runner: runner} = state 225 | timeout = timeout || runner.shutdown_timeout 226 | ref = make_ref() 227 | parent = self() 228 | %Runner{terminator: terminator} = runner 229 | 230 | state = drain_checkouts(state, timeout) 231 | 232 | {:ok, {remote_pid, remote_monitor_ref}} = 233 | runner.backend.remote_spawn_monitor(state.backend_state, fn -> 234 | :ok = Terminator.system_shutdown(terminator) 235 | send(parent, {ref, :ok}) 236 | end) 237 | 238 | receive do 239 | {^ref, :ok} -> 240 | {:stop, :normal, :ok, state} 241 | 242 | {:DOWN, ^remote_monitor_ref, :process, ^remote_pid, reason} -> 243 | {:stop, {:shutdown, reason}, {:error, reason}, state} 244 | after 245 | timeout -> exit(:timeout) 246 | end 247 | end 248 | 249 | def handle_call(:checkout, {from_pid, _tag}, state) do 250 | state = 251 | case maybe_diff_code_paths(state) do 252 | {new_state, nil} -> 253 | new_state 254 | 255 | {new_state, %CodeSync.PackagedStream{} = parent_pkg} -> 256 | terminator = state.runner.terminator 257 | 258 | remote_call!(state.runner, state.backend_state, state.runner.boot_timeout, false, fn -> 259 | if extract_dir = CodeSync.extract_packaged_stream(parent_pkg) do 260 | FLAME.Terminator.watch_path(terminator, extract_dir) 261 | end 262 | end) 263 | 264 | CodeSync.rm_packaged_stream(parent_pkg) 265 | 266 | new_state 267 | end 268 | 269 | {new_state, ref} = put_checkout(state, from_pid) 270 | {:reply, {ref, new_state.runner, new_state.backend_state}, new_state} 271 | end 272 | 273 | def handle_call({:checkin, ref, trackable_pids}, _from, state) do 274 | Process.demonitor(ref, [:flush]) 275 | 276 | new_state = 277 | Enum.reduce(trackable_pids, state, fn pid, acc -> 278 | {acc, _ref} = put_checkout(acc, pid) 279 | acc 280 | end) 281 | 282 | {:reply, :ok, drop_checkout(new_state, ref)} 283 | end 284 | 285 | def handle_call({:remote_boot, base_sync_stream, _timeout}, _from, state) do 286 | %{runner: runner, backend_state: backend_state, otp_app: otp_app} = state 287 | 288 | case runner.status do 289 | :booted -> 290 | {:reply, {:error, :already_booted}, state} 291 | 292 | :awaiting_boot -> 293 | time(runner, "runner connect", fn -> 294 | case runner.backend.remote_boot(backend_state) do 295 | {:ok, remote_terminator_pid, new_backend_state} when is_pid(remote_terminator_pid) -> 296 | Process.monitor(remote_terminator_pid) 297 | new_runner = %{runner | terminator: remote_terminator_pid, status: :booted} 298 | new_state = %{state | runner: new_runner, backend_state: new_backend_state} 299 | {new_state, beams_stream} = maybe_stream_code_paths(new_state) 300 | 301 | %Runner{ 302 | single_use: single_use, 303 | idle_shutdown_after: idle_after, 304 | idle_shutdown_check: idle_check, 305 | terminator: term 306 | } = new_runner 307 | 308 | {:ok, _} = 309 | remote_call!(runner, new_backend_state, runner.boot_timeout, false, fn -> 310 | # ensure app is fully started if parent connects before up 311 | if otp_app, do: {:ok, _} = Application.ensure_all_started(otp_app) 312 | 313 | if extract_dir = 314 | base_sync_stream && CodeSync.extract_packaged_stream(base_sync_stream) do 315 | FLAME.Terminator.watch_path(term, extract_dir) 316 | end 317 | 318 | if extract_dir = beams_stream && CodeSync.extract_packaged_stream(beams_stream) do 319 | FLAME.Terminator.watch_path(term, extract_dir) 320 | end 321 | 322 | :ok = 323 | Terminator.schedule_idle_shutdown(term, idle_after, idle_check, single_use) 324 | 325 | :ok 326 | end) 327 | 328 | {:reply, :ok, new_state} 329 | 330 | {:error, reason} -> 331 | {:stop, {:shutdown, reason}, state} 332 | 333 | other -> 334 | raise ArgumentError, 335 | "expected #{inspect(runner.backend)}.remote_boot/1 to return {:ok, remote_terminator_pid, new_state} | {:error, reason}, got: #{inspect(other)}" 336 | end 337 | end) 338 | end 339 | end 340 | 341 | @doc false 342 | def new(opts) when is_list(opts) do 343 | opts = 344 | Keyword.validate!(opts, [ 345 | :backend, 346 | :log, 347 | :single_use, 348 | :timeout, 349 | :boot_timeout, 350 | :shutdown_timeout, 351 | :idle_shutdown_after, 352 | :code_sync 353 | ]) 354 | 355 | Keyword.validate!(opts[:code_sync] || [], [ 356 | :get_path, 357 | :copy_apps, 358 | :copy_paths, 359 | :sync_beams, 360 | :start_apps, 361 | :tmp_dir, 362 | :extract_dir, 363 | :verbose, 364 | :compress, 365 | :chunk_size 366 | ]) 367 | 368 | {idle_shutdown_after_ms, idle_check} = 369 | case Keyword.fetch(opts, :idle_shutdown_after) do 370 | {:ok, :infinity} -> {:infinity, fn -> false end} 371 | {:ok, ms} when is_integer(ms) -> {ms, fn -> true end} 372 | {:ok, {ms, func}} when is_integer(ms) and is_function(func, 0) -> {ms, func} 373 | other when other in [{:ok, nil}, :error] -> {30_000, fn -> true end} 374 | end 375 | 376 | runner = 377 | %Runner{ 378 | status: :awaiting_boot, 379 | backend: :pending, 380 | backend_init: :pending, 381 | log: Keyword.get(opts, :log, false), 382 | single_use: Keyword.get(opts, :single_use, false), 383 | timeout: opts[:timeout] || 30_000, 384 | boot_timeout: opts[:boot_timeout] || 30_000, 385 | shutdown_timeout: opts[:shutdown_timeout] || 30_000, 386 | idle_shutdown_after: idle_shutdown_after_ms, 387 | idle_shutdown_check: idle_check, 388 | terminator: nil, 389 | code_sync_opts: Keyword.get(opts, :code_sync, false) 390 | } 391 | 392 | base_backend_opts = Keyword.take(opts, [:boot_timeout]) 393 | 394 | {backend, backend_init} = 395 | case Keyword.fetch!(opts, :backend) do 396 | backend when is_atom(backend) -> 397 | backend_opts = 398 | Keyword.merge(base_backend_opts, Application.get_env(:flame, backend) || []) 399 | 400 | {backend, backend.init(backend_opts)} 401 | 402 | {backend, backend_opts} when is_atom(backend) and is_list(backend_opts) -> 403 | {backend, backend.init(Keyword.merge(base_backend_opts, backend_opts))} 404 | end 405 | 406 | %{runner | backend: backend, backend_init: backend_init} 407 | end 408 | 409 | defp time(%Runner{log: false} = _runner, _label, func) do 410 | func.() 411 | end 412 | 413 | # TODO move this to telemetry 414 | defp time(%Runner{log: level}, label, func) do 415 | Logger.log(level, "#{label}: start") 416 | {elapsed_micro, result} = :timer.tc(func) 417 | millisec = System.convert_time_unit(elapsed_micro, :microsecond, :millisecond) 418 | Logger.log(level, "#{label}: completed in #{millisec}ms") 419 | result 420 | end 421 | 422 | defp put_checkout(state, from_pid) when is_pid(from_pid) do 423 | ref = Process.monitor(from_pid) 424 | {%{state | checkouts: Map.put(state.checkouts, ref, from_pid)}, ref} 425 | end 426 | 427 | defp drop_checkout(state, ref) when is_reference(ref) do 428 | %{^ref => _from_pid} = state.checkouts 429 | %{state | checkouts: Map.delete(state.checkouts, ref)} 430 | end 431 | 432 | defp remote_call!(%Runner{} = runner, backend_state, timeout, track_resources?, func) do 433 | case remote_call(runner, backend_state, timeout, track_resources?, func) do 434 | {:ok, value} -> value 435 | {:exit, reason} -> exit(reason) 436 | end 437 | end 438 | 439 | defp remote_call(%Runner{} = runner, backend_state, timeout, track_resources?, func) do 440 | %{terminator: terminator} = runner 441 | parent_ref = make_ref() 442 | parent = self() 443 | 444 | {:ok, {remote_pid, remote_monitor_ref}} = 445 | runner.backend.remote_spawn_monitor(backend_state, fn -> 446 | # This runs on the remote node 447 | result = func.() 448 | send(parent, {parent_ref, result}) 449 | 450 | if track_resources? do 451 | monitor_ref = Process.monitor(parent) 452 | 453 | receive do 454 | {^parent_ref, [_ | _] = to_watch} -> 455 | Terminator.watch(terminator, to_watch) 456 | # Hold the result until here so they are not premature garbage collected 457 | __MODULE__.identity(result) 458 | 459 | {^parent_ref, []} -> 460 | :ok 461 | 462 | {:DOWN, ^monitor_ref, _, _, _} -> 463 | :ok 464 | end 465 | end 466 | 467 | :ok 468 | end) 469 | 470 | receive do 471 | {^parent_ref, result} -> 472 | Process.demonitor(remote_monitor_ref, [:flush]) 473 | 474 | if track_resources? do 475 | {result, pids} = FLAME.track_resources(result, [], node(remote_pid)) 476 | send(remote_pid, {parent_ref, pids}) 477 | {:ok, {result, pids}} 478 | else 479 | {:ok, {result, []}} 480 | end 481 | 482 | {:DOWN, ^remote_monitor_ref, :process, ^remote_pid, reason} -> 483 | case reason do 484 | :killed -> {:exit, :timeout} 485 | other -> {:exit, other} 486 | end 487 | 488 | {:EXIT, ^remote_pid, reason} -> 489 | {:exit, reason} 490 | after 491 | timeout -> 492 | {:exit, :timeout} 493 | end 494 | end 495 | 496 | @doc """ 497 | Used to avoid garbage collection of remote terms. 498 | """ 499 | def identity(term), do: term 500 | 501 | @drain_timeout :drain_timeout 502 | defp drain_checkouts(state, timeout) do 503 | case state.checkouts do 504 | checkouts when checkouts == %{} -> 505 | state 506 | 507 | checkouts -> 508 | Process.send_after(self(), @drain_timeout, timeout) 509 | 510 | Enum.reduce(checkouts, state, fn {ref, _from_pid}, acc -> 511 | receive do 512 | {:checkin, ^ref} -> drop_checkout(acc, ref) 513 | {:DOWN, ^ref, :process, _pid, _reason} -> drop_checkout(acc, ref) 514 | @drain_timeout -> exit(:timeout) 515 | end 516 | end) 517 | end 518 | end 519 | 520 | defp maybe_stream_code_paths(%{runner: %Runner{} = runner} = state) do 521 | if code_sync_opts = runner.code_sync_opts do 522 | code_sync = 523 | code_sync_opts 524 | |> CodeSync.new() 525 | |> CodeSync.compute_sync_beams() 526 | 527 | %CodeSync.PackagedStream{} = parent_stream = CodeSync.package_to_stream(code_sync) 528 | new_runner = %{runner | code_sync: code_sync} 529 | {%{state | runner: new_runner}, parent_stream} 530 | else 531 | {state, nil} 532 | end 533 | end 534 | 535 | defp maybe_diff_code_paths(%{runner: %Runner{} = runner} = state) do 536 | if runner.code_sync do 537 | diffed_code = CodeSync.diff(runner.code_sync) 538 | new_runner = %{runner | code_sync: diffed_code} 539 | new_state = %{state | runner: new_runner} 540 | 541 | if CodeSync.changed?(diffed_code) do 542 | %CodeSync.PackagedStream{} = parent_stream = CodeSync.package_to_stream(diffed_code) 543 | {new_state, parent_stream} 544 | else 545 | {new_state, nil} 546 | end 547 | else 548 | {state, nil} 549 | end 550 | end 551 | end 552 | -------------------------------------------------------------------------------- /lib/flame/terminator.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Terminator.Caller do 2 | @moduledoc false 3 | 4 | defstruct from_pid: nil, timer: nil, placed_child_ref: nil, placed_caller_ref: nil, link?: false 5 | end 6 | 7 | defmodule FLAME.Terminator do 8 | @moduledoc false 9 | # The terminator is responsible for ensuring RPC deadlines and parent monitoring. 10 | 11 | # All FLAME calls are deadlined with a timeout. The runners will spawn a 12 | # function on a remote node, check in with the terminator and ask to be deadlined 13 | # with a timeout, and then perform their work. If the process exists beyond the 14 | # deadline, it is forcefully killed by the terminator. The termintor also ensures 15 | # a configured shutdown timeout to give existing RPC calls time to finish when 16 | # the system is shutting down. 17 | 18 | # The Terminator also handles connecting back to the parent node and monitoring 19 | # it when the node is started as FLAME child. If the connection is not 20 | # established with a failsafe time, or connection is lost, the system is shut 21 | # down by the terminator. 22 | use GenServer 23 | 24 | require Logger 25 | 26 | alias FLAME.{Terminator, Parent} 27 | alias FLAME.Terminator.Caller 28 | 29 | defstruct parent: nil, 30 | parent_monitor_ref: nil, 31 | child_placement_sup: nil, 32 | single_use: false, 33 | calls: %{}, 34 | watchers: %{}, 35 | paths: [], 36 | log: false, 37 | status: nil, 38 | failsafe_timer: nil, 39 | connect_timer: nil, 40 | connect_attempts: 0, 41 | idle_shutdown_after: nil, 42 | idle_shutdown_check: nil, 43 | idle_shutdown_timer: nil 44 | 45 | def child_spec(opts) do 46 | %{ 47 | id: {FLAME.Terminator.Supervisor, Keyword.fetch!(opts, :name)}, 48 | start: {FLAME.Terminator.Supervisor, :start_link, [opts]}, 49 | type: :supervisor 50 | } 51 | end 52 | 53 | @doc """ 54 | Starts the Terminator. 55 | 56 | ## Options 57 | 58 | * `:name` – The optional name of the GenServer. 59 | 60 | * `:parent` – The `%FLAME.Parent{}` of the parent runner. 61 | Defaults to lookup from `FLAME.Parent.get/0`. 62 | 63 | * `:failsafe_timeout` - The time to wait for a connection to the parent node 64 | before shutting down the system. Defaults to 2 seconds. 65 | 66 | * `:log` - The optional logging level. Defaults `false`. 67 | """ 68 | def start_link(opts) do 69 | Keyword.validate!(opts, [:name, :parent, :child_placement_sup, :failsafe_timeout, :log]) 70 | GenServer.start_link(__MODULE__, opts, name: opts[:name]) 71 | end 72 | 73 | def watch(terminator, pids) do 74 | GenServer.call(terminator, {:watch, pids}) 75 | end 76 | 77 | def watch_path(terminator, path) do 78 | GenServer.call(terminator, {:watch_path, path}) 79 | end 80 | 81 | def deadline_me(terminator, timeout) do 82 | GenServer.call(terminator, {:deadline, timeout}) 83 | end 84 | 85 | def schedule_idle_shutdown(terminator, idle_shutdown, idle_check, single_use?) do 86 | GenServer.call(terminator, {:schedule_idle_shutdown, idle_shutdown, idle_check, single_use?}) 87 | end 88 | 89 | def system_shutdown(terminator) when is_pid(terminator) do 90 | GenServer.call(terminator, :system_shutdown) 91 | end 92 | 93 | def place_child(terminator, caller, link?, child_spec) 94 | when is_pid(caller) and is_boolean(link?) do 95 | dynamic_sup = FLAME.Terminator.Supervisor.child_placement_sup_name(terminator) 96 | %{start: start} = child_spec = Supervisor.child_spec(child_spec, []) 97 | gl = Process.group_leader() 98 | 99 | rewritten_start = 100 | {__MODULE__, :start_child_inside_sup, [start, terminator, caller, link?, gl]} 101 | 102 | wrapped_child_spec = %{child_spec | start: rewritten_start} 103 | DynamicSupervisor.start_child(dynamic_sup, wrapped_child_spec) 104 | end 105 | 106 | # This runs inside the supervisor 107 | # We rewrite the child spec in place_child/3 to call this function which starts 108 | # the DynamicSupervisor child inside the child placement supervisor, and notifies the 109 | # terminator via the {:placed_child, caller, child_pid} message. 110 | # This approach allows the caller to place the child outside of terminator, safely. 111 | def start_child_inside_sup({mod, fun, args}, terminator, caller, link?, gl) do 112 | # We switch the group leader, so that the newly started 113 | # process gets the same group leader as the caller 114 | initial_gl = Process.group_leader() 115 | Process.group_leader(self(), gl) 116 | 117 | try do 118 | {resp, pid} = 119 | case apply(mod, fun, args) do 120 | {:ok, pid} = resp -> {resp, pid} 121 | {:ok, pid, _info} = resp -> {resp, pid} 122 | resp -> {resp, nil} 123 | end 124 | 125 | if pid, do: GenServer.call(terminator, {:placed_child, caller, pid, link?}) 126 | 127 | resp 128 | after 129 | Process.group_leader(self(), initial_gl) 130 | end 131 | end 132 | 133 | @impl true 134 | def init(opts) do 135 | Process.flag(:trap_exit, true) 136 | failsafe_timeout = Keyword.get(opts, :failsafe_timeout, 20_000) 137 | log = Keyword.get(opts, :log, false) 138 | 139 | case opts[:parent] || FLAME.Parent.get() do 140 | nil -> 141 | if log, do: Logger.log(log, "no parent found, :ignore") 142 | :ignore 143 | 144 | %FLAME.Parent{} = parent -> 145 | :global_group.monitor_nodes(true) 146 | failsafe_timer = Process.send_after(self(), :failsafe_shutdown, failsafe_timeout) 147 | 148 | child_placement_sup = 149 | case Keyword.fetch!(opts, :child_placement_sup) do 150 | pid when is_pid(pid) -> pid 151 | name when is_atom(name) -> Process.whereis(name) 152 | end 153 | 154 | state = %Terminator{ 155 | status: :connecting, 156 | child_placement_sup: child_placement_sup, 157 | parent: parent, 158 | calls: %{}, 159 | log: log, 160 | failsafe_timer: failsafe_timer, 161 | idle_shutdown_timer: {nil, nil} 162 | } 163 | 164 | log(state, "starting with parent #{inspect(parent)}") 165 | 166 | {:ok, state, {:continue, :connect}} 167 | end 168 | end 169 | 170 | @impl true 171 | def handle_continue(:connect, %Terminator{} = state) do 172 | {:noreply, connect(state)} 173 | end 174 | 175 | @impl true 176 | def handle_info(:connect, state) do 177 | if state.parent_monitor_ref do 178 | {:noreply, state} 179 | else 180 | {:noreply, connect(state)} 181 | end 182 | end 183 | 184 | def handle_info({:timeout, ref}, state) do 185 | # we can't rely on the ref to be there as Process.cancel_timer may still have delivered 186 | case state.calls do 187 | %{^ref => %Caller{} = caller} -> 188 | Process.demonitor(ref, []) 189 | Process.exit(caller.from_pid, :kill) 190 | {:noreply, drop_caller(state, ref)} 191 | 192 | %{} -> 193 | {:noreply, state} 194 | end 195 | end 196 | 197 | def handle_info({:DOWN, ref, :process, pid, reason}, %Terminator{} = state) do 198 | case state do 199 | %{parent: %{pid: ^pid}} -> 200 | message = "parent pid #{inspect(pid)} went away #{inspect(reason)}. Going down" 201 | {:noreply, system_stop(state, message)} 202 | 203 | %{watchers: %{^ref => _} = watchers} -> 204 | state = %{state | watchers: Map.delete(watchers, ref)} 205 | {:noreply, maybe_schedule_shutdown(state)} 206 | 207 | %{} -> 208 | {:noreply, drop_caller(state, ref)} 209 | end 210 | end 211 | 212 | def handle_info({:nodeup, who}, %Terminator{parent: parent} = state) do 213 | if !state.parent_monitor_ref && who === node(parent.pid) do 214 | {:noreply, connect(state)} 215 | else 216 | {:noreply, state} 217 | end 218 | end 219 | 220 | def handle_info({:nodedown, who}, %Terminator{parent: parent} = state) do 221 | if who === node(parent.pid) do 222 | new_state = system_stop(state, "nodedown #{inspect(who)}") 223 | {:noreply, new_state} 224 | else 225 | {:noreply, state} 226 | end 227 | end 228 | 229 | def handle_info(:failsafe_shutdown, %Terminator{} = state) do 230 | new_state = system_stop(state, "failsafe connect timeout") 231 | {:noreply, new_state} 232 | end 233 | 234 | def handle_info({:idle_shutdown, timer_ref}, %Terminator{parent: parent} = state) do 235 | {_current_timer, current_timer_ref} = state.idle_shutdown_timer 236 | 237 | if timer_ref == current_timer_ref && state.idle_shutdown_check.() do 238 | send_parent(parent, {:remote_shutdown, :idle}) 239 | new_state = system_stop(state, "idle shutdown") 240 | {:noreply, new_state} 241 | else 242 | {:noreply, schedule_idle_shutdown(state)} 243 | end 244 | end 245 | 246 | @impl true 247 | def handle_call({:placed_child, caller, child_pid, link?}, _from, %Terminator{} = state) do 248 | {child_ref, new_state} = deadline_caller(state, child_pid, :infinity) 249 | {caller_ref, new_state} = deadline_caller(new_state, caller, :infinity) 250 | 251 | new_state = 252 | new_state 253 | |> update_caller(child_ref, fn child -> 254 | %{child | placed_caller_ref: caller_ref, link?: link?} 255 | end) 256 | |> update_caller(caller_ref, fn caller -> 257 | %{caller | placed_child_ref: child_ref, link?: link?} 258 | end) 259 | 260 | {:reply, {:ok, child_pid}, new_state} 261 | end 262 | 263 | def handle_call({:watch, pids}, _from, %Terminator{watchers: watchers} = state) do 264 | watchers = 265 | Enum.reduce(pids, watchers, fn pid, acc -> Map.put(acc, Process.monitor(pid), []) end) 266 | 267 | state = %{state | watchers: watchers} 268 | {:reply, :ok, cancel_idle_shutdown(state)} 269 | end 270 | 271 | def handle_call({:watch_path, path}, _from, %Terminator{watchers: paths} = state) do 272 | {:reply, :ok, %{state | paths: [path | paths]}} 273 | end 274 | 275 | def handle_call(:system_shutdown, _from, %Terminator{} = state) do 276 | {:reply, :ok, 277 | system_stop(state, "system shutdown instructed from parent #{inspect(state.parent.pid)}")} 278 | end 279 | 280 | def handle_call({:deadline, timeout}, {from_pid, _tag}, %Terminator{} = state) do 281 | {_ref, new_state} = deadline_caller(state, from_pid, timeout) 282 | {:reply, :ok, new_state} 283 | end 284 | 285 | def handle_call( 286 | {:schedule_idle_shutdown, idle_after, idle_check, single_use?}, 287 | _from, 288 | %Terminator{} = state 289 | ) do 290 | new_state = %{ 291 | state 292 | | single_use: single_use?, 293 | idle_shutdown_after: idle_after, 294 | idle_shutdown_check: idle_check 295 | } 296 | 297 | {:reply, :ok, schedule_idle_shutdown(new_state)} 298 | end 299 | 300 | defp clean_up_paths(paths) do 301 | for path <- paths do 302 | File.rm_rf(path) 303 | end 304 | end 305 | 306 | @impl true 307 | def terminate(_reason, %Terminator{} = state) do 308 | state = 309 | state 310 | |> cancel_idle_shutdown() 311 | |> system_stop("terminating") 312 | 313 | # clean up any paths that were watched before waiting to not be killed 314 | clean_up_paths(state.paths) 315 | 316 | # supervisor will force kill us if we take longer than configured shutdown_timeout 317 | Enum.each(state.calls, fn 318 | # skip callers that placed a child since they are on the remote node 319 | {_ref, %Caller{placed_child_ref: ref}} when not is_nil(ref) -> 320 | :ok 321 | 322 | {ref, %Caller{}} -> 323 | receive do 324 | {:DOWN, ^ref, :process, _pid, _reason} -> :ok 325 | end 326 | end) 327 | end 328 | 329 | defp update_caller(%Terminator{} = state, ref, func) 330 | when is_reference(ref) and is_function(func, 1) do 331 | %{state | calls: Map.update!(state.calls, ref, func)} 332 | end 333 | 334 | defp deadline_caller(%Terminator{} = state, from_pid, timeout) 335 | when is_pid(from_pid) and 336 | (is_integer(timeout) or timeout == :infinity) do 337 | ref = Process.monitor(from_pid) 338 | 339 | timer = 340 | case timeout do 341 | :infinity -> nil 342 | ms when is_integer(ms) -> Process.send_after(self(), {:timeout, ref}, ms) 343 | end 344 | 345 | caller = %Caller{from_pid: from_pid, timer: timer} 346 | new_state = %{state | calls: Map.put(state.calls, ref, caller)} 347 | {ref, cancel_idle_shutdown(new_state)} 348 | end 349 | 350 | defp drop_caller(%Terminator{} = state, ref) when is_reference(ref) do 351 | %{^ref => %Caller{} = caller} = state.calls 352 | if caller.timer, do: Process.cancel_timer(caller.timer) 353 | state = %{state | calls: Map.delete(state.calls, ref)} 354 | 355 | # if the caller going down was one that placed a child, and the child is still tracked: 356 | # - if the child is not linked (link: false), do nothing 357 | # - if the child is linked, terminate the child. there is no need to notify the og caller, 358 | # as they linked themselves. 359 | # 360 | # Note: there is also a race where we can't rely on the link to have happened to so we 361 | # must monitor in the terminator even with the remote link 362 | state = 363 | with placed_child_ref <- caller.placed_child_ref, 364 | true <- is_reference(placed_child_ref), 365 | %{^placed_child_ref => %Caller{} = placed_child} <- state.calls, 366 | true <- placed_child.link? do 367 | if placed_child.timer, do: Process.cancel_timer(placed_child.timer) 368 | Process.demonitor(placed_child_ref, [:flush]) 369 | DynamicSupervisor.terminate_child(state.child_placement_sup, placed_child.from_pid) 370 | %{state | calls: Map.delete(state.calls, placed_child_ref)} 371 | else 372 | _ -> state 373 | end 374 | 375 | # if the caller going down was a placed child, clean up the placed caller ref 376 | state = 377 | with placed_caller_ref <- caller.placed_caller_ref, 378 | true <- is_reference(placed_caller_ref), 379 | %{^placed_caller_ref => %Caller{} = placed_caller} <- state.calls do 380 | if placed_caller.timer, do: Process.cancel_timer(placed_caller.timer) 381 | Process.demonitor(placed_caller_ref, [:flush]) 382 | %{state | calls: Map.delete(state.calls, placed_caller_ref)} 383 | else 384 | _ -> state 385 | end 386 | 387 | state = 388 | if state.single_use do 389 | system_stop(state, "single use completed. Going down") 390 | else 391 | state 392 | end 393 | 394 | maybe_schedule_shutdown(state) 395 | end 396 | 397 | defp maybe_schedule_shutdown(%{calls: calls, watchers: watchers} = state) do 398 | if map_size(calls) == 0 and map_size(watchers) == 0 do 399 | schedule_idle_shutdown(state) 400 | else 401 | state 402 | end 403 | end 404 | 405 | defp schedule_idle_shutdown(%Terminator{} = state) do 406 | state = cancel_idle_shutdown(state) 407 | 408 | case state.idle_shutdown_after do 409 | time when time in [nil, :infinity] -> 410 | %{state | idle_shutdown_timer: {nil, make_ref()}} 411 | 412 | time when is_integer(time) -> 413 | timer_ref = make_ref() 414 | timer = Process.send_after(self(), {:idle_shutdown, timer_ref}, time) 415 | %{state | idle_shutdown_timer: {timer, timer_ref}} 416 | end 417 | end 418 | 419 | defp cancel_idle_shutdown(%Terminator{} = state) do 420 | {timer, _ref} = state.idle_shutdown_timer 421 | if timer, do: Process.cancel_timer(timer) 422 | %{state | idle_shutdown_timer: {nil, make_ref()}} 423 | end 424 | 425 | defp connect(%Terminator{parent: %Parent{} = parent} = state) do 426 | new_attempts = state.connect_attempts + 1 427 | state.connect_timer && Process.cancel_timer(state.connect_timer) 428 | connected? = Node.connect(node(parent.pid)) 429 | 430 | log(state, "connect (#{new_attempts}) #{inspect(node(parent.pid))}: #{inspect(connected?)}") 431 | 432 | if connected? do 433 | state.failsafe_timer && Process.cancel_timer(state.failsafe_timer) 434 | ref = Process.monitor(parent.pid) 435 | 436 | send_parent(parent, {:remote_up, self()}) 437 | 438 | %{ 439 | state 440 | | status: :connected, 441 | parent_monitor_ref: ref, 442 | failsafe_timer: nil, 443 | connect_timer: nil, 444 | connect_attempts: new_attempts 445 | } 446 | else 447 | %{ 448 | state 449 | | connect_timer: Process.send_after(self(), :connect, 100), 450 | connect_attempts: new_attempts 451 | } 452 | end 453 | end 454 | 455 | defp system_stop(%Terminator{parent: parent} = state, log) do 456 | if state.status != :stopping do 457 | log(state, "#{inspect(__MODULE__)}.system_stop: #{log}") 458 | parent.backend.system_shutdown() 459 | end 460 | 461 | %{state | status: :stopping} 462 | end 463 | 464 | defp log(%Terminator{log: level}, message) do 465 | if level do 466 | Logger.log(level, message) 467 | end 468 | end 469 | 470 | defp send_parent(%Parent{} = parent, msg) do 471 | send(parent.pid, {parent.ref, msg}) 472 | end 473 | end 474 | -------------------------------------------------------------------------------- /lib/flame/terminator/supervisor.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Terminator.Supervisor do 2 | @moduledoc false 3 | 4 | use Supervisor 5 | 6 | def start_link(opts) do 7 | sup_name = opts |> Keyword.fetch!(:name) |> Module.concat("Supervisor") 8 | Supervisor.start_link(__MODULE__, opts, name: sup_name) 9 | end 10 | 11 | def child_placement_sup_name(terminator_pid) when is_pid(terminator_pid) do 12 | {:registered_name, name} = Process.info(terminator_pid, :registered_name) 13 | child_placement_sup_name(name) 14 | end 15 | 16 | def child_placement_sup_name(terminator_name) when is_atom(terminator_name) do 17 | Module.concat(terminator_name, "ChildPlacementSup") 18 | end 19 | 20 | def init(opts) do 21 | {shutdown_timeout, opts} = Keyword.pop(opts, :shutdown_timeout, 30_000) 22 | name = Keyword.fetch!(opts, :name) 23 | child_placement_sup = child_placement_sup_name(name) 24 | terminator_opts = Keyword.merge(opts, child_placement_sup: child_placement_sup) 25 | 26 | children = 27 | [ 28 | {DynamicSupervisor, name: child_placement_sup, strategy: :one_for_one}, 29 | %{ 30 | id: FLAME.Terminator, 31 | start: {FLAME.Terminator, :start_link, [terminator_opts]}, 32 | type: :worker, 33 | shutdown: shutdown_timeout 34 | } 35 | ] 36 | 37 | Supervisor.init(children, strategy: :one_for_all, max_restarts: 0) 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /lib/flame/trackable.ex: -------------------------------------------------------------------------------- 1 | defprotocol FLAME.Trackable do 2 | @moduledoc """ 3 | A protocol called to track resources. 4 | 5 | This is invoked by FLAME from `FLAME.track_resources/3`, 6 | which is invoked when the `:track_resources` option is 7 | set to true. 8 | 9 | Sometimes we may want to allocate long lived resources 10 | in a FLAME but, because FLAME nodes are temporary, the 11 | node would terminate shortly after. The `:track_resources` 12 | option tells `FLAME` to look for resources which implement 13 | the `FLAME.Trackable` protocol. Those resources can then 14 | spawn PIDs in the remote node and tell FLAME to track them. 15 | Once all PIDs terminate, the FLAME will terminate too. 16 | 17 | Implementations of the protocol will receive the data type, 18 | a list of pids as `acc`, and the `node`. It must return the 19 | updated data type and an updated list of pids. If you need 20 | to traverse recursively, you may call `FLAME.track_resources/3`. 21 | """ 22 | 23 | @fallback_to_any true 24 | 25 | @doc """ 26 | The entry point for tracking. 27 | 28 | See the module docs. 29 | """ 30 | def track(data, acc, node) 31 | end 32 | 33 | defimpl FLAME.Trackable, for: Any do 34 | def track(data, acc, _node), do: {data, acc} 35 | end 36 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Runner.MixProject do 2 | use Mix.Project 3 | 4 | def project do 5 | [ 6 | app: :flame, 7 | version: "0.5.2", 8 | elixir: "~> 1.15", 9 | elixirc_paths: elixirc_paths(Mix.env()), 10 | start_permanent: Mix.env() == :prod, 11 | deps: deps(), 12 | package: package(), 13 | source_url: "https://github.com/phoenixframework/flame", 14 | homepage_url: "http://www.phoenixframework.org", 15 | description: """ 16 | Treat your entire application as a lambda, where modular parts can be executed on short-lived infrastructure. 17 | """ 18 | ] 19 | end 20 | 21 | defp package do 22 | [ 23 | maintainers: ["Chris McCord", "Jason Stiebs"], 24 | licenses: ["MIT"], 25 | links: %{ 26 | GitHub: "https://github.com/phoenixframework/flame" 27 | }, 28 | files: ~w(lib CHANGELOG.md LICENSE.md mix.exs README.md) 29 | ] 30 | end 31 | 32 | # Run "mix help compile.app" to learn about applications. 33 | def application do 34 | [ 35 | mod: {FLAME.Application, []}, 36 | extra_applications: [:logger, inets: :optional, ssl: :optional] 37 | ] 38 | end 39 | 40 | defp elixirc_paths(:test), do: ["lib", "test/support"] 41 | defp elixirc_paths(_), do: ["lib"] 42 | 43 | # Run "mix help deps" to learn about dependencies. 44 | defp deps do 45 | [ 46 | {:jason, ">= 0.0.0", optional: true}, 47 | {:castore, ">= 0.0.0", optional: true}, 48 | {:mox, "~> 1.1.0", only: :test}, 49 | {:ex_doc, ">= 0.0.0", only: :dev, runtime: false} 50 | ] 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "castore": {:hex, :castore, "1.0.7", "b651241514e5f6956028147fe6637f7ac13802537e895a724f90bf3e36ddd1dd", [:mix], [], "hexpm", "da7785a4b0d2a021cd1292a60875a784b6caef71e76bf4917bdee1f390455cf5"}, 3 | "earmark_parser": {:hex, :earmark_parser, "1.4.43", "34b2f401fe473080e39ff2b90feb8ddfeef7639f8ee0bbf71bb41911831d77c5", [:mix], [], "hexpm", "970a3cd19503f5e8e527a190662be2cee5d98eed1ff72ed9b3d1a3d466692de8"}, 4 | "ex_doc": {:hex, :ex_doc, "0.37.0", "970f92b39e62c460aa8a367508e938f5e4da6e2ff3eaed3f8530b25870f45471", [:mix], [{:earmark_parser, "~> 1.4.42", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "b0ee7f17373948e0cf471e59c3a0ee42f3bd1171c67d91eb3626456ef9c6202c"}, 5 | "jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"}, 6 | "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"}, 7 | "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"}, 8 | "makeup_erlang": {:hex, :makeup_erlang, "1.0.2", "03e1804074b3aa64d5fad7aa64601ed0fb395337b982d9bcf04029d68d51b6a7", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "af33ff7ef368d5893e4a267933e7744e46ce3cf1f61e2dccf53a111ed3aa3727"}, 9 | "mox": {:hex, :mox, "1.1.0", "0f5e399649ce9ab7602f72e718305c0f9cdc351190f72844599545e4996af73c", [:mix], [], "hexpm", "d44474c50be02d5b72131070281a5d3895c0e7a95c780e90bc0cfe712f633a13"}, 10 | "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"}, 11 | } 12 | -------------------------------------------------------------------------------- /test/code_sync_test.exs: -------------------------------------------------------------------------------- 1 | defmodule FLAME.CodeSyncTest do 2 | use ExUnit.Case, async: false 3 | alias FLAME.CodeSync 4 | alias FLAME.Test.CodeSyncMock 5 | 6 | def rel(%CodeSyncMock{} = mock, paths) do 7 | Enum.map(paths, &Path.relative_to(&1, Path.join([File.cwd!(), "tmp", "#{mock.id}"]))) 8 | end 9 | 10 | def started_apps do 11 | Enum.map(Application.started_applications(), fn {app, _desc, _vsn} -> app end) 12 | end 13 | 14 | setup do 15 | Application.ensure_started(:logger) 16 | end 17 | 18 | describe "new/0" do 19 | test "creates a new struct with change tracking" do 20 | mock = CodeSyncMock.new() 21 | 22 | code_sync = 23 | mock.opts 24 | |> CodeSync.new() 25 | |> CodeSync.compute_changed_paths() 26 | 27 | assert %CodeSync{ 28 | sync_beam_hashes: %{}, 29 | changed_paths: changed_paths, 30 | deleted_paths: [], 31 | purge_modules: [] 32 | } = code_sync 33 | 34 | assert code_sync.apps_to_start == started_apps() 35 | 36 | assert rel(mock, changed_paths) == [ 37 | "one/ebin/Elixir.FLAME.Test.CodeSyncMock.Mod1.beam", 38 | "two/ebin/Elixir.FLAME.Test.CodeSyncMock.Mod2.beam" 39 | ] 40 | end 41 | end 42 | 43 | test "identifies changed, added, and deleted beams" do 44 | mock = CodeSyncMock.new() 45 | 46 | previous = 47 | mock.opts 48 | |> CodeSync.new() 49 | |> CodeSync.compute_sync_beams() 50 | 51 | # simulate change to mod1, new mod3, and deleted mod2 52 | :ok = CodeSyncMock.simulate_changes(mock) 53 | 54 | current = CodeSync.diff(previous) 55 | 56 | assert rel(mock, current.changed_paths) == [ 57 | "one/ebin/Elixir.FLAME.Test.CodeSyncMock.Mod1.beam", 58 | "one/ebin/Elixir.FLAME.Test.CodeSyncMock.Mod3.beam" 59 | ] 60 | 61 | assert rel(mock, current.deleted_paths) == [ 62 | "two/ebin/Elixir.FLAME.Test.CodeSyncMock.Mod2.beam" 63 | ] 64 | 65 | assert current.purge_modules == [FLAME.Test.CodeSyncMock.Mod2] 66 | 67 | # new diff should have no changes 68 | current = CodeSync.diff(current) 69 | assert current.changed_paths == [] 70 | assert current.deleted_paths == [] 71 | assert current.purge_modules == [] 72 | assert current.apps_to_start == [] 73 | end 74 | 75 | test "start_apps: false, does not sync started apps" do 76 | # cheap way to ensure apps are started on extract. Note async: false is required 77 | Application.stop(:logger) 78 | refute :logger in started_apps() 79 | mock = CodeSyncMock.new(start_apps: false) 80 | previous = CodeSync.new(mock.opts) 81 | assert previous.apps_to_start == [] 82 | 83 | Application.ensure_started(:logger) 84 | current = CodeSync.diff(previous) 85 | assert current.apps_to_start == [] 86 | end 87 | 88 | test "start_apps with a list syncs listed apps" do 89 | # cheap way to ensure apps are started on extract. Note async: false is required 90 | Application.stop(:logger) 91 | refute :logger in started_apps() 92 | mock = CodeSyncMock.new(start_apps: [:logger]) 93 | previous = CodeSync.new(mock.opts) 94 | assert previous.apps_to_start == [:logger] 95 | 96 | Application.ensure_started(:logger) 97 | current = CodeSync.diff(previous) 98 | assert current.apps_to_start == [] 99 | end 100 | 101 | test "compute_changed_paths packages and extracts packaged code and starts apps" do 102 | assert :logger in started_apps() 103 | mock = CodeSyncMock.new() 104 | 105 | code = 106 | mock.opts 107 | |> CodeSync.new() 108 | |> CodeSync.compute_changed_paths() 109 | 110 | assert %FLAME.CodeSync.PackagedStream{} = pkg = CodeSync.package_to_stream(code) 111 | assert File.exists?(pkg.stream.path) 112 | 113 | # cheap way to ensure apps are started on extract. Note async: false is required 114 | Application.stop(:logger) 115 | refute :logger in started_apps() 116 | 117 | assert CodeSync.extract_packaged_stream(pkg) == mock.extract_dir 118 | 119 | assert CodeSyncMock.extracted_rel_paths(mock) == [ 120 | "one/ebin/Elixir.FLAME.Test.CodeSyncMock.Mod1.beam", 121 | "two/ebin/Elixir.FLAME.Test.CodeSyncMock.Mod2.beam" 122 | ] 123 | 124 | assert :logger in started_apps() 125 | end 126 | end 127 | -------------------------------------------------------------------------------- /test/flame_test.exs: -------------------------------------------------------------------------------- 1 | defmodule FLAME.FLAMETest do 2 | use ExUnit.Case, async: true 3 | 4 | alias FLAME.Pool 5 | 6 | defp sim_long_running(pool, time \\ 1_000) do 7 | ref = make_ref() 8 | parent = self() 9 | 10 | task = 11 | Task.start_link(fn -> 12 | FLAME.call(pool, fn -> 13 | send(parent, {ref, :called}) 14 | Process.sleep(time) 15 | end) 16 | end) 17 | 18 | receive do 19 | {^ref, :called} -> task 20 | end 21 | end 22 | 23 | setup config do 24 | case config do 25 | %{runner: runner_opts} -> 26 | runner_sup = Module.concat(config.test, "RunnerSup") 27 | pool_pid = start_supervised!({Pool, Keyword.merge(runner_opts, name: config.test)}) 28 | 29 | {:ok, runner_sup: runner_sup, pool_pid: pool_pid} 30 | 31 | %{} -> 32 | :ok 33 | end 34 | end 35 | 36 | @tag runner: [min: 1, max: 2, max_concurrency: 2] 37 | test "init boots min runners synchronously and grows on demand", 38 | %{runner_sup: runner_sup} = config do 39 | min_pool = Supervisor.which_children(runner_sup) 40 | assert [{:undefined, _pid, :worker, [FLAME.Runner]}] = min_pool 41 | # execute against single runner 42 | assert FLAME.call(config.test, fn -> :works end) == :works 43 | 44 | # dynamically grows to max 45 | _task1 = sim_long_running(config.test) 46 | assert FLAME.call(config.test, fn -> :works end) == :works 47 | # max concurrency still below threshold 48 | assert Supervisor.which_children(runner_sup) == min_pool 49 | # max concurrency above threshold boots new runner 50 | _task2 = sim_long_running(config.test) 51 | assert FLAME.call(config.test, fn -> :works end) == :works 52 | new_pool = Supervisor.which_children(runner_sup) 53 | refute new_pool == min_pool 54 | assert length(new_pool) == 2 55 | # caller is now queued while waiting for available runner 56 | _task3 = sim_long_running(config.test) 57 | _task4 = sim_long_running(config.test) 58 | # task is queued and times out 59 | queued = spawn(fn -> FLAME.call(config.test, fn -> :queued end, timeout: 100) end) 60 | ref = Process.monitor(queued) 61 | assert_receive {:DOWN, ^ref, :process, _, {:timeout, _}}, 1000 62 | assert FLAME.call(config.test, fn -> :queued end) == :queued 63 | assert new_pool == Supervisor.which_children(runner_sup) 64 | end 65 | 66 | @tag runner: [min: 0, max: 1, max_concurrency: 2] 67 | test "concurrent calls on fully pending runners", 68 | %{runner_sup: runner_sup} = config do 69 | assert Supervisor.which_children(runner_sup) == [] 70 | parent = self() 71 | 72 | Task.start_link(fn -> 73 | FLAME.call(config.test, fn -> 74 | send(parent, :called) 75 | Process.sleep(:infinity) 76 | end) 77 | end) 78 | 79 | Task.start_link(fn -> 80 | FLAME.call(config.test, fn -> 81 | send(parent, :called) 82 | Process.sleep(:infinity) 83 | end) 84 | end) 85 | 86 | assert_receive :called 87 | assert_receive :called 88 | end 89 | 90 | def on_grow_start(meta) do 91 | send(:failure_test, {:grow_start, meta}) 92 | 93 | if Agent.get_and_update(:failure_test_counter, &{&1 + 1, &1 + 1}) <= 1 do 94 | raise "boom" 95 | end 96 | end 97 | 98 | def on_grow_end(result, meta) do 99 | send(:failure_test, {:grow_start_end, result, meta}) 100 | end 101 | 102 | @tag runner: [ 103 | min: 1, 104 | max: 2, 105 | max_concurrency: 1, 106 | on_grow_start: &__MODULE__.on_grow_start/1, 107 | on_grow_end: &__MODULE__.on_grow_end/2 108 | ] 109 | test "failure of pending async runner bootup", %{runner_sup: runner_sup} = config do 110 | parent = self() 111 | 112 | ExUnit.CaptureLog.capture_log(fn -> 113 | start_supervised!( 114 | {Agent, 115 | fn -> 116 | Process.register(self(), :failure_test_counter) 117 | 0 118 | end} 119 | ) 120 | 121 | Process.register(self(), :failure_test) 122 | assert [{:undefined, _pid, :worker, [FLAME.Runner]}] = Supervisor.which_children(runner_sup) 123 | # max concurrency above threshold tries to boot new runner 124 | _task2 = sim_long_running(config.test, :infinity) 125 | 126 | spawn_link(fn -> 127 | FLAME.cast(config.test, fn -> send(parent, :fullfilled) end) 128 | Process.sleep(:infinity) 129 | end) 130 | 131 | # first attempt fails 132 | refute_receive :fullfilled 133 | assert_receive {:grow_start, %{count: 2, pid: pid}} 134 | assert_receive {:grow_start_end, {:exit, _}, %{pid: ^pid, count: 1}} 135 | assert length(Supervisor.which_children(runner_sup)) == 1 136 | 137 | # retry attempt succeeds 138 | assert_receive {:grow_start, %{count: 2, pid: pid}}, 1000 139 | assert_receive {:grow_start_end, :ok, %{pid: ^pid, count: 2}} 140 | # queued og caller is now fullfilled from retried runner boot 141 | assert_receive :fullfilled 142 | assert FLAME.call(config.test, fn -> :works end) == :works 143 | assert length(Supervisor.which_children(runner_sup)) == 2 144 | end) 145 | end 146 | 147 | @tag runner: [min: 1, max: 2, max_concurrency: 2, idle_shutdown_after: 500] 148 | test "idle shutdown", %{runner_sup: runner_sup} = config do 149 | sim_long_running(config.test, 100) 150 | sim_long_running(config.test, 100) 151 | sim_long_running(config.test, 100) 152 | 153 | # we've scaled from min 1 to max 2 at this point 154 | assert [ 155 | {:undefined, runner1, :worker, [FLAME.Runner]}, 156 | {:undefined, runner2, :worker, [FLAME.Runner]} 157 | ] = Supervisor.which_children(runner_sup) 158 | 159 | Process.monitor(runner1) 160 | Process.monitor(runner2) 161 | assert_receive {:DOWN, _ref, :process, ^runner2, {:shutdown, :idle}}, 1000 162 | refute_receive {:DOWN, _ref, :process, ^runner1, {:shutdown, :idle}} 163 | 164 | assert [{:undefined, ^runner1, :worker, [FLAME.Runner]}] = 165 | Supervisor.which_children(runner_sup) 166 | end 167 | 168 | @tag runner: [min: 1, max: 1, max_concurrency: 2, idle_shutdown_after: 500] 169 | test "pool runner DOWN exits any active checkouts", %{runner_sup: runner_sup} = config do 170 | {:ok, active_checkout} = sim_long_running(config.test, 10_000) 171 | Process.unlink(active_checkout) 172 | Process.monitor(active_checkout) 173 | assert [{:undefined, runner, :worker, [FLAME.Runner]}] = Supervisor.which_children(runner_sup) 174 | Process.exit(runner, :brutal_kill) 175 | assert_receive {:DOWN, _ref, :process, ^active_checkout, :killed} 176 | end 177 | 178 | @tag runner: [min: 0, max: 1, max_concurrency: 2, idle_shutdown_after: 50] 179 | test "call links", %{runner_sup: runner_sup} = config do 180 | ExUnit.CaptureLog.capture_log(fn -> 181 | parent = self() 182 | # links by defaults 183 | Process.flag(:trap_exit, true) 184 | 185 | caught = 186 | try do 187 | FLAME.call( 188 | config.test, 189 | fn -> 190 | send(parent, {:called, self()}) 191 | Process.exit(self(), :kill) 192 | end 193 | ) 194 | catch 195 | kind, reason -> {kind, reason} 196 | end 197 | 198 | [{:undefined, runner, :worker, [FLAME.Runner]}] = Supervisor.which_children(runner_sup) 199 | Process.monitor(runner) 200 | assert {:exit, :killed} = caught 201 | assert_receive {:called, _flame_pid} 202 | assert_receive {:DOWN, _ref, :process, ^runner, {:shutdown, :idle}} 203 | 204 | # link: false 205 | Process.flag(:trap_exit, false) 206 | assert Supervisor.which_children(runner_sup) == [] 207 | parent = self() 208 | 209 | caught = 210 | try do 211 | FLAME.call( 212 | config.test, 213 | fn -> 214 | send(parent, {:called, self()}) 215 | raise "boom" 216 | end, 217 | link: false 218 | ) 219 | catch 220 | kind, reason -> {kind, reason} 221 | end 222 | 223 | [{:undefined, runner_pid, :worker, [FLAME.Runner]}] = Supervisor.which_children(runner_sup) 224 | Process.monitor(runner_pid) 225 | assert {:exit, {%RuntimeError{message: "boom"}, _}} = caught 226 | assert_receive {:called, flame_pid} 227 | Process.monitor(flame_pid) 228 | assert_receive {:DOWN, _ref, :process, ^flame_pid, :noproc} 229 | assert_receive {:DOWN, _ref, :process, ^runner_pid, {:shutdown, :idle}} 230 | assert Supervisor.which_children(runner_sup) == [] 231 | end) 232 | end 233 | 234 | @tag runner: [min: 0, max: 1, max_concurrency: 2, idle_shutdown_after: 50] 235 | test "cast with link false", %{runner_sup: runner_sup} = config do 236 | ExUnit.CaptureLog.capture_log(fn -> 237 | assert Supervisor.which_children(runner_sup) == [] 238 | parent = self() 239 | 240 | FLAME.cast( 241 | config.test, 242 | fn -> 243 | send(parent, {:called, self()}) 244 | raise "boom" 245 | end, 246 | link: false 247 | ) 248 | 249 | assert_receive {:called, flame_pid} 250 | Process.monitor(flame_pid) 251 | [{:undefined, runner_pid, :worker, [FLAME.Runner]}] = Supervisor.which_children(runner_sup) 252 | assert_receive {:DOWN, _ref, :process, ^flame_pid, :noproc} 253 | Process.monitor(runner_pid) 254 | assert_receive {:DOWN, _ref, :process, ^runner_pid, {:shutdown, :idle}} 255 | assert Supervisor.which_children(runner_sup) == [] 256 | end) 257 | end 258 | 259 | describe "cast" do 260 | @tag runner: [min: 1, max: 2, max_concurrency: 2, idle_shutdown_after: 500] 261 | test "normal execution", %{} = config do 262 | sim_long_running(config.test, 100) 263 | parent = self() 264 | 265 | assert FLAME.cast(config.test, fn -> 266 | send(parent, {:ran, self()}) 267 | 268 | receive do 269 | :continue -> :ok 270 | end 271 | end) == :ok 272 | 273 | assert_receive {:ran, cast_pid} 274 | Process.monitor(cast_pid) 275 | send(cast_pid, :continue) 276 | assert_receive {:DOWN, _ref, :process, ^cast_pid, :normal} 277 | end 278 | 279 | def growth_grow_start(meta) do 280 | send(Process.whereis(:current_test), {:grow_start, meta}) 281 | end 282 | 283 | @tag runner: [ 284 | min: 0, 285 | max: 2, 286 | max_concurrency: 1, 287 | on_grow_start: &__MODULE__.growth_grow_start/1 288 | ] 289 | test "pool growth", %{} = config do 290 | Process.register(self(), :current_test) 291 | parent = self() 292 | 293 | for i <- [1, 2, 3] do 294 | assert FLAME.cast(config.test, fn -> 295 | send(parent, {:ran, i, self()}) 296 | Process.sleep(500) 297 | end) == :ok 298 | end 299 | 300 | for i <- [1, 2, 3] do 301 | assert_receive {:ran, ^i, cast_pid} 302 | Process.monitor(cast_pid) 303 | assert_receive {:DOWN, _ref, :process, ^cast_pid, _}, 1000 304 | end 305 | 306 | assert_receive {:grow_start, %{count: 1}}, 1000 307 | assert_receive {:grow_start, %{count: 2}}, 1000 308 | refute_receive {:grow_start, _}, 1000 309 | end 310 | 311 | @tag runner: [min: 1, max: 2, max_concurrency: 2, idle_shutdown_after: 500] 312 | test "with exit and default link", %{} = config do 313 | ExUnit.CaptureLog.capture_log(fn -> 314 | Process.flag(:trap_exit, true) 315 | sim_long_running(config.test, 100) 316 | parent = self() 317 | 318 | assert FLAME.cast(config.test, fn -> 319 | send(parent, {:ran, self()}) 320 | 321 | receive do 322 | :continue -> exit(:boom) 323 | end 324 | end) == :ok 325 | 326 | assert_receive {:ran, cast_pid} 327 | Process.monitor(cast_pid) 328 | send(cast_pid, :continue) 329 | assert_receive {:EXIT, ^cast_pid, :boom} 330 | end) 331 | end 332 | end 333 | 334 | describe "process placement" do 335 | @tag runner: [min: 0, max: 2, max_concurrency: 2, idle_shutdown_after: 100] 336 | test "place_child/2", %{runner_sup: runner_sup} = config do 337 | assert [] = Supervisor.which_children(runner_sup) 338 | assert {:ok, pid} = FLAME.place_child(config.test, {Agent, fn -> 1 end}) 339 | Process.monitor(pid) 340 | 341 | assert [{:undefined, runner, :worker, [FLAME.Runner]}] = 342 | Supervisor.which_children(runner_sup) 343 | 344 | Process.monitor(runner) 345 | assert Agent.get(pid, & &1) == 1 346 | # does not idle down runner or actively placed children 347 | refute_receive {:DOWN, _ref, :process, _, _}, 1000 348 | # active caller to prevent idle down 349 | assert FLAME.cast(config.test, fn -> 350 | Process.sleep(1_000) 351 | end) == :ok 352 | 353 | Agent.stop(pid) 354 | assert_receive {:DOWN, _ref, :process, ^pid, _}, 100 355 | 356 | # runner does not idle down with active checkout from cast 357 | refute_receive {:DOWN, _ref, :process, ^runner, _}, 1000 358 | 359 | # runner idles down now that placed child and cast callers are gone 360 | assert_receive {:DOWN, _ref, :process, ^runner, _}, 1000 361 | end 362 | 363 | @tag runner: [min: 0, max: 2, max_concurrency: 2, idle_shutdown_after: 100] 364 | test "place_child links", %{runner_sup: runner_sup} = config do 365 | # links by default 366 | Process.flag(:trap_exit, true) 367 | assert {:ok, pid} = FLAME.place_child(config.test, {Agent, fn -> 1 end}) 368 | 369 | assert [{:undefined, runner, :worker, [FLAME.Runner]}] = 370 | Supervisor.which_children(runner_sup) 371 | 372 | Process.monitor(runner) 373 | 374 | Process.exit(pid, :kill) 375 | assert_receive {:EXIT, ^pid, :killed}, 100 376 | 377 | # runner idles down now that placed child and cast callers are gone 378 | assert_receive {:DOWN, _ref, :process, ^runner, _}, 1000 379 | 380 | # with explicit link: false 381 | Process.flag(:trap_exit, false) 382 | assert {:ok, pid} = FLAME.place_child(config.test, {Agent, fn -> 1 end}, link: false) 383 | Process.monitor(pid) 384 | 385 | assert [{:undefined, runner, :worker, [FLAME.Runner]}] = 386 | Supervisor.which_children(runner_sup) 387 | 388 | Process.monitor(runner) 389 | 390 | Process.exit(pid, :kill) 391 | assert_receive {:DOWN, _ref, :process, ^pid, :killed}, 100 392 | 393 | # runner idles down now that placed child and cast callers are gone 394 | assert_receive {:DOWN, _ref, :process, ^runner, _}, 1000 395 | end 396 | 397 | @tag runner: [min: 0, max: 2, max_concurrency: 2, idle_shutdown_after: 100] 398 | test "place_child when caller exits", %{runner_sup: runner_sup} = config do 399 | # links by default 400 | parent = self() 401 | 402 | caller = 403 | spawn(fn -> 404 | {:ok, pid} = FLAME.place_child(config.test, {Agent, fn -> 1 end}) 405 | send(parent, {:child, pid}) 406 | Process.sleep(:infinity) 407 | end) 408 | 409 | assert_receive {:child, placed_child} 410 | 411 | assert [{:undefined, runner, :worker, [FLAME.Runner]}] = 412 | Supervisor.which_children(runner_sup) 413 | 414 | Process.monitor(runner) 415 | Process.monitor(placed_child) 416 | 417 | Process.exit(caller, :kill) 418 | 419 | assert_receive {:DOWN, _ref, :process, ^placed_child, _} 420 | # runner idles down now that placed child and cast callers are gone 421 | assert_receive {:DOWN, _ref, :process, ^runner, _}, 1000 422 | 423 | # with link: false 424 | caller = 425 | spawn(fn -> 426 | {:ok, pid} = FLAME.place_child(config.test, {Agent, fn -> 1 end}, link: false) 427 | send(parent, {:child, pid}) 428 | Process.sleep(:infinity) 429 | end) 430 | 431 | assert_receive {:child, placed_child} 432 | 433 | assert [{:undefined, runner, :worker, [FLAME.Runner]}] = 434 | Supervisor.which_children(runner_sup) 435 | 436 | Process.monitor(runner) 437 | Process.monitor(placed_child) 438 | Process.exit(caller, :kill) 439 | 440 | refute_receive {:DOWN, _ref, :process, ^placed_child, _} 441 | # runner does not idle down when caller goes away since placed child still running 442 | refute_receive {:DOWN, _ref, :process, ^runner, _}, 1000 443 | 444 | Process.exit(placed_child, :kill) 445 | assert_receive {:DOWN, _ref, :process, ^placed_child, _} 446 | # runner idles down now that placed child and cast callers are gone 447 | assert_receive {:DOWN, _ref, :process, ^runner, _}, 1000 448 | end 449 | end 450 | 451 | describe "resource tracking" do 452 | @tag runner: [min: 0, max: 1] 453 | test "local", config do 454 | name = :"#{config.test}_trackable" 455 | ref = make_ref() 456 | trackable = %MyTrackable{name: name, ref: ref} 457 | non_trackable = URI.new!("/") 458 | 459 | {[{map}], [pid]} = 460 | FLAME.track_resources([{%{"yes" => trackable, "no" => non_trackable}}], [], node()) 461 | 462 | assert map_size(map) == 2 463 | assert ^non_trackable = map["no"] 464 | assert %MyTrackable{name: ^name, ref: ^ref, pid: ^pid} = map["yes"] 465 | assert Process.whereis(name) == pid 466 | 467 | monitor_ref = Process.monitor(pid) 468 | send(pid, {ref, :stop}) 469 | assert_receive {:DOWN, ^monitor_ref, _, _, :normal} 470 | end 471 | 472 | @tag runner: [min: 0, max: 2, max_concurrency: 2, idle_shutdown_after: 100] 473 | test "remote without tracking", config do 474 | name = :"#{config.test}_trackable" 475 | non_trackable = URI.new!("/") 476 | 477 | [{map}] = 478 | FLAME.call(config.test, fn -> 479 | ref = make_ref() 480 | trackable = %MyTrackable{name: name, ref: ref} 481 | [{%{"yes" => trackable, "no" => non_trackable}}] 482 | end) 483 | 484 | assert map_size(map) == 2 485 | assert ^non_trackable = map["no"] 486 | assert %MyTrackable{pid: nil} = map["yes"] 487 | end 488 | 489 | @tag runner: [min: 0, max: 2, max_concurrency: 2, idle_shutdown_after: 100] 490 | test "remote with tracking", %{runner_sup: runner_sup} = config do 491 | name = :"#{config.test}_trackable" 492 | non_trackable = URI.new!("/") 493 | 494 | [{map}] = 495 | FLAME.call( 496 | config.test, 497 | fn -> 498 | ref = make_ref() 499 | trackable = %MyTrackable{name: name, ref: ref} 500 | [{%{"yes" => trackable, "no" => non_trackable}}] 501 | end, 502 | track_resources: true 503 | ) 504 | 505 | assert [{:undefined, runner, :worker, [FLAME.Runner]}] = 506 | Supervisor.which_children(runner_sup) 507 | 508 | Process.monitor(runner) 509 | assert map_size(map) == 2 510 | assert ^non_trackable = map["no"] 511 | assert %MyTrackable{pid: pid} = trackable = map["yes"] 512 | assert Process.alive?(pid) 513 | refute_receive {:DOWN, _, _, ^runner, _}, 1000 514 | send(pid, {trackable.ref, :stop}) 515 | assert_receive {:DOWN, _, _, ^runner, {:shutdown, :idle}}, 1000 516 | end 517 | 518 | @tag runner: [ 519 | min: 0, 520 | max: 2, 521 | max_concurrency: 2, 522 | idle_shutdown_after: 100, 523 | track_resources: true 524 | ] 525 | test "remote with tracking enabled at pool level", %{runner_sup: runner_sup} = config do 526 | name = :"#{config.test}_trackable" 527 | non_trackable = URI.new!("/") 528 | 529 | [{map}] = 530 | FLAME.call( 531 | config.test, 532 | fn -> 533 | ref = make_ref() 534 | trackable = %MyTrackable{name: name, ref: ref} 535 | [{%{"yes" => trackable, "no" => non_trackable}}] 536 | end 537 | ) 538 | 539 | assert [{:undefined, runner, :worker, [FLAME.Runner]}] = 540 | Supervisor.which_children(runner_sup) 541 | 542 | Process.monitor(runner) 543 | assert map_size(map) == 2 544 | assert ^non_trackable = map["no"] 545 | assert %MyTrackable{pid: pid} = trackable = map["yes"] 546 | assert Process.alive?(pid) 547 | refute_receive {:DOWN, _, _, ^runner, _}, 1000 548 | send(pid, {trackable.ref, :stop}) 549 | assert_receive {:DOWN, _, _, ^runner, {:shutdown, :idle}}, 1000 550 | end 551 | 552 | @tag runner: [ 553 | min: 0, 554 | max: 1, 555 | max_concurrency: 1, 556 | idle_shutdown_after: 100, 557 | track_resources: true 558 | ] 559 | test "remote with tracking max concurrency", %{runner_sup: runner_sup} = config do 560 | non_trackable = URI.new!("/") 561 | 562 | call = fn count -> 563 | ref = make_ref() 564 | 565 | trackables = 566 | for _ <- 1..count, 567 | do: %MyTrackable{ 568 | name: :"#{config.test}_trackable_#{System.unique_integer()}", 569 | ref: ref 570 | } 571 | 572 | [{%{"yes" => trackables, "no" => non_trackable}}] 573 | end 574 | 575 | [{map}] = FLAME.call(config.test, fn -> call.(2) end) 576 | 577 | assert [{:undefined, runner, :worker, [FLAME.Runner]}] = 578 | Supervisor.which_children(runner_sup) 579 | 580 | Process.monitor(runner) 581 | assert map_size(map) == 2 582 | assert ^non_trackable = map["no"] 583 | assert [%MyTrackable{} = trackable1, %MyTrackable{} = trackable2] = map["yes"] 584 | 585 | # original trackables still occupies the slots 586 | assert Process.alive?(trackable1.pid) 587 | assert Process.alive?(trackable2.pid) 588 | refute_receive {:DOWN, _, _, ^runner, _}, 1000 589 | 590 | # check in the trackable 1 591 | send(trackable1.pid, {trackable1.ref, :stop}) 592 | 593 | # no idle down because second trackable still alive 594 | refute_receive {:DOWN, _, _, ^runner, _}, 1000 595 | 596 | # trackable2 occupies the only available slot, so next call times out 597 | caught = 598 | try do 599 | FLAME.call(config.test, fn -> call.(1) end, timeout: 1000) 600 | catch 601 | kind, reason -> {kind, reason} 602 | end 603 | 604 | assert {:exit, {:timeout, _}} = caught 605 | 606 | # check in the trackable 2 607 | send(trackable2.pid, {trackable2.ref, :stop}) 608 | 609 | # runner is now free for more work on open slot 610 | [{map}] = FLAME.call(config.test, fn -> call.(1) end) 611 | 612 | assert [{:undefined, runner, :worker, [FLAME.Runner]}] = 613 | Supervisor.which_children(runner_sup) 614 | 615 | Process.monitor(runner) 616 | assert map_size(map) == 2 617 | assert ^non_trackable = map["no"] 618 | assert [%MyTrackable{pid: pid} = trackable] = map["yes"] 619 | 620 | # check in the trackable 621 | send(pid, {trackable.ref, :stop}) 622 | 623 | # runner idles down 624 | assert_receive {:DOWN, _, _, ^runner, {:shutdown, :idle}}, 1000 625 | end 626 | end 627 | 628 | test "code_sync artifact cleaner", config do 629 | mock = FLAME.Test.CodeSyncMock.new() 630 | 631 | cleaner = Module.concat(config.test, "Cleaner") 632 | 633 | pool_pid = 634 | start_supervised!( 635 | {Pool, min: 1, max: 1, max_concurrency: 1, name: config.test, code_sync: mock.opts} 636 | ) 637 | 638 | assert [artifact] = FLAME.Pool.Cleaner.list_paths(cleaner) 639 | assert File.exists?(artifact) 640 | assert FLAME.call(config.test, fn -> :works end) == :works 641 | Supervisor.stop(pool_pid) 642 | refute File.exists?(artifact) 643 | end 644 | end 645 | -------------------------------------------------------------------------------- /test/fly_backend_test.exs: -------------------------------------------------------------------------------- 1 | defmodule FLAME.FlyBackendTest do 2 | use ExUnit.Case, async: false 3 | 4 | alias FLAME.{Runner, FlyBackend} 5 | 6 | def new({backend, opts}) do 7 | Runner.new(backend: {backend, Keyword.merge([terminator_sup: __MODULE__], opts)}) 8 | end 9 | 10 | setup do 11 | Application.delete_env(:flame, :backend) 12 | Application.delete_env(:flame, FlyBackend) 13 | end 14 | 15 | test "explicit backend" do 16 | assert_raise ArgumentError, ~r/missing :token/, fn -> 17 | new({FlyBackend, []}) 18 | end 19 | 20 | assert_raise ArgumentError, ~r/missing :image/, fn -> 21 | new({FlyBackend, token: "123"}) 22 | end 23 | 24 | assert_raise ArgumentError, ~r/missing :app/, fn -> 25 | new({FlyBackend, token: "123", image: "img"}) 26 | end 27 | 28 | assert_raise ArgumentError, ~r/missing :app/, fn -> 29 | new({FlyBackend, token: "123", image: "img", boot_timeout: 55123}) 30 | end 31 | 32 | assert new({FlyBackend, token: "123", image: "img", app: "app"}) 33 | end 34 | 35 | test "extended opts" do 36 | opts = [ 37 | token: "123", 38 | image: "img", 39 | app: "app", 40 | host: "foo.local", 41 | env: %{"ONE" => "1"}, 42 | cpu_kind: "performance", 43 | cpus: 1, 44 | memory_mb: 256, 45 | gpu_kind: "a100-pcie-40gb" 46 | ] 47 | 48 | runner = new({FlyBackend, opts}) 49 | assert {:ok, init} = runner.backend_init 50 | assert init.host == "foo.local" 51 | assert init.cpu_kind == "performance" 52 | assert init.cpus == 1 53 | assert init.memory_mb == 256 54 | assert init.gpu_kind == "a100-pcie-40gb" 55 | 56 | assert %{ 57 | "ONE" => "1", 58 | "FLAME_PARENT" => _, 59 | "PHX_SERVER" => "false" 60 | } = init.env 61 | end 62 | 63 | test "global configured backend" do 64 | assert_raise ArgumentError, ~r/missing :token/, fn -> 65 | Application.put_env(:flame, FLAME.FlyBackend, []) 66 | Runner.new(backend: FLAME.FlyBackend) 67 | end 68 | 69 | assert_raise ArgumentError, ~r/missing :image/, fn -> 70 | Application.put_env(:flame, FLAME.FlyBackend, token: "123") 71 | Runner.new(backend: FLAME.FlyBackend) 72 | end 73 | 74 | assert_raise ArgumentError, ~r/missing :app/, fn -> 75 | Application.put_env(:flame, FLAME.FlyBackend, token: "123", image: "img") 76 | Runner.new(backend: FLAME.FlyBackend) 77 | end 78 | 79 | Application.put_env(:flame, :backend, FLAME.FlyBackend) 80 | Application.put_env(:flame, FLAME.FlyBackend, token: "123", image: "img", app: "app") 81 | 82 | assert Runner.new(backend: FLAME.FlyBackend) 83 | end 84 | 85 | test "parent backend attributes" do 86 | assert %FLAME.Parent{ 87 | pid: _, 88 | ref: _, 89 | backend: FLAME.FlyBackend, 90 | flame_vsn: vsn, 91 | backend_vsn: vsn, 92 | backend_app: :flame 93 | } = 94 | FLAME.Parent.new( 95 | make_ref(), 96 | self(), 97 | FLAME.FlyBackend, 98 | "app-flame-1", 99 | "FLY_PRIVATE_IP" 100 | ) 101 | end 102 | end 103 | -------------------------------------------------------------------------------- /test/parser/json_test.exs: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Parser.JSONTest do 2 | use ExUnit.Case, async: false 3 | 4 | alias FLAME.Parser.JSON 5 | 6 | describe "encode!/1" do 7 | test "should encode string" do 8 | assert JSON.encode!("foo") == "\"foo\"" 9 | end 10 | 11 | test "should encode atom" do 12 | assert JSON.encode!(:FLAME) == "\"FLAME\"" 13 | end 14 | 15 | test "should encode string maps" do 16 | assert JSON.encode!(%{"foo" => "bar"}) == "{\"foo\":\"bar\"}" 17 | end 18 | 19 | test "should encode atom maps" do 20 | assert JSON.encode!(%{foo: "bar"}) == "{\"foo\":\"bar\"}" 21 | end 22 | 23 | test "should encode nested maps" do 24 | assert JSON.encode!(%{foo: "bar", bar: %{baz: nil}}) in [ 25 | "{\"foo\":\"bar\",\"bar\":{\"baz\":null}}", 26 | "{\"bar\":{\"baz\":null},\"foo\":\"bar\"}" 27 | ] 28 | end 29 | 30 | test "should encode list" do 31 | assert JSON.encode!([%{foo: "bar"}]) == "[{\"foo\":\"bar\"}]" 32 | assert JSON.encode!([%{foo: "bar"}, %{bar: nil}]) == "[{\"foo\":\"bar\"},{\"bar\":null}]" 33 | end 34 | 35 | test "should encode nullable values" do 36 | assert JSON.encode!(%{foo: nil}) == "{\"foo\":null}" 37 | assert JSON.encode!(%{"foo" => nil}) == "{\"foo\":null}" 38 | assert JSON.encode!(nil) == "null" 39 | end 40 | end 41 | 42 | describe "decode!/1" do 43 | test "should decode string" do 44 | assert JSON.decode!("\"foo\"") == "foo" 45 | end 46 | 47 | test "should decode maps" do 48 | assert JSON.decode!("{\"foo\":\"bar\"}") == %{"foo" => "bar"} 49 | end 50 | 51 | test "should decode nested maps" do 52 | assert JSON.decode!("{\"bar\":{\"baz\":null},\"foo\":\"bar\"}") == %{ 53 | "foo" => "bar", 54 | "bar" => %{"baz" => nil} 55 | } 56 | end 57 | 58 | test "should decode list" do 59 | assert JSON.decode!("[{\"foo\":\"bar\"}]") == [%{"foo" => "bar"}] 60 | 61 | assert JSON.decode!("[{\"foo\":\"bar\"}, {\"bar\":null}]") == [ 62 | %{"foo" => "bar"}, 63 | %{"bar" => nil} 64 | ] 65 | end 66 | 67 | test "should decode nullable values" do 68 | assert JSON.decode!("{\"foo\":null}") == %{"foo" => nil} 69 | assert JSON.decode!("null") == nil 70 | end 71 | end 72 | 73 | describe "json parser" do 74 | test "correct json parser based on erlang json availability" do 75 | if Code.ensure_loaded?(:json) do 76 | assert JSON.json_parser() == :json 77 | else 78 | assert JSON.json_parser() == Jason 79 | end 80 | end 81 | end 82 | end 83 | -------------------------------------------------------------------------------- /test/queue_test.exs: -------------------------------------------------------------------------------- 1 | defmodule FLAME.QueueTest do 2 | use ExUnit.Case 3 | 4 | alias FLAME.Queue 5 | 6 | describe "new/0" do 7 | test "creates a new Queue" do 8 | assert %Queue{} = Queue.new() 9 | end 10 | end 11 | 12 | describe "insert/3" do 13 | test "inserts a new item into the queue" do 14 | queue = Queue.insert(Queue.new(), "item1", :key1) 15 | assert Queue.size(queue) == 1 16 | assert Queue.get_by_key(queue, :key1) == "item1" 17 | end 18 | end 19 | 20 | describe "pop/1" do 21 | test "pops the first item from the queue" do 22 | queue = 23 | Queue.new() 24 | |> Queue.insert("item1", :key1) 25 | |> Queue.insert("item2", :key2) 26 | 27 | assert queue.keys == %{key1: 0, key2: 1} 28 | 29 | {popped_item, %Queue{} = queue} = Queue.pop(queue) 30 | 31 | assert popped_item == {:key1, "item1"} 32 | assert Queue.size(queue) == 1 33 | assert queue.keys == %{key2: 1} 34 | 35 | assert {{:key2, "item2"}, %Queue{} = queue} = Queue.pop(queue) 36 | assert Queue.size(queue) == 0 37 | assert queue.idx == 0 38 | end 39 | 40 | test "returns an error when the queue is empty" do 41 | assert {nil, %Queue{}} == Queue.pop(Queue.new()) 42 | end 43 | end 44 | 45 | describe "pop_until/2" do 46 | test "pops until function returns true" do 47 | queue = Queue.new() 48 | assert Queue.pop_until(queue, fn _, _ -> true end) == {nil, queue} 49 | 50 | queue = 51 | Queue.new() 52 | |> Queue.insert(10, :key1) 53 | |> Queue.insert(11, :key2) 54 | |> Queue.insert(20, :key3) 55 | |> Queue.insert(30, :key4) 56 | 57 | assert {{:key3, 20}, %Queue{} = queue} = Queue.pop_until(queue, fn _key, i -> i >= 20 end) 58 | assert Queue.size(queue) == 1 59 | assert queue.keys == %{key4: 3} 60 | end 61 | end 62 | 63 | describe "access" do 64 | test "retrieves an item by index" do 65 | queue = 66 | Queue.new() 67 | |> Queue.insert("item1", :key1) 68 | |> Queue.insert("item2", :key2) 69 | 70 | assert Queue.get_by_key(queue, :key1) == "item1" 71 | assert Queue.get_by_key(queue, :key2) == "item2" 72 | end 73 | 74 | test "returns nil for un unknown index or key" do 75 | queue = Queue.new() 76 | assert Queue.get_by_key(queue, :nope) == nil 77 | end 78 | end 79 | 80 | describe "delete_by_key/2" do 81 | test "deletes an item by its secondary key" do 82 | queue = 83 | Queue.new() 84 | |> Queue.insert("item1", :key1) 85 | |> Queue.insert("item2", :key2) 86 | 87 | queue = Queue.delete_by_key(queue, :key1) 88 | assert Queue.get_by_key(queue, :key1) == nil 89 | assert Queue.get_by_key(queue, :key2) == "item2" 90 | assert Queue.size(queue) == 1 91 | assert queue.idx == 2 92 | queue = Queue.delete_by_key(queue, :key2) 93 | assert Queue.size(queue) == 0 94 | assert queue.idx == 0 95 | 96 | queue = Queue.insert(queue, "item3", :key3) 97 | assert Queue.get_by_key(queue, :key3) == "item3" 98 | assert Queue.size(queue) == 1 99 | assert queue.idx == 1 100 | end 101 | 102 | test "non-existent key" do 103 | queue = Queue.new() 104 | assert queue == Queue.delete_by_key(queue, :key1) 105 | end 106 | end 107 | end 108 | -------------------------------------------------------------------------------- /test/runner_test.exs: -------------------------------------------------------------------------------- 1 | defmodule FLAME.RunnerTest do 2 | use ExUnit.Case, async: false 3 | 4 | import Mox 5 | 6 | alias FLAME.{Runner, MockBackend} 7 | alias FLAME.Test.CodeSyncMock 8 | 9 | # Make sure mocks are verified when the test exits 10 | setup :set_mox_global 11 | setup :verify_on_exit! 12 | 13 | setup do 14 | term_sup = 15 | start_supervised!({DynamicSupervisor, name: __MODULE__.TermSup, strategy: :one_for_one}) 16 | 17 | {:ok, term_sup: term_sup} 18 | end 19 | 20 | @post_success %{ 21 | "id" => "app", 22 | "instance_id" => "iad-app", 23 | "private_ip" => node() |> to_string() |> String.split("@") |> Enum.at(-1) 24 | } 25 | 26 | defp remote_boot(state) do 27 | parent = FLAME.Parent.new(make_ref(), self(), MockBackend, "app-flame-1", "MY_HOST_IP") 28 | name = Module.concat(FLAME.TerminatorTest, to_string(System.unique_integer([:positive]))) 29 | opts = [name: name, parent: parent] 30 | spec = Supervisor.child_spec({FLAME.Terminator, opts}, restart: :temporary) 31 | {:ok, _sup_pid} = DynamicSupervisor.start_child(__MODULE__.TermSup, spec) 32 | 33 | case Process.whereis(name) do 34 | terminator_pid when is_pid(terminator_pid) -> {:ok, terminator_pid, state} 35 | end 36 | end 37 | 38 | def mock_successful_runner(executions, runner_opts \\ []) do 39 | test_pid = self() 40 | 41 | MockBackend 42 | |> expect(:init, fn _opts -> {:ok, :state} end) 43 | |> expect(:remote_boot, fn :state -> remote_boot(@post_success) end) 44 | |> expect(:handle_info, fn {_ref, {:remote_up, _pid}}, state -> {:noreply, state} end) 45 | |> expect(:remote_spawn_monitor, executions, fn @post_success = _state, func -> 46 | {:ok, spawn_monitor(func)} 47 | end) 48 | # we need to send and assert_receive to avoid the race of going down before mox verify 49 | |> expect(:system_shutdown, fn -> send(test_pid, :stopped) end) 50 | 51 | Runner.start_link( 52 | Keyword.merge( 53 | [backend: {MockBackend, image: "my-imag", app_name: "test", api_token: "secret"}], 54 | runner_opts 55 | ) 56 | ) 57 | end 58 | 59 | def wrap_exit(runner, func) do 60 | prev_trap = Process.flag(:trap_exit, true) 61 | Process.unlink(runner) 62 | ref = make_ref() 63 | 64 | ExUnit.CaptureLog.capture_log(fn -> 65 | error = 66 | try do 67 | func.() 68 | catch 69 | kind, reason -> {kind, reason} 70 | end 71 | 72 | send(self(), {ref, error}) 73 | end) 74 | 75 | receive do 76 | {^ref, error} -> 77 | Process.flag(:trap_exit, prev_trap) 78 | error 79 | end 80 | end 81 | 82 | setup_all do 83 | Mox.defmock(MockBackend, for: FLAME.Backend) 84 | Application.put_env(:flame, :backend, FLAME.MockBackend) 85 | :ok 86 | end 87 | 88 | test "backend success single_use" do 89 | test_pid = self() 90 | 91 | MockBackend 92 | |> expect(:init, fn _opts -> {:ok, :state} end) 93 | |> expect(:remote_boot, fn :state -> remote_boot(@post_success) end) 94 | |> expect(:handle_info, fn {_ref, {:remote_up, _pid}}, state -> {:noreply, state} end) 95 | |> expect(:remote_spawn_monitor, 2, fn @post_success = _state, func -> 96 | {:ok, spawn_monitor(func)} 97 | end) 98 | |> expect(:system_shutdown, fn -> send(test_pid, :stopped) end) 99 | 100 | {:ok, runner} = 101 | Runner.start_link( 102 | single_use: true, 103 | backend: {MockBackend, image: "my-imag", app_name: "test", api_token: "secret"} 104 | ) 105 | 106 | assert Runner.remote_boot(runner, nil) == :ok 107 | assert Runner.call(runner, self(), fn -> :works end) == {:works, []} 108 | assert_receive :stopped 109 | end 110 | 111 | test "backend success multi use" do 112 | {:ok, runner} = mock_successful_runner(4) 113 | 114 | assert Runner.remote_boot(runner, nil) == :ok 115 | assert Runner.remote_boot(runner, nil) == {:error, :already_booted} 116 | assert Runner.call(runner, self(), fn -> :works end) == {:works, []} 117 | refute_receive :stopped 118 | assert Runner.call(runner, self(), fn -> :still_works end) == {:still_works, []} 119 | ref = Process.monitor(runner) 120 | assert Runner.shutdown(runner) == :ok 121 | assert_receive :stopped 122 | assert_receive {:DOWN, ^ref, :process, ^runner, :normal} 123 | end 124 | 125 | test "backend runner spawn connect failure" do 126 | MockBackend 127 | |> expect(:init, fn _opts -> {:ok, :state} end) 128 | |> expect(:remote_boot, fn :state -> {:error, :invalid_authentication} end) 129 | 130 | {:ok, runner} = 131 | Runner.start_link( 132 | backend: {MockBackend, image: "my-imag", app_name: "test", api_token: "secret"} 133 | ) 134 | 135 | assert {:exit, {{:shutdown, :invalid_authentication}, _}} = 136 | wrap_exit(runner, fn -> Runner.remote_boot(runner, nil) end) 137 | end 138 | 139 | test "backend runner boot failure" do 140 | MockBackend 141 | |> expect(:init, fn _opts -> {:ok, :state} end) 142 | |> expect(:remote_boot, fn :state -> {:error, :nxdomain} end) 143 | 144 | {:ok, runner} = 145 | Runner.start_link( 146 | backend: {MockBackend, image: "my-imag", app_name: "test", api_token: "secret"} 147 | ) 148 | 149 | assert {:exit, {{:shutdown, :nxdomain}, _}} = 150 | wrap_exit(runner, fn -> Runner.remote_boot(runner, nil) end) 151 | end 152 | 153 | describe "execution failure" do 154 | test "single use" do 155 | {:ok, runner} = mock_successful_runner(3, single_use: true) 156 | assert Runner.remote_boot(runner, nil) == :ok 157 | error = wrap_exit(runner, fn -> Runner.call(runner, self(), fn -> raise "boom" end) end) 158 | assert {:exit, {%RuntimeError{message: "boom"}, _}} = error 159 | assert_receive :stopped 160 | assert Runner.shutdown(runner) == :ok 161 | end 162 | 163 | test "multi use" do 164 | {:ok, runner} = mock_successful_runner(4) 165 | Process.monitor(runner) 166 | assert Runner.remote_boot(runner, nil) == :ok 167 | 168 | error = wrap_exit(runner, fn -> Runner.call(runner, self(), fn -> raise "boom" end) end) 169 | assert {:exit, {%RuntimeError{message: "boom"}, _}} = error 170 | refute_receive :stopped 171 | refute_receive {:DOWN, _ref, :process, ^runner, _} 172 | assert Runner.call(runner, self(), fn -> :works end) == {:works, []} 173 | assert Runner.shutdown(runner) == :ok 174 | end 175 | end 176 | 177 | describe "execution timeout" do 178 | test "single use" do 179 | timeout = 100 180 | {:ok, runner} = mock_successful_runner(3, timeout: timeout, single_use: true) 181 | 182 | Process.monitor(runner) 183 | assert Runner.remote_boot(runner, nil) == :ok 184 | 185 | error = 186 | wrap_exit(runner, fn -> 187 | Runner.call(runner, self(), fn -> Process.sleep(timeout * 2) end) 188 | end) 189 | 190 | assert error == {:exit, :timeout} 191 | 192 | assert_receive :stopped 193 | assert_receive {:DOWN, _ref, :process, _, :killed} 194 | assert Runner.shutdown(runner) == :ok 195 | end 196 | 197 | test "multi use" do 198 | timeout = 100 199 | {:ok, runner} = mock_successful_runner(4, timeout: timeout) 200 | 201 | Process.monitor(runner) 202 | assert Runner.remote_boot(runner, nil) == :ok 203 | 204 | error = 205 | wrap_exit(runner, fn -> 206 | Runner.call(runner, self(), fn -> Process.sleep(timeout * 2) end) 207 | end) 208 | 209 | assert error == {:exit, :timeout} 210 | 211 | refute_receive :stopped 212 | refute_receive {:DOWN, _ref, :process, ^runner, _} 213 | assert Runner.call(runner, self(), fn -> :works end, timeout: 1234) == {:works, []} 214 | assert Runner.shutdown(runner) == :ok 215 | end 216 | end 217 | 218 | describe "idle shutdown" do 219 | test "with time" do 220 | timeout = 500 221 | {:ok, runner} = mock_successful_runner(1, idle_shutdown_after: timeout) 222 | 223 | Process.unlink(runner) 224 | Process.monitor(runner) 225 | assert Runner.remote_boot(runner, nil) == :ok 226 | 227 | assert_receive :stopped, timeout * 2 228 | assert_receive {:DOWN, _ref, :process, ^runner, _} 229 | 230 | {:ok, runner} = mock_successful_runner(2, idle_shutdown_after: timeout) 231 | Process.unlink(runner) 232 | Process.monitor(runner) 233 | assert Runner.remote_boot(runner, nil) == :ok 234 | assert Runner.call(runner, self(), fn -> :works end) == {:works, []} 235 | assert_receive :stopped, timeout * 2 236 | assert_receive {:DOWN, _ref, :process, ^runner, _} 237 | end 238 | 239 | test "with timed check" do 240 | agent = start_supervised!({Agent, fn -> false end}) 241 | timeout = 500 242 | idle_after = {timeout, fn -> Agent.get(agent, & &1) end} 243 | {:ok, runner} = mock_successful_runner(1, idle_shutdown_after: idle_after) 244 | 245 | Process.unlink(runner) 246 | Process.monitor(runner) 247 | assert Runner.remote_boot(runner, nil) == :ok 248 | 249 | refute_receive {:DOWN, _ref, :process, ^runner, _}, timeout * 2 250 | Agent.update(agent, fn _ -> true end) 251 | assert_receive :stopped, timeout * 2 252 | assert_receive {:DOWN, _ref, :process, ^runner, _} 253 | end 254 | end 255 | 256 | describe "code_sync" do 257 | test "copy_paths: true, copies the code paths and extracts on boot" do 258 | mock = CodeSyncMock.new() 259 | # the 4th invocation is the rpc to diff code paths 260 | code_sync = FLAME.CodeSync.new(mock.opts) 261 | stream = FLAME.CodeSync.package_to_stream(code_sync) 262 | 263 | {:ok, runner} = mock_successful_runner(4, code_sync: mock.opts) 264 | 265 | Process.monitor(runner) 266 | assert Runner.remote_boot(runner, stream) == :ok 267 | assert Runner.call(runner, self(), fn -> :works end, timeout: 1234) == {:works, []} 268 | assert Runner.shutdown(runner) == :ok 269 | # called on remote boot 270 | assert_receive {CodeSyncMock, {_mock, :extract}} 271 | 272 | # called on :works call 273 | assert_receive {CodeSyncMock, {_mock, :extract}} 274 | end 275 | 276 | test "noops by default" do 277 | {:ok, runner} = mock_successful_runner(3) 278 | 279 | Process.monitor(runner) 280 | assert Runner.remote_boot(runner, nil) == :ok 281 | assert Runner.call(runner, self(), fn -> :works end, timeout: 1234) == {:works, []} 282 | assert Runner.shutdown(runner) == :ok 283 | refute_receive {CodeSyncMock, _} 284 | end 285 | end 286 | end 287 | -------------------------------------------------------------------------------- /test/support/code_sync_mock.ex: -------------------------------------------------------------------------------- 1 | defmodule FLAME.Test.CodeSyncMock.Mod1 do 2 | end 3 | 4 | defmodule FLAME.Test.CodeSyncMock.Mod1Modified do 5 | end 6 | 7 | defmodule FLAME.Test.CodeSyncMock.Mod2 do 8 | end 9 | 10 | defmodule FLAME.Test.CodeSyncMock.Mod3 do 11 | end 12 | 13 | defmodule FLAME.Test.CodeSyncMock do 14 | defstruct opts: nil, id: nil, extract_dir: nil 15 | alias FLAME.Test.CodeSyncMock 16 | 17 | def new(opts \\ []) do 18 | test_pid = self() 19 | id = System.unique_integer([:positive]) 20 | tmp_dir = File.cwd!() |> Path.join("tmp") |> Path.expand() 21 | working_dir = tmp_dir |> Path.join("#{id}") |> Path.expand() 22 | File.rm_rf!(working_dir) 23 | mod1_dir = Path.join([working_dir, "one", "ebin"]) 24 | mod2_dir = Path.join([working_dir, "two", "ebin"]) 25 | File.mkdir_p!(mod1_dir) 26 | File.mkdir_p!(mod2_dir) 27 | 28 | File.write!( 29 | Path.join(mod1_dir, "Elixir.FLAME.Test.CodeSyncMock.Mod1.beam"), 30 | obj_code(FLAME.Test.CodeSyncMock.Mod1) 31 | ) 32 | 33 | File.write!( 34 | Path.join(mod2_dir, "Elixir.FLAME.Test.CodeSyncMock.Mod2.beam"), 35 | obj_code(FLAME.Test.CodeSyncMock.Mod2) 36 | ) 37 | 38 | extract_dir = Path.join([tmp_dir, "#{id}", "extracted_code"]) 39 | File.mkdir_p!(extract_dir) 40 | 41 | get_path = 42 | fn -> 43 | working_dir 44 | |> Path.join("*/ebin") 45 | |> Path.wildcard() 46 | end 47 | 48 | default_opts = [ 49 | start_apps: true, 50 | sync_beams: [working_dir], 51 | tmp_dir: {Function, :identity, [tmp_dir]}, 52 | extract_dir: {__MODULE__, :extract_dir, [id, test_pid, extract_dir]}, 53 | get_path: get_path 54 | ] 55 | 56 | %CodeSyncMock{id: id, opts: Keyword.merge(default_opts, opts), extract_dir: extract_dir} 57 | end 58 | 59 | def extract_dir(id, test_pid, extract_dir) do 60 | send(test_pid, {CodeSyncMock, {id, :extract}}) 61 | extract_dir 62 | end 63 | 64 | def simulate_changes(%CodeSyncMock{id: id} = mock) do 65 | # mod1 is modified 66 | mod1_dir = Path.join([mfa(mock.opts[:tmp_dir]), "#{id}", "one", "ebin"]) 67 | mod2_dir = Path.join([mfa(mock.opts[:tmp_dir]), "#{id}", "two", "ebin"]) 68 | 69 | File.write!( 70 | Path.join(mod1_dir, "Elixir.FLAME.Test.CodeSyncMock.Mod1.beam"), 71 | obj_code(FLAME.Test.CodeSyncMock.Mod1Modified) 72 | ) 73 | 74 | # mod2 is deleted 75 | File.rm!(Path.join(mod2_dir, "Elixir.FLAME.Test.CodeSyncMock.Mod2.beam")) 76 | 77 | # mod3 is added 78 | File.write!( 79 | Path.join(mod1_dir, "Elixir.FLAME.Test.CodeSyncMock.Mod3.beam"), 80 | obj_code(FLAME.Test.CodeSyncMock.Mod3) 81 | ) 82 | 83 | :ok 84 | end 85 | 86 | defp mfa({mod, func, args}), do: apply(mod, func, args) 87 | 88 | def extracted_rel_paths(%CodeSyncMock{} = mock) do 89 | extracted_beams = Path.wildcard(Path.join(mfa(mock.opts[:extract_dir]), "**/*.beam")) 90 | 91 | Enum.map(extracted_beams, fn path -> 92 | path 93 | |> Path.relative_to_cwd() 94 | |> Path.relative_to(Path.join(["tmp", "#{mock.id}", "extracted_code", File.cwd!()])) 95 | |> Path.relative_to(Path.join(["tmp", "#{mock.id}"])) 96 | end) 97 | end 98 | 99 | defp obj_code(mod) do 100 | {^mod, beam_code, _path} = :code.get_object_code(mod) 101 | beam_code 102 | end 103 | end 104 | -------------------------------------------------------------------------------- /test/support/trackable.ex: -------------------------------------------------------------------------------- 1 | defmodule MyTrackable do 2 | defstruct [:pid, :ref, :name] 3 | 4 | defimpl FLAME.Trackable do 5 | def track(%{ref: ref, name: name} = data, acc, node) do 6 | ^node = node(ref) 7 | parent = self() 8 | 9 | {pid, monitor_ref} = 10 | Node.spawn_monitor(node, fn -> 11 | Process.register(self(), name) 12 | send(parent, {ref, :started}) 13 | 14 | receive do 15 | {^ref, :stop} -> :ok 16 | end 17 | end) 18 | 19 | receive do 20 | {^ref, :started} -> 21 | Process.demonitor(monitor_ref) 22 | {%{data | pid: pid}, [pid | acc]} 23 | 24 | {:DOWN, ^monitor_ref, _, _, reason} -> 25 | exit(reason) 26 | end 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | --------------------------------------------------------------------------------