├── .formatter.exs ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── guides ├── examples │ ├── amazon-sqs.md │ ├── apache-kafka.md │ ├── custom-producers.md │ ├── google-cloud-pubsub.md │ ├── introduction.md │ └── rabbitmq.md └── internals │ └── architecture.md ├── lib ├── broadway.ex └── broadway │ ├── acknowledger.ex │ ├── application.ex │ ├── batch_info.ex │ ├── caller_acknowledger.ex │ ├── config_storage.ex │ ├── config_storage │ ├── ets.ex │ └── persistent_term.ex │ ├── dummy_producer.ex │ ├── message.ex │ ├── noop_acknowledger.ex │ ├── options.ex │ ├── producer.ex │ ├── topology.ex │ └── topology │ ├── batch_processor_stage.ex │ ├── batcher_stage.ex │ ├── processor_stage.ex │ ├── producer_stage.ex │ ├── rate_limiter.ex │ ├── subscriber.ex │ └── terminator.ex ├── mix.exs ├── mix.lock └── test ├── broadway ├── acknowledger_test.exs ├── config_storage_test.exs ├── dummy_producer_test.exs └── topology │ ├── batcher_stage_test.exs │ ├── processor_stage_test.exs │ └── producer_stage_test.exs ├── broadway_test.exs └── test_helper.exs /.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" 2 | [ 3 | inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] 4 | ] 5 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-20.04 12 | env: 13 | MIX_ENV: test 14 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | include: 19 | # Earliest-supported versions. 20 | - elixir: "1.7.4" 21 | otp: "21.3.8.17" 22 | 23 | # Latest versions. 24 | - elixir: "1.18" 25 | otp: "27.2" 26 | lint: lint 27 | coverage: coverage 28 | steps: 29 | - name: Check out this repository 30 | uses: actions/checkout@v4 31 | 32 | - name: Set up Erlang and Elixir 33 | uses: erlef/setup-beam@v1 34 | with: 35 | otp-version: ${{matrix.otp}} 36 | elixir-version: ${{matrix.elixir}} 37 | 38 | - name: Cache Mix dependencies 39 | uses: actions/cache@v3 40 | id: cache-deps 41 | with: 42 | path: | 43 | deps 44 | _build 45 | key: | 46 | mix-${{ runner.os }}-${{matrix.elixir}}-${{matrix.otp}}-${{ hashFiles('**/mix.lock') }} 47 | restore-keys: | 48 | mix-${{ runner.os }}-${{matrix.elixir}}-${{matrix.otp}}- 49 | 50 | - run: mix do deps.get --check-locked, deps.compile 51 | if: steps.cache-deps.outputs.cache-hit != 'true' 52 | 53 | - run: mix format --check-formatted 54 | if: ${{ matrix.lint }} 55 | 56 | - run: mix deps.unlock --check-unused 57 | if: ${{ matrix.lint }} 58 | 59 | - run: mix compile --warnings-as-errors 60 | if: ${{ matrix.lint }} 61 | 62 | - run: mix test 63 | if: ${{!matrix.coverage}} 64 | 65 | - run: mix coveralls.github 66 | if: ${{matrix.coverage}} 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build/ 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover/ 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps/ 9 | 10 | # Where 3rd-party dependencies like ExDoc output generated docs. 11 | /doc/ 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | 22 | # Ignore package tarball (built via "mix hex.build"). 23 | broadway-*.tar 24 | 25 | .elixir_ls 26 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v1.2.1 (2025-02-12) 4 | 5 | * Run `setup` callback on Broadway startup 6 | 7 | ## v1.2.0 (2025-02-10) 8 | 9 | * Do not clean up persistent terms on shutdown 10 | * Add format_discarded callback 11 | * Allow different config storages 12 | 13 | ## v1.1.0 (2024-06-21) 14 | 15 | ### Bug fix 16 | 17 | * No longer set demand to `:accumulate` when draining, for compatibility with GenStage v1.2+. This means that any polling implementation must implement the `prepare_for_draining` callback and stop polling messages. You can check how [BroadwaySQS](https://github.com/dashbitco/broadway_sqs/commit/5b8f18a78e4760b5fcc839ad576be8c63345add0) tackles this problem as an example 18 | 19 | ### Enhancements 20 | 21 | * Log leaked trapped exits 22 | 23 | ## v1.0.7 (2023-04-22) 24 | 25 | ### Enhancements 26 | 27 | * Relax `nimble_options` dependency 28 | * Improve documentation and error messages 29 | 30 | ## v1.0.6 (2023-01-19) 31 | 32 | ### Bug fixes 33 | 34 | * Remove the assumption a cancelled timer has been delivered 35 | 36 | ## v1.0.5 (2022-11-06) 37 | 38 | ### Bug fixes 39 | 40 | * Fix NoopAcknowledger metadata name 41 | 42 | ## v1.0.4 (2022-11-05) 43 | 44 | ### Enhancements 45 | 46 | * Add `init` convenience function to acknowledgers 47 | * Allow `:reset` option when calling `update_rate_limiting`. When set to true, the rate limit counter and interval is immediately reset 48 | * Add the producer to the telemetry metadata 49 | * Support custom function in `:batch_size` for customized batch splitting logic 50 | 51 | ## v1.0.3 (2022-03-18) 52 | 53 | ### Bug fixes 54 | 55 | * Move the `process_name/2` callback to the `Broadway` behaviour where it belongs 56 | 57 | ## v1.0.2 (2022-01-12) 58 | 59 | ### Enhancements 60 | 61 | * Also allow `nimble_options ~> 0.4.0` 62 | 63 | ## v1.0.1 (2021-10-12) 64 | 65 | ### Bug fixes 66 | 67 | * Make sure `processors` and `batch_processors` demands are properly shuffled 68 | * Ensure proper messages metadata for telemetry events on `Broadway.Message.failed/2` 69 | 70 | ## v1.0.0 (2021-08-30) 71 | 72 | Broadway v1.0 requires Erlang/OTP 21.3+. 73 | 74 | ### Backwards incompatible changes 75 | 76 | * Remove `Broadway.TermStorage` now that we have Broadway topology information on the producer init callback 77 | * Rename `:events` to `:messages` in batcher telemetry event 78 | * Remove `:time` from "stop" telemetry event measurements 79 | * Rename `:time` to `:system_time` in telemetry event measurements 80 | * Rename `[:broadway, :consumer, *]` to `[:broadway, :batch_processor, *]` in telemetry event 81 | 82 | ### Enhancements 83 | 84 | * Add `Broadway.Message.put_data/2` 85 | * Add `Broadway.stop/1` 86 | * Add `Broadway.all_running/0` 87 | * Add `Broadway.topology/1` 88 | * Add ack configuration to `Broadway.test_message/3` and `Broadway.test_batch/3` 89 | * Allow Broadway :via tuples as broadway names 90 | * Enrich telemetry events with metadata 91 | 92 | ## v0.6.2 (2020-08-17) 93 | 94 | * Make `Broadway.Producer` public 95 | * Add optional `prepare_messages` callback 96 | 97 | ## v0.6.1 (2020-06-02) 98 | 99 | * Rename `:failure` Telemetry event to `:exception` so it conforms to the telemetry specification 100 | * Deprecate `Broadway.test_messages/3` in favor of `Broadway.test_message/3` and `Broadway.test_batch/3` 101 | 102 | ## v0.6.0 (2020-02-13) 103 | 104 | * Deprecate `:stages` in favor of `:concurrency` for clarity 105 | * Do not validate `:batcher` if message failed 106 | * Add support for rate limiting producers 107 | * Support returning state in `c:Broadway.Producer.prepare_for_draining/1` 108 | * Emit telemetry events 109 | * Add Kafka guide 110 | 111 | ## v0.5.0 (2019-11-04) 112 | 113 | * Deprecate `:producers` in favor of a single `:producer` key 114 | * Add `Broadway.Message.configure_ack/3` 115 | * Add `Broadway.Message.ack_immediately/1` 116 | * Add `Broadway.producer_names/1` 117 | * Add the `c:Broadway.handle_failed/2` optional callback which is invoked with failed messages 118 | * Add `:crash_reason` to Logger reports metadata 119 | * Add `c:Broadway.Producer.prepare_for_start/2` optional callback which allows producers to customize the topology 120 | * Support `partition_by` in processors and batchers 121 | * Log if `handle_batch` returns less messages than expected 122 | 123 | ## v0.4.0 (2019-08-05) 124 | 125 | * Add `:batch_mode` to `Broadway.test_messages/3` to control how test messages are flushed 126 | * Add `Broadway.DummyProducer` for testing 127 | * Append .Broadway to module prefixes to avoid potential naming conflicts 128 | 129 | ## v0.3.0 (2019-04-26) 130 | 131 | * Add `metadata` field to the `Message` struct so clients can append extra information 132 | 133 | ## v0.2.0 (2019-04-04) 134 | 135 | * `Broadway.Message.put_partition/2` has been renamed to `Broadway.Message.put_batch_key/2` 136 | * Allow `Broadway.Producer` to `prepare_for_draining/1` 137 | * Allow pipelines without batchers 138 | 139 | ## v0.1.0 (2019-02-19) 140 | 141 | * Initial release 142 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 Plataformatec 2 | Copyright 2020 Dashbit 3 | 4 | Apache License 5 | Version 2.0, January 2004 6 | http://www.apache.org/licenses/ 7 | 8 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 9 | 10 | 1. Definitions. 11 | 12 | "License" shall mean the terms and conditions for use, reproduction, 13 | and distribution as defined by Sections 1 through 9 of this document. 14 | 15 | "Licensor" shall mean the copyright owner or entity authorized by 16 | the copyright owner that is granting the License. 17 | 18 | "Legal Entity" shall mean the union of the acting entity and all 19 | other entities that control, are controlled by, or are under common 20 | control with that entity. For the purposes of this definition, 21 | "control" means (i) the power, direct or indirect, to cause the 22 | direction or management of such entity, whether by contract or 23 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 24 | outstanding shares, or (iii) beneficial ownership of such entity. 25 | 26 | "You" (or "Your") shall mean an individual or Legal Entity 27 | exercising permissions granted by this License. 28 | 29 | "Source" form shall mean the preferred form for making modifications, 30 | including but not limited to software source code, documentation 31 | source, and configuration files. 32 | 33 | "Object" form shall mean any form resulting from mechanical 34 | transformation or translation of a Source form, including but 35 | not limited to compiled object code, generated documentation, 36 | and conversions to other media types. 37 | 38 | "Work" shall mean the work of authorship, whether in Source or 39 | Object form, made available under the License, as indicated by a 40 | copyright notice that is included in or attached to the work 41 | (an example is provided in the Appendix below). 42 | 43 | "Derivative Works" shall mean any work, whether in Source or Object 44 | form, that is based on (or derived from) the Work and for which the 45 | editorial revisions, annotations, elaborations, or other modifications 46 | represent, as a whole, an original work of authorship. For the purposes 47 | of this License, Derivative Works shall not include works that remain 48 | separable from, or merely link (or bind by name) to the interfaces of, 49 | the Work and Derivative Works thereof. 50 | 51 | "Contribution" shall mean any work of authorship, including 52 | the original version of the Work and any modifications or additions 53 | to that Work or Derivative Works thereof, that is intentionally 54 | submitted to Licensor for inclusion in the Work by the copyright owner 55 | or by an individual or Legal Entity authorized to submit on behalf of 56 | the copyright owner. For the purposes of this definition, "submitted" 57 | means any form of electronic, verbal, or written communication sent 58 | to the Licensor or its representatives, including but not limited to 59 | communication on electronic mailing lists, source code control systems, 60 | and issue tracking systems that are managed by, or on behalf of, the 61 | Licensor for the purpose of discussing and improving the Work, but 62 | excluding communication that is conspicuously marked or otherwise 63 | designated in writing by the copyright owner as "Not a Contribution." 64 | 65 | "Contributor" shall mean Licensor and any individual or Legal Entity 66 | on behalf of whom a Contribution has been received by Licensor and 67 | subsequently incorporated within the Work. 68 | 69 | 2. Grant of Copyright License. Subject to the terms and conditions of 70 | this License, each Contributor hereby grants to You a perpetual, 71 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 72 | copyright license to reproduce, prepare Derivative Works of, 73 | publicly display, publicly perform, sublicense, and distribute the 74 | Work and such Derivative Works in Source or Object form. 75 | 76 | 3. Grant of Patent License. Subject to the terms and conditions of 77 | this License, each Contributor hereby grants to You a perpetual, 78 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 79 | (except as stated in this section) patent license to make, have made, 80 | use, offer to sell, sell, import, and otherwise transfer the Work, 81 | where such license applies only to those patent claims licensable 82 | by such Contributor that are necessarily infringed by their 83 | Contribution(s) alone or by combination of their Contribution(s) 84 | with the Work to which such Contribution(s) was submitted. If You 85 | institute patent litigation against any entity (including a 86 | cross-claim or counterclaim in a lawsuit) alleging that the Work 87 | or a Contribution incorporated within the Work constitutes direct 88 | or contributory patent infringement, then any patent licenses 89 | granted to You under this License for that Work shall terminate 90 | as of the date such litigation is filed. 91 | 92 | 4. Redistribution. You may reproduce and distribute copies of the 93 | Work or Derivative Works thereof in any medium, with or without 94 | modifications, and in Source or Object form, provided that You 95 | meet the following conditions: 96 | 97 | (a) You must give any other recipients of the Work or 98 | Derivative Works a copy of this License; and 99 | 100 | (b) You must cause any modified files to carry prominent notices 101 | stating that You changed the files; and 102 | 103 | (c) You must retain, in the Source form of any Derivative Works 104 | that You distribute, all copyright, patent, trademark, and 105 | attribution notices from the Source form of the Work, 106 | excluding those notices that do not pertain to any part of 107 | the Derivative Works; and 108 | 109 | (d) If the Work includes a "NOTICE" text file as part of its 110 | distribution, then any Derivative Works that You distribute must 111 | include a readable copy of the attribution notices contained 112 | within such NOTICE file, excluding those notices that do not 113 | pertain to any part of the Derivative Works, in at least one 114 | of the following places: within a NOTICE text file distributed 115 | as part of the Derivative Works; within the Source form or 116 | documentation, if provided along with the Derivative Works; or, 117 | within a display generated by the Derivative Works, if and 118 | wherever such third-party notices normally appear. The contents 119 | of the NOTICE file are for informational purposes only and 120 | do not modify the License. You may add Your own attribution 121 | notices within Derivative Works that You distribute, alongside 122 | or as an addendum to the NOTICE text from the Work, provided 123 | that such additional attribution notices cannot be construed 124 | as modifying the License. 125 | 126 | You may add Your own copyright statement to Your modifications and 127 | may provide additional or different license terms and conditions 128 | for use, reproduction, or distribution of Your modifications, or 129 | for any such Derivative Works as a whole, provided Your use, 130 | reproduction, and distribution of the Work otherwise complies with 131 | the conditions stated in this License. 132 | 133 | 5. Submission of Contributions. Unless You explicitly state otherwise, 134 | any Contribution intentionally submitted for inclusion in the Work 135 | by You to the Licensor shall be under the terms and conditions of 136 | this License, without any additional terms or conditions. 137 | Notwithstanding the above, nothing herein shall supersede or modify 138 | the terms of any separate license agreement you may have executed 139 | with Licensor regarding such Contributions. 140 | 141 | 6. Trademarks. This License does not grant permission to use the trade 142 | names, trademarks, service marks, or product names of the Licensor, 143 | except as required for reasonable and customary use in describing the 144 | origin of the Work and reproducing the content of the NOTICE file. 145 | 146 | 7. Disclaimer of Warranty. Unless required by applicable law or 147 | agreed to in writing, Licensor provides the Work (and each 148 | Contributor provides its Contributions) on an "AS IS" BASIS, 149 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 150 | implied, including, without limitation, any warranties or conditions 151 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 152 | PARTICULAR PURPOSE. You are solely responsible for determining the 153 | appropriateness of using or redistributing the Work and assume any 154 | risks associated with Your exercise of permissions under this License. 155 | 156 | 8. Limitation of Liability. In no event and under no legal theory, 157 | whether in tort (including negligence), contract, or otherwise, 158 | unless required by applicable law (such as deliberate and grossly 159 | negligent acts) or agreed to in writing, shall any Contributor be 160 | liable to You for damages, including any direct, indirect, special, 161 | incidental, or consequential damages of any character arising as a 162 | result of this License or out of the use or inability to use the 163 | Work (including but not limited to damages for loss of goodwill, 164 | work stoppage, computer failure or malfunction, or any and all 165 | other commercial damages or losses), even if such Contributor 166 | has been advised of the possibility of such damages. 167 | 168 | 9. Accepting Warranty or Additional Liability. While redistributing 169 | the Work or Derivative Works thereof, You may choose to offer, 170 | and charge a fee for, acceptance of support, warranty, indemnity, 171 | or other liability obligations and/or rights consistent with this 172 | License. However, in accepting such obligations, You may act only 173 | on Your own behalf and on Your sole responsibility, not on behalf 174 | of any other Contributor, and only if You agree to indemnify, 175 | defend, and hold each Contributor harmless for any liability 176 | incurred by, or claims asserted against, such Contributor by reason 177 | of your accepting any such warranty or additional liability. 178 | 179 | END OF TERMS AND CONDITIONS 180 | 181 | APPENDIX: How to apply the Apache License to your work. 182 | 183 | To apply the Apache License to your work, attach the following 184 | boilerplate notice, with the fields enclosed by brackets "[]" 185 | replaced with your own identifying information. (Don't include 186 | the brackets!) The text should be enclosed in the appropriate 187 | comment syntax for the file format. We also recommend that a 188 | file or class name and description of purpose be included on the 189 | same "printed page" as the copyright notice for easier 190 | identification within third-party archives. 191 | 192 | Copyright [yyyy] [name of copyright owner] 193 | 194 | Licensed under the Apache License, Version 2.0 (the "License"); 195 | you may not use this file except in compliance with the License. 196 | You may obtain a copy of the License at 197 | 198 | http://www.apache.org/licenses/LICENSE-2.0 199 | 200 | Unless required by applicable law or agreed to in writing, software 201 | distributed under the License is distributed on an "AS IS" BASIS, 202 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 203 | See the License for the specific language governing permissions and 204 | limitations under the License. 205 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Broadway 2 | 3 | [![CI](https://github.com/dashbitco/broadway/actions/workflows/ci.yml/badge.svg)](https://github.com/dashbitco/broadway/actions/workflows/ci.yml) 4 | 5 | Build concurrent and multi-stage data ingestion and data processing pipelines with Elixir. Broadway allows developers to consume data efficiently from different sources, known as producers, such as Amazon SQS, Apache Kafka, Google Cloud PubSub, RabbitMQ, and others. Broadway pipelines are long-lived, concurrent, and robust, thanks to the Erlang VM and its actors. 6 | 7 | Broadway takes its name from the famous [Broadway street](https://en.wikipedia.org/wiki/Broadway_theatre) in New York City, renowned for its stages, actors, and producers. 8 | 9 | To learn more and get started, check out [our official website](https://elixir-broadway.org) and [our guides and docs](https://hexdocs.pm/broadway). 10 | 11 | ![Broadway Logo](https://user-images.githubusercontent.com/9582/117824616-ed298500-b26e-11eb-8ded-0fb7e608bf70.png) 12 | 13 | ## Built-in features 14 | 15 | Broadway takes the burden of defining concurrent GenStage topologies and provides a simple configuration API that automatically defines concurrent producers, concurrent processing, batch handling, and more, leading to both time and cost efficient ingestion and processing of data. It features: 16 | 17 | * Back-pressure 18 | * Automatic acknowledgements at the end of the pipeline 19 | * Batching 20 | * Fault tolerance 21 | * Graceful shutdown 22 | * Built-in testing 23 | * Custom failure handling 24 | * Ordering and partitioning 25 | * Rate-limiting 26 | * Metrics 27 | 28 | ### Producers 29 | 30 | There are several producers that you can use to integrate with existing services and technologies. [See the docs for detailed how-tos and supported producers](https://hexdocs.pm/broadway/introduction.html#official-producers). 31 | 32 | ## Installation 33 | 34 | Add `:broadway` to the list of dependencies in `mix.exs`: 35 | 36 | ```elixir 37 | def deps do 38 | [ 39 | {:broadway, "~> 1.0"} 40 | ] 41 | end 42 | ``` 43 | 44 | ## A quick example: SQS integration 45 | 46 | Assuming you have added [`broadway_sqs`](https://github.com/dashbitco/broadway_sqs) as a dependency and configured your SQS credentials accordingly, you can consume Amazon SQS events in only 20 LOCs: 47 | 48 | ```elixir 49 | defmodule MyBroadway do 50 | use Broadway 51 | 52 | alias Broadway.Message 53 | 54 | def start_link(_opts) do 55 | Broadway.start_link(__MODULE__, 56 | name: __MODULE__, 57 | producer: [ 58 | module: {BroadwaySQS.Producer, queue_url: "https://us-east-2.queue.amazonaws.com/100000000001/my_queue"} 59 | ], 60 | processors: [ 61 | default: [concurrency: 50] 62 | ], 63 | batchers: [ 64 | s3: [concurrency: 5, batch_size: 10, batch_timeout: 1000] 65 | ] 66 | ) 67 | end 68 | 69 | def handle_message(_processor_name, message, _context) do 70 | message 71 | |> Message.update_data(&process_data/1) 72 | |> Message.put_batcher(:s3) 73 | end 74 | 75 | def handle_batch(:s3, messages, _batch_info, _context) do 76 | # Send batch of messages to S3 77 | end 78 | 79 | defp process_data(data) do 80 | # Do some calculations, generate a JSON representation, process images. 81 | end 82 | end 83 | ``` 84 | 85 | Once your Broadway module is defined, you just need to add it as a child of your application supervision tree as `{MyBroadway, []}`. 86 | 87 | ## Comparison to Flow 88 | 89 | You may also be interested in [Flow by Dashbit](https://github.com/dashbitco/flow). Both Broadway and Flow are built on top of GenStage. Flow is a more general abstraction than Broadway that focuses on data as a whole, providing features like aggregation, joins, windows, etc. Broadway focuses on events and on operational features, such as metrics, automatic acknowledgements, failure handling, and so on. Broadway is recommended for continuous, long-running pipelines. Flow works with short- and long-lived data processing. 90 | 91 | ## License 92 | 93 | Copyright 2019 Plataformatec\ 94 | Copyright 2020 Dashbit 95 | 96 | Licensed under the Apache License, Version 2.0 (the "License"); 97 | you may not use this file except in compliance with the License. 98 | You may obtain a copy of the License at 99 | 100 | http://www.apache.org/licenses/LICENSE-2.0 101 | 102 | Unless required by applicable law or agreed to in writing, software 103 | distributed under the License is distributed on an "AS IS" BASIS, 104 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 105 | See the License for the specific language governing permissions and 106 | limitations under the License. 107 | -------------------------------------------------------------------------------- /guides/examples/amazon-sqs.md: -------------------------------------------------------------------------------- 1 | # Amazon SQS 2 | 3 | Amazon Simple Queue Service (SQS) is a highly scalable distributed message 4 | queuing service provided by Amazon.com. AWS SQS offers two types of message 5 | queues: 6 | 7 | * Standard 8 | * Nearly unlimited throughput 9 | * Best-effort ordering 10 | * At-least-once delivery 11 | 12 | * FIFO 13 | * Limited number of transactions per second (TPS). 14 | See [Amazon SQS FIFO](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/FIFO-queues.html) 15 | developer guide for more information on limits. 16 | * Order in which messages are sent/received is strictly preserved 17 | * Exactly-once delivery 18 | 19 | Broadway can work seamlessly with both, Standard and FIFO queues. 20 | 21 | ## Getting Started 22 | 23 | In order to use Broadway with SQS, we need to: 24 | 25 | 1. Create a SQS queue (or use an existing one) 26 | 1. Configure our Elixir project to use Broadway 27 | 1. Define your pipeline configuration 28 | 1. Implement Broadway callbacks 29 | 1. Run the Broadway pipeline 30 | 1. Tuning the configuration (Optional) 31 | 32 | ## Create a SQS queue 33 | 34 | Amazon provides a comprehensive [Step-by-step Guide](https://aws.amazon.com/getting-started/tutorials/send-messages-distributed-applications/) 35 | on creating SQS queues. In case you don't have an AWS account and want to 36 | test Broadway locally, use can easily install [ElasticMQ](https://github.com/softwaremill/elasticmq), 37 | which is a message queue system that offers a SQS-compatible query interface. 38 | 39 | ## Configure the project 40 | 41 | In this guide we're going to use [BroadwaySQS](https://github.com/dashbitco/broadway_sqs), 42 | which is a Broadway SQS Connector provided by [Dashbit](https://dashbit.co/). 43 | 44 | ### Starting a new project 45 | 46 | If you plan to start a new project, just run: 47 | 48 | $ mix new my_app --sup 49 | 50 | The `--sup` flag instructs Elixir to generate an application with a supervision tree. 51 | 52 | ### Setting up dependencies 53 | 54 | Add `:broadway_sqs` to the list of dependencies in `mix.exs` along the HTTP 55 | client of your choice (defaults to `:hackney`): 56 | 57 | def deps do 58 | [ 59 | ... 60 | {:broadway_sqs, "~> 0.7"}, 61 | {:hackney, "~> 1.9"}, 62 | ] 63 | end 64 | 65 | Don't forget to check for the latest version of dependencies. 66 | 67 | ## Define the pipeline configuration 68 | 69 | Broadway is a process-based behaviour and to define a Broadway 70 | pipeline, we need to define three functions: `start_link/1`, 71 | `handle_message/3` and `handle_batch/4`. We will cover `start_link/1` 72 | in this section and the `handle_` callbacks in the next one. 73 | 74 | Similar to other process-based behaviour, `start_link/1` simply 75 | delegates to `Broadway.start_link/2`, which should define the 76 | producers, processors, and batchers in the Broadway pipeline. 77 | Assuming we want to consume messages from a queue called 78 | `my_queue`, the minimal configuration would be: 79 | 80 | defmodule MyBroadway do 81 | use Broadway 82 | 83 | alias Broadway.Message 84 | 85 | def start_link(_opts) do 86 | Broadway.start_link(__MODULE__, 87 | name: __MODULE__, 88 | producer: [ 89 | module: {BroadwaySQS.Producer, 90 | queue_url: "https://us-east-2.queue.amazonaws.com/100000000001/my_queue"} 91 | ], 92 | processors: [ 93 | default: [] 94 | ], 95 | batchers: [ 96 | default: [ 97 | batch_size: 10, 98 | batch_timeout: 2000 99 | ] 100 | ] 101 | ) 102 | end 103 | 104 | ...callbacks... 105 | end 106 | 107 | The above configuration also assumes that you have the AWS credentials 108 | set up in your environment, for instance, by having the `AWS_ACCESS_KEY_ID` 109 | and `AWS_SECRET_ACCESS_KEY` environment variables set. If that's 110 | not the case, you will need to pass that information to the client so it 111 | can properly connect to the AWS servers. Here is how you can do it: 112 | 113 | ... 114 | producer: [ 115 | module: 116 | {BroadwaySQS.Producer, 117 | queue_url: "https://us-east-2.queue.amazonaws.com/100000000001/my_queue", 118 | config: [ 119 | access_key_id: "YOUR_AWS_ACCESS_KEY_ID", 120 | secret_access_key: "YOUR_AWS_SECRET_ACCESS_KEY" 121 | ]} 122 | ] 123 | ... 124 | 125 | For a full list of options for `BroadwaySQS.Producer`, please see 126 | [BroadwaySQS](https://hexdocs.pm/broadway_sqs/) documentation. 127 | 128 | For general information about setting up Broadway, see `Broadway` 129 | module docs as well as `Broadway.start_link/2`. 130 | 131 | > Note: Even though batching is optional since Broadway v0.2, we recommend that all Amazon SQS 132 | > pipelines have at least a default batcher. This lets you control the exact batch 133 | > size and frequency that messages are acknowledged to Amazon SQS, often leading to 134 | > pipelines that are more cost and time efficient. 135 | 136 | ## Implement Broadway callbacks 137 | 138 | In order to process incoming messages, we need to implement the 139 | required callbacks. For the sake of simplicity, we're considering that 140 | all messages received from the queue are just numbers: 141 | 142 | defmodule MyBroadway do 143 | use Broadway 144 | 145 | alias Broadway.Message 146 | 147 | ...start_link... 148 | 149 | @impl true 150 | def handle_message(_, %Message{data: data} = message, _) do 151 | message 152 | |> Message.update_data(fn data -> data * data end) 153 | end 154 | 155 | @impl true 156 | def handle_batch(_, messages, _, _) do 157 | list = messages |> Enum.map(fn e -> e.data end) 158 | IO.inspect(list, label: "Got batch of finished jobs from processors, sending ACKs to SQS as a batch.") 159 | messages 160 | end 161 | end 162 | 163 | We are not doing anything fancy here, but it should be enough for our 164 | purpose. First we update the message's data individually inside 165 | `handle_message/3` and then we print each batch inside `handle_batch/4`. 166 | 167 | For more information, see `c:Broadway.handle_message/3` and 168 | `c:Broadway.handle_batch/4`. 169 | 170 | ## Run the Broadway pipeline 171 | 172 | To run your `Broadway` pipeline, you just need to add as a child in 173 | a supervision tree. Most applications have a supervision tree defined 174 | at `lib/my_app/application.ex`. You can add Broadway as a child to a 175 | supervisor as follows: 176 | 177 | children = [ 178 | {MyBroadway, []} 179 | ] 180 | 181 | Supervisor.start_link(children, strategy: :one_for_one) 182 | 183 | Now the Broadway pipeline should be started when your application starts. 184 | Also, if your Broadway has any dependency (for example, it needs to talk 185 | to the database), make sure that Broadway is listed *after* its dependencies 186 | in the supervision tree. 187 | 188 | ## Tuning the configuration 189 | 190 | Some of the configuration options available for Broadway come already with a 191 | "reasonable" default value. However those values might not suit your 192 | requirements. Depending on the number of messages you get, how much processing 193 | they need and how much IO work is going to take place, you might need completely 194 | different values to optimize the flow of your pipeline. The `concurrency` option 195 | available for every set of producers, processors and batchers, among with 196 | `max_demand`, `batch_size`, and `batch_timeout` can give you a great deal 197 | of flexibility. 198 | 199 | The `concurrency` option controls the concurrency level in each layer of 200 | the pipeline. 201 | See the notes on [`Producer concurrency`](https://hexdocs.pm/broadway/Broadway.html#module-producer-concurrency) 202 | and [`Batcher concurrency`](https://hexdocs.pm/broadway/Broadway.html#module-batcher-concurrency) 203 | for details. 204 | 205 | Here's an example on how you could tune them according to 206 | your needs. 207 | 208 | defmodule MyBroadway do 209 | use Broadway 210 | 211 | def start_link(_opts) do 212 | Broadway.start_link(__MODULE__, 213 | name: __MODULE__, 214 | producer: [ 215 | ... 216 | concurrency: 10, 217 | ], 218 | processors: [ 219 | default: [ 220 | concurrency: 100, 221 | max_demand: 1, 222 | ] 223 | ], 224 | batchers: [ 225 | default: [ 226 | batch_size: 10, 227 | concurrency: 10, 228 | ] 229 | ] 230 | ) 231 | end 232 | 233 | ...callbacks... 234 | end 235 | 236 | In order to get a good set of configurations for your pipeline, it's 237 | important to respect the limitations of the servers you're running, 238 | as well as the limitations of the services you're providing/consuming 239 | data to/from. Broadway comes with telemetry, so you can measure your 240 | pipeline and help ensure your changes are effective. 241 | -------------------------------------------------------------------------------- /guides/examples/apache-kafka.md: -------------------------------------------------------------------------------- 1 | # Apache Kafka 2 | 3 | Kafka is a distributed streaming platform that has three key capabilities: 4 | 5 | * Publish and subscribe to streams of records 6 | * Store streams of records in a fault-tolerant durable way 7 | * Process streams of records as they occur 8 | 9 | ## Getting Started 10 | 11 | In order to use Broadway with Kafka, we need to: 12 | 13 | 1. Create a stream of records (or use an existing one) 14 | 1. Configure your Elixir project to use Broadway 15 | 1. Define your pipeline configuration 16 | 1. Implement Broadway callbacks 17 | 1. Run the Broadway pipeline 18 | 19 | ## Create a stream of records (or use an existing one) 20 | 21 | In case you don't have Kafka installed yet, please follow the instructions on Kafka's 22 | [Quickstart](https://kafka.apache.org/quickstart) for a clean installation. After 23 | initializing Kafka, you can create a new stream by running: 24 | 25 | $ kafka-topics --create --zookeeper localhost:2181 --partitions 3 --topic test 26 | 27 | ## Configure your Elixir project to use Broadway 28 | 29 | This guide describes the steps necessary to integrate Broadway with Kafka using 30 | [BroadwayKafka](https://github.com/dashbitco/broadway_kafka), 31 | which is a Broadway Kafka Connector provided by [Dashbit](https://dashbit.co/). 32 | 33 | BroadwayKafka can subscribe to one or more topics and process streams of records 34 | using Kafka's [Consumer API](https://kafka.apache.org/documentation.html#consumerapi). 35 | 36 | Each GenStage producer initialized by BroadwayKafka will be available as a consumer, 37 | all registered using the same self-labeled **consumer group**. Each record published to a 38 | topic/partition will be delivered to one consumer instance within each consumer group. 39 | 40 | Bear in mind that a topic/partition can be assigned to any consumer instance that has 41 | been subscribed using the same consumer group, i.e, any Broadway instance or application 42 | running on any machine connected to the Kafka cluster. 43 | 44 | ### Starting a new project 45 | 46 | Create a new project running: 47 | 48 | $ mix new my_app --sup 49 | 50 | The `--sup` flag instructs Elixir to generate an application with a supervision tree. 51 | 52 | ### Setting up dependencies 53 | 54 | Add `:broadway_kafka` to the list of dependencies in `mix.exs`: 55 | 56 | def deps do 57 | [ 58 | ... 59 | {:broadway_kafka, "~> 0.3"} 60 | ] 61 | end 62 | 63 | Don't forget to check for the latest version of dependencies. 64 | 65 | ## Define the pipeline configuration 66 | 67 | Broadway is a process-based behaviour and to define a Broadway pipeline, 68 | we need to define three functions: `start_link/1`, `handle_message/3` 69 | and optionally `handle_batch/4`. We will cover `start_link/1` in this 70 | section and the `handle_` callbacks in the next one. 71 | 72 | Similar to other process-based behaviours, `start_link/1` simply 73 | delegates to `Broadway.start_link/2`, which should define the 74 | producers, processors, and batchers in the Broadway pipeline. 75 | Assuming we want to consume messages from a topic called 76 | `test`, one possible configuration would be: 77 | 78 | defmodule MyBroadway do 79 | use Broadway 80 | 81 | alias Broadway.Message 82 | 83 | def start_link(_opts) do 84 | Broadway.start_link(__MODULE__, 85 | name: __MODULE__, 86 | producer: [ 87 | module: 88 | {BroadwayKafka.Producer, 89 | [ 90 | hosts: [localhost: 9092], 91 | group_id: "group_1", 92 | topics: ["test"] 93 | ]}, 94 | concurrency: 1 95 | ], 96 | processors: [ 97 | default: [ 98 | concurrency: 10 99 | ] 100 | ], 101 | batchers: [ 102 | default: [ 103 | batch_size: 100, 104 | batch_timeout: 200, 105 | concurrency: 10 106 | ] 107 | ] 108 | ) 109 | end 110 | 111 | ...callbacks... 112 | end 113 | 114 | > **Note**: Pipelines built on top of BroadwayKafka are automatically partitioned. 115 | So even though there are multiple processes (stages), these processes will preserve 116 | Kafka's ordering semantics when it comes to topics/partitions. Internally, this is 117 | achieved by making sure all messages from the same topic/partition will always be 118 | forwarded to the same processor and batch processor. 119 | 120 | For a full list of options for `BroadwayKafka.Producer`, refer to the 121 | official [BroadwayKafka](https://hexdocs.pm/broadway_kafka/) documentation. 122 | 123 | For general information about setting up Broadway, see `Broadway` 124 | module docs as well as `Broadway.start_link/2`. 125 | 126 | ## Implement Broadway callbacks 127 | 128 | In order to process incoming messages, we need to implement the 129 | required callbacks. For the sake of simplicity, we're considering that 130 | all messages received from the topic are just numbers: 131 | 132 | defmodule MyBroadway do 133 | use Broadway 134 | 135 | alias Broadway.Message 136 | 137 | ...start_link... 138 | 139 | @impl true 140 | def handle_message(_, message, _) do 141 | message 142 | |> Message.update_data(fn data -> {data, String.to_integer(data) * 2} end) 143 | end 144 | 145 | @impl true 146 | def handle_batch(_, messages, _, _) do 147 | list = messages |> Enum.map(fn e -> e.data end) 148 | IO.inspect(list, label: "Got batch") 149 | messages 150 | end 151 | end 152 | 153 | We are not doing anything fancy here, but it should be enough for our 154 | purpose. First, we update the message's data individually inside 155 | `handle_message/3` and then we print each batch inside `handle_batch/4`. 156 | 157 | For more information, see `c:Broadway.handle_message/3` and 158 | `c:Broadway.handle_batch/4`. 159 | 160 | > Note: Since Broadway v0.2, batching is optional. In case you don't need to 161 | > group messages as batches for further processing/publishing, you can remove 162 | > the `:batchers` configuration along with the `handle_batch/4` callback. 163 | 164 | ## Run the Broadway pipeline 165 | 166 | To run your `Broadway` pipeline, you just need to add as a child in 167 | a supervision tree. Most applications have a supervision tree defined 168 | at `lib/my_app/application.ex`. You can add Broadway as a child to a 169 | supervisor as follows: 170 | 171 | children = [ 172 | {MyBroadway, []} 173 | ] 174 | 175 | Supervisor.start_link(children, strategy: :one_for_one) 176 | 177 | Now the Broadway pipeline should be started when your application starts. 178 | Also, if your Broadway has any dependency (for example, it needs to talk 179 | to the database), make sure that Broadway is listed *after* its dependencies 180 | in the supervision tree. 181 | 182 | You can now test your pipeline by entering an `iex` session: 183 | 184 | $ iex -S mix 185 | 186 | If everything went fine, you should see lots of `info` log messages like this 187 | one coming from the `:brod` supervisors: 188 | 189 | 15:14:04.356 [info] [supervisor: {:local, :brod_sup}, started: [pid: #PID<0.251.0>, id: :test_client, mfargs: {:brod_client, :start_link, [[localhost: 9092], :test_client, []]}, restart_type: {:permanent, 10}, shutdown: 5000, child_type: :worker]] 190 | 191 | [Brod](https://github.com/klarna/brod/) is the client that BroadwayKafka uses 192 | under the hood to communicate with Kafka. 193 | 194 | ### Sending messages to Kafka 195 | 196 | Finally, we can send some sample messages to Kafka using using `:brod` with the following snippet: 197 | 198 | topic = "test" 199 | client_id = :my_client 200 | hosts = [localhost: 9092] 201 | 202 | :ok = :brod.start_client(hosts, client_id, _client_config=[]) 203 | :ok = :brod.start_producer(client_id, topic, _producer_config = []) 204 | 205 | Enum.each(1..1000, fn i -> 206 | partition = rem(i, 3) 207 | :ok = :brod.produce_sync(client_id, topic, partition, _key="", "#{i}") 208 | end) 209 | 210 | You should see the output showing the generated batches: 211 | 212 | Got batch: [ 213 | {"2", 4}, 214 | {"5", 10}, 215 | {"8", 16}, 216 | {"11", 22}, 217 | {"14", 28}, 218 | ... 219 | ] 220 | Got batch: [ 221 | {"3", 6}, 222 | {"6", 12}, 223 | {"9", 18}, 224 | {"12", 24}, 225 | {"15", 30}, 226 | ... 227 | ] 228 | 229 | ## Tuning the configuration 230 | 231 | Some of the configuration options available for Broadway come already with a 232 | "reasonable" default value. However, those values might not suit your 233 | requirements. Depending on the number of records you get, how much processing 234 | they need and how much IO work is going to take place, you might need completely 235 | different values to optimize the flow of your pipeline. The `concurrency` option 236 | available for every set of producers, processors and batchers, along with 237 | `batch_size` and `batch_timeout` can give you a great deal of flexibility. 238 | See the notes on [`Producer concurrency`](https://hexdocs.pm/broadway/Broadway.html#module-producer-concurrency) 239 | and [`Batcher concurrency`](https://hexdocs.pm/broadway/Broadway.html#module-batcher-concurrency) 240 | for details. 241 | 242 | By setting the `concurrency` option, you define the number of concurrent processes 243 | that will be started by Broadway, allowing you to have full control over the 244 | concurrency level in each layer of the pipeline. Keep in mind that since the 245 | concurrency model provided by **Kafka** is based on **partitioning**, in order to take 246 | full advantage of this model, you need to set the `concurrency` option for 247 | your processors and batchers accordingly. Having less concurrency than topic/partitions 248 | assigned will result in individual processors handling more than one partition, 249 | decreasing the overall level of concurrency. Therefore, if you want to always be able 250 | to process messages at maximum concurrency (assuming you have enough resources to do it), 251 | you should increase the concurrency up front to make sure you have enough processors to 252 | handle the extra records received from new partitions assigned. 253 | 254 | > **Note**: Even if you don't plan to add more partitions to a Kafka topic, your pipeline 255 | can still receive more assignments than planned. For instance, if another consumer crashes, 256 | the server will reassign all its topic/partition to other available consumers, including 257 | any Broadway producer subscribed to the same topic. 258 | 259 | There are other options that you may want to take a closer look when tuning your configuration. 260 | The `:max_bytes` option, for instance, belongs to the `:fetch_config` group and defines the 261 | maximum amount of data to be fetched at a time from a single partition. The default is 262 | 1048576 (1 MiB). Setting greater values can improve throughput at the cost of more 263 | memory consumption. For more information and other fetch options, please refer to the 264 | "Fetch config options" in the official [BroadwayKafka](https://hexdocs.pm/broadway_kafka/) 265 | documentation. 266 | 267 | Other two important options are `:offset_commit_interval_seconds` and `:offset_commit_on_ack`. 268 | Both belong to the main configuration and they can make a huge impact on performance. 269 | 270 | The `:offset_commit_interval_seconds` defines the time interval between two 271 | OffsetCommitRequest messages. The default is 5s. 272 | 273 | The `:offset_commit_on_ack`, when set to `true`, tells Broadway to send an 274 | OffsetCommitRequest immediately after each acknowledgement, bypassing any 275 | interval defined in `:offset_commit_interval_seconds`. Setting this option to 276 | `false` can increase performance since any commit requests will start respecting 277 | the `:offset_commit_interval_seconds` option. This will usually result in fewer 278 | requests to be sent to the server. However, setting long commit intervals might 279 | lead to a large number of duplicated records to be processed after a server 280 | restart or connection loss. Since it is always possible that duplicate messages 281 | will be received by consumers, make sure your logic is idempotent when consuming 282 | records to avoid inconsistencies. Also, bear in mind that the negative 283 | performance impact might be insignificant if you're using batchers since only 284 | one commit request will be performed per batch. As a basic rule, always take 285 | into account the values of `batch_size` and `batch_timeout` whenever you're 286 | tuning `:offset_commit_interval_seconds` and `:offset_commit_on_ack`. 287 | 288 | ## Handling failed messages 289 | 290 | `broadway_kafka` never stops the flow of the stream, i.e. it will **always ack** the messages 291 | even when they fail. Unlike queue-based connectors, where you can mark a single message as failed. 292 | In Kafka that's not possible due to its single offset per topic/partition ack strategy. If you 293 | want to reprocess failed messages, you need to roll your own strategy. A possible way to do that 294 | is to implement `handle_failed/2` and send failed messages to a separated stream or queue for 295 | later processing. 296 | -------------------------------------------------------------------------------- /guides/examples/custom-producers.md: -------------------------------------------------------------------------------- 1 | # Custom Producers 2 | 3 | If you want to use Broadway but there is no existing Broadway producer 4 | for the technology of your choice, you can integrate any existing GenStage 5 | producer into the pipeline with relative ease. 6 | 7 | ## Example 8 | 9 | In general, producers must generate `%Broadway.Message{}` structs in order 10 | to be processed by Broadway. In case you need to use an existing GenStage 11 | producer and you don't want to change its original implementation, 12 | you'll have to set the producer's `:transformer` option to translate the 13 | generated events into Broadway messages. 14 | 15 | In the following example the producer is a regular GenStage, i.e., it 16 | produces plain events that cannot be processed by Broadway directly: 17 | 18 | defmodule Counter do 19 | use GenStage 20 | 21 | def start_link(number) do 22 | GenStage.start_link(Counter, number) 23 | end 24 | 25 | def init(counter) do 26 | {:producer, counter} 27 | end 28 | 29 | def handle_demand(demand, counter) when demand > 0 do 30 | events = Enum.to_list(counter..counter+demand-1) 31 | {:noreply, events, counter + demand} 32 | end 33 | end 34 | 35 | By using a transformer, you can tell Broadway to transform all events 36 | generated by the producer into proper Broadway messages: 37 | 38 | defmodule MyBroadway do 39 | use Broadway 40 | 41 | @behaviour Broadway.Acknowledger 42 | 43 | alias Broadway.Message 44 | 45 | def start_link(_opts) do 46 | Broadway.start_link(__MODULE__, 47 | name: __MODULE__, 48 | producer: [ 49 | module: {Counter, 1}, 50 | transformer: {__MODULE__, :transform, []} 51 | ], 52 | processors: [ 53 | default: [concurrency: 10] 54 | ], 55 | batchers: [ 56 | default: [concurrency: 2, batch_size: 5], 57 | ] 58 | ) 59 | end 60 | 61 | ...callbacks... 62 | 63 | def transform(event, _opts) do 64 | %Message{ 65 | data: event, 66 | acknowledger: {__MODULE__, :ack_id, :ack_data} 67 | } 68 | end 69 | 70 | @impl Broadway.Acknowledger 71 | def ack(:ack_id, successful, failed) do 72 | # Write ack code here 73 | :ok 74 | end 75 | end 76 | 77 | Notice that you need to pass two options to the producer: 78 | 79 | * `:module` - a tuple representing the GenStage producer as `{mod, arg}`. 80 | Where `mod` is module that implements the GenStage behaviour and `arg` 81 | the argument that will be given to the `init` callback of the GenStage. 82 | It is very important to note that Broadway **will not call** the 83 | `child_spec/1` or `start_link/1` function of the producer. That's 84 | because Broadway wraps the producer to augment it with extra features. 85 | 86 | * `:transformer` - a module-function-args tuple that will be invoked 87 | inside the producer, for every producer message, that should create 88 | a `Broadway.Message` struct with the `data` and `acknowledger` fields. 89 | 90 | See the `Broadway.Acknowledger` module for more information on defining 91 | and setting up acknowledgements. 92 | -------------------------------------------------------------------------------- /guides/examples/google-cloud-pubsub.md: -------------------------------------------------------------------------------- 1 | # Google Cloud Pub/Sub 2 | 3 | Cloud Pub/Sub is a fully-managed real-time messaging service provided by Google. 4 | 5 | ## Getting Started 6 | 7 | In order to use Broadway with Cloud Pub/Sub you need to: 8 | 9 | 1. Setup a Cloud Pub/Sub project 10 | 1. Configure your Elixir project to use Broadway 11 | 1. Define your pipeline configuration 12 | 1. Implement Broadway callbacks 13 | 1. Run the Broadway pipeline 14 | 1. Tune the configuration (Optional) 15 | 16 | If you are just getting familiar with Google Pub/Sub, refer to [the documentation](https://cloud.google.com/pubsub/docs/) 17 | to get started. Instead of testing against a live environment, you may also consider using the 18 | [emulator](https://cloud.google.com/pubsub/docs/emulator) to simulate integrating with Cloud 19 | Pub/Sub. 20 | 21 | If you have an existing project, topic, subscription, and credentials, you can skip [step 22 | 1](#setup-cloud-pub-sub-project) and jump to [Configure the project](#configure-the-project) 23 | section. 24 | 25 | ## Setup Cloud Pub/Sub project 26 | 27 | In this tutorial we'll use the [`gcloud`](https://cloud.google.com/sdk/gcloud/) command-line tool 28 | to set everything up in Google Cloud. Alternatively, you can roughly follow this guide by using 29 | [Cloud Console](https://console.cloud.google.com). 30 | 31 | To install `gcloud` follow the [documentation](https://cloud.google.com/sdk/gcloud/). If you are 32 | on macOS you may consider installing it with Homebrew: 33 | 34 | $ brew install --cask google-cloud-sdk 35 | 36 | Now, authenticate the CLI: 37 | 38 | $ gcloud auth login 39 | 40 | Then, create a new project: 41 | 42 | $ gcloud projects create test-pubsub 43 | 44 | A new topic: 45 | 46 | $ gcloud pubsub topics create test-topic --project test-pubsub 47 | Created topic [projects/test-pubsub/topics/test-topic]. 48 | 49 | > Note: If you run this command immediately after creating a new Google Cloud project, you may receive an error indicating that your project's organization policy is still being provisioned. Just wait a couple minutes and try again. 50 | 51 | And a new subscription: 52 | 53 | $ gcloud pubsub subscriptions create test-subscription --project test-pubsub --topic test-topic 54 | Created subscription [projects/test-pubsub/subscriptions/test-subscription]. 55 | 56 | We also need a [service account](https://cloud.google.com/iam/docs/service-accounts), an IAM 57 | policy, as well as API credentials in order to programmatically work with the service. First, let's 58 | create the service account: 59 | 60 | $ gcloud iam service-accounts create test-account --project test-pubsub 61 | Created service account [test-account]. 62 | 63 | Then the policy. For simplicity we add the general role `roles/editor`, but make sure to 64 | examine the [available roles](https://cloud.google.com/iam/docs/understanding-roles#pubsub-roles) 65 | and choose the one that best suits your use case: 66 | 67 | $ gcloud projects add-iam-policy-binding test-pubsub \ 68 | --member serviceAccount:test-account@test-pubsub.iam.gserviceaccount.com \ 69 | --role roles/editor 70 | Updated IAM policy for project [test-pubsub]. 71 | (...) 72 | 73 | And now the credentials: 74 | 75 | $ gcloud iam service-accounts keys create credentials.json --iam-account=test-account@test-pubsub.iam.gserviceaccount.com 76 | created key [xxx] of type [json] as [key] for [test-account@test-pubsub.iam.gserviceaccount.com] 77 | 78 | This command generated a `credentials.json` file which will be useful later. Note, the IAM account 79 | pattern is `@.iam.gserviceaccount.com`. Run `gcloud iam service-accounts list --project test-pubsub` 80 | to see all service accounts associated with the given project. 81 | 82 | Finally, we need to enable Pub/Sub for our project: 83 | 84 | $ gcloud services enable pubsub --project test-pubsub 85 | Operation "operations/xxx" finished successfully. 86 | 87 | ## Configure the project 88 | 89 | In this guide we're going to use [BroadwayCloudPubSub](https://github.com/dashbitco/broadway_cloud_pub_sub), 90 | which is a Broadway Cloud Pub/Sub Connector provided by [Dashbit](https://dashbit.co/). 91 | 92 | ### Starting a new project 93 | 94 | If you plan to start a new project, just run: 95 | 96 | $ mix new my_app --sup 97 | 98 | The `--sup` flag instructs Elixir to generate an application with a supervision tree. 99 | 100 | ### Setting up dependencies 101 | 102 | Add `:broadway_cloud_pub_sub` to the list of dependencies in `mix.exs`, along with the Google 103 | Cloud authentication library of your choice (defaults to `:goth`): 104 | 105 | defp deps() do 106 | [ 107 | ... 108 | {:broadway_cloud_pub_sub, "~> 0.7"}, 109 | {:goth, "~> 1.0"} 110 | ] 111 | end 112 | 113 | Don't forget to check for the latest version of dependencies. 114 | 115 | ## Define the pipeline configuration 116 | 117 | Broadway is a process-based behaviour and to define a Broadway pipeline, we need to define three 118 | functions: `start_link/1`, `handle_message/3` and `handle_batch/4`. We will cover `start_link/1` 119 | in this section and the `handle_` callbacks in the next one. 120 | 121 | Similar to other process-based behaviour, `start_link/1` simply delegates to 122 | `Broadway.start_link/2`, which should define the producers, processors, and batchers in the 123 | Broadway pipeline. Assuming we want to consume messages from the `test-subscription`, the minimal 124 | configuration would be: 125 | 126 | defmodule MyBroadway do 127 | use Broadway 128 | 129 | alias Broadway.Message 130 | 131 | def start_link(_opts) do 132 | Broadway.start_link(__MODULE__, 133 | name: __MODULE__, 134 | producer: [ 135 | module: 136 | {BroadwayCloudPubSub.Producer, 137 | subscription: "projects/test-pubsub/subscriptions/test-subscription"} 138 | ], 139 | processors: [ 140 | default: [] 141 | ], 142 | batchers: [ 143 | default: [ 144 | batch_size: 10, 145 | batch_timeout: 2_000 146 | ] 147 | ] 148 | ) 149 | end 150 | 151 | ...callbacks... 152 | end 153 | 154 | For a full list of options for `BroadwayCloudPubSub.Producer`, please see [the 155 | documentation](https://hexdocs.pm/broadway_cloud_pub_sub). 156 | 157 | For general information about setting up Broadway, see `Broadway` module docs as well as 158 | `Broadway.start_link/2`. 159 | 160 | > Note: Even though batching is optional since Broadway v0.2, we recommend all Cloud Pub/Sub 161 | > pipelines to have at least a default batcher, as that allows you to control the exact batch 162 | > size and frequency that messages are acknowledged to Cloud Pub/Sub, which often leads to 163 | > pipelines that are more cost and time efficient. 164 | 165 | ## Implement Broadway callbacks 166 | 167 | In order to process incoming messages, we need to implement the required callbacks. For the sake 168 | of simplicity, we're considering that all messages received from the queue are strings and our 169 | processor calls `String.upcase/1` on them: 170 | 171 | defmodule MyBroadway do 172 | use Broadway 173 | 174 | alias Broadway.Message 175 | 176 | ...start_link... 177 | 178 | def handle_message(_, %Message{data: data} = message, _) do 179 | message 180 | |> Message.update_data(fn data -> String.upcase(data) end) 181 | end 182 | 183 | def handle_batch(_, messages, _, _) do 184 | list = messages |> Enum.map(fn e -> e.data end) 185 | IO.inspect(list, label: "Got batch of finished jobs from processors, sending ACKs to Pub/Sub as a batch.") 186 | messages 187 | end 188 | end 189 | 190 | We are not doing anything fancy here, but it should be enough for our purpose. First we update the 191 | message's data individually inside `handle_message/3` and then we print each batch inside 192 | `handle_batch/4`. 193 | 194 | For more information, see `c:Broadway.handle_message/3` and `c:Broadway.handle_batch/4`. 195 | 196 | ## Run the Broadway pipeline 197 | 198 | To run your `Broadway` pipeline, you need to add it as a child in a supervision tree. Most 199 | applications have a supervision tree defined at `lib/my_app/application.ex`. You can add Broadway 200 | as a child to a supervisor as follows: 201 | 202 | children = [ 203 | {MyBroadway, []} 204 | ] 205 | 206 | Supervisor.start_link(children, strategy: :one_for_one) 207 | 208 | The final step is to configure credentials. You can set the following environment variable: 209 | 210 | export GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json 211 | 212 | See [Goth](https://github.com/peburrows/goth) documentation for alternative ways of authenticating 213 | with the API. 214 | 215 | Now the Broadway pipeline should be started when your application starts. Also, if your Broadway 216 | pipeline has any dependency (for example, it needs to talk to the database), make sure that 217 | it is listed *after* its dependencies in the supervision tree. 218 | 219 | If you followed the previous section about setting the project with `gcloud`, you can now test the 220 | the pipeline. In one terminal tab start the application: 221 | 222 | $ iex -S mix 223 | 224 | And in another tab, send a couple of test messages to Pub/Sub: 225 | 226 | $ gcloud pubsub topics publish projects/test-pubsub/topics/test-topic --message "test 1" 227 | messageIds: 228 | - '651428033718119' 229 | 230 | gcloud pubsub topics publish projects/test-pubsub/topics/test-topic --message "test 2" 231 | messageIds: 232 | - '651427034966696' 233 | 234 | Now, In the first tab, you should see output similar to: 235 | 236 | ``` 237 | Got batch of finished jobs from processors, sending ACKs to Pub/Sub as a batch.: ["TEST 1", "TEST 2"] 238 | ``` 239 | 240 | ## Tuning the configuration 241 | 242 | Some of the configuration options available for Broadway come already with a 243 | "reasonable" default value. However those values might not suit your 244 | requirements. Depending on the number of messages you get, how much processing 245 | they need and how much IO work is going to take place, you might need completely 246 | different values to optimize the flow of your pipeline. The `concurrency` option 247 | available for every set of producers, processors and batchers, among with 248 | `max_demand`, `batch_size`, and `batch_timeout` can give you a great deal 249 | of flexibility. 250 | 251 | The `concurrency` option controls the concurrency level in each layer of 252 | the pipeline. 253 | See the notes on [`Producer concurrency`](https://hexdocs.pm/broadway/Broadway.html#module-producer-concurrency) 254 | and [`Batcher concurrency`](https://hexdocs.pm/broadway/Broadway.html#module-batcher-concurrency) 255 | for details. 256 | 257 | Here's an example on how you could tune them according to 258 | your needs. 259 | 260 | defmodule MyBroadway do 261 | use Broadway 262 | 263 | def start_link(_opts) do 264 | Broadway.start_link(__MODULE__, 265 | name: __MODULE__, 266 | producer: [ 267 | ... 268 | concurrency: 10, 269 | ], 270 | processors: [ 271 | default: [ 272 | concurrency: 100, 273 | max_demand: 1, 274 | ] 275 | ], 276 | batchers: [ 277 | default: [ 278 | batch_size: 10, 279 | concurrency: 10, 280 | ] 281 | ] 282 | ) 283 | end 284 | 285 | ...callbacks... 286 | end 287 | 288 | In order to get a good set of configurations for your pipeline, it's 289 | important to respect the limitations of the servers you're running, 290 | as well as the limitations of the services you're providing/consuming 291 | data to/from. Broadway comes with telemetry, so you can measure your 292 | pipeline and help ensure your changes are effective. 293 | -------------------------------------------------------------------------------- /guides/examples/introduction.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | `Broadway` is a library for building concurrent and multi-stage data ingestion and data processing pipelines with Elixir. Broadway pipelines are concurrent and robust, thanks to the Erlang VM and its actors. It features: 4 | 5 | * Back-pressure 6 | * Automatic acknowledgements at the end of the pipeline 7 | * Batching 8 | * Fault tolerance 9 | * Graceful shutdown 10 | * Built-in testing 11 | * Custom failure handling 12 | * Ordering and partitioning 13 | * Rate-limiting 14 | * Metrics 15 | 16 | ## Official Producers 17 | 18 | Currently we officially support four Broadway producers: 19 | 20 | * Amazon SQS: [Source](https://github.com/dashbitco/broadway_sqs) - [Guide](amazon-sqs.md) 21 | * Apache Kafka: [Source](https://github.com/dashbitco/broadway_kafka) - [Guide](apache-kafka.md) 22 | * Google Cloud Pub/Sub: [Source](https://github.com/dashbitco/broadway_cloud_pub_sub) - [Guide](google-cloud-pubsub.md) 23 | * RabbitMQ: [Source](https://github.com/dashbitco/broadway_rabbitmq) - [Guide](rabbitmq.md) 24 | 25 | The guides links above will help you get started with your adapter of choice. For API reference, you can check out the `Broadway` module. 26 | 27 | ## Non-official (Off-Broadway) Producers 28 | 29 | For those interested in rolling their own Broadway Producers (which we actively encourage!), we recommend using the `OffBroadway` namespace, mirroring the [Off-Broadway theaters](https://en.wikipedia.org/wiki/Off-Broadway). For example, if you want to publish your own integration with Amazon SQS, you can package it as `off_broadway_sqs`, which uses the `OffBroadway.SQS` namespace. 30 | 31 | The following Off-Broadway libraries are available (feel free to send a PR adding your own in alphabetical order): 32 | 33 | * [off_broadway_amqp10](https://github.com/highmobility/off_broadway_amqp10): [Guide](https://hexdocs.pm/off_broadway_amqp10/) 34 | * [off_broadway_elasticsearch](https://github.com/jonlunsford/off_broadway_elasticsearch): [Guide](https://hexdocs.pm/off_broadway_elasticsearch/) 35 | * [off_broadway_kafka](https://github.com/bbalser/off_broadway_kafka): [Guide](https://hexdocs.pm/off_broadway_kafka/) 36 | * [off_broadway_memory](https://github.com/elliotekj/off_broadway_memory): [Guide](https://hexdocs.pm/off_broadway_memory/) 37 | * [off_broadway_redis](https://github.com/amokan/off_broadway_redis): [Guide](https://hexdocs.pm/off_broadway_redis/) 38 | * [off_broadway_redis_stream](https://github.com/akash-akya/off_broadway_redis_stream): [Guide](https://hexdocs.pm/off_broadway_redis_stream/) 39 | * [off_broadway_splunk](https://github.com/Intility/off_broadway_splunk): [Guide](https://hexdocs.pm/off_broadway_splunk/) 40 | -------------------------------------------------------------------------------- /guides/examples/rabbitmq.md: -------------------------------------------------------------------------------- 1 | # RabbitMQ 2 | 3 | RabbitMQ is an open source message broker designed to be highly scalable and 4 | distributed. It supports multiple protocols including the Advanced Message 5 | Queuing Protocol (AMQP). 6 | 7 | ## Getting Started 8 | 9 | In order to use Broadway with RabbitMQ, we need to: 10 | 11 | 1. [Create a queue](#create-a-queue) (or use an existing one) 12 | 1. [Configure our Elixir project to use Broadway](#configure-the-project) 13 | 1. [Define your pipeline configuration](#define-the-pipeline-configuration) 14 | 1. [Implement Broadway callbacks](#implement-broadway-callbacks) 15 | 1. [Run the Broadway pipeline](#run-the-broadway-pipeline) 16 | 1. [Tuning the configuration](#tuning-the-configuration) (Optional) 17 | 18 | In case you want to work with an existing queue, you can skip [step 1](#create-a-queue) 19 | and jump to [Configure the project](#configure-the-project). 20 | 21 | > Note: `BroadwayRabbitMQ` does not automatically create any queue. If you 22 | configure a pipeline with a non-existent queue, the producers will crash, 23 | bringing down the pipeline. 24 | 25 | ## Create a queue 26 | 27 | RabbitMQ runs on many operating systems. Please see 28 | [Downloading and Installing RabbitMQ](https://www.rabbitmq.com/download.html) for 29 | further information. Also, make sure you have the 30 | [Management](https://www.rabbitmq.com/management.html) plugin enabled, which ships 31 | with the command line tool, `rabbitmqadmin`. 32 | 33 | After successfully installing RabbitMQ, you can declare a new queue with the 34 | following command: 35 | 36 | $ rabbitmqadmin declare queue name=my_queue durable=true 37 | 38 | You can list all declared queues to see our the one we've just created: 39 | 40 | $ rabbitmqctl list_queues 41 | Timeout: 60.0 seconds ... 42 | Listing queues for vhost / ... 43 | name messages 44 | my_queue 0 45 | 46 | ## Configure the project 47 | 48 | In this guide, we're going to use [BroadwayRabbitMQ](https://github.com/dashbitco/broadway_rabbitmq), 49 | which is a Broadway RabbitMQ Connector provided by [Dashbit](https://dashbit.co/). 50 | 51 | ### Starting a new project 52 | 53 | If you're creating a new project, run: 54 | 55 | $ mix new my_app --sup 56 | 57 | The `--sup` flag instructs Elixir to generate an application with a supervision tree. 58 | 59 | ### Setting up dependencies 60 | 61 | Add `:broadway_rabbitmq` to the list of dependencies in `mix.exs`: 62 | 63 | def deps do 64 | [ 65 | ... 66 | {:broadway_rabbitmq, "~> 0.7"}, 67 | ] 68 | end 69 | 70 | Don't forget to check for the latest version of dependencies. 71 | 72 | ## Define the pipeline configuration 73 | 74 | Broadway is a process-based behaviour and to define a Broadway pipeline, 75 | we need to define three functions: `start_link/1`, `handle_message/3` 76 | and optionally `handle_batch/4`. We will cover `start_link/1` in this 77 | section and the `handle_` callbacks in the next one. 78 | 79 | Similar to other process-based behaviours, `start_link/1` simply 80 | delegates to `Broadway.start_link/2`, which should define the 81 | producers, processors, and batchers in the Broadway pipeline. 82 | Assuming we want to consume messages from a queue called 83 | `my_queue`, one possible configuration would be: 84 | 85 | defmodule MyBroadway do 86 | use Broadway 87 | 88 | alias Broadway.Message 89 | 90 | def start_link(_opts) do 91 | Broadway.start_link(__MODULE__, 92 | name: MyBroadway, 93 | producer: [ 94 | module: {BroadwayRabbitMQ.Producer, 95 | queue: "my_queue", 96 | qos: [ 97 | prefetch_count: 50, 98 | ] 99 | }, 100 | concurrency: 1 101 | ], 102 | processors: [ 103 | default: [ 104 | concurrency: 50 105 | ] 106 | ], 107 | batchers: [ 108 | default: [ 109 | batch_size: 10, 110 | batch_timeout: 1500, 111 | concurrency: 5 112 | ] 113 | ] 114 | ) 115 | end 116 | 117 | ...callbacks... 118 | end 119 | 120 | If you're consuming data from an existing broker that requires authorization, 121 | you'll need to provide your credentials using the `connection` option: 122 | 123 | ... 124 | producer: [ 125 | module: {BroadwayRabbitMQ.Producer, 126 | queue: "my_queue", 127 | connection: [ 128 | username: "user", 129 | password: "password", 130 | ] 131 | ... 132 | } 133 | ] 134 | ... 135 | 136 | For the full list of `connection` options, please see 137 | [`AMQP.Connection.open/1`](https://hexdocs.pm/amqp/1.1.1/AMQP.Connection.html#open/1) 138 | 139 | For general information about setting up Broadway, see `Broadway` 140 | module docs as well as `Broadway.start_link/2`. 141 | 142 | ## Implement Broadway callbacks 143 | 144 | In order to process incoming messages, we need to implement the 145 | required callbacks. For the sake of simplicity, we're considering that 146 | all messages received from the queue are just numbers: 147 | 148 | defmodule MyBroadway do 149 | use Broadway 150 | 151 | alias Broadway.Message 152 | 153 | ...start_link... 154 | 155 | @impl true 156 | def handle_message(_, message, _) do 157 | message 158 | |> Message.update_data(fn data -> {data, String.to_integer(data) * 2} end) 159 | end 160 | 161 | @impl true 162 | def handle_batch(_, messages, _, _) do 163 | list = messages |> Enum.map(fn e -> e.data end) 164 | IO.inspect(list, label: "Got batch") 165 | messages 166 | end 167 | end 168 | 169 | We are not doing anything fancy here, but it should be enough for our 170 | purpose. First, we update the message's data individually inside 171 | `handle_message/3` and then we print each batch inside `handle_batch/4`. 172 | 173 | For more information, see `c:Broadway.handle_message/3` and 174 | `c:Broadway.handle_batch/4`. 175 | 176 | > Note: Since Broadway v0.2, batching is optional. In case you don't need to 177 | > group messages as batches for further processing/publishing, you can remove 178 | > the `:batchers` configuration along with the `handle_batch/4` callback. This 179 | > is perfectly fine for RabbitMQ, where messages are acknowledged individually 180 | > and never as a batch. 181 | 182 | ## Run the Broadway pipeline 183 | 184 | To run your `Broadway` pipeline, you just need to add as a child in 185 | a supervision tree. Most applications have a supervision tree defined 186 | at `lib/my_app/application.ex`. You can add Broadway as a child to a 187 | supervisor as follows: 188 | 189 | children = [ 190 | {MyBroadway, []} 191 | ] 192 | 193 | Supervisor.start_link(children, strategy: :one_for_one) 194 | 195 | Now the Broadway pipeline should be started when your application starts. 196 | Also, if your Broadway has any dependency (for example, it needs to talk 197 | to the database), make sure that Broadway is listed *after* its dependencies 198 | in the supervision tree. 199 | 200 | You can now test your pipeline by entering an `iex` session: 201 | 202 | $ iex -S mix 203 | 204 | If everything went fine, you should see lots of `info` log messages from the `amqp` 205 | supervisors. If you think that's too verbose and want to do something 206 | about it, please take a look at the _"Log related to amqp supervisors are too verbose"_ 207 | subsection in the `amqp`'s [Troubleshooting](https://hexdocs.pm/amqp/readme.html#troubleshooting) 208 | documentation. 209 | 210 | Finally, let's generate some sample messages to be consumed by Broadway with the 211 | following code: 212 | 213 | {:ok, connection} = AMQP.Connection.open 214 | {:ok, channel} = AMQP.Channel.open(connection) 215 | AMQP.Queue.declare(channel, "my_queue", durable: true) 216 | 217 | Enum.each(1..5000, fn i -> 218 | AMQP.Basic.publish(channel, "", "my_queue", "#{i}") 219 | end) 220 | AMQP.Connection.close(connection) 221 | 222 | You should see the output showing the generated batches: 223 | 224 | Got batch: [ 225 | {"7", 14}, 226 | {"5", 10}, 227 | {"8", 16}, 228 | {"98", 196}, 229 | {"6", 12}, 230 | {"97", 194}, 231 | {"9", 18}, 232 | {"99", 198}, 233 | {"10", 20}, 234 | {"100", 200} 235 | ] 236 | Got batch: [ 237 | {"29", 58}, 238 | {"32", 64}, 239 | ... 240 | ] 241 | 242 | ## Tuning the configuration 243 | 244 | Some of the configuration options available for Broadway come already with a 245 | "reasonable" default value. However, those values might not suit your 246 | requirements. Depending on the number of messages you get, how much processing 247 | they need and how much IO work is going to take place, you might need completely 248 | different values to optimize the flow of your pipeline. The `concurrency` option 249 | available for every set of producers, processors and batchers, among with 250 | `max_demand`, `batch_size`, and `batch_timeout` can give you a great deal 251 | of flexibility. The `concurrency` option controls the concurrency level in 252 | each layer of the pipeline. 253 | See the notes on [`Producer concurrency`](https://hexdocs.pm/broadway/Broadway.html#module-producer-concurrency) 254 | and [`Batcher concurrency`](https://hexdocs.pm/broadway/Broadway.html#module-batcher-concurrency) 255 | for details. 256 | 257 | Another important option to take into account is the `:prefetch_count`. 258 | RabbitMQ will continually push new messages to Broadway as it receives them. 259 | The `:prefetch_count` setting provides back-pressure by instructing RabbitMQ to [limit the number of unacknowledged messages a consumer will have at a given moment](https://www.rabbitmq.com/consumer-prefetch.html). 260 | See the ["Back-pressure and :prefetch_count"](https://hexdocs.pm/broadway_rabbitmq/BroadwayRabbitMQ.Producer.html#module-back-pressure-and-prefetch_count) 261 | section of the `BroadwayRabbitMQ` documentation for details. 262 | 263 | In order to get a good set of configurations for your pipeline, it's 264 | important to respect the limitations of the servers you're running, 265 | as well as the limitations of the services you're providing/consuming 266 | data to/from. Broadway comes with telemetry, so you can measure your 267 | pipeline and help ensure your changes are effective. 268 | -------------------------------------------------------------------------------- /guides/internals/architecture.md: -------------------------------------------------------------------------------- 1 | # Architecture 2 | 3 | Broadway's architecture is built on top of GenStage. That means we structure 4 | our processing units as independent stages that are responsible for one 5 | individual task in the pipeline. By implementing the `Broadway` behaviour, 6 | we define a `GenServer` process that wraps a `Supervisor` to manage and 7 | own our pipeline. 8 | 9 | ## The pipeline model 10 | 11 | ```asciidoc 12 | [producers] <- pulls data from SQS, RabbitMQ, etc. 13 | | 14 | | (demand dispatcher) 15 | | 16 | handle_message/3 and ----------> [processors] 17 | prepare_messages/2 run here / \ 18 | / \ (partition dispatcher) 19 | / \ 20 | [batcher] [batcher] <- one for each batcher key 21 | | | 22 | | | (demand dispatcher) 23 | | | 24 | handle_batch/4 runs here -> [batch processor][batch processor] 25 | ``` 26 | 27 | ## Internal stages 28 | 29 | * `Broadway.Producer` - A wrapper around the actual producer defined by 30 | the user. It serves as the source of the pipeline. 31 | * `Broadway.Processor` - This is where messages are processed, e.g. do 32 | calculations, convert data into a custom json format etc. Here is where 33 | the code from `handle_message/3` runs. 34 | * `Broadway.Batcher` - Creates batches of messages based on the 35 | batcher's key. One batcher for each key will be created. 36 | * `Broadway.BatchProcessor` - This is where the code from `handle_batch/4` runs. 37 | 38 | ## The supervision tree 39 | 40 | Broadway was designed to always go back to a working state in case 41 | of failures thanks to the use of supervisors. Our supervision tree 42 | is designed as follows: 43 | 44 | ```asciidoc 45 | [Broadway GenServer] 46 | | 47 | | 48 | | 49 | [Broadway Pipeline Supervisor] 50 | / / (:rest_for_one) \ \ 51 | / | | \ 52 | / | | \ 53 | / | | \ 54 | / | | \ 55 | / | | \ 56 | [ProducerSupervisor] [ProcessorSupervisor] [BatchersSupervisor] [Terminator] 57 | (:one_for_one) (:one_for_all) (:one_for_one) 58 | / \ / \ / \ 59 | / \ / \ / \ 60 | / \ / \ / \ 61 | / \ / \ / \ 62 | [Producer_1] ... [Processor_1] ... [BatcherSupervisor_1] ... 63 | (:rest_for_one) 64 | / \ 65 | / \ 66 | / \ 67 | [Batcher] [BatchProcessorSupervisor] 68 | (:one_for_all) 69 | / \ 70 | / \ 71 | / \ 72 | [BatchProcessor_1] ... 73 | ``` 74 | 75 | 76 | Both `ProcessorSupervisor` and `BatchProcessorSupervisor` are set with 77 | `max_restarts` to 0. The idea is that if any process fails, we want 78 | to restart the rest of the tree. Since Broadway callbacks are 79 | stateless, we can handle errors and provide reports without crashing 80 | processes. This means that the supervision tree will only shutdown 81 | in case of unforeseen errors in Broadway's implementation. 82 | 83 | The only exception are the producers, which contain external code 84 | and are expected to fail. If a producer crashes, it will be restarted 85 | by its supervisor without cascading failures until its max restarts 86 | is reached. Broadway automatically handles those failures by making 87 | processors automatically resubscribe to producers in case of crashes. 88 | 89 | ## Graceful shutdowns 90 | 91 | The cascading failures aspect also provides safe semantics for graceful 92 | shutdown. We know that either all processes are running OR they are all 93 | being shutdown. Therefore, to gracefully shutdown the supervision tree, 94 | a terminator process is activated, which starts the following steps: 95 | 96 | 1. It notifies the first layer of processors that they should not 97 | resubscribe to producers once they exit 98 | 99 | 2. It tells all producers to no longer accept demand, flush all 100 | current events, and then shutdown 101 | 102 | 3. It then monitors and waits for a confirmation message from batch 103 | processors. At this point, the terminator is effectively blocking 104 | the supervisor until all events have been processed 105 | 106 | This triggers a cascade effect where processors notice all of its producers 107 | have been cancelled, causing them to flush their own events and cancels the 108 | stages downstream, and so on and so on. This happens until batch processors 109 | notice all of their producers have been cancelled, effectively notifying the 110 | terminator to shutdown, allowing the outer most supervisor to go on and fully 111 | terminate all stages, which at this point have flushed all events. 112 | -------------------------------------------------------------------------------- /lib/broadway/acknowledger.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Acknowledger do 2 | @moduledoc """ 3 | A behaviour used to acknowledge that the received messages 4 | were successfully processed or failed. 5 | 6 | When implementing a new connector for Broadway, you should 7 | implement this behaviour and consider how the technology 8 | you're working with handles message acknowledgement. 9 | 10 | The `c:ack/3` callback must be implemented in order to notify 11 | the origin of the data that a message can be safely removed 12 | after been successfully processed and published. In case of 13 | failed messages or messages without acknowledgement, depending 14 | on the technology chosen, the messages can be either moved back 15 | in the queue or, alternatively, moved to a *dead-letter queue*. 16 | """ 17 | 18 | alias Broadway.Message 19 | 20 | require Logger 21 | 22 | @doc """ 23 | Invoked to acknowledge successful and failed messages. 24 | 25 | * `ack_ref` is a term that uniquely identifies how messages 26 | should be grouped and sent for acknowledgement. Imagine 27 | you have a scenario where messages are coming from 28 | different producers. Broadway will use this information 29 | to correctly identify the acknowledger and pass it among 30 | with the messages so you can properly communicate with 31 | the source of the data for acknowledgement. `ack_ref` is 32 | part of `t:Broadway.Message.acknowledger/0`. 33 | 34 | * `successful` is the list of messages that were 35 | successfully processed and published. 36 | 37 | * `failed` is the list of messages that, for some reason, 38 | could not be processed or published. 39 | 40 | """ 41 | @callback ack(ack_ref :: term, successful :: [Message.t()], failed :: [Message.t()]) :: 42 | :ok 43 | 44 | @doc """ 45 | Configures the acknowledger with new `options`. 46 | 47 | Every acknowledger can decide how to incorporate the given `options` into its 48 | `ack_data`. The `ack_data` is the current acknowledger's data. The return value 49 | of this function is `{:ok, new_ack_data}` where `new_ack_data` is the updated 50 | data for the acknowledger. 51 | 52 | Note that `options` are different for every acknowledger, as the acknowledger 53 | is what specifies what are the supported options. Check the documentation for the 54 | acknowledger you're using to see the supported options. 55 | 56 | `ack_ref` and `ack_data` are part of `t:Broadway.Message.acknowledger/0`. 57 | """ 58 | @callback configure(ack_ref :: term, ack_data :: term, options :: keyword) :: 59 | {:ok, new_ack_data :: term} 60 | 61 | @optional_callbacks [configure: 3] 62 | 63 | @doc false 64 | @spec ack_messages([Message.t()], [Message.t()]) :: no_return 65 | def ack_messages(successful, failed) do 66 | %{} 67 | |> group_by_acknowledger(successful, :successful) 68 | |> group_by_acknowledger(failed, :failed) 69 | |> Enum.each(&call_ack/1) 70 | end 71 | 72 | defp group_by_acknowledger(ackers, messages, key) do 73 | Enum.reduce(messages, ackers, fn %{acknowledger: {acknowledger, ack_ref, _}} = msg, acc -> 74 | ack_info = {acknowledger, ack_ref} 75 | pdict_key = {ack_info, key} 76 | Process.put(pdict_key, [msg | Process.get(pdict_key, [])]) 77 | Map.put(acc, ack_info, true) 78 | end) 79 | end 80 | 81 | defp call_ack({{acknowledger, ack_ref} = ack_info, true}) do 82 | successful = Process.delete({ack_info, :successful}) || [] 83 | failed = Process.delete({ack_info, :failed}) || [] 84 | acknowledger.ack(ack_ref, Enum.reverse(successful), Enum.reverse(failed)) 85 | end 86 | 87 | @doc false 88 | # Builds a crash reason used in Logger reporting. 89 | def crash_reason(:throw, reason, stack), do: {{:nocatch, reason}, stack} 90 | def crash_reason(:error, reason, stack), do: {Exception.normalize(:error, reason, stack), stack} 91 | def crash_reason(:exit, reason, stack), do: {reason, stack} 92 | 93 | # Used by the processor and the batcher to maybe call c:handle_failed/2 94 | # on failed messages. 95 | @doc false 96 | def maybe_handle_failed_messages(messages, module, context) do 97 | if function_exported?(module, :handle_failed, 2) and messages != [] do 98 | handle_failed_messages(messages, module, context) 99 | else 100 | messages 101 | end 102 | end 103 | 104 | defp handle_failed_messages(messages, module, context) do 105 | module.handle_failed(messages, context) 106 | catch 107 | kind, reason -> 108 | Logger.error(Exception.format(kind, reason, __STACKTRACE__), 109 | crash_reason: crash_reason(kind, reason, __STACKTRACE__) 110 | ) 111 | 112 | messages 113 | else 114 | return_messages when is_list(return_messages) -> 115 | size = length(messages) 116 | return_size = length(return_messages) 117 | 118 | if return_size != size do 119 | Logger.error( 120 | "#{inspect(module)}.handle_failed/2 received #{size} messages and " <> 121 | "returned only #{return_size}. All messages given to handle_failed/2 " <> 122 | "must be returned" 123 | ) 124 | end 125 | 126 | return_messages 127 | 128 | _other -> 129 | Logger.error( 130 | "#{inspect(module)}.handle_failed/2 didn't return a list of messages, " <> 131 | "so ignoring its return value" 132 | ) 133 | 134 | messages 135 | end 136 | end 137 | -------------------------------------------------------------------------------- /lib/broadway/application.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Application do 2 | use Application 3 | 4 | def start(_type, _args) do 5 | config_storage = Broadway.ConfigStorage.get_module() 6 | 7 | if Code.ensure_loaded?(config_storage) and function_exported?(config_storage, :setup, 0) do 8 | config_storage.setup() 9 | end 10 | 11 | opts = [strategy: :one_for_one, name: Broadway.Supervisor] 12 | Supervisor.start_link([], opts) 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /lib/broadway/batch_info.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.BatchInfo do 2 | @moduledoc """ 3 | A struct used to hold information about a generated batch. 4 | 5 | An instance of this struct containing the related info will 6 | be passed to the `c:Broadway.handle_batch/4` callback of the 7 | module implementing the `Broadway` behaviour. 8 | 9 | See the documentation for [`%Broadway.BatchInfo{}`](`__struct__/0`) 10 | for information on the fields. 11 | """ 12 | 13 | @typedoc """ 14 | The type for a batch info struct. 15 | """ 16 | @type t :: %__MODULE__{ 17 | batcher: atom, 18 | batch_key: term, 19 | partition: non_neg_integer | nil, 20 | size: pos_integer, 21 | trigger: atom 22 | } 23 | 24 | @doc """ 25 | The batch info struct. 26 | 27 | The fields are: 28 | 29 | * `:batcher` - is the key that defined the batcher. This value can 30 | be set in the `c:Broadway.handle_message/3` callback using 31 | `Broadway.Message.put_batcher/2`. 32 | 33 | * `:batch_key` - identifies the batch key for this batch. 34 | See `Broadway.Message.put_batch_key/2`. 35 | 36 | * `:partition` - the partition, if present. 37 | 38 | * `:size` - the number of messages in the batch. 39 | 40 | * `:trigger` - the trigger that generated the batch, like `:timeout` 41 | or `:flush`. 42 | 43 | """ 44 | defstruct [ 45 | :batcher, 46 | :batch_key, 47 | :partition, 48 | :size, 49 | :trigger 50 | ] 51 | end 52 | -------------------------------------------------------------------------------- /lib/broadway/caller_acknowledger.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.CallerAcknowledger do 2 | @moduledoc """ 3 | A simple acknowledger that sends a message back to a caller. 4 | 5 | If you want to use this acknowledger in messages produced by your 6 | `Broadway.Producer`, you can get its configuration by calling 7 | the `init/0` function. For example, you can use it in 8 | `Broadway.test_message/3`: 9 | 10 | some_ref = make_ref() 11 | 12 | Broadway.test_message( 13 | MyPipeline, 14 | "some data", 15 | acknowledger: Broadway.CallerAcknowledger.init({self(), some_ref}, :ignored) 16 | ) 17 | 18 | The first parameter is a tuple with the PID to receive the messages 19 | and a unique identifier (usually a reference). Such unique identifier 20 | is then included in the messages sent to the PID. The second parameter, 21 | which is per message, is ignored. 22 | 23 | It sends a message in the format: 24 | 25 | {:ack, ref, successful_messages, failed_messages} 26 | 27 | If `Broadway.Message.configure_ack/2` is called on a message that 28 | uses this acknowledger, then the following message is sent: 29 | 30 | {:configure, ref, options} 31 | 32 | """ 33 | 34 | @behaviour Broadway.Acknowledger 35 | 36 | @doc """ 37 | Returns the acknowledger metadata. 38 | 39 | See the module documentation. 40 | """ 41 | @spec init({pid, ref :: term}, ignored_term :: term) :: Broadway.Message.acknowledger() 42 | def init({pid, ref} = _pid_and_ref, ignored_term) when is_pid(pid) do 43 | {__MODULE__, {pid, ref}, ignored_term} 44 | end 45 | 46 | @impl true 47 | def ack({pid, ref}, successful, failed) do 48 | send(pid, {:ack, ref, successful, failed}) 49 | end 50 | 51 | @impl true 52 | def configure({pid, ref}, ack_data, options) do 53 | send(pid, {:configure, ref, options}) 54 | {:ok, ack_data} 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /lib/broadway/config_storage.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.ConfigStorage do 2 | @moduledoc false 3 | 4 | @doc """ 5 | Optional setup for the configuration storage. 6 | 7 | Invoked when Broadway boots. 8 | """ 9 | @callback setup() :: :ok 10 | 11 | @doc """ 12 | Lists all broadway names in the config storage. 13 | """ 14 | @callback list() :: [term()] 15 | 16 | @doc """ 17 | Puts the given key value pair in the underlying storage. 18 | """ 19 | @callback put(server :: term(), value :: %Broadway.Topology{}) :: term() 20 | 21 | @doc """ 22 | Retrieves a configuration from the underlying storage. 23 | """ 24 | @callback get(server :: term()) :: term() 25 | 26 | @doc """ 27 | Deletes a configuration from the underlying storage. 28 | """ 29 | @callback delete(server :: term()) :: boolean() 30 | 31 | @optional_callbacks setup: 0 32 | 33 | @doc """ 34 | Retrieves the configured module based on the `:config_storage` key. 35 | """ 36 | @spec get_module() :: module() 37 | def get_module() do 38 | case Application.fetch_env!(:broadway, :config_storage) do 39 | :ets -> Broadway.ConfigStorage.ETS 40 | :persistent_term -> Broadway.ConfigStorage.PersistentTerm 41 | mod -> mod 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /lib/broadway/config_storage/ets.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.ConfigStorage.ETS do 2 | @moduledoc false 3 | 4 | @behaviour Broadway.ConfigStorage 5 | 6 | @table __MODULE__ 7 | 8 | # Used in tests. 9 | def table, do: @table 10 | 11 | @impl true 12 | def setup do 13 | :ets.new(@table, [:named_table, :public, :set, {:read_concurrency, true}]) 14 | :ok 15 | end 16 | 17 | @impl true 18 | def list do 19 | :ets.select(@table, [{{:"$1", :_}, [], [:"$1"]}]) 20 | end 21 | 22 | @impl true 23 | def get(server) do 24 | case :ets.match(@table, {server, :"$1"}) do 25 | [[topology]] -> topology 26 | _ -> nil 27 | end 28 | end 29 | 30 | @impl true 31 | def put(server, topology) do 32 | :ets.insert(@table, {server, topology}) 33 | end 34 | 35 | @impl true 36 | def delete(server) do 37 | :ets.delete(@table, server) 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /lib/broadway/config_storage/persistent_term.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.ConfigStorage.PersistentTerm do 2 | @moduledoc false 3 | @behaviour Broadway.ConfigStorage 4 | 5 | @impl true 6 | def setup do 7 | if not Code.ensure_loaded?(:persistent_term) do 8 | require Logger 9 | Logger.error("Broadway requires Erlang/OTP 21.3+") 10 | raise "Broadway requires Erlang/OTP 21.3+" 11 | end 12 | 13 | :ok 14 | end 15 | 16 | @impl true 17 | def list do 18 | for {{Broadway, name}, %Broadway.Topology{}} <- :persistent_term.get() do 19 | name 20 | end 21 | end 22 | 23 | @impl true 24 | def get(server) do 25 | :persistent_term.get({Broadway, server}, nil) 26 | end 27 | 28 | @impl true 29 | def put(server, topology) do 30 | :persistent_term.put({Broadway, server}, topology) 31 | end 32 | 33 | @impl true 34 | def delete(_server) do 35 | # We don't delete from persistent term on purpose. Since the process is 36 | # named, we can assume it does not start dynamically, so it will either 37 | # restart or the amount of memory it uses is negligibla to justify the 38 | # process purging done by persistent_term. If the repo is restarted and 39 | # stores the same metadata, then no purging happens either. 40 | # :persistent_term.erase({Broadway, server}) 41 | true 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /lib/broadway/dummy_producer.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.DummyProducer do 2 | @moduledoc """ 3 | A producer that does nothing, used mostly for testing. 4 | 5 | See "Testing" section in `Broadway` module documentation for more information. 6 | """ 7 | 8 | use GenStage 9 | @behaviour Broadway.Producer 10 | 11 | @impl true 12 | def init(_args) do 13 | {:producer, []} 14 | end 15 | 16 | @impl true 17 | def handle_demand(_demand, state) do 18 | {:noreply, [], state} 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/broadway/message.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Message do 2 | @moduledoc """ 3 | This struct holds all information about a message. 4 | 5 | A message is first created by the producers. It is then 6 | sent downstream and gets updated multiple times, either 7 | by a module implementing the `Broadway` behaviour 8 | through the `c:Broadway.handle_message/3` callback 9 | or internally by one of the built-in stages of Broadway. 10 | 11 | Instead of modifying the struct directly, you should use the functions 12 | provided by this module to manipulate messages. However, if you are implementing 13 | a `Broadway.Producer` of your own, see `t:t/0` to see what fields you should set. 14 | """ 15 | 16 | alias __MODULE__, as: Message 17 | alias Broadway.{Acknowledger, NoopAcknowledger} 18 | 19 | @typedoc """ 20 | The acknowledger of the message. 21 | 22 | This tuple contains: 23 | 24 | * A module implementing the `Broadway.Acknowledger` behaviour. 25 | 26 | * An ack reference that is passed to the `c:Broadway.Acknowledger.ack/3` 27 | callback. See `c:Broadway.Acknowledger.ack/3` for more information. 28 | 29 | * An arbitrary term that is passed to the optional 30 | `c:Broadway.Acknowledger.configure/3` callback. 31 | 32 | """ 33 | @typedoc since: "1.1.0" 34 | @type acknowledger :: {module, ack_ref :: term, data :: term} 35 | 36 | @typedoc """ 37 | The Broadway message struct. 38 | 39 | Most of these fields are manipulated by Broadway itself. You can 40 | *read* the `:metadata` field, and you can use the functions in this 41 | module to update most of the other fields. If you are implementing 42 | your own producer, see the `Broadway.Producer` documentation 43 | for more information on how to create and manipulate message structs. 44 | """ 45 | @type t :: %Message{ 46 | data: term, 47 | metadata: %{optional(atom) => term}, 48 | acknowledger: acknowledger, 49 | batcher: atom, 50 | batch_key: term, 51 | batch_mode: :bulk | :flush, 52 | status: 53 | :ok 54 | | {:failed, reason :: term} 55 | | {:throw | :error | :exit, term, Exception.stacktrace()} 56 | } 57 | 58 | @enforce_keys [:data, :acknowledger] 59 | defstruct data: nil, 60 | metadata: %{}, 61 | acknowledger: nil, 62 | batcher: :default, 63 | batch_key: :default, 64 | batch_mode: :bulk, 65 | status: :ok 66 | 67 | @doc """ 68 | Updates the data in the message. 69 | 70 | This function is usually used inside the `c:Broadway.handle_message/3` implementation 71 | to update data with new processed data. 72 | """ 73 | @spec update_data(message :: Message.t(), fun :: (term -> term)) :: Message.t() 74 | def update_data(%Message{} = message, fun) when is_function(fun, 1) do 75 | %{message | data: fun.(message.data)} 76 | end 77 | 78 | @doc """ 79 | Stores the given data in the message. 80 | 81 | This function is usually used inside the `c:Broadway.handle_message/3` implementation 82 | to replace data with new processed data. 83 | """ 84 | @doc since: "1.0.0" 85 | @spec put_data(message :: Message.t(), term) :: Message.t() 86 | def put_data(%Message{} = message, data) do 87 | %{message | data: data} 88 | end 89 | 90 | @doc """ 91 | Defines the target batcher which the message should be forwarded to. 92 | """ 93 | @spec put_batcher(message :: Message.t(), batcher :: atom) :: Message.t() 94 | def put_batcher(%Message{} = message, batcher) when is_atom(batcher) do 95 | %{message | batcher: batcher} 96 | end 97 | 98 | @doc """ 99 | Defines the message batch key. 100 | 101 | The batch key identifies the batch the message belongs to, within 102 | a given batcher. Each batcher then groups batches with the same 103 | `batch_key`, with size of at most `batch_size` within period 104 | `batch_timeout`. Both `batch_size` and `batch_timeout` are managed 105 | per batch key, so a batcher is capable of grouping multiple batch 106 | keys at the same time, regardless of the concurrency level. 107 | 108 | If a given batcher has multiple batch processors (concurrency > 1), 109 | all messages with the same batch key are routed to the same processor. 110 | So different batch keys may run concurrently but the same batch key 111 | is always run serially and in the same batcher processor. 112 | """ 113 | @spec put_batch_key(message :: Message.t(), batch_key :: term) :: Message.t() 114 | def put_batch_key(%Message{} = message, batch_key) do 115 | %{message | batch_key: batch_key} 116 | end 117 | 118 | @doc """ 119 | Sets the batching mode for the message. 120 | 121 | When the mode is `:bulk`, the batch that the message is in is delivered after 122 | the batch size or batch timeout is reached. 123 | 124 | When the mode is `:flush`, the batch that the message is in is delivered 125 | immediately after processing. Note it doesn't mean the batch contains only a single element 126 | but rather that all messages received from the processor are delivered without waiting. 127 | 128 | The default mode for messages is `:bulk`. 129 | """ 130 | @spec put_batch_mode(message :: Message.t(), mode :: :bulk | :flush) :: Message.t() 131 | def put_batch_mode(%Message{} = message, mode) when mode in [:bulk, :flush] do 132 | %{message | batch_mode: mode} 133 | end 134 | 135 | @doc """ 136 | Configures the acknowledger of this message. 137 | 138 | This function calls the `c:Broadway.Acknowledger.configure/3` callback to 139 | change the configuration of the acknowledger for the given `message`. 140 | 141 | This function can only be called if the acknowledger implements the `configure/3` 142 | callback. If it doesn't, an error is raised. 143 | """ 144 | @doc since: "0.5.0" 145 | @spec configure_ack(message :: Message.t(), options :: keyword) :: Message.t() 146 | def configure_ack(%Message{} = message, options) when is_list(options) do 147 | %{acknowledger: {module, ack_ref, ack_data}} = message 148 | 149 | if Code.ensure_loaded?(module) and function_exported?(module, :configure, 3) do 150 | {:ok, ack_data} = module.configure(ack_ref, ack_data, options) 151 | %{message | acknowledger: {module, ack_ref, ack_data}} 152 | else 153 | raise "the configure/3 callback is not defined by acknowledger #{inspect(module)}" 154 | end 155 | end 156 | 157 | @doc """ 158 | Mark a message as failed. 159 | 160 | Failed messages are sent directly to the related acknowledger at the end 161 | of this step and therefore they're not forwarded to the next step in the 162 | pipeline. 163 | 164 | Failing a message does not emit any log but it does trigger the 165 | `c:Broadway.handle_failed/2` callback. 166 | """ 167 | @spec failed(message :: Message.t(), reason :: term) :: Message.t() 168 | def failed(%Message{} = message, reason) do 169 | %{message | status: {:failed, reason}} 170 | end 171 | 172 | @doc """ 173 | Immediately acknowledges the given message or list of messages. 174 | 175 | This function can be used to acknowledge a message (or list of messages) 176 | immediately without waiting for the rest of the pipeline. 177 | 178 | Acknowledging a message sets that message's acknowledger to a no-op 179 | acknowledger so that it's safe to ack at the end of the pipeline. 180 | 181 | Returns the updated acked message if a message is passed in, 182 | or the updated list of acked messages if a list of messages is passed in. 183 | """ 184 | @doc since: "0.5.0" 185 | @spec ack_immediately(message :: Message.t()) :: Message.t() 186 | @spec ack_immediately(messages :: [Message.t(), ...]) :: [Message.t(), ...] 187 | def ack_immediately(message_or_messages) 188 | 189 | def ack_immediately(%Message{} = message) do 190 | [message] = ack_immediately([message]) 191 | message 192 | end 193 | 194 | def ack_immediately(messages) when is_list(messages) and messages != [] do 195 | {successful, failed} = Enum.split_with(messages, &(&1.status == :ok)) 196 | _ = Acknowledger.ack_messages(successful, failed) 197 | 198 | for message <- messages do 199 | %{message | acknowledger: NoopAcknowledger.init()} 200 | end 201 | end 202 | end 203 | -------------------------------------------------------------------------------- /lib/broadway/noop_acknowledger.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.NoopAcknowledger do 2 | @moduledoc """ 3 | An acknowledger that does nothing. 4 | 5 | If you want to use this acknowledger in messages produced by your 6 | `Broadway.Producer`, you can get its configuration by calling 7 | the `init/0` function. For example, you can use it in 8 | `Broadway.test_message/3`: 9 | 10 | Broadway.test_message(MyPipeline, "some data", acknowledger: Broadway.NoopAcknowledger.init()) 11 | 12 | Broadway sets this acknowledger automatically on messages that have been acked 13 | via `Broadway.Message.ack_immediately/1`. 14 | """ 15 | 16 | @behaviour Broadway.Acknowledger 17 | 18 | @doc """ 19 | Returns the acknowledger metadata. 20 | """ 21 | @spec init() :: Broadway.Message.acknowledger() 22 | def init do 23 | {__MODULE__, _ack_ref = nil, _data = nil} 24 | end 25 | 26 | @impl true 27 | def ack(_ack_ref = nil, _successful, _failed) do 28 | :ok 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/broadway/options.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Options do 2 | @moduledoc false 3 | 4 | definition = [ 5 | name: [ 6 | required: true, 7 | type: {:custom, __MODULE__, :validate_name, []}, 8 | doc: """ 9 | Used for name registration. When an atom, all processes/stages 10 | created will be named using this value as prefix. 11 | """ 12 | ], 13 | shutdown: [ 14 | type: :pos_integer, 15 | default: 30000, 16 | doc: """ 17 | Optional. The time in milliseconds given for Broadway to 18 | gracefully shutdown without discarding events. 19 | """ 20 | ], 21 | max_restarts: [type: :non_neg_integer, default: 3], 22 | max_seconds: [type: :pos_integer, default: 5], 23 | resubscribe_interval: [ 24 | type: :non_neg_integer, 25 | default: 100, 26 | doc: """ 27 | The interval in milliseconds that 28 | processors wait until they resubscribe to a failed producers. 29 | """ 30 | ], 31 | context: [ 32 | type: :any, 33 | default: :context_not_set, 34 | doc: """ 35 | A user defined data structure that will be passed to handle_message/3 and handle_batch/4. 36 | """ 37 | ], 38 | producer: [ 39 | required: true, 40 | type: :non_empty_keyword_list, 41 | doc: """ 42 | A keyword list of options. See ["Producers options"](#start_link/2-producers-options) 43 | section below. Only a single producer is allowed. 44 | """, 45 | subsection: """ 46 | ### Producers options 47 | 48 | The producer options allow users to set up the producer. 49 | 50 | The available options are: 51 | """, 52 | keys: [ 53 | module: [ 54 | required: true, 55 | type: :mod_arg, 56 | doc: """ 57 | A tuple representing a GenStage producer. 58 | The tuple format should be `{mod, arg}`, where `mod` is the module 59 | that implements the GenStage behaviour and `arg` the argument that will 60 | be passed to the `init/1` callback of the producer. See `Broadway.Producer` 61 | for more information. 62 | """ 63 | ], 64 | concurrency: [ 65 | type: :pos_integer, 66 | default: 1, 67 | doc: """ 68 | The number of concurrent producers that 69 | will be started by Broadway. Use this option to control the concurrency 70 | level of each set of producers. 71 | """ 72 | ], 73 | transformer: [ 74 | type: :mfa, 75 | default: nil, 76 | doc: """ 77 | A tuple representing a transformer that translates a produced GenStage event into a 78 | `%Broadway.Message{}`. The tuple format should be `{mod, fun, opts}` and the function 79 | should have the following spec `(event :: term, opts :: term) :: Broadway.Message.t` 80 | This function must be used sparingly and exclusively to convert regular 81 | messages into `Broadway.Message`. That's because a failure in the 82 | `:transformer` callback will cause the whole producer to terminate, 83 | possibly leaving unacknowledged messages along the way. 84 | """ 85 | ], 86 | spawn_opt: [ 87 | type: :keyword_list, 88 | doc: """ 89 | Overrides the top-level `:spawn_opt`. 90 | """ 91 | ], 92 | hibernate_after: [ 93 | type: :pos_integer, 94 | doc: """ 95 | Overrides the top-level `:hibernate_after`. 96 | """ 97 | ], 98 | rate_limiting: [ 99 | type: :non_empty_keyword_list, 100 | doc: """ 101 | A list of options to enable and configure rate limiting for producing. 102 | If this option is present, rate limiting is enabled, otherwise it isn't. 103 | Rate limiting refers to the rate at which producers will forward 104 | messages to the rest of the pipeline. The rate limiting is applied to 105 | and shared by all producers within the time limit. 106 | The following options are supported: 107 | """, 108 | keys: [ 109 | allowed_messages: [ 110 | required: true, 111 | type: :pos_integer, 112 | doc: """ 113 | An integer that describes how many messages are allowed in the specified interval. 114 | """ 115 | ], 116 | interval: [ 117 | required: true, 118 | type: :pos_integer, 119 | doc: """ 120 | An integer that describes the interval (in milliseconds) 121 | during which the number of allowed messages is allowed. 122 | If the producer produces more than `allowed_messages` 123 | in `interval`, only `allowed_messages` will be published until 124 | the end of `interval`, and then more messages will be published. 125 | """ 126 | ] 127 | ] 128 | ] 129 | ] 130 | ], 131 | processors: [ 132 | required: true, 133 | type: :non_empty_keyword_list, 134 | doc: """ 135 | A keyword list of named processors where the key is an atom as identifier and 136 | the value is another keyword list of options. 137 | See ["Processors options"](#start_link/2-processors-options) 138 | section below. Currently only a single processor is allowed. 139 | """, 140 | subsection: """ 141 | ### Processors options 142 | 143 | > #### You don't need multiple processors {: .info} 144 | > 145 | > A common misconception is that, if your data requires multiple 146 | > transformations, each with a different concern, then you must 147 | > have several processors. 148 | > 149 | > However, that's not quite true. Separation of concerns is modeled 150 | > by defining several modules and functions, not processors. Processors 151 | > are ultimately about moving data around and you should only do it 152 | > when necessary. Using processors for code organization purposes would 153 | > lead to inefficient pipelines. 154 | 155 | """, 156 | keys: [ 157 | *: [ 158 | type: :keyword_list, 159 | keys: [ 160 | concurrency: [ 161 | type: :pos_integer, 162 | doc: """ 163 | The number of concurrent process that will 164 | be started by Broadway. Use this option to control the concurrency level 165 | of the processors. The default value is `System.schedulers_online() * 2`. 166 | """ 167 | ], 168 | min_demand: [ 169 | type: :non_neg_integer, 170 | doc: """ 171 | Set the minimum demand of all processors stages. 172 | """ 173 | ], 174 | max_demand: [ 175 | type: :non_neg_integer, 176 | default: 10, 177 | doc: """ 178 | Set the maximum demand of all processors stages. 179 | """ 180 | ], 181 | partition_by: [ 182 | type: {:fun, 1}, 183 | doc: """ 184 | Overrides the top-level `:partition_by`. 185 | """ 186 | ], 187 | spawn_opt: [ 188 | type: :keyword_list, 189 | doc: """ 190 | Overrides the top-level `:spawn_opt`. 191 | """ 192 | ], 193 | hibernate_after: [ 194 | type: :pos_integer, 195 | doc: """ 196 | Overrides the top-level `:hibernate_after`. 197 | """ 198 | ] 199 | ] 200 | ] 201 | ] 202 | ], 203 | batchers: [ 204 | default: [], 205 | type: :keyword_list, 206 | doc: """ 207 | A keyword list of named batchers 208 | where the key is an atom as identifier and the value is another 209 | keyword list of options. See ["Batchers options"](#start_link/2-batchers-options) 210 | section below. 211 | """, 212 | subsection: """ 213 | ### Batchers options 214 | 215 | """, 216 | keys: [ 217 | *: [ 218 | type: :keyword_list, 219 | keys: [ 220 | concurrency: [ 221 | type: :pos_integer, 222 | default: 1, 223 | doc: """ 224 | The number of concurrent batch processors 225 | that will be started by Broadway. Use this option to control the 226 | concurrency level. Note that this only sets the numbers of batch 227 | processors for each batcher group, not the number of batchers. 228 | The number of batchers will always be one for each batcher key 229 | defined. 230 | """ 231 | ], 232 | batch_size: [ 233 | type: {:custom, __MODULE__, :validate_batch_size, []}, 234 | default: 100, 235 | doc: """ 236 | The size of the generated batches. Default value is `100`. It is typically an 237 | integer but it can also be tuple of `{init_acc, fun}` 238 | where `fun` receives two arguments: a `Broadway.Message` and 239 | an `acc`. The function must return either `{:emit, acc}` to indicate 240 | all batched messages must be emitted or `{:cont, acc}` to continue 241 | batching. `init_acc` is the initial accumulator used on the first call. You can 242 | consider that setting the accumulator to an integer is the equivalent to custom 243 | batching function of: 244 | 245 | {batch_size, 246 | fn 247 | _message, 1 -> {:emit, batch_size} 248 | _message, count -> {:cont, count - 1} 249 | end} 250 | 251 | We start with the batch size as the accumulator, and then we go down for every 252 | event. When we get down to `1`, we emit the batch and *reset* the accumulator 253 | to the batch size. That's because when returning `{:emit, acc}`, `acc` is 254 | used for the next call to the `:batch_size` function. 255 | 256 | > #### When is this called {: .info} 257 | > 258 | > If you pass a function as the batch size, that function is invoked *after* 259 | > `c:handle_message/3`. 260 | 261 | """ 262 | ], 263 | max_demand: [ 264 | type: :pos_integer, 265 | doc: """ 266 | Sets the maximum demand of batcher stages. 267 | By default it is set to `:batch_size`, if `:batch_size` is an integer. 268 | Must be set if the `:batch_size` is a function. 269 | """ 270 | ], 271 | batch_timeout: [ 272 | type: :pos_integer, 273 | default: 1000, 274 | doc: """ 275 | The time, in milliseconds, that the batcher waits before flushing 276 | the list of messages. When this timeout is reached, a new batch 277 | is generated and sent downstream, no matter if the `:batch_size` 278 | has been reached or not. 279 | """ 280 | ], 281 | partition_by: [ 282 | type: {:fun, 1}, 283 | doc: """ 284 | Optional. Overrides the top-level `:partition_by`. 285 | """ 286 | ], 287 | spawn_opt: [ 288 | type: :keyword_list, 289 | doc: """ 290 | Overrides the top-level `:spawn_opt`. 291 | """ 292 | ], 293 | hibernate_after: [ 294 | type: :pos_integer, 295 | doc: """ 296 | Overrides the top-level `:hibernate_after`. 297 | """ 298 | ] 299 | ] 300 | ] 301 | ] 302 | ], 303 | partition_by: [ 304 | type: {:fun, 1}, 305 | doc: """ 306 | A function that controls how data is 307 | partitioned across all processors and batchers. It receives a 308 | `Broadway.Message` and it must return a non-negative integer, 309 | starting with zero, that will be mapped to one of the existing 310 | processors. See ["Ordering and Partitioning"](#module-ordering-and-partitioning) 311 | in the module docs for more information and known pitfalls. 312 | """ 313 | ], 314 | spawn_opt: [ 315 | type: :keyword_list, 316 | doc: """ 317 | Low-level options given when starting a 318 | process. Applies to producers, processors, and batchers. 319 | See `erlang:spawn_opt/2` for more information. 320 | """ 321 | ], 322 | hibernate_after: [ 323 | type: :pos_integer, 324 | default: 15_000, 325 | doc: """ 326 | If a process does not receive any message within this interval, it will hibernate, 327 | compacting memory. Applies to producers, processors, and batchers. 328 | Defaults to `15_000` (millisecond). 329 | """ 330 | ] 331 | ] 332 | 333 | @definition NimbleOptions.new!(definition) 334 | 335 | def definition() do 336 | @definition 337 | end 338 | 339 | def validate_name(name) when is_atom(name), do: {:ok, name} 340 | 341 | def validate_name({:via, module, _term} = via) when is_atom(module), do: {:ok, via} 342 | 343 | def validate_name(name) do 344 | {:error, 345 | "expected :name to be an atom or a {:via, module, term} tuple, got: #{inspect(name)}"} 346 | end 347 | 348 | def validate_batch_size(size) when is_integer(size) and size > 0, do: {:ok, size} 349 | 350 | def validate_batch_size({_acc, func} = batch_splitter) when is_function(func) do 351 | if is_function(func, 2) do 352 | {:ok, batch_splitter} 353 | else 354 | {:error, "expected `:batch_size` to include a function of 2 arity, got: #{inspect(func)}\n"} 355 | end 356 | end 357 | 358 | def validate_batch_size(batch_size) do 359 | {:error, 360 | "expected :batch_size to be a positive integer or a {acc, &fun/2} tuple, got: #{inspect(batch_size)}\n"} 361 | end 362 | end 363 | -------------------------------------------------------------------------------- /lib/broadway/producer.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Producer do 2 | @moduledoc """ 3 | A Broadway producer is a `GenStage` producer that emits 4 | `Broadway.Message` structs as events. 5 | 6 | The `Broadway.Producer` is declared in a Broadway topology 7 | via the `:module` option (see `Broadway.start_link/2`): 8 | 9 | producer: [ 10 | module: {MyProducer, options} 11 | ] 12 | 13 | Once declared, `MyProducer` is expected to implement and 14 | behave as a `GenStage` producer. When Broadway starts, 15 | the `c:GenStage.init/1` callback will be invoked directly with the 16 | given `options`. 17 | 18 | ## Injected Broadway configuration 19 | 20 | If `options` is a keyword list, Broadway injects a `:broadway` option 21 | into the keyword list. This option contains the configuration for the 22 | complete Broadway topology (see `Broadway.start_link/2`). For example, 23 | you can use `options[:broadway][:name]` to uniquely identify the topology. 24 | 25 | The `:broadway` configuration also has an `:index` key. This 26 | is the index of the producer in its supervision tree (starting 27 | from `0`). This allows features such as having even producers 28 | connect to some server while odd producers connect to another. 29 | 30 | If `options` is any other term, it is passed as is to the `c:GenStage.init/1` 31 | callback. All other functions behave precisely as in `GenStage` 32 | with the requirements that all emitted events must be `Broadway.Message` 33 | structs. 34 | 35 | ## Optional callbacks 36 | 37 | A `Broadway.Producer` can implement two optional Broadway callbacks, 38 | `c:prepare_for_start/2` and `c:prepare_for_draining/1`, which are useful 39 | for booting up and shutting down Broadway topologies respectively. 40 | 41 | ## Producing Broadway messages 42 | 43 | You should generally modify `Broadway.Message` structs by using the functions 44 | in the `Broadway.Message` module. However, if you are implementing your 45 | own producer, you **can manipulate** some of the struct's fields directly. 46 | 47 | These fields are: 48 | 49 | * `:data` (required) - the data of the message. Even though the function 50 | `Broadway.Message.put_data/2` exists, when creating a `%Broadway.Message{}` 51 | struct from scratch you will have to pass in the `:data` field directly. 52 | 53 | * `:acknowledger` (required) - the acknowledger of the message, of type 54 | `t:Broadway.Message.acknowledger/0`. 55 | 56 | * `:metadata` (optional) - metadata about the message that your producer 57 | can attach to the message. This is useful when you want to add some metadata 58 | to messages, and document it for users to use in their pipelines. 59 | 60 | For example, a producer could create a message by doing something like this: 61 | 62 | %Broadway.Message{ 63 | data: "some data here", 64 | acknowledger: Broadway.NoopAcknowledger.init() 65 | } 66 | 67 | """ 68 | 69 | @doc """ 70 | Invoked once by Broadway during `Broadway.start_link/2`. 71 | 72 | The goal of this callback is to manipulate the general topology options, 73 | if necessary at all, and introduce any new child specs that will be 74 | started **before** the producers' supervisor in Broadway's supervision tree. 75 | Broadway's supervision tree is a `rest_for_one` supervisor (see the documentation 76 | for `Supervisor`), which means that if the children returned from this callback 77 | crash they will bring down the rest of the pipeline before being restarted. 78 | 79 | This callback is guaranteed to be invoked inside the Broadway main process. 80 | 81 | `module` is the Broadway module passed as the first argument to 82 | `Broadway.start_link/2`. `options` is all of Broadway topology options passed 83 | as the second argument to `Broadway.start_link/2`. 84 | 85 | The return value of this callback is a tuple `{child_specs, options}`. `child_specs` 86 | is the list of child specs to be started under Broadway's supervision tree. 87 | `updated_options` is a potentially-updated list of Broadway options 88 | that will be used instead of the ones passed to `Broadway.start_link/2`. This can be 89 | used to modify the characteristics of the Broadway topology to accommodate 90 | the children started here. 91 | 92 | ## Examples 93 | 94 | defmodule MyProducer do 95 | @behaviour Broadway.Producer 96 | 97 | # other callbacks... 98 | 99 | @impl true 100 | def prepare_for_start(_module, broadway_options) do 101 | children = [ 102 | {DynamicSupervisor, strategy: :one_for_one, name: MyApp.DynamicSupervisor} 103 | ] 104 | updated_options = put_in(broadway_options, [:producer, :rate_limiting], [interval: 1000, allowed_messages: 10]) 105 | 106 | {children, updated_options} 107 | end 108 | end 109 | 110 | """ 111 | @doc since: "0.5.0" 112 | @callback prepare_for_start(module :: atom, options :: keyword) :: 113 | {[child_spec], updated_options :: keyword} 114 | when child_spec: :supervisor.child_spec() | {module, any} | module 115 | 116 | @doc """ 117 | Invoked by the terminator right before Broadway starts draining in-flight 118 | messages during shutdown. 119 | 120 | This callback should be implemented by producers that need to do additional 121 | work before shutting down. That includes active producers like RabbitMQ that 122 | must ask the data provider to stop sending messages. It will be invoked for 123 | each producer stage. 124 | 125 | `state` is the current state of the producer. 126 | """ 127 | @callback prepare_for_draining(state :: any) :: 128 | {:noreply, [event], new_state} 129 | | {:noreply, [event], new_state, :hibernate} 130 | | {:stop, reason :: term, new_state} 131 | when new_state: term, event: term 132 | 133 | @optional_callbacks prepare_for_start: 2, prepare_for_draining: 1 134 | end 135 | -------------------------------------------------------------------------------- /lib/broadway/topology.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Topology do 2 | @moduledoc false 3 | @behaviour GenServer 4 | 5 | alias Broadway.Topology.{ 6 | ProducerStage, 7 | ProcessorStage, 8 | BatcherStage, 9 | BatchProcessorStage, 10 | Terminator, 11 | RateLimiter 12 | } 13 | 14 | alias Broadway.ConfigStorage 15 | 16 | defstruct [:context, :topology, :producer_names, :batchers_names, :rate_limiter_name] 17 | 18 | def start_link(module, opts) do 19 | GenServer.start_link(__MODULE__, {module, opts}, opts) 20 | end 21 | 22 | def producer_names(server) do 23 | config(server).producer_names 24 | end 25 | 26 | def get_rate_limiter(server) do 27 | if name = config(server).rate_limiter_name do 28 | {:ok, name} 29 | else 30 | {:error, :rate_limiting_not_enabled} 31 | end 32 | end 33 | 34 | def topology(server) do 35 | config(server).topology 36 | end 37 | 38 | defp config(server) do 39 | config_storage = ConfigStorage.get_module() 40 | 41 | config_storage.get(server) || 42 | exit({:noproc, {__MODULE__, :config, [server]}}) 43 | end 44 | 45 | ## Callbacks 46 | 47 | @impl true 48 | def init({module, opts}) do 49 | Process.flag(:trap_exit, true) 50 | config_storage = ConfigStorage.get_module() 51 | 52 | # We want to invoke this as early as possible otherwise the 53 | # stacktrace gets deeper and deeper in case of errors. 54 | {child_specs, opts} = prepare_for_start(module, opts) 55 | 56 | config = init_config(module, opts) 57 | {:ok, supervisor_pid} = start_supervisor(child_specs, config, opts) 58 | 59 | emit_init_event(opts, supervisor_pid) 60 | 61 | config_storage.put(config.name, %__MODULE__{ 62 | context: config.context, 63 | topology: build_topology_details(config), 64 | producer_names: process_names(config, "Producer", config.producer_config), 65 | batchers_names: 66 | Enum.map(config.batchers_config, &process_name(config, "Batcher", elem(&1, 0))), 67 | rate_limiter_name: config.rate_limiter 68 | }) 69 | 70 | {:ok, 71 | %{ 72 | supervisor_pid: supervisor_pid, 73 | terminator: config.terminator, 74 | name: config.name 75 | }} 76 | end 77 | 78 | @impl true 79 | def handle_info({:EXIT, supervisor_pid, reason}, %{supervisor_pid: supervisor_pid} = state) do 80 | {:stop, reason, state} 81 | end 82 | 83 | def handle_info(_, state) do 84 | {:noreply, state} 85 | end 86 | 87 | @impl true 88 | def terminate(reason, %{name: name, supervisor_pid: supervisor_pid, terminator: terminator}) do 89 | Broadway.Topology.Terminator.trap_exit(terminator) 90 | ref = Process.monitor(supervisor_pid) 91 | Process.exit(supervisor_pid, reason_to_signal(reason)) 92 | 93 | receive do 94 | {:DOWN, ^ref, _, _, _} -> 95 | config_storage = ConfigStorage.get_module() 96 | config_storage.delete(name) 97 | :ok 98 | end 99 | 100 | :ok 101 | end 102 | 103 | defp reason_to_signal(:killed), do: :kill 104 | defp reason_to_signal(other), do: other 105 | 106 | defp prepare_for_start(module, opts) do 107 | {producer_mod, _producer_opts} = opts[:producer][:module] 108 | 109 | if Code.ensure_loaded?(producer_mod) and 110 | function_exported?(producer_mod, :prepare_for_start, 2) do 111 | case producer_mod.prepare_for_start(module, opts) do 112 | {child_specs, opts} when is_list(child_specs) -> 113 | {child_specs, NimbleOptions.validate!(opts, Broadway.Options.definition())} 114 | 115 | other -> 116 | raise ArgumentError, 117 | "expected #{Exception.format_mfa(producer_mod, :prepare_for_start, 2)} " <> 118 | "to return {child_specs, options}, got: #{inspect(other)}" 119 | end 120 | else 121 | {[], opts} 122 | end 123 | end 124 | 125 | defp start_supervisor(child_specs, config, opts) do 126 | {producers_names, producers_specs} = build_producers_specs(config, opts) 127 | {processors_names, processors_specs} = build_processors_specs(config, producers_names) 128 | 129 | children = 130 | [ 131 | build_rate_limiter_spec(config, producers_names), 132 | build_producer_supervisor_spec(config, producers_specs), 133 | build_processor_supervisor_spec(config, processors_specs) 134 | ] ++ 135 | build_batchers_supervisor_and_terminator_specs(config, producers_names, processors_names) 136 | 137 | supervisor_opts = [ 138 | name: process_name(config, "Supervisor"), 139 | max_restarts: config.max_restarts, 140 | max_seconds: config.max_seconds, 141 | strategy: :rest_for_one 142 | ] 143 | 144 | Supervisor.start_link(child_specs ++ children, supervisor_opts) 145 | end 146 | 147 | defp init_config(module, opts) do 148 | %{ 149 | name: opts[:name], 150 | module: module, 151 | producer_config: opts[:producer], 152 | processors_config: init_processors_config(opts[:processors]), 153 | batchers_config: opts[:batchers], 154 | context: opts[:context], 155 | max_restarts: opts[:max_restarts], 156 | max_seconds: opts[:max_seconds], 157 | shutdown: opts[:shutdown], 158 | resubscribe_interval: opts[:resubscribe_interval], 159 | terminator: nil, 160 | rate_limiter: nil 161 | } 162 | |> put_terminator() 163 | |> put_rate_limiter(opts) 164 | end 165 | 166 | defp put_terminator(config) do 167 | Map.put(config, :terminator, process_name(config, "Terminator")) 168 | end 169 | 170 | defp put_rate_limiter(config, opts) do 171 | if opts[:producer][:rate_limiting] do 172 | Map.put(config, :rate_limiter, process_name(config, "RateLimiter")) 173 | else 174 | config 175 | end 176 | end 177 | 178 | defp init_processors_config(config) do 179 | Enum.map(config, fn {key, opts} -> 180 | {key, Keyword.put_new(opts, :concurrency, System.schedulers_online() * 2)} 181 | end) 182 | end 183 | 184 | defp emit_init_event(user_config, supervisor_pid) do 185 | measurements = %{system_time: System.monotonic_time()} 186 | 187 | metadata = %{ 188 | config: user_config, 189 | supervisor_pid: supervisor_pid 190 | } 191 | 192 | :telemetry.execute([:broadway, :topology, :init], measurements, metadata) 193 | end 194 | 195 | defp start_options(name, config) do 196 | [name: name] ++ Keyword.take(config, [:spawn_opt, :hibernate_after]) 197 | end 198 | 199 | defp build_rate_limiter_spec(config, producers_names) do 200 | %{producer_config: producer_config} = config 201 | 202 | opts = [ 203 | name: process_name(config, "RateLimiter"), 204 | rate_limiting: producer_config[:rate_limiting], 205 | producers_names: producers_names 206 | ] 207 | 208 | {RateLimiter, opts} 209 | end 210 | 211 | defp build_producers_specs(config, opts) do 212 | %{ 213 | producer_config: producer_config, 214 | processors_config: processors_config, 215 | shutdown: shutdown, 216 | rate_limiter: rate_limiter 217 | } = config 218 | 219 | n_producers = producer_config[:concurrency] 220 | [{_, processor_config} | _other_processors] = processors_config 221 | 222 | # The partition of the producer depends on the processor, so we handle it here. 223 | dispatcher = 224 | case processor_config[:partition_by] do 225 | nil -> 226 | {GenStage.DemandDispatcher, shuffle_demands_on_first_dispatch: true} 227 | 228 | func -> 229 | n_processors = processor_config[:concurrency] 230 | hash_func = fn msg -> {msg, rem(func.(msg), n_processors)} end 231 | {GenStage.PartitionDispatcher, partitions: 0..(n_processors - 1), hash: hash_func} 232 | end 233 | 234 | args = [broadway: opts, dispatcher: dispatcher, rate_limiter: rate_limiter] ++ producer_config 235 | 236 | names_and_specs = 237 | for index <- 0..(n_producers - 1) do 238 | name = process_name(config, "Producer", index) 239 | start_options = start_options(name, producer_config) 240 | 241 | spec = %{ 242 | start: {ProducerStage, :start_link, [args, index, start_options]}, 243 | id: name, 244 | shutdown: shutdown 245 | } 246 | 247 | {name, spec} 248 | end 249 | 250 | # We want to return {names, specs} here. 251 | Enum.unzip(names_and_specs) 252 | end 253 | 254 | defp build_processors_specs(config, producers) do 255 | %{ 256 | name: topology_name, 257 | module: module, 258 | processors_config: processors_config, 259 | context: context, 260 | batchers_config: batchers_config, 261 | resubscribe_interval: resubscribe_interval, 262 | terminator: terminator, 263 | shutdown: shutdown, 264 | producer_config: producer_config 265 | } = config 266 | 267 | [{key, processor_config} | other_processors] = processors_config 268 | 269 | if other_processors != [] do 270 | raise "Only one set of processors is allowed for now" 271 | end 272 | 273 | names = process_names(config, "Processor_#{key}", processor_config) 274 | 275 | # The partition of the processor depends on the next processor or the batcher, 276 | # so we handle it here. 277 | {type, dispatcher, batchers} = 278 | case Keyword.keys(batchers_config) do 279 | [] -> 280 | {:consumer, nil, :none} 281 | 282 | [_] = batchers -> 283 | {:producer_consumer, 284 | {GenStage.DemandDispatcher, shuffle_demands_on_first_dispatch: true}, batchers} 285 | 286 | [_ | _] = batchers -> 287 | {:producer_consumer, 288 | {GenStage.PartitionDispatcher, partitions: batchers, hash: &{&1, &1.batcher}}, 289 | batchers} 290 | end 291 | 292 | args = [ 293 | topology_name: topology_name, 294 | type: type, 295 | resubscribe: resubscribe_interval, 296 | terminator: terminator, 297 | module: module, 298 | context: context, 299 | dispatcher: dispatcher, 300 | processor_key: key, 301 | processor_config: processor_config, 302 | producers: producers, 303 | producer: producer_config[:module], 304 | batchers: batchers 305 | ] 306 | 307 | specs = 308 | for {name, index} <- Enum.with_index(names) do 309 | start_options = start_options(name, processor_config) 310 | args = [name: name, partition: index] ++ args 311 | 312 | %{ 313 | start: {ProcessorStage, :start_link, [args, start_options]}, 314 | id: name, 315 | shutdown: shutdown 316 | } 317 | end 318 | 319 | {names, specs} 320 | end 321 | 322 | defp build_batchers_supervisor_and_terminator_specs(config, producers_names, processors_names) do 323 | if config.batchers_config == [] do 324 | [build_terminator_spec(config, producers_names, processors_names, processors_names)] 325 | else 326 | {batch_processors_names, batcher_supervisors_specs} = 327 | build_batcher_supervisors_specs(config, processors_names) 328 | 329 | [ 330 | build_batchers_supervisor_spec(config, batcher_supervisors_specs), 331 | build_terminator_spec(config, producers_names, processors_names, batch_processors_names) 332 | ] 333 | end 334 | end 335 | 336 | defp build_batcher_supervisors_specs(config, processors) do 337 | names_and_specs = 338 | for {key, _} = batcher_config <- config.batchers_config do 339 | {batcher, batcher_spec} = build_batcher_spec(config, batcher_config, processors) 340 | 341 | {consumers_names, consumers_specs} = 342 | build_batch_processors_specs(config, batcher_config, batcher) 343 | 344 | children = [ 345 | batcher_spec, 346 | build_batch_processor_supervisor_spec(config, consumers_specs, key) 347 | ] 348 | 349 | {consumers_names, build_batcher_supervisor_spec(config, children, key)} 350 | end 351 | 352 | {names, specs} = Enum.unzip(names_and_specs) 353 | {Enum.concat(names), specs} 354 | end 355 | 356 | defp build_batcher_spec(config, batcher_config, processors) do 357 | %{terminator: terminator, shutdown: shutdown} = config 358 | {key, options} = batcher_config 359 | name = process_name(config, "Batcher", key) 360 | 361 | args = 362 | [ 363 | topology_name: config.name, 364 | name: name, 365 | resubscribe: :never, 366 | terminator: terminator, 367 | batcher: key, 368 | partition: key, 369 | processors: processors, 370 | context: config[:context], 371 | # Partitioning is handled inside the batcher since the batcher 372 | # needs to associate the partition with the batcher key. 373 | partition_by: options[:partition_by], 374 | concurrency: options[:concurrency] 375 | ] ++ options 376 | 377 | opts = start_options(name, options) 378 | 379 | spec = %{ 380 | start: {BatcherStage, :start_link, [args, opts]}, 381 | id: name, 382 | shutdown: shutdown 383 | } 384 | 385 | {name, spec} 386 | end 387 | 388 | defp build_batch_processors_specs(config, {key, batcher_config}, batcher) do 389 | %{ 390 | name: broadway_name, 391 | module: module, 392 | context: context, 393 | terminator: terminator, 394 | shutdown: shutdown, 395 | producer_config: producer_config 396 | } = config 397 | 398 | names = process_names(config, "BatchProcessor_#{key}", batcher_config) 399 | 400 | args = [ 401 | topology_name: broadway_name, 402 | resubscribe: :never, 403 | terminator: terminator, 404 | module: module, 405 | context: context, 406 | batcher: batcher, 407 | producer: producer_config[:module] 408 | ] 409 | 410 | specs = 411 | for {name, index} <- Enum.with_index(names) do 412 | start_options = start_options(name, batcher_config) 413 | 414 | %{ 415 | start: 416 | {BatchProcessorStage, :start_link, 417 | [[name: name, partition: index] ++ args, start_options]}, 418 | id: name, 419 | shutdown: shutdown 420 | } 421 | end 422 | 423 | {names, specs} 424 | end 425 | 426 | defp build_terminator_spec(config, producers, first, last) do 427 | %{ 428 | terminator: name, 429 | shutdown: shutdown 430 | } = config 431 | 432 | args = [ 433 | producers: producers, 434 | first: first, 435 | last: last 436 | ] 437 | 438 | start_options = [name: name] 439 | 440 | %{ 441 | start: {Terminator, :start_link, [args, start_options]}, 442 | id: name, 443 | shutdown: shutdown 444 | } 445 | end 446 | 447 | defp build_topology_details(config) do 448 | [ 449 | producers: [ 450 | %{ 451 | name: process_name(config, "Producer"), 452 | concurrency: config.producer_config[:concurrency] 453 | } 454 | ], 455 | processors: 456 | Enum.map(config.processors_config, fn {name, processor_config} -> 457 | %{ 458 | name: process_name(config, "Processor", name), 459 | processor_key: name, 460 | concurrency: processor_config[:concurrency] 461 | } 462 | end), 463 | batchers: 464 | Enum.map(config.batchers_config, fn {name, batcher_config} -> 465 | %{ 466 | batcher_name: process_name(config, "Batcher", name), 467 | batcher_key: name, 468 | name: process_name(config, "BatchProcessor", name), 469 | concurrency: batcher_config[:concurrency] 470 | } 471 | end) 472 | ] 473 | end 474 | 475 | defp process_name(config, base_name, suffix) do 476 | process_name(config, "#{base_name}_#{suffix}") 477 | end 478 | 479 | defp process_name(%{module: module, name: broadway_name} = _config, base_name) do 480 | if function_exported?(module, :process_name, 2) do 481 | module.process_name(broadway_name, base_name) 482 | else 483 | default_process_name(broadway_name, base_name) 484 | end 485 | end 486 | 487 | defp default_process_name(broadway_name, base_name) when is_atom(broadway_name) do 488 | :"#{broadway_name}.Broadway.#{base_name}" 489 | end 490 | 491 | defp default_process_name(broadway_name, _base_name) do 492 | raise ArgumentError, """ 493 | expected Broadway to be started with an atom :name, got: #{inspect(broadway_name)} 494 | 495 | If starting Broadway with a :name that is not an atom, you must define the \ 496 | process_name/2 callback in the module which calls "use Broadway" (see the documentation). 497 | """ 498 | end 499 | 500 | defp process_names(config, type, processor_config) do 501 | for index <- 0..(processor_config[:concurrency] - 1) do 502 | process_name(config, type, index) 503 | end 504 | end 505 | 506 | defp build_producer_supervisor_spec(config, children) do 507 | name = process_name(config, "ProducerSupervisor") 508 | children_count = length(children) 509 | 510 | # TODO: Allow max_restarts and max_seconds as configuration 511 | # options as well as shutdown and restart for each child. 512 | build_supervisor_spec(children, name, 513 | strategy: :one_for_one, 514 | max_restarts: 2 * children_count, 515 | max_seconds: children_count 516 | ) 517 | end 518 | 519 | defp build_processor_supervisor_spec(config, children) do 520 | build_supervisor_spec( 521 | children, 522 | process_name(config, "ProcessorSupervisor"), 523 | strategy: :one_for_all, 524 | max_restarts: 0 525 | ) 526 | end 527 | 528 | defp build_batchers_supervisor_spec(config, children) do 529 | children_count = length(children) 530 | 531 | build_supervisor_spec( 532 | children, 533 | process_name(config, "BatchersSupervisor"), 534 | strategy: :one_for_one, 535 | max_restarts: 2 * children_count, 536 | max_seconds: children_count 537 | ) 538 | end 539 | 540 | defp build_batcher_supervisor_spec(config, children, key) do 541 | build_supervisor_spec( 542 | children, 543 | process_name(config, "BatcherSupervisor", key), 544 | strategy: :rest_for_one, 545 | max_restarts: 4, 546 | max_seconds: 2 547 | ) 548 | end 549 | 550 | defp build_batch_processor_supervisor_spec(config, children, key) do 551 | build_supervisor_spec( 552 | children, 553 | process_name(config, "BatchProcessorSupervisor", key), 554 | strategy: :one_for_all, 555 | max_restarts: 0 556 | ) 557 | end 558 | 559 | defp build_supervisor_spec(children, name, opts) do 560 | %{ 561 | id: make_ref(), 562 | start: {Supervisor, :start_link, [children, [name: name] ++ opts]}, 563 | type: :supervisor 564 | } 565 | end 566 | end 567 | -------------------------------------------------------------------------------- /lib/broadway/topology/batch_processor_stage.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Topology.BatchProcessorStage do 2 | @moduledoc false 3 | use GenStage 4 | require Logger 5 | alias Broadway.{Acknowledger, Message} 6 | @subscription_options [max_demand: 1, min_demand: 0] 7 | 8 | @spec start_link(term, GenServer.options()) :: GenServer.on_start() 9 | def start_link(args, stage_options) do 10 | Broadway.Topology.Subscriber.start_link( 11 | __MODULE__, 12 | [args[:batcher]], 13 | args, 14 | @subscription_options, 15 | stage_options 16 | ) 17 | end 18 | 19 | @impl true 20 | def init(args) do 21 | Process.flag(:trap_exit, true) 22 | 23 | state = %{ 24 | topology_name: args[:topology_name], 25 | name: args[:name], 26 | partition: args[:partition], 27 | module: args[:module], 28 | context: args[:context], 29 | producer: args[:producer] 30 | } 31 | 32 | {:consumer, state, []} 33 | end 34 | 35 | @impl true 36 | def handle_info({:EXIT, pid, reason}, state) when reason not in [:normal, :shutdown] do 37 | Logger.error( 38 | "Batch processor received a trapped exit from #{inspect(pid)} with reason: " <> 39 | Exception.format_exit(reason) 40 | ) 41 | 42 | {:noreply, [], state} 43 | end 44 | 45 | def handle_info(_msg, state) do 46 | {:noreply, [], state} 47 | end 48 | 49 | @impl true 50 | def handle_events(events, _from, state) do 51 | [{messages, batch_info}] = events 52 | %Broadway.BatchInfo{batcher: batcher, size: size} = batch_info 53 | 54 | :telemetry.span( 55 | [:broadway, :batch_processor], 56 | %{ 57 | topology_name: state.topology_name, 58 | name: state.name, 59 | index: state.partition, 60 | messages: messages, 61 | batch_info: batch_info, 62 | context: state.context, 63 | producer: state.producer 64 | }, 65 | fn -> 66 | {successful_messages, failed_messages, returned} = 67 | handle_batch(batcher, messages, batch_info, state) 68 | 69 | failed_messages = 70 | Acknowledger.maybe_handle_failed_messages( 71 | failed_messages, 72 | state.module, 73 | state.context 74 | ) 75 | 76 | if returned != size do 77 | Logger.error( 78 | "#{inspect(state.module)}.handle_batch/4 received #{size} messages and " <> 79 | "returned only #{returned}. All messages given to handle_batch/4 " <> 80 | "must be returned" 81 | ) 82 | end 83 | 84 | try do 85 | Acknowledger.ack_messages(successful_messages, failed_messages) 86 | catch 87 | kind, reason -> 88 | Logger.error(Exception.format(kind, reason, __STACKTRACE__), 89 | crash_reason: Acknowledger.crash_reason(kind, reason, __STACKTRACE__) 90 | ) 91 | end 92 | 93 | {{:noreply, [], state}, 94 | %{ 95 | topology_name: state.topology_name, 96 | name: state.name, 97 | index: state.partition, 98 | successful_messages: successful_messages, 99 | failed_messages: failed_messages, 100 | batch_info: batch_info, 101 | context: state.context, 102 | producer: state.producer 103 | }} 104 | end 105 | ) 106 | end 107 | 108 | defp handle_batch(batcher, messages, batch_info, state) do 109 | %{module: module, context: context} = state 110 | 111 | try do 112 | module.handle_batch(batcher, messages, batch_info, context) 113 | |> split_by_status([], [], 0) 114 | catch 115 | kind, reason -> 116 | reason = Exception.normalize(kind, reason, __STACKTRACE__) 117 | 118 | Logger.error(Exception.format(kind, reason, __STACKTRACE__), 119 | crash_reason: Acknowledger.crash_reason(kind, reason, __STACKTRACE__) 120 | ) 121 | 122 | messages = Enum.map(messages, &%{&1 | status: {kind, reason, __STACKTRACE__}}) 123 | {[], messages, batch_info.size} 124 | end 125 | end 126 | 127 | defp split_by_status([], successful, failed, count) do 128 | {Enum.reverse(successful), Enum.reverse(failed), count} 129 | end 130 | 131 | defp split_by_status([%Message{status: :ok} = message | rest], successful, failed, count) do 132 | split_by_status(rest, [message | successful], failed, count + 1) 133 | end 134 | 135 | defp split_by_status([%Message{} = message | rest], successful, failed, count) do 136 | split_by_status(rest, successful, [message | failed], count + 1) 137 | end 138 | 139 | defp split_by_status([other | _rest], _successful, _failed, _count) do 140 | raise "handle_batch/4 must return a list of %Broadway.Message{} structs, " <> 141 | "but one element was: #{inspect(other)}" 142 | end 143 | 144 | defp split_by_status(other, _successful, _failed, _count) do 145 | raise "handle_batch/4 must return a list of %Broadway.Message{} structs, got: #{inspect(other)}" 146 | end 147 | end 148 | -------------------------------------------------------------------------------- /lib/broadway/topology/batcher_stage.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Topology.BatcherStage do 2 | @moduledoc false 3 | use GenStage 4 | alias Broadway.BatchInfo 5 | 6 | @all_batches __MODULE__.All 7 | 8 | @spec start_link(term, GenServer.options()) :: GenServer.on_start() 9 | def start_link(args, stage_options) do 10 | Broadway.Topology.Subscriber.start_link( 11 | __MODULE__, 12 | args[:processors], 13 | args, 14 | [max_demand: args[:max_demand] || args[:batch_size]], 15 | stage_options 16 | ) 17 | end 18 | 19 | @impl true 20 | def init(args) do 21 | Process.put(@all_batches, %{}) 22 | 23 | {dispatcher, partition_by} = 24 | case args[:partition_by] do 25 | nil -> 26 | {GenStage.DemandDispatcher, nil} 27 | 28 | func -> 29 | concurrency = args[:concurrency] 30 | hash_fun = fn {_, %{partition: partition}} = payload -> {payload, partition} end 31 | 32 | dispatcher = 33 | {GenStage.PartitionDispatcher, partitions: 0..(concurrency - 1), hash: hash_fun} 34 | 35 | {dispatcher, fn msg -> rem(func.(msg), concurrency) end} 36 | end 37 | 38 | state = %{ 39 | topology_name: args[:topology_name], 40 | name: args[:name], 41 | batcher: args[:batcher], 42 | batch_size: args[:batch_size], 43 | batch_timeout: args[:batch_timeout], 44 | partition_by: partition_by, 45 | context: args[:context] 46 | } 47 | 48 | {:producer_consumer, state, dispatcher: dispatcher} 49 | end 50 | 51 | @impl true 52 | def handle_events(events, _from, state) do 53 | batches = 54 | :telemetry.span( 55 | [:broadway, :batcher], 56 | %{ 57 | topology_name: state.topology_name, 58 | name: state.name, 59 | batcher_key: state.batcher, 60 | messages: events, 61 | context: state.context 62 | }, 63 | fn -> 64 | {handle_events_per_batch_key(events, [], state), 65 | %{ 66 | topology_name: state.topology_name, 67 | name: state.name, 68 | batcher_key: state.batcher, 69 | context: state.context 70 | }} 71 | end 72 | ) 73 | 74 | {:noreply, batches, state} 75 | end 76 | 77 | @impl true 78 | def handle_info({:timeout, _timer, ref}, state) do 79 | case all_batches() do 80 | %{^ref => batch_key} -> 81 | {current, _, _, _, _} = delete_batch(batch_key, ref) 82 | {:noreply, [wrap_for_delivery(batch_key, current, :timeout, state)], state} 83 | 84 | %{} -> 85 | {:noreply, [], state} 86 | end 87 | end 88 | 89 | def handle_info(:cancel_consumers, state) do 90 | events = 91 | for {ref, batch_key} <- all_batches() do 92 | {current, _, _, timer, _} = delete_batch(batch_key, ref) 93 | cancel_batch_timeout(timer) 94 | wrap_for_delivery(batch_key, current, :flush, state) 95 | end 96 | 97 | {:noreply, events, state} 98 | end 99 | 100 | def handle_info(_msg, state) do 101 | {:noreply, [], state} 102 | end 103 | 104 | ## Default batch handling 105 | 106 | defp handle_events_per_batch_key([], acc, _state) do 107 | Enum.reverse(acc) 108 | end 109 | 110 | defp handle_events_per_batch_key([event | _] = events, acc, state) do 111 | %{partition_by: partition_by} = state 112 | batch_key = batch_key(event, partition_by) 113 | {current, batch_state, batch_splitter, timer, ref} = init_or_get_batch(batch_key, state) 114 | 115 | {current, batch_state, events, flush} = 116 | split_counting( 117 | batch_key, 118 | events, 119 | batch_state, 120 | batch_splitter, 121 | nil, 122 | current, 123 | partition_by 124 | ) 125 | 126 | acc = 127 | if flush do 128 | deliver_batch(batch_key, current, flush, timer, ref, acc, state) 129 | else 130 | put_batch(batch_key, {current, batch_state, batch_splitter, timer, ref}) 131 | acc 132 | end 133 | 134 | handle_events_per_batch_key(events, acc, state) 135 | end 136 | 137 | defp split_counting(_batch_key, [], count, _batch_splitter, flush?, acc, _partition_by) do 138 | {acc, count, [], flush?} 139 | end 140 | 141 | defp split_counting( 142 | batch_key, 143 | [event | remained] = events, 144 | batch_state, 145 | batch_splitter, 146 | flush, 147 | acc, 148 | partition_by 149 | ) do 150 | event_batch_key = batch_key(event, partition_by) 151 | 152 | # Switch to a different batch key 153 | if event_batch_key != batch_key do 154 | {acc, batch_state, events, flush} 155 | else 156 | case batch_splitter.(event, batch_state) do 157 | # Batch splitter indicates a full batch 158 | {:emit, next_state} -> 159 | {[event | acc], next_state, remained, :size} 160 | 161 | # Same batch key but not fulfill one batch size yet 162 | {:cont, next_state} -> 163 | split_counting( 164 | batch_key, 165 | remained, 166 | next_state, 167 | batch_splitter, 168 | flush || flush_batch(event), 169 | [event | acc], 170 | partition_by 171 | ) 172 | end 173 | end 174 | end 175 | 176 | defp flush_batch(%{batch_mode: :flush}), do: :flush 177 | defp flush_batch(%{}), do: nil 178 | 179 | defp deliver_batch(batch_key, current, trigger, timer, ref, acc, state) do 180 | delete_batch(batch_key, ref) 181 | cancel_batch_timeout(timer) 182 | [wrap_for_delivery(batch_key, current, trigger, state) | acc] 183 | end 184 | 185 | ## General batch handling 186 | 187 | @compile {:inline, batch_key: 2} 188 | 189 | defp batch_key(%{batch_key: batch_key}, nil), 190 | do: batch_key 191 | 192 | defp batch_key(%{batch_key: batch_key} = event, partition_by), 193 | do: [batch_key | partition_by.(event)] 194 | 195 | defp init_or_get_batch(batch_key, state) do 196 | if batch = Process.get(batch_key) do 197 | batch 198 | else 199 | %{batch_size: batch_size, batch_timeout: batch_timeout} = state 200 | 201 | {batch_state, batch_splitter} = get_batch_splitter(batch_size) 202 | {timer, ref} = schedule_batch_timeout(batch_timeout) 203 | update_all_batches(&Map.put(&1, ref, batch_key)) 204 | {[], batch_state, batch_splitter, timer, ref} 205 | end 206 | end 207 | 208 | defp get_batch_splitter(batch_size) do 209 | if is_number(batch_size) do 210 | {batch_size, 211 | fn 212 | _message, 1 -> {:emit, batch_size} 213 | _message, count -> {:cont, count - 1} 214 | end} 215 | else 216 | # Customized batch splitter with initial state and function 217 | batch_size 218 | end 219 | end 220 | 221 | defp put_batch(batch_key, {_, _, _, _, _} = batch) do 222 | Process.put(batch_key, batch) 223 | end 224 | 225 | defp delete_batch(batch_key, ref) do 226 | update_all_batches(&Map.delete(&1, ref)) 227 | Process.delete(batch_key) 228 | end 229 | 230 | defp all_batches do 231 | Process.get(@all_batches) 232 | end 233 | 234 | defp update_all_batches(fun) do 235 | Process.put(@all_batches, fun.(Process.get(@all_batches))) 236 | end 237 | 238 | defp schedule_batch_timeout(batch_timeout) do 239 | ref = make_ref() 240 | {:erlang.start_timer(batch_timeout, self(), ref), ref} 241 | end 242 | 243 | defp cancel_batch_timeout(timer) do 244 | case :erlang.cancel_timer(timer) do 245 | false -> 246 | receive do 247 | {:timeout, ^timer, _} -> :ok 248 | after 249 | 0 -> :ok 250 | end 251 | 252 | _ -> 253 | :ok 254 | end 255 | end 256 | 257 | defp wrap_for_delivery(batch_key, reversed_events, trigger, %{partition_by: nil} = state) do 258 | wrap_for_delivery(batch_key, nil, reversed_events, trigger, state) 259 | end 260 | 261 | defp wrap_for_delivery([batch_key | partition], reversed_events, trigger, state) do 262 | wrap_for_delivery(batch_key, partition, reversed_events, trigger, state) 263 | end 264 | 265 | defp wrap_for_delivery(batch_key, partition, reversed_events, trigger, state) do 266 | %{batcher: batcher} = state 267 | 268 | batch_info = %BatchInfo{ 269 | batcher: batcher, 270 | batch_key: batch_key, 271 | partition: partition, 272 | size: length(reversed_events), 273 | trigger: trigger 274 | } 275 | 276 | {Enum.reverse(reversed_events), batch_info} 277 | end 278 | end 279 | -------------------------------------------------------------------------------- /lib/broadway/topology/processor_stage.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Topology.ProcessorStage do 2 | @moduledoc false 3 | use GenStage 4 | 5 | require Logger 6 | alias Broadway.{Message, Acknowledger} 7 | 8 | @spec start_link(term, GenServer.options()) :: GenServer.on_start() 9 | def start_link(args, stage_options) do 10 | Broadway.Topology.Subscriber.start_link( 11 | __MODULE__, 12 | args[:producers], 13 | args, 14 | Keyword.take(args[:processor_config], [:min_demand, :max_demand]), 15 | stage_options 16 | ) 17 | end 18 | 19 | @impl true 20 | def init(args) do 21 | Process.flag(:trap_exit, true) 22 | type = args[:type] 23 | 24 | state = %{ 25 | topology_name: args[:topology_name], 26 | name: args[:name], 27 | partition: args[:partition], 28 | type: type, 29 | module: args[:module], 30 | context: args[:context], 31 | processor_key: args[:processor_key], 32 | batchers: args[:batchers], 33 | producer: args[:producer] 34 | } 35 | 36 | case type do 37 | :consumer -> 38 | {:consumer, state, []} 39 | 40 | :producer_consumer -> 41 | {:producer_consumer, state, dispatcher: args[:dispatcher]} 42 | end 43 | end 44 | 45 | @impl true 46 | def handle_info({:EXIT, pid, reason}, state) when reason not in [:normal, :shutdown] do 47 | Logger.error( 48 | "Processor received a trapped exit from #{inspect(pid)} with reason: " <> 49 | Exception.format_exit(reason) 50 | ) 51 | 52 | {:noreply, [], state} 53 | end 54 | 55 | def handle_info(_msg, state) do 56 | {:noreply, [], state} 57 | end 58 | 59 | @impl true 60 | def handle_events(messages, _from, state) do 61 | :telemetry.span( 62 | [:broadway, :processor], 63 | %{ 64 | topology_name: state.topology_name, 65 | name: state.name, 66 | index: state.partition, 67 | processor_key: state.processor_key, 68 | messages: messages, 69 | context: state.context, 70 | producer: state.producer 71 | }, 72 | fn -> 73 | {prepared_messages, prepared_failed_messages} = maybe_prepare_messages(messages, state) 74 | {successful_messages, failed_messages} = handle_messages(prepared_messages, [], [], state) 75 | failed_messages = prepared_failed_messages ++ failed_messages 76 | 77 | {successful_messages_to_forward, successful_messages_to_ack} = 78 | case state do 79 | %{type: :consumer} -> 80 | {[], successful_messages} 81 | 82 | %{} -> 83 | {successful_messages, []} 84 | end 85 | 86 | failed_messages = 87 | Acknowledger.maybe_handle_failed_messages( 88 | failed_messages, 89 | state.module, 90 | state.context 91 | ) 92 | 93 | try do 94 | Acknowledger.ack_messages(successful_messages_to_ack, failed_messages) 95 | catch 96 | kind, reason -> 97 | Logger.error(Exception.format(kind, reason, __STACKTRACE__), 98 | crash_reason: Acknowledger.crash_reason(kind, reason, __STACKTRACE__) 99 | ) 100 | end 101 | 102 | {{:noreply, successful_messages_to_forward, state}, 103 | %{ 104 | topology_name: state.topology_name, 105 | name: state.name, 106 | index: state.partition, 107 | successful_messages_to_ack: successful_messages_to_ack, 108 | successful_messages_to_forward: successful_messages_to_forward, 109 | processor_key: state.processor_key, 110 | failed_messages: failed_messages, 111 | context: state.context, 112 | producer: state.producer 113 | }} 114 | end 115 | ) 116 | end 117 | 118 | defp maybe_prepare_messages(messages, state) do 119 | %{module: module, context: context} = state 120 | 121 | if function_exported?(module, :prepare_messages, 2) do 122 | try do 123 | prepared_messages = 124 | messages 125 | |> module.prepare_messages(context) 126 | |> validate_prepared_messages(messages) 127 | 128 | {prepared_messages, []} 129 | catch 130 | kind, reason -> 131 | reason = Exception.normalize(kind, reason, __STACKTRACE__) 132 | 133 | Logger.error(Exception.format(kind, reason, __STACKTRACE__), 134 | crash_reason: Acknowledger.crash_reason(kind, reason, __STACKTRACE__) 135 | ) 136 | 137 | messages = Enum.map(messages, &%{&1 | status: {kind, reason, __STACKTRACE__}}) 138 | {[], messages} 139 | end 140 | else 141 | {messages, []} 142 | end 143 | end 144 | 145 | defp handle_messages([message | messages], successful, failed, state) do 146 | %{ 147 | module: module, 148 | context: context, 149 | processor_key: processor_key, 150 | batchers: batchers 151 | } = state 152 | 153 | {successful, failed} = 154 | try do 155 | :telemetry.span( 156 | [:broadway, :processor, :message], 157 | %{ 158 | processor_key: state.processor_key, 159 | topology_name: state.topology_name, 160 | index: state.partition, 161 | name: state.name, 162 | message: message, 163 | context: state.context 164 | }, 165 | fn -> 166 | updated_message = 167 | processor_key 168 | |> module.handle_message(message, context) 169 | |> validate_message(batchers) 170 | 171 | {updated_message, 172 | %{ 173 | processor_key: state.processor_key, 174 | topology_name: state.topology_name, 175 | index: state.partition, 176 | name: state.name, 177 | message: updated_message, 178 | context: state.context 179 | }} 180 | end 181 | ) 182 | catch 183 | kind, reason -> 184 | reason = Exception.normalize(kind, reason, __STACKTRACE__) 185 | 186 | Logger.error(Exception.format(kind, reason, __STACKTRACE__), 187 | crash_reason: Acknowledger.crash_reason(kind, reason, __STACKTRACE__) 188 | ) 189 | 190 | message = %{message | status: {kind, reason, __STACKTRACE__}} 191 | {successful, [message | failed]} 192 | else 193 | %{status: :ok} = message -> 194 | {[message | successful], failed} 195 | 196 | %{status: {:failed, _}} = message -> 197 | {successful, [message | failed]} 198 | end 199 | 200 | handle_messages(messages, successful, failed, state) 201 | end 202 | 203 | defp handle_messages([], successful, failed, _state) do 204 | {Enum.reverse(successful), Enum.reverse(failed)} 205 | end 206 | 207 | defp validate_message(%Message{batcher: batcher, status: status} = message, batchers) do 208 | if status == :ok and batchers != :none and batcher not in batchers do 209 | raise "message was set to unknown batcher #{inspect(batcher)}. " <> 210 | "The known batchers are #{inspect(batchers)}" 211 | end 212 | 213 | message 214 | end 215 | 216 | defp validate_message(message, _batchers) do 217 | raise "expected a Broadway.Message from handle_message/3, got #{inspect(message)}" 218 | end 219 | 220 | defp validate_prepared_messages(prepared_messages, messages) do 221 | if length(prepared_messages) != length(messages) do 222 | raise "expected all messages to be returned from prepared_messages/2" 223 | end 224 | 225 | prepared_messages 226 | end 227 | end 228 | -------------------------------------------------------------------------------- /lib/broadway/topology/producer_stage.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Topology.ProducerStage do 2 | @moduledoc false 3 | use GenStage 4 | 5 | alias Broadway.Message 6 | alias Broadway.Topology.RateLimiter 7 | 8 | @spec start_link(term, non_neg_integer, GenServer.options()) :: GenServer.on_start() 9 | def start_link(args, index, opts \\ []) do 10 | GenStage.start_link(__MODULE__, {args, index}, opts) 11 | end 12 | 13 | @spec push_messages(GenServer.server(), [Message.t()]) :: :ok 14 | def push_messages(producer, messages) do 15 | GenStage.call(producer, {__MODULE__, :push_messages, messages}) 16 | end 17 | 18 | @spec drain(GenServer.server()) :: :ok 19 | def drain(producer) do 20 | GenStage.cast(producer, {__MODULE__, :prepare_for_draining}) 21 | GenStage.async_info(producer, {__MODULE__, :cancel_consumers}) 22 | end 23 | 24 | @impl true 25 | def init({args, index}) do 26 | {module, arg} = args[:module] 27 | transformer = args[:transformer] 28 | dispatcher = args[:dispatcher] 29 | rate_limiter = args[:rate_limiter] 30 | 31 | # Inject the topology index only if the args are a keyword list. 32 | arg = 33 | if Keyword.keyword?(arg) do 34 | Keyword.put(arg, :broadway, Keyword.put(args[:broadway], :index, index)) 35 | else 36 | arg 37 | end 38 | 39 | rate_limiting_state = 40 | if rate_limiter do 41 | rate_limiter_ref = RateLimiter.get_rate_limiter_ref(rate_limiter) 42 | 43 | %{ 44 | state: :open, 45 | draining?: false, 46 | rate_limiter: rate_limiter_ref, 47 | # A queue of "batches" of messages that we buffered. 48 | message_buffer: :queue.new(), 49 | # A queue of demands (integers) that we buffered. 50 | demand_buffer: :queue.new() 51 | } 52 | else 53 | nil 54 | end 55 | 56 | state = %{ 57 | module: module, 58 | module_state: nil, 59 | transformer: transformer, 60 | consumers: [], 61 | rate_limiting: rate_limiting_state 62 | } 63 | 64 | case module.init(arg) do 65 | {:producer, module_state} -> 66 | {:producer, %{state | module_state: module_state}, dispatcher: dispatcher} 67 | 68 | {:producer, module_state, options} -> 69 | if options[:dispatcher] && options[:dispatcher] != dispatcher do 70 | raise "#{inspect(module)} is setting dispatcher to #{inspect(options[:dispatcher])}, " <> 71 | "which is different from dispatcher #{inspect(dispatcher)} expected by Broadway" 72 | end 73 | 74 | {:producer, %{state | module_state: module_state}, [dispatcher: dispatcher] ++ options} 75 | 76 | return_value -> 77 | {:stop, {:bad_return_value, return_value}} 78 | end 79 | end 80 | 81 | @impl true 82 | def handle_subscribe(:consumer, _, from, state) do 83 | {:automatic, update_in(state.consumers, &[from | &1])} 84 | end 85 | 86 | @impl true 87 | def handle_cancel(_, from, state) do 88 | {:noreply, [], update_in(state.consumers, &List.delete(&1, from))} 89 | end 90 | 91 | # If we're rate limited, we store the demand in the buffer instead of forwarding it. 92 | # We'll forward it once the rate limit is lifted. 93 | @impl true 94 | def handle_demand(demand, %{rate_limiting: %{state: :closed}} = state) do 95 | state = update_in(state.rate_limiting.demand_buffer, &:queue.in(demand, &1)) 96 | {:noreply, [], state} 97 | end 98 | 99 | def handle_demand(demand, state) do 100 | %{module: module, module_state: module_state} = state 101 | handle_no_reply(module.handle_demand(demand, module_state), state) 102 | end 103 | 104 | @impl true 105 | def handle_call({__MODULE__, :push_messages, messages}, _from, state) do 106 | {:reply, :ok, messages, state} 107 | end 108 | 109 | def handle_call(message, from, state) do 110 | %{module: module, module_state: module_state} = state 111 | 112 | message 113 | |> module.handle_call(from, module_state) 114 | |> case do 115 | {:reply, reply, events, new_module_state} -> 116 | messages = transform_events(events, state.transformer) 117 | {state, messages} = maybe_rate_limit_and_buffer_messages(state, messages) 118 | {:reply, reply, messages, %{state | module_state: new_module_state}} 119 | 120 | {:reply, reply, events, new_module_state, :hibernate} -> 121 | messages = transform_events(events, state.transformer) 122 | {state, messages} = maybe_rate_limit_and_buffer_messages(state, messages) 123 | {:reply, reply, messages, %{state | module_state: new_module_state}, :hibernate} 124 | 125 | {:stop, reason, reply, new_module_state} -> 126 | {:stop, reason, reply, %{state | module_state: new_module_state}} 127 | 128 | other -> 129 | handle_no_reply(other, state) 130 | end 131 | end 132 | 133 | @impl true 134 | def handle_cast({__MODULE__, :prepare_for_draining}, state) do 135 | %{module: module, module_state: module_state} = state 136 | 137 | if function_exported?(module, :prepare_for_draining, 1) do 138 | module_state 139 | |> module.prepare_for_draining() 140 | |> handle_no_reply(state) 141 | else 142 | {:noreply, [], state} 143 | end 144 | end 145 | 146 | def handle_cast(message, state) do 147 | %{module: module, module_state: module_state} = state 148 | 149 | message 150 | |> module.handle_cast(module_state) 151 | |> handle_no_reply(state) 152 | end 153 | 154 | @impl true 155 | def handle_info({__MODULE__, :cancel_consumers}, %{rate_limiting: %{} = rate_limiting} = state) do 156 | rate_limiting = %{rate_limiting | draining?: true} 157 | 158 | if :queue.is_empty(rate_limiting.message_buffer) do 159 | cancel_consumers(state) 160 | end 161 | 162 | {:noreply, [], %{state | rate_limiting: rate_limiting}} 163 | end 164 | 165 | def handle_info({__MODULE__, :cancel_consumers}, state) do 166 | cancel_consumers(state) 167 | {:noreply, [], state} 168 | end 169 | 170 | # Don't forward buffered demand when we're draining or when the rate limiting is closed. 171 | def handle_info( 172 | {__MODULE__, :handle_next_demand}, 173 | %{rate_limiting: %{draining?: draining?, state: rl_state}} = state 174 | ) 175 | when draining? or rl_state == :closed do 176 | {:noreply, [], state} 177 | end 178 | 179 | def handle_info({__MODULE__, :handle_next_demand}, state) do 180 | case get_and_update_in(state.rate_limiting.demand_buffer, &:queue.out/1) do 181 | {{:value, demand}, state} -> 182 | case handle_demand(demand, state) do 183 | {:noreply, messages, state} -> 184 | schedule_next_handle_demand_if_any(state) 185 | {:noreply, messages, state} 186 | 187 | {:noreply, messages, state, :hibernate} -> 188 | schedule_next_handle_demand_if_any(state) 189 | {:noreply, messages, state, :hibernate} 190 | 191 | {:stop, reason, state} -> 192 | {:stop, reason, state} 193 | end 194 | 195 | {:empty, state} -> 196 | {:noreply, [], state} 197 | end 198 | end 199 | 200 | # If the rate limit is lifted but our rate limiting state was "open", 201 | # we don't need to do anything since we don't have anything in the buffer. 202 | def handle_info({RateLimiter, :reset_rate_limiting}, %{rate_limiting: %{state: :open}} = state) do 203 | {:noreply, [], state} 204 | end 205 | 206 | def handle_info({RateLimiter, :reset_rate_limiting}, state) do 207 | state = put_in(state.rate_limiting.state, :open) 208 | 209 | {state, messages} = rate_limit_and_buffer_messages(state) 210 | 211 | # We'll schedule to handle the buffered demand regardless of 212 | # the state of rate limiting. We'll check if we can forward it 213 | # when handling the message. 214 | schedule_next_handle_demand_if_any(state) 215 | 216 | {:noreply, messages, state} 217 | end 218 | 219 | def handle_info(message, state) do 220 | %{module: module, module_state: module_state} = state 221 | 222 | message 223 | |> module.handle_info(module_state) 224 | |> handle_no_reply(state) 225 | end 226 | 227 | @impl true 228 | def format_discarded(discarded, state) do 229 | %{module: module, module_state: module_state} = state 230 | 231 | if function_exported?(module, :format_discarded, 2) do 232 | module.format_discarded(discarded, module_state) 233 | else 234 | true 235 | end 236 | end 237 | 238 | @impl true 239 | def terminate(reason, %{module: module, module_state: module_state}) do 240 | if function_exported?(module, :terminate, 2) do 241 | module.terminate(reason, module_state) 242 | else 243 | :ok 244 | end 245 | end 246 | 247 | defp handle_no_reply(reply, %{transformer: transformer} = state) do 248 | case reply do 249 | {:noreply, events, new_module_state} when is_list(events) -> 250 | messages = transform_events(events, transformer) 251 | {state, messages} = maybe_rate_limit_and_buffer_messages(state, messages) 252 | {:noreply, messages, %{state | module_state: new_module_state}} 253 | 254 | {:noreply, events, new_module_state, :hibernate} -> 255 | messages = transform_events(events, transformer) 256 | {state, messages} = maybe_rate_limit_and_buffer_messages(state, messages) 257 | {:noreply, messages, %{state | module_state: new_module_state}, :hibernate} 258 | 259 | {:stop, reason, new_module_state} -> 260 | {:stop, reason, %{state | module_state: new_module_state}} 261 | end 262 | end 263 | 264 | defp transform_events(events, nil) do 265 | case events do 266 | [] -> :ok 267 | [message | _] -> validate_message(message) 268 | end 269 | 270 | events 271 | end 272 | 273 | defp transform_events(events, {m, f, opts}) do 274 | for event <- events do 275 | message = apply(m, f, [event, opts]) 276 | validate_message(message) 277 | end 278 | end 279 | 280 | defp validate_message(%Message{} = message) do 281 | message 282 | end 283 | 284 | defp validate_message(_message) do 285 | raise "the produced message is invalid. All messages must be a %Broadway.Message{} " <> 286 | "struct. In case you're using a standard GenStage producer, please set the " <> 287 | ":transformer option to transform produced events into message structs" 288 | end 289 | 290 | defp maybe_rate_limit_and_buffer_messages(state, messages) do 291 | if state.rate_limiting && messages != [] do 292 | state = update_in(state.rate_limiting.message_buffer, &enqueue_batch(&1, messages)) 293 | rate_limit_and_buffer_messages(state) 294 | else 295 | {state, messages} 296 | end 297 | end 298 | 299 | defp rate_limit_and_buffer_messages(%{rate_limiting: %{state: :closed}} = state) do 300 | {state, []} 301 | end 302 | 303 | defp rate_limit_and_buffer_messages(%{rate_limiting: rate_limiting} = state) do 304 | %{message_buffer: buffer, rate_limiter: rate_limiter, draining?: draining?} = rate_limiting 305 | 306 | {rate_limiting, messages_to_emit} = 307 | case RateLimiter.get_currently_allowed(rate_limiter) do 308 | # No point in trying to emit messages if no messages are allowed. In that case, 309 | # we close the rate limiting and don't emit anything. 310 | allowed when allowed <= 0 -> 311 | {%{rate_limiting | state: :closed}, []} 312 | 313 | allowed -> 314 | {allowed_left, probably_emittable, buffer} = dequeue_many(buffer, allowed, []) 315 | 316 | {rate_limiting_state, messages_to_emit, messages_to_buffer} = 317 | rate_limit_messages( 318 | rate_limiter, 319 | probably_emittable, 320 | _probably_emittable_count = allowed - allowed_left 321 | ) 322 | 323 | new_buffer = enqueue_batch_r(buffer, messages_to_buffer) 324 | 325 | rate_limiting = %{ 326 | rate_limiting 327 | | message_buffer: new_buffer, 328 | state: rate_limiting_state 329 | } 330 | 331 | if draining? and :queue.is_empty(new_buffer) do 332 | cancel_consumers(state) 333 | end 334 | 335 | {rate_limiting, messages_to_emit} 336 | end 337 | 338 | {%{state | rate_limiting: rate_limiting}, messages_to_emit} 339 | end 340 | 341 | defp reverse_split_demand(rest, 0, acc) do 342 | {0, acc, rest} 343 | end 344 | 345 | defp reverse_split_demand([], demand, acc) do 346 | {demand, acc, []} 347 | end 348 | 349 | defp reverse_split_demand([head | tail], demand, acc) do 350 | reverse_split_demand(tail, demand - 1, [head | acc]) 351 | end 352 | 353 | defp dequeue_many(queue, demand, acc) do 354 | case :queue.out(queue) do 355 | {{:value, list}, queue} -> 356 | case reverse_split_demand(list, demand, acc) do 357 | {0, acc, []} -> 358 | {0, Enum.reverse(acc), queue} 359 | 360 | {0, acc, rest} -> 361 | {0, Enum.reverse(acc), :queue.in_r(rest, queue)} 362 | 363 | {demand, acc, []} -> 364 | dequeue_many(queue, demand, acc) 365 | end 366 | 367 | {:empty, queue} -> 368 | {demand, Enum.reverse(acc), queue} 369 | end 370 | end 371 | 372 | defp enqueue_batch(queue, _list = []), do: queue 373 | defp enqueue_batch(queue, list), do: :queue.in(list, queue) 374 | 375 | defp enqueue_batch_r(queue, _list = []), do: queue 376 | defp enqueue_batch_r(queue, list), do: :queue.in_r(list, queue) 377 | 378 | defp rate_limit_messages(_state, [], _count) do 379 | {:open, [], []} 380 | end 381 | 382 | defp rate_limit_messages(rate_limiter, messages, message_count) do 383 | case RateLimiter.rate_limit(rate_limiter, message_count) do 384 | # If no more messages are allowed, we're rate limited but we're able 385 | # to emit all messages that we have. 386 | 0 -> 387 | {:closed, messages, _to_buffer = []} 388 | 389 | # We were able to emit all messages and still more messages are allowed, 390 | # so the rate limiting is "open". 391 | left when left > 0 -> 392 | {:open, messages, _to_buffer = []} 393 | 394 | # We went over the rate limit, so we split (on negative index) the messages 395 | # we were able to emit and close the rate limiting. 396 | overflow when overflow < 0 -> 397 | {emittable, to_buffer} = Enum.split(messages, overflow) 398 | {:closed, emittable, to_buffer} 399 | end 400 | end 401 | 402 | defp schedule_next_handle_demand_if_any(state) do 403 | if not :queue.is_empty(state.rate_limiting.demand_buffer) do 404 | send(self(), {__MODULE__, :handle_next_demand}) 405 | end 406 | end 407 | 408 | defp cancel_consumers(state) do 409 | for from <- state.consumers do 410 | send(self(), {:"$gen_producer", from, {:cancel, :shutdown}}) 411 | end 412 | end 413 | end 414 | -------------------------------------------------------------------------------- /lib/broadway/topology/rate_limiter.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Topology.RateLimiter do 2 | @moduledoc false 3 | 4 | use GenServer 5 | 6 | @atomics_index 1 7 | 8 | def start_link(opts) do 9 | case Keyword.fetch!(opts, :rate_limiting) do 10 | # If we don't have rate limiting options, we don't even need to start this rate 11 | # limiter process. 12 | nil -> 13 | :ignore 14 | 15 | rate_limiting_opts -> 16 | name = Keyword.fetch!(opts, :name) 17 | producers_names = Keyword.fetch!(opts, :producers_names) 18 | args = {rate_limiting_opts, producers_names} 19 | GenServer.start_link(__MODULE__, args, name: name) 20 | end 21 | end 22 | 23 | def rate_limit(counter, amount) 24 | when is_reference(counter) and is_integer(amount) and amount > 0 do 25 | :atomics.sub_get(counter, @atomics_index, amount) 26 | end 27 | 28 | def get_currently_allowed(counter) when is_reference(counter) do 29 | :atomics.get(counter, @atomics_index) 30 | end 31 | 32 | def update_rate_limiting(rate_limiter, opts) do 33 | GenServer.call(rate_limiter, {:update_rate_limiting, opts}) 34 | end 35 | 36 | def get_rate_limiting(rate_limiter) do 37 | GenServer.call(rate_limiter, :get_rate_limiting) 38 | end 39 | 40 | def get_rate_limiter_ref(rate_limiter) do 41 | GenServer.call(rate_limiter, :get_rate_limiter_ref) 42 | end 43 | 44 | @impl true 45 | def init({rate_limiting_opts, producers_names}) do 46 | interval = Keyword.fetch!(rate_limiting_opts, :interval) 47 | allowed = Keyword.fetch!(rate_limiting_opts, :allowed_messages) 48 | 49 | counter = :atomics.new(@atomics_index, []) 50 | :atomics.put(counter, @atomics_index, allowed) 51 | 52 | timer = schedule_next_reset(interval) 53 | 54 | state = %{ 55 | interval: interval, 56 | allowed: allowed, 57 | producers_names: producers_names, 58 | counter: counter, 59 | reset_timer: timer 60 | } 61 | 62 | {:ok, state} 63 | end 64 | 65 | @impl true 66 | def handle_call({:update_rate_limiting, opts}, _from, state) do 67 | %{interval: interval, allowed: allowed, reset_timer: prev_timer} = state 68 | new_interval = Keyword.get(opts, :interval, interval) 69 | new_allowed = Keyword.get(opts, :allowed_messages, allowed) 70 | 71 | state = %{state | interval: new_interval, allowed: new_allowed} 72 | 73 | if Keyword.get(opts, :reset, false) do 74 | cancel_reset_limit_timer(prev_timer) 75 | timer = schedule_next_reset(0) 76 | {:reply, :ok, %{state | reset_timer: timer}} 77 | else 78 | {:reply, :ok, state} 79 | end 80 | end 81 | 82 | def handle_call(:get_rate_limiting, _from, state) do 83 | %{interval: interval, allowed: allowed} = state 84 | {:reply, %{interval: interval, allowed_messages: allowed}, state} 85 | end 86 | 87 | def handle_call(:get_rate_limiter_ref, _from, %{counter: counter} = state) do 88 | {:reply, counter, state} 89 | end 90 | 91 | @impl true 92 | def handle_info(:reset_limit, state) do 93 | %{producers_names: producers_names, interval: interval, allowed: allowed, counter: counter} = 94 | state 95 | 96 | :atomics.put(counter, @atomics_index, allowed) 97 | 98 | for name <- producers_names, 99 | pid = GenServer.whereis(name), 100 | is_pid(pid), 101 | do: send(pid, {__MODULE__, :reset_rate_limiting}) 102 | 103 | timer = schedule_next_reset(interval) 104 | 105 | {:noreply, %{state | reset_timer: timer}} 106 | end 107 | 108 | defp schedule_next_reset(interval) do 109 | Process.send_after(self(), :reset_limit, interval) 110 | end 111 | 112 | defp cancel_reset_limit_timer(timer) do 113 | case Process.cancel_timer(timer) do 114 | false -> 115 | receive do 116 | :reset_limit -> :ok 117 | after 118 | 0 -> raise "unknown timer #{inspect(timer)}" 119 | end 120 | 121 | _ -> 122 | :ok 123 | end 124 | end 125 | end 126 | -------------------------------------------------------------------------------- /lib/broadway/topology/subscriber.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Topology.Subscriber do 2 | # This modules defines conveniences for subscribing to producers 3 | # and how to resubscribe to them in case of crashes. 4 | # 5 | # In practice, only the first layer resubscribers in case of crashes 6 | # as the remaining ones are shutdown via the supervision tree which 7 | # is set as one_for_all and max_restarts of 0 to the inner most 8 | # supervisor while the outer most is rest for one. This guarantees 9 | # that either all processes are running or none of them. 10 | # 11 | # For graceful shutdowns, we rely on cancellations with the help 12 | # of the terminator. 13 | @moduledoc false 14 | @behaviour GenStage 15 | 16 | def start_link(module, names, options, subscriptions_options, stage_options) do 17 | GenStage.start_link( 18 | __MODULE__, 19 | {module, names, options, subscriptions_options}, 20 | stage_options 21 | ) 22 | end 23 | 24 | @impl true 25 | def init({module, names, options, subscription_options}) do 26 | {type, state, init_options} = module.init(options) 27 | 28 | terminator = Keyword.fetch!(options, :terminator) 29 | resubscribe = Keyword.fetch!(options, :resubscribe) 30 | partition = Keyword.fetch!(options, :partition) 31 | 32 | subscription_options = 33 | subscription_options 34 | |> Keyword.put(:partition, partition) 35 | |> Keyword.put_new(:cancel, :temporary) 36 | 37 | state = 38 | Map.merge(state, %{ 39 | callback: module, 40 | terminator: if(type == :consumer, do: terminator), 41 | resubscribe: resubscribe, 42 | producers: %{}, 43 | consumers: [], 44 | subscription_options: subscription_options 45 | }) 46 | 47 | Enum.each(names, &subscribe(&1, state)) 48 | 49 | extra_options = if type == :consumer, do: [], else: [buffer_size: :infinity] 50 | {type, state, extra_options ++ init_options} 51 | end 52 | 53 | @impl true 54 | def handle_events(events, from, %{callback: callback} = state) do 55 | callback.handle_events(events, from, state) 56 | end 57 | 58 | @impl true 59 | def handle_subscribe(:producer, opts, {_, ref}, state) do 60 | process_name = Keyword.fetch!(opts, :name) 61 | {:automatic, put_in(state.producers[ref], process_name)} 62 | end 63 | 64 | def handle_subscribe(:consumer, _, from, state) do 65 | {:automatic, update_in(state.consumers, &[from | &1])} 66 | end 67 | 68 | @impl true 69 | def handle_cancel(_, {_, ref} = from, state) do 70 | case pop_in(state.producers[ref]) do 71 | {nil, _} -> 72 | {:noreply, [], update_in(state.consumers, &List.delete(&1, from))} 73 | 74 | {process_name, state} -> 75 | maybe_resubscribe(process_name, state) 76 | maybe_cancel(state) 77 | {:noreply, [], state} 78 | end 79 | end 80 | 81 | @impl true 82 | def handle_info(:will_terminate, state) do 83 | state = %{state | resubscribe: :never} 84 | maybe_cancel(state) 85 | {:noreply, [], state} 86 | end 87 | 88 | def handle_info(:cancel_consumers, %{terminator: terminator} = state) when terminator != nil do 89 | if pid = GenServer.whereis(terminator) do 90 | send(pid, {:done, self()}) 91 | end 92 | 93 | {:noreply, [], state} 94 | end 95 | 96 | def handle_info(:cancel_consumers, %{callback: callback} = state) do 97 | case callback.handle_info(:cancel_consumers, state) do 98 | # If there are no events to emit we are done 99 | {:noreply, [], state} -> 100 | for from <- state.consumers do 101 | send(self(), {:"$gen_producer", from, {:cancel, :shutdown}}) 102 | end 103 | 104 | {:noreply, [], state} 105 | 106 | # Otherwise we will try again later 107 | other -> 108 | GenStage.async_info(self(), :cancel_consumers) 109 | other 110 | end 111 | end 112 | 113 | def handle_info({:resubscribe, process_name}, state) do 114 | subscribe(process_name, state) 115 | {:noreply, [], state} 116 | end 117 | 118 | def handle_info(message, %{callback: callback} = state) do 119 | callback.handle_info(message, state) 120 | end 121 | 122 | ## Helpers 123 | 124 | defp subscribe(process_name, state) do 125 | if pid = GenServer.whereis(process_name) do 126 | opts = [to: pid, name: process_name] ++ state.subscription_options 127 | GenStage.async_subscribe(self(), opts) 128 | true 129 | else 130 | maybe_resubscribe(process_name, state) 131 | false 132 | end 133 | end 134 | 135 | defp maybe_resubscribe(process_name, %{resubscribe: integer}) when is_integer(integer) do 136 | Process.send_after(self(), {:resubscribe, process_name}, integer) 137 | true 138 | end 139 | 140 | defp maybe_resubscribe(_, _), do: false 141 | 142 | defp maybe_cancel(%{resubscribe: :never, producers: producers}) when producers == %{} do 143 | GenStage.async_info(self(), :cancel_consumers) 144 | true 145 | end 146 | 147 | defp maybe_cancel(_), do: false 148 | end 149 | -------------------------------------------------------------------------------- /lib/broadway/topology/terminator.ex: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Topology.Terminator do 2 | @moduledoc false 3 | use GenServer 4 | 5 | @spec start_link(term, GenServer.options()) :: GenServer.on_start() 6 | def start_link(args, opts) do 7 | GenServer.start_link(__MODULE__, args, opts) 8 | end 9 | 10 | @spec trap_exit(GenServer.server()) :: :ok 11 | def trap_exit(terminator) do 12 | GenServer.cast(terminator, :trap_exit) 13 | end 14 | 15 | @impl true 16 | def init(args) do 17 | state = %{ 18 | producers: args[:producers], 19 | first: args[:first], 20 | last: args[:last] 21 | } 22 | 23 | {:ok, state} 24 | end 25 | 26 | @impl true 27 | def handle_cast(:trap_exit, state) do 28 | Process.flag(:trap_exit, true) 29 | {:noreply, state} 30 | end 31 | 32 | @impl true 33 | def handle_info(_, state) do 34 | {:noreply, state} 35 | end 36 | 37 | @impl true 38 | def terminate(_, state) do 39 | for name <- state.first, pid = GenServer.whereis(name) do 40 | send(pid, :will_terminate) 41 | end 42 | 43 | for name <- state.producers, pid = GenServer.whereis(name) do 44 | Broadway.Topology.ProducerStage.drain(pid) 45 | end 46 | 47 | for name <- state.last, pid = GenServer.whereis(name) do 48 | ref = Process.monitor(pid) 49 | 50 | receive do 51 | {:done, ^pid} -> :ok 52 | {:DOWN, ^ref, _, _, _} -> :ok 53 | end 54 | end 55 | 56 | :ok 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule Broadway.MixProject do 2 | use Mix.Project 3 | 4 | @version "1.2.1" 5 | @description "Build concurrent and multi-stage data ingestion and data processing pipelines" 6 | 7 | def project do 8 | [ 9 | app: :broadway, 10 | version: @version, 11 | elixir: "~> 1.7", 12 | name: "Broadway", 13 | description: @description, 14 | deps: deps(), 15 | docs: docs(), 16 | package: package(), 17 | test_coverage: [tool: ExCoveralls], 18 | preferred_cli_env: [docs: :docs] 19 | ] 20 | end 21 | 22 | def application do 23 | [ 24 | extra_applications: [:logger], 25 | env: [config_storage: :persistent_term], 26 | mod: {Broadway.Application, []} 27 | ] 28 | end 29 | 30 | defp deps do 31 | [ 32 | {:gen_stage, "~> 1.0"}, 33 | {:nimble_options, "~> 0.3.7 or ~> 0.4 or ~> 1.0"}, 34 | {:telemetry, "~> 0.4.3 or ~> 1.0"}, 35 | 36 | # Dev/test dependencies. 37 | {:castore, "~> 1.0", only: :test}, 38 | {:ex_doc, ">= 0.19.0", only: :docs}, 39 | {:excoveralls, "~> 0.18.0", only: :test} 40 | ] 41 | end 42 | 43 | defp docs do 44 | [ 45 | main: "introduction", 46 | source_ref: "v#{@version}", 47 | source_url: "https://github.com/dashbitco/broadway", 48 | extra_section: "Guides", 49 | extras: [ 50 | "guides/examples/introduction.md", 51 | "guides/examples/amazon-sqs.md", 52 | "guides/examples/apache-kafka.md", 53 | "guides/examples/google-cloud-pubsub.md", 54 | "guides/examples/rabbitmq.md", 55 | "guides/examples/custom-producers.md", 56 | "guides/internals/architecture.md" 57 | ], 58 | groups_for_extras: [ 59 | Examples: Path.wildcard("guides/examples/*.md"), 60 | Internals: Path.wildcard("guides/internals/*.md") 61 | ], 62 | groups_for_modules: [ 63 | # Ungrouped Modules: 64 | # 65 | # Broadway 66 | # Broadway.Message 67 | # Broadway.BatchInfo 68 | 69 | Acknowledgement: [ 70 | Broadway.Acknowledger, 71 | Broadway.CallerAcknowledger, 72 | Broadway.NoopAcknowledger 73 | ], 74 | Producers: [ 75 | Broadway.Producer, 76 | Broadway.DummyProducer 77 | ] 78 | ] 79 | ] 80 | end 81 | 82 | defp package do 83 | %{ 84 | licenses: ["Apache-2.0"], 85 | maintainers: ["Marlus Saraiva", "José Valim"], 86 | links: %{"GitHub" => "https://github.com/dashbitco/broadway"} 87 | } 88 | end 89 | end 90 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "castore": {:hex, :castore, "1.0.7", "b651241514e5f6956028147fe6637f7ac13802537e895a724f90bf3e36ddd1dd", [:mix], [], "hexpm", "da7785a4b0d2a021cd1292a60875a784b6caef71e76bf4917bdee1f390455cf5"}, 3 | "earmark_parser": {:hex, :earmark_parser, "1.4.43", "34b2f401fe473080e39ff2b90feb8ddfeef7639f8ee0bbf71bb41911831d77c5", [:mix], [], "hexpm", "970a3cd19503f5e8e527a190662be2cee5d98eed1ff72ed9b3d1a3d466692de8"}, 4 | "ex_doc": {:hex, :ex_doc, "0.37.1", "65ca30d242082b95aa852b3b73c9d9914279fff56db5dc7b3859be5504417980", [:mix], [{:earmark_parser, "~> 1.4.42", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "6774f75477733ea88ce861476db031f9399c110640752ca2b400dbbb50491224"}, 5 | "excoveralls": {:hex, :excoveralls, "0.18.1", "a6f547570c6b24ec13f122a5634833a063aec49218f6fff27de9df693a15588c", [:mix], [{:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "d65f79db146bb20399f23046015974de0079668b9abb2f5aac074d078da60b8d"}, 6 | "gen_stage": {:hex, :gen_stage, "1.2.1", "19d8b5e9a5996d813b8245338a28246307fd8b9c99d1237de199d21efc4c76a1", [:mix], [], "hexpm", "83e8be657fa05b992ffa6ac1e3af6d57aa50aace8f691fcf696ff02f8335b001"}, 7 | "jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"}, 8 | "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"}, 9 | "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"}, 10 | "makeup_erlang": {:hex, :makeup_erlang, "1.0.2", "03e1804074b3aa64d5fad7aa64601ed0fb395337b982d9bcf04029d68d51b6a7", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "af33ff7ef368d5893e4a267933e7744e46ce3cf1f61e2dccf53a111ed3aa3727"}, 11 | "nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"}, 12 | "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"}, 13 | "telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"}, 14 | } 15 | -------------------------------------------------------------------------------- /test/broadway/acknowledger_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Broadway.AcknowledgerTest do 2 | use ExUnit.Case 3 | 4 | describe "crash_reason/3" do 5 | test "exceptions" do 6 | {kind, reason, stack} = kind_reason_stack(fn -> raise "oops" end) 7 | 8 | assert {%RuntimeError{message: "oops"}, [_ | _]} = 9 | Broadway.Acknowledger.crash_reason(kind, reason, stack) 10 | end 11 | 12 | test "exits" do 13 | {kind, reason, stack} = kind_reason_stack(fn -> exit(:fatal_error) end) 14 | 15 | assert {:fatal_error, [_ | _]} = Broadway.Acknowledger.crash_reason(kind, reason, stack) 16 | end 17 | 18 | test "throws" do 19 | {kind, reason, stack} = kind_reason_stack(fn -> throw(:basketball) end) 20 | 21 | assert {{:nocatch, :basketball}, [_ | _]} = 22 | Broadway.Acknowledger.crash_reason(kind, reason, stack) 23 | end 24 | 25 | test "Erlang errors" do 26 | {kind, reason, stack} = kind_reason_stack(fn -> :erlang.error(:boom) end) 27 | 28 | assert {%ErlangError{original: :boom}, [_ | _]} = 29 | Broadway.Acknowledger.crash_reason(kind, reason, stack) 30 | end 31 | end 32 | 33 | defp kind_reason_stack(fun) do 34 | fun.() 35 | catch 36 | kind, reason -> 37 | {kind, reason, __STACKTRACE__} 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /test/broadway/config_storage_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Broadway.ConfigStorageTest do 2 | use ExUnit.Case, async: false 3 | 4 | alias Broadway.ConfigStorage.ETS 5 | 6 | setup do 7 | prev = Application.fetch_env!(:broadway, :config_storage) 8 | 9 | on_exit(fn -> 10 | Application.put_env(:broadway, :config_storage, prev) 11 | end) 12 | end 13 | 14 | test "ets default options" do 15 | Application.put_env(:broadway, :config_storage, :ets) 16 | ETS.setup() 17 | assert [] = ETS.list() 18 | 19 | assert ETS.put("some name", %Broadway.Topology{}) 20 | assert ["some name"] = ETS.list() 21 | assert %Broadway.Topology{} = ETS.get("some name") 22 | assert :ets.info(ETS.table(), :size) == 1 23 | 24 | ETS.delete("some name") 25 | assert :ets.info(ETS.table(), :size) == 0 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /test/broadway/dummy_producer_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Broadway.DummyProducerTest do 2 | use ExUnit.Case, async: true 3 | 4 | defmodule Handler do 5 | use Broadway 6 | 7 | def handle_message(_processor, message, _context) do 8 | message 9 | end 10 | end 11 | 12 | test "send message through", c do 13 | {:ok, _} = 14 | Broadway.start_link(Handler, 15 | name: c.test, 16 | producer: [ 17 | module: {Broadway.DummyProducer, []} 18 | ], 19 | processors: [ 20 | default: [ 21 | concurrency: 1 22 | ] 23 | ] 24 | ) 25 | 26 | ref = Broadway.test_batch(c.test, [1, 2]) 27 | assert_receive {:ack, ^ref, [%{status: :ok}, %{status: :ok}], []} 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /test/broadway/topology/batcher_stage_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Topology.BatcherStageTest do 2 | use ExUnit.Case, async: true 3 | 4 | test "max_demand defaults to batch_size" do 5 | {:ok, pid} = 6 | Broadway.Topology.BatcherStage.start_link( 7 | [ 8 | module: __MODULE__, 9 | context: %{}, 10 | type: :producer_consumer, 11 | terminator: __MODULE__, 12 | resubscribe: :never, 13 | batcher: :default, 14 | processors: [:some_processor], 15 | batch_size: 123, 16 | batch_timeout: 1000, 17 | partition: 0 18 | ], 19 | [] 20 | ) 21 | 22 | %{state: state} = :sys.get_state(pid) 23 | assert state.subscription_options[:max_demand] == 123 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /test/broadway/topology/processor_stage_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Topology.ProcessorStageTest do 2 | use ExUnit.Case, async: true 3 | 4 | test "set custom min and max demand" do 5 | {:ok, pid} = 6 | Broadway.Topology.ProcessorStage.start_link( 7 | [ 8 | module: __MODULE__, 9 | context: %{}, 10 | type: :producer_consumer, 11 | terminator: __MODULE__, 12 | resubscribe: :never, 13 | processor_config: [min_demand: 3, max_demand: 6], 14 | producers: [:sample], 15 | partition: 0, 16 | dispatcher: GenStage.DemandDispatcher 17 | ], 18 | [] 19 | ) 20 | 21 | %{state: state} = :sys.get_state(pid) 22 | assert state.subscription_options[:min_demand] == 3 23 | assert state.subscription_options[:max_demand] == 6 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /test/broadway/topology/producer_stage_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Broadway.Topology.ProducerStageTest do 2 | use ExUnit.Case, async: true 3 | 4 | alias Broadway.Message 5 | alias Broadway.Topology.ProducerStage 6 | 7 | defmodule FakeProducer do 8 | use GenStage 9 | 10 | def init(_), do: {:producer, nil} 11 | 12 | def handle_demand(demand, :return_no_reply) do 13 | {:noreply, [wrap_message(demand)], :new_module_state} 14 | end 15 | 16 | def handle_demand(demand, :return_no_reply_with_hibernate) do 17 | {:noreply, [wrap_message(demand)], :new_module_state, :hibernate} 18 | end 19 | 20 | def handle_demand(demand, :return_stop) do 21 | {:stop, "error_on_demand_#{demand}", :new_module_state} 22 | end 23 | 24 | def handle_demand(demand, :do_not_wrap_messages) do 25 | {:noreply, [demand], :new_module_state} 26 | end 27 | 28 | def handle_info(message, :return_no_reply) do 29 | {:noreply, [wrap_message(message)], :new_module_state} 30 | end 31 | 32 | def handle_info(message, :return_no_reply_with_hibernate) do 33 | {:noreply, [wrap_message(message)], :new_module_state, :hibernate} 34 | end 35 | 36 | def handle_info(message, :return_stop) do 37 | {:stop, "error_on_#{message}", :new_module_state} 38 | end 39 | 40 | def handle_info(message, :do_not_wrap_messages) do 41 | {:noreply, [message], :new_module_state} 42 | end 43 | 44 | def terminate(reason, state) do 45 | {reason, state} 46 | end 47 | 48 | def transformer(event, concat: text) do 49 | %Message{data: "#{event}#{text}", acknowledger: {__MODULE__, event}} 50 | end 51 | 52 | defp wrap_message(data) do 53 | %Message{data: data, acknowledger: {__MODULE__, data}} 54 | end 55 | end 56 | 57 | defmodule ProducerWithOutTerminate do 58 | use GenStage 59 | 60 | def init(_), do: {:producer, nil} 61 | end 62 | 63 | defmodule ProducerWithBadReturn do 64 | use GenStage 65 | 66 | def init(_), do: {:consumer, nil} 67 | end 68 | 69 | setup do 70 | %{ 71 | state: %{ 72 | module: FakeProducer, 73 | transformer: nil, 74 | module_state: nil, 75 | rate_limiting: nil 76 | } 77 | } 78 | end 79 | 80 | test "init with bad return" do 81 | args = %{module: {ProducerWithBadReturn, []}, broadway: []} 82 | 83 | assert ProducerStage.init({args, _index = 0}) == 84 | {:stop, {:bad_return_value, {:consumer, nil}}} 85 | end 86 | 87 | describe "wrap handle_demand" do 88 | test "returning {:noreply, [event], new_state}", %{state: state} do 89 | state = %{state | module_state: :return_no_reply} 90 | new_state = %{state | module_state: :new_module_state} 91 | 92 | assert {:noreply, [%Message{data: 10}], ^new_state} = ProducerStage.handle_demand(10, state) 93 | end 94 | 95 | test "returning {:noreply, [event], new_state, :hibernate}", %{state: state} do 96 | state = %{state | module_state: :return_no_reply_with_hibernate} 97 | new_state = %{state | module_state: :new_module_state} 98 | 99 | assert {:noreply, [%Message{data: 10}], ^new_state, :hibernate} = 100 | ProducerStage.handle_demand(10, state) 101 | end 102 | 103 | test "returning {:stop, reason, new_state}", %{state: state} do 104 | state = %{state | module_state: :return_stop} 105 | new_state = %{state | module_state: :new_module_state} 106 | 107 | assert ProducerStage.handle_demand(10, state) == {:stop, "error_on_demand_10", new_state} 108 | end 109 | 110 | test "raise an error if a message is not a %Message{}", %{state: state} do 111 | state = %{state | module_state: :do_not_wrap_messages} 112 | 113 | assert_raise RuntimeError, 114 | ~r/the produced message is invalid/, 115 | fn -> ProducerStage.handle_demand(10, state) end 116 | end 117 | 118 | test "transform events into %Message{} structs using a transformer", %{state: state} do 119 | transformer = {FakeProducer, :transformer, [concat: " ok"]} 120 | state = %{state | module_state: :do_not_wrap_messages, transformer: transformer} 121 | new_state = %{state | module_state: :new_module_state} 122 | 123 | assert {:noreply, [%Message{data: "10 ok"}], ^new_state} = 124 | ProducerStage.handle_demand(10, state) 125 | end 126 | end 127 | 128 | describe "wrap handle_info" do 129 | test "returning {:noreply, [event], new_state}", %{state: state} do 130 | state = %{state | module_state: :return_no_reply} 131 | new_state = %{state | module_state: :new_module_state} 132 | 133 | assert {:noreply, [%Message{data: :a_message}], ^new_state} = 134 | ProducerStage.handle_info(:a_message, state) 135 | end 136 | 137 | test "returning {:noreply, [event], new_state, :hibernate}", %{state: state} do 138 | state = %{state | module_state: :return_no_reply_with_hibernate} 139 | new_state = %{state | module_state: :new_module_state} 140 | 141 | assert {:noreply, [%Message{data: :a_message}], ^new_state, :hibernate} = 142 | ProducerStage.handle_info(:a_message, state) 143 | end 144 | 145 | test "returning {:stop, reason, new_state}", %{state: state} do 146 | state = %{state | module_state: :return_stop} 147 | new_state = %{state | module_state: :new_module_state} 148 | 149 | assert ProducerStage.handle_info(:a_message, state) == 150 | {:stop, "error_on_a_message", new_state} 151 | end 152 | 153 | test "raise an error if a message is not a %Message{}", %{state: state} do 154 | state = %{state | module_state: :do_not_wrap_messages} 155 | 156 | assert_raise RuntimeError, 157 | ~r/the produced message is invalid/, 158 | fn -> ProducerStage.handle_info(:not_a_message, state) end 159 | end 160 | 161 | test "transform events into %Message{} structs using a transformer", %{state: state} do 162 | transformer = {FakeProducer, :transformer, [concat: " ok"]} 163 | state = %{state | module_state: :do_not_wrap_messages, transformer: transformer} 164 | new_state = %{state | module_state: :new_module_state} 165 | 166 | assert {:noreply, [%Message{data: "10 ok"}], ^new_state} = 167 | ProducerStage.handle_info(10, state) 168 | end 169 | end 170 | 171 | describe "wrap terminate" do 172 | test "forward call to wrapped module" do 173 | state = %{module: FakeProducer, module_state: :module_state} 174 | 175 | assert ProducerStage.terminate(:normal, state) == {:normal, :module_state} 176 | 177 | assert ProducerStage.terminate({:shutdown, :a_term}, state) == 178 | {{:shutdown, :a_term}, :module_state} 179 | end 180 | 181 | test "returns :ok when the wrapped module doesn't define a terminate/2 callback" do 182 | state = %{module: ProducerWithOutTerminate, module_state: :module_state} 183 | 184 | assert ProducerStage.terminate(:normal, state) == :ok 185 | end 186 | end 187 | end 188 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start(capture_log: true, assert_receive_timeout: 2000) 2 | Logger.remove_backend(:console) 3 | --------------------------------------------------------------------------------