├── examples ├── erlang_processor │ ├── config │ │ ├── vm.args │ │ └── sys.config │ ├── .gitignore │ ├── Makefile │ ├── src │ │ ├── erlang_processor.app.src │ │ ├── erlang_processor_sup.erl │ │ ├── erlang_processor_app.erl │ │ └── erlang_processor.erl │ ├── rebar.config │ └── README.md └── elixir_processor │ ├── config │ └── config.exs │ ├── .formatter.exs │ ├── Makefile │ ├── mix.exs │ ├── .gitignore │ ├── README.md │ └── lib │ ├── elixir_processor.ex │ └── elixir_processor │ └── application.ex ├── CHANGELOG.md ├── img ├── elixir-mld-pipeline.png └── erlang-mld-workers.png ├── Makefile ├── .gitignore ├── mix.exs ├── config └── config.exs ├── README.md ├── LICENSE ├── mix.lock └── lib ├── exmld ├── kinesis_stage.ex └── kinesis_worker.ex └── exmld.ex /examples/erlang_processor/config/vm.args: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | See the [Releases](../../releases) page. 2 | -------------------------------------------------------------------------------- /examples/erlang_processor/config/sys.config: -------------------------------------------------------------------------------- 1 | % -*- mode: erlang -*- 2 | []. 3 | -------------------------------------------------------------------------------- /img/elixir-mld-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AdRoll/exmld/HEAD/img/elixir-mld-pipeline.png -------------------------------------------------------------------------------- /img/erlang-mld-workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AdRoll/exmld/HEAD/img/erlang-mld-workers.png -------------------------------------------------------------------------------- /examples/elixir_processor/config/config.exs: -------------------------------------------------------------------------------- 1 | import Config 2 | 3 | config :logger, :console, metadata: :all 4 | -------------------------------------------------------------------------------- /examples/elixir_processor/.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" 2 | [ 3 | inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] 4 | ] 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: docs 2 | 3 | all: build 4 | 5 | build: 6 | mix compile 7 | 8 | docs: build 9 | @# run `mix escript.install hex ex_doc` first 10 | ex_doc "exmld" "git" _build/dev/lib/exmld/ebin 11 | -------------------------------------------------------------------------------- /examples/erlang_processor/.gitignore: -------------------------------------------------------------------------------- 1 | .rebar3 2 | _* 3 | .eunit 4 | *.o 5 | *.beam 6 | *.plt 7 | *.swp 8 | *.swo 9 | .erlang.cookie 10 | ebin 11 | log 12 | erl_crash.dump 13 | .rebar 14 | logs 15 | _build 16 | .idea 17 | *.iml 18 | rebar3.crashdump 19 | elixir_libs 20 | -------------------------------------------------------------------------------- /examples/elixir_processor/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: deps 2 | 3 | all: build 4 | 5 | deps: 6 | mix do deps.unlock --all, deps.get 7 | 8 | build: deps 9 | mix compile 10 | make jars 11 | 12 | jars: 13 | ./_build/dev/lib/erlmld/priv/download.sh 14 | 15 | clean: 16 | mix clean 17 | -------------------------------------------------------------------------------- /examples/erlang_processor/Makefile: -------------------------------------------------------------------------------- 1 | all: build 2 | 3 | deps: 4 | rebar3 do deps unlock, deps upgrade 5 | 6 | build: deps 7 | rebar3 compile 8 | make jars 9 | 10 | jars: 11 | _build/default/lib/erlmld/priv/download.sh 12 | 13 | release: build 14 | rebar3 tar 15 | 16 | clean: 17 | rebar3 clean 18 | -------------------------------------------------------------------------------- /examples/erlang_processor/src/erlang_processor.app.src: -------------------------------------------------------------------------------- 1 | {application, erlang_processor, 2 | [{description, "An OTP application"}, 3 | {vsn, "0.1.0"}, 4 | {registered, []}, 5 | {mod, { erlang_processor_app, []}}, 6 | {applications, 7 | [kernel, 8 | stdlib, 9 | erlexec, 10 | elixir, 11 | logger 12 | ]}, 13 | {included_applications, 14 | [erlmld, 15 | exmld 16 | ]}, 17 | {env,[]}, 18 | {modules, []}, 19 | 20 | {maintainers, []}, 21 | {licenses, ["BSD 3-Clause"]}, 22 | {links, []} 23 | ]}. 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build/ 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover/ 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps/ 9 | 10 | # Where 3rd-party dependencies like ExDoc output generated docs. 11 | /doc/ 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | -------------------------------------------------------------------------------- /examples/elixir_processor/mix.exs: -------------------------------------------------------------------------------- 1 | defmodule ElixirProcessor.MixProject do 2 | use Mix.Project 3 | 4 | def project do 5 | [ 6 | app: :elixir_processor, 7 | version: "0.1.0", 8 | elixir: "~> 1.12", 9 | start_permanent: Mix.env() == :prod, 10 | deps: deps() 11 | ] 12 | end 13 | 14 | def application do 15 | [ 16 | extra_applications: [:logger, :erlexec], 17 | included_applications: [:erlmld, :exmld], 18 | mod: {ElixirProcessor.Application, []} 19 | ] 20 | end 21 | 22 | defp deps do 23 | [ 24 | {:exmld, "~> 1.0.2"} 25 | ] 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /examples/elixir_processor/.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build/ 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover/ 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps/ 9 | 10 | # Where 3rd-party dependencies like ExDoc output generated docs. 11 | /doc/ 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | 22 | # Ignore package tarball (built via "mix hex.build"). 23 | elixir_processor-*.tar 24 | 25 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule Exmld.Mixfile do 2 | use Mix.Project 3 | 4 | @version "1.0.4" 5 | @name "exmld" 6 | @repo "https://github.com/AdRoll/#{@name}" 7 | 8 | def project do 9 | [ 10 | app: :exmld, 11 | version: @version, 12 | elixir: "~> 1.12", 13 | start_permanent: Mix.env == :prod, 14 | deps: deps(), 15 | package: package(), 16 | docs: [source_ref: "v#{@version}", 17 | source_url: @repo], 18 | description: "An Elixir library for processing multiple Kinesis and " <> 19 | "DynamoDB streams and shards in a single node using the Kinesis " <> 20 | "Client Library and MultiLangDaemon." 21 | ] 22 | end 23 | 24 | def application do 25 | [ 26 | extra_applications: [:logger], 27 | ] 28 | end 29 | 30 | defp deps do 31 | [ 32 | {:flow, "~> 1.2"}, 33 | {:erlmld, "~> 1.0.2"}, 34 | {:ex_doc, "~> 0.28", only: :dev, runtime: false} 35 | ] 36 | end 37 | 38 | defp package do 39 | %{ 40 | name: @name, 41 | licenses: ["BSD-3-Clause"], 42 | maintainers: ["AdRoll RTB team "], 43 | links: %{"GitHub" => @repo} 44 | } 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | # This file is responsible for configuring your application 2 | # and its dependencies with the aid of the Config module. 3 | import Config 4 | 5 | # This configuration is loaded before any dependency and is restricted 6 | # to this project. If another project depends on this project, this 7 | # file won't be loaded nor affect the parent project. For this reason, 8 | # if you want to provide default values for your application for 9 | # 3rd-party users, it should be done in your "mix.exs" file. 10 | 11 | # You can configure your application as: 12 | # 13 | # config :exmld, key: :value 14 | # 15 | # and access this configuration in your application as: 16 | # 17 | # Application.get_env(:exmld, :key) 18 | # 19 | # You can also configure a 3rd-party app: 20 | # 21 | # config :logger, level: :info 22 | # 23 | 24 | config :logger, :console, 25 | metadata: :all 26 | 27 | # It is also possible to import configuration files, relative to this 28 | # directory. For example, you can emulate configuration per environment 29 | # by uncommenting the line below and defining dev.exs, test.exs and such. 30 | # Configuration from the imported file will override the ones defined 31 | # here (which is why it is important to import them last). 32 | # 33 | # import_config "#{Mix.env}.exs" 34 | -------------------------------------------------------------------------------- /examples/erlang_processor/rebar.config: -------------------------------------------------------------------------------- 1 | %% -*- mode: erlang -*- 2 | {erl_opts, [debug_info]}. 3 | 4 | {deps, [ 5 | {exmld, 6 | {git, "exmld", "1.0.2"}}, 7 | {erlmld, "1.0.2"}, 8 | 9 | %% these are deps of exmld. they need to be here so `rebar3 10 | %% shell` works properly: 11 | {flow, 12 | {elixir, "flow", "1.2"}}, 13 | {gen_stage, 14 | {elixir, "gen_stage", "1.1.12"}} 15 | ]}. 16 | 17 | {relx, [{release, { erlang_processor, "0.1.0" }, 18 | [ 19 | erlang_processor, 20 | runtime_tools, 21 | sasl, 22 | tools 23 | ]}, 24 | 25 | {sys_config, "config/sys.config"}, 26 | {vm_args, "config/vm.args"}, 27 | 28 | {dev_mode, true}, 29 | {include_erts, false}, 30 | 31 | {extended_start_script, true}]}. 32 | 33 | {plugins, [ 34 | {rebar3_lint, "0.1.10"}, 35 | {rebar3_elixir_compile, 36 | {git, "https://github.com/barrel-db/rebar3_elixir_compile.git", 37 | {ref, "4afc7a887dcf8e9abe3613cafd50e5f8d912e342"}}} 38 | ]}. 39 | 40 | {provider_hooks, [ 41 | {pre, [{compile, {ex, compile}}]}, 42 | {pre, [{release, {ex, compile}}]} 43 | ]}. 44 | 45 | {elixir_opts, [ 46 | {env, prod} 47 | ]}. 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # exmld 2 | 3 | This application allows Kinesis and DynamoDB streams to be processed using Elixir or 4 | Erlang (by way of the KCL MultiLangDaemon). It's particularly useful when aggregate 5 | records are being used and items can be processed in approximate order (as opposed to 6 | strict order within each shard), but that isn't a requirement. 7 | 8 | Using [erlmld](https://github.com/AdRoll/erlmld), a normal Erlang Kinesis processing 9 | application looks like this: 10 | 11 | ![Erlang - MultiLangDaemon processing](img/erlang-mld-workers.png) 12 | 13 | Using this Elixir library (which uses erlmld), a processing application looks like this: 14 | 15 | ![Elixir - MultiLangDaemon processing](img/elixir-mld-pipeline.png) 16 | 17 | This is done using the [Flow](https://hexdocs.pm/flow/Flow.html) framework to set up a 18 | MapReduce-style processing pipeline within a single BEAM node. 19 | 20 | By virtue of using the KCL, processing applications can horizontally scale across a group 21 | of ([homogenous](https://github.com/awslabs/amazon-kinesis-client/issues/103)) worker 22 | instances. 23 | 24 | Unlike most applications using the KCL's MultiLangDaemon, an Erlang or Elixir processing 25 | application using this library can easily make full use of a worker's processing power 26 | (even if the stream contains a single shard) due to use of the Flow framework. 27 | 28 | # Examples 29 | 30 | See: 31 | 32 | 1. [example erlang processor](examples/erlang_processor/) 33 | 2. [example elixir processor](examples/elixir_processor/) 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, AdRoll 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /examples/erlang_processor/src/erlang_processor_sup.erl: -------------------------------------------------------------------------------- 1 | -module(erlang_processor_sup). 2 | 3 | -behaviour(supervisor). 4 | 5 | -export([start_link/1]). 6 | 7 | -export([init/1]). 8 | 9 | -define(SERVER, ?MODULE). 10 | 11 | start_link(Opts) -> 12 | supervisor:start_link({local, ?SERVER}, ?MODULE, Opts). 13 | 14 | init(#{stage_names := StageNames, 15 | flow_spec := FlowSpec, 16 | producers := ProducerConfigs}) -> 17 | 18 | SupFlags = #{strategy => one_for_all, 19 | intensity => 2, 20 | period => 10}, 21 | 22 | Producers = 23 | [#{id => {mld_producer, N}, 24 | type => supervisor, 25 | shutdown => infinity, 26 | start => {erlmld_sup, start_link, [ProducerConfig]}} 27 | || {N, ProducerConfig} <- lists:zip(lists:seq(1, length(ProducerConfigs)), 28 | ProducerConfigs)], 29 | 30 | %% the stages must be associated with live processes at the time the flow is started. 31 | %% if the stages are restarted, the flow should also be restarted. thus the 32 | %% one_for_all restart strategy. 33 | FlowWorker = #{id => flow, 34 | type => worker, 35 | shutdown => 5000, 36 | start => {'Elixir.Exmld', start_link, [FlowSpec]}}, 37 | 38 | Stages = [#{id => StageName, 39 | type => worker, 40 | shutdown => 5000, 41 | start => {'Elixir.Exmld.KinesisStage', start_link, [[{name, StageName}]]}} 42 | || StageName <- StageNames], 43 | 44 | {ok, {SupFlags, Stages ++ [FlowWorker | Producers]}}. 45 | -------------------------------------------------------------------------------- /examples/elixir_processor/README.md: -------------------------------------------------------------------------------- 1 | # ElixirProcessor 2 | 3 | An example elixir kinesis / dynamodb streams processing application using `exmld`. 4 | 5 | Note: running this example as-is will incur new costs in your AWS account of ~$11/mo (two 6 | new dynamodb KCL state tables with default read/write capacity of 10/10). Change the 7 | capacity of each table to 1/1 to reduce to ~$1.20/mo. 8 | 9 | 10 | ## Edit 11 | 12 | Edit the following variables in 13 | [lib/elixir_processor/application.ex](lib/elixir_processor/application.ex) according to 14 | the resources in your account / desired testing: 15 | 16 | 1. `stream_name` - a kinesis stream name 17 | 2. `stream_region` - region of the stream 18 | 3. `table_stream_arn` - a dynamodb table stream ARN 19 | 4. `table_region` - region of the table stream 20 | 5. `producer_configs` - list of producers to run (e.g., to test only kinesis or dynamo) 21 | 22 | 23 | ## Build 24 | 25 | $ make 26 | 27 | 28 | ## Run 29 | 30 | $ iex -S mix 31 | iex(1)> :observer.start() 32 | 33 | 34 | ## Disable KCL logspam 35 | 36 | iex(2)> Application.put_env(:erlmld, :log_kcl_spam, false) 37 | :ok 38 | 39 | 40 | ## Observe 41 | 42 | 11:36:45.688 pid=<0.199.0> full_batch_counter=9 counter=18 items=[%ElixirProcessor.Item{ 43 | token: %ElixirProcessor.Token{ 44 | sequence_number: {:sequence_number, 00000000000000000000000000000000, :undefined, 0, 1}, 45 | stage: #PID<0.189.0>, 46 | worker: #PID<0.212.0>}, 47 | value: {:stream_record, :undefined, :undefined, :undefined, 48 | {:sequence_number, 00000000000000000000000000000000, :undefined, :undefined, :undefined}, 49 | "{\"eventID\":\"00000000000000000000000000000000\",\"eventName\":\"REMOVE\", 50 | \"eventVersion\":\"1.1\",\"eventSource\":\"aws:dynamodb\",\"awsRegion\":\"us-west-2\", 51 | \"dynamodb\": ... event data ... }"}}] 52 | line=96 function=flush/1 module=ElixirProcessor 53 | file=lib/elixir_processor.ex application=elixir_processor [info] processing batch 54 | -------------------------------------------------------------------------------- /examples/erlang_processor/README.md: -------------------------------------------------------------------------------- 1 | erlang_processor 2 | ===== 3 | 4 | An example erlang kinesis / dynamodb streams processing application using `exmld`. 5 | 6 | Note: running this example as-is will incur new costs in your AWS account of ~$11/mo (two 7 | new dynamodb KCL state tables with default read/write capacity of 10/10). Change the 8 | capacity of each table to 1/1 to reduce to ~$1.20/mo. 9 | 10 | Edit 11 | ----- 12 | 13 | Edit the following variables in 14 | [src/erlang_processor_app.erl](src/erlang_processor_app.erl) 15 | according to the resources in your account / desired testing: 16 | 17 | 1. `StreamName` - a kinesis stream name 18 | 2. `StreamRegion` - region of the stream 19 | 3. `TableStreamArn` - a dynamodb table stream ARN 20 | 4. `TableRegion` - region of the table stream 21 | 5. `ProducerConfigs` - list of producers to run (e.g., to test only kinesis or dynamo) 22 | 23 | Build 24 | ----- 25 | 26 | $ make 27 | 28 | Run 29 | ----- 30 | 31 | $ rebar3 shell 32 | 1> observer:start(). 33 | 34 | Disable KCL logspam 35 | ----- 36 | 37 | 2> application:set_env(erlmld, log_kcl_spam, false). 38 | ok 39 | 40 | Observe 41 | ----- 42 | 43 | <0.434.0> processing items: [{item, 44 | #{'__struct__' => 45 | 'Elixir.Exmld.KinesisWorker.Datum', 46 | opaque => some_opaque_value, 47 | shard_id => 48 | <<"shardId-00000001537808865642-00000000">>, 49 | stream_record => 50 | {stream_record,undefined,undefined, 51 | undefined, 52 | {sequence_number, 53 | 2027497200000000000664507565,undefined, 54 | undefined,undefined}, 55 | <<"{\"eventID\":\"00000000000000000000000000000000\",\"eventName\":\"INSERT\",\"eventVersion\":\"1.1\",\"eventSource\":\"aws:dynamodb\",\"awsRegion\":\"us-west-2\",\"dynamodb\":{\"ApproximateCreationDateTime\":1537821240000,\"Keys\": ... },\"SequenceNumber\":\"00000000000000000000000000000000\",\"SizeBytes\":1234,\"StreamViewType\":\"KEYS_ONLY\"}}">>}}, 56 | {flow_token,<0.424.0>,<0.459.0>, 57 | {sequence_number,00000000000000000000000000000000, 58 | undefined,0,1}}}, 59 | ... 60 | 61 | 62 | Make a release 63 | ----- 64 | 65 | $ make release 66 | ... (unpack release somewhere) ... 67 | in release dir: 68 | $ ./bin/erlang_processor foreground 69 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "earmark_parser": {:hex, :earmark_parser, "1.4.25", "2024618731c55ebfcc5439d756852ec4e85978a39d0d58593763924d9a15916f", [:mix], [], "hexpm", "56749c5e1c59447f7b7a23ddb235e4b3defe276afc220a6227237f3efe83f51e"}, 3 | "erlexec": {:hex, :erlexec, "1.10.9", "3cbb3476f942bfb8b68b85721c21c1835061cf6dd35f5285c2362e85b100ddc7", [:rebar3], [], "hexpm", "271e5b5f2d91cdb9887efe74d89026c199bfc69f074cade0d08dab60993fa14e"}, 4 | "erlmld": {:hex, :erlmld, "1.0.2", "a3dad389e0f07d3ad0bc9e99ead5ab1e3527365a9eb7169945098c1e8e504d6b", [:rebar3], [{:erlexec, "1.10.9", [hex: :erlexec, repo: "hexpm", optional: false]}, {:jiffy, "1.1.1", [hex: :jiffy, repo: "hexpm", optional: false]}], "hexpm", "c35739e93864da3321f6f316771b8e9712bc590fb6f67dd08403d3c901076aa3"}, 5 | "ex_doc": {:hex, :ex_doc, "0.28.4", "001a0ea6beac2f810f1abc3dbf4b123e9593eaa5f00dd13ded024eae7c523298", [:mix], [{:earmark_parser, "~> 1.4.19", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "bf85d003dd34911d89c8ddb8bda1a958af3471a274a4c2150a9c01c78ac3f8ed"}, 6 | "flow": {:hex, :flow, "1.2.0", "515e03aa3d056cecc3e3f1e80f6ca4bbf5f45b13c88dee5db880b2f3f24f1caa", [:mix], [{:gen_stage, "~> 1.0", [hex: :gen_stage, repo: "hexpm", optional: false]}], "hexpm", "1b45bfc8a9202c5ec80b077c21df133561e56c56189ba4082dddccb6b5762525"}, 7 | "gen_stage": {:hex, :gen_stage, "1.1.2", "b1656cd4ba431ed02c5656fe10cb5423820847113a07218da68eae5d6a260c23", [:mix], [], "hexpm", "9e39af23140f704e2b07a3e29d8f05fd21c2aaf4088ff43cb82be4b9e3148d02"}, 8 | "jiffy": {:hex, :jiffy, "1.1.1", "aca10f47aa91697bf24ab9582c74e00e8e95474c7ef9f76d4f1a338d0f5de21b", [:rebar3], [], "hexpm", "62e1f0581c3c19c33a725c781dfa88410d8bff1bbafc3885a2552286b4785c4c"}, 9 | "makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"}, 10 | "makeup_elixir": {:hex, :makeup_elixir, "0.16.0", "f8c570a0d33f8039513fbccaf7108c5d750f47d8defd44088371191b76492b0b", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "28b2cbdc13960a46ae9a8858c4bebdec3c9a6d7b4b9e7f4ed1502f8159f338e7"}, 11 | "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"}, 12 | "nimble_parsec": {:hex, :nimble_parsec, "1.2.3", "244836e6e3f1200c7f30cb56733fd808744eca61fd182f731eac4af635cc6d0b", [:mix], [], "hexpm", "c8d789e39b9131acf7b99291e93dae60ab48ef14a7ee9d58c6964f59efb570b0"}, 13 | } 14 | -------------------------------------------------------------------------------- /examples/erlang_processor/src/erlang_processor_app.erl: -------------------------------------------------------------------------------- 1 | -module(erlang_processor_app). 2 | 3 | -behaviour(application). 4 | 5 | -export([start/2, stop/1]). 6 | 7 | start(_StartType, _StartArgs) -> 8 | %% note: in a real application many of these values would be in a common app env 9 | %% configuration, but are shown here for illustration. A key `asdf_xyz` appearing in 10 | %% a producer config map will be used to substitute the variable "${ASDF_XYZ}" 11 | %% appearing in erlmld/priv/mld.properties.in. 12 | ErlMldConfig = maps:from_list(application:get_all_env(erlmld)), 13 | 14 | %% emit kcl spam to console. in a real application, this could be configured using a 15 | %% lager-compatible module for logging to a file. 16 | application:set_env(erlmld, log_kcl_spam, true), 17 | 18 | %% id of this worker instance; it should be unique per beam node. if not supplied, it 19 | %% will be auto-generated by the KCL. two different nodes using the same worker id 20 | %% will clobber each other's state. 21 | WorkerId = <<"example worker">>, 22 | 23 | %% name and region of the kinesis stream being processed. you could create this stream 24 | %% with the following command: 25 | %% 26 | %% aws kinesis create-stream --region us-west-2 --shard-count 2 \ 27 | %% --stream-name erlang-processor-test-stream 28 | %% 29 | StreamName = <<"erlang-processor-test-stream">>, 30 | StreamRegion = <<"us-west-2">>, 31 | 32 | %% ARN and region of the dynamo stream being processed. `erlmld` does not yet support 33 | %% obtaining ARNs from table names. you can obtain the ARN of an existing table 34 | %% stream with the following command: 35 | %% 36 | %% aws dynamodbstreams list-streams --region us-west-2 \ 37 | %% --table-name erlang-processor-test-table \ 38 | %% --query 'Streams[0].StreamArn' --output text 39 | %% 40 | TableStreamArn = <<"arn:aws:dynamodb:REGION:ACCOUNT-ID:table/TABLE-NAME/stream/TIMESTAMP">>, 41 | TableRegion = StreamRegion, 42 | 43 | %% in this example application, all source streams can be processed the same way, so 44 | %% we set up a single flow and set of stages. if data from different streams should 45 | %% be handled differently, separate flows should be used. 46 | %% 47 | %% these are the registered names of the GenStages which will receive kinesis records 48 | %% from each owned shard (round-robin). the actual stages will be owned by a 49 | %% supervisor, but the names are needed now due to how the flusher module and flow are 50 | %% configured. they could also be pids, but using names allows them to be restarted 51 | %% without restarting everything else (and be started later): 52 | StageNames = [binary_to_atom(<<"stage_", (integer_to_binary(I))/binary>>, 53 | utf8) 54 | || I <- lists:seq(1, erlang:system_info(schedulers_online))], 55 | 56 | ConcurrencyFactor = 1, % increase if processing work is IO-bound 57 | NumReducers = erlang:system_info(schedulers_online) * ConcurrencyFactor, 58 | 59 | %% size of each batch to be "flushed" (i.e., collect this many items before processing 60 | %% them all in a batch): 61 | BatchSize = 10, 62 | 63 | %% attempt to flush batches every 10s even if batch size not reached (relies on 64 | %% heartbeat mechanic): 65 | FlushInterval = 10000, 66 | 67 | %% checkpoint every 60s: 68 | CheckpointInterval = 60000, 69 | 70 | %% fail if a worker stalls for 600s: 71 | WatchdogTimeout = 600000, 72 | 73 | %% max number of in-flight items for each kinesis shard worker: 74 | MaxPending = 1024, 75 | 76 | %% flow demand parameters; see flow documentation: 77 | MinDemand = 1, 78 | MaxDemand = 1024, 79 | 80 | FlowOptions = [{num_stages, NumReducers}, 81 | {min_demand, MinDemand}, 82 | {max_demand, MaxDemand}], 83 | FlowSpec = erlang_processor:flow_spec(StageNames, BatchSize, FlushInterval, FlowOptions), 84 | 85 | %% retrieve this many records with each api call (max: 10000 (kinesis), 1000 86 | %% (dynamo)): 87 | MaxRecords = 1000, 88 | 89 | CommonConfig = maps:merge( 90 | ErlMldConfig, 91 | #{ 92 | record_processor => erlmld_batch_processor, 93 | record_processor_data => 94 | #{flusher_mod => 'Elixir.Exmld.KinesisWorker', 95 | flusher_mod_data => 96 | [{stages, StageNames}, 97 | {opaque, some_opaque_value}, 98 | {max_pending, MaxPending}], 99 | flush_interval_ms => FlushInterval, 100 | checkpoint_interval_ms => CheckpointInterval, 101 | watchdog_timeout_ms => WatchdogTimeout, 102 | on_checkpoint => fun on_checkpoint/2, 103 | description => "description goes here"}, 104 | 105 | worker_id => WorkerId, 106 | 107 | %% initial starting position if no shard checkpoint exists; LATEST is 108 | %% most recent, TRIM_HORIZON is earliest available: 109 | initial_position => <<"TRIM_HORIZON">>, 110 | 111 | max_records => MaxRecords, 112 | 113 | %% reduce cloudwatch metric spam: 114 | metrics_level => <<"NONE">> 115 | }), 116 | 117 | %% a kinesis stream processor: 118 | KinesisProducer = #{ 119 | %% required if processing multiple source streams within a single beam node (any 120 | %% atom, used as a registered name suffix and local filename component): 121 | app_suffix => k, 122 | 123 | %% this name will be used to name the dynamodb state table used by the KCL. if 124 | %% it doesn't exist, it will be created. the table is used for coordinating 125 | %% leases held and checkpoints made by workers cooperating as part of an 126 | %% application. if two erlang nodes are running using the same value for this 127 | %% name, they are considered as two workers in a single processing application. 128 | %% a single beam node processing multiple different streams needs a unique value 129 | %% for each stream: 130 | %% 131 | %% this is the same name as the corresponding elixir_processor example application; 132 | %% if both the elixir and erlang versions are run at the same time, both will 133 | %% cooperate in processing the stream. 134 | kcl_appname => <<"erlang-processor-kinesis-test">>, 135 | 136 | stream_name => StreamName, 137 | stream_region => StreamRegion, 138 | 139 | %% the stream type; 'kinesis' for kinesis streams, 'dynamo' for dynamodb 140 | %% streams: 141 | stream_type => kinesis 142 | }, 143 | 144 | %% a dynamo stream processor: 145 | DynamoProducer = #{ 146 | app_suffix => d, 147 | kcl_appname => <<"erlang-processor-dynamo-test">>, 148 | stream_name => TableStreamArn, 149 | stream_region => TableRegion, 150 | stream_type => dynamo 151 | }, 152 | 153 | ProducerConfigs = [maps:merge(CommonConfig, ProducerConfig) 154 | || ProducerConfig <- [KinesisProducer]],%, DynamoProducer]], 155 | erlang_processor_sup:start_link(#{stage_names => StageNames, 156 | flow_spec => FlowSpec, 157 | producers => ProducerConfigs}). 158 | 159 | stop(_State) -> 160 | ok. 161 | 162 | 163 | on_checkpoint(OpaqueData, ShardId) -> 164 | io:format("~p checkpointed (~p)~n", [OpaqueData, ShardId]). 165 | -------------------------------------------------------------------------------- /examples/elixir_processor/lib/elixir_processor.ex: -------------------------------------------------------------------------------- 1 | defmodule ElixirProcessor do 2 | @moduledoc """ 3 | Record processor example implementation. 4 | """ 5 | 6 | require Logger 7 | require Record 8 | require Exmld 9 | 10 | defstruct [:batch_size, :pending_items, full_batch_counter: 0, flush_counter: 0] 11 | 12 | defmodule Token do 13 | defstruct [:stage, :worker, :sequence_number] 14 | end 15 | 16 | defmodule Item do 17 | defstruct [:value, :token] 18 | end 19 | 20 | @doc """ 21 | Return a flow spec which can be used to set up a processing pipeline; see exmld.ex. 22 | 23 | The pipeline definition is similar but not identical to the version in 24 | `erlang_processor`: here, we supply a window with a time-based trigger and modify the 25 | flow using the `:append` option. 26 | """ 27 | def flow_spec(stage_names, flow_options, opts \\ []) do 28 | window = 29 | Flow.Window.global() 30 | |> Flow.Window.trigger_periodically(opts[:flush_interval] || 10000, :millisecond) 31 | 32 | %{ 33 | stages: stage_names, 34 | extract_items_fn: &flow_extract/1, 35 | partition_key: {:elem, 0}, 36 | state0: fn -> 37 | %__MODULE__{ 38 | batch_size: opts[:batch_size] || 10, 39 | pending_items: [] 40 | } 41 | end, 42 | process_fn: &flow_add_event/2, 43 | flow_opts: 44 | flow_options ++ 45 | [ 46 | window: window, 47 | append: fn flow -> 48 | flow 49 | |> Flow.on_trigger(&flow_flush/1) 50 | end 51 | ] 52 | } 53 | end 54 | 55 | # flow_extract/1 is called to extract sub-items from a kinesis or dynamo stream record. 56 | # this allows handling of both KPL-aggregated records and custom aggregation schemes. 57 | # the output of this function should be a list of 2-tuples ({key, value}) to be passed 58 | # to flow_add_event/2 for handling in a reducer. 59 | # 60 | # items seen by the extract function generally look like this: 61 | # 62 | # %Exmld.KinesisStage.Event{ 63 | # event: %Exmld.KinesisWorker.Datum{ 64 | # opaque: {"us-west-2", "erlang-processor-kinesis-test"}, 65 | # shard_id: "shardId-000000000001", 66 | # stream_record: {:stream_record, "12345", 946684800, 67 | # {:sequence_number, 12345, 0, :undefined, :undefined}, 68 | # " .. record data .. "}}, 69 | # stage: #PID<0.136.0>, 70 | # worker: #PID<0.862.0>} 71 | # 72 | defp flow_extract(%Exmld.KinesisStage.Event{ 73 | event: %Exmld.KinesisWorker.Datum{stream_record: record}, 74 | stage: stage, 75 | worker: worker 76 | }) do 77 | case record do 78 | # handle a heartbeat. the second element of the tuple will vary so heartbeats get 79 | # distributed among reducers, so the elements must be swapped since we're using the 80 | # first element as a partition key. 81 | {:heartbeat, x} -> 82 | [{x, :heartbeat}] 83 | 84 | # in a real application, sub-records could be extracted from Event here. if using a 85 | # custom non-KPL aggregation scheme, this should associate each sub-record with a 86 | # faked sequence number having the same base as the parent record, and appropriate 87 | # 'user_sub' (sub-record index) and 'user_total' (total number of extracted 88 | # sub-records) fields. then when later notifying exmld of record disposition, it 89 | # can properly track sub-record processing and advance the checkpoint beyond the 90 | # parent record if all of its sub-records were processed. 91 | # 92 | # two records having the same key (first tuple element) here will be handled by the 93 | # same reducer. in general, the key should be consistently derived from some 94 | # attribute of the record/item being processed. 95 | _ when Record.is_record(record, :stream_record) -> 96 | sn = 97 | Exmld.stream_record(record, :sequence_number) 98 | |> Exmld.sequence_number(user_sub: 0) 99 | |> Exmld.sequence_number(user_total: 1) 100 | 101 | item = %Item{ 102 | value: record, 103 | token: %Token{stage: stage, worker: worker, sequence_number: sn} 104 | } 105 | 106 | [{:erlang.phash2(item), item}] 107 | end 108 | end 109 | 110 | # handle an item extracted from a record (or a heartbeat). this occurs in a reducer 111 | # whose initial state is given by 'state0' in flow_spec/3 above. it returns an updated 112 | # state after possibly processing the event (and possibly flushing/updating the state 113 | # accordingly). here, we simply add non-heartbeat items to the current batch and flush 114 | # the batch if it has reached the target size (we don't use an event based window which 115 | # would count heartbeats). if enough time elapses, a flush will be separately triggered 116 | # by the flow window. 117 | defp flow_add_event({_key, item}, %__MODULE__{pending_items: pending} = state) do 118 | case item do 119 | :heartbeat -> state 120 | _ -> %{state | pending_items: [item | pending]} 121 | end 122 | |> maybe_flush() 123 | end 124 | 125 | # possibly process the current pending batch of records if of the appropriate size: 126 | defp maybe_flush( 127 | %__MODULE__{ 128 | pending_items: pending, 129 | batch_size: batch_size, 130 | full_batch_counter: c 131 | } = state 132 | ) 133 | when length(pending) >= batch_size do 134 | elem(flow_flush(%{state | full_batch_counter: c + 1}), 1) 135 | end 136 | 137 | defp maybe_flush(state) do 138 | state 139 | end 140 | 141 | # process the current pending batch of records, notify upstream of processing 142 | # disposition, and return the events to emit downstream (the current state) and the new 143 | # reducer accumulator (the updated state). nothing in the current example makes use of 144 | # the emitted value. 145 | defp flow_flush(state) do 146 | orig = state 147 | {:ok, state, tokens} = flush(state) 148 | :ok = notify_dispositions(tokens, :ok) 149 | {[orig], state} 150 | end 151 | 152 | # process a batch of items which have been collected, returning {:ok, state, tokens}. 153 | # 154 | # `tokens` is a list of tokens used by notify_dispositions/2 to inform upstream workers 155 | # of the status of processing. this is needed because a single reducer will potentially 156 | # receive records from multiple different kinesis shards. with this disposition scheme, 157 | # a kinesis worker can correctly checkpoint based on how far along downstream processing 158 | # has come (instead of for example automatically checkpointing based on time, which 159 | # could lose records). 160 | defp flush( 161 | %__MODULE__{pending_items: pending, full_batch_counter: fc, flush_counter: c} = state 162 | ) do 163 | Logger.info("processing batch", items: inspect(pending), counter: c, full_batch_counter: fc) 164 | :timer.sleep(100 * length(pending)) 165 | tokens = for %Item{token: token} <- pending, do: token 166 | {:ok, %{state | pending_items: [], flush_counter: c + 1}, tokens} 167 | end 168 | 169 | # group item processing disposition by origin stage and worker, informing each stage of 170 | # the records (sequence numbers) from its workers which have been processed. this 171 | # allows upstream kinesis workers to safely checkpoint only fully processed data. 172 | defp notify_dispositions(tokens, status) do 173 | prepend = fn x -> &[x | &1] end 174 | 175 | List.foldl(tokens, %{}, fn %Token{stage: stage, worker: worker, sequence_number: sn}, acc -> 176 | d = %Exmld.KinesisWorker.Disposition{sequence_number: sn, status: status} 177 | Map.update(acc, stage, %{worker => [d]}, &Map.update(&1, worker, [d], prepend.(d))) 178 | end) 179 | |> Enum.reduce(:ok, fn {stage, worker_map}, :ok -> 180 | Exmld.KinesisStage.disposition(stage, worker_map) 181 | end) 182 | end 183 | end 184 | -------------------------------------------------------------------------------- /lib/exmld/kinesis_stage.ex: -------------------------------------------------------------------------------- 1 | defmodule Exmld.KinesisStage do 2 | use GenStage 3 | require Logger 4 | 5 | @moduledoc """ 6 | A `GenStage` stage for use in processing data produced by `Exmld.KinesisWorker`s. 7 | 8 | This module acts as a GenStage producer. Subscribers will receive 9 | `Exmld.KinesisStage.Event`s, which each wrap an underlying `Exmld.KinesisWorker.Datum` 10 | along with information about the producing worker and stage. Downstream processors 11 | should eventually call `disposition/2` with the disposition of processing so that 12 | originating workers can checkpoint. 13 | 14 | The expected use and workflow is: 15 | 16 | 1. Create a stage or set of stages using this module for each distinct processing 17 | pipeline. A set of stages could be used by more than one Kinesis stream if the 18 | processing being done is the same for all of them. 19 | 20 | 2. Create a flow using `Flow.from_stages/2`. 21 | 22 | 3. Configure the flow using `Exmld.flow/6`. 23 | 24 | 4. Run the flow, which should run forever. 25 | 26 | 5. Configure an erlmld supervision tree with a set of `Exmld.KinesisWorker`s using the 27 | stage(s) created in (1). 28 | """ 29 | 30 | defstruct [{:counter, 0}, 31 | {:queue, :queue.new()}, 32 | {:demand, 0}, 33 | {:disposition, %{}}] # pid => {ref, [term]} 34 | 35 | defmodule Event do 36 | @doc """ 37 | Struct for events provided to an `Exmld.KinesisStage`. 38 | 39 | Records the stage and worker identifiers associated with an event. 40 | 41 | ## Fields 42 | 43 | * `:stage` - identifier of the `Exmld.KinesisStage` which handled the event 44 | * `:worker` - identifier of the `Exmld.KinesisWorker` which produced the event 45 | * `:event` - an `Exmld.KinesisWorker.Datum` 46 | """ 47 | defstruct stage: nil, worker: nil, event: nil 48 | @type t :: %Event{stage: pid, worker: term, event: Exmld.KinesisWorker.Datum.t} 49 | end 50 | 51 | def start_link(opts \\ []) do 52 | GenStage.start_link(__MODULE__, [], opts) 53 | end 54 | 55 | @doc """ 56 | Notify `stage` of a new Kinesis record available for processing. 57 | 58 | A new event is available for processing by `stage`. The caller will be monitored and 59 | associated with the new event, and will be blocked until after the event has been used 60 | to satisfy some downstream demand. The return value will be the disposition 61 | (success/failure) of zero or more records which were previously processed. 62 | """ 63 | @spec notify(GenStage.stage, 64 | Exmld.KinesisWorker.Datum, 65 | :infinity | non_neg_integer) :: {:disposition, [Exmld.KinesisWorker.Disposition.t]} 66 | def notify(stage, datum, timeout \\ :infinity) do 67 | GenStage.call(stage, {:notify, datum}, timeout) 68 | end 69 | 70 | @doc """ 71 | Notify `stage` of the disposition of processing some items. 72 | 73 | An attempt has been made to process some data extracted from a Kinesis record by a 74 | downstream processor. `stage` will look up the originating producer and record the 75 | disposition of processing in the next batch of data to be returned to that producer. 76 | """ 77 | @spec disposition(GenStage.stage, %{optional(pid) => [Exmld.KinesisWorker.Disposition.t]}) :: :ok 78 | def disposition(stage, disposition, timeout \\ :infinity) do 79 | GenStage.call(stage, {:disposition, disposition}, timeout) 80 | end 81 | 82 | ## GenStage callbacks 83 | 84 | def init([]) do 85 | {:producer, %__MODULE__{}} 86 | end 87 | 88 | # retrieve any buffered events from state and try to serve any pending demand. associate 89 | # each event with the process which produced it. defer reply and block the caller until 90 | # the event is used to fulfill some demand. monitor caller. 91 | def handle_call({:notify, event}, from, state) do 92 | state 93 | |> monitor_sender(from) 94 | |> enqueue_event(event, from) 95 | |> dispatch_events([]) 96 | end 97 | 98 | # a record processor stage is informing us of the disposition of some items which were 99 | # extracted from a source record. forward that information back to the originating 100 | # workers by updating the next reply state for each pid if it's still monitored. 101 | def handle_call({:disposition, worker_values}, _from, 102 | %__MODULE__{counter: counter, disposition: disposition} = state) do 103 | # worker_values: %{worker_pid => [disposition]} 104 | update = fn({worker_pid, worker_dispositions}, map) -> 105 | map 106 | |> Map.get_and_update(worker_pid, 107 | fn 108 | (nil) -> 109 | {nil, nil} 110 | ({mref, x}) -> 111 | {nil, {mref, worker_dispositions ++ x}} 112 | end) 113 | |> elem(1) 114 | end 115 | disposition = Enum.reduce(worker_values, disposition, update) 116 | counter = Enum.reduce(worker_values, counter, fn ({_, d}, n) -> n + length(d) end) 117 | {:reply, :ok, [], %{state | counter: counter, disposition: disposition}} 118 | end 119 | 120 | # a monitored process has exited; discard any saved item disposition: 121 | def handle_info({:DOWN, mref, :process, pid, _reason}, 122 | %__MODULE__{disposition: disposition} = state) do 123 | {{^mref, _}, disposition} = Map.pop(disposition, pid) 124 | {:noreply, [], %{state | disposition: disposition}} 125 | end 126 | 127 | @doc """ 128 | Handle subscriber demand. 129 | 130 | Return up to `incoming_demand + pending_demand` events, fetching (from state) as needed, 131 | and storing in state any excess. If not enough events are available, record unsatisfied 132 | demand in state, and then return those events when answering a subsequent call. See the 133 | `QueueBroadcaster` example in `GenStage` for an explanation of this demand queueing 134 | behavior. 135 | """ 136 | def handle_demand(incoming_demand, %__MODULE__{demand: pending_demand} = state) do 137 | dispatch_events(%{state | demand: incoming_demand + pending_demand}, []) 138 | end 139 | 140 | ## Internal functions 141 | 142 | defp monitor_sender(%__MODULE__{disposition: disposition} = state, {pid, _}) do 143 | case Map.has_key?(disposition, pid) do 144 | true -> 145 | state 146 | false -> 147 | mref = Process.monitor(pid) 148 | %{state | disposition: Map.put(disposition, pid, {mref, []})} 149 | end 150 | end 151 | 152 | defp enqueue_event(%__MODULE__{queue: queue} = state, event, from) do 153 | %{state | queue: :queue.in({from, event}, queue)} 154 | end 155 | 156 | defp dispatch_events(%__MODULE__{demand: 0} = state, events) do 157 | {:noreply, Enum.reverse(events), state} 158 | end 159 | 160 | defp dispatch_events(%__MODULE__{queue: queue, demand: demand} = state, events) do 161 | case :queue.out(queue) do 162 | {{:value, {from, event}}, queue} -> 163 | {pid, _} = from 164 | %{state | queue: queue, demand: demand - 1} 165 | |> inform_of_disposition(from) 166 | |> dispatch_events([%__MODULE__.Event{stage: self(), worker: pid, event: event} | events]) 167 | 168 | {:empty, queue} -> 169 | {:noreply, Enum.reverse(events), %{state | queue: queue}} 170 | end 171 | end 172 | 173 | # look up the pid and send it the next disposition batch as a reply to its latest 174 | # notification. 175 | defp inform_of_disposition(%__MODULE__{disposition: disposition} = state, 176 | {pid, _ref} = from) do 177 | {value, disposition} = 178 | disposition 179 | |> Map.get_and_update(pid, 180 | fn 181 | (nil) -> 182 | {[], nil} 183 | ({mref, x}) -> 184 | {x, {mref, []}} 185 | end) 186 | :ok = GenStage.reply(from, {:disposition, value}) 187 | %{state | disposition: disposition} 188 | end 189 | end 190 | -------------------------------------------------------------------------------- /examples/erlang_processor/src/erlang_processor.erl: -------------------------------------------------------------------------------- 1 | %% 2 | %% record processor example implementation. 3 | %% 4 | 5 | -module(erlang_processor). 6 | 7 | -export([flow_spec/4]). 8 | 9 | -record(state, {batch_size, 10 | next_flush_time = os:timestamp(), 11 | flush_interval = 10000, 12 | pending_items = []}). 13 | 14 | -record(flow_token, {stage, worker, sequence_number}). 15 | 16 | -record(item, {value, token}). 17 | 18 | -include_lib("erlmld/include/erlmld.hrl"). 19 | 20 | 21 | %% return a flow spec which can be used to set up a processing pipeline; see exmld.ex. 22 | flow_spec(StageNames, BatchSize, FlushInterval, FlowOptions) -> 23 | #{stages => StageNames, 24 | extract_items_fn => fun flow_extract/1, 25 | partition_key => {elem, 0}, % elixir uses 0-indexing 26 | state0 => fun () -> 27 | #state{flush_interval = FlushInterval, 28 | batch_size = BatchSize} 29 | end, 30 | process_fn => fun flow_process_event/2, 31 | flow_opts => FlowOptions}. 32 | 33 | 34 | %% flow_extract/1 is called to extract sub-items from a kinesis or dynamo stream record. 35 | %% this allows handling of both KPL-aggregated records and custom aggregation schemes. 36 | %% the output of this function should be a list of 2-tuples ({key, value}) to be passed to 37 | %% flow_process_event/2 for processing in a reducer. 38 | %% 39 | %% items seen by the extract function generally look like this: 40 | %% 41 | %% #{'__struct__' => 'Elixir.Exmld.KinesisStage.Event', 42 | %% event => 43 | %% #{'__struct__' => 'Elixir.Exmld.KinesisWorker.Datum', 44 | %% opaque => {<<"us-west-2">>, <<"erlang-processor-kinesis-test">>}, 45 | %% shard_id => <<"shardId-000000000001">>, 46 | %% stream_record => 47 | %% #stream_record{partition_key = <<"12345">>, 48 | %% timestamp = 946684800,, 49 | %% sequence_number = 50 | %% #sequence_number{base = 12345, sub = 0}, 51 | %% data = << .. record data .. >>}}, 52 | %% stage => <0.136.0>, 53 | %% worker => <0.862.0>} 54 | %% 55 | flow_extract(#{'__struct__' := 'Elixir.Exmld.KinesisStage.Event', 56 | event := #{'__struct__' := 'Elixir.Exmld.KinesisWorker.Datum', 57 | stream_record := {heartbeat, X}}}) -> 58 | %% handle a heartbeat. the second element of the tuple will vary so heartbeats get 59 | %% distributed among reducers, so the elements must be swapped since we're using the 60 | %% first element as a partition key. 61 | [{X, heartbeat}]; 62 | flow_extract(#{'__struct__' := 'Elixir.Exmld.KinesisStage.Event', 63 | stage := Stage, 64 | worker := Worker, 65 | event := Item}) -> 66 | %% in a real application, sub-records could be extracted from Event here. if using a 67 | %% custom non-KPL aggregation scheme, this should associate each sub-record with a 68 | %% faked sequence number having the same base as the parent record, and appropriate 69 | %% 'user_sub' (sub-record index) and 'user_total' (total number of extracted 70 | %% sub-records) fields. then when later notifying exmld of record disposition, it can 71 | %% properly track sub-record processing and advance the checkpoint beyond the parent 72 | %% record if all of its sub-records were processed. 73 | %% 74 | %% two records having the same key (first tuple element) here will be handled by the 75 | %% same reducer. in general, the key should be consistently derived from some 76 | %% attribute of the record/item being processed. 77 | #{'__struct__' := 'Elixir.Exmld.KinesisWorker.Datum', 78 | stream_record := #stream_record{sequence_number = SN}} = Item, 79 | [{erlang:phash2(Item), #item{value = Item, 80 | token = #flow_token{stage = Stage, 81 | worker = Worker, 82 | sequence_number = SN#sequence_number{user_sub = 0, 83 | user_total = 1}}}}]. 84 | 85 | 86 | %% process an item extracted from a record (or a heartbeat). this occurs in a reducer 87 | %% whose initial state is given by 'state0' in flow_spec/4 above. it returns an updated 88 | %% state after processing the event (and possibly flushing/updating the state 89 | %% accordingly). here, we simply add the item to the current batch and possibly flush the 90 | %% batch. 91 | flow_process_event({_Key, Item}, #state{} = State) -> 92 | maybe_flush(flow_add_record(Item, State)). 93 | 94 | 95 | flow_add_record(heartbeat, State) -> 96 | State; 97 | flow_add_record(Item, #state{pending_items = Pending} = State) -> 98 | State#state{pending_items = [Item | Pending]}. 99 | 100 | 101 | %% possibly process the current pending batch of records if of the appropriate size or 102 | %% enough time has elapsed: 103 | maybe_flush(State) -> 104 | case should_flush(State) of 105 | true -> 106 | {ok, NState, Tokens} = flush(State), 107 | ok = notify_dispositions(Tokens, ok), 108 | note_flush(NState); 109 | false -> 110 | State 111 | end. 112 | 113 | 114 | should_flush(#state{pending_items = Pending, 115 | batch_size = BatchSize, 116 | next_flush_time = NextFlush}) -> 117 | length(Pending) >= BatchSize 118 | orelse elapsed_ms(NextFlush) >= 0. 119 | 120 | 121 | note_flush(#state{flush_interval = FlushInterval} = State) -> 122 | {Mega, Sec, Micros} = os:timestamp(), 123 | NextFlush = {Mega, Sec, Micros + trunc(FlushInterval * 1.0e3)}, 124 | State#state{next_flush_time = NextFlush}. 125 | 126 | 127 | elapsed_ms(When) -> 128 | trunc(timer:now_diff(os:timestamp(), When)/1.0e3). 129 | 130 | 131 | %% process a batch of items which have been collected, returning {ok, NState, Tokens}. 132 | %% 133 | %% Tokens is a list of tokens used by notify_dispositions/2 to inform upstream workers of 134 | %% the status of processing. this is needed because a single reducer will potentially 135 | %% receive records from multiple different kinesis shards. with this disposition scheme, 136 | %% a kinesis worker can correctly checkpoint based on how far along downstream processing 137 | %% has come (instead of for example automatically checkpointing based on time, which could 138 | %% lose records). 139 | flush(#state{pending_items = Pending} = State) -> 140 | io:format("~p processing items: ~p~n", [self(), Pending]), 141 | timer:sleep(100 * length(Pending)), 142 | Tokens = [Item#item.token || Item <- Pending], 143 | {ok, State#state{pending_items = []}, Tokens}. 144 | 145 | 146 | %% group item processing disposition by origin stage and worker, informing each stage of 147 | %% the records (sequence numbers) from its workers which have been processed. this allows 148 | %% upstream kinesis workers to safely checkpoint only fully processed data. 149 | notify_dispositions(Tokens, Status) -> 150 | RecipientMap = 151 | lists:foldl( 152 | fun (#flow_token{stage = Stage, 153 | worker = Worker, 154 | sequence_number = SN}, Acc) -> 155 | This = disposition(SN, Status), 156 | maps:update_with( 157 | Stage, 158 | fun (WAcc) -> 159 | maps:update_with( 160 | Worker, 161 | fun (DAcc) -> 162 | [This | DAcc] 163 | end, 164 | [This], 165 | WAcc) 166 | end, 167 | #{Worker => [This]}, 168 | Acc) 169 | end, #{}, Tokens), 170 | maps:fold(fun (Stage, WorkerMap, ok) -> 171 | 'Elixir.Exmld.KinesisStage':disposition(Stage, WorkerMap) 172 | end, ok, RecipientMap). 173 | 174 | 175 | disposition(SN, Status) -> 176 | #{'__struct__' => 'Elixir.Exmld.KinesisWorker.Disposition', 177 | sequence_number => SN, 178 | status => Status}. 179 | -------------------------------------------------------------------------------- /examples/elixir_processor/lib/elixir_processor/application.ex: -------------------------------------------------------------------------------- 1 | defmodule ElixirProcessor.Application do 2 | @moduledoc false 3 | use Application 4 | require Logger 5 | 6 | defmodule Producer do 7 | def child_spec({config, n}) do 8 | %{ 9 | id: {:mld_producer, n}, 10 | type: :supervisor, 11 | shutdown: :infinity, 12 | start: {:erlmld_sup, :start_link, [config]} 13 | } 14 | end 15 | end 16 | 17 | def start(_type, _args) do 18 | # emit kcl spam to console. in a real application, this could be configured using a 19 | # lager-compatible module for logging to a file. 20 | Application.put_env(:erlmld, :log_kcl_spam, true) 21 | 22 | %{stage_names: stage_names, producer_configs: producer_configs, flow_spec: flow_spec} = 23 | prepare_config() 24 | 25 | flow_worker = {Exmld, flow_spec} 26 | 27 | stages = 28 | stage_names 29 | |> Enum.map(&%{id: &1, start: {Exmld.KinesisStage, :start_link, [[name: &1]]}}) 30 | 31 | producers = 32 | producer_configs 33 | |> Enum.with_index() 34 | |> Enum.map(&{Producer, &1}) 35 | 36 | children = stages ++ [flow_worker | producers] 37 | 38 | opts = [strategy: :one_for_all, intensity: 2, period: 10, name: ElixirProcessor.Supervisor] 39 | Supervisor.start_link(children, opts) 40 | end 41 | 42 | defp prepare_config do 43 | # note: in a real application many of these values would be in a common app env 44 | # configuration, but are shown here for illustration. A key :asdf_xyz appearing in a 45 | # producer config map will be used to substitute the variable "${ASDF_XYZ}" appearing 46 | # in erlmld/priv/mld.properties.in. 47 | erlmld_config = 48 | Application.get_all_env(:erlmld) 49 | |> Enum.into(%{}) 50 | 51 | # id of this worker instance; it should be unique per beam node. if not supplied, it 52 | # will be auto-generated by the KCL. two different nodes using the same worker id 53 | # will clobber each other's state. 54 | worker_id = "example worker" 55 | 56 | # name and region of the kinesis stream being processed. you could create this stream 57 | # with the following command: 58 | # 59 | # aws kinesis create-stream --region us-west-2 --shard-count 2 \ 60 | # --stream-name erlang-processor-test-stream 61 | # 62 | stream_name = "erlang-processor-test-stream" 63 | stream_region = "us-west-2" 64 | 65 | # ARN and region of the dynamo stream being processed. `erlmld` does not yet support 66 | # obtaining ARNs from table names. you can obtain the ARN of an existing table 67 | # stream with the following command: 68 | # 69 | # aws dynamodbstreams list-streams --region us-west-2 \ 70 | # --table-name erlang-processor-test-table \ 71 | # --query 'Streams[0].StreamArn' --output text 72 | # 73 | table_stream_arn = "arn:aws:dynamodb:REGION:ACCOUNT-ID:table/TABLE-NAME/stream/TIMESTAMP" 74 | table_region = stream_region 75 | 76 | # in this example application, all source streams can be processed the same way, so we 77 | # set up a single flow and set of stages. if data from different streams should be 78 | # handled differently, separate flows should be used. 79 | # 80 | # these are the registered names of the GenStages which will receive kinesis records 81 | # from each owned shard (round-robin). the actual stages will be owned by a 82 | # supervisor, but the names are needed now due to how the flusher module and flow are 83 | # configured. they could also be pids, but using names allows them to be restarted 84 | # without restarting everything else (and be started later): 85 | stage_names = 86 | 1..System.schedulers_online() 87 | |> Enum.map(&:erlang.binary_to_atom("stage_#{&1}", :utf8)) 88 | 89 | # increase if processing work is IO-bound 90 | concurrency_factor = 1 91 | num_reducers = System.schedulers_online() * concurrency_factor 92 | 93 | # size of each batch to be "flushed" (i.e., collect this many items before processing 94 | # them all in a batch): 95 | batch_size = 10 96 | 97 | # attempt to flush batches every 10s even if batch size not reached (relies on 98 | # heartbeat mechanic): 99 | flush_interval = 10000 100 | 101 | # checkpoint every 60s: 102 | checkpoint_interval = 60000 103 | 104 | # fail if a worker stalls for 600s: 105 | watchdog_timeout = 600_000 106 | 107 | # max number of in-flight items for each kinesis shard worker: 108 | max_pending = 1024 109 | 110 | # flow demand parameters; see flow documentation: 111 | min_demand = 1 112 | max_demand = 1024 113 | 114 | flow_options = [num_stages: num_reducers, min_demand: min_demand, max_demand: max_demand] 115 | 116 | flow_spec = 117 | ElixirProcessor.flow_spec(stage_names, flow_options, 118 | batch_size: batch_size, 119 | flush_interval: flush_interval 120 | ) 121 | 122 | # retrieve this many records with each api call (max: 10000 (kinesis), 1000 123 | # (dynamo)): 124 | max_records = 1000 125 | 126 | common_config = 127 | Map.merge( 128 | erlmld_config, 129 | %{ 130 | record_processor: :erlmld_batch_processor, 131 | record_processor_data: %{ 132 | flusher_mod: Exmld.KinesisWorker, 133 | flusher_mod_data: [ 134 | stages: stage_names, 135 | opaque: :some_opaque_value, 136 | max_pending: max_pending 137 | ], 138 | flush_interval_ms: flush_interval, 139 | checkpoint_interval_ms: checkpoint_interval, 140 | watchdog_timeout_ms: watchdog_timeout, 141 | on_checkpoint: &on_checkpoint/2, 142 | description: "description goes here" 143 | }, 144 | worker_id: worker_id, 145 | 146 | # initial starting position if no shard checkpoint exists; LATEST is 147 | # most recent, TRIM_HORIZON is earliest available: 148 | initial_position: "TRIM_HORIZON", 149 | max_records: max_records, 150 | 151 | # reduce cloudwatch metric spam: 152 | metrics_level: "NONE" 153 | } 154 | ) 155 | 156 | # a kinesis stream processor: 157 | kinesis_producer = %{ 158 | # required if processing multiple source streams within a single beam node (any 159 | # atom, used as a registered name suffix and local filename component): 160 | app_suffix: :k, 161 | 162 | # this name will be used to name the dynamodb state table used by the KCL. if it 163 | # doesn't exist, it will be created. the table is used for coordinating leases held 164 | # and checkpoints made by workers cooperating as part of an application. if two 165 | # erlang nodes are running using the same value for this name, they are considered 166 | # as two workers in a single processing application. a single beam node processing 167 | # multiple different streams needs a unique value for each stream. 168 | # 169 | # this is the same name as the corresponding erlang_processor example application; 170 | # if both the elixir and erlang versions are run at the same time, both will 171 | # cooperate in processing the stream. 172 | kcl_appname: "erlang-processor-kinesis-test", 173 | stream_name: stream_name, 174 | stream_region: stream_region, 175 | 176 | # the stream type; 'kinesis' for kinesis streams, 'dynamo' for dynamodb streams: 177 | stream_type: :kinesis 178 | } 179 | 180 | # a dynamo stream processor: 181 | dynamo_producer = %{ 182 | app_suffix: :d, 183 | kcl_appname: "erlang-processor-dynamo-test", 184 | stream_name: table_stream_arn, 185 | stream_region: table_region, 186 | stream_type: :dynamo 187 | } 188 | 189 | producer_configs = 190 | [kinesis_producer, dynamo_producer] 191 | |> Enum.map(&Map.merge(common_config, &1)) 192 | 193 | %{ 194 | stage_names: stage_names, 195 | producer_configs: producer_configs, 196 | batch_size: batch_size, 197 | flush_interval: flush_interval, 198 | flow_spec: flow_spec 199 | } 200 | end 201 | 202 | defp on_checkpoint(opaque, shard_id) do 203 | Logger.info("checkpointed", opaque: opaque, shard_id: shard_id) 204 | end 205 | end 206 | -------------------------------------------------------------------------------- /lib/exmld.ex: -------------------------------------------------------------------------------- 1 | defmodule Exmld do 2 | @moduledoc ~S""" 3 | This allows items extracted from Kinesis stream records (or sub-records in a [KPL 4 | aggregate record](https://github.com/AdRoll/erlmld/blob/HEAD/proto/kpl_agg.proto)) to 5 | be processed by a pipeline of workers which may differ in number from the number of 6 | shards owned by the current node (which is the normal processing model offered by 7 | [erlmld](https://github.com/AdRoll/erlmld)). 8 | 9 | This is beneficial when using aggregate records which can be processed in approximate 10 | order according to their partition keys as opposed to strict ordering based on the 11 | shards they arrived on. For example, suppose the following two Kinesis records are 12 | received on two different shards: 13 | 14 | Record 1 (a KPL aggregate record) 15 | - partition key: "xyzzy" 16 | - subrecord a: 17 | - partition key: "asdf" 18 | - value: "12345" 19 | - subrecord b: 20 | - partition key: "fdsa" 21 | - value: "54321" 22 | 23 | Record 2 (a KPL aggregate record) 24 | - partition key: "qwer" 25 | - subrecord a: 26 | - partition key: "asdf" 27 | - value: "23456" 28 | - subrecord b: 29 | - partition key: "z" 30 | - value: "0" 31 | 32 | 33 | Using the normal Kinesis processing paradigm, each shard will be processed in order. 34 | `erlmld` supports this by spawning a process for each owned shard, which handles each 35 | record seen on the shard in sequence: 36 | 37 | Worker 1: 38 | 1. handle record "xyzzy" 39 | a. handle sub-record "asdf" 40 | b. handle sub-record "fdsa" 41 | 42 | Worker 2: 43 | 1. handle record "qwer" 44 | a. handle sub-record "asdf" 45 | b. handle sub-record "z" 46 | 47 | 48 | This can fail to make use of all available resources since the maximum concurrency is 49 | limited by the number of owned shards. If the application can tolerate the handling of 50 | sub-records in a non-strict order, it can use a `Flow`-based MapReduce-style scheme: 51 | 52 | [Worker 1] [Worker 2] (processes which produce Kinesis records) 53 | | | 54 | v v 55 | [Exmld.KinesisStage, ...] (stages receiving Exmld.KinesisWorker.Datums) 56 | | 57 | v 58 | [M1] .... [Mn] (mappers which extract items) 59 | |\ /| 60 | | \ / | 61 | | \ / | 62 | | \ / | 63 | | \ | 64 | | / \ | 65 | | / \ | 66 | | / \ | 67 | |/ \| 68 | [R1] .... [Rn] (reducers which handle extracted items) 69 | 70 | The number of reducers is configurable and defaults to the number of schedulers online. 71 | The processing application will specify a means of extracting a partition key from each 72 | extracted item; these will be used to consistently map items to reducers (which is where 73 | the actual application work occurs). 74 | 75 | Using the above example and specifying a sub-record's partition key as an item key: 76 | 77 | 1. Worker 1 will produce the "asdf" and "fdsa" sub-records from outer record "xyzzy" 78 | and send them to a pre-configured `Exmld.KinesisStage` (or round-robin to a list of 79 | such stages). 80 | 81 | 2. Worker 2 will similarly produce the "asdf" and "z" sub-records from outer record 82 | "qwer". 83 | 84 | 3. Each receiving stage will wrap and forward these sub-records for handling by the 85 | flow. 86 | 87 | 4. The application will have provided an "identity" item extraction function since KPL 88 | aggregation is being used here (or otherwise a function accepting one record and 89 | returning a list containing a single item). 90 | 91 | 5. The application will have provided a partition key extraction function which 92 | returns an appropriate partition key to be used in consistently mapping items to 93 | reducers. 94 | 95 | 6. The first received "asdf" sub-record is provided to some reducer `Rx`. The second 96 | received "asdf" sub-record is provided to the same reducer since its extracted key has 97 | the same hash. 98 | 99 | 7. The "fdsa" and "z" sub-records are similarly provided to some worker `Ry` and/or 100 | `Rz` based on the hash of their partition keys. 101 | 102 | 8. The application-provided reducer function notifies each originating stage of the 103 | disposition of processing for items received from it as processing progresses. 104 | 105 | 9. Eventually, processing disposition is provided back to the originating workers, 106 | which can decide whether or not (and where) to checkpoint. 107 | 108 | """ 109 | 110 | require Record 111 | Record.defrecord(:sequence_number, Record.extract(:sequence_number, 112 | from_lib: "erlmld/include/erlmld.hrl")) 113 | Record.defrecord(:checkpoint, Record.extract(:checkpoint, 114 | from_lib: "erlmld/include/erlmld.hrl")) 115 | Record.defrecord(:stream_record, Record.extract(:stream_record, 116 | from_lib: "erlmld/include/erlmld.hrl")) 117 | 118 | @type sequence_number :: record(:sequence_number) 119 | @type checkpoint :: record(:checkpoint) 120 | @type stream_record :: record(:stream_record) 121 | @type shard_id :: binary 122 | 123 | @type item :: any 124 | @type partition_key :: any 125 | @type reducer_state :: any 126 | 127 | @doc """ 128 | Accepts a flow producing `Exmld.KinesisWorker.Datum`s (e.g,. a flow created from 129 | `Exmld.KinesisStage`s) and returns another flow. 130 | """ 131 | # each stream record should be associated with the genstage which received it and the 132 | # worker which produced it. each item extracted from a stream record should indicate 133 | # the record it came from, the item id within the record, and the total number of items 134 | # in the record. the extraction and processing functions should correctly handle 135 | # heartbeats. the processing function should process as much data as possible, and 136 | # periodically inform the source genstages of all the item ids which have been 137 | # (successfully or not) processed. those genstages in turn will maintain information 138 | # about what has been successfully processed, which the producing kinesis workers can 139 | # use when checkpointing. 140 | @spec flow(# a flow which produces `Datum`s: 141 | flow :: Flow.t, 142 | # arity-1 function mapping a datum to list of zero or more items: 143 | extract_items_fn :: ((Exmld.KinesisWorker.Datum) -> [item]), 144 | # arity-1 function or flow partition key shortcut for partitioning items: 145 | partition_key :: {:elem, non_neg_integer} 146 | | {:key, atom} 147 | | ((item) -> partition_key), 148 | # arity-0 function returning initial reducer state: 149 | state0 :: (() -> reducer_state), 150 | # arity-2 function accepting item being processed and reducer state: 151 | process_fn :: ((item, reducer_state) -> reducer_state), 152 | opts :: keyword) :: Flow.t 153 | def flow(flow, 154 | extract_items_fn, 155 | partition_key, 156 | state0, 157 | process_fn, 158 | opts \\ []) do 159 | extra = opts[:append] || &(&1) 160 | flow 161 | |> Flow.flat_map(extract_items_fn) 162 | |> Flow.partition(key: partition_key, 163 | stages: opts[:num_stages] || System.schedulers_online(), 164 | min_demand: opts[:min_demand] || 1, 165 | max_demand: opts[:max_demand] || 500, 166 | window: opts[:window] || Flow.Window.global()) 167 | |> Flow.reduce(state0, process_fn) 168 | |> extra.() 169 | end 170 | 171 | @doc """ 172 | You can use this one to keep building your flow after calling flow/6 above. 173 | """ 174 | def from_stages(opts) do 175 | Flow.from_stages(opts.stages) 176 | |> flow(opts.extract_items_fn, opts.partition_key, 177 | opts.state0, opts.process_fn, opts.flow_opts) 178 | end 179 | 180 | def start_link(opts) do 181 | from_stages(opts) |> Flow.start_link() 182 | end 183 | 184 | def child_spec(opts) do 185 | %{id: __MODULE__, start: {__MODULE__, :start_link, [opts]}} 186 | end 187 | end 188 | -------------------------------------------------------------------------------- /lib/exmld/kinesis_worker.ex: -------------------------------------------------------------------------------- 1 | defmodule Exmld.KinesisWorker do 2 | require Logger 3 | require Exmld 4 | @behaviour :erlmld_flusher 5 | 6 | defstruct [{:stages, []}, 7 | {:shard_id, nil}, 8 | {:opaque, nil}, 9 | {:counter, 0}, 10 | {:heartbeats, 0}, 11 | {:errors, 0}, 12 | {:error_callback, nil}, 13 | {:skip_errors, false}, 14 | {:done, []}, 15 | {:max_pending, 1000}, 16 | {:await_sleep_interval, 1000}, 17 | {:pending, %{}}, 18 | {:on_duplicate, :exit}] 19 | 20 | @type flusher_token :: any 21 | @type t :: %__MODULE__{# list of identifiers which can be `GenStage.call/3`ed: 22 | stages: [any], 23 | shard_id: Exmld.shard_id, 24 | opaque: any, 25 | counter: non_neg_integer, 26 | heartbeats: non_neg_integer, 27 | errors: non_neg_integer, 28 | error_callback: ((t, [Exmld.KinesisWorker.Disposition.t]) -> any) 29 | | nil, 30 | skip_errors: boolean, 31 | done: [flusher_token], 32 | max_pending: pos_integer, 33 | await_sleep_interval: non_neg_integer, 34 | # map from a sequence number to a token or a tuple of a token and 35 | # a list of {sub, total} values known so far. 36 | # 37 | # if we expect only a single disposition for a record, it was a 38 | # kpl sub-record received from upstream and we store a value 39 | # consisting of the associated token here. 40 | # 41 | # otherwise, we store {token, []} and await all outstanding 42 | # dispositions for items extracted from the originating record. 43 | pending: %{optional({Exmld.shard_id, Exmld.sequence_number}) => 44 | flusher_token | {flusher_token, [{non_neg_integer, 45 | non_neg_integer}]}}, 46 | # What we should do if we find a duplicate sequence number 47 | on_duplicate: :exit | :skip} 48 | 49 | @moduledoc """ 50 | An [erlmld_flusher](https://github.com/AdRoll/erlmld/blob/HEAD/src/erlmld_flusher.erl) 51 | which can interface with a `Exmld.KinesisStage` data source. 52 | 53 | This implements an `erlmld_flusher` which can be used by `erlmld_batch_processor`. 54 | Unlike a typical `erlmld_flusher`, it has a different notion of fullness: if more than 55 | `:max_pending` items are in flight, the worker waits for all pending items before 56 | emitting any more for downstream processing. A periodic flush interval should be 57 | configured in the batch processor options. Similarly, the downstream stage processing 58 | pipeline should not require any kind of "full" condition and should periodically make 59 | progress (i.e., emit/flush output) even if no more records are sent. 60 | 61 | Heartbeat items are sent while the worker is waiting for pending items to be completed; 62 | these include varying counters to allow them to be automatically distributed among 63 | downstream reducers. 64 | 65 | One worker process will exist for each stream shard owned by the current node. Each 66 | such process will have been configured with a set of downstream `Exmld.KinesisStage`s 67 | which can receive records from it (actually `Exmld.KinesisWorker.Datum`s); those stages 68 | will be part of a data processing `Flow.t`. Eventually, the disposition of each 69 | record's processing will propagate back to the originating worker (as return values from 70 | `GenStage.call/3`). 71 | 72 | Periodically, `erlmld_batch_processor` will request a flush. If the flush kind is 73 | `:partial`, we return the tokens associated with the records which have already been 74 | fully processed. Otherwise, the flush kind is `:full` and we await the disposition of 75 | every outstanding record before returning. 76 | 77 | If processing of any record (or item extracted therefrom) fails, the worker will crash 78 | unless it's configured to ignore processing errors. 79 | 80 | Records presented to this worker may be ordinary records or sub-records extracted from a 81 | containing KPL-aggregated record. If KPL aggregation is not being used, but smaller 82 | sub-items are later extracted by the stage processing pipeline, the pipeline should 83 | create fake sub-record sequence numbers to track the disposition of those items (and 84 | sub-record checkpointing should be turned off). 85 | 86 | Periodically (which should be at some multiple of the periodic flush interval), 87 | `erlmld_batch_processor` will checkpoint based on the records which have so far been 88 | successfully processed (those whose tokens have been returned from `flush/2`). 89 | """ 90 | 91 | defmodule Datum do 92 | @doc """ 93 | Struct for annotating stream records (or heartbeats) with additional data. 94 | 95 | To allow a single downstream processing pipeline to be used with multiple source 96 | streams, we annotate Kinesis stream records with additional data before providing them 97 | to the stage(s) processing them. (Example: `:opaque` could be a 2-tuple naming the 98 | source stream and region or otherwise indicate how to specially process the record). 99 | 100 | ## Fields 101 | 102 | * `:opaque` - the opaque term provided at worker init time 103 | * `:shard_id` - name of the shard the worker is processing 104 | * `:stream_record` - a record from the stream or `{:heartbeat, _}` 105 | """ 106 | defstruct opaque: nil, shard_id: nil, stream_record: nil 107 | @type t :: %Datum{opaque: any, 108 | shard_id: Exmld.shard_id, 109 | stream_record: Exmld.stream_record | {:heartbeat, any}} 110 | end 111 | 112 | defmodule Disposition do 113 | @doc """ 114 | Struct for event processing disposition. 115 | 116 | Tracks whether processing succeeded or failed for a specific record or item extracted 117 | therefrom. 118 | 119 | ## Fields 120 | 121 | * `:sequence_number` - `Exmld.sequence_number()` of the subject record. If the 122 | subject is an item extracted from a containing aggregate record, the `user_sub` and 123 | `user_total` fields should be populated (whether KPL aggregation was used or not). 124 | * `:status` - processing status 125 | """ 126 | defstruct sequence_number: nil, status: nil 127 | @type t :: %Disposition{sequence_number: Exmld.sequence_number, 128 | status: :ok | {:error, term}} 129 | end 130 | 131 | @doc """ 132 | Initialize worker state with a shard id and a set of options. 133 | 134 | An `erlmld_batch_processor` is initializing processing on `shard_id` and providing the 135 | `flusher_mod_data` which was passed to it, which should be an enumerable of `keyword`s 136 | containing the following options; we return a flusher state to be used in subsequent 137 | operations. 138 | 139 | ## Options 140 | 141 | All optional unless marked required: 142 | 143 | * `:stages` - (required) list of `GenStage`s (values useable as first arg to 144 | `GenStage.call/3`) which can receive `Exmld.KinesisWorker.Datum`s 145 | * `:opaque` - opaque term passed in each `Exmld.KinesisWorker.Datum` 146 | * `:skip_errors` - boolean indicating whether errors are non-fatal (if false, crash on 147 | error). 148 | * `:max_pending` - maximum number of pending items which can be in flight. 149 | * `:await_sleep_interval` - sleep time between checks while awaiting pending items. 150 | * `:error_callback` - `nil` or an arity-2 function called with state and failure 151 | dispositions when processing failures occur. 152 | """ 153 | def init(shard_id, opts) do 154 | unless length(opts[:stages] || []) > 0 do 155 | exit(:no_stages_configured) 156 | end 157 | Logger.metadata(shard_id: shard_id, opaque: opts[:opaque]) 158 | struct(%__MODULE__{error_callback: &(log_errors(&1, &2)), 159 | shard_id: shard_id}, Map.new(opts)) 160 | end 161 | 162 | @doc """ 163 | Submit a new Kinesis record to the downstream pipeline for processing. 164 | 165 | A new Kinesis record is available for processing, and `erlmld_batch_processor` is 166 | instructing us to add it to the current batch. Since we really have no notion of a 167 | batch, we immediately choose a downstream stage and notify it of a new 168 | `Exmld.KinesisWorker.Datum` containing the record and make a note of it being in-flight. 169 | That call will block until a further-downstream consumer receives the record as a flow 170 | event. 171 | 172 | The result of that call will be an updated list of item dispositions. Unless configured 173 | to skip records which failed to be processed, we crash if any failed. Otherwise we 174 | update the set of done/pending items and return an updated state. 175 | """ 176 | def add_record(%__MODULE__{max_pending: max_pending, 177 | pending: pending} = state, record, token) 178 | when map_size(pending) >= max_pending do 179 | Logger.info("#{state.shard_id} has too many pending items, awaiting...") 180 | add_record(await_pending(state), record, token) 181 | end 182 | def add_record(state, record, token) do 183 | state = 184 | state 185 | |> incr(:counter, 1) 186 | |> note_pending(record, token) 187 | |> notify_downstream(record) 188 | |> update_pending() 189 | {:ok, state} 190 | end 191 | 192 | @doc """ 193 | Return a list of tokens corresponding to records which have been fully processed and the 194 | latest state. 195 | 196 | If the flush kind is `:full`, we await the disposition of all outstanding records before 197 | returning. Otherwise, it's `:partial` and we return (possibly an empty result) 198 | immediately. 199 | 200 | If doing a full flush and any records fail to be successfully processed, we crash unless 201 | configured to skip failed records. 202 | """ 203 | def flush(state, kind) do 204 | %__MODULE__{done: done} = state = case kind do 205 | :partial -> 206 | state 207 | :full -> 208 | await_pending(state) 209 | end 210 | {:ok, %{state | done: []}, done} 211 | end 212 | 213 | @doc """ 214 | The batch processor has received a possibly-empty set of records from the 215 | MultiLangDaemon and is informing the flusher (us). Send a heartbeat downstream and 216 | return any completed tokens. This allows progress to be made even if no more records 217 | appear on the stream. 218 | """ 219 | def heartbeat(state) do 220 | %__MODULE__{done: done} = state = do_heartbeat(state) 221 | {:ok, %{state | done: []}, done} 222 | end 223 | 224 | # a record and token have been provided. If the record is a kpl sub-record, it has 225 | # base, user_sub, and user_total fields populated, and we expect a single disposition for it. This 226 | # is also the case for erlmld's custom kpl-like aggregated records (erlmld will make sure to 227 | # extract the sub records and populate user_sub and user_total). 228 | # otherwise, it's a normal record (non-kpl) which will later have items extracted from it (we 229 | # don't know how many), and we'll expect multiple dispositions for it (each containing a 230 | # faked sequence number also containing populated base, user_sub, and user_total); once we receive 231 | # all of those, it's done. 232 | defp note_pending(%__MODULE__{pending: pending, shard_id: shard_id} = state, record, token) do 233 | sn = Exmld.stream_record(record, :sequence_number) 234 | note_pending(state, token, sn, Map.has_key?(pending, {shard_id, sn})) 235 | end 236 | defp note_pending(%__MODULE__{on_duplicate: :exit, shard_id: shard_id}, _token, sn, true) do 237 | # we received the same sequence number for two records; this should not happen. 238 | Logger.warning("Duplicate sequence number for two records #{sn} on #{shard_id} exiting...") 239 | exit({:duplicate_seqno, sn}) 240 | end 241 | defp note_pending(%__MODULE__{on_duplicate: :skip, shard_id: shard_id} = state, _token, sn, true) do 242 | Logger.warning("Duplicate sequence number for two records #{sn} on #{shard_id} skipping...") 243 | state 244 | end 245 | defp note_pending(%__MODULE__{pending: pending, shard_id: shard_id} = state, token, sn, false) do 246 | expect_multiple = :undefined == Exmld.sequence_number(sn, :user_sub) 247 | stored_token = maybe_standard_token(token, sn) 248 | %{state | pending: Map.put(pending, {shard_id, sn}, case expect_multiple do 249 | true -> 250 | {stored_token, []} 251 | false -> 252 | stored_token 253 | end)} 254 | end 255 | 256 | # take advantage of the normal (but opaque and currently unsupported) {N, SN} 257 | # representation of tokens to avoid storing the sequence number twice: 258 | defp maybe_standard_token({n, sn}, sn_) when sn == sn_ do 259 | {:t, n} 260 | end 261 | defp maybe_standard_token(t, _) do 262 | t 263 | end 264 | 265 | defp maybe_wrap_token({:t, n}, sn) do 266 | {n, sn} 267 | end 268 | defp maybe_wrap_token(t, _) do 269 | t 270 | end 271 | 272 | # a list of finished sequence numbers has been provided. either: 273 | # 274 | # 1. we received a kpl sub-record from upstream and it was passed to a reducer. we are 275 | # now receiving a sequence number with base, user_sub, and user_total fields populated. that 276 | # sub-record would have been associated with one flusher token, which is now done. 277 | # 278 | # or: 279 | # 280 | # 2. we received a normal record from upstream and sub-records were later extracted by 281 | # the application. the application should have assigned sequence numbers with user_sub 282 | # and user_total fields populated when informing us of disposition. once all such items 283 | # are done, we can consider the token associated with the original parent record as 284 | # done. 285 | defp update_pending({state, completed_sequence_numbers}) do 286 | completed_sequence_numbers 287 | |> Enum.reduce(state, &update_pending_1/2) 288 | end 289 | 290 | defp update_pending_1(sn, %__MODULE__{pending: pending, done: done, shard_id: shard_id} = state) do 291 | case Map.pop(pending, {shard_id, sn}) do 292 | {nil, pending} -> 293 | # the sequence number doesn't exist in pending. this will happen if the sequence 294 | # number has user_sub and user_total fields populated and a non-aggregate record was 295 | # received from upstream. that non-aggregate record's sequence number (lacking 296 | # user_sub/user_total fields) was used as the key, and the value will be {token, [..]}. 297 | sub = Exmld.sequence_number(sn, :user_sub) 298 | total = Exmld.sequence_number(sn, :user_total) 299 | if :undefined == sub do 300 | exit({:missing_pending, sn}) 301 | end 302 | key = {shard_id, Exmld.sequence_number(sn, user_sub: :undefined, user_total: :undefined)} 303 | {{token, seen}, pending} = Map.pop(pending, key) 304 | seen = [{sub, total} | seen] 305 | # if all expected items have been received, move token to done. otherwise, 306 | # continue building seen list; 307 | case all_done(seen) do 308 | true -> 309 | %{state | pending: pending, done: [maybe_wrap_token(token, sn) | done]} 310 | false -> 311 | %{state | pending: Map.put(pending, key, {token, seen})} 312 | end 313 | {token, pending} -> 314 | %{state | pending: pending, done: [maybe_wrap_token(token, sn) | done]} 315 | end 316 | end 317 | 318 | defp all_done([]) do 319 | false 320 | end 321 | defp all_done([{_sub, total} | _] = values) do 322 | case length(values) do 323 | ^total -> 324 | # every item must have the same total value, and each sub must be unique and cover 325 | # the range 0..user_total-1. 326 | expected = MapSet.new(0..(total-1)) 327 | actual = MapSet.new(Enum.map(values, &(elem(&1, 0)))) 328 | if MapSet.disjoint?(expected, actual) do 329 | exit({:unexpected_disjoint, expected, actual}) 330 | end 331 | true 332 | _ -> 333 | false 334 | end 335 | end 336 | 337 | # while awaiting pending items, we spew heartbeats to all downstream stages so we can 338 | # obtain disposition of prior items. if this happens frequently, the downstream 339 | # pipeline can't keep up with the producer, so its parameters should be tuned. 340 | defp await_pending(%__MODULE__{await_sleep_interval: sleep_interval, 341 | pending: pending} = state) when map_size(pending) > 0 do 342 | Logger.debug("#{state.shard_id} awaiting #{inspect map_size(pending)} items...") 343 | :timer.sleep(sleep_interval) 344 | state 345 | |> do_heartbeat() 346 | |> await_pending() 347 | end 348 | defp await_pending(state) do 349 | state 350 | end 351 | 352 | # send a heartbeat downstream and note any returned item dispositions. 353 | defp do_heartbeat(state) do 354 | state 355 | |> incr(:heartbeats, 1) 356 | |> notify_downstream(:heartbeat) 357 | |> update_pending() 358 | end 359 | 360 | # notify a downstream processing stage of a record or heartbeat and handle any returned 361 | # item dispositions, returning an updated state and a list of processed sequence 362 | # numbers. 363 | # 364 | # if processing of an item has failed and we aren't configured to skip failed records, 365 | # we crash. otherwise we call any configured error callback and skip the failed items. 366 | defp notify_downstream(%__MODULE__{shard_id: shard_id, 367 | opaque: opaque, 368 | stages: stages} = state, :heartbeat) do 369 | {state, dispositions} = 370 | stages 371 | |> Enum.reduce({0, {:disposition, []}}, 372 | fn (stage, {n, x}) -> 373 | datum = %Datum{shard_id: shard_id, 374 | opaque: opaque, 375 | stream_record: {:heartbeat, {state.counter, state.heartbeats, n}}} 376 | {:disposition, y} = Exmld.KinesisStage.notify(stage, datum) 377 | {n + 1, put_elem(x, 1, y ++ elem(x, 1))} 378 | end) 379 | |> elem(1) 380 | |> handle_errors(state) 381 | 382 | {state, Enum.map(dispositions, &(&1.sequence_number))} 383 | end 384 | defp notify_downstream(%__MODULE__{shard_id: shard_id, 385 | opaque: opaque} = state, thing) do 386 | {state, dispositions} = 387 | choose_stage(state) 388 | |> Exmld.KinesisStage.notify(%Datum{shard_id: shard_id, 389 | opaque: opaque, 390 | stream_record: thing}) 391 | |> handle_errors(state) 392 | 393 | # we either had no errors, exited, or called a configured error callback for an error. 394 | # at this point, consider all items as successfully processed and return their 395 | # sequence numbers: 396 | {state, Enum.map(dispositions, &(&1.sequence_number))} 397 | end 398 | 399 | defp choose_stage(%__MODULE__{stages: stages, heartbeats: heartbeats, counter: count}) do 400 | Enum.at(stages, rem(count + heartbeats, length(stages))) 401 | end 402 | 403 | defp handle_errors({:disposition, prior_dispositions}, 404 | %__MODULE__{shard_id: shard_id, 405 | opaque: opaque, 406 | error_callback: cb, 407 | skip_errors: skip_errors} = state) do 408 | failed = prior_dispositions 409 | |> Enum.reject(fn (%Disposition{status: status}) -> 410 | status == :ok 411 | end) 412 | 413 | result = {incr(state, :errors, length(failed)), prior_dispositions} 414 | 415 | case failed do 416 | [] -> 417 | result 418 | _ -> 419 | if cb do 420 | cb.(state, failed) 421 | end 422 | 423 | case skip_errors do 424 | true -> 425 | result 426 | _ -> 427 | exit({:processing_failed, 428 | shard_id: shard_id, opaque: opaque, failures: failed}) 429 | end 430 | end 431 | end 432 | 433 | defp log_errors(_state, failed) do 434 | Logger.error("processing failed: #{inspect failed}") 435 | end 436 | 437 | defp incr(state, field, n) do 438 | Map.put(state, field, n + Map.fetch!(state, field)) 439 | end 440 | end 441 | --------------------------------------------------------------------------------