├── examples
    ├── erlang_processor
    │   ├── config
    │   │   ├── vm.args
    │   │   └── sys.config
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── src
    │   │   ├── erlang_processor.app.src
    │   │   ├── erlang_processor_sup.erl
    │   │   ├── erlang_processor_app.erl
    │   │   └── erlang_processor.erl
    │   ├── rebar.config
    │   └── README.md
    └── elixir_processor
    │   ├── config
    │       └── config.exs
    │   ├── .formatter.exs
    │   ├── Makefile
    │   ├── mix.exs
    │   ├── .gitignore
    │   ├── README.md
    │   └── lib
    │       ├── elixir_processor.ex
    │       └── elixir_processor
    │           └── application.ex
├── CHANGELOG.md
├── img
    ├── elixir-mld-pipeline.png
    └── erlang-mld-workers.png
├── Makefile
├── .gitignore
├── mix.exs
├── config
    └── config.exs
├── README.md
├── LICENSE
├── mix.lock
└── lib
    ├── exmld
        ├── kinesis_stage.ex
        └── kinesis_worker.ex
    └── exmld.ex


/examples/erlang_processor/config/vm.args:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | See the [Releases](../../releases) page.
2 | 


--------------------------------------------------------------------------------
/examples/erlang_processor/config/sys.config:
--------------------------------------------------------------------------------
1 | % -*- mode: erlang -*-
2 | [].
3 | 


--------------------------------------------------------------------------------
/img/elixir-mld-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AdRoll/exmld/HEAD/img/elixir-mld-pipeline.png


--------------------------------------------------------------------------------
/img/erlang-mld-workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AdRoll/exmld/HEAD/img/erlang-mld-workers.png


--------------------------------------------------------------------------------
/examples/elixir_processor/config/config.exs:
--------------------------------------------------------------------------------
1 | import Config
2 | 
3 | config :logger, :console, metadata: :all
4 | 


--------------------------------------------------------------------------------
/examples/elixir_processor/.formatter.exs:
--------------------------------------------------------------------------------
1 | # Used by "mix format"
2 | [
3 |   inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
4 | ]
5 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: docs
 2 | 
 3 | all: build
 4 | 
 5 | build:
 6 | 	mix compile
 7 | 
 8 | docs: build
 9 | 	@# run `mix escript.install hex ex_doc` first
10 | 	ex_doc "exmld" "git" _build/dev/lib/exmld/ebin
11 | 


--------------------------------------------------------------------------------
/examples/erlang_processor/.gitignore:
--------------------------------------------------------------------------------
 1 | .rebar3
 2 | _*
 3 | .eunit
 4 | *.o
 5 | *.beam
 6 | *.plt
 7 | *.swp
 8 | *.swo
 9 | .erlang.cookie
10 | ebin
11 | log
12 | erl_crash.dump
13 | .rebar
14 | logs
15 | _build
16 | .idea
17 | *.iml
18 | rebar3.crashdump
19 | elixir_libs
20 | 


--------------------------------------------------------------------------------
/examples/elixir_processor/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: deps
 2 | 
 3 | all: build
 4 | 
 5 | deps:
 6 | 	mix do deps.unlock --all, deps.get
 7 | 
 8 | build: deps
 9 | 	mix compile
10 | 	make jars
11 | 
12 | jars:
13 | 	./_build/dev/lib/erlmld/priv/download.sh
14 | 
15 | clean:
16 | 	mix clean
17 | 


--------------------------------------------------------------------------------
/examples/erlang_processor/Makefile:
--------------------------------------------------------------------------------
 1 | all: build
 2 | 
 3 | deps:
 4 | 	rebar3 do deps unlock, deps upgrade
 5 | 
 6 | build: deps
 7 | 	rebar3 compile
 8 | 	make jars
 9 | 
10 | jars:
11 | 	_build/default/lib/erlmld/priv/download.sh
12 | 
13 | release: build
14 | 	rebar3 tar
15 | 
16 | clean:
17 | 	rebar3 clean
18 | 


--------------------------------------------------------------------------------
/examples/erlang_processor/src/erlang_processor.app.src:
--------------------------------------------------------------------------------
 1 | {application, erlang_processor,
 2 |  [{description, "An OTP application"},
 3 |   {vsn, "0.1.0"},
 4 |   {registered, []},
 5 |   {mod, { erlang_processor_app, []}},
 6 |   {applications,
 7 |    [kernel,
 8 |     stdlib,
 9 |     erlexec,
10 |     elixir,
11 |     logger
12 |    ]},
13 |   {included_applications,
14 |    [erlmld,
15 |     exmld
16 |    ]},
17 |   {env,[]},
18 |   {modules, []},
19 | 
20 |   {maintainers, []},
21 |   {licenses, ["BSD 3-Clause"]},
22 |   {links, []}
23 |  ]}.
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # The directory Mix will write compiled artifacts to.
 2 | /_build/
 3 | 
 4 | # If you run "mix test --cover", coverage assets end up here.
 5 | /cover/
 6 | 
 7 | # The directory Mix downloads your dependencies sources to.
 8 | /deps/
 9 | 
10 | # Where 3rd-party dependencies like ExDoc output generated docs.
11 | /doc/
12 | 
13 | # Ignore .fetch files in case you like to edit your project deps locally.
14 | /.fetch
15 | 
16 | # If the VM crashes, it generates a dump, let's ignore it too.
17 | erl_crash.dump
18 | 
19 | # Also ignore archive artifacts (built via "mix archive.build").
20 | *.ez
21 | 


--------------------------------------------------------------------------------
/examples/elixir_processor/mix.exs:
--------------------------------------------------------------------------------
 1 | defmodule ElixirProcessor.MixProject do
 2 |   use Mix.Project
 3 | 
 4 |   def project do
 5 |     [
 6 |       app: :elixir_processor,
 7 |       version: "0.1.0",
 8 |       elixir: "~> 1.12",
 9 |       start_permanent: Mix.env() == :prod,
10 |       deps: deps()
11 |     ]
12 |   end
13 | 
14 |   def application do
15 |     [
16 |       extra_applications: [:logger, :erlexec],
17 |       included_applications: [:erlmld, :exmld],
18 |       mod: {ElixirProcessor.Application, []}
19 |     ]
20 |   end
21 | 
22 |   defp deps do
23 |     [
24 |       {:exmld, "~> 1.0.2"}
25 |     ]
26 |   end
27 | end
28 | 


--------------------------------------------------------------------------------
/examples/elixir_processor/.gitignore:
--------------------------------------------------------------------------------
 1 | # The directory Mix will write compiled artifacts to.
 2 | /_build/
 3 | 
 4 | # If you run "mix test --cover", coverage assets end up here.
 5 | /cover/
 6 | 
 7 | # The directory Mix downloads your dependencies sources to.
 8 | /deps/
 9 | 
10 | # Where 3rd-party dependencies like ExDoc output generated docs.
11 | /doc/
12 | 
13 | # Ignore .fetch files in case you like to edit your project deps locally.
14 | /.fetch
15 | 
16 | # If the VM crashes, it generates a dump, let's ignore it too.
17 | erl_crash.dump
18 | 
19 | # Also ignore archive artifacts (built via "mix archive.build").
20 | *.ez
21 | 
22 | # Ignore package tarball (built via "mix hex.build").
23 | elixir_processor-*.tar
24 | 
25 | 


--------------------------------------------------------------------------------
/mix.exs:
--------------------------------------------------------------------------------
 1 | defmodule Exmld.Mixfile do
 2 |   use Mix.Project
 3 | 
 4 |   @version "1.0.4"
 5 |   @name "exmld"
 6 |   @repo "https://github.com/AdRoll/#{@name}"
 7 | 
 8 |   def project do
 9 |     [
10 |       app: :exmld,
11 |       version: @version,
12 |       elixir: "~> 1.12",
13 |       start_permanent: Mix.env == :prod,
14 |       deps: deps(),
15 |       package: package(),
16 |       docs: [source_ref: "v#{@version}",
17 |              source_url: @repo],
18 |       description: "An Elixir library for processing multiple Kinesis and " <>
19 |       "DynamoDB streams and shards in a single node using the Kinesis " <>
20 |       "Client Library and MultiLangDaemon."
21 |     ]
22 |   end
23 | 
24 |   def application do
25 |     [
26 |       extra_applications: [:logger],
27 |     ]
28 |   end
29 | 
30 |   defp deps do
31 |     [
32 |       {:flow, "~> 1.2"},
33 |       {:erlmld, "~> 1.0.2"},
34 |       {:ex_doc, "~> 0.28", only: :dev, runtime: false}
35 |     ]
36 |   end
37 | 
38 |   defp package do
39 |     %{
40 |       name: @name,
41 |       licenses: ["BSD-3-Clause"],
42 |       maintainers: ["AdRoll RTB team <rtb-team+#{@name}@adroll.com>"],
43 |       links: %{"GitHub" => @repo}
44 |     }
45 |   end
46 | end
47 | 


--------------------------------------------------------------------------------
/config/config.exs:
--------------------------------------------------------------------------------
 1 | # This file is responsible for configuring your application
 2 | # and its dependencies with the aid of the Config module.
 3 | import Config
 4 | 
 5 | # This configuration is loaded before any dependency and is restricted
 6 | # to this project. If another project depends on this project, this
 7 | # file won't be loaded nor affect the parent project. For this reason,
 8 | # if you want to provide default values for your application for
 9 | # 3rd-party users, it should be done in your "mix.exs" file.
10 | 
11 | # You can configure your application as:
12 | #
13 | #     config :exmld, key: :value
14 | #
15 | # and access this configuration in your application as:
16 | #
17 | #     Application.get_env(:exmld, :key)
18 | #
19 | # You can also configure a 3rd-party app:
20 | #
21 | #     config :logger, level: :info
22 | #
23 | 
24 | config :logger, :console,
25 |   metadata: :all
26 | 
27 | # It is also possible to import configuration files, relative to this
28 | # directory. For example, you can emulate configuration per environment
29 | # by uncommenting the line below and defining dev.exs, test.exs and such.
30 | # Configuration from the imported file will override the ones defined
31 | # here (which is why it is important to import them last).
32 | #
33 | #     import_config "#{Mix.env}.exs"
34 | 


--------------------------------------------------------------------------------
/examples/erlang_processor/rebar.config:
--------------------------------------------------------------------------------
 1 | %% -*- mode: erlang -*-
 2 | {erl_opts, [debug_info]}.
 3 | 
 4 | {deps, [
 5 |         {exmld,
 6 |          {git, "exmld", "1.0.2"}},
 7 |         {erlmld, "1.0.2"},
 8 | 
 9 |         %% these are deps of exmld.  they need to be here so `rebar3
10 |         %% shell` works properly:
11 |         {flow,
12 |          {elixir, "flow", "1.2"}},
13 |         {gen_stage,
14 |          {elixir, "gen_stage", "1.1.12"}}
15 |        ]}.
16 | 
17 | {relx, [{release, { erlang_processor, "0.1.0" },
18 |          [
19 |           erlang_processor,
20 |           runtime_tools,
21 |           sasl,
22 |           tools
23 |          ]},
24 | 
25 |         {sys_config, "config/sys.config"},
26 |         {vm_args, "config/vm.args"},
27 | 
28 |         {dev_mode, true},
29 |         {include_erts, false},
30 | 
31 |         {extended_start_script, true}]}.
32 | 
33 | {plugins, [
34 |            {rebar3_lint, "0.1.10"},
35 |            {rebar3_elixir_compile,
36 |             {git, "https://github.com/barrel-db/rebar3_elixir_compile.git",
37 |              {ref, "4afc7a887dcf8e9abe3613cafd50e5f8d912e342"}}}
38 |           ]}.
39 | 
40 | {provider_hooks, [
41 |                   {pre, [{compile, {ex, compile}}]},
42 |                   {pre, [{release, {ex, compile}}]}
43 |                  ]}.
44 | 
45 | {elixir_opts, [
46 |                {env, prod}
47 |               ]}.
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # exmld
 2 | 
 3 | This application allows Kinesis and DynamoDB streams to be processed using Elixir or
 4 | Erlang (by way of the KCL MultiLangDaemon).  It's particularly useful when aggregate
 5 | records are being used and items can be processed in approximate order (as opposed to
 6 | strict order within each shard), but that isn't a requirement.
 7 | 
 8 | Using [erlmld](https://github.com/AdRoll/erlmld), a normal Erlang Kinesis processing
 9 | application looks like this:
10 | 
11 | ![Erlang - MultiLangDaemon processing](img/erlang-mld-workers.png)
12 | 
13 | Using this Elixir library (which uses erlmld), a processing application looks like this:
14 | 
15 | ![Elixir - MultiLangDaemon processing](img/elixir-mld-pipeline.png)
16 | 
17 | This is done using the [Flow](https://hexdocs.pm/flow/Flow.html) framework to set up a
18 | MapReduce-style processing pipeline within a single BEAM node.
19 | 
20 | By virtue of using the KCL, processing applications can horizontally scale across a group
21 | of ([homogenous](https://github.com/awslabs/amazon-kinesis-client/issues/103)) worker
22 | instances.
23 | 
24 | Unlike most applications using the KCL's MultiLangDaemon, an Erlang or Elixir processing
25 | application using this library can easily make full use of a worker's processing power
26 | (even if the stream contains a single shard) due to use of the Flow framework.
27 | 
28 | # Examples
29 | 
30 | See:
31 | 
32 |   1. [example erlang processor](examples/erlang_processor/)
33 |   2. [example elixir processor](examples/elixir_processor/)
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, AdRoll
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/examples/erlang_processor/src/erlang_processor_sup.erl:
--------------------------------------------------------------------------------
 1 | -module(erlang_processor_sup).
 2 | 
 3 | -behaviour(supervisor).
 4 | 
 5 | -export([start_link/1]).
 6 | 
 7 | -export([init/1]).
 8 | 
 9 | -define(SERVER, ?MODULE).
10 | 
11 | start_link(Opts) ->
12 |     supervisor:start_link({local, ?SERVER}, ?MODULE, Opts).
13 | 
14 | init(#{stage_names := StageNames,
15 |        flow_spec := FlowSpec,
16 |        producers := ProducerConfigs}) ->
17 | 
18 |     SupFlags = #{strategy => one_for_all,
19 |                  intensity => 2,
20 |                  period => 10},
21 | 
22 |     Producers =
23 |         [#{id => {mld_producer, N},
24 |            type => supervisor,
25 |            shutdown => infinity,
26 |            start => {erlmld_sup, start_link, [ProducerConfig]}}
27 |          || {N, ProducerConfig} <- lists:zip(lists:seq(1, length(ProducerConfigs)),
28 |                                              ProducerConfigs)],
29 | 
30 |     %% the stages must be associated with live processes at the time the flow is started.
31 |     %% if the stages are restarted, the flow should also be restarted.  thus the
32 |     %% one_for_all restart strategy.
33 |     FlowWorker = #{id => flow,
34 |                    type => worker,
35 |                    shutdown => 5000,
36 |                    start => {'Elixir.Exmld', start_link, [FlowSpec]}},
37 | 
38 |     Stages = [#{id => StageName,
39 |                 type => worker,
40 |                 shutdown => 5000,
41 |                 start => {'Elixir.Exmld.KinesisStage', start_link, [[{name, StageName}]]}}
42 |               || StageName <- StageNames],
43 | 
44 |     {ok, {SupFlags, Stages ++ [FlowWorker | Producers]}}.
45 | 


--------------------------------------------------------------------------------
/examples/elixir_processor/README.md:
--------------------------------------------------------------------------------
 1 | # ElixirProcessor
 2 | 
 3 | An example elixir kinesis / dynamodb streams processing application using `exmld`.
 4 | 
 5 | Note: running this example as-is will incur new costs in your AWS account of ~$11/mo (two
 6 | new dynamodb KCL state tables with default read/write capacity of 10/10).  Change the
 7 | capacity of each table to 1/1 to reduce to ~$1.20/mo.
 8 | 
 9 | 
10 | ## Edit
11 | 
12 |  Edit the following variables in
13 |  [lib/elixir_processor/application.ex](lib/elixir_processor/application.ex) according to
14 |  the resources in your account / desired testing:
15 | 
16 |   1. `stream_name`      - a kinesis stream name
17 |   2. `stream_region`    - region of the stream
18 |   3. `table_stream_arn` - a dynamodb table stream ARN
19 |   4. `table_region`     - region of the table stream
20 |   5. `producer_configs` - list of producers to run (e.g., to test only kinesis or dynamo)
21 | 
22 | 
23 | ## Build
24 | 
25 |     $ make
26 | 
27 | 
28 | ## Run
29 | 
30 |     $ iex -S mix
31 |     iex(1)> :observer.start()
32 | 
33 | 
34 | ## Disable KCL logspam
35 | 
36 |     iex(2)> Application.put_env(:erlmld, :log_kcl_spam, false)
37 |     :ok
38 | 
39 | 
40 | ## Observe
41 | 
42 |     11:36:45.688 pid=<0.199.0> full_batch_counter=9 counter=18 items=[%ElixirProcessor.Item{
43 |     token: %ElixirProcessor.Token{
44 |     sequence_number: {:sequence_number, 00000000000000000000000000000000, :undefined, 0, 1},
45 |     stage: #PID<0.189.0>,
46 |     worker: #PID<0.212.0>},
47 |     value: {:stream_record, :undefined, :undefined, :undefined,
48 |              {:sequence_number, 00000000000000000000000000000000, :undefined, :undefined, :undefined},
49 |              "{\"eventID\":\"00000000000000000000000000000000\",\"eventName\":\"REMOVE\",
50 |                \"eventVersion\":\"1.1\",\"eventSource\":\"aws:dynamodb\",\"awsRegion\":\"us-west-2\",
51 |                \"dynamodb\": ... event data ... }"}}]
52 |       line=96 function=flush/1 module=ElixirProcessor
53 |       file=lib/elixir_processor.ex application=elixir_processor [info]  processing batch
54 | 


--------------------------------------------------------------------------------
/examples/erlang_processor/README.md:
--------------------------------------------------------------------------------
 1 | erlang_processor
 2 | =====
 3 | 
 4 | An example erlang kinesis / dynamodb streams processing application using `exmld`.
 5 | 
 6 | Note: running this example as-is will incur new costs in your AWS account of ~$11/mo (two
 7 | new dynamodb KCL state tables with default read/write capacity of 10/10).  Change the
 8 | capacity of each table to 1/1 to reduce to ~$1.20/mo.
 9 | 
10 | Edit
11 | -----
12 | 
13 |  Edit the following variables in
14 |  [src/erlang_processor_app.erl](src/erlang_processor_app.erl)
15 |  according to the resources in your account / desired testing:
16 | 
17 |   1. `StreamName`      - a kinesis stream name
18 |   2. `StreamRegion`    - region of the stream
19 |   3. `TableStreamArn`  - a dynamodb table stream ARN
20 |   4. `TableRegion`     - region of the table stream
21 |   5. `ProducerConfigs` - list of producers to run (e.g., to test only kinesis or dynamo)
22 | 
23 | Build
24 | -----
25 | 
26 |     $ make
27 | 
28 | Run
29 | -----
30 | 
31 |     $ rebar3 shell
32 |     1> observer:start().
33 | 
34 | Disable KCL logspam
35 | -----
36 | 
37 |     2> application:set_env(erlmld, log_kcl_spam, false).
38 |     ok
39 | 
40 | Observe
41 | -----
42 | 
43 |     <0.434.0> processing items: [{item,
44 |                               #{'__struct__' =>
45 |                                  'Elixir.Exmld.KinesisWorker.Datum',
46 |                                 opaque => some_opaque_value,
47 |                                 shard_id =>
48 |                                  <<"shardId-00000001537808865642-00000000">>,
49 |                                 stream_record =>
50 |                                  {stream_record,undefined,undefined,
51 |                                   undefined,
52 |                                   {sequence_number,
53 |                                    2027497200000000000664507565,undefined,
54 |                                    undefined,undefined},
55 |                                   <<"{\"eventID\":\"00000000000000000000000000000000\",\"eventName\":\"INSERT\",\"eventVersion\":\"1.1\",\"eventSource\":\"aws:dynamodb\",\"awsRegion\":\"us-west-2\",\"dynamodb\":{\"ApproximateCreationDateTime\":1537821240000,\"Keys\": ... },\"SequenceNumber\":\"00000000000000000000000000000000\",\"SizeBytes\":1234,\"StreamViewType\":\"KEYS_ONLY\"}}">>}},
56 |                               {flow_token,<0.424.0>,<0.459.0>,
57 |                                {sequence_number,00000000000000000000000000000000,
58 |                                 undefined,0,1}}},
59 |                                 ...
60 | 
61 | 
62 | Make a release
63 | -----
64 | 
65 |     $ make release
66 |     ... (unpack release somewhere) ...
67 |     in release dir:
68 |     $ ./bin/erlang_processor foreground
69 | 


--------------------------------------------------------------------------------
/mix.lock:
--------------------------------------------------------------------------------
 1 | %{
 2 |   "earmark_parser": {:hex, :earmark_parser, "1.4.25", "2024618731c55ebfcc5439d756852ec4e85978a39d0d58593763924d9a15916f", [:mix], [], "hexpm", "56749c5e1c59447f7b7a23ddb235e4b3defe276afc220a6227237f3efe83f51e"},
 3 |   "erlexec": {:hex, :erlexec, "1.10.9", "3cbb3476f942bfb8b68b85721c21c1835061cf6dd35f5285c2362e85b100ddc7", [:rebar3], [], "hexpm", "271e5b5f2d91cdb9887efe74d89026c199bfc69f074cade0d08dab60993fa14e"},
 4 |   "erlmld": {:hex, :erlmld, "1.0.2", "a3dad389e0f07d3ad0bc9e99ead5ab1e3527365a9eb7169945098c1e8e504d6b", [:rebar3], [{:erlexec, "1.10.9", [hex: :erlexec, repo: "hexpm", optional: false]}, {:jiffy, "1.1.1", [hex: :jiffy, repo: "hexpm", optional: false]}], "hexpm", "c35739e93864da3321f6f316771b8e9712bc590fb6f67dd08403d3c901076aa3"},
 5 |   "ex_doc": {:hex, :ex_doc, "0.28.4", "001a0ea6beac2f810f1abc3dbf4b123e9593eaa5f00dd13ded024eae7c523298", [:mix], [{:earmark_parser, "~> 1.4.19", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "bf85d003dd34911d89c8ddb8bda1a958af3471a274a4c2150a9c01c78ac3f8ed"},
 6 |   "flow": {:hex, :flow, "1.2.0", "515e03aa3d056cecc3e3f1e80f6ca4bbf5f45b13c88dee5db880b2f3f24f1caa", [:mix], [{:gen_stage, "~> 1.0", [hex: :gen_stage, repo: "hexpm", optional: false]}], "hexpm", "1b45bfc8a9202c5ec80b077c21df133561e56c56189ba4082dddccb6b5762525"},
 7 |   "gen_stage": {:hex, :gen_stage, "1.1.2", "b1656cd4ba431ed02c5656fe10cb5423820847113a07218da68eae5d6a260c23", [:mix], [], "hexpm", "9e39af23140f704e2b07a3e29d8f05fd21c2aaf4088ff43cb82be4b9e3148d02"},
 8 |   "jiffy": {:hex, :jiffy, "1.1.1", "aca10f47aa91697bf24ab9582c74e00e8e95474c7ef9f76d4f1a338d0f5de21b", [:rebar3], [], "hexpm", "62e1f0581c3c19c33a725c781dfa88410d8bff1bbafc3885a2552286b4785c4c"},
 9 |   "makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"},
10 |   "makeup_elixir": {:hex, :makeup_elixir, "0.16.0", "f8c570a0d33f8039513fbccaf7108c5d750f47d8defd44088371191b76492b0b", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "28b2cbdc13960a46ae9a8858c4bebdec3c9a6d7b4b9e7f4ed1502f8159f338e7"},
11 |   "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"},
12 |   "nimble_parsec": {:hex, :nimble_parsec, "1.2.3", "244836e6e3f1200c7f30cb56733fd808744eca61fd182f731eac4af635cc6d0b", [:mix], [], "hexpm", "c8d789e39b9131acf7b99291e93dae60ab48ef14a7ee9d58c6964f59efb570b0"},
13 | }
14 | 


--------------------------------------------------------------------------------
/examples/erlang_processor/src/erlang_processor_app.erl:
--------------------------------------------------------------------------------
  1 | -module(erlang_processor_app).
  2 | 
  3 | -behaviour(application).
  4 | 
  5 | -export([start/2, stop/1]).
  6 | 
  7 | start(_StartType, _StartArgs) ->
  8 |     %% note: in a real application many of these values would be in a common app env
  9 |     %% configuration, but are shown here for illustration.  A key `asdf_xyz` appearing in
 10 |     %% a producer config map will be used to substitute the variable "${ASDF_XYZ}"
 11 |     %% appearing in erlmld/priv/mld.properties.in.
 12 |     ErlMldConfig = maps:from_list(application:get_all_env(erlmld)),
 13 | 
 14 |     %% emit kcl spam to console.  in a real application, this could be configured using a
 15 |     %% lager-compatible module for logging to a file.
 16 |     application:set_env(erlmld, log_kcl_spam, true),
 17 | 
 18 |     %% id of this worker instance; it should be unique per beam node.  if not supplied, it
 19 |     %% will be auto-generated by the KCL.  two different nodes using the same worker id
 20 |     %% will clobber each other's state.
 21 |     WorkerId = <<"example worker">>,
 22 | 
 23 |     %% name and region of the kinesis stream being processed.  you could create this stream
 24 |     %% with the following command:
 25 |     %%
 26 |     %%  aws kinesis create-stream --region us-west-2 --shard-count 2 \
 27 |     %%                            --stream-name erlang-processor-test-stream
 28 |     %%
 29 |     StreamName = <<"erlang-processor-test-stream">>,
 30 |     StreamRegion = <<"us-west-2">>,
 31 | 
 32 |     %% ARN and region of the dynamo stream being processed.  `erlmld` does not yet support
 33 |     %% obtaining ARNs from table names.  you can obtain the ARN of an existing table
 34 |     %% stream with the following command:
 35 |     %%
 36 |     %%  aws dynamodbstreams list-streams --region us-west-2 \
 37 |     %%                      --table-name erlang-processor-test-table \
 38 |     %%                      --query 'Streams[0].StreamArn' --output text
 39 |     %%
 40 |     TableStreamArn = <<"arn:aws:dynamodb:REGION:ACCOUNT-ID:table/TABLE-NAME/stream/TIMESTAMP">>,
 41 |     TableRegion = StreamRegion,
 42 | 
 43 |     %% in this example application, all source streams can be processed the same way, so
 44 |     %% we set up a single flow and set of stages.  if data from different streams should
 45 |     %% be handled differently, separate flows should be used.
 46 |     %%
 47 |     %% these are the registered names of the GenStages which will receive kinesis records
 48 |     %% from each owned shard (round-robin).  the actual stages will be owned by a
 49 |     %% supervisor, but the names are needed now due to how the flusher module and flow are
 50 |     %% configured.  they could also be pids, but using names allows them to be restarted
 51 |     %% without restarting everything else (and be started later):
 52 |     StageNames = [binary_to_atom(<<"stage_", (integer_to_binary(I))/binary>>,
 53 |                                  utf8)
 54 |                   || I <- lists:seq(1, erlang:system_info(schedulers_online))],
 55 | 
 56 |     ConcurrencyFactor = 1, % increase if processing work is IO-bound
 57 |     NumReducers = erlang:system_info(schedulers_online) * ConcurrencyFactor,
 58 | 
 59 |     %% size of each batch to be "flushed" (i.e., collect this many items before processing
 60 |     %% them all in a batch):
 61 |     BatchSize = 10,
 62 | 
 63 |     %% attempt to flush batches every 10s even if batch size not reached (relies on
 64 |     %% heartbeat mechanic):
 65 |     FlushInterval = 10000,
 66 | 
 67 |     %% checkpoint every 60s:
 68 |     CheckpointInterval = 60000,
 69 | 
 70 |     %% fail if a worker stalls for 600s:
 71 |     WatchdogTimeout = 600000,
 72 | 
 73 |     %% max number of in-flight items for each kinesis shard worker:
 74 |     MaxPending = 1024,
 75 | 
 76 |     %% flow demand parameters; see flow documentation:
 77 |     MinDemand = 1,
 78 |     MaxDemand = 1024,
 79 | 
 80 |     FlowOptions = [{num_stages, NumReducers},
 81 |                    {min_demand, MinDemand},
 82 |                    {max_demand, MaxDemand}],
 83 |     FlowSpec = erlang_processor:flow_spec(StageNames, BatchSize, FlushInterval, FlowOptions),
 84 | 
 85 |     %% retrieve this many records with each api call (max: 10000 (kinesis), 1000
 86 |     %% (dynamo)):
 87 |     MaxRecords = 1000,
 88 | 
 89 |     CommonConfig = maps:merge(
 90 |                      ErlMldConfig,
 91 |                      #{
 92 |                        record_processor => erlmld_batch_processor,
 93 |                        record_processor_data =>
 94 |                            #{flusher_mod => 'Elixir.Exmld.KinesisWorker',
 95 |                              flusher_mod_data =>
 96 |                                  [{stages, StageNames},
 97 |                                   {opaque, some_opaque_value},
 98 |                                   {max_pending, MaxPending}],
 99 |                              flush_interval_ms => FlushInterval,
100 |                              checkpoint_interval_ms => CheckpointInterval,
101 |                              watchdog_timeout_ms => WatchdogTimeout,
102 |                              on_checkpoint => fun on_checkpoint/2,
103 |                              description => "description goes here"},
104 | 
105 |                        worker_id => WorkerId,
106 | 
107 |                        %% initial starting position if no shard checkpoint exists; LATEST is
108 |                        %% most recent, TRIM_HORIZON is earliest available:
109 |                        initial_position => <<"TRIM_HORIZON">>,
110 | 
111 |                        max_records => MaxRecords,
112 | 
113 |                        %% reduce cloudwatch metric spam:
114 |                        metrics_level => <<"NONE">>
115 |                       }),
116 | 
117 |     %% a kinesis stream processor:
118 |     KinesisProducer = #{
119 |       %% required if processing multiple source streams within a single beam node (any
120 |       %% atom, used as a registered name suffix and local filename component):
121 |       app_suffix => k,
122 | 
123 |       %% this name will be used to name the dynamodb state table used by the KCL.  if
124 |       %% it doesn't exist, it will be created.  the table is used for coordinating
125 |       %% leases held and checkpoints made by workers cooperating as part of an
126 |       %% application.  if two erlang nodes are running using the same value for this
127 |       %% name, they are considered as two workers in a single processing application.
128 |       %% a single beam node processing multiple different streams needs a unique value
129 |       %% for each stream:
130 |       %%
131 |       %% this is the same name as the corresponding elixir_processor example application;
132 |       %% if both the elixir and erlang versions are run at the same time, both will
133 |       %% cooperate in processing the stream.
134 |       kcl_appname => <<"erlang-processor-kinesis-test">>,
135 | 
136 |       stream_name => StreamName,
137 |       stream_region => StreamRegion,
138 | 
139 |       %% the stream type; 'kinesis' for kinesis streams, 'dynamo' for dynamodb
140 |       %% streams:
141 |       stream_type => kinesis
142 |      },
143 | 
144 |     %% a dynamo stream processor:
145 |     DynamoProducer = #{
146 |       app_suffix => d,
147 |       kcl_appname => <<"erlang-processor-dynamo-test">>,
148 |       stream_name => TableStreamArn,
149 |       stream_region => TableRegion,
150 |       stream_type => dynamo
151 |      },
152 | 
153 |     ProducerConfigs = [maps:merge(CommonConfig, ProducerConfig)
154 |                        || ProducerConfig <- [KinesisProducer]],%, DynamoProducer]],
155 |     erlang_processor_sup:start_link(#{stage_names => StageNames,
156 |                                       flow_spec => FlowSpec,
157 |                                       producers => ProducerConfigs}).
158 | 
159 | stop(_State) ->
160 |     ok.
161 | 
162 | 
163 | on_checkpoint(OpaqueData, ShardId) ->
164 |     io:format("~p checkpointed (~p)~n", [OpaqueData, ShardId]).
165 | 


--------------------------------------------------------------------------------
/examples/elixir_processor/lib/elixir_processor.ex:
--------------------------------------------------------------------------------
  1 | defmodule ElixirProcessor do
  2 |   @moduledoc """
  3 |   Record processor example implementation.
  4 |   """
  5 | 
  6 |   require Logger
  7 |   require Record
  8 |   require Exmld
  9 | 
 10 |   defstruct [:batch_size, :pending_items, full_batch_counter: 0, flush_counter: 0]
 11 | 
 12 |   defmodule Token do
 13 |     defstruct [:stage, :worker, :sequence_number]
 14 |   end
 15 | 
 16 |   defmodule Item do
 17 |     defstruct [:value, :token]
 18 |   end
 19 | 
 20 |   @doc """
 21 |   Return a flow spec which can be used to set up a processing pipeline; see exmld.ex.
 22 | 
 23 |   The pipeline definition is similar but not identical to the version in
 24 |   `erlang_processor`: here, we supply a window with a time-based trigger and modify the
 25 |   flow using the `:append` option.
 26 |   """
 27 |   def flow_spec(stage_names, flow_options, opts \\ []) do
 28 |     window =
 29 |       Flow.Window.global()
 30 |       |> Flow.Window.trigger_periodically(opts[:flush_interval] || 10000, :millisecond)
 31 | 
 32 |     %{
 33 |       stages: stage_names,
 34 |       extract_items_fn: &flow_extract/1,
 35 |       partition_key: {:elem, 0},
 36 |       state0: fn ->
 37 |         %__MODULE__{
 38 |           batch_size: opts[:batch_size] || 10,
 39 |           pending_items: []
 40 |         }
 41 |       end,
 42 |       process_fn: &flow_add_event/2,
 43 |       flow_opts:
 44 |         flow_options ++
 45 |           [
 46 |             window: window,
 47 |             append: fn flow ->
 48 |               flow
 49 |               |> Flow.on_trigger(&flow_flush/1)
 50 |             end
 51 |           ]
 52 |     }
 53 |   end
 54 | 
 55 |   # flow_extract/1 is called to extract sub-items from a kinesis or dynamo stream record.
 56 |   # this allows handling of both KPL-aggregated records and custom aggregation schemes.
 57 |   # the output of this function should be a list of 2-tuples ({key, value}) to be passed
 58 |   # to flow_add_event/2 for handling in a reducer.
 59 |   #
 60 |   # items seen by the extract function generally look like this:
 61 |   #
 62 |   # %Exmld.KinesisStage.Event{
 63 |   #   event: %Exmld.KinesisWorker.Datum{
 64 |   #       opaque: {"us-west-2", "erlang-processor-kinesis-test"},
 65 |   #       shard_id: "shardId-000000000001",
 66 |   #       stream_record: {:stream_record, "12345", 946684800,
 67 |   #                        {:sequence_number, 12345, 0, :undefined, :undefined},
 68 |   #                        " .. record data .. "}},
 69 |   #   stage: #PID<0.136.0>,
 70 |   #   worker: #PID<0.862.0>}
 71 |   #
 72 |   defp flow_extract(%Exmld.KinesisStage.Event{
 73 |          event: %Exmld.KinesisWorker.Datum{stream_record: record},
 74 |          stage: stage,
 75 |          worker: worker
 76 |        }) do
 77 |     case record do
 78 |       # handle a heartbeat.  the second element of the tuple will vary so heartbeats get
 79 |       # distributed among reducers, so the elements must be swapped since we're using the
 80 |       # first element as a partition key.
 81 |       {:heartbeat, x} ->
 82 |         [{x, :heartbeat}]
 83 | 
 84 |       # in a real application, sub-records could be extracted from Event here.  if using a
 85 |       # custom non-KPL aggregation scheme, this should associate each sub-record with a
 86 |       # faked sequence number having the same base as the parent record, and appropriate
 87 |       # 'user_sub' (sub-record index) and 'user_total' (total number of extracted
 88 |       # sub-records) fields.  then when later notifying exmld of record disposition, it
 89 |       # can properly track sub-record processing and advance the checkpoint beyond the
 90 |       # parent record if all of its sub-records were processed.
 91 |       #
 92 |       # two records having the same key (first tuple element) here will be handled by the
 93 |       # same reducer.  in general, the key should be consistently derived from some
 94 |       # attribute of the record/item being processed.
 95 |       _ when Record.is_record(record, :stream_record) ->
 96 |         sn =
 97 |           Exmld.stream_record(record, :sequence_number)
 98 |           |> Exmld.sequence_number(user_sub: 0)
 99 |           |> Exmld.sequence_number(user_total: 1)
100 | 
101 |         item = %Item{
102 |           value: record,
103 |           token: %Token{stage: stage, worker: worker, sequence_number: sn}
104 |         }
105 | 
106 |         [{:erlang.phash2(item), item}]
107 |     end
108 |   end
109 | 
110 |   # handle an item extracted from a record (or a heartbeat).  this occurs in a reducer
111 |   # whose initial state is given by 'state0' in flow_spec/3 above.  it returns an updated
112 |   # state after possibly processing the event (and possibly flushing/updating the state
113 |   # accordingly).  here, we simply add non-heartbeat items to the current batch and flush
114 |   # the batch if it has reached the target size (we don't use an event based window which
115 |   # would count heartbeats).  if enough time elapses, a flush will be separately triggered
116 |   # by the flow window.
117 |   defp flow_add_event({_key, item}, %__MODULE__{pending_items: pending} = state) do
118 |     case item do
119 |       :heartbeat -> state
120 |       _ -> %{state | pending_items: [item | pending]}
121 |     end
122 |     |> maybe_flush()
123 |   end
124 | 
125 |   # possibly process the current pending batch of records if of the appropriate size:
126 |   defp maybe_flush(
127 |          %__MODULE__{
128 |            pending_items: pending,
129 |            batch_size: batch_size,
130 |            full_batch_counter: c
131 |          } = state
132 |        )
133 |        when length(pending) >= batch_size do
134 |     elem(flow_flush(%{state | full_batch_counter: c + 1}), 1)
135 |   end
136 | 
137 |   defp maybe_flush(state) do
138 |     state
139 |   end
140 | 
141 |   # process the current pending batch of records, notify upstream of processing
142 |   # disposition, and return the events to emit downstream (the current state) and the new
143 |   # reducer accumulator (the updated state).  nothing in the current example makes use of
144 |   # the emitted value.
145 |   defp flow_flush(state) do
146 |     orig = state
147 |     {:ok, state, tokens} = flush(state)
148 |     :ok = notify_dispositions(tokens, :ok)
149 |     {[orig], state}
150 |   end
151 | 
152 |   # process a batch of items which have been collected, returning {:ok, state, tokens}.
153 |   #
154 |   # `tokens` is a list of tokens used by notify_dispositions/2 to inform upstream workers
155 |   # of the status of processing.  this is needed because a single reducer will potentially
156 |   # receive records from multiple different kinesis shards.  with this disposition scheme,
157 |   # a kinesis worker can correctly checkpoint based on how far along downstream processing
158 |   # has come (instead of for example automatically checkpointing based on time, which
159 |   # could lose records).
160 |   defp flush(
161 |          %__MODULE__{pending_items: pending, full_batch_counter: fc, flush_counter: c} = state
162 |        ) do
163 |     Logger.info("processing batch", items: inspect(pending), counter: c, full_batch_counter: fc)
164 |     :timer.sleep(100 * length(pending))
165 |     tokens = for %Item{token: token} <- pending, do: token
166 |     {:ok, %{state | pending_items: [], flush_counter: c + 1}, tokens}
167 |   end
168 | 
169 |   # group item processing disposition by origin stage and worker, informing each stage of
170 |   # the records (sequence numbers) from its workers which have been processed.  this
171 |   # allows upstream kinesis workers to safely checkpoint only fully processed data.
172 |   defp notify_dispositions(tokens, status) do
173 |     prepend = fn x -> &[x | &1] end
174 | 
175 |     List.foldl(tokens, %{}, fn %Token{stage: stage, worker: worker, sequence_number: sn}, acc ->
176 |       d = %Exmld.KinesisWorker.Disposition{sequence_number: sn, status: status}
177 |       Map.update(acc, stage, %{worker => [d]}, &Map.update(&1, worker, [d], prepend.(d)))
178 |     end)
179 |     |> Enum.reduce(:ok, fn {stage, worker_map}, :ok ->
180 |       Exmld.KinesisStage.disposition(stage, worker_map)
181 |     end)
182 |   end
183 | end
184 | 


--------------------------------------------------------------------------------
/lib/exmld/kinesis_stage.ex:
--------------------------------------------------------------------------------
  1 | defmodule Exmld.KinesisStage do
  2 |   use GenStage
  3 |   require Logger
  4 | 
  5 |   @moduledoc """
  6 |   A `GenStage` stage for use in processing data produced by `Exmld.KinesisWorker`s.
  7 | 
  8 |   This module acts as a GenStage producer.  Subscribers will receive
  9 |   `Exmld.KinesisStage.Event`s, which each wrap an underlying `Exmld.KinesisWorker.Datum`
 10 |   along with information about the producing worker and stage.  Downstream processors
 11 |   should eventually call `disposition/2` with the disposition of processing so that
 12 |   originating workers can checkpoint.
 13 | 
 14 |   The expected use and workflow is:
 15 | 
 16 |   1. Create a stage or set of stages using this module for each distinct processing
 17 |   pipeline.  A set of stages could be used by more than one Kinesis stream if the
 18 |   processing being done is the same for all of them.
 19 | 
 20 |   2. Create a flow using `Flow.from_stages/2`.
 21 | 
 22 |   3. Configure the flow using `Exmld.flow/6`.
 23 | 
 24 |   4. Run the flow, which should run forever.
 25 | 
 26 |   5. Configure an erlmld supervision tree with a set of `Exmld.KinesisWorker`s using the
 27 |   stage(s) created in (1).
 28 |   """
 29 | 
 30 |   defstruct [{:counter, 0},
 31 |              {:queue, :queue.new()},
 32 |              {:demand, 0},
 33 |              {:disposition, %{}}] # pid => {ref, [term]}
 34 | 
 35 |   defmodule Event do
 36 |     @doc """
 37 |     Struct for events provided to an `Exmld.KinesisStage`.
 38 | 
 39 |     Records the stage and worker identifiers associated with an event.
 40 | 
 41 |     ## Fields
 42 | 
 43 |       * `:stage`  - identifier of the `Exmld.KinesisStage` which handled the event
 44 |       * `:worker` - identifier of the `Exmld.KinesisWorker` which produced the event
 45 |       * `:event`  - an `Exmld.KinesisWorker.Datum`
 46 |     """
 47 |     defstruct stage: nil, worker: nil, event: nil
 48 |     @type t :: %Event{stage: pid, worker: term, event: Exmld.KinesisWorker.Datum.t}
 49 |   end
 50 | 
 51 |   def start_link(opts \\ []) do
 52 |     GenStage.start_link(__MODULE__, [], opts)
 53 |   end
 54 | 
 55 |   @doc """
 56 |   Notify `stage` of a new Kinesis record available for processing.
 57 | 
 58 |   A new event is available for processing by `stage`.  The caller will be monitored and
 59 |   associated with the new event, and will be blocked until after the event has been used
 60 |   to satisfy some downstream demand.  The return value will be the disposition
 61 |   (success/failure) of zero or more records which were previously processed.
 62 |   """
 63 |   @spec notify(GenStage.stage,
 64 |                Exmld.KinesisWorker.Datum,
 65 |                :infinity | non_neg_integer) :: {:disposition, [Exmld.KinesisWorker.Disposition.t]}
 66 |   def notify(stage, datum, timeout \\ :infinity) do
 67 |     GenStage.call(stage, {:notify, datum}, timeout)
 68 |   end
 69 | 
 70 |   @doc """
 71 |   Notify `stage` of the disposition of processing some items.
 72 | 
 73 |   An attempt has been made to process some data extracted from a Kinesis record by a
 74 |   downstream processor.  `stage` will look up the originating producer and record the
 75 |   disposition of processing in the next batch of data to be returned to that producer.
 76 |   """
 77 |   @spec disposition(GenStage.stage, %{optional(pid) => [Exmld.KinesisWorker.Disposition.t]}) :: :ok
 78 |   def disposition(stage, disposition, timeout \\ :infinity) do
 79 |     GenStage.call(stage, {:disposition, disposition}, timeout)
 80 |   end
 81 | 
 82 |   ## GenStage callbacks
 83 | 
 84 |   def init([]) do
 85 |     {:producer, %__MODULE__{}}
 86 |   end
 87 | 
 88 |   # retrieve any buffered events from state and try to serve any pending demand. associate
 89 |   # each event with the process which produced it.  defer reply and block the caller until
 90 |   # the event is used to fulfill some demand.  monitor caller.
 91 |   def handle_call({:notify, event}, from, state) do
 92 |     state
 93 |     |> monitor_sender(from)
 94 |     |> enqueue_event(event, from)
 95 |     |> dispatch_events([])
 96 |   end
 97 | 
 98 |   # a record processor stage is informing us of the disposition of some items which were
 99 |   # extracted from a source record.  forward that information back to the originating
100 |   # workers by updating the next reply state for each pid if it's still monitored.
101 |   def handle_call({:disposition, worker_values}, _from,
102 |                   %__MODULE__{counter: counter, disposition: disposition} = state) do
103 |     # worker_values: %{worker_pid => [disposition]}
104 |     update = fn({worker_pid, worker_dispositions}, map) ->
105 |                map
106 |                |> Map.get_and_update(worker_pid,
107 |                                      fn
108 |                                        (nil) ->
109 |                                          {nil, nil}
110 |                                        ({mref, x}) ->
111 |                                          {nil, {mref, worker_dispositions ++ x}}
112 |                                      end)
113 |                |> elem(1)
114 |              end
115 |     disposition = Enum.reduce(worker_values, disposition, update)
116 |     counter = Enum.reduce(worker_values, counter, fn ({_, d}, n) -> n + length(d) end)
117 |     {:reply, :ok, [], %{state | counter: counter, disposition: disposition}}
118 |   end
119 | 
120 |   # a monitored process has exited; discard any saved item disposition:
121 |   def handle_info({:DOWN, mref, :process, pid, _reason},
122 |                   %__MODULE__{disposition: disposition} = state) do
123 |     {{^mref, _}, disposition} = Map.pop(disposition, pid)
124 |     {:noreply, [], %{state | disposition: disposition}}
125 |   end
126 | 
127 |   @doc """
128 |   Handle subscriber demand.
129 | 
130 |   Return up to `incoming_demand + pending_demand` events, fetching (from state) as needed,
131 |   and storing in state any excess.  If not enough events are available, record unsatisfied
132 |   demand in state, and then return those events when answering a subsequent call.  See the
133 |   `QueueBroadcaster` example in `GenStage` for an explanation of this demand queueing
134 |   behavior.
135 |   """
136 |   def handle_demand(incoming_demand, %__MODULE__{demand: pending_demand} = state) do
137 |     dispatch_events(%{state | demand: incoming_demand + pending_demand}, [])
138 |   end
139 | 
140 |   ## Internal functions
141 | 
142 |   defp monitor_sender(%__MODULE__{disposition: disposition} = state, {pid, _}) do
143 |     case Map.has_key?(disposition, pid) do
144 |       true ->
145 |         state
146 |       false ->
147 |         mref = Process.monitor(pid)
148 |         %{state | disposition: Map.put(disposition, pid, {mref, []})}
149 |     end
150 |   end
151 | 
152 |   defp enqueue_event(%__MODULE__{queue: queue} = state, event, from) do
153 |     %{state | queue: :queue.in({from, event}, queue)}
154 |   end
155 | 
156 |   defp dispatch_events(%__MODULE__{demand: 0} = state, events) do
157 |     {:noreply, Enum.reverse(events), state}
158 |   end
159 | 
160 |   defp dispatch_events(%__MODULE__{queue: queue, demand: demand} = state, events) do
161 |     case :queue.out(queue) do
162 |       {{:value, {from, event}}, queue} ->
163 |         {pid, _} = from
164 |         %{state | queue: queue, demand: demand - 1}
165 |         |> inform_of_disposition(from)
166 |         |> dispatch_events([%__MODULE__.Event{stage: self(), worker: pid, event: event} | events])
167 | 
168 |       {:empty, queue} ->
169 |         {:noreply, Enum.reverse(events), %{state | queue: queue}}
170 |     end
171 |   end
172 | 
173 |   # look up the pid and send it the next disposition batch as a reply to its latest
174 |   # notification.
175 |   defp inform_of_disposition(%__MODULE__{disposition: disposition} = state,
176 |                              {pid, _ref} = from) do
177 |     {value, disposition} =
178 |       disposition
179 |       |> Map.get_and_update(pid,
180 |                             fn
181 |                               (nil) ->
182 |                                 {[], nil}
183 |                               ({mref, x}) ->
184 |                                 {x, {mref, []}}
185 |                             end)
186 |     :ok = GenStage.reply(from, {:disposition, value})
187 |     %{state | disposition: disposition}
188 |   end
189 | end
190 | 


--------------------------------------------------------------------------------
/examples/erlang_processor/src/erlang_processor.erl:
--------------------------------------------------------------------------------
  1 | %%
  2 | %%  record processor example implementation.
  3 | %%
  4 | 
  5 | -module(erlang_processor).
  6 | 
  7 | -export([flow_spec/4]).
  8 | 
  9 | -record(state, {batch_size,
 10 |                 next_flush_time = os:timestamp(),
 11 |                 flush_interval = 10000,
 12 |                 pending_items = []}).
 13 | 
 14 | -record(flow_token, {stage, worker, sequence_number}).
 15 | 
 16 | -record(item, {value, token}).
 17 | 
 18 | -include_lib("erlmld/include/erlmld.hrl").
 19 | 
 20 | 
 21 | %% return a flow spec which can be used to set up a processing pipeline; see exmld.ex.
 22 | flow_spec(StageNames, BatchSize, FlushInterval, FlowOptions) ->
 23 |     #{stages => StageNames,
 24 |       extract_items_fn => fun flow_extract/1,
 25 |       partition_key => {elem, 0}, % elixir uses 0-indexing
 26 |       state0 => fun () ->
 27 |                         #state{flush_interval = FlushInterval,
 28 |                                batch_size = BatchSize}
 29 |                 end,
 30 |       process_fn => fun flow_process_event/2,
 31 |       flow_opts => FlowOptions}.
 32 | 
 33 | 
 34 | %% flow_extract/1 is called to extract sub-items from a kinesis or dynamo stream record.
 35 | %% this allows handling of both KPL-aggregated records and custom aggregation schemes.
 36 | %% the output of this function should be a list of 2-tuples ({key, value}) to be passed to
 37 | %% flow_process_event/2 for processing in a reducer.
 38 | %%
 39 | %% items seen by the extract function generally look like this:
 40 | %%
 41 | %% #{'__struct__' => 'Elixir.Exmld.KinesisStage.Event',
 42 | %%   event =>
 43 | %%     #{'__struct__' => 'Elixir.Exmld.KinesisWorker.Datum',
 44 | %%       opaque => {<<"us-west-2">>, <<"erlang-processor-kinesis-test">>},
 45 | %%       shard_id => <<"shardId-000000000001">>,
 46 | %%       stream_record =>
 47 | %%         #stream_record{partition_key = <<"12345">>,
 48 | %%                        timestamp = 946684800,,
 49 | %%                        sequence_number =
 50 | %%                          #sequence_number{base = 12345, sub = 0},
 51 | %%                        data = << .. record data .. >>}},
 52 | %%   stage => <0.136.0>,
 53 | %%   worker => <0.862.0>}
 54 | %%
 55 | flow_extract(#{'__struct__' := 'Elixir.Exmld.KinesisStage.Event',
 56 |                event := #{'__struct__' := 'Elixir.Exmld.KinesisWorker.Datum',
 57 |                           stream_record := {heartbeat, X}}}) ->
 58 |     %% handle a heartbeat.  the second element of the tuple will vary so heartbeats get
 59 |     %% distributed among reducers, so the elements must be swapped since we're using the
 60 |     %% first element as a partition key.
 61 |     [{X, heartbeat}];
 62 | flow_extract(#{'__struct__' := 'Elixir.Exmld.KinesisStage.Event',
 63 |                stage := Stage,
 64 |                worker := Worker,
 65 |                event := Item}) ->
 66 |     %% in a real application, sub-records could be extracted from Event here.  if using a
 67 |     %% custom non-KPL aggregation scheme, this should associate each sub-record with a
 68 |     %% faked sequence number having the same base as the parent record, and appropriate
 69 |     %% 'user_sub' (sub-record index) and 'user_total' (total number of extracted
 70 |     %% sub-records) fields.  then when later notifying exmld of record disposition, it can
 71 |     %% properly track sub-record processing and advance the checkpoint beyond the parent
 72 |     %% record if all of its sub-records were processed.
 73 |     %%
 74 |     %% two records having the same key (first tuple element) here will be handled by the
 75 |     %% same reducer.  in general, the key should be consistently derived from some
 76 |     %% attribute of the record/item being processed.
 77 |     #{'__struct__' := 'Elixir.Exmld.KinesisWorker.Datum',
 78 |       stream_record := #stream_record{sequence_number = SN}} = Item,
 79 |     [{erlang:phash2(Item), #item{value = Item,
 80 |                                  token = #flow_token{stage = Stage,
 81 |                                                      worker = Worker,
 82 |                                                      sequence_number = SN#sequence_number{user_sub = 0,
 83 |                                                                                           user_total = 1}}}}].
 84 | 
 85 | 
 86 | %% process an item extracted from a record (or a heartbeat).  this occurs in a reducer
 87 | %% whose initial state is given by 'state0' in flow_spec/4 above.  it returns an updated
 88 | %% state after processing the event (and possibly flushing/updating the state
 89 | %% accordingly).  here, we simply add the item to the current batch and possibly flush the
 90 | %% batch.
 91 | flow_process_event({_Key, Item}, #state{} = State) ->
 92 |     maybe_flush(flow_add_record(Item, State)).
 93 | 
 94 | 
 95 | flow_add_record(heartbeat, State) ->
 96 |     State;
 97 | flow_add_record(Item, #state{pending_items = Pending} = State) ->
 98 |     State#state{pending_items = [Item | Pending]}.
 99 | 
100 | 
101 | %% possibly process the current pending batch of records if of the appropriate size or
102 | %% enough time has elapsed:
103 | maybe_flush(State) ->
104 |     case should_flush(State) of
105 |         true ->
106 |             {ok, NState, Tokens} = flush(State),
107 |             ok = notify_dispositions(Tokens, ok),
108 |             note_flush(NState);
109 |         false ->
110 |             State
111 |     end.
112 | 
113 | 
114 | should_flush(#state{pending_items = Pending,
115 |                     batch_size = BatchSize,
116 |                     next_flush_time = NextFlush}) ->
117 |     length(Pending) >= BatchSize
118 |         orelse elapsed_ms(NextFlush) >= 0.
119 | 
120 | 
121 | note_flush(#state{flush_interval = FlushInterval} = State) ->
122 |     {Mega, Sec, Micros} = os:timestamp(),
123 |     NextFlush = {Mega, Sec, Micros + trunc(FlushInterval * 1.0e3)},
124 |     State#state{next_flush_time = NextFlush}.
125 | 
126 | 
127 | elapsed_ms(When) ->
128 |     trunc(timer:now_diff(os:timestamp(), When)/1.0e3).
129 | 
130 | 
131 | %% process a batch of items which have been collected, returning {ok, NState, Tokens}.
132 | %%
133 | %% Tokens is a list of tokens used by notify_dispositions/2 to inform upstream workers of
134 | %% the status of processing.  this is needed because a single reducer will potentially
135 | %% receive records from multiple different kinesis shards.  with this disposition scheme,
136 | %% a kinesis worker can correctly checkpoint based on how far along downstream processing
137 | %% has come (instead of for example automatically checkpointing based on time, which could
138 | %% lose records).
139 | flush(#state{pending_items = Pending} = State) ->
140 |     io:format("~p processing items: ~p~n", [self(), Pending]),
141 |     timer:sleep(100 * length(Pending)),
142 |     Tokens = [Item#item.token || Item <- Pending],
143 |     {ok, State#state{pending_items = []}, Tokens}.
144 | 
145 | 
146 | %% group item processing disposition by origin stage and worker, informing each stage of
147 | %% the records (sequence numbers) from its workers which have been processed.  this allows
148 | %% upstream kinesis workers to safely checkpoint only fully processed data.
149 | notify_dispositions(Tokens, Status) ->
150 |     RecipientMap =
151 |         lists:foldl(
152 |           fun (#flow_token{stage = Stage,
153 |                            worker = Worker,
154 |                            sequence_number = SN}, Acc) ->
155 |                   This = disposition(SN, Status),
156 |                   maps:update_with(
157 |                     Stage,
158 |                     fun (WAcc) ->
159 |                             maps:update_with(
160 |                               Worker,
161 |                               fun (DAcc) ->
162 |                                       [This | DAcc]
163 |                               end,
164 |                               [This],
165 |                               WAcc)
166 |                     end,
167 |                     #{Worker => [This]},
168 |                     Acc)
169 |           end, #{}, Tokens),
170 |     maps:fold(fun (Stage, WorkerMap, ok) ->
171 |                       'Elixir.Exmld.KinesisStage':disposition(Stage, WorkerMap)
172 |               end, ok, RecipientMap).
173 | 
174 | 
175 | disposition(SN, Status) ->
176 |     #{'__struct__' => 'Elixir.Exmld.KinesisWorker.Disposition',
177 |       sequence_number => SN,
178 |       status => Status}.
179 | 


--------------------------------------------------------------------------------
/examples/elixir_processor/lib/elixir_processor/application.ex:
--------------------------------------------------------------------------------
  1 | defmodule ElixirProcessor.Application do
  2 |   @moduledoc false
  3 |   use Application
  4 |   require Logger
  5 | 
  6 |   defmodule Producer do
  7 |     def child_spec({config, n}) do
  8 |       %{
  9 |         id: {:mld_producer, n},
 10 |         type: :supervisor,
 11 |         shutdown: :infinity,
 12 |         start: {:erlmld_sup, :start_link, [config]}
 13 |       }
 14 |     end
 15 |   end
 16 | 
 17 |   def start(_type, _args) do
 18 |     # emit kcl spam to console.  in a real application, this could be configured using a
 19 |     # lager-compatible module for logging to a file.
 20 |     Application.put_env(:erlmld, :log_kcl_spam, true)
 21 | 
 22 |     %{stage_names: stage_names, producer_configs: producer_configs, flow_spec: flow_spec} =
 23 |       prepare_config()
 24 | 
 25 |     flow_worker = {Exmld, flow_spec}
 26 | 
 27 |     stages =
 28 |       stage_names
 29 |       |> Enum.map(&%{id: &1, start: {Exmld.KinesisStage, :start_link, [[name: &1]]}})
 30 | 
 31 |     producers =
 32 |       producer_configs
 33 |       |> Enum.with_index()
 34 |       |> Enum.map(&{Producer, &1})
 35 | 
 36 |     children = stages ++ [flow_worker | producers]
 37 | 
 38 |     opts = [strategy: :one_for_all, intensity: 2, period: 10, name: ElixirProcessor.Supervisor]
 39 |     Supervisor.start_link(children, opts)
 40 |   end
 41 | 
 42 |   defp prepare_config do
 43 |     # note: in a real application many of these values would be in a common app env
 44 |     # configuration, but are shown here for illustration.  A key :asdf_xyz appearing in a
 45 |     # producer config map will be used to substitute the variable "${ASDF_XYZ}" appearing
 46 |     # in erlmld/priv/mld.properties.in.
 47 |     erlmld_config =
 48 |       Application.get_all_env(:erlmld)
 49 |       |> Enum.into(%{})
 50 | 
 51 |     # id of this worker instance; it should be unique per beam node.  if not supplied, it
 52 |     # will be auto-generated by the KCL.  two different nodes using the same worker id
 53 |     # will clobber each other's state.
 54 |     worker_id = "example worker"
 55 | 
 56 |     # name and region of the kinesis stream being processed.  you could create this stream
 57 |     # with the following command:
 58 |     #
 59 |     #  aws kinesis create-stream --region us-west-2 --shard-count 2 \
 60 |     #                            --stream-name erlang-processor-test-stream
 61 |     #
 62 |     stream_name = "erlang-processor-test-stream"
 63 |     stream_region = "us-west-2"
 64 | 
 65 |     # ARN and region of the dynamo stream being processed.  `erlmld` does not yet support
 66 |     # obtaining ARNs from table names.  you can obtain the ARN of an existing table
 67 |     # stream with the following command:
 68 |     #
 69 |     #  aws dynamodbstreams list-streams --region us-west-2 \
 70 |     #                      --table-name erlang-processor-test-table \
 71 |     #                      --query 'Streams[0].StreamArn' --output text
 72 |     #
 73 |     table_stream_arn = "arn:aws:dynamodb:REGION:ACCOUNT-ID:table/TABLE-NAME/stream/TIMESTAMP"
 74 |     table_region = stream_region
 75 | 
 76 |     # in this example application, all source streams can be processed the same way, so we
 77 |     # set up a single flow and set of stages.  if data from different streams should be
 78 |     # handled differently, separate flows should be used.
 79 |     #
 80 |     # these are the registered names of the GenStages which will receive kinesis records
 81 |     # from each owned shard (round-robin).  the actual stages will be owned by a
 82 |     # supervisor, but the names are needed now due to how the flusher module and flow are
 83 |     # configured.  they could also be pids, but using names allows them to be restarted
 84 |     # without restarting everything else (and be started later):
 85 |     stage_names =
 86 |       1..System.schedulers_online()
 87 |       |> Enum.map(&:erlang.binary_to_atom("stage_#{&1}", :utf8))
 88 | 
 89 |     # increase if processing work is IO-bound
 90 |     concurrency_factor = 1
 91 |     num_reducers = System.schedulers_online() * concurrency_factor
 92 | 
 93 |     # size of each batch to be "flushed" (i.e., collect this many items before processing
 94 |     # them all in a batch):
 95 |     batch_size = 10
 96 | 
 97 |     # attempt to flush batches every 10s even if batch size not reached (relies on
 98 |     # heartbeat mechanic):
 99 |     flush_interval = 10000
100 | 
101 |     # checkpoint every 60s:
102 |     checkpoint_interval = 60000
103 | 
104 |     # fail if a worker stalls for 600s:
105 |     watchdog_timeout = 600_000
106 | 
107 |     # max number of in-flight items for each kinesis shard worker:
108 |     max_pending = 1024
109 | 
110 |     # flow demand parameters; see flow documentation:
111 |     min_demand = 1
112 |     max_demand = 1024
113 | 
114 |     flow_options = [num_stages: num_reducers, min_demand: min_demand, max_demand: max_demand]
115 | 
116 |     flow_spec =
117 |       ElixirProcessor.flow_spec(stage_names, flow_options,
118 |         batch_size: batch_size,
119 |         flush_interval: flush_interval
120 |       )
121 | 
122 |     # retrieve this many records with each api call (max: 10000 (kinesis), 1000
123 |     # (dynamo)):
124 |     max_records = 1000
125 | 
126 |     common_config =
127 |       Map.merge(
128 |         erlmld_config,
129 |         %{
130 |           record_processor: :erlmld_batch_processor,
131 |           record_processor_data: %{
132 |             flusher_mod: Exmld.KinesisWorker,
133 |             flusher_mod_data: [
134 |               stages: stage_names,
135 |               opaque: :some_opaque_value,
136 |               max_pending: max_pending
137 |             ],
138 |             flush_interval_ms: flush_interval,
139 |             checkpoint_interval_ms: checkpoint_interval,
140 |             watchdog_timeout_ms: watchdog_timeout,
141 |             on_checkpoint: &on_checkpoint/2,
142 |             description: "description goes here"
143 |           },
144 |           worker_id: worker_id,
145 | 
146 |           # initial starting position if no shard checkpoint exists; LATEST is
147 |           # most recent, TRIM_HORIZON is earliest available:
148 |           initial_position: "TRIM_HORIZON",
149 |           max_records: max_records,
150 | 
151 |           # reduce cloudwatch metric spam:
152 |           metrics_level: "NONE"
153 |         }
154 |       )
155 | 
156 |     # a kinesis stream processor:
157 |     kinesis_producer = %{
158 |       # required if processing multiple source streams within a single beam node (any
159 |       # atom, used as a registered name suffix and local filename component):
160 |       app_suffix: :k,
161 | 
162 |       # this name will be used to name the dynamodb state table used by the KCL.  if it
163 |       # doesn't exist, it will be created.  the table is used for coordinating leases held
164 |       # and checkpoints made by workers cooperating as part of an application.  if two
165 |       # erlang nodes are running using the same value for this name, they are considered
166 |       # as two workers in a single processing application.  a single beam node processing
167 |       # multiple different streams needs a unique value for each stream.
168 |       #
169 |       # this is the same name as the corresponding erlang_processor example application;
170 |       # if both the elixir and erlang versions are run at the same time, both will
171 |       # cooperate in processing the stream.
172 |       kcl_appname: "erlang-processor-kinesis-test",
173 |       stream_name: stream_name,
174 |       stream_region: stream_region,
175 | 
176 |       # the stream type; 'kinesis' for kinesis streams, 'dynamo' for dynamodb streams:
177 |       stream_type: :kinesis
178 |     }
179 | 
180 |     # a dynamo stream processor:
181 |     dynamo_producer = %{
182 |       app_suffix: :d,
183 |       kcl_appname: "erlang-processor-dynamo-test",
184 |       stream_name: table_stream_arn,
185 |       stream_region: table_region,
186 |       stream_type: :dynamo
187 |     }
188 | 
189 |     producer_configs =
190 |       [kinesis_producer, dynamo_producer]
191 |       |> Enum.map(&Map.merge(common_config, &1))
192 | 
193 |     %{
194 |       stage_names: stage_names,
195 |       producer_configs: producer_configs,
196 |       batch_size: batch_size,
197 |       flush_interval: flush_interval,
198 |       flow_spec: flow_spec
199 |     }
200 |   end
201 | 
202 |   defp on_checkpoint(opaque, shard_id) do
203 |     Logger.info("checkpointed", opaque: opaque, shard_id: shard_id)
204 |   end
205 | end
206 | 


--------------------------------------------------------------------------------
/lib/exmld.ex:
--------------------------------------------------------------------------------
  1 | defmodule Exmld do
  2 |   @moduledoc ~S"""
  3 |   This allows items extracted from Kinesis stream records (or sub-records in a [KPL
  4 |   aggregate record](https://github.com/AdRoll/erlmld/blob/HEAD/proto/kpl_agg.proto)) to
  5 |   be processed by a pipeline of workers which may differ in number from the number of
  6 |   shards owned by the current node (which is the normal processing model offered by
  7 |   [erlmld](https://github.com/AdRoll/erlmld)).
  8 | 
  9 |   This is beneficial when using aggregate records which can be processed in approximate
 10 |   order according to their partition keys as opposed to strict ordering based on the
 11 |   shards they arrived on.  For example, suppose the following two Kinesis records are
 12 |   received on two different shards:
 13 | 
 14 |       Record 1 (a KPL aggregate record)
 15 |         - partition key: "xyzzy"
 16 |         - subrecord a:
 17 |           - partition key: "asdf"
 18 |           - value: "12345"
 19 |         - subrecord b:
 20 |           - partition key: "fdsa"
 21 |           - value: "54321"
 22 | 
 23 |       Record 2 (a KPL aggregate record)
 24 |         - partition key: "qwer"
 25 |         - subrecord a:
 26 |           - partition key: "asdf"
 27 |           - value: "23456"
 28 |         - subrecord b:
 29 |           - partition key: "z"
 30 |           - value: "0"
 31 | 
 32 | 
 33 |   Using the normal Kinesis processing paradigm, each shard will be processed in order.
 34 |   `erlmld` supports this by spawning a process for each owned shard, which handles each
 35 |   record seen on the shard in sequence:
 36 | 
 37 |       Worker 1:
 38 |         1. handle record "xyzzy"
 39 |           a. handle sub-record "asdf"
 40 |           b. handle sub-record "fdsa"
 41 | 
 42 |       Worker 2:
 43 |         1. handle record "qwer"
 44 |           a. handle sub-record "asdf"
 45 |           b. handle sub-record "z"
 46 | 
 47 | 
 48 |   This can fail to make use of all available resources since the maximum concurrency is
 49 |   limited by the number of owned shards.  If the application can tolerate the handling of
 50 |   sub-records in a non-strict order, it can use a `Flow`-based MapReduce-style scheme:
 51 | 
 52 |       [Worker 1]  [Worker 2]     (processes which produce Kinesis records)
 53 |           |           |
 54 |           v           v
 55 |       [Exmld.KinesisStage, ...]  (stages receiving Exmld.KinesisWorker.Datums)
 56 |                 |
 57 |                 v
 58 |           [M1] .... [Mn]  (mappers which extract items)
 59 |             |\       /|
 60 |             | \     / |
 61 |             |  \   /  |
 62 |             |   \ /   |
 63 |             |    \    |
 64 |             |   / \   |
 65 |             |  /   \  |
 66 |             | /     \ |
 67 |             |/       \|
 68 |           [R1] .... [Rn]  (reducers which handle extracted items)
 69 | 
 70 |   The number of reducers is configurable and defaults to the number of schedulers online.
 71 |   The processing application will specify a means of extracting a partition key from each
 72 |   extracted item; these will be used to consistently map items to reducers (which is where
 73 |   the actual application work occurs).
 74 | 
 75 |   Using the above example and specifying a sub-record's partition key as an item key:
 76 | 
 77 |     1. Worker 1 will produce the "asdf" and "fdsa" sub-records from outer record "xyzzy"
 78 |     and send them to a pre-configured `Exmld.KinesisStage` (or round-robin to a list of
 79 |     such stages).
 80 | 
 81 |     2. Worker 2 will similarly produce the "asdf" and "z" sub-records from outer record
 82 |     "qwer".
 83 | 
 84 |     3. Each receiving stage will wrap and forward these sub-records for handling by the
 85 |     flow.
 86 | 
 87 |     4. The application will have provided an "identity" item extraction function since KPL
 88 |     aggregation is being used here (or otherwise a function accepting one record and
 89 |     returning a list containing a single item).
 90 | 
 91 |     5. The application will have provided a partition key extraction function which
 92 |     returns an appropriate partition key to be used in consistently mapping items to
 93 |     reducers.
 94 | 
 95 |     6. The first received "asdf" sub-record is provided to some reducer `Rx`.  The second
 96 |     received "asdf" sub-record is provided to the same reducer since its extracted key has
 97 |     the same hash.
 98 | 
 99 |     7. The "fdsa" and "z" sub-records are similarly provided to some worker `Ry` and/or
100 |     `Rz` based on the hash of their partition keys.
101 | 
102 |     8. The application-provided reducer function notifies each originating stage of the
103 |     disposition of processing for items received from it as processing progresses.
104 | 
105 |     9. Eventually, processing disposition is provided back to the originating workers,
106 |     which can decide whether or not (and where) to checkpoint.
107 | 
108 |   """
109 | 
110 |   require Record
111 |   Record.defrecord(:sequence_number, Record.extract(:sequence_number,
112 |                                                     from_lib: "erlmld/include/erlmld.hrl"))
113 |   Record.defrecord(:checkpoint, Record.extract(:checkpoint,
114 |                                                from_lib: "erlmld/include/erlmld.hrl"))
115 |   Record.defrecord(:stream_record, Record.extract(:stream_record,
116 |                                                   from_lib: "erlmld/include/erlmld.hrl"))
117 | 
118 |   @type sequence_number :: record(:sequence_number)
119 |   @type checkpoint :: record(:checkpoint)
120 |   @type stream_record :: record(:stream_record)
121 |   @type shard_id :: binary
122 | 
123 |   @type item :: any
124 |   @type partition_key :: any
125 |   @type reducer_state :: any
126 | 
127 |   @doc """
128 |   Accepts a flow producing `Exmld.KinesisWorker.Datum`s (e.g,. a flow created from
129 |   `Exmld.KinesisStage`s) and returns another flow.
130 |   """
131 |   # each stream record should be associated with the genstage which received it and the
132 |   # worker which produced it.  each item extracted from a stream record should indicate
133 |   # the record it came from, the item id within the record, and the total number of items
134 |   # in the record.  the extraction and processing functions should correctly handle
135 |   # heartbeats.  the processing function should process as much data as possible, and
136 |   # periodically inform the source genstages of all the item ids which have been
137 |   # (successfully or not) processed.  those genstages in turn will maintain information
138 |   # about what has been successfully processed, which the producing kinesis workers can
139 |   # use when checkpointing.
140 |   @spec flow(# a flow which produces `Datum`s:
141 |              flow :: Flow.t,
142 |              # arity-1 function mapping a datum to list of zero or more items:
143 |              extract_items_fn :: ((Exmld.KinesisWorker.Datum) -> [item]),
144 |              # arity-1 function or flow partition key shortcut for partitioning items:
145 |              partition_key :: {:elem, non_neg_integer}
146 |                               | {:key, atom}
147 |                               | ((item) -> partition_key),
148 |              # arity-0 function returning initial reducer state:
149 |              state0 :: (() -> reducer_state),
150 |              # arity-2 function accepting item being processed and reducer state:
151 |              process_fn :: ((item, reducer_state) -> reducer_state),
152 |              opts :: keyword) :: Flow.t
153 |   def flow(flow,
154 |            extract_items_fn,
155 |            partition_key,
156 |            state0,
157 |            process_fn,
158 |            opts \\ []) do
159 |     extra = opts[:append] || &(&1)
160 |     flow
161 |     |> Flow.flat_map(extract_items_fn)
162 |     |> Flow.partition(key: partition_key,
163 |                       stages: opts[:num_stages] || System.schedulers_online(),
164 |                       min_demand: opts[:min_demand] || 1,
165 |                       max_demand: opts[:max_demand] || 500,
166 |                       window: opts[:window] || Flow.Window.global())
167 |     |> Flow.reduce(state0, process_fn)
168 |     |> extra.()
169 |   end
170 | 
171 |   @doc """
172 |   You can use this one to keep building your flow after calling flow/6 above.
173 |   """
174 |   def from_stages(opts) do
175 |     Flow.from_stages(opts.stages)
176 |     |> flow(opts.extract_items_fn, opts.partition_key,
177 |             opts.state0, opts.process_fn, opts.flow_opts)
178 |   end
179 | 
180 |   def start_link(opts) do
181 |     from_stages(opts) |> Flow.start_link()
182 |   end
183 | 
184 |   def child_spec(opts) do
185 |     %{id: __MODULE__, start: {__MODULE__, :start_link, [opts]}}
186 |   end
187 | end
188 | 


--------------------------------------------------------------------------------
/lib/exmld/kinesis_worker.ex:
--------------------------------------------------------------------------------
  1 | defmodule Exmld.KinesisWorker do
  2 |   require Logger
  3 |   require Exmld
  4 |   @behaviour :erlmld_flusher
  5 | 
  6 |   defstruct [{:stages, []},
  7 |              {:shard_id, nil},
  8 |              {:opaque, nil},
  9 |              {:counter, 0},
 10 |              {:heartbeats, 0},
 11 |              {:errors, 0},
 12 |              {:error_callback, nil},
 13 |              {:skip_errors, false},
 14 |              {:done, []},
 15 |              {:max_pending, 1000},
 16 |              {:await_sleep_interval, 1000},
 17 |              {:pending, %{}},
 18 |              {:on_duplicate, :exit}]
 19 | 
 20 |   @type flusher_token :: any
 21 |   @type t :: %__MODULE__{# list of identifiers which can be `GenStage.call/3`ed:
 22 |                          stages: [any],
 23 |                          shard_id: Exmld.shard_id,
 24 |                          opaque: any,
 25 |                          counter: non_neg_integer,
 26 |                          heartbeats: non_neg_integer,
 27 |                          errors: non_neg_integer,
 28 |                          error_callback: ((t, [Exmld.KinesisWorker.Disposition.t]) -> any)
 29 |                                          | nil,
 30 |                          skip_errors: boolean,
 31 |                          done: [flusher_token],
 32 |                          max_pending: pos_integer,
 33 |                          await_sleep_interval: non_neg_integer,
 34 |                          # map from a sequence number to a token or a tuple of a token and
 35 |                          # a list of {sub, total} values known so far.
 36 |                          #
 37 |                          # if we expect only a single disposition for a record, it was a
 38 |                          # kpl sub-record received from upstream and we store a value
 39 |                          # consisting of the associated token here.
 40 |                          #
 41 |                          # otherwise, we store {token, []} and await all outstanding
 42 |                          # dispositions for items extracted from the originating record.
 43 |                          pending: %{optional({Exmld.shard_id, Exmld.sequence_number}) =>
 44 |                                      flusher_token | {flusher_token, [{non_neg_integer,
 45 |                                                                        non_neg_integer}]}},
 46 |                          # What we should do if we find a duplicate sequence number
 47 |                          on_duplicate: :exit | :skip}
 48 | 
 49 |   @moduledoc """
 50 |   An [erlmld_flusher](https://github.com/AdRoll/erlmld/blob/HEAD/src/erlmld_flusher.erl)
 51 |   which can interface with a `Exmld.KinesisStage` data source.
 52 | 
 53 |   This implements an `erlmld_flusher` which can be used by `erlmld_batch_processor`.
 54 |   Unlike a typical `erlmld_flusher`, it has a different notion of fullness: if more than
 55 |   `:max_pending` items are in flight, the worker waits for all pending items before
 56 |   emitting any more for downstream processing.  A periodic flush interval should be
 57 |   configured in the batch processor options.  Similarly, the downstream stage processing
 58 |   pipeline should not require any kind of "full" condition and should periodically make
 59 |   progress (i.e., emit/flush output) even if no more records are sent.
 60 | 
 61 |   Heartbeat items are sent while the worker is waiting for pending items to be completed;
 62 |   these include varying counters to allow them to be automatically distributed among
 63 |   downstream reducers.
 64 | 
 65 |   One worker process will exist for each stream shard owned by the current node.  Each
 66 |   such process will have been configured with a set of downstream `Exmld.KinesisStage`s
 67 |   which can receive records from it (actually `Exmld.KinesisWorker.Datum`s); those stages
 68 |   will be part of a data processing `Flow.t`.  Eventually, the disposition of each
 69 |   record's processing will propagate back to the originating worker (as return values from
 70 |   `GenStage.call/3`).
 71 | 
 72 |   Periodically, `erlmld_batch_processor` will request a flush.  If the flush kind is
 73 |   `:partial`, we return the tokens associated with the records which have already been
 74 |   fully processed.  Otherwise, the flush kind is `:full` and we await the disposition of
 75 |   every outstanding record before returning.
 76 | 
 77 |   If processing of any record (or item extracted therefrom) fails, the worker will crash
 78 |   unless it's configured to ignore processing errors.
 79 | 
 80 |   Records presented to this worker may be ordinary records or sub-records extracted from a
 81 |   containing KPL-aggregated record.  If KPL aggregation is not being used, but smaller
 82 |   sub-items are later extracted by the stage processing pipeline, the pipeline should
 83 |   create fake sub-record sequence numbers to track the disposition of those items (and
 84 |   sub-record checkpointing should be turned off).
 85 | 
 86 |   Periodically (which should be at some multiple of the periodic flush interval),
 87 |   `erlmld_batch_processor` will checkpoint based on the records which have so far been
 88 |   successfully processed (those whose tokens have been returned from `flush/2`).
 89 |   """
 90 | 
 91 |   defmodule Datum do
 92 |     @doc """
 93 |     Struct for annotating stream records (or heartbeats) with additional data.
 94 | 
 95 |     To allow a single downstream processing pipeline to be used with multiple source
 96 |     streams, we annotate Kinesis stream records with additional data before providing them
 97 |     to the stage(s) processing them.  (Example: `:opaque` could be a 2-tuple naming the
 98 |     source stream and region or otherwise indicate how to specially process the record).
 99 | 
100 |     ## Fields
101 | 
102 |      * `:opaque`        - the opaque term provided at worker init time
103 |      * `:shard_id`      - name of the shard the worker is processing
104 |      * `:stream_record` - a record from the stream or `{:heartbeat, _}`
105 |     """
106 |     defstruct opaque: nil, shard_id: nil, stream_record: nil
107 |     @type t :: %Datum{opaque: any,
108 |                       shard_id: Exmld.shard_id,
109 |                       stream_record: Exmld.stream_record | {:heartbeat, any}}
110 |   end
111 | 
112 |   defmodule Disposition do
113 |     @doc """
114 |     Struct for event processing disposition.
115 | 
116 |     Tracks whether processing succeeded or failed for a specific record or item extracted
117 |     therefrom.
118 | 
119 |     ## Fields
120 | 
121 |       * `:sequence_number` - `Exmld.sequence_number()` of the subject record.  If the
122 |       subject is an item extracted from a containing aggregate record, the `user_sub` and
123 |       `user_total` fields should be populated (whether KPL aggregation was used or not).
124 |       * `:status`          - processing status
125 |     """
126 |     defstruct sequence_number: nil, status: nil
127 |     @type t :: %Disposition{sequence_number: Exmld.sequence_number,
128 |                             status: :ok | {:error, term}}
129 |   end
130 | 
131 |   @doc """
132 |   Initialize worker state with a shard id and a set of options.
133 | 
134 |   An `erlmld_batch_processor` is initializing processing on `shard_id` and providing the
135 |   `flusher_mod_data` which was passed to it, which should be an enumerable of `keyword`s
136 |   containing the following options; we return a flusher state to be used in subsequent
137 |   operations.
138 | 
139 |   ## Options
140 | 
141 |   All optional unless marked required:
142 | 
143 |     * `:stages` - (required) list of `GenStage`s (values useable as first arg to
144 |       `GenStage.call/3`) which can receive `Exmld.KinesisWorker.Datum`s
145 |     * `:opaque` - opaque term passed in each `Exmld.KinesisWorker.Datum`
146 |     * `:skip_errors` - boolean indicating whether errors are non-fatal (if false, crash on
147 |        error).
148 |     * `:max_pending` - maximum number of pending items which can be in flight.
149 |     * `:await_sleep_interval` - sleep time between checks while awaiting pending items.
150 |     * `:error_callback` - `nil` or an arity-2 function called with state and failure
151 |       dispositions when processing failures occur.
152 |   """
153 |   def init(shard_id, opts) do
154 |     unless length(opts[:stages] || []) > 0 do
155 |       exit(:no_stages_configured)
156 |     end
157 |     Logger.metadata(shard_id: shard_id, opaque: opts[:opaque])
158 |     struct(%__MODULE__{error_callback: &(log_errors(&1, &2)),
159 |                        shard_id: shard_id}, Map.new(opts))
160 |   end
161 | 
162 |   @doc """
163 |   Submit a new Kinesis record to the downstream pipeline for processing.
164 | 
165 |   A new Kinesis record is available for processing, and `erlmld_batch_processor` is
166 |   instructing us to add it to the current batch.  Since we really have no notion of a
167 |   batch, we immediately choose a downstream stage and notify it of a new
168 |   `Exmld.KinesisWorker.Datum` containing the record and make a note of it being in-flight.
169 |   That call will block until a further-downstream consumer receives the record as a flow
170 |   event.
171 | 
172 |   The result of that call will be an updated list of item dispositions.  Unless configured
173 |   to skip records which failed to be processed, we crash if any failed.  Otherwise we
174 |   update the set of done/pending items and return an updated state.
175 |   """
176 |   def add_record(%__MODULE__{max_pending: max_pending,
177 |                              pending: pending} = state, record, token)
178 |   when map_size(pending) >= max_pending do
179 |     Logger.info("#{state.shard_id} has too many pending items, awaiting...")
180 |     add_record(await_pending(state), record, token)
181 |   end
182 |   def add_record(state, record, token) do
183 |     state =
184 |       state
185 |       |> incr(:counter, 1)
186 |       |> note_pending(record, token)
187 |       |> notify_downstream(record)
188 |       |> update_pending()
189 |     {:ok, state}
190 |   end
191 | 
192 |   @doc """
193 |   Return a list of tokens corresponding to records which have been fully processed and the
194 |   latest state.
195 | 
196 |   If the flush kind is `:full`, we await the disposition of all outstanding records before
197 |   returning.  Otherwise, it's `:partial` and we return (possibly an empty result)
198 |   immediately.
199 | 
200 |   If doing a full flush and any records fail to be successfully processed, we crash unless
201 |   configured to skip failed records.
202 |   """
203 |   def flush(state, kind) do
204 |     %__MODULE__{done: done} = state = case kind do
205 |                                         :partial ->
206 |                                           state
207 |                                         :full ->
208 |                                           await_pending(state)
209 |                                       end
210 |     {:ok, %{state | done: []}, done}
211 |   end
212 | 
213 |   @doc """
214 |   The batch processor has received a possibly-empty set of records from the
215 |   MultiLangDaemon and is informing the flusher (us).  Send a heartbeat downstream and
216 |   return any completed tokens.  This allows progress to be made even if no more records
217 |   appear on the stream.
218 |   """
219 |   def heartbeat(state) do
220 |     %__MODULE__{done: done} = state = do_heartbeat(state)
221 |     {:ok, %{state | done: []}, done}
222 |   end
223 | 
224 |   # a record and token have been provided. If the record is a kpl sub-record, it has
225 |   # base, user_sub, and user_total fields populated, and we expect a single disposition for it. This
226 |   # is also the case for erlmld's custom kpl-like aggregated records (erlmld will make sure to
227 |   # extract the sub records and populate user_sub and user_total).
228 |   # otherwise, it's a normal record (non-kpl) which will later have items extracted from it (we
229 |   # don't know how many), and we'll expect multiple dispositions for it (each containing a
230 |   # faked sequence number also containing populated base, user_sub, and user_total); once we receive
231 |   # all of those, it's done.
232 |   defp note_pending(%__MODULE__{pending: pending, shard_id: shard_id} = state, record, token) do
233 |     sn = Exmld.stream_record(record, :sequence_number)
234 |     note_pending(state, token, sn, Map.has_key?(pending, {shard_id, sn}))
235 |   end
236 |   defp note_pending(%__MODULE__{on_duplicate: :exit, shard_id: shard_id}, _token, sn, true) do
237 |     # we received the same sequence number for two records; this should not happen.
238 |     Logger.warning("Duplicate sequence number for two records #{sn} on #{shard_id} exiting...")
239 |     exit({:duplicate_seqno, sn})
240 |   end
241 |   defp note_pending(%__MODULE__{on_duplicate: :skip, shard_id: shard_id} = state, _token, sn, true) do
242 |     Logger.warning("Duplicate sequence number for two records #{sn} on #{shard_id} skipping...")
243 |     state
244 |   end
245 |   defp note_pending(%__MODULE__{pending: pending, shard_id: shard_id} = state, token, sn, false) do
246 |     expect_multiple = :undefined == Exmld.sequence_number(sn, :user_sub)
247 |     stored_token = maybe_standard_token(token, sn)
248 |     %{state | pending: Map.put(pending, {shard_id, sn}, case expect_multiple do
249 |                                               true ->
250 |                                                 {stored_token, []}
251 |                                               false ->
252 |                                                 stored_token
253 |                                             end)}
254 |   end
255 | 
256 |   # take advantage of the normal (but opaque and currently unsupported) {N, SN}
257 |   # representation of tokens to avoid storing the sequence number twice:
258 |   defp maybe_standard_token({n, sn}, sn_) when sn == sn_ do
259 |     {:t, n}
260 |   end
261 |   defp maybe_standard_token(t, _) do
262 |     t
263 |   end
264 | 
265 |   defp maybe_wrap_token({:t, n}, sn) do
266 |     {n, sn}
267 |   end
268 |   defp maybe_wrap_token(t, _) do
269 |     t
270 |   end
271 | 
272 |   # a list of finished sequence numbers has been provided.  either:
273 |   #
274 |   # 1. we received a kpl sub-record from upstream and it was passed to a reducer.  we are
275 |   #    now receiving a sequence number with base, user_sub, and user_total fields populated.  that
276 |   #    sub-record would have been associated with one flusher token, which is now done.
277 |   #
278 |   # or:
279 |   #
280 |   # 2. we received a normal record from upstream and sub-records were later extracted by
281 |   #    the application.  the application should have assigned sequence numbers with user_sub
282 |   #    and user_total fields populated when informing us of disposition.  once all such items
283 |   #    are done, we can consider the token associated with the original parent record as
284 |   #    done.
285 |   defp update_pending({state, completed_sequence_numbers}) do
286 |     completed_sequence_numbers
287 |     |> Enum.reduce(state, &update_pending_1/2)
288 |   end
289 | 
290 |   defp update_pending_1(sn, %__MODULE__{pending: pending, done: done, shard_id: shard_id} = state) do
291 |     case Map.pop(pending, {shard_id, sn}) do
292 |       {nil, pending} ->
293 |         # the sequence number doesn't exist in pending.  this will happen if the sequence
294 |         # number has user_sub and user_total fields populated and a non-aggregate record was
295 |         # received from upstream.  that non-aggregate record's sequence number (lacking
296 |         # user_sub/user_total fields) was used as the key, and the value will be {token, [..]}.
297 |         sub = Exmld.sequence_number(sn, :user_sub)
298 |         total = Exmld.sequence_number(sn, :user_total)
299 |         if :undefined == sub do
300 |           exit({:missing_pending, sn})
301 |         end
302 |         key = {shard_id, Exmld.sequence_number(sn, user_sub: :undefined, user_total: :undefined)}
303 |         {{token, seen}, pending} = Map.pop(pending, key)
304 |         seen = [{sub, total} | seen]
305 |         # if all expected items have been received, move token to done.  otherwise,
306 |         # continue building seen list;
307 |         case all_done(seen) do
308 |           true ->
309 |             %{state | pending: pending, done: [maybe_wrap_token(token, sn) | done]}
310 |           false ->
311 |             %{state | pending: Map.put(pending, key, {token, seen})}
312 |         end
313 |       {token, pending} ->
314 |         %{state | pending: pending, done: [maybe_wrap_token(token, sn) | done]}
315 |     end
316 |   end
317 | 
318 |   defp all_done([]) do
319 |     false
320 |   end
321 |   defp all_done([{_sub, total} | _] = values) do
322 |     case length(values) do
323 |       ^total ->
324 |         # every item must have the same total value, and each sub must be unique and cover
325 |         # the range 0..user_total-1.
326 |         expected = MapSet.new(0..(total-1))
327 |         actual = MapSet.new(Enum.map(values, &(elem(&1, 0))))
328 |         if MapSet.disjoint?(expected, actual) do
329 |           exit({:unexpected_disjoint, expected, actual})
330 |         end
331 |         true
332 |       _ ->
333 |         false
334 |     end
335 |   end
336 | 
337 |   # while awaiting pending items, we spew heartbeats to all downstream stages so we can
338 |   # obtain disposition of prior items.  if this happens frequently, the downstream
339 |   # pipeline can't keep up with the producer, so its parameters should be tuned.
340 |   defp await_pending(%__MODULE__{await_sleep_interval: sleep_interval,
341 |                                  pending: pending} = state) when map_size(pending) > 0 do
342 |     Logger.debug("#{state.shard_id} awaiting #{inspect map_size(pending)} items...")
343 |     :timer.sleep(sleep_interval)
344 |     state
345 |     |> do_heartbeat()
346 |     |> await_pending()
347 |   end
348 |   defp await_pending(state) do
349 |     state
350 |   end
351 | 
352 |   # send a heartbeat downstream and note any returned item dispositions.
353 |   defp do_heartbeat(state) do
354 |     state
355 |     |> incr(:heartbeats, 1)
356 |     |> notify_downstream(:heartbeat)
357 |     |> update_pending()
358 |   end
359 | 
360 |   # notify a downstream processing stage of a record or heartbeat and handle any returned
361 |   # item dispositions, returning an updated state and a list of processed sequence
362 |   # numbers.
363 |   #
364 |   # if processing of an item has failed and we aren't configured to skip failed records,
365 |   # we crash.  otherwise we call any configured error callback and skip the failed items.
366 |   defp notify_downstream(%__MODULE__{shard_id: shard_id,
367 |                                      opaque: opaque,
368 |                                      stages: stages} = state, :heartbeat) do
369 |     {state, dispositions} =
370 |       stages
371 |       |> Enum.reduce({0, {:disposition, []}},
372 |                      fn (stage, {n, x}) ->
373 |                        datum = %Datum{shard_id: shard_id,
374 |                                       opaque: opaque,
375 |                                       stream_record: {:heartbeat, {state.counter, state.heartbeats, n}}}
376 |                        {:disposition, y} = Exmld.KinesisStage.notify(stage, datum)
377 |                        {n + 1, put_elem(x, 1, y ++ elem(x, 1))}
378 |                      end)
379 |       |> elem(1)
380 |       |> handle_errors(state)
381 | 
382 |     {state, Enum.map(dispositions, &(&1.sequence_number))}
383 |   end
384 |   defp notify_downstream(%__MODULE__{shard_id: shard_id,
385 |                                      opaque: opaque} = state, thing) do
386 |     {state, dispositions} =
387 |       choose_stage(state)
388 |       |> Exmld.KinesisStage.notify(%Datum{shard_id: shard_id,
389 |                                           opaque: opaque,
390 |                                           stream_record: thing})
391 |       |> handle_errors(state)
392 | 
393 |     # we either had no errors, exited, or called a configured error callback for an error.
394 |     # at this point, consider all items as successfully processed and return their
395 |     # sequence numbers:
396 |     {state, Enum.map(dispositions, &(&1.sequence_number))}
397 |   end
398 | 
399 |   defp choose_stage(%__MODULE__{stages: stages, heartbeats: heartbeats, counter: count}) do
400 |     Enum.at(stages, rem(count + heartbeats, length(stages)))
401 |   end
402 | 
403 |   defp handle_errors({:disposition, prior_dispositions},
404 |                      %__MODULE__{shard_id: shard_id,
405 |                                  opaque: opaque,
406 |                                  error_callback: cb,
407 |                                  skip_errors: skip_errors} = state) do
408 |     failed = prior_dispositions
409 |     |> Enum.reject(fn (%Disposition{status: status}) ->
410 |                      status == :ok
411 |                    end)
412 | 
413 |     result = {incr(state, :errors, length(failed)), prior_dispositions}
414 | 
415 |     case failed do
416 |       [] ->
417 |         result
418 |       _ ->
419 |         if cb do
420 |           cb.(state, failed)
421 |         end
422 | 
423 |         case skip_errors do
424 |           true ->
425 |             result
426 |           _ ->
427 |             exit({:processing_failed,
428 |                   shard_id: shard_id, opaque: opaque, failures: failed})
429 |         end
430 |     end
431 |   end
432 | 
433 |   defp log_errors(_state, failed) do
434 |     Logger.error("processing failed: #{inspect failed}")
435 |   end
436 | 
437 |   defp incr(state, field, n) do
438 |     Map.put(state, field, n + Map.fetch!(state, field))
439 |   end
440 | end
441 | 


--------------------------------------------------------------------------------