├── LICENSE ├── README.md ├── pipeline.py └── requirements.txt /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 datastack.tv 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🎥 Apache Beam Explained in 12 Minutes 2 | 3 | Source code for the YouTube video, Apache Beam explained in 12 minutes. Watch the video [here]()! 4 | 5 | ## Install dependencies 6 | 7 | ```bash 8 | pip install -r requirements.txt 9 | ``` 10 | 11 | ## Run pipeline locally 12 | 13 | ```bash 14 | python pipeline.py 15 | ``` 16 | 17 | This command produces a text file called `output-00000-of-00001`. The output looks like the following: 18 | 19 | ```txt 20 | ('Andy', 2) 21 | ('Andy', 1) 22 | ('Sam', 1) 23 | ``` 24 | -------------------------------------------------------------------------------- /pipeline.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | from apache_beam.transforms.window import ( 3 | TimestampedValue, 4 | Sessions, 5 | Duration, 6 | ) 7 | from apache_beam.io.textio import WriteToText 8 | 9 | # User defined functions should always be subclassed from DoFn. This function transforms 10 | # each element into a tuple where the first field is userId and the second is click. It 11 | # assigns the timestamp to the metadata of the element such that window functions can use 12 | # it later to group elements into windows. 13 | class AddTimestampDoFn(beam.DoFn): 14 | def process(self, element): 15 | unix_timestamp = element["timestamp"] 16 | element = (element["userId"], element["click"]) 17 | 18 | yield TimestampedValue(element, unix_timestamp) 19 | 20 | 21 | with beam.Pipeline() as p: 22 | # fmt: off 23 | events = p | beam.Create( 24 | [ 25 | {"userId": "Andy", "click": 1, "timestamp": 1603112520}, # Event time: 13:02 26 | {"userId": "Sam", "click": 1, "timestamp": 1603113240}, # Event time: 13:14 27 | {"userId": "Andy", "click": 1, "timestamp": 1603115820}, # Event time: 13:57 28 | {"userId": "Andy", "click": 1, "timestamp": 1603113600}, # Event time: 13:20 29 | ] 30 | ) 31 | # fmt: on 32 | 33 | # Assign timestamp to metadata of elements such that Beam's window functions can 34 | # access and use them to group events. 35 | timestamped_events = events | "AddTimestamp" >> beam.ParDo(AddTimestampDoFn()) 36 | 37 | windowed_events = timestamped_events | beam.WindowInto( 38 | # Each session must be separated by a time gap of at least 30 minutes (1800 sec) 39 | Sessions(gap_size=30 * 60), 40 | # Triggers determine when to emit the aggregated results of each window. Default 41 | # trigger outputs the aggregated result when it estimates all data has arrived, 42 | # and discards all subsequent data for that window. 43 | trigger=None, 44 | # Since a trigger can fire multiple times, the accumulation mode determines 45 | # whether the system accumulates the window panes as the trigger fires, or 46 | # discards them. 47 | accumulation_mode=None, 48 | # Policies for combining timestamps that occur within a window. Only relevant if 49 | # a grouping operation is applied to windows. 50 | timestamp_combiner=None, 51 | # By setting allowed_lateness we can handle late data. If allowed lateness is 52 | # set, the default trigger will emit new results immediately whenever late 53 | # data arrives. 54 | allowed_lateness=Duration(seconds=1 * 24 * 60 * 60), # 1 day 55 | ) 56 | 57 | # We can use CombinePerKey with the predifined sum function to combine all elements 58 | # for each key in a collection. 59 | sum_clicks = windowed_events | beam.CombinePerKey(sum) 60 | 61 | # WriteToText writes a simple text file with the results. 62 | sum_clicks | WriteToText(file_path_prefix="output") 63 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam --------------------------------------------------------------------------------