from each GameEvent corresponding to the given
38 | # field('user' or 'team') and the score.
39 | # 2. Compute the sum of the scores for each key.
40 | # 3. Run your pipeline on the Dataflow service.
41 | return (p
42 | | 'extract_field' >> beam.Map(ChangeMeFunction)
43 | # Select the appropriate transform to compute the sum over each key.
44 | | ChangeMeTransform()
45 | )
46 | # [END EXERCISE 1]
47 |
48 |
49 | def FormatUserScoreSum(element):
50 | """Format a KV of user and their score to a BigQuery TableRow."""
51 | user, total_score = element
52 | return {'user': user, 'total_score': total_score}
53 |
54 |
55 | def Run(argv=None):
56 | known_args, pipeline_args = ParseArgs(argv)
57 | pipeline_options = PipelineOptions(pipeline_args)
58 | pipeline_options.view_as(SetupOptions).save_main_session = True
59 | p = beam.Pipeline(options=pipeline_options)
60 |
61 | project = pipeline_options.view_as(GoogleCloudOptions).project
62 | # Read events from a CSV file and parse them.
63 | _ = (p
64 | | 'read' >> ReadFromText(known_args.input)
65 | | 'parse' >> beam.FlatMap(ParseEvent)
66 | | 'extract_user_score' >> ExtractAndSumScore('user')
67 | | 'format_user_score_sum' >> beam.Map(FormatUserScoreSum)
68 | | beam.io.WriteToBigQuery(known_args.output_tablename,
69 | known_args.output_dataset, project, SCHEMA)
70 | )
71 |
72 | p.run().wait_until_finish()
73 |
74 |
75 | if __name__ == '__main__':
76 | logging.getLogger().setLevel(logging.INFO)
77 | Run()
78 |
--------------------------------------------------------------------------------
/py/exercises/exercise1.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/exercises/exercise1.pyc
--------------------------------------------------------------------------------
/py/exercises/exercise2.py:
--------------------------------------------------------------------------------
1 | # This batch pipeline calculates the sum of scores per team per hour, over an
2 | # entire batch of gaming data and writes the per-team sums to BigQuery.
3 | from __future__ import absolute_import
4 |
5 | import logging
6 | import re
7 |
8 | import apache_beam as beam
9 | from apache_beam.io import ReadFromText
10 | from apache_beam.io import WriteToText
11 | from apache_beam.metrics import Metrics
12 | from apache_beam.metrics.metric import MetricsFilter
13 | from apache_beam.options.pipeline_options import PipelineOptions
14 | from apache_beam.options.pipeline_options import SetupOptions
15 | from apache_beam.options.pipeline_options import GoogleCloudOptions
16 | from util.util import GameEvent
17 | from util.util import ParseEvent
18 | from util.util import ParseArgs
19 | import apache_beam.transforms.window as window
20 |
21 | # Defines the BigQuery schema.
22 | SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
23 |
24 |
25 | class ExtractAndSumScore(beam.PTransform):
26 | def __init__(self, field):
27 | super(ExtractAndSumScore, self).__init__()
28 | self.field = field
29 |
30 | def expand(self, p):
31 | return (p
32 | |'extract_field' >> beam.Map(lambda x: (vars(x)[self.field], x.score))
33 | | beam.CombinePerKey(sum)
34 | )
35 |
36 |
37 | class WindowedTeamScore(beam.PTransform):
38 | """A transform to compute the WindowedTeamScore."""
39 | def __init__(self, duration):
40 | super(WindowedTeamScore, self).__init__()
41 | self.duration = duration
42 |
43 | def expand(self, p):
44 | # [START EXERCISE 2]:
45 | # Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing
46 | # Also: https://cloud.google.com/dataflow/model/windowing
47 | return (p
48 | # beam.WindowInto takes a WindowFn and returns a PTransform that applies windowing.
49 | # window.FixedWindows returns a WindowFn that assigns elements into fixed-size
50 | # windows. Use these methods to apply windows of size self.duration.
51 | | 'window' >> ChangeMeTransform()
52 | # Use the ExtractAndSumScore to compute the 'team' sum.
53 | | 'extract_team_score' >> ChangeMeTransform()
54 | )
55 | # [END EXERCISE 2]
56 |
57 |
58 | class FormatTeamScoreSum(beam.DoFn):
59 | """Format a KV of user and their score to a BigQuery TableRow."""
60 | def process(self, team_score, window=beam.DoFn.WindowParam):
61 | team, score = team_score
62 | start = int(window.start)
63 | yield {
64 | 'team': team,
65 | 'total_score': score,
66 | 'window_start': start,
67 | }
68 |
69 |
70 | def Run(argv=None):
71 | known_args, pipeline_args = ParseArgs(argv)
72 | pipeline_options = PipelineOptions(pipeline_args)
73 | pipeline_options.view_as(SetupOptions).save_main_session = True
74 | p = beam.Pipeline(options=pipeline_options)
75 |
76 | project = pipeline_options.view_as(GoogleCloudOptions).project
77 | _ = (p
78 | | 'read' >> ReadFromText(known_args.input)
79 | | 'parse' >> beam.FlatMap(ParseEvent)
80 | | 'add_event_timestamps' >> beam.Map(
81 | lambda x: beam.window.TimestampedValue(x, x.timestamp))
82 | | 'windowed_team_score' >> WindowedTeamScore(60 * 60)
83 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
84 | | beam.io.WriteToBigQuery(known_args.output_tablename,
85 | known_args.output_dataset, project, SCHEMA)
86 | )
87 | p.run().wait_until_finish()
88 |
89 |
90 | if __name__ == '__main__':
91 | logging.getLogger().setLevel(logging.INFO)
92 | Run()
93 |
--------------------------------------------------------------------------------
/py/exercises/exercise3.py:
--------------------------------------------------------------------------------
1 | # This pipeline calculates the sum of scores per team per hour and writes the
2 | # per-team sums to BigQuery. The pipeline can be run in either batch or
3 | # streaming mode, reading from either a data file or Pub/Sub topic.
4 | #
5 | # You will need to create a Pub/Sub topic and run the Java Injector
6 | # in order to get game events over Pub/Sub. Please refer to the instructions
7 | # here: https://github.com/malo-denielou/DataflowSME
8 | from __future__ import absolute_import
9 |
10 | import logging
11 | import re
12 |
13 | import apache_beam as beam
14 | from apache_beam.io import ReadFromText
15 | from apache_beam.io import ReadFromPubSub
16 | from apache_beam.io import WriteToText
17 | from apache_beam.metrics import Metrics
18 | from apache_beam.metrics.metric import MetricsFilter
19 | from apache_beam.options.pipeline_options import PipelineOptions
20 | from apache_beam.options.pipeline_options import SetupOptions
21 | from apache_beam.options.pipeline_options import StandardOptions
22 | from apache_beam.options.pipeline_options import GoogleCloudOptions
23 | from util.util import GameEvent
24 | from util.util import ParseEvent
25 | from util.util import ParseEventFn
26 | from util.util import ParseArgs
27 | import apache_beam.transforms.window as window
28 | from solutions.exercise1 import ExtractAndSumScore
29 |
30 | # Defines the BigQuery schema.
31 | SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
32 |
33 |
34 | class ExtractAndSumScore(beam.PTransform):
35 | def __init__(self, field):
36 | super(ExtractAndSumScore, self).__init__()
37 | self.field = field
38 |
39 | def expand(self, p):
40 | return (p
41 | | 'extract_field' >> beam.Map(
42 | lambda x: (vars(x)[self.field], x.score))
43 | | beam.CombinePerKey(sum)
44 | )
45 |
46 |
47 | class WindowedTeamScore(beam.PTransform):
48 | """A transform to compute a windowed team score."""
49 | def __init__(self, duration):
50 | super(WindowedTeamScore, self).__init__()
51 | self.duration = duration
52 |
53 | def expand(self, p):
54 | return (p
55 | | 'window' >> beam.WindowInto(
56 | window.FixedWindows(self.duration))
57 | | 'extract_team_score' >> ExtractAndSumScore('team')
58 | )
59 |
60 |
61 | class FormatTeamScoreSum(beam.DoFn):
62 | """Format a KV of user and their score to a BigQuery TableRow."""
63 | def process(self, team_score, window=beam.DoFn.WindowParam):
64 | team, score = team_score
65 | start = int(window.start)
66 | yield {
67 | 'team': team,
68 | 'total_score': score,
69 | 'window_start': start,
70 | }
71 |
72 |
73 | def Run(argv=None):
74 | known_args, pipeline_args = ParseArgs(argv)
75 | pipeline_options = PipelineOptions(pipeline_args)
76 | pipeline_options.view_as(SetupOptions).save_main_session = True
77 | p = beam.Pipeline(options=pipeline_options)
78 | window_duration = 1 * 60 # 1 minute windows.
79 | if known_args.topic:
80 | pipeline_options.view_as(StandardOptions).streaming = True
81 |
82 | project = pipeline_options.view_as(GoogleCloudOptions).project
83 | timestamp_attribute = 'timestamp_ms'
84 | events = None
85 | if (not known_args.topic):
86 | events = (p
87 | | 'read' >> ReadFromText(known_args.input)
88 | | 'parse' >> beam.FlatMap(ParseEventFn())
89 | | 'add_event_timestamps' >> beam.Map(
90 | lambda x: beam.window.TimestampedValue(x, x.timestamp))
91 | )
92 | else:
93 | # [START EXERCISE 3]:
94 | # Read game events from the Pub/Sub topic using custom timestamps,
95 | # which are in an attribute labeled 'timestamp_ms'.
96 | # Use beam.io.ReadFromPubSub to read from the topic.
97 | # https://beam.apache.org/releases/pydoc/2.8.0/apache_beam.io.gcp.pubsub.html
98 | events = (p
99 | | 'read' >> ChangeMe()
100 | | 'decode' >> beam.ParDo(ParseEventFn())
101 | )
102 |
103 | _ = (events
104 | | 'windowed_team_score' >> WindowedTeamScore(window_duration)
105 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
106 | | beam.io.WriteToBigQuery(known_args.output_tablename,
107 | known_args.output_dataset, project, SCHEMA)
108 | )
109 | p.run().wait_until_finish()
110 |
111 |
112 | if __name__ == '__main__':
113 | logging.getLogger().setLevel(logging.INFO)
114 | Run()
115 |
--------------------------------------------------------------------------------
/py/exercises/exercise4.py:
--------------------------------------------------------------------------------
1 | # This pipeline calculates the sum of scores per team per hour and writes the
2 | # per-team sums to BigQuery. Additionally computes running user scores (e.g.,
3 | # as a leaderboard) and updates them regularly.
4 |
5 | # The pipeline can be run in either batch or streaming mode, reading from
6 | # either a data file or Pub/Sub topic.
7 | from __future__ import absolute_import
8 |
9 | import logging
10 | import re
11 | import time
12 |
13 | import apache_beam as beam
14 | from apache_beam.io import ReadFromText
15 | from apache_beam.io import ReadFromPubSub
16 | from apache_beam.io import WriteToText
17 | from apache_beam.metrics import Metrics
18 | from apache_beam.metrics.metric import MetricsFilter
19 | from apache_beam.options.pipeline_options import PipelineOptions
20 | from apache_beam.options.pipeline_options import SetupOptions
21 | from apache_beam.options.pipeline_options import StandardOptions
22 | from apache_beam.options.pipeline_options import GoogleCloudOptions
23 | from apache_beam.transforms import trigger
24 | from util.util import GameEvent
25 | from util.util import ParseEvent
26 | from util.util import ParseEventFn
27 | from util.util import ParseArgs
28 | import apache_beam.transforms.window as window
29 | from solutions.exercise1 import ExtractAndSumScore
30 |
31 | # Defines the BigQuery schemas.
32 | USER_SCHEMA = ('user:STRING,'
33 | 'total_score:INTEGER,'
34 | 'processing_time:TIMESTAMP')
35 | TEAM_SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
36 |
37 |
38 | class ExtractAndSumScore(beam.PTransform):
39 | def __init__(self, field):
40 | super(ExtractAndSumScore, self).__init__()
41 | self.field = field
42 |
43 | def expand(self, p):
44 | return (p | 'extract_field' >>
45 | beam.Map(lambda x: (vars(x)[self.field], x.score)) |
46 | beam.CombinePerKey(sum))
47 |
48 |
49 | class RunningUserScores(beam.PTransform):
50 | """Extract user/score pairs via global windowing and emit perioidic updates
51 | on all users' running scores.
52 | """
53 | def __init__(self, allowed_lateness=0):
54 | super(RunningUserScores, self).__init__()
55 |
56 | def expand(self, p):
57 | # NOTE: allowed_lateness is not yet available in Python FixedWindows.
58 | # NOTE: AfterProcessingTime not yet available in Python.
59 | # [START EXERCISE 4.1]:
60 | # Compute a leaderboard by windowing user scores into the global window.
61 | # Since we will want to see running results, trigger the window early,
62 | # after every 100 elements. Make sure to accumulate fired panes.
63 | # https://beam.apache.org/documentation/programming-guide/#triggers
64 | return (p
65 | | 'window' >> ChangeMe()
66 | | 'extract_user_score' >> ExtractAndSumScore('user')
67 | )
68 | # [END EXERCISE 4.1]
69 |
70 |
71 | class WindowedTeamScore(beam.PTransform):
72 | """Calculates scores for each team within the configured window duration"""
73 |
74 | def __init__(self, duration):
75 | super(WindowedTeamScore, self).__init__()
76 | self.duration = duration
77 |
78 | def expand(self, p):
79 | # [START EXERCISE 4.2]:
80 | # Window team scores into windows of fixed duration. Trigger these windows
81 | # on-time with the watermark, but also speculatively every 100 elements.
82 | # Ensure correct totals for the watermark-triggered pane by accumulating
83 | # over all data.
84 | return (p
85 | | 'window' >> ChangeMe()
86 | | 'extract_team_score' >> ExtractAndSumScore('team')
87 | )
88 | # [END EXERCISE 4.2]
89 |
90 |
91 | class FormatTeamScoreSum(beam.DoFn):
92 | """Format a KV of team and its score to a BigQuery TableRow."""
93 | def process(self, team_score, window=beam.DoFn.WindowParam):
94 | team, score = team_score
95 | start = int(window.start)
96 | yield {
97 | 'team': team,
98 | 'total_score': score,
99 | 'window_start': start,
100 | }
101 |
102 |
103 | class FormatUserScoreSum(beam.DoFn):
104 | """Format a KV of user and their score to a BigQuery TableRow."""
105 | def process(self, user_score, window=beam.DoFn.WindowParam):
106 | user, score = user_score
107 | yield {
108 | 'user': user,
109 | 'total_score': score,
110 | 'processing_time': time.time(),
111 | }
112 |
113 |
114 | def Run(argv=None):
115 | known_args, pipeline_args = ParseArgs(argv)
116 | pipeline_options = PipelineOptions(pipeline_args)
117 | pipeline_options.view_as(SetupOptions).save_main_session = True
118 | p = beam.Pipeline(options=pipeline_options)
119 | window_duration = 1 * 60 # 1 minute windows.
120 | if known_args.topic:
121 | pipeline_options.view_as(StandardOptions).streaming = True
122 |
123 | project = pipeline_options.view_as(GoogleCloudOptions).project
124 | timestamp_attribute = 'timestamp_ms'
125 | events = None
126 | if (not known_args.topic):
127 | events = (p
128 | | 'read' >> ReadFromText(known_args.input)
129 | | 'parse' >> beam.FlatMap(ParseEventFn())
130 | | 'add_event_timestamps' >> beam.Map(
131 | lambda x: beam.window.TimestampedValue(x, x.timestamp))
132 | )
133 | else:
134 | events = (p
135 | | 'read' >> ReadFromPubSub(topic=known_args.topic,
136 | timestamp_attribute='timestamp_ms')
137 | | 'decode' >> beam.ParDo(ParseEventFn())
138 | )
139 |
140 | # Window team scores and write them BigQuery.
141 | _ = (events
142 | | 'windowed_team_score' >> WindowedTeamScore(window_duration)
143 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
144 | | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery(
145 | known_args.output_tablename + '_team', known_args.output_dataset,
146 | project, TEAM_SCHEMA)
147 | )
148 |
149 | # Write leaderboards to BigQuery.
150 | _ = (events
151 | | 'running_user_score' >> RunningUserScores()
152 | | 'format_user_scores' >> beam.ParDo(FormatUserScoreSum())
153 | | 'write_users_to_bigquery' >> beam.io.WriteToBigQuery(
154 | known_args.output_tablename + '_user', known_args.output_dataset,
155 | project, USER_SCHEMA)
156 | )
157 |
158 | p.run().wait_until_finish()
159 |
160 |
161 | if __name__ == '__main__':
162 | logging.getLogger().setLevel(logging.INFO)
163 | Run()
164 |
--------------------------------------------------------------------------------
/py/exercises/exercise5.py:
--------------------------------------------------------------------------------
1 | # Filter 'cheating' or 'spammy' users from the game results.
2 | # Computes the global mean score and filters users that are
3 | # some threshold above that score.
4 | from __future__ import absolute_import
5 |
6 | import logging
7 | import re
8 | import time
9 |
10 | import apache_beam as beam
11 | from apache_beam.io import ReadFromText
12 | from apache_beam.io import ReadFromPubSub
13 | from apache_beam.io import WriteToText
14 | from apache_beam.metrics import Metrics
15 | from apache_beam.metrics.metric import MetricsFilter
16 | from apache_beam.options.pipeline_options import PipelineOptions
17 | from apache_beam.options.pipeline_options import SetupOptions
18 | from apache_beam.options.pipeline_options import StandardOptions
19 | from apache_beam.options.pipeline_options import GoogleCloudOptions
20 | from apache_beam.transforms import trigger
21 | from util.util import GameEvent
22 | from util.util import ParseEvent
23 | from util.util import ParseEventFn
24 | from util.util import ParseArgs
25 | import apache_beam.transforms.window as window
26 |
27 | # Defines the BigQuery schemas.
28 | USER_SCHEMA = ('user:STRING,'
29 | 'total_score:INTEGER,'
30 | 'processing_time:TIMESTAMP')
31 | TEAM_SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
32 |
33 |
34 | class ExtractAndSumScore(beam.PTransform):
35 | def __init__(self, field):
36 | super(ExtractAndSumScore, self).__init__()
37 | self.field = field
38 |
39 | def expand(self, p):
40 | return (p
41 | | 'extract_field' >> beam.Map(
42 | lambda x: (vars(x)[self.field], x.score))
43 | | beam.CombinePerKey(sum)
44 | )
45 |
46 |
47 | class WindowedUserScores(beam.PTransform):
48 | """Extract user/score pairs via in fixed windows."""
49 | def __init__(self, duration):
50 | super(WindowedUserScores, self).__init__()
51 | self.duration = duration
52 |
53 | def expand(self, p):
54 | return (p
55 | | 'window' >> beam.WindowInto(
56 | window.FixedWindows(self.duration))
57 | | 'extract_user_score' >> ExtractAndSumScore('user')
58 | )
59 |
60 |
61 | class FilterUser(beam.DoFn):
62 | """Filter a user if their score * score_weight > avg_score."""
63 | def __init__(self, score_weight):
64 | super(FilterUser, self).__init__()
65 | self.score_weight = score_weight
66 | self.num_spammy_users = Metrics.counter(self.__class__,
67 | 'num_spammy_users')
68 |
69 | def process(self, user_score, avg_score=beam.DoFn.SideInputParam):
70 | user, score = user_score
71 | if score * self.score_weight > avg_score:
72 | logging.error('User %s filtered as spammy', user)
73 | self.num_spammy_users.inc()
74 | yield user
75 |
76 |
77 | class ComputeSpammyUsers(beam.PTransform):
78 | """Compute users with a high clickrate, which we will consider spammy.
79 | We do this by finding the mean total score per user and filter out
80 | those with scores that are greater than the mean * score_weight
81 | """
82 | def __init__(self, score_weight):
83 | super(ComputeSpammyUsers, self).__init__()
84 | self.score_weight = score_weight
85 |
86 | def expand(self, p):
87 | # [START EXERCISE 5.1]:
88 | # Extract the score for each user, and compute the mean.
89 | # Create a singleton PCollection view to be used in
90 | # compute_spammers.
91 | # https://beam.apache.org/documentation/programming-guide/#combine
92 | avg_score = (p
93 | | 'extract_score' >> ChangeMe()
94 | | 'compute_mean' >> ChangeMe()
95 | )
96 | # [END EXERCISE 5.1]
97 | return (p
98 | | 'compute_spammers' >> beam.ParDo(
99 | FilterUser(self.score_weight), avg_score=avg_score)
100 | )
101 |
102 |
103 | class FilterSpammers(beam.DoFn):
104 | """Remove users found in the spam list."""
105 | def __init__(self):
106 | super(FilterSpammers, self).__init__()
107 | self.filtered_scores = Metrics.counter(self.__class__,
108 | 'filtered_scores')
109 |
110 | def process(self, elem, spammers=beam.DoFn.SideInputParam):
111 | user = elem.user
112 | if user not in spammers:
113 | yield elem
114 | else:
115 | self.filtered_scores.inc()
116 |
117 |
118 | class WindowedTeamScore(beam.PTransform):
119 | """Calculates scores for each team within the configured window duration"""
120 | def __init__(self, duration, spammers):
121 | super(WindowedTeamScore, self).__init__()
122 | self.duration = duration
123 | self.spammers = spammers
124 |
125 | def expand(self, p):
126 | return (p
127 | | 'window' >> beam.WindowInto(
128 | window.FixedWindows(self.duration))
129 | | 'filter_spammers' >> beam.ParDo(
130 | FilterSpammers(), spammers=self.spammers)
131 | | 'extract_team_score' >> ExtractAndSumScore('team')
132 | )
133 |
134 |
135 | class FormatTeamScoreSum(beam.DoFn):
136 | def process(self, team_score, window=beam.DoFn.WindowParam):
137 | team, score = team_score
138 | start = int(window.start)
139 | yield {
140 | 'team': team,
141 | 'total_score': score,
142 | 'window_start': start,
143 | }
144 |
145 |
146 | class FormatUserScoreSum(beam.DoFn):
147 | def process(self, user_score, window=beam.DoFn.WindowParam):
148 | user, score = user_score
149 | yield {
150 | 'user': user,
151 | 'total_score': score,
152 | 'processing_time': time.time(),
153 | }
154 |
155 |
156 | def Run(argv=None):
157 | known_args, pipeline_args = ParseArgs(argv)
158 | pipeline_options = PipelineOptions(pipeline_args)
159 | pipeline_options.view_as(SetupOptions).save_main_session = True
160 | p = beam.Pipeline(options=pipeline_options)
161 | window_duration = 1 * 60 # 1 minute windows.
162 | if known_args.topic:
163 | pipeline_options.view_as(StandardOptions).streaming = True
164 |
165 | project = pipeline_options.view_as(GoogleCloudOptions).project
166 | timestamp_attribute = 'timestamp_ms'
167 | events = None
168 | if (not known_args.topic):
169 | events = (p
170 | | 'read' >> ReadFromText(known_args.input)
171 | | 'parse' >> beam.FlatMap(ParseEventFn())
172 | | 'add_event_timestamps' >> beam.Map(
173 | lambda x: beam.window.TimestampedValue(x, x.timestamp)))
174 | else:
175 | events = (p
176 | | 'read' >> ReadFromPubSub(
177 | topic=known_args.topic,
178 | timestamp_attribute='timestamp_ms')
179 | | 'decode' >> beam.ParDo(ParseEventFn()))
180 |
181 | user_scores = (events
182 | | 'window_user_scores' >> WindowedUserScores(window_duration))
183 | spammers = beam.pvalue.AsList(user_scores
184 | | 'compute_spammers' >> ComputeSpammyUsers(2.5))
185 |
186 | _ = (events
187 | | 'windowed_team_score' >> WindowedTeamScore(window_duration, spammers)
188 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
189 | | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery(
190 | known_args.output_tablename, known_args.output_dataset, project,
191 | TEAM_SCHEMA)
192 | )
193 |
194 | p.run().wait_until_finish()
195 |
196 |
197 | if __name__ == '__main__':
198 | logging.getLogger().setLevel(logging.INFO)
199 | Run()
200 |
--------------------------------------------------------------------------------
/py/exercises/exercise6.py:
--------------------------------------------------------------------------------
1 | # This pipeline computes the average duration of user sessions. The
2 | # averages are windowed, to reflect durations differing over time.
3 | from __future__ import absolute_import
4 |
5 | import logging
6 | import re
7 | import time
8 |
9 | import apache_beam as beam
10 | import apache_beam.transforms.window as window
11 | from apache_beam.io import ReadFromText
12 | from apache_beam.io import ReadFromPubSub
13 | from apache_beam.io import WriteToText
14 | from apache_beam.metrics import Metrics
15 | from apache_beam.metrics.metric import MetricsFilter
16 | from apache_beam.options.pipeline_options import PipelineOptions
17 | from apache_beam.options.pipeline_options import SetupOptions
18 | from apache_beam.options.pipeline_options import StandardOptions
19 | from apache_beam.options.pipeline_options import GoogleCloudOptions
20 | from apache_beam.transforms import trigger
21 | from util.util import GameEvent
22 | from util.util import ParseEvent
23 | from util.util import ParseEventFn
24 | from util.util import ParseArgs
25 |
26 | # Defines the BigQuery schemas.
27 | SESSION_SCHEMA = ('window_start:TIMESTAMP,' 'mean_duration:FLOAT')
28 |
29 |
30 | class UserSessionActivity(beam.DoFn):
31 | """Compute the duration of a user's session."""
32 | def process(self,
33 | elem,
34 | timestamp=beam.DoFn.TimestampParam,
35 | window=beam.DoFn.WindowParam):
36 | duration = int(window.end) - int(window.start)
37 | yield duration
38 |
39 |
40 | class FormatSessionMeans(beam.DoFn):
41 | """Format session means for output to BQ"""
42 | def process(self, elem, window=beam.DoFn.WindowParam):
43 | yield {'window_start': int(window.start), 'mean_duration': elem}
44 |
45 |
46 | def Run(argv=None):
47 | known_args, pipeline_args = ParseArgs(argv)
48 | pipeline_options = PipelineOptions(pipeline_args)
49 | pipeline_options.view_as(SetupOptions).save_main_session = True
50 | p = beam.Pipeline(options=pipeline_options)
51 | if known_args.topic:
52 | pipeline_options.view_as(StandardOptions).streaming = True
53 |
54 | project = pipeline_options.view_as(GoogleCloudOptions).project
55 | timestamp_attribute = 'timestamp_ms'
56 | events = None
57 | if (not known_args.topic):
58 | events = (p
59 | | 'read' >> ReadFromText(known_args.input)
60 | | 'parse' >> beam.FlatMap(ParseEventFn())
61 | | 'add_event_timestamps' >> beam.Map(
62 | lambda x: beam.window.TimestampedValue(x, x.timestamp)))
63 | else:
64 | events = (p
65 | | 'read' >> ReadFromPubSub(
66 | topic=known_args.topic,
67 | timestamp_attribute='timestamp_ms')
68 | | 'parse' >> beam.ParDo(ParseEventFn()))
69 |
70 | # [START EXERCISE 6]
71 | _ = (events
72 | | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score))
73 | # Extract sessions of user data, using known_args.session_gap as the
74 | # gap duration.
75 | # https://beam.apache.org/documentation/programming-guide/#provided-windowing-functions
76 | | 'sessionize' >> ChangeMe()
77 | | 'drop_scores' >> beam.CombinePerKey(lambda x: 0)
78 | | 'convert_to_activity' >> beam.ParDo(UserSessionActivity())
79 | # Re-window into fixed windows of size user_activity_window in order
80 | # to compute the mean session duration for that window of activity.
81 | | 'window_of_sessions' >> ChangeMe()
82 | | 'session_mean' >> ChangeMe()
83 | # [END EXERCISE 6]
84 | | 'format_sessions' >> beam.ParDo(FormatSessionMeans())
85 | | 'write_to_bigquery' >> beam.io.WriteToBigQuery(
86 | known_args.output_tablename, known_args.output_dataset, project,
87 | SESSION_SCHEMA)
88 | )
89 |
90 | p.run().wait_until_finish()
91 |
92 |
93 | if __name__ == '__main__':
94 | logging.getLogger().setLevel(logging.INFO)
95 | Run()
96 |
--------------------------------------------------------------------------------
/py/exercises/exercise7.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | import logging
4 | import re
5 | import time
6 |
7 | import apache_beam as beam
8 | import apache_beam.transforms.window as window
9 | from apache_beam.io import ReadFromText
10 | from apache_beam.io import ReadFromPubSub
11 | from apache_beam.io import WriteToText
12 | from apache_beam.metrics import Metrics
13 | from apache_beam.metrics.metric import MetricsFilter
14 | from apache_beam.options.pipeline_options import PipelineOptions
15 | from apache_beam.options.pipeline_options import SetupOptions
16 | from apache_beam.options.pipeline_options import StandardOptions
17 | from apache_beam.options.pipeline_options import GoogleCloudOptions
18 | from apache_beam.transforms import trigger
19 | from util.util import GameEvent
20 | from util.util import ParseEvent
21 | from util.util import ParseEventFn
22 | from util.util import ParsePlayEventFn
23 | from util.util import ParseArgs
24 |
25 | # Defines the BigQuery schemas.
26 | SESSION_SCHEMA = ('window_start:TIMESTAMP,' 'mean_duration:FLOAT')
27 |
28 |
29 | class ComputeLatency(beam.DoFn):
30 | def __init__(self):
31 | super(ComputeLatency, self).__init__()
32 | self.dropped_sessions_no_events = Metrics.counter(
33 | self.__class__, 'dropped_sessions_no_events')
34 | self.dropped_sessions_too_many_events = Metrics.counter(
35 | self.__class__, 'dropped_sessions_too_many_events')
36 | self.dropped_sessions_no_play_events = Metrics.counter(
37 | self.__class__, 'dropped_sessions_no_play_events')
38 |
39 | def process(self, elem):
40 | _, vals = elem
41 | plays = vals['plays']
42 | events = vals['events']
43 |
44 | play_count = 0
45 | max_play_ts = 0
46 | for play in plays:
47 | play_count += 1
48 | max_play_ts = max(max_play_ts, long(play.timestamp))
49 |
50 | event_count = 0
51 | an_event = None
52 | for event in events:
53 | an_event = event
54 | event_count += 1
55 |
56 | if event_count == 0:
57 | self.dropped_sessions_no_events.inc()
58 | elif event_count > 1:
59 | self.dropped_sessions_too_many_events.inc()
60 | elif play_count == 0:
61 | self.dropped_sessions_no_play_events.inc()
62 | else:
63 | min_latency = long(an_event.timestamp) - max_play_ts
64 | yield (an_event.user, min_latency)
65 |
66 |
67 | class DetectBadUsers(beam.DoFn):
68 | def process(self, elem, mean_latency=beam.DoFn.SideInputParam):
69 | user, latency = elem
70 | # Naive: compute bad users are users 5 times less than
71 | # the mean.
72 | if latency < mean / 5:
73 | yield user
74 |
75 |
76 | def Run(argv=None):
77 | known_args, pipeline_args = ParseArgs(argv)
78 | pipeline_options = PipelineOptions(pipeline_args)
79 | pipeline_options.view_as(SetupOptions).save_main_session = True
80 | p = beam.Pipeline(options=pipeline_options)
81 | if known_args.topic:
82 | pipeline_options.view_as(StandardOptions).streaming = True
83 |
84 | project = pipeline_options.view_as(GoogleCloudOptions).project
85 | timestamp_attribute = 'timestamp_ms'
86 | events = None
87 | if (not known_args.topic or not known_args.play_topic):
88 | logging.fatal('topic and play_topic are required.')
89 |
90 | # [START EXERCISE 7]:
91 | # 1. Read game events with message id and timestamp.
92 | # 2. Parse events.
93 | events = (p
94 | | 'read_events' >> ChangeMe()
95 | | 'parse_events' >> ChangeMe()
96 | )
97 |
98 | # 1. Read play events with message id and timestamp.
99 | # 2. Parse events.
100 | play_events = (p
101 | | 'read_play_events' >> ChangeMe()
102 | | 'parse_play_events' >> ChangeMe()
103 | )
104 |
105 | # 1. Key events by event id.
106 | # 2. Sessionize.
107 | sessionized_events = (events
108 | | 'key_events_by_id' >> ChangeMe()
109 | | 'sessionize_events' >> ChangeMe()
110 |
111 | # 1. Key play events by event id.
112 | # 2. Sessionize.
113 | sessionized_plays = (play_events
114 | | 'key_plays_by_id' >> ChangeMe()
115 | | 'sessionize_plays' >> ChangeMe()
116 |
117 | # 1. Join events using CoGroupByKey
118 | # 2. Compute latency using ComputeLatency
119 | per_user_latency = (
120 | {'change':me, 'me':change}
121 | | 'cbk' >> ChangeMe()
122 | | 'compute_latency' >> ChangeMe()
123 |
124 | # 1. Get values of per user latencies
125 | # 2. Re-window into GlobalWindows that triggers repeatedly after 1000 new elements.
126 | # 3. Compute the global mean to be used as a side input.
127 | mean_latency = (per_user_latency
128 | | 'extract_latencies' >> ChangeMe()
129 | | 'global_window' >> ChangeMe()
130 | | 'compute_mean' >> ChangeMe()
131 | )
132 | # [END EXERCISE 7]
133 |
134 | # Filter out bad users.
135 | _ = (per_user_latency
136 | | 'detect_bad_users' >> beam.ParDo(
137 | DetectBadUsers(), mean_latency=mean_latency)
138 | | 'filter_duplicates' >> beam.WindowInto(
139 | window.GlobalWindows(), trigger=trigger.AfterCount(1),
140 | accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
141 | | 'to_bq_schema' >> beam.Map(lambda x: {'user': x})
142 | | 'write_bad_users' >> beam.io.WriteToBigQuery(
143 | known_args.output_tablename, known_args.output_dataset, project, ('user:string'))
144 | )
145 |
146 | p.run().wait_until_finish()
147 |
148 |
149 | if __name__ == '__main__':
150 | logging.getLogger().setLevel(logging.INFO)
151 | Run()
152 |
--------------------------------------------------------------------------------
/py/run0.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise0 --input gs://sme-training/game/small.csv \
2 | --output_dataset sme \
3 | --output_tablename exercise0 \
4 | --runner DataflowRunner \
5 | --project YOUR_PROJECT \
6 | --temp_location gs://YOUR_BUCKET/staging \
7 | --setup_file ./setup.py
8 |
--------------------------------------------------------------------------------
/py/run1.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise1 --input gs://sme-training/game/small.csv \
2 | --output_dataset sme \
3 | --output_tablename exercise1 \
4 | --runner DataflowRunner \
5 | --project YOUR_PROJECT \
6 | --temp_location gs://YOUR_BUCKET/tmp/ \
7 | --setup_file ./setup.py
8 |
--------------------------------------------------------------------------------
/py/run2.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise2 --input gs://sme-training/game/small.csv \
2 | --output_dataset sme \
3 | --output_tablename exercise2 \
4 | --runner DataflowRunner \
5 | --project YOUR_PROJECT \
6 | --temp_location gs://YOUR_BUCKET/tmp/ \
7 | --setup_file ./setup.py
8 |
--------------------------------------------------------------------------------
/py/run3.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise3 \
2 | --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \
3 | --output_dataset sme \
4 | --output_tablename exercise3 \
5 | --runner DataflowRunner \
6 | --project YOUR_PROJECT \
7 | --temp_location gs://YOUR_BUCKET/staging \
8 | --setup_file ./setup.py
9 |
--------------------------------------------------------------------------------
/py/run4.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise4 \
2 | --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \
3 | --output_dataset sme \
4 | --output_tablename exercise4 \
5 | --runner DataflowRunner \
6 | --project YOUR_PROJECT \
7 | --temp_location gs://YOUR_BUCKET/staging \
8 | --setup_file ./setup.py
9 |
--------------------------------------------------------------------------------
/py/run5.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise5 \
2 | --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \
3 | --output_dataset sme \
4 | --output_tablename exercise5 \
5 | --runner DataflowRunner \
6 | --project YOUR_PROJECT \
7 | --temp_location gs://YOUR_BUCKET/staging \
8 | --setup_file ./setup.py
9 |
--------------------------------------------------------------------------------
/py/run6.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise6 \
2 | --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \
3 | --output_dataset sme \
4 | --output_tablename exercise6 \
5 | --runner DataflowRunner \
6 | --project YOUR_PROJECT \
7 | --user_activity_window 240 \
8 | --session_gap 60 \
9 | --temp_location gs://YOUR_BUCKET/staging \
10 | --setup_file ./setup.py
11 |
--------------------------------------------------------------------------------
/py/run7.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise7 \
2 | --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \
3 | --play_topic projects/YOUR_PROJECT/topics/YOUR_TOPIC-play \
4 | --output_dataset sme \
5 | --output_tablename exercise7 \
6 | --runner DataflowRunner \
7 | --project YOUR_PROJECT \
8 | --session_gap 20 \
9 | --temp_location gs://YOUR_BUCKET/staging \
10 | --setup_file ./setup.py
11 |
--------------------------------------------------------------------------------
/py/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | setuptools.setup(
3 | name='sme-training',
4 | version='1.0',
5 | install_requires=[],
6 | packages=setuptools.find_packages(),
7 | )
8 |
--------------------------------------------------------------------------------
/py/solutions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/solutions/__init__.py
--------------------------------------------------------------------------------
/py/solutions/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/solutions/__init__.pyc
--------------------------------------------------------------------------------
/py/solutions/exercise0.py:
--------------------------------------------------------------------------------
1 | # This batch pipeline imports game events from CSV to BigQuery.
2 | from __future__ import absolute_import
3 |
4 | import logging
5 | import re
6 |
7 | import apache_beam as beam
8 | from apache_beam.io import ReadFromText
9 | from apache_beam.io import WriteToText
10 | from apache_beam.metrics import Metrics
11 | from apache_beam.metrics.metric import MetricsFilter
12 | from apache_beam.options.pipeline_options import PipelineOptions
13 | from apache_beam.options.pipeline_options import SetupOptions
14 | from apache_beam.options.pipeline_options import GoogleCloudOptions
15 | from util.util import GameEvent
16 | from util.util import ParseEvent
17 | from util.util import ParseArgs
18 |
19 | # Defines the BigQuery schema.
20 | SCHEMA = ('user:STRING,' 'team:STRING,' 'score:INTEGER,' 'timestamp:TIMESTAMP')
21 |
22 |
23 | def FormatEvent(element):
24 | """Format a GameEvent to a BigQuery TableRow."""
25 | return {
26 | 'user': element.user,
27 | 'team': element.team,
28 | 'score': element.score,
29 | 'timestamp': element.timestamp
30 | }
31 |
32 |
33 | def Run(argv=None):
34 | """Run a batch pipeline."""
35 | known_args, pipeline_args = ParseArgs(argv)
36 | pipeline_options = PipelineOptions(pipeline_args)
37 | pipeline_options.view_as(SetupOptions).save_main_session = True
38 | p = beam.Pipeline(options=pipeline_options)
39 |
40 | project = pipeline_options.view_as(GoogleCloudOptions).project
41 | # Read events from a CSV file, parse them and write (import) them to BigQuery.
42 | _ = (p
43 | | 'read' >> ReadFromText(known_args.input)
44 | | 'parse' >> beam.FlatMap(ParseEvent)
45 | | 'format' >> beam.Map(FormatEvent)
46 | | beam.io.WriteToBigQuery(known_args.output_tablename,
47 | known_args.output_dataset, project, SCHEMA)
48 | )
49 | p.run().wait_until_finish()
50 |
51 |
52 | if __name__ == '__main__':
53 | logging.getLogger().setLevel(logging.INFO)
54 | Run()
55 |
--------------------------------------------------------------------------------
/py/solutions/exercise1.py:
--------------------------------------------------------------------------------
1 | # This batch pipeline calculates the sum of scores per user, over an entire batch of gaming data and writes the sums to BigQuery.
2 | from __future__ import absolute_import
3 |
4 | import logging
5 | import re
6 |
7 | import apache_beam as beam
8 | from apache_beam.io import ReadFromText
9 | from apache_beam.io import WriteToText
10 | from apache_beam.metrics import Metrics
11 | from apache_beam.metrics.metric import MetricsFilter
12 | from apache_beam.options.pipeline_options import PipelineOptions
13 | from apache_beam.options.pipeline_options import SetupOptions
14 | from apache_beam.options.pipeline_options import GoogleCloudOptions
15 | from util.util import GameEvent
16 | from util.util import ParseEvent
17 | from util.util import ParseArgs
18 |
19 | # Defines the BigQuery schema.
20 | SCHEMA = ('user:STRING,' 'total_score:INTEGER')
21 |
22 |
23 | class ExtractAndSumScore(beam.PTransform):
24 | """A transform to extract key/score information from GameEvent, and sum
25 | the scores. The constructor arg determines whether 'team' or 'user' info is
26 | extracted."""
27 | def __init__(self, field):
28 | super(ExtractAndSumScore, self).__init__()
29 | self.field = field
30 |
31 | def expand(self, p):
32 | return (p
33 | | 'extract_field' >> beam.Map(lambda x: (vars(x)[self.field], x.score))
34 | | beam.CombinePerKey(sum)
35 | )
36 |
37 |
38 | def FormatUserScoreSum(element):
39 | """Format a KV of user and their score to a BigQuery TableRow."""
40 | user, total_score = element
41 | return {'user': user, 'total_score': total_score}
42 |
43 |
44 | def Run(argv=None):
45 | known_args, pipeline_args = ParseArgs(argv)
46 | pipeline_options = PipelineOptions(pipeline_args)
47 | pipeline_options.view_as(SetupOptions).save_main_session = True
48 | p = beam.Pipeline(options=pipeline_options)
49 |
50 | project = pipeline_options.view_as(GoogleCloudOptions).project
51 | # Read events from a CSV file and parse them.
52 | _ = (p
53 | | 'read' >> ReadFromText(known_args.input)
54 | | 'parse' >> beam.FlatMap(ParseEvent)
55 | | 'extract_user_score' >> ExtractAndSumScore('user')
56 | | 'format_user_score_sum' >> beam.Map(FormatUserScoreSum)
57 | | beam.io.WriteToBigQuery(known_args.output_tablename,
58 | known_args.output_dataset, project, SCHEMA)
59 | )
60 |
61 | p.run().wait_until_finish()
62 |
63 |
64 | if __name__ == '__main__':
65 | logging.getLogger().setLevel(logging.INFO)
66 | Run()
67 |
--------------------------------------------------------------------------------
/py/solutions/exercise1.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/solutions/exercise1.pyc
--------------------------------------------------------------------------------
/py/solutions/exercise2.py:
--------------------------------------------------------------------------------
1 | # This batch pipeline calculates the sum of scores per team per hour, over an entire batch of gaming data and writes the per-team sums to BigQuery.
2 | from __future__ import absolute_import
3 |
4 | import logging
5 | import re
6 |
7 | import apache_beam as beam
8 | from apache_beam.io import ReadFromText
9 | from apache_beam.io import WriteToText
10 | from apache_beam.metrics import Metrics
11 | from apache_beam.metrics.metric import MetricsFilter
12 | from apache_beam.options.pipeline_options import PipelineOptions
13 | from apache_beam.options.pipeline_options import SetupOptions
14 | from apache_beam.options.pipeline_options import GoogleCloudOptions
15 | from util.util import GameEvent
16 | from util.util import ParseEvent
17 | from util.util import ParseArgs
18 | import apache_beam.transforms.window as window
19 |
20 | # Defines the BigQuery schema.
21 | SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
22 |
23 |
24 | class ExtractAndSumScore(beam.PTransform):
25 | def __init__(self, field):
26 | super(ExtractAndSumScore, self).__init__()
27 | self.field = field
28 |
29 | def expand(self, p):
30 | return (p
31 | |'extract_field' >> beam.Map(lambda x: (vars(x)[self.field], x.score))
32 | | beam.CombinePerKey(sum)
33 | )
34 |
35 |
36 | class WindowedTeamScore(beam.PTransform):
37 | """A transform to compute the WindowedTeamScore."""
38 | def __init__(self, duration):
39 | super(WindowedTeamScore, self).__init__()
40 | self.duration = duration
41 |
42 | def expand(self, p):
43 | return (p
44 | | 'window' >> beam.WindowInto(window.FixedWindows(self.duration))
45 | | 'extract_team_score' >> ExtractAndSumScore('team')
46 | )
47 |
48 |
49 | class FormatTeamScoreSum(beam.DoFn):
50 | """Format a KV of user and their score to a BigQuery TableRow."""
51 | def process(self, team_score, window=beam.DoFn.WindowParam): ##????
52 | team, score = team_score
53 | start = int(window.start)
54 | yield {
55 | 'team': team,
56 | 'total_score': score,
57 | 'window_start': start,
58 | }
59 |
60 |
61 | def Run(argv=None):
62 | known_args, pipeline_args = ParseArgs(argv)
63 | pipeline_options = PipelineOptions(pipeline_args)
64 | pipeline_options.view_as(SetupOptions).save_main_session = True
65 | p = beam.Pipeline(options=pipeline_options)
66 |
67 | project = pipeline_options.view_as(GoogleCloudOptions).project
68 | _ = (p
69 | | 'read' >> ReadFromText(known_args.input)
70 | | 'parse' >> beam.FlatMap(ParseEvent)
71 | | 'add_event_timestamps' >> beam.Map(
72 | lambda x: beam.window.TimestampedValue(x, x.timestamp))
73 | | 'windowed_team_score' >> WindowedTeamScore(60 * 60)
74 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
75 | | beam.io.WriteToBigQuery(known_args.output_tablename,
76 | known_args.output_dataset, project, SCHEMA)
77 | )
78 | p.run().wait_until_finish()
79 |
80 |
81 | if __name__ == '__main__':
82 | logging.getLogger().setLevel(logging.INFO)
83 | Run()
84 |
--------------------------------------------------------------------------------
/py/solutions/exercise3.py:
--------------------------------------------------------------------------------
1 | # This pipeline calculates the sum of scores per team per hour and writes the
2 | # per-team sums to BigQuery. The pipeline can be run in either batch or
3 | # streaming mode, reading from either a data file or Pub/Sub topic.
4 | #
5 | # You will need to create a Pub/Sub topic and run the Java Injector
6 | # in order to get game events over Pub/Sub. Please refer to the instructions
7 | # here: https://github.com/malo-denielou/DataflowSME
8 | from __future__ import absolute_import
9 |
10 | import logging
11 | import re
12 |
13 | import apache_beam as beam
14 | from apache_beam.io import ReadFromText
15 | from apache_beam.io import ReadFromPubSub
16 | from apache_beam.io import WriteToText
17 | from apache_beam.metrics import Metrics
18 | from apache_beam.metrics.metric import MetricsFilter
19 | from apache_beam.options.pipeline_options import PipelineOptions
20 | from apache_beam.options.pipeline_options import SetupOptions
21 | from apache_beam.options.pipeline_options import StandardOptions
22 | from apache_beam.options.pipeline_options import GoogleCloudOptions
23 | from util.util import GameEvent
24 | from util.util import ParseEvent
25 | from util.util import ParseEventFn
26 | from util.util import ParseArgs
27 | import apache_beam.transforms.window as window
28 | from solutions.exercise1 import ExtractAndSumScore
29 |
30 | # Defines the BigQuery schema.
31 | SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
32 |
33 |
34 | class ExtractAndSumScore(beam.PTransform):
35 | def __init__(self, field):
36 | super(ExtractAndSumScore, self).__init__()
37 | self.field = field
38 |
39 | def expand(self, p):
40 | return (p
41 | | 'extract_field' >> beam.Map(
42 | lambda x: (vars(x)[self.field], x.score))
43 | | beam.CombinePerKey(sum)
44 | )
45 |
46 |
47 | class WindowedTeamScore(beam.PTransform):
48 | """A transform to compute a windowed team score."""
49 | def __init__(self, duration):
50 | super(WindowedTeamScore, self).__init__()
51 | self.duration = duration
52 |
53 | def expand(self, p):
54 | return (p
55 | | 'window' >> beam.WindowInto(
56 | window.FixedWindows(self.duration))
57 | | 'extract_team_score' >> ExtractAndSumScore('team')
58 | )
59 |
60 |
61 | class FormatTeamScoreSum(beam.DoFn):
62 | """Format a KV of user and their score to a BigQuery TableRow."""
63 | def process(self, team_score, window=beam.DoFn.WindowParam):
64 | team, score = team_score
65 | start = int(window.start)
66 | yield {
67 | 'team': team,
68 | 'total_score': score,
69 | 'window_start': start,
70 | }
71 |
72 |
73 | def Run(argv=None):
74 | known_args, pipeline_args = ParseArgs(argv)
75 | pipeline_options = PipelineOptions(pipeline_args)
76 | pipeline_options.view_as(SetupOptions).save_main_session = True
77 | p = beam.Pipeline(options=pipeline_options)
78 | window_duration = 1 * 60 # 1 minute windows.
79 | if known_args.topic:
80 | pipeline_options.view_as(StandardOptions).streaming = True
81 |
82 | project = pipeline_options.view_as(GoogleCloudOptions).project
83 | timestamp_attribute = 'timestamp_ms'
84 | events = None
85 | if (not known_args.topic):
86 | events = (p
87 | | 'read' >> ReadFromText(known_args.input)
88 | | 'parse' >> beam.FlatMap(ParseEventFn())
89 | | 'add_event_timestamps' >> beam.Map(
90 | lambda x: beam.window.TimestampedValue(x, x.timestamp))
91 | )
92 | else:
93 | events = (p
94 | | 'read' >> ReadFromPubSub(topic=known_args.topic,
95 | timestamp_attribute='timestamp_ms')
96 | | 'decode' >> beam.ParDo(ParseEventFn())
97 | )
98 |
99 | _ = (events
100 | | 'windowed_team_score' >> WindowedTeamScore(window_duration)
101 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
102 | | beam.io.WriteToBigQuery(known_args.output_tablename,
103 | known_args.output_dataset, project, SCHEMA)
104 | )
105 | p.run().wait_until_finish()
106 |
107 |
108 | if __name__ == '__main__':
109 | logging.getLogger().setLevel(logging.INFO)
110 | Run()
111 |
--------------------------------------------------------------------------------
/py/solutions/exercise4.py:
--------------------------------------------------------------------------------
1 | # This pipeline calculates the sum of scores per team per hour and writes the
2 | # per-team sums to BigQuery. Additionally computes running user scores (e.g.,
3 | # as a leaderboard) and updates them regularly.
4 |
5 | # The pipeline can be run in either batch or streaming mode, reading from
6 | # either a data file or Pub/Sub topic.
7 | from __future__ import absolute_import
8 |
9 | import logging
10 | import re
11 | import time
12 |
13 | import apache_beam as beam
14 | from apache_beam.io import ReadFromText
15 | from apache_beam.io import ReadFromPubSub
16 | from apache_beam.io import WriteToText
17 | from apache_beam.metrics import Metrics
18 | from apache_beam.metrics.metric import MetricsFilter
19 | from apache_beam.options.pipeline_options import PipelineOptions
20 | from apache_beam.options.pipeline_options import SetupOptions
21 | from apache_beam.options.pipeline_options import StandardOptions
22 | from apache_beam.options.pipeline_options import GoogleCloudOptions
23 | from apache_beam.transforms import trigger
24 | from util.util import GameEvent
25 | from util.util import ParseEvent
26 | from util.util import ParseEventFn
27 | from util.util import ParseArgs
28 | import apache_beam.transforms.window as window
29 | from solutions.exercise1 import ExtractAndSumScore
30 |
31 | # Defines the BigQuery schemas.
32 | USER_SCHEMA = ('user:STRING,'
33 | 'total_score:INTEGER,'
34 | 'processing_time:TIMESTAMP')
35 | TEAM_SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
36 |
37 |
38 | class ExtractAndSumScore(beam.PTransform):
39 | def __init__(self, field):
40 | super(ExtractAndSumScore, self).__init__()
41 | self.field = field
42 |
43 | def expand(self, p):
44 | return (p | 'extract_field' >>
45 | beam.Map(lambda x: (vars(x)[self.field], x.score)) |
46 | beam.CombinePerKey(sum))
47 |
48 |
49 | class RunningUserScores(beam.PTransform):
50 | """Extract user/score pairs via global windowing and emit perioidic updates
51 | on all users' running scores.
52 | """
53 | def __init__(self, allowed_lateness=0):
54 | super(RunningUserScores, self).__init__()
55 |
56 | def expand(self, p):
57 | # NOTE: allowed_lateness is not yet available in Python FixedWindows.
58 | # NOTE: AfterProcessingTime not yet available in Python.
59 | return (p
60 | | 'window' >> beam.WindowInto(
61 | beam.window.GlobalWindows(),
62 | trigger=trigger.AfterWatermark(early=trigger.AfterCount(100)),
63 | accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
64 | | 'extract_user_score' >> ExtractAndSumScore('user')
65 | )
66 |
67 |
68 | class WindowedTeamScore(beam.PTransform):
69 | """Calculates scores for each team within the configured window duration"""
70 | def __init__(self, duration):
71 | super(WindowedTeamScore, self).__init__()
72 | self.duration = duration
73 |
74 | def expand(self, p):
75 | return (p
76 | | 'window' >> beam.WindowInto(window.FixedWindows(self.duration))
77 | | 'extract_team_score' >> ExtractAndSumScore('team')
78 | )
79 |
80 |
81 | class FormatTeamScoreSum(beam.DoFn):
82 | """Format a KV of team and its score to a BigQuery TableRow."""
83 | def process(self, team_score, window=beam.DoFn.WindowParam):
84 | team, score = team_score
85 | start = int(window.start)
86 | yield {
87 | 'team': team,
88 | 'total_score': score,
89 | 'window_start': start,
90 | }
91 |
92 |
93 | class FormatUserScoreSum(beam.DoFn):
94 | """Format a KV of user and their score to a BigQuery TableRow."""
95 | def process(self, user_score, window=beam.DoFn.WindowParam):
96 | user, score = user_score
97 | yield {
98 | 'user': user,
99 | 'total_score': score,
100 | 'processing_time': time.time(),
101 | }
102 |
103 |
104 | def Run(argv=None):
105 | known_args, pipeline_args = ParseArgs(argv)
106 | pipeline_options = PipelineOptions(pipeline_args)
107 | pipeline_options.view_as(SetupOptions).save_main_session = True
108 | p = beam.Pipeline(options=pipeline_options)
109 | window_duration = 1 * 60 # 1 minute windows.
110 | if known_args.topic:
111 | pipeline_options.view_as(StandardOptions).streaming = True
112 |
113 | project = pipeline_options.view_as(GoogleCloudOptions).project
114 | timestamp_attribute = 'timestamp_ms'
115 | events = None
116 | if (not known_args.topic):
117 | events = (p
118 | | 'read' >> ReadFromText(known_args.input)
119 | | 'parse' >> beam.FlatMap(ParseEventFn())
120 | | 'add_event_timestamps' >> beam.Map(
121 | lambda x: beam.window.TimestampedValue(x, x.timestamp))
122 | )
123 | else:
124 | events = (p
125 | | 'read' >> ReadFromPubSub(topic=known_args.topic,
126 | timestamp_attribute='timestamp_ms')
127 | | 'decode' >> beam.ParDo(ParseEventFn())
128 | )
129 |
130 | # Window team scores and write them BigQuery.
131 | _ = (events
132 | | 'windowed_team_score' >> WindowedTeamScore(window_duration)
133 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
134 | | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery(
135 | known_args.output_tablename + '_team', known_args.output_dataset,
136 | project, TEAM_SCHEMA)
137 | )
138 |
139 | # Write leaderboards to BigQuery.
140 | _ = (events
141 | | 'running_user_score' >> RunningUserScores()
142 | | 'format_user_scores' >> beam.ParDo(FormatUserScoreSum())
143 | | 'write_users_to_bigquery' >> beam.io.WriteToBigQuery(
144 | known_args.output_tablename + '_user', known_args.output_dataset,
145 | project, USER_SCHEMA)
146 | )
147 |
148 | p.run().wait_until_finish()
149 |
150 |
151 | if __name__ == '__main__':
152 | logging.getLogger().setLevel(logging.INFO)
153 | Run()
154 |
--------------------------------------------------------------------------------
/py/solutions/exercise5.py:
--------------------------------------------------------------------------------
1 | # Filter 'cheating' or 'spammy' users from the game results.
2 | # Computes the global mean score and filters users that are
3 | # some threshold above that score.
4 | from __future__ import absolute_import
5 |
6 | import logging
7 | import re
8 | import time
9 |
10 | import apache_beam as beam
11 | from apache_beam.io import ReadFromText
12 | from apache_beam.io import ReadFromPubSub
13 | from apache_beam.io import WriteToText
14 | from apache_beam.metrics import Metrics
15 | from apache_beam.metrics.metric import MetricsFilter
16 | from apache_beam.options.pipeline_options import PipelineOptions
17 | from apache_beam.options.pipeline_options import SetupOptions
18 | from apache_beam.options.pipeline_options import StandardOptions
19 | from apache_beam.options.pipeline_options import GoogleCloudOptions
20 | from apache_beam.transforms import trigger
21 | from util.util import GameEvent
22 | from util.util import ParseEvent
23 | from util.util import ParseEventFn
24 | from util.util import ParseArgs
25 | import apache_beam.transforms.window as window
26 |
27 | # Defines the BigQuery schemas.
28 | USER_SCHEMA = ('user:STRING,'
29 | 'total_score:INTEGER,'
30 | 'processing_time:TIMESTAMP')
31 | TEAM_SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
32 |
33 |
34 | class ExtractAndSumScore(beam.PTransform):
35 | def __init__(self, field):
36 | super(ExtractAndSumScore, self).__init__()
37 | self.field = field
38 |
39 | def expand(self, p):
40 | return (p
41 | | 'extract_field' >> beam.Map(
42 | lambda x: (vars(x)[self.field], x.score))
43 | | beam.CombinePerKey(sum)
44 | )
45 |
46 |
47 | class WindowedUserScores(beam.PTransform):
48 | """Extract user/score pairs via in fixed windows."""
49 | def __init__(self, duration):
50 | super(WindowedUserScores, self).__init__()
51 | self.duration = duration
52 |
53 | def expand(self, p):
54 | return (p
55 | | 'window' >> beam.WindowInto(
56 | window.FixedWindows(self.duration))
57 | | 'extract_user_score' >> ExtractAndSumScore('user')
58 | )
59 |
60 |
61 | class FilterUser(beam.DoFn):
62 | """Filter a user if their score * score_weight > avg_score."""
63 | def __init__(self, score_weight):
64 | super(FilterUser, self).__init__()
65 | self.score_weight = score_weight
66 | self.num_spammy_users = Metrics.counter(self.__class__,
67 | 'num_spammy_users')
68 |
69 | def process(self, user_score, avg_score=beam.DoFn.SideInputParam):
70 | user, score = user_score
71 | if score * self.score_weight > avg_score:
72 | logging.error('User %s filtered as spammy', user)
73 | self.num_spammy_users.inc()
74 | yield user
75 |
76 |
77 | class ComputeSpammyUsers(beam.PTransform):
78 | """Compute users with a high clickrate, which we will consider spammy.
79 | We do this by finding the mean total score per user and filter out
80 | those with scores that are greater than the mean * score_weight
81 | """
82 | def __init__(self, score_weight):
83 | super(ComputeSpammyUsers, self).__init__()
84 | self.score_weight = score_weight
85 |
86 | def expand(self, p):
87 | avg_score = (p
88 | | beam.Values()
89 | | beam.CombineGlobally(
90 | beam.combiners.MeanCombineFn()).as_singleton_view()
91 | )
92 | return (p
93 | | 'compute_spammers' >> beam.ParDo(
94 | FilterUser(self.score_weight), avg_score=avg_score)
95 | )
96 |
97 |
98 | class FilterSpammers(beam.DoFn):
99 | """Remove users found in the spam list."""
100 | def __init__(self):
101 | super(FilterSpammers, self).__init__()
102 | self.filtered_scores = Metrics.counter(self.__class__,
103 | 'filtered_scores')
104 |
105 | def process(self, elem, spammers=beam.DoFn.SideInputParam):
106 | user = elem.user
107 | if user not in spammers:
108 | yield elem
109 | else:
110 | self.filtered_scores.inc()
111 |
112 |
113 | class WindowedTeamScore(beam.PTransform):
114 | """Calculates scores for each team within the configured window duration"""
115 | def __init__(self, duration, spammers):
116 | super(WindowedTeamScore, self).__init__()
117 | self.duration = duration
118 | self.spammers = spammers
119 |
120 | def expand(self, p):
121 | return (p
122 | | 'window' >> beam.WindowInto(
123 | window.FixedWindows(self.duration))
124 | | 'filter_spammers' >> beam.ParDo(
125 | FilterSpammers(), spammers=self.spammers)
126 | | 'extract_team_score' >> ExtractAndSumScore('team')
127 | )
128 |
129 |
130 | class FormatTeamScoreSum(beam.DoFn):
131 | def process(self, team_score, window=beam.DoFn.WindowParam):
132 | team, score = team_score
133 | start = int(window.start)
134 | yield {
135 | 'team': team,
136 | 'total_score': score,
137 | 'window_start': start,
138 | }
139 |
140 |
141 | class FormatUserScoreSum(beam.DoFn):
142 | def process(self, user_score, window=beam.DoFn.WindowParam):
143 | user, score = user_score
144 | yield {
145 | 'user': user,
146 | 'total_score': score,
147 | 'processing_time': time.time(),
148 | }
149 |
150 |
151 | def Run(argv=None):
152 | known_args, pipeline_args = ParseArgs(argv)
153 | pipeline_options = PipelineOptions(pipeline_args)
154 | pipeline_options.view_as(SetupOptions).save_main_session = True
155 | p = beam.Pipeline(options=pipeline_options)
156 | window_duration = 1 * 60 # 1 minute windows.
157 | if known_args.topic:
158 | pipeline_options.view_as(StandardOptions).streaming = True
159 |
160 | project = pipeline_options.view_as(GoogleCloudOptions).project
161 | timestamp_attribute = 'timestamp_ms'
162 | events = None
163 | if (not known_args.topic):
164 | events = (p
165 | | 'read' >> ReadFromText(known_args.input)
166 | | 'parse' >> beam.FlatMap(ParseEventFn())
167 | | 'add_event_timestamps' >> beam.Map(
168 | lambda x: beam.window.TimestampedValue(x, x.timestamp)))
169 | else:
170 | events = (p
171 | | 'read' >> ReadFromPubSub(
172 | topic=known_args.topic,
173 | timestamp_attribute='timestamp_ms')
174 | | 'decode' >> beam.ParDo(ParseEventFn()))
175 |
176 | user_scores = (events
177 | | 'window_user_scores' >> WindowedUserScores(window_duration))
178 | spammers = beam.pvalue.AsList(user_scores
179 | | 'compute_spammers' >> ComputeSpammyUsers(2.5))
180 |
181 | _ = (events
182 | | 'windowed_team_score' >> WindowedTeamScore(window_duration, spammers)
183 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
184 | | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery(
185 | known_args.output_tablename, known_args.output_dataset, project,
186 | TEAM_SCHEMA)
187 | )
188 |
189 | p.run().wait_until_finish()
190 |
191 |
192 | if __name__ == '__main__':
193 | logging.getLogger().setLevel(logging.INFO)
194 | Run()
195 |
--------------------------------------------------------------------------------
/py/solutions/exercise6.py:
--------------------------------------------------------------------------------
1 | # This pipeline computes the average duration of user sessions. The
2 | # averages are windowed, to reflect durations differing over time.
3 | from __future__ import absolute_import
4 |
5 | import logging
6 | import re
7 | import time
8 |
9 | import apache_beam as beam
10 | import apache_beam.transforms.window as window
11 | from apache_beam.io import ReadFromText
12 | from apache_beam.io import ReadFromPubSub
13 | from apache_beam.io import WriteToText
14 | from apache_beam.metrics import Metrics
15 | from apache_beam.metrics.metric import MetricsFilter
16 | from apache_beam.options.pipeline_options import PipelineOptions
17 | from apache_beam.options.pipeline_options import SetupOptions
18 | from apache_beam.options.pipeline_options import StandardOptions
19 | from apache_beam.options.pipeline_options import GoogleCloudOptions
20 | from apache_beam.transforms import trigger
21 | from util.util import GameEvent
22 | from util.util import ParseEvent
23 | from util.util import ParseEventFn
24 | from util.util import ParseArgs
25 |
26 | # Defines the BigQuery schemas.
27 | SESSION_SCHEMA = ('window_start:TIMESTAMP,' 'mean_duration:FLOAT')
28 |
29 |
30 | class UserSessionActivity(beam.DoFn):
31 | """Compute the duration of a user's session."""
32 | def process(self,
33 | elem,
34 | timestamp=beam.DoFn.TimestampParam,
35 | window=beam.DoFn.WindowParam):
36 | duration = int(window.end) - int(window.start)
37 | yield duration
38 |
39 |
40 | class FormatSessionMeans(beam.DoFn):
41 | """Format session means for output to BQ"""
42 | def process(self, elem, window=beam.DoFn.WindowParam):
43 | yield {'window_start': int(window.start), 'mean_duration': elem}
44 |
45 |
46 | def Run(argv=None):
47 | known_args, pipeline_args = ParseArgs(argv)
48 | pipeline_options = PipelineOptions(pipeline_args)
49 | pipeline_options.view_as(SetupOptions).save_main_session = True
50 | p = beam.Pipeline(options=pipeline_options)
51 | if known_args.topic:
52 | pipeline_options.view_as(StandardOptions).streaming = True
53 |
54 | project = pipeline_options.view_as(GoogleCloudOptions).project
55 | timestamp_attribute = 'timestamp_ms'
56 | events = None
57 | if (not known_args.topic):
58 | events = (p
59 | | 'read' >> ReadFromText(known_args.input)
60 | | 'parse' >> beam.FlatMap(ParseEventFn())
61 | | 'add_event_timestamps' >> beam.Map(
62 | lambda x: beam.window.TimestampedValue(x, x.timestamp)))
63 | else:
64 | events = (p
65 | | 'read' >> ReadFromPubSub(
66 | topic=known_args.topic,
67 | timestamp_attribute='timestamp_ms')
68 | | 'parse' >> beam.ParDo(ParseEventFn()))
69 |
70 | _ = (events
71 | | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score))
72 | | 'sessionize' >> beam.WindowInto(
73 | window.Sessions(float(known_args.session_gap)))
74 | | 'drop_scores' >> beam.CombinePerKey(lambda x: 0)
75 | | 'convert_to_activity' >> beam.ParDo(UserSessionActivity())
76 | | 'window_of_sessions' >> beam.WindowInto(
77 | window.FixedWindows(int(known_args.user_activity_window)))
78 | | 'session_mean' >> beam.CombineGlobally(
79 | beam.combiners.MeanCombineFn()).without_defaults()
80 | | 'format_sessions' >> beam.ParDo(FormatSessionMeans())
81 | | 'write_to_bigquery' >> beam.io.WriteToBigQuery(
82 | known_args.output_tablename, known_args.output_dataset, project,
83 | SESSION_SCHEMA)
84 | )
85 |
86 | p.run().wait_until_finish()
87 |
88 |
89 | if __name__ == '__main__':
90 | logging.getLogger().setLevel(logging.INFO)
91 | Run()
92 |
--------------------------------------------------------------------------------
/py/solutions/exercise7.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | import logging
4 | import re
5 | import time
6 |
7 | import apache_beam as beam
8 | import apache_beam.transforms.window as window
9 | from apache_beam.io import ReadFromText
10 | from apache_beam.io import ReadFromPubSub
11 | from apache_beam.io import WriteToText
12 | from apache_beam.metrics import Metrics
13 | from apache_beam.metrics.metric import MetricsFilter
14 | from apache_beam.options.pipeline_options import PipelineOptions
15 | from apache_beam.options.pipeline_options import SetupOptions
16 | from apache_beam.options.pipeline_options import StandardOptions
17 | from apache_beam.options.pipeline_options import GoogleCloudOptions
18 | from apache_beam.transforms import trigger
19 | from util.util import GameEvent
20 | from util.util import ParseEvent
21 | from util.util import ParseEventFn
22 | from util.util import ParsePlayEventFn
23 | from util.util import ParseArgs
24 |
25 | # Defines the BigQuery schemas.
26 | SESSION_SCHEMA = ('window_start:TIMESTAMP,' 'mean_duration:FLOAT')
27 |
28 |
29 | class ComputeLatency(beam.DoFn):
30 | def __init__(self):
31 | super(ComputeLatency, self).__init__()
32 | self.dropped_sessions_no_events = Metrics.counter(
33 | self.__class__, 'dropped_sessions_no_events')
34 | self.dropped_sessions_too_many_events = Metrics.counter(
35 | self.__class__, 'dropped_sessions_too_many_events')
36 | self.dropped_sessions_no_play_events = Metrics.counter(
37 | self.__class__, 'dropped_sessions_no_play_events')
38 |
39 | def process(self, elem):
40 | _, vals = elem
41 | plays = vals['plays']
42 | events = vals['events']
43 |
44 | play_count = 0
45 | max_play_ts = 0
46 | for play in plays:
47 | play_count += 1
48 | max_play_ts = max(max_play_ts, long(play.timestamp))
49 |
50 | event_count = 0
51 | an_event = None
52 | for event in events:
53 | an_event = event
54 | event_count += 1
55 |
56 | if event_count == 0:
57 | self.dropped_sessions_no_events.inc()
58 | elif event_count > 1:
59 | self.dropped_sessions_too_many_events.inc()
60 | elif play_count == 0:
61 | self.dropped_sessions_no_play_events.inc()
62 | else:
63 | min_latency = long(an_event.timestamp) - max_play_ts
64 | yield (an_event.user, min_latency)
65 |
66 |
67 | class DetectBadUsers(beam.DoFn):
68 | def process(self, elem, mean_latency=beam.DoFn.SideInputParam):
69 | user, latency = elem
70 | # Naive: compute bad users are users 5 times less than
71 | # the mean.
72 | if latency < mean / 5:
73 | yield user
74 |
75 |
76 | def Run(argv=None):
77 | known_args, pipeline_args = ParseArgs(argv)
78 | pipeline_options = PipelineOptions(pipeline_args)
79 | pipeline_options.view_as(SetupOptions).save_main_session = True
80 | p = beam.Pipeline(options=pipeline_options)
81 | if known_args.topic:
82 | pipeline_options.view_as(StandardOptions).streaming = True
83 |
84 | project = pipeline_options.view_as(GoogleCloudOptions).project
85 | timestamp_attribute = 'timestamp_ms'
86 | events = None
87 | if (not known_args.topic or not known_args.play_topic):
88 | logging.fatal('topic and play_topic are required.')
89 |
90 | events = (p
91 | | 'read_events' >> ReadFromPubSub(
92 | topic=known_args.topic,
93 | timestamp_attribute='timestamp_ms')
94 | | 'parse_events' >> beam.ParDo(ParseEventFn())
95 | )
96 |
97 | play_events = (p
98 | | 'read_play_events' >> ReadFromPubSub(
99 | topic=known_args.play_topic,
100 | timestamp_attribute='timestamp_ms')
101 | | 'parse_play_events' >> beam.ParDo(ParsePlayEventFn())
102 | )
103 |
104 | sessionized_events = (events
105 | | 'key_events_by_id' >> beam.Map(lambda x: (x.event_id, x))
106 | | 'sessionize_events' >> beam.WindowInto(
107 | window.Sessions(float(known_args.session_gap))))
108 |
109 | sessionized_plays = (play_events
110 | | 'key_plays_by_id' >> beam.Map(lambda x: (x.event_id, x))
111 | | 'sessionize_plays' >> beam.WindowInto(
112 | window.Sessions(float(known_args.session_gap))))
113 |
114 | per_user_latency = (
115 | {'plays': sessionized_plays, 'events': sessionized_events}
116 | | 'cbk' >> beam.CoGroupByKey()
117 | | 'compute_latency' >> beam.ParDo(ComputeLatency()))
118 |
119 | mean_latency = (per_user_latency
120 | | 'extract_latencies' >> beam.Values()
121 | | 'global_window' >> beam.WindowInto(
122 | window.GlobalWindows(),
123 | trigger=trigger.Repeatedly(trigger.AfterCount(1000)),
124 | accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
125 | | 'compute_mean' >> beam.CombineGlobally(
126 | beam.combiners.MeanCombineFn()).with_fanout(16).as_singleton_view()
127 | )
128 |
129 | _ = (per_user_latency
130 | | 'detect_bad_users' >> beam.ParDo(
131 | DetectBadUsers(), mean_latency=mean_latency)
132 | | 'filter_duplicates' >> beam.WindowInto(
133 | window.GlobalWindows(), trigger=trigger.AfterCount(1),
134 | accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
135 | | 'to_bq_schema' >> beam.Map(lambda x: {'user': x})
136 | | 'write_bad_users' >> beam.io.WriteToBigQuery(
137 | known_args.output_tablename, known_args.output_dataset, project, ('user:string'))
138 | )
139 |
140 | p.run().wait_until_finish()
141 |
142 |
143 | if __name__ == '__main__':
144 | logging.getLogger().setLevel(logging.INFO)
145 | Run()
146 |
--------------------------------------------------------------------------------
/py/util/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['util']
2 |
--------------------------------------------------------------------------------
/py/util/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/util/__init__.pyc
--------------------------------------------------------------------------------
/py/util/util.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | import argparse
4 | import collections
5 | import logging
6 |
7 | import apache_beam as beam
8 | from apache_beam.metrics.metric import Metrics
9 |
10 | GameEvent = collections.namedtuple(
11 | 'GameEvent', ['user', 'team', 'score', 'timestamp', 'event_id'])
12 | PlayEvent = collections.namedtuple('PlayEvent',
13 | ['user', 'timestamp', 'event_id'])
14 |
15 |
16 | class ParseEventFn(beam.DoFn):
17 | """Parses an event.
18 | [user,team,score,timestamp,readable_timestamp,event_id]
19 | """
20 | def __init__(self):
21 | super(ParseEventFn, self).__init__()
22 | self.num_parse_errors = Metrics.counter(self.__class__,
23 | 'num_event_parse_errors')
24 |
25 | def process(self, elem):
26 | try:
27 | parts = [x.strip() for x in elem.split(',')]
28 | user, team, score, timestamp = parts[:4]
29 | score = int(score)
30 | timestamp = long(timestamp)
31 | if len(parts) >= 6:
32 | event_id = parts[5]
33 | else:
34 | event_id = 'none'
35 | yield GameEvent(user, team, score, timestamp, event_id)
36 | except Exception as e:
37 | self.num_parse_errors.inc()
38 | logging.error('Parse error on "%s": %s', elem, str(e))
39 |
40 |
41 | class ParsePlayEventFn(beam.DoFn):
42 | """Parses a play event: [user,timestamp,readable_timestamp,event_id]"""
43 | def __init__(self):
44 | super(ParsePlayEventFn, self).__init__()
45 | self.num_parse_errors = Metrics.counter(self.__class__,
46 | 'num_play_parse_errors')
47 |
48 | def process(self, elem):
49 | try:
50 | parts = [x.strip() for x in elem.split(',')]
51 | user, timestamp, _, event_id = parts[:5]
52 | yield PlayEvent(user, timestamp, event_id)
53 | except Exception as e:
54 | self.num_parse_errors.inc()
55 | logging.error('Parse error on "%s": %s', elem, str(e))
56 |
57 |
58 | def ParseEvent(element):
59 | try:
60 | parts = [x.strip() for x in element.split(',')]
61 | user, team, score, timestamp = parts[:4]
62 | score = int(score)
63 | timestamp = long(timestamp)
64 | if len(parts) >= 6:
65 | event_id = parts[5]
66 | else:
67 | event_id = 'none'
68 | return [GameEvent(user, team, score, timestamp, event_id)]
69 | except:
70 | return []
71 |
72 | def ParseArgs(argv):
73 | parser = argparse.ArgumentParser()
74 | parser.add_argument('--input', dest='input', help='Input file to process.')
75 | parser.add_argument(
76 | '--topic', dest='topic', help='Input topic to read from.')
77 | parser.add_argument(
78 | '--play_topic',
79 | dest='play_topic',
80 | help='Input topic to read for play events.')
81 | parser.add_argument(
82 | '--output_dataset',
83 | dest='output_dataset',
84 | required=True,
85 | help='Output file to write results to.')
86 | parser.add_argument(
87 | '--output_tablename',
88 | dest='output_tablename',
89 | required=True,
90 | help='Output file to write results to.')
91 | parser.add_argument(
92 | '--session_gap',
93 | dest='session_gap',
94 | help='Gap between user sessions, in seconds.')
95 | parser.add_argument(
96 | '--user_activity_window',
97 | dest='user_activity_window',
98 | help=
99 | 'Value of fixed window for finding mean of session duration, in second.'
100 | )
101 | return parser.parse_known_args(argv)
102 |
--------------------------------------------------------------------------------
/py/util/util.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/util/util.pyc
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/Exercise0.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 | * use this file except in compliance with the License. You may obtain a copy of
6 | * the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | * License for the specific language governing permissions and limitations under
14 | * the License.
15 | */
16 |
17 | package org.apache.beam.examples.complete.game;
18 |
19 | import com.google.api.services.bigquery.model.TableFieldSchema;
20 | import com.google.api.services.bigquery.model.TableReference;
21 | import com.google.api.services.bigquery.model.TableRow;
22 | import com.google.api.services.bigquery.model.TableSchema;
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import org.apache.beam.examples.complete.game.utils.GameEvent;
26 | import org.apache.beam.examples.complete.game.utils.Options;
27 | import org.apache.beam.examples.complete.game.utils.ParseEventFn;
28 | import org.apache.beam.sdk.Pipeline;
29 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
30 | import org.apache.beam.sdk.io.TextIO;
31 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
34 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
35 | import org.apache.beam.sdk.transforms.DoFn;
36 | import org.apache.beam.sdk.transforms.ParDo;
37 |
38 | /**
39 | * Zeroth (no code changes necessary) in a series of exercises in a gaming domain.
40 | *
41 | * This batch pipeline imports game events from CSV to BigQuery.
42 | *
43 | *
See README.md for details.
44 | */
45 | public class Exercise0 {
46 |
47 | /**
48 | * Format a GameEvent to a BigQuery TableRow.
49 | */
50 | static class FormatGameEventFn extends DoFn {
51 |
52 | @ProcessElement
53 | public void processElement(ProcessContext c) {
54 | GameEvent event = c.element();
55 | TableRow row = new TableRow()
56 | .set("user", event.getUser())
57 | .set("team", event.getTeam())
58 | .set("score", event.getScore())
59 | .set("timestamp", event.getTimestamp() / 1000);
60 | c.output(row);
61 | }
62 |
63 | /**
64 | * Defines the BigQuery schema.
65 | */
66 | static TableSchema getSchema() {
67 | List fields = new ArrayList<>();
68 | fields.add(new TableFieldSchema().setName("user").setType("STRING"));
69 | fields.add(new TableFieldSchema().setName("team").setType("STRING"));
70 | fields.add(new TableFieldSchema().setName("score").setType("INTEGER"));
71 | fields.add(new TableFieldSchema().setName("timestamp").setType("TIMESTAMP"));
72 | return new TableSchema().setFields(fields);
73 | }
74 | }
75 |
76 | /**
77 | * Run a batch pipeline.
78 | */
79 | public static void main(String[] args) throws Exception {
80 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
81 | Pipeline pipeline = Pipeline.create(options);
82 |
83 | TableReference tableRef = new TableReference();
84 | tableRef.setDatasetId(options.getOutputDataset());
85 | tableRef.setProjectId(options.as(GcpOptions.class).getProject());
86 | tableRef.setTableId(options.getOutputTableName());
87 |
88 | // Read events from a CSV file, parse them and write (import) them to BigQuery.
89 | pipeline
90 | .apply(TextIO.read().from(options.getInput()))
91 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
92 | .apply("FormatGameEvent", ParDo.of(new FormatGameEventFn()))
93 | .apply(
94 | BigQueryIO.writeTableRows().to(tableRef)
95 | .withSchema(FormatGameEventFn.getSchema())
96 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
97 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
98 |
99 | pipeline.run();
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/Exercise1.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 | * use this file except in compliance with the License. You may obtain a copy of
6 | * the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | * License for the specific language governing permissions and limitations under
14 | * the License.
15 | */
16 |
17 | package org.apache.beam.examples.complete.game;
18 |
19 | import com.google.api.services.bigquery.model.TableFieldSchema;
20 | import com.google.api.services.bigquery.model.TableReference;
21 | import com.google.api.services.bigquery.model.TableRow;
22 | import com.google.api.services.bigquery.model.TableSchema;
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import org.apache.beam.examples.complete.game.utils.ChangeMe;
26 | import org.apache.beam.examples.complete.game.utils.GameEvent;
27 | import org.apache.beam.examples.complete.game.utils.Options;
28 | import org.apache.beam.examples.complete.game.utils.ParseEventFn;
29 | import org.apache.beam.sdk.Pipeline;
30 | import org.apache.beam.sdk.PipelineResult;
31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
32 | import org.apache.beam.sdk.io.TextIO;
33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
35 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
36 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
37 | import org.apache.beam.sdk.transforms.DoFn;
38 | import org.apache.beam.sdk.transforms.PTransform;
39 | import org.apache.beam.sdk.transforms.ParDo;
40 | import org.apache.beam.sdk.values.KV;
41 | import org.apache.beam.sdk.values.PCollection;
42 |
43 | /**
44 | * First in a series of coding exercises in a gaming domain.
45 | *
46 | * This batch pipeline calculates the sum of scores per user, over an entire batch of gaming data
47 | * and writes the sums to BigQuery.
48 | *
49 | *
See README.md for details.
50 | */
51 | public class Exercise1 {
52 |
53 | /**
54 | * A transform to extract key/score information from GameEvent, and sum
55 | * the scores. The constructor arg determines whether 'team' or 'user' info is
56 | * extracted.
57 | */
58 | public static class ExtractAndSumScore
59 | extends PTransform, PCollection>> {
60 |
61 | private final String field;
62 |
63 | public ExtractAndSumScore(String field) {
64 | this.field = field;
65 | }
66 |
67 | @Override
68 | public PCollection> expand(PCollection gameEvents) {
69 | // [START EXERCISE 1]:
70 | // JavaDoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/
71 | // Developer Docs: https://beam.apache.org/documentation/programming-guide/#transforms-pardo
72 | // Also: https://cloud.google.com/dataflow/model/par-do
73 | //
74 | // Fill in the code to:
75 | // 1. Extract a KV from each GameEvent corresponding to the given
76 | // field and the score.
77 | // 2. Compute the sum of the scores for each key.
78 | // 3. Run your pipeline on the Dataflow service.
79 | return gameEvents
80 | .apply(ParDo.of(new DoFn>(){
81 | @ProcessElement
82 | public void processElement(ProcessContext c) {
83 | // 1. Creates key-value pairs, using the KeyField as the key and
84 | // the score as the value. KV.of(key, value) creates a key-value pair.
85 | /* TODO: YOUR CODE GOES HERE */
86 | }
87 | }))
88 | // 2. Sum is a family of PTransforms for computing the sum of elements in a PCollection.
89 | // Select the appropriate method to compute the sum over each key.
90 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */);
91 | // [END EXERCISE 1]:
92 | }
93 | }
94 |
95 | /**
96 | * Format a KV of user and their score to a BigQuery TableRow.
97 | */
98 | static class FormatUserScoreSumsFn extends DoFn, TableRow> {
99 |
100 | @ProcessElement
101 | public void processElement(ProcessContext c) {
102 | TableRow row = new TableRow()
103 | .set("user", c.element().getKey())
104 | .set("total_score", c.element().getValue());
105 | c.output(row);
106 | }
107 |
108 | /**
109 | * Defines the BigQuery schema.
110 | */
111 | static TableSchema getSchema() {
112 | List fields = new ArrayList<>();
113 | fields.add(new TableFieldSchema().setName("user").setType("STRING"));
114 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
115 | return new TableSchema().setFields(fields);
116 | }
117 | }
118 |
119 | /**
120 | * Run a batch pipeline.
121 | */
122 | public static void main(String[] args) throws Exception {
123 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
124 | Pipeline pipeline = Pipeline.create(options);
125 |
126 | TableReference tableRef = new TableReference();
127 | tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
128 | tableRef.setProjectId(options.as(GcpOptions.class).getProject());
129 | tableRef.setTableId(options.getOutputTableName());
130 |
131 | // Read events from a CSV file and parse them.
132 | pipeline
133 | .apply(TextIO.read().from(options.getInput()))
134 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
135 | // Extract and sum username/score pairs from the event data.
136 | .apply("ExtractUserScore", new ExtractAndSumScore("user"))
137 | // Write the results to BigQuery.
138 | .apply("FormatUserScoreSums", ParDo.of(new FormatUserScoreSumsFn()))
139 | .apply(
140 | BigQueryIO.writeTableRows().to(tableRef)
141 | .withSchema(FormatUserScoreSumsFn.getSchema())
142 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
143 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
144 |
145 | PipelineResult result = pipeline.run();
146 | result.waitUntilFinish();
147 | }
148 | }
149 |
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/Exercise2.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 | * use this file except in compliance with the License. You may obtain a copy of
6 | * the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | * License for the specific language governing permissions and limitations under
14 | * the License.
15 | */
16 |
17 | package org.apache.beam.examples.complete.game;
18 |
19 | import com.google.api.services.bigquery.model.TableFieldSchema;
20 | import com.google.api.services.bigquery.model.TableReference;
21 | import com.google.api.services.bigquery.model.TableRow;
22 | import com.google.api.services.bigquery.model.TableSchema;
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import org.apache.beam.examples.complete.game.utils.ChangeMe;
26 | import org.apache.beam.examples.complete.game.utils.GameEvent;
27 | import org.apache.beam.examples.complete.game.utils.Options;
28 | import org.apache.beam.examples.complete.game.utils.ParseEventFn;
29 | import org.apache.beam.sdk.Pipeline;
30 | import org.apache.beam.sdk.PipelineResult;
31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
32 | import org.apache.beam.sdk.io.TextIO;
33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
35 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
36 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
37 | import org.apache.beam.sdk.transforms.DoFn;
38 | import org.apache.beam.sdk.transforms.PTransform;
39 | import org.apache.beam.sdk.transforms.ParDo;
40 | import org.apache.beam.sdk.transforms.WithTimestamps;
41 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
42 | import org.apache.beam.sdk.values.KV;
43 | import org.apache.beam.sdk.values.PCollection;
44 | import org.joda.time.Duration;
45 | import org.joda.time.Instant;
46 |
47 | /**
48 | * Second in a series of coding exercises in a gaming domain.
49 | *
50 | * This batch pipeline calculates the sum of scores per team per hour, over an entire batch of
51 | * gaming data and writes the per-team sums to BigQuery.
52 | *
53 | *
See README.md for details.
54 | */
55 | public class Exercise2 {
56 |
57 | /**
58 | * A transform to compute the WindowedTeamScore.
59 | */
60 | public static class WindowedTeamScore
61 | extends PTransform, PCollection>> {
62 | // Developer Docs for composite transforms:
63 | // https://beam.apache.org/documentation/programming-guide/#transforms-composite
64 |
65 | private Duration duration;
66 |
67 | public WindowedTeamScore(Duration duration) {
68 | this.duration = duration;
69 | }
70 |
71 | @Override
72 | public PCollection> expand(PCollection input) {
73 | // [START EXERCISE 2]:
74 | // JavaDoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/
75 | // Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing
76 | // Also: https://cloud.google.com/dataflow/model/windowing
77 | //
78 | return input
79 | // Window.into() takes a WindowFn and returns a PTransform that
80 | // applies windowing to the PCollection. FixedWindows.of() returns a
81 | // WindowFn that assigns elements to windows of a fixed size. Use
82 | // these methods to apply fixed windows of size
83 | // this.duration to the PCollection.
84 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */)
85 | // Remember the ExtractAndSumScore PTransform from Exercise 1? We
86 | // parameterized it over the key field. Use it here to compute the "team"
87 | // scores (recall it is a public static method of Exercise1).
88 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */);
89 | // [END EXERCISE 2]
90 | }
91 | }
92 |
93 | /**
94 | * Format a KV of team and their score to a BigQuery TableRow.
95 | */
96 | public static class FormatTeamScoreSumsFn extends DoFn, TableRow>{
97 |
98 | @ProcessElement
99 | public void processElement(ProcessContext c, IntervalWindow window) {
100 | TableRow row =
101 | new TableRow()
102 | .set("team", c.element().getKey())
103 | .set("total_score", c.element().getValue())
104 | .set("window_start", window.start().getMillis() / 1000);
105 | c.output(row);
106 | }
107 |
108 | /**
109 | * Defines the BigQuery schema.
110 | */
111 | public static TableSchema getSchema() {
112 | List fields = new ArrayList<>();
113 | fields.add(new TableFieldSchema().setName("team").setType("STRING"));
114 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
115 | fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
116 | return new TableSchema().setFields(fields);
117 | }
118 | }
119 |
120 | /**
121 | * Run a batch pipeline.
122 | */
123 | public static void main(String[] args) throws Exception {
124 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
125 | Pipeline pipeline = Pipeline.create(options);
126 |
127 | TableReference tableRef = new TableReference();
128 | tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
129 | tableRef.setProjectId(options.as(GcpOptions.class).getProject());
130 | tableRef.setTableId(options.getOutputTableName());
131 |
132 | // Read events from a CSV file and parse them.
133 | pipeline
134 | .apply(TextIO.read().from(options.getInput()))
135 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
136 | .apply(
137 | "AddEventTimestamps", WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp())))
138 | .apply("WindowedTeamScore", new WindowedTeamScore(Duration.standardMinutes(60)))
139 | // Write the results to BigQuery.
140 | .apply("FormatTeamScoreSums", ParDo.of(new FormatTeamScoreSumsFn()))
141 | .apply(
142 | BigQueryIO.writeTableRows().to(tableRef)
143 | .withSchema(FormatTeamScoreSumsFn.getSchema())
144 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
145 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
146 |
147 | PipelineResult result = pipeline.run();
148 | result.waitUntilFinish();
149 | }
150 | }
151 |
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/Exercise3.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 | * use this file except in compliance with the License. You may obtain a copy of
6 | * the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | * License for the specific language governing permissions and limitations under
14 | * the License.
15 | */
16 |
17 | package org.apache.beam.examples.complete.game;
18 |
19 | import com.google.api.services.bigquery.model.TableReference;
20 | import org.apache.beam.sdk.Pipeline;
21 | import org.apache.beam.sdk.PipelineResult;
22 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
23 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
24 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
25 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
26 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
27 | import org.apache.beam.sdk.transforms.PTransform;
28 | import org.apache.beam.sdk.transforms.ParDo;
29 | import org.apache.beam.sdk.values.PBegin;
30 | import org.apache.beam.sdk.values.PCollection;
31 | import org.apache.beam.examples.complete.game.solutions.Exercise2;
32 | import org.apache.beam.examples.complete.game.utils.ChangeMe;
33 | import org.apache.beam.examples.complete.game.utils.GameEvent;
34 | import org.apache.beam.examples.complete.game.utils.Options;
35 | import org.joda.time.Duration;
36 |
37 | /**
38 | * Third in a series of coding exercises in a gaming domain.
39 | *
40 | * This is the same pipeline as in Exercise 2, but can run in either batch or streaming mode.
41 | *
42 | *
See README.md for details.
43 | */
44 | public class Exercise3 {
45 |
46 | /**
47 | * A transform to read the game events from either text files or Pub/Sub topic.
48 | */
49 | public static class ReadGameEvents extends PTransform> {
50 |
51 | private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms";
52 |
53 | private Options options;
54 |
55 | public ReadGameEvents(Options options) {
56 | this.options = options;
57 | }
58 |
59 | @Override
60 | public PCollection expand(PBegin begin) {
61 | // [START EXERCISE 3]:
62 | // Javadoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.html
63 | // Developer Docs (1.x): https://cloud.google.com/dataflow/model/pubsub-io
64 | //
65 | // Determine whether to use files or topic based on options.
66 | if (options.getInput() != null && !options.getInput().isEmpty()) {
67 | return begin
68 | .getPipeline()
69 | // Read game events from files. See main() in Exercise2. Don't forget to parse events or
70 | // to include WithTimestamps transform to assign timestamps to events.
71 | // https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/WithTimestamps.html
72 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */)
73 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */)
74 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */);
75 | } else {
76 | return begin
77 | .getPipeline()
78 | // Read game events from Pub/Sub topic options.getTopic() using custom timestamps, which
79 | // are extracted from the pubsub attribute TIMESTAMP_ATTRIBUTE.
80 | // Use PubsubIO.readStrings() with withTimestampAttribute() and fromTopic().
81 | // https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.html
82 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */)
83 | // Parse the messages the same way as when they come from the text file. Note that we no
84 | // longer have to run WithTimestamps transform, as the timestamps are already set by
85 | // PubsubIO. (In streaming, changing timestamps must be done carefully to avoid
86 | // guarantees necessary for watermarks.)
87 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */);
88 | }
89 | // [END EXERCISE 3]
90 | }
91 | }
92 |
93 | /**
94 | * Run a batch or streaming pipeline.
95 | */
96 | public static void main(String[] args) throws Exception {
97 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
98 |
99 | Pipeline pipeline = Pipeline.create(options);
100 |
101 | TableReference tableRef = new TableReference();
102 | tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
103 | tableRef.setProjectId(options.as(GcpOptions.class).getProject());
104 | tableRef.setTableId(options.getOutputTableName());
105 |
106 | // Read events from either a CSV file or PubSub stream.
107 | pipeline
108 | .apply(new ReadGameEvents(options))
109 | .apply("WindowedTeamScore", new Exercise2.WindowedTeamScore(Duration.standardMinutes(60)))
110 | // Write the results to BigQuery.
111 | .apply("FormatTeamScoreSums", ParDo.of(new Exercise2.FormatTeamScoreSumsFn()))
112 | .apply(
113 | BigQueryIO.writeTableRows().to(tableRef)
114 | .withSchema(Exercise2.FormatTeamScoreSumsFn.getSchema())
115 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
116 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
117 |
118 | PipelineResult result = pipeline.run();
119 | result.waitUntilFinish();
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/Exercise4.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 | * use this file except in compliance with the License. You may obtain a copy of
6 | * the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | * License for the specific language governing permissions and limitations under
14 | * the License.
15 | */
16 |
17 | package org.apache.beam.examples.complete.game;
18 |
19 | import com.google.api.services.bigquery.model.TableFieldSchema;
20 | import com.google.api.services.bigquery.model.TableReference;
21 | import com.google.api.services.bigquery.model.TableRow;
22 | import com.google.api.services.bigquery.model.TableSchema;
23 | import com.google.common.annotations.VisibleForTesting;
24 | import java.util.ArrayList;
25 | import java.util.List;
26 | import org.apache.beam.examples.complete.game.solutions.Exercise1;
27 | import org.apache.beam.examples.complete.game.solutions.Exercise3;
28 | import org.apache.beam.examples.complete.game.utils.GameEvent;
29 | import org.apache.beam.examples.complete.game.utils.Options;
30 | import org.apache.beam.runners.dataflow.DataflowRunner;
31 | import org.apache.beam.sdk.Pipeline;
32 | import org.apache.beam.sdk.PipelineResult;
33 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
35 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
36 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
37 | import org.apache.beam.sdk.options.Default;
38 | import org.apache.beam.sdk.options.Description;
39 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
40 | import org.apache.beam.sdk.options.StreamingOptions;
41 | import org.apache.beam.sdk.transforms.DoFn;
42 | import org.apache.beam.sdk.transforms.PTransform;
43 | import org.apache.beam.sdk.transforms.ParDo;
44 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
45 | import org.apache.beam.sdk.values.KV;
46 | import org.apache.beam.sdk.values.PCollection;
47 | import org.joda.time.Duration;
48 | import org.joda.time.Instant;
49 |
50 | /**
51 | * Fourth in a series of coding exercises in a gaming domain.
52 | *
53 | * This streaming pipeline calculates user and team scores for a window of time and writes them
54 | * to BigQuery.
55 | *
56 | *
See README.md for details.
57 | */
58 | public class Exercise4 {
59 |
60 | static final Duration TEN_SECONDS = Duration.standardSeconds(10);
61 | static final Duration THIRTY_SECONDS = Duration.standardSeconds(30);
62 |
63 | /**
64 | * Exercise4Options supported by {@link Exercise4}.
65 | */
66 | interface Exercise4Options extends Options, StreamingOptions {
67 |
68 | @Description("Numeric value of fixed window duration for team analysis, in minutes")
69 | @Default.Integer(1)
70 | Integer getTeamWindowDuration();
71 |
72 | void setTeamWindowDuration(Integer value);
73 |
74 | @Description("Numeric value of allowed data lateness, in minutes")
75 | @Default.Integer(2)
76 | Integer getAllowedLateness();
77 |
78 | void setAllowedLateness(Integer value);
79 | }
80 |
81 | /**
82 | * Extract user/score pairs from the event stream using processing time, via global windowing. Get
83 | * periodic updates on all users' running scores.
84 | */
85 | @VisibleForTesting
86 | static class CalculateUserScores
87 | extends PTransform, PCollection>> {
88 |
89 | private final Duration allowedLateness;
90 |
91 | CalculateUserScores(Duration allowedLateness) {
92 | this.allowedLateness = allowedLateness;
93 | }
94 |
95 | @Override
96 | public PCollection> expand(PCollection input) {
97 | // [START EXERCISE 4 PART 1]:
98 | // JavaDoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/windowing/Window.html
99 | // Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing
100 | //
101 | // Fill in the code to:
102 | // 1. Window the incoming input into global windows
103 | // 2. that trigger every thirty seconds to emit speculative results,
104 | // 3. allow late data with allowedLateness,
105 | // 3. and don't forget to accumulate over the entire window.
106 | return input
107 | /* TODO: SOLUTION CODE HERE */
108 | // Extract and sum username/score pairs from the event data.
109 | .apply("ExtractUserScore", new Exercise1.ExtractAndSumScore("user"));
110 | // [END EXERCISE 4 PART 1]:
111 | }
112 | }
113 |
114 | /**
115 | * Calculates scores for each team within the configured window duration.
116 | */
117 | // Extract team/score pairs from the event stream, using hour-long windows by default.
118 | @VisibleForTesting
119 | static class CalculateTeamScores
120 | extends PTransform, PCollection>> {
121 |
122 | private final Duration teamWindowDuration;
123 | private final Duration allowedLateness;
124 |
125 | CalculateTeamScores(Duration teamWindowDuration, Duration allowedLateness) {
126 | this.teamWindowDuration = teamWindowDuration;
127 | this.allowedLateness = allowedLateness;
128 | }
129 |
130 | @Override
131 | public PCollection> expand(PCollection infos) {
132 | // [START EXERCISE 4 PART 2]:
133 | // JavaDoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/windowing/Window.html
134 | // Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing
135 | //
136 | // Fill in the code to:
137 | // 1. Window the incoming input into fixed windows of team window duration,
138 | // 2. trigger on time results at the watermark,
139 | // 3. trigger speculative results every ten seconds,
140 | // 4. trigger late data results with a delay of thirty seconds,
141 | // 5. don't forget to set the allowedLateness,
142 | // 6. and ensure that we continue to accumulate over all data in the window.
143 | return infos
144 | /* TODO: SOLUTION CODE HERE */
145 | // Extract and sum teamname/score pairs from the event data.
146 | .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team"));
147 | // [END EXERCISE 4 PART 2]:
148 | }
149 | }
150 |
151 | public static void main(String[] args) throws Exception {
152 | Exercise4Options options =
153 | PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise4Options.class);
154 | // Enforce that this pipeline is always run in streaming mode.
155 | options.setStreaming(true);
156 | options.setRunner(DataflowRunner.class);
157 | Pipeline pipeline = Pipeline.create(options);
158 |
159 | TableReference teamTable = new TableReference();
160 | teamTable.setDatasetId(options.getOutputDataset());
161 | teamTable.setProjectId(options.as(GcpOptions.class).getProject());
162 | teamTable.setTableId(options.getOutputTableName() + "_team");
163 |
164 | TableReference userTable = new TableReference();
165 | userTable.setDatasetId(options.getOutputDataset());
166 | userTable.setProjectId(options.as(GcpOptions.class).getProject());
167 | userTable.setTableId(options.getOutputTableName() + "_user");
168 |
169 | PCollection gameEvents = pipeline.apply(new Exercise3.ReadGameEvents(options));
170 |
171 | gameEvents
172 | .apply(
173 | "CalculateTeamScores",
174 | new CalculateTeamScores(
175 | Duration.standardMinutes(options.getTeamWindowDuration()),
176 | Duration.standardMinutes(options.getAllowedLateness())))
177 | // Write the results to BigQuery.
178 | .apply("FormatTeamScores", ParDo.of(new FormatTeamScoreFn()))
179 | .apply(
180 | BigQueryIO.writeTableRows().to(teamTable)
181 | .withSchema(FormatTeamScoreFn.getSchema())
182 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
183 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
184 |
185 | gameEvents
186 | .apply(
187 | "CalculateUserScores",
188 | new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness())))
189 | // Write the results to BigQuery.
190 | .apply("FormatUserScores", ParDo.of(new FormatUserScoreFn()))
191 | .apply(
192 | BigQueryIO.writeTableRows().to(userTable)
193 | .withSchema(FormatUserScoreFn.getSchema())
194 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
195 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
196 |
197 | PipelineResult result = pipeline.run();
198 | result.waitUntilFinish();
199 | }
200 |
201 | /**
202 | * Format a KV of team and associated properties to a BigQuery TableRow.
203 | */
204 | protected static class FormatTeamScoreFn extends DoFn, TableRow> {
205 |
206 | @ProcessElement
207 | public void processElement(ProcessContext c, IntervalWindow window) {
208 | TableRow row =
209 | new TableRow()
210 | .set("team", c.element().getKey())
211 | .set("total_score", c.element().getValue())
212 | .set("window_start", window.start().getMillis() / 1000)
213 | .set("processing_time", Instant.now().getMillis() / 1000)
214 | .set("timing", c.pane().getTiming().toString());
215 | c.output(row);
216 | }
217 |
218 | static TableSchema getSchema() {
219 | List fields = new ArrayList<>();
220 | fields.add(new TableFieldSchema().setName("team").setType("STRING"));
221 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
222 | fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
223 | fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"));
224 | fields.add(new TableFieldSchema().setName("timing").setType("STRING"));
225 | return new TableSchema().setFields(fields);
226 | }
227 | }
228 |
229 | /**
230 | * Format a KV of user and associated properties to a BigQuery TableRow.
231 | */
232 | static class FormatUserScoreFn extends DoFn, TableRow> {
233 |
234 | @ProcessElement
235 | public void processElement(ProcessContext c) {
236 | TableRow row =
237 | new TableRow()
238 | .set("user", c.element().getKey())
239 | .set("total_score", c.element().getValue())
240 | .set("processing_time", Instant.now().getMillis() / 1000);
241 | c.output(row);
242 | }
243 |
244 | static TableSchema getSchema() {
245 | List fields = new ArrayList<>();
246 | fields.add(new TableFieldSchema().setName("user").setType("STRING"));
247 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
248 | fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"));
249 | return new TableSchema().setFields(fields);
250 | }
251 | }
252 | }
253 |
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/Exercise6.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 | * use this file except in compliance with the License. You may obtain a copy of
6 | * the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | * License for the specific language governing permissions and limitations under
14 | * the License.
15 | */
16 | package org.apache.beam.examples.complete.game;
17 |
18 | import com.google.api.services.bigquery.model.TableFieldSchema;
19 | import com.google.api.services.bigquery.model.TableReference;
20 | import com.google.api.services.bigquery.model.TableRow;
21 | import com.google.api.services.bigquery.model.TableSchema;
22 | import java.util.ArrayList;
23 | import java.util.List;
24 | import org.apache.beam.examples.complete.game.solutions.Exercise3.ReadGameEvents;
25 | import org.apache.beam.examples.complete.game.utils.ChangeMe;
26 | import org.apache.beam.examples.complete.game.utils.GameEvent;
27 | import org.apache.beam.examples.complete.game.utils.Options;
28 | import org.apache.beam.runners.dataflow.DataflowRunner;
29 | import org.apache.beam.sdk.Pipeline;
30 | import org.apache.beam.sdk.PipelineResult;
31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
35 | import org.apache.beam.sdk.options.Default;
36 | import org.apache.beam.sdk.options.Description;
37 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
38 | import org.apache.beam.sdk.options.StreamingOptions;
39 | import org.apache.beam.sdk.transforms.Combine;
40 | import org.apache.beam.sdk.transforms.DoFn;
41 | import org.apache.beam.sdk.transforms.MapElements;
42 | import org.apache.beam.sdk.transforms.Mean;
43 | import org.apache.beam.sdk.transforms.ParDo;
44 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
45 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
46 | import org.apache.beam.sdk.values.KV;
47 | import org.apache.beam.sdk.values.PCollection;
48 | import org.apache.beam.sdk.values.TypeDescriptors;
49 | import org.joda.time.Duration;
50 | import org.slf4j.Logger;
51 | import org.slf4j.LoggerFactory;
52 |
53 | /**
54 | * Sixth in a series of coding exercises in a gaming domain.
55 | *
56 | * This exercise introduces session windows.
57 | *
58 | *
See README.md for details.
59 | */
60 | public class Exercise6 {
61 |
62 | private static final Logger LOG = LoggerFactory.getLogger(Exercise6.class);
63 |
64 | /**
65 | * Calculate and output an element's session duration.
66 | */
67 | private static class UserSessionInfoFn extends DoFn, Integer> {
68 |
69 | @ProcessElement
70 | public void processElement(ProcessContext c, BoundedWindow window) {
71 | IntervalWindow w = (IntervalWindow) window;
72 | int duration = new Duration(w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes();
73 | c.output(duration);
74 | }
75 | }
76 |
77 | /**
78 | * Options supported by {@link Exercise6}.
79 | */
80 | interface Exercise6Options extends Options, StreamingOptions {
81 |
82 | @Description("Numeric value of gap between user sessions, in minutes")
83 | @Default.Integer(1)
84 | Integer getSessionGap();
85 |
86 | void setSessionGap(Integer value);
87 |
88 | @Description(
89 | "Numeric value of fixed window for finding mean of user session duration, " + "in minutes")
90 | @Default.Integer(5)
91 | Integer getUserActivityWindowDuration();
92 |
93 | void setUserActivityWindowDuration(Integer value);
94 | }
95 |
96 | public static void main(String[] args) throws Exception {
97 |
98 | Exercise6Options options =
99 | PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise6Options.class);
100 | // Enforce that this pipeline is always run in streaming mode.
101 | options.setStreaming(true);
102 | options.setRunner(DataflowRunner.class);
103 | Pipeline pipeline = Pipeline.create(options);
104 |
105 | TableReference sessionsTable = new TableReference();
106 | sessionsTable.setDatasetId(options.getOutputDataset());
107 | sessionsTable.setProjectId(options.as(GcpOptions.class).getProject());
108 | sessionsTable.setTableId(options.getOutputTableName());
109 |
110 | PCollection rawEvents = pipeline.apply(new ReadGameEvents(options));
111 |
112 | // Extract username/score pairs from the event stream
113 | PCollection> userEvents =
114 | rawEvents.apply(
115 | "ExtractUserScore",
116 | MapElements
117 | .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
118 | .via((GameEvent gInfo) -> KV.of(gInfo.getUser(),
119 | gInfo.getScore())));
120 |
121 | // [START EXERCISE 6]:
122 | // Detect user sessions-- that is, a burst of activity separated by a gap from further
123 | // activity. Find and record the mean session lengths.
124 | // This information could help the game designers track the changing user engagement
125 | // as their set of games changes.
126 | userEvents
127 | // Window the user events into sessions with gap options.getSessionGap() minutes. Make sure
128 | // to use an outputTimeFn that sets the output timestamp to the end of the window. This will
129 | // allow us to compute means on sessions based on their end times, rather than their start
130 | // times.
131 | // JavaDoc:
132 | // - https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/windowing/Sessions.html
133 | // - https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/windowing/Window.html
134 | // Note: Pay attention to the withTimestampCombiner method on Window.
135 | .apply("WindowIntoSessions",
136 | /* TODO: YOUR CODE GOES HERE */
137 | new ChangeMe>, KV>())
138 | // For this use, we care only about the existence of the session, not any particular
139 | // information aggregated over it, so the following is an efficient way to do that.
140 | .apply(Combine.perKey(x -> 0))
141 | // Get the duration per session.
142 | .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn()))
143 | // Note that the output of the previous transform is a PCollection of session durations
144 | // (PCollection) where the timestamp of elements is the end of the window.
145 | //
146 | // Re-window to process groups of session sums according to when the sessions complete.
147 | // In streaming we don't just ask "what is the mean value" we must ask "what is the mean
148 | // value for some window of time". To compute periodic means of session durations, we
149 | // re-window the session durations.
150 | .apply("WindowToExtractSessionMean",
151 | /* TODO: YOUR CODE GOES HERE */
152 | new ChangeMe, Integer>())
153 | // Find the mean session duration in each window.
154 | .apply(Mean.globally().withoutDefaults())
155 | // Write this info to a BigQuery table.
156 | .apply("FormatSessions", ParDo.of(new FormatSessionWindowFn()))
157 | .apply(
158 | BigQueryIO.writeTableRows().to(sessionsTable)
159 | .withSchema(FormatSessionWindowFn.getSchema())
160 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
161 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
162 | // [END EXERCISE 6]:
163 |
164 | PipelineResult result = pipeline.run();
165 | result.waitUntilFinish();
166 | }
167 |
168 | /**
169 | * Format a KV of session and associated properties to a BigQuery TableRow.
170 | */
171 | static class FormatSessionWindowFn extends DoFn {
172 |
173 | @ProcessElement
174 | public void processElement(ProcessContext c, BoundedWindow window) {
175 | IntervalWindow w = (IntervalWindow) window;
176 | TableRow row =
177 | new TableRow()
178 | .set("window_start", w.start().getMillis() / 1000)
179 | .set("mean_duration", c.element());
180 | c.output(row);
181 | }
182 |
183 | static TableSchema getSchema() {
184 | List fields = new ArrayList<>();
185 | fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
186 | fields.add(new TableFieldSchema().setName("mean_duration").setType("FLOAT"));
187 | return new TableSchema().setFields(fields);
188 | }
189 | }
190 | }
191 |
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/injector/InjectorUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 | * use this file except in compliance with the License. You may obtain a copy of
6 | * the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | * License for the specific language governing permissions and limitations under
14 | * the License.
15 | */
16 |
17 | package org.apache.beam.examples.complete.game.injector;
18 |
19 | import static com.google.common.base.Preconditions.checkNotNull;
20 |
21 | import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
22 | import com.google.api.client.googleapis.json.GoogleJsonResponseException;
23 | import com.google.api.client.googleapis.util.Utils;
24 | import com.google.api.client.http.HttpRequestInitializer;
25 | import com.google.api.client.http.HttpStatusCodes;
26 | import com.google.api.client.http.HttpTransport;
27 | import com.google.api.client.json.JsonFactory;
28 | import com.google.api.services.pubsub.Pubsub;
29 | import com.google.api.services.pubsub.PubsubScopes;
30 | import com.google.api.services.pubsub.model.Topic;
31 | import java.io.IOException;
32 |
33 | class InjectorUtils {
34 |
35 | private static final String APP_NAME = "injector";
36 |
37 | /**
38 | * Builds a new Pubsub client and returns it.
39 | */
40 | public static Pubsub getClient(final HttpTransport httpTransport, final JsonFactory jsonFactory)
41 | throws IOException {
42 | checkNotNull(httpTransport);
43 | checkNotNull(jsonFactory);
44 | GoogleCredential credential =
45 | GoogleCredential.getApplicationDefault(httpTransport, jsonFactory);
46 | if (credential.createScopedRequired()) {
47 | credential = credential.createScoped(PubsubScopes.all());
48 | }
49 | if (credential.getClientAuthentication() != null) {
50 | System.out.println(
51 | "\n***Warning! You are not using service account credentials to "
52 | + "authenticate.\nYou need to use service account credentials for this example,"
53 | + "\nsince user-level credentials do not have enough pubsub quota,\nand so you will run "
54 | + "out of PubSub quota very quickly.\nSee "
55 | + "https://developers.google.com/identity/protocols/application-default-credentials.");
56 | System.exit(1);
57 | }
58 | HttpRequestInitializer initializer = new RetryHttpInitializerWrapper(credential);
59 | return new Pubsub.Builder(httpTransport, jsonFactory, initializer)
60 | .setApplicationName(APP_NAME)
61 | .build();
62 | }
63 |
64 | /**
65 | * Builds a new Pubsub client with default HttpTransport and JsonFactory and returns it.
66 | */
67 | public static Pubsub getClient() throws IOException {
68 | return getClient(Utils.getDefaultTransport(), Utils.getDefaultJsonFactory());
69 | }
70 |
71 | /**
72 | * Returns the fully qualified topic name for Pub/Sub.
73 | */
74 | public static String getFullyQualifiedTopicName(final String project, final String topic) {
75 | return String.format("projects/%s/topics/%s", project, topic);
76 | }
77 |
78 | /**
79 | * Create a topic if it doesn't exist.
80 | */
81 | public static void createTopic(Pubsub client, String fullTopicName) throws IOException {
82 | try {
83 | client.projects().topics().get(fullTopicName).execute();
84 | } catch (GoogleJsonResponseException e) {
85 | if (e.getStatusCode() == HttpStatusCodes.STATUS_CODE_NOT_FOUND) {
86 | Topic topic = client.projects().topics().create(fullTopicName, new Topic()).execute();
87 | System.out.printf("Topic %s was created.\n", topic.getName());
88 | }
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/injector/RetryHttpInitializerWrapper.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5 | * in compliance with the License. You may obtain a copy of the License at
6 | *
7 | * http://www.apache.org/licenses/LICENSE-2.0
8 | *
9 | * Unless required by applicable law or agreed to in writing, software distributed under the License
10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11 | * or implied. See the License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 |
15 | package org.apache.beam.examples.complete.game.injector;
16 |
17 | import static com.google.common.base.Preconditions.checkNotNull;
18 |
19 | import com.google.api.client.auth.oauth2.Credential;
20 | import com.google.api.client.http.HttpBackOffIOExceptionHandler;
21 | import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler;
22 | import com.google.api.client.http.HttpRequest;
23 | import com.google.api.client.http.HttpRequestInitializer;
24 | import com.google.api.client.http.HttpResponse;
25 | import com.google.api.client.http.HttpUnsuccessfulResponseHandler;
26 | import com.google.api.client.util.ExponentialBackOff;
27 | import com.google.api.client.util.Sleeper;
28 | import java.io.IOException;
29 | import java.util.logging.Logger;
30 |
31 | /**
32 | * RetryHttpInitializerWrapper will automatically retry upon RPC failures, preserving the
33 | * auto-refresh behavior of the Google Credentials.
34 | */
35 | public class RetryHttpInitializerWrapper implements HttpRequestInitializer {
36 |
37 | /**
38 | * A private logger.
39 | */
40 | private static final Logger LOG = Logger.getLogger(RetryHttpInitializerWrapper.class.getName());
41 |
42 | /**
43 | * One minutes in miliseconds.
44 | */
45 | private static final int ONEMINITUES = 60000;
46 |
47 | /**
48 | * Intercepts the request for filling in the "Authorization" header field, as well as recovering
49 | * from certain unsuccessful error codes wherein the Credential must refresh its token for a
50 | * retry.
51 | */
52 | private final Credential wrappedCredential;
53 |
54 | /**
55 | * A sleeper; you can replace it with a mock in your test.
56 | */
57 | private final Sleeper sleeper;
58 |
59 | /**
60 | * A constructor.
61 | *
62 | * @param wrappedCredential Credential which will be wrapped and used for providing auth header.
63 | */
64 | public RetryHttpInitializerWrapper(final Credential wrappedCredential) {
65 | this(wrappedCredential, Sleeper.DEFAULT);
66 | }
67 |
68 | /**
69 | * A protected constructor only for testing.
70 | *
71 | * @param wrappedCredential Credential which will be wrapped and used for providing auth header.
72 | * @param sleeper Sleeper for easy testing.
73 | */
74 | RetryHttpInitializerWrapper(final Credential wrappedCredential, final Sleeper sleeper) {
75 | this.wrappedCredential = checkNotNull(wrappedCredential);
76 | this.sleeper = sleeper;
77 | }
78 |
79 | /**
80 | * Initializes the given request.
81 | */
82 | @Override
83 | public final void initialize(final HttpRequest request) {
84 | request.setReadTimeout(2 * ONEMINITUES); // 2 minutes read timeout
85 | final HttpUnsuccessfulResponseHandler backoffHandler =
86 | new HttpBackOffUnsuccessfulResponseHandler(new ExponentialBackOff()).setSleeper(sleeper);
87 | request.setInterceptor(wrappedCredential);
88 | request.setUnsuccessfulResponseHandler(
89 | new HttpUnsuccessfulResponseHandler() {
90 | @Override
91 | public boolean handleResponse(
92 | final HttpRequest request, final HttpResponse response, final boolean supportsRetry)
93 | throws IOException {
94 | if (wrappedCredential.handleResponse(request, response, supportsRetry)) {
95 | // If credential decides it can handle it,
96 | // the return code or message indicated
97 | // something specific to authentication,
98 | // and no backoff is desired.
99 | return true;
100 | } else if (backoffHandler.handleResponse(request, response, supportsRetry)) {
101 | // Otherwise, we defer to the judgement of
102 | // our internal backoff handler.
103 | LOG.info("Retrying " + request.getUrl().toString());
104 | return true;
105 | } else {
106 | return false;
107 | }
108 | }
109 | });
110 | request.setIOExceptionHandler(
111 | new HttpBackOffIOExceptionHandler(new ExponentialBackOff()).setSleeper(sleeper));
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise1.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 | * use this file except in compliance with the License. You may obtain a copy of
6 | * the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | * License for the specific language governing permissions and limitations under
14 | * the License.
15 | */
16 |
17 | package org.apache.beam.examples.complete.game.solutions;
18 |
19 | import com.google.api.services.bigquery.model.TableFieldSchema;
20 | import com.google.api.services.bigquery.model.TableReference;
21 | import com.google.api.services.bigquery.model.TableRow;
22 | import com.google.api.services.bigquery.model.TableSchema;
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import org.apache.beam.examples.complete.game.utils.GameEvent;
26 | import org.apache.beam.examples.complete.game.utils.Options;
27 | import org.apache.beam.examples.complete.game.utils.ParseEventFn;
28 | import org.apache.beam.sdk.Pipeline;
29 | import org.apache.beam.sdk.PipelineResult;
30 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
31 | import org.apache.beam.sdk.io.TextIO;
32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
35 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
36 | import org.apache.beam.sdk.transforms.DoFn;
37 | import org.apache.beam.sdk.transforms.MapElements;
38 | import org.apache.beam.sdk.transforms.PTransform;
39 | import org.apache.beam.sdk.transforms.ParDo;
40 | import org.apache.beam.sdk.transforms.Sum;
41 | import org.apache.beam.sdk.values.KV;
42 | import org.apache.beam.sdk.values.PCollection;
43 | import org.apache.beam.sdk.values.TypeDescriptors;
44 |
45 | /**
46 | * First in a series of coding exercises in a gaming domain.
47 | *
48 | * This batch pipeline calculates the sum of scores per user, over an entire batch of gaming data
49 | * and writes the sums to BigQuery.
50 | *
51 | *
See README.md for details.
52 | */
53 | public class Exercise1 {
54 |
55 | /**
56 | * A transform to extract key/score information from GameEvent, and sum
57 | * the scores. The constructor arg determines whether 'team' or 'user' info is
58 | * extracted.
59 | */
60 | public static class ExtractAndSumScore
61 | extends PTransform, PCollection>> {
62 |
63 | private final String field;
64 |
65 | public ExtractAndSumScore(String field) {
66 | this.field = field;
67 | }
68 |
69 | @Override
70 | public PCollection> expand(PCollection gameEvents) {
71 | return gameEvents
72 | .apply(ParDo.of(new DoFn>(){
73 | @ProcessElement
74 | public void processElement(ProcessContext c) {
75 | GameEvent event = c.element();
76 | c.output(KV.of(event.getKey(field), event.getScore()));
77 | }
78 | }))
79 | /*
80 | // alternate implementation
81 | .apply(MapElements
82 | .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
83 | .via((GameEvent event) -> KV.of(event.getKey(field),
84 | event.getScore()))) */
85 | .apply(Sum.integersPerKey());
86 | }
87 | }
88 |
89 | /**
90 | * Format a KV of user and their score to a BigQuery TableRow.
91 | */
92 | static class FormatUserScoreSumsFn extends DoFn, TableRow> {
93 |
94 | @ProcessElement
95 | public void processElement(ProcessContext c) {
96 | TableRow row = new TableRow()
97 | .set("user", c.element().getKey())
98 | .set("total_score", c.element().getValue());
99 | c.output(row);
100 | }
101 |
102 | /**
103 | * Defines the BigQuery schema.
104 | */
105 | static TableSchema getSchema() {
106 | List fields = new ArrayList<>();
107 | fields.add(new TableFieldSchema().setName("user").setType("STRING"));
108 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
109 | return new TableSchema().setFields(fields);
110 | }
111 | }
112 |
113 | /**
114 | * Run a batch pipeline.
115 | */
116 | public static void main(String[] args) throws Exception {
117 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
118 | Pipeline pipeline = Pipeline.create(options);
119 |
120 | TableReference tableRef = new TableReference();
121 | tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
122 | tableRef.setProjectId(options.as(GcpOptions.class).getProject());
123 | tableRef.setTableId(options.getOutputTableName());
124 |
125 | // Read events from a CSV file and parse them.
126 | pipeline
127 | .apply(TextIO.read().from(options.getInput()))
128 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
129 | // Extract and sum username/score pairs from the event data.
130 | .apply("ExtractUserScore", new ExtractAndSumScore("user"))
131 | // Write the results to BigQuery.
132 | .apply("FormatUserScoreSums", ParDo.of(new FormatUserScoreSumsFn()))
133 | .apply(
134 | BigQueryIO.writeTableRows().to(tableRef)
135 | .withSchema(FormatUserScoreSumsFn.getSchema())
136 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
137 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
138 |
139 | PipelineResult result = pipeline.run();
140 | result.waitUntilFinish();
141 | }
142 | }
143 |
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise2.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 | * use this file except in compliance with the License. You may obtain a copy of
6 | * the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | * License for the specific language governing permissions and limitations under
14 | * the License.
15 | */
16 |
17 | package org.apache.beam.examples.complete.game.solutions;
18 |
19 | import com.google.api.services.bigquery.model.TableFieldSchema;
20 | import com.google.api.services.bigquery.model.TableReference;
21 | import com.google.api.services.bigquery.model.TableRow;
22 | import com.google.api.services.bigquery.model.TableSchema;
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import org.apache.beam.examples.complete.game.utils.GameEvent;
26 | import org.apache.beam.examples.complete.game.utils.Options;
27 | import org.apache.beam.examples.complete.game.utils.ParseEventFn;
28 | import org.apache.beam.sdk.Pipeline;
29 | import org.apache.beam.sdk.PipelineResult;
30 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
31 | import org.apache.beam.sdk.io.TextIO;
32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
35 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
36 | import org.apache.beam.sdk.transforms.DoFn;
37 | import org.apache.beam.sdk.transforms.PTransform;
38 | import org.apache.beam.sdk.transforms.ParDo;
39 | import org.apache.beam.sdk.transforms.WithTimestamps;
40 | import org.apache.beam.sdk.transforms.windowing.FixedWindows;
41 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
42 | import org.apache.beam.sdk.transforms.windowing.Window;
43 | import org.apache.beam.sdk.values.KV;
44 | import org.apache.beam.sdk.values.PCollection;
45 | import org.joda.time.Duration;
46 | import org.joda.time.Instant;
47 |
48 | /**
49 | * Second in a series of coding exercises in a gaming domain.
50 | *
51 | * This batch pipeline calculates the sum of scores per team per hour, over an entire batch of
52 | * gaming data and writes the per-team sums to BigQuery.
53 | *
54 | *
See README.md for details.
55 | */
56 | public class Exercise2 {
57 |
58 | /**
59 | * A transform to compute the WindowedTeamScore.
60 | */
61 | public static class WindowedTeamScore
62 | extends PTransform, PCollection>> {
63 |
64 | private Duration duration;
65 |
66 | public WindowedTeamScore(Duration duration) {
67 | this.duration = duration;
68 | }
69 |
70 | @Override
71 | public PCollection> expand(PCollection input) {
72 | return input
73 | .apply(Window.into(FixedWindows.of(duration)))
74 | .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team"));
75 | }
76 | }
77 |
78 | /**
79 | * Format a KV of team and their score to a BigQuery TableRow.
80 | */
81 | public static class FormatTeamScoreSumsFn extends DoFn, TableRow> {
82 |
83 | @ProcessElement
84 | public void processElement(ProcessContext c, IntervalWindow window) {
85 | TableRow row =
86 | new TableRow()
87 | .set("team", c.element().getKey())
88 | .set("total_score", c.element().getValue())
89 | .set("window_start", window.start().getMillis() / 1000);
90 | c.output(row);
91 | }
92 |
93 | /**
94 | * Defines the BigQuery schema.
95 | */
96 | public static TableSchema getSchema() {
97 | List fields = new ArrayList<>();
98 | fields.add(new TableFieldSchema().setName("team").setType("STRING"));
99 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
100 | fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
101 | return new TableSchema().setFields(fields);
102 | }
103 | }
104 |
105 | /**
106 | * Run a batch pipeline.
107 | */
108 | public static void main(String[] args) throws Exception {
109 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
110 | Pipeline pipeline = Pipeline.create(options);
111 |
112 | TableReference tableRef = new TableReference();
113 | tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
114 | tableRef.setProjectId(options.as(GcpOptions.class).getProject());
115 | tableRef.setTableId(options.getOutputTableName());
116 |
117 | // Read events from a CSV file and parse them.
118 | pipeline
119 | .apply(TextIO.read().from(options.getInput()))
120 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
121 | .apply(
122 | "AddEventTimestamps", WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp())))
123 | .apply("WindowedTeamScore", new WindowedTeamScore(Duration.standardMinutes(60)))
124 | // Write the results to BigQuery.
125 | .apply("FormatTeamScoreSums", ParDo.of(new FormatTeamScoreSumsFn()))
126 | .apply(
127 | BigQueryIO.writeTableRows().to(tableRef)
128 | .withSchema(FormatTeamScoreSumsFn.getSchema())
129 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
130 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
131 |
132 | PipelineResult result = pipeline.run();
133 | result.waitUntilFinish();
134 | }
135 | }
136 |
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise3.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 | * use this file except in compliance with the License. You may obtain a copy of
6 | * the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | * License for the specific language governing permissions and limitations under
14 | * the License.
15 | */
16 |
17 | package org.apache.beam.examples.complete.game.solutions;
18 |
19 | import com.google.api.services.bigquery.model.TableReference;
20 | import org.apache.beam.examples.complete.game.utils.GameEvent;
21 | import org.apache.beam.examples.complete.game.utils.Options;
22 | import org.apache.beam.examples.complete.game.utils.ParseEventFn;
23 | import org.apache.beam.sdk.Pipeline;
24 | import org.apache.beam.sdk.PipelineResult;
25 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
26 | import org.apache.beam.sdk.io.TextIO;
27 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
28 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
29 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
30 | import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
31 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
32 | import org.apache.beam.sdk.transforms.PTransform;
33 | import org.apache.beam.sdk.transforms.ParDo;
34 | import org.apache.beam.sdk.transforms.WithTimestamps;
35 | import org.apache.beam.sdk.values.PBegin;
36 | import org.apache.beam.sdk.values.PCollection;
37 | import org.joda.time.Duration;
38 | import org.joda.time.Instant;
39 |
40 | /**
41 | * Third in a series of coding exercises in a gaming domain.
42 | *
43 | * This is the same pipeline as in Exercise 2, but can run in either batch or streaming mode.
44 | *
45 | *
See README.md for details.
46 | */
47 | public class Exercise3 {
48 |
49 | /**
50 | * A transform to read the game events from either text files or Pub/Sub topic.
51 | */
52 | public static class ReadGameEvents extends PTransform> {
53 |
54 | private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms";
55 |
56 | private Options options;
57 |
58 | public ReadGameEvents(Options options) {
59 | this.options = options;
60 | }
61 |
62 | @Override
63 | public PCollection expand(PBegin begin) {
64 | if (options.getInput() != null && !options.getInput().isEmpty()) {
65 | return begin
66 | .getPipeline()
67 | .apply(TextIO.read().from(options.getInput()))
68 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
69 | .apply(
70 | "AddEventTimestamps",
71 | WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp())));
72 | } else {
73 | return begin
74 | .getPipeline()
75 | .apply(PubsubIO.readStrings().withTimestampAttribute(TIMESTAMP_ATTRIBUTE)
76 | .fromTopic(options.getTopic()))
77 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn()));
78 | }
79 | }
80 | }
81 |
82 | /**
83 | * Run a batch or streaming pipeline.
84 | */
85 | public static void main(String[] args) throws Exception {
86 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
87 |
88 | Pipeline pipeline = Pipeline.create(options);
89 |
90 | TableReference tableRef = new TableReference();
91 | tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
92 | tableRef.setProjectId(options.as(GcpOptions.class).getProject());
93 | tableRef.setTableId(options.getOutputTableName());
94 |
95 | // Read events from either a CSV file or PubSub stream.
96 | pipeline
97 | .apply(new ReadGameEvents(options))
98 | .apply("WindowedTeamScore", new Exercise2.WindowedTeamScore(Duration.standardMinutes(5)))
99 | // Write the results to BigQuery.
100 | .apply("FormatTeamScoreSums", ParDo.of(new Exercise2.FormatTeamScoreSumsFn()))
101 | .apply(
102 | BigQueryIO.writeTableRows().to(tableRef)
103 | .withSchema(Exercise2.FormatTeamScoreSumsFn.getSchema())
104 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
105 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
106 |
107 | PipelineResult result = pipeline.run();
108 | result.waitUntilFinish();
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise4.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 | * use this file except in compliance with the License. You may obtain a copy of
6 | * the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | * License for the specific language governing permissions and limitations under
14 | * the License.
15 | */
16 |
17 | package org.apache.beam.examples.complete.game.solutions;
18 |
19 | import com.google.api.services.bigquery.model.TableFieldSchema;
20 | import com.google.api.services.bigquery.model.TableReference;
21 | import com.google.api.services.bigquery.model.TableRow;
22 | import com.google.api.services.bigquery.model.TableSchema;
23 | import com.google.common.annotations.VisibleForTesting;
24 | import java.util.ArrayList;
25 | import java.util.List;
26 | import org.apache.beam.examples.complete.game.utils.GameEvent;
27 | import org.apache.beam.examples.complete.game.utils.Options;
28 | import org.apache.beam.runners.dataflow.DataflowRunner;
29 | import org.apache.beam.sdk.Pipeline;
30 | import org.apache.beam.sdk.PipelineResult;
31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
35 | import org.apache.beam.sdk.options.Default;
36 | import org.apache.beam.sdk.options.Description;
37 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
38 | import org.apache.beam.sdk.options.StreamingOptions;
39 | import org.apache.beam.sdk.transforms.DoFn;
40 | import org.apache.beam.sdk.transforms.PTransform;
41 | import org.apache.beam.sdk.transforms.ParDo;
42 | import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime;
43 | import org.apache.beam.sdk.transforms.windowing.AfterWatermark;
44 | import org.apache.beam.sdk.transforms.windowing.FixedWindows;
45 | import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
46 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
47 | import org.apache.beam.sdk.transforms.windowing.Repeatedly;
48 | import org.apache.beam.sdk.transforms.windowing.Window;
49 | import org.apache.beam.sdk.values.KV;
50 | import org.apache.beam.sdk.values.PCollection;
51 | import org.joda.time.Duration;
52 | import org.joda.time.Instant;
53 |
54 | /**
55 | * Fourth in a series of coding exercises in a gaming domain.
56 | *
57 | * This streaming pipeline calculates user and team scores for a window of time and writes them
58 | * to BigQuery.
59 | *
60 | *
See README.md for details.
61 | */
62 | public class Exercise4 {
63 |
64 | static final Duration TEN_SECONDS = Duration.standardSeconds(10);
65 | static final Duration THIRTY_SECONDS = Duration.standardSeconds(30);
66 |
67 | /**
68 | * Exercise4Options supported by {@link Exercise4}.
69 | */
70 | interface Exercise4Options extends Options, StreamingOptions {
71 |
72 | @Description("Numeric value of fixed window duration for team analysis, in minutes")
73 | @Default.Integer(1)
74 | Integer getTeamWindowDuration();
75 |
76 | void setTeamWindowDuration(Integer value);
77 |
78 | @Description("Numeric value of allowed data lateness, in minutes")
79 | @Default.Integer(2)
80 | Integer getAllowedLateness();
81 |
82 | void setAllowedLateness(Integer value);
83 | }
84 |
85 | /**
86 | * Extract user/score pairs from the event stream using processing time, via global windowing. Get
87 | * periodic updates on all users' running scores.
88 | */
89 | @VisibleForTesting
90 | static class CalculateUserScores
91 | extends PTransform, PCollection>> {
92 |
93 | private final Duration allowedLateness;
94 |
95 | CalculateUserScores(Duration allowedLateness) {
96 | this.allowedLateness = allowedLateness;
97 | }
98 |
99 | @Override
100 | public PCollection> expand(PCollection input) {
101 | return input
102 | .apply(
103 | "LeaderboardUserGlobalWindow",
104 | Window.into(new GlobalWindows())
105 | // Get periodic results every 30 seconds.
106 | .triggering(
107 | Repeatedly.forever(
108 | AfterProcessingTime.pastFirstElementInPane().plusDelayOf(THIRTY_SECONDS)))
109 | .accumulatingFiredPanes()
110 | .withAllowedLateness(allowedLateness))
111 | // Extract and sum username/score pairs from the event data.
112 | .apply("ExtractUserScore", new Exercise1.ExtractAndSumScore("user"));
113 | }
114 | }
115 |
116 | /**
117 | * Calculates scores for each team within the configured window duration.
118 | */
119 | // Extract team/score pairs from the event stream, using hour-long windows by default.
120 | @VisibleForTesting
121 | static class CalculateTeamScores
122 | extends PTransform, PCollection>> {
123 |
124 | private final Duration teamWindowDuration;
125 | private final Duration allowedLateness;
126 |
127 | CalculateTeamScores(Duration teamWindowDuration, Duration allowedLateness) {
128 | this.teamWindowDuration = teamWindowDuration;
129 | this.allowedLateness = allowedLateness;
130 | }
131 |
132 | @Override
133 | public PCollection> expand(PCollection infos) {
134 | return infos
135 | .apply(
136 | "LeaderboardTeamFixedWindows",
137 | Window.into(FixedWindows.of(teamWindowDuration))
138 | // We will get early (speculative) results as well as cumulative
139 | // processing of late data.
140 | .triggering(
141 | AfterWatermark.pastEndOfWindow()
142 | .withEarlyFirings(
143 | AfterProcessingTime.pastFirstElementInPane().plusDelayOf(TEN_SECONDS))
144 | .withLateFirings(
145 | AfterProcessingTime.pastFirstElementInPane()
146 | .plusDelayOf(THIRTY_SECONDS)))
147 | .withAllowedLateness(allowedLateness)
148 | .accumulatingFiredPanes())
149 | // Extract and sum teamname/score pairs from the event data.
150 | .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team"));
151 | }
152 | }
153 |
154 | public static void main(String[] args) throws Exception {
155 | Exercise4Options options =
156 | PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise4Options.class);
157 | // Enforce that this pipeline is always run in streaming mode.
158 | options.setStreaming(true);
159 | // For example purposes, allow the pipeline to be easily cancelled instead of running
160 | // continuously.
161 | options.setRunner(DataflowRunner.class);
162 | Pipeline pipeline = Pipeline.create(options);
163 |
164 | TableReference teamTable = new TableReference();
165 | teamTable.setDatasetId(options.getOutputDataset());
166 | teamTable.setProjectId(options.as(GcpOptions.class).getProject());
167 | teamTable.setTableId(options.getOutputTableName() + "_team");
168 |
169 | TableReference userTable = new TableReference();
170 | userTable.setDatasetId(options.getOutputDataset());
171 | userTable.setProjectId(options.as(GcpOptions.class).getProject());
172 | userTable.setTableId(options.getOutputTableName() + "_user");
173 |
174 | PCollection gameEvents = pipeline.apply(new Exercise3.ReadGameEvents(options));
175 |
176 | gameEvents
177 | .apply(
178 | "CalculateTeamScores",
179 | new CalculateTeamScores(
180 | Duration.standardMinutes(options.getTeamWindowDuration()),
181 | Duration.standardMinutes(options.getAllowedLateness())))
182 | // Write the results to BigQuery.
183 | .apply("FormatTeamScores", ParDo.of(new FormatTeamScoreFn()))
184 | .apply(
185 | BigQueryIO.writeTableRows().to(teamTable)
186 | .withSchema(FormatTeamScoreFn.getSchema())
187 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
188 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
189 |
190 | gameEvents
191 | .apply(
192 | "CalculateUserScores",
193 | new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness())))
194 | // Write the results to BigQuery.
195 | .apply("FormatUserScores", ParDo.of(new FormatUserScoreFn()))
196 | .apply(
197 | BigQueryIO.writeTableRows().to(userTable)
198 | .withSchema(FormatUserScoreFn.getSchema())
199 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
200 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
201 |
202 | PipelineResult result = pipeline.run();
203 | result.waitUntilFinish();
204 | }
205 |
206 | /**
207 | * Format a KV of team and associated properties to a BigQuery TableRow.
208 | */
209 | protected static class FormatTeamScoreFn extends DoFn, TableRow> {
210 |
211 | @ProcessElement
212 | public void processElement(ProcessContext c, IntervalWindow window) {
213 | TableRow row =
214 | new TableRow()
215 | .set("team", c.element().getKey())
216 | .set("total_score", c.element().getValue())
217 | .set("window_start", window.start().getMillis() / 1000)
218 | .set("processing_time", Instant.now().getMillis() / 1000)
219 | .set("timing", c.pane().getTiming().toString());
220 | c.output(row);
221 | }
222 |
223 | static TableSchema getSchema() {
224 | List fields = new ArrayList<>();
225 | fields.add(new TableFieldSchema().setName("team").setType("STRING"));
226 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
227 | fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
228 | fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"));
229 | fields.add(new TableFieldSchema().setName("timing").setType("STRING"));
230 | return new TableSchema().setFields(fields);
231 | }
232 | }
233 |
234 | /**
235 | * Format a KV of user and associated properties to a BigQuery TableRow.
236 | */
237 | static class FormatUserScoreFn extends DoFn, TableRow> {
238 |
239 | @ProcessElement
240 | public void processElement(ProcessContext c) {
241 | TableRow row =
242 | new TableRow()
243 | .set("user", c.element().getKey())
244 | .set("total_score", c.element().getValue())
245 | .set("processing_time", Instant.now().getMillis() / 1000);
246 | c.output(row);
247 | }
248 |
249 | static TableSchema getSchema() {
250 | List fields = new ArrayList<>();
251 | fields.add(new TableFieldSchema().setName("user").setType("STRING"));
252 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
253 | fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"));
254 | return new TableSchema().setFields(fields);
255 | }
256 | }
257 | }
258 |
--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise5.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2015 Google Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 | * use this file except in compliance with the License. You may obtain a copy of
6 | * the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | * License for the specific language governing permissions and limitations under
14 | * the License.
15 | */
16 | package org.apache.beam.examples.complete.game.solutions;
17 |
18 | import com.google.api.services.bigquery.model.TableFieldSchema;
19 | import com.google.api.services.bigquery.model.TableReference;
20 | import com.google.api.services.bigquery.model.TableRow;
21 | import com.google.api.services.bigquery.model.TableSchema;
22 | import java.util.ArrayList;
23 | import java.util.List;
24 | import java.util.Map;
25 | import org.apache.beam.examples.complete.game.utils.GameEvent;
26 | import org.apache.beam.examples.complete.game.utils.Options;
27 | import org.apache.beam.runners.dataflow.DataflowRunner;
28 | import org.apache.beam.sdk.Pipeline;
29 | import org.apache.beam.sdk.PipelineResult;
30 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
31 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
34 | import org.apache.beam.sdk.metrics.Counter;
35 | import org.apache.beam.sdk.metrics.Metrics;
36 | import org.apache.beam.sdk.options.Default;
37 | import org.apache.beam.sdk.options.Description;
38 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
39 | import org.apache.beam.sdk.options.StreamingOptions;
40 | import org.apache.beam.sdk.transforms.DoFn;
41 | import org.apache.beam.sdk.transforms.MapElements;
42 | import org.apache.beam.sdk.transforms.Mean;
43 | import org.apache.beam.sdk.transforms.PTransform;
44 | import org.apache.beam.sdk.transforms.ParDo;
45 | import org.apache.beam.sdk.transforms.Sum;
46 | import org.apache.beam.sdk.transforms.Values;
47 | import org.apache.beam.sdk.transforms.View;
48 | import org.apache.beam.sdk.transforms.windowing.FixedWindows;
49 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
50 | import org.apache.beam.sdk.transforms.windowing.Window;
51 | import org.apache.beam.sdk.values.KV;
52 | import org.apache.beam.sdk.values.PCollection;
53 | import org.apache.beam.sdk.values.PCollectionView;
54 | import org.apache.beam.sdk.values.TypeDescriptors;
55 | import org.joda.time.Duration;
56 | import org.joda.time.Instant;
57 | import org.slf4j.Logger;
58 | import org.slf4j.LoggerFactory;
59 |
60 | /**
61 | * Fifth in a series of coding exercises in a gaming domain.
62 | *
63 | * This exercise introduces side inputs.
64 | *
65 | *
See README.md for details.
66 | */
67 | public class Exercise5 {
68 |
69 | private static final Logger LOG = LoggerFactory.getLogger(Exercise5.class);
70 |
71 | /**
72 | * Filter out all but those users with a high clickrate, which we will consider as 'spammy' users.
73 | * We do this by finding the mean total score per user, then using that information as a side
74 | * input to filter out all but those user scores that are > (mean * SCORE_WEIGHT)
75 | */
76 | public static class CalculateSpammyUsers
77 | extends PTransform>, PCollection>> {
78 |
79 | private static final Logger LOG = LoggerFactory.getLogger(CalculateSpammyUsers.class);
80 | private static final double SCORE_WEIGHT = 2.5;
81 |
82 | @Override
83 | public PCollection> expand(PCollection> userScores) {
84 |
85 | // Get the sum of scores for each user.
86 | PCollection> sumScores =
87 | userScores.apply("UserSum", Sum.integersPerKey());
88 |
89 | // Extract the score from each element, and use it to find the global mean.
90 | final PCollectionView globalMeanScore =
91 | sumScores
92 | .apply(Values.create())
93 | .apply(Mean.globally().asSingletonView());
94 |
95 | // Filter the user sums using the global mean.
96 | PCollection> filtered =
97 | sumScores.apply("ProcessAndFilter",
98 | ParDo
99 | // use the derived mean total score as a side input
100 | .of(
101 | new DoFn, KV>() {
102 | private final Counter numSpammerUsers = Metrics
103 | .counter("main", "SpammerUsers");
104 |
105 | @ProcessElement
106 | public void processElement(ProcessContext c) {
107 | Integer score = c.element().getValue();
108 | Double gmc = c.sideInput(globalMeanScore);
109 | if (score > (gmc * SCORE_WEIGHT)) {
110 | LOG.info(
111 | "user "
112 | + c.element().getKey()
113 | + " spammer score "
114 | + score
115 | + " with mean "
116 | + gmc);
117 | numSpammerUsers.inc();
118 | c.output(c.element());
119 | }
120 | }
121 | })
122 | .withSideInputs(globalMeanScore));
123 | return filtered;
124 | }
125 | }
126 |
127 | /**
128 | * Calculate and output an element's session duration.
129 | */
130 | private static class UserSessionInfoFn extends DoFn, Integer> {
131 |
132 | @ProcessElement
133 | public void processElement(ProcessContext c, IntervalWindow w) {
134 | int duration = new Duration(w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes();
135 | c.output(duration);
136 | }
137 | }
138 |
139 | /**
140 | * Options supported by {@link Exercise5}.
141 | */
142 | interface Exercise5Options extends Options, StreamingOptions {
143 |
144 | @Description("Numeric value of fixed window duration for user analysis, in minutes")
145 | @Default.Integer(5)
146 | Integer getFixedWindowDuration();
147 |
148 | void setFixedWindowDuration(Integer value);
149 | }
150 |
151 | public static void main(String[] args) throws Exception {
152 |
153 | Exercise5Options options =
154 | PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise5Options.class);
155 | // Enforce that this pipeline is always run in streaming mode.
156 | options.setStreaming(true);
157 | options.setRunner(DataflowRunner.class);
158 | Pipeline pipeline = Pipeline.create(options);
159 |
160 | TableReference teamTable = new TableReference();
161 | teamTable.setDatasetId(options.getOutputDataset());
162 | teamTable.setProjectId(options.as(GcpOptions.class).getProject());
163 | teamTable.setTableId(options.getOutputTableName());
164 |
165 | PCollection rawEvents = pipeline.apply(new Exercise3.ReadGameEvents(options));
166 |
167 | // Extract username/score pairs from the event stream
168 | PCollection> userEvents =
169 | rawEvents.apply(
170 | "ExtractUserScore",
171 | MapElements
172 | .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
173 | .via((GameEvent gInfo) -> KV.of(gInfo.getUser(),
174 | gInfo.getScore())));
175 |
176 | // Calculate the total score per user over fixed windows, and
177 | // cumulative updates for late data.
178 | final PCollectionView