├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── pom.xml ├── py ├── exercises │ ├── __init__.py │ ├── __init__.pyc │ ├── exercise0.py │ ├── exercise1.py │ ├── exercise1.pyc │ ├── exercise2.py │ ├── exercise3.py │ ├── exercise4.py │ ├── exercise5.py │ ├── exercise6.py │ └── exercise7.py ├── run0.sh ├── run1.sh ├── run2.sh ├── run3.sh ├── run4.sh ├── run5.sh ├── run6.sh ├── run7.sh ├── setup.py ├── solutions │ ├── __init__.py │ ├── __init__.pyc │ ├── exercise0.py │ ├── exercise1.py │ ├── exercise1.pyc │ ├── exercise2.py │ ├── exercise3.py │ ├── exercise4.py │ ├── exercise5.py │ ├── exercise6.py │ └── exercise7.py └── util │ ├── __init__.py │ ├── __init__.pyc │ ├── util.py │ └── util.pyc └── src └── main └── java8 └── org └── apache └── beam └── examples └── complete └── game ├── Exercise0.java ├── Exercise1.java ├── Exercise2.java ├── Exercise3.java ├── Exercise4.java ├── Exercise5.java ├── Exercise6.java ├── Exercise7.java ├── injector ├── Injector.java ├── InjectorUtils.java └── RetryHttpInitializerWrapper.java ├── solutions ├── Exercise1.java ├── Exercise2.java ├── Exercise3.java ├── Exercise4.java ├── Exercise5.java ├── Exercise6.java └── Exercise7.java └── utils ├── ChangeMe.java ├── GameEvent.java ├── Options.java ├── ParseEventFn.java ├── ParsePlayEventFn.java └── PlayEvent.java /.gitattributes: -------------------------------------------------------------------------------- 1 | # The default behavior, which overrides 'core.autocrlf', is to use Git's 2 | # built-in heuristics to determine whether a particular file is text or binary. 3 | # Text files are automatically normalized to the user's platforms. 4 | * text=auto 5 | 6 | # Explicitly declare text files that should always be normalized and converted 7 | # to native line endings. 8 | .gitattributes text 9 | .gitignore text 10 | LICENSE text 11 | *.avsc text 12 | *.html text 13 | *.java text 14 | *.md text 15 | *.properties text 16 | *.proto text 17 | *.py text 18 | *.sh text 19 | *.xml text 20 | *.yml text 21 | 22 | # Declare files that will always have CRLF line endings on checkout. 23 | # *.sln text eol=crlf 24 | 25 | # Explicitly denote all files that are truly binary and should not be modified. 26 | # *.jpg binary 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | 3 | # Ignore IntelliJ files. 4 | .idea/ 5 | *.iml 6 | *.ipr 7 | *.iws 8 | 9 | # Ignore Eclipse files. 10 | .classpath 11 | .project 12 | .settings/ 13 | 14 | # The build process generates the dependency-reduced POM, but it shouldn't be 15 | # committed. 16 | dependency-reduced-pom.xml 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 20 | 4.0.0 21 | 22 | 23 | com.google.cloud.dataflow 24 | google-cloud-dataflow-java-sdk-parent 25 | 2.4.0 26 | 27 | 28 | com.google.cloud.dataflow 29 | DataflowTutorials 30 | Google Cloud Dataflow Java Tutorials 31 | 32 | 1.0.0 33 | 34 | jar 35 | 36 | 37 | 38 | java8 39 | 40 | [1.8,) 41 | 42 | 43 | 44 | 45 | 46 | 47 | org.codehaus.mojo 48 | build-helper-maven-plugin 49 | 3.0.0 50 | 51 | 52 | add-java8-main-source 53 | initialize 54 | 55 | add-source 56 | 57 | 58 | 59 | ${project.basedir}/src/main/java8 60 | 61 | 62 | 63 | 64 | 65 | add-java8-test-source 66 | initialize 67 | 68 | add-test-source 69 | 70 | 71 | 72 | ${project.basedir}/src/test/java8 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | org.apache.maven.plugins 81 | maven-compiler-plugin 82 | 83 | 84 | 85 | 86 | default-compile 87 | compile 88 | 89 | compile 90 | 91 | 92 | 1.8 93 | 1.8 94 | 8 95 | 96 | 98 | **/*Java8*.java 99 | **/game/**/*.java 100 | 101 | 102 | -Werror 103 | -Xlint:all 104 | -Xlint:-cast 105 | -Xlint:-deprecation 106 | -Xlint:-processing 107 | -Xlint:-rawtypes 108 | -Xlint:-serial 109 | -Xlint:-try 110 | -Xlint:-unchecked 111 | -Xlint:-varargs 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | maven-shade-plugin 126 | 2.4.1 127 | 128 | 129 | package 130 | 131 | shade 132 | 133 | 134 | ${project.artifactId}-bundled-${project.version} 135 | 136 | 137 | *:* 138 | 139 | 140 | 141 | 142 | *:* 143 | 144 | META-INF/*.SF 145 | META-INF/*.DSA 146 | META-INF/*.RSA 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | org.apache.maven.plugins 156 | maven-compiler-plugin 157 | 158 | 1.8 159 | 1.8 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | com.google.cloud.dataflow 168 | google-cloud-dataflow-java-sdk-all 169 | 2.4.0 170 | 171 | 172 | 173 | com.google.api-client 174 | google-api-client 175 | 176 | 1.22.0 177 | 178 | 180 | 181 | com.google.guava 182 | guava-jdk5 183 | 184 | 185 | 186 | 187 | 188 | org.apache.beam 189 | beam-runners-google-cloud-dataflow-java 190 | 2.4.0 191 | 192 | 193 | 194 | org.apache.avro 195 | avro 196 | 1.8.2 197 | 198 | 199 | 200 | com.google.apis 201 | google-api-services-pubsub 202 | 203 | v1-rev383-1.22.0 204 | 205 | 207 | 208 | com.google.guava 209 | guava-jdk5 210 | 211 | 212 | 213 | 214 | 215 | com.google.guava 216 | guava 217 | 24.1-jre 218 | 219 | 220 | 221 | joda-time 222 | joda-time 223 | 2.9.9 224 | 225 | 226 | 227 | org.slf4j 228 | slf4j-api 229 | 1.7.25 230 | 231 | 232 | 233 | org.slf4j 234 | slf4j-jdk14 235 | 1.7.25 236 | runtime 237 | 238 | 239 | 240 | javax.servlet 241 | javax.servlet-api 242 | 4.0.1 243 | 244 | 245 | 247 | 248 | 249 | org.hamcrest 250 | hamcrest-all 251 | 1.3 252 | 253 | 254 | 255 | junit 256 | junit 257 | 4.12 258 | 259 | 260 | 261 | org.mockito 262 | mockito-all 263 | 1.10.19 264 | test 265 | 266 | 267 | 268 | -------------------------------------------------------------------------------- /py/exercises/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/exercises/__init__.py -------------------------------------------------------------------------------- /py/exercises/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/exercises/__init__.pyc -------------------------------------------------------------------------------- /py/exercises/exercise0.py: -------------------------------------------------------------------------------- 1 | # This batch pipeline imports game events from CSV to BigQuery. 2 | from __future__ import absolute_import 3 | 4 | import logging 5 | import re 6 | 7 | import apache_beam as beam 8 | from apache_beam.io import ReadFromText 9 | from apache_beam.io import WriteToText 10 | from apache_beam.metrics import Metrics 11 | from apache_beam.metrics.metric import MetricsFilter 12 | from apache_beam.options.pipeline_options import PipelineOptions 13 | from apache_beam.options.pipeline_options import SetupOptions 14 | from apache_beam.options.pipeline_options import GoogleCloudOptions 15 | from util.util import GameEvent 16 | from util.util import ParseEvent 17 | from util.util import ParseArgs 18 | 19 | # Defines the BigQuery schema. 20 | SCHEMA = ('user:STRING,' 'team:STRING,' 'score:INTEGER,' 'timestamp:TIMESTAMP') 21 | 22 | 23 | def FormatEvent(element): 24 | """Format a GameEvent to a BigQuery TableRow.""" 25 | return { 26 | 'user': element.user, 27 | 'team': element.team, 28 | 'score': element.score, 29 | 'timestamp': element.timestamp 30 | } 31 | 32 | 33 | def Run(argv=None): 34 | """Run a batch pipeline.""" 35 | known_args, pipeline_args = ParseArgs(argv) 36 | pipeline_options = PipelineOptions(pipeline_args) 37 | pipeline_options.view_as(SetupOptions).save_main_session = True 38 | p = beam.Pipeline(options=pipeline_options) 39 | 40 | project = pipeline_options.view_as(GoogleCloudOptions).project 41 | # Read events from a CSV file, parse them and write (import) them to BigQuery. 42 | _ = (p 43 | | 'read' >> ReadFromText(known_args.input) 44 | | 'parse' >> beam.FlatMap(ParseEvent) 45 | | 'format' >> beam.Map(FormatEvent) 46 | | beam.io.WriteToBigQuery(known_args.output_tablename, 47 | known_args.output_dataset, project, SCHEMA) 48 | ) 49 | p.run().wait_until_finish() 50 | 51 | 52 | if __name__ == '__main__': 53 | logging.getLogger().setLevel(logging.INFO) 54 | Run() 55 | -------------------------------------------------------------------------------- /py/exercises/exercise1.py: -------------------------------------------------------------------------------- 1 | # This batch pipeline calculates the sum of scores per user, over an entire batch of gaming data and writes the sums to BigQuery. 2 | from __future__ import absolute_import 3 | 4 | import logging 5 | import re 6 | 7 | import apache_beam as beam 8 | from apache_beam.io import ReadFromText 9 | from apache_beam.io import WriteToText 10 | from apache_beam.metrics import Metrics 11 | from apache_beam.metrics.metric import MetricsFilter 12 | from apache_beam.options.pipeline_options import PipelineOptions 13 | from apache_beam.options.pipeline_options import SetupOptions 14 | from apache_beam.options.pipeline_options import GoogleCloudOptions 15 | from util.util import GameEvent 16 | from util.util import ParseEvent 17 | from util.util import ParseArgs 18 | 19 | # Defines the BigQuery schema. 20 | SCHEMA = ('user:STRING,' 'total_score:INTEGER') 21 | 22 | 23 | class ExtractAndSumScore(beam.PTransform): 24 | """A transform to extract key/score information from GameEvent, and sum 25 | the scores. The constructor arg determines whether 'team' or 'user' info is 26 | extracted.""" 27 | def __init__(self, field): 28 | super(ExtractAndSumScore, self).__init__() 29 | self.field = field 30 | 31 | def expand(self, p): 32 | # [START EXERCISE 1]: 33 | # Developer Docs: https:#beam.apache.org/documentation/programming-guide/#transforms-pardo 34 | # Also: https:#cloud.google.com/dataflow/model/par-do 35 | # 36 | # Fill in the code to: 37 | # 1. Extract a KV from each GameEvent corresponding to the given 38 | # field('user' or 'team') and the score. 39 | # 2. Compute the sum of the scores for each key. 40 | # 3. Run your pipeline on the Dataflow service. 41 | return (p 42 | | 'extract_field' >> beam.Map(ChangeMeFunction) 43 | # Select the appropriate transform to compute the sum over each key. 44 | | ChangeMeTransform() 45 | ) 46 | # [END EXERCISE 1] 47 | 48 | 49 | def FormatUserScoreSum(element): 50 | """Format a KV of user and their score to a BigQuery TableRow.""" 51 | user, total_score = element 52 | return {'user': user, 'total_score': total_score} 53 | 54 | 55 | def Run(argv=None): 56 | known_args, pipeline_args = ParseArgs(argv) 57 | pipeline_options = PipelineOptions(pipeline_args) 58 | pipeline_options.view_as(SetupOptions).save_main_session = True 59 | p = beam.Pipeline(options=pipeline_options) 60 | 61 | project = pipeline_options.view_as(GoogleCloudOptions).project 62 | # Read events from a CSV file and parse them. 63 | _ = (p 64 | | 'read' >> ReadFromText(known_args.input) 65 | | 'parse' >> beam.FlatMap(ParseEvent) 66 | | 'extract_user_score' >> ExtractAndSumScore('user') 67 | | 'format_user_score_sum' >> beam.Map(FormatUserScoreSum) 68 | | beam.io.WriteToBigQuery(known_args.output_tablename, 69 | known_args.output_dataset, project, SCHEMA) 70 | ) 71 | 72 | p.run().wait_until_finish() 73 | 74 | 75 | if __name__ == '__main__': 76 | logging.getLogger().setLevel(logging.INFO) 77 | Run() 78 | -------------------------------------------------------------------------------- /py/exercises/exercise1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/exercises/exercise1.pyc -------------------------------------------------------------------------------- /py/exercises/exercise2.py: -------------------------------------------------------------------------------- 1 | # This batch pipeline calculates the sum of scores per team per hour, over an 2 | # entire batch of gaming data and writes the per-team sums to BigQuery. 3 | from __future__ import absolute_import 4 | 5 | import logging 6 | import re 7 | 8 | import apache_beam as beam 9 | from apache_beam.io import ReadFromText 10 | from apache_beam.io import WriteToText 11 | from apache_beam.metrics import Metrics 12 | from apache_beam.metrics.metric import MetricsFilter 13 | from apache_beam.options.pipeline_options import PipelineOptions 14 | from apache_beam.options.pipeline_options import SetupOptions 15 | from apache_beam.options.pipeline_options import GoogleCloudOptions 16 | from util.util import GameEvent 17 | from util.util import ParseEvent 18 | from util.util import ParseArgs 19 | import apache_beam.transforms.window as window 20 | 21 | # Defines the BigQuery schema. 22 | SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP') 23 | 24 | 25 | class ExtractAndSumScore(beam.PTransform): 26 | def __init__(self, field): 27 | super(ExtractAndSumScore, self).__init__() 28 | self.field = field 29 | 30 | def expand(self, p): 31 | return (p 32 | |'extract_field' >> beam.Map(lambda x: (vars(x)[self.field], x.score)) 33 | | beam.CombinePerKey(sum) 34 | ) 35 | 36 | 37 | class WindowedTeamScore(beam.PTransform): 38 | """A transform to compute the WindowedTeamScore.""" 39 | def __init__(self, duration): 40 | super(WindowedTeamScore, self).__init__() 41 | self.duration = duration 42 | 43 | def expand(self, p): 44 | # [START EXERCISE 2]: 45 | # Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing 46 | # Also: https://cloud.google.com/dataflow/model/windowing 47 | return (p 48 | # beam.WindowInto takes a WindowFn and returns a PTransform that applies windowing. 49 | # window.FixedWindows returns a WindowFn that assigns elements into fixed-size 50 | # windows. Use these methods to apply windows of size self.duration. 51 | | 'window' >> ChangeMeTransform() 52 | # Use the ExtractAndSumScore to compute the 'team' sum. 53 | | 'extract_team_score' >> ChangeMeTransform() 54 | ) 55 | # [END EXERCISE 2] 56 | 57 | 58 | class FormatTeamScoreSum(beam.DoFn): 59 | """Format a KV of user and their score to a BigQuery TableRow.""" 60 | def process(self, team_score, window=beam.DoFn.WindowParam): 61 | team, score = team_score 62 | start = int(window.start) 63 | yield { 64 | 'team': team, 65 | 'total_score': score, 66 | 'window_start': start, 67 | } 68 | 69 | 70 | def Run(argv=None): 71 | known_args, pipeline_args = ParseArgs(argv) 72 | pipeline_options = PipelineOptions(pipeline_args) 73 | pipeline_options.view_as(SetupOptions).save_main_session = True 74 | p = beam.Pipeline(options=pipeline_options) 75 | 76 | project = pipeline_options.view_as(GoogleCloudOptions).project 77 | _ = (p 78 | | 'read' >> ReadFromText(known_args.input) 79 | | 'parse' >> beam.FlatMap(ParseEvent) 80 | | 'add_event_timestamps' >> beam.Map( 81 | lambda x: beam.window.TimestampedValue(x, x.timestamp)) 82 | | 'windowed_team_score' >> WindowedTeamScore(60 * 60) 83 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum()) 84 | | beam.io.WriteToBigQuery(known_args.output_tablename, 85 | known_args.output_dataset, project, SCHEMA) 86 | ) 87 | p.run().wait_until_finish() 88 | 89 | 90 | if __name__ == '__main__': 91 | logging.getLogger().setLevel(logging.INFO) 92 | Run() 93 | -------------------------------------------------------------------------------- /py/exercises/exercise3.py: -------------------------------------------------------------------------------- 1 | # This pipeline calculates the sum of scores per team per hour and writes the 2 | # per-team sums to BigQuery. The pipeline can be run in either batch or 3 | # streaming mode, reading from either a data file or Pub/Sub topic. 4 | # 5 | # You will need to create a Pub/Sub topic and run the Java Injector 6 | # in order to get game events over Pub/Sub. Please refer to the instructions 7 | # here: https://github.com/malo-denielou/DataflowSME 8 | from __future__ import absolute_import 9 | 10 | import logging 11 | import re 12 | 13 | import apache_beam as beam 14 | from apache_beam.io import ReadFromText 15 | from apache_beam.io import ReadFromPubSub 16 | from apache_beam.io import WriteToText 17 | from apache_beam.metrics import Metrics 18 | from apache_beam.metrics.metric import MetricsFilter 19 | from apache_beam.options.pipeline_options import PipelineOptions 20 | from apache_beam.options.pipeline_options import SetupOptions 21 | from apache_beam.options.pipeline_options import StandardOptions 22 | from apache_beam.options.pipeline_options import GoogleCloudOptions 23 | from util.util import GameEvent 24 | from util.util import ParseEvent 25 | from util.util import ParseEventFn 26 | from util.util import ParseArgs 27 | import apache_beam.transforms.window as window 28 | from solutions.exercise1 import ExtractAndSumScore 29 | 30 | # Defines the BigQuery schema. 31 | SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP') 32 | 33 | 34 | class ExtractAndSumScore(beam.PTransform): 35 | def __init__(self, field): 36 | super(ExtractAndSumScore, self).__init__() 37 | self.field = field 38 | 39 | def expand(self, p): 40 | return (p 41 | | 'extract_field' >> beam.Map( 42 | lambda x: (vars(x)[self.field], x.score)) 43 | | beam.CombinePerKey(sum) 44 | ) 45 | 46 | 47 | class WindowedTeamScore(beam.PTransform): 48 | """A transform to compute a windowed team score.""" 49 | def __init__(self, duration): 50 | super(WindowedTeamScore, self).__init__() 51 | self.duration = duration 52 | 53 | def expand(self, p): 54 | return (p 55 | | 'window' >> beam.WindowInto( 56 | window.FixedWindows(self.duration)) 57 | | 'extract_team_score' >> ExtractAndSumScore('team') 58 | ) 59 | 60 | 61 | class FormatTeamScoreSum(beam.DoFn): 62 | """Format a KV of user and their score to a BigQuery TableRow.""" 63 | def process(self, team_score, window=beam.DoFn.WindowParam): 64 | team, score = team_score 65 | start = int(window.start) 66 | yield { 67 | 'team': team, 68 | 'total_score': score, 69 | 'window_start': start, 70 | } 71 | 72 | 73 | def Run(argv=None): 74 | known_args, pipeline_args = ParseArgs(argv) 75 | pipeline_options = PipelineOptions(pipeline_args) 76 | pipeline_options.view_as(SetupOptions).save_main_session = True 77 | p = beam.Pipeline(options=pipeline_options) 78 | window_duration = 1 * 60 # 1 minute windows. 79 | if known_args.topic: 80 | pipeline_options.view_as(StandardOptions).streaming = True 81 | 82 | project = pipeline_options.view_as(GoogleCloudOptions).project 83 | timestamp_attribute = 'timestamp_ms' 84 | events = None 85 | if (not known_args.topic): 86 | events = (p 87 | | 'read' >> ReadFromText(known_args.input) 88 | | 'parse' >> beam.FlatMap(ParseEventFn()) 89 | | 'add_event_timestamps' >> beam.Map( 90 | lambda x: beam.window.TimestampedValue(x, x.timestamp)) 91 | ) 92 | else: 93 | # [START EXERCISE 3]: 94 | # Read game events from the Pub/Sub topic using custom timestamps, 95 | # which are in an attribute labeled 'timestamp_ms'. 96 | # Use beam.io.ReadFromPubSub to read from the topic. 97 | # https://beam.apache.org/releases/pydoc/2.8.0/apache_beam.io.gcp.pubsub.html 98 | events = (p 99 | | 'read' >> ChangeMe() 100 | | 'decode' >> beam.ParDo(ParseEventFn()) 101 | ) 102 | 103 | _ = (events 104 | | 'windowed_team_score' >> WindowedTeamScore(window_duration) 105 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum()) 106 | | beam.io.WriteToBigQuery(known_args.output_tablename, 107 | known_args.output_dataset, project, SCHEMA) 108 | ) 109 | p.run().wait_until_finish() 110 | 111 | 112 | if __name__ == '__main__': 113 | logging.getLogger().setLevel(logging.INFO) 114 | Run() 115 | -------------------------------------------------------------------------------- /py/exercises/exercise4.py: -------------------------------------------------------------------------------- 1 | # This pipeline calculates the sum of scores per team per hour and writes the 2 | # per-team sums to BigQuery. Additionally computes running user scores (e.g., 3 | # as a leaderboard) and updates them regularly. 4 | 5 | # The pipeline can be run in either batch or streaming mode, reading from 6 | # either a data file or Pub/Sub topic. 7 | from __future__ import absolute_import 8 | 9 | import logging 10 | import re 11 | import time 12 | 13 | import apache_beam as beam 14 | from apache_beam.io import ReadFromText 15 | from apache_beam.io import ReadFromPubSub 16 | from apache_beam.io import WriteToText 17 | from apache_beam.metrics import Metrics 18 | from apache_beam.metrics.metric import MetricsFilter 19 | from apache_beam.options.pipeline_options import PipelineOptions 20 | from apache_beam.options.pipeline_options import SetupOptions 21 | from apache_beam.options.pipeline_options import StandardOptions 22 | from apache_beam.options.pipeline_options import GoogleCloudOptions 23 | from apache_beam.transforms import trigger 24 | from util.util import GameEvent 25 | from util.util import ParseEvent 26 | from util.util import ParseEventFn 27 | from util.util import ParseArgs 28 | import apache_beam.transforms.window as window 29 | from solutions.exercise1 import ExtractAndSumScore 30 | 31 | # Defines the BigQuery schemas. 32 | USER_SCHEMA = ('user:STRING,' 33 | 'total_score:INTEGER,' 34 | 'processing_time:TIMESTAMP') 35 | TEAM_SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP') 36 | 37 | 38 | class ExtractAndSumScore(beam.PTransform): 39 | def __init__(self, field): 40 | super(ExtractAndSumScore, self).__init__() 41 | self.field = field 42 | 43 | def expand(self, p): 44 | return (p | 'extract_field' >> 45 | beam.Map(lambda x: (vars(x)[self.field], x.score)) | 46 | beam.CombinePerKey(sum)) 47 | 48 | 49 | class RunningUserScores(beam.PTransform): 50 | """Extract user/score pairs via global windowing and emit perioidic updates 51 | on all users' running scores. 52 | """ 53 | def __init__(self, allowed_lateness=0): 54 | super(RunningUserScores, self).__init__() 55 | 56 | def expand(self, p): 57 | # NOTE: allowed_lateness is not yet available in Python FixedWindows. 58 | # NOTE: AfterProcessingTime not yet available in Python. 59 | # [START EXERCISE 4.1]: 60 | # Compute a leaderboard by windowing user scores into the global window. 61 | # Since we will want to see running results, trigger the window early, 62 | # after every 100 elements. Make sure to accumulate fired panes. 63 | # https://beam.apache.org/documentation/programming-guide/#triggers 64 | return (p 65 | | 'window' >> ChangeMe() 66 | | 'extract_user_score' >> ExtractAndSumScore('user') 67 | ) 68 | # [END EXERCISE 4.1] 69 | 70 | 71 | class WindowedTeamScore(beam.PTransform): 72 | """Calculates scores for each team within the configured window duration""" 73 | 74 | def __init__(self, duration): 75 | super(WindowedTeamScore, self).__init__() 76 | self.duration = duration 77 | 78 | def expand(self, p): 79 | # [START EXERCISE 4.2]: 80 | # Window team scores into windows of fixed duration. Trigger these windows 81 | # on-time with the watermark, but also speculatively every 100 elements. 82 | # Ensure correct totals for the watermark-triggered pane by accumulating 83 | # over all data. 84 | return (p 85 | | 'window' >> ChangeMe() 86 | | 'extract_team_score' >> ExtractAndSumScore('team') 87 | ) 88 | # [END EXERCISE 4.2] 89 | 90 | 91 | class FormatTeamScoreSum(beam.DoFn): 92 | """Format a KV of team and its score to a BigQuery TableRow.""" 93 | def process(self, team_score, window=beam.DoFn.WindowParam): 94 | team, score = team_score 95 | start = int(window.start) 96 | yield { 97 | 'team': team, 98 | 'total_score': score, 99 | 'window_start': start, 100 | } 101 | 102 | 103 | class FormatUserScoreSum(beam.DoFn): 104 | """Format a KV of user and their score to a BigQuery TableRow.""" 105 | def process(self, user_score, window=beam.DoFn.WindowParam): 106 | user, score = user_score 107 | yield { 108 | 'user': user, 109 | 'total_score': score, 110 | 'processing_time': time.time(), 111 | } 112 | 113 | 114 | def Run(argv=None): 115 | known_args, pipeline_args = ParseArgs(argv) 116 | pipeline_options = PipelineOptions(pipeline_args) 117 | pipeline_options.view_as(SetupOptions).save_main_session = True 118 | p = beam.Pipeline(options=pipeline_options) 119 | window_duration = 1 * 60 # 1 minute windows. 120 | if known_args.topic: 121 | pipeline_options.view_as(StandardOptions).streaming = True 122 | 123 | project = pipeline_options.view_as(GoogleCloudOptions).project 124 | timestamp_attribute = 'timestamp_ms' 125 | events = None 126 | if (not known_args.topic): 127 | events = (p 128 | | 'read' >> ReadFromText(known_args.input) 129 | | 'parse' >> beam.FlatMap(ParseEventFn()) 130 | | 'add_event_timestamps' >> beam.Map( 131 | lambda x: beam.window.TimestampedValue(x, x.timestamp)) 132 | ) 133 | else: 134 | events = (p 135 | | 'read' >> ReadFromPubSub(topic=known_args.topic, 136 | timestamp_attribute='timestamp_ms') 137 | | 'decode' >> beam.ParDo(ParseEventFn()) 138 | ) 139 | 140 | # Window team scores and write them BigQuery. 141 | _ = (events 142 | | 'windowed_team_score' >> WindowedTeamScore(window_duration) 143 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum()) 144 | | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery( 145 | known_args.output_tablename + '_team', known_args.output_dataset, 146 | project, TEAM_SCHEMA) 147 | ) 148 | 149 | # Write leaderboards to BigQuery. 150 | _ = (events 151 | | 'running_user_score' >> RunningUserScores() 152 | | 'format_user_scores' >> beam.ParDo(FormatUserScoreSum()) 153 | | 'write_users_to_bigquery' >> beam.io.WriteToBigQuery( 154 | known_args.output_tablename + '_user', known_args.output_dataset, 155 | project, USER_SCHEMA) 156 | ) 157 | 158 | p.run().wait_until_finish() 159 | 160 | 161 | if __name__ == '__main__': 162 | logging.getLogger().setLevel(logging.INFO) 163 | Run() 164 | -------------------------------------------------------------------------------- /py/exercises/exercise5.py: -------------------------------------------------------------------------------- 1 | # Filter 'cheating' or 'spammy' users from the game results. 2 | # Computes the global mean score and filters users that are 3 | # some threshold above that score. 4 | from __future__ import absolute_import 5 | 6 | import logging 7 | import re 8 | import time 9 | 10 | import apache_beam as beam 11 | from apache_beam.io import ReadFromText 12 | from apache_beam.io import ReadFromPubSub 13 | from apache_beam.io import WriteToText 14 | from apache_beam.metrics import Metrics 15 | from apache_beam.metrics.metric import MetricsFilter 16 | from apache_beam.options.pipeline_options import PipelineOptions 17 | from apache_beam.options.pipeline_options import SetupOptions 18 | from apache_beam.options.pipeline_options import StandardOptions 19 | from apache_beam.options.pipeline_options import GoogleCloudOptions 20 | from apache_beam.transforms import trigger 21 | from util.util import GameEvent 22 | from util.util import ParseEvent 23 | from util.util import ParseEventFn 24 | from util.util import ParseArgs 25 | import apache_beam.transforms.window as window 26 | 27 | # Defines the BigQuery schemas. 28 | USER_SCHEMA = ('user:STRING,' 29 | 'total_score:INTEGER,' 30 | 'processing_time:TIMESTAMP') 31 | TEAM_SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP') 32 | 33 | 34 | class ExtractAndSumScore(beam.PTransform): 35 | def __init__(self, field): 36 | super(ExtractAndSumScore, self).__init__() 37 | self.field = field 38 | 39 | def expand(self, p): 40 | return (p 41 | | 'extract_field' >> beam.Map( 42 | lambda x: (vars(x)[self.field], x.score)) 43 | | beam.CombinePerKey(sum) 44 | ) 45 | 46 | 47 | class WindowedUserScores(beam.PTransform): 48 | """Extract user/score pairs via in fixed windows.""" 49 | def __init__(self, duration): 50 | super(WindowedUserScores, self).__init__() 51 | self.duration = duration 52 | 53 | def expand(self, p): 54 | return (p 55 | | 'window' >> beam.WindowInto( 56 | window.FixedWindows(self.duration)) 57 | | 'extract_user_score' >> ExtractAndSumScore('user') 58 | ) 59 | 60 | 61 | class FilterUser(beam.DoFn): 62 | """Filter a user if their score * score_weight > avg_score.""" 63 | def __init__(self, score_weight): 64 | super(FilterUser, self).__init__() 65 | self.score_weight = score_weight 66 | self.num_spammy_users = Metrics.counter(self.__class__, 67 | 'num_spammy_users') 68 | 69 | def process(self, user_score, avg_score=beam.DoFn.SideInputParam): 70 | user, score = user_score 71 | if score * self.score_weight > avg_score: 72 | logging.error('User %s filtered as spammy', user) 73 | self.num_spammy_users.inc() 74 | yield user 75 | 76 | 77 | class ComputeSpammyUsers(beam.PTransform): 78 | """Compute users with a high clickrate, which we will consider spammy. 79 | We do this by finding the mean total score per user and filter out 80 | those with scores that are greater than the mean * score_weight 81 | """ 82 | def __init__(self, score_weight): 83 | super(ComputeSpammyUsers, self).__init__() 84 | self.score_weight = score_weight 85 | 86 | def expand(self, p): 87 | # [START EXERCISE 5.1]: 88 | # Extract the score for each user, and compute the mean. 89 | # Create a singleton PCollection view to be used in 90 | # compute_spammers. 91 | # https://beam.apache.org/documentation/programming-guide/#combine 92 | avg_score = (p 93 | | 'extract_score' >> ChangeMe() 94 | | 'compute_mean' >> ChangeMe() 95 | ) 96 | # [END EXERCISE 5.1] 97 | return (p 98 | | 'compute_spammers' >> beam.ParDo( 99 | FilterUser(self.score_weight), avg_score=avg_score) 100 | ) 101 | 102 | 103 | class FilterSpammers(beam.DoFn): 104 | """Remove users found in the spam list.""" 105 | def __init__(self): 106 | super(FilterSpammers, self).__init__() 107 | self.filtered_scores = Metrics.counter(self.__class__, 108 | 'filtered_scores') 109 | 110 | def process(self, elem, spammers=beam.DoFn.SideInputParam): 111 | user = elem.user 112 | if user not in spammers: 113 | yield elem 114 | else: 115 | self.filtered_scores.inc() 116 | 117 | 118 | class WindowedTeamScore(beam.PTransform): 119 | """Calculates scores for each team within the configured window duration""" 120 | def __init__(self, duration, spammers): 121 | super(WindowedTeamScore, self).__init__() 122 | self.duration = duration 123 | self.spammers = spammers 124 | 125 | def expand(self, p): 126 | return (p 127 | | 'window' >> beam.WindowInto( 128 | window.FixedWindows(self.duration)) 129 | | 'filter_spammers' >> beam.ParDo( 130 | FilterSpammers(), spammers=self.spammers) 131 | | 'extract_team_score' >> ExtractAndSumScore('team') 132 | ) 133 | 134 | 135 | class FormatTeamScoreSum(beam.DoFn): 136 | def process(self, team_score, window=beam.DoFn.WindowParam): 137 | team, score = team_score 138 | start = int(window.start) 139 | yield { 140 | 'team': team, 141 | 'total_score': score, 142 | 'window_start': start, 143 | } 144 | 145 | 146 | class FormatUserScoreSum(beam.DoFn): 147 | def process(self, user_score, window=beam.DoFn.WindowParam): 148 | user, score = user_score 149 | yield { 150 | 'user': user, 151 | 'total_score': score, 152 | 'processing_time': time.time(), 153 | } 154 | 155 | 156 | def Run(argv=None): 157 | known_args, pipeline_args = ParseArgs(argv) 158 | pipeline_options = PipelineOptions(pipeline_args) 159 | pipeline_options.view_as(SetupOptions).save_main_session = True 160 | p = beam.Pipeline(options=pipeline_options) 161 | window_duration = 1 * 60 # 1 minute windows. 162 | if known_args.topic: 163 | pipeline_options.view_as(StandardOptions).streaming = True 164 | 165 | project = pipeline_options.view_as(GoogleCloudOptions).project 166 | timestamp_attribute = 'timestamp_ms' 167 | events = None 168 | if (not known_args.topic): 169 | events = (p 170 | | 'read' >> ReadFromText(known_args.input) 171 | | 'parse' >> beam.FlatMap(ParseEventFn()) 172 | | 'add_event_timestamps' >> beam.Map( 173 | lambda x: beam.window.TimestampedValue(x, x.timestamp))) 174 | else: 175 | events = (p 176 | | 'read' >> ReadFromPubSub( 177 | topic=known_args.topic, 178 | timestamp_attribute='timestamp_ms') 179 | | 'decode' >> beam.ParDo(ParseEventFn())) 180 | 181 | user_scores = (events 182 | | 'window_user_scores' >> WindowedUserScores(window_duration)) 183 | spammers = beam.pvalue.AsList(user_scores 184 | | 'compute_spammers' >> ComputeSpammyUsers(2.5)) 185 | 186 | _ = (events 187 | | 'windowed_team_score' >> WindowedTeamScore(window_duration, spammers) 188 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum()) 189 | | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery( 190 | known_args.output_tablename, known_args.output_dataset, project, 191 | TEAM_SCHEMA) 192 | ) 193 | 194 | p.run().wait_until_finish() 195 | 196 | 197 | if __name__ == '__main__': 198 | logging.getLogger().setLevel(logging.INFO) 199 | Run() 200 | -------------------------------------------------------------------------------- /py/exercises/exercise6.py: -------------------------------------------------------------------------------- 1 | # This pipeline computes the average duration of user sessions. The 2 | # averages are windowed, to reflect durations differing over time. 3 | from __future__ import absolute_import 4 | 5 | import logging 6 | import re 7 | import time 8 | 9 | import apache_beam as beam 10 | import apache_beam.transforms.window as window 11 | from apache_beam.io import ReadFromText 12 | from apache_beam.io import ReadFromPubSub 13 | from apache_beam.io import WriteToText 14 | from apache_beam.metrics import Metrics 15 | from apache_beam.metrics.metric import MetricsFilter 16 | from apache_beam.options.pipeline_options import PipelineOptions 17 | from apache_beam.options.pipeline_options import SetupOptions 18 | from apache_beam.options.pipeline_options import StandardOptions 19 | from apache_beam.options.pipeline_options import GoogleCloudOptions 20 | from apache_beam.transforms import trigger 21 | from util.util import GameEvent 22 | from util.util import ParseEvent 23 | from util.util import ParseEventFn 24 | from util.util import ParseArgs 25 | 26 | # Defines the BigQuery schemas. 27 | SESSION_SCHEMA = ('window_start:TIMESTAMP,' 'mean_duration:FLOAT') 28 | 29 | 30 | class UserSessionActivity(beam.DoFn): 31 | """Compute the duration of a user's session.""" 32 | def process(self, 33 | elem, 34 | timestamp=beam.DoFn.TimestampParam, 35 | window=beam.DoFn.WindowParam): 36 | duration = int(window.end) - int(window.start) 37 | yield duration 38 | 39 | 40 | class FormatSessionMeans(beam.DoFn): 41 | """Format session means for output to BQ""" 42 | def process(self, elem, window=beam.DoFn.WindowParam): 43 | yield {'window_start': int(window.start), 'mean_duration': elem} 44 | 45 | 46 | def Run(argv=None): 47 | known_args, pipeline_args = ParseArgs(argv) 48 | pipeline_options = PipelineOptions(pipeline_args) 49 | pipeline_options.view_as(SetupOptions).save_main_session = True 50 | p = beam.Pipeline(options=pipeline_options) 51 | if known_args.topic: 52 | pipeline_options.view_as(StandardOptions).streaming = True 53 | 54 | project = pipeline_options.view_as(GoogleCloudOptions).project 55 | timestamp_attribute = 'timestamp_ms' 56 | events = None 57 | if (not known_args.topic): 58 | events = (p 59 | | 'read' >> ReadFromText(known_args.input) 60 | | 'parse' >> beam.FlatMap(ParseEventFn()) 61 | | 'add_event_timestamps' >> beam.Map( 62 | lambda x: beam.window.TimestampedValue(x, x.timestamp))) 63 | else: 64 | events = (p 65 | | 'read' >> ReadFromPubSub( 66 | topic=known_args.topic, 67 | timestamp_attribute='timestamp_ms') 68 | | 'parse' >> beam.ParDo(ParseEventFn())) 69 | 70 | # [START EXERCISE 6] 71 | _ = (events 72 | | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score)) 73 | # Extract sessions of user data, using known_args.session_gap as the 74 | # gap duration. 75 | # https://beam.apache.org/documentation/programming-guide/#provided-windowing-functions 76 | | 'sessionize' >> ChangeMe() 77 | | 'drop_scores' >> beam.CombinePerKey(lambda x: 0) 78 | | 'convert_to_activity' >> beam.ParDo(UserSessionActivity()) 79 | # Re-window into fixed windows of size user_activity_window in order 80 | # to compute the mean session duration for that window of activity. 81 | | 'window_of_sessions' >> ChangeMe() 82 | | 'session_mean' >> ChangeMe() 83 | # [END EXERCISE 6] 84 | | 'format_sessions' >> beam.ParDo(FormatSessionMeans()) 85 | | 'write_to_bigquery' >> beam.io.WriteToBigQuery( 86 | known_args.output_tablename, known_args.output_dataset, project, 87 | SESSION_SCHEMA) 88 | ) 89 | 90 | p.run().wait_until_finish() 91 | 92 | 93 | if __name__ == '__main__': 94 | logging.getLogger().setLevel(logging.INFO) 95 | Run() 96 | -------------------------------------------------------------------------------- /py/exercises/exercise7.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import logging 4 | import re 5 | import time 6 | 7 | import apache_beam as beam 8 | import apache_beam.transforms.window as window 9 | from apache_beam.io import ReadFromText 10 | from apache_beam.io import ReadFromPubSub 11 | from apache_beam.io import WriteToText 12 | from apache_beam.metrics import Metrics 13 | from apache_beam.metrics.metric import MetricsFilter 14 | from apache_beam.options.pipeline_options import PipelineOptions 15 | from apache_beam.options.pipeline_options import SetupOptions 16 | from apache_beam.options.pipeline_options import StandardOptions 17 | from apache_beam.options.pipeline_options import GoogleCloudOptions 18 | from apache_beam.transforms import trigger 19 | from util.util import GameEvent 20 | from util.util import ParseEvent 21 | from util.util import ParseEventFn 22 | from util.util import ParsePlayEventFn 23 | from util.util import ParseArgs 24 | 25 | # Defines the BigQuery schemas. 26 | SESSION_SCHEMA = ('window_start:TIMESTAMP,' 'mean_duration:FLOAT') 27 | 28 | 29 | class ComputeLatency(beam.DoFn): 30 | def __init__(self): 31 | super(ComputeLatency, self).__init__() 32 | self.dropped_sessions_no_events = Metrics.counter( 33 | self.__class__, 'dropped_sessions_no_events') 34 | self.dropped_sessions_too_many_events = Metrics.counter( 35 | self.__class__, 'dropped_sessions_too_many_events') 36 | self.dropped_sessions_no_play_events = Metrics.counter( 37 | self.__class__, 'dropped_sessions_no_play_events') 38 | 39 | def process(self, elem): 40 | _, vals = elem 41 | plays = vals['plays'] 42 | events = vals['events'] 43 | 44 | play_count = 0 45 | max_play_ts = 0 46 | for play in plays: 47 | play_count += 1 48 | max_play_ts = max(max_play_ts, long(play.timestamp)) 49 | 50 | event_count = 0 51 | an_event = None 52 | for event in events: 53 | an_event = event 54 | event_count += 1 55 | 56 | if event_count == 0: 57 | self.dropped_sessions_no_events.inc() 58 | elif event_count > 1: 59 | self.dropped_sessions_too_many_events.inc() 60 | elif play_count == 0: 61 | self.dropped_sessions_no_play_events.inc() 62 | else: 63 | min_latency = long(an_event.timestamp) - max_play_ts 64 | yield (an_event.user, min_latency) 65 | 66 | 67 | class DetectBadUsers(beam.DoFn): 68 | def process(self, elem, mean_latency=beam.DoFn.SideInputParam): 69 | user, latency = elem 70 | # Naive: compute bad users are users 5 times less than 71 | # the mean. 72 | if latency < mean / 5: 73 | yield user 74 | 75 | 76 | def Run(argv=None): 77 | known_args, pipeline_args = ParseArgs(argv) 78 | pipeline_options = PipelineOptions(pipeline_args) 79 | pipeline_options.view_as(SetupOptions).save_main_session = True 80 | p = beam.Pipeline(options=pipeline_options) 81 | if known_args.topic: 82 | pipeline_options.view_as(StandardOptions).streaming = True 83 | 84 | project = pipeline_options.view_as(GoogleCloudOptions).project 85 | timestamp_attribute = 'timestamp_ms' 86 | events = None 87 | if (not known_args.topic or not known_args.play_topic): 88 | logging.fatal('topic and play_topic are required.') 89 | 90 | # [START EXERCISE 7]: 91 | # 1. Read game events with message id and timestamp. 92 | # 2. Parse events. 93 | events = (p 94 | | 'read_events' >> ChangeMe() 95 | | 'parse_events' >> ChangeMe() 96 | ) 97 | 98 | # 1. Read play events with message id and timestamp. 99 | # 2. Parse events. 100 | play_events = (p 101 | | 'read_play_events' >> ChangeMe() 102 | | 'parse_play_events' >> ChangeMe() 103 | ) 104 | 105 | # 1. Key events by event id. 106 | # 2. Sessionize. 107 | sessionized_events = (events 108 | | 'key_events_by_id' >> ChangeMe() 109 | | 'sessionize_events' >> ChangeMe() 110 | 111 | # 1. Key play events by event id. 112 | # 2. Sessionize. 113 | sessionized_plays = (play_events 114 | | 'key_plays_by_id' >> ChangeMe() 115 | | 'sessionize_plays' >> ChangeMe() 116 | 117 | # 1. Join events using CoGroupByKey 118 | # 2. Compute latency using ComputeLatency 119 | per_user_latency = ( 120 | {'change':me, 'me':change} 121 | | 'cbk' >> ChangeMe() 122 | | 'compute_latency' >> ChangeMe() 123 | 124 | # 1. Get values of per user latencies 125 | # 2. Re-window into GlobalWindows that triggers repeatedly after 1000 new elements. 126 | # 3. Compute the global mean to be used as a side input. 127 | mean_latency = (per_user_latency 128 | | 'extract_latencies' >> ChangeMe() 129 | | 'global_window' >> ChangeMe() 130 | | 'compute_mean' >> ChangeMe() 131 | ) 132 | # [END EXERCISE 7] 133 | 134 | # Filter out bad users. 135 | _ = (per_user_latency 136 | | 'detect_bad_users' >> beam.ParDo( 137 | DetectBadUsers(), mean_latency=mean_latency) 138 | | 'filter_duplicates' >> beam.WindowInto( 139 | window.GlobalWindows(), trigger=trigger.AfterCount(1), 140 | accumulation_mode=trigger.AccumulationMode.ACCUMULATING) 141 | | 'to_bq_schema' >> beam.Map(lambda x: {'user': x}) 142 | | 'write_bad_users' >> beam.io.WriteToBigQuery( 143 | known_args.output_tablename, known_args.output_dataset, project, ('user:string')) 144 | ) 145 | 146 | p.run().wait_until_finish() 147 | 148 | 149 | if __name__ == '__main__': 150 | logging.getLogger().setLevel(logging.INFO) 151 | Run() 152 | -------------------------------------------------------------------------------- /py/run0.sh: -------------------------------------------------------------------------------- 1 | python -m exercises.exercise0 --input gs://sme-training/game/small.csv \ 2 | --output_dataset sme \ 3 | --output_tablename exercise0 \ 4 | --runner DataflowRunner \ 5 | --project YOUR_PROJECT \ 6 | --temp_location gs://YOUR_BUCKET/staging \ 7 | --setup_file ./setup.py 8 | -------------------------------------------------------------------------------- /py/run1.sh: -------------------------------------------------------------------------------- 1 | python -m exercises.exercise1 --input gs://sme-training/game/small.csv \ 2 | --output_dataset sme \ 3 | --output_tablename exercise1 \ 4 | --runner DataflowRunner \ 5 | --project YOUR_PROJECT \ 6 | --temp_location gs://YOUR_BUCKET/tmp/ \ 7 | --setup_file ./setup.py 8 | -------------------------------------------------------------------------------- /py/run2.sh: -------------------------------------------------------------------------------- 1 | python -m exercises.exercise2 --input gs://sme-training/game/small.csv \ 2 | --output_dataset sme \ 3 | --output_tablename exercise2 \ 4 | --runner DataflowRunner \ 5 | --project YOUR_PROJECT \ 6 | --temp_location gs://YOUR_BUCKET/tmp/ \ 7 | --setup_file ./setup.py 8 | -------------------------------------------------------------------------------- /py/run3.sh: -------------------------------------------------------------------------------- 1 | python -m exercises.exercise3 \ 2 | --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \ 3 | --output_dataset sme \ 4 | --output_tablename exercise3 \ 5 | --runner DataflowRunner \ 6 | --project YOUR_PROJECT \ 7 | --temp_location gs://YOUR_BUCKET/staging \ 8 | --setup_file ./setup.py 9 | -------------------------------------------------------------------------------- /py/run4.sh: -------------------------------------------------------------------------------- 1 | python -m exercises.exercise4 \ 2 | --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \ 3 | --output_dataset sme \ 4 | --output_tablename exercise4 \ 5 | --runner DataflowRunner \ 6 | --project YOUR_PROJECT \ 7 | --temp_location gs://YOUR_BUCKET/staging \ 8 | --setup_file ./setup.py 9 | -------------------------------------------------------------------------------- /py/run5.sh: -------------------------------------------------------------------------------- 1 | python -m exercises.exercise5 \ 2 | --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \ 3 | --output_dataset sme \ 4 | --output_tablename exercise5 \ 5 | --runner DataflowRunner \ 6 | --project YOUR_PROJECT \ 7 | --temp_location gs://YOUR_BUCKET/staging \ 8 | --setup_file ./setup.py 9 | -------------------------------------------------------------------------------- /py/run6.sh: -------------------------------------------------------------------------------- 1 | python -m exercises.exercise6 \ 2 | --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \ 3 | --output_dataset sme \ 4 | --output_tablename exercise6 \ 5 | --runner DataflowRunner \ 6 | --project YOUR_PROJECT \ 7 | --user_activity_window 240 \ 8 | --session_gap 60 \ 9 | --temp_location gs://YOUR_BUCKET/staging \ 10 | --setup_file ./setup.py 11 | -------------------------------------------------------------------------------- /py/run7.sh: -------------------------------------------------------------------------------- 1 | python -m exercises.exercise7 \ 2 | --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \ 3 | --play_topic projects/YOUR_PROJECT/topics/YOUR_TOPIC-play \ 4 | --output_dataset sme \ 5 | --output_tablename exercise7 \ 6 | --runner DataflowRunner \ 7 | --project YOUR_PROJECT \ 8 | --session_gap 20 \ 9 | --temp_location gs://YOUR_BUCKET/staging \ 10 | --setup_file ./setup.py 11 | -------------------------------------------------------------------------------- /py/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | setuptools.setup( 3 | name='sme-training', 4 | version='1.0', 5 | install_requires=[], 6 | packages=setuptools.find_packages(), 7 | ) 8 | -------------------------------------------------------------------------------- /py/solutions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/solutions/__init__.py -------------------------------------------------------------------------------- /py/solutions/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/solutions/__init__.pyc -------------------------------------------------------------------------------- /py/solutions/exercise0.py: -------------------------------------------------------------------------------- 1 | # This batch pipeline imports game events from CSV to BigQuery. 2 | from __future__ import absolute_import 3 | 4 | import logging 5 | import re 6 | 7 | import apache_beam as beam 8 | from apache_beam.io import ReadFromText 9 | from apache_beam.io import WriteToText 10 | from apache_beam.metrics import Metrics 11 | from apache_beam.metrics.metric import MetricsFilter 12 | from apache_beam.options.pipeline_options import PipelineOptions 13 | from apache_beam.options.pipeline_options import SetupOptions 14 | from apache_beam.options.pipeline_options import GoogleCloudOptions 15 | from util.util import GameEvent 16 | from util.util import ParseEvent 17 | from util.util import ParseArgs 18 | 19 | # Defines the BigQuery schema. 20 | SCHEMA = ('user:STRING,' 'team:STRING,' 'score:INTEGER,' 'timestamp:TIMESTAMP') 21 | 22 | 23 | def FormatEvent(element): 24 | """Format a GameEvent to a BigQuery TableRow.""" 25 | return { 26 | 'user': element.user, 27 | 'team': element.team, 28 | 'score': element.score, 29 | 'timestamp': element.timestamp 30 | } 31 | 32 | 33 | def Run(argv=None): 34 | """Run a batch pipeline.""" 35 | known_args, pipeline_args = ParseArgs(argv) 36 | pipeline_options = PipelineOptions(pipeline_args) 37 | pipeline_options.view_as(SetupOptions).save_main_session = True 38 | p = beam.Pipeline(options=pipeline_options) 39 | 40 | project = pipeline_options.view_as(GoogleCloudOptions).project 41 | # Read events from a CSV file, parse them and write (import) them to BigQuery. 42 | _ = (p 43 | | 'read' >> ReadFromText(known_args.input) 44 | | 'parse' >> beam.FlatMap(ParseEvent) 45 | | 'format' >> beam.Map(FormatEvent) 46 | | beam.io.WriteToBigQuery(known_args.output_tablename, 47 | known_args.output_dataset, project, SCHEMA) 48 | ) 49 | p.run().wait_until_finish() 50 | 51 | 52 | if __name__ == '__main__': 53 | logging.getLogger().setLevel(logging.INFO) 54 | Run() 55 | -------------------------------------------------------------------------------- /py/solutions/exercise1.py: -------------------------------------------------------------------------------- 1 | # This batch pipeline calculates the sum of scores per user, over an entire batch of gaming data and writes the sums to BigQuery. 2 | from __future__ import absolute_import 3 | 4 | import logging 5 | import re 6 | 7 | import apache_beam as beam 8 | from apache_beam.io import ReadFromText 9 | from apache_beam.io import WriteToText 10 | from apache_beam.metrics import Metrics 11 | from apache_beam.metrics.metric import MetricsFilter 12 | from apache_beam.options.pipeline_options import PipelineOptions 13 | from apache_beam.options.pipeline_options import SetupOptions 14 | from apache_beam.options.pipeline_options import GoogleCloudOptions 15 | from util.util import GameEvent 16 | from util.util import ParseEvent 17 | from util.util import ParseArgs 18 | 19 | # Defines the BigQuery schema. 20 | SCHEMA = ('user:STRING,' 'total_score:INTEGER') 21 | 22 | 23 | class ExtractAndSumScore(beam.PTransform): 24 | """A transform to extract key/score information from GameEvent, and sum 25 | the scores. The constructor arg determines whether 'team' or 'user' info is 26 | extracted.""" 27 | def __init__(self, field): 28 | super(ExtractAndSumScore, self).__init__() 29 | self.field = field 30 | 31 | def expand(self, p): 32 | return (p 33 | | 'extract_field' >> beam.Map(lambda x: (vars(x)[self.field], x.score)) 34 | | beam.CombinePerKey(sum) 35 | ) 36 | 37 | 38 | def FormatUserScoreSum(element): 39 | """Format a KV of user and their score to a BigQuery TableRow.""" 40 | user, total_score = element 41 | return {'user': user, 'total_score': total_score} 42 | 43 | 44 | def Run(argv=None): 45 | known_args, pipeline_args = ParseArgs(argv) 46 | pipeline_options = PipelineOptions(pipeline_args) 47 | pipeline_options.view_as(SetupOptions).save_main_session = True 48 | p = beam.Pipeline(options=pipeline_options) 49 | 50 | project = pipeline_options.view_as(GoogleCloudOptions).project 51 | # Read events from a CSV file and parse them. 52 | _ = (p 53 | | 'read' >> ReadFromText(known_args.input) 54 | | 'parse' >> beam.FlatMap(ParseEvent) 55 | | 'extract_user_score' >> ExtractAndSumScore('user') 56 | | 'format_user_score_sum' >> beam.Map(FormatUserScoreSum) 57 | | beam.io.WriteToBigQuery(known_args.output_tablename, 58 | known_args.output_dataset, project, SCHEMA) 59 | ) 60 | 61 | p.run().wait_until_finish() 62 | 63 | 64 | if __name__ == '__main__': 65 | logging.getLogger().setLevel(logging.INFO) 66 | Run() 67 | -------------------------------------------------------------------------------- /py/solutions/exercise1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/solutions/exercise1.pyc -------------------------------------------------------------------------------- /py/solutions/exercise2.py: -------------------------------------------------------------------------------- 1 | # This batch pipeline calculates the sum of scores per team per hour, over an entire batch of gaming data and writes the per-team sums to BigQuery. 2 | from __future__ import absolute_import 3 | 4 | import logging 5 | import re 6 | 7 | import apache_beam as beam 8 | from apache_beam.io import ReadFromText 9 | from apache_beam.io import WriteToText 10 | from apache_beam.metrics import Metrics 11 | from apache_beam.metrics.metric import MetricsFilter 12 | from apache_beam.options.pipeline_options import PipelineOptions 13 | from apache_beam.options.pipeline_options import SetupOptions 14 | from apache_beam.options.pipeline_options import GoogleCloudOptions 15 | from util.util import GameEvent 16 | from util.util import ParseEvent 17 | from util.util import ParseArgs 18 | import apache_beam.transforms.window as window 19 | 20 | # Defines the BigQuery schema. 21 | SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP') 22 | 23 | 24 | class ExtractAndSumScore(beam.PTransform): 25 | def __init__(self, field): 26 | super(ExtractAndSumScore, self).__init__() 27 | self.field = field 28 | 29 | def expand(self, p): 30 | return (p 31 | |'extract_field' >> beam.Map(lambda x: (vars(x)[self.field], x.score)) 32 | | beam.CombinePerKey(sum) 33 | ) 34 | 35 | 36 | class WindowedTeamScore(beam.PTransform): 37 | """A transform to compute the WindowedTeamScore.""" 38 | def __init__(self, duration): 39 | super(WindowedTeamScore, self).__init__() 40 | self.duration = duration 41 | 42 | def expand(self, p): 43 | return (p 44 | | 'window' >> beam.WindowInto(window.FixedWindows(self.duration)) 45 | | 'extract_team_score' >> ExtractAndSumScore('team') 46 | ) 47 | 48 | 49 | class FormatTeamScoreSum(beam.DoFn): 50 | """Format a KV of user and their score to a BigQuery TableRow.""" 51 | def process(self, team_score, window=beam.DoFn.WindowParam): ##???? 52 | team, score = team_score 53 | start = int(window.start) 54 | yield { 55 | 'team': team, 56 | 'total_score': score, 57 | 'window_start': start, 58 | } 59 | 60 | 61 | def Run(argv=None): 62 | known_args, pipeline_args = ParseArgs(argv) 63 | pipeline_options = PipelineOptions(pipeline_args) 64 | pipeline_options.view_as(SetupOptions).save_main_session = True 65 | p = beam.Pipeline(options=pipeline_options) 66 | 67 | project = pipeline_options.view_as(GoogleCloudOptions).project 68 | _ = (p 69 | | 'read' >> ReadFromText(known_args.input) 70 | | 'parse' >> beam.FlatMap(ParseEvent) 71 | | 'add_event_timestamps' >> beam.Map( 72 | lambda x: beam.window.TimestampedValue(x, x.timestamp)) 73 | | 'windowed_team_score' >> WindowedTeamScore(60 * 60) 74 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum()) 75 | | beam.io.WriteToBigQuery(known_args.output_tablename, 76 | known_args.output_dataset, project, SCHEMA) 77 | ) 78 | p.run().wait_until_finish() 79 | 80 | 81 | if __name__ == '__main__': 82 | logging.getLogger().setLevel(logging.INFO) 83 | Run() 84 | -------------------------------------------------------------------------------- /py/solutions/exercise3.py: -------------------------------------------------------------------------------- 1 | # This pipeline calculates the sum of scores per team per hour and writes the 2 | # per-team sums to BigQuery. The pipeline can be run in either batch or 3 | # streaming mode, reading from either a data file or Pub/Sub topic. 4 | # 5 | # You will need to create a Pub/Sub topic and run the Java Injector 6 | # in order to get game events over Pub/Sub. Please refer to the instructions 7 | # here: https://github.com/malo-denielou/DataflowSME 8 | from __future__ import absolute_import 9 | 10 | import logging 11 | import re 12 | 13 | import apache_beam as beam 14 | from apache_beam.io import ReadFromText 15 | from apache_beam.io import ReadFromPubSub 16 | from apache_beam.io import WriteToText 17 | from apache_beam.metrics import Metrics 18 | from apache_beam.metrics.metric import MetricsFilter 19 | from apache_beam.options.pipeline_options import PipelineOptions 20 | from apache_beam.options.pipeline_options import SetupOptions 21 | from apache_beam.options.pipeline_options import StandardOptions 22 | from apache_beam.options.pipeline_options import GoogleCloudOptions 23 | from util.util import GameEvent 24 | from util.util import ParseEvent 25 | from util.util import ParseEventFn 26 | from util.util import ParseArgs 27 | import apache_beam.transforms.window as window 28 | from solutions.exercise1 import ExtractAndSumScore 29 | 30 | # Defines the BigQuery schema. 31 | SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP') 32 | 33 | 34 | class ExtractAndSumScore(beam.PTransform): 35 | def __init__(self, field): 36 | super(ExtractAndSumScore, self).__init__() 37 | self.field = field 38 | 39 | def expand(self, p): 40 | return (p 41 | | 'extract_field' >> beam.Map( 42 | lambda x: (vars(x)[self.field], x.score)) 43 | | beam.CombinePerKey(sum) 44 | ) 45 | 46 | 47 | class WindowedTeamScore(beam.PTransform): 48 | """A transform to compute a windowed team score.""" 49 | def __init__(self, duration): 50 | super(WindowedTeamScore, self).__init__() 51 | self.duration = duration 52 | 53 | def expand(self, p): 54 | return (p 55 | | 'window' >> beam.WindowInto( 56 | window.FixedWindows(self.duration)) 57 | | 'extract_team_score' >> ExtractAndSumScore('team') 58 | ) 59 | 60 | 61 | class FormatTeamScoreSum(beam.DoFn): 62 | """Format a KV of user and their score to a BigQuery TableRow.""" 63 | def process(self, team_score, window=beam.DoFn.WindowParam): 64 | team, score = team_score 65 | start = int(window.start) 66 | yield { 67 | 'team': team, 68 | 'total_score': score, 69 | 'window_start': start, 70 | } 71 | 72 | 73 | def Run(argv=None): 74 | known_args, pipeline_args = ParseArgs(argv) 75 | pipeline_options = PipelineOptions(pipeline_args) 76 | pipeline_options.view_as(SetupOptions).save_main_session = True 77 | p = beam.Pipeline(options=pipeline_options) 78 | window_duration = 1 * 60 # 1 minute windows. 79 | if known_args.topic: 80 | pipeline_options.view_as(StandardOptions).streaming = True 81 | 82 | project = pipeline_options.view_as(GoogleCloudOptions).project 83 | timestamp_attribute = 'timestamp_ms' 84 | events = None 85 | if (not known_args.topic): 86 | events = (p 87 | | 'read' >> ReadFromText(known_args.input) 88 | | 'parse' >> beam.FlatMap(ParseEventFn()) 89 | | 'add_event_timestamps' >> beam.Map( 90 | lambda x: beam.window.TimestampedValue(x, x.timestamp)) 91 | ) 92 | else: 93 | events = (p 94 | | 'read' >> ReadFromPubSub(topic=known_args.topic, 95 | timestamp_attribute='timestamp_ms') 96 | | 'decode' >> beam.ParDo(ParseEventFn()) 97 | ) 98 | 99 | _ = (events 100 | | 'windowed_team_score' >> WindowedTeamScore(window_duration) 101 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum()) 102 | | beam.io.WriteToBigQuery(known_args.output_tablename, 103 | known_args.output_dataset, project, SCHEMA) 104 | ) 105 | p.run().wait_until_finish() 106 | 107 | 108 | if __name__ == '__main__': 109 | logging.getLogger().setLevel(logging.INFO) 110 | Run() 111 | -------------------------------------------------------------------------------- /py/solutions/exercise4.py: -------------------------------------------------------------------------------- 1 | # This pipeline calculates the sum of scores per team per hour and writes the 2 | # per-team sums to BigQuery. Additionally computes running user scores (e.g., 3 | # as a leaderboard) and updates them regularly. 4 | 5 | # The pipeline can be run in either batch or streaming mode, reading from 6 | # either a data file or Pub/Sub topic. 7 | from __future__ import absolute_import 8 | 9 | import logging 10 | import re 11 | import time 12 | 13 | import apache_beam as beam 14 | from apache_beam.io import ReadFromText 15 | from apache_beam.io import ReadFromPubSub 16 | from apache_beam.io import WriteToText 17 | from apache_beam.metrics import Metrics 18 | from apache_beam.metrics.metric import MetricsFilter 19 | from apache_beam.options.pipeline_options import PipelineOptions 20 | from apache_beam.options.pipeline_options import SetupOptions 21 | from apache_beam.options.pipeline_options import StandardOptions 22 | from apache_beam.options.pipeline_options import GoogleCloudOptions 23 | from apache_beam.transforms import trigger 24 | from util.util import GameEvent 25 | from util.util import ParseEvent 26 | from util.util import ParseEventFn 27 | from util.util import ParseArgs 28 | import apache_beam.transforms.window as window 29 | from solutions.exercise1 import ExtractAndSumScore 30 | 31 | # Defines the BigQuery schemas. 32 | USER_SCHEMA = ('user:STRING,' 33 | 'total_score:INTEGER,' 34 | 'processing_time:TIMESTAMP') 35 | TEAM_SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP') 36 | 37 | 38 | class ExtractAndSumScore(beam.PTransform): 39 | def __init__(self, field): 40 | super(ExtractAndSumScore, self).__init__() 41 | self.field = field 42 | 43 | def expand(self, p): 44 | return (p | 'extract_field' >> 45 | beam.Map(lambda x: (vars(x)[self.field], x.score)) | 46 | beam.CombinePerKey(sum)) 47 | 48 | 49 | class RunningUserScores(beam.PTransform): 50 | """Extract user/score pairs via global windowing and emit perioidic updates 51 | on all users' running scores. 52 | """ 53 | def __init__(self, allowed_lateness=0): 54 | super(RunningUserScores, self).__init__() 55 | 56 | def expand(self, p): 57 | # NOTE: allowed_lateness is not yet available in Python FixedWindows. 58 | # NOTE: AfterProcessingTime not yet available in Python. 59 | return (p 60 | | 'window' >> beam.WindowInto( 61 | beam.window.GlobalWindows(), 62 | trigger=trigger.AfterWatermark(early=trigger.AfterCount(100)), 63 | accumulation_mode=trigger.AccumulationMode.ACCUMULATING) 64 | | 'extract_user_score' >> ExtractAndSumScore('user') 65 | ) 66 | 67 | 68 | class WindowedTeamScore(beam.PTransform): 69 | """Calculates scores for each team within the configured window duration""" 70 | def __init__(self, duration): 71 | super(WindowedTeamScore, self).__init__() 72 | self.duration = duration 73 | 74 | def expand(self, p): 75 | return (p 76 | | 'window' >> beam.WindowInto(window.FixedWindows(self.duration)) 77 | | 'extract_team_score' >> ExtractAndSumScore('team') 78 | ) 79 | 80 | 81 | class FormatTeamScoreSum(beam.DoFn): 82 | """Format a KV of team and its score to a BigQuery TableRow.""" 83 | def process(self, team_score, window=beam.DoFn.WindowParam): 84 | team, score = team_score 85 | start = int(window.start) 86 | yield { 87 | 'team': team, 88 | 'total_score': score, 89 | 'window_start': start, 90 | } 91 | 92 | 93 | class FormatUserScoreSum(beam.DoFn): 94 | """Format a KV of user and their score to a BigQuery TableRow.""" 95 | def process(self, user_score, window=beam.DoFn.WindowParam): 96 | user, score = user_score 97 | yield { 98 | 'user': user, 99 | 'total_score': score, 100 | 'processing_time': time.time(), 101 | } 102 | 103 | 104 | def Run(argv=None): 105 | known_args, pipeline_args = ParseArgs(argv) 106 | pipeline_options = PipelineOptions(pipeline_args) 107 | pipeline_options.view_as(SetupOptions).save_main_session = True 108 | p = beam.Pipeline(options=pipeline_options) 109 | window_duration = 1 * 60 # 1 minute windows. 110 | if known_args.topic: 111 | pipeline_options.view_as(StandardOptions).streaming = True 112 | 113 | project = pipeline_options.view_as(GoogleCloudOptions).project 114 | timestamp_attribute = 'timestamp_ms' 115 | events = None 116 | if (not known_args.topic): 117 | events = (p 118 | | 'read' >> ReadFromText(known_args.input) 119 | | 'parse' >> beam.FlatMap(ParseEventFn()) 120 | | 'add_event_timestamps' >> beam.Map( 121 | lambda x: beam.window.TimestampedValue(x, x.timestamp)) 122 | ) 123 | else: 124 | events = (p 125 | | 'read' >> ReadFromPubSub(topic=known_args.topic, 126 | timestamp_attribute='timestamp_ms') 127 | | 'decode' >> beam.ParDo(ParseEventFn()) 128 | ) 129 | 130 | # Window team scores and write them BigQuery. 131 | _ = (events 132 | | 'windowed_team_score' >> WindowedTeamScore(window_duration) 133 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum()) 134 | | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery( 135 | known_args.output_tablename + '_team', known_args.output_dataset, 136 | project, TEAM_SCHEMA) 137 | ) 138 | 139 | # Write leaderboards to BigQuery. 140 | _ = (events 141 | | 'running_user_score' >> RunningUserScores() 142 | | 'format_user_scores' >> beam.ParDo(FormatUserScoreSum()) 143 | | 'write_users_to_bigquery' >> beam.io.WriteToBigQuery( 144 | known_args.output_tablename + '_user', known_args.output_dataset, 145 | project, USER_SCHEMA) 146 | ) 147 | 148 | p.run().wait_until_finish() 149 | 150 | 151 | if __name__ == '__main__': 152 | logging.getLogger().setLevel(logging.INFO) 153 | Run() 154 | -------------------------------------------------------------------------------- /py/solutions/exercise5.py: -------------------------------------------------------------------------------- 1 | # Filter 'cheating' or 'spammy' users from the game results. 2 | # Computes the global mean score and filters users that are 3 | # some threshold above that score. 4 | from __future__ import absolute_import 5 | 6 | import logging 7 | import re 8 | import time 9 | 10 | import apache_beam as beam 11 | from apache_beam.io import ReadFromText 12 | from apache_beam.io import ReadFromPubSub 13 | from apache_beam.io import WriteToText 14 | from apache_beam.metrics import Metrics 15 | from apache_beam.metrics.metric import MetricsFilter 16 | from apache_beam.options.pipeline_options import PipelineOptions 17 | from apache_beam.options.pipeline_options import SetupOptions 18 | from apache_beam.options.pipeline_options import StandardOptions 19 | from apache_beam.options.pipeline_options import GoogleCloudOptions 20 | from apache_beam.transforms import trigger 21 | from util.util import GameEvent 22 | from util.util import ParseEvent 23 | from util.util import ParseEventFn 24 | from util.util import ParseArgs 25 | import apache_beam.transforms.window as window 26 | 27 | # Defines the BigQuery schemas. 28 | USER_SCHEMA = ('user:STRING,' 29 | 'total_score:INTEGER,' 30 | 'processing_time:TIMESTAMP') 31 | TEAM_SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP') 32 | 33 | 34 | class ExtractAndSumScore(beam.PTransform): 35 | def __init__(self, field): 36 | super(ExtractAndSumScore, self).__init__() 37 | self.field = field 38 | 39 | def expand(self, p): 40 | return (p 41 | | 'extract_field' >> beam.Map( 42 | lambda x: (vars(x)[self.field], x.score)) 43 | | beam.CombinePerKey(sum) 44 | ) 45 | 46 | 47 | class WindowedUserScores(beam.PTransform): 48 | """Extract user/score pairs via in fixed windows.""" 49 | def __init__(self, duration): 50 | super(WindowedUserScores, self).__init__() 51 | self.duration = duration 52 | 53 | def expand(self, p): 54 | return (p 55 | | 'window' >> beam.WindowInto( 56 | window.FixedWindows(self.duration)) 57 | | 'extract_user_score' >> ExtractAndSumScore('user') 58 | ) 59 | 60 | 61 | class FilterUser(beam.DoFn): 62 | """Filter a user if their score * score_weight > avg_score.""" 63 | def __init__(self, score_weight): 64 | super(FilterUser, self).__init__() 65 | self.score_weight = score_weight 66 | self.num_spammy_users = Metrics.counter(self.__class__, 67 | 'num_spammy_users') 68 | 69 | def process(self, user_score, avg_score=beam.DoFn.SideInputParam): 70 | user, score = user_score 71 | if score * self.score_weight > avg_score: 72 | logging.error('User %s filtered as spammy', user) 73 | self.num_spammy_users.inc() 74 | yield user 75 | 76 | 77 | class ComputeSpammyUsers(beam.PTransform): 78 | """Compute users with a high clickrate, which we will consider spammy. 79 | We do this by finding the mean total score per user and filter out 80 | those with scores that are greater than the mean * score_weight 81 | """ 82 | def __init__(self, score_weight): 83 | super(ComputeSpammyUsers, self).__init__() 84 | self.score_weight = score_weight 85 | 86 | def expand(self, p): 87 | avg_score = (p 88 | | beam.Values() 89 | | beam.CombineGlobally( 90 | beam.combiners.MeanCombineFn()).as_singleton_view() 91 | ) 92 | return (p 93 | | 'compute_spammers' >> beam.ParDo( 94 | FilterUser(self.score_weight), avg_score=avg_score) 95 | ) 96 | 97 | 98 | class FilterSpammers(beam.DoFn): 99 | """Remove users found in the spam list.""" 100 | def __init__(self): 101 | super(FilterSpammers, self).__init__() 102 | self.filtered_scores = Metrics.counter(self.__class__, 103 | 'filtered_scores') 104 | 105 | def process(self, elem, spammers=beam.DoFn.SideInputParam): 106 | user = elem.user 107 | if user not in spammers: 108 | yield elem 109 | else: 110 | self.filtered_scores.inc() 111 | 112 | 113 | class WindowedTeamScore(beam.PTransform): 114 | """Calculates scores for each team within the configured window duration""" 115 | def __init__(self, duration, spammers): 116 | super(WindowedTeamScore, self).__init__() 117 | self.duration = duration 118 | self.spammers = spammers 119 | 120 | def expand(self, p): 121 | return (p 122 | | 'window' >> beam.WindowInto( 123 | window.FixedWindows(self.duration)) 124 | | 'filter_spammers' >> beam.ParDo( 125 | FilterSpammers(), spammers=self.spammers) 126 | | 'extract_team_score' >> ExtractAndSumScore('team') 127 | ) 128 | 129 | 130 | class FormatTeamScoreSum(beam.DoFn): 131 | def process(self, team_score, window=beam.DoFn.WindowParam): 132 | team, score = team_score 133 | start = int(window.start) 134 | yield { 135 | 'team': team, 136 | 'total_score': score, 137 | 'window_start': start, 138 | } 139 | 140 | 141 | class FormatUserScoreSum(beam.DoFn): 142 | def process(self, user_score, window=beam.DoFn.WindowParam): 143 | user, score = user_score 144 | yield { 145 | 'user': user, 146 | 'total_score': score, 147 | 'processing_time': time.time(), 148 | } 149 | 150 | 151 | def Run(argv=None): 152 | known_args, pipeline_args = ParseArgs(argv) 153 | pipeline_options = PipelineOptions(pipeline_args) 154 | pipeline_options.view_as(SetupOptions).save_main_session = True 155 | p = beam.Pipeline(options=pipeline_options) 156 | window_duration = 1 * 60 # 1 minute windows. 157 | if known_args.topic: 158 | pipeline_options.view_as(StandardOptions).streaming = True 159 | 160 | project = pipeline_options.view_as(GoogleCloudOptions).project 161 | timestamp_attribute = 'timestamp_ms' 162 | events = None 163 | if (not known_args.topic): 164 | events = (p 165 | | 'read' >> ReadFromText(known_args.input) 166 | | 'parse' >> beam.FlatMap(ParseEventFn()) 167 | | 'add_event_timestamps' >> beam.Map( 168 | lambda x: beam.window.TimestampedValue(x, x.timestamp))) 169 | else: 170 | events = (p 171 | | 'read' >> ReadFromPubSub( 172 | topic=known_args.topic, 173 | timestamp_attribute='timestamp_ms') 174 | | 'decode' >> beam.ParDo(ParseEventFn())) 175 | 176 | user_scores = (events 177 | | 'window_user_scores' >> WindowedUserScores(window_duration)) 178 | spammers = beam.pvalue.AsList(user_scores 179 | | 'compute_spammers' >> ComputeSpammyUsers(2.5)) 180 | 181 | _ = (events 182 | | 'windowed_team_score' >> WindowedTeamScore(window_duration, spammers) 183 | | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum()) 184 | | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery( 185 | known_args.output_tablename, known_args.output_dataset, project, 186 | TEAM_SCHEMA) 187 | ) 188 | 189 | p.run().wait_until_finish() 190 | 191 | 192 | if __name__ == '__main__': 193 | logging.getLogger().setLevel(logging.INFO) 194 | Run() 195 | -------------------------------------------------------------------------------- /py/solutions/exercise6.py: -------------------------------------------------------------------------------- 1 | # This pipeline computes the average duration of user sessions. The 2 | # averages are windowed, to reflect durations differing over time. 3 | from __future__ import absolute_import 4 | 5 | import logging 6 | import re 7 | import time 8 | 9 | import apache_beam as beam 10 | import apache_beam.transforms.window as window 11 | from apache_beam.io import ReadFromText 12 | from apache_beam.io import ReadFromPubSub 13 | from apache_beam.io import WriteToText 14 | from apache_beam.metrics import Metrics 15 | from apache_beam.metrics.metric import MetricsFilter 16 | from apache_beam.options.pipeline_options import PipelineOptions 17 | from apache_beam.options.pipeline_options import SetupOptions 18 | from apache_beam.options.pipeline_options import StandardOptions 19 | from apache_beam.options.pipeline_options import GoogleCloudOptions 20 | from apache_beam.transforms import trigger 21 | from util.util import GameEvent 22 | from util.util import ParseEvent 23 | from util.util import ParseEventFn 24 | from util.util import ParseArgs 25 | 26 | # Defines the BigQuery schemas. 27 | SESSION_SCHEMA = ('window_start:TIMESTAMP,' 'mean_duration:FLOAT') 28 | 29 | 30 | class UserSessionActivity(beam.DoFn): 31 | """Compute the duration of a user's session.""" 32 | def process(self, 33 | elem, 34 | timestamp=beam.DoFn.TimestampParam, 35 | window=beam.DoFn.WindowParam): 36 | duration = int(window.end) - int(window.start) 37 | yield duration 38 | 39 | 40 | class FormatSessionMeans(beam.DoFn): 41 | """Format session means for output to BQ""" 42 | def process(self, elem, window=beam.DoFn.WindowParam): 43 | yield {'window_start': int(window.start), 'mean_duration': elem} 44 | 45 | 46 | def Run(argv=None): 47 | known_args, pipeline_args = ParseArgs(argv) 48 | pipeline_options = PipelineOptions(pipeline_args) 49 | pipeline_options.view_as(SetupOptions).save_main_session = True 50 | p = beam.Pipeline(options=pipeline_options) 51 | if known_args.topic: 52 | pipeline_options.view_as(StandardOptions).streaming = True 53 | 54 | project = pipeline_options.view_as(GoogleCloudOptions).project 55 | timestamp_attribute = 'timestamp_ms' 56 | events = None 57 | if (not known_args.topic): 58 | events = (p 59 | | 'read' >> ReadFromText(known_args.input) 60 | | 'parse' >> beam.FlatMap(ParseEventFn()) 61 | | 'add_event_timestamps' >> beam.Map( 62 | lambda x: beam.window.TimestampedValue(x, x.timestamp))) 63 | else: 64 | events = (p 65 | | 'read' >> ReadFromPubSub( 66 | topic=known_args.topic, 67 | timestamp_attribute='timestamp_ms') 68 | | 'parse' >> beam.ParDo(ParseEventFn())) 69 | 70 | _ = (events 71 | | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score)) 72 | | 'sessionize' >> beam.WindowInto( 73 | window.Sessions(float(known_args.session_gap))) 74 | | 'drop_scores' >> beam.CombinePerKey(lambda x: 0) 75 | | 'convert_to_activity' >> beam.ParDo(UserSessionActivity()) 76 | | 'window_of_sessions' >> beam.WindowInto( 77 | window.FixedWindows(int(known_args.user_activity_window))) 78 | | 'session_mean' >> beam.CombineGlobally( 79 | beam.combiners.MeanCombineFn()).without_defaults() 80 | | 'format_sessions' >> beam.ParDo(FormatSessionMeans()) 81 | | 'write_to_bigquery' >> beam.io.WriteToBigQuery( 82 | known_args.output_tablename, known_args.output_dataset, project, 83 | SESSION_SCHEMA) 84 | ) 85 | 86 | p.run().wait_until_finish() 87 | 88 | 89 | if __name__ == '__main__': 90 | logging.getLogger().setLevel(logging.INFO) 91 | Run() 92 | -------------------------------------------------------------------------------- /py/solutions/exercise7.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import logging 4 | import re 5 | import time 6 | 7 | import apache_beam as beam 8 | import apache_beam.transforms.window as window 9 | from apache_beam.io import ReadFromText 10 | from apache_beam.io import ReadFromPubSub 11 | from apache_beam.io import WriteToText 12 | from apache_beam.metrics import Metrics 13 | from apache_beam.metrics.metric import MetricsFilter 14 | from apache_beam.options.pipeline_options import PipelineOptions 15 | from apache_beam.options.pipeline_options import SetupOptions 16 | from apache_beam.options.pipeline_options import StandardOptions 17 | from apache_beam.options.pipeline_options import GoogleCloudOptions 18 | from apache_beam.transforms import trigger 19 | from util.util import GameEvent 20 | from util.util import ParseEvent 21 | from util.util import ParseEventFn 22 | from util.util import ParsePlayEventFn 23 | from util.util import ParseArgs 24 | 25 | # Defines the BigQuery schemas. 26 | SESSION_SCHEMA = ('window_start:TIMESTAMP,' 'mean_duration:FLOAT') 27 | 28 | 29 | class ComputeLatency(beam.DoFn): 30 | def __init__(self): 31 | super(ComputeLatency, self).__init__() 32 | self.dropped_sessions_no_events = Metrics.counter( 33 | self.__class__, 'dropped_sessions_no_events') 34 | self.dropped_sessions_too_many_events = Metrics.counter( 35 | self.__class__, 'dropped_sessions_too_many_events') 36 | self.dropped_sessions_no_play_events = Metrics.counter( 37 | self.__class__, 'dropped_sessions_no_play_events') 38 | 39 | def process(self, elem): 40 | _, vals = elem 41 | plays = vals['plays'] 42 | events = vals['events'] 43 | 44 | play_count = 0 45 | max_play_ts = 0 46 | for play in plays: 47 | play_count += 1 48 | max_play_ts = max(max_play_ts, long(play.timestamp)) 49 | 50 | event_count = 0 51 | an_event = None 52 | for event in events: 53 | an_event = event 54 | event_count += 1 55 | 56 | if event_count == 0: 57 | self.dropped_sessions_no_events.inc() 58 | elif event_count > 1: 59 | self.dropped_sessions_too_many_events.inc() 60 | elif play_count == 0: 61 | self.dropped_sessions_no_play_events.inc() 62 | else: 63 | min_latency = long(an_event.timestamp) - max_play_ts 64 | yield (an_event.user, min_latency) 65 | 66 | 67 | class DetectBadUsers(beam.DoFn): 68 | def process(self, elem, mean_latency=beam.DoFn.SideInputParam): 69 | user, latency = elem 70 | # Naive: compute bad users are users 5 times less than 71 | # the mean. 72 | if latency < mean / 5: 73 | yield user 74 | 75 | 76 | def Run(argv=None): 77 | known_args, pipeline_args = ParseArgs(argv) 78 | pipeline_options = PipelineOptions(pipeline_args) 79 | pipeline_options.view_as(SetupOptions).save_main_session = True 80 | p = beam.Pipeline(options=pipeline_options) 81 | if known_args.topic: 82 | pipeline_options.view_as(StandardOptions).streaming = True 83 | 84 | project = pipeline_options.view_as(GoogleCloudOptions).project 85 | timestamp_attribute = 'timestamp_ms' 86 | events = None 87 | if (not known_args.topic or not known_args.play_topic): 88 | logging.fatal('topic and play_topic are required.') 89 | 90 | events = (p 91 | | 'read_events' >> ReadFromPubSub( 92 | topic=known_args.topic, 93 | timestamp_attribute='timestamp_ms') 94 | | 'parse_events' >> beam.ParDo(ParseEventFn()) 95 | ) 96 | 97 | play_events = (p 98 | | 'read_play_events' >> ReadFromPubSub( 99 | topic=known_args.play_topic, 100 | timestamp_attribute='timestamp_ms') 101 | | 'parse_play_events' >> beam.ParDo(ParsePlayEventFn()) 102 | ) 103 | 104 | sessionized_events = (events 105 | | 'key_events_by_id' >> beam.Map(lambda x: (x.event_id, x)) 106 | | 'sessionize_events' >> beam.WindowInto( 107 | window.Sessions(float(known_args.session_gap)))) 108 | 109 | sessionized_plays = (play_events 110 | | 'key_plays_by_id' >> beam.Map(lambda x: (x.event_id, x)) 111 | | 'sessionize_plays' >> beam.WindowInto( 112 | window.Sessions(float(known_args.session_gap)))) 113 | 114 | per_user_latency = ( 115 | {'plays': sessionized_plays, 'events': sessionized_events} 116 | | 'cbk' >> beam.CoGroupByKey() 117 | | 'compute_latency' >> beam.ParDo(ComputeLatency())) 118 | 119 | mean_latency = (per_user_latency 120 | | 'extract_latencies' >> beam.Values() 121 | | 'global_window' >> beam.WindowInto( 122 | window.GlobalWindows(), 123 | trigger=trigger.Repeatedly(trigger.AfterCount(1000)), 124 | accumulation_mode=trigger.AccumulationMode.ACCUMULATING) 125 | | 'compute_mean' >> beam.CombineGlobally( 126 | beam.combiners.MeanCombineFn()).with_fanout(16).as_singleton_view() 127 | ) 128 | 129 | _ = (per_user_latency 130 | | 'detect_bad_users' >> beam.ParDo( 131 | DetectBadUsers(), mean_latency=mean_latency) 132 | | 'filter_duplicates' >> beam.WindowInto( 133 | window.GlobalWindows(), trigger=trigger.AfterCount(1), 134 | accumulation_mode=trigger.AccumulationMode.ACCUMULATING) 135 | | 'to_bq_schema' >> beam.Map(lambda x: {'user': x}) 136 | | 'write_bad_users' >> beam.io.WriteToBigQuery( 137 | known_args.output_tablename, known_args.output_dataset, project, ('user:string')) 138 | ) 139 | 140 | p.run().wait_until_finish() 141 | 142 | 143 | if __name__ == '__main__': 144 | logging.getLogger().setLevel(logging.INFO) 145 | Run() 146 | -------------------------------------------------------------------------------- /py/util/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['util'] 2 | -------------------------------------------------------------------------------- /py/util/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/util/__init__.pyc -------------------------------------------------------------------------------- /py/util/util.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import argparse 4 | import collections 5 | import logging 6 | 7 | import apache_beam as beam 8 | from apache_beam.metrics.metric import Metrics 9 | 10 | GameEvent = collections.namedtuple( 11 | 'GameEvent', ['user', 'team', 'score', 'timestamp', 'event_id']) 12 | PlayEvent = collections.namedtuple('PlayEvent', 13 | ['user', 'timestamp', 'event_id']) 14 | 15 | 16 | class ParseEventFn(beam.DoFn): 17 | """Parses an event. 18 | [user,team,score,timestamp,readable_timestamp,event_id] 19 | """ 20 | def __init__(self): 21 | super(ParseEventFn, self).__init__() 22 | self.num_parse_errors = Metrics.counter(self.__class__, 23 | 'num_event_parse_errors') 24 | 25 | def process(self, elem): 26 | try: 27 | parts = [x.strip() for x in elem.split(',')] 28 | user, team, score, timestamp = parts[:4] 29 | score = int(score) 30 | timestamp = long(timestamp) 31 | if len(parts) >= 6: 32 | event_id = parts[5] 33 | else: 34 | event_id = 'none' 35 | yield GameEvent(user, team, score, timestamp, event_id) 36 | except Exception as e: 37 | self.num_parse_errors.inc() 38 | logging.error('Parse error on "%s": %s', elem, str(e)) 39 | 40 | 41 | class ParsePlayEventFn(beam.DoFn): 42 | """Parses a play event: [user,timestamp,readable_timestamp,event_id]""" 43 | def __init__(self): 44 | super(ParsePlayEventFn, self).__init__() 45 | self.num_parse_errors = Metrics.counter(self.__class__, 46 | 'num_play_parse_errors') 47 | 48 | def process(self, elem): 49 | try: 50 | parts = [x.strip() for x in elem.split(',')] 51 | user, timestamp, _, event_id = parts[:5] 52 | yield PlayEvent(user, timestamp, event_id) 53 | except Exception as e: 54 | self.num_parse_errors.inc() 55 | logging.error('Parse error on "%s": %s', elem, str(e)) 56 | 57 | 58 | def ParseEvent(element): 59 | try: 60 | parts = [x.strip() for x in element.split(',')] 61 | user, team, score, timestamp = parts[:4] 62 | score = int(score) 63 | timestamp = long(timestamp) 64 | if len(parts) >= 6: 65 | event_id = parts[5] 66 | else: 67 | event_id = 'none' 68 | return [GameEvent(user, team, score, timestamp, event_id)] 69 | except: 70 | return [] 71 | 72 | def ParseArgs(argv): 73 | parser = argparse.ArgumentParser() 74 | parser.add_argument('--input', dest='input', help='Input file to process.') 75 | parser.add_argument( 76 | '--topic', dest='topic', help='Input topic to read from.') 77 | parser.add_argument( 78 | '--play_topic', 79 | dest='play_topic', 80 | help='Input topic to read for play events.') 81 | parser.add_argument( 82 | '--output_dataset', 83 | dest='output_dataset', 84 | required=True, 85 | help='Output file to write results to.') 86 | parser.add_argument( 87 | '--output_tablename', 88 | dest='output_tablename', 89 | required=True, 90 | help='Output file to write results to.') 91 | parser.add_argument( 92 | '--session_gap', 93 | dest='session_gap', 94 | help='Gap between user sessions, in seconds.') 95 | parser.add_argument( 96 | '--user_activity_window', 97 | dest='user_activity_window', 98 | help= 99 | 'Value of fixed window for finding mean of session duration, in second.' 100 | ) 101 | return parser.parse_known_args(argv) 102 | -------------------------------------------------------------------------------- /py/util/util.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/util/util.pyc -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/Exercise0.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game; 18 | 19 | import com.google.api.services.bigquery.model.TableFieldSchema; 20 | import com.google.api.services.bigquery.model.TableReference; 21 | import com.google.api.services.bigquery.model.TableRow; 22 | import com.google.api.services.bigquery.model.TableSchema; 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import org.apache.beam.examples.complete.game.utils.GameEvent; 26 | import org.apache.beam.examples.complete.game.utils.Options; 27 | import org.apache.beam.examples.complete.game.utils.ParseEventFn; 28 | import org.apache.beam.sdk.Pipeline; 29 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 30 | import org.apache.beam.sdk.io.TextIO; 31 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 34 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 35 | import org.apache.beam.sdk.transforms.DoFn; 36 | import org.apache.beam.sdk.transforms.ParDo; 37 | 38 | /** 39 | * Zeroth (no code changes necessary) in a series of exercises in a gaming domain. 40 | * 41 | *

This batch pipeline imports game events from CSV to BigQuery. 42 | * 43 | *

See README.md for details. 44 | */ 45 | public class Exercise0 { 46 | 47 | /** 48 | * Format a GameEvent to a BigQuery TableRow. 49 | */ 50 | static class FormatGameEventFn extends DoFn { 51 | 52 | @ProcessElement 53 | public void processElement(ProcessContext c) { 54 | GameEvent event = c.element(); 55 | TableRow row = new TableRow() 56 | .set("user", event.getUser()) 57 | .set("team", event.getTeam()) 58 | .set("score", event.getScore()) 59 | .set("timestamp", event.getTimestamp() / 1000); 60 | c.output(row); 61 | } 62 | 63 | /** 64 | * Defines the BigQuery schema. 65 | */ 66 | static TableSchema getSchema() { 67 | List fields = new ArrayList<>(); 68 | fields.add(new TableFieldSchema().setName("user").setType("STRING")); 69 | fields.add(new TableFieldSchema().setName("team").setType("STRING")); 70 | fields.add(new TableFieldSchema().setName("score").setType("INTEGER")); 71 | fields.add(new TableFieldSchema().setName("timestamp").setType("TIMESTAMP")); 72 | return new TableSchema().setFields(fields); 73 | } 74 | } 75 | 76 | /** 77 | * Run a batch pipeline. 78 | */ 79 | public static void main(String[] args) throws Exception { 80 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 81 | Pipeline pipeline = Pipeline.create(options); 82 | 83 | TableReference tableRef = new TableReference(); 84 | tableRef.setDatasetId(options.getOutputDataset()); 85 | tableRef.setProjectId(options.as(GcpOptions.class).getProject()); 86 | tableRef.setTableId(options.getOutputTableName()); 87 | 88 | // Read events from a CSV file, parse them and write (import) them to BigQuery. 89 | pipeline 90 | .apply(TextIO.read().from(options.getInput())) 91 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) 92 | .apply("FormatGameEvent", ParDo.of(new FormatGameEventFn())) 93 | .apply( 94 | BigQueryIO.writeTableRows().to(tableRef) 95 | .withSchema(FormatGameEventFn.getSchema()) 96 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 97 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 98 | 99 | pipeline.run(); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/Exercise1.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game; 18 | 19 | import com.google.api.services.bigquery.model.TableFieldSchema; 20 | import com.google.api.services.bigquery.model.TableReference; 21 | import com.google.api.services.bigquery.model.TableRow; 22 | import com.google.api.services.bigquery.model.TableSchema; 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import org.apache.beam.examples.complete.game.utils.ChangeMe; 26 | import org.apache.beam.examples.complete.game.utils.GameEvent; 27 | import org.apache.beam.examples.complete.game.utils.Options; 28 | import org.apache.beam.examples.complete.game.utils.ParseEventFn; 29 | import org.apache.beam.sdk.Pipeline; 30 | import org.apache.beam.sdk.PipelineResult; 31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 32 | import org.apache.beam.sdk.io.TextIO; 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 35 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 36 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 37 | import org.apache.beam.sdk.transforms.DoFn; 38 | import org.apache.beam.sdk.transforms.PTransform; 39 | import org.apache.beam.sdk.transforms.ParDo; 40 | import org.apache.beam.sdk.values.KV; 41 | import org.apache.beam.sdk.values.PCollection; 42 | 43 | /** 44 | * First in a series of coding exercises in a gaming domain. 45 | * 46 | *

This batch pipeline calculates the sum of scores per user, over an entire batch of gaming data 47 | * and writes the sums to BigQuery. 48 | * 49 | *

See README.md for details. 50 | */ 51 | public class Exercise1 { 52 | 53 | /** 54 | * A transform to extract key/score information from GameEvent, and sum 55 | * the scores. The constructor arg determines whether 'team' or 'user' info is 56 | * extracted. 57 | */ 58 | public static class ExtractAndSumScore 59 | extends PTransform, PCollection>> { 60 | 61 | private final String field; 62 | 63 | public ExtractAndSumScore(String field) { 64 | this.field = field; 65 | } 66 | 67 | @Override 68 | public PCollection> expand(PCollection gameEvents) { 69 | // [START EXERCISE 1]: 70 | // JavaDoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/ 71 | // Developer Docs: https://beam.apache.org/documentation/programming-guide/#transforms-pardo 72 | // Also: https://cloud.google.com/dataflow/model/par-do 73 | // 74 | // Fill in the code to: 75 | // 1. Extract a KV from each GameEvent corresponding to the given 76 | // field and the score. 77 | // 2. Compute the sum of the scores for each key. 78 | // 3. Run your pipeline on the Dataflow service. 79 | return gameEvents 80 | .apply(ParDo.of(new DoFn>(){ 81 | @ProcessElement 82 | public void processElement(ProcessContext c) { 83 | // 1. Creates key-value pairs, using the KeyField as the key and 84 | // the score as the value. KV.of(key, value) creates a key-value pair. 85 | /* TODO: YOUR CODE GOES HERE */ 86 | } 87 | })) 88 | // 2. Sum is a family of PTransforms for computing the sum of elements in a PCollection. 89 | // Select the appropriate method to compute the sum over each key. 90 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */); 91 | // [END EXERCISE 1]: 92 | } 93 | } 94 | 95 | /** 96 | * Format a KV of user and their score to a BigQuery TableRow. 97 | */ 98 | static class FormatUserScoreSumsFn extends DoFn, TableRow> { 99 | 100 | @ProcessElement 101 | public void processElement(ProcessContext c) { 102 | TableRow row = new TableRow() 103 | .set("user", c.element().getKey()) 104 | .set("total_score", c.element().getValue()); 105 | c.output(row); 106 | } 107 | 108 | /** 109 | * Defines the BigQuery schema. 110 | */ 111 | static TableSchema getSchema() { 112 | List fields = new ArrayList<>(); 113 | fields.add(new TableFieldSchema().setName("user").setType("STRING")); 114 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER")); 115 | return new TableSchema().setFields(fields); 116 | } 117 | } 118 | 119 | /** 120 | * Run a batch pipeline. 121 | */ 122 | public static void main(String[] args) throws Exception { 123 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 124 | Pipeline pipeline = Pipeline.create(options); 125 | 126 | TableReference tableRef = new TableReference(); 127 | tableRef.setDatasetId(options.as(Options.class).getOutputDataset()); 128 | tableRef.setProjectId(options.as(GcpOptions.class).getProject()); 129 | tableRef.setTableId(options.getOutputTableName()); 130 | 131 | // Read events from a CSV file and parse them. 132 | pipeline 133 | .apply(TextIO.read().from(options.getInput())) 134 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) 135 | // Extract and sum username/score pairs from the event data. 136 | .apply("ExtractUserScore", new ExtractAndSumScore("user")) 137 | // Write the results to BigQuery. 138 | .apply("FormatUserScoreSums", ParDo.of(new FormatUserScoreSumsFn())) 139 | .apply( 140 | BigQueryIO.writeTableRows().to(tableRef) 141 | .withSchema(FormatUserScoreSumsFn.getSchema()) 142 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 143 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 144 | 145 | PipelineResult result = pipeline.run(); 146 | result.waitUntilFinish(); 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/Exercise2.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game; 18 | 19 | import com.google.api.services.bigquery.model.TableFieldSchema; 20 | import com.google.api.services.bigquery.model.TableReference; 21 | import com.google.api.services.bigquery.model.TableRow; 22 | import com.google.api.services.bigquery.model.TableSchema; 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import org.apache.beam.examples.complete.game.utils.ChangeMe; 26 | import org.apache.beam.examples.complete.game.utils.GameEvent; 27 | import org.apache.beam.examples.complete.game.utils.Options; 28 | import org.apache.beam.examples.complete.game.utils.ParseEventFn; 29 | import org.apache.beam.sdk.Pipeline; 30 | import org.apache.beam.sdk.PipelineResult; 31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 32 | import org.apache.beam.sdk.io.TextIO; 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 35 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 36 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 37 | import org.apache.beam.sdk.transforms.DoFn; 38 | import org.apache.beam.sdk.transforms.PTransform; 39 | import org.apache.beam.sdk.transforms.ParDo; 40 | import org.apache.beam.sdk.transforms.WithTimestamps; 41 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow; 42 | import org.apache.beam.sdk.values.KV; 43 | import org.apache.beam.sdk.values.PCollection; 44 | import org.joda.time.Duration; 45 | import org.joda.time.Instant; 46 | 47 | /** 48 | * Second in a series of coding exercises in a gaming domain. 49 | * 50 | *

This batch pipeline calculates the sum of scores per team per hour, over an entire batch of 51 | * gaming data and writes the per-team sums to BigQuery. 52 | * 53 | *

See README.md for details. 54 | */ 55 | public class Exercise2 { 56 | 57 | /** 58 | * A transform to compute the WindowedTeamScore. 59 | */ 60 | public static class WindowedTeamScore 61 | extends PTransform, PCollection>> { 62 | // Developer Docs for composite transforms: 63 | // https://beam.apache.org/documentation/programming-guide/#transforms-composite 64 | 65 | private Duration duration; 66 | 67 | public WindowedTeamScore(Duration duration) { 68 | this.duration = duration; 69 | } 70 | 71 | @Override 72 | public PCollection> expand(PCollection input) { 73 | // [START EXERCISE 2]: 74 | // JavaDoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/ 75 | // Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing 76 | // Also: https://cloud.google.com/dataflow/model/windowing 77 | // 78 | return input 79 | // Window.into() takes a WindowFn and returns a PTransform that 80 | // applies windowing to the PCollection. FixedWindows.of() returns a 81 | // WindowFn that assigns elements to windows of a fixed size. Use 82 | // these methods to apply fixed windows of size 83 | // this.duration to the PCollection. 84 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */) 85 | // Remember the ExtractAndSumScore PTransform from Exercise 1? We 86 | // parameterized it over the key field. Use it here to compute the "team" 87 | // scores (recall it is a public static method of Exercise1). 88 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */); 89 | // [END EXERCISE 2] 90 | } 91 | } 92 | 93 | /** 94 | * Format a KV of team and their score to a BigQuery TableRow. 95 | */ 96 | public static class FormatTeamScoreSumsFn extends DoFn, TableRow>{ 97 | 98 | @ProcessElement 99 | public void processElement(ProcessContext c, IntervalWindow window) { 100 | TableRow row = 101 | new TableRow() 102 | .set("team", c.element().getKey()) 103 | .set("total_score", c.element().getValue()) 104 | .set("window_start", window.start().getMillis() / 1000); 105 | c.output(row); 106 | } 107 | 108 | /** 109 | * Defines the BigQuery schema. 110 | */ 111 | public static TableSchema getSchema() { 112 | List fields = new ArrayList<>(); 113 | fields.add(new TableFieldSchema().setName("team").setType("STRING")); 114 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER")); 115 | fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP")); 116 | return new TableSchema().setFields(fields); 117 | } 118 | } 119 | 120 | /** 121 | * Run a batch pipeline. 122 | */ 123 | public static void main(String[] args) throws Exception { 124 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 125 | Pipeline pipeline = Pipeline.create(options); 126 | 127 | TableReference tableRef = new TableReference(); 128 | tableRef.setDatasetId(options.as(Options.class).getOutputDataset()); 129 | tableRef.setProjectId(options.as(GcpOptions.class).getProject()); 130 | tableRef.setTableId(options.getOutputTableName()); 131 | 132 | // Read events from a CSV file and parse them. 133 | pipeline 134 | .apply(TextIO.read().from(options.getInput())) 135 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) 136 | .apply( 137 | "AddEventTimestamps", WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp()))) 138 | .apply("WindowedTeamScore", new WindowedTeamScore(Duration.standardMinutes(60))) 139 | // Write the results to BigQuery. 140 | .apply("FormatTeamScoreSums", ParDo.of(new FormatTeamScoreSumsFn())) 141 | .apply( 142 | BigQueryIO.writeTableRows().to(tableRef) 143 | .withSchema(FormatTeamScoreSumsFn.getSchema()) 144 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 145 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 146 | 147 | PipelineResult result = pipeline.run(); 148 | result.waitUntilFinish(); 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/Exercise3.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game; 18 | 19 | import com.google.api.services.bigquery.model.TableReference; 20 | import org.apache.beam.sdk.Pipeline; 21 | import org.apache.beam.sdk.PipelineResult; 22 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 23 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 24 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 25 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 26 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 27 | import org.apache.beam.sdk.transforms.PTransform; 28 | import org.apache.beam.sdk.transforms.ParDo; 29 | import org.apache.beam.sdk.values.PBegin; 30 | import org.apache.beam.sdk.values.PCollection; 31 | import org.apache.beam.examples.complete.game.solutions.Exercise2; 32 | import org.apache.beam.examples.complete.game.utils.ChangeMe; 33 | import org.apache.beam.examples.complete.game.utils.GameEvent; 34 | import org.apache.beam.examples.complete.game.utils.Options; 35 | import org.joda.time.Duration; 36 | 37 | /** 38 | * Third in a series of coding exercises in a gaming domain. 39 | * 40 | *

This is the same pipeline as in Exercise 2, but can run in either batch or streaming mode. 41 | * 42 | *

See README.md for details. 43 | */ 44 | public class Exercise3 { 45 | 46 | /** 47 | * A transform to read the game events from either text files or Pub/Sub topic. 48 | */ 49 | public static class ReadGameEvents extends PTransform> { 50 | 51 | private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms"; 52 | 53 | private Options options; 54 | 55 | public ReadGameEvents(Options options) { 56 | this.options = options; 57 | } 58 | 59 | @Override 60 | public PCollection expand(PBegin begin) { 61 | // [START EXERCISE 3]: 62 | // Javadoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.html 63 | // Developer Docs (1.x): https://cloud.google.com/dataflow/model/pubsub-io 64 | // 65 | // Determine whether to use files or topic based on options. 66 | if (options.getInput() != null && !options.getInput().isEmpty()) { 67 | return begin 68 | .getPipeline() 69 | // Read game events from files. See main() in Exercise2. Don't forget to parse events or 70 | // to include WithTimestamps transform to assign timestamps to events. 71 | // https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/WithTimestamps.html 72 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */) 73 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */) 74 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */); 75 | } else { 76 | return begin 77 | .getPipeline() 78 | // Read game events from Pub/Sub topic options.getTopic() using custom timestamps, which 79 | // are extracted from the pubsub attribute TIMESTAMP_ATTRIBUTE. 80 | // Use PubsubIO.readStrings() with withTimestampAttribute() and fromTopic(). 81 | // https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.html 82 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */) 83 | // Parse the messages the same way as when they come from the text file. Note that we no 84 | // longer have to run WithTimestamps transform, as the timestamps are already set by 85 | // PubsubIO. (In streaming, changing timestamps must be done carefully to avoid 86 | // guarantees necessary for watermarks.) 87 | .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */); 88 | } 89 | // [END EXERCISE 3] 90 | } 91 | } 92 | 93 | /** 94 | * Run a batch or streaming pipeline. 95 | */ 96 | public static void main(String[] args) throws Exception { 97 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 98 | 99 | Pipeline pipeline = Pipeline.create(options); 100 | 101 | TableReference tableRef = new TableReference(); 102 | tableRef.setDatasetId(options.as(Options.class).getOutputDataset()); 103 | tableRef.setProjectId(options.as(GcpOptions.class).getProject()); 104 | tableRef.setTableId(options.getOutputTableName()); 105 | 106 | // Read events from either a CSV file or PubSub stream. 107 | pipeline 108 | .apply(new ReadGameEvents(options)) 109 | .apply("WindowedTeamScore", new Exercise2.WindowedTeamScore(Duration.standardMinutes(60))) 110 | // Write the results to BigQuery. 111 | .apply("FormatTeamScoreSums", ParDo.of(new Exercise2.FormatTeamScoreSumsFn())) 112 | .apply( 113 | BigQueryIO.writeTableRows().to(tableRef) 114 | .withSchema(Exercise2.FormatTeamScoreSumsFn.getSchema()) 115 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 116 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 117 | 118 | PipelineResult result = pipeline.run(); 119 | result.waitUntilFinish(); 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/Exercise4.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game; 18 | 19 | import com.google.api.services.bigquery.model.TableFieldSchema; 20 | import com.google.api.services.bigquery.model.TableReference; 21 | import com.google.api.services.bigquery.model.TableRow; 22 | import com.google.api.services.bigquery.model.TableSchema; 23 | import com.google.common.annotations.VisibleForTesting; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | import org.apache.beam.examples.complete.game.solutions.Exercise1; 27 | import org.apache.beam.examples.complete.game.solutions.Exercise3; 28 | import org.apache.beam.examples.complete.game.utils.GameEvent; 29 | import org.apache.beam.examples.complete.game.utils.Options; 30 | import org.apache.beam.runners.dataflow.DataflowRunner; 31 | import org.apache.beam.sdk.Pipeline; 32 | import org.apache.beam.sdk.PipelineResult; 33 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 35 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 36 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 37 | import org.apache.beam.sdk.options.Default; 38 | import org.apache.beam.sdk.options.Description; 39 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 40 | import org.apache.beam.sdk.options.StreamingOptions; 41 | import org.apache.beam.sdk.transforms.DoFn; 42 | import org.apache.beam.sdk.transforms.PTransform; 43 | import org.apache.beam.sdk.transforms.ParDo; 44 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow; 45 | import org.apache.beam.sdk.values.KV; 46 | import org.apache.beam.sdk.values.PCollection; 47 | import org.joda.time.Duration; 48 | import org.joda.time.Instant; 49 | 50 | /** 51 | * Fourth in a series of coding exercises in a gaming domain. 52 | * 53 | *

This streaming pipeline calculates user and team scores for a window of time and writes them 54 | * to BigQuery. 55 | * 56 | *

See README.md for details. 57 | */ 58 | public class Exercise4 { 59 | 60 | static final Duration TEN_SECONDS = Duration.standardSeconds(10); 61 | static final Duration THIRTY_SECONDS = Duration.standardSeconds(30); 62 | 63 | /** 64 | * Exercise4Options supported by {@link Exercise4}. 65 | */ 66 | interface Exercise4Options extends Options, StreamingOptions { 67 | 68 | @Description("Numeric value of fixed window duration for team analysis, in minutes") 69 | @Default.Integer(1) 70 | Integer getTeamWindowDuration(); 71 | 72 | void setTeamWindowDuration(Integer value); 73 | 74 | @Description("Numeric value of allowed data lateness, in minutes") 75 | @Default.Integer(2) 76 | Integer getAllowedLateness(); 77 | 78 | void setAllowedLateness(Integer value); 79 | } 80 | 81 | /** 82 | * Extract user/score pairs from the event stream using processing time, via global windowing. Get 83 | * periodic updates on all users' running scores. 84 | */ 85 | @VisibleForTesting 86 | static class CalculateUserScores 87 | extends PTransform, PCollection>> { 88 | 89 | private final Duration allowedLateness; 90 | 91 | CalculateUserScores(Duration allowedLateness) { 92 | this.allowedLateness = allowedLateness; 93 | } 94 | 95 | @Override 96 | public PCollection> expand(PCollection input) { 97 | // [START EXERCISE 4 PART 1]: 98 | // JavaDoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/windowing/Window.html 99 | // Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing 100 | // 101 | // Fill in the code to: 102 | // 1. Window the incoming input into global windows 103 | // 2. that trigger every thirty seconds to emit speculative results, 104 | // 3. allow late data with allowedLateness, 105 | // 3. and don't forget to accumulate over the entire window. 106 | return input 107 | /* TODO: SOLUTION CODE HERE */ 108 | // Extract and sum username/score pairs from the event data. 109 | .apply("ExtractUserScore", new Exercise1.ExtractAndSumScore("user")); 110 | // [END EXERCISE 4 PART 1]: 111 | } 112 | } 113 | 114 | /** 115 | * Calculates scores for each team within the configured window duration. 116 | */ 117 | // Extract team/score pairs from the event stream, using hour-long windows by default. 118 | @VisibleForTesting 119 | static class CalculateTeamScores 120 | extends PTransform, PCollection>> { 121 | 122 | private final Duration teamWindowDuration; 123 | private final Duration allowedLateness; 124 | 125 | CalculateTeamScores(Duration teamWindowDuration, Duration allowedLateness) { 126 | this.teamWindowDuration = teamWindowDuration; 127 | this.allowedLateness = allowedLateness; 128 | } 129 | 130 | @Override 131 | public PCollection> expand(PCollection infos) { 132 | // [START EXERCISE 4 PART 2]: 133 | // JavaDoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/windowing/Window.html 134 | // Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing 135 | // 136 | // Fill in the code to: 137 | // 1. Window the incoming input into fixed windows of team window duration, 138 | // 2. trigger on time results at the watermark, 139 | // 3. trigger speculative results every ten seconds, 140 | // 4. trigger late data results with a delay of thirty seconds, 141 | // 5. don't forget to set the allowedLateness, 142 | // 6. and ensure that we continue to accumulate over all data in the window. 143 | return infos 144 | /* TODO: SOLUTION CODE HERE */ 145 | // Extract and sum teamname/score pairs from the event data. 146 | .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team")); 147 | // [END EXERCISE 4 PART 2]: 148 | } 149 | } 150 | 151 | public static void main(String[] args) throws Exception { 152 | Exercise4Options options = 153 | PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise4Options.class); 154 | // Enforce that this pipeline is always run in streaming mode. 155 | options.setStreaming(true); 156 | options.setRunner(DataflowRunner.class); 157 | Pipeline pipeline = Pipeline.create(options); 158 | 159 | TableReference teamTable = new TableReference(); 160 | teamTable.setDatasetId(options.getOutputDataset()); 161 | teamTable.setProjectId(options.as(GcpOptions.class).getProject()); 162 | teamTable.setTableId(options.getOutputTableName() + "_team"); 163 | 164 | TableReference userTable = new TableReference(); 165 | userTable.setDatasetId(options.getOutputDataset()); 166 | userTable.setProjectId(options.as(GcpOptions.class).getProject()); 167 | userTable.setTableId(options.getOutputTableName() + "_user"); 168 | 169 | PCollection gameEvents = pipeline.apply(new Exercise3.ReadGameEvents(options)); 170 | 171 | gameEvents 172 | .apply( 173 | "CalculateTeamScores", 174 | new CalculateTeamScores( 175 | Duration.standardMinutes(options.getTeamWindowDuration()), 176 | Duration.standardMinutes(options.getAllowedLateness()))) 177 | // Write the results to BigQuery. 178 | .apply("FormatTeamScores", ParDo.of(new FormatTeamScoreFn())) 179 | .apply( 180 | BigQueryIO.writeTableRows().to(teamTable) 181 | .withSchema(FormatTeamScoreFn.getSchema()) 182 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 183 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 184 | 185 | gameEvents 186 | .apply( 187 | "CalculateUserScores", 188 | new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness()))) 189 | // Write the results to BigQuery. 190 | .apply("FormatUserScores", ParDo.of(new FormatUserScoreFn())) 191 | .apply( 192 | BigQueryIO.writeTableRows().to(userTable) 193 | .withSchema(FormatUserScoreFn.getSchema()) 194 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 195 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 196 | 197 | PipelineResult result = pipeline.run(); 198 | result.waitUntilFinish(); 199 | } 200 | 201 | /** 202 | * Format a KV of team and associated properties to a BigQuery TableRow. 203 | */ 204 | protected static class FormatTeamScoreFn extends DoFn, TableRow> { 205 | 206 | @ProcessElement 207 | public void processElement(ProcessContext c, IntervalWindow window) { 208 | TableRow row = 209 | new TableRow() 210 | .set("team", c.element().getKey()) 211 | .set("total_score", c.element().getValue()) 212 | .set("window_start", window.start().getMillis() / 1000) 213 | .set("processing_time", Instant.now().getMillis() / 1000) 214 | .set("timing", c.pane().getTiming().toString()); 215 | c.output(row); 216 | } 217 | 218 | static TableSchema getSchema() { 219 | List fields = new ArrayList<>(); 220 | fields.add(new TableFieldSchema().setName("team").setType("STRING")); 221 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER")); 222 | fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP")); 223 | fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP")); 224 | fields.add(new TableFieldSchema().setName("timing").setType("STRING")); 225 | return new TableSchema().setFields(fields); 226 | } 227 | } 228 | 229 | /** 230 | * Format a KV of user and associated properties to a BigQuery TableRow. 231 | */ 232 | static class FormatUserScoreFn extends DoFn, TableRow> { 233 | 234 | @ProcessElement 235 | public void processElement(ProcessContext c) { 236 | TableRow row = 237 | new TableRow() 238 | .set("user", c.element().getKey()) 239 | .set("total_score", c.element().getValue()) 240 | .set("processing_time", Instant.now().getMillis() / 1000); 241 | c.output(row); 242 | } 243 | 244 | static TableSchema getSchema() { 245 | List fields = new ArrayList<>(); 246 | fields.add(new TableFieldSchema().setName("user").setType("STRING")); 247 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER")); 248 | fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP")); 249 | return new TableSchema().setFields(fields); 250 | } 251 | } 252 | } 253 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/Exercise6.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | package org.apache.beam.examples.complete.game; 17 | 18 | import com.google.api.services.bigquery.model.TableFieldSchema; 19 | import com.google.api.services.bigquery.model.TableReference; 20 | import com.google.api.services.bigquery.model.TableRow; 21 | import com.google.api.services.bigquery.model.TableSchema; 22 | import java.util.ArrayList; 23 | import java.util.List; 24 | import org.apache.beam.examples.complete.game.solutions.Exercise3.ReadGameEvents; 25 | import org.apache.beam.examples.complete.game.utils.ChangeMe; 26 | import org.apache.beam.examples.complete.game.utils.GameEvent; 27 | import org.apache.beam.examples.complete.game.utils.Options; 28 | import org.apache.beam.runners.dataflow.DataflowRunner; 29 | import org.apache.beam.sdk.Pipeline; 30 | import org.apache.beam.sdk.PipelineResult; 31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 35 | import org.apache.beam.sdk.options.Default; 36 | import org.apache.beam.sdk.options.Description; 37 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 38 | import org.apache.beam.sdk.options.StreamingOptions; 39 | import org.apache.beam.sdk.transforms.Combine; 40 | import org.apache.beam.sdk.transforms.DoFn; 41 | import org.apache.beam.sdk.transforms.MapElements; 42 | import org.apache.beam.sdk.transforms.Mean; 43 | import org.apache.beam.sdk.transforms.ParDo; 44 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow; 45 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow; 46 | import org.apache.beam.sdk.values.KV; 47 | import org.apache.beam.sdk.values.PCollection; 48 | import org.apache.beam.sdk.values.TypeDescriptors; 49 | import org.joda.time.Duration; 50 | import org.slf4j.Logger; 51 | import org.slf4j.LoggerFactory; 52 | 53 | /** 54 | * Sixth in a series of coding exercises in a gaming domain. 55 | * 56 | *

This exercise introduces session windows. 57 | * 58 | *

See README.md for details. 59 | */ 60 | public class Exercise6 { 61 | 62 | private static final Logger LOG = LoggerFactory.getLogger(Exercise6.class); 63 | 64 | /** 65 | * Calculate and output an element's session duration. 66 | */ 67 | private static class UserSessionInfoFn extends DoFn, Integer> { 68 | 69 | @ProcessElement 70 | public void processElement(ProcessContext c, BoundedWindow window) { 71 | IntervalWindow w = (IntervalWindow) window; 72 | int duration = new Duration(w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes(); 73 | c.output(duration); 74 | } 75 | } 76 | 77 | /** 78 | * Options supported by {@link Exercise6}. 79 | */ 80 | interface Exercise6Options extends Options, StreamingOptions { 81 | 82 | @Description("Numeric value of gap between user sessions, in minutes") 83 | @Default.Integer(1) 84 | Integer getSessionGap(); 85 | 86 | void setSessionGap(Integer value); 87 | 88 | @Description( 89 | "Numeric value of fixed window for finding mean of user session duration, " + "in minutes") 90 | @Default.Integer(5) 91 | Integer getUserActivityWindowDuration(); 92 | 93 | void setUserActivityWindowDuration(Integer value); 94 | } 95 | 96 | public static void main(String[] args) throws Exception { 97 | 98 | Exercise6Options options = 99 | PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise6Options.class); 100 | // Enforce that this pipeline is always run in streaming mode. 101 | options.setStreaming(true); 102 | options.setRunner(DataflowRunner.class); 103 | Pipeline pipeline = Pipeline.create(options); 104 | 105 | TableReference sessionsTable = new TableReference(); 106 | sessionsTable.setDatasetId(options.getOutputDataset()); 107 | sessionsTable.setProjectId(options.as(GcpOptions.class).getProject()); 108 | sessionsTable.setTableId(options.getOutputTableName()); 109 | 110 | PCollection rawEvents = pipeline.apply(new ReadGameEvents(options)); 111 | 112 | // Extract username/score pairs from the event stream 113 | PCollection> userEvents = 114 | rawEvents.apply( 115 | "ExtractUserScore", 116 | MapElements 117 | .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) 118 | .via((GameEvent gInfo) -> KV.of(gInfo.getUser(), 119 | gInfo.getScore()))); 120 | 121 | // [START EXERCISE 6]: 122 | // Detect user sessions-- that is, a burst of activity separated by a gap from further 123 | // activity. Find and record the mean session lengths. 124 | // This information could help the game designers track the changing user engagement 125 | // as their set of games changes. 126 | userEvents 127 | // Window the user events into sessions with gap options.getSessionGap() minutes. Make sure 128 | // to use an outputTimeFn that sets the output timestamp to the end of the window. This will 129 | // allow us to compute means on sessions based on their end times, rather than their start 130 | // times. 131 | // JavaDoc: 132 | // - https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/windowing/Sessions.html 133 | // - https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/windowing/Window.html 134 | // Note: Pay attention to the withTimestampCombiner method on Window. 135 | .apply("WindowIntoSessions", 136 | /* TODO: YOUR CODE GOES HERE */ 137 | new ChangeMe>, KV>()) 138 | // For this use, we care only about the existence of the session, not any particular 139 | // information aggregated over it, so the following is an efficient way to do that. 140 | .apply(Combine.perKey(x -> 0)) 141 | // Get the duration per session. 142 | .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())) 143 | // Note that the output of the previous transform is a PCollection of session durations 144 | // (PCollection) where the timestamp of elements is the end of the window. 145 | // 146 | // Re-window to process groups of session sums according to when the sessions complete. 147 | // In streaming we don't just ask "what is the mean value" we must ask "what is the mean 148 | // value for some window of time". To compute periodic means of session durations, we 149 | // re-window the session durations. 150 | .apply("WindowToExtractSessionMean", 151 | /* TODO: YOUR CODE GOES HERE */ 152 | new ChangeMe, Integer>()) 153 | // Find the mean session duration in each window. 154 | .apply(Mean.globally().withoutDefaults()) 155 | // Write this info to a BigQuery table. 156 | .apply("FormatSessions", ParDo.of(new FormatSessionWindowFn())) 157 | .apply( 158 | BigQueryIO.writeTableRows().to(sessionsTable) 159 | .withSchema(FormatSessionWindowFn.getSchema()) 160 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 161 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 162 | // [END EXERCISE 6]: 163 | 164 | PipelineResult result = pipeline.run(); 165 | result.waitUntilFinish(); 166 | } 167 | 168 | /** 169 | * Format a KV of session and associated properties to a BigQuery TableRow. 170 | */ 171 | static class FormatSessionWindowFn extends DoFn { 172 | 173 | @ProcessElement 174 | public void processElement(ProcessContext c, BoundedWindow window) { 175 | IntervalWindow w = (IntervalWindow) window; 176 | TableRow row = 177 | new TableRow() 178 | .set("window_start", w.start().getMillis() / 1000) 179 | .set("mean_duration", c.element()); 180 | c.output(row); 181 | } 182 | 183 | static TableSchema getSchema() { 184 | List fields = new ArrayList<>(); 185 | fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP")); 186 | fields.add(new TableFieldSchema().setName("mean_duration").setType("FLOAT")); 187 | return new TableSchema().setFields(fields); 188 | } 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/injector/InjectorUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game.injector; 18 | 19 | import static com.google.common.base.Preconditions.checkNotNull; 20 | 21 | import com.google.api.client.googleapis.auth.oauth2.GoogleCredential; 22 | import com.google.api.client.googleapis.json.GoogleJsonResponseException; 23 | import com.google.api.client.googleapis.util.Utils; 24 | import com.google.api.client.http.HttpRequestInitializer; 25 | import com.google.api.client.http.HttpStatusCodes; 26 | import com.google.api.client.http.HttpTransport; 27 | import com.google.api.client.json.JsonFactory; 28 | import com.google.api.services.pubsub.Pubsub; 29 | import com.google.api.services.pubsub.PubsubScopes; 30 | import com.google.api.services.pubsub.model.Topic; 31 | import java.io.IOException; 32 | 33 | class InjectorUtils { 34 | 35 | private static final String APP_NAME = "injector"; 36 | 37 | /** 38 | * Builds a new Pubsub client and returns it. 39 | */ 40 | public static Pubsub getClient(final HttpTransport httpTransport, final JsonFactory jsonFactory) 41 | throws IOException { 42 | checkNotNull(httpTransport); 43 | checkNotNull(jsonFactory); 44 | GoogleCredential credential = 45 | GoogleCredential.getApplicationDefault(httpTransport, jsonFactory); 46 | if (credential.createScopedRequired()) { 47 | credential = credential.createScoped(PubsubScopes.all()); 48 | } 49 | if (credential.getClientAuthentication() != null) { 50 | System.out.println( 51 | "\n***Warning! You are not using service account credentials to " 52 | + "authenticate.\nYou need to use service account credentials for this example," 53 | + "\nsince user-level credentials do not have enough pubsub quota,\nand so you will run " 54 | + "out of PubSub quota very quickly.\nSee " 55 | + "https://developers.google.com/identity/protocols/application-default-credentials."); 56 | System.exit(1); 57 | } 58 | HttpRequestInitializer initializer = new RetryHttpInitializerWrapper(credential); 59 | return new Pubsub.Builder(httpTransport, jsonFactory, initializer) 60 | .setApplicationName(APP_NAME) 61 | .build(); 62 | } 63 | 64 | /** 65 | * Builds a new Pubsub client with default HttpTransport and JsonFactory and returns it. 66 | */ 67 | public static Pubsub getClient() throws IOException { 68 | return getClient(Utils.getDefaultTransport(), Utils.getDefaultJsonFactory()); 69 | } 70 | 71 | /** 72 | * Returns the fully qualified topic name for Pub/Sub. 73 | */ 74 | public static String getFullyQualifiedTopicName(final String project, final String topic) { 75 | return String.format("projects/%s/topics/%s", project, topic); 76 | } 77 | 78 | /** 79 | * Create a topic if it doesn't exist. 80 | */ 81 | public static void createTopic(Pubsub client, String fullTopicName) throws IOException { 82 | try { 83 | client.projects().topics().get(fullTopicName).execute(); 84 | } catch (GoogleJsonResponseException e) { 85 | if (e.getStatusCode() == HttpStatusCodes.STATUS_CODE_NOT_FOUND) { 86 | Topic topic = client.projects().topics().create(fullTopicName, new Topic()).execute(); 87 | System.out.printf("Topic %s was created.\n", topic.getName()); 88 | } 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/injector/RetryHttpInitializerWrapper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | package org.apache.beam.examples.complete.game.injector; 16 | 17 | import static com.google.common.base.Preconditions.checkNotNull; 18 | 19 | import com.google.api.client.auth.oauth2.Credential; 20 | import com.google.api.client.http.HttpBackOffIOExceptionHandler; 21 | import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler; 22 | import com.google.api.client.http.HttpRequest; 23 | import com.google.api.client.http.HttpRequestInitializer; 24 | import com.google.api.client.http.HttpResponse; 25 | import com.google.api.client.http.HttpUnsuccessfulResponseHandler; 26 | import com.google.api.client.util.ExponentialBackOff; 27 | import com.google.api.client.util.Sleeper; 28 | import java.io.IOException; 29 | import java.util.logging.Logger; 30 | 31 | /** 32 | * RetryHttpInitializerWrapper will automatically retry upon RPC failures, preserving the 33 | * auto-refresh behavior of the Google Credentials. 34 | */ 35 | public class RetryHttpInitializerWrapper implements HttpRequestInitializer { 36 | 37 | /** 38 | * A private logger. 39 | */ 40 | private static final Logger LOG = Logger.getLogger(RetryHttpInitializerWrapper.class.getName()); 41 | 42 | /** 43 | * One minutes in miliseconds. 44 | */ 45 | private static final int ONEMINITUES = 60000; 46 | 47 | /** 48 | * Intercepts the request for filling in the "Authorization" header field, as well as recovering 49 | * from certain unsuccessful error codes wherein the Credential must refresh its token for a 50 | * retry. 51 | */ 52 | private final Credential wrappedCredential; 53 | 54 | /** 55 | * A sleeper; you can replace it with a mock in your test. 56 | */ 57 | private final Sleeper sleeper; 58 | 59 | /** 60 | * A constructor. 61 | * 62 | * @param wrappedCredential Credential which will be wrapped and used for providing auth header. 63 | */ 64 | public RetryHttpInitializerWrapper(final Credential wrappedCredential) { 65 | this(wrappedCredential, Sleeper.DEFAULT); 66 | } 67 | 68 | /** 69 | * A protected constructor only for testing. 70 | * 71 | * @param wrappedCredential Credential which will be wrapped and used for providing auth header. 72 | * @param sleeper Sleeper for easy testing. 73 | */ 74 | RetryHttpInitializerWrapper(final Credential wrappedCredential, final Sleeper sleeper) { 75 | this.wrappedCredential = checkNotNull(wrappedCredential); 76 | this.sleeper = sleeper; 77 | } 78 | 79 | /** 80 | * Initializes the given request. 81 | */ 82 | @Override 83 | public final void initialize(final HttpRequest request) { 84 | request.setReadTimeout(2 * ONEMINITUES); // 2 minutes read timeout 85 | final HttpUnsuccessfulResponseHandler backoffHandler = 86 | new HttpBackOffUnsuccessfulResponseHandler(new ExponentialBackOff()).setSleeper(sleeper); 87 | request.setInterceptor(wrappedCredential); 88 | request.setUnsuccessfulResponseHandler( 89 | new HttpUnsuccessfulResponseHandler() { 90 | @Override 91 | public boolean handleResponse( 92 | final HttpRequest request, final HttpResponse response, final boolean supportsRetry) 93 | throws IOException { 94 | if (wrappedCredential.handleResponse(request, response, supportsRetry)) { 95 | // If credential decides it can handle it, 96 | // the return code or message indicated 97 | // something specific to authentication, 98 | // and no backoff is desired. 99 | return true; 100 | } else if (backoffHandler.handleResponse(request, response, supportsRetry)) { 101 | // Otherwise, we defer to the judgement of 102 | // our internal backoff handler. 103 | LOG.info("Retrying " + request.getUrl().toString()); 104 | return true; 105 | } else { 106 | return false; 107 | } 108 | } 109 | }); 110 | request.setIOExceptionHandler( 111 | new HttpBackOffIOExceptionHandler(new ExponentialBackOff()).setSleeper(sleeper)); 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise1.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game.solutions; 18 | 19 | import com.google.api.services.bigquery.model.TableFieldSchema; 20 | import com.google.api.services.bigquery.model.TableReference; 21 | import com.google.api.services.bigquery.model.TableRow; 22 | import com.google.api.services.bigquery.model.TableSchema; 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import org.apache.beam.examples.complete.game.utils.GameEvent; 26 | import org.apache.beam.examples.complete.game.utils.Options; 27 | import org.apache.beam.examples.complete.game.utils.ParseEventFn; 28 | import org.apache.beam.sdk.Pipeline; 29 | import org.apache.beam.sdk.PipelineResult; 30 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 31 | import org.apache.beam.sdk.io.TextIO; 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 35 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 36 | import org.apache.beam.sdk.transforms.DoFn; 37 | import org.apache.beam.sdk.transforms.MapElements; 38 | import org.apache.beam.sdk.transforms.PTransform; 39 | import org.apache.beam.sdk.transforms.ParDo; 40 | import org.apache.beam.sdk.transforms.Sum; 41 | import org.apache.beam.sdk.values.KV; 42 | import org.apache.beam.sdk.values.PCollection; 43 | import org.apache.beam.sdk.values.TypeDescriptors; 44 | 45 | /** 46 | * First in a series of coding exercises in a gaming domain. 47 | * 48 | *

This batch pipeline calculates the sum of scores per user, over an entire batch of gaming data 49 | * and writes the sums to BigQuery. 50 | * 51 | *

See README.md for details. 52 | */ 53 | public class Exercise1 { 54 | 55 | /** 56 | * A transform to extract key/score information from GameEvent, and sum 57 | * the scores. The constructor arg determines whether 'team' or 'user' info is 58 | * extracted. 59 | */ 60 | public static class ExtractAndSumScore 61 | extends PTransform, PCollection>> { 62 | 63 | private final String field; 64 | 65 | public ExtractAndSumScore(String field) { 66 | this.field = field; 67 | } 68 | 69 | @Override 70 | public PCollection> expand(PCollection gameEvents) { 71 | return gameEvents 72 | .apply(ParDo.of(new DoFn>(){ 73 | @ProcessElement 74 | public void processElement(ProcessContext c) { 75 | GameEvent event = c.element(); 76 | c.output(KV.of(event.getKey(field), event.getScore())); 77 | } 78 | })) 79 | /* 80 | // alternate implementation 81 | .apply(MapElements 82 | .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) 83 | .via((GameEvent event) -> KV.of(event.getKey(field), 84 | event.getScore()))) */ 85 | .apply(Sum.integersPerKey()); 86 | } 87 | } 88 | 89 | /** 90 | * Format a KV of user and their score to a BigQuery TableRow. 91 | */ 92 | static class FormatUserScoreSumsFn extends DoFn, TableRow> { 93 | 94 | @ProcessElement 95 | public void processElement(ProcessContext c) { 96 | TableRow row = new TableRow() 97 | .set("user", c.element().getKey()) 98 | .set("total_score", c.element().getValue()); 99 | c.output(row); 100 | } 101 | 102 | /** 103 | * Defines the BigQuery schema. 104 | */ 105 | static TableSchema getSchema() { 106 | List fields = new ArrayList<>(); 107 | fields.add(new TableFieldSchema().setName("user").setType("STRING")); 108 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER")); 109 | return new TableSchema().setFields(fields); 110 | } 111 | } 112 | 113 | /** 114 | * Run a batch pipeline. 115 | */ 116 | public static void main(String[] args) throws Exception { 117 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 118 | Pipeline pipeline = Pipeline.create(options); 119 | 120 | TableReference tableRef = new TableReference(); 121 | tableRef.setDatasetId(options.as(Options.class).getOutputDataset()); 122 | tableRef.setProjectId(options.as(GcpOptions.class).getProject()); 123 | tableRef.setTableId(options.getOutputTableName()); 124 | 125 | // Read events from a CSV file and parse them. 126 | pipeline 127 | .apply(TextIO.read().from(options.getInput())) 128 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) 129 | // Extract and sum username/score pairs from the event data. 130 | .apply("ExtractUserScore", new ExtractAndSumScore("user")) 131 | // Write the results to BigQuery. 132 | .apply("FormatUserScoreSums", ParDo.of(new FormatUserScoreSumsFn())) 133 | .apply( 134 | BigQueryIO.writeTableRows().to(tableRef) 135 | .withSchema(FormatUserScoreSumsFn.getSchema()) 136 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 137 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 138 | 139 | PipelineResult result = pipeline.run(); 140 | result.waitUntilFinish(); 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise2.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game.solutions; 18 | 19 | import com.google.api.services.bigquery.model.TableFieldSchema; 20 | import com.google.api.services.bigquery.model.TableReference; 21 | import com.google.api.services.bigquery.model.TableRow; 22 | import com.google.api.services.bigquery.model.TableSchema; 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import org.apache.beam.examples.complete.game.utils.GameEvent; 26 | import org.apache.beam.examples.complete.game.utils.Options; 27 | import org.apache.beam.examples.complete.game.utils.ParseEventFn; 28 | import org.apache.beam.sdk.Pipeline; 29 | import org.apache.beam.sdk.PipelineResult; 30 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 31 | import org.apache.beam.sdk.io.TextIO; 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 35 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 36 | import org.apache.beam.sdk.transforms.DoFn; 37 | import org.apache.beam.sdk.transforms.PTransform; 38 | import org.apache.beam.sdk.transforms.ParDo; 39 | import org.apache.beam.sdk.transforms.WithTimestamps; 40 | import org.apache.beam.sdk.transforms.windowing.FixedWindows; 41 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow; 42 | import org.apache.beam.sdk.transforms.windowing.Window; 43 | import org.apache.beam.sdk.values.KV; 44 | import org.apache.beam.sdk.values.PCollection; 45 | import org.joda.time.Duration; 46 | import org.joda.time.Instant; 47 | 48 | /** 49 | * Second in a series of coding exercises in a gaming domain. 50 | * 51 | *

This batch pipeline calculates the sum of scores per team per hour, over an entire batch of 52 | * gaming data and writes the per-team sums to BigQuery. 53 | * 54 | *

See README.md for details. 55 | */ 56 | public class Exercise2 { 57 | 58 | /** 59 | * A transform to compute the WindowedTeamScore. 60 | */ 61 | public static class WindowedTeamScore 62 | extends PTransform, PCollection>> { 63 | 64 | private Duration duration; 65 | 66 | public WindowedTeamScore(Duration duration) { 67 | this.duration = duration; 68 | } 69 | 70 | @Override 71 | public PCollection> expand(PCollection input) { 72 | return input 73 | .apply(Window.into(FixedWindows.of(duration))) 74 | .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team")); 75 | } 76 | } 77 | 78 | /** 79 | * Format a KV of team and their score to a BigQuery TableRow. 80 | */ 81 | public static class FormatTeamScoreSumsFn extends DoFn, TableRow> { 82 | 83 | @ProcessElement 84 | public void processElement(ProcessContext c, IntervalWindow window) { 85 | TableRow row = 86 | new TableRow() 87 | .set("team", c.element().getKey()) 88 | .set("total_score", c.element().getValue()) 89 | .set("window_start", window.start().getMillis() / 1000); 90 | c.output(row); 91 | } 92 | 93 | /** 94 | * Defines the BigQuery schema. 95 | */ 96 | public static TableSchema getSchema() { 97 | List fields = new ArrayList<>(); 98 | fields.add(new TableFieldSchema().setName("team").setType("STRING")); 99 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER")); 100 | fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP")); 101 | return new TableSchema().setFields(fields); 102 | } 103 | } 104 | 105 | /** 106 | * Run a batch pipeline. 107 | */ 108 | public static void main(String[] args) throws Exception { 109 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 110 | Pipeline pipeline = Pipeline.create(options); 111 | 112 | TableReference tableRef = new TableReference(); 113 | tableRef.setDatasetId(options.as(Options.class).getOutputDataset()); 114 | tableRef.setProjectId(options.as(GcpOptions.class).getProject()); 115 | tableRef.setTableId(options.getOutputTableName()); 116 | 117 | // Read events from a CSV file and parse them. 118 | pipeline 119 | .apply(TextIO.read().from(options.getInput())) 120 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) 121 | .apply( 122 | "AddEventTimestamps", WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp()))) 123 | .apply("WindowedTeamScore", new WindowedTeamScore(Duration.standardMinutes(60))) 124 | // Write the results to BigQuery. 125 | .apply("FormatTeamScoreSums", ParDo.of(new FormatTeamScoreSumsFn())) 126 | .apply( 127 | BigQueryIO.writeTableRows().to(tableRef) 128 | .withSchema(FormatTeamScoreSumsFn.getSchema()) 129 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 130 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 131 | 132 | PipelineResult result = pipeline.run(); 133 | result.waitUntilFinish(); 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise3.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game.solutions; 18 | 19 | import com.google.api.services.bigquery.model.TableReference; 20 | import org.apache.beam.examples.complete.game.utils.GameEvent; 21 | import org.apache.beam.examples.complete.game.utils.Options; 22 | import org.apache.beam.examples.complete.game.utils.ParseEventFn; 23 | import org.apache.beam.sdk.Pipeline; 24 | import org.apache.beam.sdk.PipelineResult; 25 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 26 | import org.apache.beam.sdk.io.TextIO; 27 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 28 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 29 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 30 | import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; 31 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 32 | import org.apache.beam.sdk.transforms.PTransform; 33 | import org.apache.beam.sdk.transforms.ParDo; 34 | import org.apache.beam.sdk.transforms.WithTimestamps; 35 | import org.apache.beam.sdk.values.PBegin; 36 | import org.apache.beam.sdk.values.PCollection; 37 | import org.joda.time.Duration; 38 | import org.joda.time.Instant; 39 | 40 | /** 41 | * Third in a series of coding exercises in a gaming domain. 42 | * 43 | *

This is the same pipeline as in Exercise 2, but can run in either batch or streaming mode. 44 | * 45 | *

See README.md for details. 46 | */ 47 | public class Exercise3 { 48 | 49 | /** 50 | * A transform to read the game events from either text files or Pub/Sub topic. 51 | */ 52 | public static class ReadGameEvents extends PTransform> { 53 | 54 | private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms"; 55 | 56 | private Options options; 57 | 58 | public ReadGameEvents(Options options) { 59 | this.options = options; 60 | } 61 | 62 | @Override 63 | public PCollection expand(PBegin begin) { 64 | if (options.getInput() != null && !options.getInput().isEmpty()) { 65 | return begin 66 | .getPipeline() 67 | .apply(TextIO.read().from(options.getInput())) 68 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) 69 | .apply( 70 | "AddEventTimestamps", 71 | WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp()))); 72 | } else { 73 | return begin 74 | .getPipeline() 75 | .apply(PubsubIO.readStrings().withTimestampAttribute(TIMESTAMP_ATTRIBUTE) 76 | .fromTopic(options.getTopic())) 77 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn())); 78 | } 79 | } 80 | } 81 | 82 | /** 83 | * Run a batch or streaming pipeline. 84 | */ 85 | public static void main(String[] args) throws Exception { 86 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 87 | 88 | Pipeline pipeline = Pipeline.create(options); 89 | 90 | TableReference tableRef = new TableReference(); 91 | tableRef.setDatasetId(options.as(Options.class).getOutputDataset()); 92 | tableRef.setProjectId(options.as(GcpOptions.class).getProject()); 93 | tableRef.setTableId(options.getOutputTableName()); 94 | 95 | // Read events from either a CSV file or PubSub stream. 96 | pipeline 97 | .apply(new ReadGameEvents(options)) 98 | .apply("WindowedTeamScore", new Exercise2.WindowedTeamScore(Duration.standardMinutes(5))) 99 | // Write the results to BigQuery. 100 | .apply("FormatTeamScoreSums", ParDo.of(new Exercise2.FormatTeamScoreSumsFn())) 101 | .apply( 102 | BigQueryIO.writeTableRows().to(tableRef) 103 | .withSchema(Exercise2.FormatTeamScoreSumsFn.getSchema()) 104 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 105 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 106 | 107 | PipelineResult result = pipeline.run(); 108 | result.waitUntilFinish(); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise4.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game.solutions; 18 | 19 | import com.google.api.services.bigquery.model.TableFieldSchema; 20 | import com.google.api.services.bigquery.model.TableReference; 21 | import com.google.api.services.bigquery.model.TableRow; 22 | import com.google.api.services.bigquery.model.TableSchema; 23 | import com.google.common.annotations.VisibleForTesting; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | import org.apache.beam.examples.complete.game.utils.GameEvent; 27 | import org.apache.beam.examples.complete.game.utils.Options; 28 | import org.apache.beam.runners.dataflow.DataflowRunner; 29 | import org.apache.beam.sdk.Pipeline; 30 | import org.apache.beam.sdk.PipelineResult; 31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 35 | import org.apache.beam.sdk.options.Default; 36 | import org.apache.beam.sdk.options.Description; 37 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 38 | import org.apache.beam.sdk.options.StreamingOptions; 39 | import org.apache.beam.sdk.transforms.DoFn; 40 | import org.apache.beam.sdk.transforms.PTransform; 41 | import org.apache.beam.sdk.transforms.ParDo; 42 | import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime; 43 | import org.apache.beam.sdk.transforms.windowing.AfterWatermark; 44 | import org.apache.beam.sdk.transforms.windowing.FixedWindows; 45 | import org.apache.beam.sdk.transforms.windowing.GlobalWindows; 46 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow; 47 | import org.apache.beam.sdk.transforms.windowing.Repeatedly; 48 | import org.apache.beam.sdk.transforms.windowing.Window; 49 | import org.apache.beam.sdk.values.KV; 50 | import org.apache.beam.sdk.values.PCollection; 51 | import org.joda.time.Duration; 52 | import org.joda.time.Instant; 53 | 54 | /** 55 | * Fourth in a series of coding exercises in a gaming domain. 56 | * 57 | *

This streaming pipeline calculates user and team scores for a window of time and writes them 58 | * to BigQuery. 59 | * 60 | *

See README.md for details. 61 | */ 62 | public class Exercise4 { 63 | 64 | static final Duration TEN_SECONDS = Duration.standardSeconds(10); 65 | static final Duration THIRTY_SECONDS = Duration.standardSeconds(30); 66 | 67 | /** 68 | * Exercise4Options supported by {@link Exercise4}. 69 | */ 70 | interface Exercise4Options extends Options, StreamingOptions { 71 | 72 | @Description("Numeric value of fixed window duration for team analysis, in minutes") 73 | @Default.Integer(1) 74 | Integer getTeamWindowDuration(); 75 | 76 | void setTeamWindowDuration(Integer value); 77 | 78 | @Description("Numeric value of allowed data lateness, in minutes") 79 | @Default.Integer(2) 80 | Integer getAllowedLateness(); 81 | 82 | void setAllowedLateness(Integer value); 83 | } 84 | 85 | /** 86 | * Extract user/score pairs from the event stream using processing time, via global windowing. Get 87 | * periodic updates on all users' running scores. 88 | */ 89 | @VisibleForTesting 90 | static class CalculateUserScores 91 | extends PTransform, PCollection>> { 92 | 93 | private final Duration allowedLateness; 94 | 95 | CalculateUserScores(Duration allowedLateness) { 96 | this.allowedLateness = allowedLateness; 97 | } 98 | 99 | @Override 100 | public PCollection> expand(PCollection input) { 101 | return input 102 | .apply( 103 | "LeaderboardUserGlobalWindow", 104 | Window.into(new GlobalWindows()) 105 | // Get periodic results every 30 seconds. 106 | .triggering( 107 | Repeatedly.forever( 108 | AfterProcessingTime.pastFirstElementInPane().plusDelayOf(THIRTY_SECONDS))) 109 | .accumulatingFiredPanes() 110 | .withAllowedLateness(allowedLateness)) 111 | // Extract and sum username/score pairs from the event data. 112 | .apply("ExtractUserScore", new Exercise1.ExtractAndSumScore("user")); 113 | } 114 | } 115 | 116 | /** 117 | * Calculates scores for each team within the configured window duration. 118 | */ 119 | // Extract team/score pairs from the event stream, using hour-long windows by default. 120 | @VisibleForTesting 121 | static class CalculateTeamScores 122 | extends PTransform, PCollection>> { 123 | 124 | private final Duration teamWindowDuration; 125 | private final Duration allowedLateness; 126 | 127 | CalculateTeamScores(Duration teamWindowDuration, Duration allowedLateness) { 128 | this.teamWindowDuration = teamWindowDuration; 129 | this.allowedLateness = allowedLateness; 130 | } 131 | 132 | @Override 133 | public PCollection> expand(PCollection infos) { 134 | return infos 135 | .apply( 136 | "LeaderboardTeamFixedWindows", 137 | Window.into(FixedWindows.of(teamWindowDuration)) 138 | // We will get early (speculative) results as well as cumulative 139 | // processing of late data. 140 | .triggering( 141 | AfterWatermark.pastEndOfWindow() 142 | .withEarlyFirings( 143 | AfterProcessingTime.pastFirstElementInPane().plusDelayOf(TEN_SECONDS)) 144 | .withLateFirings( 145 | AfterProcessingTime.pastFirstElementInPane() 146 | .plusDelayOf(THIRTY_SECONDS))) 147 | .withAllowedLateness(allowedLateness) 148 | .accumulatingFiredPanes()) 149 | // Extract and sum teamname/score pairs from the event data. 150 | .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team")); 151 | } 152 | } 153 | 154 | public static void main(String[] args) throws Exception { 155 | Exercise4Options options = 156 | PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise4Options.class); 157 | // Enforce that this pipeline is always run in streaming mode. 158 | options.setStreaming(true); 159 | // For example purposes, allow the pipeline to be easily cancelled instead of running 160 | // continuously. 161 | options.setRunner(DataflowRunner.class); 162 | Pipeline pipeline = Pipeline.create(options); 163 | 164 | TableReference teamTable = new TableReference(); 165 | teamTable.setDatasetId(options.getOutputDataset()); 166 | teamTable.setProjectId(options.as(GcpOptions.class).getProject()); 167 | teamTable.setTableId(options.getOutputTableName() + "_team"); 168 | 169 | TableReference userTable = new TableReference(); 170 | userTable.setDatasetId(options.getOutputDataset()); 171 | userTable.setProjectId(options.as(GcpOptions.class).getProject()); 172 | userTable.setTableId(options.getOutputTableName() + "_user"); 173 | 174 | PCollection gameEvents = pipeline.apply(new Exercise3.ReadGameEvents(options)); 175 | 176 | gameEvents 177 | .apply( 178 | "CalculateTeamScores", 179 | new CalculateTeamScores( 180 | Duration.standardMinutes(options.getTeamWindowDuration()), 181 | Duration.standardMinutes(options.getAllowedLateness()))) 182 | // Write the results to BigQuery. 183 | .apply("FormatTeamScores", ParDo.of(new FormatTeamScoreFn())) 184 | .apply( 185 | BigQueryIO.writeTableRows().to(teamTable) 186 | .withSchema(FormatTeamScoreFn.getSchema()) 187 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 188 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 189 | 190 | gameEvents 191 | .apply( 192 | "CalculateUserScores", 193 | new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness()))) 194 | // Write the results to BigQuery. 195 | .apply("FormatUserScores", ParDo.of(new FormatUserScoreFn())) 196 | .apply( 197 | BigQueryIO.writeTableRows().to(userTable) 198 | .withSchema(FormatUserScoreFn.getSchema()) 199 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 200 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 201 | 202 | PipelineResult result = pipeline.run(); 203 | result.waitUntilFinish(); 204 | } 205 | 206 | /** 207 | * Format a KV of team and associated properties to a BigQuery TableRow. 208 | */ 209 | protected static class FormatTeamScoreFn extends DoFn, TableRow> { 210 | 211 | @ProcessElement 212 | public void processElement(ProcessContext c, IntervalWindow window) { 213 | TableRow row = 214 | new TableRow() 215 | .set("team", c.element().getKey()) 216 | .set("total_score", c.element().getValue()) 217 | .set("window_start", window.start().getMillis() / 1000) 218 | .set("processing_time", Instant.now().getMillis() / 1000) 219 | .set("timing", c.pane().getTiming().toString()); 220 | c.output(row); 221 | } 222 | 223 | static TableSchema getSchema() { 224 | List fields = new ArrayList<>(); 225 | fields.add(new TableFieldSchema().setName("team").setType("STRING")); 226 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER")); 227 | fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP")); 228 | fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP")); 229 | fields.add(new TableFieldSchema().setName("timing").setType("STRING")); 230 | return new TableSchema().setFields(fields); 231 | } 232 | } 233 | 234 | /** 235 | * Format a KV of user and associated properties to a BigQuery TableRow. 236 | */ 237 | static class FormatUserScoreFn extends DoFn, TableRow> { 238 | 239 | @ProcessElement 240 | public void processElement(ProcessContext c) { 241 | TableRow row = 242 | new TableRow() 243 | .set("user", c.element().getKey()) 244 | .set("total_score", c.element().getValue()) 245 | .set("processing_time", Instant.now().getMillis() / 1000); 246 | c.output(row); 247 | } 248 | 249 | static TableSchema getSchema() { 250 | List fields = new ArrayList<>(); 251 | fields.add(new TableFieldSchema().setName("user").setType("STRING")); 252 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER")); 253 | fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP")); 254 | return new TableSchema().setFields(fields); 255 | } 256 | } 257 | } 258 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise5.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | package org.apache.beam.examples.complete.game.solutions; 17 | 18 | import com.google.api.services.bigquery.model.TableFieldSchema; 19 | import com.google.api.services.bigquery.model.TableReference; 20 | import com.google.api.services.bigquery.model.TableRow; 21 | import com.google.api.services.bigquery.model.TableSchema; 22 | import java.util.ArrayList; 23 | import java.util.List; 24 | import java.util.Map; 25 | import org.apache.beam.examples.complete.game.utils.GameEvent; 26 | import org.apache.beam.examples.complete.game.utils.Options; 27 | import org.apache.beam.runners.dataflow.DataflowRunner; 28 | import org.apache.beam.sdk.Pipeline; 29 | import org.apache.beam.sdk.PipelineResult; 30 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 31 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 34 | import org.apache.beam.sdk.metrics.Counter; 35 | import org.apache.beam.sdk.metrics.Metrics; 36 | import org.apache.beam.sdk.options.Default; 37 | import org.apache.beam.sdk.options.Description; 38 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 39 | import org.apache.beam.sdk.options.StreamingOptions; 40 | import org.apache.beam.sdk.transforms.DoFn; 41 | import org.apache.beam.sdk.transforms.MapElements; 42 | import org.apache.beam.sdk.transforms.Mean; 43 | import org.apache.beam.sdk.transforms.PTransform; 44 | import org.apache.beam.sdk.transforms.ParDo; 45 | import org.apache.beam.sdk.transforms.Sum; 46 | import org.apache.beam.sdk.transforms.Values; 47 | import org.apache.beam.sdk.transforms.View; 48 | import org.apache.beam.sdk.transforms.windowing.FixedWindows; 49 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow; 50 | import org.apache.beam.sdk.transforms.windowing.Window; 51 | import org.apache.beam.sdk.values.KV; 52 | import org.apache.beam.sdk.values.PCollection; 53 | import org.apache.beam.sdk.values.PCollectionView; 54 | import org.apache.beam.sdk.values.TypeDescriptors; 55 | import org.joda.time.Duration; 56 | import org.joda.time.Instant; 57 | import org.slf4j.Logger; 58 | import org.slf4j.LoggerFactory; 59 | 60 | /** 61 | * Fifth in a series of coding exercises in a gaming domain. 62 | * 63 | *

This exercise introduces side inputs. 64 | * 65 | *

See README.md for details. 66 | */ 67 | public class Exercise5 { 68 | 69 | private static final Logger LOG = LoggerFactory.getLogger(Exercise5.class); 70 | 71 | /** 72 | * Filter out all but those users with a high clickrate, which we will consider as 'spammy' users. 73 | * We do this by finding the mean total score per user, then using that information as a side 74 | * input to filter out all but those user scores that are > (mean * SCORE_WEIGHT) 75 | */ 76 | public static class CalculateSpammyUsers 77 | extends PTransform>, PCollection>> { 78 | 79 | private static final Logger LOG = LoggerFactory.getLogger(CalculateSpammyUsers.class); 80 | private static final double SCORE_WEIGHT = 2.5; 81 | 82 | @Override 83 | public PCollection> expand(PCollection> userScores) { 84 | 85 | // Get the sum of scores for each user. 86 | PCollection> sumScores = 87 | userScores.apply("UserSum", Sum.integersPerKey()); 88 | 89 | // Extract the score from each element, and use it to find the global mean. 90 | final PCollectionView globalMeanScore = 91 | sumScores 92 | .apply(Values.create()) 93 | .apply(Mean.globally().asSingletonView()); 94 | 95 | // Filter the user sums using the global mean. 96 | PCollection> filtered = 97 | sumScores.apply("ProcessAndFilter", 98 | ParDo 99 | // use the derived mean total score as a side input 100 | .of( 101 | new DoFn, KV>() { 102 | private final Counter numSpammerUsers = Metrics 103 | .counter("main", "SpammerUsers"); 104 | 105 | @ProcessElement 106 | public void processElement(ProcessContext c) { 107 | Integer score = c.element().getValue(); 108 | Double gmc = c.sideInput(globalMeanScore); 109 | if (score > (gmc * SCORE_WEIGHT)) { 110 | LOG.info( 111 | "user " 112 | + c.element().getKey() 113 | + " spammer score " 114 | + score 115 | + " with mean " 116 | + gmc); 117 | numSpammerUsers.inc(); 118 | c.output(c.element()); 119 | } 120 | } 121 | }) 122 | .withSideInputs(globalMeanScore)); 123 | return filtered; 124 | } 125 | } 126 | 127 | /** 128 | * Calculate and output an element's session duration. 129 | */ 130 | private static class UserSessionInfoFn extends DoFn, Integer> { 131 | 132 | @ProcessElement 133 | public void processElement(ProcessContext c, IntervalWindow w) { 134 | int duration = new Duration(w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes(); 135 | c.output(duration); 136 | } 137 | } 138 | 139 | /** 140 | * Options supported by {@link Exercise5}. 141 | */ 142 | interface Exercise5Options extends Options, StreamingOptions { 143 | 144 | @Description("Numeric value of fixed window duration for user analysis, in minutes") 145 | @Default.Integer(5) 146 | Integer getFixedWindowDuration(); 147 | 148 | void setFixedWindowDuration(Integer value); 149 | } 150 | 151 | public static void main(String[] args) throws Exception { 152 | 153 | Exercise5Options options = 154 | PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise5Options.class); 155 | // Enforce that this pipeline is always run in streaming mode. 156 | options.setStreaming(true); 157 | options.setRunner(DataflowRunner.class); 158 | Pipeline pipeline = Pipeline.create(options); 159 | 160 | TableReference teamTable = new TableReference(); 161 | teamTable.setDatasetId(options.getOutputDataset()); 162 | teamTable.setProjectId(options.as(GcpOptions.class).getProject()); 163 | teamTable.setTableId(options.getOutputTableName()); 164 | 165 | PCollection rawEvents = pipeline.apply(new Exercise3.ReadGameEvents(options)); 166 | 167 | // Extract username/score pairs from the event stream 168 | PCollection> userEvents = 169 | rawEvents.apply( 170 | "ExtractUserScore", 171 | MapElements 172 | .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) 173 | .via((GameEvent gInfo) -> KV.of(gInfo.getUser(), 174 | gInfo.getScore()))); 175 | 176 | // Calculate the total score per user over fixed windows, and 177 | // cumulative updates for late data. 178 | final PCollectionView> spammersView = 179 | userEvents 180 | .apply("FixedWindowsUser", 181 | Window.>into( 182 | FixedWindows.of( 183 | Duration.standardMinutes(options.getFixedWindowDuration())))) 184 | 185 | // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. 186 | // These might be robots/spammers. 187 | .apply("CalculateSpammyUsers", new CalculateSpammyUsers()) 188 | // Derive a view from the collection of spammer users. It will be used as a side input 189 | // in calculating the team score sums, below. 190 | .apply("CreateSpammersView", View.asMap()); 191 | 192 | // Calculate the total score per team over fixed windows, 193 | // and emit cumulative updates for late data. Uses the side input derived above-- the set of 194 | // suspected robots-- to filter out scores from those users from the sum. 195 | // Write the results to BigQuery. 196 | rawEvents 197 | .apply("WindowIntoFixedWindows", 198 | Window 199 | .into( 200 | FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) 201 | // Filter out the detected spammer users, using the side input derived above. 202 | .apply("FilterOutSpammers", 203 | ParDo 204 | .of( 205 | new DoFn() { 206 | @ProcessElement 207 | public void processElement(ProcessContext c) { 208 | // If the user is not in the spammers Map, output the data element. 209 | if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) { 210 | c.output(c.element()); 211 | } 212 | } 213 | }) 214 | .withSideInputs(spammersView)) 215 | // Extract and sum teamname/score pairs from the event data. 216 | .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team")) 217 | // Write the result to BigQuery 218 | .apply("FormatTeamWindows", ParDo.of(new FormatTeamWindowFn())) 219 | .apply( 220 | BigQueryIO.writeTableRows().to(teamTable) 221 | .withSchema(FormatTeamWindowFn.getSchema()) 222 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 223 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 224 | 225 | PipelineResult result = pipeline.run(); 226 | result.waitUntilFinish(); 227 | } 228 | 229 | /** 230 | * Format a KV of team and associated properties to a BigQuery TableRow. 231 | */ 232 | protected static class FormatTeamWindowFn extends DoFn, TableRow> { 233 | 234 | @ProcessElement 235 | public void processElement(ProcessContext c, IntervalWindow window) { 236 | TableRow row = 237 | new TableRow() 238 | .set("team", c.element().getKey()) 239 | .set("total_score", c.element().getValue()) 240 | .set("window_start", window.start().getMillis() / 1000) 241 | .set("processing_time", Instant.now().getMillis() / 1000); 242 | c.output(row); 243 | } 244 | 245 | static TableSchema getSchema() { 246 | List fields = new ArrayList<>(); 247 | fields.add(new TableFieldSchema().setName("team").setType("STRING")); 248 | fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER")); 249 | fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP")); 250 | fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP")); 251 | return new TableSchema().setFields(fields); 252 | } 253 | } 254 | } 255 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise6.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | package org.apache.beam.examples.complete.game.solutions; 17 | 18 | import static org.apache.beam.sdk.transforms.windowing.TimestampCombiner.END_OF_WINDOW; 19 | 20 | import com.google.api.services.bigquery.model.TableFieldSchema; 21 | import com.google.api.services.bigquery.model.TableReference; 22 | import com.google.api.services.bigquery.model.TableRow; 23 | import com.google.api.services.bigquery.model.TableSchema; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | import org.apache.beam.examples.complete.game.utils.GameEvent; 27 | import org.apache.beam.examples.complete.game.utils.Options; 28 | import org.apache.beam.runners.dataflow.DataflowRunner; 29 | import org.apache.beam.sdk.Pipeline; 30 | import org.apache.beam.sdk.PipelineResult; 31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 35 | import org.apache.beam.sdk.options.Default; 36 | import org.apache.beam.sdk.options.Description; 37 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 38 | import org.apache.beam.sdk.options.StreamingOptions; 39 | import org.apache.beam.sdk.transforms.Combine; 40 | import org.apache.beam.sdk.transforms.DoFn; 41 | import org.apache.beam.sdk.transforms.MapElements; 42 | import org.apache.beam.sdk.transforms.Mean; 43 | import org.apache.beam.sdk.transforms.ParDo; 44 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow; 45 | import org.apache.beam.sdk.transforms.windowing.FixedWindows; 46 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow; 47 | import org.apache.beam.sdk.transforms.windowing.Sessions; 48 | import org.apache.beam.sdk.transforms.windowing.Window; 49 | import org.apache.beam.sdk.values.KV; 50 | import org.apache.beam.sdk.values.PCollection; 51 | import org.apache.beam.sdk.values.TypeDescriptors; 52 | import org.joda.time.Duration; 53 | import org.slf4j.Logger; 54 | import org.slf4j.LoggerFactory; 55 | 56 | /** 57 | * Sixth in a series of coding exercises in a gaming domain. 58 | * 59 | *

This exercise introduces session windows. 60 | * 61 | *

See README.md for details. 62 | */ 63 | public class Exercise6 { 64 | 65 | private static final Logger LOG = LoggerFactory.getLogger(Exercise6.class); 66 | 67 | /** 68 | * Calculate and output an element's session duration. 69 | */ 70 | private static class UserSessionInfoFn extends DoFn, Integer> { 71 | 72 | @ProcessElement 73 | public void processElement(ProcessContext c, BoundedWindow window) { 74 | IntervalWindow w = (IntervalWindow) window; 75 | int duration = new Duration(w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes(); 76 | c.output(duration); 77 | } 78 | } 79 | 80 | /** 81 | * Options supported by {@link Exercise6}. 82 | */ 83 | interface Exercise6Options extends Options, StreamingOptions { 84 | 85 | @Description("Numeric value of gap between user sessions, in minutes") 86 | @Default.Integer(1) 87 | Integer getSessionGap(); 88 | 89 | void setSessionGap(Integer value); 90 | 91 | @Description( 92 | "Numeric value of fixed window for finding mean of user session duration, " + "in minutes") 93 | @Default.Integer(5) 94 | Integer getUserActivityWindowDuration(); 95 | 96 | void setUserActivityWindowDuration(Integer value); 97 | } 98 | 99 | public static void main(String[] args) throws Exception { 100 | 101 | Exercise6Options options = 102 | PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise6Options.class); 103 | // Enforce that this pipeline is always run in streaming mode. 104 | options.setStreaming(true); 105 | options.setRunner(DataflowRunner.class); 106 | Pipeline pipeline = Pipeline.create(options); 107 | 108 | TableReference sessionsTable = new TableReference(); 109 | sessionsTable.setDatasetId(options.getOutputDataset()); 110 | sessionsTable.setProjectId(options.as(GcpOptions.class).getProject()); 111 | sessionsTable.setTableId(options.getOutputTableName()); 112 | 113 | PCollection rawEvents = pipeline.apply(new Exercise3.ReadGameEvents(options)); 114 | 115 | // Extract username/score pairs from the event stream 116 | PCollection> userEvents = 117 | rawEvents.apply( 118 | "ExtractUserScore", 119 | MapElements 120 | .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) 121 | .via((GameEvent gInfo) -> KV.of(gInfo.getUser(), 122 | gInfo.getScore()))); 123 | 124 | // Detect user sessions-- that is, a burst of activity separated by a gap from further 125 | // activity. Find and record the mean session lengths. 126 | // This information could help the game designers track the changing user engagement 127 | // as their set of games changes. 128 | userEvents 129 | .apply("WindowIntoSessions", 130 | Window 131 | .>into( 132 | Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))) 133 | .withTimestampCombiner(END_OF_WINDOW)) 134 | // For this use, we care only about the existence of the session, not any particular 135 | // information aggregated over it, so the following is an efficient way to do that. 136 | .apply(Combine.perKey(x -> 0)) 137 | // Get the duration per session. 138 | .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())) 139 | // Re-window to process groups of session sums according to when the sessions complete. 140 | .apply("WindowToExtractSessionMean", 141 | Window 142 | .into( 143 | FixedWindows.of( 144 | Duration.standardMinutes(options.getUserActivityWindowDuration())))) 145 | // Find the mean session duration in each window. 146 | .apply(Mean.globally().withoutDefaults()) 147 | // Write this info to a BigQuery table. 148 | .apply("FormatSessions", ParDo.of(new FormatSessionWindowFn())) 149 | .apply( 150 | BigQueryIO.writeTableRows().to(sessionsTable) 151 | .withSchema(FormatSessionWindowFn.getSchema()) 152 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 153 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 154 | 155 | PipelineResult result = pipeline.run(); 156 | result.waitUntilFinish(); 157 | } 158 | 159 | /** 160 | * Format a KV of session and associated properties to a BigQuery TableRow. 161 | */ 162 | static class FormatSessionWindowFn extends DoFn { 163 | 164 | @ProcessElement 165 | public void processElement(ProcessContext c, BoundedWindow window) { 166 | IntervalWindow w = (IntervalWindow) window; 167 | TableRow row = 168 | new TableRow() 169 | .set("window_start", w.start().getMillis() / 1000) 170 | .set("mean_duration", c.element()); 171 | c.output(row); 172 | } 173 | 174 | static TableSchema getSchema() { 175 | List fields = new ArrayList<>(); 176 | fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP")); 177 | fields.add(new TableFieldSchema().setName("mean_duration").setType("FLOAT")); 178 | return new TableSchema().setFields(fields); 179 | } 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/utils/ChangeMe.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game.utils; 18 | 19 | import org.apache.beam.sdk.transforms.PTransform; 20 | import org.apache.beam.sdk.values.PCollection; 21 | import org.apache.beam.sdk.values.PInput; 22 | 23 | /** 24 | * PTransform that crashes at runtime used as a placeholder in tutorials. 25 | */ 26 | public class ChangeMe 27 | extends PTransform> { 28 | 29 | @Override 30 | public PCollection expand(InputT input) { 31 | throw new RuntimeException("Not implemented"); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/utils/GameEvent.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game.utils; 18 | 19 | import org.apache.avro.reflect.Nullable; 20 | import org.apache.beam.sdk.coders.AvroCoder; 21 | import org.apache.beam.sdk.coders.DefaultCoder; 22 | 23 | /** 24 | * Class to hold info about a game event. 25 | */ 26 | @DefaultCoder(AvroCoder.class) 27 | public class GameEvent { 28 | 29 | @Nullable 30 | String user; 31 | @Nullable 32 | String team; 33 | @Nullable 34 | Integer score; 35 | @Nullable 36 | Long timestamp; 37 | @Nullable 38 | String eventId; 39 | 40 | public GameEvent() { 41 | } 42 | 43 | public GameEvent(String user, String team, Integer score, Long timestamp, String eventId) { 44 | this.user = user; 45 | this.team = team; 46 | this.score = score; 47 | this.timestamp = timestamp; 48 | this.eventId = eventId; 49 | } 50 | 51 | public String getUser() { 52 | return this.user; 53 | } 54 | 55 | public String getTeam() { 56 | return this.team; 57 | } 58 | 59 | public Integer getScore() { 60 | return this.score; 61 | } 62 | 63 | public String getKey(String keyname) { 64 | if (keyname.equals("team")) { 65 | return this.team; 66 | } else { // return username as default 67 | return this.user; 68 | } 69 | } 70 | 71 | public Long getTimestamp() { 72 | return this.timestamp; 73 | } 74 | 75 | public String getEventId() { 76 | return this.eventId; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/utils/Options.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game.utils; 18 | 19 | import org.apache.beam.sdk.options.Description; 20 | import org.apache.beam.sdk.options.PipelineOptions; 21 | import org.apache.beam.sdk.options.Validation; 22 | 23 | /** 24 | * Options supported by the exercise pipelines. 25 | */ 26 | public interface Options extends PipelineOptions { 27 | 28 | @Description("Path to the data file(s) containing game data.") 29 | String getInput(); 30 | 31 | void setInput(String value); 32 | 33 | @Description("Pub/Sub topic to read from. Used if --input is empty.") 34 | String getTopic(); 35 | 36 | void setTopic(String value); 37 | 38 | @Description("BigQuery Dataset to write tables to. Must already exist.") 39 | @Validation.Required 40 | String getOutputDataset(); 41 | 42 | void setOutputDataset(String value); 43 | 44 | @Description("The BigQuery table name. Should not already exist.") 45 | @Validation.Required 46 | String getOutputTableName(); 47 | 48 | void setOutputTableName(String value); 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/utils/ParseEventFn.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game.utils; 18 | 19 | import org.apache.beam.sdk.metrics.Counter; 20 | import org.apache.beam.sdk.metrics.Metrics; 21 | import org.apache.beam.sdk.transforms.DoFn; 22 | import org.slf4j.Logger; 23 | import org.slf4j.LoggerFactory; 24 | 25 | /** 26 | * Parses the raw game event info into GameEvent objects. Each event line has the following 27 | * format: username,teamname,score,timestamp_in_ms,readable_time,event_id 28 | * e.g.: 29 | * user2_AsparagusPig,AsparagusPig,10,1445230923951, 30 | * 2015-11-02 09:09:28.224,e8018d7d-18a6-4265-ba7e-55666b898b6f 31 | * The human-readable time string is not used here. 32 | */ 33 | public class ParseEventFn extends DoFn { 34 | 35 | // Log and count parse errors. 36 | private static final Logger LOG = LoggerFactory.getLogger(ParseEventFn.class); 37 | private final Counter numParseErrors = Metrics.counter("main", "ParseErrors"); 38 | 39 | @ProcessElement 40 | public void processElement(ProcessContext c) { 41 | String[] components = c.element().split(","); 42 | try { 43 | String user = components[0].trim(); 44 | String team = components[1].trim(); 45 | Integer score = Integer.parseInt(components[2].trim()); 46 | Long timestamp = Long.parseLong(components[3].trim()); 47 | String eventId = components.length >= 6 ? components[5].trim() : "none"; 48 | GameEvent gInfo = new GameEvent(user, team, score, timestamp, eventId); 49 | c.output(gInfo); 50 | } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) { 51 | numParseErrors.inc(); 52 | LOG.info("Parse error on " + c.element() + ", " + e.getMessage()); 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/utils/ParsePlayEventFn.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game.utils; 18 | 19 | import org.apache.beam.sdk.metrics.Counter; 20 | import org.apache.beam.sdk.metrics.Metrics; 21 | import org.apache.beam.sdk.transforms.DoFn; 22 | import org.slf4j.Logger; 23 | import org.slf4j.LoggerFactory; 24 | 25 | /** 26 | * Parses the raw play event info into PlayEvent objects. Each play event line has the following 27 | * format: username,timestamp_in_ms,readable_time,event_id 28 | * e.g.: 29 | * user2_AsparagusPig,AsparagusPig,10,1445230923951, 30 | * 2015-11-02 09:09:28.224,e8018d7d-18a6-4265-ba7e-55666b898b6f 31 | * The human-readable time string is not used here. 32 | */ 33 | public class ParsePlayEventFn extends DoFn { 34 | 35 | // Log and count parse errors. 36 | private static final Logger LOG = LoggerFactory.getLogger(ParsePlayEventFn.class); 37 | private final Counter numParseErrors = Metrics.counter("main", "ParseErrors"); 38 | 39 | @ProcessElement 40 | public void processElement(ProcessContext c) { 41 | String[] components = c.element().split(","); 42 | try { 43 | String user = components[0].trim(); 44 | Long timestamp = Long.parseLong(components[1].trim()); 45 | String eventId = components[3].trim(); 46 | PlayEvent play = new PlayEvent(user, timestamp, eventId); 47 | c.output(play); 48 | } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) { 49 | numParseErrors.inc(); 50 | LOG.info("Parse error on " + c.element() + ", " + e.getMessage()); 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java8/org/apache/beam/examples/complete/game/utils/PlayEvent.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package org.apache.beam.examples.complete.game.utils; 18 | 19 | import org.apache.avro.reflect.Nullable; 20 | import org.apache.beam.sdk.coders.AvroCoder; 21 | import org.apache.beam.sdk.coders.DefaultCoder; 22 | 23 | /** 24 | * Class to hold info about a game play event 25 | */ 26 | @DefaultCoder(AvroCoder.class) 27 | public class PlayEvent { 28 | 29 | @Nullable 30 | String user; 31 | @Nullable 32 | Long timestamp; 33 | @Nullable 34 | String eventId; 35 | 36 | public PlayEvent() { 37 | } 38 | 39 | public PlayEvent(String user, Long timestamp, String eventId) { 40 | this.user = user; 41 | this.timestamp = timestamp; 42 | this.eventId = eventId; 43 | } 44 | 45 | public String getUser() { 46 | return this.user; 47 | } 48 | 49 | public String getKey() { 50 | return this.user; 51 | } 52 | 53 | public Long getTimestamp() { 54 | return this.timestamp; 55 | } 56 | 57 | public String getEventId() { 58 | return this.eventId; 59 | } 60 | } 61 | --------------------------------------------------------------------------------