├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── pom.xml
├── py
    ├── exercises
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── exercise0.py
    │   ├── exercise1.py
    │   ├── exercise1.pyc
    │   ├── exercise2.py
    │   ├── exercise3.py
    │   ├── exercise4.py
    │   ├── exercise5.py
    │   ├── exercise6.py
    │   └── exercise7.py
    ├── run0.sh
    ├── run1.sh
    ├── run2.sh
    ├── run3.sh
    ├── run4.sh
    ├── run5.sh
    ├── run6.sh
    ├── run7.sh
    ├── setup.py
    ├── solutions
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── exercise0.py
    │   ├── exercise1.py
    │   ├── exercise1.pyc
    │   ├── exercise2.py
    │   ├── exercise3.py
    │   ├── exercise4.py
    │   ├── exercise5.py
    │   ├── exercise6.py
    │   └── exercise7.py
    └── util
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── util.py
    │   └── util.pyc
└── src
    └── main
        └── java8
            └── org
                └── apache
                    └── beam
                        └── examples
                            └── complete
                                └── game
                                    ├── Exercise0.java
                                    ├── Exercise1.java
                                    ├── Exercise2.java
                                    ├── Exercise3.java
                                    ├── Exercise4.java
                                    ├── Exercise5.java
                                    ├── Exercise6.java
                                    ├── Exercise7.java
                                    ├── injector
                                        ├── Injector.java
                                        ├── InjectorUtils.java
                                        └── RetryHttpInitializerWrapper.java
                                    ├── solutions
                                        ├── Exercise1.java
                                        ├── Exercise2.java
                                        ├── Exercise3.java
                                        ├── Exercise4.java
                                        ├── Exercise5.java
                                        ├── Exercise6.java
                                        └── Exercise7.java
                                    └── utils
                                        ├── ChangeMe.java
                                        ├── GameEvent.java
                                        ├── Options.java
                                        ├── ParseEventFn.java
                                        ├── ParsePlayEventFn.java
                                        └── PlayEvent.java


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # The default behavior, which overrides 'core.autocrlf', is to use Git's
 2 | # built-in heuristics to determine whether a particular file is text or binary.
 3 | # Text files are automatically normalized to the user's platforms.
 4 | * text=auto
 5 | 
 6 | # Explicitly declare text files that should always be normalized and converted
 7 | # to native line endings.
 8 | .gitattributes text
 9 | .gitignore text
10 | LICENSE text
11 | *.avsc text
12 | *.html text
13 | *.java text
14 | *.md text
15 | *.properties text
16 | *.proto text
17 | *.py text
18 | *.sh text
19 | *.xml text
20 | *.yml text
21 | 
22 | # Declare files that will always have CRLF line endings on checkout.
23 | # *.sln text eol=crlf
24 | 
25 | # Explicitly denote all files that are truly binary and should not be modified.
26 | # *.jpg binary
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | target/
 2 | 
 3 | # Ignore IntelliJ files.
 4 | .idea/
 5 | *.iml
 6 | *.ipr
 7 | *.iws
 8 | 
 9 | # Ignore Eclipse files.
10 | .classpath
11 | .project
12 | .settings/
13 | 
14 | # The build process generates the dependency-reduced POM, but it shouldn't be
15 | # committed.
16 | dependency-reduced-pom.xml
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  3 |   ~ Copyright (C) 2015 Google Inc.
  4 |   ~
  5 |   ~ Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6 |   ~ use this file except in compliance with the License. You may obtain a copy of
  7 |   ~ the License at
  8 |   ~
  9 |   ~ http://www.apache.org/licenses/LICENSE-2.0
 10 |   ~
 11 |   ~ Unless required by applicable law or agreed to in writing, software
 12 |   ~ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 13 |   ~ WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 14 |   ~ License for the specific language governing permissions and limitations under
 15 |   ~ the License.
 16 |   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~-->
 17 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 18 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 19 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 20 |   <modelVersion>4.0.0</modelVersion>
 21 | 
 22 |   <parent>
 23 |     <groupId>com.google.cloud.dataflow</groupId>
 24 |     <artifactId>google-cloud-dataflow-java-sdk-parent</artifactId>
 25 |     <version>2.4.0</version>
 26 |   </parent>
 27 | 
 28 |   <groupId>com.google.cloud.dataflow</groupId>
 29 |   <artifactId>DataflowTutorials</artifactId>
 30 |   <name>Google Cloud Dataflow Java Tutorials</name>
 31 | 
 32 |   <version>1.0.0</version>
 33 | 
 34 |   <packaging>jar</packaging>
 35 | 
 36 |   <profiles>
 37 |     <profile>
 38 |       <id>java8</id>
 39 |       <activation>
 40 |         <jdk>[1.8,)</jdk>
 41 |       </activation>
 42 | 
 43 |       <build>
 44 |         <plugins>
 45 |           <!-- Tells Maven about the Java 8 main and test source root. -->
 46 |           <plugin>
 47 |             <groupId>org.codehaus.mojo</groupId>
 48 |             <artifactId>build-helper-maven-plugin</artifactId>
 49 |             <version>3.0.0</version>
 50 |             <executions>
 51 |               <execution>
 52 |                 <id>add-java8-main-source</id>
 53 |                 <phase>initialize</phase>
 54 |                 <goals>
 55 |                   <goal>add-source</goal>
 56 |                 </goals>
 57 |                 <configuration>
 58 |                   <sources>
 59 |                     <source>${project.basedir}/src/main/java8</source>
 60 |                   </sources>
 61 |                 </configuration>
 62 |               </execution>
 63 | 
 64 |               <execution>
 65 |                 <id>add-java8-test-source</id>
 66 |                 <phase>initialize</phase>
 67 |                 <goals>
 68 |                   <goal>add-test-source</goal>
 69 |                 </goals>
 70 |                 <configuration>
 71 |                   <sources>
 72 |                     <source>${project.basedir}/src/test/java8</source>
 73 |                   </sources>
 74 |                 </configuration>
 75 |               </execution>
 76 |             </executions>
 77 |           </plugin>
 78 | 
 79 |           <plugin>
 80 |             <groupId>org.apache.maven.plugins</groupId>
 81 |             <artifactId>maven-compiler-plugin</artifactId>
 82 |             <executions>
 83 | 
 84 |               <!-- Set `-source 1.8 -target 1.8` for Java 8 examples -->
 85 |               <execution>
 86 |                 <id>default-compile</id>
 87 |                 <phase>compile</phase>
 88 |                 <goals>
 89 |                   <goal>compile</goal>
 90 |                 </goals>
 91 |                 <configuration>
 92 |                   <source>1.8</source>
 93 |                   <target>1.8</target>
 94 |                   <release>8</release>
 95 |                   <includes>
 96 |                     <!-- This pattern is brittle; we would prefer to filter on the directory
 97 |                          but that seems to be unavailable to us. -->
 98 |                     <include>**/*Java8*.java</include>
 99 |                     <include>**/game/**/*.java</include>
100 |                   </includes>
101 |                   <compilerArgs>
102 |                     <arg>-Werror</arg>
103 |                     <arg>-Xlint:all</arg>
104 |                     <arg>-Xlint:-cast</arg>
105 |                     <arg>-Xlint:-deprecation</arg>
106 |                     <arg>-Xlint:-processing</arg>
107 |                     <arg>-Xlint:-rawtypes</arg>
108 |                     <arg>-Xlint:-serial</arg>
109 |                     <arg>-Xlint:-try</arg>
110 |                     <arg>-Xlint:-unchecked</arg>
111 |                     <arg>-Xlint:-varargs</arg>
112 |                   </compilerArgs>
113 |                 </configuration>
114 |               </execution>
115 |             </executions>
116 |           </plugin>
117 |         </plugins>
118 |       </build>
119 |     </profile>
120 |   </profiles>
121 | 
122 |   <build>
123 |     <plugins>
124 |       <plugin>
125 |         <artifactId>maven-shade-plugin</artifactId>
126 |         <version>2.4.1</version>
127 |         <executions>
128 |           <execution>
129 |             <phase>package</phase>
130 |             <goals>
131 |               <goal>shade</goal>
132 |             </goals>
133 |             <configuration>
134 |               <finalName>${project.artifactId}-bundled-${project.version}</finalName>
135 |               <artifactSet>
136 |                 <includes>
137 |                   <include>*:*</include>
138 |                 </includes>
139 |               </artifactSet>
140 |               <filters>
141 |                 <filter>
142 |                   <artifact>*:*</artifact>
143 |                   <excludes>
144 |                     <exclude>META-INF/*.SF</exclude>
145 |                     <exclude>META-INF/*.DSA</exclude>
146 |                     <exclude>META-INF/*.RSA</exclude>
147 |                   </excludes>
148 |                 </filter>
149 |               </filters>
150 |             </configuration>
151 |           </execution>
152 |         </executions>
153 |       </plugin>
154 |       <plugin>
155 |         <groupId>org.apache.maven.plugins</groupId>
156 |         <artifactId>maven-compiler-plugin</artifactId>
157 |         <configuration>
158 |           <source>1.8</source>
159 |           <target>1.8</target>
160 |         </configuration>
161 |       </plugin>
162 |     </plugins>
163 |   </build>
164 | 
165 |   <dependencies>
166 |     <dependency>
167 |       <groupId>com.google.cloud.dataflow</groupId>
168 |       <artifactId>google-cloud-dataflow-java-sdk-all</artifactId>
169 |       <version>2.4.0</version>
170 |     </dependency>
171 | 
172 |     <dependency>
173 |       <groupId>com.google.api-client</groupId>
174 |       <artifactId>google-api-client</artifactId>
175 |       <!-- DO _NOT_ UPGRADE TO 1.23.0. It has a bug that causes it to fail when staging files -->
176 |       <version>1.22.0</version>
177 |       <exclusions>
178 |         <!-- Exclude an old version of guava that is being pulled
179 |              in by a transitive dependency of google-api-client -->
180 |         <exclusion>
181 |           <groupId>com.google.guava</groupId>
182 |           <artifactId>guava-jdk5</artifactId>
183 |         </exclusion>
184 |       </exclusions>
185 |     </dependency>
186 | 
187 |     <dependency>
188 |       <groupId>org.apache.beam</groupId>
189 |       <artifactId>beam-runners-google-cloud-dataflow-java</artifactId>
190 |       <version>2.4.0</version>
191 |     </dependency>
192 | 
193 |     <dependency>
194 |       <groupId>org.apache.avro</groupId>
195 |       <artifactId>avro</artifactId>
196 |       <version>1.8.2</version>
197 |     </dependency>
198 | 
199 |     <dependency>
200 |       <groupId>com.google.apis</groupId>
201 |       <artifactId>google-api-services-pubsub</artifactId>
202 |       <!-- DO NOT UPGRADE TO 1.23. Version 1.23 won't work with google-api-client v1.22 -->
203 |       <version>v1-rev383-1.22.0</version>
204 |       <exclusions>
205 |         <!-- Exclude an old version of guava that is being pulled
206 |              in by a transitive dependency of google-api-client -->
207 |         <exclusion>
208 |           <groupId>com.google.guava</groupId>
209 |           <artifactId>guava-jdk5</artifactId>
210 |         </exclusion>
211 |       </exclusions>
212 |     </dependency>
213 | 
214 |     <dependency>
215 |       <groupId>com.google.guava</groupId>
216 |       <artifactId>guava</artifactId>
217 |       <version>24.1-jre</version>
218 |     </dependency>
219 | 
220 |     <dependency>
221 |       <groupId>joda-time</groupId>
222 |       <artifactId>joda-time</artifactId>
223 |       <version>2.9.9</version>
224 |     </dependency>
225 | 
226 |     <dependency>
227 |       <groupId>org.slf4j</groupId>
228 |       <artifactId>slf4j-api</artifactId>
229 |       <version>1.7.25</version>
230 |     </dependency>
231 | 
232 |     <dependency>
233 |       <groupId>org.slf4j</groupId>
234 |       <artifactId>slf4j-jdk14</artifactId>
235 |       <version>1.7.25</version>
236 |       <scope>runtime</scope>
237 |     </dependency>
238 | 
239 |     <dependency>
240 |       <groupId>javax.servlet</groupId>
241 |       <artifactId>javax.servlet-api</artifactId>
242 |       <version>4.0.1</version>
243 |     </dependency>
244 | 
245 |     <!-- Hamcrest and JUnit are required dependencies of DataflowAssert,
246 |          which is used in the main code of DebuggingWordCount example. -->
247 | 
248 |     <dependency>
249 |       <groupId>org.hamcrest</groupId>
250 |       <artifactId>hamcrest-all</artifactId>
251 |       <version>1.3</version>
252 |     </dependency>
253 | 
254 |     <dependency>
255 |       <groupId>junit</groupId>
256 |       <artifactId>junit</artifactId>
257 |       <version>4.12</version>
258 |     </dependency>
259 | 
260 |     <dependency>
261 |       <groupId>org.mockito</groupId>
262 |       <artifactId>mockito-all</artifactId>
263 |       <version>1.10.19</version>
264 |       <scope>test</scope>
265 |     </dependency>
266 |   </dependencies>
267 | </project>
268 | 


--------------------------------------------------------------------------------
/py/exercises/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/exercises/__init__.py


--------------------------------------------------------------------------------
/py/exercises/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/exercises/__init__.pyc


--------------------------------------------------------------------------------
/py/exercises/exercise0.py:
--------------------------------------------------------------------------------
 1 | # This batch pipeline imports game events from CSV to BigQuery.
 2 | from __future__ import absolute_import
 3 | 
 4 | import logging
 5 | import re
 6 | 
 7 | import apache_beam as beam
 8 | from apache_beam.io import ReadFromText
 9 | from apache_beam.io import WriteToText
10 | from apache_beam.metrics import Metrics
11 | from apache_beam.metrics.metric import MetricsFilter
12 | from apache_beam.options.pipeline_options import PipelineOptions
13 | from apache_beam.options.pipeline_options import SetupOptions
14 | from apache_beam.options.pipeline_options import GoogleCloudOptions
15 | from util.util import GameEvent
16 | from util.util import ParseEvent
17 | from util.util import ParseArgs
18 | 
19 | # Defines the BigQuery schema.
20 | SCHEMA = ('user:STRING,' 'team:STRING,' 'score:INTEGER,' 'timestamp:TIMESTAMP')
21 | 
22 | 
23 | def FormatEvent(element):
24 |     """Format a GameEvent to a BigQuery TableRow."""
25 |     return {
26 |         'user': element.user,
27 |         'team': element.team,
28 |         'score': element.score,
29 |         'timestamp': element.timestamp
30 |     }
31 | 
32 | 
33 | def Run(argv=None):
34 |     """Run a batch pipeline."""
35 |     known_args, pipeline_args = ParseArgs(argv)
36 |     pipeline_options = PipelineOptions(pipeline_args)
37 |     pipeline_options.view_as(SetupOptions).save_main_session = True
38 |     p = beam.Pipeline(options=pipeline_options)
39 | 
40 |     project = pipeline_options.view_as(GoogleCloudOptions).project
41 |     # Read events from a CSV file, parse them and write (import) them to BigQuery.
42 |     _ = (p
43 |         | 'read' >> ReadFromText(known_args.input)
44 |         | 'parse' >> beam.FlatMap(ParseEvent)
45 |         | 'format' >> beam.Map(FormatEvent)
46 |         | beam.io.WriteToBigQuery(known_args.output_tablename,
47 |             known_args.output_dataset, project, SCHEMA)
48 |         )
49 |     p.run().wait_until_finish()
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     logging.getLogger().setLevel(logging.INFO)
54 |     Run()
55 | 


--------------------------------------------------------------------------------
/py/exercises/exercise1.py:
--------------------------------------------------------------------------------
 1 | # This batch pipeline calculates the sum of scores per user, over an entire batch of gaming data and writes the sums to BigQuery.
 2 | from __future__ import absolute_import
 3 | 
 4 | import logging
 5 | import re
 6 | 
 7 | import apache_beam as beam
 8 | from apache_beam.io import ReadFromText
 9 | from apache_beam.io import WriteToText
10 | from apache_beam.metrics import Metrics
11 | from apache_beam.metrics.metric import MetricsFilter
12 | from apache_beam.options.pipeline_options import PipelineOptions
13 | from apache_beam.options.pipeline_options import SetupOptions
14 | from apache_beam.options.pipeline_options import GoogleCloudOptions
15 | from util.util import GameEvent
16 | from util.util import ParseEvent
17 | from util.util import ParseArgs
18 | 
19 | # Defines the BigQuery schema.
20 | SCHEMA = ('user:STRING,' 'total_score:INTEGER')
21 | 
22 | 
23 | class ExtractAndSumScore(beam.PTransform):
24 |     """A transform to extract key/score information from GameEvent, and sum
25 |      the scores. The constructor arg determines whether 'team' or 'user' info is
26 |      extracted."""
27 |     def __init__(self, field):
28 |         super(ExtractAndSumScore, self).__init__()
29 |         self.field = field
30 | 
31 |     def expand(self, p):
32 |         # [START EXERCISE 1]:
33 |         # Developer Docs: https:#beam.apache.org/documentation/programming-guide/#transforms-pardo
34 |         # Also: https:#cloud.google.com/dataflow/model/par-do
35 |         #
36 |         # Fill in the code to:
37 |         #   1. Extract a KV<String, Integer> from each GameEvent corresponding to the given
38 |         #      field('user' or 'team') and the score.
39 |         #   2. Compute the sum of the scores for each key.
40 |         #   3. Run your pipeline on the Dataflow service.
41 |         return (p
42 |                 | 'extract_field' >> beam.Map(ChangeMeFunction)
43 | 		# Select the appropriate transform to compute the sum over each key.
44 |                 | ChangeMeTransform()
45 |                 )
46 | 	# [END EXERCISE 1]
47 | 
48 | 
49 | def FormatUserScoreSum(element):
50 |     """Format a KV of user and their score to a BigQuery TableRow."""
51 |     user, total_score = element
52 |     return {'user': user, 'total_score': total_score}
53 | 
54 | 
55 | def Run(argv=None):
56 |     known_args, pipeline_args = ParseArgs(argv)
57 |     pipeline_options = PipelineOptions(pipeline_args)
58 |     pipeline_options.view_as(SetupOptions).save_main_session = True
59 |     p = beam.Pipeline(options=pipeline_options)
60 | 
61 |     project = pipeline_options.view_as(GoogleCloudOptions).project
62 |     # Read events from a CSV file and parse them.
63 |     _ = (p
64 |          | 'read' >> ReadFromText(known_args.input)
65 |          | 'parse' >> beam.FlatMap(ParseEvent)
66 |          | 'extract_user_score' >> ExtractAndSumScore('user')
67 |          | 'format_user_score_sum' >> beam.Map(FormatUserScoreSum)
68 |          | beam.io.WriteToBigQuery(known_args.output_tablename,
69 |              known_args.output_dataset, project, SCHEMA)
70 |          )
71 | 
72 |     p.run().wait_until_finish()
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     logging.getLogger().setLevel(logging.INFO)
77 |     Run()
78 | 


--------------------------------------------------------------------------------
/py/exercises/exercise1.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/exercises/exercise1.pyc


--------------------------------------------------------------------------------
/py/exercises/exercise2.py:
--------------------------------------------------------------------------------
 1 | # This batch pipeline calculates the sum of scores per team per hour, over an
 2 | # entire batch of gaming data and writes the per-team sums to BigQuery.
 3 | from __future__ import absolute_import
 4 | 
 5 | import logging
 6 | import re
 7 | 
 8 | import apache_beam as beam
 9 | from apache_beam.io import ReadFromText
10 | from apache_beam.io import WriteToText
11 | from apache_beam.metrics import Metrics
12 | from apache_beam.metrics.metric import MetricsFilter
13 | from apache_beam.options.pipeline_options import PipelineOptions
14 | from apache_beam.options.pipeline_options import SetupOptions
15 | from apache_beam.options.pipeline_options import GoogleCloudOptions
16 | from util.util import GameEvent
17 | from util.util import ParseEvent
18 | from util.util import ParseArgs
19 | import apache_beam.transforms.window as window
20 | 
21 | # Defines the BigQuery schema.
22 | SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
23 | 
24 | 
25 | class ExtractAndSumScore(beam.PTransform):
26 |     def __init__(self, field):
27 |         super(ExtractAndSumScore, self).__init__()
28 |         self.field = field
29 | 
30 |     def expand(self, p):
31 |         return (p
32 |                 |'extract_field' >> beam.Map(lambda x: (vars(x)[self.field], x.score))
33 |                 | beam.CombinePerKey(sum)
34 |                 )
35 | 
36 | 
37 | class WindowedTeamScore(beam.PTransform):
38 |     """A transform to compute the WindowedTeamScore."""
39 |     def __init__(self, duration):
40 |         super(WindowedTeamScore, self).__init__()
41 |         self.duration = duration
42 | 
43 |     def expand(self, p):
44 |         # [START EXERCISE 2]:
45 |         # Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing
46 |         # Also: https://cloud.google.com/dataflow/model/windowing
47 |         return (p
48 |                 # beam.WindowInto takes a WindowFn and returns a PTransform that applies windowing.
49 |                 # window.FixedWindows returns a WindowFn that assigns elements into fixed-size
50 |                 # windows. Use these methods to apply windows of size self.duration.
51 |                 | 'window' >> ChangeMeTransform()
52 |                 # Use the ExtractAndSumScore to compute the 'team' sum.
53 |                 | 'extract_team_score' >> ChangeMeTransform()
54 |                 )
55 |         # [END EXERCISE 2]
56 | 
57 | 
58 | class FormatTeamScoreSum(beam.DoFn):
59 |     """Format a KV of user and their score to a BigQuery TableRow."""
60 |     def process(self, team_score, window=beam.DoFn.WindowParam):
61 |         team, score = team_score
62 |         start = int(window.start)
63 |         yield {
64 |             'team': team,
65 |             'total_score': score,
66 |             'window_start': start,
67 |         }
68 | 
69 | 
70 | def Run(argv=None):
71 |     known_args, pipeline_args = ParseArgs(argv)
72 |     pipeline_options = PipelineOptions(pipeline_args)
73 |     pipeline_options.view_as(SetupOptions).save_main_session = True
74 |     p = beam.Pipeline(options=pipeline_options)
75 | 
76 |     project = pipeline_options.view_as(GoogleCloudOptions).project
77 |     _ = (p
78 |          | 'read' >> ReadFromText(known_args.input)
79 |          | 'parse' >> beam.FlatMap(ParseEvent)
80 |          | 'add_event_timestamps' >> beam.Map(
81 |              lambda x: beam.window.TimestampedValue(x, x.timestamp))
82 |          | 'windowed_team_score' >> WindowedTeamScore(60 * 60)
83 |          | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
84 |          | beam.io.WriteToBigQuery(known_args.output_tablename,
85 |              known_args.output_dataset, project, SCHEMA)
86 |          )
87 |     p.run().wait_until_finish()
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     logging.getLogger().setLevel(logging.INFO)
92 |     Run()
93 | 


--------------------------------------------------------------------------------
/py/exercises/exercise3.py:
--------------------------------------------------------------------------------
  1 | # This pipeline calculates the sum of scores per team per hour and writes the
  2 | # per-team sums to BigQuery. The pipeline can be run in either batch or
  3 | # streaming mode, reading from either a data file or Pub/Sub topic.
  4 | #
  5 | # You will need to create a Pub/Sub topic and run the Java Injector
  6 | # in order to get game events over Pub/Sub. Please refer to the instructions
  7 | # here: https://github.com/malo-denielou/DataflowSME
  8 | from __future__ import absolute_import
  9 | 
 10 | import logging
 11 | import re
 12 | 
 13 | import apache_beam as beam
 14 | from apache_beam.io import ReadFromText
 15 | from apache_beam.io import ReadFromPubSub
 16 | from apache_beam.io import WriteToText
 17 | from apache_beam.metrics import Metrics
 18 | from apache_beam.metrics.metric import MetricsFilter
 19 | from apache_beam.options.pipeline_options import PipelineOptions
 20 | from apache_beam.options.pipeline_options import SetupOptions
 21 | from apache_beam.options.pipeline_options import StandardOptions
 22 | from apache_beam.options.pipeline_options import GoogleCloudOptions
 23 | from util.util import GameEvent
 24 | from util.util import ParseEvent
 25 | from util.util import ParseEventFn
 26 | from util.util import ParseArgs
 27 | import apache_beam.transforms.window as window
 28 | from solutions.exercise1 import ExtractAndSumScore
 29 | 
 30 | # Defines the BigQuery schema.
 31 | SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
 32 | 
 33 | 
 34 | class ExtractAndSumScore(beam.PTransform):
 35 |     def __init__(self, field):
 36 |         super(ExtractAndSumScore, self).__init__()
 37 |         self.field = field
 38 | 
 39 |     def expand(self, p):
 40 |         return (p
 41 |                 | 'extract_field' >> beam.Map(
 42 |                     lambda x: (vars(x)[self.field], x.score))
 43 |                 | beam.CombinePerKey(sum)
 44 |                 )
 45 | 
 46 | 
 47 | class WindowedTeamScore(beam.PTransform):
 48 |     """A transform to compute a windowed team score."""
 49 |     def __init__(self, duration):
 50 |         super(WindowedTeamScore, self).__init__()
 51 |         self.duration = duration
 52 | 
 53 |     def expand(self, p):
 54 |         return (p
 55 |                 | 'window' >> beam.WindowInto(
 56 |                     window.FixedWindows(self.duration))
 57 |                 | 'extract_team_score' >> ExtractAndSumScore('team')
 58 |                 )
 59 | 
 60 | 
 61 | class FormatTeamScoreSum(beam.DoFn):
 62 |     """Format a KV of user and their score to a BigQuery TableRow."""
 63 |     def process(self, team_score, window=beam.DoFn.WindowParam):
 64 |         team, score = team_score
 65 |         start = int(window.start)
 66 |         yield {
 67 |             'team': team,
 68 |             'total_score': score,
 69 |             'window_start': start,
 70 |         }
 71 | 
 72 | 
 73 | def Run(argv=None):
 74 |     known_args, pipeline_args = ParseArgs(argv)
 75 |     pipeline_options = PipelineOptions(pipeline_args)
 76 |     pipeline_options.view_as(SetupOptions).save_main_session = True
 77 |     p = beam.Pipeline(options=pipeline_options)
 78 |     window_duration = 1 * 60  # 1 minute windows.
 79 |     if known_args.topic:
 80 |         pipeline_options.view_as(StandardOptions).streaming = True
 81 | 
 82 |     project = pipeline_options.view_as(GoogleCloudOptions).project
 83 |     timestamp_attribute = 'timestamp_ms'
 84 |     events = None
 85 |     if (not known_args.topic):
 86 |         events = (p
 87 |                 | 'read' >> ReadFromText(known_args.input)
 88 |                 | 'parse' >> beam.FlatMap(ParseEventFn())
 89 |                 | 'add_event_timestamps' >> beam.Map(
 90 |                     lambda x: beam.window.TimestampedValue(x, x.timestamp))
 91 |                 )
 92 |     else:
 93 |         # [START EXERCISE 3]:
 94 |         # Read game events from the Pub/Sub topic using custom timestamps,
 95 |         # which are in an attribute labeled 'timestamp_ms'.
 96 |         # Use beam.io.ReadFromPubSub to read from the topic.
 97 |         # https://beam.apache.org/releases/pydoc/2.8.0/apache_beam.io.gcp.pubsub.html
 98 |         events = (p
 99 |                 | 'read' >> ChangeMe()
100 |                 | 'decode' >> beam.ParDo(ParseEventFn())
101 |                 )
102 | 
103 |     _ = (events
104 |             | 'windowed_team_score' >> WindowedTeamScore(window_duration)
105 |             | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
106 |             | beam.io.WriteToBigQuery(known_args.output_tablename,
107 |                 known_args.output_dataset, project, SCHEMA)
108 |             )
109 |     p.run().wait_until_finish()
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     logging.getLogger().setLevel(logging.INFO)
114 |     Run()
115 | 


--------------------------------------------------------------------------------
/py/exercises/exercise4.py:
--------------------------------------------------------------------------------
  1 | # This pipeline calculates the sum of scores per team per hour and writes the
  2 | # per-team sums to BigQuery. Additionally computes running user scores (e.g.,
  3 | # as a leaderboard) and updates them regularly.
  4 | 
  5 | # The pipeline can be run in either batch or streaming mode, reading from
  6 | # either a data file or Pub/Sub topic.
  7 | from __future__ import absolute_import
  8 | 
  9 | import logging
 10 | import re
 11 | import time
 12 | 
 13 | import apache_beam as beam
 14 | from apache_beam.io import ReadFromText
 15 | from apache_beam.io import ReadFromPubSub
 16 | from apache_beam.io import WriteToText
 17 | from apache_beam.metrics import Metrics
 18 | from apache_beam.metrics.metric import MetricsFilter
 19 | from apache_beam.options.pipeline_options import PipelineOptions
 20 | from apache_beam.options.pipeline_options import SetupOptions
 21 | from apache_beam.options.pipeline_options import StandardOptions
 22 | from apache_beam.options.pipeline_options import GoogleCloudOptions
 23 | from apache_beam.transforms import trigger
 24 | from util.util import GameEvent
 25 | from util.util import ParseEvent
 26 | from util.util import ParseEventFn
 27 | from util.util import ParseArgs
 28 | import apache_beam.transforms.window as window
 29 | from solutions.exercise1 import ExtractAndSumScore
 30 | 
 31 | # Defines the BigQuery schemas.
 32 | USER_SCHEMA = ('user:STRING,'
 33 |                'total_score:INTEGER,'
 34 |                'processing_time:TIMESTAMP')
 35 | TEAM_SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
 36 | 
 37 | 
 38 | class ExtractAndSumScore(beam.PTransform):
 39 |     def __init__(self, field):
 40 |         super(ExtractAndSumScore, self).__init__()
 41 |         self.field = field
 42 | 
 43 |     def expand(self, p):
 44 |         return (p | 'extract_field' >>
 45 |                 beam.Map(lambda x: (vars(x)[self.field], x.score)) |
 46 |                 beam.CombinePerKey(sum))
 47 | 
 48 | 
 49 | class RunningUserScores(beam.PTransform):
 50 |     """Extract user/score pairs via global windowing and emit perioidic updates
 51 |      on all users' running scores.
 52 |     """
 53 |     def __init__(self, allowed_lateness=0):
 54 |         super(RunningUserScores, self).__init__()
 55 | 
 56 |     def expand(self, p):
 57 |         # NOTE: allowed_lateness is not yet available in Python FixedWindows.
 58 |         # NOTE: AfterProcessingTime not yet available in Python.
 59 |         # [START EXERCISE 4.1]:
 60 |         # Compute a leaderboard by windowing user scores into the global window.
 61 |         # Since we will want to see running results, trigger the window early,
 62 |         # after every 100 elements. Make sure to accumulate fired panes.
 63 |         # https://beam.apache.org/documentation/programming-guide/#triggers
 64 |         return (p
 65 |                 | 'window' >> ChangeMe()
 66 |                 | 'extract_user_score' >> ExtractAndSumScore('user')
 67 |                 )
 68 |         # [END EXERCISE 4.1]
 69 | 
 70 | 
 71 | class WindowedTeamScore(beam.PTransform):
 72 |     """Calculates scores for each team within the configured window duration"""
 73 | 
 74 |     def __init__(self, duration):
 75 |         super(WindowedTeamScore, self).__init__()
 76 |         self.duration = duration
 77 | 
 78 |     def expand(self, p):
 79 |         # [START EXERCISE 4.2]:
 80 |         # Window team scores into windows of fixed duration. Trigger these windows
 81 |         # on-time with the watermark, but also speculatively every 100 elements.
 82 |         # Ensure correct totals for the watermark-triggered pane by accumulating
 83 |         # over all data.
 84 |         return (p
 85 |                 | 'window' >> ChangeMe()
 86 |                 | 'extract_team_score' >> ExtractAndSumScore('team')
 87 |                 )
 88 |         # [END EXERCISE 4.2]
 89 | 
 90 | 
 91 | class FormatTeamScoreSum(beam.DoFn):
 92 |     """Format a KV of team and its score to a BigQuery TableRow."""
 93 |     def process(self, team_score, window=beam.DoFn.WindowParam):
 94 |         team, score = team_score
 95 |         start = int(window.start)
 96 |         yield {
 97 |             'team': team,
 98 |             'total_score': score,
 99 |             'window_start': start,
100 |         }
101 | 
102 | 
103 | class FormatUserScoreSum(beam.DoFn):
104 |     """Format a KV of user and their score to a BigQuery TableRow."""
105 |     def process(self, user_score, window=beam.DoFn.WindowParam):
106 |         user, score = user_score
107 |         yield {
108 |             'user': user,
109 |             'total_score': score,
110 |             'processing_time': time.time(),
111 |         }
112 | 
113 | 
114 | def Run(argv=None):
115 |     known_args, pipeline_args = ParseArgs(argv)
116 |     pipeline_options = PipelineOptions(pipeline_args)
117 |     pipeline_options.view_as(SetupOptions).save_main_session = True
118 |     p = beam.Pipeline(options=pipeline_options)
119 |     window_duration = 1 * 60  # 1 minute windows.
120 |     if known_args.topic:
121 |         pipeline_options.view_as(StandardOptions).streaming = True
122 | 
123 |     project = pipeline_options.view_as(GoogleCloudOptions).project
124 |     timestamp_attribute = 'timestamp_ms'
125 |     events = None
126 |     if (not known_args.topic):
127 |         events = (p
128 |                 | 'read' >> ReadFromText(known_args.input)
129 |                 | 'parse' >> beam.FlatMap(ParseEventFn())
130 |                 | 'add_event_timestamps' >> beam.Map(
131 |                     lambda x: beam.window.TimestampedValue(x, x.timestamp))
132 |                 )
133 |     else:
134 |         events = (p
135 |                 | 'read' >> ReadFromPubSub(topic=known_args.topic,
136 |                     timestamp_attribute='timestamp_ms')
137 |                 | 'decode' >> beam.ParDo(ParseEventFn())
138 |                 )
139 | 
140 |     # Window team scores and write them BigQuery.
141 |     _ = (events
142 |          | 'windowed_team_score' >> WindowedTeamScore(window_duration)
143 |          | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
144 |          | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery(
145 |              known_args.output_tablename + '_team', known_args.output_dataset,
146 |              project, TEAM_SCHEMA)
147 |          )
148 | 
149 |     # Write leaderboards to BigQuery.
150 |     _ = (events
151 |          | 'running_user_score' >> RunningUserScores()
152 |          | 'format_user_scores' >> beam.ParDo(FormatUserScoreSum())
153 |          | 'write_users_to_bigquery' >> beam.io.WriteToBigQuery(
154 |              known_args.output_tablename + '_user', known_args.output_dataset,
155 |              project, USER_SCHEMA)
156 |          )
157 | 
158 |     p.run().wait_until_finish()
159 | 
160 | 
161 | if __name__ == '__main__':
162 |     logging.getLogger().setLevel(logging.INFO)
163 |     Run()
164 | 


--------------------------------------------------------------------------------
/py/exercises/exercise5.py:
--------------------------------------------------------------------------------
  1 | # Filter 'cheating' or 'spammy' users from the game results.
  2 | # Computes the global mean score and filters users that are 
  3 | # some threshold above that score.
  4 | from __future__ import absolute_import
  5 | 
  6 | import logging
  7 | import re
  8 | import time
  9 | 
 10 | import apache_beam as beam
 11 | from apache_beam.io import ReadFromText
 12 | from apache_beam.io import ReadFromPubSub
 13 | from apache_beam.io import WriteToText
 14 | from apache_beam.metrics import Metrics
 15 | from apache_beam.metrics.metric import MetricsFilter
 16 | from apache_beam.options.pipeline_options import PipelineOptions
 17 | from apache_beam.options.pipeline_options import SetupOptions
 18 | from apache_beam.options.pipeline_options import StandardOptions
 19 | from apache_beam.options.pipeline_options import GoogleCloudOptions
 20 | from apache_beam.transforms import trigger
 21 | from util.util import GameEvent
 22 | from util.util import ParseEvent
 23 | from util.util import ParseEventFn
 24 | from util.util import ParseArgs
 25 | import apache_beam.transforms.window as window
 26 | 
 27 | # Defines the BigQuery schemas.
 28 | USER_SCHEMA = ('user:STRING,'
 29 |                'total_score:INTEGER,'
 30 |                'processing_time:TIMESTAMP')
 31 | TEAM_SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
 32 | 
 33 | 
 34 | class ExtractAndSumScore(beam.PTransform):
 35 |     def __init__(self, field):
 36 |         super(ExtractAndSumScore, self).__init__()
 37 |         self.field = field
 38 | 
 39 |     def expand(self, p):
 40 |         return (p
 41 |                 | 'extract_field' >> beam.Map(
 42 |                     lambda x: (vars(x)[self.field], x.score))
 43 |                 | beam.CombinePerKey(sum)
 44 |                 )
 45 | 
 46 | 
 47 | class WindowedUserScores(beam.PTransform):
 48 |     """Extract user/score pairs via in fixed windows."""
 49 |     def __init__(self, duration):
 50 |         super(WindowedUserScores, self).__init__()
 51 |         self.duration = duration
 52 | 
 53 |     def expand(self, p):
 54 |         return (p
 55 |                 | 'window' >> beam.WindowInto(
 56 |                     window.FixedWindows(self.duration))
 57 |                 | 'extract_user_score' >> ExtractAndSumScore('user')
 58 |                 )
 59 | 
 60 | 
 61 | class FilterUser(beam.DoFn):
 62 |     """Filter a user if their score * score_weight > avg_score."""
 63 |     def __init__(self, score_weight):
 64 |         super(FilterUser, self).__init__()
 65 |         self.score_weight = score_weight
 66 |         self.num_spammy_users = Metrics.counter(self.__class__,
 67 |                                                 'num_spammy_users')
 68 | 
 69 |     def process(self, user_score, avg_score=beam.DoFn.SideInputParam):
 70 |         user, score = user_score
 71 |         if score * self.score_weight > avg_score:
 72 |             logging.error('User %s filtered as spammy', user)
 73 |             self.num_spammy_users.inc()
 74 |             yield user
 75 | 
 76 | 
 77 | class ComputeSpammyUsers(beam.PTransform):
 78 |     """Compute users with a high clickrate, which we will consider spammy.
 79 |      We do this by finding the mean total score per user and filter out
 80 |      those with scores that are greater than the mean * score_weight
 81 |     """
 82 |     def __init__(self, score_weight):
 83 |         super(ComputeSpammyUsers, self).__init__()
 84 |         self.score_weight = score_weight
 85 | 
 86 |     def expand(self, p):
 87 |         # [START EXERCISE 5.1]:
 88 |         # Extract the score for each user, and compute the mean.
 89 |         # Create a singleton PCollection view to be used in
 90 |         # compute_spammers.
 91 |         # https://beam.apache.org/documentation/programming-guide/#combine
 92 |         avg_score = (p
 93 |                 | 'extract_score' >> ChangeMe()
 94 |                 | 'compute_mean' >> ChangeMe()
 95 |                 )
 96 |         # [END EXERCISE 5.1]
 97 |         return (p
 98 |                 | 'compute_spammers' >> beam.ParDo(
 99 |                     FilterUser(self.score_weight), avg_score=avg_score)
100 |                 )
101 | 
102 | 
103 | class FilterSpammers(beam.DoFn):
104 |     """Remove users found in the spam list."""
105 |     def __init__(self):
106 |         super(FilterSpammers, self).__init__()
107 |         self.filtered_scores = Metrics.counter(self.__class__,
108 |                                                'filtered_scores')
109 | 
110 |     def process(self, elem, spammers=beam.DoFn.SideInputParam):
111 |         user = elem.user
112 |         if user not in spammers:
113 |             yield elem
114 |         else:
115 |             self.filtered_scores.inc()
116 | 
117 | 
118 | class WindowedTeamScore(beam.PTransform):
119 |     """Calculates scores for each team within the configured window duration"""
120 |     def __init__(self, duration, spammers):
121 |         super(WindowedTeamScore, self).__init__()
122 |         self.duration = duration
123 |         self.spammers = spammers
124 | 
125 |     def expand(self, p):
126 |         return (p
127 |                 | 'window' >> beam.WindowInto(
128 |                     window.FixedWindows(self.duration))
129 |                 | 'filter_spammers' >> beam.ParDo(
130 |                     FilterSpammers(), spammers=self.spammers)
131 |                 | 'extract_team_score' >> ExtractAndSumScore('team')
132 |                 )
133 | 
134 | 
135 | class FormatTeamScoreSum(beam.DoFn):
136 |     def process(self, team_score, window=beam.DoFn.WindowParam):
137 |         team, score = team_score
138 |         start = int(window.start)
139 |         yield {
140 |             'team': team,
141 |             'total_score': score,
142 |             'window_start': start,
143 |         }
144 | 
145 | 
146 | class FormatUserScoreSum(beam.DoFn):
147 |     def process(self, user_score, window=beam.DoFn.WindowParam):
148 |         user, score = user_score
149 |         yield {
150 |             'user': user,
151 |             'total_score': score,
152 |             'processing_time': time.time(),
153 |         }
154 | 
155 | 
156 | def Run(argv=None):
157 |     known_args, pipeline_args = ParseArgs(argv)
158 |     pipeline_options = PipelineOptions(pipeline_args)
159 |     pipeline_options.view_as(SetupOptions).save_main_session = True
160 |     p = beam.Pipeline(options=pipeline_options)
161 |     window_duration = 1 * 60  # 1 minute windows.
162 |     if known_args.topic:
163 |         pipeline_options.view_as(StandardOptions).streaming = True
164 | 
165 |     project = pipeline_options.view_as(GoogleCloudOptions).project
166 |     timestamp_attribute = 'timestamp_ms'
167 |     events = None
168 |     if (not known_args.topic):
169 |         events = (p
170 |                 | 'read' >> ReadFromText(known_args.input)
171 |                 | 'parse' >> beam.FlatMap(ParseEventFn())
172 |                 | 'add_event_timestamps' >> beam.Map(
173 |                     lambda x: beam.window.TimestampedValue(x, x.timestamp)))
174 |     else:
175 |         events = (p
176 |                 | 'read' >> ReadFromPubSub(
177 |                     topic=known_args.topic,
178 |                     timestamp_attribute='timestamp_ms')
179 |                 | 'decode' >> beam.ParDo(ParseEventFn()))
180 | 
181 |     user_scores = (events
182 |             | 'window_user_scores' >> WindowedUserScores(window_duration))
183 |     spammers = beam.pvalue.AsList(user_scores
184 |             | 'compute_spammers' >> ComputeSpammyUsers(2.5))
185 | 
186 |     _ = (events
187 |          | 'windowed_team_score' >> WindowedTeamScore(window_duration, spammers)
188 |          | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
189 |          | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery(
190 |              known_args.output_tablename, known_args.output_dataset, project,
191 |              TEAM_SCHEMA)
192 |          )
193 | 
194 |     p.run().wait_until_finish()
195 | 
196 | 
197 | if __name__ == '__main__':
198 |     logging.getLogger().setLevel(logging.INFO)
199 |     Run()
200 | 


--------------------------------------------------------------------------------
/py/exercises/exercise6.py:
--------------------------------------------------------------------------------
 1 | # This pipeline computes the average duration of user sessions. The
 2 | # averages are windowed, to reflect durations differing over time.
 3 | from __future__ import absolute_import
 4 | 
 5 | import logging
 6 | import re
 7 | import time
 8 | 
 9 | import apache_beam as beam
10 | import apache_beam.transforms.window as window
11 | from apache_beam.io import ReadFromText
12 | from apache_beam.io import ReadFromPubSub
13 | from apache_beam.io import WriteToText
14 | from apache_beam.metrics import Metrics
15 | from apache_beam.metrics.metric import MetricsFilter
16 | from apache_beam.options.pipeline_options import PipelineOptions
17 | from apache_beam.options.pipeline_options import SetupOptions
18 | from apache_beam.options.pipeline_options import StandardOptions
19 | from apache_beam.options.pipeline_options import GoogleCloudOptions
20 | from apache_beam.transforms import trigger
21 | from util.util import GameEvent
22 | from util.util import ParseEvent
23 | from util.util import ParseEventFn
24 | from util.util import ParseArgs
25 | 
26 | # Defines the BigQuery schemas.
27 | SESSION_SCHEMA = ('window_start:TIMESTAMP,' 'mean_duration:FLOAT')
28 | 
29 | 
30 | class UserSessionActivity(beam.DoFn):
31 |     """Compute the duration of a user's session."""
32 |     def process(self,
33 |                 elem,
34 |                 timestamp=beam.DoFn.TimestampParam,
35 |                 window=beam.DoFn.WindowParam):
36 |         duration = int(window.end) - int(window.start)
37 |         yield duration
38 | 
39 | 
40 | class FormatSessionMeans(beam.DoFn):
41 |     """Format session means for output to BQ"""
42 |     def process(self, elem, window=beam.DoFn.WindowParam):
43 |         yield {'window_start': int(window.start), 'mean_duration': elem}
44 | 
45 | 
46 | def Run(argv=None):
47 |     known_args, pipeline_args = ParseArgs(argv)
48 |     pipeline_options = PipelineOptions(pipeline_args)
49 |     pipeline_options.view_as(SetupOptions).save_main_session = True
50 |     p = beam.Pipeline(options=pipeline_options)
51 |     if known_args.topic:
52 |         pipeline_options.view_as(StandardOptions).streaming = True
53 | 
54 |     project = pipeline_options.view_as(GoogleCloudOptions).project
55 |     timestamp_attribute = 'timestamp_ms'
56 |     events = None
57 |     if (not known_args.topic):
58 |         events = (p
59 |                 | 'read' >> ReadFromText(known_args.input)
60 |                 | 'parse' >> beam.FlatMap(ParseEventFn())
61 |                 | 'add_event_timestamps' >> beam.Map(
62 |                     lambda x: beam.window.TimestampedValue(x, x.timestamp)))
63 |     else:
64 |         events = (p
65 |                 | 'read' >> ReadFromPubSub(
66 |                     topic=known_args.topic,
67 |                     timestamp_attribute='timestamp_ms')
68 |                 | 'parse' >> beam.ParDo(ParseEventFn()))
69 | 
70 |     # [START EXERCISE 6]
71 |     _ = (events
72 |          | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score))
73 |          # Extract sessions of user data, using known_args.session_gap as the
74 |          # gap duration.
75 |          # https://beam.apache.org/documentation/programming-guide/#provided-windowing-functions
76 |          | 'sessionize' >> ChangeMe()
77 |          | 'drop_scores' >> beam.CombinePerKey(lambda x: 0)
78 |          | 'convert_to_activity' >> beam.ParDo(UserSessionActivity())
79 |          # Re-window into fixed windows of size user_activity_window in order
80 |          # to compute the mean session duration for that window of activity.
81 |          | 'window_of_sessions' >> ChangeMe()
82 |          | 'session_mean' >> ChangeMe()
83 |          # [END EXERCISE 6]
84 |          | 'format_sessions' >> beam.ParDo(FormatSessionMeans())
85 |          | 'write_to_bigquery' >> beam.io.WriteToBigQuery(
86 |              known_args.output_tablename, known_args.output_dataset, project,
87 |              SESSION_SCHEMA)
88 |          )
89 | 
90 |     p.run().wait_until_finish()
91 | 
92 | 
93 | if __name__ == '__main__':
94 |     logging.getLogger().setLevel(logging.INFO)
95 |     Run()
96 | 


--------------------------------------------------------------------------------
/py/exercises/exercise7.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | import logging
  4 | import re
  5 | import time
  6 | 
  7 | import apache_beam as beam
  8 | import apache_beam.transforms.window as window
  9 | from apache_beam.io import ReadFromText
 10 | from apache_beam.io import ReadFromPubSub
 11 | from apache_beam.io import WriteToText
 12 | from apache_beam.metrics import Metrics
 13 | from apache_beam.metrics.metric import MetricsFilter
 14 | from apache_beam.options.pipeline_options import PipelineOptions
 15 | from apache_beam.options.pipeline_options import SetupOptions
 16 | from apache_beam.options.pipeline_options import StandardOptions
 17 | from apache_beam.options.pipeline_options import GoogleCloudOptions
 18 | from apache_beam.transforms import trigger
 19 | from util.util import GameEvent
 20 | from util.util import ParseEvent
 21 | from util.util import ParseEventFn
 22 | from util.util import ParsePlayEventFn
 23 | from util.util import ParseArgs
 24 | 
 25 | # Defines the BigQuery schemas.
 26 | SESSION_SCHEMA = ('window_start:TIMESTAMP,' 'mean_duration:FLOAT')
 27 | 
 28 | 
 29 | class ComputeLatency(beam.DoFn):
 30 |     def __init__(self):
 31 |         super(ComputeLatency, self).__init__()
 32 |         self.dropped_sessions_no_events = Metrics.counter(
 33 |             self.__class__, 'dropped_sessions_no_events')
 34 |         self.dropped_sessions_too_many_events = Metrics.counter(
 35 |             self.__class__, 'dropped_sessions_too_many_events')
 36 |         self.dropped_sessions_no_play_events = Metrics.counter(
 37 |             self.__class__, 'dropped_sessions_no_play_events')
 38 | 
 39 |     def process(self, elem):
 40 |         _, vals = elem
 41 |         plays = vals['plays']
 42 |         events = vals['events']
 43 | 
 44 |         play_count = 0
 45 |         max_play_ts = 0
 46 |         for play in plays:
 47 |             play_count += 1
 48 |             max_play_ts = max(max_play_ts, long(play.timestamp))
 49 | 
 50 |         event_count = 0
 51 |         an_event = None
 52 |         for event in events:
 53 |             an_event = event
 54 |             event_count += 1
 55 | 
 56 |         if event_count == 0:
 57 |             self.dropped_sessions_no_events.inc()
 58 |         elif event_count > 1:
 59 |             self.dropped_sessions_too_many_events.inc()
 60 |         elif play_count == 0:
 61 |             self.dropped_sessions_no_play_events.inc()
 62 |         else:
 63 |             min_latency = long(an_event.timestamp) - max_play_ts
 64 |             yield (an_event.user, min_latency)
 65 | 
 66 | 
 67 | class DetectBadUsers(beam.DoFn):
 68 |     def process(self, elem, mean_latency=beam.DoFn.SideInputParam):
 69 |         user, latency = elem
 70 |         # Naive: compute bad users are users 5 times less than
 71 |         # the mean.
 72 |         if latency < mean / 5:
 73 |             yield user
 74 | 
 75 | 
 76 | def Run(argv=None):
 77 |     known_args, pipeline_args = ParseArgs(argv)
 78 |     pipeline_options = PipelineOptions(pipeline_args)
 79 |     pipeline_options.view_as(SetupOptions).save_main_session = True
 80 |     p = beam.Pipeline(options=pipeline_options)
 81 |     if known_args.topic:
 82 |         pipeline_options.view_as(StandardOptions).streaming = True
 83 | 
 84 |     project = pipeline_options.view_as(GoogleCloudOptions).project
 85 |     timestamp_attribute = 'timestamp_ms'
 86 |     events = None
 87 |     if (not known_args.topic or not known_args.play_topic):
 88 |         logging.fatal('topic and play_topic are required.')
 89 | 
 90 |     # [START EXERCISE 7]:
 91 |     # 1. Read game events with message id and timestamp.
 92 |     # 2. Parse events.
 93 |     events = (p
 94 |             | 'read_events' >> ChangeMe()
 95 |             | 'parse_events' >> ChangeMe()
 96 |             )
 97 | 
 98 |     # 1. Read play events with message id and timestamp.
 99 |     # 2. Parse events.
100 |     play_events = (p
101 |             | 'read_play_events' >> ChangeMe()
102 |             | 'parse_play_events' >> ChangeMe()
103 |             )
104 | 
105 |     # 1. Key events by event id.
106 |     # 2. Sessionize.
107 |     sessionized_events = (events
108 |             | 'key_events_by_id' >> ChangeMe()
109 |             | 'sessionize_events' >> ChangeMe()
110 | 
111 |     # 1. Key play events by event id.
112 |     # 2. Sessionize.
113 |     sessionized_plays = (play_events
114 |             | 'key_plays_by_id' >> ChangeMe()
115 |             | 'sessionize_plays' >> ChangeMe()
116 | 
117 |     # 1. Join events using CoGroupByKey
118 |     # 2. Compute latency using ComputeLatency
119 |     per_user_latency = (
120 |             {'change':me, 'me':change}
121 |             | 'cbk' >> ChangeMe()
122 |             | 'compute_latency' >> ChangeMe()
123 | 
124 |     # 1. Get values of per user latencies
125 |     # 2. Re-window into GlobalWindows that triggers repeatedly after 1000 new elements.
126 |     # 3. Compute the global mean to be used as a side input.
127 |     mean_latency = (per_user_latency
128 |             | 'extract_latencies' >> ChangeMe()
129 |             | 'global_window' >> ChangeMe()
130 |             | 'compute_mean' >> ChangeMe()
131 |             )
132 |     # [END EXERCISE 7]
133 | 
134 |     # Filter out bad users.
135 |     _ = (per_user_latency
136 |             | 'detect_bad_users' >> beam.ParDo(
137 |                 DetectBadUsers(), mean_latency=mean_latency)
138 |             | 'filter_duplicates' >> beam.WindowInto(
139 |                 window.GlobalWindows(), trigger=trigger.AfterCount(1),
140 |                 accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
141 |             | 'to_bq_schema' >> beam.Map(lambda x: {'user': x})
142 |             | 'write_bad_users' >> beam.io.WriteToBigQuery(
143 |                 known_args.output_tablename, known_args.output_dataset, project, ('user:string'))
144 |             )
145 | 
146 |     p.run().wait_until_finish()
147 | 
148 | 
149 | if __name__ == '__main__':
150 |     logging.getLogger().setLevel(logging.INFO)
151 |     Run()
152 | 


--------------------------------------------------------------------------------
/py/run0.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise0 --input gs://sme-training/game/small.csv \
2 |   --output_dataset sme \
3 |   --output_tablename exercise0 \
4 |   --runner DataflowRunner \
5 |   --project YOUR_PROJECT \
6 |   --temp_location gs://YOUR_BUCKET/staging \
7 |   --setup_file ./setup.py
8 | 


--------------------------------------------------------------------------------
/py/run1.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise1 --input gs://sme-training/game/small.csv \
2 |   --output_dataset sme \
3 |   --output_tablename exercise1 \
4 |   --runner DataflowRunner \
5 |   --project YOUR_PROJECT \
6 |   --temp_location gs://YOUR_BUCKET/tmp/ \
7 |   --setup_file ./setup.py
8 | 


--------------------------------------------------------------------------------
/py/run2.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise2 --input gs://sme-training/game/small.csv \
2 |   --output_dataset sme \
3 |   --output_tablename exercise2 \
4 |   --runner DataflowRunner \
5 |   --project YOUR_PROJECT \
6 |   --temp_location gs://YOUR_BUCKET/tmp/ \
7 |   --setup_file ./setup.py
8 | 


--------------------------------------------------------------------------------
/py/run3.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise3 \
2 |   --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \
3 |   --output_dataset sme \
4 |   --output_tablename exercise3 \
5 |   --runner DataflowRunner \
6 |   --project YOUR_PROJECT \
7 |   --temp_location gs://YOUR_BUCKET/staging \
8 |   --setup_file ./setup.py
9 | 


--------------------------------------------------------------------------------
/py/run4.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise4 \
2 |   --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \
3 |   --output_dataset sme \
4 |   --output_tablename exercise4 \
5 |   --runner DataflowRunner \
6 |   --project YOUR_PROJECT \
7 |   --temp_location gs://YOUR_BUCKET/staging \
8 |   --setup_file ./setup.py
9 | 


--------------------------------------------------------------------------------
/py/run5.sh:
--------------------------------------------------------------------------------
1 | python -m exercises.exercise5 \
2 |   --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \
3 |   --output_dataset sme \
4 |   --output_tablename exercise5 \
5 |   --runner DataflowRunner \
6 |   --project YOUR_PROJECT \
7 |   --temp_location gs://YOUR_BUCKET/staging \
8 |   --setup_file ./setup.py
9 | 


--------------------------------------------------------------------------------
/py/run6.sh:
--------------------------------------------------------------------------------
 1 | python -m exercises.exercise6 \
 2 |   --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \
 3 |   --output_dataset sme \
 4 |   --output_tablename exercise6 \
 5 |   --runner DataflowRunner \
 6 |   --project YOUR_PROJECT \
 7 |   --user_activity_window 240 \
 8 |   --session_gap 60 \
 9 |   --temp_location gs://YOUR_BUCKET/staging \
10 |   --setup_file ./setup.py
11 | 


--------------------------------------------------------------------------------
/py/run7.sh:
--------------------------------------------------------------------------------
 1 | python -m exercises.exercise7 \
 2 |   --topic projects/YOUR_PROJECT/topics/YOUR_TOPIC \
 3 |   --play_topic projects/YOUR_PROJECT/topics/YOUR_TOPIC-play \
 4 |   --output_dataset sme \
 5 |   --output_tablename exercise7 \
 6 |   --runner DataflowRunner \
 7 |   --project YOUR_PROJECT \
 8 |   --session_gap 20 \
 9 |   --temp_location gs://YOUR_BUCKET/staging \
10 |   --setup_file ./setup.py
11 | 


--------------------------------------------------------------------------------
/py/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | setuptools.setup(
3 | 	name='sme-training',
4 | 	version='1.0',
5 | 	install_requires=[],
6 | 	packages=setuptools.find_packages(),
7 | )
8 | 


--------------------------------------------------------------------------------
/py/solutions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/solutions/__init__.py


--------------------------------------------------------------------------------
/py/solutions/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/solutions/__init__.pyc


--------------------------------------------------------------------------------
/py/solutions/exercise0.py:
--------------------------------------------------------------------------------
 1 | # This batch pipeline imports game events from CSV to BigQuery.
 2 | from __future__ import absolute_import
 3 | 
 4 | import logging
 5 | import re
 6 | 
 7 | import apache_beam as beam
 8 | from apache_beam.io import ReadFromText
 9 | from apache_beam.io import WriteToText
10 | from apache_beam.metrics import Metrics
11 | from apache_beam.metrics.metric import MetricsFilter
12 | from apache_beam.options.pipeline_options import PipelineOptions
13 | from apache_beam.options.pipeline_options import SetupOptions
14 | from apache_beam.options.pipeline_options import GoogleCloudOptions
15 | from util.util import GameEvent
16 | from util.util import ParseEvent
17 | from util.util import ParseArgs
18 | 
19 | # Defines the BigQuery schema.
20 | SCHEMA = ('user:STRING,' 'team:STRING,' 'score:INTEGER,' 'timestamp:TIMESTAMP')
21 | 
22 | 
23 | def FormatEvent(element):
24 |     """Format a GameEvent to a BigQuery TableRow."""
25 |     return {
26 |         'user': element.user,
27 |         'team': element.team,
28 |         'score': element.score,
29 |         'timestamp': element.timestamp
30 |     }
31 | 
32 | 
33 | def Run(argv=None):
34 |     """Run a batch pipeline."""
35 |     known_args, pipeline_args = ParseArgs(argv)
36 |     pipeline_options = PipelineOptions(pipeline_args)
37 |     pipeline_options.view_as(SetupOptions).save_main_session = True
38 |     p = beam.Pipeline(options=pipeline_options)
39 | 
40 |     project = pipeline_options.view_as(GoogleCloudOptions).project
41 |     # Read events from a CSV file, parse them and write (import) them to BigQuery.
42 |     _ = (p
43 |         | 'read' >> ReadFromText(known_args.input)
44 |         | 'parse' >> beam.FlatMap(ParseEvent)
45 |         | 'format' >> beam.Map(FormatEvent)
46 |         | beam.io.WriteToBigQuery(known_args.output_tablename,
47 |             known_args.output_dataset, project, SCHEMA)
48 |         )
49 |     p.run().wait_until_finish()
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     logging.getLogger().setLevel(logging.INFO)
54 |     Run()
55 | 


--------------------------------------------------------------------------------
/py/solutions/exercise1.py:
--------------------------------------------------------------------------------
 1 | # This batch pipeline calculates the sum of scores per user, over an entire batch of gaming data and writes the sums to BigQuery.
 2 | from __future__ import absolute_import
 3 | 
 4 | import logging
 5 | import re
 6 | 
 7 | import apache_beam as beam
 8 | from apache_beam.io import ReadFromText
 9 | from apache_beam.io import WriteToText
10 | from apache_beam.metrics import Metrics
11 | from apache_beam.metrics.metric import MetricsFilter
12 | from apache_beam.options.pipeline_options import PipelineOptions
13 | from apache_beam.options.pipeline_options import SetupOptions
14 | from apache_beam.options.pipeline_options import GoogleCloudOptions
15 | from util.util import GameEvent
16 | from util.util import ParseEvent
17 | from util.util import ParseArgs
18 | 
19 | # Defines the BigQuery schema.
20 | SCHEMA = ('user:STRING,' 'total_score:INTEGER')
21 | 
22 | 
23 | class ExtractAndSumScore(beam.PTransform):
24 |     """A transform to extract key/score information from GameEvent, and sum
25 |      the scores. The constructor arg determines whether 'team' or 'user' info is
26 |      extracted."""
27 |     def __init__(self, field):
28 |         super(ExtractAndSumScore, self).__init__()
29 |         self.field = field
30 | 
31 |     def expand(self, p):
32 |         return (p
33 |                 | 'extract_field' >> beam.Map(lambda x: (vars(x)[self.field], x.score))
34 |                 | beam.CombinePerKey(sum)
35 |                 )
36 | 
37 | 
38 | def FormatUserScoreSum(element):
39 |     """Format a KV of user and their score to a BigQuery TableRow."""
40 |     user, total_score = element
41 |     return {'user': user, 'total_score': total_score}
42 | 
43 | 
44 | def Run(argv=None):
45 |     known_args, pipeline_args = ParseArgs(argv)
46 |     pipeline_options = PipelineOptions(pipeline_args)
47 |     pipeline_options.view_as(SetupOptions).save_main_session = True
48 |     p = beam.Pipeline(options=pipeline_options)
49 | 
50 |     project = pipeline_options.view_as(GoogleCloudOptions).project
51 |     # Read events from a CSV file and parse them.
52 |     _ = (p
53 |          | 'read' >> ReadFromText(known_args.input)
54 |          | 'parse' >> beam.FlatMap(ParseEvent)
55 |          | 'extract_user_score' >> ExtractAndSumScore('user')
56 |          | 'format_user_score_sum' >> beam.Map(FormatUserScoreSum)
57 |          | beam.io.WriteToBigQuery(known_args.output_tablename,
58 |              known_args.output_dataset, project, SCHEMA)
59 |          )
60 | 
61 |     p.run().wait_until_finish()
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     logging.getLogger().setLevel(logging.INFO)
66 |     Run()
67 | 


--------------------------------------------------------------------------------
/py/solutions/exercise1.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/solutions/exercise1.pyc


--------------------------------------------------------------------------------
/py/solutions/exercise2.py:
--------------------------------------------------------------------------------
 1 | # This batch pipeline calculates the sum of scores per team per hour, over an entire batch of gaming data and writes the per-team sums to BigQuery.
 2 | from __future__ import absolute_import
 3 | 
 4 | import logging
 5 | import re
 6 | 
 7 | import apache_beam as beam
 8 | from apache_beam.io import ReadFromText
 9 | from apache_beam.io import WriteToText
10 | from apache_beam.metrics import Metrics
11 | from apache_beam.metrics.metric import MetricsFilter
12 | from apache_beam.options.pipeline_options import PipelineOptions
13 | from apache_beam.options.pipeline_options import SetupOptions
14 | from apache_beam.options.pipeline_options import GoogleCloudOptions
15 | from util.util import GameEvent
16 | from util.util import ParseEvent
17 | from util.util import ParseArgs
18 | import apache_beam.transforms.window as window
19 | 
20 | # Defines the BigQuery schema.
21 | SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
22 | 
23 | 
24 | class ExtractAndSumScore(beam.PTransform):
25 |     def __init__(self, field):
26 |         super(ExtractAndSumScore, self).__init__()
27 |         self.field = field
28 | 
29 |     def expand(self, p):
30 |         return (p
31 |                 |'extract_field' >> beam.Map(lambda x: (vars(x)[self.field], x.score))
32 |                 | beam.CombinePerKey(sum)
33 |                 )
34 | 
35 | 
36 | class WindowedTeamScore(beam.PTransform):
37 |     """A transform to compute the WindowedTeamScore."""
38 |     def __init__(self, duration):
39 |         super(WindowedTeamScore, self).__init__()
40 |         self.duration = duration
41 | 
42 |     def expand(self, p):
43 |         return (p
44 |                 | 'window' >> beam.WindowInto(window.FixedWindows(self.duration))
45 |                 | 'extract_team_score' >> ExtractAndSumScore('team')
46 |                 )
47 | 
48 | 
49 | class FormatTeamScoreSum(beam.DoFn):
50 |     """Format a KV of user and their score to a BigQuery TableRow."""
51 |     def process(self, team_score, window=beam.DoFn.WindowParam):  ##????
52 |         team, score = team_score
53 |         start = int(window.start)
54 |         yield {
55 |             'team': team,
56 |             'total_score': score,
57 |             'window_start': start,
58 |         }
59 | 
60 | 
61 | def Run(argv=None):
62 |     known_args, pipeline_args = ParseArgs(argv)
63 |     pipeline_options = PipelineOptions(pipeline_args)
64 |     pipeline_options.view_as(SetupOptions).save_main_session = True
65 |     p = beam.Pipeline(options=pipeline_options)
66 | 
67 |     project = pipeline_options.view_as(GoogleCloudOptions).project
68 |     _ = (p
69 |          | 'read' >> ReadFromText(known_args.input)
70 |          | 'parse' >> beam.FlatMap(ParseEvent)
71 |          | 'add_event_timestamps' >> beam.Map(
72 |              lambda x: beam.window.TimestampedValue(x, x.timestamp))
73 |          | 'windowed_team_score' >> WindowedTeamScore(60 * 60)
74 |          | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
75 |          | beam.io.WriteToBigQuery(known_args.output_tablename,
76 |              known_args.output_dataset, project, SCHEMA)
77 |          )
78 |     p.run().wait_until_finish()
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     logging.getLogger().setLevel(logging.INFO)
83 |     Run()
84 | 


--------------------------------------------------------------------------------
/py/solutions/exercise3.py:
--------------------------------------------------------------------------------
  1 | # This pipeline calculates the sum of scores per team per hour and writes the
  2 | # per-team sums to BigQuery. The pipeline can be run in either batch or
  3 | # streaming mode, reading from either a data file or Pub/Sub topic.
  4 | #
  5 | # You will need to create a Pub/Sub topic and run the Java Injector
  6 | # in order to get game events over Pub/Sub. Please refer to the instructions
  7 | # here: https://github.com/malo-denielou/DataflowSME
  8 | from __future__ import absolute_import
  9 | 
 10 | import logging
 11 | import re
 12 | 
 13 | import apache_beam as beam
 14 | from apache_beam.io import ReadFromText
 15 | from apache_beam.io import ReadFromPubSub
 16 | from apache_beam.io import WriteToText
 17 | from apache_beam.metrics import Metrics
 18 | from apache_beam.metrics.metric import MetricsFilter
 19 | from apache_beam.options.pipeline_options import PipelineOptions
 20 | from apache_beam.options.pipeline_options import SetupOptions
 21 | from apache_beam.options.pipeline_options import StandardOptions
 22 | from apache_beam.options.pipeline_options import GoogleCloudOptions
 23 | from util.util import GameEvent
 24 | from util.util import ParseEvent
 25 | from util.util import ParseEventFn
 26 | from util.util import ParseArgs
 27 | import apache_beam.transforms.window as window
 28 | from solutions.exercise1 import ExtractAndSumScore
 29 | 
 30 | # Defines the BigQuery schema.
 31 | SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
 32 | 
 33 | 
 34 | class ExtractAndSumScore(beam.PTransform):
 35 |     def __init__(self, field):
 36 |         super(ExtractAndSumScore, self).__init__()
 37 |         self.field = field
 38 | 
 39 |     def expand(self, p):
 40 |         return (p
 41 |                 | 'extract_field' >> beam.Map(
 42 |                     lambda x: (vars(x)[self.field], x.score))
 43 |                 | beam.CombinePerKey(sum)
 44 |                 )
 45 | 
 46 | 
 47 | class WindowedTeamScore(beam.PTransform):
 48 |     """A transform to compute a windowed team score."""
 49 |     def __init__(self, duration):
 50 |         super(WindowedTeamScore, self).__init__()
 51 |         self.duration = duration
 52 | 
 53 |     def expand(self, p):
 54 |         return (p
 55 |                 | 'window' >> beam.WindowInto(
 56 |                     window.FixedWindows(self.duration))
 57 |                 | 'extract_team_score' >> ExtractAndSumScore('team')
 58 |                 )
 59 | 
 60 | 
 61 | class FormatTeamScoreSum(beam.DoFn):
 62 |     """Format a KV of user and their score to a BigQuery TableRow."""
 63 |     def process(self, team_score, window=beam.DoFn.WindowParam):
 64 |         team, score = team_score
 65 |         start = int(window.start)
 66 |         yield {
 67 |             'team': team,
 68 |             'total_score': score,
 69 |             'window_start': start,
 70 |         }
 71 | 
 72 | 
 73 | def Run(argv=None):
 74 |     known_args, pipeline_args = ParseArgs(argv)
 75 |     pipeline_options = PipelineOptions(pipeline_args)
 76 |     pipeline_options.view_as(SetupOptions).save_main_session = True
 77 |     p = beam.Pipeline(options=pipeline_options)
 78 |     window_duration = 1 * 60  # 1 minute windows.
 79 |     if known_args.topic:
 80 |         pipeline_options.view_as(StandardOptions).streaming = True
 81 | 
 82 |     project = pipeline_options.view_as(GoogleCloudOptions).project
 83 |     timestamp_attribute = 'timestamp_ms'
 84 |     events = None
 85 |     if (not known_args.topic):
 86 |         events = (p
 87 |                 | 'read' >> ReadFromText(known_args.input)
 88 |                 | 'parse' >> beam.FlatMap(ParseEventFn())
 89 |                 | 'add_event_timestamps' >> beam.Map(
 90 |                     lambda x: beam.window.TimestampedValue(x, x.timestamp))
 91 |                 )
 92 |     else:
 93 |         events = (p
 94 |                 | 'read' >> ReadFromPubSub(topic=known_args.topic,
 95 |                     timestamp_attribute='timestamp_ms')
 96 |                 | 'decode' >> beam.ParDo(ParseEventFn())
 97 |                 )
 98 | 
 99 |     _ = (events
100 |             | 'windowed_team_score' >> WindowedTeamScore(window_duration)
101 |             | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
102 |             | beam.io.WriteToBigQuery(known_args.output_tablename,
103 |                 known_args.output_dataset, project, SCHEMA)
104 |             )
105 |     p.run().wait_until_finish()
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     logging.getLogger().setLevel(logging.INFO)
110 |     Run()
111 | 


--------------------------------------------------------------------------------
/py/solutions/exercise4.py:
--------------------------------------------------------------------------------
  1 | # This pipeline calculates the sum of scores per team per hour and writes the
  2 | # per-team sums to BigQuery. Additionally computes running user scores (e.g.,
  3 | # as a leaderboard) and updates them regularly.
  4 | 
  5 | # The pipeline can be run in either batch or streaming mode, reading from
  6 | # either a data file or Pub/Sub topic.
  7 | from __future__ import absolute_import
  8 | 
  9 | import logging
 10 | import re
 11 | import time
 12 | 
 13 | import apache_beam as beam
 14 | from apache_beam.io import ReadFromText
 15 | from apache_beam.io import ReadFromPubSub
 16 | from apache_beam.io import WriteToText
 17 | from apache_beam.metrics import Metrics
 18 | from apache_beam.metrics.metric import MetricsFilter
 19 | from apache_beam.options.pipeline_options import PipelineOptions
 20 | from apache_beam.options.pipeline_options import SetupOptions
 21 | from apache_beam.options.pipeline_options import StandardOptions
 22 | from apache_beam.options.pipeline_options import GoogleCloudOptions
 23 | from apache_beam.transforms import trigger
 24 | from util.util import GameEvent
 25 | from util.util import ParseEvent
 26 | from util.util import ParseEventFn
 27 | from util.util import ParseArgs
 28 | import apache_beam.transforms.window as window
 29 | from solutions.exercise1 import ExtractAndSumScore
 30 | 
 31 | # Defines the BigQuery schemas.
 32 | USER_SCHEMA = ('user:STRING,'
 33 |                'total_score:INTEGER,'
 34 |                'processing_time:TIMESTAMP')
 35 | TEAM_SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
 36 | 
 37 | 
 38 | class ExtractAndSumScore(beam.PTransform):
 39 |     def __init__(self, field):
 40 |         super(ExtractAndSumScore, self).__init__()
 41 |         self.field = field
 42 | 
 43 |     def expand(self, p):
 44 |         return (p | 'extract_field' >>
 45 |                 beam.Map(lambda x: (vars(x)[self.field], x.score)) |
 46 |                 beam.CombinePerKey(sum))
 47 | 
 48 | 
 49 | class RunningUserScores(beam.PTransform):
 50 |     """Extract user/score pairs via global windowing and emit perioidic updates
 51 |      on all users' running scores.
 52 |     """
 53 |     def __init__(self, allowed_lateness=0):
 54 |         super(RunningUserScores, self).__init__()
 55 | 
 56 |     def expand(self, p):
 57 |         # NOTE: allowed_lateness is not yet available in Python FixedWindows.
 58 |         # NOTE: AfterProcessingTime not yet available in Python.
 59 |         return (p
 60 |                 | 'window' >> beam.WindowInto(
 61 |                     beam.window.GlobalWindows(),
 62 |                     trigger=trigger.AfterWatermark(early=trigger.AfterCount(100)),
 63 |                     accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
 64 |                 | 'extract_user_score' >> ExtractAndSumScore('user')
 65 |                 )
 66 | 
 67 | 
 68 | class WindowedTeamScore(beam.PTransform):
 69 |     """Calculates scores for each team within the configured window duration"""
 70 |     def __init__(self, duration):
 71 |         super(WindowedTeamScore, self).__init__()
 72 |         self.duration = duration
 73 | 
 74 |     def expand(self, p):
 75 |         return (p
 76 |                 | 'window' >> beam.WindowInto(window.FixedWindows(self.duration))
 77 |                 | 'extract_team_score' >> ExtractAndSumScore('team')
 78 |                 )
 79 | 
 80 | 
 81 | class FormatTeamScoreSum(beam.DoFn):
 82 |     """Format a KV of team and its score to a BigQuery TableRow."""
 83 |     def process(self, team_score, window=beam.DoFn.WindowParam):
 84 |         team, score = team_score
 85 |         start = int(window.start)
 86 |         yield {
 87 |             'team': team,
 88 |             'total_score': score,
 89 |             'window_start': start,
 90 |         }
 91 | 
 92 | 
 93 | class FormatUserScoreSum(beam.DoFn):
 94 |     """Format a KV of user and their score to a BigQuery TableRow."""
 95 |     def process(self, user_score, window=beam.DoFn.WindowParam):
 96 |         user, score = user_score
 97 |         yield {
 98 |             'user': user,
 99 |             'total_score': score,
100 |             'processing_time': time.time(),
101 |         }
102 | 
103 | 
104 | def Run(argv=None):
105 |     known_args, pipeline_args = ParseArgs(argv)
106 |     pipeline_options = PipelineOptions(pipeline_args)
107 |     pipeline_options.view_as(SetupOptions).save_main_session = True
108 |     p = beam.Pipeline(options=pipeline_options)
109 |     window_duration = 1 * 60  # 1 minute windows.
110 |     if known_args.topic:
111 |         pipeline_options.view_as(StandardOptions).streaming = True
112 | 
113 |     project = pipeline_options.view_as(GoogleCloudOptions).project
114 |     timestamp_attribute = 'timestamp_ms'
115 |     events = None
116 |     if (not known_args.topic):
117 |         events = (p
118 |                 | 'read' >> ReadFromText(known_args.input)
119 |                 | 'parse' >> beam.FlatMap(ParseEventFn())
120 |                 | 'add_event_timestamps' >> beam.Map(
121 |                     lambda x: beam.window.TimestampedValue(x, x.timestamp))
122 |                 )
123 |     else:
124 |         events = (p
125 |                 | 'read' >> ReadFromPubSub(topic=known_args.topic,
126 |                     timestamp_attribute='timestamp_ms')
127 |                 | 'decode' >> beam.ParDo(ParseEventFn())
128 |                 )
129 | 
130 |     # Window team scores and write them BigQuery.
131 |     _ = (events
132 |          | 'windowed_team_score' >> WindowedTeamScore(window_duration)
133 |          | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
134 |          | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery(
135 |              known_args.output_tablename + '_team', known_args.output_dataset,
136 |              project, TEAM_SCHEMA)
137 |          )
138 | 
139 |     # Write leaderboards to BigQuery.
140 |     _ = (events
141 |          | 'running_user_score' >> RunningUserScores()
142 |          | 'format_user_scores' >> beam.ParDo(FormatUserScoreSum())
143 |          | 'write_users_to_bigquery' >> beam.io.WriteToBigQuery(
144 |              known_args.output_tablename + '_user', known_args.output_dataset,
145 |              project, USER_SCHEMA)
146 |          )
147 | 
148 |     p.run().wait_until_finish()
149 | 
150 | 
151 | if __name__ == '__main__':
152 |     logging.getLogger().setLevel(logging.INFO)
153 |     Run()
154 | 


--------------------------------------------------------------------------------
/py/solutions/exercise5.py:
--------------------------------------------------------------------------------
  1 | # Filter 'cheating' or 'spammy' users from the game results.
  2 | # Computes the global mean score and filters users that are 
  3 | # some threshold above that score.
  4 | from __future__ import absolute_import
  5 | 
  6 | import logging
  7 | import re
  8 | import time
  9 | 
 10 | import apache_beam as beam
 11 | from apache_beam.io import ReadFromText
 12 | from apache_beam.io import ReadFromPubSub
 13 | from apache_beam.io import WriteToText
 14 | from apache_beam.metrics import Metrics
 15 | from apache_beam.metrics.metric import MetricsFilter
 16 | from apache_beam.options.pipeline_options import PipelineOptions
 17 | from apache_beam.options.pipeline_options import SetupOptions
 18 | from apache_beam.options.pipeline_options import StandardOptions
 19 | from apache_beam.options.pipeline_options import GoogleCloudOptions
 20 | from apache_beam.transforms import trigger
 21 | from util.util import GameEvent
 22 | from util.util import ParseEvent
 23 | from util.util import ParseEventFn
 24 | from util.util import ParseArgs
 25 | import apache_beam.transforms.window as window
 26 | 
 27 | # Defines the BigQuery schemas.
 28 | USER_SCHEMA = ('user:STRING,'
 29 |                'total_score:INTEGER,'
 30 |                'processing_time:TIMESTAMP')
 31 | TEAM_SCHEMA = ('team:STRING,' 'total_score:INTEGER,' 'window_start:TIMESTAMP')
 32 | 
 33 | 
 34 | class ExtractAndSumScore(beam.PTransform):
 35 |     def __init__(self, field):
 36 |         super(ExtractAndSumScore, self).__init__()
 37 |         self.field = field
 38 | 
 39 |     def expand(self, p):
 40 |         return (p
 41 |                 | 'extract_field' >> beam.Map(
 42 |                     lambda x: (vars(x)[self.field], x.score))
 43 |                 | beam.CombinePerKey(sum)
 44 |                 )
 45 | 
 46 | 
 47 | class WindowedUserScores(beam.PTransform):
 48 |     """Extract user/score pairs via in fixed windows."""
 49 |     def __init__(self, duration):
 50 |         super(WindowedUserScores, self).__init__()
 51 |         self.duration = duration
 52 | 
 53 |     def expand(self, p):
 54 |         return (p
 55 |                 | 'window' >> beam.WindowInto(
 56 |                     window.FixedWindows(self.duration))
 57 |                 | 'extract_user_score' >> ExtractAndSumScore('user')
 58 |                 )
 59 | 
 60 | 
 61 | class FilterUser(beam.DoFn):
 62 |     """Filter a user if their score * score_weight > avg_score."""
 63 |     def __init__(self, score_weight):
 64 |         super(FilterUser, self).__init__()
 65 |         self.score_weight = score_weight
 66 |         self.num_spammy_users = Metrics.counter(self.__class__,
 67 |                                                 'num_spammy_users')
 68 | 
 69 |     def process(self, user_score, avg_score=beam.DoFn.SideInputParam):
 70 |         user, score = user_score
 71 |         if score * self.score_weight > avg_score:
 72 |             logging.error('User %s filtered as spammy', user)
 73 |             self.num_spammy_users.inc()
 74 |             yield user
 75 | 
 76 | 
 77 | class ComputeSpammyUsers(beam.PTransform):
 78 |     """Compute users with a high clickrate, which we will consider spammy.
 79 |      We do this by finding the mean total score per user and filter out
 80 |      those with scores that are greater than the mean * score_weight
 81 |     """
 82 |     def __init__(self, score_weight):
 83 |         super(ComputeSpammyUsers, self).__init__()
 84 |         self.score_weight = score_weight
 85 | 
 86 |     def expand(self, p):
 87 |         avg_score = (p
 88 |                 | beam.Values()
 89 |                 | beam.CombineGlobally(
 90 |                     beam.combiners.MeanCombineFn()).as_singleton_view()
 91 |                 )
 92 |         return (p
 93 |                 | 'compute_spammers' >> beam.ParDo(
 94 |                     FilterUser(self.score_weight), avg_score=avg_score)
 95 |                 )
 96 | 
 97 | 
 98 | class FilterSpammers(beam.DoFn):
 99 |     """Remove users found in the spam list."""
100 |     def __init__(self):
101 |         super(FilterSpammers, self).__init__()
102 |         self.filtered_scores = Metrics.counter(self.__class__,
103 |                                                'filtered_scores')
104 | 
105 |     def process(self, elem, spammers=beam.DoFn.SideInputParam):
106 |         user = elem.user
107 |         if user not in spammers:
108 |             yield elem
109 |         else:
110 |             self.filtered_scores.inc()
111 | 
112 | 
113 | class WindowedTeamScore(beam.PTransform):
114 |     """Calculates scores for each team within the configured window duration"""
115 |     def __init__(self, duration, spammers):
116 |         super(WindowedTeamScore, self).__init__()
117 |         self.duration = duration
118 |         self.spammers = spammers
119 | 
120 |     def expand(self, p):
121 |         return (p
122 |                 | 'window' >> beam.WindowInto(
123 |                     window.FixedWindows(self.duration))
124 |                 | 'filter_spammers' >> beam.ParDo(
125 |                     FilterSpammers(), spammers=self.spammers)
126 |                 | 'extract_team_score' >> ExtractAndSumScore('team')
127 |                 )
128 | 
129 | 
130 | class FormatTeamScoreSum(beam.DoFn):
131 |     def process(self, team_score, window=beam.DoFn.WindowParam):
132 |         team, score = team_score
133 |         start = int(window.start)
134 |         yield {
135 |             'team': team,
136 |             'total_score': score,
137 |             'window_start': start,
138 |         }
139 | 
140 | 
141 | class FormatUserScoreSum(beam.DoFn):
142 |     def process(self, user_score, window=beam.DoFn.WindowParam):
143 |         user, score = user_score
144 |         yield {
145 |             'user': user,
146 |             'total_score': score,
147 |             'processing_time': time.time(),
148 |         }
149 | 
150 | 
151 | def Run(argv=None):
152 |     known_args, pipeline_args = ParseArgs(argv)
153 |     pipeline_options = PipelineOptions(pipeline_args)
154 |     pipeline_options.view_as(SetupOptions).save_main_session = True
155 |     p = beam.Pipeline(options=pipeline_options)
156 |     window_duration = 1 * 60  # 1 minute windows.
157 |     if known_args.topic:
158 |         pipeline_options.view_as(StandardOptions).streaming = True
159 | 
160 |     project = pipeline_options.view_as(GoogleCloudOptions).project
161 |     timestamp_attribute = 'timestamp_ms'
162 |     events = None
163 |     if (not known_args.topic):
164 |         events = (p
165 |                 | 'read' >> ReadFromText(known_args.input)
166 |                 | 'parse' >> beam.FlatMap(ParseEventFn())
167 |                 | 'add_event_timestamps' >> beam.Map(
168 |                     lambda x: beam.window.TimestampedValue(x, x.timestamp)))
169 |     else:
170 |         events = (p
171 |                 | 'read' >> ReadFromPubSub(
172 |                     topic=known_args.topic,
173 |                     timestamp_attribute='timestamp_ms')
174 |                 | 'decode' >> beam.ParDo(ParseEventFn()))
175 | 
176 |     user_scores = (events
177 |             | 'window_user_scores' >> WindowedUserScores(window_duration))
178 |     spammers = beam.pvalue.AsList(user_scores
179 |             | 'compute_spammers' >> ComputeSpammyUsers(2.5))
180 | 
181 |     _ = (events
182 |          | 'windowed_team_score' >> WindowedTeamScore(window_duration, spammers)
183 |          | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
184 |          | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery(
185 |              known_args.output_tablename, known_args.output_dataset, project,
186 |              TEAM_SCHEMA)
187 |          )
188 | 
189 |     p.run().wait_until_finish()
190 | 
191 | 
192 | if __name__ == '__main__':
193 |     logging.getLogger().setLevel(logging.INFO)
194 |     Run()
195 | 


--------------------------------------------------------------------------------
/py/solutions/exercise6.py:
--------------------------------------------------------------------------------
 1 | # This pipeline computes the average duration of user sessions. The
 2 | # averages are windowed, to reflect durations differing over time.
 3 | from __future__ import absolute_import
 4 | 
 5 | import logging
 6 | import re
 7 | import time
 8 | 
 9 | import apache_beam as beam
10 | import apache_beam.transforms.window as window
11 | from apache_beam.io import ReadFromText
12 | from apache_beam.io import ReadFromPubSub
13 | from apache_beam.io import WriteToText
14 | from apache_beam.metrics import Metrics
15 | from apache_beam.metrics.metric import MetricsFilter
16 | from apache_beam.options.pipeline_options import PipelineOptions
17 | from apache_beam.options.pipeline_options import SetupOptions
18 | from apache_beam.options.pipeline_options import StandardOptions
19 | from apache_beam.options.pipeline_options import GoogleCloudOptions
20 | from apache_beam.transforms import trigger
21 | from util.util import GameEvent
22 | from util.util import ParseEvent
23 | from util.util import ParseEventFn
24 | from util.util import ParseArgs
25 | 
26 | # Defines the BigQuery schemas.
27 | SESSION_SCHEMA = ('window_start:TIMESTAMP,' 'mean_duration:FLOAT')
28 | 
29 | 
30 | class UserSessionActivity(beam.DoFn):
31 |     """Compute the duration of a user's session."""
32 |     def process(self,
33 |                 elem,
34 |                 timestamp=beam.DoFn.TimestampParam,
35 |                 window=beam.DoFn.WindowParam):
36 |         duration = int(window.end) - int(window.start)
37 |         yield duration
38 | 
39 | 
40 | class FormatSessionMeans(beam.DoFn):
41 |     """Format session means for output to BQ"""
42 |     def process(self, elem, window=beam.DoFn.WindowParam):
43 |         yield {'window_start': int(window.start), 'mean_duration': elem}
44 | 
45 | 
46 | def Run(argv=None):
47 |     known_args, pipeline_args = ParseArgs(argv)
48 |     pipeline_options = PipelineOptions(pipeline_args)
49 |     pipeline_options.view_as(SetupOptions).save_main_session = True
50 |     p = beam.Pipeline(options=pipeline_options)
51 |     if known_args.topic:
52 |         pipeline_options.view_as(StandardOptions).streaming = True
53 | 
54 |     project = pipeline_options.view_as(GoogleCloudOptions).project
55 |     timestamp_attribute = 'timestamp_ms'
56 |     events = None
57 |     if (not known_args.topic):
58 |         events = (p
59 |                 | 'read' >> ReadFromText(known_args.input)
60 |                 | 'parse' >> beam.FlatMap(ParseEventFn())
61 |                 | 'add_event_timestamps' >> beam.Map(
62 |                     lambda x: beam.window.TimestampedValue(x, x.timestamp)))
63 |     else:
64 |         events = (p
65 |                 | 'read' >> ReadFromPubSub(
66 |                     topic=known_args.topic,
67 |                     timestamp_attribute='timestamp_ms')
68 |                 | 'parse' >> beam.ParDo(ParseEventFn()))
69 | 
70 |     _ = (events
71 |          | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score))
72 |          | 'sessionize' >> beam.WindowInto(
73 |              window.Sessions(float(known_args.session_gap)))
74 |          | 'drop_scores' >> beam.CombinePerKey(lambda x: 0)
75 |          | 'convert_to_activity' >> beam.ParDo(UserSessionActivity())
76 |          | 'window_of_sessions' >> beam.WindowInto(
77 |              window.FixedWindows(int(known_args.user_activity_window)))
78 |          | 'session_mean' >> beam.CombineGlobally(
79 |              beam.combiners.MeanCombineFn()).without_defaults()
80 |          | 'format_sessions' >> beam.ParDo(FormatSessionMeans())
81 |          | 'write_to_bigquery' >> beam.io.WriteToBigQuery(
82 |              known_args.output_tablename, known_args.output_dataset, project,
83 |              SESSION_SCHEMA)
84 |          )
85 | 
86 |     p.run().wait_until_finish()
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     logging.getLogger().setLevel(logging.INFO)
91 |     Run()
92 | 


--------------------------------------------------------------------------------
/py/solutions/exercise7.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | import logging
  4 | import re
  5 | import time
  6 | 
  7 | import apache_beam as beam
  8 | import apache_beam.transforms.window as window
  9 | from apache_beam.io import ReadFromText
 10 | from apache_beam.io import ReadFromPubSub
 11 | from apache_beam.io import WriteToText
 12 | from apache_beam.metrics import Metrics
 13 | from apache_beam.metrics.metric import MetricsFilter
 14 | from apache_beam.options.pipeline_options import PipelineOptions
 15 | from apache_beam.options.pipeline_options import SetupOptions
 16 | from apache_beam.options.pipeline_options import StandardOptions
 17 | from apache_beam.options.pipeline_options import GoogleCloudOptions
 18 | from apache_beam.transforms import trigger
 19 | from util.util import GameEvent
 20 | from util.util import ParseEvent
 21 | from util.util import ParseEventFn
 22 | from util.util import ParsePlayEventFn
 23 | from util.util import ParseArgs
 24 | 
 25 | # Defines the BigQuery schemas.
 26 | SESSION_SCHEMA = ('window_start:TIMESTAMP,' 'mean_duration:FLOAT')
 27 | 
 28 | 
 29 | class ComputeLatency(beam.DoFn):
 30 |     def __init__(self):
 31 |         super(ComputeLatency, self).__init__()
 32 |         self.dropped_sessions_no_events = Metrics.counter(
 33 |             self.__class__, 'dropped_sessions_no_events')
 34 |         self.dropped_sessions_too_many_events = Metrics.counter(
 35 |             self.__class__, 'dropped_sessions_too_many_events')
 36 |         self.dropped_sessions_no_play_events = Metrics.counter(
 37 |             self.__class__, 'dropped_sessions_no_play_events')
 38 | 
 39 |     def process(self, elem):
 40 |         _, vals = elem
 41 |         plays = vals['plays']
 42 |         events = vals['events']
 43 | 
 44 |         play_count = 0
 45 |         max_play_ts = 0
 46 |         for play in plays:
 47 |             play_count += 1
 48 |             max_play_ts = max(max_play_ts, long(play.timestamp))
 49 | 
 50 |         event_count = 0
 51 |         an_event = None
 52 |         for event in events:
 53 |             an_event = event
 54 |             event_count += 1
 55 | 
 56 |         if event_count == 0:
 57 |             self.dropped_sessions_no_events.inc()
 58 |         elif event_count > 1:
 59 |             self.dropped_sessions_too_many_events.inc()
 60 |         elif play_count == 0:
 61 |             self.dropped_sessions_no_play_events.inc()
 62 |         else:
 63 |             min_latency = long(an_event.timestamp) - max_play_ts
 64 |             yield (an_event.user, min_latency)
 65 | 
 66 | 
 67 | class DetectBadUsers(beam.DoFn):
 68 |     def process(self, elem, mean_latency=beam.DoFn.SideInputParam):
 69 |         user, latency = elem
 70 |         # Naive: compute bad users are users 5 times less than
 71 |         # the mean.
 72 |         if latency < mean / 5:
 73 |             yield user
 74 | 
 75 | 
 76 | def Run(argv=None):
 77 |     known_args, pipeline_args = ParseArgs(argv)
 78 |     pipeline_options = PipelineOptions(pipeline_args)
 79 |     pipeline_options.view_as(SetupOptions).save_main_session = True
 80 |     p = beam.Pipeline(options=pipeline_options)
 81 |     if known_args.topic:
 82 |         pipeline_options.view_as(StandardOptions).streaming = True
 83 | 
 84 |     project = pipeline_options.view_as(GoogleCloudOptions).project
 85 |     timestamp_attribute = 'timestamp_ms'
 86 |     events = None
 87 |     if (not known_args.topic or not known_args.play_topic):
 88 |         logging.fatal('topic and play_topic are required.')
 89 | 
 90 |     events = (p
 91 |             | 'read_events' >> ReadFromPubSub(
 92 |                 topic=known_args.topic,
 93 |                 timestamp_attribute='timestamp_ms')
 94 |             | 'parse_events' >> beam.ParDo(ParseEventFn())
 95 |             )
 96 | 
 97 |     play_events = (p
 98 |             | 'read_play_events' >> ReadFromPubSub(
 99 |                 topic=known_args.play_topic,
100 |                 timestamp_attribute='timestamp_ms')
101 |             | 'parse_play_events' >> beam.ParDo(ParsePlayEventFn())
102 |             )
103 | 
104 |     sessionized_events = (events
105 |             | 'key_events_by_id' >> beam.Map(lambda x: (x.event_id, x))
106 |             | 'sessionize_events' >> beam.WindowInto(
107 |                 window.Sessions(float(known_args.session_gap))))
108 | 
109 |     sessionized_plays = (play_events
110 |             | 'key_plays_by_id' >> beam.Map(lambda x: (x.event_id, x))
111 |             | 'sessionize_plays' >> beam.WindowInto(
112 |                 window.Sessions(float(known_args.session_gap))))
113 | 
114 |     per_user_latency = (
115 |             {'plays': sessionized_plays, 'events': sessionized_events}
116 |             | 'cbk' >> beam.CoGroupByKey()
117 |             | 'compute_latency' >> beam.ParDo(ComputeLatency()))
118 | 
119 |     mean_latency = (per_user_latency
120 |             | 'extract_latencies' >> beam.Values()
121 |             | 'global_window' >> beam.WindowInto(
122 |                 window.GlobalWindows(),
123 |                 trigger=trigger.Repeatedly(trigger.AfterCount(1000)),
124 |                 accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
125 |             | 'compute_mean' >> beam.CombineGlobally(
126 |                 beam.combiners.MeanCombineFn()).with_fanout(16).as_singleton_view()
127 |             )
128 | 
129 |     _ = (per_user_latency
130 |             | 'detect_bad_users' >> beam.ParDo(
131 |                 DetectBadUsers(), mean_latency=mean_latency)
132 |             | 'filter_duplicates' >> beam.WindowInto(
133 |                 window.GlobalWindows(), trigger=trigger.AfterCount(1),
134 |                 accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
135 |             | 'to_bq_schema' >> beam.Map(lambda x: {'user': x})
136 |             | 'write_bad_users' >> beam.io.WriteToBigQuery(
137 |                 known_args.output_tablename, known_args.output_dataset, project, ('user:string'))
138 |             )
139 | 
140 |     p.run().wait_until_finish()
141 | 
142 | 
143 | if __name__ == '__main__':
144 |     logging.getLogger().setLevel(logging.INFO)
145 |     Run()
146 | 


--------------------------------------------------------------------------------
/py/util/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['util']
2 | 


--------------------------------------------------------------------------------
/py/util/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/util/__init__.pyc


--------------------------------------------------------------------------------
/py/util/util.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | import argparse
  4 | import collections
  5 | import logging
  6 | 
  7 | import apache_beam as beam
  8 | from apache_beam.metrics.metric import Metrics
  9 | 
 10 | GameEvent = collections.namedtuple(
 11 |     'GameEvent', ['user', 'team', 'score', 'timestamp', 'event_id'])
 12 | PlayEvent = collections.namedtuple('PlayEvent',
 13 |                                    ['user', 'timestamp', 'event_id'])
 14 | 
 15 | 
 16 | class ParseEventFn(beam.DoFn):
 17 |     """Parses an event.
 18 |     [user,team,score,timestamp,readable_timestamp,event_id]
 19 |     """
 20 |     def __init__(self):
 21 |         super(ParseEventFn, self).__init__()
 22 |         self.num_parse_errors = Metrics.counter(self.__class__,
 23 |                                                 'num_event_parse_errors')
 24 | 
 25 |     def process(self, elem):
 26 |         try:
 27 |             parts = [x.strip() for x in elem.split(',')]
 28 |             user, team, score, timestamp = parts[:4]
 29 |             score = int(score)
 30 |             timestamp = long(timestamp)
 31 |             if len(parts) >= 6:
 32 |                 event_id = parts[5]
 33 |             else:
 34 |                 event_id = 'none'
 35 |             yield GameEvent(user, team, score, timestamp, event_id)
 36 |         except Exception as e:
 37 |             self.num_parse_errors.inc()
 38 |             logging.error('Parse error on "%s": %s', elem, str(e))
 39 | 
 40 | 
 41 | class ParsePlayEventFn(beam.DoFn):
 42 |     """Parses a play event: [user,timestamp,readable_timestamp,event_id]"""
 43 |     def __init__(self):
 44 |         super(ParsePlayEventFn, self).__init__()
 45 |         self.num_parse_errors = Metrics.counter(self.__class__,
 46 |                                                 'num_play_parse_errors')
 47 | 
 48 |     def process(self, elem):
 49 |         try:
 50 |             parts = [x.strip() for x in elem.split(',')]
 51 |             user, timestamp, _, event_id = parts[:5]
 52 |             yield PlayEvent(user, timestamp, event_id)
 53 |         except Exception as e:
 54 |             self.num_parse_errors.inc()
 55 |             logging.error('Parse error on "%s": %s', elem, str(e))
 56 | 
 57 | 
 58 | def ParseEvent(element):
 59 |     try:
 60 |         parts = [x.strip() for x in element.split(',')]
 61 |         user, team, score, timestamp = parts[:4]
 62 |         score = int(score)
 63 |         timestamp = long(timestamp)
 64 |         if len(parts) >= 6:
 65 |             event_id = parts[5]
 66 |         else:
 67 |             event_id = 'none'
 68 |         return [GameEvent(user, team, score, timestamp, event_id)]
 69 |     except:
 70 |         return []
 71 | 
 72 | def ParseArgs(argv):
 73 |     parser = argparse.ArgumentParser()
 74 |     parser.add_argument('--input', dest='input', help='Input file to process.')
 75 |     parser.add_argument(
 76 |         '--topic', dest='topic', help='Input topic to read from.')
 77 |     parser.add_argument(
 78 |         '--play_topic',
 79 |         dest='play_topic',
 80 |         help='Input topic to read for play events.')
 81 |     parser.add_argument(
 82 |         '--output_dataset',
 83 |         dest='output_dataset',
 84 |         required=True,
 85 |         help='Output file to write results to.')
 86 |     parser.add_argument(
 87 |         '--output_tablename',
 88 |         dest='output_tablename',
 89 |         required=True,
 90 |         help='Output file to write results to.')
 91 |     parser.add_argument(
 92 |         '--session_gap',
 93 |         dest='session_gap',
 94 |         help='Gap between user sessions, in seconds.')
 95 |     parser.add_argument(
 96 |         '--user_activity_window',
 97 |         dest='user_activity_window',
 98 |         help=
 99 |         'Value of fixed window for finding mean of session duration, in second.'
100 |     )
101 |     return parser.parse_known_args(argv)
102 | 


--------------------------------------------------------------------------------
/py/util/util.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malo-denielou/DataflowSME/34d42a1b855cd10f7c12b3ffc1171288f72c90a8/py/util/util.pyc


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/Exercise0.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package org.apache.beam.examples.complete.game;
 18 | 
 19 | import com.google.api.services.bigquery.model.TableFieldSchema;
 20 | import com.google.api.services.bigquery.model.TableReference;
 21 | import com.google.api.services.bigquery.model.TableRow;
 22 | import com.google.api.services.bigquery.model.TableSchema;
 23 | import java.util.ArrayList;
 24 | import java.util.List;
 25 | import org.apache.beam.examples.complete.game.utils.GameEvent;
 26 | import org.apache.beam.examples.complete.game.utils.Options;
 27 | import org.apache.beam.examples.complete.game.utils.ParseEventFn;
 28 | import org.apache.beam.sdk.Pipeline;
 29 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 30 | import org.apache.beam.sdk.io.TextIO;
 31 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 34 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 35 | import org.apache.beam.sdk.transforms.DoFn;
 36 | import org.apache.beam.sdk.transforms.ParDo;
 37 | 
 38 | /**
 39 |  * Zeroth (no code changes necessary) in a series of exercises in a gaming domain.
 40 |  *
 41 |  * <p>This batch pipeline imports game events from CSV to BigQuery.
 42 |  *
 43 |  * <p>See README.md for details.
 44 |  */
 45 | public class Exercise0 {
 46 | 
 47 |   /**
 48 |    * Format a GameEvent to a BigQuery TableRow.
 49 |    */
 50 |   static class FormatGameEventFn extends DoFn<GameEvent, TableRow> {
 51 | 
 52 |     @ProcessElement
 53 |     public void processElement(ProcessContext c) {
 54 |       GameEvent event = c.element();
 55 |       TableRow row = new TableRow()
 56 |           .set("user", event.getUser())
 57 |           .set("team", event.getTeam())
 58 |           .set("score", event.getScore())
 59 |           .set("timestamp", event.getTimestamp() / 1000);
 60 |       c.output(row);
 61 |     }
 62 | 
 63 |     /**
 64 |      * Defines the BigQuery schema.
 65 |      */
 66 |     static TableSchema getSchema() {
 67 |       List<TableFieldSchema> fields = new ArrayList<>();
 68 |       fields.add(new TableFieldSchema().setName("user").setType("STRING"));
 69 |       fields.add(new TableFieldSchema().setName("team").setType("STRING"));
 70 |       fields.add(new TableFieldSchema().setName("score").setType("INTEGER"));
 71 |       fields.add(new TableFieldSchema().setName("timestamp").setType("TIMESTAMP"));
 72 |       return new TableSchema().setFields(fields);
 73 |     }
 74 |   }
 75 | 
 76 |   /**
 77 |    * Run a batch pipeline.
 78 |    */
 79 |   public static void main(String[] args) throws Exception {
 80 |     Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
 81 |     Pipeline pipeline = Pipeline.create(options);
 82 | 
 83 |     TableReference tableRef = new TableReference();
 84 |     tableRef.setDatasetId(options.getOutputDataset());
 85 |     tableRef.setProjectId(options.as(GcpOptions.class).getProject());
 86 |     tableRef.setTableId(options.getOutputTableName());
 87 | 
 88 |     // Read events from a CSV file, parse them and write (import) them to BigQuery.
 89 |     pipeline
 90 |         .apply(TextIO.read().from(options.getInput()))
 91 |         .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
 92 |         .apply("FormatGameEvent", ParDo.of(new FormatGameEventFn()))
 93 |         .apply(
 94 |             BigQueryIO.writeTableRows().to(tableRef)
 95 |                 .withSchema(FormatGameEventFn.getSchema())
 96 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
 97 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
 98 | 
 99 |     pipeline.run();
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/Exercise1.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package org.apache.beam.examples.complete.game;
 18 | 
 19 | import com.google.api.services.bigquery.model.TableFieldSchema;
 20 | import com.google.api.services.bigquery.model.TableReference;
 21 | import com.google.api.services.bigquery.model.TableRow;
 22 | import com.google.api.services.bigquery.model.TableSchema;
 23 | import java.util.ArrayList;
 24 | import java.util.List;
 25 | import org.apache.beam.examples.complete.game.utils.ChangeMe;
 26 | import org.apache.beam.examples.complete.game.utils.GameEvent;
 27 | import org.apache.beam.examples.complete.game.utils.Options;
 28 | import org.apache.beam.examples.complete.game.utils.ParseEventFn;
 29 | import org.apache.beam.sdk.Pipeline;
 30 | import org.apache.beam.sdk.PipelineResult;
 31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 32 | import org.apache.beam.sdk.io.TextIO;
 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 35 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 36 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 37 | import org.apache.beam.sdk.transforms.DoFn;
 38 | import org.apache.beam.sdk.transforms.PTransform;
 39 | import org.apache.beam.sdk.transforms.ParDo;
 40 | import org.apache.beam.sdk.values.KV;
 41 | import org.apache.beam.sdk.values.PCollection;
 42 | 
 43 | /**
 44 |  * First in a series of coding exercises in a gaming domain.
 45 |  *
 46 |  * <p>This batch pipeline calculates the sum of scores per user, over an entire batch of gaming data
 47 |  * and writes the sums to BigQuery.
 48 |  *
 49 |  * <p>See README.md for details.
 50 |  */
 51 | public class Exercise1 {
 52 | 
 53 |   /**
 54 |    * A transform to extract key/score information from GameEvent, and sum
 55 |    * the scores. The constructor arg determines whether 'team' or 'user' info is
 56 |    * extracted.
 57 |    */
 58 |   public static class ExtractAndSumScore
 59 |       extends PTransform<PCollection<GameEvent>, PCollection<KV<String, Integer>>> {
 60 | 
 61 |     private final String field;
 62 | 
 63 |     public ExtractAndSumScore(String field) {
 64 |       this.field = field;
 65 |     }
 66 | 
 67 |     @Override
 68 |     public PCollection<KV<String, Integer>> expand(PCollection<GameEvent> gameEvents) {
 69 |       // [START EXERCISE 1]:
 70 |       // JavaDoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/
 71 |       // Developer Docs: https://beam.apache.org/documentation/programming-guide/#transforms-pardo
 72 |       // Also: https://cloud.google.com/dataflow/model/par-do
 73 |       //
 74 |       // Fill in the code to:
 75 |       //   1. Extract a KV<String, Integer> from each GameEvent corresponding to the given
 76 |       //      field and the score.
 77 |       //   2. Compute the sum of the scores for each key.
 78 |       //   3. Run your pipeline on the Dataflow service.
 79 |       return gameEvents
 80 |           .apply(ParDo.of(new DoFn<GameEvent, KV<String, Integer>>(){
 81 |             @ProcessElement
 82 |             public void processElement(ProcessContext c) {
 83 |               // 1. Creates key-value pairs, using the KeyField as the key and
 84 |               // the score as the value. KV.of(key, value) creates a key-value pair.
 85 |               /* TODO: YOUR CODE GOES HERE */
 86 |             }
 87 |           })) 
 88 |           // 2. Sum is a family of PTransforms for computing the sum of elements in a PCollection.
 89 |           // Select the appropriate method to compute the sum over each key.
 90 |           .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */);
 91 |       // [END EXERCISE 1]:
 92 |     }
 93 |   }
 94 | 
 95 |   /**
 96 |    * Format a KV of user and their score to a BigQuery TableRow.
 97 |    */
 98 |   static class FormatUserScoreSumsFn extends DoFn<KV<String, Integer>, TableRow> {
 99 | 
100 |     @ProcessElement
101 |     public void processElement(ProcessContext c) {
102 |       TableRow row = new TableRow()
103 |           .set("user", c.element().getKey())
104 |           .set("total_score", c.element().getValue());
105 |       c.output(row);
106 |     }
107 | 
108 |     /**
109 |      * Defines the BigQuery schema.
110 |      */
111 |     static TableSchema getSchema() {
112 |       List<TableFieldSchema> fields = new ArrayList<>();
113 |       fields.add(new TableFieldSchema().setName("user").setType("STRING"));
114 |       fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
115 |       return new TableSchema().setFields(fields);
116 |     }
117 |   }
118 | 
119 |   /**
120 |    * Run a batch pipeline.
121 |    */
122 |   public static void main(String[] args) throws Exception {
123 |     Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
124 |     Pipeline pipeline = Pipeline.create(options);
125 | 
126 |     TableReference tableRef = new TableReference();
127 |     tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
128 |     tableRef.setProjectId(options.as(GcpOptions.class).getProject());
129 |     tableRef.setTableId(options.getOutputTableName());
130 | 
131 |     // Read events from a CSV file and parse them.
132 |     pipeline
133 |         .apply(TextIO.read().from(options.getInput()))
134 |         .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
135 |         // Extract and sum username/score pairs from the event data.
136 |         .apply("ExtractUserScore", new ExtractAndSumScore("user"))
137 |         // Write the results to BigQuery.
138 |         .apply("FormatUserScoreSums", ParDo.of(new FormatUserScoreSumsFn()))
139 |         .apply(
140 |             BigQueryIO.writeTableRows().to(tableRef)
141 |                 .withSchema(FormatUserScoreSumsFn.getSchema())
142 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
143 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
144 | 
145 |     PipelineResult result = pipeline.run();
146 |     result.waitUntilFinish();
147 |   }
148 | }
149 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/Exercise2.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package org.apache.beam.examples.complete.game;
 18 | 
 19 | import com.google.api.services.bigquery.model.TableFieldSchema;
 20 | import com.google.api.services.bigquery.model.TableReference;
 21 | import com.google.api.services.bigquery.model.TableRow;
 22 | import com.google.api.services.bigquery.model.TableSchema;
 23 | import java.util.ArrayList;
 24 | import java.util.List;
 25 | import org.apache.beam.examples.complete.game.utils.ChangeMe;
 26 | import org.apache.beam.examples.complete.game.utils.GameEvent;
 27 | import org.apache.beam.examples.complete.game.utils.Options;
 28 | import org.apache.beam.examples.complete.game.utils.ParseEventFn;
 29 | import org.apache.beam.sdk.Pipeline;
 30 | import org.apache.beam.sdk.PipelineResult;
 31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 32 | import org.apache.beam.sdk.io.TextIO;
 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 35 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 36 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 37 | import org.apache.beam.sdk.transforms.DoFn;
 38 | import org.apache.beam.sdk.transforms.PTransform;
 39 | import org.apache.beam.sdk.transforms.ParDo;
 40 | import org.apache.beam.sdk.transforms.WithTimestamps;
 41 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
 42 | import org.apache.beam.sdk.values.KV;
 43 | import org.apache.beam.sdk.values.PCollection;
 44 | import org.joda.time.Duration;
 45 | import org.joda.time.Instant;
 46 | 
 47 | /**
 48 |  * Second in a series of coding exercises in a gaming domain.
 49 |  *
 50 |  * <p>This batch pipeline calculates the sum of scores per team per hour, over an entire batch of
 51 |  * gaming data and writes the per-team sums to BigQuery.
 52 |  *
 53 |  * <p>See README.md for details.
 54 |  */
 55 | public class Exercise2 {
 56 | 
 57 |   /**
 58 |    * A transform to compute the WindowedTeamScore.
 59 |    */
 60 |   public static class WindowedTeamScore
 61 |       extends PTransform<PCollection<GameEvent>, PCollection<KV<String, Integer>>> {
 62 |     // Developer Docs for composite transforms:
 63 |     //   https://beam.apache.org/documentation/programming-guide/#transforms-composite
 64 | 
 65 |     private Duration duration;
 66 | 
 67 |     public WindowedTeamScore(Duration duration) {
 68 |       this.duration = duration;
 69 |     }
 70 | 
 71 |     @Override
 72 |     public PCollection<KV<String, Integer>> expand(PCollection<GameEvent> input) {
 73 |       // [START EXERCISE 2]:
 74 |       // JavaDoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/
 75 |       // Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing
 76 |       // Also: https://cloud.google.com/dataflow/model/windowing
 77 |       //
 78 |       return input
 79 |           // Window.into() takes a WindowFn and returns a PTransform that
 80 |           // applies windowing to the PCollection. FixedWindows.of() returns a
 81 |           // WindowFn that assigns elements to windows of a fixed size. Use
 82 |           // these methods to apply fixed windows of size
 83 |           // this.duration to the PCollection.
 84 |           .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */)
 85 |           // Remember the ExtractAndSumScore PTransform from Exercise 1? We
 86 |           // parameterized it over the key field. Use it here to compute the "team"
 87 |           // scores (recall it is a public static method of Exercise1).
 88 |           .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */);
 89 |       // [END EXERCISE 2]
 90 |     }
 91 |   }
 92 | 
 93 |   /**
 94 |    * Format a KV of team and their score to a BigQuery TableRow.
 95 |    */
 96 |   public static class FormatTeamScoreSumsFn extends DoFn<KV<String, Integer>, TableRow>{
 97 | 
 98 |     @ProcessElement
 99 |     public void processElement(ProcessContext c, IntervalWindow window) {
100 |       TableRow row =
101 |           new TableRow()
102 |               .set("team", c.element().getKey())
103 |               .set("total_score", c.element().getValue())
104 |               .set("window_start", window.start().getMillis() / 1000);
105 |       c.output(row);
106 |     }
107 | 
108 |     /**
109 |      * Defines the BigQuery schema.
110 |      */
111 |     public static TableSchema getSchema() {
112 |       List<TableFieldSchema> fields = new ArrayList<>();
113 |       fields.add(new TableFieldSchema().setName("team").setType("STRING"));
114 |       fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
115 |       fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
116 |       return new TableSchema().setFields(fields);
117 |     }
118 |   }
119 | 
120 |   /**
121 |    * Run a batch pipeline.
122 |    */
123 |   public static void main(String[] args) throws Exception {
124 |     Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
125 |     Pipeline pipeline = Pipeline.create(options);
126 | 
127 |     TableReference tableRef = new TableReference();
128 |     tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
129 |     tableRef.setProjectId(options.as(GcpOptions.class).getProject());
130 |     tableRef.setTableId(options.getOutputTableName());
131 | 
132 |     // Read events from a CSV file and parse them.
133 |     pipeline
134 |         .apply(TextIO.read().from(options.getInput()))
135 |         .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
136 |         .apply(
137 |             "AddEventTimestamps", WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp())))
138 |         .apply("WindowedTeamScore", new WindowedTeamScore(Duration.standardMinutes(60)))
139 |         // Write the results to BigQuery.
140 |         .apply("FormatTeamScoreSums", ParDo.of(new FormatTeamScoreSumsFn()))
141 |         .apply(
142 |             BigQueryIO.writeTableRows().to(tableRef)
143 |                 .withSchema(FormatTeamScoreSumsFn.getSchema())
144 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
145 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
146 | 
147 |     PipelineResult result = pipeline.run();
148 |     result.waitUntilFinish();
149 |   }
150 | }
151 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/Exercise3.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package org.apache.beam.examples.complete.game;
 18 | 
 19 | import com.google.api.services.bigquery.model.TableReference;
 20 | import org.apache.beam.sdk.Pipeline;
 21 | import org.apache.beam.sdk.PipelineResult;
 22 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 23 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 24 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 25 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 26 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 27 | import org.apache.beam.sdk.transforms.PTransform;
 28 | import org.apache.beam.sdk.transforms.ParDo;
 29 | import org.apache.beam.sdk.values.PBegin;
 30 | import org.apache.beam.sdk.values.PCollection;
 31 | import org.apache.beam.examples.complete.game.solutions.Exercise2;
 32 | import org.apache.beam.examples.complete.game.utils.ChangeMe;
 33 | import org.apache.beam.examples.complete.game.utils.GameEvent;
 34 | import org.apache.beam.examples.complete.game.utils.Options;
 35 | import org.joda.time.Duration;
 36 | 
 37 | /**
 38 |  * Third in a series of coding exercises in a gaming domain.
 39 |  *
 40 |  * <p>This is the same pipeline as in Exercise 2, but can run in either batch or streaming mode.
 41 |  *
 42 |  * <p>See README.md for details.
 43 |  */
 44 | public class Exercise3 {
 45 | 
 46 |   /**
 47 |    * A transform to read the game events from either text files or Pub/Sub topic.
 48 |    */
 49 |   public static class ReadGameEvents extends PTransform<PBegin, PCollection<GameEvent>> {
 50 | 
 51 |     private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms";
 52 | 
 53 |     private Options options;
 54 | 
 55 |     public ReadGameEvents(Options options) {
 56 |       this.options = options;
 57 |     }
 58 | 
 59 |     @Override
 60 |     public PCollection<GameEvent> expand(PBegin begin) {
 61 |       // [START EXERCISE 3]:
 62 |       // Javadoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.html
 63 |       // Developer Docs (1.x): https://cloud.google.com/dataflow/model/pubsub-io
 64 |       //
 65 |       // Determine whether to use files or topic based on options.
 66 |       if (options.getInput() != null && !options.getInput().isEmpty()) {
 67 |         return begin
 68 |             .getPipeline()
 69 |             // Read game events from files. See main() in Exercise2. Don't forget to parse events or
 70 |             // to include WithTimestamps transform to assign timestamps to events.
 71 |             // https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/WithTimestamps.html
 72 |             .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */)
 73 |             .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */)
 74 |             .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */);
 75 |       } else {
 76 |         return begin
 77 |             .getPipeline()
 78 |             // Read game events from Pub/Sub topic options.getTopic() using custom timestamps, which
 79 |             // are extracted from the pubsub attribute TIMESTAMP_ATTRIBUTE.
 80 |             // Use PubsubIO.readStrings() with withTimestampAttribute() and fromTopic().
 81 |             // https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.html
 82 |             .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */)
 83 |             // Parse the messages the same way as when they come from the text file. Note that we no
 84 |             // longer have to run WithTimestamps transform, as the timestamps are already set by
 85 |             // PubsubIO. (In streaming, changing timestamps must be done carefully to avoid
 86 |             // guarantees necessary for watermarks.)
 87 |             .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */);
 88 |       }
 89 |       // [END EXERCISE 3]
 90 |     }
 91 |   }
 92 | 
 93 |   /**
 94 |    * Run a batch or streaming pipeline.
 95 |    */
 96 |   public static void main(String[] args) throws Exception {
 97 |     Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
 98 | 
 99 |     Pipeline pipeline = Pipeline.create(options);
100 | 
101 |     TableReference tableRef = new TableReference();
102 |     tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
103 |     tableRef.setProjectId(options.as(GcpOptions.class).getProject());
104 |     tableRef.setTableId(options.getOutputTableName());
105 | 
106 |     // Read events from either a CSV file or PubSub stream.
107 |     pipeline
108 |         .apply(new ReadGameEvents(options))
109 |         .apply("WindowedTeamScore", new Exercise2.WindowedTeamScore(Duration.standardMinutes(60)))
110 |         // Write the results to BigQuery.
111 |         .apply("FormatTeamScoreSums", ParDo.of(new Exercise2.FormatTeamScoreSumsFn()))
112 |         .apply(
113 |             BigQueryIO.writeTableRows().to(tableRef)
114 |                 .withSchema(Exercise2.FormatTeamScoreSumsFn.getSchema())
115 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
116 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
117 | 
118 |     PipelineResult result = pipeline.run();
119 |     result.waitUntilFinish();
120 |   }
121 | }
122 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/Exercise4.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package org.apache.beam.examples.complete.game;
 18 | 
 19 | import com.google.api.services.bigquery.model.TableFieldSchema;
 20 | import com.google.api.services.bigquery.model.TableReference;
 21 | import com.google.api.services.bigquery.model.TableRow;
 22 | import com.google.api.services.bigquery.model.TableSchema;
 23 | import com.google.common.annotations.VisibleForTesting;
 24 | import java.util.ArrayList;
 25 | import java.util.List;
 26 | import org.apache.beam.examples.complete.game.solutions.Exercise1;
 27 | import org.apache.beam.examples.complete.game.solutions.Exercise3;
 28 | import org.apache.beam.examples.complete.game.utils.GameEvent;
 29 | import org.apache.beam.examples.complete.game.utils.Options;
 30 | import org.apache.beam.runners.dataflow.DataflowRunner;
 31 | import org.apache.beam.sdk.Pipeline;
 32 | import org.apache.beam.sdk.PipelineResult;
 33 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 35 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 36 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 37 | import org.apache.beam.sdk.options.Default;
 38 | import org.apache.beam.sdk.options.Description;
 39 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 40 | import org.apache.beam.sdk.options.StreamingOptions;
 41 | import org.apache.beam.sdk.transforms.DoFn;
 42 | import org.apache.beam.sdk.transforms.PTransform;
 43 | import org.apache.beam.sdk.transforms.ParDo;
 44 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
 45 | import org.apache.beam.sdk.values.KV;
 46 | import org.apache.beam.sdk.values.PCollection;
 47 | import org.joda.time.Duration;
 48 | import org.joda.time.Instant;
 49 | 
 50 | /**
 51 |  * Fourth in a series of coding exercises in a gaming domain.
 52 |  *
 53 |  * <p>This streaming pipeline calculates user and team scores for a window of time and writes them
 54 |  * to BigQuery.
 55 |  *
 56 |  * <p>See README.md for details.
 57 |  */
 58 | public class Exercise4 {
 59 | 
 60 |   static final Duration TEN_SECONDS = Duration.standardSeconds(10);
 61 |   static final Duration THIRTY_SECONDS = Duration.standardSeconds(30);
 62 | 
 63 |   /**
 64 |    * Exercise4Options supported by {@link Exercise4}.
 65 |    */
 66 |   interface Exercise4Options extends Options, StreamingOptions {
 67 | 
 68 |     @Description("Numeric value of fixed window duration for team analysis, in minutes")
 69 |     @Default.Integer(1)
 70 |     Integer getTeamWindowDuration();
 71 | 
 72 |     void setTeamWindowDuration(Integer value);
 73 | 
 74 |     @Description("Numeric value of allowed data lateness, in minutes")
 75 |     @Default.Integer(2)
 76 |     Integer getAllowedLateness();
 77 | 
 78 |     void setAllowedLateness(Integer value);
 79 |   }
 80 | 
 81 |   /**
 82 |    * Extract user/score pairs from the event stream using processing time, via global windowing. Get
 83 |    * periodic updates on all users' running scores.
 84 |    */
 85 |   @VisibleForTesting
 86 |   static class CalculateUserScores
 87 |       extends PTransform<PCollection<GameEvent>, PCollection<KV<String, Integer>>> {
 88 | 
 89 |     private final Duration allowedLateness;
 90 | 
 91 |     CalculateUserScores(Duration allowedLateness) {
 92 |       this.allowedLateness = allowedLateness;
 93 |     }
 94 | 
 95 |     @Override
 96 |     public PCollection<KV<String, Integer>> expand(PCollection<GameEvent> input) {
 97 |       // [START EXERCISE 4 PART 1]:
 98 |       // JavaDoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/windowing/Window.html
 99 |       // Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing
100 |       //
101 |       // Fill in the code to:
102 |       //   1. Window the incoming input into global windows
103 |       //   2. that trigger every thirty seconds to emit speculative results,
104 |       //   3. allow late data with allowedLateness,
105 |       //   3. and don't forget to accumulate over the entire window.
106 |       return input
107 |           /* TODO: SOLUTION CODE HERE */
108 |           // Extract and sum username/score pairs from the event data.
109 |           .apply("ExtractUserScore", new Exercise1.ExtractAndSumScore("user"));
110 |       // [END EXERCISE 4 PART 1]:
111 |     }
112 |   }
113 | 
114 |   /**
115 |    * Calculates scores for each team within the configured window duration.
116 |    */
117 |   // Extract team/score pairs from the event stream, using hour-long windows by default.
118 |   @VisibleForTesting
119 |   static class CalculateTeamScores
120 |       extends PTransform<PCollection<GameEvent>, PCollection<KV<String, Integer>>> {
121 | 
122 |     private final Duration teamWindowDuration;
123 |     private final Duration allowedLateness;
124 | 
125 |     CalculateTeamScores(Duration teamWindowDuration, Duration allowedLateness) {
126 |       this.teamWindowDuration = teamWindowDuration;
127 |       this.allowedLateness = allowedLateness;
128 |     }
129 | 
130 |     @Override
131 |     public PCollection<KV<String, Integer>> expand(PCollection<GameEvent> infos) {
132 |       // [START EXERCISE 4 PART 2]:
133 |       // JavaDoc: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/windowing/Window.html
134 |       // Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing
135 |       //
136 |       // Fill in the code to:
137 |       //   1. Window the incoming input into fixed windows of team window duration,
138 |       //   2. trigger on time results at the watermark,
139 |       //   3. trigger speculative results every ten seconds,
140 |       //   4. trigger late data results with a delay of thirty seconds,
141 |       //   5. don't forget to set the allowedLateness,
142 |       //   6. and ensure that we continue to accumulate over all data in the window.
143 |       return infos
144 |           /* TODO: SOLUTION CODE HERE */
145 |           // Extract and sum teamname/score pairs from the event data.
146 |           .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team"));
147 |       // [END EXERCISE 4 PART 2]:
148 |     }
149 |   }
150 | 
151 |   public static void main(String[] args) throws Exception {
152 |     Exercise4Options options =
153 |         PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise4Options.class);
154 |     // Enforce that this pipeline is always run in streaming mode.
155 |     options.setStreaming(true);
156 |     options.setRunner(DataflowRunner.class);
157 |     Pipeline pipeline = Pipeline.create(options);
158 | 
159 |     TableReference teamTable = new TableReference();
160 |     teamTable.setDatasetId(options.getOutputDataset());
161 |     teamTable.setProjectId(options.as(GcpOptions.class).getProject());
162 |     teamTable.setTableId(options.getOutputTableName() + "_team");
163 | 
164 |     TableReference userTable = new TableReference();
165 |     userTable.setDatasetId(options.getOutputDataset());
166 |     userTable.setProjectId(options.as(GcpOptions.class).getProject());
167 |     userTable.setTableId(options.getOutputTableName() + "_user");
168 | 
169 |     PCollection<GameEvent> gameEvents = pipeline.apply(new Exercise3.ReadGameEvents(options));
170 | 
171 |     gameEvents
172 |         .apply(
173 |             "CalculateTeamScores",
174 |             new CalculateTeamScores(
175 |                 Duration.standardMinutes(options.getTeamWindowDuration()),
176 |                 Duration.standardMinutes(options.getAllowedLateness())))
177 |         // Write the results to BigQuery.
178 |         .apply("FormatTeamScores", ParDo.of(new FormatTeamScoreFn()))
179 |         .apply(
180 |             BigQueryIO.writeTableRows().to(teamTable)
181 |                 .withSchema(FormatTeamScoreFn.getSchema())
182 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
183 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
184 | 
185 |     gameEvents
186 |         .apply(
187 |             "CalculateUserScores",
188 |             new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness())))
189 |         // Write the results to BigQuery.
190 |         .apply("FormatUserScores", ParDo.of(new FormatUserScoreFn()))
191 |         .apply(
192 |             BigQueryIO.writeTableRows().to(userTable)
193 |                 .withSchema(FormatUserScoreFn.getSchema())
194 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
195 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
196 | 
197 |     PipelineResult result = pipeline.run();
198 |     result.waitUntilFinish();
199 |   }
200 | 
201 |   /**
202 |    * Format a KV of team and associated properties to a BigQuery TableRow.
203 |    */
204 |   protected static class FormatTeamScoreFn extends DoFn<KV<String, Integer>, TableRow> {
205 | 
206 |     @ProcessElement
207 |     public void processElement(ProcessContext c, IntervalWindow window) {
208 |       TableRow row =
209 |           new TableRow()
210 |               .set("team", c.element().getKey())
211 |               .set("total_score", c.element().getValue())
212 |               .set("window_start", window.start().getMillis() / 1000)
213 |               .set("processing_time", Instant.now().getMillis() / 1000)
214 |               .set("timing", c.pane().getTiming().toString());
215 |       c.output(row);
216 |     }
217 | 
218 |     static TableSchema getSchema() {
219 |       List<TableFieldSchema> fields = new ArrayList<>();
220 |       fields.add(new TableFieldSchema().setName("team").setType("STRING"));
221 |       fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
222 |       fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
223 |       fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"));
224 |       fields.add(new TableFieldSchema().setName("timing").setType("STRING"));
225 |       return new TableSchema().setFields(fields);
226 |     }
227 |   }
228 | 
229 |   /**
230 |    * Format a KV of user and associated properties to a BigQuery TableRow.
231 |    */
232 |   static class FormatUserScoreFn extends DoFn<KV<String, Integer>, TableRow> {
233 | 
234 |     @ProcessElement
235 |     public void processElement(ProcessContext c) {
236 |       TableRow row =
237 |           new TableRow()
238 |               .set("user", c.element().getKey())
239 |               .set("total_score", c.element().getValue())
240 |               .set("processing_time", Instant.now().getMillis() / 1000);
241 |       c.output(row);
242 |     }
243 | 
244 |     static TableSchema getSchema() {
245 |       List<TableFieldSchema> fields = new ArrayList<>();
246 |       fields.add(new TableFieldSchema().setName("user").setType("STRING"));
247 |       fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
248 |       fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"));
249 |       return new TableSchema().setFields(fields);
250 |     }
251 |   }
252 | }
253 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/Exercise6.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | package org.apache.beam.examples.complete.game;
 17 | 
 18 | import com.google.api.services.bigquery.model.TableFieldSchema;
 19 | import com.google.api.services.bigquery.model.TableReference;
 20 | import com.google.api.services.bigquery.model.TableRow;
 21 | import com.google.api.services.bigquery.model.TableSchema;
 22 | import java.util.ArrayList;
 23 | import java.util.List;
 24 | import org.apache.beam.examples.complete.game.solutions.Exercise3.ReadGameEvents;
 25 | import org.apache.beam.examples.complete.game.utils.ChangeMe;
 26 | import org.apache.beam.examples.complete.game.utils.GameEvent;
 27 | import org.apache.beam.examples.complete.game.utils.Options;
 28 | import org.apache.beam.runners.dataflow.DataflowRunner;
 29 | import org.apache.beam.sdk.Pipeline;
 30 | import org.apache.beam.sdk.PipelineResult;
 31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 35 | import org.apache.beam.sdk.options.Default;
 36 | import org.apache.beam.sdk.options.Description;
 37 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 38 | import org.apache.beam.sdk.options.StreamingOptions;
 39 | import org.apache.beam.sdk.transforms.Combine;
 40 | import org.apache.beam.sdk.transforms.DoFn;
 41 | import org.apache.beam.sdk.transforms.MapElements;
 42 | import org.apache.beam.sdk.transforms.Mean;
 43 | import org.apache.beam.sdk.transforms.ParDo;
 44 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
 45 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
 46 | import org.apache.beam.sdk.values.KV;
 47 | import org.apache.beam.sdk.values.PCollection;
 48 | import org.apache.beam.sdk.values.TypeDescriptors;
 49 | import org.joda.time.Duration;
 50 | import org.slf4j.Logger;
 51 | import org.slf4j.LoggerFactory;
 52 | 
 53 | /**
 54 |  * Sixth in a series of coding exercises in a gaming domain.
 55 |  *
 56 |  * <p>This exercise introduces session windows.
 57 |  *
 58 |  * <p>See README.md for details.
 59 |  */
 60 | public class Exercise6 {
 61 | 
 62 |   private static final Logger LOG = LoggerFactory.getLogger(Exercise6.class);
 63 | 
 64 |   /**
 65 |    * Calculate and output an element's session duration.
 66 |    */
 67 |   private static class UserSessionInfoFn extends DoFn<KV<String, Integer>, Integer> {
 68 | 
 69 |     @ProcessElement
 70 |     public void processElement(ProcessContext c, BoundedWindow window) {
 71 |       IntervalWindow w = (IntervalWindow) window;
 72 |       int duration = new Duration(w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes();
 73 |       c.output(duration);
 74 |     }
 75 |   }
 76 | 
 77 |   /**
 78 |    * Options supported by {@link Exercise6}.
 79 |    */
 80 |   interface Exercise6Options extends Options, StreamingOptions {
 81 | 
 82 |     @Description("Numeric value of gap between user sessions, in minutes")
 83 |     @Default.Integer(1)
 84 |     Integer getSessionGap();
 85 | 
 86 |     void setSessionGap(Integer value);
 87 | 
 88 |     @Description(
 89 |         "Numeric value of fixed window for finding mean of user session duration, " + "in minutes")
 90 |     @Default.Integer(5)
 91 |     Integer getUserActivityWindowDuration();
 92 | 
 93 |     void setUserActivityWindowDuration(Integer value);
 94 |   }
 95 | 
 96 |   public static void main(String[] args) throws Exception {
 97 | 
 98 |     Exercise6Options options =
 99 |         PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise6Options.class);
100 |     // Enforce that this pipeline is always run in streaming mode.
101 |     options.setStreaming(true);
102 |     options.setRunner(DataflowRunner.class);
103 |     Pipeline pipeline = Pipeline.create(options);
104 | 
105 |     TableReference sessionsTable = new TableReference();
106 |     sessionsTable.setDatasetId(options.getOutputDataset());
107 |     sessionsTable.setProjectId(options.as(GcpOptions.class).getProject());
108 |     sessionsTable.setTableId(options.getOutputTableName());
109 | 
110 |     PCollection<GameEvent> rawEvents = pipeline.apply(new ReadGameEvents(options));
111 | 
112 |     // Extract username/score pairs from the event stream
113 |     PCollection<KV<String, Integer>> userEvents =
114 |         rawEvents.apply(
115 |             "ExtractUserScore",
116 |             MapElements
117 |                 .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
118 |                 .via((GameEvent gInfo) -> KV.<String, Integer>of(gInfo.getUser(),
119 |                     gInfo.getScore())));
120 | 
121 |     // [START EXERCISE 6]:
122 |     // Detect user sessions-- that is, a burst of activity separated by a gap from further
123 |     // activity. Find and record the mean session lengths.
124 |     // This information could help the game designers track the changing user engagement
125 |     // as their set of games changes.
126 |     userEvents
127 |         // Window the user events into sessions with gap options.getSessionGap() minutes. Make sure
128 |         // to use an outputTimeFn that sets the output timestamp to the end of the window. This will
129 |         // allow us to compute means on sessions based on their end times, rather than their start
130 |         // times.
131 |         // JavaDoc:
132 |         //   - https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/windowing/Sessions.html
133 |         //   - https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/windowing/Window.html
134 |         // Note: Pay attention to the withTimestampCombiner method on Window.
135 |         .apply("WindowIntoSessions",
136 |             /* TODO: YOUR CODE GOES HERE */
137 |             new ChangeMe<PCollection<KV<String, Integer>>, KV<String, Integer>>())
138 |         // For this use, we care only about the existence of the session, not any particular
139 |         // information aggregated over it, so the following is an efficient way to do that.
140 |         .apply(Combine.perKey(x -> 0))
141 |         // Get the duration per session.
142 |         .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn()))
143 |         // Note that the output of the previous transform is a PCollection of session durations
144 |         // (PCollection<Integer>) where the timestamp of elements is the end of the window.
145 |         //
146 |         // Re-window to process groups of session sums according to when the sessions complete.
147 |         // In streaming we don't just ask "what is the mean value" we must ask "what is the mean
148 |         // value for some window of time". To compute periodic means of session durations, we
149 |         // re-window the session durations.
150 |         .apply("WindowToExtractSessionMean",
151 |             /* TODO: YOUR CODE GOES HERE */
152 |             new ChangeMe<PCollection<Integer>, Integer>())
153 |         // Find the mean session duration in each window.
154 |         .apply(Mean.<Integer>globally().withoutDefaults())
155 |         // Write this info to a BigQuery table.
156 |         .apply("FormatSessions", ParDo.of(new FormatSessionWindowFn()))
157 |         .apply(
158 |             BigQueryIO.writeTableRows().to(sessionsTable)
159 |                 .withSchema(FormatSessionWindowFn.getSchema())
160 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
161 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
162 |     // [END EXERCISE 6]:
163 | 
164 |     PipelineResult result = pipeline.run();
165 |     result.waitUntilFinish();
166 |   }
167 | 
168 |   /**
169 |    * Format a KV of session and associated properties to a BigQuery TableRow.
170 |    */
171 |   static class FormatSessionWindowFn extends DoFn<Double, TableRow> {
172 | 
173 |     @ProcessElement
174 |     public void processElement(ProcessContext c, BoundedWindow window) {
175 |       IntervalWindow w = (IntervalWindow) window;
176 |       TableRow row =
177 |           new TableRow()
178 |               .set("window_start", w.start().getMillis() / 1000)
179 |               .set("mean_duration", c.element());
180 |       c.output(row);
181 |     }
182 | 
183 |     static TableSchema getSchema() {
184 |       List<TableFieldSchema> fields = new ArrayList<>();
185 |       fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
186 |       fields.add(new TableFieldSchema().setName("mean_duration").setType("FLOAT"));
187 |       return new TableSchema().setFields(fields);
188 |     }
189 |   }
190 | }
191 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/injector/InjectorUtils.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2015 Google Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package org.apache.beam.examples.complete.game.injector;
18 | 
19 | import static com.google.common.base.Preconditions.checkNotNull;
20 | 
21 | import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
22 | import com.google.api.client.googleapis.json.GoogleJsonResponseException;
23 | import com.google.api.client.googleapis.util.Utils;
24 | import com.google.api.client.http.HttpRequestInitializer;
25 | import com.google.api.client.http.HttpStatusCodes;
26 | import com.google.api.client.http.HttpTransport;
27 | import com.google.api.client.json.JsonFactory;
28 | import com.google.api.services.pubsub.Pubsub;
29 | import com.google.api.services.pubsub.PubsubScopes;
30 | import com.google.api.services.pubsub.model.Topic;
31 | import java.io.IOException;
32 | 
33 | class InjectorUtils {
34 | 
35 |   private static final String APP_NAME = "injector";
36 | 
37 |   /**
38 |    * Builds a new Pubsub client and returns it.
39 |    */
40 |   public static Pubsub getClient(final HttpTransport httpTransport, final JsonFactory jsonFactory)
41 |       throws IOException {
42 |     checkNotNull(httpTransport);
43 |     checkNotNull(jsonFactory);
44 |     GoogleCredential credential =
45 |         GoogleCredential.getApplicationDefault(httpTransport, jsonFactory);
46 |     if (credential.createScopedRequired()) {
47 |       credential = credential.createScoped(PubsubScopes.all());
48 |     }
49 |     if (credential.getClientAuthentication() != null) {
50 |       System.out.println(
51 |           "\n***Warning! You are not using service account credentials to "
52 |               + "authenticate.\nYou need to use service account credentials for this example,"
53 |               + "\nsince user-level credentials do not have enough pubsub quota,\nand so you will run "
54 |               + "out of PubSub quota very quickly.\nSee "
55 |               + "https://developers.google.com/identity/protocols/application-default-credentials.");
56 |       System.exit(1);
57 |     }
58 |     HttpRequestInitializer initializer = new RetryHttpInitializerWrapper(credential);
59 |     return new Pubsub.Builder(httpTransport, jsonFactory, initializer)
60 |         .setApplicationName(APP_NAME)
61 |         .build();
62 |   }
63 | 
64 |   /**
65 |    * Builds a new Pubsub client with default HttpTransport and JsonFactory and returns it.
66 |    */
67 |   public static Pubsub getClient() throws IOException {
68 |     return getClient(Utils.getDefaultTransport(), Utils.getDefaultJsonFactory());
69 |   }
70 | 
71 |   /**
72 |    * Returns the fully qualified topic name for Pub/Sub.
73 |    */
74 |   public static String getFullyQualifiedTopicName(final String project, final String topic) {
75 |     return String.format("projects/%s/topics/%s", project, topic);
76 |   }
77 | 
78 |   /**
79 |    * Create a topic if it doesn't exist.
80 |    */
81 |   public static void createTopic(Pubsub client, String fullTopicName) throws IOException {
82 |     try {
83 |       client.projects().topics().get(fullTopicName).execute();
84 |     } catch (GoogleJsonResponseException e) {
85 |       if (e.getStatusCode() == HttpStatusCodes.STATUS_CODE_NOT_FOUND) {
86 |         Topic topic = client.projects().topics().create(fullTopicName, new Topic()).execute();
87 |         System.out.printf("Topic %s was created.\n", topic.getName());
88 |       }
89 |     }
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/injector/RetryHttpInitializerWrapper.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
  5 |  * in compliance with the License. You may obtain a copy of the License at
  6 |  *
  7 |  * http://www.apache.org/licenses/LICENSE-2.0
  8 |  *
  9 |  * Unless required by applicable law or agreed to in writing, software distributed under the License
 10 |  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 11 |  * or implied. See the License for the specific language governing permissions and limitations under
 12 |  * the License.
 13 |  */
 14 | 
 15 | package org.apache.beam.examples.complete.game.injector;
 16 | 
 17 | import static com.google.common.base.Preconditions.checkNotNull;
 18 | 
 19 | import com.google.api.client.auth.oauth2.Credential;
 20 | import com.google.api.client.http.HttpBackOffIOExceptionHandler;
 21 | import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler;
 22 | import com.google.api.client.http.HttpRequest;
 23 | import com.google.api.client.http.HttpRequestInitializer;
 24 | import com.google.api.client.http.HttpResponse;
 25 | import com.google.api.client.http.HttpUnsuccessfulResponseHandler;
 26 | import com.google.api.client.util.ExponentialBackOff;
 27 | import com.google.api.client.util.Sleeper;
 28 | import java.io.IOException;
 29 | import java.util.logging.Logger;
 30 | 
 31 | /**
 32 |  * RetryHttpInitializerWrapper will automatically retry upon RPC failures, preserving the
 33 |  * auto-refresh behavior of the Google Credentials.
 34 |  */
 35 | public class RetryHttpInitializerWrapper implements HttpRequestInitializer {
 36 | 
 37 |   /**
 38 |    * A private logger.
 39 |    */
 40 |   private static final Logger LOG = Logger.getLogger(RetryHttpInitializerWrapper.class.getName());
 41 | 
 42 |   /**
 43 |    * One minutes in miliseconds.
 44 |    */
 45 |   private static final int ONEMINITUES = 60000;
 46 | 
 47 |   /**
 48 |    * Intercepts the request for filling in the "Authorization" header field, as well as recovering
 49 |    * from certain unsuccessful error codes wherein the Credential must refresh its token for a
 50 |    * retry.
 51 |    */
 52 |   private final Credential wrappedCredential;
 53 | 
 54 |   /**
 55 |    * A sleeper; you can replace it with a mock in your test.
 56 |    */
 57 |   private final Sleeper sleeper;
 58 | 
 59 |   /**
 60 |    * A constructor.
 61 |    *
 62 |    * @param wrappedCredential Credential which will be wrapped and used for providing auth header.
 63 |    */
 64 |   public RetryHttpInitializerWrapper(final Credential wrappedCredential) {
 65 |     this(wrappedCredential, Sleeper.DEFAULT);
 66 |   }
 67 | 
 68 |   /**
 69 |    * A protected constructor only for testing.
 70 |    *
 71 |    * @param wrappedCredential Credential which will be wrapped and used for providing auth header.
 72 |    * @param sleeper Sleeper for easy testing.
 73 |    */
 74 |   RetryHttpInitializerWrapper(final Credential wrappedCredential, final Sleeper sleeper) {
 75 |     this.wrappedCredential = checkNotNull(wrappedCredential);
 76 |     this.sleeper = sleeper;
 77 |   }
 78 | 
 79 |   /**
 80 |    * Initializes the given request.
 81 |    */
 82 |   @Override
 83 |   public final void initialize(final HttpRequest request) {
 84 |     request.setReadTimeout(2 * ONEMINITUES); // 2 minutes read timeout
 85 |     final HttpUnsuccessfulResponseHandler backoffHandler =
 86 |         new HttpBackOffUnsuccessfulResponseHandler(new ExponentialBackOff()).setSleeper(sleeper);
 87 |     request.setInterceptor(wrappedCredential);
 88 |     request.setUnsuccessfulResponseHandler(
 89 |         new HttpUnsuccessfulResponseHandler() {
 90 |           @Override
 91 |           public boolean handleResponse(
 92 |               final HttpRequest request, final HttpResponse response, final boolean supportsRetry)
 93 |               throws IOException {
 94 |             if (wrappedCredential.handleResponse(request, response, supportsRetry)) {
 95 |               // If credential decides it can handle it,
 96 |               // the return code or message indicated
 97 |               // something specific to authentication,
 98 |               // and no backoff is desired.
 99 |               return true;
100 |             } else if (backoffHandler.handleResponse(request, response, supportsRetry)) {
101 |               // Otherwise, we defer to the judgement of
102 |               // our internal backoff handler.
103 |               LOG.info("Retrying " + request.getUrl().toString());
104 |               return true;
105 |             } else {
106 |               return false;
107 |             }
108 |           }
109 |         });
110 |     request.setIOExceptionHandler(
111 |         new HttpBackOffIOExceptionHandler(new ExponentialBackOff()).setSleeper(sleeper));
112 |   }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise1.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package org.apache.beam.examples.complete.game.solutions;
 18 | 
 19 | import com.google.api.services.bigquery.model.TableFieldSchema;
 20 | import com.google.api.services.bigquery.model.TableReference;
 21 | import com.google.api.services.bigquery.model.TableRow;
 22 | import com.google.api.services.bigquery.model.TableSchema;
 23 | import java.util.ArrayList;
 24 | import java.util.List;
 25 | import org.apache.beam.examples.complete.game.utils.GameEvent;
 26 | import org.apache.beam.examples.complete.game.utils.Options;
 27 | import org.apache.beam.examples.complete.game.utils.ParseEventFn;
 28 | import org.apache.beam.sdk.Pipeline;
 29 | import org.apache.beam.sdk.PipelineResult;
 30 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 31 | import org.apache.beam.sdk.io.TextIO;
 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 35 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 36 | import org.apache.beam.sdk.transforms.DoFn;
 37 | import org.apache.beam.sdk.transforms.MapElements;
 38 | import org.apache.beam.sdk.transforms.PTransform;
 39 | import org.apache.beam.sdk.transforms.ParDo;
 40 | import org.apache.beam.sdk.transforms.Sum;
 41 | import org.apache.beam.sdk.values.KV;
 42 | import org.apache.beam.sdk.values.PCollection;
 43 | import org.apache.beam.sdk.values.TypeDescriptors;
 44 | 
 45 | /**
 46 |  * First in a series of coding exercises in a gaming domain.
 47 |  *
 48 |  * <p>This batch pipeline calculates the sum of scores per user, over an entire batch of gaming data
 49 |  * and writes the sums to BigQuery.
 50 |  *
 51 |  * <p>See README.md for details.
 52 |  */
 53 | public class Exercise1 {
 54 | 
 55 |   /**
 56 |    * A transform to extract key/score information from GameEvent, and sum
 57 |    * the scores. The constructor arg determines whether 'team' or 'user' info is
 58 |    * extracted.
 59 |    */
 60 |   public static class ExtractAndSumScore
 61 |       extends PTransform<PCollection<GameEvent>, PCollection<KV<String, Integer>>> {
 62 | 
 63 |     private final String field;
 64 | 
 65 |     public ExtractAndSumScore(String field) {
 66 |       this.field = field;
 67 |     }
 68 | 
 69 |     @Override
 70 |     public PCollection<KV<String, Integer>> expand(PCollection<GameEvent> gameEvents) {
 71 |       return gameEvents
 72 |           .apply(ParDo.of(new DoFn<GameEvent, KV<String, Integer>>(){
 73 |             @ProcessElement
 74 |             public void processElement(ProcessContext c) {
 75 |               GameEvent event = c.element();
 76 |               c.output(KV.of(event.getKey(field), event.getScore()));
 77 |             }
 78 |           }))
 79 |           /*
 80 |            // alternate implementation
 81 |           .apply(MapElements
 82 |               .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
 83 |               .via((GameEvent event) -> KV.<String, Integer>of(event.getKey(field),
 84 |                   event.getScore()))) */
 85 |           .apply(Sum.<String>integersPerKey());
 86 |     }
 87 |   }
 88 | 
 89 |   /**
 90 |    * Format a KV of user and their score to a BigQuery TableRow.
 91 |    */
 92 |   static class FormatUserScoreSumsFn extends DoFn<KV<String, Integer>, TableRow> {
 93 | 
 94 |     @ProcessElement
 95 |     public void processElement(ProcessContext c) {
 96 |       TableRow row = new TableRow()
 97 |           .set("user", c.element().getKey())
 98 |           .set("total_score", c.element().getValue());
 99 |       c.output(row);
100 |     }
101 | 
102 |     /**
103 |      * Defines the BigQuery schema.
104 |      */
105 |     static TableSchema getSchema() {
106 |       List<TableFieldSchema> fields = new ArrayList<>();
107 |       fields.add(new TableFieldSchema().setName("user").setType("STRING"));
108 |       fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
109 |       return new TableSchema().setFields(fields);
110 |     }
111 |   }
112 | 
113 |   /**
114 |    * Run a batch pipeline.
115 |    */
116 |   public static void main(String[] args) throws Exception {
117 |     Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
118 |     Pipeline pipeline = Pipeline.create(options);
119 | 
120 |     TableReference tableRef = new TableReference();
121 |     tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
122 |     tableRef.setProjectId(options.as(GcpOptions.class).getProject());
123 |     tableRef.setTableId(options.getOutputTableName());
124 | 
125 |     // Read events from a CSV file and parse them.
126 |     pipeline
127 |         .apply(TextIO.read().from(options.getInput()))
128 |         .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
129 |         // Extract and sum username/score pairs from the event data.
130 |         .apply("ExtractUserScore", new ExtractAndSumScore("user"))
131 |         // Write the results to BigQuery.
132 |         .apply("FormatUserScoreSums", ParDo.of(new FormatUserScoreSumsFn()))
133 |         .apply(
134 |             BigQueryIO.writeTableRows().to(tableRef)
135 |                 .withSchema(FormatUserScoreSumsFn.getSchema())
136 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
137 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
138 | 
139 |     PipelineResult result = pipeline.run();
140 |     result.waitUntilFinish();
141 |   }
142 | }
143 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise2.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package org.apache.beam.examples.complete.game.solutions;
 18 | 
 19 | import com.google.api.services.bigquery.model.TableFieldSchema;
 20 | import com.google.api.services.bigquery.model.TableReference;
 21 | import com.google.api.services.bigquery.model.TableRow;
 22 | import com.google.api.services.bigquery.model.TableSchema;
 23 | import java.util.ArrayList;
 24 | import java.util.List;
 25 | import org.apache.beam.examples.complete.game.utils.GameEvent;
 26 | import org.apache.beam.examples.complete.game.utils.Options;
 27 | import org.apache.beam.examples.complete.game.utils.ParseEventFn;
 28 | import org.apache.beam.sdk.Pipeline;
 29 | import org.apache.beam.sdk.PipelineResult;
 30 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 31 | import org.apache.beam.sdk.io.TextIO;
 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 35 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 36 | import org.apache.beam.sdk.transforms.DoFn;
 37 | import org.apache.beam.sdk.transforms.PTransform;
 38 | import org.apache.beam.sdk.transforms.ParDo;
 39 | import org.apache.beam.sdk.transforms.WithTimestamps;
 40 | import org.apache.beam.sdk.transforms.windowing.FixedWindows;
 41 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
 42 | import org.apache.beam.sdk.transforms.windowing.Window;
 43 | import org.apache.beam.sdk.values.KV;
 44 | import org.apache.beam.sdk.values.PCollection;
 45 | import org.joda.time.Duration;
 46 | import org.joda.time.Instant;
 47 | 
 48 | /**
 49 |  * Second in a series of coding exercises in a gaming domain.
 50 |  *
 51 |  * <p>This batch pipeline calculates the sum of scores per team per hour, over an entire batch of
 52 |  * gaming data and writes the per-team sums to BigQuery.
 53 |  *
 54 |  * <p>See README.md for details.
 55 |  */
 56 | public class Exercise2 {
 57 | 
 58 |   /**
 59 |    * A transform to compute the WindowedTeamScore.
 60 |    */
 61 |   public static class WindowedTeamScore
 62 |       extends PTransform<PCollection<GameEvent>, PCollection<KV<String, Integer>>> {
 63 | 
 64 |     private Duration duration;
 65 | 
 66 |     public WindowedTeamScore(Duration duration) {
 67 |       this.duration = duration;
 68 |     }
 69 | 
 70 |     @Override
 71 |     public PCollection<KV<String, Integer>> expand(PCollection<GameEvent> input) {
 72 |       return input
 73 |           .apply(Window.into(FixedWindows.of(duration)))
 74 |           .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team"));
 75 |     }
 76 |   }
 77 | 
 78 |   /**
 79 |    * Format a KV of team and their score to a BigQuery TableRow.
 80 |    */
 81 |   public static class FormatTeamScoreSumsFn extends DoFn<KV<String, Integer>, TableRow> {
 82 | 
 83 |     @ProcessElement
 84 |     public void processElement(ProcessContext c, IntervalWindow window) {
 85 |       TableRow row =
 86 |           new TableRow()
 87 |               .set("team", c.element().getKey())
 88 |               .set("total_score", c.element().getValue())
 89 |               .set("window_start", window.start().getMillis() / 1000);
 90 |       c.output(row);
 91 |     }
 92 | 
 93 |     /**
 94 |      * Defines the BigQuery schema.
 95 |      */
 96 |     public static TableSchema getSchema() {
 97 |       List<TableFieldSchema> fields = new ArrayList<>();
 98 |       fields.add(new TableFieldSchema().setName("team").setType("STRING"));
 99 |       fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
100 |       fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
101 |       return new TableSchema().setFields(fields);
102 |     }
103 |   }
104 | 
105 |   /**
106 |    * Run a batch pipeline.
107 |    */
108 |   public static void main(String[] args) throws Exception {
109 |     Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
110 |     Pipeline pipeline = Pipeline.create(options);
111 | 
112 |     TableReference tableRef = new TableReference();
113 |     tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
114 |     tableRef.setProjectId(options.as(GcpOptions.class).getProject());
115 |     tableRef.setTableId(options.getOutputTableName());
116 | 
117 |     // Read events from a CSV file and parse them.
118 |     pipeline
119 |         .apply(TextIO.read().from(options.getInput()))
120 |         .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
121 |         .apply(
122 |             "AddEventTimestamps", WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp())))
123 |         .apply("WindowedTeamScore", new WindowedTeamScore(Duration.standardMinutes(60)))
124 |         // Write the results to BigQuery.
125 |         .apply("FormatTeamScoreSums", ParDo.of(new FormatTeamScoreSumsFn()))
126 |         .apply(
127 |             BigQueryIO.writeTableRows().to(tableRef)
128 |                 .withSchema(FormatTeamScoreSumsFn.getSchema())
129 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
130 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
131 | 
132 |     PipelineResult result = pipeline.run();
133 |     result.waitUntilFinish();
134 |   }
135 | }
136 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise3.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package org.apache.beam.examples.complete.game.solutions;
 18 | 
 19 | import com.google.api.services.bigquery.model.TableReference;
 20 | import org.apache.beam.examples.complete.game.utils.GameEvent;
 21 | import org.apache.beam.examples.complete.game.utils.Options;
 22 | import org.apache.beam.examples.complete.game.utils.ParseEventFn;
 23 | import org.apache.beam.sdk.Pipeline;
 24 | import org.apache.beam.sdk.PipelineResult;
 25 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 26 | import org.apache.beam.sdk.io.TextIO;
 27 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 28 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 29 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 30 | import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
 31 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 32 | import org.apache.beam.sdk.transforms.PTransform;
 33 | import org.apache.beam.sdk.transforms.ParDo;
 34 | import org.apache.beam.sdk.transforms.WithTimestamps;
 35 | import org.apache.beam.sdk.values.PBegin;
 36 | import org.apache.beam.sdk.values.PCollection;
 37 | import org.joda.time.Duration;
 38 | import org.joda.time.Instant;
 39 | 
 40 | /**
 41 |  * Third in a series of coding exercises in a gaming domain.
 42 |  *
 43 |  * <p>This is the same pipeline as in Exercise 2, but can run in either batch or streaming mode.
 44 |  *
 45 |  * <p>See README.md for details.
 46 |  */
 47 | public class Exercise3 {
 48 | 
 49 |   /**
 50 |    * A transform to read the game events from either text files or Pub/Sub topic.
 51 |    */
 52 |   public static class ReadGameEvents extends PTransform<PBegin, PCollection<GameEvent>> {
 53 | 
 54 |     private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms";
 55 | 
 56 |     private Options options;
 57 | 
 58 |     public ReadGameEvents(Options options) {
 59 |       this.options = options;
 60 |     }
 61 | 
 62 |     @Override
 63 |     public PCollection<GameEvent> expand(PBegin begin) {
 64 |       if (options.getInput() != null && !options.getInput().isEmpty()) {
 65 |         return begin
 66 |             .getPipeline()
 67 |             .apply(TextIO.read().from(options.getInput()))
 68 |             .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
 69 |             .apply(
 70 |                 "AddEventTimestamps",
 71 |                 WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp())));
 72 |       } else {
 73 |         return begin
 74 |             .getPipeline()
 75 |             .apply(PubsubIO.readStrings().withTimestampAttribute(TIMESTAMP_ATTRIBUTE)
 76 |                 .fromTopic(options.getTopic()))
 77 |             .apply("ParseGameEvent", ParDo.of(new ParseEventFn()));
 78 |       }
 79 |     }
 80 |   }
 81 | 
 82 |   /**
 83 |    * Run a batch or streaming pipeline.
 84 |    */
 85 |   public static void main(String[] args) throws Exception {
 86 |     Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
 87 | 
 88 |     Pipeline pipeline = Pipeline.create(options);
 89 | 
 90 |     TableReference tableRef = new TableReference();
 91 |     tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
 92 |     tableRef.setProjectId(options.as(GcpOptions.class).getProject());
 93 |     tableRef.setTableId(options.getOutputTableName());
 94 | 
 95 |     // Read events from either a CSV file or PubSub stream.
 96 |     pipeline
 97 |         .apply(new ReadGameEvents(options))
 98 |         .apply("WindowedTeamScore", new Exercise2.WindowedTeamScore(Duration.standardMinutes(5)))
 99 |         // Write the results to BigQuery.
100 |         .apply("FormatTeamScoreSums", ParDo.of(new Exercise2.FormatTeamScoreSumsFn()))
101 |         .apply(
102 |             BigQueryIO.writeTableRows().to(tableRef)
103 |                 .withSchema(Exercise2.FormatTeamScoreSumsFn.getSchema())
104 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
105 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
106 | 
107 |     PipelineResult result = pipeline.run();
108 |     result.waitUntilFinish();
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise4.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package org.apache.beam.examples.complete.game.solutions;
 18 | 
 19 | import com.google.api.services.bigquery.model.TableFieldSchema;
 20 | import com.google.api.services.bigquery.model.TableReference;
 21 | import com.google.api.services.bigquery.model.TableRow;
 22 | import com.google.api.services.bigquery.model.TableSchema;
 23 | import com.google.common.annotations.VisibleForTesting;
 24 | import java.util.ArrayList;
 25 | import java.util.List;
 26 | import org.apache.beam.examples.complete.game.utils.GameEvent;
 27 | import org.apache.beam.examples.complete.game.utils.Options;
 28 | import org.apache.beam.runners.dataflow.DataflowRunner;
 29 | import org.apache.beam.sdk.Pipeline;
 30 | import org.apache.beam.sdk.PipelineResult;
 31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 35 | import org.apache.beam.sdk.options.Default;
 36 | import org.apache.beam.sdk.options.Description;
 37 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 38 | import org.apache.beam.sdk.options.StreamingOptions;
 39 | import org.apache.beam.sdk.transforms.DoFn;
 40 | import org.apache.beam.sdk.transforms.PTransform;
 41 | import org.apache.beam.sdk.transforms.ParDo;
 42 | import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime;
 43 | import org.apache.beam.sdk.transforms.windowing.AfterWatermark;
 44 | import org.apache.beam.sdk.transforms.windowing.FixedWindows;
 45 | import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
 46 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
 47 | import org.apache.beam.sdk.transforms.windowing.Repeatedly;
 48 | import org.apache.beam.sdk.transforms.windowing.Window;
 49 | import org.apache.beam.sdk.values.KV;
 50 | import org.apache.beam.sdk.values.PCollection;
 51 | import org.joda.time.Duration;
 52 | import org.joda.time.Instant;
 53 | 
 54 | /**
 55 |  * Fourth in a series of coding exercises in a gaming domain.
 56 |  *
 57 |  * <p>This streaming pipeline calculates user and team scores for a window of time and writes them
 58 |  * to BigQuery.
 59 |  *
 60 |  * <p>See README.md for details.
 61 |  */
 62 | public class Exercise4 {
 63 | 
 64 |   static final Duration TEN_SECONDS = Duration.standardSeconds(10);
 65 |   static final Duration THIRTY_SECONDS = Duration.standardSeconds(30);
 66 | 
 67 |   /**
 68 |    * Exercise4Options supported by {@link Exercise4}.
 69 |    */
 70 |   interface Exercise4Options extends Options, StreamingOptions {
 71 | 
 72 |     @Description("Numeric value of fixed window duration for team analysis, in minutes")
 73 |     @Default.Integer(1)
 74 |     Integer getTeamWindowDuration();
 75 | 
 76 |     void setTeamWindowDuration(Integer value);
 77 | 
 78 |     @Description("Numeric value of allowed data lateness, in minutes")
 79 |     @Default.Integer(2)
 80 |     Integer getAllowedLateness();
 81 | 
 82 |     void setAllowedLateness(Integer value);
 83 |   }
 84 | 
 85 |   /**
 86 |    * Extract user/score pairs from the event stream using processing time, via global windowing. Get
 87 |    * periodic updates on all users' running scores.
 88 |    */
 89 |   @VisibleForTesting
 90 |   static class CalculateUserScores
 91 |       extends PTransform<PCollection<GameEvent>, PCollection<KV<String, Integer>>> {
 92 | 
 93 |     private final Duration allowedLateness;
 94 | 
 95 |     CalculateUserScores(Duration allowedLateness) {
 96 |       this.allowedLateness = allowedLateness;
 97 |     }
 98 | 
 99 |     @Override
100 |     public PCollection<KV<String, Integer>> expand(PCollection<GameEvent> input) {
101 |       return input
102 |           .apply(
103 |               "LeaderboardUserGlobalWindow",
104 |               Window.<GameEvent>into(new GlobalWindows())
105 |                   // Get periodic results every 30 seconds.
106 |                   .triggering(
107 |                       Repeatedly.forever(
108 |                           AfterProcessingTime.pastFirstElementInPane().plusDelayOf(THIRTY_SECONDS)))
109 |                   .accumulatingFiredPanes()
110 |                   .withAllowedLateness(allowedLateness))
111 |           // Extract and sum username/score pairs from the event data.
112 |           .apply("ExtractUserScore", new Exercise1.ExtractAndSumScore("user"));
113 |     }
114 |   }
115 | 
116 |   /**
117 |    * Calculates scores for each team within the configured window duration.
118 |    */
119 |   // Extract team/score pairs from the event stream, using hour-long windows by default.
120 |   @VisibleForTesting
121 |   static class CalculateTeamScores
122 |       extends PTransform<PCollection<GameEvent>, PCollection<KV<String, Integer>>> {
123 | 
124 |     private final Duration teamWindowDuration;
125 |     private final Duration allowedLateness;
126 | 
127 |     CalculateTeamScores(Duration teamWindowDuration, Duration allowedLateness) {
128 |       this.teamWindowDuration = teamWindowDuration;
129 |       this.allowedLateness = allowedLateness;
130 |     }
131 | 
132 |     @Override
133 |     public PCollection<KV<String, Integer>> expand(PCollection<GameEvent> infos) {
134 |       return infos
135 |           .apply(
136 |               "LeaderboardTeamFixedWindows",
137 |               Window.<GameEvent>into(FixedWindows.of(teamWindowDuration))
138 |                   // We will get early (speculative) results as well as cumulative
139 |                   // processing of late data.
140 |                   .triggering(
141 |                       AfterWatermark.pastEndOfWindow()
142 |                           .withEarlyFirings(
143 |                               AfterProcessingTime.pastFirstElementInPane().plusDelayOf(TEN_SECONDS))
144 |                           .withLateFirings(
145 |                               AfterProcessingTime.pastFirstElementInPane()
146 |                                   .plusDelayOf(THIRTY_SECONDS)))
147 |                   .withAllowedLateness(allowedLateness)
148 |                   .accumulatingFiredPanes())
149 |           // Extract and sum teamname/score pairs from the event data.
150 |           .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team"));
151 |     }
152 |   }
153 | 
154 |   public static void main(String[] args) throws Exception {
155 |     Exercise4Options options =
156 |         PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise4Options.class);
157 |     // Enforce that this pipeline is always run in streaming mode.
158 |     options.setStreaming(true);
159 |     // For example purposes, allow the pipeline to be easily cancelled instead of running
160 |     // continuously.
161 |     options.setRunner(DataflowRunner.class);
162 |     Pipeline pipeline = Pipeline.create(options);
163 | 
164 |     TableReference teamTable = new TableReference();
165 |     teamTable.setDatasetId(options.getOutputDataset());
166 |     teamTable.setProjectId(options.as(GcpOptions.class).getProject());
167 |     teamTable.setTableId(options.getOutputTableName() + "_team");
168 | 
169 |     TableReference userTable = new TableReference();
170 |     userTable.setDatasetId(options.getOutputDataset());
171 |     userTable.setProjectId(options.as(GcpOptions.class).getProject());
172 |     userTable.setTableId(options.getOutputTableName() + "_user");
173 | 
174 |     PCollection<GameEvent> gameEvents = pipeline.apply(new Exercise3.ReadGameEvents(options));
175 | 
176 |     gameEvents
177 |         .apply(
178 |             "CalculateTeamScores",
179 |             new CalculateTeamScores(
180 |                 Duration.standardMinutes(options.getTeamWindowDuration()),
181 |                 Duration.standardMinutes(options.getAllowedLateness())))
182 |         // Write the results to BigQuery.
183 |         .apply("FormatTeamScores", ParDo.of(new FormatTeamScoreFn()))
184 |         .apply(
185 |             BigQueryIO.writeTableRows().to(teamTable)
186 |                 .withSchema(FormatTeamScoreFn.getSchema())
187 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
188 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
189 | 
190 |     gameEvents
191 |         .apply(
192 |             "CalculateUserScores",
193 |             new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness())))
194 |         // Write the results to BigQuery.
195 |         .apply("FormatUserScores", ParDo.of(new FormatUserScoreFn()))
196 |         .apply(
197 |             BigQueryIO.writeTableRows().to(userTable)
198 |                 .withSchema(FormatUserScoreFn.getSchema())
199 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
200 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
201 | 
202 |     PipelineResult result = pipeline.run();
203 |     result.waitUntilFinish();
204 |   }
205 | 
206 |   /**
207 |    * Format a KV of team and associated properties to a BigQuery TableRow.
208 |    */
209 |   protected static class FormatTeamScoreFn extends DoFn<KV<String, Integer>, TableRow> {
210 | 
211 |     @ProcessElement
212 |     public void processElement(ProcessContext c, IntervalWindow window) {
213 |       TableRow row =
214 |           new TableRow()
215 |               .set("team", c.element().getKey())
216 |               .set("total_score", c.element().getValue())
217 |               .set("window_start", window.start().getMillis() / 1000)
218 |               .set("processing_time", Instant.now().getMillis() / 1000)
219 |               .set("timing", c.pane().getTiming().toString());
220 |       c.output(row);
221 |     }
222 | 
223 |     static TableSchema getSchema() {
224 |       List<TableFieldSchema> fields = new ArrayList<>();
225 |       fields.add(new TableFieldSchema().setName("team").setType("STRING"));
226 |       fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
227 |       fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
228 |       fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"));
229 |       fields.add(new TableFieldSchema().setName("timing").setType("STRING"));
230 |       return new TableSchema().setFields(fields);
231 |     }
232 |   }
233 | 
234 |   /**
235 |    * Format a KV of user and associated properties to a BigQuery TableRow.
236 |    */
237 |   static class FormatUserScoreFn extends DoFn<KV<String, Integer>, TableRow> {
238 | 
239 |     @ProcessElement
240 |     public void processElement(ProcessContext c) {
241 |       TableRow row =
242 |           new TableRow()
243 |               .set("user", c.element().getKey())
244 |               .set("total_score", c.element().getValue())
245 |               .set("processing_time", Instant.now().getMillis() / 1000);
246 |       c.output(row);
247 |     }
248 | 
249 |     static TableSchema getSchema() {
250 |       List<TableFieldSchema> fields = new ArrayList<>();
251 |       fields.add(new TableFieldSchema().setName("user").setType("STRING"));
252 |       fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
253 |       fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"));
254 |       return new TableSchema().setFields(fields);
255 |     }
256 |   }
257 | }
258 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise5.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | package org.apache.beam.examples.complete.game.solutions;
 17 | 
 18 | import com.google.api.services.bigquery.model.TableFieldSchema;
 19 | import com.google.api.services.bigquery.model.TableReference;
 20 | import com.google.api.services.bigquery.model.TableRow;
 21 | import com.google.api.services.bigquery.model.TableSchema;
 22 | import java.util.ArrayList;
 23 | import java.util.List;
 24 | import java.util.Map;
 25 | import org.apache.beam.examples.complete.game.utils.GameEvent;
 26 | import org.apache.beam.examples.complete.game.utils.Options;
 27 | import org.apache.beam.runners.dataflow.DataflowRunner;
 28 | import org.apache.beam.sdk.Pipeline;
 29 | import org.apache.beam.sdk.PipelineResult;
 30 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 31 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 34 | import org.apache.beam.sdk.metrics.Counter;
 35 | import org.apache.beam.sdk.metrics.Metrics;
 36 | import org.apache.beam.sdk.options.Default;
 37 | import org.apache.beam.sdk.options.Description;
 38 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 39 | import org.apache.beam.sdk.options.StreamingOptions;
 40 | import org.apache.beam.sdk.transforms.DoFn;
 41 | import org.apache.beam.sdk.transforms.MapElements;
 42 | import org.apache.beam.sdk.transforms.Mean;
 43 | import org.apache.beam.sdk.transforms.PTransform;
 44 | import org.apache.beam.sdk.transforms.ParDo;
 45 | import org.apache.beam.sdk.transforms.Sum;
 46 | import org.apache.beam.sdk.transforms.Values;
 47 | import org.apache.beam.sdk.transforms.View;
 48 | import org.apache.beam.sdk.transforms.windowing.FixedWindows;
 49 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
 50 | import org.apache.beam.sdk.transforms.windowing.Window;
 51 | import org.apache.beam.sdk.values.KV;
 52 | import org.apache.beam.sdk.values.PCollection;
 53 | import org.apache.beam.sdk.values.PCollectionView;
 54 | import org.apache.beam.sdk.values.TypeDescriptors;
 55 | import org.joda.time.Duration;
 56 | import org.joda.time.Instant;
 57 | import org.slf4j.Logger;
 58 | import org.slf4j.LoggerFactory;
 59 | 
 60 | /**
 61 |  * Fifth in a series of coding exercises in a gaming domain.
 62 |  *
 63 |  * <p>This exercise introduces side inputs.
 64 |  *
 65 |  * <p>See README.md for details.
 66 |  */
 67 | public class Exercise5 {
 68 | 
 69 |   private static final Logger LOG = LoggerFactory.getLogger(Exercise5.class);
 70 | 
 71 |   /**
 72 |    * Filter out all but those users with a high clickrate, which we will consider as 'spammy' users.
 73 |    * We do this by finding the mean total score per user, then using that information as a side
 74 |    * input to filter out all but those user scores that are > (mean * SCORE_WEIGHT)
 75 |    */
 76 |   public static class CalculateSpammyUsers
 77 |       extends PTransform<PCollection<KV<String, Integer>>, PCollection<KV<String, Integer>>> {
 78 | 
 79 |     private static final Logger LOG = LoggerFactory.getLogger(CalculateSpammyUsers.class);
 80 |     private static final double SCORE_WEIGHT = 2.5;
 81 | 
 82 |     @Override
 83 |     public PCollection<KV<String, Integer>> expand(PCollection<KV<String, Integer>> userScores) {
 84 | 
 85 |       // Get the sum of scores for each user.
 86 |       PCollection<KV<String, Integer>> sumScores =
 87 |           userScores.apply("UserSum", Sum.<String>integersPerKey());
 88 | 
 89 |       // Extract the score from each element, and use it to find the global mean.
 90 |       final PCollectionView<Double> globalMeanScore =
 91 |           sumScores
 92 |               .apply(Values.<Integer>create())
 93 |               .apply(Mean.<Integer>globally().asSingletonView());
 94 | 
 95 |       // Filter the user sums using the global mean.
 96 |       PCollection<KV<String, Integer>> filtered =
 97 |           sumScores.apply("ProcessAndFilter",
 98 |               ParDo
 99 |                   // use the derived mean total score as a side input
100 |                   .of(
101 |                       new DoFn<KV<String, Integer>, KV<String, Integer>>() {
102 |                         private final Counter numSpammerUsers = Metrics
103 |                             .counter("main", "SpammerUsers");
104 | 
105 |                         @ProcessElement
106 |                         public void processElement(ProcessContext c) {
107 |                           Integer score = c.element().getValue();
108 |                           Double gmc = c.sideInput(globalMeanScore);
109 |                           if (score > (gmc * SCORE_WEIGHT)) {
110 |                             LOG.info(
111 |                                 "user "
112 |                                     + c.element().getKey()
113 |                                     + " spammer score "
114 |                                     + score
115 |                                     + " with mean "
116 |                                     + gmc);
117 |                             numSpammerUsers.inc();
118 |                             c.output(c.element());
119 |                           }
120 |                         }
121 |                       })
122 |                   .withSideInputs(globalMeanScore));
123 |       return filtered;
124 |     }
125 |   }
126 | 
127 |   /**
128 |    * Calculate and output an element's session duration.
129 |    */
130 |   private static class UserSessionInfoFn extends DoFn<KV<String, Integer>, Integer> {
131 | 
132 |     @ProcessElement
133 |     public void processElement(ProcessContext c, IntervalWindow w) {
134 |       int duration = new Duration(w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes();
135 |       c.output(duration);
136 |     }
137 |   }
138 | 
139 |   /**
140 |    * Options supported by {@link Exercise5}.
141 |    */
142 |   interface Exercise5Options extends Options, StreamingOptions {
143 | 
144 |     @Description("Numeric value of fixed window duration for user analysis, in minutes")
145 |     @Default.Integer(5)
146 |     Integer getFixedWindowDuration();
147 | 
148 |     void setFixedWindowDuration(Integer value);
149 |   }
150 | 
151 |   public static void main(String[] args) throws Exception {
152 | 
153 |     Exercise5Options options =
154 |         PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise5Options.class);
155 |     // Enforce that this pipeline is always run in streaming mode.
156 |     options.setStreaming(true);
157 |     options.setRunner(DataflowRunner.class);
158 |     Pipeline pipeline = Pipeline.create(options);
159 | 
160 |     TableReference teamTable = new TableReference();
161 |     teamTable.setDatasetId(options.getOutputDataset());
162 |     teamTable.setProjectId(options.as(GcpOptions.class).getProject());
163 |     teamTable.setTableId(options.getOutputTableName());
164 | 
165 |     PCollection<GameEvent> rawEvents = pipeline.apply(new Exercise3.ReadGameEvents(options));
166 | 
167 |     // Extract username/score pairs from the event stream
168 |     PCollection<KV<String, Integer>> userEvents =
169 |         rawEvents.apply(
170 |             "ExtractUserScore",
171 |             MapElements
172 |                 .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
173 |                 .via((GameEvent gInfo) -> KV.<String, Integer>of(gInfo.getUser(),
174 |                     gInfo.getScore())));
175 | 
176 |     // Calculate the total score per user over fixed windows, and
177 |     // cumulative updates for late data.
178 |     final PCollectionView<Map<String, Integer>> spammersView =
179 |         userEvents
180 |             .apply("FixedWindowsUser",
181 |                 Window.<KV<String, Integer>>into(
182 |                         FixedWindows.of(
183 |                             Duration.standardMinutes(options.getFixedWindowDuration()))))
184 | 
185 |             // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate.
186 |             // These might be robots/spammers.
187 |             .apply("CalculateSpammyUsers", new CalculateSpammyUsers())
188 |             // Derive a view from the collection of spammer users. It will be used as a side input
189 |             // in calculating the team score sums, below.
190 |             .apply("CreateSpammersView", View.<String, Integer>asMap());
191 | 
192 |     // Calculate the total score per team over fixed windows,
193 |     // and emit cumulative updates for late data. Uses the side input derived above-- the set of
194 |     // suspected robots-- to filter out scores from those users from the sum.
195 |     // Write the results to BigQuery.
196 |     rawEvents
197 |         .apply("WindowIntoFixedWindows",
198 |             Window
199 |                 .<GameEvent>into(
200 |                     FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration()))))
201 |         // Filter out the detected spammer users, using the side input derived above.
202 |         .apply("FilterOutSpammers",
203 |             ParDo
204 |                 .of(
205 |                     new DoFn<GameEvent, GameEvent>() {
206 |                       @ProcessElement
207 |                       public void processElement(ProcessContext c) {
208 |                         // If the user is not in the spammers Map, output the data element.
209 |                         if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) {
210 |                           c.output(c.element());
211 |                         }
212 |                       }
213 |                     })
214 |                 .withSideInputs(spammersView))
215 |         // Extract and sum teamname/score pairs from the event data.
216 |         .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team"))
217 |         // Write the result to BigQuery
218 |         .apply("FormatTeamWindows", ParDo.of(new FormatTeamWindowFn()))
219 |         .apply(
220 |             BigQueryIO.writeTableRows().to(teamTable)
221 |                 .withSchema(FormatTeamWindowFn.getSchema())
222 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
223 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
224 | 
225 |     PipelineResult result = pipeline.run();
226 |     result.waitUntilFinish();
227 |   }
228 | 
229 |   /**
230 |    * Format a KV of team and associated properties to a BigQuery TableRow.
231 |    */
232 |   protected static class FormatTeamWindowFn extends DoFn<KV<String, Integer>, TableRow> {
233 | 
234 |     @ProcessElement
235 |     public void processElement(ProcessContext c, IntervalWindow window) {
236 |       TableRow row =
237 |           new TableRow()
238 |               .set("team", c.element().getKey())
239 |               .set("total_score", c.element().getValue())
240 |               .set("window_start", window.start().getMillis() / 1000)
241 |               .set("processing_time", Instant.now().getMillis() / 1000);
242 |       c.output(row);
243 |     }
244 | 
245 |     static TableSchema getSchema() {
246 |       List<TableFieldSchema> fields = new ArrayList<>();
247 |       fields.add(new TableFieldSchema().setName("team").setType("STRING"));
248 |       fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
249 |       fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
250 |       fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"));
251 |       return new TableSchema().setFields(fields);
252 |     }
253 |   }
254 | }
255 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/solutions/Exercise6.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Google Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | package org.apache.beam.examples.complete.game.solutions;
 17 | 
 18 | import static org.apache.beam.sdk.transforms.windowing.TimestampCombiner.END_OF_WINDOW;
 19 | 
 20 | import com.google.api.services.bigquery.model.TableFieldSchema;
 21 | import com.google.api.services.bigquery.model.TableReference;
 22 | import com.google.api.services.bigquery.model.TableRow;
 23 | import com.google.api.services.bigquery.model.TableSchema;
 24 | import java.util.ArrayList;
 25 | import java.util.List;
 26 | import org.apache.beam.examples.complete.game.utils.GameEvent;
 27 | import org.apache.beam.examples.complete.game.utils.Options;
 28 | import org.apache.beam.runners.dataflow.DataflowRunner;
 29 | import org.apache.beam.sdk.Pipeline;
 30 | import org.apache.beam.sdk.PipelineResult;
 31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 32 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 33 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 34 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 35 | import org.apache.beam.sdk.options.Default;
 36 | import org.apache.beam.sdk.options.Description;
 37 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 38 | import org.apache.beam.sdk.options.StreamingOptions;
 39 | import org.apache.beam.sdk.transforms.Combine;
 40 | import org.apache.beam.sdk.transforms.DoFn;
 41 | import org.apache.beam.sdk.transforms.MapElements;
 42 | import org.apache.beam.sdk.transforms.Mean;
 43 | import org.apache.beam.sdk.transforms.ParDo;
 44 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
 45 | import org.apache.beam.sdk.transforms.windowing.FixedWindows;
 46 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
 47 | import org.apache.beam.sdk.transforms.windowing.Sessions;
 48 | import org.apache.beam.sdk.transforms.windowing.Window;
 49 | import org.apache.beam.sdk.values.KV;
 50 | import org.apache.beam.sdk.values.PCollection;
 51 | import org.apache.beam.sdk.values.TypeDescriptors;
 52 | import org.joda.time.Duration;
 53 | import org.slf4j.Logger;
 54 | import org.slf4j.LoggerFactory;
 55 | 
 56 | /**
 57 |  * Sixth in a series of coding exercises in a gaming domain.
 58 |  *
 59 |  * <p>This exercise introduces session windows.
 60 |  *
 61 |  * <p>See README.md for details.
 62 |  */
 63 | public class Exercise6 {
 64 | 
 65 |   private static final Logger LOG = LoggerFactory.getLogger(Exercise6.class);
 66 | 
 67 |   /**
 68 |    * Calculate and output an element's session duration.
 69 |    */
 70 |   private static class UserSessionInfoFn extends DoFn<KV<String, Integer>, Integer> {
 71 | 
 72 |     @ProcessElement
 73 |     public void processElement(ProcessContext c, BoundedWindow window) {
 74 |       IntervalWindow w = (IntervalWindow) window;
 75 |       int duration = new Duration(w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes();
 76 |       c.output(duration);
 77 |     }
 78 |   }
 79 | 
 80 |   /**
 81 |    * Options supported by {@link Exercise6}.
 82 |    */
 83 |   interface Exercise6Options extends Options, StreamingOptions {
 84 | 
 85 |     @Description("Numeric value of gap between user sessions, in minutes")
 86 |     @Default.Integer(1)
 87 |     Integer getSessionGap();
 88 | 
 89 |     void setSessionGap(Integer value);
 90 | 
 91 |     @Description(
 92 |         "Numeric value of fixed window for finding mean of user session duration, " + "in minutes")
 93 |     @Default.Integer(5)
 94 |     Integer getUserActivityWindowDuration();
 95 | 
 96 |     void setUserActivityWindowDuration(Integer value);
 97 |   }
 98 | 
 99 |   public static void main(String[] args) throws Exception {
100 | 
101 |     Exercise6Options options =
102 |         PipelineOptionsFactory.fromArgs(args).withValidation().as(Exercise6Options.class);
103 |     // Enforce that this pipeline is always run in streaming mode.
104 |     options.setStreaming(true);
105 |     options.setRunner(DataflowRunner.class);
106 |     Pipeline pipeline = Pipeline.create(options);
107 | 
108 |     TableReference sessionsTable = new TableReference();
109 |     sessionsTable.setDatasetId(options.getOutputDataset());
110 |     sessionsTable.setProjectId(options.as(GcpOptions.class).getProject());
111 |     sessionsTable.setTableId(options.getOutputTableName());
112 | 
113 |     PCollection<GameEvent> rawEvents = pipeline.apply(new Exercise3.ReadGameEvents(options));
114 | 
115 |     // Extract username/score pairs from the event stream
116 |     PCollection<KV<String, Integer>> userEvents =
117 |         rawEvents.apply(
118 |             "ExtractUserScore",
119 |             MapElements
120 |                 .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
121 |                 .via((GameEvent gInfo) -> KV.<String, Integer>of(gInfo.getUser(),
122 |                     gInfo.getScore())));
123 | 
124 |     // Detect user sessions-- that is, a burst of activity separated by a gap from further
125 |     // activity. Find and record the mean session lengths.
126 |     // This information could help the game designers track the changing user engagement
127 |     // as their set of games changes.
128 |     userEvents
129 |         .apply("WindowIntoSessions",
130 |             Window
131 |                 .<KV<String, Integer>>into(
132 |                     Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap())))
133 |                 .withTimestampCombiner(END_OF_WINDOW))
134 |         // For this use, we care only about the existence of the session, not any particular
135 |         // information aggregated over it, so the following is an efficient way to do that.
136 |         .apply(Combine.perKey(x -> 0))
137 |         // Get the duration per session.
138 |         .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn()))
139 |         // Re-window to process groups of session sums according to when the sessions complete.
140 |         .apply("WindowToExtractSessionMean",
141 |             Window
142 |                 .<Integer>into(
143 |                     FixedWindows.of(
144 |                         Duration.standardMinutes(options.getUserActivityWindowDuration()))))
145 |         // Find the mean session duration in each window.
146 |         .apply(Mean.<Integer>globally().withoutDefaults())
147 |         // Write this info to a BigQuery table.
148 |         .apply("FormatSessions", ParDo.of(new FormatSessionWindowFn()))
149 |         .apply(
150 |             BigQueryIO.writeTableRows().to(sessionsTable)
151 |                 .withSchema(FormatSessionWindowFn.getSchema())
152 |                 .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
153 |                 .withWriteDisposition(WriteDisposition.WRITE_APPEND));
154 | 
155 |     PipelineResult result = pipeline.run();
156 |     result.waitUntilFinish();
157 |   }
158 | 
159 |   /**
160 |    * Format a KV of session and associated properties to a BigQuery TableRow.
161 |    */
162 |   static class FormatSessionWindowFn extends DoFn<Double, TableRow> {
163 | 
164 |     @ProcessElement
165 |     public void processElement(ProcessContext c, BoundedWindow window) {
166 |       IntervalWindow w = (IntervalWindow) window;
167 |       TableRow row =
168 |           new TableRow()
169 |               .set("window_start", w.start().getMillis() / 1000)
170 |               .set("mean_duration", c.element());
171 |       c.output(row);
172 |     }
173 | 
174 |     static TableSchema getSchema() {
175 |       List<TableFieldSchema> fields = new ArrayList<>();
176 |       fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
177 |       fields.add(new TableFieldSchema().setName("mean_duration").setType("FLOAT"));
178 |       return new TableSchema().setFields(fields);
179 |     }
180 |   }
181 | }
182 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/utils/ChangeMe.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2015 Google Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package org.apache.beam.examples.complete.game.utils;
18 | 
19 | import org.apache.beam.sdk.transforms.PTransform;
20 | import org.apache.beam.sdk.values.PCollection;
21 | import org.apache.beam.sdk.values.PInput;
22 | 
23 | /**
24 |  * PTransform that crashes at runtime used as a placeholder in tutorials.
25 |  */
26 | public class ChangeMe<InputT extends PInput, OutputT>
27 |     extends PTransform<InputT, PCollection<OutputT>> {
28 | 
29 |   @Override
30 |   public PCollection<OutputT> expand(InputT input) {
31 |     throw new RuntimeException("Not implemented");
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/utils/GameEvent.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2015 Google Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package org.apache.beam.examples.complete.game.utils;
18 | 
19 | import org.apache.avro.reflect.Nullable;
20 | import org.apache.beam.sdk.coders.AvroCoder;
21 | import org.apache.beam.sdk.coders.DefaultCoder;
22 | 
23 | /**
24 |  * Class to hold info about a game event.
25 |  */
26 | @DefaultCoder(AvroCoder.class)
27 | public class GameEvent {
28 | 
29 |   @Nullable
30 |   String user;
31 |   @Nullable
32 |   String team;
33 |   @Nullable
34 |   Integer score;
35 |   @Nullable
36 |   Long timestamp;
37 |   @Nullable
38 |   String eventId;
39 | 
40 |   public GameEvent() {
41 |   }
42 | 
43 |   public GameEvent(String user, String team, Integer score, Long timestamp, String eventId) {
44 |     this.user = user;
45 |     this.team = team;
46 |     this.score = score;
47 |     this.timestamp = timestamp;
48 |     this.eventId = eventId;
49 |   }
50 | 
51 |   public String getUser() {
52 |     return this.user;
53 |   }
54 | 
55 |   public String getTeam() {
56 |     return this.team;
57 |   }
58 | 
59 |   public Integer getScore() {
60 |     return this.score;
61 |   }
62 | 
63 |   public String getKey(String keyname) {
64 |     if (keyname.equals("team")) {
65 |       return this.team;
66 |     } else {  // return username as default
67 |       return this.user;
68 |     }
69 |   }
70 | 
71 |   public Long getTimestamp() {
72 |     return this.timestamp;
73 |   }
74 | 
75 |   public String getEventId() {
76 |     return this.eventId;
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/utils/Options.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2015 Google Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package org.apache.beam.examples.complete.game.utils;
18 | 
19 | import org.apache.beam.sdk.options.Description;
20 | import org.apache.beam.sdk.options.PipelineOptions;
21 | import org.apache.beam.sdk.options.Validation;
22 | 
23 | /**
24 |  * Options supported by the exercise pipelines.
25 |  */
26 | public interface Options extends PipelineOptions {
27 | 
28 |   @Description("Path to the data file(s) containing game data.")
29 |   String getInput();
30 | 
31 |   void setInput(String value);
32 | 
33 |   @Description("Pub/Sub topic to read from. Used if --input is empty.")
34 |   String getTopic();
35 | 
36 |   void setTopic(String value);
37 | 
38 |   @Description("BigQuery Dataset to write tables to. Must already exist.")
39 |   @Validation.Required
40 |   String getOutputDataset();
41 | 
42 |   void setOutputDataset(String value);
43 | 
44 |   @Description("The BigQuery table name. Should not already exist.")
45 |   @Validation.Required
46 |   String getOutputTableName();
47 | 
48 |   void setOutputTableName(String value);
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/utils/ParseEventFn.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2015 Google Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package org.apache.beam.examples.complete.game.utils;
18 | 
19 | import org.apache.beam.sdk.metrics.Counter;
20 | import org.apache.beam.sdk.metrics.Metrics;
21 | import org.apache.beam.sdk.transforms.DoFn;
22 | import org.slf4j.Logger;
23 | import org.slf4j.LoggerFactory;
24 | 
25 | /**
26 |  * Parses the raw game event info into GameEvent objects. Each event line has the following
27 |  * format: username,teamname,score,timestamp_in_ms,readable_time,event_id
28 |  * e.g.:
29 |  * user2_AsparagusPig,AsparagusPig,10,1445230923951,
30 |  * 2015-11-02 09:09:28.224,e8018d7d-18a6-4265-ba7e-55666b898b6f
31 |  * The human-readable time string is not used here.
32 |  */
33 | public class ParseEventFn extends DoFn<String, GameEvent> {
34 | 
35 |   // Log and count parse errors.
36 |   private static final Logger LOG = LoggerFactory.getLogger(ParseEventFn.class);
37 |   private final Counter numParseErrors = Metrics.counter("main", "ParseErrors");
38 | 
39 |   @ProcessElement
40 |   public void processElement(ProcessContext c) {
41 |     String[] components = c.element().split(",");
42 |     try {
43 |       String user = components[0].trim();
44 |       String team = components[1].trim();
45 |       Integer score = Integer.parseInt(components[2].trim());
46 |       Long timestamp = Long.parseLong(components[3].trim());
47 |       String eventId = components.length >= 6 ? components[5].trim() : "none";
48 |       GameEvent gInfo = new GameEvent(user, team, score, timestamp, eventId);
49 |       c.output(gInfo);
50 |     } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) {
51 |       numParseErrors.inc();
52 |       LOG.info("Parse error on " + c.element() + ", " + e.getMessage());
53 |     }
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/utils/ParsePlayEventFn.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2015 Google Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package org.apache.beam.examples.complete.game.utils;
18 | 
19 | import org.apache.beam.sdk.metrics.Counter;
20 | import org.apache.beam.sdk.metrics.Metrics;
21 | import org.apache.beam.sdk.transforms.DoFn;
22 | import org.slf4j.Logger;
23 | import org.slf4j.LoggerFactory;
24 | 
25 | /**
26 |  * Parses the raw play event info into PlayEvent objects. Each play event line has the following
27 |  * format: username,timestamp_in_ms,readable_time,event_id
28 |  * e.g.:
29 |  * user2_AsparagusPig,AsparagusPig,10,1445230923951,
30 |  * 2015-11-02 09:09:28.224,e8018d7d-18a6-4265-ba7e-55666b898b6f
31 |  * The human-readable time string is not used here.
32 |  */
33 | public class ParsePlayEventFn extends DoFn<String, PlayEvent> {
34 | 
35 |   // Log and count parse errors.
36 |   private static final Logger LOG = LoggerFactory.getLogger(ParsePlayEventFn.class);
37 |   private final Counter numParseErrors = Metrics.counter("main", "ParseErrors");
38 | 
39 |   @ProcessElement
40 |   public void processElement(ProcessContext c) {
41 |     String[] components = c.element().split(",");
42 |     try {
43 |       String user = components[0].trim();
44 |       Long timestamp = Long.parseLong(components[1].trim());
45 |       String eventId = components[3].trim();
46 |       PlayEvent play = new PlayEvent(user, timestamp, eventId);
47 |       c.output(play);
48 |     } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) {
49 |       numParseErrors.inc();
50 |       LOG.info("Parse error on " + c.element() + ", " + e.getMessage());
51 |     }
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java8/org/apache/beam/examples/complete/game/utils/PlayEvent.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2015 Google Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package org.apache.beam.examples.complete.game.utils;
18 | 
19 | import org.apache.avro.reflect.Nullable;
20 | import org.apache.beam.sdk.coders.AvroCoder;
21 | import org.apache.beam.sdk.coders.DefaultCoder;
22 | 
23 | /**
24 |  * Class to hold info about a game play event
25 |  */
26 | @DefaultCoder(AvroCoder.class)
27 | public class PlayEvent {
28 | 
29 |   @Nullable
30 |   String user;
31 |   @Nullable
32 |   Long timestamp;
33 |   @Nullable
34 |   String eventId;
35 | 
36 |   public PlayEvent() {
37 |   }
38 | 
39 |   public PlayEvent(String user, Long timestamp, String eventId) {
40 |     this.user = user;
41 |     this.timestamp = timestamp;
42 |     this.eventId = eventId;
43 |   }
44 | 
45 |   public String getUser() {
46 |     return this.user;
47 |   }
48 | 
49 |   public String getKey() {
50 |     return this.user;
51 |   }
52 | 
53 |   public Long getTimestamp() {
54 |     return this.timestamp;
55 |   }
56 | 
57 |   public String getEventId() {
58 |     return this.eventId;
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------