├── .gitattributes ├── .github └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── course.md └── examples └── java8 ├── pom.xml └── src ├── main └── java │ └── org │ └── apache │ └── beam │ └── examples │ ├── DebuggingWordCount.java │ ├── LineCount.java │ ├── MinimalLineCount.java │ ├── MinimalLineCountArgs.java │ ├── MinimalLineCountLambda.java │ ├── MinimalWordCount.java │ ├── WindowedWordCount.java │ ├── WordCount.java │ ├── common │ ├── ExampleBigQueryTableOptions.java │ ├── ExampleOptions.java │ ├── ExamplePubsubTopicAndSubscriptionOptions.java │ ├── ExamplePubsubTopicOptions.java │ ├── ExampleUtils.java │ └── WriteOneFilePerWindow.java │ ├── complete │ └── game │ │ ├── GameStats.java │ │ ├── HourlyTeamScore.java │ │ ├── LeaderBoard.java │ │ ├── StatefulTeamScore.java │ │ ├── UserScore.java │ │ ├── injector │ │ ├── Injector.java │ │ ├── InjectorUtils.java │ │ └── RetryHttpInitializerWrapper.java │ │ └── utils │ │ ├── GameConstants.java │ │ ├── WriteToBigQuery.java │ │ ├── WriteToText.java │ │ └── WriteWindowedToBigQuery.java │ └── subprocess │ ├── ExampleEchoPipeline.java │ ├── SubProcessPipelineOptions.java │ ├── configuration │ └── SubProcessConfiguration.java │ ├── kernel │ ├── SubProcessCommandLineArgs.java │ ├── SubProcessIOFiles.java │ └── SubProcessKernel.java │ └── utils │ ├── CallingSubProcessUtils.java │ ├── ExecutableFile.java │ └── FileUtils.java └── test └── java └── org └── apache └── beam └── examples ├── DebuggingWordCountTest.java ├── MinimalWordCountTest.java ├── WordCountTest.java ├── complete └── game │ ├── GameStatsTest.java │ ├── HourlyTeamScoreTest.java │ ├── LeaderBoardTest.java │ ├── StatefulTeamScoreTest.java │ └── UserScoreTest.java └── subprocess └── ExampleEchoPipelineTest.java /.gitattributes: -------------------------------------------------------------------------------- 1 | # The default behavior, which overrides 'core.autocrlf', is to use Git's 2 | # built-in heuristics to determine whether a particular file is text or binary. 3 | # Text files are automatically normalized to the user's platforms. 4 | * text=auto 5 | 6 | # Explicitly declare text files that should always be normalized and converted 7 | # to native line endings. 8 | .gitattributes text 9 | .gitignore text 10 | LICENSE text 11 | *.avsc text 12 | *.html text 13 | *.java text 14 | *.md text 15 | *.properties text 16 | *.proto text 17 | *.py text 18 | *.sh text 19 | *.xml text 20 | *.yml text 21 | 22 | # Declare files that will always have CRLF line endings on checkout. 23 | # *.sln text eol=crlf 24 | 25 | # Explicitly denote all files that are truly binary and should not be modified. 26 | # *.jpg binary 27 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Follow this checklist to help us incorporate your contribution quickly and easily: 2 | 3 | - [ ] Make sure there is a [JIRA issue](https://issues.apache.org/jira/projects/BEAM/issues/) filed for the change (usually before you start working on it). Trivial changes like typos do not require a JIRA issue. Your pull request should address just this issue, without pulling in other changes. 4 | - [ ] Each commit in the pull request should have a meaningful subject line and body. 5 | - [ ] Format the pull request title like `[BEAM-XXX] Fixes bug in ApproximateQuantiles`, where you replace `BEAM-XXX` with the appropriate JIRA issue. 6 | - [ ] Write a pull request description that is detailed enough to understand what the pull request does, how, and why. 7 | - [ ] Run `mvn clean verify` to make sure basic checks pass. A more thorough check will be performed on your pull request automatically. 8 | - [ ] If this contribution is large, please file an Apache [Individual Contributor License Agreement](https://www.apache.org/licenses/icla.pdf). 9 | 10 | --- 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # NOTE: if you modify this file, you probably need to modify the file set that 2 | # is an input to 'maven-assembly-plugin' that generates source distribution. 3 | # This is typically in files named 'src.xml' throughout this repository. 4 | 5 | # Ignore files generated by the Maven build process. 6 | target/ 7 | bin/ 8 | 9 | # Ignore generated archetypes 10 | sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/ 11 | sdks/java/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/ 12 | 13 | # Ignore files generated by the Python build process. 14 | *.py[cod] 15 | *.egg-info/ 16 | .eggs/ 17 | .tox/ 18 | build/ 19 | dist/ 20 | distribute-* 21 | env/ 22 | sdks/python/**/*.c 23 | sdks/python/**/*.so 24 | sdks/python/**/*.egg 25 | sdks/python/LICENSE 26 | sdks/python/NOTICE 27 | sdks/python/README.md 28 | sdks/python/apache_beam/portability/api/*pb2*.* 29 | 30 | # Ignore IntelliJ files. 31 | .idea/ 32 | *.iml 33 | *.ipr 34 | *.iws 35 | 36 | # Ignore Eclipse files. 37 | .classpath 38 | .project 39 | .factorypath 40 | .checkstyle 41 | .fbExcludeFilterFile 42 | .apt_generated/ 43 | .settings/ 44 | 45 | # The build process generates the dependency-reduced POM, but it shouldn't be 46 | # committed. 47 | dependency-reduced-pom.xml 48 | 49 | # Hotspot VM leaves this log in a non-target directory when java crashes 50 | hs_err_pid*.log 51 | 52 | # Ignore files that end with '~', since they are most likely auto-save files 53 | # produced by a text editor. 54 | *~ 55 | 56 | # Ignore MacOSX files. 57 | .DS_Store 58 | 59 | # NOTE: if you modify this file, you probably need to modify the file set that 60 | # is an input to 'maven-assembly-plugin' that generates source distribution. 61 | # This is typically in files named 'src.xml' throughout this repository. 62 | -------------------------------------------------------------------------------- /course.md: -------------------------------------------------------------------------------- 1 | # Introduction to Google Cloud Dataflow 2 | This file contains text you can copy and paste for the examples in Cloud Academy's _Introduction to Google Cloud Dataflow_ course. 3 | 4 | ### Building and Running a Pipeline 5 | Installing on your own computer: https://cloud.google.com/dataflow/docs/quickstarts 6 | Transforms: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/package-summary.html 7 | 8 | Note: Cloud Shell now uses Java 11 by default, so to get this demo to work, switch to Java 8 by running the following command. 9 | It will generate errors, but it will still work. 10 | ``` 11 | sudo update-java-alternatives -s java-1.8.0-openjdk-amd64 && export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 12 | ``` 13 | 14 | ``` 15 | git clone https://github.com/cloudacademy/beam.git 16 | cd beam/examples/java8 17 | mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.MinimalLineCount 18 | ``` 19 | ``` 20 | gsutil cat gs://dataflow-samples/shakespeare/kinglear.txt | wc 21 | ``` 22 | 23 | ### Deploying a Pipeline on Cloud Dataflow 24 | ``` 25 | nano ~/.profile 26 | PROJECT=[Your Project ID] 27 | BUCKET=gs://dataflow-$PROJECT 28 | gsutil mb $BUCKET 29 | cd ~/beam/examples/java8 30 | ``` 31 | ``` 32 | mvn -Pdataflow-runner compile exec:java -Dexec.mainClass=org.apache.beam.examples.MinimalLineCountArgs \ 33 | -Dexec.args="--runner=DataflowRunner \ 34 | --project=$PROJECT \ 35 | --tempLocation=$BUCKET/temp \ 36 | --region=us-central1" 37 | ``` 38 | ``` 39 | mvn -Pdataflow-runner compile exec:java -Dexec.mainClass=org.apache.beam.examples.LineCount \ 40 | -Dexec.args="--runner=DataflowRunner \ 41 | --project=$PROJECT \ 42 | --tempLocation=$BUCKET/temp \ 43 | --output=$BUCKET/linecount \ 44 | --region=us-central1" 45 | ``` 46 | 47 | ### Custom Transforms 48 | ``` 49 | cd ~/beam/examples/java8 50 | ``` 51 | ``` 52 | mvn -Pdataflow-runner compile exec:java -Dexec.mainClass=org.apache.beam.examples.MinimalWordCount \ 53 | -Dexec.args="--runner=DataflowRunner \ 54 | --project=$PROJECT \ 55 | --tempLocation=$BUCKET/temp \ 56 | --output=$BUCKET/wordcounts \ 57 | --region=us-central1" 58 | ``` 59 | 60 | ### Composite Transforms 61 | ``` 62 | cd ~/beam/examples/java8 63 | ``` 64 | ``` 65 | mvn -Pdataflow-runner compile exec:java -Dexec.mainClass=org.apache.beam.examples.complete.game.UserScore \ 66 | -Dexec.args="--runner=DataflowRunner \ 67 | --project=$PROJECT \ 68 | --tempLocation=$BUCKET/temp/ \ 69 | --output=$BUCKET/scores \ 70 | --region=us-central1" 71 | ``` 72 | 73 | ### Windowing 74 | ``` 75 | cd ~/beam/examples/java8 76 | ``` 77 | ``` 78 | mvn -Pdataflow-runner compile exec:java -Dexec.mainClass=org.apache.beam.examples.complete.game.HourlyTeamScore \ 79 | -Dexec.args="--runner=DataflowRunner \ 80 | --project=$PROJECT \ 81 | --tempLocation=$BUCKET/temp/ \ 82 | --output=$BUCKET/scores \ 83 | --startMin=2015-11-16-16-00 \ 84 | --stopMin=2015-11-17-16-00 \ 85 | --region=us-central1" 86 | ``` 87 | 88 | ### Running LeaderBoard 89 | ``` 90 | bq mk game 91 | ``` 92 | Note: You no longer need to use a credentials file to run this example. 93 | ``` 94 | cd ~/beam/examples/java8 95 | ``` 96 | ``` 97 | mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.complete.game.injector.Injector \ 98 | -Dexec.args="$PROJECT game none" 99 | ``` 100 | ``` 101 | cd ~/beam/examples/java8 102 | ``` 103 | ``` 104 | mvn -Pdataflow-runner compile exec:java -Dexec.mainClass=org.apache.beam.examples.complete.game.LeaderBoard \ 105 | -Dexec.args="--runner=DataflowRunner \ 106 | --project=$PROJECT \ 107 | --tempLocation=$BUCKET/temp/ \ 108 | --dataset=game \ 109 | --topic=projects/$PROJECT/topics/game \ 110 | --region=us-central1" 111 | ``` 112 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/DebuggingWordCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples; 19 | 20 | import java.util.Arrays; 21 | import java.util.List; 22 | import java.util.regex.Pattern; 23 | import org.apache.beam.sdk.Pipeline; 24 | import org.apache.beam.sdk.io.TextIO; 25 | import org.apache.beam.sdk.metrics.Counter; 26 | import org.apache.beam.sdk.metrics.Metrics; 27 | import org.apache.beam.sdk.options.Default; 28 | import org.apache.beam.sdk.options.Description; 29 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 30 | import org.apache.beam.sdk.testing.PAssert; 31 | import org.apache.beam.sdk.transforms.DoFn; 32 | import org.apache.beam.sdk.transforms.ParDo; 33 | import org.apache.beam.sdk.values.KV; 34 | import org.apache.beam.sdk.values.PCollection; 35 | import org.slf4j.Logger; 36 | import org.slf4j.LoggerFactory; 37 | 38 | /** 39 | * An example that verifies word counts in Shakespeare and includes Beam best practices. 40 | * 41 | *

This class, {@link DebuggingWordCount}, is the third in a series of four successively more 42 | * detailed 'word count' examples. You may first want to take a look at {@link MinimalWordCount} and 43 | * {@link WordCount}. After you've looked at this example, then see the {@link WindowedWordCount} 44 | * pipeline, for introduction of additional concepts. 45 | * 46 | *

Basic concepts, also in the MinimalWordCount and WordCount examples: Reading text files; 47 | * counting a PCollection; executing a Pipeline both locally and using a selected runner; defining 48 | * DoFns. 49 | * 50 | *

New Concepts: 51 | * 52 | *

 53 |  *   1. Logging using SLF4J, even in a distributed environment
 54 |  *   2. Creating a custom metric (runners have varying levels of support)
 55 |  *   3. Testing your Pipeline via PAssert
 56 |  * 
57 | * 58 | *

To execute this pipeline locally, specify general pipeline configuration: 59 | * 60 | *

{@code
 61 |  * --project=YOUR_PROJECT_ID
 62 |  * }
63 | * 64 | *

To change the runner, specify: 65 | * 66 | *

{@code
 67 |  * --runner=YOUR_SELECTED_RUNNER
 68 |  * }
69 | * 70 | *

The input file defaults to a public data set containing the text of of King Lear, by William 71 | * Shakespeare. You can override it and choose your own input with {@code --inputFile}. 72 | */ 73 | public class DebuggingWordCount { 74 | /** A DoFn that filters for a specific key based upon a regular expression. */ 75 | public static class FilterTextFn extends DoFn, KV> { 76 | /** 77 | * Concept #1: The logger below uses the fully qualified class name of FilterTextFn as the 78 | * logger. Depending on your SLF4J configuration, log statements will likely be qualified by 79 | * this name. 80 | * 81 | *

Note that this is entirely standard SLF4J usage. Some runners may provide a default SLF4J 82 | * configuration that is most appropriate for their logging integration. 83 | */ 84 | private static final Logger LOG = LoggerFactory.getLogger(FilterTextFn.class); 85 | 86 | private final Pattern filter; 87 | 88 | public FilterTextFn(String pattern) { 89 | filter = Pattern.compile(pattern); 90 | } 91 | 92 | /** 93 | * Concept #2: A custom metric can track values in your pipeline as it runs. Each runner 94 | * provides varying levels of support for metrics, and may expose them in a dashboard, etc. 95 | */ 96 | private final Counter matchedWords = Metrics.counter(FilterTextFn.class, "matchedWords"); 97 | 98 | private final Counter unmatchedWords = Metrics.counter(FilterTextFn.class, "unmatchedWords"); 99 | 100 | @ProcessElement 101 | public void processElement(ProcessContext c) { 102 | if (filter.matcher(c.element().getKey()).matches()) { 103 | // Log at the "DEBUG" level each element that we match. When executing this pipeline 104 | // these log lines will appear only if the log level is set to "DEBUG" or lower. 105 | LOG.debug("Matched: " + c.element().getKey()); 106 | matchedWords.inc(); 107 | c.output(c.element()); 108 | } else { 109 | // Log at the "TRACE" level each element that is not matched. Different log levels 110 | // can be used to control the verbosity of logging providing an effective mechanism 111 | // to filter less important information. 112 | LOG.trace("Did not match: " + c.element().getKey()); 113 | unmatchedWords.inc(); 114 | } 115 | } 116 | } 117 | 118 | /** 119 | * Options supported by {@link DebuggingWordCount}. 120 | * 121 | *

Inherits standard configuration options and all options defined in {@link 122 | * WordCount.WordCountOptions}. 123 | */ 124 | public interface WordCountOptions extends WordCount.WordCountOptions { 125 | 126 | @Description( 127 | "Regex filter pattern to use in DebuggingWordCount. " 128 | + "Only words matching this pattern will be counted.") 129 | @Default.String("Flourish|stomach") 130 | String getFilterPattern(); 131 | 132 | void setFilterPattern(String value); 133 | } 134 | 135 | static void runDebuggingWordCount(WordCountOptions options) { 136 | Pipeline p = Pipeline.create(options); 137 | 138 | PCollection> filteredWords = 139 | p.apply("ReadLines", TextIO.read().from(options.getInputFile())) 140 | .apply(new WordCount.CountWords()) 141 | .apply(ParDo.of(new FilterTextFn(options.getFilterPattern()))); 142 | 143 | /* 144 | * Concept #3: PAssert is a set of convenient PTransforms in the style of 145 | * Hamcrest's collection matchers that can be used when writing Pipeline level tests 146 | * to validate the contents of PCollections. PAssert is best used in unit tests 147 | * with small data sets but is demonstrated here as a teaching tool. 148 | * 149 | *

Below we verify that the set of filtered words matches our expected counts. Note 150 | * that PAssert does not provide any output and that successful completion of the 151 | * Pipeline implies that the expectations were met. Learn more at 152 | * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ on how to test 153 | * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test. 154 | */ 155 | List> expectedResults = 156 | Arrays.asList(KV.of("Flourish", 3L), KV.of("stomach", 1L)); 157 | PAssert.that(filteredWords).containsInAnyOrder(expectedResults); 158 | 159 | p.run().waitUntilFinish(); 160 | } 161 | 162 | public static void main(String[] args) { 163 | WordCountOptions options = 164 | PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class); 165 | 166 | runDebuggingWordCount(options); 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/LineCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples; 19 | 20 | import org.apache.beam.sdk.Pipeline; 21 | import org.apache.beam.sdk.io.TextIO; 22 | import org.apache.beam.sdk.options.Default; 23 | import org.apache.beam.sdk.options.Description; 24 | import org.apache.beam.sdk.options.PipelineOptions; 25 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 26 | import org.apache.beam.sdk.options.Validation.Required; 27 | import org.apache.beam.sdk.transforms.Count; 28 | import org.apache.beam.sdk.transforms.MapElements; 29 | import org.apache.beam.sdk.values.TypeDescriptors; 30 | 31 | public class LineCount { 32 | 33 | public interface LineCountOptions extends PipelineOptions { 34 | 35 | /** 36 | * By default, this example reads from a public dataset containing the text of 37 | * King Lear. Set this option to choose a different input file or glob. 38 | */ 39 | @Description("Path of the file to read from") 40 | @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt") 41 | String getInputFile(); 42 | void setInputFile(String value); 43 | 44 | /** 45 | * Set this required option to specify where to write the output. 46 | */ 47 | @Description("Path of the file to write to") 48 | @Required 49 | String getOutput(); 50 | void setOutput(String value); 51 | } 52 | 53 | public static void main(String[] args) { 54 | LineCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() 55 | .as(LineCountOptions.class); 56 | Pipeline p = Pipeline.create(options); 57 | 58 | p.apply(TextIO.read().from(options.getInputFile())) 59 | .apply(Count.globally()) 60 | .apply(MapElements.into(TypeDescriptors.strings()) 61 | .via((Long count) -> Long.toString(count))) 62 | .apply(TextIO.write().to(options.getOutput())); 63 | 64 | p.run().waitUntilFinish(); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/MinimalLineCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples; 19 | 20 | import org.apache.beam.sdk.Pipeline; 21 | import org.apache.beam.sdk.io.TextIO; 22 | import org.apache.beam.sdk.options.PipelineOptions; 23 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 24 | import org.apache.beam.sdk.transforms.Count; 25 | import org.apache.beam.sdk.transforms.MapElements; 26 | import org.apache.beam.sdk.transforms.SimpleFunction; 27 | 28 | public class MinimalLineCount { 29 | 30 | public static void main(String[] args) { 31 | PipelineOptions options = PipelineOptionsFactory.create(); 32 | Pipeline p = Pipeline.create(options); 33 | 34 | p.apply(TextIO.read().from("gs://dataflow-samples/shakespeare/kinglear.txt")) 35 | .apply(Count.globally()) 36 | .apply(MapElements.via(new SimpleFunction() { 37 | public String apply(Long input) { 38 | return Long.toString(input); 39 | } 40 | })) 41 | .apply(TextIO.write().to("linecount")); 42 | 43 | p.run().waitUntilFinish(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/MinimalLineCountArgs.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples; 19 | 20 | import org.apache.beam.sdk.Pipeline; 21 | import org.apache.beam.sdk.io.TextIO; 22 | import org.apache.beam.sdk.options.PipelineOptions; 23 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 24 | import org.apache.beam.sdk.transforms.Count; 25 | import org.apache.beam.sdk.transforms.MapElements; 26 | import org.apache.beam.sdk.values.TypeDescriptors; 27 | 28 | public class MinimalLineCountArgs { 29 | 30 | public static void main(String[] args) { 31 | PipelineOptions options = PipelineOptionsFactory.fromArgs(args).as(PipelineOptions.class); 32 | Pipeline p = Pipeline.create(options); 33 | 34 | p.apply(TextIO.read().from("gs://dataflow-samples/shakespeare/kinglear.txt")) 35 | .apply(Count.globally()) 36 | .apply(MapElements.into(TypeDescriptors.strings()) 37 | .via((Long count) -> Long.toString(count))) 38 | .apply(TextIO.write().to("linecount")); 39 | 40 | p.run().waitUntilFinish(); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/MinimalLineCountLambda.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples; 19 | 20 | import org.apache.beam.sdk.Pipeline; 21 | import org.apache.beam.sdk.io.TextIO; 22 | import org.apache.beam.sdk.options.PipelineOptions; 23 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 24 | import org.apache.beam.sdk.transforms.Count; 25 | import org.apache.beam.sdk.transforms.MapElements; 26 | import org.apache.beam.sdk.values.TypeDescriptors; 27 | 28 | public class MinimalLineCountLambda { 29 | 30 | public static void main(String[] args) { 31 | PipelineOptions options = PipelineOptionsFactory.create(); 32 | Pipeline p = Pipeline.create(options); 33 | 34 | p.apply(TextIO.read().from("gs://dataflow-samples/shakespeare/kinglear.txt")) 35 | .apply(Count.globally()) 36 | .apply(MapElements.into(TypeDescriptors.strings()) 37 | .via((Long count) -> Long.toString(count))) 38 | .apply(TextIO.write().to("linecount")); 39 | 40 | p.run().waitUntilFinish(); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/MinimalWordCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples; 19 | 20 | import org.apache.beam.examples.common.ExampleUtils; 21 | import org.apache.beam.sdk.Pipeline; 22 | import org.apache.beam.sdk.io.TextIO; 23 | import org.apache.beam.sdk.options.Default; 24 | import org.apache.beam.sdk.options.Description; 25 | import org.apache.beam.sdk.options.PipelineOptions; 26 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 27 | import org.apache.beam.sdk.options.Validation.Required; 28 | import org.apache.beam.sdk.transforms.Count; 29 | import org.apache.beam.sdk.transforms.DoFn; 30 | import org.apache.beam.sdk.transforms.MapElements; 31 | import org.apache.beam.sdk.transforms.ParDo; 32 | import org.apache.beam.sdk.transforms.SimpleFunction; 33 | import org.apache.beam.sdk.values.KV; 34 | 35 | 36 | /** 37 | * An example that counts words in Shakespeare. 38 | * 39 | *

This class, {@link MinimalWordCount}, is the first in a series of four successively more 40 | * detailed 'word count' examples. Here, for simplicity, we don't show any error-checking, 41 | * and focus on construction of the pipeline, which chains together the application of core 42 | * transforms. 43 | * 44 | *

Next, see the {@link WordCount} pipeline, then the {@link DebuggingWordCount}, and finally the 45 | * {@link WindowedWordCount} pipeline, for more detailed examples that introduce additional 46 | * concepts. 47 | * 48 | *

Concepts: 49 | * 50 | *

 51 |  *   1. Reading data from text files
 52 |  *   2. Specifying 'inline' transforms
 53 |  *   3. Counting items in a PCollection
 54 |  *   4. Writing data to text files
 55 |  * 
56 | * 57 | */ 58 | public class MinimalWordCount { 59 | 60 | public interface WordCountOptions extends PipelineOptions { 61 | 62 | /** 63 | * By default, this example reads from a public dataset containing the text of 64 | * King Lear. Set this option to choose a different input file or glob. 65 | */ 66 | @Description("Path of the file to read from") 67 | @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt") 68 | String getInputFile(); 69 | void setInputFile(String value); 70 | 71 | /** 72 | * Set this required option to specify where to write the output. 73 | */ 74 | @Description("Path of the file to write to") 75 | @Required 76 | String getOutput(); 77 | void setOutput(String value); 78 | } 79 | 80 | public static void main(String[] args) { 81 | // Create a PipelineOptions object. This object lets us set various execution 82 | // options for our pipeline, such as the runner you wish to use. This example 83 | // will run with the DirectRunner by default, based on the class path configured 84 | // in its dependencies. 85 | WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() 86 | .as(WordCountOptions.class); 87 | 88 | // Create the Pipeline object with the options we defined above. 89 | Pipeline p = Pipeline.create(options); 90 | 91 | // Apply the pipeline's transforms. 92 | 93 | // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set 94 | // of input text files. TextIO.Read returns a PCollection where each element is one line from 95 | // the input text. 96 | 97 | p.apply(TextIO.read().from(options.getInputFile())) 98 | 99 | // Concept #2: Apply a ParDo transform to our PCollection of text lines. This ParDo invokes a 100 | // DoFn (defined in-line) on each element that tokenizes the text line into individual words. 101 | // The ParDo returns a PCollection, where each element is an individual word in 102 | // the input text. 103 | .apply("ExtractWords", ParDo.of(new DoFn() { 104 | @ProcessElement 105 | public void processElement(ProcessContext c) { 106 | for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) { 107 | if (!word.isEmpty()) { 108 | c.output(word); 109 | } 110 | } 111 | } 112 | })) 113 | 114 | // Concept #3: Apply the Count transform to our PCollection of individual words. The Count 115 | // transform returns a new PCollection of key/value pairs, where each key represents a unique 116 | // word in the text. The associated value is the occurrence count for that word. 117 | .apply(Count.perElement()) 118 | 119 | // Apply a MapElements transform that formats our PCollection of word counts into a printable 120 | // string, suitable for writing to an output file. 121 | .apply("FormatResults", MapElements.via(new SimpleFunction, String>() { 122 | @Override 123 | public String apply(KV input) { 124 | return input.getKey() + ": " + input.getValue(); 125 | } 126 | })) 127 | 128 | // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline. 129 | // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of 130 | // formatted strings) to a series of text files. 131 | // 132 | // By default, it will write to a set of files with names like wordcount-00001-of-00005 133 | .apply(TextIO.write().to(options.getOutput())); 134 | 135 | // Run the pipeline. 136 | p.run().waitUntilFinish(); 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/WindowedWordCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples; 19 | 20 | import java.io.IOException; 21 | import java.util.concurrent.ThreadLocalRandom; 22 | import org.apache.beam.examples.common.ExampleBigQueryTableOptions; 23 | import org.apache.beam.examples.common.ExampleOptions; 24 | import org.apache.beam.examples.common.WriteOneFilePerWindow; 25 | import org.apache.beam.sdk.Pipeline; 26 | import org.apache.beam.sdk.PipelineResult; 27 | import org.apache.beam.sdk.io.TextIO; 28 | import org.apache.beam.sdk.options.Default; 29 | import org.apache.beam.sdk.options.DefaultValueFactory; 30 | import org.apache.beam.sdk.options.Description; 31 | import org.apache.beam.sdk.options.PipelineOptions; 32 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 33 | import org.apache.beam.sdk.transforms.DoFn; 34 | import org.apache.beam.sdk.transforms.MapElements; 35 | import org.apache.beam.sdk.transforms.ParDo; 36 | import org.apache.beam.sdk.transforms.windowing.FixedWindows; 37 | import org.apache.beam.sdk.transforms.windowing.Window; 38 | import org.apache.beam.sdk.values.KV; 39 | import org.apache.beam.sdk.values.PCollection; 40 | import org.joda.time.Duration; 41 | import org.joda.time.Instant; 42 | 43 | /** 44 | * An example that counts words in text, and can run over either unbounded or bounded input 45 | * collections. 46 | * 47 | *

This class, {@link WindowedWordCount}, is the last in a series of four successively more 48 | * detailed 'word count' examples. First take a look at {@link MinimalWordCount}, {@link WordCount}, 49 | * and {@link DebuggingWordCount}. 50 | * 51 | *

Basic concepts, also in the MinimalWordCount, WordCount, and DebuggingWordCount examples: 52 | * Reading text files; counting a PCollection; writing to GCS; executing a Pipeline both locally and 53 | * using a selected runner; defining DoFns; user-defined PTransforms; defining PipelineOptions. 54 | * 55 | *

New Concepts: 56 | * 57 | *

 58 |  *   1. Unbounded and bounded pipeline input modes
 59 |  *   2. Adding timestamps to data
 60 |  *   3. Windowing
 61 |  *   4. Re-using PTransforms over windowed PCollections
 62 |  *   5. Accessing the window of an element
 63 |  *   6. Writing data to per-window text files
 64 |  * 
65 | * 66 | *

By default, the examples will run with the {@code DirectRunner}. To change the runner, 67 | * specify: 68 | * 69 | *

{@code
 70 |  * --runner=YOUR_SELECTED_RUNNER
 71 |  * }
72 | * 73 | * See examples/java/README.md for instructions about how to configure different runners. 74 | * 75 | *

To execute this pipeline locally, specify a local output file (if using the {@code 76 | * DirectRunner}) or output prefix on a supported distributed file system. 77 | * 78 | *

{@code
 79 |  * --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
 80 |  * }
81 | * 82 | *

The input file defaults to a public data set containing the text of of King Lear, by William 83 | * Shakespeare. You can override it and choose your own input with {@code --inputFile}. 84 | * 85 | *

By default, the pipeline will do fixed windowing, on 10-minute windows. You can change this 86 | * interval by setting the {@code --windowSize} parameter, e.g. {@code --windowSize=15} for 87 | * 15-minute windows. 88 | * 89 | *

The example will try to cancel the pipeline on the signal to terminate the process (CTRL-C). 90 | */ 91 | public class WindowedWordCount { 92 | static final int WINDOW_SIZE = 10; // Default window duration in minutes 93 | /** 94 | * Concept #2: A DoFn that sets the data element timestamp. This is a silly method, just for this 95 | * example, for the bounded data case. 96 | * 97 | *

Imagine that many ghosts of Shakespeare are all typing madly at the same time to recreate 98 | * his masterworks. Each line of the corpus will get a random associated timestamp somewhere in a 99 | * 2-hour period. 100 | */ 101 | static class AddTimestampFn extends DoFn { 102 | private final Instant minTimestamp; 103 | private final Instant maxTimestamp; 104 | 105 | AddTimestampFn(Instant minTimestamp, Instant maxTimestamp) { 106 | this.minTimestamp = minTimestamp; 107 | this.maxTimestamp = maxTimestamp; 108 | } 109 | 110 | @ProcessElement 111 | public void processElement(@Element String element, OutputReceiver receiver) { 112 | Instant randomTimestamp = 113 | new Instant( 114 | ThreadLocalRandom.current() 115 | .nextLong(minTimestamp.getMillis(), maxTimestamp.getMillis())); 116 | 117 | /* 118 | * Concept #2: Set the data element with that timestamp. 119 | */ 120 | receiver.outputWithTimestamp(element, randomTimestamp); 121 | } 122 | } 123 | 124 | /** A {@link DefaultValueFactory} that returns the current system time. */ 125 | public static class DefaultToCurrentSystemTime implements DefaultValueFactory { 126 | @Override 127 | public Long create(PipelineOptions options) { 128 | return System.currentTimeMillis(); 129 | } 130 | } 131 | 132 | /** A {@link DefaultValueFactory} that returns the minimum timestamp plus one hour. */ 133 | public static class DefaultToMinTimestampPlusOneHour implements DefaultValueFactory { 134 | @Override 135 | public Long create(PipelineOptions options) { 136 | return options.as(Options.class).getMinTimestampMillis() 137 | + Duration.standardHours(1).getMillis(); 138 | } 139 | } 140 | 141 | /** 142 | * Options for {@link WindowedWordCount}. 143 | * 144 | *

Inherits standard example configuration options, which allow specification of the runner, as 145 | * well as the {@link WordCount.WordCountOptions} support for specification of the input and 146 | * output files. 147 | */ 148 | public interface Options 149 | extends WordCount.WordCountOptions, ExampleOptions, ExampleBigQueryTableOptions { 150 | @Description("Fixed window duration, in minutes") 151 | @Default.Integer(WINDOW_SIZE) 152 | Integer getWindowSize(); 153 | 154 | void setWindowSize(Integer value); 155 | 156 | @Description("Minimum randomly assigned timestamp, in milliseconds-since-epoch") 157 | @Default.InstanceFactory(DefaultToCurrentSystemTime.class) 158 | Long getMinTimestampMillis(); 159 | 160 | void setMinTimestampMillis(Long value); 161 | 162 | @Description("Maximum randomly assigned timestamp, in milliseconds-since-epoch") 163 | @Default.InstanceFactory(DefaultToMinTimestampPlusOneHour.class) 164 | Long getMaxTimestampMillis(); 165 | 166 | void setMaxTimestampMillis(Long value); 167 | 168 | @Description("Fixed number of shards to produce per window") 169 | Integer getNumShards(); 170 | 171 | void setNumShards(Integer numShards); 172 | } 173 | 174 | static void runWindowedWordCount(Options options) throws IOException { 175 | final String output = options.getOutput(); 176 | final Instant minTimestamp = new Instant(options.getMinTimestampMillis()); 177 | final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis()); 178 | 179 | Pipeline pipeline = Pipeline.create(options); 180 | 181 | /* 182 | * Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or 183 | * unbounded input source. 184 | */ 185 | PCollection input = 186 | pipeline 187 | /* Read from the GCS file. */ 188 | .apply(TextIO.read().from(options.getInputFile())) 189 | // Concept #2: Add an element timestamp, using an artificial time just to show 190 | // windowing. 191 | // See AddTimestampFn for more detail on this. 192 | .apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp))); 193 | 194 | /* 195 | * Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1 196 | * minute (you can change this with a command-line option). See the documentation for more 197 | * information on how fixed windows work, and for information on the other types of windowing 198 | * available (e.g., sliding windows). 199 | */ 200 | PCollection windowedWords = 201 | input.apply( 202 | Window.into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize())))); 203 | 204 | /* 205 | * Concept #4: Re-use our existing CountWords transform that does not have knowledge of 206 | * windows over a PCollection containing windowed values. 207 | */ 208 | PCollection> wordCounts = windowedWords.apply(new WordCount.CountWords()); 209 | 210 | /* 211 | * Concept #5: Format the results and write to a sharded file partitioned by window, using a 212 | * simple ParDo operation. Because there may be failures followed by retries, the 213 | * writes must be idempotent, but the details of writing to files is elided here. 214 | */ 215 | wordCounts 216 | .apply(MapElements.via(new WordCount.FormatAsTextFn())) 217 | .apply(new WriteOneFilePerWindow(output, options.getNumShards())); 218 | 219 | PipelineResult result = pipeline.run(); 220 | try { 221 | result.waitUntilFinish(); 222 | } catch (Exception exc) { 223 | result.cancel(); 224 | } 225 | } 226 | 227 | public static void main(String[] args) throws IOException { 228 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 229 | 230 | runWindowedWordCount(options); 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/WordCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples; 19 | 20 | import org.apache.beam.examples.common.ExampleUtils; 21 | import org.apache.beam.sdk.Pipeline; 22 | import org.apache.beam.sdk.io.TextIO; 23 | import org.apache.beam.sdk.metrics.Counter; 24 | import org.apache.beam.sdk.metrics.Distribution; 25 | import org.apache.beam.sdk.metrics.Metrics; 26 | import org.apache.beam.sdk.options.Default; 27 | import org.apache.beam.sdk.options.Description; 28 | import org.apache.beam.sdk.options.PipelineOptions; 29 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 30 | import org.apache.beam.sdk.options.Validation.Required; 31 | import org.apache.beam.sdk.transforms.Count; 32 | import org.apache.beam.sdk.transforms.DoFn; 33 | import org.apache.beam.sdk.transforms.MapElements; 34 | import org.apache.beam.sdk.transforms.PTransform; 35 | import org.apache.beam.sdk.transforms.ParDo; 36 | import org.apache.beam.sdk.transforms.SimpleFunction; 37 | import org.apache.beam.sdk.values.KV; 38 | import org.apache.beam.sdk.values.PCollection; 39 | 40 | /** 41 | * An example that counts words in Shakespeare and includes Beam best practices. 42 | * 43 | *

This class, {@link WordCount}, is the second in a series of four successively more detailed 44 | * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}. After 45 | * you've looked at this example, then see the {@link DebuggingWordCount} pipeline, for introduction 46 | * of additional concepts. 47 | * 48 | *

For a detailed walkthrough of this example, see 50 | * https://beam.apache.org/get-started/wordcount-example/ 51 | * 52 | *

Basic concepts, also in the MinimalWordCount example: Reading text files; counting a 53 | * PCollection; writing to text files 54 | * 55 | *

New Concepts: 56 | * 57 | *

 58 |  *   1. Executing a Pipeline both locally and using the selected runner
 59 |  *   2. Using ParDo with static DoFns defined out-of-line
 60 |  *   3. Building a composite transform
 61 |  *   4. Defining your own pipeline options
 62 |  * 
63 | * 64 | *

Concept #1: you can execute this pipeline either locally or using by selecting another runner. 65 | * These are now command-line options and not hard-coded as they were in the MinimalWordCount 66 | * example. 67 | * 68 | *

To change the runner, specify: 69 | * 70 | *

{@code
 71 |  * --runner=YOUR_SELECTED_RUNNER
 72 |  * }
73 | * 74 | *

To execute this pipeline, specify a local output file (if using the {@code DirectRunner}) or 75 | * output prefix on a supported distributed file system. 76 | * 77 | *

{@code
 78 |  * --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
 79 |  * }
80 | * 81 | *

The input file defaults to a public data set containing the text of of King Lear, by William 82 | * Shakespeare. You can override it and choose your own input with {@code --inputFile}. 83 | */ 84 | public class WordCount { 85 | 86 | /** 87 | * Concept #2: You can make your pipeline assembly code less verbose by defining your DoFns 88 | * statically out-of-line. This DoFn tokenizes lines of text into individual words; we pass it to 89 | * a ParDo in the pipeline. 90 | */ 91 | static class ExtractWordsFn extends DoFn { 92 | private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines"); 93 | private final Distribution lineLenDist = 94 | Metrics.distribution(ExtractWordsFn.class, "lineLenDistro"); 95 | 96 | @ProcessElement 97 | public void processElement(@Element String element, OutputReceiver receiver) { 98 | lineLenDist.update(element.length()); 99 | if (element.trim().isEmpty()) { 100 | emptyLines.inc(); 101 | } 102 | 103 | // Split the line into words. 104 | String[] words = element.split(ExampleUtils.TOKENIZER_PATTERN, -1); 105 | 106 | // Output each word encountered into the output PCollection. 107 | for (String word : words) { 108 | if (!word.isEmpty()) { 109 | receiver.output(word); 110 | } 111 | } 112 | } 113 | } 114 | 115 | /** A SimpleFunction that converts a Word and Count into a printable string. */ 116 | public static class FormatAsTextFn extends SimpleFunction, String> { 117 | @Override 118 | public String apply(KV input) { 119 | return input.getKey() + ": " + input.getValue(); 120 | } 121 | } 122 | 123 | /** 124 | * A PTransform that converts a PCollection containing lines of text into a PCollection of 125 | * formatted word counts. 126 | * 127 | *

Concept #3: This is a custom composite transform that bundles two transforms (ParDo and 128 | * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse, 129 | * modular testing, and an improved monitoring experience. 130 | */ 131 | public static class CountWords 132 | extends PTransform, PCollection>> { 133 | @Override 134 | public PCollection> expand(PCollection lines) { 135 | 136 | // Convert lines of text into individual words. 137 | PCollection words = lines.apply(ParDo.of(new ExtractWordsFn())); 138 | 139 | // Count the number of times each word occurs. 140 | PCollection> wordCounts = words.apply(Count.perElement()); 141 | 142 | return wordCounts; 143 | } 144 | } 145 | 146 | /** 147 | * Options supported by {@link WordCount}. 148 | * 149 | *

Concept #4: Defining your own configuration options. Here, you can add your own arguments to 150 | * be processed by the command-line parser, and specify default values for them. You can then 151 | * access the options values in your pipeline code. 152 | * 153 | *

Inherits standard configuration options. 154 | */ 155 | public interface WordCountOptions extends PipelineOptions { 156 | 157 | /** 158 | * By default, this example reads from a public dataset containing the text of King Lear. Set 159 | * this option to choose a different input file or glob. 160 | */ 161 | @Description("Path of the file to read from") 162 | @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt") 163 | String getInputFile(); 164 | 165 | void setInputFile(String value); 166 | 167 | /** Set this required option to specify where to write the output. */ 168 | @Description("Path of the file to write to") 169 | @Required 170 | String getOutput(); 171 | 172 | void setOutput(String value); 173 | } 174 | 175 | static void runWordCount(WordCountOptions options) { 176 | Pipeline p = Pipeline.create(options); 177 | 178 | // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the 179 | // static FormatAsTextFn() to the ParDo transform. 180 | p.apply("ReadLines", TextIO.read().from(options.getInputFile())) 181 | .apply(new CountWords()) 182 | .apply(MapElements.via(new FormatAsTextFn())) 183 | .apply("WriteCounts", TextIO.write().to(options.getOutput())); 184 | 185 | p.run().waitUntilFinish(); 186 | } 187 | 188 | public static void main(String[] args) { 189 | WordCountOptions options = 190 | PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class); 191 | 192 | runWordCount(options); 193 | } 194 | } 195 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/common/ExampleBigQueryTableOptions.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.common; 19 | 20 | import com.google.api.services.bigquery.model.TableSchema; 21 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 22 | import org.apache.beam.sdk.options.Default; 23 | import org.apache.beam.sdk.options.DefaultValueFactory; 24 | import org.apache.beam.sdk.options.Description; 25 | import org.apache.beam.sdk.options.PipelineOptions; 26 | 27 | /** 28 | * Options that can be used to configure BigQuery tables in Beam examples. The project defaults to 29 | * the project being used to run the example. 30 | */ 31 | public interface ExampleBigQueryTableOptions extends GcpOptions { 32 | @Description("BigQuery dataset name") 33 | @Default.String("beam_examples") 34 | String getBigQueryDataset(); 35 | 36 | void setBigQueryDataset(String dataset); 37 | 38 | @Description("BigQuery table name") 39 | @Default.InstanceFactory(BigQueryTableFactory.class) 40 | String getBigQueryTable(); 41 | 42 | void setBigQueryTable(String table); 43 | 44 | @Description("BigQuery table schema") 45 | TableSchema getBigQuerySchema(); 46 | 47 | void setBigQuerySchema(TableSchema schema); 48 | 49 | /** Returns the job name as the default BigQuery table name. */ 50 | class BigQueryTableFactory implements DefaultValueFactory { 51 | @Override 52 | public String create(PipelineOptions options) { 53 | return options.getJobName().replace('-', '_'); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/common/ExampleOptions.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.common; 19 | 20 | import org.apache.beam.sdk.options.Default; 21 | import org.apache.beam.sdk.options.Description; 22 | import org.apache.beam.sdk.options.PipelineOptions; 23 | 24 | /** Options that can be used to configure the Beam examples. */ 25 | public interface ExampleOptions extends PipelineOptions { 26 | @Description("Whether to keep jobs running after local process exit") 27 | @Default.Boolean(false) 28 | boolean getKeepJobsRunning(); 29 | 30 | void setKeepJobsRunning(boolean keepJobsRunning); 31 | 32 | @Description("Number of workers to use when executing the injector pipeline") 33 | @Default.Integer(1) 34 | int getInjectorNumWorkers(); 35 | 36 | void setInjectorNumWorkers(int numWorkers); 37 | } 38 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/common/ExamplePubsubTopicAndSubscriptionOptions.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.common; 19 | 20 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 21 | import org.apache.beam.sdk.options.Default; 22 | import org.apache.beam.sdk.options.DefaultValueFactory; 23 | import org.apache.beam.sdk.options.Description; 24 | import org.apache.beam.sdk.options.PipelineOptions; 25 | 26 | /** Options that can be used to configure Pub/Sub topic/subscription in Beam examples. */ 27 | public interface ExamplePubsubTopicAndSubscriptionOptions extends ExamplePubsubTopicOptions { 28 | @Description("Pub/Sub subscription") 29 | @Default.InstanceFactory(PubsubSubscriptionFactory.class) 30 | String getPubsubSubscription(); 31 | 32 | void setPubsubSubscription(String subscription); 33 | 34 | /** Returns a default Pub/Sub subscription based on the project and the job names. */ 35 | class PubsubSubscriptionFactory implements DefaultValueFactory { 36 | @Override 37 | public String create(PipelineOptions options) { 38 | return "projects/" 39 | + options.as(GcpOptions.class).getProject() 40 | + "/subscriptions/" 41 | + options.getJobName(); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/common/ExamplePubsubTopicOptions.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.common; 19 | 20 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 21 | import org.apache.beam.sdk.options.Default; 22 | import org.apache.beam.sdk.options.DefaultValueFactory; 23 | import org.apache.beam.sdk.options.Description; 24 | import org.apache.beam.sdk.options.PipelineOptions; 25 | 26 | /** Options that can be used to configure Pub/Sub topic in Beam examples. */ 27 | public interface ExamplePubsubTopicOptions extends GcpOptions { 28 | @Description("Pub/Sub topic") 29 | @Default.InstanceFactory(PubsubTopicFactory.class) 30 | String getPubsubTopic(); 31 | 32 | void setPubsubTopic(String topic); 33 | 34 | /** Returns a default Pub/Sub topic based on the project and the job names. */ 35 | class PubsubTopicFactory implements DefaultValueFactory { 36 | @Override 37 | public String create(PipelineOptions options) { 38 | return "projects/" 39 | + options.as(GcpOptions.class).getProject() 40 | + "/topics/" 41 | + options.getJobName(); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/common/WriteOneFilePerWindow.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.common; 19 | 20 | import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects.firstNonNull; 21 | 22 | import javax.annotation.Nullable; 23 | import org.apache.beam.sdk.io.FileBasedSink; 24 | import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; 25 | import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; 26 | import org.apache.beam.sdk.io.TextIO; 27 | import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; 28 | import org.apache.beam.sdk.io.fs.ResourceId; 29 | import org.apache.beam.sdk.transforms.DoFn; 30 | import org.apache.beam.sdk.transforms.PTransform; 31 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow; 32 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow; 33 | import org.apache.beam.sdk.transforms.windowing.PaneInfo; 34 | import org.apache.beam.sdk.values.PCollection; 35 | import org.apache.beam.sdk.values.PDone; 36 | import org.joda.time.format.DateTimeFormatter; 37 | import org.joda.time.format.ISODateTimeFormat; 38 | 39 | /** 40 | * A {@link DoFn} that writes elements to files with names deterministically derived from the lower 41 | * and upper bounds of their key (an {@link IntervalWindow}). 42 | * 43 | *

This is test utility code, not for end-users, so examples can be focused on their primary 44 | * lessons. 45 | */ 46 | public class WriteOneFilePerWindow extends PTransform, PDone> { 47 | private static final DateTimeFormatter FORMATTER = ISODateTimeFormat.hourMinute(); 48 | private String filenamePrefix; 49 | @Nullable private Integer numShards; 50 | 51 | public WriteOneFilePerWindow(String filenamePrefix, Integer numShards) { 52 | this.filenamePrefix = filenamePrefix; 53 | this.numShards = numShards; 54 | } 55 | 56 | @Override 57 | public PDone expand(PCollection input) { 58 | ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); 59 | TextIO.Write write = 60 | TextIO.write() 61 | .to(new PerWindowFiles(resource)) 62 | .withTempDirectory(resource.getCurrentDirectory()) 63 | .withWindowedWrites(); 64 | if (numShards != null) { 65 | write = write.withNumShards(numShards); 66 | } 67 | return input.apply(write); 68 | } 69 | 70 | /** 71 | * A {@link FilenamePolicy} produces a base file name for a write based on metadata about the data 72 | * being written. This always includes the shard number and the total number of shards. For 73 | * windowed writes, it also includes the window and pane index (a sequence number assigned to each 74 | * trigger firing). 75 | */ 76 | public static class PerWindowFiles extends FilenamePolicy { 77 | 78 | private final ResourceId baseFilename; 79 | 80 | public PerWindowFiles(ResourceId baseFilename) { 81 | this.baseFilename = baseFilename; 82 | } 83 | 84 | public String filenamePrefixForWindow(IntervalWindow window) { 85 | String prefix = 86 | baseFilename.isDirectory() ? "" : firstNonNull(baseFilename.getFilename(), ""); 87 | return String.format( 88 | "%s-%s-%s", prefix, FORMATTER.print(window.start()), FORMATTER.print(window.end())); 89 | } 90 | 91 | @Override 92 | public ResourceId windowedFilename( 93 | int shardNumber, 94 | int numShards, 95 | BoundedWindow window, 96 | PaneInfo paneInfo, 97 | OutputFileHints outputFileHints) { 98 | IntervalWindow intervalWindow = (IntervalWindow) window; 99 | String filename = 100 | String.format( 101 | "%s-%s-of-%s%s", 102 | filenamePrefixForWindow(intervalWindow), 103 | shardNumber, 104 | numShards, 105 | outputFileHints.getSuggestedFilenameSuffix()); 106 | return baseFilename 107 | .getCurrentDirectory() 108 | .resolve(filename, StandardResolveOptions.RESOLVE_FILE); 109 | } 110 | 111 | @Override 112 | public ResourceId unwindowedFilename( 113 | int shardNumber, int numShards, OutputFileHints outputFileHints) { 114 | throw new UnsupportedOperationException("Unsupported."); 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/complete/game/HourlyTeamScore.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game; 19 | 20 | import java.util.HashMap; 21 | import java.util.Map; 22 | import java.util.TimeZone; 23 | import org.apache.beam.examples.complete.game.utils.GameConstants; 24 | import org.apache.beam.examples.complete.game.utils.WriteToText; 25 | import org.apache.beam.sdk.Pipeline; 26 | import org.apache.beam.sdk.io.TextIO; 27 | import org.apache.beam.sdk.options.Default; 28 | import org.apache.beam.sdk.options.Description; 29 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 30 | import org.apache.beam.sdk.transforms.Filter; 31 | import org.apache.beam.sdk.transforms.ParDo; 32 | import org.apache.beam.sdk.transforms.WithTimestamps; 33 | import org.apache.beam.sdk.transforms.windowing.FixedWindows; 34 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow; 35 | import org.apache.beam.sdk.transforms.windowing.Window; 36 | import org.apache.beam.sdk.values.KV; 37 | import org.joda.time.DateTimeZone; 38 | import org.joda.time.Duration; 39 | import org.joda.time.Instant; 40 | import org.joda.time.format.DateTimeFormat; 41 | import org.joda.time.format.DateTimeFormatter; 42 | 43 | /** 44 | * This class is the second in a series of four pipelines that tell a story in a 'gaming' domain, 45 | * following {@link UserScore}. In addition to the concepts introduced in {@link UserScore}, new 46 | * concepts include: windowing and element timestamps; use of {@code Filter.by()}. 47 | * 48 | *

This pipeline processes data collected from gaming events in batch, building on {@link 49 | * UserScore} but using fixed windows. It calculates the sum of scores per team, for each window, 50 | * optionally allowing specification of two timestamps before and after which data is filtered out. 51 | * This allows a model where late data collected after the intended analysis window can be included, 52 | * and any late-arriving data prior to the beginning of the analysis window can be removed as well. 53 | * By using windowing and adding element timestamps, we can do finer-grained analysis than with the 54 | * {@link UserScore} pipeline. However, our batch processing is high-latency, in that we don't get 55 | * results from plays at the beginning of the batch's time period until the batch is processed. 56 | * 57 | *

To execute this pipeline, specify the pipeline configuration like this: 58 | * 59 | *

{@code
 60 |  * --tempLocation=YOUR_TEMP_DIRECTORY
 61 |  * --runner=YOUR_RUNNER
 62 |  * --output=YOUR_OUTPUT_DIRECTORY
 63 |  * (possibly options specific to your runner or permissions for your temp/output locations)
 64 |  * }
65 | * 66 | *

Optionally include {@code --input} to specify the batch input file path. To indicate a time 67 | * after which the data should be filtered out, include the {@code --stopMin} arg. E.g., {@code 68 | * --stopMin=2015-10-18-23-59} indicates that any data timestamped after 23:59 PST on 2015-10-18 69 | * should not be included in the analysis. To indicate a time before which data should be filtered 70 | * out, include the {@code --startMin} arg. If you're using the default input specified in {@link 71 | * UserScore}, "gs://apache-beam-samples/game/gaming_data*.csv", then {@code 72 | * --startMin=2015-11-16-16-10 --stopMin=2015-11-17-16-10} are good values. 73 | */ 74 | public class HourlyTeamScore extends UserScore { 75 | 76 | private static DateTimeFormatter minFmt = 77 | DateTimeFormat.forPattern("yyyy-MM-dd-HH-mm") 78 | .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("America/Los_Angeles"))); 79 | 80 | /** Options supported by {@link HourlyTeamScore}. */ 81 | public interface Options extends UserScore.Options { 82 | 83 | @Description("Numeric value of fixed window duration, in minutes") 84 | @Default.Integer(60) 85 | Integer getWindowDuration(); 86 | 87 | void setWindowDuration(Integer value); 88 | 89 | @Description( 90 | "String representation of the first minute after which to generate results," 91 | + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST." 92 | + "Any input data timestamped prior to that minute won't be included in the sums.") 93 | @Default.String("1970-01-01-00-00") 94 | String getStartMin(); 95 | 96 | void setStartMin(String value); 97 | 98 | @Description( 99 | "String representation of the first minute for which to not generate results," 100 | + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST." 101 | + "Any input data timestamped after that minute won't be included in the sums.") 102 | @Default.String("2100-01-01-00-00") 103 | String getStopMin(); 104 | 105 | void setStopMin(String value); 106 | } 107 | 108 | /** 109 | * Create a map of information that describes how to write pipeline output to text. This map is 110 | * passed to the {@link WriteToText} constructor to write team score sums and includes information 111 | * about window start time. 112 | */ 113 | protected static Map>> configureOutput() { 114 | Map>> config = new HashMap<>(); 115 | config.put("team", (c, w) -> c.element().getKey()); 116 | config.put("total_score", (c, w) -> c.element().getValue()); 117 | config.put( 118 | "window_start", 119 | (c, w) -> { 120 | IntervalWindow window = (IntervalWindow) w; 121 | return GameConstants.DATE_TIME_FORMATTER.print(window.start()); 122 | }); 123 | return config; 124 | } 125 | 126 | /** Run a batch pipeline to do windowed analysis of the data. */ 127 | // [START DocInclude_HTSMain] 128 | public static void main(String[] args) throws Exception { 129 | // Begin constructing a pipeline configured by commandline flags. 130 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 131 | Pipeline pipeline = Pipeline.create(options); 132 | 133 | final Instant stopMinTimestamp = new Instant(minFmt.parseMillis(options.getStopMin())); 134 | final Instant startMinTimestamp = new Instant(minFmt.parseMillis(options.getStartMin())); 135 | 136 | // Read 'gaming' events from a text file. 137 | pipeline 138 | .apply(TextIO.read().from(options.getInput())) 139 | // Parse the incoming data. 140 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) 141 | 142 | // Filter out data before and after the given times so that it is not included 143 | // in the calculations. As we collect data in batches (say, by day), the batch for the day 144 | // that we want to analyze could potentially include some late-arriving data from the 145 | // previous day. 146 | // If so, we want to weed it out. Similarly, if we include data from the following day 147 | // (to scoop up late-arriving events from the day we're analyzing), we need to weed out 148 | // events that fall after the time period we want to analyze. 149 | // [START DocInclude_HTSFilters] 150 | .apply( 151 | "FilterStartTime", 152 | Filter.by( 153 | (GameActionInfo gInfo) -> gInfo.getTimestamp() > startMinTimestamp.getMillis())) 154 | .apply( 155 | "FilterEndTime", 156 | Filter.by( 157 | (GameActionInfo gInfo) -> gInfo.getTimestamp() < stopMinTimestamp.getMillis())) 158 | // [END DocInclude_HTSFilters] 159 | 160 | // [START DocInclude_HTSAddTsAndWindow] 161 | // Add an element timestamp based on the event log, and apply fixed windowing. 162 | .apply( 163 | "AddEventTimestamps", 164 | WithTimestamps.of((GameActionInfo i) -> new Instant(i.getTimestamp()))) 165 | .apply( 166 | "FixedWindowsTeam", 167 | Window.into(FixedWindows.of(Duration.standardMinutes(options.getWindowDuration())))) 168 | // [END DocInclude_HTSAddTsAndWindow] 169 | 170 | // Extract and sum teamname/score pairs from the event data. 171 | .apply("ExtractTeamScore", new ExtractAndSumScore("team")) 172 | .apply( 173 | "WriteTeamScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), true)); 174 | 175 | pipeline.run().waitUntilFinish(); 176 | } 177 | // [END DocInclude_HTSMain] 178 | 179 | } 180 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/complete/game/LeaderBoard.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game; 19 | 20 | import java.util.HashMap; 21 | import java.util.Map; 22 | import org.apache.beam.examples.common.ExampleOptions; 23 | import org.apache.beam.examples.common.ExampleUtils; 24 | import org.apache.beam.examples.complete.game.utils.GameConstants; 25 | import org.apache.beam.examples.complete.game.utils.WriteToBigQuery; 26 | import org.apache.beam.examples.complete.game.utils.WriteWindowedToBigQuery; 27 | import org.apache.beam.sdk.Pipeline; 28 | import org.apache.beam.sdk.PipelineResult; 29 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 30 | import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; 31 | import org.apache.beam.sdk.options.Default; 32 | import org.apache.beam.sdk.options.Description; 33 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 34 | import org.apache.beam.sdk.options.StreamingOptions; 35 | import org.apache.beam.sdk.options.Validation; 36 | import org.apache.beam.sdk.transforms.PTransform; 37 | import org.apache.beam.sdk.transforms.ParDo; 38 | import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime; 39 | import org.apache.beam.sdk.transforms.windowing.AfterWatermark; 40 | import org.apache.beam.sdk.transforms.windowing.FixedWindows; 41 | import org.apache.beam.sdk.transforms.windowing.GlobalWindows; 42 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow; 43 | import org.apache.beam.sdk.transforms.windowing.Repeatedly; 44 | import org.apache.beam.sdk.transforms.windowing.Window; 45 | import org.apache.beam.sdk.values.KV; 46 | import org.apache.beam.sdk.values.PCollection; 47 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting; 48 | import org.joda.time.Duration; 49 | import org.joda.time.Instant; 50 | 51 | /** 52 | * This class is the third in a series of four pipelines that tell a story in a 'gaming' domain, 53 | * following {@link UserScore} and {@link HourlyTeamScore}. Concepts include: processing unbounded 54 | * data using fixed windows; use of custom timestamps and event-time processing; generation of 55 | * early/speculative results; using .accumulatingFiredPanes() to do cumulative processing of late- 56 | * arriving data. 57 | * 58 | *

This pipeline processes an unbounded stream of 'game events'. The calculation of the team 59 | * scores uses fixed windowing based on event time (the time of the game play event), not processing 60 | * time (the time that an event is processed by the pipeline). The pipeline calculates the sum of 61 | * scores per team, for each window. By default, the team scores are calculated using one-hour 62 | * windows. 63 | * 64 | *

In contrast-- to demo another windowing option-- the user scores are calculated using a global 65 | * window, which periodically (every ten minutes) emits cumulative user score sums. 66 | * 67 | *

In contrast to the previous pipelines in the series, which used static, finite input data, 68 | * here we're using an unbounded data source, which lets us provide speculative results, and allows 69 | * handling of late data, at much lower latency. We can use the early/speculative results to keep a 70 | * 'leaderboard' updated in near-realtime. Our handling of late data lets us generate correct 71 | * results, e.g. for 'team prizes'. We're now outputting window results as they're calculated, 72 | * giving us much lower latency than with the previous batch examples. 73 | * 74 | *

Run {@code injector.Injector} to generate pubsub data for this pipeline. The Injector 75 | * documentation provides more detail on how to do this. 76 | * 77 | *

To execute this pipeline, specify the pipeline configuration like this: 78 | * 79 | *

{@code
 80 |  * --project=YOUR_PROJECT_ID
 81 |  * --tempLocation=gs://YOUR_TEMP_DIRECTORY
 82 |  * --runner=YOUR_RUNNER
 83 |  * --dataset=YOUR-DATASET
 84 |  * --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
 85 |  * }
86 | * 87 | *

The BigQuery dataset you specify must already exist. The PubSub topic you specify should be 88 | * the same topic to which the Injector is publishing. 89 | */ 90 | public class LeaderBoard extends HourlyTeamScore { 91 | 92 | static final Duration FIVE_MINUTES = Duration.standardMinutes(5); 93 | static final Duration TEN_MINUTES = Duration.standardMinutes(10); 94 | 95 | /** Options supported by {@link LeaderBoard}. */ 96 | public interface Options extends ExampleOptions, StreamingOptions { 97 | 98 | @Description("BigQuery Dataset to write tables to. Must already exist.") 99 | @Validation.Required 100 | String getDataset(); 101 | 102 | void setDataset(String value); 103 | 104 | @Description("Pub/Sub topic to read from") 105 | @Validation.Required 106 | String getTopic(); 107 | 108 | void setTopic(String value); 109 | 110 | @Description("Numeric value of fixed window duration for team analysis, in minutes") 111 | @Default.Integer(60) 112 | Integer getTeamWindowDuration(); 113 | 114 | void setTeamWindowDuration(Integer value); 115 | 116 | @Description("Numeric value of allowed data lateness, in minutes") 117 | @Default.Integer(120) 118 | Integer getAllowedLateness(); 119 | 120 | void setAllowedLateness(Integer value); 121 | 122 | @Description("Prefix used for the BigQuery table names") 123 | @Default.String("leaderboard") 124 | String getLeaderBoardTableName(); 125 | 126 | void setLeaderBoardTableName(String value); 127 | } 128 | 129 | /** 130 | * Create a map of information that describes how to write pipeline output to BigQuery. This map 131 | * is used to write team score sums and includes event timing information. 132 | */ 133 | protected static Map>> 134 | configureWindowedTableWrite() { 135 | 136 | Map>> tableConfigure = 137 | new HashMap<>(); 138 | tableConfigure.put( 139 | "team", new WriteWindowedToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey())); 140 | tableConfigure.put( 141 | "total_score", 142 | new WriteWindowedToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue())); 143 | tableConfigure.put( 144 | "window_start", 145 | new WriteWindowedToBigQuery.FieldInfo<>( 146 | "STRING", 147 | (c, w) -> { 148 | IntervalWindow window = (IntervalWindow) w; 149 | return GameConstants.DATE_TIME_FORMATTER.print(window.start()); 150 | })); 151 | tableConfigure.put( 152 | "processing_time", 153 | new WriteWindowedToBigQuery.FieldInfo<>( 154 | "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now()))); 155 | tableConfigure.put( 156 | "timing", 157 | new WriteWindowedToBigQuery.FieldInfo<>( 158 | "STRING", (c, w) -> c.pane().getTiming().toString())); 159 | return tableConfigure; 160 | } 161 | 162 | /** 163 | * Create a map of information that describes how to write pipeline output to BigQuery. This map 164 | * is passed to the {@link WriteToBigQuery} constructor to write user score sums. 165 | */ 166 | protected static Map>> 167 | configureBigQueryWrite() { 168 | Map>> tableConfigure = new HashMap<>(); 169 | tableConfigure.put( 170 | "user", new WriteToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey())); 171 | tableConfigure.put( 172 | "total_score", 173 | new WriteToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue())); 174 | return tableConfigure; 175 | } 176 | 177 | /** 178 | * Create a map of information that describes how to write pipeline output to BigQuery. This map 179 | * is used to write user score sums. 180 | */ 181 | protected static Map>> 182 | configureGlobalWindowBigQueryWrite() { 183 | 184 | Map>> tableConfigure = 185 | configureBigQueryWrite(); 186 | tableConfigure.put( 187 | "processing_time", 188 | new WriteToBigQuery.FieldInfo<>( 189 | "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now()))); 190 | return tableConfigure; 191 | } 192 | 193 | public static void main(String[] args) throws Exception { 194 | 195 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 196 | // Enforce that this pipeline is always run in streaming mode. 197 | options.setStreaming(true); 198 | ExampleUtils exampleUtils = new ExampleUtils(options); 199 | Pipeline pipeline = Pipeline.create(options); 200 | 201 | // Read game events from Pub/Sub using custom timestamps, which are extracted from the pubsub 202 | // data elements, and parse the data. 203 | PCollection gameEvents = 204 | pipeline 205 | .apply( 206 | PubsubIO.readStrings() 207 | .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE) 208 | .fromTopic(options.getTopic())) 209 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn())); 210 | 211 | gameEvents 212 | .apply( 213 | "CalculateTeamScores", 214 | new CalculateTeamScores( 215 | Duration.standardMinutes(options.getTeamWindowDuration()), 216 | Duration.standardMinutes(options.getAllowedLateness()))) 217 | // Write the results to BigQuery. 218 | .apply( 219 | "WriteTeamScoreSums", 220 | new WriteWindowedToBigQuery<>( 221 | options.as(GcpOptions.class).getProject(), 222 | options.getDataset(), 223 | options.getLeaderBoardTableName() + "_team", 224 | configureWindowedTableWrite())); 225 | gameEvents 226 | .apply( 227 | "CalculateUserScores", 228 | new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness()))) 229 | // Write the results to BigQuery. 230 | .apply( 231 | "WriteUserScoreSums", 232 | new WriteToBigQuery<>( 233 | options.as(GcpOptions.class).getProject(), 234 | options.getDataset(), 235 | options.getLeaderBoardTableName() + "_user", 236 | configureGlobalWindowBigQueryWrite())); 237 | 238 | // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the 239 | // command line. 240 | PipelineResult result = pipeline.run(); 241 | exampleUtils.waitToFinish(result); 242 | } 243 | 244 | /** Calculates scores for each team within the configured window duration. */ 245 | // [START DocInclude_WindowAndTrigger] 246 | // Extract team/score pairs from the event stream, using hour-long windows by default. 247 | @VisibleForTesting 248 | static class CalculateTeamScores 249 | extends PTransform, PCollection>> { 250 | private final Duration teamWindowDuration; 251 | private final Duration allowedLateness; 252 | 253 | CalculateTeamScores(Duration teamWindowDuration, Duration allowedLateness) { 254 | this.teamWindowDuration = teamWindowDuration; 255 | this.allowedLateness = allowedLateness; 256 | } 257 | 258 | @Override 259 | public PCollection> expand(PCollection infos) { 260 | return infos 261 | .apply( 262 | "LeaderboardTeamFixedWindows", 263 | Window.into(FixedWindows.of(teamWindowDuration)) 264 | // We will get early (speculative) results as well as cumulative 265 | // processing of late data. 266 | .triggering( 267 | AfterWatermark.pastEndOfWindow() 268 | .withEarlyFirings( 269 | AfterProcessingTime.pastFirstElementInPane() 270 | .plusDelayOf(FIVE_MINUTES)) 271 | .withLateFirings( 272 | AfterProcessingTime.pastFirstElementInPane() 273 | .plusDelayOf(TEN_MINUTES))) 274 | .withAllowedLateness(allowedLateness) 275 | .accumulatingFiredPanes()) 276 | // Extract and sum teamname/score pairs from the event data. 277 | .apply("ExtractTeamScore", new ExtractAndSumScore("team")); 278 | } 279 | } 280 | // [END DocInclude_WindowAndTrigger] 281 | 282 | // [START DocInclude_ProcTimeTrigger] 283 | /** 284 | * Extract user/score pairs from the event stream using processing time, via global windowing. Get 285 | * periodic updates on all users' running scores. 286 | */ 287 | @VisibleForTesting 288 | static class CalculateUserScores 289 | extends PTransform, PCollection>> { 290 | private final Duration allowedLateness; 291 | 292 | CalculateUserScores(Duration allowedLateness) { 293 | this.allowedLateness = allowedLateness; 294 | } 295 | 296 | @Override 297 | public PCollection> expand(PCollection input) { 298 | return input 299 | .apply( 300 | "LeaderboardUserGlobalWindow", 301 | Window.into(new GlobalWindows()) 302 | // Get periodic results every ten minutes. 303 | .triggering( 304 | Repeatedly.forever( 305 | AfterProcessingTime.pastFirstElementInPane().plusDelayOf(TEN_MINUTES))) 306 | .accumulatingFiredPanes() 307 | .withAllowedLateness(allowedLateness)) 308 | // Extract and sum username/score pairs from the event data. 309 | .apply("ExtractUserScore", new ExtractAndSumScore("user")); 310 | } 311 | } 312 | // [END DocInclude_ProcTimeTrigger] 313 | } 314 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/complete/game/StatefulTeamScore.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game; 19 | 20 | import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects.firstNonNull; 21 | 22 | import java.util.HashMap; 23 | import java.util.Map; 24 | import org.apache.beam.examples.common.ExampleUtils; 25 | import org.apache.beam.examples.complete.game.utils.GameConstants; 26 | import org.apache.beam.examples.complete.game.utils.WriteToBigQuery.FieldInfo; 27 | import org.apache.beam.examples.complete.game.utils.WriteWindowedToBigQuery; 28 | import org.apache.beam.sdk.Pipeline; 29 | import org.apache.beam.sdk.PipelineResult; 30 | import org.apache.beam.sdk.coders.VarIntCoder; 31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; 32 | import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; 33 | import org.apache.beam.sdk.options.Default; 34 | import org.apache.beam.sdk.options.Description; 35 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 36 | import org.apache.beam.sdk.state.StateSpec; 37 | import org.apache.beam.sdk.state.StateSpecs; 38 | import org.apache.beam.sdk.state.ValueState; 39 | import org.apache.beam.sdk.transforms.DoFn; 40 | import org.apache.beam.sdk.transforms.MapElements; 41 | import org.apache.beam.sdk.transforms.ParDo; 42 | import org.apache.beam.sdk.values.KV; 43 | import org.apache.beam.sdk.values.TypeDescriptor; 44 | import org.apache.beam.sdk.values.TypeDescriptors; 45 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting; 46 | import org.joda.time.Instant; 47 | 48 | /** 49 | * This class is part of a series of pipelines that tell a story in a gaming domain. Concepts 50 | * include: stateful processing. 51 | * 52 | *

This pipeline processes an unbounded stream of 'game events'. It uses stateful processing to 53 | * aggregate team scores per team and outputs team name and it's total score every time the team 54 | * passes a new multiple of a threshold score. For example, multiples of the threshold could be the 55 | * corresponding scores required to pass each level of the game. By default, this threshold is set 56 | * to 5000. 57 | * 58 | *

Stateful processing allows us to write pipelines that output based on a runtime state (when a 59 | * team reaches a certain score, in every 100 game events etc) without time triggers. See 60 | * https://beam.apache.org/blog/2017/02/13/stateful-processing.html for more information on using 61 | * stateful processing. 62 | * 63 | *

Run {@code injector.Injector} to generate pubsub data for this pipeline. The Injector 64 | * documentation provides more detail on how to do this. 65 | * 66 | *

To execute this pipeline, specify the pipeline configuration like this: 67 | * 68 | *

{@code
 69 |  * --project=YOUR_PROJECT_ID
 70 |  * --tempLocation=gs://YOUR_TEMP_DIRECTORY
 71 |  * --runner=YOUR_RUNNER
 72 |  * --dataset=YOUR-DATASET
 73 |  * --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
 74 |  * }
75 | * 76 | *

The BigQuery dataset you specify must already exist. The PubSub topic you specify should be 77 | * the same topic to which the Injector is publishing. 78 | */ 79 | @SuppressWarnings({ 80 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402) 81 | }) 82 | public class StatefulTeamScore extends LeaderBoard { 83 | 84 | /** Options supported by {@link StatefulTeamScore}. */ 85 | public interface Options extends LeaderBoard.Options { 86 | 87 | @Description("Numeric value, multiple of which is used as threshold for outputting team score.") 88 | @Default.Integer(5000) 89 | Integer getThresholdScore(); 90 | 91 | void setThresholdScore(Integer value); 92 | } 93 | 94 | /** 95 | * Create a map of information that describes how to write pipeline output to BigQuery. This map 96 | * is used to write team score sums. 97 | */ 98 | private static Map>> configureCompleteWindowedTableWrite() { 99 | 100 | Map>> tableConfigure = 101 | new HashMap<>(); 102 | tableConfigure.put( 103 | "team", new WriteWindowedToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey())); 104 | tableConfigure.put( 105 | "total_score", 106 | new WriteWindowedToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue())); 107 | tableConfigure.put( 108 | "processing_time", 109 | new WriteWindowedToBigQuery.FieldInfo<>( 110 | "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now()))); 111 | return tableConfigure; 112 | } 113 | 114 | public static void main(String[] args) throws Exception { 115 | 116 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 117 | // Enforce that this pipeline is always run in streaming mode. 118 | options.setStreaming(true); 119 | ExampleUtils exampleUtils = new ExampleUtils(options); 120 | Pipeline pipeline = Pipeline.create(options); 121 | 122 | pipeline 123 | // Read game events from Pub/Sub using custom timestamps, which are extracted from the 124 | // pubsub data elements, and parse the data. 125 | .apply( 126 | PubsubIO.readStrings() 127 | .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE) 128 | .fromTopic(options.getTopic())) 129 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) 130 | // Create mapping. UpdateTeamScore uses team name as key. 131 | .apply( 132 | "MapTeamAsKey", 133 | MapElements.into( 134 | TypeDescriptors.kvs( 135 | TypeDescriptors.strings(), TypeDescriptor.of(GameActionInfo.class))) 136 | .via((GameActionInfo gInfo) -> KV.of(gInfo.team, gInfo))) 137 | // Outputs a team's score every time it passes a new multiple of the threshold. 138 | .apply("UpdateTeamScore", ParDo.of(new UpdateTeamScoreFn(options.getThresholdScore()))) 139 | // Write the results to BigQuery. 140 | .apply( 141 | "WriteTeamLeaders", 142 | new WriteWindowedToBigQuery<>( 143 | options.as(GcpOptions.class).getProject(), 144 | options.getDataset(), 145 | options.getLeaderBoardTableName() + "_team_leader", 146 | configureCompleteWindowedTableWrite())); 147 | 148 | // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the 149 | // command line. 150 | PipelineResult result = pipeline.run(); 151 | exampleUtils.waitToFinish(result); 152 | } 153 | 154 | /** 155 | * Tracks each team's score separately in a single state cell and outputs the score every time it 156 | * passes a new multiple of a threshold. 157 | * 158 | *

We use stateful {@link DoFn} because: 159 | * 160 | *

    161 | *
  • State is key-partitioned. Therefore, the score is calculated per team. 162 | *
  • Stateful {@link DoFn} can determine when to output based on the state. This only allows 163 | * outputting when a team's score passes a given threshold. 164 | *
165 | */ 166 | @VisibleForTesting 167 | public static class UpdateTeamScoreFn 168 | extends DoFn, KV> { 169 | 170 | private static final String TOTAL_SCORE = "totalScore"; 171 | private final int thresholdScore; 172 | 173 | public UpdateTeamScoreFn(int thresholdScore) { 174 | this.thresholdScore = thresholdScore; 175 | } 176 | 177 | /** 178 | * Describes the state for storing team score. Let's break down this statement. 179 | * 180 | *

{@link StateSpec} configures the state cell, which is provided by a runner during pipeline 181 | * execution. 182 | * 183 | *

{@link org.apache.beam.sdk.transforms.DoFn.StateId} annotation assigns an identifier to 184 | * the state, which is used to refer the state in {@link 185 | * org.apache.beam.sdk.transforms.DoFn.ProcessElement}. 186 | * 187 | *

A {@link ValueState} stores single value per key and per window. Because our pipeline is 188 | * globally windowed in this example, this {@link ValueState} is just key partitioned, with one 189 | * score per team. Any other class that extends {@link org.apache.beam.sdk.state.State} can be 190 | * used. 191 | * 192 | *

In order to store the value, the state must be encoded. Therefore, we provide a coder, in 193 | * this case the {@link VarIntCoder}. If the coder is not provided as in {@code 194 | * StateSpecs.value()}, Beam's coder inference will try to provide a coder automatically. 195 | */ 196 | @StateId(TOTAL_SCORE) 197 | private final StateSpec> totalScoreSpec = 198 | StateSpecs.value(VarIntCoder.of()); 199 | 200 | /** 201 | * To use a state cell, annotate a parameter with {@link 202 | * org.apache.beam.sdk.transforms.DoFn.StateId} that matches the state declaration. The type of 203 | * the parameter should match the {@link StateSpec} type. 204 | */ 205 | @ProcessElement 206 | public void processElement( 207 | ProcessContext c, @StateId(TOTAL_SCORE) ValueState totalScore) { 208 | String teamName = c.element().getKey(); 209 | GameActionInfo gInfo = c.element().getValue(); 210 | 211 | // ValueState cells do not contain a default value. If the state is possibly not written, make 212 | // sure to check for null on read. 213 | int oldTotalScore = firstNonNull(totalScore.read(), 0); 214 | totalScore.write(oldTotalScore + gInfo.score); 215 | 216 | // Since there are no negative scores, the easiest way to check whether a team just passed a 217 | // new multiple of the threshold score is to compare the quotients of dividing total scores by 218 | // threshold before and after this aggregation. For example, if the total score was 1999, 219 | // the new total is 2002, and the threshold is 1000, 1999 / 1000 = 1, 2002 / 1000 = 2. 220 | // Therefore, this team passed the threshold. 221 | if (oldTotalScore / this.thresholdScore < totalScore.read() / this.thresholdScore) { 222 | c.output(KV.of(teamName, totalScore.read())); 223 | } 224 | } 225 | } 226 | } 227 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/complete/game/UserScore.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game; 19 | 20 | import java.util.HashMap; 21 | import java.util.Map; 22 | import java.util.Objects; 23 | import org.apache.avro.reflect.Nullable; 24 | import org.apache.beam.examples.complete.game.utils.WriteToText; 25 | import org.apache.beam.sdk.Pipeline; 26 | import org.apache.beam.sdk.coders.AvroCoder; 27 | import org.apache.beam.sdk.coders.DefaultCoder; 28 | import org.apache.beam.sdk.io.TextIO; 29 | import org.apache.beam.sdk.metrics.Counter; 30 | import org.apache.beam.sdk.metrics.Metrics; 31 | import org.apache.beam.sdk.options.Default; 32 | import org.apache.beam.sdk.options.Description; 33 | import org.apache.beam.sdk.options.PipelineOptions; 34 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 35 | import org.apache.beam.sdk.options.Validation; 36 | import org.apache.beam.sdk.transforms.DoFn; 37 | import org.apache.beam.sdk.transforms.MapElements; 38 | import org.apache.beam.sdk.transforms.PTransform; 39 | import org.apache.beam.sdk.transforms.ParDo; 40 | import org.apache.beam.sdk.transforms.Sum; 41 | import org.apache.beam.sdk.values.KV; 42 | import org.apache.beam.sdk.values.PCollection; 43 | import org.apache.beam.sdk.values.TypeDescriptors; 44 | import org.slf4j.Logger; 45 | import org.slf4j.LoggerFactory; 46 | 47 | /** 48 | * This class is the first in a series of four pipelines that tell a story in a 'gaming' domain. 49 | * Concepts: batch processing, reading input from text files, writing output to text files, using 50 | * standalone DoFns, use of the sum per key transform, and use of Java 8 lambda syntax. 51 | * 52 | *

In this gaming scenario, many users play, as members of different teams, over the course of a 53 | * day, and their actions are logged for processing. Some of the logged game events may be late- 54 | * arriving, if users play on mobile devices and go transiently offline for a period. 55 | * 56 | *

This pipeline does batch processing of data collected from gaming events. It calculates the 57 | * sum of scores per user, over an entire batch of gaming data (collected, say, for each day). The 58 | * batch processing will not include any late data that arrives after the day's cutoff point. 59 | * 60 | *

To execute this pipeline, specify the pipeline configuration like this: 61 | * 62 | *

{@code
 63 |  * --tempLocation=YOUR_TEMP_DIRECTORY
 64 |  * --runner=YOUR_RUNNER
 65 |  * --output=YOUR_OUTPUT_DIRECTORY
 66 |  * (possibly options specific to your runner or permissions for your temp/output locations)
 67 |  * }
68 | * 69 | *

Optionally include the --input argument to specify a batch input file. See the --input default 70 | * value for example batch data file, or use {@code injector.Injector} to generate your own batch 71 | * data. 72 | */ 73 | @SuppressWarnings({ 74 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402) 75 | }) 76 | public class UserScore { 77 | 78 | /** Class to hold info about a game event. */ 79 | @DefaultCoder(AvroCoder.class) 80 | static class GameActionInfo { 81 | @Nullable String user; 82 | @Nullable String team; 83 | @Nullable Integer score; 84 | @Nullable Long timestamp; 85 | 86 | public GameActionInfo() {} 87 | 88 | public GameActionInfo(String user, String team, Integer score, Long timestamp) { 89 | this.user = user; 90 | this.team = team; 91 | this.score = score; 92 | this.timestamp = timestamp; 93 | } 94 | 95 | public String getUser() { 96 | return this.user; 97 | } 98 | 99 | public String getTeam() { 100 | return this.team; 101 | } 102 | 103 | public Integer getScore() { 104 | return this.score; 105 | } 106 | 107 | public Long getTimestamp() { 108 | return this.timestamp; 109 | } 110 | 111 | public String getKey(String keyname) { 112 | if ("team".equals(keyname)) { 113 | return this.team; 114 | } else { // return username as default 115 | return this.user; 116 | } 117 | } 118 | 119 | @Override 120 | public boolean equals(Object o) { 121 | if (this == o) { 122 | return true; 123 | } 124 | if (o == null || o.getClass() != this.getClass()) { 125 | return false; 126 | } 127 | 128 | GameActionInfo gameActionInfo = (GameActionInfo) o; 129 | 130 | if (!this.getUser().equals(gameActionInfo.getUser())) { 131 | return false; 132 | } 133 | 134 | if (!this.getTeam().equals(gameActionInfo.getTeam())) { 135 | return false; 136 | } 137 | 138 | if (!this.getScore().equals(gameActionInfo.getScore())) { 139 | return false; 140 | } 141 | 142 | return this.getTimestamp().equals(gameActionInfo.getTimestamp()); 143 | } 144 | 145 | @Override 146 | public int hashCode() { 147 | return Objects.hash(user, team, score, timestamp); 148 | } 149 | } 150 | 151 | /** 152 | * Parses the raw game event info into GameActionInfo objects. Each event line has the following 153 | * format: username,teamname,score,timestamp_in_ms,readable_time e.g.: 154 | * user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224 The human-readable 155 | * time string is not used here. 156 | */ 157 | static class ParseEventFn extends DoFn { 158 | 159 | // Log and count parse errors. 160 | private static final Logger LOG = LoggerFactory.getLogger(ParseEventFn.class); 161 | private final Counter numParseErrors = Metrics.counter("main", "ParseErrors"); 162 | 163 | @ProcessElement 164 | public void processElement(ProcessContext c) { 165 | String[] components = c.element().split(",", -1); 166 | try { 167 | String user = components[0].trim(); 168 | String team = components[1].trim(); 169 | Integer score = Integer.parseInt(components[2].trim()); 170 | Long timestamp = Long.parseLong(components[3].trim()); 171 | GameActionInfo gInfo = new GameActionInfo(user, team, score, timestamp); 172 | c.output(gInfo); 173 | } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) { 174 | numParseErrors.inc(); 175 | LOG.info("Parse error on " + c.element() + ", " + e.getMessage()); 176 | } 177 | } 178 | } 179 | 180 | /** 181 | * A transform to extract key/score information from GameActionInfo, and sum the scores. The 182 | * constructor arg determines whether 'team' or 'user' info is extracted. 183 | */ 184 | // [START DocInclude_USExtractXform] 185 | public static class ExtractAndSumScore 186 | extends PTransform, PCollection>> { 187 | 188 | private final String field; 189 | 190 | ExtractAndSumScore(String field) { 191 | this.field = field; 192 | } 193 | 194 | @Override 195 | public PCollection> expand(PCollection gameInfo) { 196 | 197 | return gameInfo 198 | .apply( 199 | MapElements.into( 200 | TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) 201 | .via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore()))) 202 | .apply(Sum.integersPerKey()); 203 | } 204 | } 205 | // [END DocInclude_USExtractXform] 206 | 207 | /** Options supported by {@link UserScore}. */ 208 | public interface Options extends PipelineOptions { 209 | 210 | @Description("Path to the data file(s) containing game data.") 211 | /* The default maps to two large Google Cloud Storage files (each ~12GB) holding two subsequent 212 | day's worth (roughly) of data. 213 | 214 | Note: You may want to use a small sample dataset to test it locally/quickly : gs://apache-beam-samples/game/small/gaming_data.csv 215 | You can also download it via the command line gsutil cp gs://apache-beam-samples/game/small/gaming_data.csv ./destination_folder/gaming_data.csv */ 216 | @Default.String("gs://apache-beam-samples/game/gaming_data*.csv") 217 | String getInput(); 218 | 219 | void setInput(String value); 220 | 221 | // Set this required option to specify where to write the output. 222 | @Description("Path of the file to write to.") 223 | @Validation.Required 224 | String getOutput(); 225 | 226 | void setOutput(String value); 227 | } 228 | 229 | /** 230 | * Create a map of information that describes how to write pipeline output to text. This map is 231 | * passed to the {@link WriteToText} constructor to write user score sums. 232 | */ 233 | protected static Map>> configureOutput() { 234 | Map>> config = new HashMap<>(); 235 | config.put("user", (c, w) -> c.element().getKey()); 236 | config.put("total_score", (c, w) -> c.element().getValue()); 237 | return config; 238 | } 239 | 240 | /** Run a batch pipeline. */ 241 | // [START DocInclude_USMain] 242 | public static void main(String[] args) throws Exception { 243 | // Begin constructing a pipeline configured by commandline flags. 244 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 245 | Pipeline pipeline = Pipeline.create(options); 246 | 247 | // Read events from a text file and parse them. 248 | pipeline 249 | .apply(TextIO.read().from(options.getInput())) 250 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) 251 | // Extract and sum username/score pairs from the event data. 252 | .apply("ExtractUserScore", new ExtractAndSumScore("user")) 253 | .apply( 254 | "WriteUserScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), false)); 255 | 256 | // Run the batch pipeline. 257 | pipeline.run().waitUntilFinish(); 258 | } 259 | // [END DocInclude_USMain] 260 | } 261 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/complete/game/injector/InjectorUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game.injector; 19 | 20 | import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkNotNull; 21 | 22 | import com.google.api.client.googleapis.auth.oauth2.GoogleCredential; 23 | import com.google.api.client.googleapis.json.GoogleJsonResponseException; 24 | import com.google.api.client.googleapis.util.Utils; 25 | import com.google.api.client.http.HttpRequestInitializer; 26 | import com.google.api.client.http.HttpStatusCodes; 27 | import com.google.api.client.http.HttpTransport; 28 | import com.google.api.client.json.JsonFactory; 29 | import com.google.api.services.pubsub.Pubsub; 30 | import com.google.api.services.pubsub.PubsubScopes; 31 | import com.google.api.services.pubsub.model.Topic; 32 | import java.io.IOException; 33 | 34 | class InjectorUtils { 35 | 36 | private static final String APP_NAME = "injector"; 37 | 38 | /** Builds a new Pubsub client and returns it. */ 39 | public static Pubsub getClient(final HttpTransport httpTransport, final JsonFactory jsonFactory) 40 | throws IOException { 41 | checkNotNull(httpTransport); 42 | checkNotNull(jsonFactory); 43 | GoogleCredential credential = 44 | GoogleCredential.getApplicationDefault(httpTransport, jsonFactory); 45 | if (credential.createScopedRequired()) { 46 | credential = credential.createScoped(PubsubScopes.all()); 47 | } 48 | if (credential.getClientAuthentication() != null) { 49 | System.out.println( 50 | "\n***Warning! You are not using service account credentials to " 51 | + "authenticate.\nYou need to use service account credentials for this example," 52 | + "\nsince user-level credentials do not have enough pubsub quota,\nand so you will run " 53 | + "out of PubSub quota very quickly.\nSee " 54 | + "https://developers.google.com/identity/protocols/application-default-credentials."); 55 | System.exit(1); 56 | } 57 | HttpRequestInitializer initializer = new RetryHttpInitializerWrapper(credential); 58 | return new Pubsub.Builder(httpTransport, jsonFactory, initializer) 59 | .setApplicationName(APP_NAME) 60 | .build(); 61 | } 62 | 63 | /** Builds a new Pubsub client with default HttpTransport and JsonFactory and returns it. */ 64 | public static Pubsub getClient() throws IOException { 65 | return getClient(Utils.getDefaultTransport(), Utils.getDefaultJsonFactory()); 66 | } 67 | 68 | /** Returns the fully qualified topic name for Pub/Sub. */ 69 | public static String getFullyQualifiedTopicName(final String project, final String topic) { 70 | return String.format("projects/%s/topics/%s", project, topic); 71 | } 72 | 73 | /** Create a topic if it doesn't exist. */ 74 | public static void createTopic(Pubsub client, String fullTopicName) throws IOException { 75 | System.out.println("fullTopicName " + fullTopicName); 76 | try { 77 | client.projects().topics().get(fullTopicName).execute(); 78 | } catch (GoogleJsonResponseException e) { 79 | if (e.getStatusCode() == HttpStatusCodes.STATUS_CODE_NOT_FOUND) { 80 | Topic topic = client.projects().topics().create(fullTopicName, new Topic()).execute(); 81 | System.out.printf("Topic %s was created.%n", topic.getName()); 82 | } 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/complete/game/injector/RetryHttpInitializerWrapper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game.injector; 19 | 20 | import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkNotNull; 21 | 22 | import com.google.api.client.auth.oauth2.Credential; 23 | import com.google.api.client.http.HttpBackOffIOExceptionHandler; 24 | import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler; 25 | import com.google.api.client.http.HttpRequest; 26 | import com.google.api.client.http.HttpRequestInitializer; 27 | import com.google.api.client.http.HttpUnsuccessfulResponseHandler; 28 | import com.google.api.client.util.ExponentialBackOff; 29 | import com.google.api.client.util.Sleeper; 30 | import java.util.logging.Logger; 31 | 32 | /** 33 | * RetryHttpInitializerWrapper will automatically retry upon RPC failures, preserving the 34 | * auto-refresh behavior of the Google Credentials. 35 | */ 36 | public class RetryHttpInitializerWrapper implements HttpRequestInitializer { 37 | 38 | /** A private logger. */ 39 | private static final Logger LOG = Logger.getLogger(RetryHttpInitializerWrapper.class.getName()); 40 | 41 | /** One minutes in miliseconds. */ 42 | private static final int ONEMINITUES = 60000; 43 | 44 | /** 45 | * Intercepts the request for filling in the "Authorization" header field, as well as recovering 46 | * from certain unsuccessful error codes wherein the Credential must refresh its token for a 47 | * retry. 48 | */ 49 | private final Credential wrappedCredential; 50 | 51 | /** A sleeper; you can replace it with a mock in your test. */ 52 | private final Sleeper sleeper; 53 | 54 | /** 55 | * A constructor. 56 | * 57 | * @param wrappedCredential Credential which will be wrapped and used for providing auth header. 58 | */ 59 | public RetryHttpInitializerWrapper(final Credential wrappedCredential) { 60 | this(wrappedCredential, Sleeper.DEFAULT); 61 | } 62 | 63 | /** 64 | * A protected constructor only for testing. 65 | * 66 | * @param wrappedCredential Credential which will be wrapped and used for providing auth header. 67 | * @param sleeper Sleeper for easy testing. 68 | */ 69 | RetryHttpInitializerWrapper(final Credential wrappedCredential, final Sleeper sleeper) { 70 | this.wrappedCredential = checkNotNull(wrappedCredential); 71 | this.sleeper = sleeper; 72 | } 73 | 74 | /** Initializes the given request. */ 75 | @Override 76 | public final void initialize(final HttpRequest request) { 77 | request.setReadTimeout(2 * ONEMINITUES); // 2 minutes read timeout 78 | final HttpUnsuccessfulResponseHandler backoffHandler = 79 | new HttpBackOffUnsuccessfulResponseHandler(new ExponentialBackOff()).setSleeper(sleeper); 80 | request.setInterceptor(wrappedCredential); 81 | request.setUnsuccessfulResponseHandler( 82 | (request1, response, supportsRetry) -> { 83 | if (wrappedCredential.handleResponse(request1, response, supportsRetry)) { 84 | // If credential decides it can handle it, the return code or message indicated 85 | // something specific to authentication, and no backoff is desired. 86 | return true; 87 | } else if (backoffHandler.handleResponse(request1, response, supportsRetry)) { 88 | // Otherwise, we defer to the judgement of our internal backoff handler. 89 | LOG.info("Retrying " + request1.getUrl().toString()); 90 | return true; 91 | } else { 92 | return false; 93 | } 94 | }); 95 | request.setIOExceptionHandler( 96 | new HttpBackOffIOExceptionHandler(new ExponentialBackOff()).setSleeper(sleeper)); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/complete/game/utils/GameConstants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game.utils; 19 | 20 | import java.util.TimeZone; 21 | import org.joda.time.DateTimeZone; 22 | import org.joda.time.format.DateTimeFormat; 23 | import org.joda.time.format.DateTimeFormatter; 24 | 25 | /** Shared constants between game series classes. */ 26 | public class GameConstants { 27 | 28 | public static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms"; 29 | 30 | public static final DateTimeFormatter DATE_TIME_FORMATTER = 31 | DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") 32 | .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("America/Los_Angeles"))); 33 | } 34 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/complete/game/utils/WriteToBigQuery.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game.utils; 19 | 20 | import com.google.api.services.bigquery.model.TableFieldSchema; 21 | import com.google.api.services.bigquery.model.TableReference; 22 | import com.google.api.services.bigquery.model.TableRow; 23 | import com.google.api.services.bigquery.model.TableSchema; 24 | import java.io.Serializable; 25 | import java.util.ArrayList; 26 | import java.util.List; 27 | import java.util.Map; 28 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 29 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 30 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 31 | import org.apache.beam.sdk.transforms.DoFn; 32 | import org.apache.beam.sdk.transforms.PTransform; 33 | import org.apache.beam.sdk.transforms.ParDo; 34 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow; 35 | import org.apache.beam.sdk.values.PCollection; 36 | import org.apache.beam.sdk.values.PDone; 37 | 38 | /** 39 | * Generate, format, and write BigQuery table row information. Use provided information about the 40 | * field names and types, as well as lambda functions that describe how to generate their values. 41 | */ 42 | @SuppressWarnings({ 43 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402) 44 | }) 45 | public class WriteToBigQuery extends PTransform, PDone> { 46 | 47 | protected String projectId; 48 | protected String datasetId; 49 | protected String tableName; 50 | protected Map> fieldInfo; 51 | 52 | public WriteToBigQuery() {} 53 | 54 | public WriteToBigQuery( 55 | String projectId, 56 | String datasetId, 57 | String tableName, 58 | Map> fieldInfo) { 59 | this.projectId = projectId; 60 | this.datasetId = datasetId; 61 | this.tableName = tableName; 62 | this.fieldInfo = fieldInfo; 63 | } 64 | 65 | /** 66 | * A {@link Serializable} function from a {@link DoFn.ProcessContext} and {@link BoundedWindow} to 67 | * the value for that field. 68 | */ 69 | public interface FieldFn extends Serializable { 70 | Object apply(DoFn.ProcessContext context, BoundedWindow window); 71 | } 72 | 73 | /** Define a class to hold information about output table field definitions. */ 74 | public static class FieldInfo implements Serializable { 75 | // The BigQuery 'type' of the field 76 | private String fieldType; 77 | // A lambda function to generate the field value 78 | private FieldFn fieldFn; 79 | 80 | public FieldInfo(String fieldType, FieldFn fieldFn) { 81 | this.fieldType = fieldType; 82 | this.fieldFn = fieldFn; 83 | } 84 | 85 | String getFieldType() { 86 | return this.fieldType; 87 | } 88 | 89 | FieldFn getFieldFn() { 90 | return this.fieldFn; 91 | } 92 | } 93 | 94 | /** Convert each key/score pair into a BigQuery TableRow as specified by fieldFn. */ 95 | protected class BuildRowFn extends DoFn { 96 | 97 | @ProcessElement 98 | public void processElement(ProcessContext c, BoundedWindow window) { 99 | 100 | TableRow row = new TableRow(); 101 | for (Map.Entry> entry : fieldInfo.entrySet()) { 102 | String key = entry.getKey(); 103 | FieldInfo fcnInfo = entry.getValue(); 104 | FieldFn fcn = fcnInfo.getFieldFn(); 105 | row.set(key, fcn.apply(c, window)); 106 | } 107 | c.output(row); 108 | } 109 | } 110 | 111 | /** Build the output table schema. */ 112 | protected TableSchema getSchema() { 113 | List fields = new ArrayList<>(); 114 | for (Map.Entry> entry : fieldInfo.entrySet()) { 115 | String key = entry.getKey(); 116 | FieldInfo fcnInfo = entry.getValue(); 117 | String bqType = fcnInfo.getFieldType(); 118 | fields.add(new TableFieldSchema().setName(key).setType(bqType)); 119 | } 120 | return new TableSchema().setFields(fields); 121 | } 122 | 123 | @Override 124 | public PDone expand(PCollection teamAndScore) { 125 | teamAndScore 126 | .apply("ConvertToRow", ParDo.of(new BuildRowFn())) 127 | .apply( 128 | BigQueryIO.writeTableRows() 129 | .to(getTable(projectId, datasetId, tableName)) 130 | .withSchema(getSchema()) 131 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 132 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 133 | return PDone.in(teamAndScore.getPipeline()); 134 | } 135 | 136 | /** Utility to construct an output table reference. */ 137 | static TableReference getTable(String projectId, String datasetId, String tableName) { 138 | TableReference table = new TableReference(); 139 | table.setDatasetId(datasetId); 140 | table.setProjectId(projectId); 141 | table.setTableId(tableName); 142 | return table; 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/complete/game/utils/WriteToText.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game.utils; 19 | 20 | import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; 21 | 22 | import java.io.Serializable; 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import java.util.Map; 26 | import java.util.TimeZone; 27 | import java.util.stream.Collectors; 28 | import org.apache.beam.sdk.io.FileBasedSink; 29 | import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; 30 | import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; 31 | import org.apache.beam.sdk.io.TextIO; 32 | import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; 33 | import org.apache.beam.sdk.io.fs.ResourceId; 34 | import org.apache.beam.sdk.transforms.DoFn; 35 | import org.apache.beam.sdk.transforms.PTransform; 36 | import org.apache.beam.sdk.transforms.ParDo; 37 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow; 38 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow; 39 | import org.apache.beam.sdk.transforms.windowing.PaneInfo; 40 | import org.apache.beam.sdk.values.PCollection; 41 | import org.apache.beam.sdk.values.PDone; 42 | import org.joda.time.DateTimeZone; 43 | import org.joda.time.format.DateTimeFormat; 44 | import org.joda.time.format.DateTimeFormatter; 45 | 46 | /** 47 | * Generate, format, and write rows. Use provided information about the field names and types, as 48 | * well as lambda functions that describe how to generate their values. 49 | */ 50 | @SuppressWarnings({ 51 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402) 52 | }) 53 | public class WriteToText extends PTransform, PDone> { 54 | 55 | private static final DateTimeFormatter formatter = 56 | DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") 57 | .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("America/Los_Angeles"))); 58 | 59 | protected String filenamePrefix; 60 | protected Map> fieldFn; 61 | protected boolean windowed; 62 | 63 | public WriteToText() {} 64 | 65 | public WriteToText( 66 | String filenamePrefix, Map> fieldFn, boolean windowed) { 67 | this.filenamePrefix = filenamePrefix; 68 | this.fieldFn = fieldFn; 69 | this.windowed = windowed; 70 | } 71 | 72 | /** 73 | * A {@link Serializable} function from a {@link DoFn.ProcessContext} and {@link BoundedWindow} to 74 | * the value for that field. 75 | */ 76 | public interface FieldFn extends Serializable { 77 | Object apply(DoFn.ProcessContext context, BoundedWindow window); 78 | } 79 | 80 | /** Convert each key/score pair into a row as specified by fieldFn. */ 81 | protected class BuildRowFn extends DoFn { 82 | 83 | @ProcessElement 84 | public void processElement(ProcessContext c, BoundedWindow window) { 85 | List fields = new ArrayList<>(); 86 | for (Map.Entry> entry : fieldFn.entrySet()) { 87 | String key = entry.getKey(); 88 | FieldFn fcn = entry.getValue(); 89 | fields.add(key + ": " + fcn.apply(c, window)); 90 | } 91 | String result = fields.stream().collect(Collectors.joining(", ")); 92 | c.output(result); 93 | } 94 | } 95 | 96 | /** 97 | * A {@link DoFn} that writes elements to files with names deterministically derived from the 98 | * lower and upper bounds of their key (an {@link IntervalWindow}). 99 | */ 100 | protected static class WriteOneFilePerWindow extends PTransform, PDone> { 101 | 102 | private final String filenamePrefix; 103 | 104 | public WriteOneFilePerWindow(String filenamePrefix) { 105 | this.filenamePrefix = filenamePrefix; 106 | } 107 | 108 | @Override 109 | public PDone expand(PCollection input) { 110 | // Verify that the input has a compatible window type. 111 | checkArgument( 112 | input.getWindowingStrategy().getWindowFn().windowCoder() == IntervalWindow.getCoder()); 113 | 114 | ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); 115 | 116 | return input.apply( 117 | TextIO.write() 118 | .to(new PerWindowFiles(resource)) 119 | .withTempDirectory(resource.getCurrentDirectory()) 120 | .withWindowedWrites() 121 | .withNumShards(3)); 122 | } 123 | } 124 | 125 | /** 126 | * A {@link FilenamePolicy} produces a base file name for a write based on metadata about the data 127 | * being written. This always includes the shard number and the total number of shards. For 128 | * windowed writes, it also includes the window and pane index (a sequence number assigned to each 129 | * trigger firing). 130 | */ 131 | protected static class PerWindowFiles extends FilenamePolicy { 132 | 133 | private final ResourceId prefix; 134 | 135 | public PerWindowFiles(ResourceId prefix) { 136 | this.prefix = prefix; 137 | } 138 | 139 | public String filenamePrefixForWindow(IntervalWindow window) { 140 | String filePrefix = prefix.isDirectory() ? "" : prefix.getFilename(); 141 | return String.format( 142 | "%s-%s-%s", filePrefix, formatter.print(window.start()), formatter.print(window.end())); 143 | } 144 | 145 | @Override 146 | public ResourceId windowedFilename( 147 | int shardNumber, 148 | int numShards, 149 | BoundedWindow window, 150 | PaneInfo paneInfo, 151 | OutputFileHints outputFileHints) { 152 | IntervalWindow intervalWindow = (IntervalWindow) window; 153 | String filename = 154 | String.format( 155 | "%s-%s-of-%s%s", 156 | filenamePrefixForWindow(intervalWindow), 157 | shardNumber, 158 | numShards, 159 | outputFileHints.getSuggestedFilenameSuffix()); 160 | return prefix.getCurrentDirectory().resolve(filename, StandardResolveOptions.RESOLVE_FILE); 161 | } 162 | 163 | @Override 164 | public ResourceId unwindowedFilename( 165 | int shardNumber, int numShards, OutputFileHints outputFileHints) { 166 | throw new UnsupportedOperationException("Unsupported."); 167 | } 168 | } 169 | 170 | @Override 171 | public PDone expand(PCollection teamAndScore) { 172 | if (windowed) { 173 | teamAndScore 174 | .apply("ConvertToRow", ParDo.of(new BuildRowFn())) 175 | .apply(new WriteToText.WriteOneFilePerWindow(filenamePrefix)); 176 | } else { 177 | teamAndScore 178 | .apply("ConvertToRow", ParDo.of(new BuildRowFn())) 179 | .apply(TextIO.write().to(filenamePrefix)); 180 | } 181 | return PDone.in(teamAndScore.getPipeline()); 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/complete/game/utils/WriteWindowedToBigQuery.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game.utils; 19 | 20 | import com.google.api.services.bigquery.model.TableRow; 21 | import java.util.Map; 22 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 23 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; 24 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; 25 | import org.apache.beam.sdk.transforms.DoFn; 26 | import org.apache.beam.sdk.transforms.ParDo; 27 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow; 28 | import org.apache.beam.sdk.values.PCollection; 29 | import org.apache.beam.sdk.values.PDone; 30 | 31 | /** 32 | * Generate, format, and write BigQuery table row information. Subclasses {@link WriteToBigQuery} to 33 | * require windowing; so this subclass may be used for writes that require access to the context's 34 | * window information. 35 | */ 36 | public class WriteWindowedToBigQuery extends WriteToBigQuery { 37 | 38 | public WriteWindowedToBigQuery( 39 | String projectId, String datasetId, String tableName, Map> fieldInfo) { 40 | super(projectId, datasetId, tableName, fieldInfo); 41 | } 42 | 43 | /** Convert each key/score pair into a BigQuery TableRow. */ 44 | protected class BuildRowFn extends DoFn { 45 | @ProcessElement 46 | public void processElement(ProcessContext c, BoundedWindow window) { 47 | 48 | TableRow row = new TableRow(); 49 | for (Map.Entry> entry : fieldInfo.entrySet()) { 50 | String key = entry.getKey(); 51 | FieldInfo fcnInfo = entry.getValue(); 52 | row.set(key, fcnInfo.getFieldFn().apply(c, window)); 53 | } 54 | c.output(row); 55 | } 56 | } 57 | 58 | @Override 59 | public PDone expand(PCollection teamAndScore) { 60 | teamAndScore 61 | .apply("ConvertToRow", ParDo.of(new BuildRowFn())) 62 | .apply( 63 | BigQueryIO.writeTableRows() 64 | .to(getTable(projectId, datasetId, tableName)) 65 | .withSchema(getSchema()) 66 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 67 | .withWriteDisposition(WriteDisposition.WRITE_APPEND)); 68 | return PDone.in(teamAndScore.getPipeline()); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/subprocess/ExampleEchoPipeline.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.subprocess; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration; 23 | import org.apache.beam.examples.subprocess.kernel.SubProcessCommandLineArgs; 24 | import org.apache.beam.examples.subprocess.kernel.SubProcessCommandLineArgs.Command; 25 | import org.apache.beam.examples.subprocess.kernel.SubProcessKernel; 26 | import org.apache.beam.examples.subprocess.utils.CallingSubProcessUtils; 27 | import org.apache.beam.sdk.Pipeline; 28 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 29 | import org.apache.beam.sdk.transforms.Create; 30 | import org.apache.beam.sdk.transforms.DoFn; 31 | import org.apache.beam.sdk.transforms.ParDo; 32 | import org.apache.beam.sdk.values.KV; 33 | import org.slf4j.Logger; 34 | import org.slf4j.LoggerFactory; 35 | 36 | /** 37 | * In this example batch pipeline we will invoke a simple Echo C++ library within a DoFn The sample 38 | * makes use of a ExternalLibraryDoFn class which abstracts the setup and processing of the 39 | * executable, logs and results. For this example we are using commands passed to the library based 40 | * on ordinal position but for a production system you should use a mechanism like ProtoBuffers with 41 | * Base64 encoding to pass the parameters to the library To test this example you will need to build 42 | * the files Echo.cc and EchoAgain.cc in a linux env matching the runner that you are using (using 43 | * g++ with static option). Once built copy them to the SourcePath defined in {@link 44 | * SubProcessPipelineOptions} 45 | */ 46 | public class ExampleEchoPipeline { 47 | private static final Logger LOG = LoggerFactory.getLogger(ExampleEchoPipeline.class); 48 | 49 | public static void main(String[] args) throws Exception { 50 | 51 | // Read in the options for the pipeline 52 | SubProcessPipelineOptions options = 53 | PipelineOptionsFactory.fromArgs(args).withValidation().as(SubProcessPipelineOptions.class); 54 | 55 | Pipeline p = Pipeline.create(options); 56 | 57 | // Setup the Configuration option used with all transforms 58 | SubProcessConfiguration configuration = options.getSubProcessConfiguration(); 59 | 60 | // Create some sample data to be fed to our c++ Echo library 61 | List> sampleData = new ArrayList<>(); 62 | for (int i = 0; i < 10000; i++) { 63 | String str = String.valueOf(i); 64 | sampleData.add(KV.of(str, str)); 65 | } 66 | 67 | // Define the pipeline which is two transforms echoing the inputs out to Logs 68 | p.apply(Create.of(sampleData)) 69 | .apply("Echo inputs round 1", ParDo.of(new EchoInputDoFn(configuration, "Echo"))) 70 | .apply("Echo inputs round 2", ParDo.of(new EchoInputDoFn(configuration, "EchoAgain"))); 71 | 72 | p.run(); 73 | } 74 | 75 | /** Simple DoFn that echos the element, used as an example of running a C++ library. */ 76 | @SuppressWarnings("serial") 77 | public static class EchoInputDoFn extends DoFn, KV> { 78 | 79 | private static final Logger LOG = LoggerFactory.getLogger(EchoInputDoFn.class); 80 | 81 | private SubProcessConfiguration configuration; 82 | private String binaryName; 83 | 84 | public EchoInputDoFn(SubProcessConfiguration configuration, String binary) { 85 | // Pass in configuration information the name of the filename of the sub-process and the level 86 | // of concurrency 87 | this.configuration = configuration; 88 | this.binaryName = binary; 89 | } 90 | 91 | @Setup 92 | public void setUp() throws Exception { 93 | CallingSubProcessUtils.setUp(configuration, binaryName); 94 | } 95 | 96 | @ProcessElement 97 | public void processElement(ProcessContext c) throws Exception { 98 | try { 99 | // Our Library takes a single command in position 0 which it will echo back in the result 100 | SubProcessCommandLineArgs commands = new SubProcessCommandLineArgs(); 101 | Command command = new Command(0, String.valueOf(c.element().getValue())); 102 | commands.putCommand(command); 103 | 104 | // The ProcessingKernel deals with the execution of the process 105 | SubProcessKernel kernel = new SubProcessKernel(configuration, binaryName); 106 | 107 | // Run the command and work through the results 108 | List results = kernel.exec(commands); 109 | for (String s : results) { 110 | c.output(KV.of(c.element().getKey(), s)); 111 | } 112 | } catch (Exception ex) { 113 | LOG.error("Error processing element ", ex); 114 | throw ex; 115 | } 116 | } 117 | } 118 | 119 | private static String getTestShellEcho() { 120 | return "#!/bin/sh\n" + "filename=$1;\n" + "echo $2 >> $filename;"; 121 | } 122 | 123 | private static String getTestShellEchoAgain() { 124 | return "#!/bin/sh\n" 125 | + "filename=$1;\n" 126 | + "echo \"You again? Well ok, here is your word again.\" >> $2 >> $filename;"; 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/subprocess/SubProcessPipelineOptions.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.subprocess; 19 | 20 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration; 21 | import org.apache.beam.sdk.options.Default; 22 | import org.apache.beam.sdk.options.DefaultValueFactory; 23 | import org.apache.beam.sdk.options.Description; 24 | import org.apache.beam.sdk.options.PipelineOptions; 25 | import org.apache.beam.sdk.options.Validation.Required; 26 | 27 | /** Options for running a sub process within a DoFn. */ 28 | public interface SubProcessPipelineOptions extends PipelineOptions { 29 | 30 | @Description("Source GCS directory where the C++ library is located gs://bucket/tests") 31 | @Required 32 | String getSourcePath(); 33 | 34 | void setSourcePath(String sourcePath); 35 | 36 | @Description("Working directory for the process I/O") 37 | @Default.String("/tmp/grid_working_files") 38 | String getWorkerPath(); 39 | 40 | void setWorkerPath(String workerPath); 41 | 42 | @Description("The maximum time to wait for the sub-process to complete") 43 | @Default.Integer(3600) 44 | Integer getWaitTime(); 45 | 46 | void setWaitTime(Integer waitTime); 47 | 48 | @Description("As sub-processes can be heavy weight define the level of concurrency level") 49 | @Required 50 | Integer getConcurrency(); 51 | 52 | void setConcurrency(Integer concurrency); 53 | 54 | @Description("Should log files only be uploaded if error.") 55 | @Default.Boolean(true) 56 | Boolean getOnlyUpLoadLogsOnError(); 57 | 58 | void setOnlyUpLoadLogsOnError(Boolean onlyUpLoadLogsOnError); 59 | 60 | @Default.InstanceFactory(SubProcessConfigurationFactory.class) 61 | SubProcessConfiguration getSubProcessConfiguration(); 62 | 63 | void setSubProcessConfiguration(SubProcessConfiguration configuration); 64 | 65 | /** Confirm Configuration and return a configuration object used in pipeline. */ 66 | class SubProcessConfigurationFactory implements DefaultValueFactory { 67 | @Override 68 | public SubProcessConfiguration create(PipelineOptions options) { 69 | 70 | SubProcessPipelineOptions subProcessPipelineOptions = (SubProcessPipelineOptions) options; 71 | 72 | SubProcessConfiguration configuration = new SubProcessConfiguration(); 73 | 74 | if (subProcessPipelineOptions.getSourcePath() == null) { 75 | throw new IllegalStateException("Source path must be set"); 76 | } 77 | if (subProcessPipelineOptions.getConcurrency() == null 78 | || subProcessPipelineOptions.getConcurrency() == 0) { 79 | throw new IllegalStateException("Concurrency must be set and be > 0"); 80 | } 81 | configuration.setSourcePath(subProcessPipelineOptions.getSourcePath()); 82 | configuration.setWorkerPath(subProcessPipelineOptions.getWorkerPath()); 83 | configuration.setWaitTime(subProcessPipelineOptions.getWaitTime()); 84 | configuration.setOnlyUpLoadLogsOnError(subProcessPipelineOptions.getOnlyUpLoadLogsOnError()); 85 | configuration.concurrency = subProcessPipelineOptions.getConcurrency(); 86 | 87 | return configuration; 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/subprocess/configuration/SubProcessConfiguration.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.subprocess.configuration; 19 | 20 | import java.io.Serializable; 21 | 22 | /** 23 | * Configuration file used to setup the Process kernel for execution of the external library Values 24 | * are copied from the Options to all them to be Serializable. 25 | */ 26 | @SuppressWarnings({"serial", "nullness"}) // TODO(https://issues.apache.org/jira/browse/BEAM-10402) 27 | public class SubProcessConfiguration implements Serializable { 28 | 29 | // Source GCS directory where the C++ library is located gs://bucket/tests 30 | public String sourcePath; 31 | 32 | // Working directory for the process I/O 33 | public String workerPath; 34 | 35 | // The maximum time to wait for the sub-process to complete 36 | public Integer waitTime; 37 | 38 | // "As sub-processes can be heavy weight match the concurrency level to num cores on the machines" 39 | public Integer concurrency; 40 | 41 | // Should log files only be uploaded if error 42 | public Boolean onlyUpLoadLogsOnError; 43 | 44 | public Boolean getOnlyUpLoadLogsOnError() { 45 | return onlyUpLoadLogsOnError; 46 | } 47 | 48 | public void setOnlyUpLoadLogsOnError(Boolean onlyUpLoadLogsOnError) { 49 | this.onlyUpLoadLogsOnError = onlyUpLoadLogsOnError; 50 | } 51 | 52 | public String getSourcePath() { 53 | return sourcePath; 54 | } 55 | 56 | public void setSourcePath(String sourcePath) { 57 | this.sourcePath = sourcePath; 58 | } 59 | 60 | public String getWorkerPath() { 61 | return workerPath; 62 | } 63 | 64 | public void setWorkerPath(String workerPath) { 65 | this.workerPath = workerPath; 66 | } 67 | 68 | public Integer getWaitTime() { 69 | return waitTime; 70 | } 71 | 72 | public void setWaitTime(Integer waitTime) { 73 | this.waitTime = waitTime; 74 | } 75 | 76 | public Integer getConcurrency() { 77 | return concurrency; 78 | } 79 | 80 | public void setConcurrency(Integer concurrency) { 81 | this.concurrency = concurrency; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/subprocess/kernel/SubProcessCommandLineArgs.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.subprocess.kernel; 19 | 20 | import java.util.List; 21 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists; 22 | 23 | /** Parameters to the sub-process, has tuple of ordinal position and the value. */ 24 | @SuppressWarnings({ 25 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402) 26 | }) 27 | public class SubProcessCommandLineArgs { 28 | 29 | // Parameters to pass to the sub-process 30 | private List parameters = Lists.newArrayList(); 31 | 32 | public void addCommand(Integer position, String value) { 33 | parameters.add(new Command(position, value)); 34 | } 35 | 36 | public void putCommand(Command command) { 37 | parameters.add(command); 38 | } 39 | 40 | public List getParameters() { 41 | return parameters; 42 | } 43 | 44 | /** Class used to store the SubProcces parameters. */ 45 | public static class Command { 46 | 47 | // The ordinal position of the command to pass to the sub-process 48 | int ordinalPosition; 49 | String value; 50 | 51 | @SuppressWarnings("unused") 52 | private Command() {} 53 | 54 | public Command(int ordinalPosition, String value) { 55 | this.ordinalPosition = ordinalPosition; 56 | this.value = value; 57 | } 58 | 59 | public int getKey() { 60 | return ordinalPosition; 61 | } 62 | 63 | public void setKey(int key) { 64 | this.ordinalPosition = key; 65 | } 66 | 67 | public String getValue() { 68 | return value; 69 | } 70 | 71 | public void setValue(String value) { 72 | this.value = value; 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/subprocess/kernel/SubProcessIOFiles.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.subprocess.kernel; 19 | 20 | import java.io.Closeable; 21 | import java.io.IOException; 22 | import java.nio.file.Files; 23 | import java.nio.file.Path; 24 | import java.nio.file.Paths; 25 | import java.util.UUID; 26 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration; 27 | import org.apache.beam.examples.subprocess.utils.FileUtils; 28 | import org.slf4j.Logger; 29 | import org.slf4j.LoggerFactory; 30 | 31 | /** 32 | * All information generated from the process will be stored in output files. The local working 33 | * directory is used to generate three files with extension .err for standard error output .out for 34 | * standard out output .ret for storing the results from the called library. The files will have a 35 | * uuid created for them based on java.util.UUID 36 | */ 37 | public class SubProcessIOFiles implements Closeable { 38 | 39 | private static final Logger LOG = LoggerFactory.getLogger(SubProcessIOFiles.class); 40 | 41 | Path errFile; 42 | Path outFile; 43 | Path resultFile; 44 | Path base; 45 | 46 | String errFileLocation = ""; 47 | String outFileLocation = ""; 48 | String uuid; 49 | 50 | public String getErrFileLocation() { 51 | return errFileLocation; 52 | } 53 | 54 | public String getOutFileLocation() { 55 | return outFileLocation; 56 | } 57 | 58 | /** @param workerWorkingDirectory */ 59 | public SubProcessIOFiles(String workerWorkingDirectory) { 60 | 61 | this.uuid = UUID.randomUUID().toString(); 62 | base = Paths.get(workerWorkingDirectory); 63 | 64 | // Setup all the redirect handles, including the return file type 65 | errFile = Paths.get(base.toString(), uuid + ".err"); 66 | outFile = Paths.get(base.toString(), uuid + ".out"); 67 | resultFile = Paths.get(base.toString(), uuid + ".res"); 68 | } 69 | 70 | public Path getErrFile() { 71 | return errFile; 72 | } 73 | 74 | public Path getOutFile() { 75 | return outFile; 76 | } 77 | 78 | public Path getResultFile() { 79 | return resultFile; 80 | } 81 | 82 | /** 83 | * Clean up the files that have been created on the local worker file system. Without this expect 84 | * both performance issues and eventual failure 85 | */ 86 | @Override 87 | public void close() throws IOException { 88 | 89 | if (Files.exists(outFile)) { 90 | Files.delete(outFile); 91 | } 92 | 93 | if (Files.exists(errFile)) { 94 | Files.delete(errFile); 95 | } 96 | 97 | if (Files.exists(resultFile)) { 98 | Files.delete(resultFile); 99 | } 100 | } 101 | 102 | /** 103 | * Will copy the output files to the GCS path setup via the configuration. 104 | * 105 | * @param configuration 106 | * @param params 107 | */ 108 | public void copyOutPutFilesToBucket(SubProcessConfiguration configuration, String params) { 109 | if (Files.exists(outFile) || Files.exists(errFile)) { 110 | try { 111 | outFileLocation = FileUtils.copyFileFromWorkerToGCS(configuration, outFile); 112 | } catch (Exception ex) { 113 | LOG.error("Error uploading log file to storage ", ex); 114 | } 115 | 116 | try { 117 | errFileLocation = FileUtils.copyFileFromWorkerToGCS(configuration, errFile); 118 | } catch (Exception ex) { 119 | LOG.error("Error uploading log file to storage ", ex); 120 | } 121 | 122 | LOG.info( 123 | String.format( 124 | "Log Files for process: %s outFile was: %s errFile was: %s", 125 | params, outFileLocation, errFileLocation)); 126 | } else { 127 | LOG.error(String.format("There was no output file or err file for process %s", params)); 128 | } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/subprocess/kernel/SubProcessKernel.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.subprocess.kernel; 19 | 20 | import java.io.IOException; 21 | import java.lang.ProcessBuilder.Redirect; 22 | import java.nio.file.Files; 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import java.util.concurrent.TimeUnit; 26 | import java.util.stream.Stream; 27 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration; 28 | import org.apache.beam.examples.subprocess.utils.CallingSubProcessUtils; 29 | import org.apache.beam.examples.subprocess.utils.FileUtils; 30 | import org.slf4j.Logger; 31 | import org.slf4j.LoggerFactory; 32 | 33 | /** 34 | * This is the process kernel which deals with exec of the subprocess. It also deals with all I/O. 35 | */ 36 | @SuppressWarnings({ 37 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402) 38 | }) 39 | public class SubProcessKernel { 40 | 41 | private static final Logger LOG = LoggerFactory.getLogger(SubProcessKernel.class); 42 | 43 | private static final int MAX_SIZE_COMMAND_LINE_ARGS = 128 * 1024; 44 | 45 | SubProcessConfiguration configuration; 46 | ProcessBuilder processBuilder; 47 | 48 | private SubProcessKernel() {} 49 | 50 | /** 51 | * Creates the SubProcess Kernel ready for execution. Will deal with all input and outputs to the 52 | * SubProcess 53 | * 54 | * @param options 55 | * @param binaryName 56 | */ 57 | public SubProcessKernel(SubProcessConfiguration options, String binaryName) { 58 | this.configuration = options; 59 | this.processBuilder = new ProcessBuilder(binaryName); 60 | } 61 | 62 | public List exec(SubProcessCommandLineArgs commands) throws Exception { 63 | try (CallingSubProcessUtils.Permit permit = 64 | new CallingSubProcessUtils.Permit(processBuilder.command().get(0))) { 65 | 66 | List results = null; 67 | 68 | try (SubProcessIOFiles outputFiles = new SubProcessIOFiles(configuration.getWorkerPath())) { 69 | 70 | try { 71 | Process process = execBinary(processBuilder, commands, outputFiles); 72 | results = collectProcessResults(process, processBuilder, outputFiles); 73 | } catch (Exception ex) { 74 | LOG.error("Error running executable ", ex); 75 | throw ex; 76 | } 77 | } catch (IOException ex) { 78 | LOG.error( 79 | "Unable to delete the outputfiles. This can lead to performance issues and failure", 80 | ex); 81 | } 82 | return results; 83 | } 84 | } 85 | 86 | public byte[] execBinaryResult(SubProcessCommandLineArgs commands) throws Exception { 87 | try (CallingSubProcessUtils.Permit permit = 88 | new CallingSubProcessUtils.Permit(processBuilder.command().get(0))) { 89 | 90 | try (SubProcessIOFiles outputFiles = new SubProcessIOFiles(configuration.getWorkerPath())) { 91 | 92 | try { 93 | Process process = execBinary(processBuilder, commands, outputFiles); 94 | return collectProcessResultsBytes(process, processBuilder, outputFiles); 95 | } catch (Exception ex) { 96 | LOG.error("Error running executable ", ex); 97 | throw ex; 98 | } 99 | } catch (IOException ex) { 100 | LOG.error( 101 | "Unable to delete the outputfiles. This can lead to performance issues and failure", 102 | ex); 103 | } 104 | return new byte[0]; 105 | } 106 | } 107 | 108 | private ProcessBuilder prepareBuilder( 109 | ProcessBuilder builder, SubProcessCommandLineArgs commands, SubProcessIOFiles outPutFiles) 110 | throws IllegalStateException { 111 | 112 | // Check we are not over the max size of command line parameters 113 | if (getTotalCommandBytes(commands) > MAX_SIZE_COMMAND_LINE_ARGS) { 114 | throw new IllegalStateException("Command is over 2MB in size"); 115 | } 116 | 117 | appendExecutablePath(builder); 118 | 119 | // Add the result file path to the builder at position 1, 0 is reserved for the process itself 120 | builder.command().add(1, outPutFiles.resultFile.toString()); 121 | 122 | // Shift commands by 2 ordinal positions and load into the builder 123 | for (SubProcessCommandLineArgs.Command s : commands.getParameters()) { 124 | builder.command().add(s.ordinalPosition + 2, s.value); 125 | } 126 | 127 | builder.redirectError(Redirect.appendTo(outPutFiles.errFile.toFile())); 128 | builder.redirectOutput(Redirect.appendTo(outPutFiles.outFile.toFile())); 129 | 130 | return builder; 131 | } 132 | 133 | /** 134 | * Add up the total bytes used by the process. 135 | * 136 | * @param commands 137 | * @return 138 | */ 139 | private int getTotalCommandBytes(SubProcessCommandLineArgs commands) { 140 | int size = 0; 141 | for (SubProcessCommandLineArgs.Command c : commands.getParameters()) { 142 | size += c.value.length(); 143 | } 144 | return size; 145 | } 146 | 147 | private Process execBinary( 148 | ProcessBuilder builder, SubProcessCommandLineArgs commands, SubProcessIOFiles outPutFiles) 149 | throws Exception { 150 | try { 151 | 152 | builder = prepareBuilder(builder, commands, outPutFiles); 153 | Process process = builder.start(); 154 | 155 | boolean timeout = !process.waitFor(configuration.getWaitTime(), TimeUnit.SECONDS); 156 | 157 | if (timeout) { 158 | String log = 159 | String.format( 160 | "Timeout waiting to run process with parameters %s . " 161 | + "Check to see if your timeout is long enough. Currently set at %s.", 162 | createLogEntryFromInputs(builder.command()), configuration.getWaitTime()); 163 | throw new Exception(log); 164 | } 165 | return process; 166 | 167 | } catch (Exception ex) { 168 | 169 | LOG.error( 170 | String.format( 171 | "Error running process with parameters %s error was %s ", 172 | createLogEntryFromInputs(builder.command()), ex.getMessage())); 173 | throw new Exception(ex); 174 | } 175 | } 176 | 177 | /** 178 | * TODO clean up duplicate with byte[] version collectBinaryProcessResults. 179 | * 180 | * @param process 181 | * @param builder 182 | * @param outPutFiles 183 | * @return List of results 184 | * @throws Exception if process has non 0 value or no logs found then throw exception 185 | */ 186 | private List collectProcessResults( 187 | Process process, ProcessBuilder builder, SubProcessIOFiles outPutFiles) throws Exception { 188 | 189 | List results = new ArrayList<>(); 190 | 191 | try { 192 | 193 | LOG.debug(String.format("Executing process %s", createLogEntryFromInputs(builder.command()))); 194 | 195 | // If process exit value is not 0 then subprocess failed, record logs 196 | if (process.exitValue() != 0) { 197 | outPutFiles.copyOutPutFilesToBucket(configuration, FileUtils.toStringParams(builder)); 198 | String log = createLogEntryForProcessFailure(process, builder.command(), outPutFiles); 199 | throw new Exception(log); 200 | } 201 | 202 | // If no return file then either something went wrong or the binary is setup incorrectly for 203 | // the ret file either way throw error 204 | if (!Files.exists(outPutFiles.resultFile)) { 205 | String log = createLogEntryForProcessFailure(process, builder.command(), outPutFiles); 206 | outPutFiles.copyOutPutFilesToBucket(configuration, FileUtils.toStringParams(builder)); 207 | throw new Exception(log); 208 | } 209 | 210 | // Everything looks healthy return values 211 | try (Stream lines = Files.lines(outPutFiles.resultFile)) { 212 | for (String line : (Iterable) lines::iterator) { 213 | results.add(line); 214 | } 215 | } 216 | return results; 217 | } catch (Exception ex) { 218 | String log = 219 | String.format( 220 | "Unexpected error runnng process. %s error message was %s", 221 | createLogEntryFromInputs(builder.command()), ex.getMessage()); 222 | throw new Exception(log); 223 | } 224 | } 225 | 226 | /** 227 | * Used when the reault file contains binary data. 228 | * 229 | * @param process 230 | * @param builder 231 | * @param outPutFiles 232 | * @return Binary results 233 | * @throws Exception if process has non 0 value or no logs found then throw exception 234 | */ 235 | private byte[] collectProcessResultsBytes( 236 | Process process, ProcessBuilder builder, SubProcessIOFiles outPutFiles) throws Exception { 237 | 238 | Byte[] results; 239 | 240 | try { 241 | 242 | LOG.debug(String.format("Executing process %s", createLogEntryFromInputs(builder.command()))); 243 | 244 | // If process exit value is not 0 then subprocess failed, record logs 245 | if (process.exitValue() != 0) { 246 | outPutFiles.copyOutPutFilesToBucket(configuration, FileUtils.toStringParams(builder)); 247 | String log = createLogEntryForProcessFailure(process, builder.command(), outPutFiles); 248 | throw new Exception(log); 249 | } 250 | 251 | // If no return file then either something went wrong or the binary is setup incorrectly for 252 | // the ret file either way throw error 253 | if (!Files.exists(outPutFiles.resultFile)) { 254 | String log = createLogEntryForProcessFailure(process, builder.command(), outPutFiles); 255 | outPutFiles.copyOutPutFilesToBucket(configuration, FileUtils.toStringParams(builder)); 256 | throw new Exception(log); 257 | } 258 | 259 | // Everything looks healthy return bytes 260 | return Files.readAllBytes(outPutFiles.resultFile); 261 | 262 | } catch (Exception ex) { 263 | String log = 264 | String.format( 265 | "Unexpected error runnng process. %s error message was %s", 266 | createLogEntryFromInputs(builder.command()), ex.getMessage()); 267 | throw new Exception(log); 268 | } 269 | } 270 | 271 | private static String createLogEntryForProcessFailure( 272 | Process process, List commands, SubProcessIOFiles files) { 273 | 274 | StringBuilder stringBuilder = new StringBuilder(); 275 | 276 | // Highlight when no result file is found vs standard process error 277 | if (process.exitValue() == 0) { 278 | stringBuilder.append(String.format("%nProcess succeded but no result file was found %n")); 279 | } else { 280 | stringBuilder.append( 281 | String.format("%nProcess error failed with exit value of %s %n", process.exitValue())); 282 | } 283 | 284 | stringBuilder.append( 285 | String.format("Command info was %s %n", createLogEntryFromInputs(commands))); 286 | 287 | stringBuilder.append( 288 | String.format( 289 | "First line of error file is %s %n", FileUtils.readLineOfLogFile(files.errFile))); 290 | 291 | stringBuilder.append( 292 | String.format( 293 | "First line of out file is %s %n", FileUtils.readLineOfLogFile(files.outFile))); 294 | 295 | stringBuilder.append( 296 | String.format( 297 | "First line of ret file is %s %n", FileUtils.readLineOfLogFile(files.resultFile))); 298 | 299 | return stringBuilder.toString(); 300 | } 301 | 302 | private static String createLogEntryFromInputs(List commands) { 303 | String params; 304 | if (commands != null) { 305 | params = String.join(",", commands); 306 | } else { 307 | params = "No-Commands"; 308 | } 309 | return params; 310 | } 311 | 312 | // Pass the Path of the binary to the SubProcess in Command position 0 313 | private ProcessBuilder appendExecutablePath(ProcessBuilder builder) { 314 | String executable = builder.command().get(0); 315 | if (executable == null) { 316 | throw new IllegalArgumentException( 317 | "No executable provided to the Process Builder... we will do... nothing... "); 318 | } 319 | builder 320 | .command() 321 | .set(0, FileUtils.getFileResourceId(configuration.getWorkerPath(), executable).toString()); 322 | return builder; 323 | } 324 | } 325 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/subprocess/utils/CallingSubProcessUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.subprocess.utils; 19 | 20 | import java.util.Map; 21 | import java.util.Set; 22 | import java.util.concurrent.ConcurrentHashMap; 23 | import java.util.concurrent.Semaphore; 24 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration; 25 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets; 26 | import org.slf4j.Logger; 27 | import org.slf4j.LoggerFactory; 28 | 29 | /** Utility class for dealing with concurrency and binary file copies to the worker. */ 30 | public class CallingSubProcessUtils { 31 | 32 | // Prevent Instantiation 33 | private CallingSubProcessUtils() {} 34 | 35 | private static final Logger LOG = LoggerFactory.getLogger(CallingSubProcessUtils.class); 36 | 37 | static boolean initCompleted = false; 38 | 39 | // Allow multiple subclasses to create files, but only one thread per subclass can add the file to 40 | // the worker 41 | private static final Set downloadedFiles = Sets.newConcurrentHashSet(); 42 | 43 | // Limit the number of threads able to do work 44 | private static Map semaphores = new ConcurrentHashMap<>(); 45 | 46 | public static void setUp(SubProcessConfiguration configuration, String binaryName) 47 | throws Exception { 48 | 49 | if (!semaphores.containsKey(binaryName)) { 50 | initSemaphore(configuration.getConcurrency(), binaryName); 51 | } 52 | 53 | synchronized (downloadedFiles) { 54 | if (!downloadedFiles.contains(binaryName)) { 55 | // Create Directories if needed 56 | FileUtils.createDirectoriesOnWorker(configuration); 57 | LOG.info("Calling filesetup to move Executables to worker."); 58 | ExecutableFile executableFile = new ExecutableFile(configuration, binaryName); 59 | FileUtils.copyFileFromGCSToWorker(executableFile); 60 | downloadedFiles.add(binaryName); 61 | } 62 | } 63 | } 64 | 65 | public static synchronized void initSemaphore(Integer permits, String binaryName) { 66 | if (!semaphores.containsKey(binaryName)) { 67 | LOG.info(String.format(String.format("Initialized Semaphore for binary %s ", binaryName))); 68 | semaphores.put(binaryName, new Semaphore(permits)); 69 | } 70 | } 71 | 72 | private static void aquireSemaphore(String binaryName) throws IllegalStateException { 73 | if (!semaphores.containsKey(binaryName)) { 74 | throw new IllegalStateException("Semaphore is NULL, check init logic in @Setup."); 75 | } 76 | try { 77 | semaphores.get(binaryName).acquire(); 78 | } catch (InterruptedException ex) { 79 | LOG.error("Interupted during aquire", ex); 80 | } 81 | } 82 | 83 | private static void releaseSemaphore(String binaryName) throws IllegalStateException { 84 | if (!semaphores.containsKey(binaryName)) { 85 | throw new IllegalStateException("Semaphore is NULL, check init logic in @Setup."); 86 | } 87 | semaphores.get(binaryName).release(); 88 | } 89 | 90 | /** Permit class for access to worker cpu resources. */ 91 | public static class Permit implements AutoCloseable { 92 | 93 | private String binaryName; 94 | 95 | public Permit(String binaryName) { 96 | this.binaryName = binaryName; 97 | CallingSubProcessUtils.aquireSemaphore(binaryName); 98 | } 99 | 100 | @Override 101 | public void close() { 102 | CallingSubProcessUtils.releaseSemaphore(binaryName); 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/subprocess/utils/ExecutableFile.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.subprocess.utils; 19 | 20 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration; 21 | import org.apache.beam.sdk.coders.AvroCoder; 22 | import org.apache.beam.sdk.coders.DefaultCoder; 23 | import org.slf4j.Logger; 24 | import org.slf4j.LoggerFactory; 25 | 26 | /** Contains the configuration for the external library. */ 27 | @DefaultCoder(AvroCoder.class) 28 | @SuppressWarnings({ 29 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402) 30 | }) 31 | public class ExecutableFile { 32 | 33 | String fileName; 34 | 35 | private String sourceGCSLocation; 36 | private String destinationLocation; 37 | 38 | private static final Logger LOG = LoggerFactory.getLogger(ExecutableFile.class); 39 | 40 | public String getSourceGCSLocation() { 41 | return sourceGCSLocation; 42 | } 43 | 44 | public void setSourceGCSLocation(String sourceGCSLocation) { 45 | this.sourceGCSLocation = sourceGCSLocation; 46 | } 47 | 48 | public String getDestinationLocation() { 49 | return destinationLocation; 50 | } 51 | 52 | public void setDestinationLocation(String destinationLocation) { 53 | this.destinationLocation = destinationLocation; 54 | } 55 | 56 | public ExecutableFile(SubProcessConfiguration configuration, String fileName) 57 | throws IllegalStateException { 58 | if (configuration == null) { 59 | throw new IllegalStateException("Configuration can not be NULL"); 60 | } 61 | if (fileName == null) { 62 | throw new IllegalStateException("FileName can not be NULLt"); 63 | } 64 | this.fileName = fileName; 65 | setDestinationLocation(configuration); 66 | setSourceLocation(configuration); 67 | } 68 | 69 | private void setDestinationLocation(SubProcessConfiguration configuration) { 70 | this.sourceGCSLocation = 71 | FileUtils.getFileResourceId(configuration.getSourcePath(), fileName).toString(); 72 | } 73 | 74 | private void setSourceLocation(SubProcessConfiguration configuration) { 75 | this.destinationLocation = 76 | FileUtils.getFileResourceId(configuration.getWorkerPath(), fileName).toString(); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /examples/java8/src/main/java/org/apache/beam/examples/subprocess/utils/FileUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.subprocess.utils; 19 | 20 | import static java.nio.charset.StandardCharsets.UTF_8; 21 | 22 | import java.io.BufferedReader; 23 | import java.io.IOException; 24 | import java.nio.ByteBuffer; 25 | import java.nio.channels.ReadableByteChannel; 26 | import java.nio.channels.WritableByteChannel; 27 | import java.nio.file.FileAlreadyExistsException; 28 | import java.nio.file.Files; 29 | import java.nio.file.Path; 30 | import java.nio.file.Paths; 31 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration; 32 | import org.apache.beam.sdk.io.FileSystems; 33 | import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; 34 | import org.apache.beam.sdk.io.fs.ResourceId; 35 | import org.slf4j.Logger; 36 | import org.slf4j.LoggerFactory; 37 | 38 | /** Utilities for dealing with movement of files from object stores and workers. */ 39 | @SuppressWarnings({ 40 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402) 41 | }) 42 | public class FileUtils { 43 | 44 | private static final Logger LOG = LoggerFactory.getLogger(FileUtils.class); 45 | 46 | public static ResourceId getFileResourceId(String directory, String fileName) { 47 | ResourceId resourceID = FileSystems.matchNewResource(directory, true); 48 | return resourceID.getCurrentDirectory().resolve(fileName, StandardResolveOptions.RESOLVE_FILE); 49 | } 50 | 51 | public static String toStringParams(ProcessBuilder builder) { 52 | return String.join(",", builder.command()); 53 | } 54 | 55 | public static String copyFileFromWorkerToGCS( 56 | SubProcessConfiguration configuration, Path fileToUpload) throws Exception { 57 | 58 | Path fileName; 59 | 60 | if ((fileName = fileToUpload.getFileName()) == null) { 61 | throw new IllegalArgumentException("FileName can not be null."); 62 | } 63 | 64 | ResourceId sourceFile = getFileResourceId(configuration.getWorkerPath(), fileName.toString()); 65 | 66 | LOG.info("Copying file from worker " + sourceFile); 67 | 68 | ResourceId destinationFile = 69 | getFileResourceId(configuration.getSourcePath(), fileName.toString()); 70 | // TODO currently not supported with different schemas for example GCS to local, else could use 71 | // FileSystems.copy(ImmutableList.of(sourceFile), ImmutableList.of(destinationFile)); 72 | try { 73 | return copyFile(sourceFile, destinationFile); 74 | } catch (Exception ex) { 75 | LOG.error( 76 | String.format("Error copying file from %s to %s", sourceFile, destinationFile), ex); 77 | throw ex; 78 | } 79 | } 80 | 81 | public static String copyFileFromGCSToWorker(ExecutableFile execuableFile) throws Exception { 82 | 83 | ResourceId sourceFile = 84 | FileSystems.matchNewResource(execuableFile.getSourceGCSLocation(), false); 85 | ResourceId destinationFile = 86 | FileSystems.matchNewResource(execuableFile.getDestinationLocation(), false); 87 | try { 88 | LOG.info( 89 | String.format( 90 | "Moving File %s to %s ", 91 | execuableFile.getSourceGCSLocation(), execuableFile.getDestinationLocation())); 92 | Path path = Paths.get(execuableFile.getDestinationLocation()); 93 | 94 | if (path.toFile().exists()) { 95 | LOG.warn( 96 | String.format( 97 | "Overwriting file %s, should only see this once per worker.", 98 | execuableFile.getDestinationLocation())); 99 | } 100 | copyFile(sourceFile, destinationFile); 101 | path.toFile().setExecutable(true); 102 | return path.toString(); 103 | 104 | } catch (Exception ex) { 105 | LOG.error(String.format("Error moving file : %s ", execuableFile.fileName), ex); 106 | throw ex; 107 | } 108 | } 109 | 110 | public static String copyFile(ResourceId sourceFile, ResourceId destinationFile) 111 | throws IOException { 112 | 113 | try (WritableByteChannel writeChannel = FileSystems.create(destinationFile, "text/plain")) { 114 | try (ReadableByteChannel readChannel = FileSystems.open(sourceFile)) { 115 | 116 | final ByteBuffer buffer = ByteBuffer.allocateDirect(16 * 1024); 117 | while (readChannel.read(buffer) != -1) { 118 | buffer.flip(); 119 | writeChannel.write(buffer); 120 | buffer.compact(); 121 | } 122 | buffer.flip(); 123 | while (buffer.hasRemaining()) { 124 | writeChannel.write(buffer); 125 | } 126 | } 127 | } 128 | 129 | return destinationFile.toString(); 130 | } 131 | 132 | /** 133 | * Create directories needed based on configuration. 134 | * 135 | * @param configuration 136 | * @throws IOException 137 | */ 138 | public static void createDirectoriesOnWorker(SubProcessConfiguration configuration) 139 | throws IOException { 140 | 141 | try { 142 | 143 | Path path = Paths.get(configuration.getWorkerPath()); 144 | 145 | if (!path.toFile().exists()) { 146 | Files.createDirectories(path); 147 | LOG.info(String.format("Created Folder %s ", path.toFile())); 148 | } 149 | } catch (FileAlreadyExistsException ex) { 150 | LOG.warn( 151 | String.format( 152 | " Tried to create folder %s which already existsed, this should not happen!", 153 | configuration.getWorkerPath()), 154 | ex); 155 | } 156 | } 157 | 158 | public static String readLineOfLogFile(Path path) { 159 | 160 | try (BufferedReader br = Files.newBufferedReader(Paths.get(path.toString()), UTF_8)) { 161 | return br.readLine(); 162 | } catch (IOException e) { 163 | LOG.error("Error reading the first line of file", e); 164 | } 165 | 166 | // `return empty string rather than NULL string as this data is often used in further logging 167 | return ""; 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /examples/java8/src/test/java/org/apache/beam/examples/DebuggingWordCountTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples; 19 | 20 | import java.io.File; 21 | import java.nio.charset.StandardCharsets; 22 | import org.apache.beam.examples.DebuggingWordCount.WordCountOptions; 23 | import org.apache.beam.sdk.testing.TestPipeline; 24 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.io.Files; 25 | import org.junit.Rule; 26 | import org.junit.Test; 27 | import org.junit.rules.TemporaryFolder; 28 | import org.junit.runner.RunWith; 29 | import org.junit.runners.JUnit4; 30 | 31 | /** Tests for {@link DebuggingWordCount}. */ 32 | @RunWith(JUnit4.class) 33 | public class DebuggingWordCountTest { 34 | @Rule public TemporaryFolder tmpFolder = new TemporaryFolder(); 35 | 36 | private String getFilePath(String filePath) { 37 | if (filePath.contains(":")) { 38 | return filePath.replace("\\", "/").split(":", -1)[1]; 39 | } 40 | return filePath; 41 | } 42 | 43 | @Test 44 | public void testDebuggingWordCount() throws Exception { 45 | File inputFile = tmpFolder.newFile(); 46 | File outputFile = tmpFolder.newFile(); 47 | Files.write( 48 | "stomach secret Flourish message Flourish here Flourish", 49 | inputFile, 50 | StandardCharsets.UTF_8); 51 | WordCountOptions options = TestPipeline.testingPipelineOptions().as(WordCountOptions.class); 52 | options.setInputFile(getFilePath(inputFile.getAbsolutePath())); 53 | options.setOutput(getFilePath(outputFile.getAbsolutePath())); 54 | DebuggingWordCount.runDebuggingWordCount(options); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /examples/java8/src/test/java/org/apache/beam/examples/MinimalWordCountTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples; 19 | 20 | import java.io.IOException; 21 | import java.io.Serializable; 22 | import java.nio.channels.FileChannel; 23 | import java.nio.file.Files; 24 | import java.nio.file.StandardOpenOption; 25 | import java.util.Arrays; 26 | import org.apache.beam.sdk.extensions.gcp.options.GcsOptions; 27 | import org.apache.beam.sdk.extensions.gcp.util.GcsUtil; 28 | import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; 29 | import org.apache.beam.sdk.io.TextIO; 30 | import org.apache.beam.sdk.testing.TestPipeline; 31 | import org.apache.beam.sdk.transforms.Count; 32 | import org.apache.beam.sdk.transforms.Filter; 33 | import org.apache.beam.sdk.transforms.FlatMapElements; 34 | import org.apache.beam.sdk.transforms.MapElements; 35 | import org.apache.beam.sdk.values.KV; 36 | import org.apache.beam.sdk.values.TypeDescriptors; 37 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList; 38 | import org.junit.Rule; 39 | import org.junit.Test; 40 | import org.junit.runner.RunWith; 41 | import org.junit.runners.JUnit4; 42 | import org.mockito.Mockito; 43 | 44 | /** 45 | * To keep {@link MinimalWordCount} simple, it is not factored or testable. This test file should be 46 | * maintained with a copy of its code for a basic smoke test. 47 | */ 48 | @RunWith(JUnit4.class) 49 | public class MinimalWordCountTest implements Serializable { 50 | 51 | @Rule public TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false); 52 | 53 | /** A basic smoke test that ensures there is no crash at pipeline construction time. */ 54 | @Test 55 | public void testMinimalWordCount() throws Exception { 56 | p.getOptions().as(GcsOptions.class).setGcsUtil(buildMockGcsUtil()); 57 | 58 | p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")) 59 | .apply( 60 | FlatMapElements.into(TypeDescriptors.strings()) 61 | .via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+")))) 62 | .apply(Filter.by((String word) -> !word.isEmpty())) 63 | .apply(Count.perElement()) 64 | .apply( 65 | MapElements.into(TypeDescriptors.strings()) 66 | .via( 67 | (KV wordCount) -> 68 | wordCount.getKey() + ": " + wordCount.getValue())) 69 | .apply(TextIO.write().to("gs://your-output-bucket/and-output-prefix")); 70 | } 71 | 72 | private GcsUtil buildMockGcsUtil() throws IOException { 73 | GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class); 74 | 75 | // Any request to open gets a new bogus channel 76 | Mockito.when(mockGcsUtil.open(Mockito.any(GcsPath.class))) 77 | .then( 78 | invocation -> 79 | FileChannel.open( 80 | Files.createTempFile("channel-", ".tmp"), 81 | StandardOpenOption.CREATE, 82 | StandardOpenOption.DELETE_ON_CLOSE)); 83 | 84 | // Any request for expansion returns a list containing the original GcsPath 85 | // This is required to pass validation that occurs in TextIO during apply() 86 | Mockito.when(mockGcsUtil.expand(Mockito.any(GcsPath.class))) 87 | .then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0])); 88 | 89 | return mockGcsUtil; 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /examples/java8/src/test/java/org/apache/beam/examples/WordCountTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples; 19 | 20 | import java.util.Arrays; 21 | import java.util.List; 22 | import org.apache.beam.examples.WordCount.CountWords; 23 | import org.apache.beam.examples.WordCount.ExtractWordsFn; 24 | import org.apache.beam.examples.WordCount.FormatAsTextFn; 25 | import org.apache.beam.sdk.coders.StringUtf8Coder; 26 | import org.apache.beam.sdk.testing.PAssert; 27 | import org.apache.beam.sdk.testing.TestPipeline; 28 | import org.apache.beam.sdk.testing.ValidatesRunner; 29 | import org.apache.beam.sdk.transforms.Create; 30 | import org.apache.beam.sdk.transforms.DoFn; 31 | import org.apache.beam.sdk.transforms.MapElements; 32 | import org.apache.beam.sdk.transforms.ParDo; 33 | import org.apache.beam.sdk.values.PCollection; 34 | import org.junit.Rule; 35 | import org.junit.Test; 36 | import org.junit.experimental.categories.Category; 37 | import org.junit.runner.RunWith; 38 | import org.junit.runners.JUnit4; 39 | 40 | /** Tests of WordCount. */ 41 | @RunWith(JUnit4.class) 42 | public class WordCountTest { 43 | 44 | /** Example test that tests a specific {@link DoFn}. */ 45 | @Test 46 | public void testExtractWordsFn() throws Exception { 47 | List words = Arrays.asList(" some input words ", " ", " cool ", " foo", " bar"); 48 | PCollection output = 49 | p.apply(Create.of(words).withCoder(StringUtf8Coder.of())) 50 | .apply(ParDo.of(new ExtractWordsFn())); 51 | PAssert.that(output).containsInAnyOrder("some", "input", "words", "cool", "foo", "bar"); 52 | p.run().waitUntilFinish(); 53 | } 54 | 55 | static final String[] WORDS_ARRAY = 56 | new String[] { 57 | "hi there", "hi", "hi sue bob", 58 | "hi sue", "", "bob hi" 59 | }; 60 | 61 | static final List WORDS = Arrays.asList(WORDS_ARRAY); 62 | 63 | static final String[] COUNTS_ARRAY = new String[] {"hi: 5", "there: 1", "sue: 2", "bob: 2"}; 64 | 65 | @Rule public TestPipeline p = TestPipeline.create(); 66 | 67 | /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */ 68 | @Test 69 | @Category(ValidatesRunner.class) 70 | public void testCountWords() throws Exception { 71 | PCollection input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of())); 72 | 73 | PCollection output = 74 | input.apply(new CountWords()).apply(MapElements.via(new FormatAsTextFn())); 75 | 76 | PAssert.that(output).containsInAnyOrder(COUNTS_ARRAY); 77 | p.run().waitUntilFinish(); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /examples/java8/src/test/java/org/apache/beam/examples/complete/game/GameStatsTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game; 19 | 20 | import java.io.Serializable; 21 | import java.util.Arrays; 22 | import java.util.List; 23 | import org.apache.beam.examples.complete.game.GameStats.CalculateSpammyUsers; 24 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 25 | import org.apache.beam.sdk.testing.PAssert; 26 | import org.apache.beam.sdk.testing.TestPipeline; 27 | import org.apache.beam.sdk.testing.ValidatesRunner; 28 | import org.apache.beam.sdk.transforms.Create; 29 | import org.apache.beam.sdk.values.KV; 30 | import org.apache.beam.sdk.values.PCollection; 31 | import org.junit.Rule; 32 | import org.junit.Test; 33 | import org.junit.experimental.categories.Category; 34 | import org.junit.runner.RunWith; 35 | import org.junit.runners.JUnit4; 36 | 37 | /** 38 | * Tests of GameStats. Because the pipeline was designed for easy readability and explanations, it 39 | * lacks good modularity for testing. See our testing documentation for better ideas: 40 | * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ 41 | */ 42 | @RunWith(JUnit4.class) 43 | public class GameStatsTest implements Serializable { 44 | 45 | // User scores 46 | static final List> USER_SCORES = 47 | Arrays.asList( 48 | KV.of("Robot-2", 66), 49 | KV.of("Robot-1", 116), 50 | KV.of("user7_AndroidGreenKookaburra", 23), 51 | KV.of("user7_AndroidGreenKookaburra", 1), 52 | KV.of("user19_BisqueBilby", 14), 53 | KV.of("user13_ApricotQuokka", 15), 54 | KV.of("user18_BananaEmu", 25), 55 | KV.of("user6_AmberEchidna", 8), 56 | KV.of("user2_AmberQuokka", 6), 57 | KV.of("user0_MagentaKangaroo", 4), 58 | KV.of("user0_MagentaKangaroo", 3), 59 | KV.of("user2_AmberCockatoo", 13), 60 | KV.of("user7_AlmondWallaby", 15), 61 | KV.of("user6_AmberNumbat", 11), 62 | KV.of("user6_AmberQuokka", 4)); 63 | 64 | // The expected list of 'spammers'. 65 | static final List> SPAMMERS = 66 | Arrays.asList(KV.of("Robot-2", 66), KV.of("Robot-1", 116)); 67 | 68 | @Rule public TestPipeline p = TestPipeline.create(); 69 | 70 | /** Test the calculation of 'spammy users'. */ 71 | @Test 72 | @Category(ValidatesRunner.class) 73 | public void testCalculateSpammyUsers() throws Exception { 74 | PCollection> input = p.apply(Create.of(USER_SCORES)); 75 | PCollection> output = input.apply(new CalculateSpammyUsers()); 76 | 77 | // Check the set of spammers. 78 | PAssert.that(output).containsInAnyOrder(SPAMMERS); 79 | 80 | p.run().waitUntilFinish(); 81 | } 82 | 83 | @Test 84 | public void testGameStatsOptions() { 85 | PipelineOptionsFactory.as(GameStats.Options.class); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /examples/java8/src/test/java/org/apache/beam/examples/complete/game/HourlyTeamScoreTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game; 19 | 20 | import java.io.Serializable; 21 | import java.util.Arrays; 22 | import java.util.List; 23 | import org.apache.beam.examples.complete.game.UserScore.GameActionInfo; 24 | import org.apache.beam.examples.complete.game.UserScore.ParseEventFn; 25 | import org.apache.beam.sdk.coders.StringUtf8Coder; 26 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 27 | import org.apache.beam.sdk.testing.PAssert; 28 | import org.apache.beam.sdk.testing.TestPipeline; 29 | import org.apache.beam.sdk.testing.ValidatesRunner; 30 | import org.apache.beam.sdk.transforms.Create; 31 | import org.apache.beam.sdk.transforms.Filter; 32 | import org.apache.beam.sdk.transforms.MapElements; 33 | import org.apache.beam.sdk.transforms.ParDo; 34 | import org.apache.beam.sdk.values.KV; 35 | import org.apache.beam.sdk.values.PCollection; 36 | import org.apache.beam.sdk.values.TypeDescriptors; 37 | import org.joda.time.Instant; 38 | import org.junit.Rule; 39 | import org.junit.Test; 40 | import org.junit.experimental.categories.Category; 41 | import org.junit.runner.RunWith; 42 | import org.junit.runners.JUnit4; 43 | 44 | /** 45 | * Tests of HourlyTeamScore. Because the pipeline was designed for easy readability and 46 | * explanations, it lacks good modularity for testing. See our testing documentation for better 47 | * ideas: https://beam.apache.org/documentation/pipelines/test-your-pipeline/ 48 | */ 49 | @RunWith(JUnit4.class) 50 | @SuppressWarnings({ 51 | "rawtypes" // TODO(https://issues.apache.org/jira/browse/BEAM-10556) 52 | }) 53 | public class HourlyTeamScoreTest implements Serializable { 54 | 55 | static final String[] GAME_EVENTS_ARRAY = 56 | new String[] { 57 | "user0_MagentaKangaroo,MagentaKangaroo,3,1447955630000,2015-11-19 09:53:53.444", 58 | "user13_ApricotQuokka,ApricotQuokka,15,1447955630000,2015-11-19 09:53:53.444", 59 | "user6_AmberNumbat,AmberNumbat,11,1447955630000,2015-11-19 09:53:53.444", 60 | "user7_AlmondWallaby,AlmondWallaby,15,1447955630000,2015-11-19 09:53:53.444", 61 | "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,12,1447955630000,2015-11-19 09:53:53.444", 62 | "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,11,1447955630000,2015-11-19 09:53:53.444", 63 | "user19_BisqueBilby,BisqueBilby,6,1447955630000,2015-11-19 09:53:53.444", 64 | "user19_BisqueBilby,BisqueBilby,8,1447955630000,2015-11-19 09:53:53.444", 65 | // time gap... 66 | "user0_AndroidGreenEchidna,AndroidGreenEchidna,0,1447965690000,2015-11-19 12:41:31.053", 67 | "user0_MagentaKangaroo,MagentaKangaroo,4,1447965690000,2015-11-19 12:41:31.053", 68 | "user2_AmberCockatoo,AmberCockatoo,13,1447965690000,2015-11-19 12:41:31.053", 69 | "user18_BananaEmu,BananaEmu,7,1447965690000,2015-11-19 12:41:31.053", 70 | "user3_BananaEmu,BananaEmu,17,1447965690000,2015-11-19 12:41:31.053", 71 | "user18_BananaEmu,BananaEmu,1,1447965690000,2015-11-19 12:41:31.053", 72 | "user18_ApricotCaneToad,ApricotCaneToad,14,1447965690000,2015-11-19 12:41:31.053" 73 | }; 74 | 75 | static final List GAME_EVENTS = Arrays.asList(GAME_EVENTS_ARRAY); 76 | 77 | // Used to check the filtering. 78 | static final KV[] FILTERED_EVENTS = 79 | new KV[] { 80 | KV.of("user0_AndroidGreenEchidna", 0), 81 | KV.of("user0_MagentaKangaroo", 4), 82 | KV.of("user2_AmberCockatoo", 13), 83 | KV.of("user18_BananaEmu", 7), 84 | KV.of("user3_BananaEmu", 17), 85 | KV.of("user18_BananaEmu", 1), 86 | KV.of("user18_ApricotCaneToad", 14) 87 | }; 88 | 89 | @Rule public TestPipeline p = TestPipeline.create(); 90 | 91 | /** Test the filtering. */ 92 | @Test 93 | @Category(ValidatesRunner.class) 94 | public void testUserScoresFilter() throws Exception { 95 | 96 | final Instant startMinTimestamp = new Instant(1447965680000L); 97 | 98 | PCollection input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of())); 99 | 100 | PCollection> output = 101 | input 102 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) 103 | .apply( 104 | "FilterStartTime", 105 | Filter.by( 106 | (GameActionInfo gInfo) -> gInfo.getTimestamp() > startMinTimestamp.getMillis())) 107 | // run a map to access the fields in the result. 108 | .apply( 109 | MapElements.into( 110 | TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) 111 | .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))); 112 | 113 | PAssert.that(output).containsInAnyOrder(FILTERED_EVENTS); 114 | 115 | p.run().waitUntilFinish(); 116 | } 117 | 118 | @Test 119 | public void testUserScoreOptions() { 120 | PipelineOptionsFactory.as(HourlyTeamScore.Options.class); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /examples/java8/src/test/java/org/apache/beam/examples/complete/game/StatefulTeamScoreTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game; 19 | 20 | import org.apache.beam.examples.complete.game.StatefulTeamScore.UpdateTeamScoreFn; 21 | import org.apache.beam.examples.complete.game.UserScore.GameActionInfo; 22 | import org.apache.beam.sdk.coders.AvroCoder; 23 | import org.apache.beam.sdk.coders.KvCoder; 24 | import org.apache.beam.sdk.coders.StringUtf8Coder; 25 | import org.apache.beam.sdk.testing.PAssert; 26 | import org.apache.beam.sdk.testing.TestPipeline; 27 | import org.apache.beam.sdk.testing.TestStream; 28 | import org.apache.beam.sdk.transforms.ParDo; 29 | import org.apache.beam.sdk.transforms.windowing.FixedWindows; 30 | import org.apache.beam.sdk.transforms.windowing.GlobalWindow; 31 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow; 32 | import org.apache.beam.sdk.transforms.windowing.Window; 33 | import org.apache.beam.sdk.values.KV; 34 | import org.apache.beam.sdk.values.PCollection; 35 | import org.apache.beam.sdk.values.TimestampedValue; 36 | import org.joda.time.Duration; 37 | import org.joda.time.Instant; 38 | import org.junit.Rule; 39 | import org.junit.Test; 40 | import org.junit.runner.RunWith; 41 | import org.junit.runners.JUnit4; 42 | 43 | /** Tests for {@link StatefulTeamScore}. */ 44 | @RunWith(JUnit4.class) 45 | public class StatefulTeamScoreTest { 46 | 47 | private Instant baseTime = new Instant(0); 48 | 49 | @Rule public TestPipeline p = TestPipeline.create(); 50 | 51 | /** Some example users, on two separate teams. */ 52 | private enum TestUser { 53 | RED_ONE("scarlet", "red"), 54 | RED_TWO("burgundy", "red"), 55 | BLUE_ONE("navy", "blue"), 56 | BLUE_TWO("sky", "blue"); 57 | 58 | private final String userName; 59 | private final String teamName; 60 | 61 | TestUser(String userName, String teamName) { 62 | this.userName = userName; 63 | this.teamName = teamName; 64 | } 65 | 66 | public String getUser() { 67 | return userName; 68 | } 69 | 70 | public String getTeam() { 71 | return teamName; 72 | } 73 | } 74 | 75 | /** 76 | * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs 77 | * correctly for one team. 78 | */ 79 | @Test 80 | public void testScoreUpdatesOneTeam() { 81 | 82 | TestStream> createEvents = 83 | TestStream.create(KvCoder.of(StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class))) 84 | .advanceWatermarkTo(baseTime) 85 | .addElements( 86 | event(TestUser.RED_TWO, 99, Duration.standardSeconds(10)), 87 | event(TestUser.RED_ONE, 1, Duration.standardSeconds(20)), 88 | event(TestUser.RED_ONE, 0, Duration.standardSeconds(30)), 89 | event(TestUser.RED_TWO, 100, Duration.standardSeconds(40)), 90 | event(TestUser.RED_TWO, 201, Duration.standardSeconds(50))) 91 | .advanceWatermarkToInfinity(); 92 | 93 | PCollection> teamScores = 94 | p.apply(createEvents).apply(ParDo.of(new UpdateTeamScoreFn(100))); 95 | 96 | String redTeam = TestUser.RED_ONE.getTeam(); 97 | 98 | PAssert.that(teamScores) 99 | .inWindow(GlobalWindow.INSTANCE) 100 | .containsInAnyOrder(KV.of(redTeam, 100), KV.of(redTeam, 200), KV.of(redTeam, 401)); 101 | 102 | p.run().waitUntilFinish(); 103 | } 104 | 105 | /** 106 | * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs 107 | * correctly for multiple teams. 108 | */ 109 | @Test 110 | public void testScoreUpdatesPerTeam() { 111 | 112 | TestStream> createEvents = 113 | TestStream.create(KvCoder.of(StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class))) 114 | .advanceWatermarkTo(baseTime) 115 | .addElements( 116 | event(TestUser.RED_ONE, 50, Duration.standardSeconds(10)), 117 | event(TestUser.RED_TWO, 50, Duration.standardSeconds(20)), 118 | event(TestUser.BLUE_ONE, 70, Duration.standardSeconds(30)), 119 | event(TestUser.BLUE_TWO, 80, Duration.standardSeconds(40)), 120 | event(TestUser.BLUE_TWO, 50, Duration.standardSeconds(50))) 121 | .advanceWatermarkToInfinity(); 122 | 123 | PCollection> teamScores = 124 | p.apply(createEvents).apply(ParDo.of(new UpdateTeamScoreFn(100))); 125 | 126 | String redTeam = TestUser.RED_ONE.getTeam(); 127 | String blueTeam = TestUser.BLUE_ONE.getTeam(); 128 | 129 | PAssert.that(teamScores) 130 | .inWindow(GlobalWindow.INSTANCE) 131 | .containsInAnyOrder(KV.of(redTeam, 100), KV.of(blueTeam, 150), KV.of(blueTeam, 200)); 132 | 133 | p.run().waitUntilFinish(); 134 | } 135 | 136 | /** 137 | * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs 138 | * correctly per window and per key. 139 | */ 140 | @Test 141 | public void testScoreUpdatesPerWindow() { 142 | 143 | TestStream> createEvents = 144 | TestStream.create(KvCoder.of(StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class))) 145 | .advanceWatermarkTo(baseTime) 146 | .addElements( 147 | event(TestUser.RED_ONE, 50, Duration.standardMinutes(1)), 148 | event(TestUser.RED_TWO, 50, Duration.standardMinutes(2)), 149 | event(TestUser.RED_ONE, 50, Duration.standardMinutes(3)), 150 | event(TestUser.RED_ONE, 60, Duration.standardMinutes(6)), 151 | event(TestUser.RED_TWO, 60, Duration.standardMinutes(7))) 152 | .advanceWatermarkToInfinity(); 153 | 154 | Duration teamWindowDuration = Duration.standardMinutes(5); 155 | 156 | PCollection> teamScores = 157 | p.apply(createEvents) 158 | .apply(Window.>into(FixedWindows.of(teamWindowDuration))) 159 | .apply(ParDo.of(new UpdateTeamScoreFn(100))); 160 | 161 | String redTeam = TestUser.RED_ONE.getTeam(); 162 | 163 | IntervalWindow window1 = new IntervalWindow(baseTime, teamWindowDuration); 164 | IntervalWindow window2 = new IntervalWindow(window1.end(), teamWindowDuration); 165 | 166 | PAssert.that(teamScores).inWindow(window1).containsInAnyOrder(KV.of(redTeam, 100)); 167 | 168 | PAssert.that(teamScores).inWindow(window2).containsInAnyOrder(KV.of(redTeam, 120)); 169 | 170 | p.run().waitUntilFinish(); 171 | } 172 | 173 | private TimestampedValue> event( 174 | TestUser user, int score, Duration baseTimeOffset) { 175 | return TimestampedValue.of( 176 | KV.of( 177 | user.getTeam(), 178 | new GameActionInfo( 179 | user.getUser(), user.getTeam(), score, baseTime.plus(baseTimeOffset).getMillis())), 180 | baseTime.plus(baseTimeOffset)); 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /examples/java8/src/test/java/org/apache/beam/examples/complete/game/UserScoreTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.complete.game; 19 | 20 | import java.io.Serializable; 21 | import java.util.Arrays; 22 | import java.util.List; 23 | import org.apache.beam.examples.complete.game.UserScore.ExtractAndSumScore; 24 | import org.apache.beam.examples.complete.game.UserScore.GameActionInfo; 25 | import org.apache.beam.examples.complete.game.UserScore.ParseEventFn; 26 | import org.apache.beam.sdk.coders.StringUtf8Coder; 27 | import org.apache.beam.sdk.testing.PAssert; 28 | import org.apache.beam.sdk.testing.TestPipeline; 29 | import org.apache.beam.sdk.testing.ValidatesRunner; 30 | import org.apache.beam.sdk.transforms.Create; 31 | import org.apache.beam.sdk.transforms.MapElements; 32 | import org.apache.beam.sdk.transforms.ParDo; 33 | import org.apache.beam.sdk.values.KV; 34 | import org.apache.beam.sdk.values.PCollection; 35 | import org.apache.beam.sdk.values.TypeDescriptors; 36 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists; 37 | import org.junit.Rule; 38 | import org.junit.Test; 39 | import org.junit.experimental.categories.Category; 40 | import org.junit.runner.RunWith; 41 | import org.junit.runners.JUnit4; 42 | 43 | /** Tests of UserScore. */ 44 | @RunWith(JUnit4.class) 45 | public class UserScoreTest implements Serializable { 46 | 47 | static final String[] GAME_EVENTS_ARRAY = 48 | new String[] { 49 | "user0_MagentaKangaroo,MagentaKangaroo,3,1447955630000,2015-11-19 09:53:53.444", 50 | "user13_ApricotQuokka,ApricotQuokka,15,1447955630000,2015-11-19 09:53:53.444", 51 | "user6_AmberNumbat,AmberNumbat,11,1447955630000,2015-11-19 09:53:53.444", 52 | "user7_AlmondWallaby,AlmondWallaby,15,1447955630000,2015-11-19 09:53:53.444", 53 | "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,12,1447955630000,2015-11-19 09:53:53.444", 54 | "user6_AliceBlueDingo,AliceBlueDingo,4,xxxxxxx,2015-11-19 09:53:53.444", 55 | "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,11,1447955630000,2015-11-19 09:53:53.444", 56 | "THIS IS A PARSE ERROR,2015-11-19 09:53:53.444", 57 | "user19_BisqueBilby,BisqueBilby,6,1447955630000,2015-11-19 09:53:53.444", 58 | "user19_BisqueBilby,BisqueBilby,8,1447955630000,2015-11-19 09:53:53.444" 59 | }; 60 | 61 | static final String[] GAME_EVENTS_ARRAY2 = 62 | new String[] { 63 | "user6_AliceBlueDingo,AliceBlueDingo,4,xxxxxxx,2015-11-19 09:53:53.444", 64 | "THIS IS A PARSE ERROR,2015-11-19 09:53:53.444", 65 | "user13_BisqueBilby,BisqueBilby,xxx,1447955630000,2015-11-19 09:53:53.444" 66 | }; 67 | 68 | static final List GAME_EVENTS = Arrays.asList(GAME_EVENTS_ARRAY); 69 | static final List GAME_EVENTS2 = Arrays.asList(GAME_EVENTS_ARRAY2); 70 | 71 | static final List GAME_ACTION_INFO_LIST = 72 | Lists.newArrayList( 73 | new GameActionInfo("user0_MagentaKangaroo", "MagentaKangaroo", 3, 1447955630000L), 74 | new GameActionInfo("user13_ApricotQuokka", "ApricotQuokka", 15, 1447955630000L), 75 | new GameActionInfo("user6_AmberNumbat", "AmberNumbat", 11, 1447955630000L), 76 | new GameActionInfo("user7_AlmondWallaby", "AlmondWallaby", 15, 1447955630000L), 77 | new GameActionInfo( 78 | "user7_AndroidGreenKookaburra", "AndroidGreenKookaburra", 12, 1447955630000L), 79 | new GameActionInfo( 80 | "user7_AndroidGreenKookaburra", "AndroidGreenKookaburra", 11, 1447955630000L), 81 | new GameActionInfo("user19_BisqueBilby", "BisqueBilby", 6, 1447955630000L), 82 | new GameActionInfo("user19_BisqueBilby", "BisqueBilby", 8, 1447955630000L)); 83 | 84 | static final List> USER_SUMS = 85 | Arrays.asList( 86 | KV.of("user0_MagentaKangaroo", 3), 87 | KV.of("user13_ApricotQuokka", 15), 88 | KV.of("user6_AmberNumbat", 11), 89 | KV.of("user7_AlmondWallaby", 15), 90 | KV.of("user7_AndroidGreenKookaburra", 23), 91 | KV.of("user19_BisqueBilby", 14)); 92 | 93 | static final List> TEAM_SUMS = 94 | Arrays.asList( 95 | KV.of("MagentaKangaroo", 3), 96 | KV.of("ApricotQuokka", 15), 97 | KV.of("AmberNumbat", 11), 98 | KV.of("AlmondWallaby", 15), 99 | KV.of("AndroidGreenKookaburra", 23), 100 | KV.of("BisqueBilby", 14)); 101 | 102 | @Rule public TestPipeline p = TestPipeline.create(); 103 | 104 | /** Test the {@link ParseEventFn} {@link org.apache.beam.sdk.transforms.DoFn}. */ 105 | @Test 106 | public void testParseEventFn() throws Exception { 107 | PCollection input = p.apply(Create.of(GAME_EVENTS)); 108 | PCollection output = input.apply(ParDo.of(new ParseEventFn())); 109 | 110 | PAssert.that(output).containsInAnyOrder(GAME_ACTION_INFO_LIST); 111 | 112 | p.run().waitUntilFinish(); 113 | } 114 | 115 | /** Tests ExtractAndSumScore("user"). */ 116 | @Test 117 | @Category(ValidatesRunner.class) 118 | public void testUserScoreSums() throws Exception { 119 | 120 | PCollection input = p.apply(Create.of(GAME_EVENTS)); 121 | 122 | PCollection> output = 123 | input 124 | .apply(ParDo.of(new ParseEventFn())) 125 | // Extract and sum username/score pairs from the event data. 126 | .apply("ExtractUserScore", new ExtractAndSumScore("user")); 127 | 128 | // Check the user score sums. 129 | PAssert.that(output).containsInAnyOrder(USER_SUMS); 130 | 131 | p.run().waitUntilFinish(); 132 | } 133 | 134 | /** Tests ExtractAndSumScore("team"). */ 135 | @Test 136 | @Category(ValidatesRunner.class) 137 | public void testTeamScoreSums() throws Exception { 138 | 139 | PCollection input = p.apply(Create.of(GAME_EVENTS)); 140 | 141 | PCollection> output = 142 | input 143 | .apply(ParDo.of(new ParseEventFn())) 144 | // Extract and sum teamname/score pairs from the event data. 145 | .apply("ExtractTeamScore", new ExtractAndSumScore("team")); 146 | 147 | // Check the team score sums. 148 | PAssert.that(output).containsInAnyOrder(TEAM_SUMS); 149 | 150 | p.run().waitUntilFinish(); 151 | } 152 | 153 | /** Test that bad input data is dropped appropriately. */ 154 | @Test 155 | @Category(ValidatesRunner.class) 156 | public void testUserScoresBadInput() throws Exception { 157 | 158 | PCollection input = p.apply(Create.of(GAME_EVENTS2).withCoder(StringUtf8Coder.of())); 159 | 160 | PCollection> extract = 161 | input 162 | .apply(ParDo.of(new ParseEventFn())) 163 | .apply( 164 | MapElements.into( 165 | TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) 166 | .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))); 167 | 168 | PAssert.that(extract).empty(); 169 | 170 | p.run().waitUntilFinish(); 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /examples/java8/src/test/java/org/apache/beam/examples/subprocess/ExampleEchoPipelineTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.beam.examples.subprocess; 19 | 20 | import static java.nio.charset.StandardCharsets.UTF_8; 21 | 22 | import java.io.IOException; 23 | import java.nio.ByteBuffer; 24 | import java.nio.channels.FileChannel; 25 | import java.nio.channels.SeekableByteChannel; 26 | import java.nio.file.Files; 27 | import java.nio.file.Path; 28 | import java.nio.file.StandardOpenOption; 29 | import java.util.ArrayList; 30 | import java.util.List; 31 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration; 32 | import org.apache.beam.examples.subprocess.kernel.SubProcessCommandLineArgs; 33 | import org.apache.beam.examples.subprocess.kernel.SubProcessCommandLineArgs.Command; 34 | import org.apache.beam.examples.subprocess.kernel.SubProcessKernel; 35 | import org.apache.beam.examples.subprocess.utils.CallingSubProcessUtils; 36 | import org.apache.beam.sdk.extensions.gcp.options.GcsOptions; 37 | import org.apache.beam.sdk.extensions.gcp.util.GcsUtil; 38 | import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; 39 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 40 | import org.apache.beam.sdk.testing.PAssert; 41 | import org.apache.beam.sdk.testing.TestPipeline; 42 | import org.apache.beam.sdk.transforms.Create; 43 | import org.apache.beam.sdk.transforms.DoFn; 44 | import org.apache.beam.sdk.transforms.ParDo; 45 | import org.apache.beam.sdk.values.KV; 46 | import org.apache.beam.sdk.values.PCollection; 47 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList; 48 | import org.junit.Rule; 49 | import org.junit.Test; 50 | import org.junit.runner.RunWith; 51 | import org.junit.runners.JUnit4; 52 | import org.mockito.Mockito; 53 | import org.mockito.invocation.InvocationOnMock; 54 | import org.mockito.stubbing.Answer; 55 | import org.slf4j.Logger; 56 | import org.slf4j.LoggerFactory; 57 | 58 | /** 59 | * To keep {@link org.apache.beam.examples.subprocess.ExampleEchoPipeline} simple, it is not 60 | * factored or testable. This test file should be maintained with a copy of its code for a basic 61 | * smoke test. 62 | */ 63 | @RunWith(JUnit4.class) 64 | public class ExampleEchoPipelineTest { 65 | 66 | private static final Logger LOG = LoggerFactory.getLogger(ExampleEchoPipelineTest.class); 67 | 68 | @Rule public TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false); 69 | 70 | @Test 71 | public void testExampleEchoPipeline() throws Exception { 72 | 73 | // Create two Bash files as tests for the binary files 74 | 75 | Path fileA = Files.createTempFile("test-Echo", ".sh"); 76 | Path fileB = Files.createTempFile("test-EchoAgain", ".sh"); 77 | 78 | Path workerTempFiles = Files.createTempDirectory("test-Echoo"); 79 | 80 | try (SeekableByteChannel channel = 81 | FileChannel.open(fileA, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) { 82 | channel.write(ByteBuffer.wrap(getTestShellEcho().getBytes(UTF_8))); 83 | } 84 | 85 | try (SeekableByteChannel channel = 86 | FileChannel.open(fileB, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) { 87 | channel.write(ByteBuffer.wrap(getTestShellEchoAgain().getBytes(UTF_8))); 88 | } 89 | 90 | // Read in the options for the pipeline 91 | SubProcessPipelineOptions options = PipelineOptionsFactory.as(SubProcessPipelineOptions.class); 92 | 93 | options.setConcurrency(2); 94 | options.setSourcePath(fileA.getParent().toString()); 95 | options.setWorkerPath(workerTempFiles.toAbsolutePath().toString()); 96 | 97 | p.getOptions().as(GcsOptions.class).setGcsUtil(buildMockGcsUtil()); 98 | 99 | // Setup the Configuration option used with all transforms 100 | SubProcessConfiguration configuration = options.getSubProcessConfiguration(); 101 | 102 | // Create some sample data to be fed to our c++ Echo library 103 | List> sampleData = new ArrayList<>(); 104 | 105 | for (int i = 0; i < 100; i++) { 106 | String str = String.valueOf(i); 107 | sampleData.add(KV.of(str, str)); 108 | } 109 | 110 | // Define the pipeline which is two transforms echoing the inputs out to Logs 111 | // For this use case we will make use of two shell files instead of the binary to make 112 | // testing easier 113 | PCollection> output = 114 | p.apply(Create.of(sampleData)) 115 | .apply( 116 | "Echo inputs round 1", 117 | ParDo.of(new EchoInputDoFn(configuration, fileA.getFileName().toString()))) 118 | .apply( 119 | "Echo inputs round 2", 120 | ParDo.of(new EchoInputDoFn(configuration, fileB.getFileName().toString()))); 121 | 122 | PAssert.that(output).containsInAnyOrder(sampleData); 123 | 124 | p.run(); 125 | } 126 | 127 | /** Simple DoFn that echos the element, used as an example of running a C++ library. */ 128 | @SuppressWarnings("serial") 129 | private static class EchoInputDoFn extends DoFn, KV> { 130 | 131 | private static final Logger LOG = LoggerFactory.getLogger(EchoInputDoFn.class); 132 | 133 | private SubProcessConfiguration configuration; 134 | private String binaryName; 135 | 136 | public EchoInputDoFn(SubProcessConfiguration configuration, String binary) { 137 | // Pass in configuration information the name of the filename of the sub-process and the level 138 | // of concurrency 139 | this.configuration = configuration; 140 | this.binaryName = binary; 141 | } 142 | 143 | @Setup 144 | public void setUp() throws Exception { 145 | CallingSubProcessUtils.setUp(configuration, binaryName); 146 | } 147 | 148 | @ProcessElement 149 | public void processElement(ProcessContext c) throws Exception { 150 | try { 151 | // Our Library takes a single command in position 0 which it will echo back in the result 152 | SubProcessCommandLineArgs commands = new SubProcessCommandLineArgs(); 153 | Command command = new Command(0, String.valueOf(c.element().getValue())); 154 | commands.putCommand(command); 155 | 156 | // The ProcessingKernel deals with the execution of the process 157 | SubProcessKernel kernel = new SubProcessKernel(configuration, binaryName); 158 | 159 | // Run the command and work through the results 160 | List results = kernel.exec(commands); 161 | for (String s : results) { 162 | c.output(KV.of(c.element().getKey(), s)); 163 | } 164 | } catch (Exception ex) { 165 | LOG.error("Error processing element ", ex); 166 | throw ex; 167 | } 168 | } 169 | } 170 | 171 | private static String getTestShellEcho() { 172 | return "#!/bin/sh\n" + "filename=$1;\n" + "echo $2 >> $filename;"; 173 | } 174 | 175 | private static String getTestShellEchoAgain() { 176 | return "#!/bin/sh\n" + "filename=$1;\n" + "echo $2 >> $filename;"; 177 | } 178 | 179 | private GcsUtil buildMockGcsUtil() throws IOException { 180 | GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class); 181 | 182 | // Any request to open gets a new bogus channel 183 | Mockito.when(mockGcsUtil.open(Mockito.any(GcsPath.class))) 184 | .then( 185 | new Answer() { 186 | 187 | @Override 188 | public SeekableByteChannel answer(InvocationOnMock invocation) throws Throwable { 189 | return FileChannel.open( 190 | Files.createTempFile("channel-", ".tmp"), 191 | StandardOpenOption.CREATE, 192 | StandardOpenOption.DELETE_ON_CLOSE); 193 | } 194 | }); 195 | 196 | // Any request for expansion returns a list containing the original GcsPath 197 | // This is required to pass validation that occurs in TextIO during apply() 198 | Mockito.when(mockGcsUtil.expand(Mockito.any(GcsPath.class))) 199 | .then( 200 | new Answer>() { 201 | 202 | @Override 203 | public List answer(InvocationOnMock invocation) throws Throwable { 204 | return ImmutableList.of((GcsPath) invocation.getArguments()[0]); 205 | } 206 | }); 207 | 208 | return mockGcsUtil; 209 | } 210 | } 211 | --------------------------------------------------------------------------------