├── .gitattributes
├── .github
└── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── course.md
└── examples
└── java8
├── pom.xml
└── src
├── main
└── java
│ └── org
│ └── apache
│ └── beam
│ └── examples
│ ├── DebuggingWordCount.java
│ ├── LineCount.java
│ ├── MinimalLineCount.java
│ ├── MinimalLineCountArgs.java
│ ├── MinimalLineCountLambda.java
│ ├── MinimalWordCount.java
│ ├── WindowedWordCount.java
│ ├── WordCount.java
│ ├── common
│ ├── ExampleBigQueryTableOptions.java
│ ├── ExampleOptions.java
│ ├── ExamplePubsubTopicAndSubscriptionOptions.java
│ ├── ExamplePubsubTopicOptions.java
│ ├── ExampleUtils.java
│ └── WriteOneFilePerWindow.java
│ ├── complete
│ └── game
│ │ ├── GameStats.java
│ │ ├── HourlyTeamScore.java
│ │ ├── LeaderBoard.java
│ │ ├── StatefulTeamScore.java
│ │ ├── UserScore.java
│ │ ├── injector
│ │ ├── Injector.java
│ │ ├── InjectorUtils.java
│ │ └── RetryHttpInitializerWrapper.java
│ │ └── utils
│ │ ├── GameConstants.java
│ │ ├── WriteToBigQuery.java
│ │ ├── WriteToText.java
│ │ └── WriteWindowedToBigQuery.java
│ └── subprocess
│ ├── ExampleEchoPipeline.java
│ ├── SubProcessPipelineOptions.java
│ ├── configuration
│ └── SubProcessConfiguration.java
│ ├── kernel
│ ├── SubProcessCommandLineArgs.java
│ ├── SubProcessIOFiles.java
│ └── SubProcessKernel.java
│ └── utils
│ ├── CallingSubProcessUtils.java
│ ├── ExecutableFile.java
│ └── FileUtils.java
└── test
└── java
└── org
└── apache
└── beam
└── examples
├── DebuggingWordCountTest.java
├── MinimalWordCountTest.java
├── WordCountTest.java
├── complete
└── game
│ ├── GameStatsTest.java
│ ├── HourlyTeamScoreTest.java
│ ├── LeaderBoardTest.java
│ ├── StatefulTeamScoreTest.java
│ └── UserScoreTest.java
└── subprocess
└── ExampleEchoPipelineTest.java
/.gitattributes:
--------------------------------------------------------------------------------
1 | # The default behavior, which overrides 'core.autocrlf', is to use Git's
2 | # built-in heuristics to determine whether a particular file is text or binary.
3 | # Text files are automatically normalized to the user's platforms.
4 | * text=auto
5 |
6 | # Explicitly declare text files that should always be normalized and converted
7 | # to native line endings.
8 | .gitattributes text
9 | .gitignore text
10 | LICENSE text
11 | *.avsc text
12 | *.html text
13 | *.java text
14 | *.md text
15 | *.properties text
16 | *.proto text
17 | *.py text
18 | *.sh text
19 | *.xml text
20 | *.yml text
21 |
22 | # Declare files that will always have CRLF line endings on checkout.
23 | # *.sln text eol=crlf
24 |
25 | # Explicitly denote all files that are truly binary and should not be modified.
26 | # *.jpg binary
27 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Follow this checklist to help us incorporate your contribution quickly and easily:
2 |
3 | - [ ] Make sure there is a [JIRA issue](https://issues.apache.org/jira/projects/BEAM/issues/) filed for the change (usually before you start working on it). Trivial changes like typos do not require a JIRA issue. Your pull request should address just this issue, without pulling in other changes.
4 | - [ ] Each commit in the pull request should have a meaningful subject line and body.
5 | - [ ] Format the pull request title like `[BEAM-XXX] Fixes bug in ApproximateQuantiles`, where you replace `BEAM-XXX` with the appropriate JIRA issue.
6 | - [ ] Write a pull request description that is detailed enough to understand what the pull request does, how, and why.
7 | - [ ] Run `mvn clean verify` to make sure basic checks pass. A more thorough check will be performed on your pull request automatically.
8 | - [ ] If this contribution is large, please file an Apache [Individual Contributor License Agreement](https://www.apache.org/licenses/icla.pdf).
9 |
10 | ---
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # NOTE: if you modify this file, you probably need to modify the file set that
2 | # is an input to 'maven-assembly-plugin' that generates source distribution.
3 | # This is typically in files named 'src.xml' throughout this repository.
4 |
5 | # Ignore files generated by the Maven build process.
6 | target/
7 | bin/
8 |
9 | # Ignore generated archetypes
10 | sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/
11 | sdks/java/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/
12 |
13 | # Ignore files generated by the Python build process.
14 | *.py[cod]
15 | *.egg-info/
16 | .eggs/
17 | .tox/
18 | build/
19 | dist/
20 | distribute-*
21 | env/
22 | sdks/python/**/*.c
23 | sdks/python/**/*.so
24 | sdks/python/**/*.egg
25 | sdks/python/LICENSE
26 | sdks/python/NOTICE
27 | sdks/python/README.md
28 | sdks/python/apache_beam/portability/api/*pb2*.*
29 |
30 | # Ignore IntelliJ files.
31 | .idea/
32 | *.iml
33 | *.ipr
34 | *.iws
35 |
36 | # Ignore Eclipse files.
37 | .classpath
38 | .project
39 | .factorypath
40 | .checkstyle
41 | .fbExcludeFilterFile
42 | .apt_generated/
43 | .settings/
44 |
45 | # The build process generates the dependency-reduced POM, but it shouldn't be
46 | # committed.
47 | dependency-reduced-pom.xml
48 |
49 | # Hotspot VM leaves this log in a non-target directory when java crashes
50 | hs_err_pid*.log
51 |
52 | # Ignore files that end with '~', since they are most likely auto-save files
53 | # produced by a text editor.
54 | *~
55 |
56 | # Ignore MacOSX files.
57 | .DS_Store
58 |
59 | # NOTE: if you modify this file, you probably need to modify the file set that
60 | # is an input to 'maven-assembly-plugin' that generates source distribution.
61 | # This is typically in files named 'src.xml' throughout this repository.
62 |
--------------------------------------------------------------------------------
/course.md:
--------------------------------------------------------------------------------
1 | # Introduction to Google Cloud Dataflow
2 | This file contains text you can copy and paste for the examples in Cloud Academy's _Introduction to Google Cloud Dataflow_ course.
3 |
4 | ### Building and Running a Pipeline
5 | Installing on your own computer: https://cloud.google.com/dataflow/docs/quickstarts
6 | Transforms: https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/transforms/package-summary.html
7 |
8 | Note: Cloud Shell now uses Java 11 by default, so to get this demo to work, switch to Java 8 by running the following command.
9 | It will generate errors, but it will still work.
10 | ```
11 | sudo update-java-alternatives -s java-1.8.0-openjdk-amd64 && export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
12 | ```
13 |
14 | ```
15 | git clone https://github.com/cloudacademy/beam.git
16 | cd beam/examples/java8
17 | mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.MinimalLineCount
18 | ```
19 | ```
20 | gsutil cat gs://dataflow-samples/shakespeare/kinglear.txt | wc
21 | ```
22 |
23 | ### Deploying a Pipeline on Cloud Dataflow
24 | ```
25 | nano ~/.profile
26 | PROJECT=[Your Project ID]
27 | BUCKET=gs://dataflow-$PROJECT
28 | gsutil mb $BUCKET
29 | cd ~/beam/examples/java8
30 | ```
31 | ```
32 | mvn -Pdataflow-runner compile exec:java -Dexec.mainClass=org.apache.beam.examples.MinimalLineCountArgs \
33 | -Dexec.args="--runner=DataflowRunner \
34 | --project=$PROJECT \
35 | --tempLocation=$BUCKET/temp \
36 | --region=us-central1"
37 | ```
38 | ```
39 | mvn -Pdataflow-runner compile exec:java -Dexec.mainClass=org.apache.beam.examples.LineCount \
40 | -Dexec.args="--runner=DataflowRunner \
41 | --project=$PROJECT \
42 | --tempLocation=$BUCKET/temp \
43 | --output=$BUCKET/linecount \
44 | --region=us-central1"
45 | ```
46 |
47 | ### Custom Transforms
48 | ```
49 | cd ~/beam/examples/java8
50 | ```
51 | ```
52 | mvn -Pdataflow-runner compile exec:java -Dexec.mainClass=org.apache.beam.examples.MinimalWordCount \
53 | -Dexec.args="--runner=DataflowRunner \
54 | --project=$PROJECT \
55 | --tempLocation=$BUCKET/temp \
56 | --output=$BUCKET/wordcounts \
57 | --region=us-central1"
58 | ```
59 |
60 | ### Composite Transforms
61 | ```
62 | cd ~/beam/examples/java8
63 | ```
64 | ```
65 | mvn -Pdataflow-runner compile exec:java -Dexec.mainClass=org.apache.beam.examples.complete.game.UserScore \
66 | -Dexec.args="--runner=DataflowRunner \
67 | --project=$PROJECT \
68 | --tempLocation=$BUCKET/temp/ \
69 | --output=$BUCKET/scores \
70 | --region=us-central1"
71 | ```
72 |
73 | ### Windowing
74 | ```
75 | cd ~/beam/examples/java8
76 | ```
77 | ```
78 | mvn -Pdataflow-runner compile exec:java -Dexec.mainClass=org.apache.beam.examples.complete.game.HourlyTeamScore \
79 | -Dexec.args="--runner=DataflowRunner \
80 | --project=$PROJECT \
81 | --tempLocation=$BUCKET/temp/ \
82 | --output=$BUCKET/scores \
83 | --startMin=2015-11-16-16-00 \
84 | --stopMin=2015-11-17-16-00 \
85 | --region=us-central1"
86 | ```
87 |
88 | ### Running LeaderBoard
89 | ```
90 | bq mk game
91 | ```
92 | Note: You no longer need to use a credentials file to run this example.
93 | ```
94 | cd ~/beam/examples/java8
95 | ```
96 | ```
97 | mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.complete.game.injector.Injector \
98 | -Dexec.args="$PROJECT game none"
99 | ```
100 | ```
101 | cd ~/beam/examples/java8
102 | ```
103 | ```
104 | mvn -Pdataflow-runner compile exec:java -Dexec.mainClass=org.apache.beam.examples.complete.game.LeaderBoard \
105 | -Dexec.args="--runner=DataflowRunner \
106 | --project=$PROJECT \
107 | --tempLocation=$BUCKET/temp/ \
108 | --dataset=game \
109 | --topic=projects/$PROJECT/topics/game \
110 | --region=us-central1"
111 | ```
112 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/DebuggingWordCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples;
19 |
20 | import java.util.Arrays;
21 | import java.util.List;
22 | import java.util.regex.Pattern;
23 | import org.apache.beam.sdk.Pipeline;
24 | import org.apache.beam.sdk.io.TextIO;
25 | import org.apache.beam.sdk.metrics.Counter;
26 | import org.apache.beam.sdk.metrics.Metrics;
27 | import org.apache.beam.sdk.options.Default;
28 | import org.apache.beam.sdk.options.Description;
29 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
30 | import org.apache.beam.sdk.testing.PAssert;
31 | import org.apache.beam.sdk.transforms.DoFn;
32 | import org.apache.beam.sdk.transforms.ParDo;
33 | import org.apache.beam.sdk.values.KV;
34 | import org.apache.beam.sdk.values.PCollection;
35 | import org.slf4j.Logger;
36 | import org.slf4j.LoggerFactory;
37 |
38 | /**
39 | * An example that verifies word counts in Shakespeare and includes Beam best practices.
40 | *
41 | *
This class, {@link DebuggingWordCount}, is the third in a series of four successively more
42 | * detailed 'word count' examples. You may first want to take a look at {@link MinimalWordCount} and
43 | * {@link WordCount}. After you've looked at this example, then see the {@link WindowedWordCount}
44 | * pipeline, for introduction of additional concepts.
45 | *
46 | *
Basic concepts, also in the MinimalWordCount and WordCount examples: Reading text files;
47 | * counting a PCollection; executing a Pipeline both locally and using a selected runner; defining
48 | * DoFns.
49 | *
50 | *
New Concepts:
51 | *
52 | *
53 | * 1. Logging using SLF4J, even in a distributed environment
54 | * 2. Creating a custom metric (runners have varying levels of support)
55 | * 3. Testing your Pipeline via PAssert
56 | *
57 | *
58 | * To execute this pipeline locally, specify general pipeline configuration:
59 | *
60 | *
{@code
61 | * --project=YOUR_PROJECT_ID
62 | * }
63 | *
64 | * To change the runner, specify:
65 | *
66 | *
{@code
67 | * --runner=YOUR_SELECTED_RUNNER
68 | * }
69 | *
70 | * The input file defaults to a public data set containing the text of of King Lear, by William
71 | * Shakespeare. You can override it and choose your own input with {@code --inputFile}.
72 | */
73 | public class DebuggingWordCount {
74 | /** A DoFn that filters for a specific key based upon a regular expression. */
75 | public static class FilterTextFn extends DoFn, KV> {
76 | /**
77 | * Concept #1: The logger below uses the fully qualified class name of FilterTextFn as the
78 | * logger. Depending on your SLF4J configuration, log statements will likely be qualified by
79 | * this name.
80 | *
81 | * Note that this is entirely standard SLF4J usage. Some runners may provide a default SLF4J
82 | * configuration that is most appropriate for their logging integration.
83 | */
84 | private static final Logger LOG = LoggerFactory.getLogger(FilterTextFn.class);
85 |
86 | private final Pattern filter;
87 |
88 | public FilterTextFn(String pattern) {
89 | filter = Pattern.compile(pattern);
90 | }
91 |
92 | /**
93 | * Concept #2: A custom metric can track values in your pipeline as it runs. Each runner
94 | * provides varying levels of support for metrics, and may expose them in a dashboard, etc.
95 | */
96 | private final Counter matchedWords = Metrics.counter(FilterTextFn.class, "matchedWords");
97 |
98 | private final Counter unmatchedWords = Metrics.counter(FilterTextFn.class, "unmatchedWords");
99 |
100 | @ProcessElement
101 | public void processElement(ProcessContext c) {
102 | if (filter.matcher(c.element().getKey()).matches()) {
103 | // Log at the "DEBUG" level each element that we match. When executing this pipeline
104 | // these log lines will appear only if the log level is set to "DEBUG" or lower.
105 | LOG.debug("Matched: " + c.element().getKey());
106 | matchedWords.inc();
107 | c.output(c.element());
108 | } else {
109 | // Log at the "TRACE" level each element that is not matched. Different log levels
110 | // can be used to control the verbosity of logging providing an effective mechanism
111 | // to filter less important information.
112 | LOG.trace("Did not match: " + c.element().getKey());
113 | unmatchedWords.inc();
114 | }
115 | }
116 | }
117 |
118 | /**
119 | * Options supported by {@link DebuggingWordCount}.
120 | *
121 | *
Inherits standard configuration options and all options defined in {@link
122 | * WordCount.WordCountOptions}.
123 | */
124 | public interface WordCountOptions extends WordCount.WordCountOptions {
125 |
126 | @Description(
127 | "Regex filter pattern to use in DebuggingWordCount. "
128 | + "Only words matching this pattern will be counted.")
129 | @Default.String("Flourish|stomach")
130 | String getFilterPattern();
131 |
132 | void setFilterPattern(String value);
133 | }
134 |
135 | static void runDebuggingWordCount(WordCountOptions options) {
136 | Pipeline p = Pipeline.create(options);
137 |
138 | PCollection> filteredWords =
139 | p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
140 | .apply(new WordCount.CountWords())
141 | .apply(ParDo.of(new FilterTextFn(options.getFilterPattern())));
142 |
143 | /*
144 | * Concept #3: PAssert is a set of convenient PTransforms in the style of
145 | * Hamcrest's collection matchers that can be used when writing Pipeline level tests
146 | * to validate the contents of PCollections. PAssert is best used in unit tests
147 | * with small data sets but is demonstrated here as a teaching tool.
148 | *
149 | * Below we verify that the set of filtered words matches our expected counts. Note
150 | * that PAssert does not provide any output and that successful completion of the
151 | * Pipeline implies that the expectations were met. Learn more at
152 | * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ on how to test
153 | * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test.
154 | */
155 | List> expectedResults =
156 | Arrays.asList(KV.of("Flourish", 3L), KV.of("stomach", 1L));
157 | PAssert.that(filteredWords).containsInAnyOrder(expectedResults);
158 |
159 | p.run().waitUntilFinish();
160 | }
161 |
162 | public static void main(String[] args) {
163 | WordCountOptions options =
164 | PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);
165 |
166 | runDebuggingWordCount(options);
167 | }
168 | }
169 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/LineCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples;
19 |
20 | import org.apache.beam.sdk.Pipeline;
21 | import org.apache.beam.sdk.io.TextIO;
22 | import org.apache.beam.sdk.options.Default;
23 | import org.apache.beam.sdk.options.Description;
24 | import org.apache.beam.sdk.options.PipelineOptions;
25 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
26 | import org.apache.beam.sdk.options.Validation.Required;
27 | import org.apache.beam.sdk.transforms.Count;
28 | import org.apache.beam.sdk.transforms.MapElements;
29 | import org.apache.beam.sdk.values.TypeDescriptors;
30 |
31 | public class LineCount {
32 |
33 | public interface LineCountOptions extends PipelineOptions {
34 |
35 | /**
36 | * By default, this example reads from a public dataset containing the text of
37 | * King Lear. Set this option to choose a different input file or glob.
38 | */
39 | @Description("Path of the file to read from")
40 | @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt")
41 | String getInputFile();
42 | void setInputFile(String value);
43 |
44 | /**
45 | * Set this required option to specify where to write the output.
46 | */
47 | @Description("Path of the file to write to")
48 | @Required
49 | String getOutput();
50 | void setOutput(String value);
51 | }
52 |
53 | public static void main(String[] args) {
54 | LineCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
55 | .as(LineCountOptions.class);
56 | Pipeline p = Pipeline.create(options);
57 |
58 | p.apply(TextIO.read().from(options.getInputFile()))
59 | .apply(Count.globally())
60 | .apply(MapElements.into(TypeDescriptors.strings())
61 | .via((Long count) -> Long.toString(count)))
62 | .apply(TextIO.write().to(options.getOutput()));
63 |
64 | p.run().waitUntilFinish();
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/MinimalLineCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples;
19 |
20 | import org.apache.beam.sdk.Pipeline;
21 | import org.apache.beam.sdk.io.TextIO;
22 | import org.apache.beam.sdk.options.PipelineOptions;
23 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
24 | import org.apache.beam.sdk.transforms.Count;
25 | import org.apache.beam.sdk.transforms.MapElements;
26 | import org.apache.beam.sdk.transforms.SimpleFunction;
27 |
28 | public class MinimalLineCount {
29 |
30 | public static void main(String[] args) {
31 | PipelineOptions options = PipelineOptionsFactory.create();
32 | Pipeline p = Pipeline.create(options);
33 |
34 | p.apply(TextIO.read().from("gs://dataflow-samples/shakespeare/kinglear.txt"))
35 | .apply(Count.globally())
36 | .apply(MapElements.via(new SimpleFunction() {
37 | public String apply(Long input) {
38 | return Long.toString(input);
39 | }
40 | }))
41 | .apply(TextIO.write().to("linecount"));
42 |
43 | p.run().waitUntilFinish();
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/MinimalLineCountArgs.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples;
19 |
20 | import org.apache.beam.sdk.Pipeline;
21 | import org.apache.beam.sdk.io.TextIO;
22 | import org.apache.beam.sdk.options.PipelineOptions;
23 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
24 | import org.apache.beam.sdk.transforms.Count;
25 | import org.apache.beam.sdk.transforms.MapElements;
26 | import org.apache.beam.sdk.values.TypeDescriptors;
27 |
28 | public class MinimalLineCountArgs {
29 |
30 | public static void main(String[] args) {
31 | PipelineOptions options = PipelineOptionsFactory.fromArgs(args).as(PipelineOptions.class);
32 | Pipeline p = Pipeline.create(options);
33 |
34 | p.apply(TextIO.read().from("gs://dataflow-samples/shakespeare/kinglear.txt"))
35 | .apply(Count.globally())
36 | .apply(MapElements.into(TypeDescriptors.strings())
37 | .via((Long count) -> Long.toString(count)))
38 | .apply(TextIO.write().to("linecount"));
39 |
40 | p.run().waitUntilFinish();
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/MinimalLineCountLambda.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples;
19 |
20 | import org.apache.beam.sdk.Pipeline;
21 | import org.apache.beam.sdk.io.TextIO;
22 | import org.apache.beam.sdk.options.PipelineOptions;
23 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
24 | import org.apache.beam.sdk.transforms.Count;
25 | import org.apache.beam.sdk.transforms.MapElements;
26 | import org.apache.beam.sdk.values.TypeDescriptors;
27 |
28 | public class MinimalLineCountLambda {
29 |
30 | public static void main(String[] args) {
31 | PipelineOptions options = PipelineOptionsFactory.create();
32 | Pipeline p = Pipeline.create(options);
33 |
34 | p.apply(TextIO.read().from("gs://dataflow-samples/shakespeare/kinglear.txt"))
35 | .apply(Count.globally())
36 | .apply(MapElements.into(TypeDescriptors.strings())
37 | .via((Long count) -> Long.toString(count)))
38 | .apply(TextIO.write().to("linecount"));
39 |
40 | p.run().waitUntilFinish();
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/MinimalWordCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples;
19 |
20 | import org.apache.beam.examples.common.ExampleUtils;
21 | import org.apache.beam.sdk.Pipeline;
22 | import org.apache.beam.sdk.io.TextIO;
23 | import org.apache.beam.sdk.options.Default;
24 | import org.apache.beam.sdk.options.Description;
25 | import org.apache.beam.sdk.options.PipelineOptions;
26 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
27 | import org.apache.beam.sdk.options.Validation.Required;
28 | import org.apache.beam.sdk.transforms.Count;
29 | import org.apache.beam.sdk.transforms.DoFn;
30 | import org.apache.beam.sdk.transforms.MapElements;
31 | import org.apache.beam.sdk.transforms.ParDo;
32 | import org.apache.beam.sdk.transforms.SimpleFunction;
33 | import org.apache.beam.sdk.values.KV;
34 |
35 |
36 | /**
37 | * An example that counts words in Shakespeare.
38 | *
39 | * This class, {@link MinimalWordCount}, is the first in a series of four successively more
40 | * detailed 'word count' examples. Here, for simplicity, we don't show any error-checking,
41 | * and focus on construction of the pipeline, which chains together the application of core
42 | * transforms.
43 | *
44 | *
Next, see the {@link WordCount} pipeline, then the {@link DebuggingWordCount}, and finally the
45 | * {@link WindowedWordCount} pipeline, for more detailed examples that introduce additional
46 | * concepts.
47 | *
48 | *
Concepts:
49 | *
50 | *
51 | * 1. Reading data from text files
52 | * 2. Specifying 'inline' transforms
53 | * 3. Counting items in a PCollection
54 | * 4. Writing data to text files
55 | *
56 | *
57 | */
58 | public class MinimalWordCount {
59 |
60 | public interface WordCountOptions extends PipelineOptions {
61 |
62 | /**
63 | * By default, this example reads from a public dataset containing the text of
64 | * King Lear. Set this option to choose a different input file or glob.
65 | */
66 | @Description("Path of the file to read from")
67 | @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt")
68 | String getInputFile();
69 | void setInputFile(String value);
70 |
71 | /**
72 | * Set this required option to specify where to write the output.
73 | */
74 | @Description("Path of the file to write to")
75 | @Required
76 | String getOutput();
77 | void setOutput(String value);
78 | }
79 |
80 | public static void main(String[] args) {
81 | // Create a PipelineOptions object. This object lets us set various execution
82 | // options for our pipeline, such as the runner you wish to use. This example
83 | // will run with the DirectRunner by default, based on the class path configured
84 | // in its dependencies.
85 | WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
86 | .as(WordCountOptions.class);
87 |
88 | // Create the Pipeline object with the options we defined above.
89 | Pipeline p = Pipeline.create(options);
90 |
91 | // Apply the pipeline's transforms.
92 |
93 | // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set
94 | // of input text files. TextIO.Read returns a PCollection where each element is one line from
95 | // the input text.
96 |
97 | p.apply(TextIO.read().from(options.getInputFile()))
98 |
99 | // Concept #2: Apply a ParDo transform to our PCollection of text lines. This ParDo invokes a
100 | // DoFn (defined in-line) on each element that tokenizes the text line into individual words.
101 | // The ParDo returns a PCollection, where each element is an individual word in
102 | // the input text.
103 | .apply("ExtractWords", ParDo.of(new DoFn() {
104 | @ProcessElement
105 | public void processElement(ProcessContext c) {
106 | for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) {
107 | if (!word.isEmpty()) {
108 | c.output(word);
109 | }
110 | }
111 | }
112 | }))
113 |
114 | // Concept #3: Apply the Count transform to our PCollection of individual words. The Count
115 | // transform returns a new PCollection of key/value pairs, where each key represents a unique
116 | // word in the text. The associated value is the occurrence count for that word.
117 | .apply(Count.perElement())
118 |
119 | // Apply a MapElements transform that formats our PCollection of word counts into a printable
120 | // string, suitable for writing to an output file.
121 | .apply("FormatResults", MapElements.via(new SimpleFunction, String>() {
122 | @Override
123 | public String apply(KV input) {
124 | return input.getKey() + ": " + input.getValue();
125 | }
126 | }))
127 |
128 | // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline.
129 | // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of
130 | // formatted strings) to a series of text files.
131 | //
132 | // By default, it will write to a set of files with names like wordcount-00001-of-00005
133 | .apply(TextIO.write().to(options.getOutput()));
134 |
135 | // Run the pipeline.
136 | p.run().waitUntilFinish();
137 | }
138 | }
139 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/WindowedWordCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples;
19 |
20 | import java.io.IOException;
21 | import java.util.concurrent.ThreadLocalRandom;
22 | import org.apache.beam.examples.common.ExampleBigQueryTableOptions;
23 | import org.apache.beam.examples.common.ExampleOptions;
24 | import org.apache.beam.examples.common.WriteOneFilePerWindow;
25 | import org.apache.beam.sdk.Pipeline;
26 | import org.apache.beam.sdk.PipelineResult;
27 | import org.apache.beam.sdk.io.TextIO;
28 | import org.apache.beam.sdk.options.Default;
29 | import org.apache.beam.sdk.options.DefaultValueFactory;
30 | import org.apache.beam.sdk.options.Description;
31 | import org.apache.beam.sdk.options.PipelineOptions;
32 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
33 | import org.apache.beam.sdk.transforms.DoFn;
34 | import org.apache.beam.sdk.transforms.MapElements;
35 | import org.apache.beam.sdk.transforms.ParDo;
36 | import org.apache.beam.sdk.transforms.windowing.FixedWindows;
37 | import org.apache.beam.sdk.transforms.windowing.Window;
38 | import org.apache.beam.sdk.values.KV;
39 | import org.apache.beam.sdk.values.PCollection;
40 | import org.joda.time.Duration;
41 | import org.joda.time.Instant;
42 |
43 | /**
44 | * An example that counts words in text, and can run over either unbounded or bounded input
45 | * collections.
46 | *
47 | * This class, {@link WindowedWordCount}, is the last in a series of four successively more
48 | * detailed 'word count' examples. First take a look at {@link MinimalWordCount}, {@link WordCount},
49 | * and {@link DebuggingWordCount}.
50 | *
51 | *
Basic concepts, also in the MinimalWordCount, WordCount, and DebuggingWordCount examples:
52 | * Reading text files; counting a PCollection; writing to GCS; executing a Pipeline both locally and
53 | * using a selected runner; defining DoFns; user-defined PTransforms; defining PipelineOptions.
54 | *
55 | *
New Concepts:
56 | *
57 | *
58 | * 1. Unbounded and bounded pipeline input modes
59 | * 2. Adding timestamps to data
60 | * 3. Windowing
61 | * 4. Re-using PTransforms over windowed PCollections
62 | * 5. Accessing the window of an element
63 | * 6. Writing data to per-window text files
64 | *
65 | *
66 | * By default, the examples will run with the {@code DirectRunner}. To change the runner,
67 | * specify:
68 | *
69 | *
{@code
70 | * --runner=YOUR_SELECTED_RUNNER
71 | * }
72 | *
73 | * See examples/java/README.md for instructions about how to configure different runners.
74 | *
75 | * To execute this pipeline locally, specify a local output file (if using the {@code
76 | * DirectRunner}) or output prefix on a supported distributed file system.
77 | *
78 | *
{@code
79 | * --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
80 | * }
81 | *
82 | * The input file defaults to a public data set containing the text of of King Lear, by William
83 | * Shakespeare. You can override it and choose your own input with {@code --inputFile}.
84 | *
85 | *
By default, the pipeline will do fixed windowing, on 10-minute windows. You can change this
86 | * interval by setting the {@code --windowSize} parameter, e.g. {@code --windowSize=15} for
87 | * 15-minute windows.
88 | *
89 | *
The example will try to cancel the pipeline on the signal to terminate the process (CTRL-C).
90 | */
91 | public class WindowedWordCount {
92 | static final int WINDOW_SIZE = 10; // Default window duration in minutes
93 | /**
94 | * Concept #2: A DoFn that sets the data element timestamp. This is a silly method, just for this
95 | * example, for the bounded data case.
96 | *
97 | *
Imagine that many ghosts of Shakespeare are all typing madly at the same time to recreate
98 | * his masterworks. Each line of the corpus will get a random associated timestamp somewhere in a
99 | * 2-hour period.
100 | */
101 | static class AddTimestampFn extends DoFn {
102 | private final Instant minTimestamp;
103 | private final Instant maxTimestamp;
104 |
105 | AddTimestampFn(Instant minTimestamp, Instant maxTimestamp) {
106 | this.minTimestamp = minTimestamp;
107 | this.maxTimestamp = maxTimestamp;
108 | }
109 |
110 | @ProcessElement
111 | public void processElement(@Element String element, OutputReceiver receiver) {
112 | Instant randomTimestamp =
113 | new Instant(
114 | ThreadLocalRandom.current()
115 | .nextLong(minTimestamp.getMillis(), maxTimestamp.getMillis()));
116 |
117 | /*
118 | * Concept #2: Set the data element with that timestamp.
119 | */
120 | receiver.outputWithTimestamp(element, randomTimestamp);
121 | }
122 | }
123 |
124 | /** A {@link DefaultValueFactory} that returns the current system time. */
125 | public static class DefaultToCurrentSystemTime implements DefaultValueFactory {
126 | @Override
127 | public Long create(PipelineOptions options) {
128 | return System.currentTimeMillis();
129 | }
130 | }
131 |
132 | /** A {@link DefaultValueFactory} that returns the minimum timestamp plus one hour. */
133 | public static class DefaultToMinTimestampPlusOneHour implements DefaultValueFactory {
134 | @Override
135 | public Long create(PipelineOptions options) {
136 | return options.as(Options.class).getMinTimestampMillis()
137 | + Duration.standardHours(1).getMillis();
138 | }
139 | }
140 |
141 | /**
142 | * Options for {@link WindowedWordCount}.
143 | *
144 | * Inherits standard example configuration options, which allow specification of the runner, as
145 | * well as the {@link WordCount.WordCountOptions} support for specification of the input and
146 | * output files.
147 | */
148 | public interface Options
149 | extends WordCount.WordCountOptions, ExampleOptions, ExampleBigQueryTableOptions {
150 | @Description("Fixed window duration, in minutes")
151 | @Default.Integer(WINDOW_SIZE)
152 | Integer getWindowSize();
153 |
154 | void setWindowSize(Integer value);
155 |
156 | @Description("Minimum randomly assigned timestamp, in milliseconds-since-epoch")
157 | @Default.InstanceFactory(DefaultToCurrentSystemTime.class)
158 | Long getMinTimestampMillis();
159 |
160 | void setMinTimestampMillis(Long value);
161 |
162 | @Description("Maximum randomly assigned timestamp, in milliseconds-since-epoch")
163 | @Default.InstanceFactory(DefaultToMinTimestampPlusOneHour.class)
164 | Long getMaxTimestampMillis();
165 |
166 | void setMaxTimestampMillis(Long value);
167 |
168 | @Description("Fixed number of shards to produce per window")
169 | Integer getNumShards();
170 |
171 | void setNumShards(Integer numShards);
172 | }
173 |
174 | static void runWindowedWordCount(Options options) throws IOException {
175 | final String output = options.getOutput();
176 | final Instant minTimestamp = new Instant(options.getMinTimestampMillis());
177 | final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis());
178 |
179 | Pipeline pipeline = Pipeline.create(options);
180 |
181 | /*
182 | * Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or
183 | * unbounded input source.
184 | */
185 | PCollection input =
186 | pipeline
187 | /* Read from the GCS file. */
188 | .apply(TextIO.read().from(options.getInputFile()))
189 | // Concept #2: Add an element timestamp, using an artificial time just to show
190 | // windowing.
191 | // See AddTimestampFn for more detail on this.
192 | .apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp)));
193 |
194 | /*
195 | * Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1
196 | * minute (you can change this with a command-line option). See the documentation for more
197 | * information on how fixed windows work, and for information on the other types of windowing
198 | * available (e.g., sliding windows).
199 | */
200 | PCollection windowedWords =
201 | input.apply(
202 | Window.into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
203 |
204 | /*
205 | * Concept #4: Re-use our existing CountWords transform that does not have knowledge of
206 | * windows over a PCollection containing windowed values.
207 | */
208 | PCollection> wordCounts = windowedWords.apply(new WordCount.CountWords());
209 |
210 | /*
211 | * Concept #5: Format the results and write to a sharded file partitioned by window, using a
212 | * simple ParDo operation. Because there may be failures followed by retries, the
213 | * writes must be idempotent, but the details of writing to files is elided here.
214 | */
215 | wordCounts
216 | .apply(MapElements.via(new WordCount.FormatAsTextFn()))
217 | .apply(new WriteOneFilePerWindow(output, options.getNumShards()));
218 |
219 | PipelineResult result = pipeline.run();
220 | try {
221 | result.waitUntilFinish();
222 | } catch (Exception exc) {
223 | result.cancel();
224 | }
225 | }
226 |
227 | public static void main(String[] args) throws IOException {
228 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
229 |
230 | runWindowedWordCount(options);
231 | }
232 | }
233 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/WordCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples;
19 |
20 | import org.apache.beam.examples.common.ExampleUtils;
21 | import org.apache.beam.sdk.Pipeline;
22 | import org.apache.beam.sdk.io.TextIO;
23 | import org.apache.beam.sdk.metrics.Counter;
24 | import org.apache.beam.sdk.metrics.Distribution;
25 | import org.apache.beam.sdk.metrics.Metrics;
26 | import org.apache.beam.sdk.options.Default;
27 | import org.apache.beam.sdk.options.Description;
28 | import org.apache.beam.sdk.options.PipelineOptions;
29 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
30 | import org.apache.beam.sdk.options.Validation.Required;
31 | import org.apache.beam.sdk.transforms.Count;
32 | import org.apache.beam.sdk.transforms.DoFn;
33 | import org.apache.beam.sdk.transforms.MapElements;
34 | import org.apache.beam.sdk.transforms.PTransform;
35 | import org.apache.beam.sdk.transforms.ParDo;
36 | import org.apache.beam.sdk.transforms.SimpleFunction;
37 | import org.apache.beam.sdk.values.KV;
38 | import org.apache.beam.sdk.values.PCollection;
39 |
40 | /**
41 | * An example that counts words in Shakespeare and includes Beam best practices.
42 | *
43 | * This class, {@link WordCount}, is the second in a series of four successively more detailed
44 | * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}. After
45 | * you've looked at this example, then see the {@link DebuggingWordCount} pipeline, for introduction
46 | * of additional concepts.
47 | *
48 | *
For a detailed walkthrough of this example, see
50 | * https://beam.apache.org/get-started/wordcount-example/
51 | *
52 | *
Basic concepts, also in the MinimalWordCount example: Reading text files; counting a
53 | * PCollection; writing to text files
54 | *
55 | *
New Concepts:
56 | *
57 | *
58 | * 1. Executing a Pipeline both locally and using the selected runner
59 | * 2. Using ParDo with static DoFns defined out-of-line
60 | * 3. Building a composite transform
61 | * 4. Defining your own pipeline options
62 | *
63 | *
64 | * Concept #1: you can execute this pipeline either locally or using by selecting another runner.
65 | * These are now command-line options and not hard-coded as they were in the MinimalWordCount
66 | * example.
67 | *
68 | *
To change the runner, specify:
69 | *
70 | *
{@code
71 | * --runner=YOUR_SELECTED_RUNNER
72 | * }
73 | *
74 | * To execute this pipeline, specify a local output file (if using the {@code DirectRunner}) or
75 | * output prefix on a supported distributed file system.
76 | *
77 | *
{@code
78 | * --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
79 | * }
80 | *
81 | * The input file defaults to a public data set containing the text of of King Lear, by William
82 | * Shakespeare. You can override it and choose your own input with {@code --inputFile}.
83 | */
84 | public class WordCount {
85 |
86 | /**
87 | * Concept #2: You can make your pipeline assembly code less verbose by defining your DoFns
88 | * statically out-of-line. This DoFn tokenizes lines of text into individual words; we pass it to
89 | * a ParDo in the pipeline.
90 | */
91 | static class ExtractWordsFn extends DoFn {
92 | private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines");
93 | private final Distribution lineLenDist =
94 | Metrics.distribution(ExtractWordsFn.class, "lineLenDistro");
95 |
96 | @ProcessElement
97 | public void processElement(@Element String element, OutputReceiver receiver) {
98 | lineLenDist.update(element.length());
99 | if (element.trim().isEmpty()) {
100 | emptyLines.inc();
101 | }
102 |
103 | // Split the line into words.
104 | String[] words = element.split(ExampleUtils.TOKENIZER_PATTERN, -1);
105 |
106 | // Output each word encountered into the output PCollection.
107 | for (String word : words) {
108 | if (!word.isEmpty()) {
109 | receiver.output(word);
110 | }
111 | }
112 | }
113 | }
114 |
115 | /** A SimpleFunction that converts a Word and Count into a printable string. */
116 | public static class FormatAsTextFn extends SimpleFunction, String> {
117 | @Override
118 | public String apply(KV input) {
119 | return input.getKey() + ": " + input.getValue();
120 | }
121 | }
122 |
123 | /**
124 | * A PTransform that converts a PCollection containing lines of text into a PCollection of
125 | * formatted word counts.
126 | *
127 | * Concept #3: This is a custom composite transform that bundles two transforms (ParDo and
128 | * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse,
129 | * modular testing, and an improved monitoring experience.
130 | */
131 | public static class CountWords
132 | extends PTransform, PCollection>> {
133 | @Override
134 | public PCollection> expand(PCollection lines) {
135 |
136 | // Convert lines of text into individual words.
137 | PCollection words = lines.apply(ParDo.of(new ExtractWordsFn()));
138 |
139 | // Count the number of times each word occurs.
140 | PCollection> wordCounts = words.apply(Count.perElement());
141 |
142 | return wordCounts;
143 | }
144 | }
145 |
146 | /**
147 | * Options supported by {@link WordCount}.
148 | *
149 | * Concept #4: Defining your own configuration options. Here, you can add your own arguments to
150 | * be processed by the command-line parser, and specify default values for them. You can then
151 | * access the options values in your pipeline code.
152 | *
153 | *
Inherits standard configuration options.
154 | */
155 | public interface WordCountOptions extends PipelineOptions {
156 |
157 | /**
158 | * By default, this example reads from a public dataset containing the text of King Lear. Set
159 | * this option to choose a different input file or glob.
160 | */
161 | @Description("Path of the file to read from")
162 | @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt")
163 | String getInputFile();
164 |
165 | void setInputFile(String value);
166 |
167 | /** Set this required option to specify where to write the output. */
168 | @Description("Path of the file to write to")
169 | @Required
170 | String getOutput();
171 |
172 | void setOutput(String value);
173 | }
174 |
175 | static void runWordCount(WordCountOptions options) {
176 | Pipeline p = Pipeline.create(options);
177 |
178 | // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
179 | // static FormatAsTextFn() to the ParDo transform.
180 | p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
181 | .apply(new CountWords())
182 | .apply(MapElements.via(new FormatAsTextFn()))
183 | .apply("WriteCounts", TextIO.write().to(options.getOutput()));
184 |
185 | p.run().waitUntilFinish();
186 | }
187 |
188 | public static void main(String[] args) {
189 | WordCountOptions options =
190 | PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);
191 |
192 | runWordCount(options);
193 | }
194 | }
195 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/common/ExampleBigQueryTableOptions.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.common;
19 |
20 | import com.google.api.services.bigquery.model.TableSchema;
21 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
22 | import org.apache.beam.sdk.options.Default;
23 | import org.apache.beam.sdk.options.DefaultValueFactory;
24 | import org.apache.beam.sdk.options.Description;
25 | import org.apache.beam.sdk.options.PipelineOptions;
26 |
27 | /**
28 | * Options that can be used to configure BigQuery tables in Beam examples. The project defaults to
29 | * the project being used to run the example.
30 | */
31 | public interface ExampleBigQueryTableOptions extends GcpOptions {
32 | @Description("BigQuery dataset name")
33 | @Default.String("beam_examples")
34 | String getBigQueryDataset();
35 |
36 | void setBigQueryDataset(String dataset);
37 |
38 | @Description("BigQuery table name")
39 | @Default.InstanceFactory(BigQueryTableFactory.class)
40 | String getBigQueryTable();
41 |
42 | void setBigQueryTable(String table);
43 |
44 | @Description("BigQuery table schema")
45 | TableSchema getBigQuerySchema();
46 |
47 | void setBigQuerySchema(TableSchema schema);
48 |
49 | /** Returns the job name as the default BigQuery table name. */
50 | class BigQueryTableFactory implements DefaultValueFactory {
51 | @Override
52 | public String create(PipelineOptions options) {
53 | return options.getJobName().replace('-', '_');
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/common/ExampleOptions.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.common;
19 |
20 | import org.apache.beam.sdk.options.Default;
21 | import org.apache.beam.sdk.options.Description;
22 | import org.apache.beam.sdk.options.PipelineOptions;
23 |
24 | /** Options that can be used to configure the Beam examples. */
25 | public interface ExampleOptions extends PipelineOptions {
26 | @Description("Whether to keep jobs running after local process exit")
27 | @Default.Boolean(false)
28 | boolean getKeepJobsRunning();
29 |
30 | void setKeepJobsRunning(boolean keepJobsRunning);
31 |
32 | @Description("Number of workers to use when executing the injector pipeline")
33 | @Default.Integer(1)
34 | int getInjectorNumWorkers();
35 |
36 | void setInjectorNumWorkers(int numWorkers);
37 | }
38 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/common/ExamplePubsubTopicAndSubscriptionOptions.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.common;
19 |
20 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
21 | import org.apache.beam.sdk.options.Default;
22 | import org.apache.beam.sdk.options.DefaultValueFactory;
23 | import org.apache.beam.sdk.options.Description;
24 | import org.apache.beam.sdk.options.PipelineOptions;
25 |
26 | /** Options that can be used to configure Pub/Sub topic/subscription in Beam examples. */
27 | public interface ExamplePubsubTopicAndSubscriptionOptions extends ExamplePubsubTopicOptions {
28 | @Description("Pub/Sub subscription")
29 | @Default.InstanceFactory(PubsubSubscriptionFactory.class)
30 | String getPubsubSubscription();
31 |
32 | void setPubsubSubscription(String subscription);
33 |
34 | /** Returns a default Pub/Sub subscription based on the project and the job names. */
35 | class PubsubSubscriptionFactory implements DefaultValueFactory {
36 | @Override
37 | public String create(PipelineOptions options) {
38 | return "projects/"
39 | + options.as(GcpOptions.class).getProject()
40 | + "/subscriptions/"
41 | + options.getJobName();
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/common/ExamplePubsubTopicOptions.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.common;
19 |
20 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
21 | import org.apache.beam.sdk.options.Default;
22 | import org.apache.beam.sdk.options.DefaultValueFactory;
23 | import org.apache.beam.sdk.options.Description;
24 | import org.apache.beam.sdk.options.PipelineOptions;
25 |
26 | /** Options that can be used to configure Pub/Sub topic in Beam examples. */
27 | public interface ExamplePubsubTopicOptions extends GcpOptions {
28 | @Description("Pub/Sub topic")
29 | @Default.InstanceFactory(PubsubTopicFactory.class)
30 | String getPubsubTopic();
31 |
32 | void setPubsubTopic(String topic);
33 |
34 | /** Returns a default Pub/Sub topic based on the project and the job names. */
35 | class PubsubTopicFactory implements DefaultValueFactory {
36 | @Override
37 | public String create(PipelineOptions options) {
38 | return "projects/"
39 | + options.as(GcpOptions.class).getProject()
40 | + "/topics/"
41 | + options.getJobName();
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/common/WriteOneFilePerWindow.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.common;
19 |
20 | import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects.firstNonNull;
21 |
22 | import javax.annotation.Nullable;
23 | import org.apache.beam.sdk.io.FileBasedSink;
24 | import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy;
25 | import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints;
26 | import org.apache.beam.sdk.io.TextIO;
27 | import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions;
28 | import org.apache.beam.sdk.io.fs.ResourceId;
29 | import org.apache.beam.sdk.transforms.DoFn;
30 | import org.apache.beam.sdk.transforms.PTransform;
31 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
32 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
33 | import org.apache.beam.sdk.transforms.windowing.PaneInfo;
34 | import org.apache.beam.sdk.values.PCollection;
35 | import org.apache.beam.sdk.values.PDone;
36 | import org.joda.time.format.DateTimeFormatter;
37 | import org.joda.time.format.ISODateTimeFormat;
38 |
39 | /**
40 | * A {@link DoFn} that writes elements to files with names deterministically derived from the lower
41 | * and upper bounds of their key (an {@link IntervalWindow}).
42 | *
43 | * This is test utility code, not for end-users, so examples can be focused on their primary
44 | * lessons.
45 | */
46 | public class WriteOneFilePerWindow extends PTransform, PDone> {
47 | private static final DateTimeFormatter FORMATTER = ISODateTimeFormat.hourMinute();
48 | private String filenamePrefix;
49 | @Nullable private Integer numShards;
50 |
51 | public WriteOneFilePerWindow(String filenamePrefix, Integer numShards) {
52 | this.filenamePrefix = filenamePrefix;
53 | this.numShards = numShards;
54 | }
55 |
56 | @Override
57 | public PDone expand(PCollection input) {
58 | ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
59 | TextIO.Write write =
60 | TextIO.write()
61 | .to(new PerWindowFiles(resource))
62 | .withTempDirectory(resource.getCurrentDirectory())
63 | .withWindowedWrites();
64 | if (numShards != null) {
65 | write = write.withNumShards(numShards);
66 | }
67 | return input.apply(write);
68 | }
69 |
70 | /**
71 | * A {@link FilenamePolicy} produces a base file name for a write based on metadata about the data
72 | * being written. This always includes the shard number and the total number of shards. For
73 | * windowed writes, it also includes the window and pane index (a sequence number assigned to each
74 | * trigger firing).
75 | */
76 | public static class PerWindowFiles extends FilenamePolicy {
77 |
78 | private final ResourceId baseFilename;
79 |
80 | public PerWindowFiles(ResourceId baseFilename) {
81 | this.baseFilename = baseFilename;
82 | }
83 |
84 | public String filenamePrefixForWindow(IntervalWindow window) {
85 | String prefix =
86 | baseFilename.isDirectory() ? "" : firstNonNull(baseFilename.getFilename(), "");
87 | return String.format(
88 | "%s-%s-%s", prefix, FORMATTER.print(window.start()), FORMATTER.print(window.end()));
89 | }
90 |
91 | @Override
92 | public ResourceId windowedFilename(
93 | int shardNumber,
94 | int numShards,
95 | BoundedWindow window,
96 | PaneInfo paneInfo,
97 | OutputFileHints outputFileHints) {
98 | IntervalWindow intervalWindow = (IntervalWindow) window;
99 | String filename =
100 | String.format(
101 | "%s-%s-of-%s%s",
102 | filenamePrefixForWindow(intervalWindow),
103 | shardNumber,
104 | numShards,
105 | outputFileHints.getSuggestedFilenameSuffix());
106 | return baseFilename
107 | .getCurrentDirectory()
108 | .resolve(filename, StandardResolveOptions.RESOLVE_FILE);
109 | }
110 |
111 | @Override
112 | public ResourceId unwindowedFilename(
113 | int shardNumber, int numShards, OutputFileHints outputFileHints) {
114 | throw new UnsupportedOperationException("Unsupported.");
115 | }
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/complete/game/HourlyTeamScore.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game;
19 |
20 | import java.util.HashMap;
21 | import java.util.Map;
22 | import java.util.TimeZone;
23 | import org.apache.beam.examples.complete.game.utils.GameConstants;
24 | import org.apache.beam.examples.complete.game.utils.WriteToText;
25 | import org.apache.beam.sdk.Pipeline;
26 | import org.apache.beam.sdk.io.TextIO;
27 | import org.apache.beam.sdk.options.Default;
28 | import org.apache.beam.sdk.options.Description;
29 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
30 | import org.apache.beam.sdk.transforms.Filter;
31 | import org.apache.beam.sdk.transforms.ParDo;
32 | import org.apache.beam.sdk.transforms.WithTimestamps;
33 | import org.apache.beam.sdk.transforms.windowing.FixedWindows;
34 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
35 | import org.apache.beam.sdk.transforms.windowing.Window;
36 | import org.apache.beam.sdk.values.KV;
37 | import org.joda.time.DateTimeZone;
38 | import org.joda.time.Duration;
39 | import org.joda.time.Instant;
40 | import org.joda.time.format.DateTimeFormat;
41 | import org.joda.time.format.DateTimeFormatter;
42 |
43 | /**
44 | * This class is the second in a series of four pipelines that tell a story in a 'gaming' domain,
45 | * following {@link UserScore}. In addition to the concepts introduced in {@link UserScore}, new
46 | * concepts include: windowing and element timestamps; use of {@code Filter.by()}.
47 | *
48 | * This pipeline processes data collected from gaming events in batch, building on {@link
49 | * UserScore} but using fixed windows. It calculates the sum of scores per team, for each window,
50 | * optionally allowing specification of two timestamps before and after which data is filtered out.
51 | * This allows a model where late data collected after the intended analysis window can be included,
52 | * and any late-arriving data prior to the beginning of the analysis window can be removed as well.
53 | * By using windowing and adding element timestamps, we can do finer-grained analysis than with the
54 | * {@link UserScore} pipeline. However, our batch processing is high-latency, in that we don't get
55 | * results from plays at the beginning of the batch's time period until the batch is processed.
56 | *
57 | *
To execute this pipeline, specify the pipeline configuration like this:
58 | *
59 | *
{@code
60 | * --tempLocation=YOUR_TEMP_DIRECTORY
61 | * --runner=YOUR_RUNNER
62 | * --output=YOUR_OUTPUT_DIRECTORY
63 | * (possibly options specific to your runner or permissions for your temp/output locations)
64 | * }
65 | *
66 | * Optionally include {@code --input} to specify the batch input file path. To indicate a time
67 | * after which the data should be filtered out, include the {@code --stopMin} arg. E.g., {@code
68 | * --stopMin=2015-10-18-23-59} indicates that any data timestamped after 23:59 PST on 2015-10-18
69 | * should not be included in the analysis. To indicate a time before which data should be filtered
70 | * out, include the {@code --startMin} arg. If you're using the default input specified in {@link
71 | * UserScore}, "gs://apache-beam-samples/game/gaming_data*.csv", then {@code
72 | * --startMin=2015-11-16-16-10 --stopMin=2015-11-17-16-10} are good values.
73 | */
74 | public class HourlyTeamScore extends UserScore {
75 |
76 | private static DateTimeFormatter minFmt =
77 | DateTimeFormat.forPattern("yyyy-MM-dd-HH-mm")
78 | .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("America/Los_Angeles")));
79 |
80 | /** Options supported by {@link HourlyTeamScore}. */
81 | public interface Options extends UserScore.Options {
82 |
83 | @Description("Numeric value of fixed window duration, in minutes")
84 | @Default.Integer(60)
85 | Integer getWindowDuration();
86 |
87 | void setWindowDuration(Integer value);
88 |
89 | @Description(
90 | "String representation of the first minute after which to generate results,"
91 | + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST."
92 | + "Any input data timestamped prior to that minute won't be included in the sums.")
93 | @Default.String("1970-01-01-00-00")
94 | String getStartMin();
95 |
96 | void setStartMin(String value);
97 |
98 | @Description(
99 | "String representation of the first minute for which to not generate results,"
100 | + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST."
101 | + "Any input data timestamped after that minute won't be included in the sums.")
102 | @Default.String("2100-01-01-00-00")
103 | String getStopMin();
104 |
105 | void setStopMin(String value);
106 | }
107 |
108 | /**
109 | * Create a map of information that describes how to write pipeline output to text. This map is
110 | * passed to the {@link WriteToText} constructor to write team score sums and includes information
111 | * about window start time.
112 | */
113 | protected static Map>> configureOutput() {
114 | Map>> config = new HashMap<>();
115 | config.put("team", (c, w) -> c.element().getKey());
116 | config.put("total_score", (c, w) -> c.element().getValue());
117 | config.put(
118 | "window_start",
119 | (c, w) -> {
120 | IntervalWindow window = (IntervalWindow) w;
121 | return GameConstants.DATE_TIME_FORMATTER.print(window.start());
122 | });
123 | return config;
124 | }
125 |
126 | /** Run a batch pipeline to do windowed analysis of the data. */
127 | // [START DocInclude_HTSMain]
128 | public static void main(String[] args) throws Exception {
129 | // Begin constructing a pipeline configured by commandline flags.
130 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
131 | Pipeline pipeline = Pipeline.create(options);
132 |
133 | final Instant stopMinTimestamp = new Instant(minFmt.parseMillis(options.getStopMin()));
134 | final Instant startMinTimestamp = new Instant(minFmt.parseMillis(options.getStartMin()));
135 |
136 | // Read 'gaming' events from a text file.
137 | pipeline
138 | .apply(TextIO.read().from(options.getInput()))
139 | // Parse the incoming data.
140 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
141 |
142 | // Filter out data before and after the given times so that it is not included
143 | // in the calculations. As we collect data in batches (say, by day), the batch for the day
144 | // that we want to analyze could potentially include some late-arriving data from the
145 | // previous day.
146 | // If so, we want to weed it out. Similarly, if we include data from the following day
147 | // (to scoop up late-arriving events from the day we're analyzing), we need to weed out
148 | // events that fall after the time period we want to analyze.
149 | // [START DocInclude_HTSFilters]
150 | .apply(
151 | "FilterStartTime",
152 | Filter.by(
153 | (GameActionInfo gInfo) -> gInfo.getTimestamp() > startMinTimestamp.getMillis()))
154 | .apply(
155 | "FilterEndTime",
156 | Filter.by(
157 | (GameActionInfo gInfo) -> gInfo.getTimestamp() < stopMinTimestamp.getMillis()))
158 | // [END DocInclude_HTSFilters]
159 |
160 | // [START DocInclude_HTSAddTsAndWindow]
161 | // Add an element timestamp based on the event log, and apply fixed windowing.
162 | .apply(
163 | "AddEventTimestamps",
164 | WithTimestamps.of((GameActionInfo i) -> new Instant(i.getTimestamp())))
165 | .apply(
166 | "FixedWindowsTeam",
167 | Window.into(FixedWindows.of(Duration.standardMinutes(options.getWindowDuration()))))
168 | // [END DocInclude_HTSAddTsAndWindow]
169 |
170 | // Extract and sum teamname/score pairs from the event data.
171 | .apply("ExtractTeamScore", new ExtractAndSumScore("team"))
172 | .apply(
173 | "WriteTeamScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), true));
174 |
175 | pipeline.run().waitUntilFinish();
176 | }
177 | // [END DocInclude_HTSMain]
178 |
179 | }
180 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/complete/game/LeaderBoard.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game;
19 |
20 | import java.util.HashMap;
21 | import java.util.Map;
22 | import org.apache.beam.examples.common.ExampleOptions;
23 | import org.apache.beam.examples.common.ExampleUtils;
24 | import org.apache.beam.examples.complete.game.utils.GameConstants;
25 | import org.apache.beam.examples.complete.game.utils.WriteToBigQuery;
26 | import org.apache.beam.examples.complete.game.utils.WriteWindowedToBigQuery;
27 | import org.apache.beam.sdk.Pipeline;
28 | import org.apache.beam.sdk.PipelineResult;
29 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
30 | import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
31 | import org.apache.beam.sdk.options.Default;
32 | import org.apache.beam.sdk.options.Description;
33 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
34 | import org.apache.beam.sdk.options.StreamingOptions;
35 | import org.apache.beam.sdk.options.Validation;
36 | import org.apache.beam.sdk.transforms.PTransform;
37 | import org.apache.beam.sdk.transforms.ParDo;
38 | import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime;
39 | import org.apache.beam.sdk.transforms.windowing.AfterWatermark;
40 | import org.apache.beam.sdk.transforms.windowing.FixedWindows;
41 | import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
42 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
43 | import org.apache.beam.sdk.transforms.windowing.Repeatedly;
44 | import org.apache.beam.sdk.transforms.windowing.Window;
45 | import org.apache.beam.sdk.values.KV;
46 | import org.apache.beam.sdk.values.PCollection;
47 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting;
48 | import org.joda.time.Duration;
49 | import org.joda.time.Instant;
50 |
51 | /**
52 | * This class is the third in a series of four pipelines that tell a story in a 'gaming' domain,
53 | * following {@link UserScore} and {@link HourlyTeamScore}. Concepts include: processing unbounded
54 | * data using fixed windows; use of custom timestamps and event-time processing; generation of
55 | * early/speculative results; using .accumulatingFiredPanes() to do cumulative processing of late-
56 | * arriving data.
57 | *
58 | * This pipeline processes an unbounded stream of 'game events'. The calculation of the team
59 | * scores uses fixed windowing based on event time (the time of the game play event), not processing
60 | * time (the time that an event is processed by the pipeline). The pipeline calculates the sum of
61 | * scores per team, for each window. By default, the team scores are calculated using one-hour
62 | * windows.
63 | *
64 | *
In contrast-- to demo another windowing option-- the user scores are calculated using a global
65 | * window, which periodically (every ten minutes) emits cumulative user score sums.
66 | *
67 | *
In contrast to the previous pipelines in the series, which used static, finite input data,
68 | * here we're using an unbounded data source, which lets us provide speculative results, and allows
69 | * handling of late data, at much lower latency. We can use the early/speculative results to keep a
70 | * 'leaderboard' updated in near-realtime. Our handling of late data lets us generate correct
71 | * results, e.g. for 'team prizes'. We're now outputting window results as they're calculated,
72 | * giving us much lower latency than with the previous batch examples.
73 | *
74 | *
Run {@code injector.Injector} to generate pubsub data for this pipeline. The Injector
75 | * documentation provides more detail on how to do this.
76 | *
77 | *
To execute this pipeline, specify the pipeline configuration like this:
78 | *
79 | *
{@code
80 | * --project=YOUR_PROJECT_ID
81 | * --tempLocation=gs://YOUR_TEMP_DIRECTORY
82 | * --runner=YOUR_RUNNER
83 | * --dataset=YOUR-DATASET
84 | * --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
85 | * }
86 | *
87 | * The BigQuery dataset you specify must already exist. The PubSub topic you specify should be
88 | * the same topic to which the Injector is publishing.
89 | */
90 | public class LeaderBoard extends HourlyTeamScore {
91 |
92 | static final Duration FIVE_MINUTES = Duration.standardMinutes(5);
93 | static final Duration TEN_MINUTES = Duration.standardMinutes(10);
94 |
95 | /** Options supported by {@link LeaderBoard}. */
96 | public interface Options extends ExampleOptions, StreamingOptions {
97 |
98 | @Description("BigQuery Dataset to write tables to. Must already exist.")
99 | @Validation.Required
100 | String getDataset();
101 |
102 | void setDataset(String value);
103 |
104 | @Description("Pub/Sub topic to read from")
105 | @Validation.Required
106 | String getTopic();
107 |
108 | void setTopic(String value);
109 |
110 | @Description("Numeric value of fixed window duration for team analysis, in minutes")
111 | @Default.Integer(60)
112 | Integer getTeamWindowDuration();
113 |
114 | void setTeamWindowDuration(Integer value);
115 |
116 | @Description("Numeric value of allowed data lateness, in minutes")
117 | @Default.Integer(120)
118 | Integer getAllowedLateness();
119 |
120 | void setAllowedLateness(Integer value);
121 |
122 | @Description("Prefix used for the BigQuery table names")
123 | @Default.String("leaderboard")
124 | String getLeaderBoardTableName();
125 |
126 | void setLeaderBoardTableName(String value);
127 | }
128 |
129 | /**
130 | * Create a map of information that describes how to write pipeline output to BigQuery. This map
131 | * is used to write team score sums and includes event timing information.
132 | */
133 | protected static Map>>
134 | configureWindowedTableWrite() {
135 |
136 | Map>> tableConfigure =
137 | new HashMap<>();
138 | tableConfigure.put(
139 | "team", new WriteWindowedToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey()));
140 | tableConfigure.put(
141 | "total_score",
142 | new WriteWindowedToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue()));
143 | tableConfigure.put(
144 | "window_start",
145 | new WriteWindowedToBigQuery.FieldInfo<>(
146 | "STRING",
147 | (c, w) -> {
148 | IntervalWindow window = (IntervalWindow) w;
149 | return GameConstants.DATE_TIME_FORMATTER.print(window.start());
150 | }));
151 | tableConfigure.put(
152 | "processing_time",
153 | new WriteWindowedToBigQuery.FieldInfo<>(
154 | "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now())));
155 | tableConfigure.put(
156 | "timing",
157 | new WriteWindowedToBigQuery.FieldInfo<>(
158 | "STRING", (c, w) -> c.pane().getTiming().toString()));
159 | return tableConfigure;
160 | }
161 |
162 | /**
163 | * Create a map of information that describes how to write pipeline output to BigQuery. This map
164 | * is passed to the {@link WriteToBigQuery} constructor to write user score sums.
165 | */
166 | protected static Map>>
167 | configureBigQueryWrite() {
168 | Map>> tableConfigure = new HashMap<>();
169 | tableConfigure.put(
170 | "user", new WriteToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey()));
171 | tableConfigure.put(
172 | "total_score",
173 | new WriteToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue()));
174 | return tableConfigure;
175 | }
176 |
177 | /**
178 | * Create a map of information that describes how to write pipeline output to BigQuery. This map
179 | * is used to write user score sums.
180 | */
181 | protected static Map>>
182 | configureGlobalWindowBigQueryWrite() {
183 |
184 | Map>> tableConfigure =
185 | configureBigQueryWrite();
186 | tableConfigure.put(
187 | "processing_time",
188 | new WriteToBigQuery.FieldInfo<>(
189 | "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now())));
190 | return tableConfigure;
191 | }
192 |
193 | public static void main(String[] args) throws Exception {
194 |
195 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
196 | // Enforce that this pipeline is always run in streaming mode.
197 | options.setStreaming(true);
198 | ExampleUtils exampleUtils = new ExampleUtils(options);
199 | Pipeline pipeline = Pipeline.create(options);
200 |
201 | // Read game events from Pub/Sub using custom timestamps, which are extracted from the pubsub
202 | // data elements, and parse the data.
203 | PCollection gameEvents =
204 | pipeline
205 | .apply(
206 | PubsubIO.readStrings()
207 | .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE)
208 | .fromTopic(options.getTopic()))
209 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn()));
210 |
211 | gameEvents
212 | .apply(
213 | "CalculateTeamScores",
214 | new CalculateTeamScores(
215 | Duration.standardMinutes(options.getTeamWindowDuration()),
216 | Duration.standardMinutes(options.getAllowedLateness())))
217 | // Write the results to BigQuery.
218 | .apply(
219 | "WriteTeamScoreSums",
220 | new WriteWindowedToBigQuery<>(
221 | options.as(GcpOptions.class).getProject(),
222 | options.getDataset(),
223 | options.getLeaderBoardTableName() + "_team",
224 | configureWindowedTableWrite()));
225 | gameEvents
226 | .apply(
227 | "CalculateUserScores",
228 | new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness())))
229 | // Write the results to BigQuery.
230 | .apply(
231 | "WriteUserScoreSums",
232 | new WriteToBigQuery<>(
233 | options.as(GcpOptions.class).getProject(),
234 | options.getDataset(),
235 | options.getLeaderBoardTableName() + "_user",
236 | configureGlobalWindowBigQueryWrite()));
237 |
238 | // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
239 | // command line.
240 | PipelineResult result = pipeline.run();
241 | exampleUtils.waitToFinish(result);
242 | }
243 |
244 | /** Calculates scores for each team within the configured window duration. */
245 | // [START DocInclude_WindowAndTrigger]
246 | // Extract team/score pairs from the event stream, using hour-long windows by default.
247 | @VisibleForTesting
248 | static class CalculateTeamScores
249 | extends PTransform, PCollection>> {
250 | private final Duration teamWindowDuration;
251 | private final Duration allowedLateness;
252 |
253 | CalculateTeamScores(Duration teamWindowDuration, Duration allowedLateness) {
254 | this.teamWindowDuration = teamWindowDuration;
255 | this.allowedLateness = allowedLateness;
256 | }
257 |
258 | @Override
259 | public PCollection> expand(PCollection infos) {
260 | return infos
261 | .apply(
262 | "LeaderboardTeamFixedWindows",
263 | Window.into(FixedWindows.of(teamWindowDuration))
264 | // We will get early (speculative) results as well as cumulative
265 | // processing of late data.
266 | .triggering(
267 | AfterWatermark.pastEndOfWindow()
268 | .withEarlyFirings(
269 | AfterProcessingTime.pastFirstElementInPane()
270 | .plusDelayOf(FIVE_MINUTES))
271 | .withLateFirings(
272 | AfterProcessingTime.pastFirstElementInPane()
273 | .plusDelayOf(TEN_MINUTES)))
274 | .withAllowedLateness(allowedLateness)
275 | .accumulatingFiredPanes())
276 | // Extract and sum teamname/score pairs from the event data.
277 | .apply("ExtractTeamScore", new ExtractAndSumScore("team"));
278 | }
279 | }
280 | // [END DocInclude_WindowAndTrigger]
281 |
282 | // [START DocInclude_ProcTimeTrigger]
283 | /**
284 | * Extract user/score pairs from the event stream using processing time, via global windowing. Get
285 | * periodic updates on all users' running scores.
286 | */
287 | @VisibleForTesting
288 | static class CalculateUserScores
289 | extends PTransform, PCollection>> {
290 | private final Duration allowedLateness;
291 |
292 | CalculateUserScores(Duration allowedLateness) {
293 | this.allowedLateness = allowedLateness;
294 | }
295 |
296 | @Override
297 | public PCollection> expand(PCollection input) {
298 | return input
299 | .apply(
300 | "LeaderboardUserGlobalWindow",
301 | Window.into(new GlobalWindows())
302 | // Get periodic results every ten minutes.
303 | .triggering(
304 | Repeatedly.forever(
305 | AfterProcessingTime.pastFirstElementInPane().plusDelayOf(TEN_MINUTES)))
306 | .accumulatingFiredPanes()
307 | .withAllowedLateness(allowedLateness))
308 | // Extract and sum username/score pairs from the event data.
309 | .apply("ExtractUserScore", new ExtractAndSumScore("user"));
310 | }
311 | }
312 | // [END DocInclude_ProcTimeTrigger]
313 | }
314 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/complete/game/StatefulTeamScore.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game;
19 |
20 | import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects.firstNonNull;
21 |
22 | import java.util.HashMap;
23 | import java.util.Map;
24 | import org.apache.beam.examples.common.ExampleUtils;
25 | import org.apache.beam.examples.complete.game.utils.GameConstants;
26 | import org.apache.beam.examples.complete.game.utils.WriteToBigQuery.FieldInfo;
27 | import org.apache.beam.examples.complete.game.utils.WriteWindowedToBigQuery;
28 | import org.apache.beam.sdk.Pipeline;
29 | import org.apache.beam.sdk.PipelineResult;
30 | import org.apache.beam.sdk.coders.VarIntCoder;
31 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
32 | import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
33 | import org.apache.beam.sdk.options.Default;
34 | import org.apache.beam.sdk.options.Description;
35 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
36 | import org.apache.beam.sdk.state.StateSpec;
37 | import org.apache.beam.sdk.state.StateSpecs;
38 | import org.apache.beam.sdk.state.ValueState;
39 | import org.apache.beam.sdk.transforms.DoFn;
40 | import org.apache.beam.sdk.transforms.MapElements;
41 | import org.apache.beam.sdk.transforms.ParDo;
42 | import org.apache.beam.sdk.values.KV;
43 | import org.apache.beam.sdk.values.TypeDescriptor;
44 | import org.apache.beam.sdk.values.TypeDescriptors;
45 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting;
46 | import org.joda.time.Instant;
47 |
48 | /**
49 | * This class is part of a series of pipelines that tell a story in a gaming domain. Concepts
50 | * include: stateful processing.
51 | *
52 | * This pipeline processes an unbounded stream of 'game events'. It uses stateful processing to
53 | * aggregate team scores per team and outputs team name and it's total score every time the team
54 | * passes a new multiple of a threshold score. For example, multiples of the threshold could be the
55 | * corresponding scores required to pass each level of the game. By default, this threshold is set
56 | * to 5000.
57 | *
58 | *
Stateful processing allows us to write pipelines that output based on a runtime state (when a
59 | * team reaches a certain score, in every 100 game events etc) without time triggers. See
60 | * https://beam.apache.org/blog/2017/02/13/stateful-processing.html for more information on using
61 | * stateful processing.
62 | *
63 | *
Run {@code injector.Injector} to generate pubsub data for this pipeline. The Injector
64 | * documentation provides more detail on how to do this.
65 | *
66 | *
To execute this pipeline, specify the pipeline configuration like this:
67 | *
68 | *
{@code
69 | * --project=YOUR_PROJECT_ID
70 | * --tempLocation=gs://YOUR_TEMP_DIRECTORY
71 | * --runner=YOUR_RUNNER
72 | * --dataset=YOUR-DATASET
73 | * --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
74 | * }
75 | *
76 | * The BigQuery dataset you specify must already exist. The PubSub topic you specify should be
77 | * the same topic to which the Injector is publishing.
78 | */
79 | @SuppressWarnings({
80 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402)
81 | })
82 | public class StatefulTeamScore extends LeaderBoard {
83 |
84 | /** Options supported by {@link StatefulTeamScore}. */
85 | public interface Options extends LeaderBoard.Options {
86 |
87 | @Description("Numeric value, multiple of which is used as threshold for outputting team score.")
88 | @Default.Integer(5000)
89 | Integer getThresholdScore();
90 |
91 | void setThresholdScore(Integer value);
92 | }
93 |
94 | /**
95 | * Create a map of information that describes how to write pipeline output to BigQuery. This map
96 | * is used to write team score sums.
97 | */
98 | private static Map>> configureCompleteWindowedTableWrite() {
99 |
100 | Map>> tableConfigure =
101 | new HashMap<>();
102 | tableConfigure.put(
103 | "team", new WriteWindowedToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey()));
104 | tableConfigure.put(
105 | "total_score",
106 | new WriteWindowedToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue()));
107 | tableConfigure.put(
108 | "processing_time",
109 | new WriteWindowedToBigQuery.FieldInfo<>(
110 | "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now())));
111 | return tableConfigure;
112 | }
113 |
114 | public static void main(String[] args) throws Exception {
115 |
116 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
117 | // Enforce that this pipeline is always run in streaming mode.
118 | options.setStreaming(true);
119 | ExampleUtils exampleUtils = new ExampleUtils(options);
120 | Pipeline pipeline = Pipeline.create(options);
121 |
122 | pipeline
123 | // Read game events from Pub/Sub using custom timestamps, which are extracted from the
124 | // pubsub data elements, and parse the data.
125 | .apply(
126 | PubsubIO.readStrings()
127 | .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE)
128 | .fromTopic(options.getTopic()))
129 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
130 | // Create mapping. UpdateTeamScore uses team name as key.
131 | .apply(
132 | "MapTeamAsKey",
133 | MapElements.into(
134 | TypeDescriptors.kvs(
135 | TypeDescriptors.strings(), TypeDescriptor.of(GameActionInfo.class)))
136 | .via((GameActionInfo gInfo) -> KV.of(gInfo.team, gInfo)))
137 | // Outputs a team's score every time it passes a new multiple of the threshold.
138 | .apply("UpdateTeamScore", ParDo.of(new UpdateTeamScoreFn(options.getThresholdScore())))
139 | // Write the results to BigQuery.
140 | .apply(
141 | "WriteTeamLeaders",
142 | new WriteWindowedToBigQuery<>(
143 | options.as(GcpOptions.class).getProject(),
144 | options.getDataset(),
145 | options.getLeaderBoardTableName() + "_team_leader",
146 | configureCompleteWindowedTableWrite()));
147 |
148 | // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
149 | // command line.
150 | PipelineResult result = pipeline.run();
151 | exampleUtils.waitToFinish(result);
152 | }
153 |
154 | /**
155 | * Tracks each team's score separately in a single state cell and outputs the score every time it
156 | * passes a new multiple of a threshold.
157 | *
158 | * We use stateful {@link DoFn} because:
159 | *
160 | *
161 | * - State is key-partitioned. Therefore, the score is calculated per team.
162 | *
- Stateful {@link DoFn} can determine when to output based on the state. This only allows
163 | * outputting when a team's score passes a given threshold.
164 | *
165 | */
166 | @VisibleForTesting
167 | public static class UpdateTeamScoreFn
168 | extends DoFn, KV> {
169 |
170 | private static final String TOTAL_SCORE = "totalScore";
171 | private final int thresholdScore;
172 |
173 | public UpdateTeamScoreFn(int thresholdScore) {
174 | this.thresholdScore = thresholdScore;
175 | }
176 |
177 | /**
178 | * Describes the state for storing team score. Let's break down this statement.
179 | *
180 | * {@link StateSpec} configures the state cell, which is provided by a runner during pipeline
181 | * execution.
182 | *
183 | *
{@link org.apache.beam.sdk.transforms.DoFn.StateId} annotation assigns an identifier to
184 | * the state, which is used to refer the state in {@link
185 | * org.apache.beam.sdk.transforms.DoFn.ProcessElement}.
186 | *
187 | *
A {@link ValueState} stores single value per key and per window. Because our pipeline is
188 | * globally windowed in this example, this {@link ValueState} is just key partitioned, with one
189 | * score per team. Any other class that extends {@link org.apache.beam.sdk.state.State} can be
190 | * used.
191 | *
192 | *
In order to store the value, the state must be encoded. Therefore, we provide a coder, in
193 | * this case the {@link VarIntCoder}. If the coder is not provided as in {@code
194 | * StateSpecs.value()}, Beam's coder inference will try to provide a coder automatically.
195 | */
196 | @StateId(TOTAL_SCORE)
197 | private final StateSpec> totalScoreSpec =
198 | StateSpecs.value(VarIntCoder.of());
199 |
200 | /**
201 | * To use a state cell, annotate a parameter with {@link
202 | * org.apache.beam.sdk.transforms.DoFn.StateId} that matches the state declaration. The type of
203 | * the parameter should match the {@link StateSpec} type.
204 | */
205 | @ProcessElement
206 | public void processElement(
207 | ProcessContext c, @StateId(TOTAL_SCORE) ValueState totalScore) {
208 | String teamName = c.element().getKey();
209 | GameActionInfo gInfo = c.element().getValue();
210 |
211 | // ValueState cells do not contain a default value. If the state is possibly not written, make
212 | // sure to check for null on read.
213 | int oldTotalScore = firstNonNull(totalScore.read(), 0);
214 | totalScore.write(oldTotalScore + gInfo.score);
215 |
216 | // Since there are no negative scores, the easiest way to check whether a team just passed a
217 | // new multiple of the threshold score is to compare the quotients of dividing total scores by
218 | // threshold before and after this aggregation. For example, if the total score was 1999,
219 | // the new total is 2002, and the threshold is 1000, 1999 / 1000 = 1, 2002 / 1000 = 2.
220 | // Therefore, this team passed the threshold.
221 | if (oldTotalScore / this.thresholdScore < totalScore.read() / this.thresholdScore) {
222 | c.output(KV.of(teamName, totalScore.read()));
223 | }
224 | }
225 | }
226 | }
227 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/complete/game/UserScore.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game;
19 |
20 | import java.util.HashMap;
21 | import java.util.Map;
22 | import java.util.Objects;
23 | import org.apache.avro.reflect.Nullable;
24 | import org.apache.beam.examples.complete.game.utils.WriteToText;
25 | import org.apache.beam.sdk.Pipeline;
26 | import org.apache.beam.sdk.coders.AvroCoder;
27 | import org.apache.beam.sdk.coders.DefaultCoder;
28 | import org.apache.beam.sdk.io.TextIO;
29 | import org.apache.beam.sdk.metrics.Counter;
30 | import org.apache.beam.sdk.metrics.Metrics;
31 | import org.apache.beam.sdk.options.Default;
32 | import org.apache.beam.sdk.options.Description;
33 | import org.apache.beam.sdk.options.PipelineOptions;
34 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
35 | import org.apache.beam.sdk.options.Validation;
36 | import org.apache.beam.sdk.transforms.DoFn;
37 | import org.apache.beam.sdk.transforms.MapElements;
38 | import org.apache.beam.sdk.transforms.PTransform;
39 | import org.apache.beam.sdk.transforms.ParDo;
40 | import org.apache.beam.sdk.transforms.Sum;
41 | import org.apache.beam.sdk.values.KV;
42 | import org.apache.beam.sdk.values.PCollection;
43 | import org.apache.beam.sdk.values.TypeDescriptors;
44 | import org.slf4j.Logger;
45 | import org.slf4j.LoggerFactory;
46 |
47 | /**
48 | * This class is the first in a series of four pipelines that tell a story in a 'gaming' domain.
49 | * Concepts: batch processing, reading input from text files, writing output to text files, using
50 | * standalone DoFns, use of the sum per key transform, and use of Java 8 lambda syntax.
51 | *
52 | * In this gaming scenario, many users play, as members of different teams, over the course of a
53 | * day, and their actions are logged for processing. Some of the logged game events may be late-
54 | * arriving, if users play on mobile devices and go transiently offline for a period.
55 | *
56 | *
This pipeline does batch processing of data collected from gaming events. It calculates the
57 | * sum of scores per user, over an entire batch of gaming data (collected, say, for each day). The
58 | * batch processing will not include any late data that arrives after the day's cutoff point.
59 | *
60 | *
To execute this pipeline, specify the pipeline configuration like this:
61 | *
62 | *
{@code
63 | * --tempLocation=YOUR_TEMP_DIRECTORY
64 | * --runner=YOUR_RUNNER
65 | * --output=YOUR_OUTPUT_DIRECTORY
66 | * (possibly options specific to your runner or permissions for your temp/output locations)
67 | * }
68 | *
69 | * Optionally include the --input argument to specify a batch input file. See the --input default
70 | * value for example batch data file, or use {@code injector.Injector} to generate your own batch
71 | * data.
72 | */
73 | @SuppressWarnings({
74 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402)
75 | })
76 | public class UserScore {
77 |
78 | /** Class to hold info about a game event. */
79 | @DefaultCoder(AvroCoder.class)
80 | static class GameActionInfo {
81 | @Nullable String user;
82 | @Nullable String team;
83 | @Nullable Integer score;
84 | @Nullable Long timestamp;
85 |
86 | public GameActionInfo() {}
87 |
88 | public GameActionInfo(String user, String team, Integer score, Long timestamp) {
89 | this.user = user;
90 | this.team = team;
91 | this.score = score;
92 | this.timestamp = timestamp;
93 | }
94 |
95 | public String getUser() {
96 | return this.user;
97 | }
98 |
99 | public String getTeam() {
100 | return this.team;
101 | }
102 |
103 | public Integer getScore() {
104 | return this.score;
105 | }
106 |
107 | public Long getTimestamp() {
108 | return this.timestamp;
109 | }
110 |
111 | public String getKey(String keyname) {
112 | if ("team".equals(keyname)) {
113 | return this.team;
114 | } else { // return username as default
115 | return this.user;
116 | }
117 | }
118 |
119 | @Override
120 | public boolean equals(Object o) {
121 | if (this == o) {
122 | return true;
123 | }
124 | if (o == null || o.getClass() != this.getClass()) {
125 | return false;
126 | }
127 |
128 | GameActionInfo gameActionInfo = (GameActionInfo) o;
129 |
130 | if (!this.getUser().equals(gameActionInfo.getUser())) {
131 | return false;
132 | }
133 |
134 | if (!this.getTeam().equals(gameActionInfo.getTeam())) {
135 | return false;
136 | }
137 |
138 | if (!this.getScore().equals(gameActionInfo.getScore())) {
139 | return false;
140 | }
141 |
142 | return this.getTimestamp().equals(gameActionInfo.getTimestamp());
143 | }
144 |
145 | @Override
146 | public int hashCode() {
147 | return Objects.hash(user, team, score, timestamp);
148 | }
149 | }
150 |
151 | /**
152 | * Parses the raw game event info into GameActionInfo objects. Each event line has the following
153 | * format: username,teamname,score,timestamp_in_ms,readable_time e.g.:
154 | * user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224 The human-readable
155 | * time string is not used here.
156 | */
157 | static class ParseEventFn extends DoFn {
158 |
159 | // Log and count parse errors.
160 | private static final Logger LOG = LoggerFactory.getLogger(ParseEventFn.class);
161 | private final Counter numParseErrors = Metrics.counter("main", "ParseErrors");
162 |
163 | @ProcessElement
164 | public void processElement(ProcessContext c) {
165 | String[] components = c.element().split(",", -1);
166 | try {
167 | String user = components[0].trim();
168 | String team = components[1].trim();
169 | Integer score = Integer.parseInt(components[2].trim());
170 | Long timestamp = Long.parseLong(components[3].trim());
171 | GameActionInfo gInfo = new GameActionInfo(user, team, score, timestamp);
172 | c.output(gInfo);
173 | } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) {
174 | numParseErrors.inc();
175 | LOG.info("Parse error on " + c.element() + ", " + e.getMessage());
176 | }
177 | }
178 | }
179 |
180 | /**
181 | * A transform to extract key/score information from GameActionInfo, and sum the scores. The
182 | * constructor arg determines whether 'team' or 'user' info is extracted.
183 | */
184 | // [START DocInclude_USExtractXform]
185 | public static class ExtractAndSumScore
186 | extends PTransform, PCollection>> {
187 |
188 | private final String field;
189 |
190 | ExtractAndSumScore(String field) {
191 | this.field = field;
192 | }
193 |
194 | @Override
195 | public PCollection> expand(PCollection gameInfo) {
196 |
197 | return gameInfo
198 | .apply(
199 | MapElements.into(
200 | TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
201 | .via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore())))
202 | .apply(Sum.integersPerKey());
203 | }
204 | }
205 | // [END DocInclude_USExtractXform]
206 |
207 | /** Options supported by {@link UserScore}. */
208 | public interface Options extends PipelineOptions {
209 |
210 | @Description("Path to the data file(s) containing game data.")
211 | /* The default maps to two large Google Cloud Storage files (each ~12GB) holding two subsequent
212 | day's worth (roughly) of data.
213 |
214 | Note: You may want to use a small sample dataset to test it locally/quickly : gs://apache-beam-samples/game/small/gaming_data.csv
215 | You can also download it via the command line gsutil cp gs://apache-beam-samples/game/small/gaming_data.csv ./destination_folder/gaming_data.csv */
216 | @Default.String("gs://apache-beam-samples/game/gaming_data*.csv")
217 | String getInput();
218 |
219 | void setInput(String value);
220 |
221 | // Set this required option to specify where to write the output.
222 | @Description("Path of the file to write to.")
223 | @Validation.Required
224 | String getOutput();
225 |
226 | void setOutput(String value);
227 | }
228 |
229 | /**
230 | * Create a map of information that describes how to write pipeline output to text. This map is
231 | * passed to the {@link WriteToText} constructor to write user score sums.
232 | */
233 | protected static Map>> configureOutput() {
234 | Map>> config = new HashMap<>();
235 | config.put("user", (c, w) -> c.element().getKey());
236 | config.put("total_score", (c, w) -> c.element().getValue());
237 | return config;
238 | }
239 |
240 | /** Run a batch pipeline. */
241 | // [START DocInclude_USMain]
242 | public static void main(String[] args) throws Exception {
243 | // Begin constructing a pipeline configured by commandline flags.
244 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
245 | Pipeline pipeline = Pipeline.create(options);
246 |
247 | // Read events from a text file and parse them.
248 | pipeline
249 | .apply(TextIO.read().from(options.getInput()))
250 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
251 | // Extract and sum username/score pairs from the event data.
252 | .apply("ExtractUserScore", new ExtractAndSumScore("user"))
253 | .apply(
254 | "WriteUserScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), false));
255 |
256 | // Run the batch pipeline.
257 | pipeline.run().waitUntilFinish();
258 | }
259 | // [END DocInclude_USMain]
260 | }
261 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/complete/game/injector/InjectorUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game.injector;
19 |
20 | import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkNotNull;
21 |
22 | import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
23 | import com.google.api.client.googleapis.json.GoogleJsonResponseException;
24 | import com.google.api.client.googleapis.util.Utils;
25 | import com.google.api.client.http.HttpRequestInitializer;
26 | import com.google.api.client.http.HttpStatusCodes;
27 | import com.google.api.client.http.HttpTransport;
28 | import com.google.api.client.json.JsonFactory;
29 | import com.google.api.services.pubsub.Pubsub;
30 | import com.google.api.services.pubsub.PubsubScopes;
31 | import com.google.api.services.pubsub.model.Topic;
32 | import java.io.IOException;
33 |
34 | class InjectorUtils {
35 |
36 | private static final String APP_NAME = "injector";
37 |
38 | /** Builds a new Pubsub client and returns it. */
39 | public static Pubsub getClient(final HttpTransport httpTransport, final JsonFactory jsonFactory)
40 | throws IOException {
41 | checkNotNull(httpTransport);
42 | checkNotNull(jsonFactory);
43 | GoogleCredential credential =
44 | GoogleCredential.getApplicationDefault(httpTransport, jsonFactory);
45 | if (credential.createScopedRequired()) {
46 | credential = credential.createScoped(PubsubScopes.all());
47 | }
48 | if (credential.getClientAuthentication() != null) {
49 | System.out.println(
50 | "\n***Warning! You are not using service account credentials to "
51 | + "authenticate.\nYou need to use service account credentials for this example,"
52 | + "\nsince user-level credentials do not have enough pubsub quota,\nand so you will run "
53 | + "out of PubSub quota very quickly.\nSee "
54 | + "https://developers.google.com/identity/protocols/application-default-credentials.");
55 | System.exit(1);
56 | }
57 | HttpRequestInitializer initializer = new RetryHttpInitializerWrapper(credential);
58 | return new Pubsub.Builder(httpTransport, jsonFactory, initializer)
59 | .setApplicationName(APP_NAME)
60 | .build();
61 | }
62 |
63 | /** Builds a new Pubsub client with default HttpTransport and JsonFactory and returns it. */
64 | public static Pubsub getClient() throws IOException {
65 | return getClient(Utils.getDefaultTransport(), Utils.getDefaultJsonFactory());
66 | }
67 |
68 | /** Returns the fully qualified topic name for Pub/Sub. */
69 | public static String getFullyQualifiedTopicName(final String project, final String topic) {
70 | return String.format("projects/%s/topics/%s", project, topic);
71 | }
72 |
73 | /** Create a topic if it doesn't exist. */
74 | public static void createTopic(Pubsub client, String fullTopicName) throws IOException {
75 | System.out.println("fullTopicName " + fullTopicName);
76 | try {
77 | client.projects().topics().get(fullTopicName).execute();
78 | } catch (GoogleJsonResponseException e) {
79 | if (e.getStatusCode() == HttpStatusCodes.STATUS_CODE_NOT_FOUND) {
80 | Topic topic = client.projects().topics().create(fullTopicName, new Topic()).execute();
81 | System.out.printf("Topic %s was created.%n", topic.getName());
82 | }
83 | }
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/complete/game/injector/RetryHttpInitializerWrapper.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game.injector;
19 |
20 | import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkNotNull;
21 |
22 | import com.google.api.client.auth.oauth2.Credential;
23 | import com.google.api.client.http.HttpBackOffIOExceptionHandler;
24 | import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler;
25 | import com.google.api.client.http.HttpRequest;
26 | import com.google.api.client.http.HttpRequestInitializer;
27 | import com.google.api.client.http.HttpUnsuccessfulResponseHandler;
28 | import com.google.api.client.util.ExponentialBackOff;
29 | import com.google.api.client.util.Sleeper;
30 | import java.util.logging.Logger;
31 |
32 | /**
33 | * RetryHttpInitializerWrapper will automatically retry upon RPC failures, preserving the
34 | * auto-refresh behavior of the Google Credentials.
35 | */
36 | public class RetryHttpInitializerWrapper implements HttpRequestInitializer {
37 |
38 | /** A private logger. */
39 | private static final Logger LOG = Logger.getLogger(RetryHttpInitializerWrapper.class.getName());
40 |
41 | /** One minutes in miliseconds. */
42 | private static final int ONEMINITUES = 60000;
43 |
44 | /**
45 | * Intercepts the request for filling in the "Authorization" header field, as well as recovering
46 | * from certain unsuccessful error codes wherein the Credential must refresh its token for a
47 | * retry.
48 | */
49 | private final Credential wrappedCredential;
50 |
51 | /** A sleeper; you can replace it with a mock in your test. */
52 | private final Sleeper sleeper;
53 |
54 | /**
55 | * A constructor.
56 | *
57 | * @param wrappedCredential Credential which will be wrapped and used for providing auth header.
58 | */
59 | public RetryHttpInitializerWrapper(final Credential wrappedCredential) {
60 | this(wrappedCredential, Sleeper.DEFAULT);
61 | }
62 |
63 | /**
64 | * A protected constructor only for testing.
65 | *
66 | * @param wrappedCredential Credential which will be wrapped and used for providing auth header.
67 | * @param sleeper Sleeper for easy testing.
68 | */
69 | RetryHttpInitializerWrapper(final Credential wrappedCredential, final Sleeper sleeper) {
70 | this.wrappedCredential = checkNotNull(wrappedCredential);
71 | this.sleeper = sleeper;
72 | }
73 |
74 | /** Initializes the given request. */
75 | @Override
76 | public final void initialize(final HttpRequest request) {
77 | request.setReadTimeout(2 * ONEMINITUES); // 2 minutes read timeout
78 | final HttpUnsuccessfulResponseHandler backoffHandler =
79 | new HttpBackOffUnsuccessfulResponseHandler(new ExponentialBackOff()).setSleeper(sleeper);
80 | request.setInterceptor(wrappedCredential);
81 | request.setUnsuccessfulResponseHandler(
82 | (request1, response, supportsRetry) -> {
83 | if (wrappedCredential.handleResponse(request1, response, supportsRetry)) {
84 | // If credential decides it can handle it, the return code or message indicated
85 | // something specific to authentication, and no backoff is desired.
86 | return true;
87 | } else if (backoffHandler.handleResponse(request1, response, supportsRetry)) {
88 | // Otherwise, we defer to the judgement of our internal backoff handler.
89 | LOG.info("Retrying " + request1.getUrl().toString());
90 | return true;
91 | } else {
92 | return false;
93 | }
94 | });
95 | request.setIOExceptionHandler(
96 | new HttpBackOffIOExceptionHandler(new ExponentialBackOff()).setSleeper(sleeper));
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/complete/game/utils/GameConstants.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game.utils;
19 |
20 | import java.util.TimeZone;
21 | import org.joda.time.DateTimeZone;
22 | import org.joda.time.format.DateTimeFormat;
23 | import org.joda.time.format.DateTimeFormatter;
24 |
25 | /** Shared constants between game series classes. */
26 | public class GameConstants {
27 |
28 | public static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms";
29 |
30 | public static final DateTimeFormatter DATE_TIME_FORMATTER =
31 | DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")
32 | .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("America/Los_Angeles")));
33 | }
34 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/complete/game/utils/WriteToBigQuery.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game.utils;
19 |
20 | import com.google.api.services.bigquery.model.TableFieldSchema;
21 | import com.google.api.services.bigquery.model.TableReference;
22 | import com.google.api.services.bigquery.model.TableRow;
23 | import com.google.api.services.bigquery.model.TableSchema;
24 | import java.io.Serializable;
25 | import java.util.ArrayList;
26 | import java.util.List;
27 | import java.util.Map;
28 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
29 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
30 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
31 | import org.apache.beam.sdk.transforms.DoFn;
32 | import org.apache.beam.sdk.transforms.PTransform;
33 | import org.apache.beam.sdk.transforms.ParDo;
34 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
35 | import org.apache.beam.sdk.values.PCollection;
36 | import org.apache.beam.sdk.values.PDone;
37 |
38 | /**
39 | * Generate, format, and write BigQuery table row information. Use provided information about the
40 | * field names and types, as well as lambda functions that describe how to generate their values.
41 | */
42 | @SuppressWarnings({
43 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402)
44 | })
45 | public class WriteToBigQuery extends PTransform, PDone> {
46 |
47 | protected String projectId;
48 | protected String datasetId;
49 | protected String tableName;
50 | protected Map> fieldInfo;
51 |
52 | public WriteToBigQuery() {}
53 |
54 | public WriteToBigQuery(
55 | String projectId,
56 | String datasetId,
57 | String tableName,
58 | Map> fieldInfo) {
59 | this.projectId = projectId;
60 | this.datasetId = datasetId;
61 | this.tableName = tableName;
62 | this.fieldInfo = fieldInfo;
63 | }
64 |
65 | /**
66 | * A {@link Serializable} function from a {@link DoFn.ProcessContext} and {@link BoundedWindow} to
67 | * the value for that field.
68 | */
69 | public interface FieldFn extends Serializable {
70 | Object apply(DoFn.ProcessContext context, BoundedWindow window);
71 | }
72 |
73 | /** Define a class to hold information about output table field definitions. */
74 | public static class FieldInfo implements Serializable {
75 | // The BigQuery 'type' of the field
76 | private String fieldType;
77 | // A lambda function to generate the field value
78 | private FieldFn fieldFn;
79 |
80 | public FieldInfo(String fieldType, FieldFn fieldFn) {
81 | this.fieldType = fieldType;
82 | this.fieldFn = fieldFn;
83 | }
84 |
85 | String getFieldType() {
86 | return this.fieldType;
87 | }
88 |
89 | FieldFn getFieldFn() {
90 | return this.fieldFn;
91 | }
92 | }
93 |
94 | /** Convert each key/score pair into a BigQuery TableRow as specified by fieldFn. */
95 | protected class BuildRowFn extends DoFn {
96 |
97 | @ProcessElement
98 | public void processElement(ProcessContext c, BoundedWindow window) {
99 |
100 | TableRow row = new TableRow();
101 | for (Map.Entry> entry : fieldInfo.entrySet()) {
102 | String key = entry.getKey();
103 | FieldInfo fcnInfo = entry.getValue();
104 | FieldFn fcn = fcnInfo.getFieldFn();
105 | row.set(key, fcn.apply(c, window));
106 | }
107 | c.output(row);
108 | }
109 | }
110 |
111 | /** Build the output table schema. */
112 | protected TableSchema getSchema() {
113 | List fields = new ArrayList<>();
114 | for (Map.Entry> entry : fieldInfo.entrySet()) {
115 | String key = entry.getKey();
116 | FieldInfo fcnInfo = entry.getValue();
117 | String bqType = fcnInfo.getFieldType();
118 | fields.add(new TableFieldSchema().setName(key).setType(bqType));
119 | }
120 | return new TableSchema().setFields(fields);
121 | }
122 |
123 | @Override
124 | public PDone expand(PCollection teamAndScore) {
125 | teamAndScore
126 | .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
127 | .apply(
128 | BigQueryIO.writeTableRows()
129 | .to(getTable(projectId, datasetId, tableName))
130 | .withSchema(getSchema())
131 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
132 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
133 | return PDone.in(teamAndScore.getPipeline());
134 | }
135 |
136 | /** Utility to construct an output table reference. */
137 | static TableReference getTable(String projectId, String datasetId, String tableName) {
138 | TableReference table = new TableReference();
139 | table.setDatasetId(datasetId);
140 | table.setProjectId(projectId);
141 | table.setTableId(tableName);
142 | return table;
143 | }
144 | }
145 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/complete/game/utils/WriteToText.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game.utils;
19 |
20 | import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument;
21 |
22 | import java.io.Serializable;
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import java.util.Map;
26 | import java.util.TimeZone;
27 | import java.util.stream.Collectors;
28 | import org.apache.beam.sdk.io.FileBasedSink;
29 | import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy;
30 | import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints;
31 | import org.apache.beam.sdk.io.TextIO;
32 | import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions;
33 | import org.apache.beam.sdk.io.fs.ResourceId;
34 | import org.apache.beam.sdk.transforms.DoFn;
35 | import org.apache.beam.sdk.transforms.PTransform;
36 | import org.apache.beam.sdk.transforms.ParDo;
37 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
38 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
39 | import org.apache.beam.sdk.transforms.windowing.PaneInfo;
40 | import org.apache.beam.sdk.values.PCollection;
41 | import org.apache.beam.sdk.values.PDone;
42 | import org.joda.time.DateTimeZone;
43 | import org.joda.time.format.DateTimeFormat;
44 | import org.joda.time.format.DateTimeFormatter;
45 |
46 | /**
47 | * Generate, format, and write rows. Use provided information about the field names and types, as
48 | * well as lambda functions that describe how to generate their values.
49 | */
50 | @SuppressWarnings({
51 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402)
52 | })
53 | public class WriteToText extends PTransform, PDone> {
54 |
55 | private static final DateTimeFormatter formatter =
56 | DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")
57 | .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("America/Los_Angeles")));
58 |
59 | protected String filenamePrefix;
60 | protected Map> fieldFn;
61 | protected boolean windowed;
62 |
63 | public WriteToText() {}
64 |
65 | public WriteToText(
66 | String filenamePrefix, Map> fieldFn, boolean windowed) {
67 | this.filenamePrefix = filenamePrefix;
68 | this.fieldFn = fieldFn;
69 | this.windowed = windowed;
70 | }
71 |
72 | /**
73 | * A {@link Serializable} function from a {@link DoFn.ProcessContext} and {@link BoundedWindow} to
74 | * the value for that field.
75 | */
76 | public interface FieldFn extends Serializable {
77 | Object apply(DoFn.ProcessContext context, BoundedWindow window);
78 | }
79 |
80 | /** Convert each key/score pair into a row as specified by fieldFn. */
81 | protected class BuildRowFn extends DoFn {
82 |
83 | @ProcessElement
84 | public void processElement(ProcessContext c, BoundedWindow window) {
85 | List fields = new ArrayList<>();
86 | for (Map.Entry> entry : fieldFn.entrySet()) {
87 | String key = entry.getKey();
88 | FieldFn fcn = entry.getValue();
89 | fields.add(key + ": " + fcn.apply(c, window));
90 | }
91 | String result = fields.stream().collect(Collectors.joining(", "));
92 | c.output(result);
93 | }
94 | }
95 |
96 | /**
97 | * A {@link DoFn} that writes elements to files with names deterministically derived from the
98 | * lower and upper bounds of their key (an {@link IntervalWindow}).
99 | */
100 | protected static class WriteOneFilePerWindow extends PTransform, PDone> {
101 |
102 | private final String filenamePrefix;
103 |
104 | public WriteOneFilePerWindow(String filenamePrefix) {
105 | this.filenamePrefix = filenamePrefix;
106 | }
107 |
108 | @Override
109 | public PDone expand(PCollection input) {
110 | // Verify that the input has a compatible window type.
111 | checkArgument(
112 | input.getWindowingStrategy().getWindowFn().windowCoder() == IntervalWindow.getCoder());
113 |
114 | ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
115 |
116 | return input.apply(
117 | TextIO.write()
118 | .to(new PerWindowFiles(resource))
119 | .withTempDirectory(resource.getCurrentDirectory())
120 | .withWindowedWrites()
121 | .withNumShards(3));
122 | }
123 | }
124 |
125 | /**
126 | * A {@link FilenamePolicy} produces a base file name for a write based on metadata about the data
127 | * being written. This always includes the shard number and the total number of shards. For
128 | * windowed writes, it also includes the window and pane index (a sequence number assigned to each
129 | * trigger firing).
130 | */
131 | protected static class PerWindowFiles extends FilenamePolicy {
132 |
133 | private final ResourceId prefix;
134 |
135 | public PerWindowFiles(ResourceId prefix) {
136 | this.prefix = prefix;
137 | }
138 |
139 | public String filenamePrefixForWindow(IntervalWindow window) {
140 | String filePrefix = prefix.isDirectory() ? "" : prefix.getFilename();
141 | return String.format(
142 | "%s-%s-%s", filePrefix, formatter.print(window.start()), formatter.print(window.end()));
143 | }
144 |
145 | @Override
146 | public ResourceId windowedFilename(
147 | int shardNumber,
148 | int numShards,
149 | BoundedWindow window,
150 | PaneInfo paneInfo,
151 | OutputFileHints outputFileHints) {
152 | IntervalWindow intervalWindow = (IntervalWindow) window;
153 | String filename =
154 | String.format(
155 | "%s-%s-of-%s%s",
156 | filenamePrefixForWindow(intervalWindow),
157 | shardNumber,
158 | numShards,
159 | outputFileHints.getSuggestedFilenameSuffix());
160 | return prefix.getCurrentDirectory().resolve(filename, StandardResolveOptions.RESOLVE_FILE);
161 | }
162 |
163 | @Override
164 | public ResourceId unwindowedFilename(
165 | int shardNumber, int numShards, OutputFileHints outputFileHints) {
166 | throw new UnsupportedOperationException("Unsupported.");
167 | }
168 | }
169 |
170 | @Override
171 | public PDone expand(PCollection teamAndScore) {
172 | if (windowed) {
173 | teamAndScore
174 | .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
175 | .apply(new WriteToText.WriteOneFilePerWindow(filenamePrefix));
176 | } else {
177 | teamAndScore
178 | .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
179 | .apply(TextIO.write().to(filenamePrefix));
180 | }
181 | return PDone.in(teamAndScore.getPipeline());
182 | }
183 | }
184 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/complete/game/utils/WriteWindowedToBigQuery.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game.utils;
19 |
20 | import com.google.api.services.bigquery.model.TableRow;
21 | import java.util.Map;
22 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
23 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
24 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
25 | import org.apache.beam.sdk.transforms.DoFn;
26 | import org.apache.beam.sdk.transforms.ParDo;
27 | import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
28 | import org.apache.beam.sdk.values.PCollection;
29 | import org.apache.beam.sdk.values.PDone;
30 |
31 | /**
32 | * Generate, format, and write BigQuery table row information. Subclasses {@link WriteToBigQuery} to
33 | * require windowing; so this subclass may be used for writes that require access to the context's
34 | * window information.
35 | */
36 | public class WriteWindowedToBigQuery extends WriteToBigQuery {
37 |
38 | public WriteWindowedToBigQuery(
39 | String projectId, String datasetId, String tableName, Map> fieldInfo) {
40 | super(projectId, datasetId, tableName, fieldInfo);
41 | }
42 |
43 | /** Convert each key/score pair into a BigQuery TableRow. */
44 | protected class BuildRowFn extends DoFn {
45 | @ProcessElement
46 | public void processElement(ProcessContext c, BoundedWindow window) {
47 |
48 | TableRow row = new TableRow();
49 | for (Map.Entry> entry : fieldInfo.entrySet()) {
50 | String key = entry.getKey();
51 | FieldInfo fcnInfo = entry.getValue();
52 | row.set(key, fcnInfo.getFieldFn().apply(c, window));
53 | }
54 | c.output(row);
55 | }
56 | }
57 |
58 | @Override
59 | public PDone expand(PCollection teamAndScore) {
60 | teamAndScore
61 | .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
62 | .apply(
63 | BigQueryIO.writeTableRows()
64 | .to(getTable(projectId, datasetId, tableName))
65 | .withSchema(getSchema())
66 | .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
67 | .withWriteDisposition(WriteDisposition.WRITE_APPEND));
68 | return PDone.in(teamAndScore.getPipeline());
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/subprocess/ExampleEchoPipeline.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.subprocess;
19 |
20 | import java.util.ArrayList;
21 | import java.util.List;
22 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration;
23 | import org.apache.beam.examples.subprocess.kernel.SubProcessCommandLineArgs;
24 | import org.apache.beam.examples.subprocess.kernel.SubProcessCommandLineArgs.Command;
25 | import org.apache.beam.examples.subprocess.kernel.SubProcessKernel;
26 | import org.apache.beam.examples.subprocess.utils.CallingSubProcessUtils;
27 | import org.apache.beam.sdk.Pipeline;
28 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
29 | import org.apache.beam.sdk.transforms.Create;
30 | import org.apache.beam.sdk.transforms.DoFn;
31 | import org.apache.beam.sdk.transforms.ParDo;
32 | import org.apache.beam.sdk.values.KV;
33 | import org.slf4j.Logger;
34 | import org.slf4j.LoggerFactory;
35 |
36 | /**
37 | * In this example batch pipeline we will invoke a simple Echo C++ library within a DoFn The sample
38 | * makes use of a ExternalLibraryDoFn class which abstracts the setup and processing of the
39 | * executable, logs and results. For this example we are using commands passed to the library based
40 | * on ordinal position but for a production system you should use a mechanism like ProtoBuffers with
41 | * Base64 encoding to pass the parameters to the library To test this example you will need to build
42 | * the files Echo.cc and EchoAgain.cc in a linux env matching the runner that you are using (using
43 | * g++ with static option). Once built copy them to the SourcePath defined in {@link
44 | * SubProcessPipelineOptions}
45 | */
46 | public class ExampleEchoPipeline {
47 | private static final Logger LOG = LoggerFactory.getLogger(ExampleEchoPipeline.class);
48 |
49 | public static void main(String[] args) throws Exception {
50 |
51 | // Read in the options for the pipeline
52 | SubProcessPipelineOptions options =
53 | PipelineOptionsFactory.fromArgs(args).withValidation().as(SubProcessPipelineOptions.class);
54 |
55 | Pipeline p = Pipeline.create(options);
56 |
57 | // Setup the Configuration option used with all transforms
58 | SubProcessConfiguration configuration = options.getSubProcessConfiguration();
59 |
60 | // Create some sample data to be fed to our c++ Echo library
61 | List> sampleData = new ArrayList<>();
62 | for (int i = 0; i < 10000; i++) {
63 | String str = String.valueOf(i);
64 | sampleData.add(KV.of(str, str));
65 | }
66 |
67 | // Define the pipeline which is two transforms echoing the inputs out to Logs
68 | p.apply(Create.of(sampleData))
69 | .apply("Echo inputs round 1", ParDo.of(new EchoInputDoFn(configuration, "Echo")))
70 | .apply("Echo inputs round 2", ParDo.of(new EchoInputDoFn(configuration, "EchoAgain")));
71 |
72 | p.run();
73 | }
74 |
75 | /** Simple DoFn that echos the element, used as an example of running a C++ library. */
76 | @SuppressWarnings("serial")
77 | public static class EchoInputDoFn extends DoFn, KV> {
78 |
79 | private static final Logger LOG = LoggerFactory.getLogger(EchoInputDoFn.class);
80 |
81 | private SubProcessConfiguration configuration;
82 | private String binaryName;
83 |
84 | public EchoInputDoFn(SubProcessConfiguration configuration, String binary) {
85 | // Pass in configuration information the name of the filename of the sub-process and the level
86 | // of concurrency
87 | this.configuration = configuration;
88 | this.binaryName = binary;
89 | }
90 |
91 | @Setup
92 | public void setUp() throws Exception {
93 | CallingSubProcessUtils.setUp(configuration, binaryName);
94 | }
95 |
96 | @ProcessElement
97 | public void processElement(ProcessContext c) throws Exception {
98 | try {
99 | // Our Library takes a single command in position 0 which it will echo back in the result
100 | SubProcessCommandLineArgs commands = new SubProcessCommandLineArgs();
101 | Command command = new Command(0, String.valueOf(c.element().getValue()));
102 | commands.putCommand(command);
103 |
104 | // The ProcessingKernel deals with the execution of the process
105 | SubProcessKernel kernel = new SubProcessKernel(configuration, binaryName);
106 |
107 | // Run the command and work through the results
108 | List results = kernel.exec(commands);
109 | for (String s : results) {
110 | c.output(KV.of(c.element().getKey(), s));
111 | }
112 | } catch (Exception ex) {
113 | LOG.error("Error processing element ", ex);
114 | throw ex;
115 | }
116 | }
117 | }
118 |
119 | private static String getTestShellEcho() {
120 | return "#!/bin/sh\n" + "filename=$1;\n" + "echo $2 >> $filename;";
121 | }
122 |
123 | private static String getTestShellEchoAgain() {
124 | return "#!/bin/sh\n"
125 | + "filename=$1;\n"
126 | + "echo \"You again? Well ok, here is your word again.\" >> $2 >> $filename;";
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/subprocess/SubProcessPipelineOptions.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.subprocess;
19 |
20 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration;
21 | import org.apache.beam.sdk.options.Default;
22 | import org.apache.beam.sdk.options.DefaultValueFactory;
23 | import org.apache.beam.sdk.options.Description;
24 | import org.apache.beam.sdk.options.PipelineOptions;
25 | import org.apache.beam.sdk.options.Validation.Required;
26 |
27 | /** Options for running a sub process within a DoFn. */
28 | public interface SubProcessPipelineOptions extends PipelineOptions {
29 |
30 | @Description("Source GCS directory where the C++ library is located gs://bucket/tests")
31 | @Required
32 | String getSourcePath();
33 |
34 | void setSourcePath(String sourcePath);
35 |
36 | @Description("Working directory for the process I/O")
37 | @Default.String("/tmp/grid_working_files")
38 | String getWorkerPath();
39 |
40 | void setWorkerPath(String workerPath);
41 |
42 | @Description("The maximum time to wait for the sub-process to complete")
43 | @Default.Integer(3600)
44 | Integer getWaitTime();
45 |
46 | void setWaitTime(Integer waitTime);
47 |
48 | @Description("As sub-processes can be heavy weight define the level of concurrency level")
49 | @Required
50 | Integer getConcurrency();
51 |
52 | void setConcurrency(Integer concurrency);
53 |
54 | @Description("Should log files only be uploaded if error.")
55 | @Default.Boolean(true)
56 | Boolean getOnlyUpLoadLogsOnError();
57 |
58 | void setOnlyUpLoadLogsOnError(Boolean onlyUpLoadLogsOnError);
59 |
60 | @Default.InstanceFactory(SubProcessConfigurationFactory.class)
61 | SubProcessConfiguration getSubProcessConfiguration();
62 |
63 | void setSubProcessConfiguration(SubProcessConfiguration configuration);
64 |
65 | /** Confirm Configuration and return a configuration object used in pipeline. */
66 | class SubProcessConfigurationFactory implements DefaultValueFactory {
67 | @Override
68 | public SubProcessConfiguration create(PipelineOptions options) {
69 |
70 | SubProcessPipelineOptions subProcessPipelineOptions = (SubProcessPipelineOptions) options;
71 |
72 | SubProcessConfiguration configuration = new SubProcessConfiguration();
73 |
74 | if (subProcessPipelineOptions.getSourcePath() == null) {
75 | throw new IllegalStateException("Source path must be set");
76 | }
77 | if (subProcessPipelineOptions.getConcurrency() == null
78 | || subProcessPipelineOptions.getConcurrency() == 0) {
79 | throw new IllegalStateException("Concurrency must be set and be > 0");
80 | }
81 | configuration.setSourcePath(subProcessPipelineOptions.getSourcePath());
82 | configuration.setWorkerPath(subProcessPipelineOptions.getWorkerPath());
83 | configuration.setWaitTime(subProcessPipelineOptions.getWaitTime());
84 | configuration.setOnlyUpLoadLogsOnError(subProcessPipelineOptions.getOnlyUpLoadLogsOnError());
85 | configuration.concurrency = subProcessPipelineOptions.getConcurrency();
86 |
87 | return configuration;
88 | }
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/subprocess/configuration/SubProcessConfiguration.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.subprocess.configuration;
19 |
20 | import java.io.Serializable;
21 |
22 | /**
23 | * Configuration file used to setup the Process kernel for execution of the external library Values
24 | * are copied from the Options to all them to be Serializable.
25 | */
26 | @SuppressWarnings({"serial", "nullness"}) // TODO(https://issues.apache.org/jira/browse/BEAM-10402)
27 | public class SubProcessConfiguration implements Serializable {
28 |
29 | // Source GCS directory where the C++ library is located gs://bucket/tests
30 | public String sourcePath;
31 |
32 | // Working directory for the process I/O
33 | public String workerPath;
34 |
35 | // The maximum time to wait for the sub-process to complete
36 | public Integer waitTime;
37 |
38 | // "As sub-processes can be heavy weight match the concurrency level to num cores on the machines"
39 | public Integer concurrency;
40 |
41 | // Should log files only be uploaded if error
42 | public Boolean onlyUpLoadLogsOnError;
43 |
44 | public Boolean getOnlyUpLoadLogsOnError() {
45 | return onlyUpLoadLogsOnError;
46 | }
47 |
48 | public void setOnlyUpLoadLogsOnError(Boolean onlyUpLoadLogsOnError) {
49 | this.onlyUpLoadLogsOnError = onlyUpLoadLogsOnError;
50 | }
51 |
52 | public String getSourcePath() {
53 | return sourcePath;
54 | }
55 |
56 | public void setSourcePath(String sourcePath) {
57 | this.sourcePath = sourcePath;
58 | }
59 |
60 | public String getWorkerPath() {
61 | return workerPath;
62 | }
63 |
64 | public void setWorkerPath(String workerPath) {
65 | this.workerPath = workerPath;
66 | }
67 |
68 | public Integer getWaitTime() {
69 | return waitTime;
70 | }
71 |
72 | public void setWaitTime(Integer waitTime) {
73 | this.waitTime = waitTime;
74 | }
75 |
76 | public Integer getConcurrency() {
77 | return concurrency;
78 | }
79 |
80 | public void setConcurrency(Integer concurrency) {
81 | this.concurrency = concurrency;
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/subprocess/kernel/SubProcessCommandLineArgs.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.subprocess.kernel;
19 |
20 | import java.util.List;
21 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists;
22 |
23 | /** Parameters to the sub-process, has tuple of ordinal position and the value. */
24 | @SuppressWarnings({
25 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402)
26 | })
27 | public class SubProcessCommandLineArgs {
28 |
29 | // Parameters to pass to the sub-process
30 | private List parameters = Lists.newArrayList();
31 |
32 | public void addCommand(Integer position, String value) {
33 | parameters.add(new Command(position, value));
34 | }
35 |
36 | public void putCommand(Command command) {
37 | parameters.add(command);
38 | }
39 |
40 | public List getParameters() {
41 | return parameters;
42 | }
43 |
44 | /** Class used to store the SubProcces parameters. */
45 | public static class Command {
46 |
47 | // The ordinal position of the command to pass to the sub-process
48 | int ordinalPosition;
49 | String value;
50 |
51 | @SuppressWarnings("unused")
52 | private Command() {}
53 |
54 | public Command(int ordinalPosition, String value) {
55 | this.ordinalPosition = ordinalPosition;
56 | this.value = value;
57 | }
58 |
59 | public int getKey() {
60 | return ordinalPosition;
61 | }
62 |
63 | public void setKey(int key) {
64 | this.ordinalPosition = key;
65 | }
66 |
67 | public String getValue() {
68 | return value;
69 | }
70 |
71 | public void setValue(String value) {
72 | this.value = value;
73 | }
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/subprocess/kernel/SubProcessIOFiles.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.subprocess.kernel;
19 |
20 | import java.io.Closeable;
21 | import java.io.IOException;
22 | import java.nio.file.Files;
23 | import java.nio.file.Path;
24 | import java.nio.file.Paths;
25 | import java.util.UUID;
26 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration;
27 | import org.apache.beam.examples.subprocess.utils.FileUtils;
28 | import org.slf4j.Logger;
29 | import org.slf4j.LoggerFactory;
30 |
31 | /**
32 | * All information generated from the process will be stored in output files. The local working
33 | * directory is used to generate three files with extension .err for standard error output .out for
34 | * standard out output .ret for storing the results from the called library. The files will have a
35 | * uuid created for them based on java.util.UUID
36 | */
37 | public class SubProcessIOFiles implements Closeable {
38 |
39 | private static final Logger LOG = LoggerFactory.getLogger(SubProcessIOFiles.class);
40 |
41 | Path errFile;
42 | Path outFile;
43 | Path resultFile;
44 | Path base;
45 |
46 | String errFileLocation = "";
47 | String outFileLocation = "";
48 | String uuid;
49 |
50 | public String getErrFileLocation() {
51 | return errFileLocation;
52 | }
53 |
54 | public String getOutFileLocation() {
55 | return outFileLocation;
56 | }
57 |
58 | /** @param workerWorkingDirectory */
59 | public SubProcessIOFiles(String workerWorkingDirectory) {
60 |
61 | this.uuid = UUID.randomUUID().toString();
62 | base = Paths.get(workerWorkingDirectory);
63 |
64 | // Setup all the redirect handles, including the return file type
65 | errFile = Paths.get(base.toString(), uuid + ".err");
66 | outFile = Paths.get(base.toString(), uuid + ".out");
67 | resultFile = Paths.get(base.toString(), uuid + ".res");
68 | }
69 |
70 | public Path getErrFile() {
71 | return errFile;
72 | }
73 |
74 | public Path getOutFile() {
75 | return outFile;
76 | }
77 |
78 | public Path getResultFile() {
79 | return resultFile;
80 | }
81 |
82 | /**
83 | * Clean up the files that have been created on the local worker file system. Without this expect
84 | * both performance issues and eventual failure
85 | */
86 | @Override
87 | public void close() throws IOException {
88 |
89 | if (Files.exists(outFile)) {
90 | Files.delete(outFile);
91 | }
92 |
93 | if (Files.exists(errFile)) {
94 | Files.delete(errFile);
95 | }
96 |
97 | if (Files.exists(resultFile)) {
98 | Files.delete(resultFile);
99 | }
100 | }
101 |
102 | /**
103 | * Will copy the output files to the GCS path setup via the configuration.
104 | *
105 | * @param configuration
106 | * @param params
107 | */
108 | public void copyOutPutFilesToBucket(SubProcessConfiguration configuration, String params) {
109 | if (Files.exists(outFile) || Files.exists(errFile)) {
110 | try {
111 | outFileLocation = FileUtils.copyFileFromWorkerToGCS(configuration, outFile);
112 | } catch (Exception ex) {
113 | LOG.error("Error uploading log file to storage ", ex);
114 | }
115 |
116 | try {
117 | errFileLocation = FileUtils.copyFileFromWorkerToGCS(configuration, errFile);
118 | } catch (Exception ex) {
119 | LOG.error("Error uploading log file to storage ", ex);
120 | }
121 |
122 | LOG.info(
123 | String.format(
124 | "Log Files for process: %s outFile was: %s errFile was: %s",
125 | params, outFileLocation, errFileLocation));
126 | } else {
127 | LOG.error(String.format("There was no output file or err file for process %s", params));
128 | }
129 | }
130 | }
131 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/subprocess/kernel/SubProcessKernel.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.subprocess.kernel;
19 |
20 | import java.io.IOException;
21 | import java.lang.ProcessBuilder.Redirect;
22 | import java.nio.file.Files;
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import java.util.concurrent.TimeUnit;
26 | import java.util.stream.Stream;
27 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration;
28 | import org.apache.beam.examples.subprocess.utils.CallingSubProcessUtils;
29 | import org.apache.beam.examples.subprocess.utils.FileUtils;
30 | import org.slf4j.Logger;
31 | import org.slf4j.LoggerFactory;
32 |
33 | /**
34 | * This is the process kernel which deals with exec of the subprocess. It also deals with all I/O.
35 | */
36 | @SuppressWarnings({
37 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402)
38 | })
39 | public class SubProcessKernel {
40 |
41 | private static final Logger LOG = LoggerFactory.getLogger(SubProcessKernel.class);
42 |
43 | private static final int MAX_SIZE_COMMAND_LINE_ARGS = 128 * 1024;
44 |
45 | SubProcessConfiguration configuration;
46 | ProcessBuilder processBuilder;
47 |
48 | private SubProcessKernel() {}
49 |
50 | /**
51 | * Creates the SubProcess Kernel ready for execution. Will deal with all input and outputs to the
52 | * SubProcess
53 | *
54 | * @param options
55 | * @param binaryName
56 | */
57 | public SubProcessKernel(SubProcessConfiguration options, String binaryName) {
58 | this.configuration = options;
59 | this.processBuilder = new ProcessBuilder(binaryName);
60 | }
61 |
62 | public List exec(SubProcessCommandLineArgs commands) throws Exception {
63 | try (CallingSubProcessUtils.Permit permit =
64 | new CallingSubProcessUtils.Permit(processBuilder.command().get(0))) {
65 |
66 | List results = null;
67 |
68 | try (SubProcessIOFiles outputFiles = new SubProcessIOFiles(configuration.getWorkerPath())) {
69 |
70 | try {
71 | Process process = execBinary(processBuilder, commands, outputFiles);
72 | results = collectProcessResults(process, processBuilder, outputFiles);
73 | } catch (Exception ex) {
74 | LOG.error("Error running executable ", ex);
75 | throw ex;
76 | }
77 | } catch (IOException ex) {
78 | LOG.error(
79 | "Unable to delete the outputfiles. This can lead to performance issues and failure",
80 | ex);
81 | }
82 | return results;
83 | }
84 | }
85 |
86 | public byte[] execBinaryResult(SubProcessCommandLineArgs commands) throws Exception {
87 | try (CallingSubProcessUtils.Permit permit =
88 | new CallingSubProcessUtils.Permit(processBuilder.command().get(0))) {
89 |
90 | try (SubProcessIOFiles outputFiles = new SubProcessIOFiles(configuration.getWorkerPath())) {
91 |
92 | try {
93 | Process process = execBinary(processBuilder, commands, outputFiles);
94 | return collectProcessResultsBytes(process, processBuilder, outputFiles);
95 | } catch (Exception ex) {
96 | LOG.error("Error running executable ", ex);
97 | throw ex;
98 | }
99 | } catch (IOException ex) {
100 | LOG.error(
101 | "Unable to delete the outputfiles. This can lead to performance issues and failure",
102 | ex);
103 | }
104 | return new byte[0];
105 | }
106 | }
107 |
108 | private ProcessBuilder prepareBuilder(
109 | ProcessBuilder builder, SubProcessCommandLineArgs commands, SubProcessIOFiles outPutFiles)
110 | throws IllegalStateException {
111 |
112 | // Check we are not over the max size of command line parameters
113 | if (getTotalCommandBytes(commands) > MAX_SIZE_COMMAND_LINE_ARGS) {
114 | throw new IllegalStateException("Command is over 2MB in size");
115 | }
116 |
117 | appendExecutablePath(builder);
118 |
119 | // Add the result file path to the builder at position 1, 0 is reserved for the process itself
120 | builder.command().add(1, outPutFiles.resultFile.toString());
121 |
122 | // Shift commands by 2 ordinal positions and load into the builder
123 | for (SubProcessCommandLineArgs.Command s : commands.getParameters()) {
124 | builder.command().add(s.ordinalPosition + 2, s.value);
125 | }
126 |
127 | builder.redirectError(Redirect.appendTo(outPutFiles.errFile.toFile()));
128 | builder.redirectOutput(Redirect.appendTo(outPutFiles.outFile.toFile()));
129 |
130 | return builder;
131 | }
132 |
133 | /**
134 | * Add up the total bytes used by the process.
135 | *
136 | * @param commands
137 | * @return
138 | */
139 | private int getTotalCommandBytes(SubProcessCommandLineArgs commands) {
140 | int size = 0;
141 | for (SubProcessCommandLineArgs.Command c : commands.getParameters()) {
142 | size += c.value.length();
143 | }
144 | return size;
145 | }
146 |
147 | private Process execBinary(
148 | ProcessBuilder builder, SubProcessCommandLineArgs commands, SubProcessIOFiles outPutFiles)
149 | throws Exception {
150 | try {
151 |
152 | builder = prepareBuilder(builder, commands, outPutFiles);
153 | Process process = builder.start();
154 |
155 | boolean timeout = !process.waitFor(configuration.getWaitTime(), TimeUnit.SECONDS);
156 |
157 | if (timeout) {
158 | String log =
159 | String.format(
160 | "Timeout waiting to run process with parameters %s . "
161 | + "Check to see if your timeout is long enough. Currently set at %s.",
162 | createLogEntryFromInputs(builder.command()), configuration.getWaitTime());
163 | throw new Exception(log);
164 | }
165 | return process;
166 |
167 | } catch (Exception ex) {
168 |
169 | LOG.error(
170 | String.format(
171 | "Error running process with parameters %s error was %s ",
172 | createLogEntryFromInputs(builder.command()), ex.getMessage()));
173 | throw new Exception(ex);
174 | }
175 | }
176 |
177 | /**
178 | * TODO clean up duplicate with byte[] version collectBinaryProcessResults.
179 | *
180 | * @param process
181 | * @param builder
182 | * @param outPutFiles
183 | * @return List of results
184 | * @throws Exception if process has non 0 value or no logs found then throw exception
185 | */
186 | private List collectProcessResults(
187 | Process process, ProcessBuilder builder, SubProcessIOFiles outPutFiles) throws Exception {
188 |
189 | List results = new ArrayList<>();
190 |
191 | try {
192 |
193 | LOG.debug(String.format("Executing process %s", createLogEntryFromInputs(builder.command())));
194 |
195 | // If process exit value is not 0 then subprocess failed, record logs
196 | if (process.exitValue() != 0) {
197 | outPutFiles.copyOutPutFilesToBucket(configuration, FileUtils.toStringParams(builder));
198 | String log = createLogEntryForProcessFailure(process, builder.command(), outPutFiles);
199 | throw new Exception(log);
200 | }
201 |
202 | // If no return file then either something went wrong or the binary is setup incorrectly for
203 | // the ret file either way throw error
204 | if (!Files.exists(outPutFiles.resultFile)) {
205 | String log = createLogEntryForProcessFailure(process, builder.command(), outPutFiles);
206 | outPutFiles.copyOutPutFilesToBucket(configuration, FileUtils.toStringParams(builder));
207 | throw new Exception(log);
208 | }
209 |
210 | // Everything looks healthy return values
211 | try (Stream lines = Files.lines(outPutFiles.resultFile)) {
212 | for (String line : (Iterable) lines::iterator) {
213 | results.add(line);
214 | }
215 | }
216 | return results;
217 | } catch (Exception ex) {
218 | String log =
219 | String.format(
220 | "Unexpected error runnng process. %s error message was %s",
221 | createLogEntryFromInputs(builder.command()), ex.getMessage());
222 | throw new Exception(log);
223 | }
224 | }
225 |
226 | /**
227 | * Used when the reault file contains binary data.
228 | *
229 | * @param process
230 | * @param builder
231 | * @param outPutFiles
232 | * @return Binary results
233 | * @throws Exception if process has non 0 value or no logs found then throw exception
234 | */
235 | private byte[] collectProcessResultsBytes(
236 | Process process, ProcessBuilder builder, SubProcessIOFiles outPutFiles) throws Exception {
237 |
238 | Byte[] results;
239 |
240 | try {
241 |
242 | LOG.debug(String.format("Executing process %s", createLogEntryFromInputs(builder.command())));
243 |
244 | // If process exit value is not 0 then subprocess failed, record logs
245 | if (process.exitValue() != 0) {
246 | outPutFiles.copyOutPutFilesToBucket(configuration, FileUtils.toStringParams(builder));
247 | String log = createLogEntryForProcessFailure(process, builder.command(), outPutFiles);
248 | throw new Exception(log);
249 | }
250 |
251 | // If no return file then either something went wrong or the binary is setup incorrectly for
252 | // the ret file either way throw error
253 | if (!Files.exists(outPutFiles.resultFile)) {
254 | String log = createLogEntryForProcessFailure(process, builder.command(), outPutFiles);
255 | outPutFiles.copyOutPutFilesToBucket(configuration, FileUtils.toStringParams(builder));
256 | throw new Exception(log);
257 | }
258 |
259 | // Everything looks healthy return bytes
260 | return Files.readAllBytes(outPutFiles.resultFile);
261 |
262 | } catch (Exception ex) {
263 | String log =
264 | String.format(
265 | "Unexpected error runnng process. %s error message was %s",
266 | createLogEntryFromInputs(builder.command()), ex.getMessage());
267 | throw new Exception(log);
268 | }
269 | }
270 |
271 | private static String createLogEntryForProcessFailure(
272 | Process process, List commands, SubProcessIOFiles files) {
273 |
274 | StringBuilder stringBuilder = new StringBuilder();
275 |
276 | // Highlight when no result file is found vs standard process error
277 | if (process.exitValue() == 0) {
278 | stringBuilder.append(String.format("%nProcess succeded but no result file was found %n"));
279 | } else {
280 | stringBuilder.append(
281 | String.format("%nProcess error failed with exit value of %s %n", process.exitValue()));
282 | }
283 |
284 | stringBuilder.append(
285 | String.format("Command info was %s %n", createLogEntryFromInputs(commands)));
286 |
287 | stringBuilder.append(
288 | String.format(
289 | "First line of error file is %s %n", FileUtils.readLineOfLogFile(files.errFile)));
290 |
291 | stringBuilder.append(
292 | String.format(
293 | "First line of out file is %s %n", FileUtils.readLineOfLogFile(files.outFile)));
294 |
295 | stringBuilder.append(
296 | String.format(
297 | "First line of ret file is %s %n", FileUtils.readLineOfLogFile(files.resultFile)));
298 |
299 | return stringBuilder.toString();
300 | }
301 |
302 | private static String createLogEntryFromInputs(List commands) {
303 | String params;
304 | if (commands != null) {
305 | params = String.join(",", commands);
306 | } else {
307 | params = "No-Commands";
308 | }
309 | return params;
310 | }
311 |
312 | // Pass the Path of the binary to the SubProcess in Command position 0
313 | private ProcessBuilder appendExecutablePath(ProcessBuilder builder) {
314 | String executable = builder.command().get(0);
315 | if (executable == null) {
316 | throw new IllegalArgumentException(
317 | "No executable provided to the Process Builder... we will do... nothing... ");
318 | }
319 | builder
320 | .command()
321 | .set(0, FileUtils.getFileResourceId(configuration.getWorkerPath(), executable).toString());
322 | return builder;
323 | }
324 | }
325 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/subprocess/utils/CallingSubProcessUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.subprocess.utils;
19 |
20 | import java.util.Map;
21 | import java.util.Set;
22 | import java.util.concurrent.ConcurrentHashMap;
23 | import java.util.concurrent.Semaphore;
24 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration;
25 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets;
26 | import org.slf4j.Logger;
27 | import org.slf4j.LoggerFactory;
28 |
29 | /** Utility class for dealing with concurrency and binary file copies to the worker. */
30 | public class CallingSubProcessUtils {
31 |
32 | // Prevent Instantiation
33 | private CallingSubProcessUtils() {}
34 |
35 | private static final Logger LOG = LoggerFactory.getLogger(CallingSubProcessUtils.class);
36 |
37 | static boolean initCompleted = false;
38 |
39 | // Allow multiple subclasses to create files, but only one thread per subclass can add the file to
40 | // the worker
41 | private static final Set downloadedFiles = Sets.newConcurrentHashSet();
42 |
43 | // Limit the number of threads able to do work
44 | private static Map semaphores = new ConcurrentHashMap<>();
45 |
46 | public static void setUp(SubProcessConfiguration configuration, String binaryName)
47 | throws Exception {
48 |
49 | if (!semaphores.containsKey(binaryName)) {
50 | initSemaphore(configuration.getConcurrency(), binaryName);
51 | }
52 |
53 | synchronized (downloadedFiles) {
54 | if (!downloadedFiles.contains(binaryName)) {
55 | // Create Directories if needed
56 | FileUtils.createDirectoriesOnWorker(configuration);
57 | LOG.info("Calling filesetup to move Executables to worker.");
58 | ExecutableFile executableFile = new ExecutableFile(configuration, binaryName);
59 | FileUtils.copyFileFromGCSToWorker(executableFile);
60 | downloadedFiles.add(binaryName);
61 | }
62 | }
63 | }
64 |
65 | public static synchronized void initSemaphore(Integer permits, String binaryName) {
66 | if (!semaphores.containsKey(binaryName)) {
67 | LOG.info(String.format(String.format("Initialized Semaphore for binary %s ", binaryName)));
68 | semaphores.put(binaryName, new Semaphore(permits));
69 | }
70 | }
71 |
72 | private static void aquireSemaphore(String binaryName) throws IllegalStateException {
73 | if (!semaphores.containsKey(binaryName)) {
74 | throw new IllegalStateException("Semaphore is NULL, check init logic in @Setup.");
75 | }
76 | try {
77 | semaphores.get(binaryName).acquire();
78 | } catch (InterruptedException ex) {
79 | LOG.error("Interupted during aquire", ex);
80 | }
81 | }
82 |
83 | private static void releaseSemaphore(String binaryName) throws IllegalStateException {
84 | if (!semaphores.containsKey(binaryName)) {
85 | throw new IllegalStateException("Semaphore is NULL, check init logic in @Setup.");
86 | }
87 | semaphores.get(binaryName).release();
88 | }
89 |
90 | /** Permit class for access to worker cpu resources. */
91 | public static class Permit implements AutoCloseable {
92 |
93 | private String binaryName;
94 |
95 | public Permit(String binaryName) {
96 | this.binaryName = binaryName;
97 | CallingSubProcessUtils.aquireSemaphore(binaryName);
98 | }
99 |
100 | @Override
101 | public void close() {
102 | CallingSubProcessUtils.releaseSemaphore(binaryName);
103 | }
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/subprocess/utils/ExecutableFile.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.subprocess.utils;
19 |
20 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration;
21 | import org.apache.beam.sdk.coders.AvroCoder;
22 | import org.apache.beam.sdk.coders.DefaultCoder;
23 | import org.slf4j.Logger;
24 | import org.slf4j.LoggerFactory;
25 |
26 | /** Contains the configuration for the external library. */
27 | @DefaultCoder(AvroCoder.class)
28 | @SuppressWarnings({
29 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402)
30 | })
31 | public class ExecutableFile {
32 |
33 | String fileName;
34 |
35 | private String sourceGCSLocation;
36 | private String destinationLocation;
37 |
38 | private static final Logger LOG = LoggerFactory.getLogger(ExecutableFile.class);
39 |
40 | public String getSourceGCSLocation() {
41 | return sourceGCSLocation;
42 | }
43 |
44 | public void setSourceGCSLocation(String sourceGCSLocation) {
45 | this.sourceGCSLocation = sourceGCSLocation;
46 | }
47 |
48 | public String getDestinationLocation() {
49 | return destinationLocation;
50 | }
51 |
52 | public void setDestinationLocation(String destinationLocation) {
53 | this.destinationLocation = destinationLocation;
54 | }
55 |
56 | public ExecutableFile(SubProcessConfiguration configuration, String fileName)
57 | throws IllegalStateException {
58 | if (configuration == null) {
59 | throw new IllegalStateException("Configuration can not be NULL");
60 | }
61 | if (fileName == null) {
62 | throw new IllegalStateException("FileName can not be NULLt");
63 | }
64 | this.fileName = fileName;
65 | setDestinationLocation(configuration);
66 | setSourceLocation(configuration);
67 | }
68 |
69 | private void setDestinationLocation(SubProcessConfiguration configuration) {
70 | this.sourceGCSLocation =
71 | FileUtils.getFileResourceId(configuration.getSourcePath(), fileName).toString();
72 | }
73 |
74 | private void setSourceLocation(SubProcessConfiguration configuration) {
75 | this.destinationLocation =
76 | FileUtils.getFileResourceId(configuration.getWorkerPath(), fileName).toString();
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/examples/java8/src/main/java/org/apache/beam/examples/subprocess/utils/FileUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.subprocess.utils;
19 |
20 | import static java.nio.charset.StandardCharsets.UTF_8;
21 |
22 | import java.io.BufferedReader;
23 | import java.io.IOException;
24 | import java.nio.ByteBuffer;
25 | import java.nio.channels.ReadableByteChannel;
26 | import java.nio.channels.WritableByteChannel;
27 | import java.nio.file.FileAlreadyExistsException;
28 | import java.nio.file.Files;
29 | import java.nio.file.Path;
30 | import java.nio.file.Paths;
31 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration;
32 | import org.apache.beam.sdk.io.FileSystems;
33 | import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions;
34 | import org.apache.beam.sdk.io.fs.ResourceId;
35 | import org.slf4j.Logger;
36 | import org.slf4j.LoggerFactory;
37 |
38 | /** Utilities for dealing with movement of files from object stores and workers. */
39 | @SuppressWarnings({
40 | "nullness" // TODO(https://issues.apache.org/jira/browse/BEAM-10402)
41 | })
42 | public class FileUtils {
43 |
44 | private static final Logger LOG = LoggerFactory.getLogger(FileUtils.class);
45 |
46 | public static ResourceId getFileResourceId(String directory, String fileName) {
47 | ResourceId resourceID = FileSystems.matchNewResource(directory, true);
48 | return resourceID.getCurrentDirectory().resolve(fileName, StandardResolveOptions.RESOLVE_FILE);
49 | }
50 |
51 | public static String toStringParams(ProcessBuilder builder) {
52 | return String.join(",", builder.command());
53 | }
54 |
55 | public static String copyFileFromWorkerToGCS(
56 | SubProcessConfiguration configuration, Path fileToUpload) throws Exception {
57 |
58 | Path fileName;
59 |
60 | if ((fileName = fileToUpload.getFileName()) == null) {
61 | throw new IllegalArgumentException("FileName can not be null.");
62 | }
63 |
64 | ResourceId sourceFile = getFileResourceId(configuration.getWorkerPath(), fileName.toString());
65 |
66 | LOG.info("Copying file from worker " + sourceFile);
67 |
68 | ResourceId destinationFile =
69 | getFileResourceId(configuration.getSourcePath(), fileName.toString());
70 | // TODO currently not supported with different schemas for example GCS to local, else could use
71 | // FileSystems.copy(ImmutableList.of(sourceFile), ImmutableList.of(destinationFile));
72 | try {
73 | return copyFile(sourceFile, destinationFile);
74 | } catch (Exception ex) {
75 | LOG.error(
76 | String.format("Error copying file from %s to %s", sourceFile, destinationFile), ex);
77 | throw ex;
78 | }
79 | }
80 |
81 | public static String copyFileFromGCSToWorker(ExecutableFile execuableFile) throws Exception {
82 |
83 | ResourceId sourceFile =
84 | FileSystems.matchNewResource(execuableFile.getSourceGCSLocation(), false);
85 | ResourceId destinationFile =
86 | FileSystems.matchNewResource(execuableFile.getDestinationLocation(), false);
87 | try {
88 | LOG.info(
89 | String.format(
90 | "Moving File %s to %s ",
91 | execuableFile.getSourceGCSLocation(), execuableFile.getDestinationLocation()));
92 | Path path = Paths.get(execuableFile.getDestinationLocation());
93 |
94 | if (path.toFile().exists()) {
95 | LOG.warn(
96 | String.format(
97 | "Overwriting file %s, should only see this once per worker.",
98 | execuableFile.getDestinationLocation()));
99 | }
100 | copyFile(sourceFile, destinationFile);
101 | path.toFile().setExecutable(true);
102 | return path.toString();
103 |
104 | } catch (Exception ex) {
105 | LOG.error(String.format("Error moving file : %s ", execuableFile.fileName), ex);
106 | throw ex;
107 | }
108 | }
109 |
110 | public static String copyFile(ResourceId sourceFile, ResourceId destinationFile)
111 | throws IOException {
112 |
113 | try (WritableByteChannel writeChannel = FileSystems.create(destinationFile, "text/plain")) {
114 | try (ReadableByteChannel readChannel = FileSystems.open(sourceFile)) {
115 |
116 | final ByteBuffer buffer = ByteBuffer.allocateDirect(16 * 1024);
117 | while (readChannel.read(buffer) != -1) {
118 | buffer.flip();
119 | writeChannel.write(buffer);
120 | buffer.compact();
121 | }
122 | buffer.flip();
123 | while (buffer.hasRemaining()) {
124 | writeChannel.write(buffer);
125 | }
126 | }
127 | }
128 |
129 | return destinationFile.toString();
130 | }
131 |
132 | /**
133 | * Create directories needed based on configuration.
134 | *
135 | * @param configuration
136 | * @throws IOException
137 | */
138 | public static void createDirectoriesOnWorker(SubProcessConfiguration configuration)
139 | throws IOException {
140 |
141 | try {
142 |
143 | Path path = Paths.get(configuration.getWorkerPath());
144 |
145 | if (!path.toFile().exists()) {
146 | Files.createDirectories(path);
147 | LOG.info(String.format("Created Folder %s ", path.toFile()));
148 | }
149 | } catch (FileAlreadyExistsException ex) {
150 | LOG.warn(
151 | String.format(
152 | " Tried to create folder %s which already existsed, this should not happen!",
153 | configuration.getWorkerPath()),
154 | ex);
155 | }
156 | }
157 |
158 | public static String readLineOfLogFile(Path path) {
159 |
160 | try (BufferedReader br = Files.newBufferedReader(Paths.get(path.toString()), UTF_8)) {
161 | return br.readLine();
162 | } catch (IOException e) {
163 | LOG.error("Error reading the first line of file", e);
164 | }
165 |
166 | // `return empty string rather than NULL string as this data is often used in further logging
167 | return "";
168 | }
169 | }
170 |
--------------------------------------------------------------------------------
/examples/java8/src/test/java/org/apache/beam/examples/DebuggingWordCountTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples;
19 |
20 | import java.io.File;
21 | import java.nio.charset.StandardCharsets;
22 | import org.apache.beam.examples.DebuggingWordCount.WordCountOptions;
23 | import org.apache.beam.sdk.testing.TestPipeline;
24 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.io.Files;
25 | import org.junit.Rule;
26 | import org.junit.Test;
27 | import org.junit.rules.TemporaryFolder;
28 | import org.junit.runner.RunWith;
29 | import org.junit.runners.JUnit4;
30 |
31 | /** Tests for {@link DebuggingWordCount}. */
32 | @RunWith(JUnit4.class)
33 | public class DebuggingWordCountTest {
34 | @Rule public TemporaryFolder tmpFolder = new TemporaryFolder();
35 |
36 | private String getFilePath(String filePath) {
37 | if (filePath.contains(":")) {
38 | return filePath.replace("\\", "/").split(":", -1)[1];
39 | }
40 | return filePath;
41 | }
42 |
43 | @Test
44 | public void testDebuggingWordCount() throws Exception {
45 | File inputFile = tmpFolder.newFile();
46 | File outputFile = tmpFolder.newFile();
47 | Files.write(
48 | "stomach secret Flourish message Flourish here Flourish",
49 | inputFile,
50 | StandardCharsets.UTF_8);
51 | WordCountOptions options = TestPipeline.testingPipelineOptions().as(WordCountOptions.class);
52 | options.setInputFile(getFilePath(inputFile.getAbsolutePath()));
53 | options.setOutput(getFilePath(outputFile.getAbsolutePath()));
54 | DebuggingWordCount.runDebuggingWordCount(options);
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/examples/java8/src/test/java/org/apache/beam/examples/MinimalWordCountTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples;
19 |
20 | import java.io.IOException;
21 | import java.io.Serializable;
22 | import java.nio.channels.FileChannel;
23 | import java.nio.file.Files;
24 | import java.nio.file.StandardOpenOption;
25 | import java.util.Arrays;
26 | import org.apache.beam.sdk.extensions.gcp.options.GcsOptions;
27 | import org.apache.beam.sdk.extensions.gcp.util.GcsUtil;
28 | import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath;
29 | import org.apache.beam.sdk.io.TextIO;
30 | import org.apache.beam.sdk.testing.TestPipeline;
31 | import org.apache.beam.sdk.transforms.Count;
32 | import org.apache.beam.sdk.transforms.Filter;
33 | import org.apache.beam.sdk.transforms.FlatMapElements;
34 | import org.apache.beam.sdk.transforms.MapElements;
35 | import org.apache.beam.sdk.values.KV;
36 | import org.apache.beam.sdk.values.TypeDescriptors;
37 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList;
38 | import org.junit.Rule;
39 | import org.junit.Test;
40 | import org.junit.runner.RunWith;
41 | import org.junit.runners.JUnit4;
42 | import org.mockito.Mockito;
43 |
44 | /**
45 | * To keep {@link MinimalWordCount} simple, it is not factored or testable. This test file should be
46 | * maintained with a copy of its code for a basic smoke test.
47 | */
48 | @RunWith(JUnit4.class)
49 | public class MinimalWordCountTest implements Serializable {
50 |
51 | @Rule public TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false);
52 |
53 | /** A basic smoke test that ensures there is no crash at pipeline construction time. */
54 | @Test
55 | public void testMinimalWordCount() throws Exception {
56 | p.getOptions().as(GcsOptions.class).setGcsUtil(buildMockGcsUtil());
57 |
58 | p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*"))
59 | .apply(
60 | FlatMapElements.into(TypeDescriptors.strings())
61 | .via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+"))))
62 | .apply(Filter.by((String word) -> !word.isEmpty()))
63 | .apply(Count.perElement())
64 | .apply(
65 | MapElements.into(TypeDescriptors.strings())
66 | .via(
67 | (KV wordCount) ->
68 | wordCount.getKey() + ": " + wordCount.getValue()))
69 | .apply(TextIO.write().to("gs://your-output-bucket/and-output-prefix"));
70 | }
71 |
72 | private GcsUtil buildMockGcsUtil() throws IOException {
73 | GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class);
74 |
75 | // Any request to open gets a new bogus channel
76 | Mockito.when(mockGcsUtil.open(Mockito.any(GcsPath.class)))
77 | .then(
78 | invocation ->
79 | FileChannel.open(
80 | Files.createTempFile("channel-", ".tmp"),
81 | StandardOpenOption.CREATE,
82 | StandardOpenOption.DELETE_ON_CLOSE));
83 |
84 | // Any request for expansion returns a list containing the original GcsPath
85 | // This is required to pass validation that occurs in TextIO during apply()
86 | Mockito.when(mockGcsUtil.expand(Mockito.any(GcsPath.class)))
87 | .then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0]));
88 |
89 | return mockGcsUtil;
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/examples/java8/src/test/java/org/apache/beam/examples/WordCountTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples;
19 |
20 | import java.util.Arrays;
21 | import java.util.List;
22 | import org.apache.beam.examples.WordCount.CountWords;
23 | import org.apache.beam.examples.WordCount.ExtractWordsFn;
24 | import org.apache.beam.examples.WordCount.FormatAsTextFn;
25 | import org.apache.beam.sdk.coders.StringUtf8Coder;
26 | import org.apache.beam.sdk.testing.PAssert;
27 | import org.apache.beam.sdk.testing.TestPipeline;
28 | import org.apache.beam.sdk.testing.ValidatesRunner;
29 | import org.apache.beam.sdk.transforms.Create;
30 | import org.apache.beam.sdk.transforms.DoFn;
31 | import org.apache.beam.sdk.transforms.MapElements;
32 | import org.apache.beam.sdk.transforms.ParDo;
33 | import org.apache.beam.sdk.values.PCollection;
34 | import org.junit.Rule;
35 | import org.junit.Test;
36 | import org.junit.experimental.categories.Category;
37 | import org.junit.runner.RunWith;
38 | import org.junit.runners.JUnit4;
39 |
40 | /** Tests of WordCount. */
41 | @RunWith(JUnit4.class)
42 | public class WordCountTest {
43 |
44 | /** Example test that tests a specific {@link DoFn}. */
45 | @Test
46 | public void testExtractWordsFn() throws Exception {
47 | List words = Arrays.asList(" some input words ", " ", " cool ", " foo", " bar");
48 | PCollection output =
49 | p.apply(Create.of(words).withCoder(StringUtf8Coder.of()))
50 | .apply(ParDo.of(new ExtractWordsFn()));
51 | PAssert.that(output).containsInAnyOrder("some", "input", "words", "cool", "foo", "bar");
52 | p.run().waitUntilFinish();
53 | }
54 |
55 | static final String[] WORDS_ARRAY =
56 | new String[] {
57 | "hi there", "hi", "hi sue bob",
58 | "hi sue", "", "bob hi"
59 | };
60 |
61 | static final List WORDS = Arrays.asList(WORDS_ARRAY);
62 |
63 | static final String[] COUNTS_ARRAY = new String[] {"hi: 5", "there: 1", "sue: 2", "bob: 2"};
64 |
65 | @Rule public TestPipeline p = TestPipeline.create();
66 |
67 | /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */
68 | @Test
69 | @Category(ValidatesRunner.class)
70 | public void testCountWords() throws Exception {
71 | PCollection input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));
72 |
73 | PCollection output =
74 | input.apply(new CountWords()).apply(MapElements.via(new FormatAsTextFn()));
75 |
76 | PAssert.that(output).containsInAnyOrder(COUNTS_ARRAY);
77 | p.run().waitUntilFinish();
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/examples/java8/src/test/java/org/apache/beam/examples/complete/game/GameStatsTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game;
19 |
20 | import java.io.Serializable;
21 | import java.util.Arrays;
22 | import java.util.List;
23 | import org.apache.beam.examples.complete.game.GameStats.CalculateSpammyUsers;
24 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
25 | import org.apache.beam.sdk.testing.PAssert;
26 | import org.apache.beam.sdk.testing.TestPipeline;
27 | import org.apache.beam.sdk.testing.ValidatesRunner;
28 | import org.apache.beam.sdk.transforms.Create;
29 | import org.apache.beam.sdk.values.KV;
30 | import org.apache.beam.sdk.values.PCollection;
31 | import org.junit.Rule;
32 | import org.junit.Test;
33 | import org.junit.experimental.categories.Category;
34 | import org.junit.runner.RunWith;
35 | import org.junit.runners.JUnit4;
36 |
37 | /**
38 | * Tests of GameStats. Because the pipeline was designed for easy readability and explanations, it
39 | * lacks good modularity for testing. See our testing documentation for better ideas:
40 | * https://beam.apache.org/documentation/pipelines/test-your-pipeline/
41 | */
42 | @RunWith(JUnit4.class)
43 | public class GameStatsTest implements Serializable {
44 |
45 | // User scores
46 | static final List> USER_SCORES =
47 | Arrays.asList(
48 | KV.of("Robot-2", 66),
49 | KV.of("Robot-1", 116),
50 | KV.of("user7_AndroidGreenKookaburra", 23),
51 | KV.of("user7_AndroidGreenKookaburra", 1),
52 | KV.of("user19_BisqueBilby", 14),
53 | KV.of("user13_ApricotQuokka", 15),
54 | KV.of("user18_BananaEmu", 25),
55 | KV.of("user6_AmberEchidna", 8),
56 | KV.of("user2_AmberQuokka", 6),
57 | KV.of("user0_MagentaKangaroo", 4),
58 | KV.of("user0_MagentaKangaroo", 3),
59 | KV.of("user2_AmberCockatoo", 13),
60 | KV.of("user7_AlmondWallaby", 15),
61 | KV.of("user6_AmberNumbat", 11),
62 | KV.of("user6_AmberQuokka", 4));
63 |
64 | // The expected list of 'spammers'.
65 | static final List> SPAMMERS =
66 | Arrays.asList(KV.of("Robot-2", 66), KV.of("Robot-1", 116));
67 |
68 | @Rule public TestPipeline p = TestPipeline.create();
69 |
70 | /** Test the calculation of 'spammy users'. */
71 | @Test
72 | @Category(ValidatesRunner.class)
73 | public void testCalculateSpammyUsers() throws Exception {
74 | PCollection> input = p.apply(Create.of(USER_SCORES));
75 | PCollection> output = input.apply(new CalculateSpammyUsers());
76 |
77 | // Check the set of spammers.
78 | PAssert.that(output).containsInAnyOrder(SPAMMERS);
79 |
80 | p.run().waitUntilFinish();
81 | }
82 |
83 | @Test
84 | public void testGameStatsOptions() {
85 | PipelineOptionsFactory.as(GameStats.Options.class);
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/examples/java8/src/test/java/org/apache/beam/examples/complete/game/HourlyTeamScoreTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game;
19 |
20 | import java.io.Serializable;
21 | import java.util.Arrays;
22 | import java.util.List;
23 | import org.apache.beam.examples.complete.game.UserScore.GameActionInfo;
24 | import org.apache.beam.examples.complete.game.UserScore.ParseEventFn;
25 | import org.apache.beam.sdk.coders.StringUtf8Coder;
26 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
27 | import org.apache.beam.sdk.testing.PAssert;
28 | import org.apache.beam.sdk.testing.TestPipeline;
29 | import org.apache.beam.sdk.testing.ValidatesRunner;
30 | import org.apache.beam.sdk.transforms.Create;
31 | import org.apache.beam.sdk.transforms.Filter;
32 | import org.apache.beam.sdk.transforms.MapElements;
33 | import org.apache.beam.sdk.transforms.ParDo;
34 | import org.apache.beam.sdk.values.KV;
35 | import org.apache.beam.sdk.values.PCollection;
36 | import org.apache.beam.sdk.values.TypeDescriptors;
37 | import org.joda.time.Instant;
38 | import org.junit.Rule;
39 | import org.junit.Test;
40 | import org.junit.experimental.categories.Category;
41 | import org.junit.runner.RunWith;
42 | import org.junit.runners.JUnit4;
43 |
44 | /**
45 | * Tests of HourlyTeamScore. Because the pipeline was designed for easy readability and
46 | * explanations, it lacks good modularity for testing. See our testing documentation for better
47 | * ideas: https://beam.apache.org/documentation/pipelines/test-your-pipeline/
48 | */
49 | @RunWith(JUnit4.class)
50 | @SuppressWarnings({
51 | "rawtypes" // TODO(https://issues.apache.org/jira/browse/BEAM-10556)
52 | })
53 | public class HourlyTeamScoreTest implements Serializable {
54 |
55 | static final String[] GAME_EVENTS_ARRAY =
56 | new String[] {
57 | "user0_MagentaKangaroo,MagentaKangaroo,3,1447955630000,2015-11-19 09:53:53.444",
58 | "user13_ApricotQuokka,ApricotQuokka,15,1447955630000,2015-11-19 09:53:53.444",
59 | "user6_AmberNumbat,AmberNumbat,11,1447955630000,2015-11-19 09:53:53.444",
60 | "user7_AlmondWallaby,AlmondWallaby,15,1447955630000,2015-11-19 09:53:53.444",
61 | "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,12,1447955630000,2015-11-19 09:53:53.444",
62 | "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,11,1447955630000,2015-11-19 09:53:53.444",
63 | "user19_BisqueBilby,BisqueBilby,6,1447955630000,2015-11-19 09:53:53.444",
64 | "user19_BisqueBilby,BisqueBilby,8,1447955630000,2015-11-19 09:53:53.444",
65 | // time gap...
66 | "user0_AndroidGreenEchidna,AndroidGreenEchidna,0,1447965690000,2015-11-19 12:41:31.053",
67 | "user0_MagentaKangaroo,MagentaKangaroo,4,1447965690000,2015-11-19 12:41:31.053",
68 | "user2_AmberCockatoo,AmberCockatoo,13,1447965690000,2015-11-19 12:41:31.053",
69 | "user18_BananaEmu,BananaEmu,7,1447965690000,2015-11-19 12:41:31.053",
70 | "user3_BananaEmu,BananaEmu,17,1447965690000,2015-11-19 12:41:31.053",
71 | "user18_BananaEmu,BananaEmu,1,1447965690000,2015-11-19 12:41:31.053",
72 | "user18_ApricotCaneToad,ApricotCaneToad,14,1447965690000,2015-11-19 12:41:31.053"
73 | };
74 |
75 | static final List GAME_EVENTS = Arrays.asList(GAME_EVENTS_ARRAY);
76 |
77 | // Used to check the filtering.
78 | static final KV[] FILTERED_EVENTS =
79 | new KV[] {
80 | KV.of("user0_AndroidGreenEchidna", 0),
81 | KV.of("user0_MagentaKangaroo", 4),
82 | KV.of("user2_AmberCockatoo", 13),
83 | KV.of("user18_BananaEmu", 7),
84 | KV.of("user3_BananaEmu", 17),
85 | KV.of("user18_BananaEmu", 1),
86 | KV.of("user18_ApricotCaneToad", 14)
87 | };
88 |
89 | @Rule public TestPipeline p = TestPipeline.create();
90 |
91 | /** Test the filtering. */
92 | @Test
93 | @Category(ValidatesRunner.class)
94 | public void testUserScoresFilter() throws Exception {
95 |
96 | final Instant startMinTimestamp = new Instant(1447965680000L);
97 |
98 | PCollection input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of()));
99 |
100 | PCollection> output =
101 | input
102 | .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
103 | .apply(
104 | "FilterStartTime",
105 | Filter.by(
106 | (GameActionInfo gInfo) -> gInfo.getTimestamp() > startMinTimestamp.getMillis()))
107 | // run a map to access the fields in the result.
108 | .apply(
109 | MapElements.into(
110 | TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
111 | .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())));
112 |
113 | PAssert.that(output).containsInAnyOrder(FILTERED_EVENTS);
114 |
115 | p.run().waitUntilFinish();
116 | }
117 |
118 | @Test
119 | public void testUserScoreOptions() {
120 | PipelineOptionsFactory.as(HourlyTeamScore.Options.class);
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/examples/java8/src/test/java/org/apache/beam/examples/complete/game/StatefulTeamScoreTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game;
19 |
20 | import org.apache.beam.examples.complete.game.StatefulTeamScore.UpdateTeamScoreFn;
21 | import org.apache.beam.examples.complete.game.UserScore.GameActionInfo;
22 | import org.apache.beam.sdk.coders.AvroCoder;
23 | import org.apache.beam.sdk.coders.KvCoder;
24 | import org.apache.beam.sdk.coders.StringUtf8Coder;
25 | import org.apache.beam.sdk.testing.PAssert;
26 | import org.apache.beam.sdk.testing.TestPipeline;
27 | import org.apache.beam.sdk.testing.TestStream;
28 | import org.apache.beam.sdk.transforms.ParDo;
29 | import org.apache.beam.sdk.transforms.windowing.FixedWindows;
30 | import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
31 | import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
32 | import org.apache.beam.sdk.transforms.windowing.Window;
33 | import org.apache.beam.sdk.values.KV;
34 | import org.apache.beam.sdk.values.PCollection;
35 | import org.apache.beam.sdk.values.TimestampedValue;
36 | import org.joda.time.Duration;
37 | import org.joda.time.Instant;
38 | import org.junit.Rule;
39 | import org.junit.Test;
40 | import org.junit.runner.RunWith;
41 | import org.junit.runners.JUnit4;
42 |
43 | /** Tests for {@link StatefulTeamScore}. */
44 | @RunWith(JUnit4.class)
45 | public class StatefulTeamScoreTest {
46 |
47 | private Instant baseTime = new Instant(0);
48 |
49 | @Rule public TestPipeline p = TestPipeline.create();
50 |
51 | /** Some example users, on two separate teams. */
52 | private enum TestUser {
53 | RED_ONE("scarlet", "red"),
54 | RED_TWO("burgundy", "red"),
55 | BLUE_ONE("navy", "blue"),
56 | BLUE_TWO("sky", "blue");
57 |
58 | private final String userName;
59 | private final String teamName;
60 |
61 | TestUser(String userName, String teamName) {
62 | this.userName = userName;
63 | this.teamName = teamName;
64 | }
65 |
66 | public String getUser() {
67 | return userName;
68 | }
69 |
70 | public String getTeam() {
71 | return teamName;
72 | }
73 | }
74 |
75 | /**
76 | * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs
77 | * correctly for one team.
78 | */
79 | @Test
80 | public void testScoreUpdatesOneTeam() {
81 |
82 | TestStream> createEvents =
83 | TestStream.create(KvCoder.of(StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class)))
84 | .advanceWatermarkTo(baseTime)
85 | .addElements(
86 | event(TestUser.RED_TWO, 99, Duration.standardSeconds(10)),
87 | event(TestUser.RED_ONE, 1, Duration.standardSeconds(20)),
88 | event(TestUser.RED_ONE, 0, Duration.standardSeconds(30)),
89 | event(TestUser.RED_TWO, 100, Duration.standardSeconds(40)),
90 | event(TestUser.RED_TWO, 201, Duration.standardSeconds(50)))
91 | .advanceWatermarkToInfinity();
92 |
93 | PCollection> teamScores =
94 | p.apply(createEvents).apply(ParDo.of(new UpdateTeamScoreFn(100)));
95 |
96 | String redTeam = TestUser.RED_ONE.getTeam();
97 |
98 | PAssert.that(teamScores)
99 | .inWindow(GlobalWindow.INSTANCE)
100 | .containsInAnyOrder(KV.of(redTeam, 100), KV.of(redTeam, 200), KV.of(redTeam, 401));
101 |
102 | p.run().waitUntilFinish();
103 | }
104 |
105 | /**
106 | * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs
107 | * correctly for multiple teams.
108 | */
109 | @Test
110 | public void testScoreUpdatesPerTeam() {
111 |
112 | TestStream> createEvents =
113 | TestStream.create(KvCoder.of(StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class)))
114 | .advanceWatermarkTo(baseTime)
115 | .addElements(
116 | event(TestUser.RED_ONE, 50, Duration.standardSeconds(10)),
117 | event(TestUser.RED_TWO, 50, Duration.standardSeconds(20)),
118 | event(TestUser.BLUE_ONE, 70, Duration.standardSeconds(30)),
119 | event(TestUser.BLUE_TWO, 80, Duration.standardSeconds(40)),
120 | event(TestUser.BLUE_TWO, 50, Duration.standardSeconds(50)))
121 | .advanceWatermarkToInfinity();
122 |
123 | PCollection> teamScores =
124 | p.apply(createEvents).apply(ParDo.of(new UpdateTeamScoreFn(100)));
125 |
126 | String redTeam = TestUser.RED_ONE.getTeam();
127 | String blueTeam = TestUser.BLUE_ONE.getTeam();
128 |
129 | PAssert.that(teamScores)
130 | .inWindow(GlobalWindow.INSTANCE)
131 | .containsInAnyOrder(KV.of(redTeam, 100), KV.of(blueTeam, 150), KV.of(blueTeam, 200));
132 |
133 | p.run().waitUntilFinish();
134 | }
135 |
136 | /**
137 | * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs
138 | * correctly per window and per key.
139 | */
140 | @Test
141 | public void testScoreUpdatesPerWindow() {
142 |
143 | TestStream> createEvents =
144 | TestStream.create(KvCoder.of(StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class)))
145 | .advanceWatermarkTo(baseTime)
146 | .addElements(
147 | event(TestUser.RED_ONE, 50, Duration.standardMinutes(1)),
148 | event(TestUser.RED_TWO, 50, Duration.standardMinutes(2)),
149 | event(TestUser.RED_ONE, 50, Duration.standardMinutes(3)),
150 | event(TestUser.RED_ONE, 60, Duration.standardMinutes(6)),
151 | event(TestUser.RED_TWO, 60, Duration.standardMinutes(7)))
152 | .advanceWatermarkToInfinity();
153 |
154 | Duration teamWindowDuration = Duration.standardMinutes(5);
155 |
156 | PCollection> teamScores =
157 | p.apply(createEvents)
158 | .apply(Window.>into(FixedWindows.of(teamWindowDuration)))
159 | .apply(ParDo.of(new UpdateTeamScoreFn(100)));
160 |
161 | String redTeam = TestUser.RED_ONE.getTeam();
162 |
163 | IntervalWindow window1 = new IntervalWindow(baseTime, teamWindowDuration);
164 | IntervalWindow window2 = new IntervalWindow(window1.end(), teamWindowDuration);
165 |
166 | PAssert.that(teamScores).inWindow(window1).containsInAnyOrder(KV.of(redTeam, 100));
167 |
168 | PAssert.that(teamScores).inWindow(window2).containsInAnyOrder(KV.of(redTeam, 120));
169 |
170 | p.run().waitUntilFinish();
171 | }
172 |
173 | private TimestampedValue> event(
174 | TestUser user, int score, Duration baseTimeOffset) {
175 | return TimestampedValue.of(
176 | KV.of(
177 | user.getTeam(),
178 | new GameActionInfo(
179 | user.getUser(), user.getTeam(), score, baseTime.plus(baseTimeOffset).getMillis())),
180 | baseTime.plus(baseTimeOffset));
181 | }
182 | }
183 |
--------------------------------------------------------------------------------
/examples/java8/src/test/java/org/apache/beam/examples/complete/game/UserScoreTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.complete.game;
19 |
20 | import java.io.Serializable;
21 | import java.util.Arrays;
22 | import java.util.List;
23 | import org.apache.beam.examples.complete.game.UserScore.ExtractAndSumScore;
24 | import org.apache.beam.examples.complete.game.UserScore.GameActionInfo;
25 | import org.apache.beam.examples.complete.game.UserScore.ParseEventFn;
26 | import org.apache.beam.sdk.coders.StringUtf8Coder;
27 | import org.apache.beam.sdk.testing.PAssert;
28 | import org.apache.beam.sdk.testing.TestPipeline;
29 | import org.apache.beam.sdk.testing.ValidatesRunner;
30 | import org.apache.beam.sdk.transforms.Create;
31 | import org.apache.beam.sdk.transforms.MapElements;
32 | import org.apache.beam.sdk.transforms.ParDo;
33 | import org.apache.beam.sdk.values.KV;
34 | import org.apache.beam.sdk.values.PCollection;
35 | import org.apache.beam.sdk.values.TypeDescriptors;
36 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists;
37 | import org.junit.Rule;
38 | import org.junit.Test;
39 | import org.junit.experimental.categories.Category;
40 | import org.junit.runner.RunWith;
41 | import org.junit.runners.JUnit4;
42 |
43 | /** Tests of UserScore. */
44 | @RunWith(JUnit4.class)
45 | public class UserScoreTest implements Serializable {
46 |
47 | static final String[] GAME_EVENTS_ARRAY =
48 | new String[] {
49 | "user0_MagentaKangaroo,MagentaKangaroo,3,1447955630000,2015-11-19 09:53:53.444",
50 | "user13_ApricotQuokka,ApricotQuokka,15,1447955630000,2015-11-19 09:53:53.444",
51 | "user6_AmberNumbat,AmberNumbat,11,1447955630000,2015-11-19 09:53:53.444",
52 | "user7_AlmondWallaby,AlmondWallaby,15,1447955630000,2015-11-19 09:53:53.444",
53 | "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,12,1447955630000,2015-11-19 09:53:53.444",
54 | "user6_AliceBlueDingo,AliceBlueDingo,4,xxxxxxx,2015-11-19 09:53:53.444",
55 | "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,11,1447955630000,2015-11-19 09:53:53.444",
56 | "THIS IS A PARSE ERROR,2015-11-19 09:53:53.444",
57 | "user19_BisqueBilby,BisqueBilby,6,1447955630000,2015-11-19 09:53:53.444",
58 | "user19_BisqueBilby,BisqueBilby,8,1447955630000,2015-11-19 09:53:53.444"
59 | };
60 |
61 | static final String[] GAME_EVENTS_ARRAY2 =
62 | new String[] {
63 | "user6_AliceBlueDingo,AliceBlueDingo,4,xxxxxxx,2015-11-19 09:53:53.444",
64 | "THIS IS A PARSE ERROR,2015-11-19 09:53:53.444",
65 | "user13_BisqueBilby,BisqueBilby,xxx,1447955630000,2015-11-19 09:53:53.444"
66 | };
67 |
68 | static final List GAME_EVENTS = Arrays.asList(GAME_EVENTS_ARRAY);
69 | static final List GAME_EVENTS2 = Arrays.asList(GAME_EVENTS_ARRAY2);
70 |
71 | static final List GAME_ACTION_INFO_LIST =
72 | Lists.newArrayList(
73 | new GameActionInfo("user0_MagentaKangaroo", "MagentaKangaroo", 3, 1447955630000L),
74 | new GameActionInfo("user13_ApricotQuokka", "ApricotQuokka", 15, 1447955630000L),
75 | new GameActionInfo("user6_AmberNumbat", "AmberNumbat", 11, 1447955630000L),
76 | new GameActionInfo("user7_AlmondWallaby", "AlmondWallaby", 15, 1447955630000L),
77 | new GameActionInfo(
78 | "user7_AndroidGreenKookaburra", "AndroidGreenKookaburra", 12, 1447955630000L),
79 | new GameActionInfo(
80 | "user7_AndroidGreenKookaburra", "AndroidGreenKookaburra", 11, 1447955630000L),
81 | new GameActionInfo("user19_BisqueBilby", "BisqueBilby", 6, 1447955630000L),
82 | new GameActionInfo("user19_BisqueBilby", "BisqueBilby", 8, 1447955630000L));
83 |
84 | static final List> USER_SUMS =
85 | Arrays.asList(
86 | KV.of("user0_MagentaKangaroo", 3),
87 | KV.of("user13_ApricotQuokka", 15),
88 | KV.of("user6_AmberNumbat", 11),
89 | KV.of("user7_AlmondWallaby", 15),
90 | KV.of("user7_AndroidGreenKookaburra", 23),
91 | KV.of("user19_BisqueBilby", 14));
92 |
93 | static final List> TEAM_SUMS =
94 | Arrays.asList(
95 | KV.of("MagentaKangaroo", 3),
96 | KV.of("ApricotQuokka", 15),
97 | KV.of("AmberNumbat", 11),
98 | KV.of("AlmondWallaby", 15),
99 | KV.of("AndroidGreenKookaburra", 23),
100 | KV.of("BisqueBilby", 14));
101 |
102 | @Rule public TestPipeline p = TestPipeline.create();
103 |
104 | /** Test the {@link ParseEventFn} {@link org.apache.beam.sdk.transforms.DoFn}. */
105 | @Test
106 | public void testParseEventFn() throws Exception {
107 | PCollection input = p.apply(Create.of(GAME_EVENTS));
108 | PCollection output = input.apply(ParDo.of(new ParseEventFn()));
109 |
110 | PAssert.that(output).containsInAnyOrder(GAME_ACTION_INFO_LIST);
111 |
112 | p.run().waitUntilFinish();
113 | }
114 |
115 | /** Tests ExtractAndSumScore("user"). */
116 | @Test
117 | @Category(ValidatesRunner.class)
118 | public void testUserScoreSums() throws Exception {
119 |
120 | PCollection input = p.apply(Create.of(GAME_EVENTS));
121 |
122 | PCollection> output =
123 | input
124 | .apply(ParDo.of(new ParseEventFn()))
125 | // Extract and sum username/score pairs from the event data.
126 | .apply("ExtractUserScore", new ExtractAndSumScore("user"));
127 |
128 | // Check the user score sums.
129 | PAssert.that(output).containsInAnyOrder(USER_SUMS);
130 |
131 | p.run().waitUntilFinish();
132 | }
133 |
134 | /** Tests ExtractAndSumScore("team"). */
135 | @Test
136 | @Category(ValidatesRunner.class)
137 | public void testTeamScoreSums() throws Exception {
138 |
139 | PCollection input = p.apply(Create.of(GAME_EVENTS));
140 |
141 | PCollection> output =
142 | input
143 | .apply(ParDo.of(new ParseEventFn()))
144 | // Extract and sum teamname/score pairs from the event data.
145 | .apply("ExtractTeamScore", new ExtractAndSumScore("team"));
146 |
147 | // Check the team score sums.
148 | PAssert.that(output).containsInAnyOrder(TEAM_SUMS);
149 |
150 | p.run().waitUntilFinish();
151 | }
152 |
153 | /** Test that bad input data is dropped appropriately. */
154 | @Test
155 | @Category(ValidatesRunner.class)
156 | public void testUserScoresBadInput() throws Exception {
157 |
158 | PCollection input = p.apply(Create.of(GAME_EVENTS2).withCoder(StringUtf8Coder.of()));
159 |
160 | PCollection> extract =
161 | input
162 | .apply(ParDo.of(new ParseEventFn()))
163 | .apply(
164 | MapElements.into(
165 | TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
166 | .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())));
167 |
168 | PAssert.that(extract).empty();
169 |
170 | p.run().waitUntilFinish();
171 | }
172 | }
173 |
--------------------------------------------------------------------------------
/examples/java8/src/test/java/org/apache/beam/examples/subprocess/ExampleEchoPipelineTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.apache.beam.examples.subprocess;
19 |
20 | import static java.nio.charset.StandardCharsets.UTF_8;
21 |
22 | import java.io.IOException;
23 | import java.nio.ByteBuffer;
24 | import java.nio.channels.FileChannel;
25 | import java.nio.channels.SeekableByteChannel;
26 | import java.nio.file.Files;
27 | import java.nio.file.Path;
28 | import java.nio.file.StandardOpenOption;
29 | import java.util.ArrayList;
30 | import java.util.List;
31 | import org.apache.beam.examples.subprocess.configuration.SubProcessConfiguration;
32 | import org.apache.beam.examples.subprocess.kernel.SubProcessCommandLineArgs;
33 | import org.apache.beam.examples.subprocess.kernel.SubProcessCommandLineArgs.Command;
34 | import org.apache.beam.examples.subprocess.kernel.SubProcessKernel;
35 | import org.apache.beam.examples.subprocess.utils.CallingSubProcessUtils;
36 | import org.apache.beam.sdk.extensions.gcp.options.GcsOptions;
37 | import org.apache.beam.sdk.extensions.gcp.util.GcsUtil;
38 | import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath;
39 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
40 | import org.apache.beam.sdk.testing.PAssert;
41 | import org.apache.beam.sdk.testing.TestPipeline;
42 | import org.apache.beam.sdk.transforms.Create;
43 | import org.apache.beam.sdk.transforms.DoFn;
44 | import org.apache.beam.sdk.transforms.ParDo;
45 | import org.apache.beam.sdk.values.KV;
46 | import org.apache.beam.sdk.values.PCollection;
47 | import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList;
48 | import org.junit.Rule;
49 | import org.junit.Test;
50 | import org.junit.runner.RunWith;
51 | import org.junit.runners.JUnit4;
52 | import org.mockito.Mockito;
53 | import org.mockito.invocation.InvocationOnMock;
54 | import org.mockito.stubbing.Answer;
55 | import org.slf4j.Logger;
56 | import org.slf4j.LoggerFactory;
57 |
58 | /**
59 | * To keep {@link org.apache.beam.examples.subprocess.ExampleEchoPipeline} simple, it is not
60 | * factored or testable. This test file should be maintained with a copy of its code for a basic
61 | * smoke test.
62 | */
63 | @RunWith(JUnit4.class)
64 | public class ExampleEchoPipelineTest {
65 |
66 | private static final Logger LOG = LoggerFactory.getLogger(ExampleEchoPipelineTest.class);
67 |
68 | @Rule public TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false);
69 |
70 | @Test
71 | public void testExampleEchoPipeline() throws Exception {
72 |
73 | // Create two Bash files as tests for the binary files
74 |
75 | Path fileA = Files.createTempFile("test-Echo", ".sh");
76 | Path fileB = Files.createTempFile("test-EchoAgain", ".sh");
77 |
78 | Path workerTempFiles = Files.createTempDirectory("test-Echoo");
79 |
80 | try (SeekableByteChannel channel =
81 | FileChannel.open(fileA, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
82 | channel.write(ByteBuffer.wrap(getTestShellEcho().getBytes(UTF_8)));
83 | }
84 |
85 | try (SeekableByteChannel channel =
86 | FileChannel.open(fileB, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
87 | channel.write(ByteBuffer.wrap(getTestShellEchoAgain().getBytes(UTF_8)));
88 | }
89 |
90 | // Read in the options for the pipeline
91 | SubProcessPipelineOptions options = PipelineOptionsFactory.as(SubProcessPipelineOptions.class);
92 |
93 | options.setConcurrency(2);
94 | options.setSourcePath(fileA.getParent().toString());
95 | options.setWorkerPath(workerTempFiles.toAbsolutePath().toString());
96 |
97 | p.getOptions().as(GcsOptions.class).setGcsUtil(buildMockGcsUtil());
98 |
99 | // Setup the Configuration option used with all transforms
100 | SubProcessConfiguration configuration = options.getSubProcessConfiguration();
101 |
102 | // Create some sample data to be fed to our c++ Echo library
103 | List> sampleData = new ArrayList<>();
104 |
105 | for (int i = 0; i < 100; i++) {
106 | String str = String.valueOf(i);
107 | sampleData.add(KV.of(str, str));
108 | }
109 |
110 | // Define the pipeline which is two transforms echoing the inputs out to Logs
111 | // For this use case we will make use of two shell files instead of the binary to make
112 | // testing easier
113 | PCollection> output =
114 | p.apply(Create.of(sampleData))
115 | .apply(
116 | "Echo inputs round 1",
117 | ParDo.of(new EchoInputDoFn(configuration, fileA.getFileName().toString())))
118 | .apply(
119 | "Echo inputs round 2",
120 | ParDo.of(new EchoInputDoFn(configuration, fileB.getFileName().toString())));
121 |
122 | PAssert.that(output).containsInAnyOrder(sampleData);
123 |
124 | p.run();
125 | }
126 |
127 | /** Simple DoFn that echos the element, used as an example of running a C++ library. */
128 | @SuppressWarnings("serial")
129 | private static class EchoInputDoFn extends DoFn, KV> {
130 |
131 | private static final Logger LOG = LoggerFactory.getLogger(EchoInputDoFn.class);
132 |
133 | private SubProcessConfiguration configuration;
134 | private String binaryName;
135 |
136 | public EchoInputDoFn(SubProcessConfiguration configuration, String binary) {
137 | // Pass in configuration information the name of the filename of the sub-process and the level
138 | // of concurrency
139 | this.configuration = configuration;
140 | this.binaryName = binary;
141 | }
142 |
143 | @Setup
144 | public void setUp() throws Exception {
145 | CallingSubProcessUtils.setUp(configuration, binaryName);
146 | }
147 |
148 | @ProcessElement
149 | public void processElement(ProcessContext c) throws Exception {
150 | try {
151 | // Our Library takes a single command in position 0 which it will echo back in the result
152 | SubProcessCommandLineArgs commands = new SubProcessCommandLineArgs();
153 | Command command = new Command(0, String.valueOf(c.element().getValue()));
154 | commands.putCommand(command);
155 |
156 | // The ProcessingKernel deals with the execution of the process
157 | SubProcessKernel kernel = new SubProcessKernel(configuration, binaryName);
158 |
159 | // Run the command and work through the results
160 | List results = kernel.exec(commands);
161 | for (String s : results) {
162 | c.output(KV.of(c.element().getKey(), s));
163 | }
164 | } catch (Exception ex) {
165 | LOG.error("Error processing element ", ex);
166 | throw ex;
167 | }
168 | }
169 | }
170 |
171 | private static String getTestShellEcho() {
172 | return "#!/bin/sh\n" + "filename=$1;\n" + "echo $2 >> $filename;";
173 | }
174 |
175 | private static String getTestShellEchoAgain() {
176 | return "#!/bin/sh\n" + "filename=$1;\n" + "echo $2 >> $filename;";
177 | }
178 |
179 | private GcsUtil buildMockGcsUtil() throws IOException {
180 | GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class);
181 |
182 | // Any request to open gets a new bogus channel
183 | Mockito.when(mockGcsUtil.open(Mockito.any(GcsPath.class)))
184 | .then(
185 | new Answer() {
186 |
187 | @Override
188 | public SeekableByteChannel answer(InvocationOnMock invocation) throws Throwable {
189 | return FileChannel.open(
190 | Files.createTempFile("channel-", ".tmp"),
191 | StandardOpenOption.CREATE,
192 | StandardOpenOption.DELETE_ON_CLOSE);
193 | }
194 | });
195 |
196 | // Any request for expansion returns a list containing the original GcsPath
197 | // This is required to pass validation that occurs in TextIO during apply()
198 | Mockito.when(mockGcsUtil.expand(Mockito.any(GcsPath.class)))
199 | .then(
200 | new Answer>() {
201 |
202 | @Override
203 | public List answer(InvocationOnMock invocation) throws Throwable {
204 | return ImmutableList.of((GcsPath) invocation.getArguments()[0]);
205 | }
206 | });
207 |
208 | return mockGcsUtil;
209 | }
210 | }
211 |
--------------------------------------------------------------------------------