├── quickstart └── wordcount.tar.gz ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── src ├── main │ ├── java │ │ └── samza │ │ │ └── examples │ │ │ ├── sql │ │ │ ├── samza-sql-filter │ │ │ │ └── src │ │ │ │ │ └── main │ │ │ │ │ └── sql │ │ │ │ │ └── samza.sql │ │ │ ├── samza-sql-stream-table-join │ │ │ │ └── src │ │ │ │ │ └── main │ │ │ │ │ └── sql │ │ │ │ │ └── samza.sql │ │ │ ├── samza-sql-groupby │ │ │ │ └── src │ │ │ │ │ └── main │ │ │ │ │ ├── java │ │ │ │ │ └── samza │ │ │ │ │ │ └── sql │ │ │ │ │ │ └── PageViewGroupByOutput.json │ │ │ │ │ └── sql │ │ │ │ │ └── samza.sql │ │ │ └── samza-sql-casewhen │ │ │ │ └── src │ │ │ │ └── main │ │ │ │ └── sql │ │ │ │ └── samza.sql │ │ │ ├── cookbook │ │ │ ├── data │ │ │ │ ├── Profile.java │ │ │ │ ├── UserPageViews.java │ │ │ │ ├── PageView.java │ │ │ │ └── AdClick.java │ │ │ ├── FilterExample.java │ │ │ ├── SessionWindowExample.java │ │ │ ├── TumblingWindowExample.java │ │ │ ├── JoinExample.java │ │ │ ├── StreamTableJoinExample.java │ │ │ ├── RemoteTableJoinExample.java │ │ │ └── CouchbaseTableExample.java │ │ │ ├── azure │ │ │ ├── AzureZKLocalApplication.java │ │ │ ├── data │ │ │ │ └── PageViewAvroRecord.java │ │ │ ├── AzureApplication.java │ │ │ └── AzureBlobApplication.java │ │ │ ├── wikipedia │ │ │ ├── system │ │ │ │ ├── descriptors │ │ │ │ │ ├── WikipediaInputDescriptor.java │ │ │ │ │ └── WikipediaSystemDescriptor.java │ │ │ │ ├── WikipediaSystemFactory.java │ │ │ │ ├── WikipediaConsumer.java │ │ │ │ └── WikipediaFeed.java │ │ │ ├── task │ │ │ │ ├── WikipediaFeedStreamTask.java │ │ │ │ ├── WikipediaParserStreamTask.java │ │ │ │ ├── application │ │ │ │ │ ├── WikipediaStatsTaskApplication.java │ │ │ │ │ ├── WikipediaParserTaskApplication.java │ │ │ │ │ └── WikipediaFeedTaskApplication.java │ │ │ │ └── WikipediaStatsStreamTask.java │ │ │ ├── application │ │ │ │ └── WikipediaZkLocalApplication.java │ │ │ └── model │ │ │ │ └── WikipediaParser.java │ │ │ └── kinesis │ │ │ └── KinesisHelloSamza.java │ ├── config │ │ ├── filter-example.properties │ │ ├── join-example.properties │ │ ├── session-window-example.properties │ │ ├── tumbling-window-example.properties │ │ ├── wikipedia-parser.properties │ │ ├── couchbase-table-example.properties │ │ ├── remote-table-join-example.properties │ │ ├── stream-table-join-example.properties │ │ ├── azure-application-local-runner.properties │ │ ├── wikipedia-application-local-runner.properties │ │ ├── wikipedia-feed.properties │ │ ├── wikipedia-application.properties │ │ ├── azure-blob-application.properties │ │ ├── wikipedia-stats.properties │ │ └── kinesis-hello-samza.properties │ ├── resources │ │ └── log4j2.xml │ └── assembly │ │ └── src.xml └── test │ ├── resources │ ├── log4j2-test.xml │ └── WikitionaryEditEvents.txt │ └── java │ └── samza │ └── examples │ ├── wikipedia │ ├── task │ │ └── test │ │ │ └── TestWikipediaTask.java │ └── application │ │ └── test │ │ └── TestWikipediaApplication.java │ ├── test │ └── utils │ │ └── TestUtils.java │ └── cookbook │ └── test │ └── TestSamzaCookBookExamples.java ├── gradle.properties ├── .reviewboardrc ├── bin ├── deploy.sh ├── run-event-hubs-zk-application.sh ├── run-wikipedia-zk-application.sh ├── produce-wikipedia-raw-data.sh └── grid ├── .gitignore ├── README-gradle.md ├── conf └── yarn-site.xml ├── gradlew.bat ├── gradlew └── README.md /quickstart/wordcount.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/samza-hello-samza/HEAD/quickstart/wordcount.tar.gz -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/samza-hello-samza/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Fri Mar 27 16:28:33 PDT 2020 2 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.3-all.zip 3 | distributionBase=GRADLE_USER_HOME 4 | distributionPath=wrapper/dists 5 | zipStorePath=wrapper/dists 6 | zipStoreBase=GRADLE_USER_HOME 7 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/sql/samza-sql-filter/src/main/sql/samza.sql: -------------------------------------------------------------------------------- 1 | -- Filter Profile change-capture stream by 'Product Manager' 2 | -- title and project basic profile data to a kafka topic. 3 | 4 | INSERT INTO kafka.ProductManagerProfiles 5 | SELECT memberId, firstName, lastName, company 6 | FROM kafka.ProfileChanges 7 | WHERE standardize(title) = 'Product Manager' 8 | 9 | -- you can add additional SQL statements here 10 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/sql/samza-sql-stream-table-join/src/main/sql/samza.sql: -------------------------------------------------------------------------------- 1 | -- NOTE: Join Operator is currently not fully stable, 2 | -- we are actively working on stabilizing it. 3 | 4 | -- Enrich PageViewEvent with member profile data 5 | INSERT INTO kafka.tracking.EnrichedPageVIewEvent 6 | SELECT * 7 | FROM Kafka.PageViewEvent as pv 8 | JOIN Kafka.ProfileChanges.`$table` as p 9 | ON pv.memberid = p.memberid 10 | 11 | -- You can add additional SQL statements here 12 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/sql/samza-sql-groupby/src/main/java/samza/sql/PageViewGroupByOutput.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "PageViewGroupByOutput", 3 | "version" : 1, 4 | "namespace": "org.apache.samza.sql.system.avro", 5 | "type": "record", 6 | "fields": [ 7 | { 8 | "name":"pageKey", 9 | "doc":"The page key of the page being viewed.", 10 | "type":["string","null"] 11 | }, 12 | { 13 | "name": "Views", 14 | "doc" : "Number of views in 5 minute window.", 15 | "type": ["long", "null"] 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/sql/samza-sql-casewhen/src/main/sql/samza.sql: -------------------------------------------------------------------------------- 1 | -- For each profile in Kafka Profile change capture stream, identify whether the 2 | -- profile is a quality profile or not and insert the result into QualityProfile 3 | -- kafka topic. Please note the usage of GetSqlField UDF to extract the company 4 | -- name field from nested record. 5 | 6 | INSERT INTO kafka.QualityProfile 7 | SELECT id, status, case when (profilePicture <> null and industryName <> null and 8 | GetSqlField(positions, 'Position.companyName') <> null) 9 | then 1 else 0 end as quality 10 | FROM kafka.ProfileChanges 11 | 12 | -- you can add additional SQL statements here 13 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/sql/samza-sql-groupby/src/main/sql/samza.sql: -------------------------------------------------------------------------------- 1 | -- NOTE: Groupby Operator is currently not fully stable, 2 | -- we are actively working on stabilizing it. 3 | 4 | -- Emit Page view counts collected grouped by page key in the last 5 | -- 5 minutes at 5 minute interval and send the result to a kafka topic. 6 | -- Using GetSqlField UDF to extract page key from the requestHeader. 7 | insert into kafka.groupbyTopic 8 | select GetSqlField(pv.requestHeader) as __key__, GetPageKey(pv.requestHeader) as pageKey, count(*) as Views 9 | from kafka.`PageViewEvent` as pv 10 | group by GetSqlField(pv.requestHeader) 11 | 12 | -- You can add additional SQL statements here 13 | -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | SAMZA_VERSION=1.6.0 21 | KAFKA_VERSION=0.11.0.2 22 | HADOOP_VERSION=2.7.1 23 | 24 | SLF4J_VERSION = 1.7.7 25 | 26 | -------------------------------------------------------------------------------- /.reviewboardrc: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | REVIEWBOARD_URL = 'https://reviews.apache.org' 19 | REPOSITORY = 'samza-hello-samza' 20 | GUESS_DESCRIPTION = True 21 | TARGET_GROUPS = 'samza' 22 | TRACKING_BRANCH = 'origin/master' 23 | -------------------------------------------------------------------------------- /bin/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | home_dir=`pwd` 20 | base_dir=$(dirname $0)/.. 21 | cd $base_dir 22 | base_dir=`pwd` 23 | 24 | mvn clean package 25 | mkdir -p $base_dir/deploy/samza 26 | tar -xvf $base_dir/target/hello-samza-1.6.0-dist.tar.gz -C $base_dir/deploy/samza 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | *.class 16 | *.war 17 | *.ear 18 | target/ 19 | .classpath 20 | .project 21 | .vagrant 22 | .settings/ 23 | .idea/ 24 | .idea_modules/ 25 | *.iml 26 | *.ipr 27 | *.iws 28 | */.cache 29 | deploy 30 | *.swp 31 | build/ 32 | .gradle/ 33 | state 34 | manifest.txt 35 | pathing.jar 36 | out/ 37 | -------------------------------------------------------------------------------- /README-gradle.md: -------------------------------------------------------------------------------- 1 | 2 | To use gradle to build/run the hello-samza project: 3 | 4 | 1) the project is configured to download and use gradle version 2.3 - on first task execution, it will download the required gradle jars. 5 | 6 | 2) download/install yarn/kafka/zookeeper: 7 | 8 | $ ./gradlew installGrid 9 | 10 | 3) build hello-samza job package: 11 | 12 | $ ./gradlew distTar 13 | 14 | 4) deploy hello-samza project to grid: 15 | 16 | $ ./gradlew deployHelloSamza 17 | 18 | 5) start the grid (starts up yarn/kafka/zookeeper): 19 | 20 | $ ./gradlew startGrid 21 | 22 | 6) run the various Samza tasks that are part of hello-samza project: 23 | 24 | $ ./gradlew runWikiFeed 25 | $ ./gradlew runWikiParser 26 | $ ./gradlew runWikiStats 27 | 28 | 7) view all the current Kafka topics: 29 | 30 | $ ./gradlew listKafkaTopics 31 | 32 | 8) view the Kafka topics output by the various Samza tasks: 33 | 34 | $ ./gradlew dumpWikiRaw 35 | ( output of Kafka topic scrolls by) 36 | CTRL-c 37 | 38 | $ ./gradlew dumpWikiEdits 39 | ( output of Kafka topic scrolls by) 40 | CTRL-c 41 | 42 | $ ./gradlew dumpWikiStats 43 | ( output of Kafka topic scrolls by) 44 | CTRL-c 45 | 46 | 9) stop all the components: 47 | 48 | $ ./gradlew stopGrid 49 | 50 | Shortcut: using the 'runWiki*' tasks directly will do steps 3-6 automatically. 51 | 52 | -------------------------------------------------------------------------------- /src/main/config/filter-example.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Application/Job 19 | app.class=samza.examples.cookbook.FilterExample 20 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 21 | job.name=pageview-filter 22 | job.container.count=2 23 | 24 | # YARN 25 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 26 | 27 | # Config Loader 28 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 29 | job.config.loader.properties.path=./__package/config/filter-example.properties -------------------------------------------------------------------------------- /src/main/config/join-example.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Application / Job 19 | app.class=samza.examples.cookbook.JoinExample 20 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 21 | job.name=pageview-adclick-joiner 22 | job.container.count=2 23 | 24 | # YARN 25 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 26 | 27 | # Config Loader 28 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 29 | job.config.loader.properties.path=./__package/config/join-example.properties -------------------------------------------------------------------------------- /src/test/resources/log4j2-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /src/main/config/session-window-example.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Application / Job 19 | app.class=samza.examples.cookbook.SessionWindowExample 20 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 21 | job.name=pageview-sessionizer 22 | job.container.count=2 23 | 24 | # YARN 25 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 26 | 27 | # Config Loader 28 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 29 | job.config.loader.properties.path=./__package/config/session-window-example.properties -------------------------------------------------------------------------------- /src/main/config/tumbling-window-example.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Application / Job 19 | app.class=samza.examples.cookbook.TumblingWindowExample 20 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 21 | job.name=tumbling-pageview-counter 22 | job.container.count=2 23 | 24 | # YARN 25 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 26 | 27 | # Config Loader 28 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 29 | job.config.loader.properties.path=./__package/config/tumbling-window-example.properties -------------------------------------------------------------------------------- /src/main/config/wikipedia-parser.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Job 19 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 20 | job.name=wikipedia-parser 21 | 22 | # YARN 23 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 24 | 25 | # TaskApplication class 26 | app.class=samza.examples.wikipedia.task.application.WikipediaParserTaskApplication 27 | 28 | # Config Loader 29 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 30 | job.config.loader.properties.path=./__package/config/wikipedia-parser.properties -------------------------------------------------------------------------------- /src/main/config/couchbase-table-example.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Application / Job 19 | app.class=samza.examples.cookbook.CouchbaseTableExample 20 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 21 | job.name=couchbase-table-example 22 | job.container.count=2 23 | 24 | # YARN 25 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 26 | 27 | # Config Loader 28 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 29 | job.config.loader.properties.path=./__package/config/couchbase-table-example.properties 30 | -------------------------------------------------------------------------------- /src/main/config/remote-table-join-example.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Application / Job 19 | app.class=samza.examples.cookbook.RemoteTableJoinExample 20 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 21 | job.name=stock-price-table-joiner 22 | job.container.count=2 23 | 24 | # YARN 25 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 26 | 27 | # Config Loader 28 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 29 | job.config.loader.properties.path=./__package/config/remote-table-join-example.properties -------------------------------------------------------------------------------- /src/main/config/stream-table-join-example.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Application / Job 19 | app.class=samza.examples.cookbook.StreamTableJoinExample 20 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 21 | job.name=pageview-profile-table-joiner 22 | job.container.count=2 23 | 24 | # YARN 25 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 26 | 27 | # Config Loader 28 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 29 | job.config.loader.properties.path=./__package/config/stream-table-join-example.properties -------------------------------------------------------------------------------- /src/main/config/azure-application-local-runner.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Job 19 | job.name=azure-application-local-runner 20 | job.coordinator.factory=org.apache.samza.zk.ZkJobCoordinatorFactory 21 | job.default.system=eventhubs 22 | job.coordinator.zk.connect=localhost:2181 23 | 24 | # Define the key and name configurations with property names of your choice, starting with 'sensitive.' 25 | sensitive.eventhubs.sas.key.name=my-sas-key-name 26 | sensitive.eventhubs.sas.token=my-sas-token 27 | 28 | # Task/Application 29 | task.name.grouper.factory=org.apache.samza.container.grouper.task.GroupByContainerIdsFactory 30 | -------------------------------------------------------------------------------- /conf/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 20 | 21 | 22 | yarn.resourcemanager.scheduler.class 23 | org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler 24 | 25 | 26 | yarn.nodemanager.vmem-pmem-ratio 27 | 10 28 | 29 | 30 | yarn.resourcemanager.hostname 31 | 127.0.0.1 32 | 33 | 34 | yarn.nodemanager.delete.debug-delay.sec 35 | 86400 36 | 37 | 38 | -------------------------------------------------------------------------------- /bin/run-event-hubs-zk-application.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | home_dir=`pwd` 20 | base_dir=$(dirname $0)/.. 21 | cd $base_dir 22 | base_dir=`pwd` 23 | cd $home_dir 24 | 25 | export EXECUTION_PLAN_DIR="$base_dir/plan" 26 | mkdir -p $EXECUTION_PLAN_DIR 27 | 28 | [[ $JAVA_OPTS != *-Dlog4j.configuration* ]] && export JAVA_OPTS="$JAVA_OPTS -Dlog4j.configuration=file:$(dirname $0)/log4j-console.xml" 29 | 30 | exec $(dirname $0)/run-class.sh samza.examples.azure.AzureZKLocalApplication --config job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory --config job.config.loader.properties.path=$PWD/deploy/samza/config/azure-application-local-runner.properties 31 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/cookbook/data/Profile.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.cookbook.data; 20 | 21 | import org.codehaus.jackson.annotate.JsonProperty; 22 | 23 | 24 | public class Profile { 25 | 26 | public final String userId; 27 | public final String company; 28 | 29 | /** 30 | * Constructs a user profile. 31 | * 32 | * @param userId the user Id 33 | * @param company company to which the user belong to 34 | */ 35 | public Profile( 36 | @JsonProperty("userId") String userId, 37 | @JsonProperty("company") String company) { 38 | this.userId = userId; 39 | this.company = company; 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /bin/run-wikipedia-zk-application.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | home_dir=`pwd` 20 | base_dir=$(dirname $0)/.. 21 | cd $base_dir 22 | base_dir=`pwd` 23 | cd $home_dir 24 | 25 | export EXECUTION_PLAN_DIR="$base_dir/plan" 26 | mkdir -p $EXECUTION_PLAN_DIR 27 | 28 | [[ $JAVA_OPTS != *-Dlog4j.configuration* ]] && export JAVA_OPTS="$JAVA_OPTS -Dlog4j.configuration=file:$(dirname $0)/log4j-console.xml" 29 | 30 | exec $(dirname $0)/run-class.sh samza.examples.wikipedia.application.WikipediaZkLocalApplication --config job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory --config job.config.loader.properties.path=$PWD/deploy/samza/config/wikipedia-application-local-runner.properties 31 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/azure/AzureZKLocalApplication.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.azure; 21 | 22 | import joptsimple.OptionSet; 23 | import org.apache.samza.config.Config; 24 | import org.apache.samza.runtime.LocalApplicationRunner; 25 | import org.apache.samza.util.CommandLine; 26 | 27 | 28 | public class AzureZKLocalApplication { 29 | 30 | public static void main(String[] args) { 31 | CommandLine cmdLine = new CommandLine(); 32 | OptionSet options = cmdLine.parser().parse(args); 33 | Config config = cmdLine.loadConfig(options); 34 | 35 | AzureApplication app = new AzureApplication(); 36 | LocalApplicationRunner runner = new LocalApplicationRunner(app, config); 37 | runner.run(); 38 | 39 | runner.waitForFinish(); 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/config/wikipedia-application-local-runner.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Job 19 | job.name=wikipedia-application 20 | job.coordinator.factory=org.apache.samza.zk.ZkJobCoordinatorFactory 21 | job.default.system=kafka 22 | job.coordinator.zk.connect=localhost:2181 23 | task.name.grouper.factory=org.apache.samza.container.grouper.task.GroupByContainerIdsFactory 24 | 25 | # Serializers 26 | serializers.registry.string.class=org.apache.samza.serializers.StringSerdeFactory 27 | serializers.registry.integer.class=org.apache.samza.serializers.IntegerSerdeFactory 28 | 29 | # Key-value storage 30 | stores.wikipedia-stats.factory=org.apache.samza.storage.kv.RocksDbKeyValueStorageEngineFactory 31 | stores.wikipedia-stats.changelog=kafka.wikipedia-stats-changelog 32 | stores.wikipedia-stats.key.serde=string 33 | stores.wikipedia-stats.msg.serde=integer 34 | 35 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/cookbook/data/UserPageViews.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.cookbook.data; 20 | 21 | 22 | import org.codehaus.jackson.annotate.JsonProperty; 23 | 24 | /** 25 | * User page view count. 26 | */ 27 | public class UserPageViews { 28 | private final String userId; 29 | private final int count; 30 | 31 | /** 32 | * Constructs a user page view count. 33 | * 34 | * @param userId the id of the user viewing the pages 35 | * @param count number of page views by the user 36 | */ 37 | public UserPageViews( 38 | @JsonProperty("userId") String userId, 39 | @JsonProperty("count") int count) { 40 | this.userId = userId; 41 | this.count = count; 42 | } 43 | 44 | public String getUserId() { 45 | return userId; 46 | } 47 | 48 | public int getCount() { 49 | return count; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/config/wikipedia-feed.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Job 19 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 20 | job.name=wikipedia-feed 21 | 22 | # YARN package path 23 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 24 | 25 | # TaskApplication class 26 | app.class=samza.examples.wikipedia.task.application.WikipediaFeedTaskApplication 27 | 28 | # Config Loader 29 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 30 | job.config.loader.properties.path=./__package/config/wikipedia-feed.properties 31 | 32 | # Add configuration to disable checkpointing for this job once it is available in the Coordinator Stream model 33 | # See https://issues.apache.org/jira/browse/SAMZA-465?focusedCommentId=14533346&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-14533346 for more details 34 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/cookbook/data/PageView.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.cookbook.data; 20 | 21 | import org.codehaus.jackson.annotate.JsonProperty; 22 | 23 | /** 24 | * A page view event 25 | */ 26 | public class PageView { 27 | public final String userId; 28 | public final String country; 29 | public final String pageId; 30 | 31 | /** 32 | * Constructs a page view event. 33 | * 34 | * @param pageId the id for the page that was viewed 35 | * @param userId the user that viewed the page 36 | * @param country the country that the page was viewed from 37 | */ 38 | public PageView( 39 | @JsonProperty("pageId") String pageId, 40 | @JsonProperty("userId") String userId, 41 | @JsonProperty("countryId") String country) { 42 | this.userId = userId; 43 | this.country = country; 44 | this.pageId = pageId; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/system/descriptors/WikipediaInputDescriptor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.wikipedia.system.descriptors; 21 | 22 | import org.apache.samza.serializers.NoOpSerde; 23 | import org.apache.samza.serializers.Serde; 24 | import org.apache.samza.system.descriptors.InputDescriptor; 25 | import org.apache.samza.system.descriptors.SystemDescriptor; 26 | 27 | import samza.examples.wikipedia.system.WikipediaFeed; 28 | 29 | 30 | public class WikipediaInputDescriptor extends InputDescriptor { 31 | // Messages come from WikipediaConsumer so we know that they don't have a key and don't need to be deserialized. 32 | private static final Serde SERDE = new NoOpSerde(); 33 | 34 | WikipediaInputDescriptor(String streamId, SystemDescriptor systemDescriptor) { 35 | super(streamId, SERDE, systemDescriptor, null); 36 | } 37 | 38 | public WikipediaInputDescriptor withChannel(String channel) { 39 | withPhysicalName(channel); 40 | return this; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/kinesis/KinesisHelloSamza.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.kinesis; 21 | 22 | import org.apache.samza.system.IncomingMessageEnvelope; 23 | import org.apache.samza.system.kinesis.consumer.KinesisIncomingMessageEnvelope; 24 | import org.apache.samza.task.MessageCollector; 25 | import org.apache.samza.task.StreamTask; 26 | import org.apache.samza.task.TaskCoordinator; 27 | import org.slf4j.Logger; 28 | import org.slf4j.LoggerFactory; 29 | 30 | 31 | /** 32 | * A sample task which consumes messages from kinesis stream and logs the message content. 33 | */ 34 | public class KinesisHelloSamza implements StreamTask { 35 | private static final Logger LOG = LoggerFactory.getLogger(KinesisHelloSamza.class); 36 | 37 | public void process(IncomingMessageEnvelope envelope, MessageCollector collector, TaskCoordinator coordinator) { 38 | KinesisIncomingMessageEnvelope kEnvelope = (KinesisIncomingMessageEnvelope) envelope; 39 | long lagMs = System.currentTimeMillis() - kEnvelope.getApproximateArrivalTimestamp().getTime(); 40 | LOG.info(String.format("Kinesis message key: %s Lag: %d ms", envelope.getKey(), lagMs)); 41 | } 42 | } -------------------------------------------------------------------------------- /bin/produce-wikipedia-raw-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | # This script will generate wikipedia-raw data to Kafka 20 | 21 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 22 | BASE_DIR=$(dirname $DIR) 23 | ZOOKEEPER=localhost:2181 24 | KAFKA_BROKER=localhost:9092 25 | 26 | # overwritten options 27 | while getopts "z:b:" option 28 | do 29 | case ${option} in 30 | z) ZOOKEEPER="${OPTARG}";; 31 | b) KAFKA_BROKER="${OPTARG}";; 32 | esac 33 | done 34 | echo "Using ${ZOOKEEPER} as the zookeeper. You can overwrite it with '-z yourlocation'" 35 | echo "Using ${KAFKA_BROKER} as the kafka broker. You can overwrite it with '-b yourlocation'" 36 | 37 | # check if the topic exists. if not, create the topic 38 | EXIST=$($BASE_DIR/deploy/kafka/bin/kafka-topics.sh --describe --topic wikipedia-raw --zookeeper $ZOOKEEPER) 39 | if [ -z "$EXIST" ] 40 | then 41 | $BASE_DIR/deploy/kafka/bin/kafka-topics.sh --create --zookeeper $ZOOKEEPER --topic wikipedia-raw --partition 1 --replication-factor 1 42 | fi 43 | 44 | # produce raw data 45 | while sleep 1 46 | do 47 | $BASE_DIR/deploy/kafka/bin/kafka-console-producer.sh < $BASE_DIR/wikipedia-raw.json --topic wikipedia-raw --broker $KAFKA_BROKER 48 | done 49 | 50 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/cookbook/data/AdClick.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.cookbook.data; 21 | 22 | import org.codehaus.jackson.annotate.JsonProperty; 23 | 24 | /** 25 | * An ad click event. 26 | */ 27 | public class AdClick { 28 | 29 | private String pageId; // the unique id of the page that the ad was clicked on 30 | private String adId; // an unique id for the ad 31 | private String userId; // the user that clicked the ad 32 | 33 | public AdClick( 34 | @JsonProperty("pageId") String pageId, 35 | @JsonProperty("adId") String adId, 36 | @JsonProperty("userId") String userId) { 37 | this.pageId = pageId; 38 | this.adId = adId; 39 | this.userId = userId; 40 | } 41 | 42 | public String getPageId() { 43 | return pageId; 44 | } 45 | 46 | public void setPageId(String pageId) { 47 | this.pageId = pageId; 48 | } 49 | 50 | public String getAdId() { 51 | return adId; 52 | } 53 | 54 | public void setAdId(String adId) { 55 | this.adId = adId; 56 | } 57 | 58 | public String getUserId() { 59 | return userId; 60 | } 61 | 62 | public void setUserId(String userId) { 63 | this.userId = userId; 64 | } 65 | } -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/task/WikipediaFeedStreamTask.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.wikipedia.task; 21 | 22 | import java.util.Map; 23 | import org.apache.samza.system.IncomingMessageEnvelope; 24 | import org.apache.samza.system.OutgoingMessageEnvelope; 25 | import org.apache.samza.system.SystemStream; 26 | import org.apache.samza.task.MessageCollector; 27 | import org.apache.samza.task.StreamTask; 28 | import org.apache.samza.task.TaskCoordinator; 29 | import samza.examples.wikipedia.system.WikipediaFeed.WikipediaFeedEvent; 30 | 31 | /** 32 | * This task is very simple. All it does is take messages that it receives, and 33 | * sends them to a Kafka topic called wikipedia-raw. 34 | */ 35 | public class WikipediaFeedStreamTask implements StreamTask { 36 | private static final SystemStream OUTPUT_STREAM = new SystemStream("kafka", "wikipedia-raw"); 37 | 38 | @Override 39 | public void process(IncomingMessageEnvelope envelope, MessageCollector collector, TaskCoordinator coordinator) { 40 | Map outgoingMap = WikipediaFeedEvent.toMap((WikipediaFeedEvent) envelope.getMessage()); 41 | collector.send(new OutgoingMessageEnvelope(OUTPUT_STREAM, outgoingMap)); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/config/wikipedia-application.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Application / Job 19 | app.class=samza.examples.wikipedia.application.WikipediaApplication 20 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 21 | job.name=wikipedia-application 22 | 23 | # YARN 24 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 25 | 26 | # Serializers 27 | serializers.registry.string.class=org.apache.samza.serializers.StringSerdeFactory 28 | serializers.registry.integer.class=org.apache.samza.serializers.IntegerSerdeFactory 29 | 30 | # Key-value storage 31 | stores.wikipedia-stats.factory=org.apache.samza.storage.kv.RocksDbKeyValueStorageEngineFactory 32 | stores.wikipedia-stats.changelog=kafka.wikipedia-stats-changelog 33 | stores.wikipedia-stats.key.serde=string 34 | stores.wikipedia-stats.msg.serde=integer 35 | 36 | # Metrics 37 | metrics.reporters=snapshot,jmx 38 | metrics.reporter.snapshot.class=org.apache.samza.metrics.reporter.MetricsSnapshotReporterFactory 39 | metrics.reporter.snapshot.stream=kafka.metrics 40 | metrics.reporter.jmx.class=org.apache.samza.metrics.reporter.JmxReporterFactory 41 | 42 | # Config Loader 43 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 44 | job.config.loader.properties.path=./__package/config/wikipedia-application.properties 45 | -------------------------------------------------------------------------------- /src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 22 | 23 | 24 | 25 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /src/main/config/azure-blob-application.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Instructions to run this samza job are given in {@link samza.examples.azure.AzureBlobApplication} 19 | 20 | # "azure-blob-container" creates an Azure Container (if it doesnt already exist) in Azure Storage Account. 21 | # For valid container names follow guidance in https://docs.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata#container-names 22 | 23 | # Job 24 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 25 | job.name=azure-blob 26 | 27 | # YARN package path 28 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 29 | 30 | # StreamApplication class 31 | app.class=samza.examples.azure.AzureBlobApplication 32 | 33 | # Azure blob essential configs 34 | systems.azure-blob-container.samza.factory=org.apache.samza.system.azureblob.AzureBlobSystemFactory 35 | sensitive.systems.azure-blob-container.azureblob.account.name=your-azure-storage-account-name 36 | sensitive.systems.azure-blob-container.azureblob.account.key=your-azure-storage-account-key 37 | 38 | # Config Loader 39 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 40 | job.config.loader.properties.path=./__package/config/azure-blob-application.properties 41 | 42 | #Azure blob config - to created a blob per 2 input kafka messages 43 | systems.azure-blob-container.azureblob.maxMessagesPerBlob=2 44 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/system/descriptors/WikipediaSystemDescriptor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.wikipedia.system.descriptors; 21 | 22 | import samza.examples.wikipedia.system.WikipediaSystemFactory; 23 | 24 | import java.util.Map; 25 | import org.apache.samza.system.descriptors.SystemDescriptor; 26 | 27 | public class WikipediaSystemDescriptor extends SystemDescriptor { 28 | private static final String SYSTEM_NAME = "wikipedia"; 29 | private static final String FACTORY_CLASS_NAME = WikipediaSystemFactory.class.getName(); 30 | private static final String HOST_KEY = "systems.%s.host"; 31 | private static final String PORT_KEY = "systems.%s.port"; 32 | 33 | private final String host; 34 | private final int port; 35 | 36 | public WikipediaSystemDescriptor(String host, int port) { 37 | super(SYSTEM_NAME, FACTORY_CLASS_NAME, null, null); 38 | this.host = host; 39 | this.port = port; 40 | } 41 | 42 | public WikipediaInputDescriptor getInputDescriptor(String streamId) { 43 | return new WikipediaInputDescriptor(streamId, this); 44 | } 45 | 46 | @Override 47 | public Map toConfig() { 48 | Map configs = super.toConfig(); 49 | configs.put(String.format(HOST_KEY, getSystemName()), host); 50 | configs.put(String.format(PORT_KEY, getSystemName()), Integer.toString(port)); 51 | return configs; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/system/WikipediaSystemFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.wikipedia.system; 21 | 22 | import org.apache.samza.SamzaException; 23 | import org.apache.samza.config.Config; 24 | import org.apache.samza.metrics.MetricsRegistry; 25 | import org.apache.samza.system.SystemAdmin; 26 | import org.apache.samza.system.SystemConsumer; 27 | import org.apache.samza.system.SystemFactory; 28 | import org.apache.samza.system.SystemProducer; 29 | import org.apache.samza.util.SinglePartitionWithoutOffsetsSystemAdmin; 30 | 31 | public class WikipediaSystemFactory implements SystemFactory { 32 | @Override 33 | public SystemAdmin getAdmin(String systemName, Config config) { 34 | return new SinglePartitionWithoutOffsetsSystemAdmin(); 35 | } 36 | 37 | @Override 38 | public SystemConsumer getConsumer(String systemName, Config config, MetricsRegistry registry) { 39 | String host = config.get("systems." + systemName + ".host"); 40 | int port = config.getInt("systems." + systemName + ".port"); 41 | WikipediaFeed feed = new WikipediaFeed(host, port); 42 | 43 | return new WikipediaConsumer(systemName, feed, registry); 44 | } 45 | 46 | @Override 47 | public SystemProducer getProducer(String systemName, Config config, MetricsRegistry registry) { 48 | throw new SamzaException("You can't produce to a Wikipedia feed! How about making some edits to a Wiki, instead?"); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/application/WikipediaZkLocalApplication.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.wikipedia.application; 21 | 22 | import joptsimple.OptionSet; 23 | import org.apache.samza.config.Config; 24 | import org.apache.samza.runtime.LocalApplicationRunner; 25 | import org.apache.samza.util.CommandLine; 26 | 27 | 28 | /** 29 | * An entry point for {@link WikipediaApplication} that runs in stand alone mode using zookeeper. 30 | * It waits for the job to finish; The job can also be ended by killing this process. 31 | */ 32 | public class WikipediaZkLocalApplication { 33 | 34 | /** 35 | * Executes the application using the local application runner. 36 | * It takes two required command line arguments 37 | * --config job.config.loader.factory: a fully {@link org.apache.samza.config.loaders.PropertiesConfigLoaderFactory} class name 38 | * --config job.config.loader.properties.path: path to application properties 39 | * 40 | * @param args command line arguments 41 | */ 42 | public static void main(String[] args) { 43 | CommandLine cmdLine = new CommandLine(); 44 | OptionSet options = cmdLine.parser().parse(args); 45 | Config config = cmdLine.loadConfig(options); 46 | 47 | WikipediaApplication app = new WikipediaApplication(); 48 | LocalApplicationRunner runner = new LocalApplicationRunner(app, config); 49 | runner.run(); 50 | runner.waitForFinish(); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/config/wikipedia-stats.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Job 19 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 20 | job.name=wikipedia-stats 21 | 22 | # YARN package path 23 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 24 | 25 | # TaskApplication class 26 | app.class=samza.examples.wikipedia.task.application.WikipediaStatsTaskApplication 27 | 28 | # Setting the window frequency in milliseconds 29 | task.window.ms=10000 30 | 31 | # Metrics 32 | metrics.reporters=snapshot,jmx 33 | metrics.reporter.snapshot.class=org.apache.samza.metrics.reporter.MetricsSnapshotReporterFactory 34 | metrics.reporter.snapshot.stream=kafka.metrics 35 | metrics.reporter.jmx.class=org.apache.samza.metrics.reporter.JmxReporterFactory 36 | 37 | # Serializers (used below in specifying the stores' serdes) 38 | serializers.registry.string.class=org.apache.samza.serializers.StringSerdeFactory 39 | serializers.registry.integer.class=org.apache.samza.serializers.IntegerSerdeFactory 40 | 41 | 42 | # Key-value storage 43 | stores.wikipedia-stats.factory=org.apache.samza.storage.kv.RocksDbKeyValueStorageEngineFactory 44 | stores.wikipedia-stats.changelog=kafka.wikipedia-stats-changelog 45 | stores.wikipedia-stats.key.serde=string 46 | stores.wikipedia-stats.msg.serde=integer 47 | 48 | # Normally, we'd leave this alone, but we have only one broker. 49 | stores.wikipedia-stats.changelog.replication.factor=1 50 | 51 | # Normally, we'd set this much higher, but we want things to look snappy in the demo. 52 | stores.wikipedia-stats.write.batch.size=0 53 | stores.wikipedia-stats.object.cache.size=0 54 | 55 | # Config Loader 56 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 57 | job.config.loader.properties.path=./__package/config/wikipedia-stats.properties -------------------------------------------------------------------------------- /src/main/java/samza/examples/azure/data/PageViewAvroRecord.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.azure.data; 21 | 22 | import java.io.Serializable; 23 | import org.apache.avro.AvroRuntimeException; 24 | import samza.examples.cookbook.data.PageView; 25 | 26 | public class PageViewAvroRecord extends org.apache.avro.specific.SpecificRecordBase 27 | implements org.apache.avro.specific.SpecificRecord, Serializable { 28 | public final org.apache.avro.Schema SCHEMA = org.apache.avro.Schema.parse( 29 | "{\"type\":\"record\",\"name\":\"PageViewAvroRecord\",\"namespace\":\"org.apache.samza.examples.events\", \"fields\":[{\"name\": \"userId\", \"type\": \"string\"}, {\"name\": \"country\", \"type\": \"string\"}, {\"name\": \"pageId\", \"type\": \"string\"}]}"); 30 | 31 | private String userId; 32 | private String country; 33 | private String pageId; 34 | 35 | public static PageViewAvroRecord buildPageViewRecord(PageView pageView) { 36 | PageViewAvroRecord record = new PageViewAvroRecord(); 37 | record.userId = pageView.userId; 38 | record.country = pageView.country; 39 | record.pageId = pageView.pageId; 40 | return record; 41 | } 42 | public org.apache.avro.Schema getSchema() { 43 | return SCHEMA; 44 | } 45 | 46 | public java.lang.Object get(int field) { 47 | switch (field) { 48 | case 0: return userId; 49 | case 1: return country; 50 | case 2: return pageId; 51 | default: throw new AvroRuntimeException("bad index"); 52 | } 53 | } 54 | 55 | public void put(int field, Object value) { 56 | switch (field) { 57 | case 0: 58 | userId = (String) value; break; 59 | case 1: 60 | country = (String) value; break; 61 | case 2: 62 | pageId = (String) value; break; 63 | default: 64 | throw new AvroRuntimeException("bad index"); 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 12 | set DEFAULT_JVM_OPTS= 13 | 14 | set DIRNAME=%~dp0 15 | if "%DIRNAME%" == "" set DIRNAME=. 16 | set APP_BASE_NAME=%~n0 17 | set APP_HOME=%DIRNAME% 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windowz variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | if "%@eval[2+2]" == "4" goto 4NT_args 53 | 54 | :win9xME_args 55 | @rem Slurp the command line arguments. 56 | set CMD_LINE_ARGS= 57 | set _SKIP=2 58 | 59 | :win9xME_args_slurp 60 | if "x%~1" == "x" goto execute 61 | 62 | set CMD_LINE_ARGS=%* 63 | goto execute 64 | 65 | :4NT_args 66 | @rem Get arguments from the 4NT Shell from JP Software 67 | set CMD_LINE_ARGS=%$ 68 | 69 | :execute 70 | @rem Setup the command line 71 | 72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 73 | 74 | @rem Execute Gradle 75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 76 | 77 | :end 78 | @rem End local scope for the variables with windows NT shell 79 | if "%ERRORLEVEL%"=="0" goto mainEnd 80 | 81 | :fail 82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 83 | rem the _cmd.exe /c_ return code! 84 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 85 | exit /b 1 86 | 87 | :mainEnd 88 | if "%OS%"=="Windows_NT" endlocal 89 | 90 | :omega 91 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/task/WikipediaParserStreamTask.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.wikipedia.task; 21 | 22 | import java.util.Map; 23 | import org.apache.samza.system.IncomingMessageEnvelope; 24 | import org.apache.samza.system.OutgoingMessageEnvelope; 25 | import org.apache.samza.system.SystemStream; 26 | import org.apache.samza.task.MessageCollector; 27 | import org.apache.samza.task.StreamTask; 28 | import org.apache.samza.task.TaskCoordinator; 29 | import samza.examples.wikipedia.model.WikipediaParser; 30 | import samza.examples.wikipedia.system.WikipediaFeed.WikipediaFeedEvent; 31 | 32 | public class WikipediaParserStreamTask implements StreamTask { 33 | private static final SystemStream OUTPUT_STREAM = new SystemStream("kafka", "wikipedia-edits"); 34 | 35 | @SuppressWarnings("unchecked") 36 | @Override 37 | public void process(IncomingMessageEnvelope envelope, MessageCollector collector, TaskCoordinator coordinator) { 38 | Map jsonObject = (Map) envelope.getMessage(); 39 | WikipediaFeedEvent event = new WikipediaFeedEvent(jsonObject); 40 | 41 | Map parsedJsonObject = WikipediaParser.parseEvent(event); 42 | 43 | if (parsedJsonObject != null) { 44 | collector.send(new OutgoingMessageEnvelope(OUTPUT_STREAM, parsedJsonObject)); 45 | } 46 | } 47 | 48 | public static void main(String[] args) { 49 | String[] lines = new String[] { "[[Wikipedia talk:Articles for creation/Lords of War]] http://en.wikipedia.org/w/index.php?diff=562991653&oldid=562991567 * BBGLordsofWar * (+95) /* Lords of War: Elves versus Lizardmen */]", "[[David Shepard (surgeon)]] M http://en.wikipedia.org/w/index.php?diff=562993463&oldid=562989820 * Jacobsievers * (+115) /* American Revolution (1775�1783) */ Added to note regarding David Shepard's brothers" }; 50 | 51 | for (String line : lines) { 52 | System.out.println(WikipediaParser.parseLine(line)); 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/config/kinesis-hello-samza.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Job 19 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory 20 | job.name=kinesis-hello-samza 21 | 22 | job.systemstreampartition.grouper.factory=org.apache.samza.container.grouper.stream.AllSspToSingleTaskGrouperFactory 23 | 24 | # YARN 25 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz 26 | yarn.container.count=2 27 | 28 | # Config Loader 29 | job.config.loader.factory=org.apache.samza.config.loaders.PropertiesConfigLoaderFactory 30 | job.config.loader.properties.path=./__package/config/kinesis-hello-samza.properties 31 | 32 | # Task 33 | task.class=samza.examples.kinesis.KinesisHelloSamza 34 | # Please replace the below input stream with the stream you plan to consume from. 35 | task.inputs=kinesis.kinesis-samza-sample-stream 36 | 37 | # Serializers 38 | serializers.registry.json.class=org.apache.samza.serializers.JsonSerdeFactory 39 | 40 | # Kinesis System 41 | systems.kinesis.samza.factory=org.apache.samza.system.kinesis.KinesisSystemFactory 42 | # Please replace the below with the region of your Kinesis data stream. 43 | systems.kinesis.streams.kinesis-samza-sample-stream.aws.region=us-west-1 44 | # Access key below is a dummy key for instructional purposes. Please replace with your own key. 45 | systems.kinesis.streams.kinesis-samza-sample-stream.aws.accessKey=AKIAIHSMRK3Q72O8TEXQ 46 | # Secret key below is a dummy key for instructional purposes. Please replace with your own key. 47 | sensitive.systems.kinesis.streams.kinesis-samza-sample-stream.aws.secretKey=9GuEqdY+gNXXGrOQyev8XKziY+sRB1ht91jloEyP 48 | systems.kinesis.streams.kinesis-samza-sample-stream.aws.kcl.TableName=kinesis-hello-samza 49 | 50 | # Kafka System 51 | systems.kafka.samza.factory=org.apache.samza.system.kafka.KafkaSystemFactory 52 | systems.kafka.samza.msg.serde=json 53 | systems.kafka.consumer.zookeeper.connect=localhost:2181/ 54 | systems.kafka.producer.bootstrap.servers=localhost:9092 55 | 56 | # Job Coordinator 57 | job.coordinator.system=kafka 58 | job.coordinator.replication.factor=1 59 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/system/WikipediaConsumer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.wikipedia.system; 21 | 22 | import java.util.ArrayList; 23 | import java.util.List; 24 | import org.apache.samza.Partition; 25 | import org.apache.samza.metrics.MetricsRegistry; 26 | import org.apache.samza.system.IncomingMessageEnvelope; 27 | import org.apache.samza.system.SystemStreamPartition; 28 | import org.apache.samza.util.BlockingEnvelopeMap; 29 | import samza.examples.wikipedia.system.WikipediaFeed.WikipediaFeedEvent; 30 | import samza.examples.wikipedia.system.WikipediaFeed.WikipediaFeedListener; 31 | 32 | public class WikipediaConsumer extends BlockingEnvelopeMap implements WikipediaFeedListener { 33 | private final List channels; 34 | private final String systemName; 35 | private final WikipediaFeed feed; 36 | 37 | public WikipediaConsumer(String systemName, WikipediaFeed feed, MetricsRegistry registry) { 38 | this.channels = new ArrayList(); 39 | this.systemName = systemName; 40 | this.feed = feed; 41 | } 42 | 43 | public void onEvent(final WikipediaFeedEvent event) { 44 | SystemStreamPartition systemStreamPartition = new SystemStreamPartition(systemName, event.getChannel(), new Partition(0)); 45 | 46 | try { 47 | put(systemStreamPartition, new IncomingMessageEnvelope(systemStreamPartition, null, null, event)); 48 | } catch (Exception e) { 49 | System.err.println(e); 50 | } 51 | } 52 | 53 | @Override 54 | public void register(SystemStreamPartition systemStreamPartition, String startingOffset) { 55 | super.register(systemStreamPartition, startingOffset); 56 | 57 | channels.add(systemStreamPartition.getStream()); 58 | } 59 | 60 | @Override 61 | public void start() { 62 | feed.start(); 63 | 64 | for (String channel : channels) { 65 | feed.listen(channel, this); 66 | } 67 | } 68 | 69 | @Override 70 | public void stop() { 71 | for (String channel : channels) { 72 | feed.unlisten(channel, this); 73 | } 74 | 75 | feed.stop(); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/test/resources/WikitionaryEditEvents.txt: -------------------------------------------------------------------------------- 1 | {summary=/* Translations */, diff-url=https://en.wiktionary.org/w/index.php?diff=50611320&oldid=50602198&rcid=61308224, unparsed-flags=!, flags={is-minor=false, is-talk=false, is-bot-edit=false, is-new=false, is-unpatrolled=true, is-special=false}, channel=#en.wiktionary, source=rc-pmtpa, time=1541009729196, title=redundancy, user=L-native, diff-bytes=-222} 2 | {summary=/* Related terms */ corrected typos, diff-url=https://en.wiktionary.org/w/index.php?diff=50611319&oldid=45143413&rcid=61308223, unparsed-flags=!, flags={is-minor=false, is-talk=false, is-bot-edit=false, is-new=false, is-unpatrolled=true, is-special=false}, channel=#en.wiktionary, source=rc-pmtpa, time=1541009726357, title=treze, user=.123.192.97, diff-bytes=1} 3 | {summary=, diff-url=https://en.wiktionary.org/w/index.php?diff=50611326&oldid=50599435&rcid=61308247, unparsed-flags=!, flags={is-minor=false, is-talk=false, is-bot-edit=false, is-new=false, is-unpatrolled=true, is-special=false}, channel=#en.wiktionary, source=rc-pmtpa, time=1541009812126, title=sol, user=HansRompel, diff-bytes=146} 4 | {summary=/* Translations */, diff-url=https://en.wiktionary.org/w/index.php?diff=50611320&oldid=50602198&rcid=61308224, unparsed-flags=!, flags={is-minor=false, is-talk=false, is-bot-edit=false, is-new=false, is-unpatrolled=true, is-special=false}, channel=#en.wiktionary, source=rc-pmtpa, time=1541009729196, title=redundancy, user=L-native, diff-bytes=-222} 5 | {summary=/* Related terms */ corrected typos, diff-url=https://en.wiktionary.org/w/index.php?diff=50611319&oldid=45143413&rcid=61308223, unparsed-flags=!, flags={is-minor=false, is-talk=false, is-bot-edit=false, is-new=false, is-unpatrolled=true, is-special=false}, channel=#en.wiktionary, source=rc-pmtpa, time=1541009726357, title=treze, user=.123.192.97, diff-bytes=1} 6 | {summary=, diff-url=https://en.wiktionary.org/w/index.php?diff=50611326&oldid=50599435&rcid=61308247, unparsed-flags=!, flags={is-minor=false, is-talk=false, is-bot-edit=false, is-new=false, is-unpatrolled=true, is-special=false}, channel=#en.wiktionary, source=rc-pmtpa, time=1541009812126, title=sol, user=HansRompel, diff-bytes=146} 7 | {summary=/* Translations */, diff-url=https://en.wiktionary.org/w/index.php?diff=50611320&oldid=50602198&rcid=61308224, unparsed-flags=!, flags={is-minor=false, is-talk=false, is-bot-edit=false, is-new=false, is-unpatrolled=true, is-special=false}, channel=#en.wiktionary, source=rc-pmtpa, time=1541009729196, title=redundancy, user=L-native, diff-bytes=-222} 8 | {summary=/* Related terms */ corrected typos, diff-url=https://en.wiktionary.org/w/index.php?diff=50611319&oldid=45143413&rcid=61308223, unparsed-flags=!, flags={is-minor=false, is-talk=false, is-bot-edit=false, is-new=false, is-unpatrolled=true, is-special=false}, channel=#en.wiktionary, source=rc-pmtpa, time=1541009726357, title=treze, user=.123.192.97, diff-bytes=1} 9 | {summary=, diff-url=https://en.wiktionary.org/w/index.php?diff=50611326&oldid=50599435&rcid=61308247, unparsed-flags=!, flags={is-minor=false, is-talk=false, is-bot-edit=false, is-new=false, is-unpatrolled=true, is-special=false}, channel=#en.wiktionary, source=rc-pmtpa, time=1541009812126, title=sol, user=HansRompel, diff-bytes=146} 10 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/model/WikipediaParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.wikipedia.model; 21 | 22 | import java.util.HashMap; 23 | import java.util.Map; 24 | import java.util.regex.Matcher; 25 | import java.util.regex.Pattern; 26 | import samza.examples.wikipedia.system.WikipediaFeed; 27 | 28 | 29 | public class WikipediaParser { 30 | public static Map parseEvent(WikipediaFeed.WikipediaFeedEvent wikipediaFeedEvent) { 31 | Map parsedJsonObject = null; 32 | try { 33 | parsedJsonObject = WikipediaParser.parseLine(wikipediaFeedEvent.getRawEvent()); 34 | 35 | parsedJsonObject.put("channel", wikipediaFeedEvent.getChannel()); 36 | parsedJsonObject.put("source", wikipediaFeedEvent.getSource()); 37 | parsedJsonObject.put("time", wikipediaFeedEvent.getTime()); 38 | } catch (Exception e) { 39 | System.err.println("Unable to parse line: " + wikipediaFeedEvent); 40 | } 41 | 42 | return parsedJsonObject; 43 | } 44 | 45 | public static Map parseLine(String line) { 46 | Pattern p = Pattern.compile("\\[\\[(.*)\\]\\]\\s(.*)\\s(.*)\\s\\*\\s(.*)\\s\\*\\s\\(\\+?(.\\d*)\\)\\s(.*)"); 47 | Matcher m = p.matcher(line); 48 | 49 | if (m.find() && m.groupCount() == 6) { 50 | String title = m.group(1); 51 | String flags = m.group(2); 52 | String diffUrl = m.group(3); 53 | String user = m.group(4); 54 | int byteDiff = Integer.parseInt(m.group(5)); 55 | String summary = m.group(6); 56 | 57 | Map flagMap = new HashMap(); 58 | 59 | flagMap.put("is-minor", flags.contains("M")); 60 | flagMap.put("is-new", flags.contains("N")); 61 | flagMap.put("is-unpatrolled", flags.contains("!")); 62 | flagMap.put("is-bot-edit", flags.contains("B")); 63 | flagMap.put("is-special", title.startsWith("Special:")); 64 | flagMap.put("is-talk", title.startsWith("Talk:")); 65 | 66 | Map root = new HashMap(); 67 | 68 | root.put("title", title); 69 | root.put("user", user); 70 | root.put("unparsed-flags", flags); 71 | root.put("diff-bytes", byteDiff); 72 | root.put("diff-url", diffUrl); 73 | root.put("summary", summary); 74 | root.put("flags", flagMap); 75 | 76 | return root; 77 | } else { 78 | throw new IllegalArgumentException(); 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/test/java/samza/examples/wikipedia/task/test/TestWikipediaTask.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.wikipedia.task.test; 21 | 22 | import java.time.Duration; 23 | import java.util.ArrayList; 24 | import java.util.HashMap; 25 | import java.util.List; 26 | import java.util.Map; 27 | import org.apache.samza.serializers.NoOpSerde; 28 | import org.apache.samza.test.framework.TestRunner; 29 | import org.apache.samza.test.framework.system.descriptors.InMemoryInputDescriptor; 30 | import org.apache.samza.test.framework.system.descriptors.InMemoryOutputDescriptor; 31 | import org.apache.samza.test.framework.system.descriptors.InMemorySystemDescriptor; 32 | import org.codehaus.jackson.map.ObjectMapper; 33 | import org.junit.Assert; 34 | import org.junit.Test; 35 | import samza.examples.wikipedia.system.WikipediaFeed.WikipediaFeedEvent; 36 | import samza.examples.wikipedia.task.application.WikipediaParserTaskApplication; 37 | 38 | public class TestWikipediaTask { 39 | 40 | @Test 41 | public void testWikipediaFeedTask() throws Exception { 42 | String[] wikipediaFeedSamples = new String[] { "{\"channel\":\"#en.wikipedia\",\"raw\":\"[[Fear Is the Key (song)]] https://en.wikipedia.org/w/index.php?diff=865574761&oldid=861177329 * Sam Sailor * (+46) Redirecting to [[Fear of the Dark (Iron Maiden album)]] ([[User:Sam Sailor/Scripts/Sagittarius+|♐]])\",\"time\":1540408899419,\"source\":\"rc-pmtpa\"}" }; 43 | 44 | InMemorySystemDescriptor isd = new InMemorySystemDescriptor("kafka"); 45 | 46 | InMemoryInputDescriptor rawWikiEvents = isd 47 | .getInputDescriptor("wikipedia-raw", new NoOpSerde<>()); 48 | 49 | InMemoryOutputDescriptor outputStreamDesc = isd 50 | .getOutputDescriptor("wikipedia-edits", new NoOpSerde<>()); 51 | 52 | TestRunner 53 | .of(new WikipediaParserTaskApplication()) 54 | .addInputStream(rawWikiEvents, parseJSONToMap(wikipediaFeedSamples)) 55 | .addOutputStream(outputStreamDesc, 1) 56 | .run(Duration.ofSeconds(2)); 57 | 58 | Assert.assertEquals(1 59 | , TestRunner.consumeStream(outputStreamDesc, Duration.ofSeconds(1)).get(0).size()); 60 | } 61 | 62 | public static List> parseJSONToMap(String[] lines) throws Exception{ 63 | List> wikiRawEvents = new ArrayList<>(); 64 | ObjectMapper mapper = new ObjectMapper(); 65 | for (String line : lines) { 66 | wikiRawEvents.add(mapper.readValue(line, HashMap.class)); 67 | } 68 | return wikiRawEvents; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/task/application/WikipediaStatsTaskApplication.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.wikipedia.task.application; 20 | 21 | import com.google.common.collect.ImmutableList; 22 | import com.google.common.collect.ImmutableMap; 23 | import java.util.List; 24 | import java.util.Map; 25 | import org.apache.samza.application.TaskApplication; 26 | import org.apache.samza.application.descriptors.TaskApplicationDescriptor; 27 | import org.apache.samza.serializers.JsonSerde; 28 | import org.apache.samza.system.kafka.descriptors.KafkaInputDescriptor; 29 | import org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor; 30 | import org.apache.samza.task.StreamTaskFactory; 31 | import samza.examples.wikipedia.task.WikipediaStatsStreamTask; 32 | 33 | 34 | public class WikipediaStatsTaskApplication implements TaskApplication { 35 | 36 | private static final List KAFKA_CONSUMER_ZK_CONNECT = ImmutableList.of("localhost:2181"); 37 | private static final List KAFKA_PRODUCER_BOOTSTRAP_SERVERS = ImmutableList.of("localhost:9092"); 38 | private static final Map KAFKA_DEFAULT_STREAM_CONFIGS = ImmutableMap.of("replication.factor", "1"); 39 | 40 | @Override 41 | public void describe(TaskApplicationDescriptor taskApplicationDescriptor) { 42 | 43 | // Define a system descriptor for Kafka 44 | KafkaSystemDescriptor kafkaSystemDescriptor = new KafkaSystemDescriptor("kafka") 45 | .withConsumerZkConnect(KAFKA_CONSUMER_ZK_CONNECT) 46 | .withProducerBootstrapServers(KAFKA_PRODUCER_BOOTSTRAP_SERVERS) 47 | .withDefaultStreamConfigs(KAFKA_DEFAULT_STREAM_CONFIGS); 48 | 49 | // Input descriptor for the wikipedia-edits topic 50 | KafkaInputDescriptor kafkaInputDescriptor = 51 | kafkaSystemDescriptor.getInputDescriptor("wikipedia-edits", new JsonSerde<>()); 52 | 53 | // Set the default system descriptor to Kafka, so that it is used for all 54 | // internal resources, e.g., kafka topic for checkpointing, coordinator stream. 55 | taskApplicationDescriptor.withDefaultSystem(kafkaSystemDescriptor); 56 | 57 | // Set the input 58 | taskApplicationDescriptor.withInputStream(kafkaInputDescriptor); 59 | 60 | // Set the output 61 | taskApplicationDescriptor.withOutputStream( 62 | kafkaSystemDescriptor.getOutputDescriptor("wikipedia-stats", new JsonSerde<>())); 63 | 64 | // Set the task factory 65 | taskApplicationDescriptor.withTaskFactory((StreamTaskFactory) () -> new WikipediaStatsStreamTask()); 66 | } 67 | } 68 | 69 | -------------------------------------------------------------------------------- /src/main/assembly/src.xml: -------------------------------------------------------------------------------- 1 | 2 | 12 | 13 | 17 | dist 18 | 19 | tar.gz 20 | 21 | false 22 | 23 | 24 | ${basedir} 25 | 26 | README* 27 | LICENSE* 28 | NOTICE* 29 | 30 | 31 | 32 | ${basedir}/src/main/config 33 | 34 | *.properties 35 | 36 | config 37 | 39 | true 40 | 41 | 42 | 43 | 44 | ${basedir}/src/main/resources/log4j2.xml 45 | lib 46 | 47 | 48 | ${basedir}/bin/run-wikipedia-zk-application.sh 49 | bin 50 | 51 | 52 | ${basedir}/bin/run-event-hubs-zk-application.sh 53 | bin 54 | 55 | 56 | 57 | 58 | bin 59 | 60 | org.apache.samza:samza-shell:tgz:dist:* 61 | 62 | 0744 63 | true 64 | 65 | 66 | lib 67 | 68 | org.apache.samza:samza-core_2.11 69 | org.apache.samza:samza-kafka_2.11 70 | org.apache.samza:samza-yarn_2.11 71 | org.apache.samza:samza-kv-rocksdb_2.11 72 | org.apache.samza:samza-log4j 73 | org.apache.samza:hello-samza 74 | org.slf4j:slf4j-log4j12 75 | org.apache.kafka:kafka_2.11 76 | org.apache.hadoop:hadoop-hdfs 77 | 78 | true 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/task/application/WikipediaParserTaskApplication.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.wikipedia.task.application; 20 | 21 | import com.google.common.collect.ImmutableList; 22 | import com.google.common.collect.ImmutableMap; 23 | import java.util.List; 24 | import java.util.Map; 25 | import org.apache.samza.application.TaskApplication; 26 | import org.apache.samza.application.descriptors.TaskApplicationDescriptor; 27 | import org.apache.samza.serializers.JsonSerde; 28 | import org.apache.samza.system.kafka.descriptors.KafkaInputDescriptor; 29 | import org.apache.samza.system.kafka.descriptors.KafkaOutputDescriptor; 30 | import org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor; 31 | import org.apache.samza.task.StreamTaskFactory; 32 | import samza.examples.wikipedia.task.WikipediaParserStreamTask; 33 | 34 | 35 | public class WikipediaParserTaskApplication implements TaskApplication { 36 | 37 | private static final List KAFKA_CONSUMER_ZK_CONNECT = ImmutableList.of("localhost:2181"); 38 | private static final List KAFKA_PRODUCER_BOOTSTRAP_SERVERS = ImmutableList.of("localhost:9092"); 39 | private static final Map KAFKA_DEFAULT_STREAM_CONFIGS = ImmutableMap.of("replication.factor", "1"); 40 | 41 | @Override 42 | public void describe(TaskApplicationDescriptor taskApplicationDescriptor) { 43 | 44 | // Define a system descriptor for Kafka, which is both our input and output system 45 | KafkaSystemDescriptor kafkaSystemDescriptor = 46 | new KafkaSystemDescriptor("kafka").withConsumerZkConnect(KAFKA_CONSUMER_ZK_CONNECT) 47 | .withProducerBootstrapServers(KAFKA_PRODUCER_BOOTSTRAP_SERVERS) 48 | .withDefaultStreamConfigs(KAFKA_DEFAULT_STREAM_CONFIGS); 49 | 50 | // Input descriptor for the wikipedia-raw topic 51 | KafkaInputDescriptor kafkaInputDescriptor = 52 | kafkaSystemDescriptor.getInputDescriptor("wikipedia-raw", new JsonSerde<>()); 53 | 54 | // Output descriptor for the wikipedia-edits topic 55 | KafkaOutputDescriptor kafkaOutputDescriptor = 56 | kafkaSystemDescriptor.getOutputDescriptor("wikipedia-edits", new JsonSerde<>()); 57 | 58 | // Set the default system descriptor to Kafka, so that it is used for all 59 | // internal resources, e.g., kafka topic for checkpointing, coordinator stream. 60 | taskApplicationDescriptor.withDefaultSystem(kafkaSystemDescriptor); 61 | 62 | // Set the input 63 | taskApplicationDescriptor.withInputStream(kafkaInputDescriptor); 64 | 65 | // Set the output 66 | taskApplicationDescriptor.withOutputStream(kafkaOutputDescriptor); 67 | 68 | // Set the task factory 69 | taskApplicationDescriptor.withTaskFactory((StreamTaskFactory) () -> new WikipediaParserStreamTask()); 70 | } 71 | } 72 | 73 | -------------------------------------------------------------------------------- /src/test/java/samza/examples/test/utils/TestUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.test.utils; 21 | 22 | import com.google.common.io.Resources; 23 | import java.io.BufferedReader; 24 | import java.io.IOException; 25 | import java.io.InputStream; 26 | import java.io.InputStreamReader; 27 | import java.util.ArrayList; 28 | import java.util.HashMap; 29 | import java.util.List; 30 | import java.util.stream.Collectors; 31 | import org.codehaus.jackson.map.ObjectMapper; 32 | import samza.examples.cookbook.data.PageView; 33 | import samza.examples.wikipedia.application.WikipediaApplication; 34 | 35 | import static samza.examples.wikipedia.system.WikipediaFeed.WikipediaFeedEvent; 36 | 37 | 38 | public class TestUtils { 39 | 40 | public static List genWikipediaFeedEvents(String channel) { 41 | List wikiEvents = null; 42 | switch (channel) { 43 | case WikipediaApplication.WIKIPEDIA_CHANNEL: 44 | wikiEvents = readFile("WikipediaEditEvents.txt"); 45 | break; 46 | 47 | case WikipediaApplication.WIKINEWS_CHANNEL: 48 | wikiEvents = readFile("WikinewsEditEvents.txt"); 49 | break; 50 | 51 | case WikipediaApplication.WIKTIONARY_CHANNEL: 52 | wikiEvents = readFile("WikitionaryEditEvents.txt"); 53 | break; 54 | } 55 | ObjectMapper mapper = new ObjectMapper(); 56 | return wikiEvents.stream().map(event -> { 57 | try { 58 | return new WikipediaFeedEvent(mapper.readValue(event, HashMap.class)); 59 | } catch (Exception e) { 60 | e.printStackTrace(); 61 | } 62 | return null; 63 | }).filter(x -> x != null).collect(Collectors.toList()); 64 | } 65 | 66 | public static List genSamplePageViewData() { 67 | List pageViewEvents = new ArrayList<>(); 68 | pageViewEvents.add(new PageView("google.com/home", "user1", "india")); 69 | pageViewEvents.add(new PageView("google.com/search", "user1", "india")); 70 | pageViewEvents.add(new PageView("yahoo.com/home", "user2", "china")); 71 | pageViewEvents.add(new PageView("yahoo.com/search", "user2", "china")); 72 | pageViewEvents.add(new PageView("google.com/news", "user1", "india")); 73 | pageViewEvents.add(new PageView("yahoo.com/fashion", "user2", "china")); 74 | return pageViewEvents; 75 | } 76 | 77 | private static List readFile(String path) { 78 | try { 79 | InputStream in = Resources.getResource(path).openStream(); 80 | List lines = new ArrayList<>(); 81 | String line = null; 82 | BufferedReader reader = new BufferedReader(new InputStreamReader(in)); 83 | while ((line = reader.readLine()) != null) { 84 | lines.add(line); 85 | } 86 | reader.close(); 87 | return lines; 88 | } catch (IOException e) { 89 | e.printStackTrace(); 90 | return null; 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/test/java/samza/examples/wikipedia/application/test/TestWikipediaApplication.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.wikipedia.application.test; 21 | 22 | import java.time.Duration; 23 | import java.util.HashMap; 24 | import java.util.Map; 25 | import org.apache.samza.serializers.NoOpSerde; 26 | import org.apache.samza.test.framework.TestRunner; 27 | import org.apache.samza.test.framework.system.descriptors.InMemoryInputDescriptor; 28 | import org.apache.samza.test.framework.system.descriptors.InMemoryOutputDescriptor; 29 | import org.apache.samza.test.framework.system.descriptors.InMemorySystemDescriptor; 30 | import org.junit.Assert; 31 | import org.junit.Test; 32 | import samza.examples.wikipedia.application.WikipediaApplication; 33 | import samza.examples.test.utils.TestUtils; 34 | 35 | 36 | public class TestWikipediaApplication { 37 | 38 | @Test 39 | public void testWikipediaApplication() throws Exception { 40 | 41 | InMemorySystemDescriptor wikipediaSystemDescriptor = new InMemorySystemDescriptor("wikipedia"); 42 | 43 | // These config must be removed once examples are refactored to use Table-API 44 | Map conf = new HashMap<>(); 45 | conf.put("stores.wikipedia-stats.factory", "org.apache.samza.storage.kv.RocksDbKeyValueStorageEngineFactory"); 46 | conf.put("stores.wikipedia-stats.key.serde", "string"); 47 | conf.put("stores.wikipedia-stats.msg.serde", "integer"); 48 | conf.put("serializers.registry.string.class", "org.apache.samza.serializers.StringSerdeFactory"); 49 | conf.put("serializers.registry.integer.class", "org.apache.samza.serializers.IntegerSerdeFactory"); 50 | 51 | InMemoryInputDescriptor wikipediaInputDescriptor = wikipediaSystemDescriptor 52 | .getInputDescriptor("en-wikipedia", new NoOpSerde<>()) 53 | .withPhysicalName(WikipediaApplication.WIKIPEDIA_CHANNEL); 54 | 55 | InMemoryInputDescriptor wiktionaryInputDescriptor = wikipediaSystemDescriptor 56 | .getInputDescriptor("en-wiktionary", new NoOpSerde<>()) 57 | .withPhysicalName(WikipediaApplication.WIKTIONARY_CHANNEL); 58 | 59 | InMemoryInputDescriptor wikiNewsInputDescriptor = wikipediaSystemDescriptor 60 | .getInputDescriptor("en-wikinews", new NoOpSerde<>()) 61 | .withPhysicalName(WikipediaApplication.WIKINEWS_CHANNEL); 62 | 63 | InMemorySystemDescriptor kafkaSystemDescriptor = new InMemorySystemDescriptor("kafka"); 64 | 65 | InMemoryOutputDescriptor outputStreamDesc = kafkaSystemDescriptor 66 | .getOutputDescriptor("wikipedia-stats", new NoOpSerde<>()); 67 | 68 | 69 | TestRunner 70 | .of(new WikipediaApplication()) 71 | .addInputStream(wikipediaInputDescriptor, TestUtils.genWikipediaFeedEvents(WikipediaApplication.WIKIPEDIA_CHANNEL)) 72 | .addInputStream(wiktionaryInputDescriptor, TestUtils.genWikipediaFeedEvents(WikipediaApplication.WIKTIONARY_CHANNEL)) 73 | .addInputStream(wikiNewsInputDescriptor, TestUtils.genWikipediaFeedEvents(WikipediaApplication.WIKINEWS_CHANNEL)) 74 | .addOutputStream(outputStreamDesc, 1) 75 | .addConfig(conf) 76 | .addConfig("deploy.test", "true") 77 | .run(Duration.ofMinutes(1)); 78 | 79 | Assert.assertTrue(TestRunner.consumeStream(outputStreamDesc, Duration.ofMillis(100)).get(0).size() > 0); 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/task/WikipediaStatsStreamTask.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.wikipedia.task; 21 | 22 | import java.util.HashMap; 23 | import java.util.HashSet; 24 | import java.util.Map; 25 | import java.util.Set; 26 | import org.apache.samza.context.Context; 27 | import org.apache.samza.context.TaskContext; 28 | import org.apache.samza.metrics.Counter; 29 | import org.apache.samza.storage.kv.KeyValueStore; 30 | import org.apache.samza.system.IncomingMessageEnvelope; 31 | import org.apache.samza.system.OutgoingMessageEnvelope; 32 | import org.apache.samza.system.SystemStream; 33 | import org.apache.samza.task.InitableTask; 34 | import org.apache.samza.task.MessageCollector; 35 | import org.apache.samza.task.StreamTask; 36 | import org.apache.samza.task.TaskCoordinator; 37 | import org.apache.samza.task.WindowableTask; 38 | 39 | public class WikipediaStatsStreamTask implements StreamTask, InitableTask, WindowableTask { 40 | private static final SystemStream OUTPUT_STREAM = new SystemStream("kafka", "wikipedia-stats"); 41 | 42 | private int edits = 0; 43 | private int byteDiff = 0; 44 | private Set titles = new HashSet(); 45 | private Map counts = new HashMap(); 46 | private KeyValueStore store; 47 | 48 | // Example metric. Running counter of the number of repeat edits of the same title within a single window. 49 | private Counter repeatEdits; 50 | 51 | public void init(Context context) { 52 | TaskContext taskContext = context.getTaskContext(); 53 | this.store = (KeyValueStore) taskContext.getStore("wikipedia-stats"); 54 | this.repeatEdits = taskContext.getTaskMetricsRegistry().newCounter("edit-counters", "repeat-edits"); 55 | } 56 | 57 | @SuppressWarnings("unchecked") 58 | @Override 59 | public void process(IncomingMessageEnvelope envelope, MessageCollector collector, TaskCoordinator coordinator) { 60 | Map edit = (Map) envelope.getMessage(); 61 | Map flags = (Map) edit.get("flags"); 62 | 63 | Integer editsAllTime = store.get("count-edits-all-time"); 64 | if (editsAllTime == null) editsAllTime = 0; 65 | store.put("count-edits-all-time", editsAllTime + 1); 66 | 67 | edits += 1; 68 | byteDiff += (Integer) edit.get("diff-bytes"); 69 | boolean newTitle = titles.add((String) edit.get("title")); 70 | 71 | for (Map.Entry flag : flags.entrySet()) { 72 | if (Boolean.TRUE.equals(flag.getValue())) { 73 | counts.compute(flag.getKey(), (k, v) -> v == null ? 0 : v + 1); 74 | } 75 | } 76 | 77 | if (!newTitle) { 78 | repeatEdits.inc(); 79 | } 80 | } 81 | 82 | @Override 83 | public void window(MessageCollector collector, TaskCoordinator coordinator) { 84 | counts.put("edits", edits); 85 | counts.put("bytes-added", byteDiff); 86 | counts.put("unique-titles", titles.size()); 87 | counts.put("edits-all-time", store.get("count-edits-all-time")); 88 | 89 | collector.send(new OutgoingMessageEnvelope(OUTPUT_STREAM, counts)); 90 | 91 | // Reset counts after windowing. 92 | edits = 0; 93 | byteDiff = 0; 94 | titles = new HashSet(); 95 | counts = new HashMap(); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/task/application/WikipediaFeedTaskApplication.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.wikipedia.task.application; 20 | 21 | import com.google.common.collect.ImmutableList; 22 | import com.google.common.collect.ImmutableMap; 23 | import java.util.List; 24 | import java.util.Map; 25 | import org.apache.samza.application.TaskApplication; 26 | import org.apache.samza.application.descriptors.TaskApplicationDescriptor; 27 | import org.apache.samza.serializers.JsonSerde; 28 | import org.apache.samza.system.kafka.descriptors.KafkaOutputDescriptor; 29 | import org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor; 30 | import org.apache.samza.task.StreamTaskFactory; 31 | import samza.examples.wikipedia.system.descriptors.WikipediaInputDescriptor; 32 | import samza.examples.wikipedia.system.descriptors.WikipediaSystemDescriptor; 33 | import samza.examples.wikipedia.task.WikipediaFeedStreamTask; 34 | 35 | 36 | /** 37 | * This TaskApplication is responsible for consuming data from wikipedia, wiktionary, and wikinews data sources, and 38 | * merging them into a single Kafka topic called wikipedia-raw. 39 | * 40 | * 41 | */ 42 | public class WikipediaFeedTaskApplication implements TaskApplication { 43 | 44 | private static final List KAFKA_CONSUMER_ZK_CONNECT = ImmutableList.of("localhost:2181"); 45 | private static final List KAFKA_PRODUCER_BOOTSTRAP_SERVERS = ImmutableList.of("localhost:9092"); 46 | private static final Map KAFKA_DEFAULT_STREAM_CONFIGS = ImmutableMap.of("replication.factor", "1"); 47 | 48 | @Override 49 | public void describe(TaskApplicationDescriptor taskApplicationDescriptor) { 50 | 51 | // Define a SystemDescriptor for Wikipedia data 52 | WikipediaSystemDescriptor wikipediaSystemDescriptor = new WikipediaSystemDescriptor("irc.wikimedia.org", 6667); 53 | 54 | // Define InputDescriptors for consuming wikipedia data 55 | WikipediaInputDescriptor wikipediaInputDescriptor = 56 | wikipediaSystemDescriptor.getInputDescriptor("en-wikipedia").withChannel("#en.wikipedia"); 57 | WikipediaInputDescriptor wiktionaryInputDescriptor = 58 | wikipediaSystemDescriptor.getInputDescriptor("en-wiktionary").withChannel("#en.wiktionary"); 59 | WikipediaInputDescriptor wikiNewsInputDescriptor = 60 | wikipediaSystemDescriptor.getInputDescriptor("en-wikinews").withChannel("#en.wikinews"); 61 | 62 | // Define a system descriptor for Kafka, which is our output system 63 | KafkaSystemDescriptor kafkaSystemDescriptor = 64 | new KafkaSystemDescriptor("kafka").withConsumerZkConnect(KAFKA_CONSUMER_ZK_CONNECT) 65 | .withProducerBootstrapServers(KAFKA_PRODUCER_BOOTSTRAP_SERVERS) 66 | .withDefaultStreamConfigs(KAFKA_DEFAULT_STREAM_CONFIGS); 67 | 68 | // Define an output descriptor 69 | KafkaOutputDescriptor kafkaOutputDescriptor = 70 | kafkaSystemDescriptor.getOutputDescriptor("wikipedia-raw", new JsonSerde<>()); 71 | 72 | // Set the default system descriptor to Kafka, so that it is used for all 73 | // internal resources, e.g., kafka topic for checkpointing, coordinator stream. 74 | taskApplicationDescriptor.withDefaultSystem(kafkaSystemDescriptor); 75 | 76 | // Set the inputs 77 | taskApplicationDescriptor.withInputStream(wikipediaInputDescriptor); 78 | taskApplicationDescriptor.withInputStream(wiktionaryInputDescriptor); 79 | taskApplicationDescriptor.withInputStream(wikiNewsInputDescriptor); 80 | 81 | // Set the output 82 | taskApplicationDescriptor.withOutputStream(kafkaOutputDescriptor); 83 | 84 | // Set the task factory 85 | taskApplicationDescriptor.withTaskFactory((StreamTaskFactory) () -> new WikipediaFeedStreamTask()); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/azure/AzureApplication.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.azure; 21 | 22 | import org.apache.samza.application.StreamApplication; 23 | import org.apache.samza.application.descriptors.StreamApplicationDescriptor; 24 | import org.apache.samza.operators.KV; 25 | import org.apache.samza.operators.MessageStream; 26 | import org.apache.samza.operators.OutputStream; 27 | import org.apache.samza.serializers.StringSerde; 28 | import org.apache.samza.system.SystemStreamMetadata; 29 | import org.apache.samza.system.eventhub.descriptors.EventHubsInputDescriptor; 30 | import org.apache.samza.system.eventhub.descriptors.EventHubsOutputDescriptor; 31 | import org.apache.samza.system.eventhub.descriptors.EventHubsSystemDescriptor; 32 | 33 | 34 | public class AzureApplication implements StreamApplication { 35 | // Stream names 36 | private static final String INPUT_STREAM_ID = "input-stream"; 37 | private static final String OUTPUT_STREAM_ID = "output-stream"; 38 | 39 | // These properties could be configured here or in azure-application-local-runner.properties 40 | // Keep in mind that the .properties file will be overwrite properties defined here with Descriptors 41 | private static final String EVENTHUBS_NAMESPACE = "my-eventhubs-namespace"; 42 | 43 | // Upstream and downstream Event Hubs entity names 44 | private static final String EVENTHUBS_INPUT_ENTITY = "my-input-entity"; 45 | private static final String EVENTHUBS_OUTPUT_ENTITY = "my-output-entity"; 46 | 47 | // You may define your own config properties in azure-application-local-runner.properties and retrieve them 48 | // in the StreamApplicationDescriptor. Prefix them with 'sensitive.' to avoid logging them. 49 | private static final String EVENTHUBS_SAS_KEY_NAME_CONFIG = "sensitive.eventhubs.sas.key.name"; 50 | private static final String EVENTHUBS_SAS_KEY_TOKEN_CONFIG = "sensitive.eventhubs.sas.token"; 51 | 52 | @Override 53 | public void describe(StreamApplicationDescriptor appDescriptor) { 54 | // Define your system here 55 | EventHubsSystemDescriptor systemDescriptor = new EventHubsSystemDescriptor("eventhubs"); 56 | 57 | // Choose your serializer/deserializer for the EventData payload 58 | StringSerde serde = new StringSerde(); 59 | 60 | // Define the input and output descriptors with respective configs 61 | EventHubsInputDescriptor> inputDescriptor = 62 | systemDescriptor.getInputDescriptor(INPUT_STREAM_ID, EVENTHUBS_NAMESPACE, EVENTHUBS_INPUT_ENTITY, serde) 63 | .withSasKeyName(appDescriptor.getConfig().get(EVENTHUBS_SAS_KEY_NAME_CONFIG)) 64 | .withSasKey(appDescriptor.getConfig().get(EVENTHUBS_SAS_KEY_TOKEN_CONFIG)); 65 | 66 | EventHubsOutputDescriptor> outputDescriptor = 67 | systemDescriptor.getOutputDescriptor(OUTPUT_STREAM_ID, EVENTHUBS_NAMESPACE, EVENTHUBS_OUTPUT_ENTITY, serde) 68 | .withSasKeyName(appDescriptor.getConfig().get(EVENTHUBS_SAS_KEY_NAME_CONFIG)) 69 | .withSasKey(appDescriptor.getConfig().get(EVENTHUBS_SAS_KEY_TOKEN_CONFIG)); 70 | 71 | // Define the input and output streams with descriptors 72 | MessageStream> eventhubInput = appDescriptor.getInputStream(inputDescriptor); 73 | OutputStream> eventhubOutput = appDescriptor.getOutputStream(outputDescriptor); 74 | 75 | // Define the execution flow with the high-level API 76 | eventhubInput 77 | .map((message) -> { 78 | System.out.println("Sending: "); 79 | System.out.println("Received Key: " + message.getKey()); 80 | System.out.println("Received Message: " + message.getValue()); 81 | return message; 82 | }) 83 | .sendTo(eventhubOutput); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/cookbook/FilterExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.cookbook; 20 | 21 | import org.apache.samza.application.StreamApplication; 22 | import org.apache.samza.application.descriptors.StreamApplicationDescriptor; 23 | import org.apache.samza.operators.KV; 24 | import org.apache.samza.operators.MessageStream; 25 | import org.apache.samza.operators.OutputStream; 26 | import org.apache.samza.serializers.JsonSerdeV2; 27 | import org.apache.samza.serializers.KVSerde; 28 | import org.apache.samza.serializers.StringSerde; 29 | import org.apache.samza.system.kafka.descriptors.KafkaInputDescriptor; 30 | import org.apache.samza.system.kafka.descriptors.KafkaOutputDescriptor; 31 | import org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor; 32 | 33 | import com.google.common.collect.ImmutableList; 34 | import com.google.common.collect.ImmutableMap; 35 | import samza.examples.cookbook.data.PageView; 36 | 37 | import java.util.List; 38 | import java.util.Map; 39 | 40 | /** 41 | * In this example, we demonstrate filtering out some bad events in the stream. 42 | * 43 | *

Concepts covered: Using stateless operators on a stream. 44 | * 45 | * To run the below example: 46 | * 47 | *

    48 | *
  1. 49 | * Ensure that the topic "pageview-filter-input" is created
    50 | * ./deploy/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic pageview-filter-input --partitions 2 --replication-factor 1 51 | *
  2. 52 | *
  3. 53 | * Run the application using the run-app.sh script
    54 | * ./deploy/samza/bin/run-app.sh --config-path=$PWD/deploy/samza/config/filter-example.properties 55 | *
  4. 56 | *
  5. 57 | * Produce some messages to the "pageview-filter-input" topic
    58 | * ./deploy/kafka/bin/kafka-console-producer.sh --topic pageview-filter-input --broker-list localhost:9092
    59 | * {"userId": "user1", "country": "india", "pageId":"google.com"}
    60 | * {"userId": "invalidUserId", "country": "france", "pageId":"facebook.com"}
    61 | * {"userId": "user2", "country": "china", "pageId":"yahoo.com"} 62 | *
  6. 63 | *
  7. 64 | * Consume messages from the "pageview-filter-output" topic (e.g. bin/kafka-console-consumer.sh) 65 | * ./deploy/kafka/bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic pageview-filter-output --property print.key=true 66 | *
  8. 67 | *
68 | */ 69 | public class FilterExample implements StreamApplication { 70 | private static final String KAFKA_SYSTEM_NAME = "kafka"; 71 | private static final List KAFKA_CONSUMER_ZK_CONNECT = ImmutableList.of("localhost:2181"); 72 | private static final List KAFKA_PRODUCER_BOOTSTRAP_SERVERS = ImmutableList.of("localhost:9092"); 73 | private static final Map KAFKA_DEFAULT_STREAM_CONFIGS = ImmutableMap.of("replication.factor", "1"); 74 | 75 | private static final String INPUT_STREAM_ID = "pageview-filter-input"; 76 | private static final String OUTPUT_STREAM_ID = "pageview-filter-output"; 77 | private static final String INVALID_USER_ID = "invalidUserId"; 78 | 79 | @Override 80 | public void describe(StreamApplicationDescriptor appDescriptor) { 81 | KafkaSystemDescriptor kafkaSystemDescriptor = new KafkaSystemDescriptor(KAFKA_SYSTEM_NAME) 82 | .withConsumerZkConnect(KAFKA_CONSUMER_ZK_CONNECT) 83 | .withProducerBootstrapServers(KAFKA_PRODUCER_BOOTSTRAP_SERVERS) 84 | .withDefaultStreamConfigs(KAFKA_DEFAULT_STREAM_CONFIGS); 85 | 86 | KVSerde serde = KVSerde.of(new StringSerde(), new JsonSerdeV2<>(PageView.class)); 87 | KafkaInputDescriptor> inputDescriptor = 88 | kafkaSystemDescriptor.getInputDescriptor(INPUT_STREAM_ID, serde); 89 | KafkaOutputDescriptor> outputDescriptor = 90 | kafkaSystemDescriptor.getOutputDescriptor(OUTPUT_STREAM_ID, serde); 91 | 92 | appDescriptor.withDefaultSystem(kafkaSystemDescriptor); 93 | 94 | MessageStream> pageViews = appDescriptor.getInputStream(inputDescriptor); 95 | OutputStream> filteredPageViews = appDescriptor.getOutputStream(outputDescriptor); 96 | 97 | pageViews 98 | .filter(kv -> !INVALID_USER_ID.equals(kv.value.userId)) 99 | .sendTo(filteredPageViews); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 10 | DEFAULT_JVM_OPTS="" 11 | 12 | APP_NAME="Gradle" 13 | APP_BASE_NAME=`basename "$0"` 14 | 15 | # Use the maximum available, or set MAX_FD != -1 to use that value. 16 | MAX_FD="maximum" 17 | 18 | warn ( ) { 19 | echo "$*" 20 | } 21 | 22 | die ( ) { 23 | echo 24 | echo "$*" 25 | echo 26 | exit 1 27 | } 28 | 29 | # OS specific support (must be 'true' or 'false'). 30 | cygwin=false 31 | msys=false 32 | darwin=false 33 | case "`uname`" in 34 | CYGWIN* ) 35 | cygwin=true 36 | ;; 37 | Darwin* ) 38 | darwin=true 39 | ;; 40 | MINGW* ) 41 | msys=true 42 | ;; 43 | esac 44 | 45 | # For Cygwin, ensure paths are in UNIX format before anything is touched. 46 | if $cygwin ; then 47 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 48 | fi 49 | 50 | # Attempt to set APP_HOME 51 | # Resolve links: $0 may be a link 52 | PRG="$0" 53 | # Need this for relative symlinks. 54 | while [ -h "$PRG" ] ; do 55 | ls=`ls -ld "$PRG"` 56 | link=`expr "$ls" : '.*-> \(.*\)$'` 57 | if expr "$link" : '/.*' > /dev/null; then 58 | PRG="$link" 59 | else 60 | PRG=`dirname "$PRG"`"/$link" 61 | fi 62 | done 63 | SAVED="`pwd`" 64 | cd "`dirname \"$PRG\"`/" >&- 65 | APP_HOME="`pwd -P`" 66 | cd "$SAVED" >&- 67 | 68 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 69 | 70 | # Determine the Java command to use to start the JVM. 71 | if [ -n "$JAVA_HOME" ] ; then 72 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 73 | # IBM's JDK on AIX uses strange locations for the executables 74 | JAVACMD="$JAVA_HOME/jre/sh/java" 75 | else 76 | JAVACMD="$JAVA_HOME/bin/java" 77 | fi 78 | if [ ! -x "$JAVACMD" ] ; then 79 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 80 | 81 | Please set the JAVA_HOME variable in your environment to match the 82 | location of your Java installation." 83 | fi 84 | else 85 | JAVACMD="java" 86 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 87 | 88 | Please set the JAVA_HOME variable in your environment to match the 89 | location of your Java installation." 90 | fi 91 | 92 | # Increase the maximum file descriptors if we can. 93 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then 94 | MAX_FD_LIMIT=`ulimit -H -n` 95 | if [ $? -eq 0 ] ; then 96 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 97 | MAX_FD="$MAX_FD_LIMIT" 98 | fi 99 | ulimit -n $MAX_FD 100 | if [ $? -ne 0 ] ; then 101 | warn "Could not set maximum file descriptor limit: $MAX_FD" 102 | fi 103 | else 104 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 105 | fi 106 | fi 107 | 108 | # For Darwin, add options to specify how the application appears in the dock 109 | if $darwin; then 110 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 111 | fi 112 | 113 | # For Cygwin, switch paths to Windows format before running java 114 | if $cygwin ; then 115 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 116 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules 158 | function splitJvmOpts() { 159 | JVM_OPTS=("$@") 160 | } 161 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS 162 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" 163 | 164 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" 165 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | hello-samza 2 | =========== 3 | 4 | **Hello Samza** is a starter project for [Apache Samza](http://samza.apache.org/) jobs. 5 | 6 | ### About 7 | 8 | [Hello Samza](http://samza.apache.org/startup/hello-samza/latest/) is developed as part of the [Apache Samza](http://samza.apache.org) project. Please direct questions, improvements and bug fixes there. Questions about [Hello Samza](http://samza.apache.org/startup/hello-samza/latest/) are welcome on the [dev list](http://samza.apache.org/community/mailing-lists.html) and the [Samza JIRA](https://issues.apache.org/jira/browse/SAMZA) has a hello-samza component for filing tickets. 9 | 10 | ### Instructions 11 | 12 | The **Hello Samza** project contains example Samza applications of high-level API as well as low-level API. The following are the instructions to install the binaries and run the applications in a local Yarn cluster. See also [Hello Samza](http://samza.apache.org/startup/hello-samza/latest/) and [Hello Samza High Level API](http://samza.apache.org/learn/tutorials/latest/hello-samza-high-level-yarn.html) for more information. 13 | 14 | #### 1. Get the Code 15 | 16 | Check out the hello-samza project: 17 | 18 | ``` 19 | git clone https://gitbox.apache.org/repos/asf/samza-hello-samza.git hello-samza 20 | cd hello-samza 21 | ``` 22 | 23 | To build hello-samza with the latest Samza master, you can switch to the _latest_ branch: 24 | 25 | ``` 26 | git checkout latest 27 | ``` 28 | 29 | This project contains everything you'll need to run your first Samza application. 30 | 31 | #### 2. Start a Grid 32 | 33 | A Samza grid usually comprises three different systems: [YARN](http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html), [Kafka](http://kafka.apache.org/), and [ZooKeeper](http://zookeeper.apache.org/). The hello-samza project comes with a script called "grid" to help you setup these systems. Start by running: 34 | 35 | ``` 36 | ./bin/grid bootstrap 37 | ``` 38 | 39 | This command will download, install, and start ZooKeeper, Kafka, and YARN. It will also check out the latest version of Samza and build it. All package files will be put in a sub-directory called "deploy" inside hello-samza's root folder. 40 | 41 | If you get a complaint that _JAVA_HOME_ is not set, then you'll need to set it to the path where Java is installed on your system. 42 | 43 | Once the grid command completes, you can verify that YARN is up and running by going to [http://localhost:8088](http://localhost:8088). This is the YARN UI. 44 | 45 | #### 3. Build a Samza Application Package 46 | 47 | Before you can run a Samza application, you need to build a package for it. This package is what YARN uses to deploy your apps on the grid. Use the following command in hello-samza project to build and deploy the example applications: 48 | 49 | ``` 50 | ./bin/deploy.sh 51 | ``` 52 | 53 | #### 4. Run a Samza Application 54 | 55 | After you've built your Samza package, you can start the example applications on the grid. 56 | 57 | ##### - High-level API Examples 58 | 59 | Package [samza.examples.cookbook](https://github.com/apache/samza-hello-samza/tree/master/src/main/java/samza/examples/cookbook) contains various examples of high-level API operator usage, such as map, partitionBy, window and join. Each example is a runnable Samza application with the steps in the class javadocs, e.g [PageViewAdClickJoiner](https://github.com/apache/samza-hello-samza/blob/master/src/main/java/samza/examples/cookbook/JoinExample.java). 60 | 61 | Package [samza.examples.wikipedia.application](https://github.com/apache/samza-hello-samza/tree/master/src/main/java/samza/examples/wikipedia/application) contains a small Samza application which consumes the real-time feeds from Wikipedia, extracts the metadata of the events, and calculates statistics of all edits in a 10-second window. You can start the app on the grid using the run-app.sh script: 62 | 63 | ``` 64 | ./deploy/samza/bin/run-app.sh --config-path=$PWD/deploy/samza/config/wikipedia-application.properties 65 | ``` 66 | 67 | Once the job is started, we can tail the kafka topic by: 68 | 69 | ``` 70 | ./deploy/kafka/bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic wikipedia-stats 71 | ``` 72 | 73 | A code walkthrough of this application can be found [here](http://samza.apache.org/learn/tutorials/latest/hello-samza-high-level-code.html). 74 | 75 | ##### - Low-level API Examples 76 | 77 | Package [samza.examples.wikipedia.task](https://github.com/apache/samza-hello-samza/tree/master/src/main/java/samza/examples/wikipedia/task) contains the low-level API Samza code for the Wikipedia example. To run it, use the following scripts: 78 | 79 | ``` 80 | deploy/samza/bin/run-app.sh --config-path=$PWD/deploy/samza/config/wikipedia-feed.properties 81 | deploy/samza/bin/run-app.sh --config-path=$PWD/deploy/samza/config/wikipedia-parser.properties 82 | deploy/samza/bin/run-app.sh --config-path=$PWD/deploy/samza/config/wikipedia-stats.properties 83 | ``` 84 | 85 | Once the jobs are started, you can use the same _kafka-console-consumer.sh_ command as in the high-level API Wikipedia example to check out the output of the statistics. 86 | 87 | #### 5. Run all the examples as Integration Test 88 | 89 | Every example above are ran with a few messages as Integration test using TestRunner API. You can find all the testing samples in [src/test/java](https://github.com/apache/samza-hello-samza/tree/master/src/test/java). To run it use: 90 | 91 | ``` 92 | mvn clean package 93 | ``` 94 | 95 | Run Single example as test use: 96 | 97 | ``` 98 | mvn test -Dtest= 99 | ``` 100 | 101 | ### Contribution 102 | 103 | To start contributing on [Hello Samza](http://samza.apache.org/startup/hello-samza/latest/) first read [Rules](http://samza.apache.org/contribute/rules.html) and [Contributor Corner](https://cwiki.apache.org/confluence/display/SAMZA/Contributor%27s+Corner). Notice that [Hello Samza](http://samza.apache.org/startup/hello-samza/latest/) git repository does not support git pull request. 104 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/cookbook/SessionWindowExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.cookbook; 20 | 21 | import java.io.Serializable; 22 | import org.apache.samza.application.StreamApplication; 23 | import org.apache.samza.application.descriptors.StreamApplicationDescriptor; 24 | import org.apache.samza.operators.KV; 25 | import org.apache.samza.operators.MessageStream; 26 | import org.apache.samza.operators.OutputStream; 27 | import org.apache.samza.operators.windows.Windows; 28 | import org.apache.samza.serializers.JsonSerdeV2; 29 | import org.apache.samza.serializers.KVSerde; 30 | import org.apache.samza.serializers.Serde; 31 | import org.apache.samza.serializers.StringSerde; 32 | import org.apache.samza.system.kafka.descriptors.KafkaInputDescriptor; 33 | import org.apache.samza.system.kafka.descriptors.KafkaOutputDescriptor; 34 | import org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor; 35 | 36 | import com.google.common.collect.ImmutableList; 37 | import com.google.common.collect.ImmutableMap; 38 | import samza.examples.cookbook.data.PageView; 39 | import samza.examples.cookbook.data.UserPageViews; 40 | 41 | import java.time.Duration; 42 | import java.util.List; 43 | import java.util.Map; 44 | 45 | /** 46 | * In this example, we group page views by userId into sessions, and compute the number of page views for each user 47 | * session. A session is considered closed when there is no user activity for a 10 second duration. 48 | * 49 | *

Concepts covered: Using session windows to group data in a stream, Re-partitioning a stream. 50 | * 51 | * To run the below example: 52 | * 53 | *

    54 | *
  1. 55 | * Ensure that the topic "pageview-session-input" is created
    56 | * ./deploy/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic pageview-session-input --partitions 2 --replication-factor 1 57 | *
  2. 58 | *
  3. 59 | * Run the application using the run-app.sh script
    60 | * ./deploy/samza/bin/run-app.sh --config-path=$PWD/deploy/samza/config/session-window-example.properties 61 | *
  4. 62 | *
  5. 63 | * Produce some messages to the "pageview-session-input" topic
    64 | * ./deploy/kafka/bin/kafka-console-producer.sh --topic pageview-session-input --broker-list localhost:9092
    65 | * {"userId": "user1", "country": "india", "pageId":"google.com/home"}
    66 | * {"userId": "user1", "country": "india", "pageId":"google.com/search"}
    67 | * {"userId": "user2", "country": "china", "pageId":"yahoo.com/home"}
    68 | * {"userId": "user2", "country": "china", "pageId":"yahoo.com/sports"}
    69 | * {"userId": "user1", "country": "india", "pageId":"google.com/news"}
    70 | * {"userId": "user2", "country": "china", "pageId":"yahoo.com/fashion"} 71 | *
  6. 72 | *
  7. 73 | * Consume messages from the "pageview-session-output" topic (e.g. bin/kafka-console-consumer.sh) 74 | * ./deploy/kafka/bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic pageview-session-output --property print.key=true 75 | *
  8. 76 | *
77 | * 78 | */ 79 | public class SessionWindowExample implements StreamApplication, Serializable { 80 | private static final String KAFKA_SYSTEM_NAME = "kafka"; 81 | private static final List KAFKA_CONSUMER_ZK_CONNECT = ImmutableList.of("localhost:2181"); 82 | private static final List KAFKA_PRODUCER_BOOTSTRAP_SERVERS = ImmutableList.of("localhost:9092"); 83 | private static final Map KAFKA_DEFAULT_STREAM_CONFIGS = ImmutableMap.of("replication.factor", "1"); 84 | 85 | private static final String INPUT_STREAM_ID = "pageview-session-input"; 86 | private static final String OUTPUT_STREAM_ID = "pageview-session-output"; 87 | 88 | @Override 89 | public void describe(StreamApplicationDescriptor appDescriptor) { 90 | Serde stringSerde = new StringSerde(); 91 | KVSerde pageViewKVSerde = KVSerde.of(stringSerde, new JsonSerdeV2<>(PageView.class)); 92 | KVSerde userPageViewSerde = KVSerde.of(stringSerde, new JsonSerdeV2<>(UserPageViews.class)); 93 | 94 | KafkaSystemDescriptor kafkaSystemDescriptor = new KafkaSystemDescriptor(KAFKA_SYSTEM_NAME) 95 | .withConsumerZkConnect(KAFKA_CONSUMER_ZK_CONNECT) 96 | .withProducerBootstrapServers(KAFKA_PRODUCER_BOOTSTRAP_SERVERS) 97 | .withDefaultStreamConfigs(KAFKA_DEFAULT_STREAM_CONFIGS); 98 | 99 | KafkaInputDescriptor> pageViewInputDescriptor = 100 | kafkaSystemDescriptor.getInputDescriptor(INPUT_STREAM_ID, pageViewKVSerde); 101 | KafkaOutputDescriptor> userPageViewsOutputDescriptor = 102 | kafkaSystemDescriptor.getOutputDescriptor(OUTPUT_STREAM_ID, userPageViewSerde); 103 | 104 | appDescriptor.withDefaultSystem(kafkaSystemDescriptor); 105 | 106 | MessageStream> pageViews = appDescriptor.getInputStream(pageViewInputDescriptor); 107 | OutputStream> userPageViews = appDescriptor.getOutputStream(userPageViewsOutputDescriptor); 108 | 109 | pageViews 110 | .partitionBy(kv -> kv.value.userId, kv -> kv.value, pageViewKVSerde, "pageview") 111 | .window(Windows.keyedSessionWindow(kv -> kv.value.userId, 112 | Duration.ofSeconds(10), stringSerde, pageViewKVSerde), "usersession") 113 | .map(windowPane -> { 114 | String userId = windowPane.getKey().getKey(); 115 | int views = windowPane.getMessage().size(); 116 | return KV.of(userId, new UserPageViews(userId, views)); 117 | }) 118 | .sendTo(userPageViews); 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/cookbook/TumblingWindowExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.cookbook; 20 | 21 | import java.io.Serializable; 22 | import org.apache.samza.application.StreamApplication; 23 | import org.apache.samza.application.descriptors.StreamApplicationDescriptor; 24 | import org.apache.samza.operators.KV; 25 | import org.apache.samza.operators.MessageStream; 26 | import org.apache.samza.operators.OutputStream; 27 | import org.apache.samza.operators.windows.Windows; 28 | import org.apache.samza.serializers.IntegerSerde; 29 | import org.apache.samza.serializers.JsonSerdeV2; 30 | import org.apache.samza.serializers.KVSerde; 31 | import org.apache.samza.serializers.StringSerde; 32 | import org.apache.samza.system.kafka.descriptors.KafkaInputDescriptor; 33 | import org.apache.samza.system.kafka.descriptors.KafkaOutputDescriptor; 34 | import org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor; 35 | 36 | import com.google.common.collect.ImmutableList; 37 | import com.google.common.collect.ImmutableMap; 38 | import samza.examples.cookbook.data.PageView; 39 | import samza.examples.cookbook.data.UserPageViews; 40 | 41 | import java.time.Duration; 42 | import java.util.List; 43 | import java.util.Map; 44 | 45 | /** 46 | * In this example, we group a stream of page views by country, and compute the number of page views over a tumbling time 47 | * window. 48 | * 49 | *

Concepts covered: Performing Group-By style aggregations on tumbling time windows. 50 | * 51 | *

Tumbling windows divide a stream into a set of contiguous, fixed-sized, non-overlapping time intervals. 52 | * 53 | * To run the below example: 54 | * 55 | *

    56 | *
  1. 57 | * Ensure that the topic "pageview-tumbling-input" is created
    58 | * ./deploy/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic pageview-tumbling-input --partitions 2 --replication-factor 1 59 | *
  2. 60 | *
  3. 61 | * Run the application using the run-app.sh script
    62 | * ./deploy/samza/bin/run-app.sh --config-path=$PWD/deploy/samza/config/tumbling-window-example.properties 63 | *
  4. 64 | *
  5. 65 | * Produce some messages to the "pageview-tumbling-input" topic, waiting for some time between messages
    66 | ./deploy/kafka/bin/kafka-console-producer.sh --topic pageview-tumbling-input --broker-list localhost:9092
    67 | * {"userId": "user1", "country": "india", "pageId":"google.com/home"}
    68 | * {"userId": "user1", "country": "india", "pageId":"google.com/search"}
    69 | * {"userId": "user2", "country": "china", "pageId":"yahoo.com/home"}
    70 | * {"userId": "user2", "country": "china", "pageId":"yahoo.com/sports"}
    71 | * {"userId": "user1", "country": "india", "pageId":"google.com/news"}
    72 | * {"userId": "user2", "country": "china", "pageId":"yahoo.com/fashion"} 73 | *
  6. 74 | *
  7. 75 | * Consume messages from the "pageview-tumbling-output" topic (e.g. bin/kafka-console-consumer.sh) 76 | * ./deploy/kafka/bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic pageview-tumbling-output --property print.key=true
    77 | *
  8. 78 | *
79 | * 80 | */ 81 | public class TumblingWindowExample implements StreamApplication, Serializable { 82 | private static final String KAFKA_SYSTEM_NAME = "kafka"; 83 | private static final List KAFKA_CONSUMER_ZK_CONNECT = ImmutableList.of("localhost:2181"); 84 | private static final List KAFKA_PRODUCER_BOOTSTRAP_SERVERS = ImmutableList.of("localhost:9092"); 85 | private static final Map KAFKA_DEFAULT_STREAM_CONFIGS = ImmutableMap.of("replication.factor", "1"); 86 | 87 | private static final String INPUT_STREAM_ID = "pageview-tumbling-input"; 88 | private static final String OUTPUT_STREAM_ID = "pageview-tumbling-output"; 89 | 90 | @Override 91 | public void describe(StreamApplicationDescriptor appDescriptor) { 92 | KafkaSystemDescriptor kafkaSystemDescriptor = new KafkaSystemDescriptor(KAFKA_SYSTEM_NAME) 93 | .withConsumerZkConnect(KAFKA_CONSUMER_ZK_CONNECT) 94 | .withProducerBootstrapServers(KAFKA_PRODUCER_BOOTSTRAP_SERVERS) 95 | .withDefaultStreamConfigs(KAFKA_DEFAULT_STREAM_CONFIGS); 96 | 97 | KVSerde pageViewSerde = KVSerde.of(new StringSerde(), new JsonSerdeV2<>(PageView.class)); 98 | KVSerde userPageViewSerde = KVSerde.of(new StringSerde(), new JsonSerdeV2<>(UserPageViews.class)); 99 | 100 | KafkaInputDescriptor> pageViewInputDescriptor = 101 | kafkaSystemDescriptor.getInputDescriptor(INPUT_STREAM_ID, pageViewSerde); 102 | KafkaOutputDescriptor> userPageViewOutputDescriptor = 103 | kafkaSystemDescriptor.getOutputDescriptor(OUTPUT_STREAM_ID, userPageViewSerde); 104 | 105 | appDescriptor.withDefaultSystem(kafkaSystemDescriptor); 106 | MessageStream> pageViews = appDescriptor.getInputStream(pageViewInputDescriptor); 107 | OutputStream> outputStream = appDescriptor.getOutputStream(userPageViewOutputDescriptor); 108 | 109 | pageViews 110 | .partitionBy(kv -> kv.value.userId, kv -> kv.value, pageViewSerde, "userId") 111 | .window(Windows.keyedTumblingWindow( 112 | kv -> kv.key, Duration.ofSeconds(5), () -> 0, (m, prevCount) -> prevCount + 1, 113 | new StringSerde(), new IntegerSerde()), "count") 114 | .map(windowPane -> { 115 | String userId = windowPane.getKey().getKey(); 116 | int views = windowPane.getMessage(); 117 | return KV.of(userId, new UserPageViews(userId, views)); 118 | }) 119 | .sendTo(outputStream); 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/azure/AzureBlobApplication.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.azure; 21 | 22 | import com.google.common.collect.ImmutableList; 23 | import com.google.common.collect.ImmutableMap; 24 | import java.util.List; 25 | import java.util.Map; 26 | import org.apache.samza.application.StreamApplication; 27 | import org.apache.samza.application.descriptors.StreamApplicationDescriptor; 28 | import org.apache.samza.operators.MessageStream; 29 | import org.apache.samza.operators.OutputStream; 30 | import org.apache.samza.serializers.JsonSerdeV2; 31 | import org.apache.samza.serializers.NoOpSerde; 32 | import org.apache.samza.system.descriptors.GenericOutputDescriptor; 33 | import org.apache.samza.system.descriptors.GenericSystemDescriptor; 34 | import org.apache.samza.system.kafka.descriptors.KafkaInputDescriptor; 35 | import org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor; 36 | import org.slf4j.Logger; 37 | import org.slf4j.LoggerFactory; 38 | import samza.examples.azure.data.PageViewAvroRecord; 39 | import samza.examples.cookbook.data.PageView; 40 | import samza.examples.wikipedia.application.WikipediaApplication; 41 | 42 | 43 | /** 44 | * In this example, we demonstrate sending blobs to Azure Blob Storage. 45 | * This Samza job reads from Kafka topic "page-view-azure-blob-input" and produces blobs to Azure-Container "azure-blob-container" in your Azure Storage account. 46 | * 47 | * Currently, Samza supports sending Avro files are blobs. 48 | * Hence the incoming messages into the Samza job have to be converted to an Avro record. 49 | * For this job, we use input message as {@link samza.examples.cookbook.data.PageView} and 50 | * covert it to an Avro record defined as {@link samza.examples.azure.data.PageViewAvroRecord}. 51 | * 52 | * To run the below example: 53 | * 54 | *
    55 | *
  1. 56 | * Replace your-azure-storage-account-name and your-azure-storage-account-key with details of your Azure Storage Account. 57 | *
  2. 58 | *
  3. 59 | * Ensure that the topic "page-view-azure-blob-input" is created
    60 | * ./deploy/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic page-view-azure-blob-input --partitions 1 --replication-factor 1 61 | *
  4. 62 | *
  5. 63 | * Run the application using the run-app.sh script
    64 | * ./deploy/samza/bin/run-app.sh --config-path=$PWD/deploy/samza/config/azure-blob-application.properties 65 | *
  6. 66 | *
  7. 67 | * Produce some messages to the "page-view-azure-blob-input" topic
    68 | * ./deploy/kafka/bin/kafka-console-producer.sh --topic page-view-azure-blob-input --broker-list localhost:9092
    69 | * {"userId": "user1", "country": "india", "pageId":"google.com"}
    70 | * {"userId": "user2", "country": "france", "pageId":"facebook.com"}
    71 | * {"userId": "user3", "country": "china", "pageId":"yahoo.com"}
    72 | * {"userId": "user4", "country": "italy", "pageId":"linkedin.com"}
    73 | * {"userId": "user5", "country": "germany", "pageId":"amazon.com"}
    74 | * {"userId": "user6", "country": "denmark", "pageId":"apple.com"}
    75 | *
  8. 76 | *
  9. 77 | * Seeing Output: 78 | *
      79 | *
    1. 80 | * See blobs in your Azure portal at https://.blob.core.windows.net/azure-blob-container/PageViewEventStream/.avro 81 | *
    2. 82 | *
    3. 83 | * system-name "azure-blob-container" in configs and code below maps to Azure-Container in Azure Storage account. 84 | *
    4. 85 | *
    5. 86 | * is of the format yyyy/MM/dd/HH/mm-ss-randomString.avro. Hence navigate through the virtual folders on the portal to see your blobs. 87 | *
    6. 88 | *
    7. 89 | * Due to network calls, allow a few minutes for blobs to appear on the portal. 90 | *
    8. 91 | *
    9. 92 | * Config "maxMessagesPerBlob=2" ensures that a blob is created per 2 input messages. Adjust input or config accordingly. 93 | *
    10. 94 | *
    95 | *
  10. 96 | *
97 | */ 98 | public class AzureBlobApplication implements StreamApplication { 99 | private static final Logger LOG = LoggerFactory.getLogger(AzureBlobApplication.class); 100 | 101 | private static final List KAFKA_CONSUMER_ZK_CONNECT = ImmutableList.of("localhost:2181"); 102 | private static final List KAFKA_PRODUCER_BOOTSTRAP_SERVERS = ImmutableList.of("localhost:9092"); 103 | private static final Map KAFKA_DEFAULT_STREAM_CONFIGS = ImmutableMap.of("replication.factor", "1"); 104 | private static final String INPUT_PAGEVIEW_STREAM_ID = "page-view-azure-blob-input"; 105 | private static final String OUTPUT_SYSTEM = "azure-blob-container"; 106 | private static final String OUTPUT_STREAM = "PageViewEventStream"; 107 | 108 | @Override 109 | public void describe(StreamApplicationDescriptor appDescriptor) { 110 | // Define a system descriptor for Kafka 111 | KafkaSystemDescriptor kafkaSystemDescriptor = 112 | new KafkaSystemDescriptor("kafka").withConsumerZkConnect(KAFKA_CONSUMER_ZK_CONNECT) 113 | .withProducerBootstrapServers(KAFKA_PRODUCER_BOOTSTRAP_SERVERS) 114 | .withDefaultStreamConfigs(KAFKA_DEFAULT_STREAM_CONFIGS); 115 | 116 | KafkaInputDescriptor pageViewInputDescriptor = 117 | kafkaSystemDescriptor.getInputDescriptor(INPUT_PAGEVIEW_STREAM_ID, new JsonSerdeV2<>(PageView.class)); 118 | 119 | // Define a system descriptor for Azure Blob Storage 120 | GenericSystemDescriptor azureBlobSystemDescriptor = 121 | new GenericSystemDescriptor(OUTPUT_SYSTEM, "org.apache.samza.system.azureblob.AzureBlobSystemFactory"); 122 | 123 | GenericOutputDescriptor azureBlobOuputDescriptor = 124 | azureBlobSystemDescriptor.getOutputDescriptor(OUTPUT_STREAM, new NoOpSerde<>()); 125 | 126 | // Set Kafka as the default system for the job 127 | appDescriptor.withDefaultSystem(kafkaSystemDescriptor); 128 | 129 | // Define the input and output streams with descriptors 130 | MessageStream pageViewInput = appDescriptor.getInputStream(pageViewInputDescriptor); 131 | OutputStream pageViewAvroRecordOutputStream = appDescriptor.getOutputStream(azureBlobOuputDescriptor); 132 | 133 | // Define the execution flow with the high-level API 134 | pageViewInput 135 | .map((message) -> { 136 | LOG.info("Sending: Received PageViewEvent with pageId: " + message.pageId); 137 | return PageViewAvroRecord.buildPageViewRecord(message); 138 | }) 139 | .sendTo(pageViewAvroRecordOutputStream); 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/cookbook/JoinExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.cookbook; 20 | 21 | import java.io.Serializable; 22 | import org.apache.samza.application.StreamApplication; 23 | import org.apache.samza.application.descriptors.StreamApplicationDescriptor; 24 | import org.apache.samza.operators.KV; 25 | import org.apache.samza.operators.MessageStream; 26 | import org.apache.samza.operators.OutputStream; 27 | import org.apache.samza.operators.functions.JoinFunction; 28 | import org.apache.samza.serializers.JsonSerdeV2; 29 | import org.apache.samza.serializers.KVSerde; 30 | import org.apache.samza.serializers.StringSerde; 31 | import org.apache.samza.system.kafka.descriptors.KafkaInputDescriptor; 32 | import org.apache.samza.system.kafka.descriptors.KafkaOutputDescriptor; 33 | import org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor; 34 | 35 | import com.google.common.collect.ImmutableList; 36 | import com.google.common.collect.ImmutableMap; 37 | import samza.examples.cookbook.data.AdClick; 38 | import samza.examples.cookbook.data.PageView; 39 | 40 | import java.time.Duration; 41 | import java.util.List; 42 | import java.util.Map; 43 | 44 | /** 45 | * In this example, we join a stream of Page views with a stream of Ad clicks. For instance, this is helpful for 46 | * analysis on what pages served an Ad that was clicked. 47 | * 48 | *

Concepts covered: Performing stream to stream Joins. 49 | * 50 | * To run the below example: 51 | * 52 | *

    53 | *
  1. 54 | * Ensure that the topics "pageview-join-input", "adclick-join-input" are created
    55 | * ./deploy/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic pageview-join-input --partitions 2 --replication-factor 1 56 | * ./deploy/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic adclick-join-input --partitions 2 --replication-factor 1 57 | *
  2. 58 | *
  3. 59 | * Run the application using the run-app.sh script
    60 | * ./deploy/samza/bin/run-app.sh --config-path=$PWD/deploy/samza/config/join-example.properties 61 | *
  4. 62 | *
  5. 63 | * Produce some messages to the "pageview-join-input" topic
    64 | * ./deploy/kafka/bin/kafka-console-producer.sh --topic pageview-join-input --broker-list localhost:9092
    65 | * {"userId": "user1", "country": "india", "pageId":"google.com"}
    66 | * {"userId": "user2", "country": "china", "pageId":"yahoo.com"} 67 | *
  6. 68 | *
  7. 69 | * Produce some messages to the "adclick-join-input" topic with the same pageKey
    70 | * ./deploy/kafka/bin/kafka-console-producer.sh --topic adclick-join-input --broker-list localhost:9092
    71 | * {"userId": "user1", "adId": "adClickId1", "pageId":"google.com"}
    72 | * {"userId": "user1", "adId": "adClickId2", "pageId":"yahoo.com"} 73 | *
  8. 74 | *
  9. 75 | * Consume messages from the "pageview-adclick-join-output" topic
    76 | * ./deploy/kafka/bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic pageview-adclick-join-output --property print.key=true 77 | *
  10. 78 | *
79 | * 80 | */ 81 | public class JoinExample implements StreamApplication, Serializable { 82 | private static final String KAFKA_SYSTEM_NAME = "kafka"; 83 | private static final List KAFKA_CONSUMER_ZK_CONNECT = ImmutableList.of("localhost:2181"); 84 | private static final List KAFKA_PRODUCER_BOOTSTRAP_SERVERS = ImmutableList.of("localhost:9092"); 85 | private static final Map KAFKA_DEFAULT_STREAM_CONFIGS = ImmutableMap.of("replication.factor", "1"); 86 | 87 | private static final String PAGEVIEW_STREAM_ID = "pageview-join-input"; 88 | private static final String ADCLICK_STREAM_ID = "adclick-join-input"; 89 | private static final String OUTPUT_STREAM_ID = "pageview-adclick-join-output"; 90 | 91 | @Override 92 | public void describe(StreamApplicationDescriptor appDescriptor) { 93 | StringSerde stringSerde = new StringSerde(); 94 | JsonSerdeV2 pageViewSerde = new JsonSerdeV2<>(PageView.class); 95 | JsonSerdeV2 adClickSerde = new JsonSerdeV2<>(AdClick.class); 96 | JsonSerdeV2 joinResultSerde = new JsonSerdeV2<>(JoinResult.class); 97 | 98 | KafkaSystemDescriptor kafkaSystemDescriptor = new KafkaSystemDescriptor(KAFKA_SYSTEM_NAME) 99 | .withConsumerZkConnect(KAFKA_CONSUMER_ZK_CONNECT) 100 | .withProducerBootstrapServers(KAFKA_PRODUCER_BOOTSTRAP_SERVERS) 101 | .withDefaultStreamConfigs(KAFKA_DEFAULT_STREAM_CONFIGS); 102 | 103 | KafkaInputDescriptor pageViewInputDescriptor = 104 | kafkaSystemDescriptor.getInputDescriptor(PAGEVIEW_STREAM_ID, pageViewSerde); 105 | KafkaInputDescriptor adClickInputDescriptor = 106 | kafkaSystemDescriptor.getInputDescriptor(ADCLICK_STREAM_ID, adClickSerde); 107 | KafkaOutputDescriptor joinResultOutputDescriptor = 108 | kafkaSystemDescriptor.getOutputDescriptor(OUTPUT_STREAM_ID, joinResultSerde); 109 | 110 | appDescriptor.withDefaultSystem(kafkaSystemDescriptor); 111 | 112 | MessageStream pageViews = appDescriptor.getInputStream(pageViewInputDescriptor); 113 | MessageStream adClicks = appDescriptor.getInputStream(adClickInputDescriptor); 114 | OutputStream joinResults = appDescriptor.getOutputStream(joinResultOutputDescriptor); 115 | 116 | JoinFunction pageViewAdClickJoinFunction = 117 | new JoinFunction() { 118 | @Override 119 | public JoinResult apply(PageView pageView, AdClick adClick) { 120 | return new JoinResult(pageView.pageId, pageView.userId, pageView.country, adClick.getAdId()); 121 | } 122 | 123 | @Override 124 | public String getFirstKey(PageView pageView) { 125 | return pageView.pageId; 126 | } 127 | 128 | @Override 129 | public String getSecondKey(AdClick adClick) { 130 | return adClick.getPageId(); 131 | } 132 | }; 133 | 134 | MessageStream repartitionedPageViews = 135 | pageViews 136 | .partitionBy(pv -> pv.pageId, pv -> pv, KVSerde.of(stringSerde, pageViewSerde), "pageview") 137 | .map(KV::getValue); 138 | 139 | MessageStream repartitionedAdClicks = 140 | adClicks 141 | .partitionBy(AdClick::getPageId, ac -> ac, KVSerde.of(stringSerde, adClickSerde), "adclick") 142 | .map(KV::getValue); 143 | 144 | repartitionedPageViews 145 | .join(repartitionedAdClicks, pageViewAdClickJoinFunction, 146 | stringSerde, pageViewSerde, adClickSerde, Duration.ofMinutes(3), "join") 147 | .sendTo(joinResults); 148 | } 149 | 150 | static class JoinResult { 151 | public String pageId; 152 | public String userId; 153 | public String country; 154 | public String adId; 155 | 156 | public JoinResult(String pageId, String userId, String country, String adId) { 157 | this.pageId = pageId; 158 | this.userId = userId; 159 | this.country = country; 160 | this.adId = adId; 161 | } 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/cookbook/StreamTableJoinExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.cookbook; 20 | 21 | import java.util.Objects; 22 | import org.apache.samza.application.StreamApplication; 23 | import org.apache.samza.application.descriptors.StreamApplicationDescriptor; 24 | import org.apache.samza.operators.KV; 25 | import org.apache.samza.operators.MessageStream; 26 | import org.apache.samza.operators.OutputStream; 27 | import org.apache.samza.operators.functions.StreamTableJoinFunction; 28 | import org.apache.samza.serializers.JsonSerdeV2; 29 | import org.apache.samza.serializers.KVSerde; 30 | import org.apache.samza.serializers.Serde; 31 | import org.apache.samza.serializers.StringSerde; 32 | import org.apache.samza.storage.kv.descriptors.RocksDbTableDescriptor; 33 | import org.apache.samza.system.kafka.descriptors.KafkaInputDescriptor; 34 | import org.apache.samza.system.kafka.descriptors.KafkaOutputDescriptor; 35 | import org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor; 36 | import org.apache.samza.table.Table; 37 | 38 | import com.google.common.collect.ImmutableList; 39 | import com.google.common.collect.ImmutableMap; 40 | import samza.examples.cookbook.data.PageView; 41 | import samza.examples.cookbook.data.Profile; 42 | 43 | import java.util.List; 44 | import java.util.Map; 45 | 46 | /** 47 | * In this example, we join a stream of Page views with a table of user profiles, which is populated from an 48 | * user profile stream. For instance, this is helpful for analysis that required additional information from 49 | * user's profile. 50 | * 51 | *

Concepts covered: Performing stream-to-table joins. 52 | * 53 | * To run the below example: 54 | * 55 | *

    56 | *
  1. 57 | * Ensure that the topics "pageview-join-input", "profile-table-input" are created
    58 | * ./deploy/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic pageview-join-input --partitions 2 --replication-factor 1 59 | * ./deploy/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic profile-table-input --partitions 2 --replication-factor 1 60 | *
  2. 61 | *
  3. 62 | * Run the application using the run-app.sh script
    63 | * ./deploy/samza/bin/run-app.sh --config-path=$PWD/deploy/samza/config/stream-table-join-example.properties 64 | *
  4. 65 | *
  5. 66 | * Consume messages from the "enriched-pageview-join-output" topic
    67 | * ./deploy/kafka/bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic enriched-pageview-join-output 68 | *
  6. 69 | *
  7. 70 | * Produce some messages to the "profile-table-input" topic with the same userId
    71 | * ./deploy/kafka/bin/kafka-console-producer.sh --topic profile-table-input --broker-list localhost:9092
    72 | * {"userId": "user1", "company": "LNKD"}
    73 | * {"userId": "user2", "company": "MSFT"} 74 | *
  8. 75 | *
  9. 76 | * Produce some messages to the "pageview-join-input" topic
    77 | * ./deploy/kafka/bin/kafka-console-producer.sh --topic pageview-join-input --broker-list localhost:9092
    78 | * {"userId": "user1", "country": "india", "pageId":"google.com"}
    79 | * {"userId": "user2", "country": "china", "pageId":"yahoo.com"} 80 | *
  10. 81 | *
82 | * 83 | */ 84 | public class StreamTableJoinExample implements StreamApplication { 85 | private static final String KAFKA_SYSTEM_NAME = "kafka"; 86 | private static final List KAFKA_CONSUMER_ZK_CONNECT = ImmutableList.of("localhost:2181"); 87 | private static final List KAFKA_PRODUCER_BOOTSTRAP_SERVERS = ImmutableList.of("localhost:9092"); 88 | private static final Map KAFKA_DEFAULT_STREAM_CONFIGS = ImmutableMap.of("replication.factor", "1"); 89 | 90 | private static final String PROFILE_STREAM_ID = "profile-table-input"; 91 | private static final String PAGEVIEW_STREAM_ID = "pageview-join-input"; 92 | private static final String OUTPUT_TOPIC = "enriched-pageview-join-output"; 93 | 94 | @Override 95 | public void describe(StreamApplicationDescriptor appDescriptor) { 96 | Serde profileSerde = new JsonSerdeV2<>(Profile.class); 97 | Serde pageViewSerde = new JsonSerdeV2<>(PageView.class); 98 | Serde joinResultSerde = new JsonSerdeV2<>(EnrichedPageView.class); 99 | 100 | KafkaSystemDescriptor kafkaSystemDescriptor = new KafkaSystemDescriptor(KAFKA_SYSTEM_NAME) 101 | .withConsumerZkConnect(KAFKA_CONSUMER_ZK_CONNECT) 102 | .withProducerBootstrapServers(KAFKA_PRODUCER_BOOTSTRAP_SERVERS) 103 | .withDefaultStreamConfigs(KAFKA_DEFAULT_STREAM_CONFIGS); 104 | 105 | KafkaInputDescriptor profileInputDescriptor = 106 | kafkaSystemDescriptor.getInputDescriptor(PROFILE_STREAM_ID, profileSerde); 107 | KafkaInputDescriptor pageViewInputDescriptor = 108 | kafkaSystemDescriptor.getInputDescriptor(PAGEVIEW_STREAM_ID, pageViewSerde); 109 | KafkaOutputDescriptor joinResultOutputDescriptor = 110 | kafkaSystemDescriptor.getOutputDescriptor(OUTPUT_TOPIC, joinResultSerde); 111 | 112 | RocksDbTableDescriptor profileTableDescriptor = 113 | new RocksDbTableDescriptor("profile-table", KVSerde.of(new StringSerde(), profileSerde)); 114 | 115 | appDescriptor.withDefaultSystem(kafkaSystemDescriptor); 116 | 117 | MessageStream profileStream = appDescriptor.getInputStream(profileInputDescriptor); 118 | MessageStream pageViewStream = appDescriptor.getInputStream(pageViewInputDescriptor); 119 | OutputStream joinResultStream = appDescriptor.getOutputStream(joinResultOutputDescriptor); 120 | Table> profileTable = appDescriptor.getTable(profileTableDescriptor); 121 | 122 | profileStream 123 | .map(profile -> KV.of(profile.userId, profile)) 124 | .sendTo(profileTable); 125 | 126 | pageViewStream 127 | .partitionBy(pv -> pv.userId, pv -> pv, KVSerde.of(new StringSerde(), pageViewSerde), "join") 128 | .join(profileTable, new JoinFn()) 129 | .sendTo(joinResultStream); 130 | } 131 | 132 | private static class JoinFn implements StreamTableJoinFunction, KV, EnrichedPageView> { 133 | @Override 134 | public EnrichedPageView apply(KV message, KV record) { 135 | return record == null ? null : 136 | new EnrichedPageView(message.getKey(), record.getValue().company, message.getValue().pageId); 137 | } 138 | @Override 139 | public String getMessageKey(KV message) { 140 | return message.getKey(); 141 | } 142 | @Override 143 | public String getRecordKey(KV record) { 144 | return record.getKey(); 145 | } 146 | } 147 | 148 | static public class EnrichedPageView { 149 | 150 | public final String userId; 151 | public final String company; 152 | public final String pageId; 153 | 154 | public EnrichedPageView(String userId, String company, String pageId) { 155 | this.userId = userId; 156 | this.company = company; 157 | this.pageId = pageId; 158 | } 159 | 160 | @Override 161 | public boolean equals(Object o) { 162 | if (this == o) { 163 | return true; 164 | } 165 | if (o == null || getClass() != o.getClass()) { 166 | return false; 167 | } 168 | EnrichedPageView that = (EnrichedPageView) o; 169 | return Objects.equals(userId, that.userId) && Objects.equals(company, that.company) && Objects.equals(pageId, 170 | that.pageId); 171 | } 172 | } 173 | 174 | } 175 | -------------------------------------------------------------------------------- /src/test/java/samza/examples/cookbook/test/TestSamzaCookBookExamples.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.cookbook.test; 20 | 21 | import java.time.Duration; 22 | import java.util.ArrayList; 23 | import java.util.List; 24 | import org.apache.samza.operators.KV; 25 | import org.apache.samza.serializers.NoOpSerde; 26 | import org.apache.samza.test.framework.StreamAssert; 27 | import org.apache.samza.test.framework.TestRunner; 28 | import org.apache.samza.test.framework.system.descriptors.InMemoryInputDescriptor; 29 | import org.apache.samza.test.framework.system.descriptors.InMemoryOutputDescriptor; 30 | import org.apache.samza.test.framework.system.descriptors.InMemorySystemDescriptor; 31 | import org.junit.Assert; 32 | import org.junit.Test; 33 | import samza.examples.cookbook.FilterExample; 34 | import samza.examples.cookbook.JoinExample; 35 | import samza.examples.cookbook.SessionWindowExample; 36 | import samza.examples.cookbook.StreamTableJoinExample; 37 | import samza.examples.cookbook.TumblingWindowExample; 38 | import samza.examples.cookbook.data.AdClick; 39 | import samza.examples.cookbook.data.PageView; 40 | import samza.examples.cookbook.data.Profile; 41 | import samza.examples.cookbook.data.UserPageViews; 42 | import samza.examples.test.utils.TestUtils; 43 | 44 | import static samza.examples.cookbook.StreamTableJoinExample.EnrichedPageView; 45 | 46 | 47 | public class TestSamzaCookBookExamples { 48 | @Test 49 | public void testFilterExample() { 50 | List rawPageViewEvents = new ArrayList<>(); 51 | rawPageViewEvents.add(new PageView("google.com", "user1", "india")); 52 | rawPageViewEvents.add(new PageView("facebook.com", "invalidUserId", "france")); 53 | rawPageViewEvents.add(new PageView("yahoo.com", "user2", "china")); 54 | 55 | InMemorySystemDescriptor inMemorySystem = new InMemorySystemDescriptor("kafka"); 56 | 57 | InMemoryInputDescriptor badPageViewEvents = 58 | inMemorySystem.getInputDescriptor("pageview-filter-input", new NoOpSerde()); 59 | 60 | InMemoryOutputDescriptor goodPageViewEvents = 61 | inMemorySystem.getOutputDescriptor("pageview-filter-output", new NoOpSerde()); 62 | 63 | TestRunner 64 | .of(new FilterExample()) 65 | .addInputStream(badPageViewEvents, rawPageViewEvents) 66 | .addOutputStream(goodPageViewEvents, 1) 67 | .run(Duration.ofMillis(1500)); 68 | 69 | Assert.assertEquals(TestRunner.consumeStream(goodPageViewEvents, Duration.ofMillis(1000)).get(0).size(), 2); 70 | } 71 | 72 | @Test 73 | public void testJoinExample() { 74 | List pageViewEvents = new ArrayList<>(); 75 | pageViewEvents.add(new PageView("google.com", "user1", "india")); 76 | pageViewEvents.add(new PageView("yahoo.com", "user2", "china")); 77 | List adClickEvents = new ArrayList<>(); 78 | adClickEvents.add(new AdClick("google.com", "adClickId1", "user1")); 79 | adClickEvents.add(new AdClick("yahoo.com", "adClickId2", "user1")); 80 | 81 | InMemorySystemDescriptor inMemorySystem = new InMemorySystemDescriptor("kafka"); 82 | 83 | InMemoryInputDescriptor pageViews = 84 | inMemorySystem.getInputDescriptor("pageview-join-input", new NoOpSerde()); 85 | 86 | InMemoryInputDescriptor adClicks = 87 | inMemorySystem.getInputDescriptor("adclick-join-input", new NoOpSerde()); 88 | 89 | InMemoryOutputDescriptor pageViewAdClickJoin = 90 | inMemorySystem.getOutputDescriptor("pageview-adclick-join-output", new NoOpSerde<>()); 91 | 92 | TestRunner 93 | .of(new JoinExample()) 94 | .addInputStream(pageViews, pageViewEvents) 95 | .addInputStream(adClicks, adClickEvents) 96 | .addOutputStream(pageViewAdClickJoin, 1) 97 | .run(Duration.ofMillis(1500)); 98 | 99 | Assert.assertEquals(TestRunner.consumeStream(pageViewAdClickJoin, Duration.ofMillis(1000)).get(0).size(), 2); 100 | } 101 | 102 | @Test 103 | public void testTumblingWindowExample() { 104 | List pageViewEvents = TestUtils.genSamplePageViewData(); 105 | 106 | InMemorySystemDescriptor inMemorySystem = new InMemorySystemDescriptor("kafka"); 107 | 108 | InMemoryInputDescriptor> pageViewInputDescriptor = 109 | inMemorySystem.getInputDescriptor("pageview-tumbling-input", new NoOpSerde>()); 110 | 111 | InMemoryOutputDescriptor> userPageViewOutputDescriptor = 112 | inMemorySystem.getOutputDescriptor("pageview-tumbling-output", new NoOpSerde>()); 113 | 114 | TestRunner 115 | .of(new TumblingWindowExample()) 116 | .addInputStream(pageViewInputDescriptor, pageViewEvents) 117 | .addOutputStream(userPageViewOutputDescriptor, 1) 118 | .run(Duration.ofMinutes(1)); 119 | 120 | Assert.assertTrue(TestRunner.consumeStream(userPageViewOutputDescriptor, Duration.ofMillis(1000)).get(0).size() > 1); 121 | } 122 | 123 | @Test 124 | public void testSessionWindowExample() { 125 | List pageViewEvents = TestUtils.genSamplePageViewData(); 126 | 127 | InMemorySystemDescriptor inMemorySystem = new InMemorySystemDescriptor("kafka"); 128 | 129 | InMemoryInputDescriptor> pageViewInputDescriptor = 130 | inMemorySystem.getInputDescriptor("pageview-session-input", new NoOpSerde>()); 131 | 132 | InMemoryOutputDescriptor> userPageViewOutputDescriptor = 133 | inMemorySystem.getOutputDescriptor("pageview-session-output", new NoOpSerde>()); 134 | 135 | TestRunner 136 | .of(new SessionWindowExample()) 137 | .addInputStream(pageViewInputDescriptor, pageViewEvents) 138 | .addOutputStream(userPageViewOutputDescriptor, 1) 139 | .run(Duration.ofMinutes(1)); 140 | 141 | Assert.assertEquals(2, TestRunner.consumeStream(userPageViewOutputDescriptor, Duration.ofMillis(1000)).get(0).size()); 142 | } 143 | 144 | @Test 145 | public void testStreamTableJoinExample() throws InterruptedException{ 146 | List pageViewEvents = new ArrayList<>(); 147 | pageViewEvents.add(new PageView("google.com", "user1", "india")); 148 | pageViewEvents.add(new PageView("yahoo.com", "user2", "china")); 149 | List profiles = new ArrayList<>(); 150 | profiles.add(new Profile("user1", "LNKD")); 151 | profiles.add(new Profile("user2", "MSFT")); 152 | 153 | InMemorySystemDescriptor inMemorySystem = new InMemorySystemDescriptor("kafka"); 154 | 155 | InMemoryInputDescriptor pageViews = 156 | inMemorySystem.getInputDescriptor("pageview-join-input", new NoOpSerde()); 157 | 158 | InMemoryInputDescriptor profileViews = 159 | inMemorySystem.getInputDescriptor("profile-table-input", new NoOpSerde()); 160 | 161 | InMemoryOutputDescriptor joinResultOutputDescriptor = 162 | inMemorySystem.getOutputDescriptor("enriched-pageview-join-output", new NoOpSerde()); 163 | 164 | TestRunner 165 | .of(new StreamTableJoinExample()) 166 | .addInputStream(pageViews, pageViewEvents) 167 | .addInputStream(profileViews, profiles) 168 | .addOutputStream(joinResultOutputDescriptor, 1) 169 | .run(Duration.ofMillis(1500)); 170 | 171 | List expectedOutput = new ArrayList<>(); 172 | expectedOutput.add(new EnrichedPageView("user1", "LNKD", "google.com")); 173 | expectedOutput.add(new EnrichedPageView("user2", "MSFT", "yahoo.com")); 174 | 175 | StreamAssert.containsInAnyOrder(expectedOutput, joinResultOutputDescriptor, Duration.ofMillis(200)); 176 | 177 | } 178 | 179 | } 180 | -------------------------------------------------------------------------------- /bin/grid: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | # This script will download, setup, start, and stop servers for Kafka, YARN, and ZooKeeper, 20 | # as well as downloading, building and locally publishing Samza 21 | 22 | if [ -z "$JAVA_HOME" ]; then 23 | if [ -x /usr/libexec/java_home ]; then 24 | export JAVA_HOME="$(/usr/libexec/java_home)" 25 | else 26 | echo "JAVA_HOME not set. Exiting." 27 | exit 1 28 | fi 29 | fi 30 | 31 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 32 | BASE_DIR=$(dirname $DIR) 33 | DEPLOY_ROOT_DIR=$BASE_DIR/deploy 34 | DOWNLOAD_CACHE_DIR=$HOME/.samza/download 35 | COMMAND=$1 36 | SYSTEM=$2 37 | 38 | DOWNLOAD_KAFKA=https://archive.apache.org/dist/kafka/2.1.1/kafka_2.11-2.1.1.tgz 39 | DOWNLOAD_YARN=https://archive.apache.org/dist/hadoop/common/hadoop-2.9.2/hadoop-2.9.2.tar.gz 40 | DOWNLOAD_ZOOKEEPER=https://archive.apache.org/dist/zookeeper/zookeeper-3.4.14/zookeeper-3.4.14.tar.gz 41 | 42 | SERVICE_WAIT_TIMEOUT_SEC=20 43 | ZOOKEEPER_PORT=2181 44 | RESOURCEMANAGER_PORT=8032 45 | NODEMANAGER_PORT=8042 46 | KAFKA_PORT=9092 47 | 48 | bootstrap() { 49 | echo "Bootstrapping the system..." 50 | stop_all 51 | rm -rf "$DEPLOY_ROOT_DIR" 52 | mkdir "$DEPLOY_ROOT_DIR" 53 | install_all 54 | start_all 55 | exit 0 56 | } 57 | 58 | standalone() { 59 | echo "Setting up the system..." 60 | stop_all 61 | rm -rf "$DEPLOY_ROOT_DIR" 62 | mkdir "$DEPLOY_ROOT_DIR" 63 | install_all_without_yarn 64 | start_all_without_yarn 65 | exit 0 66 | } 67 | 68 | install_all() { 69 | $DIR/grid install samza 70 | $DIR/grid install zookeeper 71 | $DIR/grid install yarn 72 | $DIR/grid install kafka 73 | } 74 | 75 | install_all_without_yarn() { 76 | $DIR/grid install samza 77 | $DIR/grid install zookeeper 78 | $DIR/grid install kafka 79 | } 80 | 81 | install_samza() { 82 | echo "Building samza from master..." 83 | mkdir -p "$DEPLOY_ROOT_DIR" 84 | if [ -d "$DOWNLOAD_CACHE_DIR/samza/.git" ]; then 85 | pushd "$DOWNLOAD_CACHE_DIR/samza" 86 | git fetch origin 87 | git reset --hard origin/master 88 | else 89 | mkdir -p $DOWNLOAD_CACHE_DIR 90 | pushd $DOWNLOAD_CACHE_DIR 91 | git clone https://gitbox.apache.org/repos/asf/samza.git 92 | cd samza 93 | fi 94 | ./gradlew -PscalaSuffix=2.11 clean publishToMavenLocal 95 | popd 96 | } 97 | 98 | install_zookeeper() { 99 | mkdir -p "$DEPLOY_ROOT_DIR" 100 | install zookeeper $DOWNLOAD_ZOOKEEPER zookeeper-3.4.14 101 | cp "$DEPLOY_ROOT_DIR/zookeeper/conf/zoo_sample.cfg" "$DEPLOY_ROOT_DIR/zookeeper/conf/zoo.cfg" 102 | } 103 | 104 | install_yarn() { 105 | mkdir -p "$DEPLOY_ROOT_DIR" 106 | install yarn $DOWNLOAD_YARN hadoop-2.9.2 107 | cp "$BASE_DIR/conf/yarn-site.xml" "$DEPLOY_ROOT_DIR/yarn/etc/hadoop/yarn-site.xml" 108 | if [ ! -f "$HOME/.samza/conf/yarn-site.xml" ]; then 109 | mkdir -p "$HOME/.samza/conf" 110 | cp "$BASE_DIR/conf/yarn-site.xml" "$HOME/.samza/conf/yarn-site.xml" 111 | fi 112 | } 113 | 114 | install_kafka() { 115 | mkdir -p "$DEPLOY_ROOT_DIR" 116 | install kafka $DOWNLOAD_KAFKA kafka_2.11-2.1.1 117 | # have to use SIGTERM since nohup on appears to ignore SIGINT 118 | # and Kafka switched to SIGINT in KAFKA-1031. 119 | sed -i.bak 's/SIGINT/SIGTERM/g' $DEPLOY_ROOT_DIR/kafka/bin/kafka-server-stop.sh 120 | # in order to simplify the wikipedia-stats example job, set topic to have just 1 partition by default 121 | sed -i.bak 's/^num\.partitions *=.*/num.partitions=1/' $DEPLOY_ROOT_DIR/kafka/config/server.properties 122 | } 123 | 124 | install() { 125 | DESTINATION_DIR="$DEPLOY_ROOT_DIR/$1" 126 | DOWNLOAD_URL=$2 127 | PACKAGE_DIR="$DOWNLOAD_CACHE_DIR/$3" 128 | PACKAGE_FILE="$DOWNLOAD_CACHE_DIR/$(basename $DOWNLOAD_URL)" 129 | if [ -f "$PACKAGE_FILE" ]; then 130 | echo "Using previously downloaded file $PACKAGE_FILE" 131 | else 132 | echo "Downloading $(basename $DOWNLOAD_URL)..." 133 | mkdir -p $DOWNLOAD_CACHE_DIR 134 | curl "$DOWNLOAD_URL" > "${PACKAGE_FILE}.tmp" 135 | mv "${PACKAGE_FILE}.tmp" "$PACKAGE_FILE" 136 | fi 137 | rm -rf "$DESTINATION_DIR" "$PACKAGE_DIR" 138 | tar -xf "$PACKAGE_FILE" -C $DOWNLOAD_CACHE_DIR 139 | mv "$PACKAGE_DIR" "$DESTINATION_DIR" 140 | } 141 | 142 | start_all() { 143 | $DIR/grid start zookeeper 144 | $DIR/grid start yarn 145 | $DIR/grid start kafka 146 | } 147 | 148 | start_all_without_yarn() { 149 | $DIR/grid start zookeeper 150 | $DIR/grid start kafka 151 | } 152 | 153 | start_zookeeper() { 154 | if [ -f $DEPLOY_ROOT_DIR/$SYSTEM/bin/zkServer.sh ]; then 155 | cd $DEPLOY_ROOT_DIR/$SYSTEM 156 | bin/zkServer.sh start 157 | wait_for_service "zookeeper" $ZOOKEEPER_PORT 158 | cd - > /dev/null 159 | else 160 | echo 'Zookeeper is not installed. Run: bin/grid install zookeeper' 161 | fi 162 | } 163 | 164 | start_yarn() { 165 | if [ -f $DEPLOY_ROOT_DIR/$SYSTEM/sbin/yarn-daemon.sh ]; then 166 | $DEPLOY_ROOT_DIR/$SYSTEM/sbin/yarn-daemon.sh start resourcemanager 167 | wait_for_service "resourcemanager" $RESOURCEMANAGER_PORT 168 | $DEPLOY_ROOT_DIR/$SYSTEM/sbin/yarn-daemon.sh start nodemanager 169 | wait_for_service "nodemanager" $NODEMANAGER_PORT 170 | else 171 | echo 'YARN is not installed. Run: bin/grid install yarn' 172 | fi 173 | } 174 | 175 | start_kafka() { 176 | if [ -f $DEPLOY_ROOT_DIR/$SYSTEM/bin/kafka-server-start.sh ]; then 177 | mkdir -p $DEPLOY_ROOT_DIR/$SYSTEM/logs 178 | cd $DEPLOY_ROOT_DIR/$SYSTEM 179 | nohup bin/kafka-server-start.sh config/server.properties > logs/kafka.log 2>&1 & 180 | cd - > /dev/null 181 | wait_for_service "kafka" $KAFKA_PORT 182 | else 183 | echo 'Kafka is not installed. Run: bin/grid install kafka' 184 | fi 185 | } 186 | 187 | wait_for_service() { 188 | local SERVICE_NAME=$1 189 | local PORT=$2 190 | echo "Waiting for $SERVICE_NAME to start..." 191 | local CURRENT_WAIT_TIME=0 192 | 193 | while [[ $(echo | nc -w1 localhost $PORT >/dev/null 2>&1 ;echo $?) -ne 0 ]]; do 194 | printf '.' 195 | sleep 1 196 | if [ $((++CURRENT_WAIT_TIME)) -eq $SERVICE_WAIT_TIMEOUT_SEC ]; then 197 | printf "\nError: timed out while waiting for $SERVICE_NAME to start.\n" 198 | exit 1 199 | fi 200 | done 201 | printf '\n' 202 | echo "$SERVICE_NAME has started"; 203 | } 204 | 205 | stop_all() { 206 | $DIR/grid stop kafka 207 | $DIR/grid stop yarn 208 | $DIR/grid stop zookeeper 209 | } 210 | 211 | stop_zookeeper() { 212 | if [ -f $DEPLOY_ROOT_DIR/$SYSTEM/bin/zkServer.sh ]; then 213 | cd $DEPLOY_ROOT_DIR/$SYSTEM 214 | bin/zkServer.sh stop 215 | cd - > /dev/null 216 | else 217 | echo 'Zookeeper is not installed. Run: bin/grid install zookeeper' 218 | fi 219 | } 220 | 221 | stop_yarn() { 222 | if [ -f $DEPLOY_ROOT_DIR/$SYSTEM/sbin/yarn-daemon.sh ]; then 223 | $DEPLOY_ROOT_DIR/$SYSTEM/sbin/yarn-daemon.sh stop resourcemanager 224 | $DEPLOY_ROOT_DIR/$SYSTEM/sbin/yarn-daemon.sh stop nodemanager 225 | else 226 | echo 'YARN is not installed. Run: bin/grid install yarn' 227 | fi 228 | } 229 | 230 | stop_kafka() { 231 | if [ -f $DEPLOY_ROOT_DIR/$SYSTEM/bin/kafka-server-stop.sh ]; then 232 | cd $DEPLOY_ROOT_DIR/$SYSTEM 233 | bin/kafka-server-stop.sh || true # tolerate nonzero exit status if Kafka isn't running 234 | cd - > /dev/null 235 | else 236 | echo 'Kafka is not installed. Run: bin/grid install kafka' 237 | fi 238 | } 239 | 240 | # Check arguments 241 | if [ "$COMMAND" == "bootstrap" ] && test -z "$SYSTEM"; then 242 | bootstrap 243 | exit 0 244 | elif [ "$COMMAND" == "standalone" ] && test -z "$SYSTEM"; then 245 | standalone 246 | exit 0 247 | elif (test -z "$COMMAND" && test -z "$SYSTEM") \ 248 | || ( [ "$COMMAND" == "help" ] || test -z "$COMMAND" || test -z "$SYSTEM"); then 249 | echo 250 | echo " Usage.." 251 | echo 252 | echo " $ grid" 253 | echo " $ grid bootstrap" 254 | echo " $ grid standalone" 255 | echo " $ grid install [yarn|kafka|zookeeper|samza|all]" 256 | echo " $ grid start [yarn|kafka|zookeeper|all]" 257 | echo " $ grid stop [yarn|kafka|zookeeper|all]" 258 | echo 259 | exit 1 260 | else 261 | echo "EXECUTING: $COMMAND $SYSTEM" 262 | 263 | "$COMMAND"_"$SYSTEM" 264 | fi 265 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/cookbook/RemoteTableJoinExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.cookbook; 20 | 21 | import com.google.common.collect.ImmutableList; 22 | import com.google.common.collect.ImmutableMap; 23 | 24 | import java.io.Serializable; 25 | import java.net.URL; 26 | import java.time.Duration; 27 | import java.util.List; 28 | import java.util.Map; 29 | import java.util.concurrent.CompletableFuture; 30 | import org.apache.samza.SamzaException; 31 | import org.apache.samza.application.StreamApplication; 32 | import org.apache.samza.application.descriptors.StreamApplicationDescriptor; 33 | import org.apache.samza.operators.KV; 34 | import org.apache.samza.operators.MessageStream; 35 | import org.apache.samza.operators.OutputStream; 36 | import org.apache.samza.operators.functions.StreamTableJoinFunction; 37 | import org.apache.samza.serializers.JsonSerdeV2; 38 | import org.apache.samza.serializers.StringSerde; 39 | import org.apache.samza.system.kafka.descriptors.KafkaInputDescriptor; 40 | import org.apache.samza.system.kafka.descriptors.KafkaOutputDescriptor; 41 | import org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor; 42 | import org.apache.samza.table.Table; 43 | import org.apache.samza.table.descriptors.CachingTableDescriptor; 44 | import org.apache.samza.table.remote.BaseTableFunction; 45 | import org.apache.samza.table.remote.TableReadFunction; 46 | import org.apache.samza.table.descriptors.RemoteTableDescriptor; 47 | import org.apache.samza.util.ExponentialSleepStrategy; 48 | import org.apache.samza.util.HttpUtil; 49 | import org.codehaus.jackson.JsonFactory; 50 | import org.codehaus.jackson.JsonParser; 51 | import org.codehaus.jackson.JsonToken; 52 | import org.codehaus.jackson.annotate.JsonProperty; 53 | 54 | /** 55 | * In this example, we join a stream of stock symbols with a remote table backed by a RESTful service, 56 | * which delivers latest stock quotes. The join results contain stock symbol and latest price, and are 57 | * delivered to an output stream. 58 | * 59 | * A rate limit of 10 requests/second is set of the entire job, internally Samza uses an embedded 60 | * rate limiter, which evenly distributes the total rate limit among tasks. 61 | * 62 | * A caching table is used over the remote table with a read TTL of 5 seconds, therefore one would 63 | * receive the same quote with this time span. 64 | * 65 | *

Concepts covered: remote table, rate limiter, caching table, stream to table joins. 66 | * 67 | * To run the below example: 68 | * 69 | *

    70 | *
  1. 71 | * Create Kafka topics "stock-symbol-input", "stock-price-output" are created
    72 | * ./deploy/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic stock-symbol-input --partitions 2 --replication-factor 1 73 | * ./deploy/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic stock-price-output --partitions 2 --replication-factor 1 74 | *
  2. 75 | *
  3. 76 | * Run the application using the run-app.sh script
    77 | * ./deploy/samza/bin/run-app.sh --config-path=$PWD/deploy/samza/config/remote-table-join-example.properties 78 | *
  4. 79 | *
  5. 80 | * Consume messages from the output topic
    81 | * ./deploy/kafka/bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic stock-price-output 82 | *
  6. 83 | *
  7. 84 | * Produce some messages to the input topic
    85 | * ./deploy/kafka/bin/kafka-console-producer.sh --topic stock-symbol-input --broker-list localhost:9092 86 | * 87 | * After the console producer is started, type 88 | * MSFT 89 | * 90 | * You should see messages like below from the console consumer window 91 | * {"symbol":"MSFT","close":107.64} 92 | * 93 | * Note: you will need a free API key for symbols other than MSFT, see below for more information. 94 | *
  8. 95 | *
96 | * 97 | */ 98 | public class RemoteTableJoinExample implements StreamApplication { 99 | private static final String KAFKA_SYSTEM_NAME = "kafka"; 100 | private static final List KAFKA_CONSUMER_ZK_CONNECT = ImmutableList.of("localhost:2181"); 101 | private static final List KAFKA_PRODUCER_BOOTSTRAP_SERVERS = ImmutableList.of("localhost:9092"); 102 | private static final Map KAFKA_DEFAULT_STREAM_CONFIGS = ImmutableMap.of("replication.factor", "1"); 103 | 104 | /** 105 | * Default API key "demo" only works for symbol "MSFT"; however you can get an 106 | * API key for free at https://www.alphavantage.co/, which will work for other symbols. 107 | */ 108 | private static final String API_KEY = "demo"; 109 | 110 | private static final String URL_TEMPLATE = 111 | "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=%s&apikey=" + API_KEY; 112 | 113 | private static final String INPUT_STREAM_ID = "stock-symbol-input"; 114 | private static final String OUTPUT_STREAM_ID = "stock-price-output"; 115 | 116 | @Override 117 | public void describe(StreamApplicationDescriptor appDescriptor) { 118 | KafkaSystemDescriptor kafkaSystemDescriptor = new KafkaSystemDescriptor(KAFKA_SYSTEM_NAME) 119 | .withConsumerZkConnect(KAFKA_CONSUMER_ZK_CONNECT) 120 | .withProducerBootstrapServers(KAFKA_PRODUCER_BOOTSTRAP_SERVERS) 121 | .withDefaultStreamConfigs(KAFKA_DEFAULT_STREAM_CONFIGS); 122 | 123 | KafkaInputDescriptor stockSymbolInputDescriptor = 124 | kafkaSystemDescriptor.getInputDescriptor(INPUT_STREAM_ID, new StringSerde()); 125 | KafkaOutputDescriptor stockPriceOutputDescriptor = 126 | kafkaSystemDescriptor.getOutputDescriptor(OUTPUT_STREAM_ID, new JsonSerdeV2<>(StockPrice.class)); 127 | appDescriptor.withDefaultSystem(kafkaSystemDescriptor); 128 | MessageStream stockSymbolStream = appDescriptor.getInputStream(stockSymbolInputDescriptor); 129 | OutputStream stockPriceStream = appDescriptor.getOutputStream(stockPriceOutputDescriptor); 130 | 131 | RemoteTableDescriptor remoteTableDescriptor = 132 | new RemoteTableDescriptor("remote-table") 133 | .withReadRateLimit(10) 134 | .withReadFunction(new StockPriceReadFunction()); 135 | CachingTableDescriptor cachedRemoteTableDescriptor = 136 | new CachingTableDescriptor<>("cached-remote-table", remoteTableDescriptor) 137 | .withReadTtl(Duration.ofSeconds(5)); 138 | Table> cachedRemoteTable = appDescriptor.getTable(cachedRemoteTableDescriptor); 139 | 140 | stockSymbolStream 141 | .map(symbol -> new KV(symbol, null)) 142 | .join(cachedRemoteTable, new JoinFn()) 143 | .sendTo(stockPriceStream); 144 | 145 | } 146 | 147 | static class JoinFn implements StreamTableJoinFunction, KV, StockPrice> { 148 | @Override 149 | public StockPrice apply(KV message, KV record) { 150 | return record == null ? null : new StockPrice(message.getKey(), record.getValue()); 151 | } 152 | @Override 153 | public String getMessageKey(KV message) { 154 | return message.getKey(); 155 | } 156 | @Override 157 | public String getRecordKey(KV record) { 158 | return record.getKey(); 159 | } 160 | } 161 | 162 | static class StockPriceReadFunction extends BaseTableFunction 163 | implements TableReadFunction { 164 | @Override 165 | public CompletableFuture getAsync(String symbol) { 166 | return CompletableFuture.supplyAsync(() -> { 167 | try { 168 | URL url = new URL(String.format(URL_TEMPLATE, symbol)); 169 | String response = HttpUtil.read(url, 5000, new ExponentialSleepStrategy()); 170 | JsonParser parser = new JsonFactory().createJsonParser(response); 171 | while (!parser.isClosed()) { 172 | if (JsonToken.FIELD_NAME.equals(parser.nextToken()) && "4. close".equalsIgnoreCase(parser.getCurrentName())) { 173 | return Double.valueOf(parser.nextTextValue()); 174 | } 175 | } 176 | return -1d; 177 | } catch (Exception ex) { 178 | throw new SamzaException(ex); 179 | } 180 | }); 181 | } 182 | 183 | @Override 184 | public boolean isRetriable(Throwable throwable) { 185 | return false; 186 | } 187 | } 188 | 189 | public static class StockPrice implements Serializable { 190 | 191 | public final String symbol; 192 | public final Double close; 193 | 194 | public StockPrice( 195 | @JsonProperty("symbol") String symbol, 196 | @JsonProperty("close") Double close) { 197 | this.symbol = symbol; 198 | this.close = close; 199 | } 200 | } 201 | 202 | } 203 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/wikipedia/system/WikipediaFeed.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package samza.examples.wikipedia.system; 21 | 22 | import java.io.IOException; 23 | import java.util.HashMap; 24 | import java.util.HashSet; 25 | import java.util.Map; 26 | import java.util.Random; 27 | import java.util.Set; 28 | import org.apache.samza.SamzaException; 29 | import org.codehaus.jackson.map.ObjectMapper; 30 | import org.schwering.irc.lib.IRCConnection; 31 | import org.schwering.irc.lib.IRCEventListener; 32 | import org.schwering.irc.lib.IRCModeParser; 33 | import org.schwering.irc.lib.IRCUser; 34 | import org.slf4j.Logger; 35 | import org.slf4j.LoggerFactory; 36 | 37 | public class WikipediaFeed { 38 | private static final Logger log = LoggerFactory.getLogger(WikipediaFeed.class); 39 | private static final Random random = new Random(); 40 | private static final ObjectMapper jsonMapper = new ObjectMapper(); 41 | 42 | private final Map> channelListeners; 43 | private final String host; 44 | private final int port; 45 | private final IRCConnection conn; 46 | private final String nick; 47 | 48 | public WikipediaFeed(String host, int port) { 49 | this.channelListeners = new HashMap>(); 50 | this.host = host; 51 | this.port = port; 52 | this.nick = "samza-bot-" + Math.abs(random.nextInt()); 53 | this.conn = new IRCConnection(host, new int[] { port }, "", nick, nick, nick); 54 | this.conn.addIRCEventListener(new WikipediaFeedIrcListener()); 55 | this.conn.setEncoding("UTF-8"); 56 | this.conn.setPong(true); 57 | this.conn.setColors(false); 58 | } 59 | 60 | public void start() { 61 | try { 62 | this.conn.connect(); 63 | } catch (IOException e) { 64 | throw new RuntimeException("Unable to connect to " + host + ":" + port + ".", e); 65 | } 66 | } 67 | 68 | public void stop() { 69 | this.conn.interrupt(); 70 | 71 | try { 72 | this.conn.join(); 73 | } catch (InterruptedException e) { 74 | throw new RuntimeException("Interrupted while trying to shutdown IRC connection for " + host + ":" + port, e); 75 | } 76 | 77 | if (this.conn.isAlive()) { 78 | throw new RuntimeException("Unable to shutdown IRC connection for " + host + ":" + port); 79 | } 80 | } 81 | 82 | public void listen(String channel, WikipediaFeedListener listener) { 83 | Set listeners = channelListeners.get(channel); 84 | 85 | if (listeners == null) { 86 | listeners = new HashSet(); 87 | channelListeners.put(channel, listeners); 88 | join(channel); 89 | } 90 | 91 | listeners.add(listener); 92 | } 93 | 94 | public void unlisten(String channel, WikipediaFeedListener listener) { 95 | Set listeners = channelListeners.get(channel); 96 | 97 | if (listeners == null) { 98 | throw new RuntimeException("Trying to unlisten to a channel that has no listeners in it."); 99 | } else if (!listeners.contains(listener)) { 100 | throw new RuntimeException("Trying to unlisten to a channel that listener is not listening to."); 101 | } 102 | 103 | listeners.remove(listener); 104 | 105 | if (listeners.size() == 0) { 106 | leave(channel); 107 | } 108 | } 109 | 110 | public void join(String channel) { 111 | conn.send("JOIN " + channel); 112 | } 113 | 114 | public void leave(String channel) { 115 | conn.send("PART " + channel); 116 | } 117 | 118 | public class WikipediaFeedIrcListener implements IRCEventListener { 119 | public void onRegistered() { 120 | log.info("Connected"); 121 | } 122 | 123 | public void onDisconnected() { 124 | log.info("Disconnected"); 125 | } 126 | 127 | public void onError(String msg) { 128 | log.info("Error: " + msg); 129 | } 130 | 131 | public void onError(int num, String msg) { 132 | log.info("Error #" + num + ": " + msg); 133 | } 134 | 135 | public void onInvite(String chan, IRCUser u, String nickPass) { 136 | log.info(chan + "> " + u.getNick() + " invites " + nickPass); 137 | } 138 | 139 | public void onJoin(String chan, IRCUser u) { 140 | log.info(chan + "> " + u.getNick() + " joins"); 141 | } 142 | 143 | public void onKick(String chan, IRCUser u, String nickPass, String msg) { 144 | log.info(chan + "> " + u.getNick() + " kicks " + nickPass); 145 | } 146 | 147 | public void onMode(IRCUser u, String nickPass, String mode) { 148 | log.info("Mode: " + u.getNick() + " sets modes " + mode + " " + nickPass); 149 | } 150 | 151 | public void onMode(String chan, IRCUser u, IRCModeParser mp) { 152 | log.info(chan + "> " + u.getNick() + " sets mode: " + mp.getLine()); 153 | } 154 | 155 | public void onNick(IRCUser u, String nickNew) { 156 | log.info("Nick: " + u.getNick() + " is now known as " + nickNew); 157 | } 158 | 159 | public void onNotice(String target, IRCUser u, String msg) { 160 | log.info(target + "> " + u.getNick() + " (notice): " + msg); 161 | } 162 | 163 | public void onPart(String chan, IRCUser u, String msg) { 164 | log.info(chan + "> " + u.getNick() + " parts"); 165 | } 166 | 167 | public void onPrivmsg(String chan, IRCUser u, String msg) { 168 | Set listeners = channelListeners.get(chan); 169 | 170 | if (listeners != null) { 171 | WikipediaFeedEvent event = new WikipediaFeedEvent(System.currentTimeMillis(), chan, u.getNick(), msg); 172 | 173 | for (WikipediaFeedListener listener : listeners) { 174 | listener.onEvent(event); 175 | } 176 | } 177 | 178 | log.debug(chan + "> " + u.getNick() + ": " + msg); 179 | } 180 | 181 | public void onQuit(IRCUser u, String msg) { 182 | log.info("Quit: " + u.getNick()); 183 | } 184 | 185 | public void onReply(int num, String value, String msg) { 186 | log.info("Reply #" + num + ": " + value + " " + msg); 187 | } 188 | 189 | public void onTopic(String chan, IRCUser u, String topic) { 190 | log.info(chan + "> " + u.getNick() + " changes topic into: " + topic); 191 | } 192 | 193 | public void onPing(String p) { 194 | } 195 | 196 | public void unknown(String a, String b, String c, String d) { 197 | log.warn("UNKNOWN: " + a + " " + b + " " + c + " " + d); 198 | } 199 | } 200 | 201 | public static interface WikipediaFeedListener { 202 | void onEvent(WikipediaFeedEvent event); 203 | } 204 | 205 | public static final class WikipediaFeedEvent { 206 | private final long time; 207 | private final String channel; 208 | private final String source; 209 | private final String rawEvent; 210 | 211 | public WikipediaFeedEvent(long time, String channel, String source, String rawEvent) { 212 | this.time = time; 213 | this.channel = channel; 214 | this.source = source; 215 | this.rawEvent = rawEvent; 216 | } 217 | 218 | public WikipediaFeedEvent(Map jsonObject) { 219 | this((Long) jsonObject.get("time"), (String) jsonObject.get("channel"), (String) jsonObject.get("source"), (String) jsonObject.get("raw")); 220 | } 221 | 222 | public long getTime() { 223 | return time; 224 | } 225 | 226 | public String getChannel() { 227 | return channel; 228 | } 229 | 230 | public String getSource() { 231 | return source; 232 | } 233 | 234 | public String getRawEvent() { 235 | return rawEvent; 236 | } 237 | 238 | @Override 239 | public int hashCode() { 240 | final int prime = 31; 241 | int result = 1; 242 | result = prime * result + ((channel == null) ? 0 : channel.hashCode()); 243 | result = prime * result + ((rawEvent == null) ? 0 : rawEvent.hashCode()); 244 | result = prime * result + ((source == null) ? 0 : source.hashCode()); 245 | result = prime * result + (int) (time ^ (time >>> 32)); 246 | return result; 247 | } 248 | 249 | @Override 250 | public boolean equals(Object obj) { 251 | if (this == obj) 252 | return true; 253 | if (obj == null) 254 | return false; 255 | if (getClass() != obj.getClass()) 256 | return false; 257 | WikipediaFeedEvent other = (WikipediaFeedEvent) obj; 258 | if (channel == null) { 259 | if (other.channel != null) 260 | return false; 261 | } else if (!channel.equals(other.channel)) 262 | return false; 263 | if (rawEvent == null) { 264 | if (other.rawEvent != null) 265 | return false; 266 | } else if (!rawEvent.equals(other.rawEvent)) 267 | return false; 268 | if (source == null) { 269 | if (other.source != null) 270 | return false; 271 | } else if (!source.equals(other.source)) 272 | return false; 273 | if (time != other.time) 274 | return false; 275 | return true; 276 | } 277 | 278 | @Override 279 | public String toString() { 280 | return "WikipediaFeedEvent [time=" + time + ", channel=" + channel + ", source=" + source + ", rawEvent=" + rawEvent + "]"; 281 | } 282 | 283 | public String toJson() { 284 | return toJson(this); 285 | } 286 | 287 | public static Map toMap(WikipediaFeedEvent event) { 288 | Map jsonObject = new HashMap(); 289 | 290 | jsonObject.put("time", event.getTime()); 291 | jsonObject.put("channel", event.getChannel()); 292 | jsonObject.put("source", event.getSource()); 293 | jsonObject.put("raw", event.getRawEvent()); 294 | 295 | return jsonObject; 296 | } 297 | 298 | public static String toJson(WikipediaFeedEvent event) { 299 | Map jsonObject = toMap(event); 300 | 301 | try { 302 | return jsonMapper.writeValueAsString(jsonObject); 303 | } catch (Exception e) { 304 | throw new SamzaException(e); 305 | } 306 | } 307 | 308 | @SuppressWarnings("unchecked") 309 | public static WikipediaFeedEvent fromJson(String json) { 310 | try { 311 | return new WikipediaFeedEvent((Map) jsonMapper.readValue(json, Map.class)); 312 | } catch (Exception e) { 313 | throw new SamzaException(e); 314 | } 315 | } 316 | } 317 | 318 | public static void main(String[] args) throws InterruptedException { 319 | WikipediaFeed feed = new WikipediaFeed("irc.wikimedia.org", 6667); 320 | feed.start(); 321 | 322 | feed.listen("#en.wikipedia", new WikipediaFeedListener() { 323 | @Override 324 | public void onEvent(WikipediaFeedEvent event) { 325 | System.out.println(event); 326 | } 327 | }); 328 | 329 | Thread.sleep(20000); 330 | feed.stop(); 331 | } 332 | } 333 | -------------------------------------------------------------------------------- /src/main/java/samza/examples/cookbook/CouchbaseTableExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package samza.examples.cookbook; 20 | 21 | import com.couchbase.client.java.document.json.JsonObject; 22 | import com.google.common.base.Preconditions; 23 | import com.google.common.collect.ImmutableList; 24 | import com.google.common.collect.ImmutableMap; 25 | import java.text.SimpleDateFormat; 26 | import java.time.Duration; 27 | import java.util.Arrays; 28 | import java.util.Date; 29 | import java.util.List; 30 | import java.util.Map; 31 | import java.util.concurrent.CompletableFuture; 32 | import java.util.concurrent.TimeUnit; 33 | import org.apache.samza.SamzaException; 34 | import org.apache.samza.application.StreamApplication; 35 | import org.apache.samza.application.descriptors.StreamApplicationDescriptor; 36 | import org.apache.samza.context.Context; 37 | import org.apache.samza.operators.MessageStream; 38 | import org.apache.samza.operators.OutputStream; 39 | import org.apache.samza.operators.functions.MapFunction; 40 | import org.apache.samza.serializers.StringSerde; 41 | import org.apache.samza.system.kafka.descriptors.KafkaInputDescriptor; 42 | import org.apache.samza.system.kafka.descriptors.KafkaOutputDescriptor; 43 | import org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor; 44 | import org.apache.samza.table.descriptors.RemoteTableDescriptor; 45 | import org.apache.samza.table.remote.NoOpTableReadFunction; 46 | import org.apache.samza.table.remote.RemoteTable; 47 | import org.apache.samza.table.remote.couchbase.CouchbaseTableWriteFunction; 48 | import org.apache.samza.table.retry.TableRetryPolicy; 49 | 50 | 51 | /** 52 | * This is a simple word count example using a remote store. 53 | * 54 | * In this example, we use Couchbase to demonstrate how to invoke API's on a remote store other than get, put or delete 55 | * as defined in {@link org.apache.samza.table.remote.AsyncRemoteTable}. Input messages are collected from user through 56 | * a Kafka console producer, and tokenized using space. For each word, we increment a counter for this word 57 | * as well as a counter for all words on Couchbase. We also output the current value of both counters to Kafka console 58 | * consumer. 59 | * 60 | * A rate limit of 4 requests/second to Couchbase is set of the entire job, internally Samza uses an embedded 61 | * rate limiter, which evenly distributes the total rate limit among tasks. As we invoke 2 calls on Couchbase 62 | * for each word, you should see roughly 2 messages per second in the Kafka console consumer 63 | * window. 64 | * 65 | * A retry policy with 1 second fixed backoff time and max 3 retries is attached to the remote table. 66 | * 67 | *

Concepts covered: remote table, rate limiter, retry, arbitrary operation on remote store. 68 | * 69 | * To run the below example: 70 | * 71 | *

    72 | *
  1. 73 | * Create a Couchbase instance using docker; Log into the admin UI at http://localhost:8091 (Administrator/password)
    74 | * create a bucket called "my-bucket"
    75 | * Under Security tab, create a user with the same name, set 123456 as the password, and give it "Data Reader" 76 | * and "Data Writer" privilege for this bucket.
    77 | * More information can be found at https://docs.couchbase.com/server/current/getting-started/do-a-quick-install.html 78 | *
  2. 79 | *
  3. 80 | * Create Kafka topics "word-input" and "count-output"
    81 | * ./deploy/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic word-input --partitions 2 --replication-factor 1 82 | * ./deploy/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic count-output --partitions 2 --replication-factor 1 83 | *
  4. 84 | *
  5. 85 | * Run the application using the run-app.sh script
    86 | * ./deploy/samza/bin/run-app.sh --config-path=$PWD/deploy/samza/config/couchbase-table-example.properties 87 | *
  6. 88 | *
  7. 89 | * Consume messages from the output topic
    90 | * ./deploy/kafka/bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic count-output 91 | *
  8. 92 | *
  9. 93 | * Produce some messages to the input topic
    94 | * ./deploy/kafka/bin/kafka-console-producer.sh --topic word-input --broker-list localhost:9092 95 | * 96 | * After the console producer is started, type 97 | * 1 98 | * 2 99 | * 3 100 | * 4 101 | * 5 102 | * 4 103 | * 3 104 | * 2 105 | * 1 106 | * 107 | * You should see messages like below from the console consumer window 108 | * 109 | * 2019-05-23 21:18:07 2019-05-23 21:18:07 word=2, count=1, total-count=1 110 | * 2019-05-23 21:18:07 2019-05-23 21:18:07 word=1, count=1, total-count=2 111 | * 2019-05-23 21:18:07 2019-05-23 21:18:07 word=4, count=1, total-count=3 112 | * 2019-05-23 21:18:07 2019-05-23 21:18:07 word=3, count=1, total-count=4 113 | * 2019-05-23 21:18:08 2019-05-23 21:18:08 word=4, count=2, total-count=5 114 | * 2019-05-23 21:18:08 2019-05-23 21:18:08 word=5, count=1, total-count=6 115 | * 2019-05-23 21:18:09 2019-05-23 21:18:09 word=2, count=2, total-count=7 116 | * 2019-05-23 21:18:09 2019-05-23 21:18:09 word=3, count=2, total-count=8 117 | * 2019-05-23 21:18:10 2019-05-23 21:18:10 word=1, count=2, total-count=9 118 | * 119 | * You can examine the result on Couchbase Admin GUI as well. 120 | * 121 | * Note: 122 | * - If you enter "1 2 3 4 5 4 3 2 1", you should see roughly 1 QPS as 123 | * the input is processed by only one task 124 | * 125 | * 126 | *
  10. 127 | *
128 | * 129 | */ 130 | public class CouchbaseTableExample implements StreamApplication { 131 | 132 | private static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 133 | 134 | private static final String KAFKA_SYSTEM_NAME = "kafka"; 135 | private static final List KAFKA_CONSUMER_ZK_CONNECT = ImmutableList.of("localhost:2181"); 136 | private static final List KAFKA_PRODUCER_BOOTSTRAP_SERVERS = ImmutableList.of("localhost:9092"); 137 | private static final Map KAFKA_DEFAULT_STREAM_CONFIGS = ImmutableMap.of("replication.factor", "1"); 138 | 139 | private static final String INPUT_STREAM_ID = "word-input"; 140 | private static final String OUTPUT_STREAM_ID = "count-output"; 141 | 142 | private static final String CLUSTER_NODES = "couchbase://127.0.0.1"; 143 | private static final int COUCHBASE_PORT = 11210; 144 | private static final String BUCKET_NAME = "my-bucket"; 145 | private static final String BUCKET_PASSWORD = "123456"; 146 | private static final String TOTAL_COUNT_ID = "total-count"; 147 | 148 | @Override 149 | public void describe(StreamApplicationDescriptor app) { 150 | 151 | KafkaSystemDescriptor kafkaSystemDescriptor = new KafkaSystemDescriptor(KAFKA_SYSTEM_NAME) 152 | .withConsumerZkConnect(KAFKA_CONSUMER_ZK_CONNECT) 153 | .withProducerBootstrapServers(KAFKA_PRODUCER_BOOTSTRAP_SERVERS) 154 | .withDefaultStreamConfigs(KAFKA_DEFAULT_STREAM_CONFIGS); 155 | 156 | KafkaInputDescriptor wordInputDescriptor = 157 | kafkaSystemDescriptor.getInputDescriptor(INPUT_STREAM_ID, new StringSerde()); 158 | 159 | KafkaOutputDescriptor countOutputDescriptor = 160 | kafkaSystemDescriptor.getOutputDescriptor(OUTPUT_STREAM_ID, new StringSerde()); 161 | 162 | MyCouchbaseTableWriteFunction writeFn = new MyCouchbaseTableWriteFunction(BUCKET_NAME, CLUSTER_NODES) 163 | .withBootstrapCarrierDirectPort(COUCHBASE_PORT) 164 | .withUsernameAndPassword(BUCKET_NAME, BUCKET_PASSWORD) 165 | .withTimeout(Duration.ofSeconds(5)); 166 | 167 | TableRetryPolicy retryPolicy = new TableRetryPolicy() 168 | .withFixedBackoff(Duration.ofSeconds(1)) 169 | .withStopAfterAttempts(3); 170 | 171 | RemoteTableDescriptor couchbaseTableDescriptor = new RemoteTableDescriptor("couchbase-table") 172 | .withReadFunction(new NoOpTableReadFunction()) 173 | .withReadRateLimiterDisabled() 174 | .withWriteFunction(writeFn) 175 | .withWriteRetryPolicy(retryPolicy) 176 | .withWriteRateLimit(4); 177 | 178 | app.withDefaultSystem(kafkaSystemDescriptor); 179 | MessageStream wordStream = app.getInputStream(wordInputDescriptor); 180 | OutputStream countStream = app.getOutputStream(countOutputDescriptor); 181 | app.getTable(couchbaseTableDescriptor); 182 | 183 | wordStream 184 | .flatMap(m -> Arrays.asList(m.split(" "))) 185 | .filter(word -> word != null && word.length() > 0) 186 | .map(new MyCountFunction()) 187 | .map(countString -> currentTime() + " " + countString) 188 | .sendTo(countStream); 189 | } 190 | 191 | static class MyCountFunction implements MapFunction { 192 | 193 | private MyCouchbaseTableWriteFunction writeFn; 194 | 195 | @Override 196 | public void init(Context context) { 197 | RemoteTable table = (RemoteTable) context.getTaskContext().getTable("couchbase-table"); 198 | writeFn = (MyCouchbaseTableWriteFunction) table.getWriteFunction(); 199 | } 200 | 201 | @Override 202 | public String apply(String word) { 203 | CompletableFuture countFuture = writeFn.incCounter(word); 204 | CompletableFuture totalCountFuture = writeFn.incCounter(TOTAL_COUNT_ID); 205 | return String.format("%s word=%s, count=%d, total-count=%d", 206 | currentTime(), word, countFuture.join(), totalCountFuture.join()); 207 | } 208 | } 209 | 210 | static class MyCouchbaseTableWriteFunction extends CouchbaseTableWriteFunction { 211 | 212 | private final static int OP_COUNTER = 1; 213 | 214 | public MyCouchbaseTableWriteFunction(String bucketName, String... clusterNodes) { 215 | super(bucketName, JsonObject.class, clusterNodes); 216 | } 217 | 218 | @Override 219 | public CompletableFuture writeAsync(int opId, Object... args) { 220 | switch (opId) { 221 | case OP_COUNTER: 222 | Preconditions.checkArgument(2 == args.length, 223 | String.format("Two arguments (String and int) are expected for counter operation (opId=%d)", opId)); 224 | String id = (String) args[0]; 225 | int delta = (int) args[1]; 226 | return asyncWriteHelper( 227 | bucket.async().counter(id, delta, 1, timeout.toMillis(), TimeUnit.MILLISECONDS), 228 | String.format("Failed to invoke counter with Id %s from bucket %s.", id, bucketName), 229 | false); 230 | default: 231 | throw new SamzaException("Unknown opId: " + opId); 232 | } 233 | } 234 | 235 | public CompletableFuture incCounter(String id) { 236 | return table.writeAsync(OP_COUNTER, id, 1); 237 | } 238 | 239 | } 240 | 241 | private static String currentTime() { 242 | return DATE_FORMAT.format(new Date()); 243 | } 244 | 245 | } 246 | --------------------------------------------------------------------------------