├── LICENSE.txt
├── NOTICES
├── README.md
├── SchemaProvider
    ├── README.md
    ├── pom.xml
    └── src
    │   └── main
    │       ├── avro
    │           ├── user_v1_1.avsc
    │           ├── user_v2_5.avsc
    │           └── versioned_schema.avsc
    │       └── java
    │           └── com
    │               └── cloudera
    │                   └── examples
    │                       └── avroserialization
    │                           ├── FileSystemSchemaProvider.java
    │                           ├── InMemorySchemaStore.java
    │                           ├── KafkaAvroDeserializer.java
    │                           ├── KafkaAvroSerializer.java
    │                           ├── KafkaSpecificRecordDeserializer.java
    │                           ├── KafkaSpecificRecordSerializer.java
    │                           ├── KafkaTopicSchemaProvider.java
    │                           ├── KafkaTopicSchemaTool.java
    │                           ├── KafkaTopicStore.java
    │                           ├── KafkaTopicStoreUsage.java
    │                           ├── SchemaProvider.java
    │                           ├── SchemaProviderFactory.java
    │                           ├── SchemaStore.java
    │                           ├── SchemaUtils.java
    │                           └── VersionedSchema.java
├── SimpleClient
    ├── README.md
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── cloudera
    │                   └── kafkaexamples
    │                       ├── SimpleConsumer.java
    │                       └── SimpleProducer.java
├── SimpleFlafka
    ├── README.md
    ├── etc
    │   └── twitter.conf
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           ├── FlafkaConsumer.java
    │           └── META-INF
    │               └── MANIFEST.MF
└── StructuredStreamingRefApp
    ├── README.md
    ├── advancedApp
        ├── README.md
        ├── db
        │   └── init_kudu_db.sql
        ├── pom.xml
        └── src
        │   ├── main
        │       ├── resources
        │       │   └── META-INF
        │       │   │   └── services
        │       │   │       └── org.apache.spark.sql.sources.DataSourceRegister
        │       └── scala
        │       │   └── com
        │       │       └── cloudera
        │       │           └── streaming
        │       │               └── refapp
        │       │                   ├── Application.scala
        │       │                   ├── DataFlows.scala
        │       │                   ├── DataGenerator.scala
        │       │                   ├── DeployedDataGenerator.scala
        │       │                   ├── DeployedStructuredStreams.scala
        │       │                   ├── Kafka.scala
        │       │                   ├── Kudu.scala
        │       │                   ├── Schemas.scala
        │       │                   ├── kudu
        │       │                       ├── KuduSink.scala
        │       │                       └── KuduSinkProvider.scala
        │       │                   └── package.scala
        │   └── test
        │       ├── resources
        │           └── samples
        │           │   ├── cluster
        │           │       ├── customers
        │           │       │   └── customers.json
        │           │       ├── states
        │           │       │   └── states.json
        │           │       └── vendors
        │           │       │   └── vendors.json
        │           │   └── kafka
        │           │       ├── customers
        │           │           └── customers.json
        │           │       ├── transactions
        │           │           ├── transactions-1.json
        │           │           ├── transactions-2.json
        │           │           ├── transactions-3.json
        │           │           └── transactions-4.json
        │           │       └── vendors
        │           │           └── vendors.json
        │       └── scala
        │           └── com
        │               └── cloudera
        │                   └── streaming
        │                       └── refapp
        │                           ├── EmbeddedKafka.scala
        │                           ├── EmbeddedSpark.scala
        │                           ├── Files.scala
        │                           ├── IntegrationTestBase.scala
        │                           ├── LocalIntegrationTest.scala
        │                           ├── Memory.scala
        │                           ├── StructuredStreams.scala
        │                           ├── TransactionsFlowUnitTest.scala
        │                           ├── UnitTestBase.scala
        │                           └── kudu
        │                               └── KuduSinkUnitTest.scala
    ├── docs
        ├── doc.md
        └── images
        │   ├── dag.png
        │   ├── flows.png
        │   ├── pipeline.png
        │   └── streaming-systems.png
    ├── scripts
        ├── config.sh
        ├── kudu.sh
        ├── producer.sh
        ├── spark-kafka.sh
        └── topics.sh
    └── simpleApp
        ├── README.md
        ├── db
            └── init_kudu_db.sql
        ├── pom.xml
        └── src
            └── main
                ├── resources
                    └── META-INF
                    │   └── services
                    │       └── org.apache.spark.sql.sources.DataSourceRegister
                └── scala
                    └── com
                        └── cloudera
                            └── streaming
                                └── refapp
                                    ├── StructuredStreamingApp.scala
                                    └── kudu
                                        ├── KuduSink.scala
                                        └── KuduSinkProvider.scala


/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/NOTICES:
--------------------------------------------------------------------------------
  1 | This project has dependencies on the following Cloudera Distribution including Apache Hadoop Components:
  2 | Kafka 2.0.0-cdh6.1.0 (based on Apache Kafka - https://kafka.apache.org/)
  3 | Kudu 1.8.0-cdh6.1.0 (based on Apache Kudu - https://kudu.apache.org/)
  4 | Spark 2.4.0-cdh6.1.0 (based on Apache Spark - https://spark.apache.org/)
  5 | 
  6 | In addition to the dependencies listed above, it depends on the following projects:
  7 | 
  8 | --------------------------
  9 | scalatest 3.0.5 (http://www.scalatest.org)
 10 | Licensed under the Apache License 2.0
 11 | 
 12 | /*
 13 | Copyright 2001-2016 Artima, Inc.
 14 | 
 15 | Licensed under the Apache License, Version 2.0 (the "License");
 16 | you may not use this file except in compliance with the License.
 17 | You may obtain a copy of the License at
 18 | 
 19 |     http://www.apache.org/licenses/LICENSE-2.0
 20 | 
 21 | Unless required by applicable law or agreed to in writing, software
 22 | distributed under the License is distributed on an "AS IS" BASIS,
 23 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 24 | See the License for the specific language governing permissions and
 25 | limitations under the License.
 26 | */
 27 | 
 28 | --------------------------
 29 | gson 2.8.5 (https://github.com/google/gson)
 30 | Licensed under the Apache License 2.0
 31 | 
 32 | /*
 33 | Copyright 2008 Google Inc.
 34 | 
 35 | Licensed under the Apache License, Version 2.0 (the "License");
 36 | you may not use this file except in compliance with the License.
 37 | You may obtain a copy of the License at
 38 | 
 39 |     http://www.apache.org/licenses/LICENSE-2.0
 40 | 
 41 | Unless required by applicable law or agreed to in writing, software
 42 | distributed under the License is distributed on an "AS IS" BASIS,
 43 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 44 | See the License for the specific language governing permissions and
 45 | limitations under the License.
 46 | */
 47 | 
 48 | --------------------------
 49 | jopt-simple
 50 | 
 51 | Received under the MIT License. A copy of this MIT License is provided below. jopt-simple includes dependencies that can be found in the jopt-simple pom.xml
 52 | 
 53 | /*
 54 | The MIT License
 55 | 
 56 | Copyright (c) 2004-2016 Paul R. Holser, Jr.
 57 | 
 58 | Permission is hereby granted, free of charge, to any person obtaining
 59 | a copy of this software and associated documentation files (the
 60 | "Software"), to deal in the Software without restriction, including
 61 | without limitation the rights to use, copy, modify, merge, publish,
 62 | distribute, sublicense, and/or sell copies of the Software, and to
 63 | permit persons to whom the Software is furnished to do so, subject to
 64 | the following conditions:
 65 | 
 66 | The above copyright notice and this permission notice shall be
 67 | included in all copies or substantial portions of the Software.
 68 | 
 69 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 70 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 71 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 72 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 73 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 74 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 75 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 76 | */
 77 | 
 78 | #####END OF JOPT-SIMPLE NOTICE#######
 79 | 
 80 | --------------------------
 81 | Mockito
 82 | 
 83 | Received under the MIT License. A copy of this MIT License is provided below. Mockito includes dependencies that can be found in the Mockito pom.xml
 84 | 
 85 | /*
 86 | The MIT License
 87 | 
 88 | Copyright (c) 2007 Mockito contributors
 89 | 
 90 | Permission is hereby granted, free of charge, to any person obtaining a copy
 91 | of this software and associated documentation files (the "Software"), to deal
 92 | in the Software without restriction, including without limitation the rights
 93 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 94 | copies of the Software, and to permit persons to whom the Software is
 95 | furnished to do so, subject to the following conditions:
 96 | 
 97 | The above copyright notice and this permission notice shall be included in
 98 | all copies or substantial portions of the Software.
 99 | 
100 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
101 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
103 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
105 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
106 | THE SOFTWARE.
107 | */
108 | 
109 | --------------------------
110 | scalatest-embedded-kafka
111 | 
112 | Received under the MIT License. A copy of this MIT License is provided below. scalatest-embedded-kafka
113 | includes dependencies that can be found in the scalatest-embedded-kafka pom.xml
114 | 
115 | /*
116 | The MIT License (MIT)
117 | 
118 | Copyright (c) 2016 Emanuele Blanco
119 | 
120 | Permission is hereby granted, free of charge, to any person obtaining a copy
121 | of this software and associated documentation files (the "Software"), to deal
122 | in the Software without restriction, including without limitation the rights
123 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
124 | copies of the Software, and to permit persons to whom the Software is
125 | furnished to do so, subject to the following conditions:
126 | 
127 | The above copyright notice and this permission notice shall be included in all
128 | copies or substantial portions of the Software.
129 | 
130 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
131 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
132 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
133 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
134 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
135 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
136 | SOFTWARE.
137 | */
138 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | _Copyright &copy; Cloudera, Inc. 2018_
 2 | 
 3 | # Code Examples for CDK/CDH Releases Powered by Apache Kafka
 4 | 
 5 | Welcome to the code repository for CDK/CDH Releases Powered by Apache Kafka. The code stored serves the following purposes:
 6 | 
 7 | * Provide a useful starting point for developers.
 8 | * Provide extra detailed comments that fit in the code, but won't be readable in a web page or documentation.
 9 | * Have more examples available for the intermediate developer.
10 | * Provide other examples for more advanced Kafka administrative tasks.
11 | * Be the repository for code used in Kafka blog posts.
12 | 
13 | ## Projects
14 | 
15 | Below is a list of the examples provided here.  Each such project should have their own README with details specific to that project.
16 | 
17 | ### SimpleClient
18 | 
19 | Minimal example of a Kafka producer and consumer.
20 | 
21 | ### SimpleFlafka
22 | 
23 | Simple example using Flume and Kafka consumer groups.
24 | 
25 | ### SchemaProvider
26 | 
27 | Example of efficient serialization and schema versioning using Apache Avro.
28 | 
29 | ### StructuredStreamingRefApp
30 | 
31 | Sample applications that demonstrate a Kafka -> Spark Structured Streaming -> Kudu pipeline for ingestion
32 | 
33 | ## Getting Started
34 | 
35 | In general, you will need the latest version of Java and Maven appropriate
36 | to a Kafka client environment.
37 | 
38 | ## License
39 | 
40 | Licensed under Apache License Version 2.0
41 | 
42 | 


--------------------------------------------------------------------------------
/SchemaProvider/README.md:
--------------------------------------------------------------------------------
1 | _Copyright &copy; Cloudera, Inc. 2018_
2 | # Avro Messages in Kafka
3 | 
4 | This example demonstrates how to use Apache Avro to serialize records 
5 | that are produced to Kafka while allowing evolution of schemas and nonsynchronous update of producer and consumer applications.
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/SchemaProvider/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>com.cloudera.examples</groupId>
 8 |     <artifactId>avroserialization</artifactId>
 9 |     <version>1.0-SNAPSHOT</version>
10 | 
11 |     <repositories>
12 |         <!-- Cloudera maven repository to resolve dependencies -->
13 |         <repository>
14 |             <id>cloudera</id>
15 |             <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
16 |             <releases><enabled>true</enabled></releases>
17 |             <snapshots><enabled>false</enabled></snapshots>
18 |         </repository>
19 |     </repositories>
20 |     <pluginRepositories>
21 |         <!-- Cloudera maven repository to resolve plugins (the same repo) -->
22 |         <pluginRepository>
23 |             <id>cloudera-plugins</id>
24 |             <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
25 |             <releases><enabled>true</enabled></releases>
26 |             <snapshots><enabled>false</enabled></snapshots>
27 |         </pluginRepository>
28 |     </pluginRepositories>
29 | 
30 | 
31 |     <properties>
32 |         <maven.compiler.source>1.8</maven.compiler.source>
33 |         <maven.compiler.target>1.8</maven.compiler.target>
34 |         <avro.version>1.7.6-cdh5.14.0</avro.version>
35 |     </properties>
36 |     <dependencies>
37 |         <dependency>
38 |             <groupId>org.apache.kafka</groupId>
39 |             <artifactId>kafka-clients</artifactId>
40 |             <version>0.11.0-kafka-3.0.0</version>
41 |         </dependency>
42 |         <dependency>
43 |             <artifactId>avro</artifactId>
44 |             <groupId>org.apache.avro</groupId>
45 |             <version>${avro.version}</version>
46 |         </dependency>
47 |         <dependency>
48 |             <groupId>net.sf.jopt-simple</groupId>
49 |             <artifactId>jopt-simple</artifactId>
50 |             <version>4.9</version>
51 |         </dependency>
52 |     </dependencies>
53 | 
54 |     <build>
55 |         <plugins>
56 |             <plugin>
57 |                 <!-- Generate sources from avro schema (*.avsc) files -->
58 |                 <groupId>org.apache.avro</groupId>
59 |                 <artifactId>avro-maven-plugin</artifactId>
60 |                 <version>${avro.version}</version>
61 |                 <executions>
62 |                     <execution>
63 |                         <id>avro-gen-main</id>
64 |                         <phase>generate-sources</phase>
65 |                         <goals>
66 |                             <goal>schema</goal>
67 |                         </goals>
68 |                         <configuration>
69 |                             <sourceDirectory>${project.basedir}/src/main/avro/</sourceDirectory>
70 |                             <outputDirectory>${project.basedir}/target/generated-sources/avro/</outputDirectory>
71 |                         </configuration>
72 |                     </execution>
73 |                 </executions>
74 |             </plugin>
75 |         </plugins>
76 |     </build>
77 | 
78 | </project>
79 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/avro/user_v1_1.avsc:
--------------------------------------------------------------------------------
1 | {"namespace": "com.cloudera.examples.avroserialization",
2 |   "type": "record",
3 |   "name": "User",
4 |   "fields": [
5 |     {"name": "identifier", "type": "string"},
6 |     {"name": "display_name", "type": "string"},
7 |     {"name": "registration_time", "type": "long"}
8 |   ]
9 | }


--------------------------------------------------------------------------------
/SchemaProvider/src/main/avro/user_v2_5.avsc:
--------------------------------------------------------------------------------
 1 | {"namespace": "com.cloudera.examples.avroserialization",
 2 |   "type": "record",
 3 |   "name": "User2",
 4 |   "fields": [
 5 |     {"name": "identifier", "type": "string"},
 6 |     {"name": "display_name", "type": ["string", "null"]},
 7 |     {"name": "year_of_birth", "type": "int"}
 8 | 
 9 |   ]
10 | }


--------------------------------------------------------------------------------
/SchemaProvider/src/main/avro/versioned_schema.avsc:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | {"namespace": "com.cloudera.kafkaexample.avro",
 5 |   "type": "record",
 6 |   "name": "SerializedVersionedSchema",
 7 |   "fields": [
 8 |     {"name": "id", "type": "int"},
 9 |     {"name": "name", "type": "string"},
10 |     {"name": "version",  "type": "int"},
11 |     {"name": "avroSchema", "type": "string"}
12 |   ]
13 | }


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/FileSystemSchemaProvider.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) Cloudera, Inc. 2018
  3 |  */
  4 | package com.cloudera.examples.avroserialization;
  5 | 
  6 | import org.apache.avro.Schema;
  7 | import org.slf4j.Logger;
  8 | import org.slf4j.LoggerFactory;
  9 | 
 10 | import java.io.File;
 11 | import java.io.IOException;
 12 | import java.nio.file.Paths;
 13 | import java.nio.file.WatchService;
 14 | import java.util.Arrays;
 15 | import java.util.Map;
 16 | import java.util.regex.Matcher;
 17 | import java.util.regex.Pattern;
 18 | 
 19 | /**
 20 |  * Example of {@link SchemaProvider} implementation with a backend different form Kafka.
 21 |  *
 22 |  * This object is configured with a directory name.
 23 |  * The directory is monitored for change and on any event, all schemas are re-read.
 24 |  * Schema name, version and identifier are parsed from file names.
 25 |  */
 26 | public class FileSystemSchemaProvider implements SchemaProvider {
 27 | 
 28 |     public static class FileSystemSchemaProviderFactory implements SchemaProviderFactory {
 29 | 
 30 |         public static final String SCHEMA_DIR_CONFIG = "schema.directory";
 31 | 
 32 |         @Override
 33 |         public SchemaProvider getProvider(Map<String, ?> config) throws Exception {
 34 |             return new FileSystemSchemaProvider(config.get(SCHEMA_DIR_CONFIG).toString());
 35 |         }
 36 |     }
 37 | 
 38 |     private static final Logger logger = LoggerFactory.getLogger(FileSystemSchemaProvider.class);
 39 | 
 40 |     private static final Pattern SCHEMA_FILENAME_PATTERN = Pattern.compile("([a-zA-Z0-9_.]+)_v(\\d+)_(\\d+)\\.avsc");
 41 |     private final String schemaDirectory;
 42 |     private final Thread watcherThread;
 43 |     private volatile InMemorySchemaStore cache;
 44 | 
 45 | 
 46 | 
 47 |     public FileSystemSchemaProvider(String schemaDirectory) throws Exception {
 48 |         this.schemaDirectory = schemaDirectory;
 49 |         watcherThread = new Thread(new PathWatcherRunnable(schemaDirectory));
 50 |         watcherThread.start();
 51 |         rereadSchemas();
 52 | 
 53 |     }
 54 | 
 55 |     @Override
 56 |     public VersionedSchema get(int id) {
 57 |         return cache.get(id);
 58 |     }
 59 | 
 60 |     @Override
 61 |     public VersionedSchema get(String schemaName, int schemaVersion) {
 62 |         return cache.get(schemaName, schemaVersion);
 63 |     }
 64 | 
 65 |     @Override
 66 |     public VersionedSchema getMetadata(Schema schema) {
 67 |         return cache.getMetadata(schema);
 68 |     }
 69 | 
 70 |     @Override
 71 |     public void close() throws Exception {
 72 |         watcherThread.interrupt();
 73 |         watcherThread.join(1000);
 74 |         cache.close();
 75 |     }
 76 | 
 77 |     private void rereadSchemas() {
 78 |         // Re-read the whole directory for simplicity.
 79 |         InMemorySchemaStore newCache = new InMemorySchemaStore();
 80 |         File[] schemaFiles = new File(schemaDirectory)
 81 |                 .listFiles((dir, name) -> SCHEMA_FILENAME_PATTERN.matcher(name).matches());
 82 |         if (schemaFiles == null) {
 83 |             throw new RuntimeException("Could not list schema directory " + schemaDirectory);
 84 |         }
 85 |         Arrays.stream(schemaFiles)
 86 |                 .map(this::readSchemaFile)
 87 |                 .forEach(newCache::add);
 88 |         this.cache = newCache;
 89 |     }
 90 | 
 91 |     private VersionedSchema readSchemaFile(File schemaFile) {
 92 |         // Parse file metadata from file name.
 93 |         Matcher matcher = SCHEMA_FILENAME_PATTERN.matcher(schemaFile.getName());
 94 |         matcher.matches();
 95 |         String name = matcher.group(1);
 96 |         int version = Integer.parseInt(matcher.group(2));
 97 |         int id = Integer.parseInt(matcher.group(3));
 98 |         try {
 99 |             Schema schema = new Schema.Parser().parse(schemaFile);
100 |             return new VersionedSchema(id, name, version, schema);
101 |         } catch (IOException e) {
102 |             throw new RuntimeException("Could not parse schema file " + schemaFile, e);
103 |         }
104 |     }
105 | 
106 |     private class PathWatcherRunnable implements Runnable {
107 | 
108 |         private final WatchService watcher;
109 | 
110 |         PathWatcherRunnable(String schemaDirectory) throws Exception {
111 |             watcher = Paths.get(schemaDirectory).getFileSystem().newWatchService();
112 | 
113 |         }
114 | 
115 |         @Override
116 |         public void run() {
117 |             // poll the FS for changes until interrupted
118 |             while (true) {
119 |                 try {
120 |                     watcher.take();
121 |                     rereadSchemas();
122 |                 } catch (InterruptedException e) {
123 |                     logger.info("Interrupted while polling for file system changes.", e);
124 |                     return;
125 |                 } catch (Exception e) {
126 |                     logger.error("Error while reading schemas.", e);
127 |                 }
128 |             }
129 |         }
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/InMemorySchemaStore.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) Cloudera, Inc. 2018
  3 |  */
  4 | package com.cloudera.examples.avroserialization;
  5 | 
  6 | import org.apache.avro.Schema;
  7 | import org.apache.avro.SchemaNormalization;
  8 | import org.slf4j.Logger;
  9 | import org.slf4j.LoggerFactory;
 10 | 
 11 | import java.util.Collection;
 12 | import java.util.Map;
 13 | import java.util.Objects;
 14 | import java.util.concurrent.ConcurrentHashMap;
 15 | 
 16 | /**
 17 |  * Implementation of {@link SchemaStore} that keeps schemas in memory. Useful as a cache in other SchemasStore implementations.
 18 |  *
 19 |  * Stores {@link VersionedSchema} references in maps to enable fast lookup.
 20 |  */
 21 | public class InMemorySchemaStore implements SchemaStore {
 22 | 
 23 |     private static final Logger logger = LoggerFactory.getLogger(InMemorySchemaStore.class);
 24 | 
 25 |     private final Map<Integer, VersionedSchema> schemasById = new ConcurrentHashMap<>();
 26 |     private final Map<SchemaNameWithVersion, VersionedSchema> schemasByNameAndVersion = new ConcurrentHashMap<>();
 27 |     private final Map<String, VersionedSchema> schemasByParsingForm = new ConcurrentHashMap<>();
 28 | 
 29 |     @Override
 30 |     public void add(VersionedSchema schema) {
 31 |         schemasById.put(schema.getId(), schema);
 32 |         schemasByNameAndVersion.put(new SchemaNameWithVersion(schema.getName(), schema.getVersion()), schema);
 33 |         schemasByParsingForm.put(SchemaNormalization.toParsingForm(schema.getSchema()), schema);
 34 |     }
 35 | 
 36 |     @Override
 37 |     public VersionedSchema get(int id) {
 38 |         VersionedSchema versionedSchema = schemasById.get(id);
 39 |         if (versionedSchema == null) {
 40 |             throw new RuntimeException("Could not find version with id=" + id);
 41 |         }
 42 |         return versionedSchema;
 43 |     }
 44 | 
 45 |     @Override
 46 |     public VersionedSchema get(String schemaName, int schemaVersion) {
 47 |         VersionedSchema versionedSchema = schemasByNameAndVersion.get(new SchemaNameWithVersion(schemaName, schemaVersion));
 48 |         if (versionedSchema == null) {
 49 |             throw new RuntimeException("Could not find version with name=" + schemaName + " and version=" + schemaVersion);
 50 |         }
 51 | 
 52 |         return versionedSchema;
 53 |     }
 54 | 
 55 |     @Override
 56 |     public VersionedSchema getMetadata(Schema schema) {
 57 |         String parsingForm = SchemaNormalization.toParsingForm(schema);
 58 |         VersionedSchema versionedSchema = schemasByParsingForm.get(parsingForm);
 59 |         if (versionedSchema == null) {
 60 |             throw new RuntimeException("Could not find metadata for schema.\nParsing form: " + parsingForm);
 61 |         }
 62 |         return versionedSchema;
 63 |     }
 64 | 
 65 |     public Collection<VersionedSchema> getAllSchemas() {
 66 |         return schemasById.values();
 67 |     }
 68 | 
 69 |     @Override
 70 |     public void close() {
 71 |         schemasById.clear();
 72 |         schemasByNameAndVersion.clear();
 73 |         schemasByParsingForm.clear();
 74 |     }
 75 | 
 76 |     private class SchemaNameWithVersion {
 77 |         private final String name;
 78 |         private final int version;
 79 | 
 80 |         SchemaNameWithVersion(String name, int version) {
 81 | 
 82 |             this.name = name;
 83 |             this.version = version;
 84 |         }
 85 | 
 86 |         @Override
 87 |         public boolean equals(Object o) {
 88 |             if (this == o) return true;
 89 |             if (o == null || getClass() != o.getClass()) return false;
 90 |             SchemaNameWithVersion that = (SchemaNameWithVersion) o;
 91 |             return version == that.version &&
 92 |                     Objects.equals(name, that.name);
 93 |         }
 94 | 
 95 |         @Override
 96 |         public int hashCode() {
 97 |             return Objects.hash(name, version);
 98 |         }
 99 | 
100 |         @Override
101 |         public String toString() {
102 |             return "SchemaNameWithVersion{" +
103 |                     "name='" + name + '\'' +
104 |                     ", version=" + version +
105 |                     '}';
106 |         }
107 |     }
108 | 
109 | }
110 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/KafkaAvroDeserializer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | package com.cloudera.examples.avroserialization;
 5 | 
 6 | import org.apache.avro.Schema;
 7 | import org.apache.avro.generic.GenericData;
 8 | import org.apache.avro.generic.GenericDatumReader;
 9 | import org.apache.avro.io.BinaryDecoder;
10 | import org.apache.avro.io.DatumReader;
11 | import org.apache.avro.io.DecoderFactory;
12 | import org.apache.kafka.common.serialization.Deserializer;
13 | 
14 | import java.io.*;
15 | import java.util.Map;
16 | 
17 | /**
18 |  * {@link Deserializer} implementation that converts byte arrays to {@link org.apache.avro.generic.GenericData.Record} objects.
19 |  * The following configuration is needed<ul>
20 |  *     <li>{@code schemaProviderFactory=<factory_class_name>} for schema discovery</li>
21 |  *     <li>{@code schemaversion.<schema_name>=<schema_version>} for reader schema versions</li>
22 |  * </ul>
23 |  */
24 | public class KafkaAvroDeserializer implements Deserializer<GenericData.Record> {
25 | 
26 |     private Map<String, VersionedSchema> readerSchemasByName;
27 |     private SchemaProvider schemaProvider;
28 | 
29 |     @Override
30 |     public void configure(Map<String, ?> configs, boolean isKey) {
31 |         schemaProvider = SchemaUtils.getSchemaProvider(configs);
32 |         readerSchemasByName = SchemaUtils.getVersionedSchemas(configs, schemaProvider);
33 |     }
34 | 
35 | 
36 |     @Override
37 |     public GenericData.Record deserialize(String topic, byte[] data) {
38 |         try (ByteArrayInputStream stream = new ByteArrayInputStream(data)) {
39 | 
40 |             int schemaId = readSchemaId(stream);
41 |             VersionedSchema writerSchema = schemaProvider.get(schemaId);
42 | 
43 |             VersionedSchema readerSchema = readerSchemasByName.get(writerSchema.getName());
44 |             GenericData.Record avroRecord = readAvroRecord(stream, writerSchema.getSchema(), readerSchema.getSchema());
45 |             return avroRecord;
46 |         } catch (IOException e) {
47 |             throw new RuntimeException(e);
48 |         }
49 |     }
50 | 
51 |     private int readSchemaId(InputStream stream ) throws IOException {
52 |         try(DataInputStream is = new DataInputStream(stream)) {
53 |             return is.readInt();
54 |         }
55 |     }
56 | 
57 |     private GenericData.Record readAvroRecord(InputStream stream, Schema writerSchema, Schema readerSchema) throws IOException {
58 |         DatumReader<Object> datumReader = new GenericDatumReader<>(writerSchema, readerSchema);
59 |         BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(stream, null);
60 |         GenericData.Record record = new GenericData.Record(readerSchema);
61 |         datumReader.read(record, decoder);
62 |         return record;
63 |     }
64 | 
65 |     @Override
66 |     public void close() {
67 |         try {
68 |             schemaProvider.close();
69 |         } catch (Exception e) {
70 |             throw new RuntimeException(e);
71 |         }
72 |     }
73 | 
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/KafkaAvroSerializer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | package com.cloudera.examples.avroserialization;
 5 | 
 6 | import org.apache.avro.Schema;
 7 | import org.apache.avro.generic.GenericContainer;
 8 | import org.apache.avro.generic.GenericDatumWriter;
 9 | import org.apache.avro.io.BinaryEncoder;
10 | import org.apache.avro.io.DatumWriter;
11 | import org.apache.avro.io.EncoderFactory;
12 | import org.apache.kafka.common.serialization.Serializer;
13 | 
14 | import java.io.ByteArrayOutputStream;
15 | import java.io.DataOutputStream;
16 | import java.io.IOException;
17 | import java.util.Map;
18 | /**
19 |  * {@link Serializer} implementation that converts byte arrays to {@link org.apache.avro.generic.GenericData.Record} objects.
20 |  * The following configuration is needed<ul>
21 |  *     <li>{@code schemaProviderFactory=<factory_class_name>} for schema discovery</li>
22 |  *     <li>{@code schemaversion.<schema_name>=<schema_version>} for reader schema versions</li>
23 |  * </ul>
24 |  */
25 | public class KafkaAvroSerializer<T extends GenericContainer> implements Serializer<T> {
26 | 
27 |     private SchemaProvider schemaProvider;
28 | 
29 |     @Override
30 |     public void configure(Map<String, ?> configs, boolean isKey) {
31 |         schemaProvider = SchemaUtils.getSchemaProvider(configs);
32 |     }
33 | 
34 |     @Override
35 |     public byte[] serialize(String topic, T data) {
36 |         try (ByteArrayOutputStream stream = new ByteArrayOutputStream()) {
37 |             VersionedSchema schema = getSchema(data, topic);
38 | 
39 |             writeSchemaId(stream, schema.getId());
40 |             writeSerializedAvro(stream, data, schema.getSchema());
41 | 
42 |             return stream.toByteArray();
43 |         } catch (IOException e) {
44 |             throw new RuntimeException("Could not serialize data", e);
45 |         }
46 |     }
47 | 
48 |     private void writeSchemaId(ByteArrayOutputStream stream, int id) throws IOException {
49 |         try (DataOutputStream os = new DataOutputStream(stream)) {
50 |             os.writeInt(id);
51 |         }
52 |     }
53 | 
54 |     private void writeSerializedAvro(ByteArrayOutputStream stream, T data, Schema schema) throws IOException {
55 |         BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(stream, null);
56 |         DatumWriter<T> datumWriter = new GenericDatumWriter<>(schema);
57 |         datumWriter.write(data, encoder);
58 |         encoder.flush();
59 |     }
60 | 
61 |     private VersionedSchema getSchema(T data, String topic) {
62 |         return schemaProvider.getMetadata( data.getSchema());
63 |     }
64 | 
65 |     @Override
66 |     public void close() {
67 |         try {
68 |             schemaProvider.close();
69 |         } catch (Exception e) {
70 |             throw new RuntimeException(e);
71 |         }
72 | 
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/KafkaSpecificRecordDeserializer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | package com.cloudera.examples.avroserialization;
 5 | 
 6 | import org.apache.avro.Schema;
 7 | import org.apache.avro.io.BinaryDecoder;
 8 | import org.apache.avro.io.DatumReader;
 9 | import org.apache.avro.io.DecoderFactory;
10 | import org.apache.avro.specific.SpecificData;
11 | import org.apache.avro.specific.SpecificDatumReader;
12 | import org.apache.avro.specific.SpecificRecord;
13 | import org.apache.kafka.common.serialization.Deserializer;
14 | 
15 | import java.io.ByteArrayInputStream;
16 | import java.io.DataInputStream;
17 | import java.io.IOException;
18 | import java.io.InputStream;
19 | import java.util.Map;
20 | 
21 | /**
22 |  * {@link Deserializer} implementation that converts byte arrays to objects of a subclass of {@link SpecificRecord}.
23 |  * The following configuration is needed<ul>
24 |  *     <li>{@code schemaProviderFactory=<factory_class_name>} for schema discovery</li>
25 |  *     <li>{@code key.record.classname=<record_class_name>} or {@code value.record.classname=<record_class_name>}</li>
26 |  * </ul>
27 |  */
28 | public class KafkaSpecificRecordDeserializer<T extends SpecificRecord> implements Deserializer<T> {
29 | 
30 | 
31 |     public static final String KEY_RECORD_CLASSNAME = "key.record.classname";
32 |     public static final String VALUE_RECORD_CLASSNAME = "value.record.classname";
33 | 
34 |     private SchemaProvider schemaProvider;
35 | 
36 |     private Schema readerSchema;
37 | 
38 | 
39 |     @Override
40 |     public void configure(Map<String, ?> configs, boolean isKey) {
41 |         String className = configs.get(isKey ? KEY_RECORD_CLASSNAME : VALUE_RECORD_CLASSNAME).toString();
42 |         try {
43 |             schemaProvider = SchemaUtils.getSchemaProvider(configs);
44 |             Class<?> recordClass = Class.forName(className);
45 |             this.readerSchema = new SpecificData(recordClass.getClassLoader()).getSchema(recordClass);
46 |         } catch (Exception e) {
47 |             throw new RuntimeException(e);
48 |         }
49 |     }
50 | 
51 | 
52 |     @Override
53 |     public T deserialize(String topic, byte[] data) {
54 |         try (ByteArrayInputStream stream = new ByteArrayInputStream(data)) {
55 | 
56 |             int schemaId = readSchemaId(stream);
57 |             VersionedSchema writerSchema = schemaProvider.get(schemaId);
58 | 
59 |             return readAvroRecord(stream, writerSchema.getSchema(), readerSchema);
60 |         } catch (IOException e) {
61 |             throw new RuntimeException(e);
62 |         }
63 |     }
64 | 
65 |     private int readSchemaId(InputStream stream) throws IOException {
66 |         return new DataInputStream(stream).readInt();
67 |     }
68 | 
69 |     private T readAvroRecord(InputStream stream, Schema writerSchema, Schema readerSchema) throws IOException {
70 |         DatumReader<T> datumReader = new SpecificDatumReader<>(writerSchema, readerSchema);
71 |         BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(stream, null);
72 |         return datumReader.read(null, decoder);
73 |     }
74 | 
75 |     @Override
76 |     public void close() {
77 |         try {
78 |             schemaProvider.close();
79 |         } catch (Exception e) {
80 |             throw new RuntimeException(e);
81 |         }
82 |     }
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/KafkaSpecificRecordSerializer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | package com.cloudera.examples.avroserialization;
 5 | 
 6 | import org.apache.avro.Schema;
 7 | import org.apache.avro.io.BinaryEncoder;
 8 | import org.apache.avro.io.DatumWriter;
 9 | import org.apache.avro.io.EncoderFactory;
10 | import org.apache.avro.specific.SpecificData;
11 | import org.apache.avro.specific.SpecificDatumWriter;
12 | import org.apache.avro.specific.SpecificRecord;
13 | import org.apache.kafka.common.serialization.Serializer;
14 | 
15 | import java.io.ByteArrayOutputStream;
16 | import java.io.DataOutputStream;
17 | import java.io.IOException;
18 | import java.util.Map;
19 | 
20 | /**
21 |  * {@link Serializer} implementation that converts objects of a subclass of {@link SpecificRecord} to byte arrays.
22 |  * The following configuration is needed<ul>
23 |  *     <li>{@code schemaProviderFactory=<factory_class_name>} for schema discovery</li>
24 |  *     <li>{@code key.record.classname=<record_class_name>} or {@code value.record.classname=<record_class_name>}</li>
25 |  * </ul>
26 |  */
27 | public class KafkaSpecificRecordSerializer<T extends SpecificRecord> implements Serializer<T> {
28 | 
29 |     public static final String KEY_RECORD_CLASSNAME = "key.record.classname";
30 |     public static final String VALUE_RECORD_CLASSNAME = "value.record.classname";
31 | 
32 |     private int writerSchemaId;
33 | 
34 |     @Override
35 |     public void configure(Map<String, ?> configs, boolean isKey) {
36 |         String className = configs.get(isKey ? KEY_RECORD_CLASSNAME : VALUE_RECORD_CLASSNAME).toString();
37 |         try (SchemaProvider schemaProvider = SchemaUtils.getSchemaProvider(configs)) {
38 |             Class<?> recordClass = Class.forName(className);
39 |             Schema writerSchema = new SpecificData(recordClass.getClassLoader()).getSchema(recordClass);
40 |             this.writerSchemaId = schemaProvider.getMetadata(writerSchema).getId();
41 |         } catch (Exception e) {
42 |             throw new RuntimeException(e);
43 |         }
44 |     }
45 | 
46 |     @Override
47 |     public byte[] serialize(String topic, T data) {
48 |         try (ByteArrayOutputStream stream = new ByteArrayOutputStream()) {
49 | 
50 |             writeSchemaId(stream, writerSchemaId);
51 |             writeSerializedAvro(stream, data);
52 | 
53 |             return stream.toByteArray();
54 |         } catch (IOException e) {
55 |             throw new RuntimeException("Could not serialize data", e);
56 |         }
57 |     }
58 | 
59 |     private void writeSchemaId(ByteArrayOutputStream stream, int id) throws IOException {
60 |         try( DataOutputStream os = new DataOutputStream(stream)) {
61 |             os.writeInt(id);
62 |         }
63 |     }
64 | 
65 |     private void writeSerializedAvro(ByteArrayOutputStream stream, T data) throws IOException {
66 |         BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(stream, null);
67 |         DatumWriter<T> datumWriter = new SpecificDatumWriter<>(data.getSchema());
68 |         datumWriter.write(data, encoder);
69 |         encoder.flush();
70 |     }
71 | 
72 |     @Override
73 |     public void close() {
74 |         // nothing to do
75 |     }
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/KafkaTopicSchemaProvider.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) Cloudera, Inc. 2018
  3 |  */
  4 | package com.cloudera.examples.avroserialization;
  5 | 
  6 | import com.cloudera.kafkaexample.avro.SerializedVersionedSchema;
  7 | import org.apache.avro.Schema;
  8 | import org.apache.avro.io.BinaryDecoder;
  9 | import org.apache.avro.io.BinaryEncoder;
 10 | import org.apache.avro.io.DatumReader;
 11 | import org.apache.avro.io.DatumWriter;
 12 | import org.apache.avro.io.DecoderFactory;
 13 | import org.apache.avro.io.EncoderFactory;
 14 | import org.apache.avro.specific.SpecificDatumReader;
 15 | import org.apache.avro.specific.SpecificDatumWriter;
 16 | import org.apache.kafka.common.serialization.Deserializer;
 17 | import org.apache.kafka.common.serialization.Serializer;
 18 | import org.slf4j.Logger;
 19 | import org.slf4j.LoggerFactory;
 20 | 
 21 | import java.io.ByteArrayOutputStream;
 22 | import java.io.IOException;
 23 | import java.util.Collection;
 24 | import java.util.Map;
 25 | 
 26 | public class KafkaTopicSchemaProvider implements SchemaStore {
 27 | 
 28 |     public static final String SCHEMA_PROVIDER_CONF_PREFIX = "schemaprovider.";
 29 |     public static final String SCHEMA_TOPIC_NAME_CONF = SCHEMA_PROVIDER_CONF_PREFIX + "topic";
 30 | 
 31 |     public static final class KafkaTopicSchemaProviderFactory implements SchemaProviderFactory {
 32 | 
 33 |         @Override
 34 |         public SchemaProvider getProvider(Map<String, ?> config) throws Exception {
 35 |             InMemorySchemaStore cache = new InMemorySchemaStore();
 36 |             String topic = config.get(SCHEMA_TOPIC_NAME_CONF).toString();
 37 |             Map<String, Object> storeConfig = KafkaTopicStore.subConfig(config, SCHEMA_PROVIDER_CONF_PREFIX);
 38 | 
 39 |             KafkaTopicStore<VersionedSchema> store = new KafkaTopicStore<>(
 40 |                     storeConfig,
 41 |                     topic,
 42 |                     VersionedSchemaSerializer.class.getName(),
 43 |                     VersionedSchemaDeserializer.class.getName(),
 44 |                     r -> cache.add(r.value()));
 45 |             store.startAndCatchUp();
 46 |             return new KafkaTopicSchemaProvider(cache, store);
 47 |         }
 48 |     }
 49 | 
 50 | 
 51 | 
 52 |     public static class VersionedSchemaDeserializer implements Deserializer<VersionedSchema> {
 53 | 
 54 |         @Override
 55 |         public void configure(Map<String, ?> configs, boolean isKey) {
 56 | 
 57 |         }
 58 | 
 59 |         @Override
 60 |         public VersionedSchema deserialize(String topic, byte[] data) {
 61 |             try {
 62 |                 DatumReader<SerializedVersionedSchema> datumReader = new SpecificDatumReader<>(SerializedVersionedSchema.class);
 63 |                 BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(data, null);
 64 |                 SerializedVersionedSchema serialized = datumReader.read(null, decoder);
 65 |                 VersionedSchema schema = new VersionedSchema(
 66 |                         serialized.getId(),
 67 |                         serialized.getName().toString(),
 68 |                         serialized.getVersion(),
 69 |                         new Schema.Parser().parse(serialized.getAvroSchema().toString()));
 70 |                 return schema;
 71 |             } catch (IOException e) {
 72 |                 throw new RuntimeException(e);
 73 |             }
 74 |         }
 75 | 
 76 |         @Override
 77 |         public void close() {
 78 | 
 79 |         }
 80 |     }
 81 | 
 82 |     public static class VersionedSchemaSerializer implements Serializer<VersionedSchema> {
 83 | 
 84 |         @Override
 85 |         public void configure(Map<String, ?> configs, boolean isKey) {
 86 | 
 87 |         }
 88 | 
 89 |         @Override
 90 |         public byte[] serialize(String topic, VersionedSchema data) {
 91 |             try {
 92 |                 ByteArrayOutputStream stream = new ByteArrayOutputStream();
 93 |                 DatumWriter<SerializedVersionedSchema> datumWriter = new SpecificDatumWriter<>(SerializedVersionedSchema.class);
 94 |                 BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(stream, null);
 95 |                 SerializedVersionedSchema s = new SerializedVersionedSchema(data.getId(), data.getName(), data.getVersion(), data.getSchema().toString(true));
 96 |                 datumWriter.write(s, encoder);
 97 |                 encoder.flush();
 98 |                 byte[] serialized = stream.toByteArray();
 99 |                 return serialized;
100 |             } catch (IOException e) {
101 |                 throw new RuntimeException(e);
102 |             }
103 |         }
104 | 
105 |         @Override
106 |         public void close() {
107 | 
108 |         }
109 |     }
110 | 
111 | 
112 |     private static final Logger logger = LoggerFactory.getLogger(KafkaTopicSchemaProvider.class);
113 | 
114 |     private final InMemorySchemaStore cache;
115 | 
116 |     private final KafkaTopicStore<VersionedSchema> store;
117 | 
118 |     public KafkaTopicSchemaProvider(InMemorySchemaStore cache, KafkaTopicStore<VersionedSchema> store) {
119 |         this.store = store;
120 |         this.cache = cache;
121 |     }
122 | 
123 |     @Override
124 |     public VersionedSchema get(int id) {
125 |         return cache.get(id);
126 |     }
127 | 
128 |     @Override
129 |     public VersionedSchema get(String schemaName, int schemaVersion) {
130 |         return cache.get(schemaName, schemaVersion);
131 |     }
132 | 
133 |     @Override
134 |     public VersionedSchema getMetadata(Schema schema) {
135 |         return cache.getMetadata(schema);
136 |     }
137 | 
138 |     public Collection<VersionedSchema> getAllSchemas() {
139 |         return cache.getAllSchemas();
140 |     }
141 | 
142 |     public void add(VersionedSchema schema) throws Exception {
143 |         store.add(schema);
144 |     }
145 | 
146 |     @Override
147 |     public void close() throws Exception {
148 |         cache.close();
149 |         try {
150 |             store.close();
151 |         } catch (Exception e) {
152 |             logger.error("Could not close store.", e);
153 |         }
154 |     }
155 | }
156 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/KafkaTopicSchemaTool.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) Cloudera, Inc. 2018
  3 |  */
  4 | package com.cloudera.examples.avroserialization;
  5 | 
  6 | import joptsimple.OptionParser;
  7 | import joptsimple.OptionSet;
  8 | import org.apache.avro.Schema;
  9 | import org.apache.kafka.clients.CommonClientConfigs;
 10 | 
 11 | import java.io.FileInputStream;
 12 | import java.io.IOException;
 13 | import java.io.InputStream;
 14 | import java.nio.file.Files;
 15 | import java.nio.file.Paths;
 16 | import java.util.Collection;
 17 | import java.util.Map;
 18 | import java.util.Properties;
 19 | import java.util.Set;
 20 | import java.util.stream.Collectors;
 21 | import java.util.stream.IntStream;
 22 | import java.util.stream.Stream;
 23 | 
 24 | 
 25 | public class KafkaTopicSchemaTool {
 26 | 
 27 |     public static final String ADD = "add";
 28 |     public static final String LIST = "list";
 29 |     public static final String DESCRIBE = "describe";
 30 |     public static final String SCHEMA_FILE = "schema-file";
 31 |     public static final String NAME = "name";
 32 |     public static final String VERSION = "version";
 33 |     public static final String ID = "id";
 34 |     public static final String SERVERS = "servers";
 35 |     public static final String TOPIC = "topic";
 36 |     public static final String CLIENT_CONFIGURATION = "client-configuration";
 37 |     private final KafkaTopicSchemaProvider provider;
 38 | 
 39 |     public KafkaTopicSchemaTool(KafkaTopicSchemaProvider provider) {
 40 |         this.provider = provider;
 41 |     }
 42 | 
 43 |     public Collection<VersionedSchema> listSchemas() {
 44 |         return provider.getAllSchemas();
 45 |     }
 46 | 
 47 |     public VersionedSchema getSchema(String name, int version) {
 48 |         return provider.get(name, version);
 49 |     }
 50 | 
 51 |     public VersionedSchema getSchema(int id) {
 52 |         return provider.get(id);
 53 |     }
 54 | 
 55 |     public VersionedSchema addSchema(String name, int version, String schemaFileName) throws Exception {
 56 |         int id = generateId();
 57 |         String avroSchema = new String(Files.readAllBytes(Paths.get(schemaFileName)));
 58 |         VersionedSchema versionedSchema = new VersionedSchema(id, name, version,
 59 |                 new Schema.Parser().parse(avroSchema));
 60 |         provider.add(versionedSchema);
 61 |         return versionedSchema;
 62 |     }
 63 | 
 64 |     private int generateId() {
 65 |         Set<Integer> ids = listSchemas().stream().map(VersionedSchema::getId).collect(Collectors.toSet());
 66 |         int nextId = IntStream.iterate(0, i -> i + 1).filter(i -> !ids.contains(i)).findFirst().getAsInt();
 67 |         return nextId;
 68 |     }
 69 | 
 70 |     public static void main(String... args) throws IOException {
 71 |         OptionParser parser = new OptionParser();
 72 |         parser.accepts(ADD);
 73 |         parser.accepts(LIST);
 74 |         parser.accepts(DESCRIBE);
 75 | 
 76 |         parser.accepts(SCHEMA_FILE).withRequiredArg();
 77 |         parser.accepts(NAME).withRequiredArg();
 78 |         parser.accepts(VERSION).withRequiredArg();
 79 |         parser.accepts(ID).withRequiredArg();
 80 | 
 81 |         parser.accepts(TOPIC).withRequiredArg();
 82 |         parser.accepts(SERVERS).withRequiredArg();
 83 |         parser.accepts(CLIENT_CONFIGURATION).withRequiredArg();
 84 | 
 85 |         try {
 86 | 
 87 | 
 88 |             OptionSet options = parser.parse(args);
 89 | 
 90 |             if (!(options.has(SERVERS) || options.has(CLIENT_CONFIGURATION))) {
 91 |                 throw new IllegalStateException("You must specify '--" + CLIENT_CONFIGURATION + "' or '--" + SERVERS + "'.");
 92 |             }
 93 | 
 94 |             if (options.has(ADD)) {
 95 |                 if (options.has(LIST) || options.has(DESCRIBE)) {
 96 |                     throw new IllegalArgumentException("You must specify exactly one of --"+ADD+", --"+LIST+" or --"+DESCRIBE+"");
 97 |                 }
 98 |                 if (!(options.has(SCHEMA_FILE) && options.has(NAME) && options.has(VERSION))) {
 99 |                     throw new IllegalStateException("You must specify all of --" + SCHEMA_FILE + ", --" + NAME + ", --" + VERSION + " when adding new schema.");
100 |                 }
101 |                 if (options.has(ID)) {
102 |                     throw new IllegalStateException("You must not specify --" + ID + " when adding new schema.");
103 |                 }
104 |                 KafkaTopicSchemaTool tool = new KafkaTopicSchemaTool(createProvider(options));
105 |                 VersionedSchema schema = tool.addSchema(options.valueOf(NAME).toString(), Integer.valueOf(options.valueOf(VERSION).toString()), options.valueOf(SCHEMA_FILE).toString());
106 |                 System.out.println("Successfully added schema.");
107 |                 printSchema(schema);
108 |             } else if (options.has(LIST)) {
109 |                 if (options.has(DESCRIBE)) {
110 |                     throw new IllegalArgumentException("You must specify exactly one of --"+ADD+", --"+LIST+" or --"+DESCRIBE+"");
111 |                 }
112 |                 if (options.has(SCHEMA_FILE)) {
113 |                     throw new IllegalStateException("You must not specify --" + SCHEMA_FILE + " when listing schemas.");
114 |                 }
115 |                 KafkaTopicSchemaTool tool = new KafkaTopicSchemaTool(createProvider(options));
116 |                 Stream<VersionedSchema> schemas = tool.listSchemas().stream();
117 |                 if (options.hasArgument(NAME)) {
118 |                     schemas = schemas.filter(schema -> options.valueOf(NAME).equals(schema.getName()));
119 |                 }
120 |                 if (options.hasArgument(VERSION)) {
121 |                     schemas = schemas.filter(schema -> options.valueOf(VERSION).equals(String.valueOf(schema.getVersion())));
122 |                 }
123 |                 if (options.hasArgument(ID)) {
124 |                     schemas = schemas.filter(schema -> options.valueOf(ID).equals(String.valueOf(schema.getId())));
125 |                 }
126 |                 schemas.forEach(KafkaTopicSchemaTool::printSchema);
127 |             }  else if (options.has(DESCRIBE)) {
128 |                 if (options.has(SCHEMA_FILE)) {
129 |                     throw new IllegalStateException("You must not specify --" + SCHEMA_FILE + " when describing a schema.");
130 |                 }
131 |                 if (!options.has(ID) && !(options.has(NAME) && options.has(VERSION)) ) {
132 |                     throw new IllegalStateException("You must either specify --" + ID + " or --" + NAME + " and --" + VERSION + ".");
133 | 
134 |                 }
135 |                 KafkaTopicSchemaTool tool = new KafkaTopicSchemaTool(createProvider(options));
136 |                 VersionedSchema schema = options.has(ID)
137 |                         ? tool.getSchema(Integer.valueOf(options.valueOf(ID).toString()))
138 |                         : tool.getSchema(options.valueOf(NAME).toString(), Integer.valueOf(options.valueOf(VERSION).toString()));
139 |                 printSchema(schema);
140 |                 System.out.println("Avro schema:");
141 |                 System.out.println(schema.getSchema().toString(true));
142 |             }
143 |         } catch (Exception e) {
144 |             e.printStackTrace();
145 |             parser.printHelpOn(System.err);
146 |         }
147 | 
148 |     }
149 | 
150 |     private static void printSchema(VersionedSchema schema) {
151 |         System.out.println("ID: " + schema.getId() + "\tName: " + schema.getName() + "\tVersion: " + schema.getVersion());
152 |     }
153 | 
154 |     private static KafkaTopicSchemaProvider createProvider(OptionSet options) throws Exception {
155 |         Properties p = new Properties();
156 |         if (options.has(CLIENT_CONFIGURATION)) {
157 |             try(InputStream is = new FileInputStream(options.valueOf(CLIENT_CONFIGURATION).toString())) {
158 |                 p.load(is);
159 |             }
160 |         }
161 |         if (options.has(SERVERS)) {
162 |             p.put(KafkaTopicSchemaProvider.SCHEMA_PROVIDER_CONF_PREFIX + CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, options.valueOf(SERVERS));
163 |         }
164 |         if (options.has(TOPIC)) {
165 |             p.put(KafkaTopicSchemaProvider.SCHEMA_TOPIC_NAME_CONF, options.valueOf(TOPIC));
166 |         }
167 |         if (!p.containsKey(KafkaTopicSchemaProvider.SCHEMA_PROVIDER_CONF_PREFIX + CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG)
168 |                 || !p.containsKey(KafkaTopicSchemaProvider.SCHEMA_TOPIC_NAME_CONF)) {
169 |             throw new IllegalArgumentException("You must specify both --"+SERVERS+" and --"+TOPIC+", or add properties " +
170 |                     KafkaTopicSchemaProvider.SCHEMA_PROVIDER_CONF_PREFIX + CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG +
171 |                     " and " + KafkaTopicSchemaProvider.SCHEMA_TOPIC_NAME_CONF + " to the file passed as --"+CLIENT_CONFIGURATION+".");
172 |         }
173 | 
174 | 
175 |         return (KafkaTopicSchemaProvider) new KafkaTopicSchemaProvider.KafkaTopicSchemaProviderFactory().getProvider(cast(p));
176 | 
177 |     }
178 | 
179 |     private static Map<String, Object> cast(Object o) {
180 |         return (Map<String, Object>) o;
181 |     }
182 | 
183 | 
184 | }
185 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/KafkaTopicStore.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) Cloudera, Inc. 2018
  3 |  */
  4 | package com.cloudera.examples.avroserialization;
  5 | 
  6 | import static com.cloudera.examples.avroserialization.KafkaTopicSchemaProvider.SCHEMA_PROVIDER_CONF_PREFIX;
  7 | import static org.apache.kafka.clients.consumer.ConsumerConfig.*;
  8 | import org.apache.kafka.clients.consumer.ConsumerRecord;
  9 | import org.apache.kafka.clients.consumer.ConsumerRecords;
 10 | import org.apache.kafka.clients.consumer.KafkaConsumer;
 11 | import org.apache.kafka.clients.producer.KafkaProducer;
 12 | import org.apache.kafka.clients.producer.Producer;
 13 | import org.apache.kafka.clients.producer.ProducerConfig;
 14 | import org.apache.kafka.clients.producer.ProducerRecord;
 15 | import org.apache.kafka.common.PartitionInfo;
 16 | import org.apache.kafka.common.TopicPartition;
 17 | import org.apache.kafka.common.errors.InterruptException;
 18 | import org.apache.kafka.common.serialization.StringDeserializer;
 19 | import org.apache.kafka.common.serialization.StringSerializer;
 20 | import org.slf4j.Logger;
 21 | import org.slf4j.LoggerFactory;
 22 | 
 23 | import java.util.HashSet;
 24 | import java.util.List;
 25 | import java.util.Map;
 26 | import java.util.Properties;
 27 | import java.util.Set;
 28 | import java.util.UUID;
 29 | import java.util.function.Consumer;
 30 | import java.util.function.Predicate;
 31 | import java.util.stream.Collectors;
 32 | 
 33 | /**
 34 |  *
 35 |  * @param <T>
 36 |  */
 37 | class KafkaTopicStore<T> {
 38 | 
 39 |     public static final String SCHEMA_PROVIDER_PRODUCER_CONF_PREFIX = SCHEMA_PROVIDER_CONF_PREFIX + "producer.";
 40 |     public static final String SCHEMA_PROVIDER_CONSUMER_CONF_PREFIX = SCHEMA_PROVIDER_CONF_PREFIX + "consumer.";
 41 | 
 42 |     public static Map<String, Object> subConfig(Map<String, ?> config, String prefix) {
 43 |         Map<String, Object> filtered = config.keySet().stream()
 44 |                 .filter(key -> key.startsWith(prefix))
 45 |                 .collect(Collectors.toMap(
 46 |                         key -> key.substring(prefix.length()),
 47 |                         config::get));
 48 |         return filtered;
 49 |     }
 50 | 
 51 |     private static final Logger logger = LoggerFactory.getLogger(KafkaTopicStore.class);
 52 | 
 53 |     private final String topic;
 54 |     private final Consumer<ConsumerRecord<Void, T>> newRecordConsumer;
 55 |     private Thread pollerThread;
 56 | 
 57 |     private final org.apache.kafka.clients.consumer.Consumer<Void, T> consumer;
 58 |     private final Producer<Void, T> producer;
 59 | 
 60 | 
 61 |     public KafkaTopicStore(
 62 |             Map<String, Object> config,
 63 |             String topic,
 64 |             String valueSerializerClassName,
 65 |             String valueDeserializerClassName,
 66 |             Consumer<ConsumerRecord<Void, T>> recordConsumer) {
 67 |         this.topic = topic;
 68 |         this.newRecordConsumer = recordConsumer;
 69 |         Properties consumerProperties = consumerProperties(config, valueDeserializerClassName);
 70 |         Properties producerProperties = producerProperties(config, valueSerializerClassName);
 71 | 
 72 |         consumer = new KafkaConsumer<>(consumerProperties);
 73 |         producer = new KafkaProducer<>(producerProperties);
 74 |     }
 75 | 
 76 |     private static Properties consumerProperties(Map<String, Object> config, String valueDeserializerClassName) {
 77 |         Properties props = new Properties();
 78 |         props.put(KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
 79 |         props.put(VALUE_DESERIALIZER_CLASS_CONFIG, valueDeserializerClassName);
 80 |         props.put(GROUP_ID_CONFIG, UUID.randomUUID().toString());
 81 |         props.put(ENABLE_AUTO_COMMIT_CONFIG, "false");
 82 |         props.putAll(config);
 83 |         props.putAll(subConfig(config, SCHEMA_PROVIDER_CONSUMER_CONF_PREFIX));
 84 |         return props;
 85 |     }
 86 | 
 87 |     private static Properties producerProperties(Map<String, Object> config, String valueSerializerClassName) {
 88 |         Properties props = new Properties();
 89 |         props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
 90 |         props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, valueSerializerClassName);
 91 |         props.put(ProducerConfig.ACKS_CONFIG, "all");
 92 |         props.put(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, "1");
 93 |         props.put(ProducerConfig.RETRIES_CONFIG, "10");
 94 |         props.putAll(config);
 95 |         props.putAll(subConfig(config, SCHEMA_PROVIDER_PRODUCER_CONF_PREFIX));
 96 |         return props;
 97 |     }
 98 | 
 99 |     private void pollUntil(Predicate<ConsumerRecord<Void, T>> predicate) {
100 |         try {
101 |             while(true) {
102 |                 ConsumerRecords<Void, T> records = consumer.poll(1000);
103 |                 logger.debug("Read " + records.count() + " entries");
104 |                 for (ConsumerRecord<Void, T> record : records) {
105 |                     logger.debug("Processing " + record.key() + " -> " + record.value());
106 |                     newRecordConsumer.accept(record);
107 |                     if (predicate.test(record)) {
108 |                         return;
109 |                     }
110 |                 }
111 |             }
112 |         } catch (InterruptException e) {
113 |             logger.info("Interrupted while polling for new records.", e);
114 |         } catch (Exception e) {
115 |             logger.error("Caught error while polling for new records.", e);
116 |         }
117 |     }
118 | 
119 | 
120 |     public void startAndCatchUp() {
121 |         List<PartitionInfo> partitions = consumer.partitionsFor(topic);
122 |         logger.info("Start working with topic " + partitions);
123 |         List<TopicPartition> topicPartitions =
124 |                 partitions.stream()
125 |                         .map(p -> new TopicPartition(topic, p.partition()))
126 |                         .collect(Collectors.toList());
127 |         consumer.assign(topicPartitions);
128 |         consumer.seekToBeginning(topicPartitions);
129 |         Map<TopicPartition, Long> initialOffsets = consumer.endOffsets(topicPartitions);
130 |         logger.info("Offsets to catch up with: " +  initialOffsets);
131 |         final Set<TopicPartition> partitionsCaughtUp = new HashSet<>();
132 |         if (initialOffsets.values().stream().mapToLong(v -> v).sum() > 0) {
133 |             Predicate<ConsumerRecord<Void, T>> allCaughtUp = r -> {
134 |                 // Checks
135 |                 TopicPartition topicPartition = new TopicPartition(topic, r.partition());
136 |                 logger.debug("Read partition, offset: " + r.partition() + ",  " + r.offset());
137 |                 if (r.offset() >= initialOffsets.get(topicPartition) - 1) {
138 |                     partitionsCaughtUp.add(topicPartition);
139 |                 }
140 |                 return partitionsCaughtUp.size() == initialOffsets.size();
141 |             };
142 |             pollUntil(allCaughtUp);
143 |         }
144 |         pollerThread = new Thread(() -> pollUntil(r -> false), getClass().getSimpleName() + "-poller");
145 |         pollerThread.start();
146 |     }
147 | 
148 |     public void add(T record) throws Exception {
149 |         producer.send(new ProducerRecord<>(topic, record)).get();
150 |         // TODO maybe wait until record is read back
151 |     }
152 | 
153 |     public void close() throws Exception {
154 |         pollerThread.interrupt();
155 |         pollerThread.join(1000);
156 |     }
157 | 
158 | 
159 | 
160 | 
161 | }
162 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/KafkaTopicStoreUsage.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | package com.cloudera.examples.avroserialization;
 5 | 
 6 | 
 7 | import org.apache.kafka.clients.CommonClientConfigs;
 8 | import org.apache.kafka.clients.consumer.ConsumerConfig;
 9 | import org.apache.kafka.clients.consumer.KafkaConsumer;
10 | import org.apache.kafka.clients.producer.KafkaProducer;
11 | import org.apache.kafka.clients.producer.ProducerConfig;
12 | import org.apache.kafka.clients.producer.ProducerRecord;
13 | import org.apache.kafka.common.serialization.IntegerDeserializer;
14 | import org.apache.kafka.common.serialization.IntegerSerializer;
15 | 
16 | import java.util.Collections;
17 | import java.util.HashMap;
18 | import java.util.Map;
19 | 
20 | 
21 | public class KafkaTopicStoreUsage {
22 | 
23 |     private static final String KAFKA_CLUSTER = System.getProperty("INTEGRATION_TEST_CLUSTER");
24 |     private static final String TOPIC = "avro-example";
25 |     public static final String SCHEMAPROVIDER_TOPIC_NAME = "__com_cloudera_schemaprovider";
26 | 
27 |     /*
28 |      kafka-topics --create --topic __com_cloudera_schemaprovider --partitions 1 --replication-factor 3 --config min.insync.replicas=2  --config retention.ms=-1 --config retention.bytes=-1 --zookeeper $(hostname):2181
29 |      kafka-topics --create --topic avro-example                  --partitions 3 --replication-factor 3 --config min.insync.replicas=2 --zookeeper $(hostname):2181
30 |     */
31 | 
32 |     public void add() throws Exception {
33 |         KafkaTopicSchemaTool.main("--add", "--name", "user", "--version", "1",
34 |                 "--schema-file", "src/main/avro/user_v1_1.avsc", "--servers", KAFKA_CLUSTER, "--topic", SCHEMAPROVIDER_TOPIC_NAME);
35 |     }
36 | 
37 | 
38 | 
39 |     public void produce() throws Exception {
40 |         Map<String, Object> producerProps = new HashMap<>();
41 |         producerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, KAFKA_CLUSTER);
42 |         producerProps.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, IntegerSerializer.class.getName());
43 |         producerProps.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, KafkaSpecificRecordSerializer.class.getName());
44 |         producerProps.put(ProducerConfig.ACKS_CONFIG, "-1");
45 |         producerProps.put(KafkaSpecificRecordSerializer.VALUE_RECORD_CLASSNAME, User.class.getName());
46 |         producerProps.put(SchemaUtils.SCHEMA_PROVIDER_FACTORY_CONFIG, KafkaTopicSchemaProvider.KafkaTopicSchemaProviderFactory.class.getName());
47 |         producerProps.put(KafkaTopicSchemaProvider.SCHEMA_TOPIC_NAME_CONF, SCHEMAPROVIDER_TOPIC_NAME);
48 |         producerProps.put(KafkaTopicSchemaProvider.SCHEMA_PROVIDER_CONF_PREFIX + CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, KAFKA_CLUSTER);
49 | 
50 |         KafkaProducer<Integer, User> producer = new KafkaProducer<>(producerProps);
51 | 
52 |         for (User u : new User[] {
53 |                 new User("user1", "User, First", 0L),
54 |                 new User("user2", "User, Second", 10000L),
55 |                 new User("user3", "User, Name Third", 20000L)
56 |         })
57 |             producer.send(new ProducerRecord<>(TOPIC, u.getIdentifier().hashCode(), u)).get();
58 |     }
59 | 
60 | 
61 |     public void consume() {
62 |         Map<String, Object> consumerProps = new HashMap<>();
63 |         consumerProps.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, KAFKA_CLUSTER);
64 |         consumerProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, IntegerDeserializer.class.getName());
65 |         consumerProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, KafkaSpecificRecordDeserializer.class.getName());
66 |         consumerProps.put(ConsumerConfig.GROUP_ID_CONFIG, new Object().hashCode() + "");
67 |         consumerProps.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
68 |         consumerProps.put(KafkaSpecificRecordDeserializer.VALUE_RECORD_CLASSNAME, User2.class.getName());
69 |         consumerProps.put(SchemaUtils.SCHEMA_PROVIDER_FACTORY_CONFIG, KafkaTopicSchemaProvider.KafkaTopicSchemaProviderFactory.class.getName());
70 |         consumerProps.put(KafkaTopicSchemaProvider.SCHEMA_TOPIC_NAME_CONF, SCHEMAPROVIDER_TOPIC_NAME);
71 |         consumerProps.put(KafkaTopicSchemaProvider.SCHEMA_PROVIDER_CONF_PREFIX + CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, KAFKA_CLUSTER);
72 | 
73 |         KafkaConsumer<Integer, User2> consumer = new KafkaConsumer<>(consumerProps);
74 | 
75 |         consumer.subscribe(Collections.singletonList(TOPIC));
76 |         while(true) {
77 |             consumer.poll(1000).forEach(r -> {
78 |                 User2 u = r.value();
79 |                 System.out.println(u);
80 |             });
81 |         }
82 | 
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/SchemaProvider.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | package com.cloudera.examples.avroserialization;
 5 | 
 6 | import org.apache.avro.Schema;
 7 | 
 8 | public interface SchemaProvider extends AutoCloseable {
 9 |     public VersionedSchema get(int id);
10 |     public VersionedSchema get(String schemaName, int schemaVersion);
11 |     public VersionedSchema getMetadata(Schema schema);
12 | }
13 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/SchemaProviderFactory.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | package com.cloudera.examples.avroserialization;
 5 | 
 6 | import java.util.Map;
 7 | 
 8 | public interface SchemaProviderFactory {
 9 |     public SchemaProvider getProvider(Map<String, ?> config) throws Exception;
10 | }
11 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/SchemaStore.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Copyright (C) Cloudera, Inc. 2018
3 |  */
4 | package com.cloudera.examples.avroserialization;
5 | 
6 | public interface SchemaStore extends SchemaProvider {
7 |     public void add(VersionedSchema schema) throws Exception;
8 | }
9 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/SchemaUtils.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | package com.cloudera.examples.avroserialization;
 5 | 
 6 | import java.util.HashMap;
 7 | import java.util.Map;
 8 | import java.util.stream.Stream;
 9 | 
10 | public class SchemaUtils {
11 | 
12 | 
13 |     public static final String SCHEMA_PROVIDER_FACTORY_CONFIG = "schemaProviderFactory";
14 | 
15 |     public static SchemaProvider getSchemaProvider(Map<String, ?> configs) {
16 |         String schemaProviderFactoryClassName = (String) configs.get(SCHEMA_PROVIDER_FACTORY_CONFIG);
17 |         try {
18 |             return ((SchemaProviderFactory)Class.forName(schemaProviderFactoryClassName).newInstance()).getProvider(configs);
19 |         } catch (Exception e) {
20 |             throw new RuntimeException(e);
21 |         }
22 |     }
23 | 
24 |     public static Map<String, VersionedSchema> getVersionedSchemas(Map<String, ?> configs, SchemaProvider schemaProvider) {
25 |         Map<String, VersionedSchema> schemas = new HashMap<>();
26 |         Stream<String> schemaConfigs = configs.keySet().stream().filter(k -> k.startsWith("schemaversion."));
27 |         schemaConfigs.forEach(k -> {
28 |             String schemaName = k.substring("schemaversion.".length());
29 |             Integer schemaVersion = (Integer) configs.get(k);
30 |             VersionedSchema versionedSchema = schemaProvider.get(schemaName, schemaVersion);
31 |             schemas.put(schemaName, versionedSchema);
32 |         });
33 |         return schemas;
34 |     }
35 | 
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/SchemaProvider/src/main/java/com/cloudera/examples/avroserialization/VersionedSchema.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | package com.cloudera.examples.avroserialization;
 5 | 
 6 | import org.apache.avro.Schema;
 7 | 
 8 | public class VersionedSchema {
 9 |     private final int id;
10 |     private final String name;
11 |     private final int version;
12 |     private final Schema schema;
13 | 
14 |     public VersionedSchema(int id, String name, int version, Schema schema) {
15 |         this.id = id;
16 |         this.name = name;
17 |         this.version = version;
18 |         this.schema = schema;
19 |     }
20 | 
21 |     public String getName() {
22 |         return name;
23 |     }
24 | 
25 |     public int getVersion() {
26 |         return version;
27 |     }
28 | 
29 |     public Schema getSchema() {
30 |         return schema;
31 |     }
32 | 
33 | 
34 |     public int getId() {
35 |         return id;
36 |     }
37 | 
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/SimpleClient/README.md:
--------------------------------------------------------------------------------
1 | # SimpleClient for CDK/CDH Releases Powered by Apache Kafka
2 | 
3 | The producer sends records to a `ufo_sightings` topic.  The records have some randomly generated values for this example and consists of the following:
4 | 
5 | * Key: `ufoId`
6 | * Values: `<timestamp>`, `<latitude>`, `<longitude>`
7 | 


--------------------------------------------------------------------------------
/SimpleClient/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>com.cloudera.kafkaexamples</groupId>
 5 |   <artifactId>kafka-examples</artifactId>
 6 |   <packaging>jar</packaging>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <name>kafkadev</name>
 9 |   <url>http://maven.apache.org</url>
10 |   <repositories>
11 |     <repository>
12 |       <!-- Proper URL for Cloudera maven artifactory -->
13 |       <id>cloudera</id>
14 |       <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
15 |     </repository>
16 |   </repositories>
17 |   <dependencies>
18 |     <dependency>
19 |       <!-- The source code has been tested with the following versions -->
20 |       <!-- and will likely work with any newer versions of the         -->
21 |       <!-- kafka-clients library:                                      -->
22 |       <!-- 0.9.0-kafka-2.0.0                                           -->
23 |       <!-- 0.9.0-kafka-2.0.1                                           -->
24 |       <!-- 0.9.0-kafka-2.0.2                                           -->
25 |       <!-- 0.10.0-kafka-2.1.0                                          -->
26 |       <!-- 0.10.0-kafka-2.1.1                                          -->
27 |       <!-- 0.10.0-kafka-2.1.2                                          -->
28 |       <!-- 0.10.2-kafka-2.2.0                                          -->
29 |       <!-- 0.11.0-kafka-3.0.0                                          -->
30 |       <!-- 0.11.0-kafka-3.0.1                                          -->
31 |       <!-- 1.0.1-kafka-3.1.0                                           -->
32 |       <groupId>org.apache.kafka</groupId>
33 |       <artifactId>kafka-clients</artifactId>
34 |       <version>1.0.1-kafka-3.1.0</version>
35 |       <scope>compile</scope>
36 |     </dependency>
37 |   </dependencies>
38 |   <build>
39 |     <plugins>
40 |       <plugin>
41 |         <groupId>org.apache.maven.plugins</groupId>
42 |         <artifactId>maven-compiler-plugin</artifactId>
43 |         <version>3.7.0</version>
44 |         <configuration>
45 |           <source>1.8</source>
46 |           <target>1.8</target>
47 |         </configuration>
48 |       </plugin>
49 |     </plugins>
50 |   </build>
51 | </project>
52 | 


--------------------------------------------------------------------------------
/SimpleClient/src/main/java/com/cloudera/kafkaexamples/SimpleConsumer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | package com.cloudera.kafkaexamples;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.Properties;
 8 | 
 9 | import org.apache.kafka.clients.consumer.ConsumerConfig;
10 | import org.apache.kafka.clients.consumer.ConsumerRecord;
11 | import org.apache.kafka.clients.consumer.ConsumerRecords;
12 | import org.apache.kafka.clients.consumer.KafkaConsumer;
13 | import org.apache.kafka.common.serialization.StringDeserializer;
14 | 
15 | public class SimpleConsumer {
16 |   public static void main(String[] args) {
17 |     // Set up Java properties
18 |     Properties props = new Properties();
19 |     // This should point to at least one broker. Some communication
20 |     // will occur to find the controller. Adding more brokers will
21 |     // help in case of host failure or broker failure.
22 |     props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,
23 |         "hostname1:port1,hostname2:port2,hostname3:port3");
24 |     props.put(ConsumerConfig.GROUP_ID_CONFIG, "test");
25 |     // Enable a few useful properties for this example. Use of these
26 |     // settings will depend on your particular use case.
27 |     props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true");
28 |     props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000");
29 |     // Required properties to process records
30 |     props.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,
31 |         StringDeserializer.class.getName());
32 |     props.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,
33 |         StringDeserializer.class.getName());
34 | 
35 |     KafkaConsumer<String, String> consumer = new KafkaConsumer<>(props);
36 | 
37 |     try {
38 |       // List of topics to subscribe to
39 |       consumer.subscribe(Arrays.asList("ufo_sightings"));
40 |       while (true) {
41 |         ConsumerRecords<String, String> records = consumer.poll(100);
42 |         for (ConsumerRecord<String, String> record : records) {
43 |           System.out.printf("offset = %d, key = %s, value = %s%n", record.offset(), record.key(), record.value());
44 |         }
45 |       }
46 |     } catch (Exception e) {
47 |       e.printStackTrace();
48 |     } finally {
49 |       consumer.close();
50 |     }
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/SimpleClient/src/main/java/com/cloudera/kafkaexamples/SimpleProducer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | package com.cloudera.kafkaexamples;
 5 | 
 6 | import java.util.Date;
 7 | import java.util.Properties;
 8 | 
 9 | import org.apache.kafka.clients.producer.KafkaProducer;
10 | import org.apache.kafka.clients.producer.ProducerConfig;
11 | import org.apache.kafka.clients.producer.ProducerRecord;
12 | import org.apache.kafka.common.serialization.StringSerializer;
13 | 
14 | public class SimpleProducer {
15 |   public static void main(String[] args) {
16 |     long events = Long.parseLong(args[0]);
17 |     long ufoId = Math.round(Math.random() * Integer.MAX_VALUE);
18 |  
19 |     // Set up Java properties
20 |     Properties props = new Properties();
21 |     // This should point to at least one broker. Some communication
22 |     // will occur to find the controller. Adding more brokers will
23 |     // help in case of host failure or broker failure.
24 |     props.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,
25 |         "hostname1:port1,hostname2:port2,hostname3:port3");
26 |     // Enable a few useful properties for this example. Use of these
27 |     // settings will depend on your particular use case.
28 |     props.setProperty(ProducerConfig.ACKS_CONFIG, "1");
29 |     // Required properties to process records
30 |     props.setProperty(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
31 |         StringSerializer.class.getName());
32 |     props.setProperty(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
33 |         StringSerializer.class.getName());
34 | 
35 |     KafkaProducer<String, String> producer = new KafkaProducer<>(props);
36 | 
37 |     try {
38 |       for (long nEvents = 0; nEvents < events; nEvents++) {
39 |         String key = Long.toString(ufoId++);
40 |         long runtime = new Date().getTime();
41 |         double latitude = (Math.random() * (2 * 85.05112878)) - 85.05112878;
42 |         double longitude = (Math.random() * 360.0) - 180.0;
43 |         String msg = runtime + "," + latitude + "," + longitude;
44 |         ProducerRecord<String, String> data = new ProducerRecord<String, String>("ufo_sightings", key, msg);
45 |         producer.send(data);
46 |         long wait = Math.round(Math.random() * 25);
47 |         Thread.sleep(wait);
48 |       }
49 |     } catch (Exception e) {
50 |       e.printStackTrace();
51 |     } finally {
52 |       producer.close();
53 |     }
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/SimpleFlafka/README.md:
--------------------------------------------------------------------------------
 1 | # Simple Flafka
 2 | 
 3 | This contains the sample code for the "Scalability of Kafka Messaging using Consumer Groups"
 4 | 
 5 | ## Instructions
 6 | 
 7 | First, run the Flume agent:
 8 | 
 9 | <pre>
10 | $ sudo -u hdfs flume-ng agent -n flume/TwitterAgent -f etc/twitter.conf --conf etc/flume-ng/conf/
11 | </pre>
12 | 
13 | Then, you can use the kafka-console-consumer command to begin consuming the topic.
14 | 
15 | <pre>
16 | $ cp /etc/conf/tools-log4j.properties consumer.properties
17 | $ kafka-console-consumer --zookeeper <zkhost>:2181 --topic "tweets" --consumer.config consumer.properties
18 | </pre>
19 | 


--------------------------------------------------------------------------------
/SimpleFlafka/etc/twitter.conf:
--------------------------------------------------------------------------------
 1 | #Naming the components of the current agent:
 2 | 
 3 | TwitterAgent.sources= Twitter
 4 | TwitterAgent.channels= MemChannel
 5 | TwitterAgent.sinks = Kafkanic
 6 | 
 7 | # Describing/Configuring the source
 8 | TwitterAgent.sources.Twitter.type = org.apache.flume.source.twitter.TwitterSource
 9 | TwitterAgent.sources.Twitter.consumerKey =
10 | TwitterAgent.sources.Twitter.consumerSecret =
11 | TwitterAgent.sources.Twitter.accessToken =
12 | TwitterAgent.sources.Twitter.accessTokenSecret = 
13 | TwitterAgent.sources.Twitter.keywords = iphonex, teampixel, samsungs8, note8
14 | TwitterAgent.sources.Twitter.language= en, en-gb
15 | 
16 | # Describing/Configuring the sink
17 | 
18 | TwitterAgent.sinks.Kafkanic.type = org.apache.flume.sink.kafka.KafkaSink
19 | TwitterAgent.sinks.Kafkanic.channel = MemChannel
20 | TwitterAgent.sinks.Kafkanic.brokerList = sgostest-1.gce.cloudera.com:9092,sgostest-2.gce.cloudera.com:9092
21 | TwitterAgent.sinks.Kafkanic.batchSize =100
22 | TwitterAgent.sinks.Kafkanic.topic = tweets_partitioned
23 | 
24 | # Describing/Configuring the channel TwitterAgent.channels.MemChannel.type = memory
25 | TwitterAgent.channels.MemChannel.capacity = 10000
26 | TwitterAgent.channels.MemChannel.transactionCapacity = 100
27 | TwitterAgent.channels.MemChannel.type = memory
28 | 
29 | # Binding the source and sink to the channel
30 | TwitterAgent.sources.Twitter.channels = MemChannel


--------------------------------------------------------------------------------
/SimpleFlafka/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>simpleFlafka</groupId>
 5 |   <artifactId>simpleFlafka</artifactId>
 6 |   <version>1.0-SNAPSHOT</version>
 7 |   <repositories>
 8 |     <repository>
 9 |       <id>cloudera</id>
10 |       <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
11 |     </repository>
12 |   </repositories>
13 |   <dependencies>
14 |     <dependency>
15 |       <groupId>org.apache.kafka</groupId>
16 |       <artifactId>kafka-clients</artifactId>
17 |       <version>1.0.1-kafka-3.1.0</version>
18 |       <scope>compile</scope>
19 |     </dependency>
20 |   </dependencies>
21 |   <build>
22 |     <plugins>
23 |       <plugin>
24 |         <groupId>org.apache.maven.plugins</groupId>
25 |         <artifactId>maven-compiler-plugin</artifactId>
26 |         <version>3.7.0</version>
27 |         <configuration>
28 |           <source>1.8</source>
29 |           <target>1.8</target>
30 |         </configuration>
31 |       </plugin>
32 |     </plugins>
33 |   </build>
34 | </project>
35 | 


--------------------------------------------------------------------------------
/SimpleFlafka/src/main/java/FlafkaConsumer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2018
 3 |  */
 4 | import java.util.Arrays;
 5 | import java.util.Properties;
 6 | 
 7 | import org.apache.kafka.clients.consumer.ConsumerRecord;
 8 | import org.apache.kafka.clients.consumer.ConsumerRecords;
 9 | import org.apache.kafka.clients.consumer.KafkaConsumer;
10 | 
11 | public class FlafkaConsumer {
12 | 
13 |     public static void main(String[] args){
14 | 
15 |         Properties props = new Properties();
16 |         props.put("bootstrap.servers","broker-1.gce.cloudera.com:9092, broker-2.gce.cloudera.com:9092");
17 |         props.put("group.id",args[0]);
18 |         props.put("enable.auto.commit","true");
19 |         props.put("auto.commit.interval.ms","1000");
20 |         props.put("key.deserializer","org.apache.kafka.common.serialization.StringDeserializer");
21 |         props.put("value.deserializer","org.apache.kafka.common.serialization.StringDeserializer");
22 | 
23 |         KafkaConsumer<String,String> consumer = new KafkaConsumer<String, String>(props);
24 |         String topic = args[1];
25 | 
26 |         consumer.subscribe(Arrays.asList(topic));
27 |         while(true){
28 |             ConsumerRecords<String,String> records = consumer.poll(100);
29 |             for(ConsumerRecord<String,String> record : records ) {
30 |                 System.out.printf("offset = %d, key = %s \n",record.offset(),record.value().split(",")[4]);
31 |                // System.out.printf("offset = %d, key = %s, value = %s \n",record.offset(),record.key(),record.value());
32 | 
33 |             }
34 |         }
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/SimpleFlafka/src/main/java/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | Main-Class: FlafkaConsumer
3 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/README.md:
--------------------------------------------------------------------------------
 1 | _Copyright &copy; Cloudera, Inc. 2018_
 2 | # Spark Structured Streaming reference application for CDH
 3 | 
 4 | ## Introduction
 5 | 
 6 | This project includes sample applications that demonstrate an Apache Kafka -> Apache Spark Structured Streaming -> Apache Kudu pipeline for ingestion.
 7 | 
 8 | Please check out the [documentation](docs/doc.md) to get an overview of building Spark structured streaming applications on the CDH platform,
 9 | a description of the use case the application solves, the components and the integration techniques used to realize a simple
10 | streaming system using the Cloudera stack.  
11 | 
12 | There are two sample applications implementing a streaming application in two different ways.
13 | 
14 | * The simpleApp focuses on the integration aspect:
15 | it demonstrates the simplest way to connect Spark with Kafka and Kudu.
16 | * The advancedApp also shows a way to abstract out the business logic from the application.
17 | It enables easy switching between various sources and sinks and eases testing on different levels.   
18 | 
19 | The applications demonstrate some basic Structured Streaming techniques, including stream - static table join to enrich data in the incoming stream
20 | and windowing.
21 | 
22 | For the preparation and execution instructions please see the README file of the separate projects.
23 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/README.md:
--------------------------------------------------------------------------------
  1 | _Copyright &copy; Cloudera, Inc. 2019_
  2 | # Executing the application
  3 | ## Locally
  4 | 
  5 |    com.cloudera.streaming.refapp.StructuredStreams inputDir outputDir kudu-master
  6 | 
  7 |    It will start an embedded Kafka and Spark instance.
  8 |    It's intended to be used during development and testing.
  9 | 
 10 |    This version can read local json files or generated input for streams and local files
 11 |    or Kudu tables for the static datasets.
 12 |    It prodcues CSV output or writes to Kudu. You can easily change any sink or source in the code of StructuredStreams.
 13 | 
 14 |    For input you can use samples in the src/test/resources/samples directory of this project,
 15 |    or your own files organized to the same structure.
 16 | 
 17 |    As an alternative you can use a data generator that will keep producing randomized vendor, customer and
 18 |    transaction records until it is stopped.
 19 |    To use the data generator add / uncomment the following section to the constructor call of the
 20 |    Application in the StructuredStreams object:
 21 |    ```
 22 |    initSources = {
 23 |               CustomerGenerator(kafkaConfig, "customer").start()
 24 |               VendorGenerator(kafkaConfig, "vendor").start()
 25 |               TransactionGenerator(kafkaConfig, "transaction").start()
 26 |             },
 27 |    ```
 28 | 
 29 | 
 30 | ## Submitting to Spark service running on a cluster
 31 | 
 32 | ### Cluster requriements
 33 | The application can be deployed on a cluster that already has all the required services:
 34 | * Kafka
 35 | * Spark
 36 | * Kudu
 37 | 
 38 | their dependencies:
 39 | * Zookeeper
 40 | * HDFS
 41 | 
 42 | #### Secured cluster
 43 | * Kerberos (the application is tested with MIT Kerberos, AD should also be ok)
 44 | * Sentry
 45 | 
 46 | All the other services should be configured to use SSL/TLS, Kerberos for authentication and Sentry for authorization
 47 | 
 48 | The application itself does not require Impala but it is used by the init scripts to create the Kudu tables and insert sample initial data.
 49 | 
 50 | ### Preparations
 51 | 
 52 |   1. Execute
 53 | 
 54 |   `mvn clean package`
 55 | 
 56 |   2.  Copy the target/streaming-ref-app-advanced-0.1-SNAPSHOT-jar-with-dependencies.jar, db/init_kudu_db.sql and all files
 57 |   from the ../scripts directory to a host on the cluster.
 58 |   3.  Ssh to that host
 59 |   4.  Edit config.sh. It contains reasonable defaults, make sure that you set each value fitting to your environment.  
 60 |   5.  Execute all the other .sh files. They will generate various config files used by the application:
 61 |       1. `kudu.sh` - creates the streaming_ref datbase and the tables in Kudu and sentry privileges required to access them
 62 |       2. `producer.sh` - creates sentry privileges and configuration files for the application that generates
 63 |       input records and sends them to Kafka
 64 |       3. `spark-kafka.sh` - creates Kafka related sentry privileges and configuration files for Spark application
 65 |       4. `topics.sh` - creates the Kafka topics
 66 | 
 67 | 
 68 | ### Using the application
 69 | The DeployedStructuredStreams application will read records from 3 Kafka topics (customer, vendor, and transaction) and it will
 70 | write customer, vendor and transaction data to Kudu tables (customers, vendors, states, valid_transactions, invalid_transactions,
 71 | customer_orphans, vendor_orphans, transactions_operational_metadata).
 72 | 
 73 | When the application is started / submitted to the Spark service it will start the streaming pipeline,
 74 | but it will not produce any output until it gets data from the Kafka topic.
 75 | 
 76 | You can send data on your own (e.g. using a ConsoleProducer to producer records with the same JSON format as the sample files in
 77 | src/test/resources/samples/kafka) or you can use the DeployedDataGenerator application.
 78 | 
 79 | You can check the output e.g. by using Impala.
 80 | * Authenticate with `kinit` using a user that has access to the streaming_ref database and all the tables you want to check
 81 | * `impala-shell -i <hostname> -k --ssl`
 82 | * in the shell execute
 83 |   ```
 84 |   use streaming_ref;
 85 | 
 86 |   select 'valid_transactions' as table_name, count(*) from valid_transactions
 87 |   union
 88 |   select 'invalid_transactions', count(*) from invalid_transactions
 89 |   union
 90 |   select 'customer_orphans', count(*) from customer_orphans
 91 |   union
 92 |   select 'vendor_orphans', count(*) from vendor_orphans
 93 |   union
 94 |   select 'transactions_operational_metadata', count(*) from transactions_operational_metadata;
 95 |   ```
 96 |   to quickly check if the application is producing output, or you can execute any other queries against the output tables.
 97 | 
 98 | 
 99 | #### Submitting the application without security
100 | 
101 |  Execute
102 | 
103 |   ```
104 |   spark-submit --files consumer.properties \  
105 |   --class com.cloudera.streaming.refapp.DeployedStructuredStreams --deploy-mode cluster \
106 |   --master yarn streaming-ref-app-advanced-0.1-SNAPSHOT-jar-with-dependencies.jar \
107 |   consumer.properties <kudu_master_host:kudu_master_port>
108 |   ```  
109 | 
110 | #### Submitting the application on a secured cluster
111 | 
112 |   Execute
113 | 
114 |   ```
115 |   kinit -kt <keytab> <SPARK_USER>
116 | 
117 |   spark-submit --files consumer.properties,kafka_client_jaas.conf,<keytab file> --driver-java-options \
118 |   "-Djava.security.auth.login.config=./kafka_client_jaas.conf"  --class com.cloudera.streaming.refapp.DeployedStructuredStreams \
119 |   --conf "spark.executor.extraJavaOptions=-Djava.security.auth.login.config=./kafka_client_jaas.conf" \
120 |   --deploy-mode cluster --master yarn streaming-ref-app-advanced-0.1-SNAPSHOT-jar-with-dependencies.jar \
121 |   consumer.properties <kudu_master_host:kudu_master_port>
122 |   ```
123 | 
124 | By default the application will keep running until you kill it in yarn. You can use an additional timeToLive parameter when the
125 | application is submitted, in this case the application will stop after the given time (in seconds). E.g.
126 | ```
127 | spark-submit...streaming-ref-app-advanced-0.1-SNAPSHOT-jar-with-dependencies.jar consumer.properties <kudu_master_host:kudu_master_port> 600
128 | ```
129 | will stop after 10 minutes.
130 | 
131 | ### Running the data generator
132 | Start `java -cp streaming-ref-app-advanced-0.1-SNAPSHOT-jar-with-dependencies.jar \
133 | com.cloudera.streaming.refapp.DeployedDataGenerator producer.properties`.     
134 | 
135 | # Testing
136 | TransactionsFlowUnitTest and LocalIntegrationTest demonstrate how to write unit tests and integration tests
137 | for Spark Structured Streaming applications.
138 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/db/init_kudu_db.sql:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) Cloudera, Inc. 2019
  3 |  */
  4 | 
  5 | CREATE DATABASE IF NOT EXISTS streaming_ref;
  6 | USE streaming_ref;
  7 | 
  8 | DROP TABLE IF EXISTS customers;
  9 | CREATE TABLE customers (
 10 |     customer_id INT PRIMARY KEY,
 11 |     first_name STRING,
 12 |     last_name STRING,
 13 |     state_name STRING,
 14 |     state_abbreviation STRING,
 15 |     update_timestamp TIMESTAMP)
 16 | PARTITION BY HASH (customer_id) PARTITIONS 10
 17 | STORED AS KUDU TBLPROPERTIES ('kudu.num_tablet_replicas' = '3');
 18 | 
 19 | DROP TABLE IF EXISTS vendors;
 20 | CREATE TABLE vendors (
 21 |     vendor_id INT PRIMARY KEY,
 22 |     vendor_name STRING,
 23 |     phone_number STRING,
 24 |     update_timestamp TIMESTAMP)
 25 | PARTITION BY HASH (vendor_id) PARTITIONS 10
 26 | STORED AS KUDU TBLPROPERTIES ('kudu.num_tablet_replicas' = '3');
 27 | 
 28 | DROP TABLE IF EXISTS states;
 29 | CREATE TABLE states (
 30 |     state_id INT PRIMARY KEY,
 31 |     state_name STRING,
 32 |     state_abbreviation STRING)
 33 | STORED AS KUDU TBLPROPERTIES ('kudu.num_tablet_replicas' = '3');
 34 | 
 35 | DROP TABLE IF EXISTS valid_transactions;
 36 | CREATE TABLE valid_transactions (
 37 |     event_timestamp TIMESTAMP,
 38 |     transaction_id STRING,
 39 |     customer_id INT,
 40 |     vendor_id INT,
 41 |     event_state STRING,
 42 |     price STRING,
 43 |     card_type STRING,
 44 |     PRIMARY KEY (event_timestamp, transaction_id)
 45 | )
 46 | PARTITION BY
 47 |     HASH (transaction_id) PARTITIONS 15,
 48 |     RANGE (event_timestamp)
 49 |       (PARTITION '2018-11-01' <= VALUES < '2018-12-01')
 50 | STORED AS KUDU TBLPROPERTIES ('kudu.num_tablet_replicas' = '3');
 51 | ALTER TABLE valid_transactions ADD RANGE PARTITION '2018-12-01' <= VALUES < '2019-01-01';
 52 | ALTER TABLE valid_transactions ADD RANGE PARTITION '2019-01-01' <= VALUES < '2019-02-01';
 53 | ALTER TABLE valid_transactions ADD RANGE PARTITION '2019-02-01' <= VALUES < '2019-03-01';
 54 | -- ...
 55 | 
 56 | DROP TABLE IF EXISTS invalid_transactions;
 57 | CREATE TABLE invalid_transactions (
 58 |     transaction_id STRING PRIMARY KEY,
 59 |     customer_id INT,
 60 |     vendor_id INT,
 61 |     event_state STRING,
 62 |     event_timestamp TIMESTAMP,
 63 |     price STRING,
 64 |     card_type STRING)
 65 | PARTITION BY
 66 |     HASH (transaction_id) PARTITIONS 15
 67 | STORED AS KUDU TBLPROPERTIES ('kudu.num_tablet_replicas' = '3');
 68 | 
 69 | DROP TABLE IF EXISTS customer_orphans;
 70 | CREATE TABLE customer_orphans (
 71 |     customer_id INT PRIMARY KEY,
 72 |     first_name STRING,
 73 |     last_name STRING,
 74 |     state_id INT,
 75 |     update_timestamp TIMESTAMP)
 76 | STORED AS KUDU TBLPROPERTIES ('kudu.num_tablet_replicas' = '3');
 77 | 
 78 | DROP TABLE IF EXISTS vendor_orphans;
 79 | CREATE TABLE vendor_orphans (
 80 |     vendor_id INT PRIMARY KEY,
 81 |     vendor_name STRING,
 82 |     phone_number STRING,
 83 |     update_timestamp TIMESTAMP)
 84 | STORED AS KUDU TBLPROPERTIES ('kudu.num_tablet_replicas' = '3');
 85 | 
 86 | DROP TABLE IF EXISTS transactions_operational_metadata;
 87 | CREATE TABLE transactions_operational_metadata(
 88 |     start_ts TIMESTAMP PRIMARY KEY,
 89 |     end_ts TIMESTAMP,
 90 |     num_transactions BIGINT)
 91 | STORED AS KUDU TBLPROPERTIES ('kudu.num_tablet_replicas' = '3');
 92 | 
 93 | insert into customers values (1, 'John', 'Doe', 'Alabama', 'AL', '2018-01-01');
 94 | insert into customers values (2, 'Jane', 'Miller', 'Alaska', 'AK', '2018-01-01');
 95 | 
 96 | insert into vendors values (1, 'Apple', '123456', '2018-11-13');
 97 | insert into vendors values (2, 'Dell', '345678', '2018-11-13');
 98 | 
 99 | INSERT INTO states values (-1, 'Unknown', '??');
100 | INSERT INTO states values (1, 'Alabama', 'AL');
101 | INSERT INTO states values (2, 'Alaska', 'AK');
102 | INSERT INTO states values (3, 'Arizona', 'AZ');
103 | INSERT INTO states values (4, 'Arkansas', 'AR');
104 | INSERT INTO states values (5, 'California', 'CA');
105 | INSERT INTO states values (6, 'Colorado', 'CO');
106 | INSERT INTO states values (7, 'Connecticut', 'CT');
107 | INSERT INTO states values (8, 'Delaware', 'DE');
108 | INSERT INTO states values (9, 'District of Columbia', 'DC');
109 | INSERT INTO states values (10, 'Florida', 'FL');
110 | INSERT INTO states values (11, 'Georgia', 'GA');
111 | INSERT INTO states values (12, 'Hawaii', 'HI');
112 | INSERT INTO states values (13, 'Idaho', 'ID');
113 | INSERT INTO states values (14, 'Illinois', 'IL');
114 | INSERT INTO states values (15, 'Indiana', 'IN');
115 | INSERT INTO states values (16, 'Iowa', 'IA');
116 | INSERT INTO states values (17, 'Kansas', 'KS');
117 | INSERT INTO states values (18, 'Kentucky', 'KY');
118 | INSERT INTO states values (19, 'Louisiana', 'LA');
119 | INSERT INTO states values (20, 'Maine', 'ME');
120 | INSERT INTO states values (21, 'Maryland', 'MD');
121 | INSERT INTO states values (22, 'Massachusetts', 'MA');
122 | INSERT INTO states values (23, 'Michigan', 'MI');
123 | INSERT INTO states values (24, 'Minnesota', 'MN');
124 | INSERT INTO states values (25, 'Mississippi', 'MS');
125 | INSERT INTO states values (26, 'Missouri', 'MO');
126 | INSERT INTO states values (27, 'Montana', 'MT');
127 | INSERT INTO states values (28, 'Nebraska', 'NE');
128 | INSERT INTO states values (29, 'Nevada', 'NV');
129 | INSERT INTO states values (30, 'New Hampshire', 'NH');
130 | INSERT INTO states values (31, 'New Jersey', 'NJ');
131 | INSERT INTO states values (32, 'New Mexico', 'NM');
132 | INSERT INTO states values (33, 'New York', 'NY');
133 | INSERT INTO states values (34, 'North Carolina', 'NC');
134 | INSERT INTO states values (35, 'North Dakota', 'ND');
135 | INSERT INTO states values (36, 'Ohio', 'OH');
136 | INSERT INTO states values (37, 'Oklahoma', 'OK');
137 | INSERT INTO states values (38, 'Oregon', 'OR');
138 | INSERT INTO states values (39, 'Pennsylvania', 'PA');
139 | INSERT INTO states values (40, 'Rhode Island', 'RI');
140 | INSERT INTO states values (41, 'South Carolina', 'SC');
141 | INSERT INTO states values (42, 'South Dakota', 'SD');
142 | INSERT INTO states values (43, 'Tennessee', 'TN');
143 | INSERT INTO states values (44, 'Texas', 'TX');
144 | INSERT INTO states values (45, 'Utah', 'UT');
145 | INSERT INTO states values (46, 'Vermont', 'VT');
146 | INSERT INTO states values (47, 'Virginia', 'VA');
147 | INSERT INTO states values (48, 'Washington', 'WA');
148 | INSERT INTO states values (49, 'West Virginia', 'WV');
149 | INSERT INTO states values (50, 'Wisconsin', 'WI');
150 | INSERT INTO states values (51, 'Wyoming', 'WY');
151 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |   <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |   <groupId>com.cloudera.streaming.examples</groupId>
  8 |   <artifactId>streaming-ref-app-advanced</artifactId>
  9 |   <version>0.1-SNAPSHOT</version>
 10 | 
 11 |   <repositories>
 12 |     <repository>
 13 |       <id>cloudera</id>
 14 |       <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
 15 |     </repository>
 16 |   </repositories>
 17 | 
 18 |   <properties>
 19 |     <scala.version>2.11</scala.version>
 20 |     <java.version>1.8</java.version>
 21 |     <scala.maven.plugin.version>3.3.2</scala.maven.plugin.version>
 22 |     <maven.compiler.plugin.version>3.7.0</maven.compiler.plugin.version>
 23 |     <spark.version>2.4.0-cdh6.1.0</spark.version>
 24 |     <kafka.version>2.0.0-cdh6.1.0</kafka.version>
 25 |     <kudu.version>1.8.0-cdh6.1.0</kudu.version>
 26 |     <scalatest.version>3.0.5</scalatest.version>
 27 |     <scalatest-embedded-kafka.version>2.0.0</scalatest-embedded-kafka.version>
 28 |     <gson.version>2.8.5</gson.version>
 29 |     <mockito-core.version>2.15.0</mockito-core.version>
 30 |   </properties>
 31 | 
 32 |   <dependencies>
 33 |     <dependency>
 34 |       <groupId>org.apache.spark</groupId>
 35 |       <artifactId>spark-sql-kafka-0-10_${scala.version}</artifactId>
 36 |       <version>${spark.version}</version>
 37 |       <scope>provided</scope>
 38 |     </dependency>
 39 |     <dependency>
 40 |       <groupId>org.apache.spark</groupId>
 41 |       <artifactId>spark-sql_${scala.version}</artifactId>
 42 |       <version>${spark.version}</version>
 43 |       <scope>provided</scope>
 44 |     </dependency>
 45 |     <dependency>
 46 |       <groupId>com.google.code.gson</groupId>
 47 |       <artifactId>gson</artifactId>
 48 |       <version>${gson.version}</version>
 49 |     </dependency>
 50 |     <dependency>
 51 |       <groupId>org.scalatest</groupId>
 52 |       <artifactId>scalatest_${scala.version}</artifactId>
 53 |       <version>${scalatest.version}</version>
 54 |       <scope>test</scope>
 55 |     </dependency>
 56 |     <dependency>
 57 |       <groupId>org.apache.spark</groupId>
 58 |       <artifactId>spark-core_${scala.version}</artifactId>
 59 |       <version>${spark.version}</version>
 60 |       <type>test-jar</type>
 61 |       <scope>test</scope>
 62 |     </dependency>
 63 |     <dependency>
 64 |       <groupId>org.mockito</groupId>
 65 |       <artifactId>mockito-core</artifactId>
 66 |       <version>${mockito-core.version}</version>
 67 |       <scope>test</scope>
 68 |     </dependency>
 69 |     <dependency>
 70 |       <groupId>org.apache.spark</groupId>
 71 |       <artifactId>spark-catalyst_${scala.version}</artifactId>
 72 |       <version>${spark.version}</version>
 73 |       <classifier>tests</classifier>
 74 |       <scope>test</scope>
 75 |     </dependency>
 76 |     <dependency>
 77 |       <groupId>org.apache.spark</groupId>
 78 |       <artifactId>spark-sql_${scala.version}</artifactId>
 79 |       <version>${spark.version}</version>
 80 |       <classifier>tests</classifier>
 81 |       <scope>test</scope>
 82 |     </dependency>
 83 |     <dependency>
 84 |       <groupId>net.manub</groupId>
 85 |       <artifactId>scalatest-embedded-kafka_${scala.version}</artifactId>
 86 |       <version>${scalatest-embedded-kafka.version}</version>
 87 |       <scope>test</scope>
 88 |     </dependency>
 89 |     <dependency>
 90 |       <groupId>org.apache.kudu</groupId>
 91 |       <artifactId>kudu-spark2_${scala.version}</artifactId>
 92 |       <version>${kudu.version}</version>
 93 |     </dependency>
 94 |   </dependencies>
 95 | 
 96 |   <build>
 97 |     <plugins>
 98 |       <plugin>
 99 |         <groupId>net.alchim31.maven</groupId>
100 |         <artifactId>scala-maven-plugin</artifactId>
101 |         <version>${scala.maven.plugin.version}</version>
102 |         <executions>
103 |           <execution>
104 |             <goals>
105 |               <goal>compile</goal>
106 |               <goal>testCompile</goal>
107 |             </goals>
108 |           </execution>
109 |         </executions>
110 |       </plugin>
111 |       <plugin>
112 |         <artifactId>maven-compiler-plugin</artifactId>
113 |         <version>${maven.compiler.plugin.version}</version>
114 |         <configuration>
115 |           <source>${java.version}</source>
116 |           <target>${java.version}</target>
117 |         </configuration>
118 |       </plugin>
119 |       <plugin>
120 |         <artifactId>maven-assembly-plugin</artifactId>
121 |         <configuration>
122 |           <descriptorRefs>
123 |             <descriptorRef>jar-with-dependencies</descriptorRef>
124 |           </descriptorRefs>
125 |         </configuration>
126 |         <executions>
127 |           <execution>
128 |             <id>make-assembly</id>
129 |             <phase>package</phase>
130 |             <goals>
131 |               <goal>single</goal>
132 |             </goals>
133 |           </execution>
134 |         </executions>
135 |       </plugin>
136 |     </plugins>
137 |   </build>
138 | 
139 | </project>
140 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | com.cloudera.streaming.refapp.kudu.KuduSinkProvider


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/main/scala/com/cloudera/streaming/refapp/Application.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |   * Copyright (C) Cloudera, Inc. 2019
  3 |   */
  4 | package com.cloudera.streaming.refapp
  5 | 
  6 | import java.time.Duration
  7 | import java.util.concurrent.{ScheduledFuture, ScheduledThreadPoolExecutor, TimeUnit}
  8 | 
  9 | import org.apache.spark.sql.streaming.StreamingQuery
 10 | import org.apache.spark.sql.{DataFrame, SparkSession}
 11 | import org.slf4j.LoggerFactory
 12 | 
 13 | case class Sources(statesFromCluster: Source,
 14 |                    customersFromCluster: Source,
 15 |                    vendorsFromCluster: Source,
 16 |                    customersFromStream: Source,
 17 |                    vendorsFromStream: Source,
 18 |                    transactionsFromStream: Source)
 19 | 
 20 | case class Sinks(invalidTransactions: Sink,
 21 |                  validTransactions: Sink,
 22 |                  customerOrphans: Sink,
 23 |                  vendorOrphans: Sink,
 24 |                  customers: Sink,
 25 |                  vendors: Sink,
 26 |                  transactionsOperationalMetadata: Sink)
 27 | 
 28 | class StreamingQueries(val invalidTransactions: Query,
 29 |                        val validTransactions: Query,
 30 |                        val customerOrphans: Query,
 31 |                        val vendorOrphans: Query,
 32 |                        val customers: Query,
 33 |                        val vendors: Query,
 34 |                        val transactionsOperationalMetadata: Query) {
 35 | 
 36 |   val allQueries = List(
 37 |     invalidTransactions,
 38 |     validTransactions,
 39 |     customerOrphans,
 40 |     vendorOrphans,
 41 |     customers,
 42 |     vendors,
 43 |     transactionsOperationalMetadata)
 44 | 
 45 |   def start(): Unit = {
 46 |     invalidTransactions.start()
 47 |     validTransactions.start()
 48 |     customerOrphans.start()
 49 |     vendorOrphans.start()
 50 |     customers.start()
 51 |     vendors.start()
 52 |     transactionsOperationalMetadata.start()
 53 |   }
 54 | }
 55 | 
 56 | class Application(spark: SparkSession,
 57 |                   sources: Sources,
 58 |                   sinks: Sinks,
 59 |                   queryRestartDurations: Map[String, Duration] = Map.empty,
 60 |                   cleanOutput: => Unit = noop,
 61 |                   clusterStartup: => Unit = noop,
 62 |                   initSources: => Unit = noop) {
 63 |   private val logger = LoggerFactory.getLogger(getClass)
 64 | 
 65 |   val transactionsFlow = new TransactionsFlow(
 66 |     spark,
 67 |     sources.statesFromCluster(Schemas.state),
 68 |     sources.customersFromCluster(Schemas.customer),
 69 |     sources.vendorsFromCluster(Schemas.vendor),
 70 |     sources.transactionsFromStream(Schemas.transaction)
 71 |   )
 72 | 
 73 |   val customersFlow = new CustomersFlow(spark, sources.customersFromStream(Schemas.customer))
 74 |   val vendorsFlow = new VendorsFlow(spark, sources.vendorsFromStream(Schemas.vendor))
 75 | 
 76 |   var streamingQueries = new StreamingQueries(
 77 |     // transactionsFlow.validTransactions and invalidTransactions contain columns used for internal calculations,
 78 |     // these do not fit to our output schemas
 79 |     createQuery(
 80 |       transactionsFlow.invalidTransactions.select("transaction_id", "customer_id", "vendor_id", "event_state", "event_timestamp", "price", "card_type"),
 81 |       sinks.invalidTransactions),
 82 |     createQuery(
 83 |       transactionsFlow.validTransactions.select("transaction_id", "customer_id", "vendor_id", "event_state", "event_timestamp", "price", "card_type"),
 84 |       sinks.validTransactions),
 85 |     createQuery(transactionsFlow.customerOrphans, sinks.customerOrphans),
 86 |     createQuery(transactionsFlow.vendorOrphans, sinks.vendorOrphans),
 87 |     createQuery(customersFlow.customers, sinks.customers),
 88 |     createQuery(vendorsFlow.vendors, sinks.vendors),
 89 |     createQuery(transactionsFlow.transactionsOperationalMetadata, sinks.transactionsOperationalMetadata)
 90 |   )
 91 | 
 92 |   def scheduleQueryRestarters(): Unit = {
 93 | 
 94 |     def restartQuery(query: Query): Unit = {
 95 |       println(s"Restarting query ${query.name}")
 96 |       try {
 97 |         query.restart()
 98 |       } catch {
 99 |         case e: Exception =>
100 |           // log warn
101 |           println(s"Could not restart query ${query.name}")
102 |           e.printStackTrace()
103 |       }
104 |     }
105 | 
106 |     var schedules = List[ScheduledFuture[_]]()
107 |     val executor = new ScheduledThreadPoolExecutor(1)
108 | 
109 |     def scheduleQueryRestarter(query:Query, period: Duration) = {
110 |       println(s"Scheduling query restart of ${query.name} to $period")
111 |       val task = new Runnable {
112 |         def run(): Unit =  restartQuery(query)
113 |       }
114 |       val schedule = executor.scheduleAtFixedRate(task, period.getSeconds, period.getSeconds, TimeUnit.SECONDS)
115 |       schedules = schedules :+ schedule
116 |     }
117 | 
118 |     streamingQueries.allQueries.foreach{ query =>
119 |       val queryRestartPeriod = queryRestartDurations.get(query.name)
120 |       queryRestartPeriod match {
121 |         case Some(period) => scheduleQueryRestarter(query, period)
122 |         case None => // nothing to do
123 |       }
124 |     }
125 | 
126 |     if (schedules.nonEmpty) {
127 |       Runtime.getRuntime.addShutdownHook(new Thread() {
128 |         override def run() {
129 |           schedules.foreach { sched =>
130 |             sched.cancel(true)
131 |           }
132 |         }
133 |       })
134 |     }
135 |   }
136 | 
137 |   def start() {
138 |     logger.info("Application starting")
139 |     clusterStartup
140 |     cleanOutput
141 |     initSources
142 |     streamingQueries.start()
143 |     scheduleQueryRestarters()
144 |     logger.info("Application started")
145 |   }
146 | 
147 |   def createQuery(dataFrame: DataFrame, sink: Sink) = new Query {
148 |     val writer = sink.createDataStreamWriter(dataFrame)
149 | 
150 |     var streamingQuery: Option[StreamingQuery] = None
151 | 
152 |     def start(): Unit =
153 |       streamingQuery = Some(writer.start())
154 | 
155 |     def stop(): Unit =
156 |       streamingQuery.foreach{q => q.stop()}
157 | 
158 |     def restart(): Unit = {
159 |       stop()
160 |       start()
161 |     }
162 | 
163 |     def processAllAvailable(): Unit =
164 |       streamingQuery.foreach{q => q.processAllAvailable() }
165 | 
166 |     val name = sink.name
167 |   }
168 | }


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/main/scala/com/cloudera/streaming/refapp/DataFlows.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |   * Copyright (C) Cloudera, Inc. 2019
  3 |   */
  4 | package com.cloudera.streaming.refapp
  5 | 
  6 | import org.apache.spark.sql.{DataFrame, SparkSession, functions}
  7 | 
  8 | /**
  9 |   * Enriches incoming customer flow with state name and abbreviation.
 10 |   */
 11 | class CustomersFlow(spark: SparkSession, customersFromStream: DataFrame) {
 12 |   private val customersWithWatermark = customersFromStream.withWatermark(Schemas.timestampColumnName, "10 seconds")
 13 |   customersWithWatermark.createOrReplaceTempView("customers_from_kafka")
 14 | 
 15 |   val customers = spark.sql(
 16 |     """SELECT customer_id, first_name, last_name, state_name,
 17 |            state_abbreviation, CAST(update_timestamp AS TIMESTAMP) update_timestamp
 18 |          FROM customers_from_kafka c
 19 |          LEFT OUTER JOIN states_from_cluster s
 20 |          ON c.state_id = s.state_id""")
 21 | }
 22 | 
 23 | /**
 24 |   * No transformation
 25 |   */
 26 | class VendorsFlow(spark: SparkSession, vendorsFromStream: DataFrame) {
 27 |   private val vendorsWithWatemark = vendorsFromStream.withWatermark(Schemas.timestampColumnName, "10 seconds")
 28 |   vendorsWithWatemark.createOrReplaceTempView("vendors_from_kafka")
 29 | 
 30 |   val vendors = spark.sql(
 31 |     """SELECT vendor_id, vendor_name, phone_number,
 32 |          CAST(update_timestamp AS TIMESTAMP) update_timestamp
 33 |          FROM vendors_from_kafka""")
 34 | }
 35 | 
 36 | /**
 37 |   * Processes incoming transactions: validates, finds customer and vendor orphans and produces
 38 |   * operational metadata.
 39 |   */
 40 | class TransactionsFlow(spark: SparkSession,
 41 |                        statesFromCluster: DataFrame, // Schemas.states
 42 |                        customersFromCluster: DataFrame, // Schemas.customer
 43 |                        vendorsFromCluster: DataFrame, // Schemas.vendor
 44 |                        transactionsFromStream: DataFrame) {
 45 | 
 46 |   import spark.implicits._
 47 | 
 48 |   private val transactionsWithWatemark = transactionsFromStream.withWatermark(Schemas.timestampColumnName, "10 seconds")
 49 | 
 50 |   statesFromCluster.createOrReplaceTempView("states_from_cluster")
 51 |   customersFromCluster.createOrReplaceTempView("customers_from_cluster")
 52 |   vendorsFromCluster.createOrReplaceTempView("vendors_from_cluster")
 53 |   transactionsWithWatemark.createOrReplaceTempView("transactions")
 54 | 
 55 | 
 56 |   // TODO consider eliminating unnecessary columns or rewrite to use pure SQL
 57 |   // timestamp check is not needed if the event comes from kafka
 58 |   private val validatedTransactions = transactionsWithWatemark.withColumn("mandatory_fields_exist", !'customer_id.isNull && !$"vendor_id".isNull &&
 59 |     !$"event_timestamp".isNull)
 60 |     .withColumn("valid_card_type", 'card_type.isin("Credit", "Debit"))
 61 |     .withColumn("valid_event_state", 'event_state.isin("CREATED", "SWIPED", "CANCELLED", "SIG_REQD", "AUTHORIZED", "DECLINED"))
 62 |     .withColumn("parsed_event_timestamp", functions.to_timestamp('event_timestamp, "yyyy-mm-dd"))
 63 |     .withColumn("correct_timestamp_format", !'parsed_event_timestamp.isNull)
 64 |     .withColumn("valid_record", 'mandatory_fields_exist && 'valid_card_type && 'valid_event_state && 'correct_timestamp_format)
 65 | 
 66 |   /**
 67 |     * Transaction records with missing / incorrect data
 68 |     */
 69 |   val invalidTransactions = validatedTransactions.filter(!'valid_record)
 70 | 
 71 |   /**
 72 |     * Transactions with complete and correct data
 73 |     */
 74 |   val validTransactions = validatedTransactions.filter('valid_record)
 75 |   validTransactions.createOrReplaceTempView("card_transactions_good_records")
 76 | 
 77 |   /**
 78 |     * Customers that did not exist in our database but were referenced in transactions
 79 |     */
 80 |   val customerOrphans = spark.sql(
 81 |     """SELECT customer_id, 'Unknown' first_name, 'Unknown' last_name,
 82 |           -1 state_id, CURRENT_TIMESTAMP() update_timestamp
 83 |         FROM
 84 |           (SELECT customer_id FROM card_transactions_good_records) sc
 85 |         LEFT ANTI JOIN
 86 |           (SELECT customer_id FROM customers_from_cluster) cc
 87 |         USING (customer_id)""").dropDuplicates("customer_id")
 88 | 
 89 |   customerOrphans.createOrReplaceTempView("customer_orphans")
 90 | 
 91 |   /**
 92 |     * Customers that did not exist in our database but were referenced in transactions
 93 |     */
 94 |   val vendorOrphans = spark.sql(
 95 |     """SELECT vendor_id, 'Unknown' vendor_name, 'Unknown' phone_number,
 96 |           CURRENT_TIMESTAMP() update_timestamp
 97 |         FROM
 98 |           (SELECT vendor_id FROM card_transactions_good_records) sv
 99 |         LEFT ANTI JOIN
100 |           (SELECT vendor_id FROM vendors_from_cluster) cv
101 |           USING (vendor_id)""").dropDuplicates("vendor_id")
102 | 
103 |   vendorOrphans.createOrReplaceTempView("vendor_orphans")
104 | 
105 |   // TODO add more operational metadata for invalid records, customer and vendor orphans
106 |   /**
107 |     * Operational metadata for monitoring.
108 |     */
109 |   val transactionsOperationalMetadata = transactionsWithWatemark
110 |     .groupBy(functions.window(functions.col(Schemas.timestampColumnName), "1 minutes"))
111 |     .count().as("c")
112 |     .selectExpr("c.window.start as start_ts", "c.window.end as end_ts", "c.count as num_transactions")
113 | 
114 | }
115 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/main/scala/com/cloudera/streaming/refapp/DataGenerator.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |   * Copyright (C) Cloudera, Inc. 2019
  3 |   */
  4 | package com.cloudera.streaming.refapp
  5 | 
  6 | import java.sql.Timestamp
  7 | import java.util.concurrent.{ScheduledThreadPoolExecutor, TimeUnit}
  8 | 
  9 | import scala.collection.JavaConverters._
 10 | import scala.util.Random
 11 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 12 | import org.apache.kafka.common.serialization.{IntegerSerializer, Serializer}
 13 | import org.slf4j.{Logger, LoggerFactory}
 14 | 
 15 | object CustomerGenerator {
 16 |   def apply(kafkaConfig: KafkaConfig, topic: String): DataGenerator[Integer, Customer] = new CustomerGenerator(kafkaConfig, topic).generator
 17 | }
 18 | 
 19 | object VendorGenerator {
 20 |   def apply(kafkaConfig: KafkaConfig, topic: String): DataGenerator[Integer, Vendor] = new VendorGenerator(kafkaConfig, topic).generator
 21 | }
 22 | 
 23 | object TransactionGenerator {
 24 |   def apply(kafkaConfig: KafkaConfig, topic: String, recordsPerSec: Int = 1): DataGenerator[Integer, Transaction] = new TransactionGenerator(kafkaConfig, topic, recordsPerSec).generator
 25 | }
 26 | 
 27 | class CustomerGenerator(kafkaConfig: KafkaConfig, topic: String) {
 28 |   val generator = new DataGenerator[Integer, Customer](
 29 |     kafkaConfig,
 30 |     topic = topic,
 31 |     createValue = Customer(
 32 |       customer_id = Random.nextInt(10) + 1,
 33 |       state_id = if (Random.nextInt(100) < 5) -1 else Random.nextInt(51) + 1,
 34 |       first_name = randomFirstName,
 35 |       last_name = randomLastName,
 36 |       update_timestamp = new Timestamp(System.currentTimeMillis() - Random.nextInt(48 * 60 * 60 * 1000))),
 37 |     getKey = _.customer_id,
 38 |     keySerializer = new IntegerSerializer(),
 39 |     valueSerializer = new CustomerSerializer())
 40 | 
 41 |   private val firstNames = Seq("John", "Jane", "Alex", "Jessica")
 42 | 
 43 |   private def randomFirstName = firstNames(Random.nextInt(firstNames.length))
 44 | 
 45 |   private val lastNames = Seq("Doe", "Smith", "Tailor", "Hamilton")
 46 | 
 47 |   private def randomLastName = lastNames(Random.nextInt(lastNames.length))
 48 | }
 49 | 
 50 | class VendorGenerator(kafkaConfig: KafkaConfig, topic: String) {
 51 |   val generator = new DataGenerator[Integer, Vendor](
 52 |     kafkaConfig,
 53 |     topic = topic,
 54 |     createValue = Vendor(
 55 |       vendor_id = Random.nextInt(10) + 1,
 56 |       vendor_name = randomVendorName,
 57 |       phone_number = randomPhoneNumber,
 58 |       update_timestamp = new Timestamp(System.currentTimeMillis() - Random.nextInt(48 * 60 * 60 * 1000))),
 59 |     getKey = _ => null,
 60 |     keySerializer = new IntegerSerializer(),
 61 |     valueSerializer = new VendorSerializer())
 62 | 
 63 |   private val vendorNames = Seq("Acme Corp.", "Cyberdyne Systems", "Hooli", "Initech", "Stark Industries", "Wayne Enterprises")
 64 | 
 65 |   private def randomVendorName = vendorNames(Random.nextInt(vendorNames.length))
 66 | 
 67 |   private def randomPhoneNumber = f"+1-${Random.nextInt(1000)}%03d-555-${Random.nextInt(10000)}%04d"
 68 | }
 69 | 
 70 | class TransactionGenerator(kafkaConfig: KafkaConfig, topic: String, recordsPerSecond: Int) {
 71 |   val generator = new DataGenerator[Integer, Transaction](
 72 |     kafkaConfig,
 73 |     topic = topic,
 74 |     createValue = Transaction(
 75 |       transaction_id = Random.alphanumeric.take(3).mkString,
 76 |       customer_id = if (Random.nextInt(100) < 20) None else Some(Random.nextInt(10) + 1),
 77 |       vendor_id = if (Random.nextInt(100) < 20) None else Some(Random.nextInt(10) + 1),
 78 |       event_state = randomEventState,
 79 |       event_timestamp = new Timestamp(System.currentTimeMillis() - Random.nextInt(60 * 1000)),
 80 |       price = if (Random.nextInt(100) < 20) None else Some(Random.nextInt(100000).toString),
 81 |       card_type = randomCardType),
 82 |     getKey = transaction => {
 83 |       val id: Integer = if (transaction.customer_id.isDefined) transaction.customer_id.get else null
 84 |       id
 85 |     },
 86 |     new IntegerSerializer(),
 87 |     new TransactionSerializer(),
 88 |     recordsPerSecond
 89 |   )
 90 | 
 91 |   private val states = Seq("CREATED", "SWIPED", "AUTHORIZED", "INVALID")
 92 | 
 93 |   private def randomEventState = if (Random.nextInt(100) < 20) None else Some(states(Random.nextInt(states.length)))
 94 | 
 95 |   private val cardTypes = Seq("Credit", "Debit", "Whatever")
 96 | 
 97 |   private def randomCardType = if (Random.nextInt(100) < 20) None else Some(cardTypes(Random.nextInt(cardTypes.length)))
 98 | }
 99 | 
100 | class DataGenerator[K, V](kafkaConfig: KafkaConfig,
101 |                           topic: String,
102 |                           createValue: => V,
103 |                           getKey: V => K,
104 |                           keySerializer: Serializer[K],
105 |                           valueSerializer: Serializer[V],
106 |                           recordsPerSecond: Int = 1) {
107 | 
108 |   val logger : Logger = LoggerFactory.getLogger(getClass)
109 | 
110 |   def start(): Unit = {
111 |     logger.info("Data generator starting")
112 |     val config: Map[String, Object] = kafkaConfig.kafkaParams
113 |     val producer = new KafkaProducer(config.asJava, keySerializer, valueSerializer)
114 | 
115 |     def generate(recordCount : Int) = try {
116 |       for (_ <- 1 to recordCount) {
117 |         val value: V = createValue
118 |         val key: K = getKey(value)
119 |         logger.debug(s"Producing to $topic: $value")
120 |         producer.send(new ProducerRecord(topic, key, value))
121 |       }
122 |     } catch {
123 |       case e: Exception =>
124 |         logger.error("Exception while producing", e)
125 |         System.exit(1)
126 |     }
127 | 
128 |     val ex = new ScheduledThreadPoolExecutor(1)
129 |     val task = new Runnable {
130 |       def run(): Unit = generate(recordsPerSecond)
131 |     }
132 |     val sched = ex.scheduleAtFixedRate(task, 1, 1, TimeUnit.SECONDS)
133 | 
134 |     Runtime.getRuntime.addShutdownHook(new Thread() {
135 |       override def run() {
136 |         logger.info("Data generator stopping")
137 |         sched.cancel(false)
138 |         producer.close()
139 |         logger.info("Data generator stopped")
140 |       }
141 |     })
142 |     logger.info("Data generator started")
143 |   }
144 | }
145 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/main/scala/com/cloudera/streaming/refapp/DeployedDataGenerator.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | /**
 7 |   * Created by andrasbeni on 12/1/18.
 8 |   */
 9 | object DeployedDataGenerator {
10 | 
11 |   def main(args: Array[String]) {
12 | 
13 |     if (args.length != 2) {
14 |       sys.error(
15 |         """Usage:
16 |           |com.cloudera.streaming.refapp.DeployedDataGenerator producer.config
17 |           |producer.config path to kafka client properties
18 |           |transactions.per.sec number of records produced to "transaction" topic per second
19 |         """.stripMargin)
20 |     }
21 |     val Array(producerConfig, recordsPerSec) = args
22 |     val kafkaConfig: KafkaConfig = KafkaConfig.fromPropertiesFile(producerConfig)
23 | 
24 |     CustomerGenerator(kafkaConfig, "customer").start()
25 |     VendorGenerator(kafkaConfig, "vendor").start()
26 |     TransactionGenerator(kafkaConfig, "transaction", recordsPerSec.toInt).start()
27 | 
28 |   }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/main/scala/com/cloudera/streaming/refapp/DeployedStructuredStreams.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | import org.apache.spark.sql.SparkSession
 7 | import org.slf4j.LoggerFactory
 8 | 
 9 | /**
10 |   * A long running streaming application that can be submitted to a running Spark service.
11 |   * It reads static data from Kudu, streaming data from Kafka and writes output to Kudu
12 |   */
13 | object DeployedStructuredStreams {
14 | 
15 |   def main(args: Array[String]) {
16 | 
17 |     val logger = LoggerFactory.getLogger(getClass)
18 | 
19 |     if (args.length < 2) {
20 |       sys.error(
21 |         """Usage:
22 |           |com.cloudera.streaming.refapp.DeployedStructuredStreams consumer.config kudu-master timeToLive
23 |           |consumer.config path to kafka client properties
24 |           |kudu-master host:port pair pointing to a kudu master instance
25 |           |timeToLive optional, if specified the application will be stopped after timeToLive seconds, useful for testing
26 |         """.stripMargin)
27 |     }
28 |     // extract first two arguments
29 |     val Array(consumerConfig, kuduMaster, _*) = args
30 |     // read optional argument
31 |     val timeToLive = if (args.length > 2) Some(args(2).toInt) else None
32 | 
33 |     val spark = SparkSession.builder().appName("streaming-ref").getOrCreate()
34 | 
35 |     val kafkaConfig: KafkaConfig = KafkaConfig.fromPropertiesFile(consumerConfig)
36 |     val kafkaSource = new KafkaSource(spark, kafkaConfig)
37 | 
38 |     val kuduDatabase = "streaming_ref"
39 |     val kuduSource = new KuduSource(spark, kuduMaster, kuduDatabase)
40 |     val kuduSink = new KuduSink(kuduMaster, kuduDatabase, defaultCheckpointLocation)
41 | 
42 |     val application = new Application(
43 |       spark,
44 |       Sources(
45 |         statesFromCluster = kuduSource.loadTable("states"),
46 |         customersFromCluster = kuduSource.loadTable("customers"),
47 |         vendorsFromCluster = kuduSource.loadTable("vendors"),
48 |         customersFromStream = kafkaSource.jsonStreamWithKafkaTimestamp("customer"),
49 |         vendorsFromStream = kafkaSource.jsonStreamWithTimestampFromMessage("vendor", "update_timestamp"),
50 |         transactionsFromStream = kafkaSource.jsonStreamWithTimestampFromMessage("transaction", "event_timestamp")
51 |       ),
52 |       Sinks(
53 |         validTransactions = kuduSink.writeTable("valid_transactions"),
54 |         invalidTransactions = kuduSink.writeTable("invalid_transactions"),
55 |         customerOrphans = kuduSink.writeTable("customer_orphans"),
56 |         vendorOrphans = kuduSink.writeTable("vendor_orphans"),
57 |         customers = kuduSink.writeTable("customers"),
58 |         vendors = kuduSink.writeTable("vendors"),
59 |         transactionsOperationalMetadata = kuduSink.writeTable("transactions_operational_metadata")
60 |       ))
61 | 
62 |     application.start()
63 | 
64 |     timeToLive match {
65 |       case Some(tl) =>
66 |         logger.info(s"Running application for $tl seconds")
67 |         Thread.sleep(tl * 1000)
68 |         logger.info("Stopping application")
69 |       case None => spark.streams.awaitAnyTermination()
70 |     }
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/main/scala/com/cloudera/streaming/refapp/Kafka.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |   * Copyright (C) Cloudera, Inc. 2019
  3 |   */
  4 | package com.cloudera.streaming.refapp
  5 | 
  6 | import java.io.{FileInputStream, InputStream}
  7 | import java.util.Properties
  8 | 
  9 | import scala.collection.JavaConverters._
 10 | 
 11 | import org.apache.spark.sql.types.StructType
 12 | import org.apache.spark.sql.{DataFrame, SparkSession, functions}
 13 | 
 14 | /**
 15 |   * Reads/builds configuration for kafka clients that are either standalone or live in Spark.
 16 |   */
 17 | object KafkaConfig {
 18 | 
 19 |   /**
 20 |     * Reads configuration from property file
 21 |     */
 22 |   def fromPropertiesFile(configFile : String) = {
 23 |     var inputStream : Option[InputStream] = None
 24 |     val kafkaParams = try {
 25 |       inputStream = Some(new FileInputStream(configFile))
 26 |       val params = new Properties()
 27 |       params.load(inputStream.get)
 28 |       inputStream.get.close()
 29 |       params.asScala.toMap
 30 |     } finally {
 31 |       inputStream.foreach(_.close())
 32 |     }
 33 |     new KafkaConfig(kafkaParams)
 34 |   }
 35 | 
 36 |   /**
 37 |     * Builds configuration for brokers using PLAINTEXT protocol.
 38 |     */
 39 |   def fromBrokerList(bootstrapServers: String) = {
 40 | 
 41 |     new KafkaConfig(Map[String, String](
 42 |       "bootstrap.servers" -> bootstrapServers,
 43 |       "security.protocol" -> "PLAINTEXT"))
 44 |   }
 45 | }
 46 | 
 47 | class KafkaConfig(val kafkaParams : Map[String, String]) {
 48 | 
 49 |   /**
 50 |     * Converts plain kafka configuration for usage in Spark.
 51 |     */
 52 |   val kafkaParamsForSpark: Map[String, String] = kafkaParams.map {
 53 |     case (key, value) => "kafka." + key -> value
 54 |   }
 55 | }
 56 | /**
 57 |   * Creates streaming Sources reading Kafka topics.
 58 |   */
 59 | class KafkaSource(spark: SparkSession, kafkaConfig: KafkaConfig) {
 60 | 
 61 |   private def loadStream(topic:String, startingOffset: String) = {
 62 | 
 63 |     val params = kafkaConfig.kafkaParamsForSpark +
 64 |       ("subscribe" -> topic) +
 65 |       ("startingoffsets" -> startingOffset)
 66 |     spark.readStream.format("kafka").options(params).load()
 67 |   }
 68 | 
 69 |   /**
 70 |     * Creates a dataframe from a kafka topic containing Strings.
 71 |     * Useful for testing and debugging.
 72 |     */
 73 |   def stringStream(topic: String): DataFrame = loadStream(topic, "earliest").selectExpr("CAST(value AS STRING)")
 74 | 
 75 |   /**
 76 |     * Creates a streaming source that reads JSON records.
 77 |     * The DataFrame will include the timestamp that kafka added to the message, called Schemas.timestampColumnName.
 78 |     */
 79 |   def jsonStreamWithKafkaTimestamp(topic: String)(schema: StructType): DataFrame = {
 80 | 
 81 |     import spark.implicits._
 82 | 
 83 |     loadStream(topic, "latest")
 84 |       .withColumn(Schemas.timestampColumnName, functions.col("timestamp"))
 85 |       .selectExpr("CAST(value AS STRING)", Schemas.timestampColumnName)
 86 |       .select(functions.from_json('value, schema) as "entity", functions.col(Schemas.timestampColumnName))
 87 |       .select("entity.*", Schemas.timestampColumnName)
 88 |   }
 89 | 
 90 |   /**
 91 |     * Creates a streaming source that reads JSON records.
 92 |     * The DataFrame will include the timestamp from the original message, called Schemas.timestampColumnName.
 93 |     */
 94 |   def jsonStreamWithTimestampFromMessage(topic: String, timestampColumnName: String)(schema: StructType): DataFrame = {
 95 | 
 96 |     import spark.implicits._
 97 | 
 98 |     // TODO simplify selects
 99 |     loadStream(topic, "latest")
100 |       .selectExpr("CAST(value AS STRING)")
101 |       .select(functions.from_json('value, schema) as "entity")
102 |       .selectExpr("entity.*", s"entity.$timestampColumnName as ${Schemas.timestampColumnName}")
103 |   }
104 | }


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/main/scala/com/cloudera/streaming/refapp/Kudu.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | import org.apache.kudu.spark.kudu._
 7 | import org.apache.spark.sql.streaming.DataStreamWriter
 8 | import org.apache.spark.sql.types.StructType
 9 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
10 | 
11 | /**
12 |   * Creates static Sources reading Kudu tables.
13 |   */
14 | class KuduSource(spark: SparkSession, master: String, database: String) {
15 | 
16 |   /**
17 |     * Creates a Source that reads content from a table.
18 |     * It will register a TempView with the same name in Spark session, so SQL queries can use it.
19 |     */
20 |   def loadTable(name: String)(ignored: StructType) = {
21 |     val fullTableName = s"impala::$database.$name"
22 |     val df = spark
23 |       .read
24 |       .options(
25 |         Map(
26 |           "kudu.master" -> master,
27 |           "kudu.table" -> fullTableName)).kudu
28 |     df.createOrReplaceTempView(name)
29 |     df
30 |   }
31 | }
32 | 
33 | /**
34 |   * Creates Sinks that produce streaming output to Kudu tables.
35 |   *
36 |   * @param checkpointLocation provides the path where the checkpoints are stored, given the name of the Sink
37 |   */
38 | class KuduSink(master: String, database: String, checkpointLocation: String => String) {
39 | 
40 |     def writeTable(sinkName: String, triggerSeconds: Int = 10) =
41 |       new Sink {
42 |         override def createDataStreamWriter(df: DataFrame): DataStreamWriter[Row] = {
43 |           val fullTableName = s"impala::$database.$name"
44 |           df
45 |             .writeStream
46 |             .format("kudu")
47 |             .option("kudu.master", master)
48 |             .option("kudu.table", fullTableName)
49 |             .option("checkpointLocation", checkpointLocation(name))
50 |             .option("retries", "3")
51 |             .outputMode("update")
52 |         }
53 | 
54 |         override val name: String = sinkName
55 |       }
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/main/scala/com/cloudera/streaming/refapp/Schemas.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | import java.lang.reflect.Type
 7 | import java.util
 8 | 
 9 | import com.google.gson._
10 | import org.apache.kafka.common.serialization.Serializer
11 | 
12 | import org.apache.spark.sql.Encoders
13 | import org.apache.spark.sql.types.{StringType, StructType}
14 | 
15 | /**
16 |   * Contains schemas and serializers for various domain objects.
17 |   */
18 | object Schemas {
19 |   val timestampColumnName = "timestamp"
20 |   val transaction: StructType = Encoders.product[Transaction].schema
21 |   val vendor: StructType = Encoders.product[Vendor].schema
22 |   val customer: StructType = Encoders.product[Customer].schema
23 | 
24 |   val state: StructType = Encoders.product[State].schema
25 |   val plainText: StructType = new StructType().
26 |     add("value", StringType)
27 | }
28 | 
29 | class OptionSerializer extends JsonSerializer[Option[Any]] {
30 |   override def serialize(src: Option[Any], typeOfSrc: Type, context: JsonSerializationContext): JsonElement = {
31 |     src match {
32 |       case None => JsonNull.INSTANCE
33 |       case Some(v) => context.serialize(v)
34 |     }
35 |   }
36 | }
37 | 
38 | class TransactionSerializer extends Serializer[Transaction] {
39 | 
40 |   private val gson = new GsonBuilder().setDateFormat("yyyy-MM-dd HH:mm:ss").registerTypeHierarchyAdapter(classOf[Option[Any]], new OptionSerializer).create()
41 | 
42 |   override def configure(map: util.Map[String, _], b: Boolean): Unit = {}
43 | 
44 |   override def serialize(topic: String, transaction: Transaction): Array[Byte] = {
45 |     gson.toJson(transaction).getBytes
46 |   }
47 | 
48 |   override def close(): Unit = {}
49 | }
50 | 
51 | class CustomerSerializer extends Serializer[Customer] {
52 | 
53 |   private val gson = new GsonBuilder().setDateFormat("yyyy-MM-dd").create()
54 | 
55 |   override def configure(map: util.Map[String, _], b: Boolean): Unit = {}
56 | 
57 |   override def serialize(topic: String, customer: Customer): Array[Byte] = {
58 |     gson.toJson(customer).getBytes
59 |   }
60 | 
61 |   override def close(): Unit = {}
62 | }
63 | 
64 | class VendorSerializer extends Serializer[Vendor] {
65 | 
66 |   private val gson = new GsonBuilder().setDateFormat("yyyy-MM-dd").create()
67 | 
68 |   override def configure(map: util.Map[String, _], b: Boolean): Unit = {}
69 | 
70 |   override def serialize(topic: String, vendor: Vendor): Array[Byte] = {
71 |     gson.toJson(vendor).getBytes
72 |   }
73 | 
74 |   override def close(): Unit = {}
75 | }
76 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/main/scala/com/cloudera/streaming/refapp/kudu/KuduSink.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Modifications copyright (C) 2019 Cloudera Inc
 3 |  */
 4 | package com.cloudera.streaming.refapp.kudu
 5 | 
 6 | import org.apache.kudu.spark.kudu.KuduContext
 7 | import org.apache.spark.sql.execution.streaming.Sink
 8 | import org.apache.spark.sql.{DataFrame, SQLContext}
 9 | import org.slf4j.LoggerFactory
10 | 
11 | import scala.util.control.NonFatal
12 | 
13 | object KuduSink {
14 |   def withDefaultContext(sqlContext: SQLContext, parameters: Map[String, String]) =
15 |     new KuduSink(new KuduContext(parameters("kudu.master"), sqlContext.sparkContext), parameters)
16 | }
17 | 
18 | /**
19 |   * A simple Structured Streaming sink which writes the data frame to Kudu.
20 |   * It preserves exactly once semantics, as it's idempotent in the face of
21 |   * multiple attempts to add the same batch.
22 |   *
23 |   * It uses the following parameters:
24 |   * kudu.master - host:port pair of a kudu master node
25 |   * kudu.table - full table name
26 |   * checkpointLocation - where the checkpoint will be stored
27 |   */
28 | class KuduSink(initKuduContext: => KuduContext, parameters: Map[String, String]) extends Sink {
29 | 
30 |   private val logger = LoggerFactory.getLogger(getClass)
31 | 
32 |   private var kuduContext = initKuduContext
33 | 
34 |   private val tablename = parameters("kudu.table")
35 | 
36 |   private val retries = parameters.getOrElse("retries", "1").toInt
37 |   require(retries >= 0, "retries must be non-negative")
38 | 
39 |   logger.info(s"Created Kudu sink writing to table $tablename")
40 | 
41 |   override def addBatch(batchId: Long, data: DataFrame): Unit = {
42 |     for (attempt <- 0 to retries) {
43 |       try {
44 |         kuduContext.upsertRows(data, tablename)
45 |         return
46 |       } catch {
47 |         case NonFatal(e) =>
48 |           if (attempt < retries) {
49 |             logger.warn("Kudu upsert error, retrying...", e)
50 |             kuduContext = initKuduContext
51 |           }
52 |           else {
53 |             logger.error("Kudu upsert error, exhausted", e)
54 |             throw e
55 |           }
56 |       }
57 |     }
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/main/scala/com/cloudera/streaming/refapp/kudu/KuduSinkProvider.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Modifications copyright (C) 2019 Cloudera Inc
 3 |  */
 4 | package com.cloudera.streaming.refapp.kudu
 5 | 
 6 | import org.apache.spark.sql.SQLContext
 7 | import org.apache.spark.sql.execution.streaming.Sink
 8 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
 9 | import org.apache.spark.sql.streaming.OutputMode
10 | 
11 | /**
12 |   * Registers KuduSink to Spark streaming, so it can be used with format "kudu".
13 |   *
14 |   * Note: to make it effective you need META-INF/services/org.apache.spark.sql.sources.DataSourceRegister to
15 |   * refer to this class.
16 |   */
17 | class KuduSinkProvider extends StreamSinkProvider with DataSourceRegister {
18 | 
19 |   override def createSink(sqlContext: SQLContext,
20 |                           parameters: Map[String, String],
21 |                           partitionColumns: Seq[String],
22 |                           outputMode: OutputMode): Sink = {
23 |     require(outputMode == OutputMode.Update, "only 'update' OutputMode is supported")
24 |     KuduSink.withDefaultContext(sqlContext, parameters)
25 |   }
26 | 
27 |   override def shortName(): String = "kudu"
28 | }


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/main/scala/com/cloudera/streaming/refapp/package.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming
 5 | 
 6 | import java.sql.Timestamp
 7 | import java.util.UUID
 8 | 
 9 | import org.apache.spark.sql.streaming.DataStreamWriter
10 | import org.apache.spark.sql.types.StructType
11 | import org.apache.spark.sql.{DataFrame, Row}
12 | import org.slf4j.LoggerFactory
13 | 
14 | package object refapp {
15 | 
16 |   private val logger = LoggerFactory.getLogger(getClass)
17 | 
18 |   val noop = {}
19 | 
20 |   /**
21 |     * A Source creates a static or streaming DataFrame. Incoming data is parsed using the given schema.
22 |     */
23 |   type Source = StructType => DataFrame
24 | 
25 |   /**
26 |     * Represents a streaming query that is executing continuously in the background as new data arrives
27 |     */
28 |   trait Query {
29 |     def start(): Unit
30 |     def stop(): Unit
31 |     def restart(): Unit
32 |     def processAllAvailable(): Unit
33 |     val name: String
34 |   }
35 | 
36 |   /**
37 |     * Connects the output of a streaming query to a storage or messaging system.
38 |     */
39 |   trait Sink {
40 |     val name: String
41 |     def createDataStreamWriter(df: DataFrame): DataStreamWriter[Row]
42 |   }
43 | 
44 |   // domain objects
45 |   case class Transaction(transaction_id: String,
46 |                          customer_id: Option[Int],
47 |                          vendor_id: Option[Int],
48 |                          event_state: Option[String],
49 |                          event_timestamp: Timestamp,
50 |                          price: Option[String],
51 |                          card_type: Option[String])
52 | 
53 |   case class Vendor(vendor_id: Int,
54 |                     vendor_name: String,
55 |                     phone_number: String,
56 |                     update_timestamp: Timestamp)
57 | 
58 |   case class Customer(customer_id: Int,
59 |                       state_id: Int,
60 |                       first_name: String,
61 |                       last_name: String,
62 |                       update_timestamp: Timestamp)
63 | 
64 |   case class State(state_id: Int,
65 |                    state_name: String,
66 |                    state_abbreviation: String)
67 | 
68 | 
69 | 
70 |   private val baseCheckpointLocation = "/tmp/temporary-" + UUID.randomUUID.toString
71 |   logger.info(s"Storing Spark checkpoints in $baseCheckpointLocation")
72 | 
73 |   def defaultCheckpointLocation(streamName: String) = s"$baseCheckpointLocation/$streamName"
74 | 
75 | }


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/resources/samples/cluster/customers/customers.json:
--------------------------------------------------------------------------------
1 | {"customer_id": 1, "first_name": "John", "last_name": "Doe", "state_abbreviation": "AL", "state_name": "Alabama", "update_timestamp": "2018-01-01"},
2 | {"customer_id": 2, "first_name": "Jane", "last_name": "Miller", "state_abbreviation": "AK", "state_name": "Alaska", "update_timestamp": "2018-02-02"}
3 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/resources/samples/cluster/states/states.json:
--------------------------------------------------------------------------------
 1 | {"state_id": -1, "state_name": "Unknown", "state_abbreviation": "??"},
 2 | {"state_id": 1, "state_name": "Alabama", "state_abbreviation": "AL"},
 3 | {"state_id": 2, "state_name": "Alaska", "state_abbreviation": "AK"},
 4 | {"state_id": 3, "state_name": "Arizona", "state_abbreviation": "AZ"},
 5 | {"state_id": 4, "state_name": "Arkansas", "state_abbreviation": "AR"},
 6 | {"state_id": 5, "state_name": "California", "state_abbreviation": "CA"},
 7 | {"state_id": 6, "state_name": "Colorado", "state_abbreviation": "CO"},
 8 | {"state_id": 7, "state_name": "Connecticut", "state_abbreviation": "CT"},
 9 | {"state_id": 8, "state_name": "Delaware", "state_abbreviation": "DE"},
10 | {"state_id": 9, "state_name": "District of Columbia", "state_abbreviation": "DC"},
11 | {"state_id": 10, "state_name": "Florida", "state_abbreviation": "FL"},
12 | {"state_id": 11, "state_name": "Georgia", "state_abbreviation": "GA"},
13 | {"state_id": 12, "state_name": "Hawaii", "state_abbreviation": "HI"},
14 | {"state_id": 13, "state_name": "Idaho", "state_abbreviation": "ID"},
15 | {"state_id": 14, "state_name": "Illinois", "state_abbreviation": "IL"},
16 | {"state_id": 15, "state_name": "Indiana", "state_abbreviation": "IN"},
17 | {"state_id": 16, "state_name": "Iowa", "state_abbreviation": "IA"},
18 | {"state_id": 17, "state_name": "Kansas", "state_abbreviation": "KS"},
19 | {"state_id": 18, "state_name": "Kentucky", "state_abbreviation": "KY"},
20 | {"state_id": 19, "state_name": "Louisiana", "state_abbreviation": "LA"},
21 | {"state_id": 20, "state_name": "Maine", "state_abbreviation": "ME"},
22 | {"state_id": 21, "state_name": "Maryland", "state_abbreviation": "MD"},
23 | {"state_id": 22, "state_name": "Massachusetts", "state_abbreviation": "MA"},
24 | {"state_id": 23, "state_name": "Michigan", "state_abbreviation": "MI"},
25 | {"state_id": 24, "state_name": "Minnesota", "state_abbreviation": "MN"},
26 | {"state_id": 25, "state_name": "Mississippi", "state_abbreviation": "MS"},
27 | {"state_id": 26, "state_name": "Missouri", "state_abbreviation": "MO"},
28 | {"state_id": 27, "state_name": "Montana", "state_abbreviation": "MT"},
29 | {"state_id": 28, "state_name": "Nebraska", "state_abbreviation": "NE"},
30 | {"state_id": 29, "state_name": "Nevada", "state_abbreviation": "NV"},
31 | {"state_id": 30, "state_name": "New Hampshire", "state_abbreviation": "NH"},
32 | {"state_id": 31, "state_name": "New Jersey", "state_abbreviation": "NJ"},
33 | {"state_id": 32, "state_name": "New Mexico", "state_abbreviation": "NM"},
34 | {"state_id": 33, "state_name": "New York", "state_abbreviation": "NY"},
35 | {"state_id": 34, "state_name": "North Carolina", "state_abbreviation": "NC"},
36 | {"state_id": 35, "state_name": "North Dakota", "state_abbreviation": "ND"},
37 | {"state_id": 36, "state_name": "Ohio", "state_abbreviation": "OH"},
38 | {"state_id": 37, "state_name": "Oklahoma", "state_abbreviation": "OK"},
39 | {"state_id": 38, "state_name": "Oregon", "state_abbreviation": "OR"},
40 | {"state_id": 39, "state_name": "Pennsylvania", "state_abbreviation": "PA"},
41 | {"state_id": 40, "state_name": "Rhode Island", "state_abbreviation": "RI"},
42 | {"state_id": 41, "state_name": "South Carolina", "state_abbreviation": "SC"},
43 | {"state_id": 42, "state_name": "South Dakota", "state_abbreviation": "SD"},
44 | {"state_id": 43, "state_name": "Tennessee", "state_abbreviation": "TN"},
45 | {"state_id": 44, "state_name": "Texas", "state_abbreviation": "TX"},
46 | {"state_id": 45, "state_name": "Utah", "state_abbreviation": "UT"},
47 | {"state_id": 46, "state_name": "Vermont", "state_abbreviation": "VT"},
48 | {"state_id": 47, "state_name": "Virginia", "state_abbreviation": "VA"},
49 | {"state_id": 48, "state_name": "Washington", "state_abbreviation": "WA"},
50 | {"state_id": 49, "state_name": "West Virginia", "state_abbreviation": "WV"},
51 | {"state_id": 50, "state_name": "Wisconsin", "state_abbreviation": "WI"},
52 | {"state_id": 51, "state_name": "Wyoming", "state_abbreviation": "WY"}
53 | 
54 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/resources/samples/cluster/vendors/vendors.json:
--------------------------------------------------------------------------------
1 | {"vendor_id": 1, "vendor_name": "Apple", "phone_number": "123456", "update_timestamp": "2018-11-13"}
2 | {"vendor_id": 2, "vendor_name": "Dell", "phone_number": "345678", "update_timestamp": "2018-11-13"}
3 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/resources/samples/kafka/customers/customers.json:
--------------------------------------------------------------------------------
1 | {"customer_id": 1, "first_name": "John", "last_name": "Doe", "state_id": 3, "update_timestamp": "2018-11-13"},
2 | {"customer_id": 2, "first_name": "Jane", "last_name": "Miller", "state_id": 5, "update_timestamp": "2018-11-13"}
3 | {"customer_id": 3, "first_name": "Joe", "last_name": "Smith", "state_id": 2, "update_timestamp": "2018-11-13"}
4 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/resources/samples/kafka/transactions/transactions-1.json:
--------------------------------------------------------------------------------
1 | {"transaction_id": "11", "customer_id": 1,  "vendor_id": 1, "event_state": "CREATED", "event_timestamp": "2018-11-12 09:42:00", "price": "100", "card_type": "Credit"}
2 | {"transaction_id": "12", "customer_id": 2,  "vendor_id": 2, "event_state": "SWIPED", "event_timestamp": "2018-11-13 09:43:00", "price": "100", "card_type": "Debit"}
3 | {"transaction_id": "13", "customer_id": 1,  "vendor_id": 1, "event_state": "SWIPED", "event_timestamp": "2018-11-13 09:43:01", "price": "100", "card_type": "Debit"}
4 | {"transaction_id": "13", "customer_id": 2,  "vendor_id": 2, "event_state": "AUTHORIZED", "event_timestamp": "2018-11-13 09:44:01", "price": "100", "card_type": "Debit"}
5 | {"transaction_id": "13", "customer_id": 1,  "vendor_id": 1, "event_state": "AUTHORIZED", "event_timestamp": "2018-11-13 09:45:05", "price": "100", "card_type": "Debit"}
6 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/resources/samples/kafka/transactions/transactions-2.json:
--------------------------------------------------------------------------------
1 | {"transaction_id": "21", "customer_id": 100,  "vendor_id": 2, "event_state": "SWIPED", "event_timestamp": "2018-11-13 09:45:01", "price": "100", "card_type": "Debit"}
2 | {"transaction_id": "22", "customer_id": 1,  "vendor_id": 200, "event_state": "SWIPED", "event_timestamp": "2018-11-13 09:45:02", "price": "100", "card_type": "Debit"}
3 | {"transaction_id": "23", "customer_id": 1,  "vendor_id": 1, "event_state": "INVALID", "event_timestamp": "2018-11-13 09:45:05", "price": "100", "card_type": "Credit"}
4 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/resources/samples/kafka/transactions/transactions-3.json:
--------------------------------------------------------------------------------
1 | {"transaction_id": "31", "customer_id": 1,  "vendor_id": 1, "event_state": "CREATED", "event_timestamp": "2018-11-13 09:45:00", "price": "100", "card_type": "Whatever"}
2 | {"transaction_id": "32", "customer_id": 1,  "vendor_id": 1, "event_state": "CREATED", "event_timestamp": "2018-11-14 09:47:00", "price": "100", "card_type": "Credit"}
3 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/resources/samples/kafka/transactions/transactions-4.json:
--------------------------------------------------------------------------------
1 | {"transaction_id": "41", "customer_id": 2,  "vendor_id": 1, "event_state": "CREATED", "event_timestamp": "2018-11-14 10:47:00", "price": "100", "card_type": "Credit"}
2 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/resources/samples/kafka/vendors/vendors.json:
--------------------------------------------------------------------------------
1 | {"vendor_id": 1, "vendor_name": "Apple", "phone_number": "111111", "update_timestamp": "2018-11-13"}
2 | {"vendor_id": 2, "vendor_name": "Dell", "phone_number": "222222", "update_timestamp": "2018-11-13"}
3 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/scala/com/cloudera/streaming/refapp/EmbeddedKafka.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | import net.manub.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig}
 7 | 
 8 | object EmbeddedKafkaBroker {
 9 | 
10 |   def embeddedKafkaConfig = EmbeddedKafkaConfig.defaultConfig
11 | 
12 |   def defaultKafkaConfig =  KafkaConfig.fromBrokerList(
13 |     s"localhost:${embeddedKafkaConfig.kafkaPort}")
14 | 
15 |   def start() {
16 |     Runtime.getRuntime.addShutdownHook(new Thread() {
17 |       override def run() {
18 |         EmbeddedKafkaBroker.stop()
19 |       }
20 |     })
21 | 
22 |     EmbeddedKafka.start()
23 |   }
24 | 
25 |   def stop() { EmbeddedKafka.stop()}
26 | 
27 |   def publishStringMessageToKafka(topic: String, message: String) { EmbeddedKafka.publishStringMessageToKafka(topic, message) }
28 | }
29 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/scala/com/cloudera/streaming/refapp/EmbeddedSpark.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | import java.util.UUID
 7 | 
 8 | import org.apache.spark.SparkConf
 9 | import org.apache.spark.sql.SparkSession
10 | 
11 | object EmbeddedSpark {
12 | 
13 |   val sparkSession: SparkSession = SparkSession.
14 |     builder()
15 |     .config(
16 |       new SparkConf()
17 |         .setMaster("local[*]")
18 |         .setAppName("test")
19 |         .set("spark.ui.enabled", "false")
20 |         .set("spark.sql.shuffle.partitions", "1")
21 |         .set("spark.app.id", UUID.randomUUID.toString))
22 |     .getOrCreate()
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/scala/com/cloudera/streaming/refapp/Files.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | import java.io.File
 7 | 
 8 | import org.apache.commons.io.FileUtils
 9 | 
10 | import org.apache.spark.sql.streaming.{DataStreamWriter, OutputMode, Trigger}
11 | import org.apache.spark.sql.types.StructType
12 | import org.apache.spark.sql.{DataFrame, Row, SparkSession, functions}
13 | import org.slf4j.LoggerFactory
14 | 
15 | 
16 | /**
17 |   * Creates static and streaming Sources reading local or HDFS files.
18 |   * Used for testing.
19 |   */
20 | class FileSources(spark: SparkSession, inputDir: String) {
21 | 
22 |   /**
23 |     * Creates a streaming source that reads JSON files.
24 |     * It assumes that the files are located in the inputDir/kafka/fileName directory
25 |     */
26 |   def jsonStream(fileName: String, timestampColumnName: String)(schema: StructType): DataFrame =
27 |     spark.readStream
28 |       .format("json")
29 |       .option("maxFilesPerTrigger", "1")  // ensures that we have multiple minibatches: if we have more files each minibatch reads only one of them
30 |       .schema(schema)
31 |       .load(s"$inputDir/kafka/${fileName}")
32 |       .repartition(1)
33 |       .withColumn(Schemas.timestampColumnName, functions.col(timestampColumnName))
34 | 
35 |   import spark.implicits._
36 | 
37 |   /**
38 |     * Creates a static source that reads JSON files.
39 |     * It assumes that the files are located in the inputDir/cluster/fileName directory
40 |     */
41 |   def jsonFile(fileName: String)(schema: StructType): DataFrame =
42 |     spark.sparkContext.textFile(s"$inputDir/cluster/${fileName}", 1).toDF.repartition(1)
43 |       .select(functions.from_json('value, schema) as 'entity).select("entity.*")
44 | }
45 | 
46 | /**
47 |   * Creates Sinks that produce streaming output to CSV files.
48 |   * Local or HDFS directory where the CSV files are written.
49 |   *
50 |   * @param outputDir
51 |   * @param checkpointLocation provides the path where the checkpoints are stored, given the name of the Sink
52 |   */
53 | class FileSinks(outputDir: String, checkpointLocation: String => String) {
54 | 
55 |   private val logger = LoggerFactory.getLogger(getClass)
56 | 
57 |   def csv(sinkName: String, triggerSeconds: Int = 10) =
58 |     new Sink {
59 |       override def createDataStreamWriter(df: DataFrame): DataStreamWriter[Row] = {
60 |         df
61 |           .writeStream
62 |           .outputMode(OutputMode.Append)
63 |           .format("csv")
64 |           .trigger(Trigger.ProcessingTime(s"$triggerSeconds seconds"))
65 |           .option("checkpointLocation", checkpointLocation(name))
66 |           .option("path", s"$outputDir/$name.csv")
67 |           .option("header", "true")
68 |       }
69 | 
70 |       override val name: String = sinkName
71 |     }
72 | 
73 |   /**
74 |     * Purges the output directory.
75 |     */
76 |   def cleanOutputs(): Unit = {
77 |     val file = new File(outputDir)
78 |     if (file.exists())
79 |       file.listFiles().foreach {
80 |         FileUtils.deleteDirectory
81 |       }
82 |     logger.info(s"Cleaned output directory $outputDir")
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/scala/com/cloudera/streaming/refapp/IntegrationTestBase.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | import java.util.UUID
 7 | 
 8 | import org.scalatest.AppendedClues._
 9 | import org.scalatest.Matchers._
10 | import org.scalatest.concurrent.Eventually.{eventually, interval, timeout}
11 | import org.scalatest.time.{Millis, Seconds, Span}
12 | import org.scalatest.{BeforeAndAfter, FunSuite}
13 | import org.apache.spark.sql.streaming.StreamingQuery
14 | import org.slf4j.LoggerFactory
15 | 
16 | 
17 | abstract class IntegrationTestBase extends FunSuite with BeforeAndAfter {
18 | 
19 |   private val logger = LoggerFactory.getLogger(getClass)
20 | 
21 |   private var diagnosticQuery: Option[StreamingQuery] = None
22 | 
23 |   before {
24 |     waitForOneMessage()
25 |   }
26 | 
27 |   after {
28 |     diagnosticQuery.foreach(query => query.stop())
29 |   }
30 | 
31 |   private def waitForOneMessage() = {
32 |     EmbeddedKafkaBroker.start()
33 | 
34 |     val spark = EmbeddedSpark.sparkSession
35 | 
36 |     val topicName = UUID.randomUUID().toString.replaceAll("-", "_")
37 | 
38 |     val source = new KafkaSource(spark, EmbeddedKafkaBroker.defaultKafkaConfig)
39 |       .stringStream(topicName)
40 | 
41 |     val query = Memory.memorySink(topicName).createDataStreamWriter(source).start()
42 |     diagnosticQuery = Some(query)
43 | 
44 |     EmbeddedKafkaBroker.publishStringMessageToKafka(topicName, "test")
45 | 
46 |     eventually(timeout(Span(5, Seconds)), interval(Span(5, Millis))) {
47 |       query.processAllAvailable()
48 |       val currentContent = spark.table(topicName).collect().map(row => row.getAs[String]("value"))
49 | 
50 |       currentContent.shouldBe(Array("test")).
51 |         withClue("Spark did not get diagnostic message from Kafka. Either one of them failed to start or they can't communicate.")
52 |     }
53 |     logger.info("Kafka and Spark are running, they are able to communicate")
54 |   }
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/scala/com/cloudera/streaming/refapp/LocalIntegrationTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | import java.sql.Timestamp
 7 | 
 8 | import org.scalatest.Matchers._
 9 | import org.scalatest.concurrent.Eventually._
10 | import org.scalatest.time.{Seconds, Span}
11 | 
12 | import org.apache.spark.sql.Encoders
13 | 
14 | class LocalIntegrationTest extends IntegrationTestBase {
15 | 
16 |   test("Integration test with one kafka and one spark instance embedded in the same JVM") {
17 | 
18 |     val inputDir = "src/test/resources/samples"
19 | 
20 |     val spark = EmbeddedSpark.sparkSession
21 | 
22 |     val fileSource = new FileSources(spark, inputDir)
23 |     val kafkaConfig = EmbeddedKafkaBroker.defaultKafkaConfig
24 |     val kafkaSource = new KafkaSource(spark, kafkaConfig)
25 | 
26 |     val application = new Application(
27 |       spark,
28 |       Sources(
29 |         statesFromCluster = fileSource.jsonFile("states"),
30 |         customersFromCluster = fileSource.jsonFile("customers"),
31 |         vendorsFromCluster = fileSource.jsonFile("vendors"),
32 |         customersFromStream = kafkaSource.jsonStreamWithKafkaTimestamp("customer"),
33 |         vendorsFromStream = kafkaSource.jsonStreamWithTimestampFromMessage("vendor", "update_timestamp"),
34 |         transactionsFromStream = kafkaSource.jsonStreamWithTimestampFromMessage("transaction", "event_timestamp")
35 |       ),
36 |       Sinks(
37 |         invalidTransactions = Memory.memorySink("invalidTransactions"),
38 |         validTransactions = Memory.memorySink("validTransactions"),
39 |         customerOrphans = Memory.memorySink("customerOrphans"),
40 |         vendorOrphans = Memory.memorySink("vendorOrphans"),
41 |         customers = Memory.memorySink("customers"),
42 |         vendors = Memory.memorySink("vendors"),
43 |         transactionsOperationalMetadata = Memory.memorySink("transactionsOperationalMetadata")
44 |       ))
45 | 
46 |     application.start()
47 | 
48 |     eventually(timeout(Span(20, Seconds)), interval(Span(5, Seconds))) {
49 |       EmbeddedKafkaBroker.publishStringMessageToKafka(
50 |         "transaction",
51 |         """{
52 |           "transaction_id": "1",
53 |           "customer_id": 1,
54 |           "vendor_id": 1,
55 |           "event_state": "CREATED",
56 |           "event_timestamp": "2018-11-12 09:42:00",
57 |           "price": "100",
58 |           "card_type": "Credit"}""")
59 |       EmbeddedKafkaBroker.publishStringMessageToKafka(
60 |         "transaction",
61 |         """{
62 |           "transaction_id": "21",
63 |           "customer_id": 100,
64 |           "vendor_id": 2,
65 |           "event_state": "SWIPED",
66 |           "event_timestamp": "2018-11-13 09:45:01",
67 |           "price": "100",
68 |           "card_type": "Debit"}""")
69 | 
70 |       val validTransactionsQuery = application.streamingQueries.validTransactions
71 |       validTransactionsQuery.processAllAvailable()
72 |       val currentContent = spark.table("validTransactions").as[Transaction](Encoders.product).collect()
73 | 
74 |       currentContent.shouldBe(
75 |         Array(
76 |           Transaction(
77 |             transaction_id = "1",
78 |             customer_id = Some(1),
79 |             vendor_id = Some(1),
80 |             event_state = Some("CREATED"),
81 |             event_timestamp = Timestamp.valueOf("2018-11-12 09:42:00"),
82 |             price = Some("100"),
83 |             card_type = Some("Credit")),
84 |           Transaction(
85 |             transaction_id = "21",
86 |             customer_id = Some(100),
87 |             vendor_id = Some(2),
88 |             event_state = Some("SWIPED"),
89 |             event_timestamp = Timestamp.valueOf("2018-11-13 09:45:01"),
90 |             price = Some("100"),
91 |             card_type = Some("Debit"))
92 |         ))
93 |     }
94 |   }
95 | }


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/scala/com/cloudera/streaming/refapp/Memory.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | import org.apache.spark.sql.streaming.{DataStreamWriter, OutputMode}
 7 | import org.apache.spark.sql.{DataFrame, Row}
 8 | 
 9 | object Memory {
10 | 
11 |   def memorySink(sinkName: String) = new Sink {
12 |     override def createDataStreamWriter(df: DataFrame): DataStreamWriter[Row] = df
13 |       .writeStream
14 |       .outputMode(OutputMode.Append)
15 |       .queryName(name)
16 |       .format("memory")
17 | 
18 |     override val name: String = sinkName
19 |   }
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/scala/com/cloudera/streaming/refapp/StructuredStreams.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | import java.time.Duration
 7 | 
 8 | object StructuredStreams {
 9 | 
10 |   def main(args: Array[String]) {
11 | 
12 |     if (args.length != 3) {
13 |       sys.error(
14 |         """Usage:
15 |           |com.cloudera.streaming.refapp.StructuredStreams inputDir outputDir kudu-master
16 |           |inputDir should have the same structure as the src/main/resources/samples directory of this project
17 |           |outputDir is created if it does not exist and it's purged if it exists
18 |           |kudu-master host:port pair pointing to a kudu master instance""".stripMargin)
19 |     }
20 |     val Array(inputDir, outputDir, kuduMaster) = args
21 | 
22 |     val spark = EmbeddedSpark.sparkSession
23 | 
24 |     val fileSource = new FileSources(spark, inputDir)
25 |     val fileSink = new FileSinks(outputDir, defaultCheckpointLocation)
26 |     val kafkaConfig = EmbeddedKafkaBroker.defaultKafkaConfig
27 |     val kafkaSource = new KafkaSource(spark, kafkaConfig)
28 | 
29 |     val kuduDatabase = "streaming_ref"
30 |     val kuduSource = new KuduSource(spark, kuduMaster, kuduDatabase)
31 |     val kuduSink = new KuduSink(kuduMaster, kuduDatabase, defaultCheckpointLocation)
32 | 
33 |     val application = new Application(
34 |       spark,
35 |       Sources(
36 |         statesFromCluster = kuduSource.loadTable("states"),
37 |         customersFromCluster = fileSource.jsonFile("customers"),
38 |         vendorsFromCluster = kuduSource.loadTable("vendors"),
39 | //        customersFromStream = fileSource.jsonStream("customers", "update_timestamp"),
40 | //        vendorsFromStream = fileSource.jsonStream("vendors", "update_timestamp"),
41 | //        transactionsFromStream = fileSource.jsonStream("transactions", "event_timestamp")
42 |         customersFromStream = kafkaSource.jsonStreamWithKafkaTimestamp("customer"),
43 |         vendorsFromStream = kafkaSource.jsonStreamWithTimestampFromMessage("vendor", "update_timestamp"),
44 |         transactionsFromStream = kafkaSource.jsonStreamWithTimestampFromMessage("transaction", "event_timestamp")
45 |       ),
46 |       Sinks(
47 | //        invalidTransactions = fileSink.csv("invalidTransactions"),
48 | //        validTransactions = fileSink.csv("validTransactions"),
49 | //        customerOrphans = fileSink.csv("customerOrphans"),
50 | //        vendorOrphans = fileSink.csv("vendorOrphans"),
51 | //        customers = fileSink.csv("customers"),
52 | //        vendors = fileSink.csv("vendors"),
53 | //        transactionsOperationalMetadata = fileSink.csv("transactionsOperationalMetadata")
54 |         validTransactions = kuduSink.writeTable("valid_transactions"),
55 |         invalidTransactions = kuduSink.writeTable("invalid_transactions"),
56 |         customerOrphans = kuduSink.writeTable("customer_orphans"),
57 |         vendorOrphans = kuduSink.writeTable("vendor_orphans"),
58 |         customers = kuduSink.writeTable("customers"),
59 |         vendors = kuduSink.writeTable("vendors"),
60 |         transactionsOperationalMetadata = kuduSink.writeTable("transactions_operational_metadata")
61 |       ),
62 |       clusterStartup = EmbeddedKafkaBroker.start(),
63 |       initSources = {
64 |         CustomerGenerator(kafkaConfig, "customer").start()
65 |         VendorGenerator(kafkaConfig, "vendor").start()
66 |         TransactionGenerator(kafkaConfig, "transaction").start()
67 |       },
68 |       cleanOutput = fileSink.cleanOutputs,
69 |       queryRestartDurations = Map("valid_transactions" -> Duration.ofMinutes(1))
70 |     )
71 | 
72 |     application.start()
73 |     spark.streams.awaitAnyTermination()
74 |   }
75 | }


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/scala/com/cloudera/streaming/refapp/TransactionsFlowUnitTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | import java.sql.Timestamp
 7 | 
 8 | import org.scalatest.BeforeAndAfter
 9 | 
10 | import org.apache.spark.sql.execution.streaming.MemoryStream
11 | 
12 | class TransactionsFlowUnitTest extends UnitTestBase with BeforeAndAfter {
13 |   import testImplicits._
14 | 
15 |   var transactionsFromStream: MemoryStream[Transaction] = _
16 |   var transactiosnFlow: TransactionsFlow = _
17 | 
18 |   before {
19 |     transactionsFromStream = MemoryStream[Transaction]
20 |     transactiosnFlow = new TransactionsFlow(
21 |       spark,
22 |       statesFromCluster,
23 |       customersFromCluster,
24 |       vendorsFromCluster,
25 |       transactionsFromStream = transactionsFromStream
26 |         .toDF.withColumn("timestamp", $"event_timestamp".cast("timestamp")))
27 |   }
28 | 
29 |   test("Valid records are written to the validTransactions output") {
30 | 
31 |     val validTransaction = Transaction(
32 |       transaction_id = "1",
33 |       customer_id = Some(1),
34 |       vendor_id = Some(1),
35 |       event_state = Some("CREATED"),
36 |       event_timestamp = Timestamp.valueOf("2018-11-12 09:42:00"),
37 |       price = Some("100"),
38 |       card_type = Some("Credit"))
39 | 
40 |     testStream(transactiosnFlow.validTransactions.select('transaction_id, 'customer_id, 'vendor_id, 'event_state, 'event_timestamp, 'price, 'card_type)) (
41 |       AddData(transactionsFromStream, validTransaction),
42 |       CheckAnswer(validTransaction)
43 |     )
44 |   }
45 | 
46 |   test("Invalid records are written to the invalidTransactions output") {
47 |     // Note: transactionsFlow.validTransactions and invalidTransactions contain the fields that we used for internal calculations, e.g. for validation
48 |     // It enables us to check the internal calculations
49 |     testStream(transactiosnFlow.invalidTransactions.select('transaction_id, 'valid_card_type)) (
50 |       AddData(transactionsFromStream,
51 |         Transaction(
52 |           transaction_id = "2",
53 |           customer_id = Some(1),
54 |           vendor_id = Some(1),
55 |           event_state = Some("CREATED"),
56 |           event_timestamp = Timestamp.valueOf("2018-11-12 09:42:00"),
57 |           price = Some("100"),
58 |           card_type = Some("Invalid"))),
59 |       CheckAnswer(("2", false))
60 |     )
61 |   }
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/scala/com/cloudera/streaming/refapp/UnitTestBase.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp
 5 | 
 6 | import java.sql.Timestamp
 7 | 
 8 | import org.scalatest.BeforeAndAfter
 9 | 
10 | import org.apache.spark.sql.streaming.StreamTest
11 | import org.apache.spark.sql.test.SharedSQLContext
12 | 
13 | abstract class UnitTestBase extends StreamTest with BeforeAndAfter with SharedSQLContext {
14 |   import testImplicits._
15 | 
16 |   lazy val statesFromCluster = List(
17 |     State(state_id = 1, state_name = "Alabama", state_abbreviation = "AL"),
18 |     State(state_id = 2, state_name = "Alaska", state_abbreviation = "AK")).toDF
19 | 
20 |   lazy val customersFromCluster = List(
21 |     Customer(customer_id = 1, first_name = "John", last_name = "Doe", state_id = 1, update_timestamp = Timestamp.valueOf("2018-01-01 01:02:03")),
22 |     Customer(customer_id = 2, first_name = "Jane", last_name = "Miller", state_id = 2, update_timestamp = Timestamp.valueOf("2018-01-02 01:02:03"))).toDF
23 | 
24 |   lazy val vendorsFromCluster = List(
25 |     Vendor(vendor_id = 1, vendor_name = "Apple", phone_number = "123456", update_timestamp = Timestamp.valueOf("2018-11-13 01:02:03")),
26 |     Vendor(vendor_id = 2, vendor_name = "Dell", phone_number = "345678", update_timestamp = Timestamp.valueOf("2018-11-13 01:02:03"))
27 |   ).toDF
28 | 
29 |   after {
30 |     sqlContext.streams.active.foreach(_.stop())
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/advancedApp/src/test/scala/com/cloudera/streaming/refapp/kudu/KuduSinkUnitTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) Cloudera, Inc. 2019
 3 |   */
 4 | package com.cloudera.streaming.refapp.kudu
 5 | 
 6 | import org.apache.kudu.spark.kudu.KuduContext
 7 | import org.apache.spark.sql.DataFrame
 8 | import org.mockito.Mockito._
 9 | import org.scalatest._
10 | import org.scalatest.mockito.MockitoSugar
11 | 
12 | class KuduSinkUnitTest extends FunSuite with MockitoSugar {
13 | 
14 |   private val frame = mock[DataFrame]
15 | 
16 |   private def setupKuduContextMock(kuduContext: KuduContext, failTimes: Int): KuduContext = {
17 |     if (failTimes > 0) {
18 |       val stubber = doThrow(new RuntimeException)
19 |       for (_ <- 2 to failTimes) {
20 |         stubber.doThrow(new RuntimeException)
21 |       }
22 |       stubber.doCallRealMethod()
23 |         .when(kuduContext).upsertRows(frame, "table")
24 |     }
25 |     kuduContext
26 |   }
27 | 
28 |   test("kudu upsert fails, retries once") {
29 |     val helper = new KuduSinkWithMockedContext(setupKuduContextMock(mock[KuduContext], failTimes = 1), 1)
30 | 
31 |     helper.sink.addBatch(0, frame)
32 |     assert(helper.initialized == 1, "context should be initialized once")
33 |   }
34 | 
35 |   test("kudu upsert fails twice, retries once, fails") {
36 |     val helper = new KuduSinkWithMockedContext(setupKuduContextMock(mock[KuduContext], failTimes = 2), 1)
37 | 
38 |     intercept[RuntimeException] {
39 |       helper.sink.addBatch(0, frame)
40 |     }
41 |     assert(helper.initialized == 1, "context should be initialized once")
42 |   }
43 | 
44 |   test("kudu upsert fails 3 times, retries 3 times") {
45 |     val helper = new KuduSinkWithMockedContext(setupKuduContextMock(mock[KuduContext], failTimes = 3), 3)
46 |     helper.sink.addBatch(0, frame)
47 |     assert(helper.initialized == 3, "context should be initialized three times")
48 |   }
49 | 
50 |   test("kudu upsert fails 3 times, retries 4 times") {
51 |     val helper = new KuduSinkWithMockedContext(setupKuduContextMock(mock[KuduContext], failTimes = 3), 4)
52 |     helper.sink.addBatch(0, frame)
53 |     assert(helper.initialized == 3, "context should be initialized only three times")
54 |   }
55 | 
56 | }
57 | 
58 | class KuduSinkWithMockedContext(kuduContext: KuduContext, retries: Int) {
59 | 
60 |   // KuduSink constructor inits once
61 |   var initialized = -1
62 | 
63 |   private def initKuduConext: KuduContext = {
64 |     initialized += 1
65 |     kuduContext
66 |   }
67 | 
68 |   val sink = new KuduSink(initKuduConext, Map(
69 |     "kudu.table" -> "table",
70 |     "kudu.master" -> "master",
71 |     "retries" -> retries.toString))
72 | }
73 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/docs/images/dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cloudera/kafka-examples/4408ba67c21a0646ed76b8fd066eca0ef0a311a2/StructuredStreamingRefApp/docs/images/dag.png


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/docs/images/flows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cloudera/kafka-examples/4408ba67c21a0646ed76b8fd066eca0ef0a311a2/StructuredStreamingRefApp/docs/images/flows.png


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/docs/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cloudera/kafka-examples/4408ba67c21a0646ed76b8fd066eca0ef0a311a2/StructuredStreamingRefApp/docs/images/pipeline.png


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/docs/images/streaming-systems.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cloudera/kafka-examples/4408ba67c21a0646ed76b8fd066eca0ef0a311a2/StructuredStreamingRefApp/docs/images/streaming-systems.png


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/scripts/config.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (C) Cloudera, Inc. 2019
 2 | 
 3 | # Configuration for setup scripts
 4 | # Should be sourced by them
 5 | 
 6 | : "Kerberos principal used for setup: ${SETUP_PRINCIPAL:=kafka}"
 7 | 
 8 | : "Keytab used to login as setup principal: ${SETUP_KEYTAB:=/cdep/keytabs/$SETUP_PRINCIPAL.keytab}"
 9 | 
10 | : "Input topics to set up: ${TOPICS:=customer vendor transaction}"
11 | 
12 | : "Zookeeper quorum: ${ZOOKEEPER_QUORUM:=$(hostname):2181}"
13 | 
14 | : "Kafka broker list: ${BROKER_LIST:=$(hostname):9093}"
15 | 
16 | # Producer
17 | : "Hosts producer connects from: ${PRODUCER_HOSTS:=*}"
18 | 
19 | : "Producer user/principal: ${PRODUCER_USER:=flume}"
20 | 
21 | : "Keytab used to login as producer: ${PRODUCER_KEYTAB:=/cdep/keytabs/$PRODUCER_USER.keytab}"
22 | 
23 | : "Kerberos realm: ${PRODUCER_REALM:=`klist -kt $PRODUCER_KEYTAB | grep '@' | head -n 1 | sed s/.*@// | sed s/[[:space:]]//`}"
24 | 
25 | : "Primary group of producer user: ${PRODUCER_GROUP:=`id -gn $PRODUCER_USER`}"
26 | 
27 | : "Producer Sentry role: ${PRODUCER_ROLE:=$PRODUCER_GROUP}"
28 | 
29 | : "Producer truststore location: ${PRODUCER_TRUSTSTORE_LOCATION:=/etc/cdep-ssl-conf/CA_STANDARD/truststore.jks}"
30 | 
31 | : "Directory to store producer's files: ${PRODUCER_FILES_DIR:=`pwd`}"
32 | 
33 | 
34 | # Spark application as Kafka consumer
35 | : "Hosts Spark application consumes from: ${SPARK_HOSTS:=*}"
36 | 
37 | : "Producer user/principal: ${SPARK_USER:=systest}"
38 | 
39 | : "Keytab used by Spark application: ${SPARK_KEYTAB:=/cdep/keytabs/$SPARK_USER.keytab}"
40 | 
41 | : "Kerberos realm: ${SPARK_REALM:=`klist -kt $SPARK_KEYTAB | grep '@' | head -n 1 | sed s/.*@// | sed s/[[:space:]]//`}"
42 | 
43 | : "Primary group of consumer user: ${SPARK_GROUP:=`id -gn $SPARK_USER`}"
44 | 
45 | : "Consumer Sentry role: ${SPARK_ROLE:=$SPARK_GROUP}"
46 | 
47 | : "Consumer truststore location: ${SPARK_TRUSTSTORE_LOCATION:=/etc/cdep-ssl-conf/CA_STANDARD/truststore.jks}"
48 | 
49 | : "Directory to store consumer's files: ${CONSUMER_FILES_DIR:=`pwd`}"
50 | 
51 | # Database
52 | : "Database admin user: ${DB_ADMIN_USER:=impala}"
53 | 
54 | : "Keytab used to login as db admin user: ${DB_ADMIN_KEYTAB:=/cdep/keytabs/$DB_ADMIN_USER.keytab}"
55 | 
56 | : "Impala daemon to connect: ${IMPALA_DAEMON}"
57 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/scripts/kudu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (C) Cloudera, Inc. 2019
 3 | 
 4 | set -fex
 5 | 
 6 | # Creates the streaming_ref datbase and the tables in Kudu and sentry privileges required to access them
 7 | 
 8 | . $(dirname $0)/config.sh
 9 | 
10 | SQL_FILE=$(dirname $0)/init_kudu_db.sql
11 | 
12 | if [ -z ${IMPALA_DAEMON} ]
13 | then
14 |   : "\${IMPALA_DAEMON} must be set to the hots[:port] value the shell can connect to"
15 |   exit 1
16 | fi
17 | 
18 | # Create sentry role for Spark if it does not exist
19 | kinit -kt $SETUP_KEYTAB $SETUP_PRINCIPAL
20 | if kafka-sentry -lr | grep -q $SPARK_ROLE ; then
21 |     echo $SPARK_ROLE already exists
22 | else
23 |     : "Create sentry role $SPARK_ROLE"
24 |     kafka-sentry -cr -r $SPARK_ROLE
25 | fi
26 | 
27 | kinit -kt $DB_ADMIN_KEYTAB $DB_ADMIN_USER
28 | 
29 | impala-shell -i ${IMPALA_DAEMON} -f ${SQL_FILE} -k --ssl
30 | 
31 | impala-shell -i ${IMPALA_DAEMON} -k --ssl -q "GRANT ALL ON DATABASE streaming_ref to ${SPARK_ROLE}"
32 | 
33 | kdestroy
34 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/scripts/producer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (C) Cloudera, Inc. 2019
 3 | 
 4 | set -fex
 5 | 
 6 | # Creates sentry privileges and configuration files for producer application
 7 | 
 8 | . $(dirname $0)/config.sh
 9 | 
10 | kinit -kt $SETUP_KEYTAB $SETUP_PRINCIPAL
11 | : "Create sentry role $PRODUCER_ROLE"
12 | kafka-sentry -cr -r $PRODUCER_ROLE
13 | 
14 | 
15 | : "Add role $PRODUCER_ROLE to group $PRODUCER_GROUP"
16 | kafka-sentry -arg -r $PRODUCER_ROLE -g $PRODUCER_GROUP
17 | 
18 | for TOPIC in $TOPICS
19 | do
20 |   for PRODUCER_HOST in $PRODUCER_HOSTS
21 |   do
22 |     : "Grant privileges to role $PRODUCER_ROLE on topic $TOPIC from host $PRODUCER_HOST"
23 |     kafka-sentry -gpr -r $PRODUCER_ROLE -p "Host=$PRODUCER_HOST->Topic=$TOPIC->action=describe"
24 |     kafka-sentry -gpr -r $PRODUCER_ROLE -p "Host=$PRODUCER_HOST->Topic=$TOPIC->action=write"
25 |   done
26 | done
27 | 
28 | : "Create client.properties"
29 | cat >$PRODUCER_FILES_DIR/producer.properties<<EOF
30 | bootstrap.servers=${BROKER_LIST}
31 | security.protocol=SASL_SSL
32 | sasl.kerberos.service.name=kafka
33 | ssl.truststore.location=${PRODUCER_TRUSTSTORE_LOCATION}
34 | sasl.jaas.config=com.sun.security.auth.module.Krb5LoginModule required \
35 |   useKeyTab=true \
36 |   storeKey=true  \
37 |   useTicketCache=true \
38 |   keyTab="${PRODUCER_KEYTAB}" \
39 |   principal="${PRODUCER_USER}@${PRODUCER_REALM}";
40 | EOF
41 | 
42 | kdestroy
43 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/scripts/spark-kafka.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (C) Cloudera, Inc. 2019
 3 | 
 4 | set -fex
 5 | 
 6 | # Creates Kafka related sentry privileges and configuration files for Spark application
 7 | 
 8 | . $(dirname $0)/config.sh
 9 | 
10 | # Create sentry role for Spark if it does not exist
11 | kinit -kt $SETUP_KEYTAB $SETUP_PRINCIPAL
12 | if kafka-sentry -lr | grep -q $SPARK_ROLE ; then
13 |     echo $SPARK_ROLE already exists
14 | else
15 |     : "Create sentry role $SPARK_ROLE"
16 |     kafka-sentry -cr -r $SPARK_ROLE
17 | fi
18 | 
19 | : "Add role $SPARK_ROLE to group $SPARK_GROUP"
20 | kafka-sentry -arg -r $SPARK_ROLE -g $SPARK_GROUP
21 | 
22 | for SPARK_HOST in $SPARK_HOSTS
23 | do
24 |   for TOPIC in $TOPICS
25 |   do
26 |     : "Grant privileges to role $SPARK_ROLE on topic $TOPIC from host $SPARK_HOST"
27 |     kafka-sentry -gpr -r $SPARK_ROLE -p "Host=$SPARK_HOST->Topic=$TOPIC->action=describe"
28 |     kafka-sentry -gpr -r $SPARK_ROLE -p "Host=$SPARK_HOST->Topic=$TOPIC->action=read"
29 |     : "Allow role $SPARK_ROLE to join any consumer group from host $SPARK_HOST"
30 |     kafka-sentry -gpr -r $SPARK_ROLE -p "Host=$SPARK_HOST->Consumergroup=*->action=describe"
31 |     kafka-sentry -gpr -r $SPARK_ROLE -p "Host=$SPARK_HOST->Consumergroup=*->action=read"
32 |   done
33 | done
34 | 
35 | : "Create jaas.config"
36 | cat > kafka_client_jaas.conf<<EOF
37 | KafkaClient {
38 |     com.sun.security.auth.module.Krb5LoginModule required
39 |     useKeyTab=true
40 |     storeKey=true
41 |     keyTab="$SPARK_KEYTAB"
42 |     useTicketCache=false
43 |     serviceName="kafka"
44 |     principal="$SPARK_USER@${SPARK_REALM}";
45 | };
46 | EOF
47 | 
48 | 
49 | : "Create client.properties"
50 | cat >$CONSUMER_FILES_DIR/consumer.properties<<EOF
51 | bootstrap.servers=${BROKER_LIST}
52 | security.protocol=SASL_SSL
53 | sasl.kerberos.service.name=kafka
54 | ssl.truststore.location=${SPARK_TRUSTSTORE_LOCATION}
55 | EOF
56 | 
57 | 
58 | kdestroy
59 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/scripts/topics.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (C) Cloudera, Inc. 2019
 3 | 
 4 | set -fex
 5 | 
 6 | # Creates Kafka topics
 7 | 
 8 | . $(dirname $0)/config.sh
 9 | 
10 | for TOPIC in $TOPICS
11 | do
12 |   : "Create topic $TOPIC"
13 |   kafka-topics --create --zookeeper $ZOOKEEPER_QUORUM \
14 |     --replication-factor 3 \
15 |     --partitions 24 --topic $TOPIC
16 | done
17 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/simpleApp/README.md:
--------------------------------------------------------------------------------
 1 | _Copyright &copy; Cloudera, Inc. 2019_
 2 | ## Submitting to Spark service running on a cluster
 3 | 
 4 | ### Cluster requriements
 5 | The application can be deployed on a cluster that already has all the required services:
 6 | * Kafka
 7 | * Spark
 8 | * Kudu
 9 | 
10 | their dependencies:
11 | * Zookeeper
12 | * HDFS
13 | 
14 | #### Secured cluster
15 | * Kerberos (the application is tested with MIT Kerberos, AD should also be ok)
16 | * Sentry
17 | 
18 | All the other services should be configured to use SSL/TLS, Kerberos for authentication and Sentry for authorization
19 | 
20 | The application itself does not require Impala but it is used by the init scripts to create the Kudu tables and insert sample initial data.
21 | 
22 | ### Preparations
23 | 
24 |   1. Execute
25 | 
26 |   `mvn clean package`
27 | 
28 |   2.  Copy the target/streaming-ref-app-simple-0.1-SNAPSHOT-jar-with-dependencies.jar, db/init_kudu_db.sql and all files
29 |   from the ../scripts directory to a host on the cluster.
30 |   3.  Ssh to that host
31 |   4.  Edit config.sh. It contains reasonable defaults, make sure that you set each value fitting to your environment.  
32 |   5.  Execute all the other .sh files. They will generate various config files used by the application:
33 |       1. `kudu.sh` - creates the streaming_ref datbase and the tables in Kudu and sentry privileges required to access them
34 |       2. `producer.sh` - creates sentry privileges and configuration files for the application that generates
35 |       input records and sends them to kafka
36 |       3. `spark-kafka.sh` - creates Kafka related sentry privileges and configuration files for spark application
37 |       4. `topics.sh` - creates the Kafka topics
38 | 
39 | ### Using the application
40 | The StructuredStreamingApp application will read records from the transactions Kafka topic and it will
41 | write enriched data to the transactions Kudu table. It will also write the operational_metadata table.
42 | 
43 | When the application is submitted to the Spark service it will start the streaming pipeline,
44 | but it will not produce any output until it gets data from the Kafka topic.
45 | 
46 | You can send data to the topic using a ConsoleProducer to producer or by implementing a custom producer.
47 | This sample shows the required format:
48 | 
49 | `{"transaction_id": "1", "customer_id": 1,  "vendor_id": 1, "event_state": "CREATED", "event_timestamp": "2018-11-12 09:42:00", "price": "100", "card_type": "Credit"} `
50 | 
51 | You can check the output e.g. by using Impala.
52 | * Authenticate with `kinit` using a user that has access to the streaming_ref database and all the tables you want to check
53 | * `impala-shell -i <hostname> -k --ssl`
54 | * in the shell execute
55 |   ```
56 |   use streaming_ref;
57 | 
58 |   select count(*) from transactions
59 |   ```
60 |   to quickly check if the application is producing output, or you can execute any other queries against the output tables.
61 | 
62 | 
63 | #### Submitting the application without security
64 | 
65 |  Execute
66 | 
67 |   ```
68 |   spark-submit --files consumer.properties \      
69 |   --class com.cloudera.streaming.refapp.StructuredStreamingApp --deploy-mode cluster \  
70 |   --master yarn streaming-ref-app-simple-0.1-SNAPSHOT-jar-with-dependencies.jar \  
71 |   consumer.properties <kudu_master_host:kudu_master_port>
72 |   ```  
73 | 
74 | #### Submitting the application on a secured cluster
75 | 
76 |   Execute
77 | 
78 |   ```
79 |   kinit -kt <keytab> <SPARK_USER>
80 | 
81 |   spark-submit --files consumer.properties,kafka_client_jaas.conf,<keytab file> --driver-java-options \
82 |   "-Djava.security.auth.login.config=./kafka_client_jaas.conf"  --class com.cloudera.streaming.refapp.StructuredStreamingApp \
83 |   --conf "spark.executor.extraJavaOptions=-Djava.security.auth.login.config=./kafka_client_jaas.conf" \
84 |   --deploy-mode cluster --master yarn streaming-ref-app-simple-0.1-SNAPSHOT-jar-with-dependencies.jar \
85 |   consumer.properties <kudu_master_host:kudu_master_port>
86 |   ```
87 | 
88 | The application will keep running until you kill it in yarn.
89 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/simpleApp/db/init_kudu_db.sql:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) Cloudera, Inc. 2019
 3 |  */
 4 | 
 5 | CREATE DATABASE IF NOT EXISTS streaming_ref;
 6 | USE streaming_ref;
 7 | 
 8 | DROP TABLE IF EXISTS customers;
 9 | CREATE TABLE customers (
10 |     customer_id INT PRIMARY KEY,
11 |     first_name STRING,
12 |     last_name STRING,
13 |     state_name STRING,
14 |     state_abbreviation STRING,
15 |     update_timestamp TIMESTAMP)
16 | PARTITION BY HASH (customer_id) PARTITIONS 10
17 | STORED AS KUDU TBLPROPERTIES ('kudu.num_tablet_replicas' = '3');
18 | 
19 | DROP TABLE IF EXISTS transactions;
20 | CREATE TABLE transactions (
21 |     event_timestamp TIMESTAMP,
22 |     transaction_id STRING,
23 |     vendor_id INT,
24 |     event_state STRING,
25 |     price STRING,
26 |     card_type STRING,
27 |     customer_id INT,
28 |     customer_first_name STRING,
29 |     customer_last_name STRING,
30 |     is_valid BOOLEAN,
31 |     PRIMARY KEY (event_timestamp, transaction_id)
32 | )
33 | PARTITION BY
34 |     HASH (transaction_id) PARTITIONS 15,
35 |     RANGE (event_timestamp)
36 |       (PARTITION '2018-11-01' <= VALUES < '2018-12-01')
37 | STORED AS KUDU TBLPROPERTIES ('kudu.num_tablet_replicas' = '1');
38 | ALTER TABLE transactions ADD RANGE PARTITION '2018-12-01' <= VALUES < '2019-01-01';
39 | ALTER TABLE transactions ADD RANGE PARTITION '2019-01-01' <= VALUES < '2019-02-01';
40 | ALTER TABLE transactions ADD RANGE PARTITION '2019-02-01' <= VALUES < '2019-03-01';
41 | -- ...
42 | 
43 | DROP TABLE IF EXISTS operational_metadata;
44 | CREATE TABLE operational_metadata(
45 |     start_ts TIMESTAMP PRIMARY KEY,
46 |     end_ts TIMESTAMP,
47 |     num_transactions BIGINT)
48 | STORED AS KUDU TBLPROPERTIES ('kudu.num_tablet_replicas' = '3');
49 | 
50 | insert into customers values (1, 'John', 'Doe', 'Alabama', 'AL', '2018-01-01');
51 | insert into customers values (2, 'Jane', 'Miller', 'Alaska', 'AK', '2018-01-01');


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/simpleApp/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |   <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |   <groupId>com.cloudera.streaming.examples</groupId>
 8 |   <artifactId>streaming-ref-app-simple</artifactId>
 9 |   <version>0.1-SNAPSHOT</version>
10 | 
11 |   <repositories>
12 |     <repository>
13 |       <id>cloudera</id>
14 |       <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
15 |     </repository>
16 |   </repositories>
17 | 
18 |   <properties>
19 |     <scala.version>2.11</scala.version>
20 |     <java.version>1.8</java.version>
21 |     <scala.maven.plugin.version>3.3.2</scala.maven.plugin.version>
22 |     <maven.compiler.plugin.version>3.7.0</maven.compiler.plugin.version>
23 |     <spark.version>2.4.0-cdh6.1.0</spark.version>
24 |     <kafka.version>2.0.0-cdh6.1.0</kafka.version>
25 |     <kudu.version>1.8.0-cdh6.1.0</kudu.version>
26 |   </properties>
27 | 
28 |   <dependencies>
29 |     <dependency>
30 |       <groupId>org.apache.spark</groupId>
31 |       <artifactId>spark-sql-kafka-0-10_${scala.version}</artifactId>
32 |       <version>${spark.version}</version>
33 |       <scope>provided</scope>
34 |     </dependency>
35 |     <dependency>
36 |       <groupId>org.apache.spark</groupId>
37 |       <artifactId>spark-sql_${scala.version}</artifactId>
38 |       <version>${spark.version}</version>
39 |       <scope>provided</scope>
40 |     </dependency>
41 |     <dependency>
42 |       <groupId>org.apache.kudu</groupId>
43 |       <artifactId>kudu-spark2_${scala.version}</artifactId>
44 |       <version>${kudu.version}</version>
45 |     </dependency>
46 |   </dependencies>
47 | 
48 |   <build>
49 |     <plugins>
50 |       <plugin>
51 |         <groupId>net.alchim31.maven</groupId>
52 |         <artifactId>scala-maven-plugin</artifactId>
53 |         <version>${scala.maven.plugin.version}</version>
54 |         <executions>
55 |           <execution>
56 |             <goals>
57 |               <goal>compile</goal>
58 |               <goal>testCompile</goal>
59 |             </goals>
60 |           </execution>
61 |         </executions>
62 |       </plugin>
63 |       <plugin>
64 |         <artifactId>maven-compiler-plugin</artifactId>
65 |         <version>${maven.compiler.plugin.version}</version>
66 |         <configuration>
67 |           <source>${java.version}</source>
68 |           <target>${java.version}</target>
69 |         </configuration>
70 |       </plugin>
71 |       <plugin>
72 |         <artifactId>maven-assembly-plugin</artifactId>
73 |         <configuration>
74 |           <descriptorRefs>
75 |             <descriptorRef>jar-with-dependencies</descriptorRef>
76 |           </descriptorRefs>
77 |         </configuration>
78 |         <executions>
79 |           <execution>
80 |             <id>make-assembly</id>
81 |             <phase>package</phase>
82 |             <goals>
83 |               <goal>single</goal>
84 |             </goals>
85 |           </execution>
86 |         </executions>
87 |       </plugin>
88 |     </plugins>
89 |   </build>
90 | 
91 | </project>
92 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/simpleApp/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | com.cloudera.streaming.refapp.kudu.KuduSinkProvider


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/simpleApp/src/main/scala/com/cloudera/streaming/refapp/StructuredStreamingApp.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |   * Copyright (C) Cloudera, Inc. 2019
  3 |   */
  4 | package com.cloudera.streaming.refapp
  5 | 
  6 | import java.io.{FileInputStream, InputStream}
  7 | import java.sql.Timestamp
  8 | import java.util.{Properties, UUID}
  9 | 
 10 | import scala.collection.JavaConverters._
 11 | 
 12 | import org.apache.kudu.spark.kudu._
 13 | 
 14 | import org.apache.spark.sql.types.StructType
 15 | import org.apache.spark.sql.{DataFrame, Encoders, SparkSession, functions}
 16 | 
 17 | /**
 18 |   * A long running streaming application that can be submitted to a running Spark service.
 19 |   * It reads static data from Kudu, streaming data from Kafka and writes output to Kudu.
 20 |   * It demonstrates
 21 |   * - joining the two datasets
 22 |   * - performing simple validation
 23 |   * - working with time windows to create aggregate statistics
 24 |   * It focuses on the kafka-spark-kudu integration and keeps all other aspects as simple as possible.
 25 |   */
 26 | object StructuredStreamingApp {
 27 | 
 28 |   case class Transaction(transaction_id: String,
 29 |                          customer_id: Option[Int],
 30 |                          vendor_id: Option[Int],
 31 |                          event_state: Option[String],
 32 |                          event_timestamp: Timestamp,
 33 |                          price: Option[String],
 34 |                          card_type: Option[String])
 35 | 
 36 |   def main(args: Array[String]) {
 37 | 
 38 |     if (args.length < 2) {
 39 |       sys.error(
 40 |         """Usage:
 41 |           |com.cloudera.streaming.refapp.StructuredStreamingApp consumer.config kudu-master
 42 |           |consumer.config path to kafka client properties
 43 |           |kudu-master host:port pair pointing to a kudu master instance
 44 |         """.stripMargin)
 45 |     }
 46 | 
 47 |     // extract first two arguments
 48 |     val Array(consumerConfig, kuduMaster) = args
 49 |     val kafkaParamsForSpark = kafkaConfigFromPropertiesFile(consumerConfig)
 50 | 
 51 |     val spark = SparkSession.builder().appName("streaming-ref").getOrCreate()
 52 | 
 53 |     import spark.implicits._
 54 | 
 55 |     def readKafkaStream(topic: String, schema: StructType) = {
 56 |       val kafkaOptions = kafkaParamsForSpark ++ Map("subscribe" -> topic, "startingoffsets" -> "latest")
 57 | 
 58 |       val df = spark.readStream.format("kafka").options(kafkaOptions).load()
 59 |         .selectExpr("CAST(value AS STRING)")
 60 |         .select(functions.from_json('value, schema) as "parsedValue")
 61 |         .selectExpr("parsedValue.*")
 62 |       df.createOrReplaceTempView(topic)
 63 |       df
 64 |     }
 65 | 
 66 |     val kuduDatabase = "streaming_ref"
 67 | 
 68 |     def readKuduTable(name: String) = {
 69 |       val fullTableName = s"impala::$kuduDatabase.$name"
 70 |       val df = spark
 71 |         .read
 72 |         .options(Map(
 73 |             "kudu.master" -> kuduMaster,
 74 |             "kudu.table" -> fullTableName))
 75 |         .kudu
 76 |       df.createOrReplaceTempView(name)
 77 |       df
 78 |     }
 79 | 
 80 |     // Checkpointing is needed for failure handling: streaming queries that use checkpointing
 81 |     // can be continued after a faileru where the failed one left off, ensuring data consistency guarantees.
 82 |     // Each query needs a unique checkpoint location, that's why a random UUID is used.
 83 |     // In production you may want to set it to a stable, but unique, reliable location (e.g. on HDFS).
 84 |     val baseCheckpointLocation = "/tmp/temporary-" + UUID.randomUUID.toString
 85 | 
 86 |     def writeKuduTable(df: DataFrame, name: String) = {
 87 |       val fullTableName = s"impala::$kuduDatabase.$name"
 88 |       df
 89 |         .writeStream
 90 |         .format("kudu")
 91 |         .option("kudu.master", kuduMaster)
 92 |         .option("kudu.table", fullTableName)
 93 |         .option("checkpointLocation", s"$baseCheckpointLocation/$name")
 94 |         .option("retries", "3")
 95 |         .outputMode("update")
 96 |         .start()
 97 |     }
 98 | 
 99 |     val customers = readKuduTable("customers")
100 | 
101 |     val transactions = readKafkaStream("transaction", Encoders.product[Transaction].schema)
102 |       .withWatermark("event_timestamp", "1 minute")
103 | 
104 |     val enrichedTransactions = spark.sql(
105 |       """
106 |         SELECT
107 |           t.*,
108 |           c.first_name as customer_first_name, c.last_name as customer_last_name,
109 |           card_type in ('Visa', 'MasterCard') and event_state in ('created') as is_valid
110 |         FROM transaction t
111 |         LEFT OUTER JOIN customers c ON t.customer_id = c.customer_id
112 |       """.stripMargin)
113 | 
114 |     writeKuduTable(enrichedTransactions, "transactions")
115 | 
116 |     val operationalMetadata = transactions
117 |       .groupBy(functions.window(functions.col("event_timestamp"), "1 minutes"))
118 |       .count().as("c")
119 |       .selectExpr("c.window.start as start_ts", "c.window.end as end_ts", "c.count as num_transactions")
120 | 
121 |     writeKuduTable(operationalMetadata, "operational_metadata")
122 | 
123 |     spark.streams.awaitAnyTermination()
124 | 
125 |   }
126 | 
127 |   def kafkaConfigFromPropertiesFile(configFile: String) = {
128 |     var inputStream: Option[InputStream] = None
129 |     val kafkaParams = try {
130 |       inputStream = Some(new FileInputStream(configFile))
131 |       val params = new Properties()
132 |       params.load(inputStream.get)
133 |       inputStream.get.close()
134 |       params.asScala.toMap
135 |     } finally {
136 |       inputStream.foreach(_.close())
137 |     }
138 |     kafkaParams.map {
139 |       case (key, value) => "kafka." + key -> value
140 |     }
141 |   }
142 | 
143 | }


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/simpleApp/src/main/scala/com/cloudera/streaming/refapp/kudu/KuduSink.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Modifications copyright (C) 2019 Cloudera Inc
 3 |  */
 4 | package com.cloudera.streaming.refapp.kudu
 5 | 
 6 | import org.apache.kudu.spark.kudu.KuduContext
 7 | import org.apache.spark.sql.execution.streaming.Sink
 8 | import org.apache.spark.sql.{DataFrame, SQLContext}
 9 | import org.slf4j.LoggerFactory
10 | 
11 | import scala.util.control.NonFatal
12 | 
13 | object KuduSink {
14 |   def withDefaultContext(sqlContext: SQLContext, parameters: Map[String, String]) =
15 |     new KuduSink(new KuduContext(parameters("kudu.master"), sqlContext.sparkContext), parameters)
16 | }
17 | 
18 | /**
19 |   * A simple Structured Streaming sink which writes the data frame to Kudu.
20 |   * It preserves exactly once semantics, as it's idempotent in the face of
21 |   * multiple attempts to add the same batch.
22 |   *
23 |   * It uses the following parameters:
24 |   * kudu.master - host:port pair of a kudu master node
25 |   * kudu.table - full table name
26 |   * checkpointLocation - where the checkpoint will be stored
27 |   */
28 | class KuduSink(initKuduContext: => KuduContext, parameters: Map[String, String]) extends Sink {
29 | 
30 |   private val logger = LoggerFactory.getLogger(getClass)
31 | 
32 |   private var kuduContext = initKuduContext
33 | 
34 |   private val tablename = parameters("kudu.table")
35 | 
36 |   private val retries = parameters.getOrElse("retries", "1").toInt
37 |   require(retries >= 0, "retries must be non-negative")
38 | 
39 |   logger.info(s"Created Kudu sink writing to table $tablename")
40 | 
41 |   override def addBatch(batchId: Long, data: DataFrame): Unit = {
42 |     for (attempt <- 0 to retries) {
43 |       try {
44 |         kuduContext.upsertRows(data, tablename)
45 |         return
46 |       } catch {
47 |         case NonFatal(e) =>
48 |           if (attempt < retries) {
49 |             logger.warn("Kudu upsert error, retrying...", e)
50 |             kuduContext = initKuduContext
51 |           }
52 |           else {
53 |             logger.error("Kudu upsert error, exhausted", e)
54 |             throw e
55 |           }
56 |       }
57 |     }
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/StructuredStreamingRefApp/simpleApp/src/main/scala/com/cloudera/streaming/refapp/kudu/KuduSinkProvider.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Modifications copyright (C) 2019 Cloudera Inc
 3 |  */
 4 | package com.cloudera.streaming.refapp.kudu
 5 | 
 6 | import org.apache.spark.sql.SQLContext
 7 | import org.apache.spark.sql.execution.streaming.Sink
 8 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
 9 | import org.apache.spark.sql.streaming.OutputMode
10 | 
11 | /**
12 |   * Registers KuduSink to Spark streaming, so it can be used with format "kudu".
13 |   *
14 |   * Note: to make it effective you need META-INF/services/org.apache.spark.sql.sources.DataSourceRegister to
15 |   * refer to this class.
16 |   */
17 | class KuduSinkProvider extends StreamSinkProvider with DataSourceRegister {
18 | 
19 |   override def createSink(sqlContext: SQLContext,
20 |                           parameters: Map[String, String],
21 |                           partitionColumns: Seq[String],
22 |                           outputMode: OutputMode): Sink = {
23 |     require(outputMode == OutputMode.Update, "only 'update' OutputMode is supported")
24 |     KuduSink.withDefaultContext(sqlContext, parameters)
25 |   }
26 | 
27 |   override def shortName(): String = "kudu"
28 | }


--------------------------------------------------------------------------------