├── .gitattributes ├── .github └── workflows │ ├── distribution-ci.yml │ └── maven-ci.yml ├── .gitignore ├── LICENSE ├── NOTICE ├── README.md ├── bin └── run-example ├── common ├── pom.xml └── src │ ├── main │ └── scala │ │ └── org │ │ └── apache │ │ └── bahir │ │ └── utils │ │ ├── FileHelper.scala │ │ ├── Logging.scala │ │ └── Retry.scala │ └── test │ └── java │ └── org │ └── apache │ └── spark │ ├── ConditionalSparkFunSuite.scala │ └── streaming │ └── LocalJavaStreamingContext.java ├── dev ├── change-scala-version.sh ├── checkstyle-license-header.txt ├── checkstyle-suppressions.xml ├── checkstyle.xml └── release-build.sh ├── distribution ├── pom.xml └── src │ └── main │ └── assembly │ └── src.xml ├── pom.xml ├── scalastyle-config.xml ├── sql-cloudant ├── README.md ├── examples │ ├── python │ │ ├── CloudantApp.py │ │ ├── CloudantDF.py │ │ ├── CloudantDFOption.py │ │ ├── CloudantQuery.py │ │ └── CloudantQueryDF.py │ └── src │ │ └── main │ │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── examples │ │ └── sql │ │ └── cloudant │ │ ├── CloudantApp.scala │ │ ├── CloudantDF.scala │ │ ├── CloudantDFOption.scala │ │ ├── CloudantStreaming.scala │ │ └── CloudantStreamingSelector.scala ├── pom.xml └── src │ ├── main │ ├── java │ │ └── org │ │ │ └── apache │ │ │ └── bahir │ │ │ └── cloudant │ │ │ └── common │ │ │ ├── ChangesRow.java │ │ │ └── ChangesRowScanner.java │ ├── resources │ │ ├── application.conf │ │ └── reference.conf │ └── scala │ │ └── org │ │ └── apache │ │ └── bahir │ │ └── cloudant │ │ ├── CloudantChangesConfig.scala │ │ ├── CloudantConfig.scala │ │ ├── CloudantReceiver.scala │ │ ├── DefaultSource.scala │ │ ├── common │ │ ├── CloudantException.scala │ │ ├── FilterUtil.scala │ │ ├── JsonStoreConfigManager.scala │ │ ├── JsonStoreDataAccess.scala │ │ ├── JsonStoreRDD.scala │ │ └── JsonUtil.scala │ │ └── internal │ │ └── ChangesReceiver.scala │ └── test │ ├── resources │ ├── json-files │ │ ├── n_airportcodemapping.json │ │ ├── n_booking.json │ │ ├── n_customer.json │ │ ├── n_customersession.json │ │ ├── n_flight.json │ │ └── n_flightsegment.json │ └── log4j.properties │ └── scala │ └── org │ └── apache │ └── bahir │ └── cloudant │ ├── ClientSparkFunSuite.scala │ ├── CloudantAllDocsDFSuite.scala │ ├── CloudantChangesDFSuite.scala │ ├── CloudantOptionSuite.scala │ ├── CloudantSparkSQLSuite.scala │ └── TestUtils.scala ├── sql-streaming-akka ├── README.md ├── examples │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── apache │ │ │ └── bahir │ │ │ └── examples │ │ │ └── sql │ │ │ └── streaming │ │ │ └── akka │ │ │ └── JavaAkkaStreamWordCount.java │ │ └── scala │ │ └── org │ │ └── apache │ │ └── bahir │ │ └── examples │ │ └── sql │ │ └── streaming │ │ └── akka │ │ └── AkkaStreamWordCount.scala ├── pom.xml └── src │ ├── main │ ├── assembly │ │ └── assembly.xml │ └── scala │ │ └── org │ │ └── apache │ │ └── bahir │ │ └── sql │ │ └── streaming │ │ └── akka │ │ ├── AkkaStreamSource.scala │ │ ├── LongOffset.scala │ │ └── MessageStore.scala │ └── test │ ├── resources │ ├── feeder_actor.conf │ └── log4j.properties │ └── scala │ └── org │ └── apache │ └── bahir │ └── sql │ └── streaming │ └── akka │ ├── AkkaStreamSourceSuite.scala │ └── AkkaTestUtils.scala ├── sql-streaming-jdbc ├── README.md ├── examples │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── apache │ │ │ └── bahir │ │ │ └── examples │ │ │ └── sql │ │ │ └── streaming │ │ │ └── jdbc │ │ │ └── JavaJdbcSinkDemo.java │ │ └── scala │ │ └── org │ │ └── apache │ │ └── bahir │ │ └── examples │ │ └── sql │ │ └── streaming │ │ └── jdbc │ │ └── JdbcSinkDemo.scala ├── pom.xml └── src │ ├── main │ ├── resources │ │ └── META-INF │ │ │ └── services │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ └── scala │ │ └── org │ │ └── apache │ │ └── bahir │ │ └── sql │ │ └── streaming │ │ └── jdbc │ │ ├── JdbcSourceProvider.scala │ │ ├── JdbcStreamWriter.scala │ │ └── JdbcUtil.scala │ └── test │ ├── resources │ └── log4j.properties │ └── scala │ └── org │ └── apache │ └── spark │ └── sql │ └── jdbc │ └── JdbcStreamWriterSuite.scala ├── sql-streaming-mqtt ├── README.md ├── examples │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── apache │ │ │ └── bahir │ │ │ └── examples │ │ │ └── sql │ │ │ └── streaming │ │ │ └── mqtt │ │ │ ├── JavaMQTTSinkWordCount.java │ │ │ └── JavaMQTTStreamWordCount.java │ │ └── scala │ │ └── org │ │ └── apache │ │ └── bahir │ │ └── examples │ │ └── sql │ │ └── streaming │ │ └── mqtt │ │ ├── MQTTSinkWordCount.scala │ │ └── MQTTStreamWordCount.scala ├── pom.xml └── src │ ├── main │ ├── assembly │ │ └── assembly.xml │ ├── resources │ │ └── META-INF │ │ │ └── services │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ └── scala │ │ └── org │ │ └── apache │ │ ├── bahir │ │ └── sql │ │ │ └── streaming │ │ │ └── mqtt │ │ │ ├── CachedMQTTClient.scala │ │ │ ├── LongOffset.scala │ │ │ ├── MQTTStreamSink.scala │ │ │ ├── MQTTStreamSource.scala │ │ │ ├── MQTTUtils.scala │ │ │ └── MessageStore.scala │ │ └── spark │ │ └── sql │ │ └── mqtt │ │ ├── HDFSMQTTSourceProvider.scala │ │ └── HdfsBasedMQTTStreamSource.scala │ └── test │ ├── bin │ └── test-BAHIR-83.sh │ ├── resources │ ├── keystore.jks │ ├── log4j.properties │ ├── logging.properties │ └── truststore.jks │ └── scala │ └── org │ └── apache │ └── bahir │ └── sql │ └── streaming │ └── mqtt │ ├── HDFSBasedMQTTStreamSourceSuite.scala │ ├── LocalMessageStoreSuite.scala │ ├── MQTTStreamSinkSuite.scala │ ├── MQTTStreamSourceSuite.scala │ └── MQTTTestUtils.scala ├── sql-streaming-sqs ├── README.md ├── examples │ └── src │ │ └── main │ │ └── scala │ │ └── org │ │ └── apache │ │ └── bahir │ │ └── examples │ │ └── sql │ │ └── streaming │ │ └── sqs │ │ └── SqsSourceExample.scala ├── pom.xml └── src │ ├── main │ ├── java │ │ └── org │ │ │ └── apache │ │ │ └── spark │ │ │ └── sql │ │ │ └── streaming │ │ │ └── sqs │ │ │ ├── BasicAWSCredentialsProvider.java │ │ │ └── InstanceProfileCredentialsProviderWithRetries.java │ ├── resources │ │ ├── META-INF │ │ │ └── services │ │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ │ └── log4j.properties │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── sql │ │ └── streaming │ │ └── sqs │ │ ├── SqsClient.scala │ │ ├── SqsFileCache.scala │ │ ├── SqsSource.scala │ │ ├── SqsSourceOptions.scala │ │ └── SqsSourceProvider.scala │ └── test │ ├── resources │ └── log4j.properties │ └── scala │ └── org │ └── apache │ └── spark │ └── sql │ └── streaming │ └── sqs │ └── SqsSourceOptionsSuite.scala ├── streaming-akka ├── README.md ├── examples │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── apache │ │ │ └── spark │ │ │ └── examples │ │ │ └── streaming │ │ │ └── akka │ │ │ └── JavaActorWordCount.java │ │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── examples │ │ └── streaming │ │ └── akka │ │ └── ActorWordCount.scala ├── pom.xml └── src │ ├── main │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── streaming │ │ └── akka │ │ ├── ActorReceiver.scala │ │ └── AkkaUtils.scala │ └── test │ ├── java │ └── org │ │ └── apache │ │ └── spark │ │ └── streaming │ │ └── akka │ │ └── JavaAkkaUtilsSuite.java │ ├── resources │ └── log4j.properties │ └── scala │ └── org │ └── apache │ └── spark │ └── streaming │ └── akka │ ├── AkkaStreamSuite.scala │ └── AkkaUtilsSuite.scala ├── streaming-mqtt ├── README.md ├── examples │ └── src │ │ └── main │ │ ├── python │ │ └── streaming │ │ │ └── mqtt_wordcount.py │ │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── examples │ │ └── streaming │ │ └── mqtt │ │ └── MQTTWordCount.scala ├── pom.xml ├── python-tests │ ├── run-python-tests.sh │ └── tests.py ├── python │ └── mqtt.py └── src │ ├── main │ ├── assembly │ │ └── assembly.xml │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── streaming │ │ └── mqtt │ │ ├── MQTTInputDStream.scala │ │ ├── MQTTPairedByteArrayInputDStream.scala │ │ ├── MQTTPairedInputDStream.scala │ │ └── MQTTUtils.scala │ └── test │ ├── java │ └── org │ │ └── apache │ │ └── spark │ │ └── streaming │ │ └── mqtt │ │ └── JavaMQTTStreamSuite.java │ ├── resources │ └── log4j.properties │ └── scala │ └── org │ └── apache │ └── spark │ └── streaming │ └── mqtt │ ├── MQTTStreamSuite.scala │ └── MQTTTestUtils.scala ├── streaming-pubnub ├── README.md ├── examples │ └── src │ │ └── main │ │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── examples │ │ └── streaming │ │ └── pubnub │ │ └── PubNubWordCount.scala ├── pom.xml └── src │ ├── main │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── streaming │ │ └── pubnub │ │ ├── PubNubInputDStream.scala │ │ └── PubNubUtils.scala │ └── test │ ├── java │ └── org │ │ └── apache │ │ └── spark │ │ └── streaming │ │ └── pubnub │ │ └── JavaPubNubStreamSuite.java │ ├── resources │ └── log4j.properties │ └── scala │ └── org │ └── apache │ └── spark │ └── streaming │ └── pubnub │ ├── MessageSerializationSuite.scala │ └── PubNubStreamSuite.scala ├── streaming-pubsub ├── README.md ├── examples │ └── src │ │ └── main │ │ └── scala │ │ └── org.apache.spark.examples.streaming.pubsub │ │ └── PubsubWordCount.scala ├── pom.xml └── src │ ├── main │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── streaming │ │ └── pubsub │ │ ├── PubsubInputDStream.scala │ │ ├── PubsubUtils.scala │ │ └── SparkGCPCredentials.scala │ └── test │ ├── java │ └── org │ │ └── apache │ │ └── spark │ │ └── streaming │ │ └── pubsub │ │ └── JavaPubsubStreamSuite.java │ ├── resources │ └── log4j.properties │ └── scala │ └── org │ └── apache │ └── spark │ └── streaming │ └── pubsub │ ├── PubsubStreamSuite.scala │ ├── PubsubTestUtils.scala │ └── SparkGCPCredentialsBuilderSuite.scala ├── streaming-twitter ├── README.md ├── examples │ ├── data │ │ └── AFINN-111.txt │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── apache │ │ │ └── spark │ │ │ └── examples │ │ │ └── streaming │ │ │ └── twitter │ │ │ └── JavaTwitterHashTagJoinSentiments.java │ │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── examples │ │ └── streaming │ │ └── twitter │ │ ├── TwitterAlgebirdCMS.scala │ │ ├── TwitterAlgebirdHLL.scala │ │ ├── TwitterHashTagJoinSentiments.scala │ │ ├── TwitterLocations.scala │ │ └── TwitterPopularTags.scala ├── pom.xml └── src │ ├── main │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── streaming │ │ └── twitter │ │ ├── TwitterInputDStream.scala │ │ └── TwitterUtils.scala │ └── test │ ├── java │ └── org │ │ └── apache │ │ └── spark │ │ └── streaming │ │ └── twitter │ │ └── JavaTwitterStreamSuite.java │ ├── resources │ └── log4j.properties │ └── scala │ └── org │ └── apache │ └── spark │ └── streaming │ └── twitter │ └── TwitterStreamSuite.scala └── streaming-zeromq ├── README.md ├── examples └── src │ └── main │ └── scala │ └── org │ └── apache │ └── spark │ └── examples │ └── streaming │ └── zeromq │ └── ZeroMQWordCount.scala ├── pom.xml └── src ├── main └── scala │ └── org │ └── apache │ └── spark │ └── streaming │ └── zeromq │ ├── ZeroMQInputDStream.scala │ └── ZeroMQUtils.scala └── test ├── java └── org │ └── apache │ └── spark │ └── streaming │ └── zeromq │ └── JavaZeroMQStreamSuite.java ├── resources └── log4j.properties └── scala └── org └── apache └── spark └── streaming └── zeromq └── ZeroMQStreamSuite.scala /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior to have all files normalized to Unix-style 2 | # line endings upon check-in. 3 | * text=auto 4 | 5 | # Declare files that will always have CRLF line endings on checkout. 6 | *.bat text eol=crlf 7 | 8 | # Denote all files that are truly binary and should not be modified. 9 | *.dll binary 10 | *.exp binary 11 | *.lib binary 12 | *.pdb binary 13 | *.exe binary 14 | -------------------------------------------------------------------------------- /.github/workflows/distribution-ci.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | name: Distribution 19 | 20 | on: 21 | push: 22 | branches: [ master ] 23 | 24 | jobs: 25 | build: 26 | 27 | runs-on: ubuntu-latest 28 | strategy: 29 | matrix: 30 | java: ['8'] 31 | 32 | steps: 33 | - uses: actions/checkout@v2 34 | - name: Set up JDK ${{ matrix.java }} 35 | uses: actions/setup-java@v2 36 | with: 37 | java-version: ${{ matrix.java }} 38 | distribution: 'zulu' 39 | cache: maven 40 | 41 | - name: Build 42 | run: mvn -Pdistribution clean install 43 | -------------------------------------------------------------------------------- /.github/workflows/maven-ci.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | name: Java CI with Maven 19 | 20 | on: 21 | push: 22 | branches: [ master ] 23 | pull_request: 24 | branches: [ master ] 25 | 26 | jobs: 27 | build: 28 | 29 | runs-on: ubuntu-latest 30 | strategy: 31 | matrix: 32 | java: ['8'] 33 | spark-version: ['2.4.8'] 34 | scala-version: ['2.12'] 35 | 36 | steps: 37 | - uses: actions/checkout@v2 38 | - name: Set up JDK ${{ matrix.java }} 39 | uses: actions/setup-java@v2 40 | with: 41 | java-version: ${{ matrix.java }} 42 | distribution: 'zulu' 43 | cache: maven 44 | - name: Change scala version to ${{ matrix.scala-version }} 45 | run: ./dev/change-scala-version.sh ${{ matrix.scala-version }} 46 | shell: bash 47 | - name: Build with spark ${{ matrix.spark-version }} 48 | run: mvn -B clean verify -Dscala-${{ matrix.scala-version }} -Dspark.version=${{ matrix.spark-version }} 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Mac 2 | .DS_Store 3 | 4 | # Eclipse 5 | .classpath 6 | .project 7 | .settings/ 8 | target/ 9 | 10 | # Intellij 11 | .idea/ 12 | .idea_modules/ 13 | *.iml 14 | *.iws 15 | *.class 16 | *.log 17 | 18 | # Python 19 | *.pyc 20 | 21 | # Others 22 | .checkstyle 23 | .fbExcludeFilterFile 24 | dependency-reduced-pom.xml 25 | checkpoint 26 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Apache Bahir 2 | Copyright (c) 2016-2017 The Apache Software Foundation. 3 | 4 | This product includes software developed at 5 | The Apache Software Foundation (http://www.apache.org/). 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 19 | # Apache Bahir 20 | 21 | Apache Bahir provides extensions to distributed analytics platforms such as Apache Spark & Apache Flink. 22 | 23 | 24 | 25 | ## Apache Bahir origins 26 | 27 | The Initial Bahir source code (see issue [BAHIR-1](https://issues.apache.org/jira/browse/BAHIR-1)) containing the source for the Apache Spark streaming connectors for akka, mqtt, twitter, zeromq 28 | extracted from [Apache Spark revision 8301fad](https://github.com/apache/spark/tree/8301fadd8d269da11e72870b7a889596e3337839) 29 | (before the [deletion of the streaming connectors akka, mqtt, twitter, zeromq](https://issues.apache.org/jira/browse/SPARK-13843)). 30 | 31 | ## Source code structure 32 | 33 | Source code folder structure: 34 | ``` 35 | - streaming-akka 36 | - examples/src/main/... 37 | - src/main/... 38 | - streaming-mqtt 39 | - examples 40 | - src 41 | - python 42 | - ... 43 | ``` 44 | 45 | ## Building Bahir 46 | 47 | Bahir is built using [Apache Maven](http://maven.apache.org/). 48 | To build Bahir and its example programs, run: 49 | 50 | mvn -DskipTests clean install 51 | 52 | ## Running tests 53 | 54 | Testing first requires [building Bahir](#building-bahir). Once Bahir is built, tests 55 | can be run using: 56 | 57 | mvn test 58 | 59 | ## Example programs 60 | 61 | Each extension currently available in Apache Bahir has an example application located under the "examples" folder. 62 | 63 | 64 | ## Documentation 65 | 66 | Currently, each submodule has its own README.md, with information on example usages and API. 67 | 68 | * [SQL Cloudant](https://github.com/apache/bahir/blob/master/sql-cloudant/README.md) 69 | * [SQL Streaming Akka](https://github.com/apache/bahir/blob/master/sql-streaming-akka/README.md) 70 | * [SQL Streaming JDBC](https://github.com/apache/bahir/blob/master/sql-streaming-jdbc/README.md) 71 | * [SQL Streaming MQTT](https://github.com/apache/bahir/blob/master/sql-streaming-mqtt/README.md) 72 | * [SQL Streaming SQS](https://github.com/apache/bahir/blob/master/sql-streaming-sqs/README.md) 73 | * [Streaming Akka](https://github.com/apache/bahir/blob/master/streaming-akka/README.md) 74 | * [Streaming MQTT](https://github.com/apache/bahir/blob/master/streaming-mqtt/README.md) 75 | * [Streaming PubNub](https://github.com/apache/bahir/blob/master/streaming-pubnub/README.md) 76 | * [Streaming Google Pub/Sub](https://github.com/apache/bahir/blob/master/streaming-pubsub/README.md) 77 | * [Streaming Twitter](https://github.com/apache/bahir/blob/master/streaming-twitter/README.md) 78 | * [Streaming ZeroMQ](https://github.com/apache/bahir/blob/master/streaming-zeromq/README.md) 79 | 80 | Furthermore, to generate scaladocs for each module: 81 | 82 | `$ mvn package` 83 | 84 | Scaladocs is generated in, `MODULE_NAME/target/site/scaladocs/index.html`. __ Where `MODULE_NAME` is one of, `sql-streaming-mqtt`, `streaming-akka`, `streaming-mqtt`, `streaming-zeromq`, `streaming-twitter`. __ 85 | 86 | ## A note about Apache Spark integration 87 | 88 | Currently, each module in Bahir is available through spark packages. Please follow linking sub section in module specific [README.md](#documentation) for more details. 89 | -------------------------------------------------------------------------------- /common/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 4.0.0 21 | 22 | org.apache.bahir 23 | bahir-parent_2.12 24 | 3.0.0-SNAPSHOT 25 | ../pom.xml 26 | 27 | 28 | org.apache.bahir 29 | bahir-common_2.12 30 | 31 | bahir-common 32 | 33 | jar 34 | Apache Bahir - Common 35 | http://bahir.apache.org/ 36 | 37 | 38 | 39 | org.apache.spark 40 | spark-tags_${scala.binary.version} 41 | 42 | 43 | org.apache.spark 44 | spark-streaming_${scala.binary.version} 45 | ${spark.version} 46 | compile 47 | true 48 | 49 | 50 | org.apache.spark 51 | spark-core_${scala.binary.version} 52 | ${spark.version} 53 | compile 54 | 55 | 56 | org.apache.spark 57 | spark-core_${scala.binary.version} 58 | ${spark.version} 59 | test-jar 60 | test 61 | 62 | 63 | org.scalacheck 64 | scalacheck_${scala.binary.version} 65 | test 66 | 67 | 68 | 69 | target/scala-${scala.binary.version}/classes 70 | target/scala-${scala.binary.version}/test-classes 71 | 72 | 73 | org.apache.maven.plugins 74 | maven-source-plugin 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /common/src/main/scala/org/apache/bahir/utils/FileHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.utils 19 | 20 | import java.io.{File, IOException} 21 | import java.nio.file.{Files, FileVisitResult, Path, SimpleFileVisitor} 22 | import java.nio.file.attribute.BasicFileAttributes 23 | 24 | object FileHelper extends Logging { 25 | def deleteFileQuietly(file: File): Path = { 26 | Files.walkFileTree(file.toPath, new SimpleFileVisitor[Path]() { 27 | override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = { 28 | try { 29 | Files.delete(file) 30 | } catch { 31 | case t: Throwable => log.warn("Failed to delete", t) 32 | } 33 | FileVisitResult.CONTINUE 34 | } 35 | 36 | override def postVisitDirectory(dir: Path, exc: IOException): FileVisitResult = { 37 | try { 38 | Files.delete(dir) 39 | } catch { 40 | case t: Throwable => log.warn("Failed to delete", t) 41 | } 42 | FileVisitResult.CONTINUE 43 | } 44 | }) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /common/src/main/scala/org/apache/bahir/utils/Logging.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.utils 19 | 20 | import org.slf4j.LoggerFactory 21 | 22 | trait Logging { 23 | final val log = LoggerFactory.getLogger(this.getClass.getName.stripSuffix("$")) 24 | } 25 | -------------------------------------------------------------------------------- /common/src/main/scala/org/apache/bahir/utils/Retry.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.utils 19 | 20 | object Retry { 21 | /** 22 | * Retry invocation of given code. 23 | * @param attempts Number of attempts to try executing given code. -1 represents infinity. 24 | * @param pauseMs Number of backoff milliseconds. 25 | * @param retryExceptions Types of exceptions to retry. 26 | * @param code Function to execute. 27 | * @tparam A Type parameter. 28 | * @return Returns result of function execution or exception in case of failure. 29 | */ 30 | def apply[A](attempts: Int, pauseMs: Long, retryExceptions: Class[_]*)(code: => A): A = { 31 | var result: Option[A] = None 32 | var success = false 33 | var remaining = attempts 34 | while (!success) { 35 | try { 36 | remaining -= 1 37 | result = Some(code) 38 | success = true 39 | } 40 | catch { 41 | case e: Exception => 42 | if (retryExceptions.contains(e.getClass) && (attempts == -1 || remaining > 0)) { 43 | Thread.sleep(pauseMs) 44 | } else { 45 | throw e 46 | } 47 | } 48 | } 49 | result.get 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /common/src/test/java/org/apache/spark/ConditionalSparkFunSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark 19 | 20 | trait ConditionalSparkFunSuite extends SparkFunSuite { 21 | /** 22 | * Run test if given predicate is satisfied. 23 | * @param testName Test name 24 | * @param condition If satisfied, test will be executed 25 | * @param testBody Test body 26 | */ 27 | def testIf(testName: String, condition: () => Boolean)(testBody: => Unit) { 28 | if (condition()) { 29 | test(testName)(testBody) 30 | } else { 31 | ignore(testName)(testBody) 32 | } 33 | } 34 | 35 | /** 36 | * Run given code only if predicate has been satisfied. 37 | * @param condition If satisfied, run code block 38 | * @param body Code block 39 | */ 40 | def runIf(condition: () => Boolean)(body: => Unit): Unit = { 41 | if (condition()) { 42 | body 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /common/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.streaming; 19 | 20 | import org.apache.spark.SparkConf; 21 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 22 | import org.junit.After; 23 | import org.junit.Before; 24 | 25 | public abstract class LocalJavaStreamingContext { 26 | protected transient JavaStreamingContext ssc; 27 | 28 | @Before 29 | public void setUp() { 30 | final SparkConf conf = new SparkConf() 31 | .setMaster("local[2]") 32 | .setAppName("test") 33 | .set("spark.streaming.clock", "org.apache.spark.util.ManualClock"); 34 | ssc = new JavaStreamingContext(conf, new Duration(1000)); 35 | ssc.checkpoint("checkpoint"); 36 | } 37 | 38 | @After 39 | public void tearDown() { 40 | ssc.stop(); 41 | ssc = null; 42 | } 43 | } 44 | 45 | -------------------------------------------------------------------------------- /dev/change-scala-version.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | set -e 21 | 22 | VALID_VERSIONS=( 2.11 2.12 ) 23 | 24 | usage() { 25 | echo "Usage: $(basename $0) [-h|--help] 26 | where : 27 | -h| --help Display this help text 28 | valid version values : ${VALID_VERSIONS[*]} 29 | " 1>&2 30 | exit 1 31 | } 32 | 33 | if [[ ($# -ne 1) || ( $1 == "--help") || $1 == "-h" ]]; then 34 | usage 35 | fi 36 | 37 | TO_VERSION=$1 38 | 39 | check_scala_version() { 40 | for i in ${VALID_VERSIONS[*]}; do [ $i = "$1" ] && return 0; done 41 | echo "Invalid Scala version: $1. Valid versions: ${VALID_VERSIONS[*]}" 1>&2 42 | exit 1 43 | } 44 | 45 | check_scala_version "$TO_VERSION" 46 | 47 | if [ $TO_VERSION = "2.12" ]; then 48 | FROM_VERSION="2.11" 49 | else 50 | FROM_VERSION="2.12" 51 | fi 52 | 53 | sed_i() { 54 | sed -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2" 55 | } 56 | 57 | export -f sed_i 58 | 59 | BASEDIR=$(dirname $0)/.. 60 | find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \ 61 | -exec bash -c "sed_i 's/\(artifactId.*\)_'$FROM_VERSION'/\1_'$TO_VERSION'/g' {}" \; 62 | 63 | # also update in parent POM 64 | # match any scala binary version to ensure idempotency 65 | sed_i '1,/[0-9]*\.[0-9]*[0-9]*\.[0-9]*'$TO_VERSION' 17 | 18 | 21 | 22 | 29 | 30 | 31 | 33 | 34 | -------------------------------------------------------------------------------- /distribution/src/main/assembly/src.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | src 19 | 20 | 21 | tar.gz 22 | zip 23 | 24 | 25 | true 26 | apache-bahir-${version}-src 27 | 28 | 29 | 30 | .. 31 | 32 | 33 | **/.* 34 | **/.*/** 35 | **/*.log 36 | **/*.iml 37 | **/conf/*.properties 38 | **/conf/*.xml 39 | **/dependency-reduced-pom.xml 40 | **/scratch_space 41 | **/scratch_space/**/* 42 | **/target 43 | **/target/**/* 44 | **/temp 45 | **/temp/**/* 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /sql-cloudant/examples/python/CloudantApp.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from pyspark.sql import SparkSession 18 | 19 | spark = SparkSession\ 20 | .builder\ 21 | .appName("Cloudant Spark SQL Example in Python using temp tables")\ 22 | .config("cloudant.host","ACCOUNT.cloudant.com")\ 23 | .config("cloudant.username", "USERNAME")\ 24 | .config("cloudant.password","PASSWORD")\ 25 | .getOrCreate() 26 | 27 | 28 | # ***1. Loading temp table from Cloudant db 29 | spark.sql(" CREATE TEMPORARY TABLE airportTable USING org.apache.bahir.cloudant OPTIONS ( database 'n_airportcodemapping')") 30 | airportData = spark.sql("SELECT _id, airportName FROM airportTable WHERE _id >= 'CAA' AND _id <= 'GAA' ORDER BY _id") 31 | airportData.printSchema() 32 | print ('Total # of rows in airportData: ' + str(airportData.count())) 33 | for code in airportData.collect(): 34 | print (code._id) 35 | 36 | 37 | # ***2. Loading temp table from Cloudant search index 38 | print ('About to test org.apache.bahir.cloudant for flight with index') 39 | spark.sql(" CREATE TEMPORARY TABLE flightTable1 USING org.apache.bahir.cloudant OPTIONS ( database 'n_flight', index '_design/view/_search/n_flights')") 40 | flightData = spark.sql("SELECT flightSegmentId, scheduledDepartureTime FROM flightTable1 WHERE flightSegmentId >'AA9' AND flightSegmentId<'AA95'") 41 | flightData.printSchema() 42 | for code in flightData.collect(): 43 | print ('Flight {0} on {1}'.format(code.flightSegmentId, code.scheduledDepartureTime)) 44 | 45 | -------------------------------------------------------------------------------- /sql-cloudant/examples/python/CloudantDF.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from pyspark.sql import SparkSession 18 | 19 | # define cloudant related configuration 20 | # set protocol to http if needed, default value=https 21 | # config("cloudant.protocol","http") 22 | spark = SparkSession\ 23 | .builder\ 24 | .appName("Cloudant Spark SQL Example in Python using dataframes")\ 25 | .config("cloudant.host","ACCOUNT.cloudant.com")\ 26 | .config("cloudant.username", "USERNAME")\ 27 | .config("cloudant.password","PASSWORD")\ 28 | .config("jsonstore.rdd.partitions", 8)\ 29 | .getOrCreate() 30 | 31 | 32 | # ***1. Loading dataframe from Cloudant db 33 | df = spark.read.load("n_airportcodemapping", "org.apache.bahir.cloudant") 34 | # In case of doing multiple operations on a dataframe (select, filter etc.) 35 | # you should persist the dataframe. 36 | # Othewise, every operation on the dataframe will load the same data from Cloudant again. 37 | # Persisting will also speed up computation. 38 | df.cache() # persisting in memory 39 | # alternatively for large dbs to persist in memory & disk: 40 | # from pyspark import StorageLevel 41 | # df.persist(storageLevel = StorageLevel(True, True, False, True, 1)) 42 | df.printSchema() 43 | df.filter(df.airportName >= 'Moscow').select("_id",'airportName').show() 44 | df.filter(df._id >= 'CAA').select("_id",'airportName').show() 45 | 46 | 47 | # ***2. Saving a datafram to Cloudant db 48 | df = spark.read.load(format="org.apache.bahir.cloudant", database="n_flight") 49 | df.printSchema() 50 | df2 = df.filter(df.flightSegmentId=='AA106')\ 51 | .select("flightSegmentId", "economyClassBaseCost") 52 | df2.write.save("n_flight2", "org.apache.bahir.cloudant", 53 | bulkSize = "100", createDBOnSave="true") 54 | total = df.filter(df.flightSegmentId >'AA9').select("flightSegmentId", 55 | "scheduledDepartureTime").orderBy(df.flightSegmentId).count() 56 | print ("Total", total, "flights from table") 57 | 58 | 59 | # ***3. Loading dataframe from a Cloudant search index 60 | df = spark.read.load(format="org.apache.bahir.cloudant", database="n_flight", 61 | index="_design/view/_search/n_flights") 62 | df.printSchema() 63 | total = df.filter(df.flightSegmentId >'AA9').select("flightSegmentId", 64 | "scheduledDepartureTime").orderBy(df.flightSegmentId).count() 65 | print ("Total", total, "flights from index") 66 | 67 | 68 | # ***4. Loading dataframe from a Cloudant view 69 | df = spark.read.load(format="org.apache.bahir.cloudant", path="n_flight", 70 | view="_design/view/_view/AA0", schemaSampleSize="20") 71 | # schema for view will always be: _id, key, value 72 | # where value can be a complex field 73 | df.printSchema() 74 | df.show() 75 | -------------------------------------------------------------------------------- /sql-cloudant/examples/python/CloudantDFOption.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from pyspark.sql import SparkSession 18 | 19 | spark = SparkSession\ 20 | .builder\ 21 | .appName("Cloudant Spark SQL Example in Python using dataframes with options")\ 22 | .getOrCreate() 23 | 24 | cloudant_host = "ACCOUNT.cloudant.com" 25 | cloudant_username = "USERNAME" 26 | cloudant_password = "PASSWORD" 27 | 28 | # ***1. Loading dataframe from Cloudant db 29 | df = spark.read.format("org.apache.bahir.cloudant") \ 30 | .option("cloudant.host", cloudant_host) \ 31 | .option("cloudant.username", cloudant_username) \ 32 | .option("cloudant.password", cloudant_password) \ 33 | .load("n_airportcodemapping") 34 | df.cache() # persisting in memory 35 | df.printSchema() 36 | df.filter(df._id >= 'CAA').select("_id",'airportName').show() 37 | 38 | 39 | # ***2.Saving dataframe to Cloudant db 40 | df.filter(df._id >= 'CAA').select("_id",'airportName') \ 41 | .write.format("org.apache.bahir.cloudant") \ 42 | .option("cloudant.host", cloudant_host) \ 43 | .option("cloudant.username", cloudant_username) \ 44 | .option("cloudant.password",cloudant_password) \ 45 | .option("bulkSize","100") \ 46 | .option("createDBOnSave", "true") \ 47 | .save("airportcodemapping_df") 48 | df = spark.read.format("org.apache.bahir.cloudant") \ 49 | .option("cloudant.host", cloudant_host) \ 50 | .option("cloudant.username", cloudant_username) \ 51 | .option("cloudant.password", cloudant_password) \ 52 | .load("n_flight") 53 | df.printSchema() 54 | total = df.filter(df.flightSegmentId >'AA9') \ 55 | .select("flightSegmentId", "scheduledDepartureTime") \ 56 | .orderBy(df.flightSegmentId).count() 57 | print ("Total", total, "flights from table") 58 | 59 | 60 | # ***3. Loading dataframe from Cloudant search index 61 | df = spark.read.format("org.apache.bahir.cloudant") \ 62 | .option("cloudant.host",cloudant_host) \ 63 | .option("cloudant.username",cloudant_username) \ 64 | .option("cloudant.password",cloudant_password) \ 65 | .option("index","_design/view/_search/n_flights").load("n_flight") 66 | df.printSchema() 67 | 68 | total = df.filter(df.flightSegmentId >'AA9') \ 69 | .select("flightSegmentId", "scheduledDepartureTime") \ 70 | .orderBy(df.flightSegmentId).count() 71 | print ("Total", total, "flights from index") 72 | -------------------------------------------------------------------------------- /sql-cloudant/examples/python/CloudantQuery.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import pprint 18 | from pyspark.sql import SparkSession 19 | 20 | # define cloudant related configuration 21 | # set protocol to http if needed, default value=https 22 | # config("cloudant.protocol","http") 23 | spark = SparkSession\ 24 | .builder\ 25 | .appName("Cloudant Spark SQL Example in Python using query")\ 26 | .config("cloudant.host","ACCOUNT.cloudant.com")\ 27 | .config("cloudant.username", "USERNAME")\ 28 | .config("cloudant.password","PASSWORD")\ 29 | .config("jsonstore.rdd.partitions", 8)\ 30 | .config("cloudant.useQuery", "true")\ 31 | .config("schemaSampleSize",1)\ 32 | .getOrCreate() 33 | 34 | 35 | spark.sql(" CREATE TEMPORARY VIEW airportTable1 USING org.apache.bahir.cloudant OPTIONS ( database 'n_airportcodemapping')") 36 | airportData = spark.sql("SELECT _id, airportName FROM airportTable1 WHERE airportName == 'Moscow' ") 37 | airportData.printSchema() 38 | print 'Total # of rows in airportData: ' + str(airportData.count()) 39 | airportData.show() 40 | 41 | spark.sql(" CREATE TEMPORARY VIEW airportTable2 USING org.apache.bahir.cloudant OPTIONS ( database 'n_airportcodemapping')") 42 | airportData = spark.sql("SELECT _id, airportName FROM airportTable2 WHERE airportName > 'Moscow' ORDER BY _id") 43 | airportData.printSchema() 44 | print 'Total # of rows in airportData: ' + str(airportData.count()) 45 | airportData.show() 46 | 47 | spark.sql(" CREATE TEMPORARY VIEW airportTable3 USING org.apache.bahir.cloudant OPTIONS ( database 'n_airportcodemapping')") 48 | airportData = spark.sql("SELECT _id, airportName FROM airportTable3 WHERE airportName > 'Moscow' AND airportName < 'Sydney' ORDER BY _id") 49 | airportData.printSchema() 50 | print 'Total # of rows in airportData: ' + str(airportData.count()) 51 | airportData.show() 52 | 53 | spark.sql(" CREATE TEMPORARY VIEW flight1 USING org.apache.bahir.cloudant OPTIONS ( database 'n_flight')") 54 | flightData = spark.sql("SELECT flightSegmentId, economyClassBaseCost, numFirstClassSeats FROM flight1 WHERE economyClassBaseCost >=200 AND numFirstClassSeats<=10") 55 | flightData.printSchema() 56 | print 'Total # of rows in airportData: ' + str(flightData.count()) 57 | flightData.show() 58 | 59 | spark.sql(" CREATE TEMPORARY VIEW flight2 USING org.apache.bahir.cloudant OPTIONS ( database 'n_flight')") 60 | flightData = spark.sql("SELECT flightSegmentId, scheduledDepartureTime, scheduledArrivalTime FROM flight2 WHERE scheduledDepartureTime >='2014-12-15T05:00:00.000Z' AND scheduledArrivalTime <='2014-12-15T11:04:00.000Z'") 61 | flightData.printSchema() 62 | print 'Total # of rows in airportData: ' + str(flightData.count()) 63 | flightData.show() 64 | 65 | 66 | -------------------------------------------------------------------------------- /sql-cloudant/examples/python/CloudantQueryDF.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import pprint 18 | from pyspark.sql import SparkSession 19 | 20 | # define cloudant related configuration 21 | # set protocol to http if needed, default value=https 22 | # config("cloudant.protocol","http") 23 | spark = SparkSession\ 24 | .builder\ 25 | .appName("Cloudant Spark SQL Example in Python using query")\ 26 | .config("cloudant.host","ACCOUNT.cloudant.com")\ 27 | .config("cloudant.username", "USERNAME")\ 28 | .config("cloudant.password","PASSWORD")\ 29 | .config("jsonstore.rdd.partitions", 8)\ 30 | .config("cloudant.useQuery", "true")\ 31 | .config("schemaSampleSize",1)\ 32 | .getOrCreate() 33 | 34 | 35 | # ***0. Loading dataframe from Cloudant db with one String field condition 36 | df = spark.read.load("n_airportcodemapping", "org.apache.bahir.cloudant") 37 | df.printSchema() 38 | df.filter(df.airportName == 'Moscow').select("_id",'airportName').show() 39 | 40 | 41 | # ***1. Loading dataframe from Cloudant db with one String field condition 42 | df = spark.read.load("n_airportcodemapping", "org.apache.bahir.cloudant") 43 | df.printSchema() 44 | df.filter(df.airportName > 'Moscow').select("_id",'airportName').show() 45 | 46 | # ***2. Loading dataframe from Cloudant db with two String field condition 47 | df = spark.read.load("n_airportcodemapping", "org.apache.bahir.cloudant") 48 | df.printSchema() 49 | df.filter(df.airportName > 'Moscow').filter(df.airportName < 'Sydney').select("_id",'airportName').show() 50 | 51 | # ***3. Loading dataframe from Cloudant db with two int field condition 52 | df = spark.read.load("n_flight", "org.apache.bahir.cloudant") 53 | df.printSchema() 54 | df.filter(df.economyClassBaseCost >= 200).filter(df.numFirstClassSeats <=10).select('flightSegmentId','scheduledDepartureTime', 'scheduledArrivalTime').show() 55 | 56 | # ***4. Loading dataframe from Cloudant db with two timestamp field condition 57 | df = spark.read.load("n_flight", "org.apache.bahir.cloudant") 58 | df.printSchema() 59 | df.filter(df.scheduledDepartureTime >= "2014-12-15T05:00:00.000Z").filter(df.scheduledArrivalTime <="2014-12-15T11:04:00.000Z").select('flightSegmentId','scheduledDepartureTime', 'scheduledArrivalTime').show() 60 | 61 | 62 | -------------------------------------------------------------------------------- /sql-cloudant/examples/src/main/scala/org/apache/spark/examples/sql/cloudant/CloudantApp.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.sql.cloudant 19 | 20 | import org.apache.spark.sql.SparkSession 21 | 22 | object CloudantApp { 23 | def main(args: Array[String]) { 24 | val spark = SparkSession 25 | .builder() 26 | .appName("Cloudant Spark SQL Example") 27 | .config("cloudant.host", "ACCOUNT.cloudant.com") 28 | .config("cloudant.username", "USERNAME") 29 | .config("cloudant.password", "PASSWORD") 30 | .getOrCreate() 31 | 32 | // For implicit conversions of Dataframe to RDDs 33 | import spark.implicits._ 34 | 35 | // create a temp table from Cloudant db and query it using sql syntax 36 | spark.sql( 37 | s""" 38 | |CREATE TEMPORARY VIEW airportTable 39 | |USING org.apache.bahir.cloudant 40 | |OPTIONS ( database 'n_airportcodemapping') 41 | """.stripMargin) 42 | // create a dataframe 43 | val airportData = spark.sql( 44 | s""" 45 | |SELECT _id, airportName 46 | |FROM airportTable 47 | |WHERE _id >= 'CAA' AND _id <= 'GAA' ORDER BY _id 48 | """.stripMargin) 49 | airportData.printSchema() 50 | println(s"Total # of rows in airportData: " + airportData.count()) // scalastyle:ignore 51 | // convert dataframe to array of Rows, and process each row 52 | airportData.map(t => "code: " + t(0) + ",name:" + t(1)).collect().foreach(println) // scalastyle:ignore 53 | 54 | // create a temp table from Cloudant index and query it using sql syntax 55 | spark.sql( 56 | s""" 57 | |CREATE TEMPORARY VIEW flightTable 58 | |USING org.apache.bahir.cloudant 59 | |OPTIONS (database 'n_flight', index '_design/view/_search/n_flights') 60 | """.stripMargin) 61 | val flightData = spark.sql( 62 | s""" 63 | |SELECT flightSegmentId, scheduledDepartureTime 64 | |FROM flightTable 65 | |WHERE flightSegmentId >'AA9' AND flightSegmentId<'AA95' 66 | """.stripMargin) 67 | flightData.printSchema() 68 | flightData.map(t => "flightSegmentId: " + t(0) + ", scheduledDepartureTime: " + t(1)) 69 | .collect().foreach(println) // scalastyle:ignore 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /sql-cloudant/examples/src/main/scala/org/apache/spark/examples/sql/cloudant/CloudantDF.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.sql.cloudant 19 | 20 | import org.apache.spark.sql.SparkSession 21 | 22 | object CloudantDF{ 23 | def main(args: Array[String]) { 24 | val spark = SparkSession 25 | .builder() 26 | .appName("Cloudant Spark SQL Example with Dataframe") 27 | .config("cloudant.host", "ACCOUNT.cloudant.com") 28 | .config("cloudant.username", "USERNAME") 29 | .config("cloudant.password", "PASSWORD") 30 | .config("createDBOnSave", "true") // to create a db on save 31 | .config("jsonstore.rdd.partitions", "20") // using 20 partitions 32 | .getOrCreate() 33 | 34 | // 1. Loading data from Cloudant db 35 | val df = spark.read.format("org.apache.bahir.cloudant").load("n_flight") 36 | // Caching df in memory to speed computations 37 | // and not to retrieve data from cloudant again 38 | df.cache() 39 | df.printSchema() 40 | 41 | // 2. Saving dataframe to Cloudant db 42 | val df2 = df.filter(df("flightSegmentId") === "AA106") 43 | .select("flightSegmentId", "economyClassBaseCost") 44 | df2.show() 45 | df2.write.format("org.apache.bahir.cloudant").save("n_flight2") 46 | 47 | // 3. Loading data from Cloudant search index 48 | val df3 = spark.read.format("org.apache.bahir.cloudant") 49 | .option("index", "_design/view/_search/n_flights").load("n_flight") 50 | val total = df3.filter(df3("flightSegmentId") >"AA9") 51 | .select("flightSegmentId", "scheduledDepartureTime") 52 | .orderBy(df3("flightSegmentId")).count() 53 | println(s"Total $total flights from index") // scalastyle:ignore 54 | 55 | // 4. Loading data from view 56 | val df4 = spark.read.format("org.apache.bahir.cloudant") 57 | .option("view", "_design/view/_view/AA0").load("n_flight") 58 | df4.printSchema() 59 | df4.show() 60 | 61 | // 5. Loading data from a view with map and reduce 62 | // Loading data from Cloudant db 63 | val df5 = spark.read.format("org.apache.bahir.cloudant") 64 | .option("view", "_design/view/_view/AAreduce?reduce=true") 65 | .load("n_flight") 66 | df5.printSchema() 67 | df5.show() 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /sql-cloudant/examples/src/main/scala/org/apache/spark/examples/sql/cloudant/CloudantDFOption.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.sql.cloudant 19 | 20 | import org.apache.spark.sql.SparkSession 21 | 22 | object CloudantDFOption{ 23 | def main(args: Array[String]) { 24 | val spark = SparkSession 25 | .builder() 26 | .appName("Cloudant Spark SQL Example with Dataframe using Option") 27 | .getOrCreate() 28 | 29 | val cloudantHost = "ACCOUNT.cloudant.com" 30 | val cloudantUser = "USERNAME" 31 | val cloudantPassword = "PASSWORD" 32 | 33 | // 1. Loading data from Cloudant db 34 | val df = spark.read.format("org.apache.bahir.cloudant") 35 | .option("cloudant.host", cloudantHost) 36 | .option("cloudant.username", cloudantUser) 37 | .option("cloudant.password", cloudantPassword) 38 | .load("n_airportcodemapping") 39 | 40 | df.cache() 41 | df.printSchema() 42 | df.filter(df("_id") >= "CAA").select("_id", "airportName").show() 43 | 44 | // 2. Saving dataframe to Cloudant db 45 | // To create a Cloudant db during save set the option createDBOnSave=true 46 | df.filter(df("_id") >= "CAA") 47 | .select("_id", "airportName") 48 | .write.format("org.apache.bahir.cloudant") 49 | .option("cloudant.host", cloudantHost) 50 | .option("cloudant.username", cloudantUser) 51 | .option("cloudant.password", cloudantPassword) 52 | .option("createDBOnSave", "true") 53 | .save("airportcodemapping_df") 54 | 55 | // 3. Loading data from Cloudant search index 56 | val df2 = spark.read.format("org.apache.bahir.cloudant") 57 | .option("index", "_design/view/_search/n_flights") 58 | .option("cloudant.host", cloudantHost) 59 | .option("cloudant.username", cloudantUser) 60 | .option("cloudant.password", cloudantPassword) 61 | .load("n_flight") 62 | val total2 = df2.filter(df2("flightSegmentId") >"AA9") 63 | .select("flightSegmentId", "scheduledDepartureTime") 64 | .orderBy(df2("flightSegmentId")) 65 | .count() 66 | println(s"Total $total2 flights from index")// scalastyle:ignore 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /sql-cloudant/examples/src/main/scala/org/apache/spark/examples/sql/cloudant/CloudantStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.examples.sql.cloudant 18 | 19 | import org.apache.spark.rdd.RDD 20 | import org.apache.spark.sql.SparkSession 21 | import org.apache.spark.streaming.{Seconds, StreamingContext, Time} 22 | 23 | import org.apache.bahir.cloudant.CloudantReceiver 24 | 25 | object CloudantStreaming { 26 | def main(args: Array[String]) { 27 | val spark = SparkSession.builder() 28 | .appName("Cloudant Spark SQL External Datasource in Scala") 29 | .master("local[*]") 30 | .getOrCreate() 31 | 32 | // Create the context with a 10 seconds batch size 33 | val ssc = new StreamingContext(spark.sparkContext, Seconds(10)) 34 | import spark.implicits._ 35 | 36 | val changes = ssc.receiverStream(new CloudantReceiver(spark.sparkContext.getConf, Map( 37 | "cloudant.host" -> "examples.cloudant.com", 38 | "database" -> "sales"))) 39 | 40 | changes.foreachRDD((rdd: RDD[String], time: Time) => { 41 | // Get the singleton instance of SparkSession 42 | 43 | 44 | println(s"========= $time =========")// scalastyle:ignore 45 | // Convert RDD[String] to Dataset[String] 46 | val changesDataFrame = spark.read.json(rdd.toDS()) 47 | if (changesDataFrame.schema.nonEmpty) { 48 | changesDataFrame.printSchema() 49 | 50 | var hasDelRecord = false 51 | var hasMonth = false 52 | for (field <- changesDataFrame.schema.fieldNames) { 53 | if ("_deleted".equals(field)) { 54 | hasDelRecord = true 55 | } 56 | if ("month".equals(field)) { 57 | hasMonth = true 58 | } 59 | } 60 | if (hasDelRecord) { 61 | changesDataFrame.filter(changesDataFrame("_deleted")).select("*").show() 62 | } 63 | 64 | if (hasMonth) { 65 | changesDataFrame.filter(changesDataFrame("month") === "May").select("*").show(5) 66 | changesDataFrame.createOrReplaceTempView("sales") 67 | val salesInMayCountsDataFrame = 68 | spark.sql( 69 | s""" 70 | |select rep, amount 71 | |from sales 72 | |where month = "May" 73 | """.stripMargin) 74 | salesInMayCountsDataFrame.show(5) 75 | } 76 | } 77 | 78 | }) 79 | ssc.start() 80 | // run streaming for 60 secs 81 | Thread.sleep(60000L) 82 | ssc.stop(true) 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /sql-cloudant/examples/src/main/scala/org/apache/spark/examples/sql/cloudant/CloudantStreamingSelector.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.sql.cloudant 19 | 20 | import java.util.concurrent.atomic.AtomicLong 21 | 22 | import org.apache.spark.rdd.RDD 23 | import org.apache.spark.sql.SparkSession 24 | import org.apache.spark.streaming.{ Seconds, StreamingContext, Time } 25 | 26 | import org.apache.bahir.cloudant.CloudantReceiver 27 | 28 | object CloudantStreamingSelector { 29 | def main(args: Array[String]) { 30 | val spark = SparkSession.builder() 31 | .appName("Cloudant Spark SQL External Datasource in Scala") 32 | .master("local[*]") 33 | .getOrCreate() 34 | 35 | import spark.implicits._ 36 | 37 | // Create the context with a 10 seconds batch size 38 | val ssc = new StreamingContext(spark.sparkContext, Seconds(10)) 39 | val curTotalAmount = new AtomicLong(0) 40 | val curSalesCount = new AtomicLong(0) 41 | var batchAmount = 0L 42 | 43 | val changes = ssc.receiverStream(new CloudantReceiver(spark.sparkContext.getConf, Map( 44 | "cloudant.host" -> "examples.cloudant.com", 45 | "database" -> "sales", 46 | "selector" -> "{\"month\":\"May\", \"rep\":\"John\"}"))) 47 | 48 | changes.foreachRDD((rdd: RDD[String], time: Time) => { 49 | // Get the singleton instance of SQLContext 50 | 51 | println(s"========= $time =========") // scalastyle:ignore 52 | val changesDataFrame = spark.read.json(rdd.toDS()) 53 | if (changesDataFrame.schema.nonEmpty) { 54 | changesDataFrame.select("*").show() 55 | batchAmount = changesDataFrame.groupBy().sum("amount").collect()(0).getLong(0) 56 | curSalesCount.getAndAdd(changesDataFrame.count()) 57 | curTotalAmount.getAndAdd(batchAmount) 58 | println("Current sales count:" + curSalesCount)// scalastyle:ignore 59 | println("Current total amount:" + curTotalAmount)// scalastyle:ignore 60 | } else { 61 | ssc.stop() 62 | } 63 | }) 64 | 65 | ssc.start() 66 | ssc.awaitTermination() 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /sql-cloudant/src/main/java/org/apache/bahir/cloudant/common/ChangesRow.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.bahir.cloudant.common; 18 | 19 | import com.google.gson.JsonElement; 20 | import com.google.gson.JsonObject; 21 | 22 | import java.util.List; 23 | 24 | /** 25 | * Class representing a single row in a changes feed. Structure: 26 | * 27 | * { 28 | * last_seq": 5 29 | * "results": [ 30 | * ---*** This next items is the ChangesRow ***--- 31 | * { 32 | * "changes": [ {"rev": "2-eec205a9d413992850a6e32678485900"}, ... ], 33 | * "deleted": true, 34 | * "id": "deleted", 35 | * "seq": 5, 36 | * "doc": ... structure ... 37 | * } 38 | * ] 39 | * } 40 | */ 41 | public class ChangesRow { 42 | 43 | public class Rev { 44 | private String rev; 45 | 46 | public String getRev() { 47 | return rev; 48 | } 49 | } 50 | 51 | private List changes; 52 | public Boolean deleted; 53 | private String id; 54 | private JsonElement seq; 55 | private JsonObject doc; 56 | 57 | public List getChanges() { 58 | return changes; 59 | } 60 | 61 | public String getSeq() { 62 | if (seq.isJsonNull()) { 63 | return null; 64 | } else { 65 | return seq.toString(); 66 | } 67 | } 68 | 69 | public String getId() { 70 | return id; 71 | } 72 | 73 | public JsonObject getDoc() { 74 | return doc; 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /sql-cloudant/src/main/java/org/apache/bahir/cloudant/common/ChangesRowScanner.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.bahir.cloudant.common; 18 | 19 | import com.google.gson.Gson; 20 | import java.io.BufferedReader; 21 | import java.io.IOException; 22 | 23 | /** 24 | * This scanner will read through a _changes stream until it finds the 25 | * next meaningful row, either a change entry or the closing line with 26 | * the lastSeq and, perhaps, pending changes (for normal/longpoll feeds). 27 | */ 28 | public class ChangesRowScanner { 29 | 30 | private static final Gson gson = new Gson(); 31 | 32 | /** 33 | * Read up to the next meaningful line from the changes feed, and calls 34 | * the passed delegate depending on what it finds. Works for all styles of 35 | * changes feed (normal, longpoll, continuous). 36 | * 37 | * @return True if should continue 38 | * 39 | * @throws IOException if there's a problem reading the stream 40 | */ 41 | public static ChangesRow readRowFromReader(BufferedReader changesReader) 42 | throws IOException { 43 | 44 | String line; 45 | 46 | // Read the next line (empty = heartbeat, ignore; null = end of stream) 47 | while ((line = changesReader.readLine()) != null) { 48 | if (line.isEmpty()) { 49 | continue; 50 | } 51 | if (line.startsWith("{\"results\":")) { 52 | // ignore, just the START of the result set in normal/longpoll mode 53 | continue; 54 | } else if (line.startsWith("],")) { 55 | // ignore, just the END of the result set in normal/longpoll mode 56 | continue; 57 | } 58 | break; 59 | } 60 | 61 | if(line != null) { 62 | if (line.startsWith("\"last_seq\":")) { 63 | return null; // End of feed 64 | } else if (line.startsWith("{\"last_seq\":")) { 65 | return null; // End of feed 66 | } else { 67 | if (line.endsWith(",")) { 68 | line = line.substring(0, line.length() - 1); 69 | } 70 | ChangesRow r = gson.fromJson(line, ChangesRow.class); 71 | return r; // not end of feed 72 | } 73 | } else { 74 | return null; 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /sql-cloudant/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | spark-sql { 2 | bulkSize = 200 3 | schemaSampleSize = -1 4 | createDBOnSave = false 5 | jsonstore.rdd = { 6 | partitions = 10 7 | maxInPartition = -1 8 | minInPartition = 10 9 | requestTimeout = 900000 10 | } 11 | cloudant = { 12 | batchInterval = 8 13 | endpoint = "_all_docs" 14 | numberOfRetries = 3 15 | protocol = https 16 | useQuery = false 17 | queryLimit = 25 18 | storageLevel = "MEMORY_ONLY" 19 | timeout = 60000 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /sql-cloudant/src/main/resources/reference.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/bahir/8b647c8ef80455ba900ef91e08eaf5eafa35c133/sql-cloudant/src/main/resources/reference.conf -------------------------------------------------------------------------------- /sql-cloudant/src/main/scala/org/apache/bahir/cloudant/CloudantChangesConfig.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.bahir.cloudant 18 | 19 | import org.apache.spark.storage.StorageLevel 20 | 21 | import org.apache.bahir.cloudant.common.JsonStoreConfigManager 22 | 23 | class CloudantChangesConfig(protocol: String, host: String, dbName: String, 24 | indexName: String = null, viewName: String = null) 25 | (username: String, password: String, partitions: Int, 26 | maxInPartition: Int, minInPartition: Int, requestTimeout: Long, 27 | bulkSize: Int, schemaSampleSize: Int, 28 | createDBOnSave: Boolean, endpoint: String, selector: String, 29 | timeout: Int, storageLevel: StorageLevel, useQuery: Boolean, 30 | queryLimit: Int, batchInterval: Int, numberOfRetries: Int) 31 | extends CloudantConfig(protocol, host, dbName, indexName, viewName)(username, password, 32 | partitions, maxInPartition, minInPartition, requestTimeout, bulkSize, schemaSampleSize, 33 | createDBOnSave, endpoint, useQuery, queryLimit, numberOfRetries) { 34 | 35 | override val defaultIndex: String = endpoint 36 | 37 | def getBatchInterval : Int = { 38 | batchInterval 39 | } 40 | 41 | def getSelector : String = { 42 | if (selector != null && !selector.isEmpty) { 43 | selector 44 | } else { 45 | val version = getClient.serverVersion 46 | if (version.matches("1.*")) { 47 | null 48 | } else { 49 | // Exclude design docs and deleted=true docs 50 | "{ \"_id\": { \"$regex\": \"^(?!_design/)\" }, " + 51 | "\"_deleted\": { \"$exists\": false } }" 52 | } 53 | } 54 | } 55 | 56 | /* 57 | * Storage level when persisting RDDs during streaming. 58 | * See https://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence for 59 | * more details. 60 | * See [[org.apache.spark.storage.StorageLevel]] for all defined storage level options. 61 | */ 62 | def getStorageLevelForStreaming : StorageLevel = { 63 | if (storageLevel == null) { 64 | StorageLevel.MEMORY_ONLY 65 | } else { 66 | storageLevel 67 | } 68 | } 69 | 70 | def getContinuousChangesUrl: String = { 71 | var url = dbUrl + "/" + defaultIndex + "?include_docs=true&feed=continuous&heartbeat=3000" 72 | if (getSelector != null) { 73 | url = url + "&filter=_selector" 74 | } 75 | url 76 | } 77 | 78 | def getChangesReceiverUrl: String = { 79 | var url = dbUrl + "/" + defaultIndex + "?include_docs=true&feed=normal" + 80 | "&seq_interval=" + bulkSize + "&timeout=" + timeout 81 | if (getSelector != null) { 82 | url = url + "&filter=_selector" 83 | } 84 | url 85 | } 86 | 87 | // Use _all_docs endpoint for getting the total number of docs 88 | def getTotalUrl: String = { 89 | dbUrl + "/" + JsonStoreConfigManager.ALL_DOCS_INDEX 90 | } 91 | } 92 | 93 | object CloudantChangesConfig { 94 | // Error message from internal _changes receiver 95 | var receiverErrorMsg: String = "" 96 | } 97 | -------------------------------------------------------------------------------- /sql-cloudant/src/main/scala/org/apache/bahir/cloudant/CloudantReceiver.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.bahir.cloudant 18 | 19 | import java.io.{BufferedReader, InputStreamReader} 20 | import java.util.concurrent.TimeUnit 21 | 22 | import okhttp3._ 23 | 24 | import org.apache.spark.SparkConf 25 | import org.apache.spark.storage.StorageLevel 26 | import org.apache.spark.streaming.receiver.Receiver 27 | 28 | import org.apache.bahir.cloudant.common._ 29 | 30 | class CloudantReceiver(sparkConf: SparkConf, cloudantParams: Map[String, String]) 31 | extends Receiver[String](StorageLevel.MEMORY_AND_DISK) { 32 | // CloudantChangesConfig requires `_changes` endpoint option 33 | lazy val config: CloudantChangesConfig = { 34 | JsonStoreConfigManager.getConfig(sparkConf, cloudantParams 35 | + ("cloudant.endpoint" -> JsonStoreConfigManager.CHANGES_INDEX) 36 | ).asInstanceOf[CloudantChangesConfig] 37 | } 38 | 39 | def onStart() { 40 | // Start the thread that receives data over a connection 41 | new Thread("Cloudant Receiver") { 42 | override def run() { receive() } 43 | }.start() 44 | } 45 | 46 | private def receive(): Unit = { 47 | val okHttpClient: OkHttpClient = new OkHttpClient.Builder() 48 | .connectTimeout(5, TimeUnit.SECONDS) 49 | .readTimeout(60, TimeUnit.SECONDS) 50 | .build 51 | val url = config.getChangesReceiverUrl.toString 52 | 53 | val builder = new Request.Builder().url(url) 54 | if (config.username != null) { 55 | val credential = Credentials.basic(config.username, config.password) 56 | builder.header("Authorization", credential) 57 | } 58 | if(config.getSelector != null) { 59 | val jsonType = MediaType.parse("application/json; charset=utf-8") 60 | val selector = "{\"selector\":" + config.getSelector + "}" 61 | val selectorBody = RequestBody.create(jsonType, selector) 62 | builder.post(selectorBody) 63 | } 64 | 65 | val request = builder.build 66 | val response = okHttpClient.newCall(request).execute 67 | val status_code = response.code 68 | 69 | if (status_code == 200) { 70 | val changesInputStream = response.body.byteStream 71 | var json = new ChangesRow() 72 | if (changesInputStream != null) { 73 | val bufferedReader = new BufferedReader(new InputStreamReader(changesInputStream)) 74 | while ((json = ChangesRowScanner.readRowFromReader(bufferedReader)) != null) { 75 | if (!isStopped() && json != null && !json.getDoc.has("_deleted")) { 76 | store(json.getDoc.toString) 77 | } 78 | } 79 | } 80 | } else { 81 | val errorMsg = "Error retrieving _changes feed " + config.getDbname + ": " + status_code 82 | reportError(errorMsg, new CloudantException(errorMsg)) 83 | } 84 | } 85 | 86 | def onStop(): Unit = { 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /sql-cloudant/src/main/scala/org/apache/bahir/cloudant/common/CloudantException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.bahir.cloudant.common 18 | 19 | class CloudantException(msg: String) extends RuntimeException(msg) { 20 | def this(msg: String, cause: Throwable) { 21 | this(msg) 22 | initCause(cause) 23 | } 24 | 25 | def this(cause: Throwable) = { 26 | this(Option(cause).map(_.toString).orNull) 27 | initCause(cause) 28 | } 29 | 30 | def this() = { 31 | this(null: String) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /sql-cloudant/src/main/scala/org/apache/bahir/cloudant/common/JsonUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.bahir.cloudant.common 18 | 19 | import scala.util.control.Breaks._ 20 | 21 | import com.google.gson.{JsonElement, JsonParser} 22 | 23 | object JsonUtil { 24 | def getField(row: JsonElement, field: String) : Option[JsonElement] = { 25 | var path = field.split('.') 26 | var currentValue = row 27 | var finalValue: Option[JsonElement] = None 28 | breakable { 29 | for (i <- path.indices) { 30 | if (currentValue != null && currentValue.isJsonObject) { 31 | val f: Option[JsonElement] = 32 | Option(currentValue.getAsJsonObject.get(path(i))) 33 | f match { 34 | case Some(f2) => currentValue = f2 35 | case None => break 36 | } 37 | if (i == path.length - 1) { 38 | // The leaf node 39 | finalValue = Some(currentValue) 40 | } 41 | } 42 | } 43 | } 44 | finalValue 45 | } 46 | 47 | object JsonConverter { 48 | val parser = new JsonParser 49 | def toJson(value: Any): JsonElement = { 50 | parser.parse(value.toString) 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /sql-cloudant/src/main/scala/org/apache/bahir/cloudant/internal/ChangesReceiver.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.bahir.cloudant.internal 18 | 19 | import java.io.{BufferedReader, InputStreamReader} 20 | import java.util.concurrent.TimeUnit 21 | 22 | import com.google.gson.JsonParser 23 | import okhttp3._ 24 | 25 | import org.apache.spark.storage.StorageLevel 26 | import org.apache.spark.streaming.receiver.Receiver 27 | 28 | import org.apache.bahir.cloudant.CloudantChangesConfig 29 | import org.apache.bahir.cloudant.common._ 30 | 31 | class ChangesReceiver(config: CloudantChangesConfig) 32 | extends Receiver[String](StorageLevel.MEMORY_AND_DISK) { 33 | 34 | def onStart() { 35 | // Start the thread that receives data over a connection 36 | new Thread("Cloudant Receiver") { 37 | override def run() { receive() } 38 | }.start() 39 | } 40 | 41 | private def receive(): Unit = { 42 | val okHttpClient: OkHttpClient = new OkHttpClient.Builder() 43 | .connectTimeout(5, TimeUnit.SECONDS) 44 | .readTimeout(60, TimeUnit.SECONDS) 45 | .build 46 | val url = config.getChangesReceiverUrl.toString 47 | 48 | val builder = new Request.Builder().url(url) 49 | if (config.username != null) { 50 | val credential = Credentials.basic(config.username, config.password) 51 | builder.header("Authorization", credential) 52 | } 53 | if(config.getSelector != null) { 54 | val jsonType = MediaType.parse("application/json; charset=utf-8") 55 | val selector = "{\"selector\":" + config.getSelector + "}" 56 | val selectorBody = RequestBody.create(jsonType, selector) 57 | builder.post(selectorBody) 58 | } 59 | 60 | val request = builder.build 61 | val response = okHttpClient.newCall(request).execute 62 | val status_code = response.code 63 | 64 | if (status_code == 200) { 65 | val changesInputStream = response.body.byteStream 66 | var json = new ChangesRow() 67 | if (changesInputStream != null) { 68 | val bufferedReader = new BufferedReader(new InputStreamReader(changesInputStream)) 69 | while ((json = ChangesRowScanner.readRowFromReader(bufferedReader)) != null) { 70 | if (!isStopped() && json != null && !json.getDoc.has("_deleted")) { 71 | store(json.getDoc.toString) 72 | } 73 | } 74 | } 75 | } else { 76 | val responseAsJson = new JsonParser().parse(response.body.string) 77 | val errorMsg = "Error retrieving _changes feed data from database " + "'" + 78 | config.getDbname + "' with response code " + status_code + ": " + responseAsJson.toString 79 | reportError(errorMsg, new CloudantException(errorMsg)) 80 | CloudantChangesConfig.receiverErrorMsg = errorMsg 81 | } 82 | } 83 | 84 | override def onStop(): Unit = { 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /sql-cloudant/src/test/resources/json-files/n_airportcodemapping.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "_id": "BOM", 4 | "airportName": "Mumbai" 5 | }, 6 | { 7 | "_id": "CDG", 8 | "airportName": "Paris" 9 | }, 10 | { 11 | "_id": "DEL", 12 | "airportName": "Delhi" 13 | }, 14 | { 15 | "_id": "FCO", 16 | "airportName": "Rome" 17 | }, 18 | { 19 | "_id": "FRA", 20 | "airportName": "Frankfurt" 21 | }, 22 | { 23 | "_id": "HKG", 24 | "airportName": "Hong Kong" 25 | }, 26 | { 27 | "_id": "IKA", 28 | "airportName": "Tehran" 29 | }, 30 | { 31 | "_id": "JFK", 32 | "airportName": "New York" 33 | }, 34 | { 35 | "_id": "LHR", 36 | "airportName": "London" 37 | }, 38 | { 39 | "_id": "NRT", 40 | "airportName": "Tokyo" 41 | }, 42 | { 43 | "_id": "SIN", 44 | "airportName": "Singapore" 45 | }, 46 | { 47 | "_id": "SVO", 48 | "airportName": "Moscow" 49 | }, 50 | { 51 | "_id": "SYD", 52 | "airportName": "Sydney" 53 | }, 54 | { 55 | "_id": "YUL", 56 | "airportName": "Montreal" 57 | } 58 | ] -------------------------------------------------------------------------------- /sql-cloudant/src/test/resources/json-files/n_booking.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "flightId": "AA93", 4 | "_id": "696869c2-1035-4d4c-8142-86985f5f199e", 5 | "customerId": "uid0@email.com", 6 | "dateOfBooking": "2017-04-12T22:19:45.910Z" 7 | }, 8 | { 9 | "language": "javascript", 10 | "views": { 11 | 12 | }, 13 | "_id": "_design/view", 14 | "indexes": { 15 | "n_bookings": { 16 | "index": "function(doc){\n index(\"default\", doc._id);\n \t if(doc.customerId){\n \tindex(\"customerId\", doc.customerId, {\"store\": \"yes\"});\n \t }\n}", 17 | "analyzer": "standard" 18 | } 19 | } 20 | }, 21 | { 22 | "flightId": "AA330", 23 | "_id": "ccb8fc78-1b29-42ef-bff2-a4a81ae1f807", 24 | "customerId": "uid0@email.com", 25 | "dateOfBooking": "2017-04-12T22:19:46.140Z" 26 | } 27 | ] -------------------------------------------------------------------------------- /sql-cloudant/src/test/resources/json-files/n_customersession.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "language": "javascript", 4 | "indexes": { 5 | "n_customersessions": { 6 | "index": "function(doc){\n\t index(\"default\", doc._id);\n \t if(doc.customerid){\n \tindex(\"customerid\", doc.customerid, {\"store\": \"yes\"});\n \t }\n}", 7 | "analyzer": "standard" 8 | } 9 | }, 10 | "_id": "_design/view", 11 | "views": { 12 | 13 | } 14 | }, 15 | { 16 | "customerid": "uid0@email.com", 17 | "lastAccessedTime": "2017-04-12T22:19:45.449Z", 18 | "_id": "a1346fce-2b45-422c-a5d0-1554a47b31e6", 19 | "timeoutTime": "2017-04-13T22:19:45.449Z" 20 | } 21 | ] -------------------------------------------------------------------------------- /sql-cloudant/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file target/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=true 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 25 | 26 | # Ignore messages below warning level from Jetty, because it's a bit verbose 27 | log4j.logger.org.spark_project.jetty=WARN 28 | -------------------------------------------------------------------------------- /sql-cloudant/src/test/scala/org/apache/bahir/cloudant/TestUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.cloudant 19 | 20 | object TestUtils { 21 | // Set CouchDB/Cloudant host, username and password for local testing 22 | private val host = System.getenv("CLOUDANT_HOST") 23 | private val username = System.getenv("CLOUDANT_USER") 24 | private val password = System.getenv("CLOUDANT_PASSWORD") 25 | private val protocol = System.getenv("CLOUDANT_PROTOCOL") 26 | 27 | // List of test databases to create from JSON flat files 28 | val testDatabasesList: List[String] = List( 29 | "n_airportcodemapping", 30 | "n_booking", 31 | "n_customer", 32 | "n_customersession", 33 | "n_flight", 34 | "n_flight2", 35 | "n_flightsegment" 36 | ) 37 | 38 | // default value is https for cloudant.com accounts 39 | def getProtocol: String = { 40 | if (protocol != null && !protocol.isEmpty) { 41 | protocol 42 | } else { 43 | "https" 44 | } 45 | } 46 | 47 | def getHost: String = { 48 | if (host != null && !host.isEmpty) { 49 | host 50 | } else { 51 | getUsername + ".cloudant.com" 52 | } 53 | } 54 | 55 | def getUsername: String = { 56 | username 57 | } 58 | 59 | def getPassword: String = { 60 | password 61 | } 62 | 63 | def shouldRunTest(): Boolean = { 64 | val isEnvSet = (username != null && !username.isEmpty) && 65 | (password != null && !password.isEmpty) 66 | if (isEnvSet) { 67 | // scalastyle:off println 68 | println( 69 | s""" 70 | |Sql-cloudant tests that require Cloudant databases have been enabled by 71 | |the environment variables CLOUDANT_USER and CLOUDANT_PASSWORD. 72 | """.stripMargin) 73 | // scalastyle:on println 74 | } 75 | isEnvSet 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /sql-streaming-akka/examples/src/main/java/org/apache/bahir/examples/sql/streaming/akka/JavaAkkaStreamWordCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.examples.sql.streaming.akka; 19 | 20 | import org.apache.log4j.Level; 21 | import org.apache.log4j.Logger; 22 | import org.apache.spark.SparkConf; 23 | import org.apache.spark.api.java.function.FlatMapFunction; 24 | import org.apache.spark.sql.Dataset; 25 | import org.apache.spark.sql.Encoders; 26 | import org.apache.spark.sql.Row; 27 | import org.apache.spark.sql.SparkSession; 28 | import org.apache.spark.sql.streaming.StreamingQuery; 29 | 30 | import java.util.Arrays; 31 | import java.util.Iterator; 32 | 33 | /** 34 | * Counts words in UTF8 encoded, '\n' delimited text received from Akka Feeder Actor system. 35 | * 36 | * Usage: AkkaStreamWordCount 37 | * provides the uri of the publisher or feeder actor that Structured Streaming 38 | * would connect to receive data. 39 | * 40 | * To run this on your local machine, a Feeder Actor System should be up and running. 41 | * 42 | */ 43 | public final class JavaAkkaStreamWordCount { 44 | 45 | public static void main(String[] args) throws Exception { 46 | if (args.length < 1) { 47 | System.err.println("Usage: JavaAkkaStreamWordCount "); 48 | System.exit(1); 49 | } 50 | 51 | if (!Logger.getRootLogger().getAllAppenders().hasMoreElements()) { 52 | Logger.getRootLogger().setLevel(Level.WARN); 53 | } 54 | 55 | String urlOfPublisher = args[0]; 56 | 57 | SparkConf sparkConf = new SparkConf().setAppName("JavaAkkaStreamWordCount"); 58 | 59 | // check Spark configuration for master URL, set it to local if not configured 60 | if (!sparkConf.contains("spark.master")) { 61 | sparkConf.setMaster("local[4]"); 62 | } 63 | 64 | SparkSession spark = SparkSession.builder() 65 | .config(sparkConf) 66 | .getOrCreate(); 67 | 68 | // Create DataFrame representing the stream of input lines from connection 69 | // to publisher or feeder actor 70 | Dataset lines = spark 71 | .readStream() 72 | .format("org.apache.bahir.sql.streaming.akka.AkkaStreamSourceProvider") 73 | .option("urlOfPublisher", urlOfPublisher) 74 | .load().select("value").as(Encoders.STRING()); 75 | 76 | // Split the lines into words 77 | Dataset words = lines.flatMap(new FlatMapFunction() { 78 | @Override 79 | public Iterator call(String s) throws Exception { 80 | return Arrays.asList(s.split(" ")).iterator(); 81 | } 82 | }, Encoders.STRING()); 83 | 84 | // Generate running word count 85 | Dataset wordCounts = words.groupBy("value").count(); 86 | 87 | // Start running the query that prints the running counts to the console 88 | StreamingQuery query = wordCounts.writeStream() 89 | .outputMode("complete") 90 | .format("console") 91 | .start(); 92 | 93 | query.awaitTermination(); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /sql-streaming-akka/examples/src/main/scala/org/apache/bahir/examples/sql/streaming/akka/AkkaStreamWordCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.examples.sql.streaming.akka 19 | 20 | import java.sql.Timestamp 21 | 22 | import org.apache.spark.sql.SparkSession 23 | 24 | /** 25 | * Counts words in UTF8 encoded, '\n' delimited text received from Akka Feeder Actor system. 26 | * 27 | * Usage: AkkaStreamWordCount 28 | * provides the uri of the publisher or feeder actor that Structured Streaming 29 | * would connect to receive data. 30 | * 31 | * To run this on your local machine, a Feeder Actor System should be up and running. 32 | * 33 | */ 34 | object AkkaStreamWordCount { 35 | def main(args: Array[String]): Unit = { 36 | if (args.length < 1) { 37 | System.err.println("Usage: AkkaStreamWordCount ") // scalastyle:off println 38 | System.exit(1) 39 | } 40 | 41 | val urlOfPublisher = args(0) 42 | 43 | val spark = SparkSession 44 | .builder() 45 | .appName("AkkaStreamWordCount") 46 | .master("local[4]") 47 | .getOrCreate() 48 | 49 | import spark.implicits._ 50 | 51 | // Create DataFrame representing the stream of input lines from connection 52 | // to publisher or feeder actor 53 | val lines = spark.readStream 54 | .format("org.apache.bahir.sql.streaming.akka.AkkaStreamSourceProvider") 55 | .option("urlOfPublisher", urlOfPublisher) 56 | .load().as[(String, Timestamp)] 57 | 58 | // Split the lines into words 59 | val words = lines.map(_._1).flatMap(_.split(" ")) 60 | 61 | // Generate running word count 62 | val wordCounts = words.groupBy("value").count() 63 | 64 | // Start running the query that prints the running counts to the console 65 | val query = wordCounts.writeStream 66 | .outputMode("complete") 67 | .format("console") 68 | .start() 69 | 70 | query.awaitTermination() 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /sql-streaming-akka/src/main/assembly/assembly.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | test-jar-with-dependencies 19 | 20 | jar 21 | 22 | false 23 | 24 | 25 | 26 | ${project.build.directory}/scala-${scala.binary.version}/test-classes 27 | 28 | 29 | 30 | 31 | 32 | 33 | true 34 | test 35 | true 36 | 37 | org.apache.hadoop:*:jar 38 | org.apache.zookeeper:*:jar 39 | org.apache.avro:*:jar 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /sql-streaming-akka/src/main/scala/org/apache/bahir/sql/streaming/akka/LongOffset.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.sql.streaming.akka 19 | 20 | import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset} 21 | import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2} 22 | 23 | /** 24 | * @note As of 2.3.0, [[org.apache.spark.sql.execution.streaming.LongOffset]] 25 | * hasn't extended v2 Offset yet. Fix version is 3.0.0. Until then 26 | * this is a required class. 27 | * @see SPARK-23092 28 | */ 29 | case class LongOffset(offset: Long) extends OffsetV2 { 30 | 31 | override val json = offset.toString 32 | 33 | def +(increment: Long): LongOffset = new LongOffset(offset + increment) 34 | def -(decrement: Long): LongOffset = new LongOffset(offset - decrement) 35 | } 36 | 37 | object LongOffset { 38 | 39 | /** 40 | * LongOffset factory from serialized offset. 41 | * @return new LongOffset 42 | */ 43 | def apply(offset: SerializedOffset) : LongOffset = new LongOffset(offset.json.toLong) 44 | 45 | /** 46 | * Convert generic Offset to LongOffset if possible. 47 | * @return converted LongOffset 48 | */ 49 | def convert(offset: Offset): Option[LongOffset] = offset match { 50 | case lo: LongOffset => Some(lo) 51 | case so: SerializedOffset => Some(LongOffset(so)) 52 | case _ => None 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /sql-streaming-akka/src/main/scala/org/apache/bahir/sql/streaming/akka/MessageStore.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.sql.streaming.akka 19 | 20 | import java.nio.ByteBuffer 21 | 22 | import scala.reflect.ClassTag 23 | 24 | import org.rocksdb.RocksDB 25 | 26 | import org.apache.spark.serializer.{JavaSerializer, Serializer, SerializerInstance} 27 | import org.apache.spark.SparkConf 28 | 29 | import org.apache.bahir.utils.Logging 30 | 31 | 32 | trait MessageStore { 33 | 34 | def store[T: ClassTag](id: Long, message: T): Boolean 35 | 36 | def retrieve[T: ClassTag](start: Long, end: Long): Seq[Option[T]] 37 | 38 | def retrieve[T: ClassTag](id: Long): Option[T] 39 | 40 | def maxProcessedOffset: Long 41 | } 42 | 43 | private[akka] class LocalMessageStore(val persistentStore: RocksDB, 44 | val serializer: Serializer) 45 | extends MessageStore with Logging { 46 | 47 | val classLoader = Thread.currentThread().getContextClassLoader 48 | 49 | def this(persistentStore: RocksDB, conf: SparkConf) = 50 | this(persistentStore, new JavaSerializer(conf)) 51 | 52 | val serializerInstance: SerializerInstance = serializer.newInstance() 53 | 54 | private def get(id: Long) = persistentStore.get(id.toString.getBytes) 55 | 56 | override def maxProcessedOffset: Long = persistentStore.getLatestSequenceNumber 57 | 58 | override def store[T: ClassTag](id: Long, message: T): Boolean = { 59 | val bytes: Array[Byte] = serializerInstance.serialize(message).array() 60 | try { 61 | persistentStore.put(id.toString.getBytes(), bytes) 62 | true 63 | } catch { 64 | case e: Exception => log.warn(s"Failed to store message Id: $id", e) 65 | false 66 | } 67 | } 68 | 69 | override def retrieve[T: ClassTag](start: Long, end: Long): Seq[Option[T]] = { 70 | (start until end).map(x => retrieve(x)) 71 | } 72 | 73 | override def retrieve[T: ClassTag](id: Long): Option[T] = { 74 | val bytes = persistentStore.get(id.toString.getBytes) 75 | 76 | if (bytes != null) { 77 | Some(serializerInstance.deserialize( 78 | ByteBuffer.wrap(bytes), classLoader)) 79 | } else { 80 | None 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /sql-streaming-akka/src/test/resources/feeder_actor.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | akka { 19 | loglevel = "INFO" 20 | actor { 21 | provider = "akka.remote.RemoteActorRefProvider" 22 | } 23 | remote { 24 | enabled-transports = ["akka.remote.netty.tcp"] 25 | netty.tcp { 26 | hostname = "127.0.0.1" 27 | port = 0 28 | } 29 | log-sent-messages = on 30 | log-received-messages = on 31 | } 32 | loggers.0 = "akka.event.slf4j.Slf4jLogger" 33 | log-dead-letters-during-shutdown = "off" 34 | } 35 | -------------------------------------------------------------------------------- /sql-streaming-akka/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file target/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=true 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 25 | 26 | # Ignore messages below warning level from Jetty, because it's a bit verbose 27 | log4j.logger.org.spark_project.jetty=WARN 28 | -------------------------------------------------------------------------------- /sql-streaming-akka/src/test/scala/org/apache/bahir/sql/streaming/akka/AkkaTestUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | // scalastyle:off println 19 | package org.apache.bahir.sql.streaming.akka 20 | 21 | import java.io.File 22 | 23 | import scala.collection.mutable 24 | import scala.concurrent.Await 25 | import scala.concurrent.duration._ 26 | import scala.util.Random 27 | 28 | import akka.actor.{Actor, ActorRef, ActorSystem, ExtendedActorSystem, Props} 29 | import com.typesafe.config.{Config, ConfigFactory} 30 | 31 | import org.apache.bahir.utils.Logging 32 | 33 | class AkkaTestUtils extends Logging { 34 | private val actorSystemName = "feeder-actor-system" 35 | private var actorSystem: ActorSystem = _ 36 | 37 | private val feederActorName = "feederActor" 38 | 39 | private var message: String = _ 40 | private var count = 1 41 | 42 | def getFeederActorConfig(): Config = { 43 | val configFile = getClass.getClassLoader 44 | .getResource("feeder_actor.conf").getFile 45 | ConfigFactory.parseFile(new File(configFile)) 46 | } 47 | 48 | def getFeederActorUri(): String = 49 | s"${actorSystem.asInstanceOf[ExtendedActorSystem].provider.getDefaultAddress}" + 50 | s"/user/$feederActorName" 51 | 52 | class FeederActor extends Actor { 53 | 54 | val rand = new Random() 55 | val receivers = new mutable.LinkedHashSet[ActorRef]() 56 | 57 | val sendMessageThread = 58 | new Thread() { 59 | override def run(): Unit = { 60 | var counter = 0 61 | while (counter < count) { 62 | // Thread.sleep(500) 63 | receivers.foreach(_ ! message) 64 | counter += 1 65 | } 66 | } 67 | } 68 | 69 | override def receive: Receive = { 70 | case SubscribeReceiver(receiverActor: ActorRef) => 71 | log.debug(s"received subscribe from ${receiverActor.toString}") 72 | receivers += receiverActor 73 | sendMessageThread.run() 74 | 75 | case UnsubscribeReceiver(receiverActor: ActorRef) => 76 | log.debug(s"received unsubscribe from ${receiverActor.toString}") 77 | receivers -= receiverActor 78 | } 79 | } 80 | 81 | def setup(): Unit = { 82 | val feederConf = getFeederActorConfig() 83 | 84 | actorSystem = ActorSystem(actorSystemName, feederConf) 85 | actorSystem.actorOf(Props(new FeederActor), feederActorName) 86 | } 87 | 88 | def shutdown(): Unit = { 89 | Await.ready(actorSystem.terminate(), 5.seconds) 90 | } 91 | 92 | def setMessage(message: String): Unit = this.message = message 93 | def setCountOfMessages(messageCount: Int): Unit = count = messageCount 94 | } 95 | -------------------------------------------------------------------------------- /sql-streaming-jdbc/examples/src/main/scala/org/apache/bahir/examples/sql/streaming/jdbc/JdbcSinkDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.examples.sql.streaming.jdbc 19 | 20 | import org.apache.spark.sql.SparkSession 21 | import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions 22 | import org.apache.spark.sql.streaming.{OutputMode, Trigger} 23 | 24 | /** 25 | * Mock using rate source, change the log to a simple Person 26 | * object with name and age property, and write to jdbc. 27 | * 28 | * Usage: JdbcSinkDemo 29 | */ 30 | object JdbcSinkDemo { 31 | 32 | private case class Person(name: String, age: Int) 33 | 34 | def main(args: Array[String]): Unit = { 35 | if (args.length < 4) { 36 | // scalastyle:off println 37 | System.err.println("Usage: JdbcSinkDemo ") 38 | // scalastyle:on 39 | System.exit(1) 40 | } 41 | 42 | val jdbcUrl = args(0) 43 | val tableName = args(1) 44 | val username = args(2) 45 | val password = args(3) 46 | 47 | val spark = SparkSession 48 | .builder() 49 | .appName("JdbcSinkDemo") 50 | .getOrCreate() 51 | 52 | // load data source 53 | val df = spark.readStream 54 | .format("rate") 55 | .option("numPartitions", "5") 56 | .option("rowsPerSecond", "100") 57 | .load() 58 | 59 | // change input value to a person object. 60 | import spark.implicits._ 61 | val lines = df.select("value").as[Long].map{ value => 62 | Person(s"name_${value}", value.toInt % 30) 63 | } 64 | 65 | lines.printSchema() 66 | 67 | // write result 68 | val query = lines.writeStream 69 | .outputMode("append") 70 | .format("streaming-jdbc") 71 | .outputMode(OutputMode.Append) 72 | .option(JDBCOptions.JDBC_URL, jdbcUrl) 73 | .option(JDBCOptions.JDBC_TABLE_NAME, tableName) 74 | .option(JDBCOptions.JDBC_DRIVER_CLASS, "com.mysql.jdbc.Driver") 75 | .option(JDBCOptions.JDBC_BATCH_INSERT_SIZE, "5") 76 | .option("user", username) 77 | .option("password", password) 78 | .trigger(Trigger.ProcessingTime("10 seconds")) 79 | .start() 80 | 81 | query.awaitTermination() 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /sql-streaming-jdbc/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 4.0.0 21 | 22 | org.apache.bahir 23 | bahir-parent_2.12 24 | 3.0.0-SNAPSHOT 25 | ../pom.xml 26 | 27 | 28 | org.apache.bahir 29 | spark-sql-streaming-jdbc_2.12 30 | 31 | sql-streaming-jdbc 32 | 33 | jar 34 | Apache Bahir - Spark SQL Streaming JDBC 35 | http://bahir.apache.org/ 36 | 37 | 38 | 39 | org.apache.bahir 40 | bahir-common_${scala.binary.version} 41 | ${project.version} 42 | 43 | 44 | org.apache.spark 45 | spark-tags_${scala.binary.version} 46 | 47 | 48 | org.apache.spark 49 | spark-sql_${scala.binary.version} 50 | ${spark.version} 51 | 52 | 53 | org.apache.spark 54 | spark-sql_${scala.binary.version} 55 | ${spark.version} 56 | test-jar 57 | test 58 | 59 | 60 | org.apache.spark 61 | spark-catalyst_${scala.binary.version} 62 | ${spark.version} 63 | test-jar 64 | test 65 | 66 | 67 | org.apache.spark 68 | spark-core_${scala.binary.version} 69 | ${spark.version} 70 | test-jar 71 | test 72 | 73 | 74 | org.scalacheck 75 | scalacheck_${scala.binary.version} 76 | test 77 | 78 | 79 | com.h2database 80 | h2 81 | 1.4.195 82 | test 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /sql-streaming-jdbc/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | org.apache.bahir.sql.streaming.jdbc.JdbcSourceProvider -------------------------------------------------------------------------------- /sql-streaming-jdbc/src/main/scala/org/apache/bahir/sql/streaming/jdbc/JdbcSourceProvider.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.sql.streaming.jdbc 19 | 20 | import scala.collection.JavaConverters._ 21 | 22 | import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions 23 | import org.apache.spark.sql.sources.DataSourceRegister 24 | import org.apache.spark.sql.sources.v2.{DataSourceOptions, StreamWriteSupport} 25 | import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter 26 | import org.apache.spark.sql.streaming.OutputMode 27 | import org.apache.spark.sql.types.StructType 28 | 29 | class JdbcSourceProvider extends StreamWriteSupport with DataSourceRegister{ 30 | override def createStreamWriter(queryId: String, schema: StructType, 31 | mode: OutputMode, options: DataSourceOptions): StreamWriter = { 32 | val optionMap = options.asMap().asScala.toMap 33 | // add this for parameter check. 34 | new JDBCOptions(optionMap) 35 | new JdbcStreamWriter(schema, optionMap) 36 | } 37 | 38 | // short name 'jdbc' is used for batch, chose a different name for streaming. 39 | override def shortName(): String = "streaming-jdbc" 40 | } 41 | -------------------------------------------------------------------------------- /sql-streaming-jdbc/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file target/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=true 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 25 | 26 | # Ignore messages below warning level from Jetty, because it's a bit verbose 27 | log4j.logger.org.spark_project.jetty=WARN 28 | -------------------------------------------------------------------------------- /sql-streaming-mqtt/examples/src/main/java/org/apache/bahir/examples/sql/streaming/mqtt/JavaMQTTStreamWordCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.examples.sql.streaming.mqtt; 19 | 20 | import org.apache.log4j.Logger; 21 | import org.apache.log4j.Level; 22 | import org.apache.spark.SparkConf; 23 | import org.apache.spark.api.java.function.FlatMapFunction; 24 | import org.apache.spark.sql.Dataset; 25 | import org.apache.spark.sql.Encoders; 26 | import org.apache.spark.sql.Row; 27 | import org.apache.spark.sql.SparkSession; 28 | import org.apache.spark.sql.streaming.StreamingQuery; 29 | 30 | import java.util.Arrays; 31 | import java.util.Iterator; 32 | 33 | /** 34 | * Counts words in UTF8 encoded, '\n' delimited text received from MQTT Server. 35 | * 36 | * Usage: JavaMQTTStreamWordCount 37 | * and describe the MQTT server that Structured Streaming 38 | * would connect to receive data. 39 | * 40 | * To run this on your local machine, a MQTT Server should be up and running. 41 | * 42 | */ 43 | public final class JavaMQTTStreamWordCount { 44 | 45 | public static void main(String[] args) throws Exception { 46 | if (args.length < 2) { 47 | System.err.println("Usage: JavaMQTTStreamWordCount "); 48 | System.exit(1); 49 | } 50 | 51 | if (!Logger.getRootLogger().getAllAppenders().hasMoreElements()) { 52 | Logger.getRootLogger().setLevel(Level.WARN); 53 | } 54 | 55 | String brokerUrl = args[0]; 56 | String topic = args[1]; 57 | 58 | SparkConf sparkConf = new SparkConf().setAppName("JavaMQTTStreamWordCount"); 59 | 60 | // check Spark configuration for master URL, set it to local if not configured 61 | if (!sparkConf.contains("spark.master")) { 62 | sparkConf.setMaster("local[4]"); 63 | } 64 | 65 | SparkSession spark = SparkSession.builder() 66 | .config(sparkConf) 67 | .getOrCreate(); 68 | 69 | // Create DataFrame representing the stream of input lines from connection to mqtt server 70 | Dataset lines = spark 71 | .readStream() 72 | .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider") 73 | .option("topic", topic) 74 | .load(brokerUrl).selectExpr("CAST(payload AS STRING)").as(Encoders.STRING()); 75 | 76 | // Split the lines into words 77 | Dataset words = lines.flatMap(new FlatMapFunction() { 78 | @Override 79 | public Iterator call(String x) { 80 | return Arrays.asList(x.split(" ")).iterator(); 81 | } 82 | }, Encoders.STRING()); 83 | 84 | // Generate running word count 85 | Dataset wordCounts = words.groupBy("value").count(); 86 | 87 | // Start running the query that prints the running counts to the console 88 | StreamingQuery query = wordCounts.writeStream() 89 | .outputMode("complete") 90 | .format("console") 91 | .start(); 92 | 93 | query.awaitTermination(); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /sql-streaming-mqtt/examples/src/main/scala/org/apache/bahir/examples/sql/streaming/mqtt/MQTTSinkWordCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.examples.sql.streaming.mqtt 19 | 20 | import java.io.File 21 | 22 | import org.apache.commons.io.FileUtils 23 | 24 | import org.apache.spark.sql.SparkSession 25 | 26 | /** 27 | * Counts words in UTF-8 encoded, '\n' delimited text received from local socket 28 | * and publishes results on MQTT topic. 29 | * 30 | * Usage: MQTTSinkWordCount 31 | * represents local network port on which program is listening for input. 32 | * and describe the MQTT server that structured streaming 33 | * would connect and send data. 34 | * 35 | * To run example on your local machine, a MQTT Server should be up and running. 36 | * Linux users may leverage 'nc -lk ' to listen on local port and wait 37 | * for Spark socket connection. 38 | */ 39 | object MQTTSinkWordCount { 40 | def main(args: Array[String]) { 41 | if (args.length < 2) { 42 | // scalastyle:off 43 | System.err.println("Usage: MQTTSinkWordCount ") 44 | // scalastyle:on 45 | System.exit(1) 46 | } 47 | 48 | val checkpointDir = System.getProperty("java.io.tmpdir") + "/mqtt-example/" 49 | // Remove checkpoint directory. 50 | FileUtils.deleteDirectory(new File(checkpointDir)) 51 | 52 | val port = args(0) 53 | val brokerUrl = args(1) 54 | val topic = args(2) 55 | 56 | val spark = SparkSession.builder 57 | .appName("MQTTSinkWordCount").master("local[4]") 58 | .getOrCreate() 59 | 60 | import spark.implicits._ 61 | 62 | // Create DataFrame representing the stream of input lines from local network socket. 63 | val lines = spark.readStream 64 | .format("socket") 65 | .option("host", "localhost").option("port", port) 66 | .load().select("value").as[String] 67 | 68 | // Split the lines into words. 69 | val words = lines.flatMap(_.split(" ")) 70 | 71 | // Generate running word count. 72 | val wordCounts = words.groupBy("value").count() 73 | 74 | // Start publishing the counts to MQTT server. 75 | val query = wordCounts.writeStream 76 | .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSinkProvider") 77 | .option("checkpointLocation", checkpointDir) 78 | .outputMode("complete") 79 | .option("topic", topic) 80 | .option("localStorage", checkpointDir) 81 | .start(brokerUrl) 82 | 83 | query.awaitTermination() 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /sql-streaming-mqtt/examples/src/main/scala/org/apache/bahir/examples/sql/streaming/mqtt/MQTTStreamWordCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.examples.sql.streaming.mqtt 19 | 20 | import java.sql.Timestamp 21 | 22 | import org.apache.spark.sql.SparkSession 23 | 24 | /** 25 | * Counts words in UTF8 encoded, '\n' delimited text received from MQTT Server. 26 | * 27 | * Usage: MQTTStreamWordCount 28 | * and describe the MQTT server that Structured Streaming 29 | * would connect to receive data. 30 | * 31 | * To run this on your local machine, a MQTT Server should be up and running. 32 | * 33 | */ 34 | object MQTTStreamWordCount { 35 | def main(args: Array[String]) { 36 | if (args.length < 2) { 37 | System.err.println("Usage: MQTTStreamWordCount ") // scalastyle:off println 38 | System.exit(1) 39 | } 40 | 41 | val brokerUrl = args(0) 42 | val topic = args(1) 43 | 44 | val spark = SparkSession 45 | .builder 46 | .appName("MQTTStreamWordCount") 47 | .master("local[4]") 48 | .getOrCreate() 49 | 50 | import spark.implicits._ 51 | 52 | // Create DataFrame representing the stream of input lines from connection to mqtt server 53 | val lines = spark.readStream 54 | .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider") 55 | .option("topic", topic).option("persistence", "memory") 56 | .load(brokerUrl).selectExpr("CAST(payload AS STRING)").as[String] 57 | 58 | // Split the lines into words 59 | val words = lines.flatMap(_.split(" ")) 60 | 61 | // Generate running word count 62 | val wordCounts = words.groupBy("value").count() 63 | 64 | // Start running the query that prints the running counts to the console 65 | val query = wordCounts.writeStream 66 | .outputMode("complete") 67 | .format("console") 68 | .start() 69 | 70 | query.awaitTermination() 71 | } 72 | } 73 | 74 | -------------------------------------------------------------------------------- /sql-streaming-mqtt/src/main/assembly/assembly.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | test-jar-with-dependencies 19 | 20 | jar 21 | 22 | false 23 | 24 | 25 | 26 | ${project.build.directory}/scala-${scala.binary.version}/test-classes 27 | 28 | 29 | 30 | 31 | 32 | 33 | true 34 | test 35 | true 36 | 37 | org.apache.hadoop:*:jar 38 | org.apache.zookeeper:*:jar 39 | org.apache.avro:*:jar 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /sql-streaming-mqtt/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | org.apache.bahir.sql.streaming.mqtt.MQTTStreamSinkProvider 19 | org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider 20 | org.apache.spark.sql.mqtt.HDFSMQTTSourceProvider -------------------------------------------------------------------------------- /sql-streaming-mqtt/src/main/scala/org/apache/bahir/sql/streaming/mqtt/LongOffset.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.sql.streaming.mqtt 19 | 20 | import org.apache.spark.sql.execution.streaming.Offset 21 | import org.apache.spark.sql.execution.streaming.SerializedOffset 22 | import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2} 23 | 24 | /** 25 | * A simple offset for sources that produce a single linear stream of data. 26 | */ 27 | case class LongOffset(offset: Long) extends OffsetV2 { 28 | 29 | override val json = offset.toString 30 | 31 | def +(increment: Long): LongOffset = new LongOffset(offset + increment) 32 | def -(decrement: Long): LongOffset = new LongOffset(offset - decrement) 33 | } 34 | 35 | object LongOffset { 36 | 37 | /** 38 | * LongOffset factory from serialized offset. 39 | * 40 | * @return new LongOffset 41 | */ 42 | def apply(offset: SerializedOffset) : LongOffset = new LongOffset(offset.json.toLong) 43 | 44 | /** 45 | * Convert generic Offset to LongOffset if possible. 46 | * 47 | * @return converted LongOffset 48 | */ 49 | def convert(offset: Offset): Option[LongOffset] = offset match { 50 | case lo: LongOffset => Some(lo) 51 | case so: SerializedOffset => Some(LongOffset(so)) 52 | case _ => None 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /sql-streaming-mqtt/src/main/scala/org/apache/spark/sql/mqtt/HDFSMQTTSourceProvider.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.mqtt 19 | 20 | import org.apache.spark.internal.Logging 21 | import org.apache.spark.sql.SQLContext 22 | import org.apache.spark.sql.execution.streaming.Source 23 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider} 24 | import org.apache.spark.sql.types.StructType 25 | 26 | import org.apache.bahir.sql.streaming.mqtt.{MQTTStreamConstants, MQTTUtils} 27 | 28 | /** 29 | * The provider class for creating MQTT source. 30 | * This provider throw IllegalArgumentException if 'brokerUrl' or 'topic' parameter 31 | * is not set in options. 32 | */ 33 | class HDFSMQTTSourceProvider extends StreamSourceProvider with DataSourceRegister with Logging { 34 | 35 | override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], 36 | providerName: String, parameters: Map[String, String]): (String, StructType) = { 37 | ("hdfs-mqtt", MQTTStreamConstants.SCHEMA_DEFAULT) 38 | } 39 | 40 | override def createSource(sqlContext: SQLContext, metadataPath: String, 41 | schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { 42 | 43 | val parsedResult = MQTTUtils.parseConfigParams(parameters) 44 | 45 | new HdfsBasedMQTTStreamSource( 46 | sqlContext, 47 | metadataPath, 48 | parsedResult._1, // brokerUrl 49 | parsedResult._2, // clientId 50 | parsedResult._3, // topic 51 | parsedResult._5, // mqttConnectionOptions 52 | parsedResult._6, // qos 53 | parsedResult._7, // maxBatchMessageNum 54 | parsedResult._8, // maxBatchMessageSize 55 | parsedResult._9 // maxRetryNum 56 | ) 57 | } 58 | 59 | override def shortName(): String = "hdfs-mqtt" 60 | } 61 | 62 | object HDFSMQTTSourceProvider { 63 | val SEP = "##" 64 | } 65 | -------------------------------------------------------------------------------- /sql-streaming-mqtt/src/test/bin/test-BAHIR-83.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | set -o pipefail 20 | 21 | for i in `seq 100` ; do 22 | mvn scalatest:test -pl sql-streaming-mqtt -q -Dsuites='*.BasicMQTTSourceSuite' | \ 23 | grep -q "TEST FAILED" && echo "$i: failed" 24 | done 25 | -------------------------------------------------------------------------------- /sql-streaming-mqtt/src/test/resources/keystore.jks: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/bahir/8b647c8ef80455ba900ef91e08eaf5eafa35c133/sql-streaming-mqtt/src/test/resources/keystore.jks -------------------------------------------------------------------------------- /sql-streaming-mqtt/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file target/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=true 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 25 | 26 | # Ignore messages below warning level from Jetty, because it's a bit verbose 27 | log4j.logger.org.spark_project.jetty=WARN 28 | -------------------------------------------------------------------------------- /sql-streaming-mqtt/src/test/resources/logging.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | ############################################################ 19 | # Global properties 20 | ############################################################ 21 | 22 | # "handlers" specifies a comma separated list of log Handler 23 | # classes. These handlers will be installed during VM startup. 24 | # Note that these classes must be on the system classpath. 25 | # By default we only configure a ConsoleHandler, which will only 26 | # show messages at the INFO and above levels. 27 | handlers = java.util.logging.ConsoleHandler 28 | 29 | # To also add the FileHandler, use the following line instead. 30 | #handlers = java.util.logging.FileHandler, java.util.logging.ConsoleHandler 31 | 32 | # Default global logging level. 33 | # This specifies which kinds of events are logged across 34 | # all loggers. For any given facility this global level 35 | # can be overriden by a facility specific level 36 | # Note that the ConsoleHandler also has a separate level 37 | # setting to limit messages printed to the console. 38 | .level = INFO 39 | 40 | ############################################################ 41 | # Handler specific properties. 42 | # Describes specific configuration info for Handlers. 43 | ############################################################ 44 | 45 | # Log file output is in target directory. 46 | java.util.logging.FileHandler.pattern = target/unit-tests-java-%u.log 47 | java.util.logging.FileHandler.limit = 50000 48 | java.util.logging.FileHandler.count = 1 49 | java.util.logging.FileHandler.formatter = java.util.logging.XMLFormatter 50 | 51 | # Limit the message that are printed on the console to WARNING and above. 52 | java.util.logging.ConsoleHandler.level = WARNING 53 | java.util.logging.ConsoleHandler.formatter = java.util.logging.SimpleFormatter 54 | 55 | # Example to customize the SimpleFormatter output format 56 | # to print one-line log message like this: 57 | # : [] 58 | # 59 | # java.util.logging.SimpleFormatter.format=%4$s: %5$s [%1$tc]%n 60 | 61 | ############################################################ 62 | # Facility specific properties. 63 | # Provides extra control for each logger. 64 | ############################################################ 65 | 66 | # [BAHIR-] don't flood build logs with superfluous Parquet INFO messages 67 | # they should be written to a file via FileHandler but they end up in the 68 | # build log anyhow irrespective of the ConsoleHandler log level 69 | # also see https://github.com/Parquet/parquet-mr/issues/425 70 | org.apache.parquet.hadoop.level=SEVERE 71 | -------------------------------------------------------------------------------- /sql-streaming-mqtt/src/test/resources/truststore.jks: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/bahir/8b647c8ef80455ba900ef91e08eaf5eafa35c133/sql-streaming-mqtt/src/test/resources/truststore.jks -------------------------------------------------------------------------------- /sql-streaming-mqtt/src/test/scala/org/apache/bahir/sql/streaming/mqtt/LocalMessageStoreSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.sql.streaming.mqtt 19 | 20 | import java.io.File 21 | 22 | import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence 23 | import org.scalatest.BeforeAndAfter 24 | 25 | import org.apache.spark.SparkFunSuite 26 | 27 | import org.apache.bahir.utils.FileHelper 28 | 29 | 30 | class LocalMessageStoreSuite extends SparkFunSuite with BeforeAndAfter { 31 | 32 | private val testData = Seq(1, 2, 3, 4, 5, 6) 33 | private val javaSerializer: JavaSerializer = new JavaSerializer() 34 | 35 | private val serializerInstance = javaSerializer 36 | private val tempDir: File = new File(System.getProperty("java.io.tmpdir") + "/mqtt-test2/") 37 | private val persistence: MqttDefaultFilePersistence = 38 | new MqttDefaultFilePersistence(tempDir.getAbsolutePath) 39 | 40 | private val store = new LocalMessageStore(persistence, javaSerializer) 41 | 42 | before { 43 | tempDir.mkdirs() 44 | tempDir.deleteOnExit() 45 | persistence.open("temp", "tcp://dummy-url:0000") 46 | } 47 | 48 | after { 49 | persistence.clear() 50 | persistence.close() 51 | FileHelper.deleteFileQuietly(tempDir) 52 | } 53 | 54 | test("serialize and deserialize") { 55 | val serialized = serializerInstance.serialize(testData) 56 | val deserialized: Seq[Int] = serializerInstance 57 | .deserialize(serialized).asInstanceOf[Seq[Int]] 58 | assert(testData === deserialized) 59 | } 60 | 61 | test("Store and retrieve") { 62 | store.store(1, testData) 63 | val result: Seq[Int] = store.retrieve(1) 64 | assert(testData === result) 65 | } 66 | 67 | test("Max offset stored") { 68 | store.store(1, testData) 69 | store.store(10, testData) 70 | val offset = store.maxProcessedOffset 71 | assert(offset == 10) 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /sql-streaming-sqs/examples/src/main/scala/org/apache/bahir/examples/sql/streaming/sqs/SqsSourceExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.bahir.examples.sql.streaming.sqs 19 | 20 | import scala.util.Random 21 | 22 | import org.apache.spark.sql.SparkSession 23 | 24 | /** 25 | * Example to read files from S3 using SQS Source and write results to Memory Sink 26 | * 27 | * Usage: SqsSourceExample 28 | */ 29 | 30 | object SqsSourceExample { 31 | 32 | def main(args: Array[String]) { 33 | 34 | val randomName = Random.alphanumeric.take(6).mkString("") 35 | val pathName = "path_" + randomName 36 | val queryName = "query_" + randomName 37 | val checkpointDir = s"/checkpoints/$pathName" 38 | val schemaPathString = args(0) 39 | 40 | val spark = SparkSession.builder().appName("SqsExample").getOrCreate() 41 | 42 | val schema = spark.read.json(schemaPathString).schema 43 | 44 | val queueUrl = args(1) 45 | 46 | val fileFormat = args(2) 47 | 48 | val inputDf = spark 49 | .readStream 50 | .format("s3-sqs") 51 | .schema(schema) 52 | .option("sqsUrl", queueUrl) 53 | .option("fileFormat", fileFormat) 54 | .option("sqsFetchIntervalSeconds", "2") 55 | .option("sqsLongPollingWaitTimeSeconds", "5") 56 | .option("maxFilesPerTrigger", "50") 57 | .option("ignoreFileDeletion", "true") 58 | .load() 59 | 60 | val query = inputDf 61 | .writeStream 62 | .queryName(queryName) 63 | .format("memory") 64 | .option("checkpointLocation", checkpointDir) 65 | .start() 66 | 67 | query.awaitTermination() 68 | } 69 | } 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /sql-streaming-sqs/src/main/java/org/apache/spark/sql/streaming/sqs/BasicAWSCredentialsProvider.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.streaming.sqs; 19 | 20 | import com.amazonaws.AmazonClientException; 21 | import com.amazonaws.auth.AWSCredentialsProvider; 22 | import com.amazonaws.auth.BasicAWSCredentials; 23 | import com.amazonaws.auth.AWSCredentials; 24 | import org.apache.commons.lang.StringUtils; 25 | 26 | public class BasicAWSCredentialsProvider implements AWSCredentialsProvider { 27 | private final String accessKey; 28 | private final String secretKey; 29 | 30 | public BasicAWSCredentialsProvider(String accessKey, String secretKey) { 31 | this.accessKey = accessKey; 32 | this.secretKey = secretKey; 33 | } 34 | 35 | public AWSCredentials getCredentials() { 36 | if (!StringUtils.isEmpty(accessKey) && !StringUtils.isEmpty(secretKey)) { 37 | return new BasicAWSCredentials(accessKey, secretKey); 38 | } 39 | throw new AmazonClientException( 40 | "Access key or secret key is null"); 41 | } 42 | 43 | public void refresh() {} 44 | 45 | @Override 46 | public String toString() { 47 | return getClass().getSimpleName(); 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /sql-streaming-sqs/src/main/java/org/apache/spark/sql/streaming/sqs/InstanceProfileCredentialsProviderWithRetries.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.streaming.sqs; 19 | 20 | 21 | import com.amazonaws.AmazonClientException; 22 | import com.amazonaws.auth.AWSCredentials; 23 | import com.amazonaws.auth.InstanceProfileCredentialsProvider; 24 | import org.apache.commons.logging.Log; 25 | import org.apache.commons.logging.LogFactory; 26 | 27 | public class InstanceProfileCredentialsProviderWithRetries 28 | extends InstanceProfileCredentialsProvider { 29 | 30 | private static final Log LOG = LogFactory.getLog( 31 | InstanceProfileCredentialsProviderWithRetries.class); 32 | 33 | public AWSCredentials getCredentials() { 34 | int retries = 10; 35 | int sleep = 500; 36 | while(retries > 0) { 37 | try { 38 | return super.getCredentials(); 39 | } 40 | catch (RuntimeException re) { 41 | LOG.error("Got an exception while fetching credentials " + re); 42 | --retries; 43 | try { 44 | Thread.sleep(sleep); 45 | } catch (InterruptedException ie) { 46 | // Do nothing 47 | } 48 | if (sleep < 10000) { 49 | sleep *= 2; 50 | } 51 | } 52 | catch (Error error) { 53 | LOG.error("Got an exception while fetching credentials " + error); 54 | --retries; 55 | try { 56 | Thread.sleep(sleep); 57 | } catch (InterruptedException ie) { 58 | // Do nothing 59 | } 60 | if (sleep < 10000) { 61 | sleep *= 2; 62 | } 63 | } 64 | } 65 | throw new AmazonClientException("Unable to load credentials."); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /sql-streaming-sqs/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | org.apache.spark.sql.streaming.sqs.SqsSourceProvider -------------------------------------------------------------------------------- /sql-streaming-sqs/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | log4j.rootCategory=WARN, console 19 | 20 | # File appender 21 | log4j.appender.file=org.apache.log4j.FileAppender 22 | log4j.appender.file.append=false 23 | log4j.appender.file.file=target/unit-tests.log 24 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 25 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n 26 | 27 | # Console appender 28 | log4j.appender.console=org.apache.log4j.ConsoleAppender 29 | log4j.appender.console.target=System.out 30 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 31 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 32 | 33 | # Settings to quiet third party logs that are too verbose 34 | log4j.logger.org.sparkproject.jetty=WARN 35 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR 36 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 37 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 38 | -------------------------------------------------------------------------------- /sql-streaming-sqs/src/main/scala/org/apache/spark/sql/streaming/sqs/SqsSourceProvider.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.streaming.sqs 19 | 20 | import org.apache.spark.internal.Logging 21 | import org.apache.spark.sql.SQLContext 22 | import org.apache.spark.sql.execution.streaming.Source 23 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider} 24 | import org.apache.spark.sql.types.StructType 25 | 26 | class SqsSourceProvider extends DataSourceRegister 27 | with StreamSourceProvider 28 | with Logging { 29 | 30 | override def shortName(): String = "s3-sqs" 31 | 32 | override def sourceSchema(sqlContext: SQLContext, 33 | schema: Option[StructType], 34 | providerName: String, 35 | parameters: Map[String, String]): (String, StructType) = { 36 | 37 | require(schema.isDefined, "Sqs source doesn't support empty schema") 38 | (shortName(), schema.get) 39 | } 40 | 41 | override def createSource(sqlContext: SQLContext, 42 | metadataPath: String, 43 | schema: Option[StructType], 44 | providerName: String, 45 | parameters: Map[String, String]): Source = { 46 | 47 | new SqsSource( 48 | sqlContext.sparkSession, 49 | metadataPath, 50 | parameters, 51 | schema.get) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /sql-streaming-sqs/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file target/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=true 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 25 | 26 | # Ignore messages below warning level from Jetty, because it's a bit verbose 27 | log4j.logger.org.sparkproject.jetty=WARN 28 | -------------------------------------------------------------------------------- /streaming-akka/README.md: -------------------------------------------------------------------------------- 1 | 19 | # Spark Streaming Akka Connector 20 | 21 | A library for reading data from Akka Actors using Spark Streaming. 22 | 23 | ## Linking 24 | 25 | Using SBT: 26 | 27 | libraryDependencies += "org.apache.bahir" %% "spark-streaming-akka" % "{{site.SPARK_VERSION}}" 28 | 29 | Using Maven: 30 | 31 | 32 | org.apache.bahir 33 | spark-streaming-akka_{{site.SCALA_BINARY_VERSION}} 34 | {{site.SPARK_VERSION}} 35 | 36 | 37 | This library can also be added to Spark jobs launched through `spark-shell` or `spark-submit` by using the `--packages` command line option. 38 | For example, to include it when starting the spark shell: 39 | 40 | $ bin/spark-shell --packages org.apache.bahir:spark-streaming-akka_{{site.SCALA_BINARY_VERSION}}:{{site.SPARK_VERSION}} 41 | 42 | Unlike using `--jars`, using `--packages` ensures that this library and its dependencies will be added to the classpath. 43 | The `--packages` argument can also be used with `bin/spark-submit`. 44 | 45 | This library is cross-published for Scala 2.11 and Scala 2.12, so users should replace the proper Scala version in the commands listed above. 46 | 47 | ## Examples 48 | 49 | DStreams can be created with data streams received through Akka actors by using `AkkaUtils.createStream(ssc, actorProps, actor-name)`. 50 | 51 | ### Scala API 52 | 53 | You need to extend `ActorReceiver` so as to store received data into Spark using `store(...)` methods. The supervisor strategy of 54 | this actor can be configured to handle failures, etc. 55 | 56 | class CustomActor extends ActorReceiver { 57 | def receive = { 58 | case data: String => store(data) 59 | } 60 | } 61 | 62 | // A new input stream can be created with this custom actor as 63 | val ssc: StreamingContext = ... 64 | val lines = AkkaUtils.createStream[String](ssc, Props[CustomActor](), "CustomReceiver") 65 | 66 | ### Java API 67 | 68 | You need to extend `JavaActorReceiver` so as to store received data into Spark using `store(...)` methods. The supervisor strategy of 69 | this actor can be configured to handle failures, etc. 70 | 71 | class CustomActor extends JavaActorReceiver { 72 | @Override 73 | public void onReceive(Object msg) throws Exception { 74 | store((String) msg); 75 | } 76 | } 77 | 78 | // A new input stream can be created with this custom actor as 79 | JavaStreamingContext jssc = ...; 80 | JavaDStream lines = AkkaUtils.createStream(jssc, Props.create(CustomActor.class), "CustomReceiver"); 81 | 82 | See end-to-end examples at [Akka Examples](https://github.com/apache/bahir/tree/master/streaming-akka/examples) 83 | -------------------------------------------------------------------------------- /streaming-akka/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 4.0.0 21 | 22 | org.apache.bahir 23 | bahir-parent_2.12 24 | 3.0.0-SNAPSHOT 25 | ../pom.xml 26 | 27 | 28 | org.apache.bahir 29 | spark-streaming-akka_2.12 30 | 31 | streaming-akka 32 | 33 | jar 34 | Apache Bahir - Spark Streaming Akka 35 | http://bahir.apache.org/ 36 | 37 | 38 | 39 | org.apache.bahir 40 | bahir-common_${scala.binary.version} 41 | ${project.version} 42 | test-jar 43 | test 44 | 45 | 46 | org.apache.spark 47 | spark-tags_${scala.binary.version} 48 | 49 | 50 | org.apache.spark 51 | spark-streaming_${scala.binary.version} 52 | ${spark.version} 53 | provided 54 | 55 | 56 | org.apache.spark 57 | spark-core_${scala.binary.version} 58 | ${spark.version} 59 | test-jar 60 | test 61 | 62 | 63 | ${akka.group} 64 | akka-actor_${scala.binary.version} 65 | ${akka.version} 66 | 67 | 68 | ${akka.group} 69 | akka-remote_${scala.binary.version} 70 | ${akka.version} 71 | 72 | 73 | 74 | target/scala-${scala.binary.version}/classes 75 | target/scala-${scala.binary.version}/test-classes 76 | 77 | 78 | org.apache.maven.plugins 79 | maven-source-plugin 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /streaming-akka/src/test/java/org/apache/spark/streaming/akka/JavaAkkaUtilsSuite.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.streaming.akka; 19 | 20 | import akka.actor.ActorSystem; 21 | import akka.actor.Props; 22 | import akka.actor.SupervisorStrategy; 23 | import akka.util.Timeout; 24 | import org.junit.Test; 25 | 26 | import org.apache.spark.api.java.function.Function0; 27 | import org.apache.spark.storage.StorageLevel; 28 | import org.apache.spark.streaming.LocalJavaStreamingContext; 29 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 30 | 31 | import java.util.concurrent.TimeUnit; 32 | 33 | public class JavaAkkaUtilsSuite extends LocalJavaStreamingContext { 34 | @Test 35 | public void testAkkaUtils() { 36 | // tests the API, does not actually test data receiving 37 | JavaReceiverInputDStream test1 = AkkaUtils.createStream( 38 | ssc, Props.create(JavaTestActor.class), "test" 39 | ); 40 | JavaReceiverInputDStream test2 = AkkaUtils.createStream( 41 | ssc, Props.create(JavaTestActor.class), "test", 42 | StorageLevel.MEMORY_AND_DISK_SER_2() 43 | ); 44 | JavaReceiverInputDStream test3 = AkkaUtils.createStream( 45 | ssc, Props.create(JavaTestActor.class), "test", 46 | StorageLevel.MEMORY_AND_DISK_SER_2(), new ActorSystemCreatorForTest(), 47 | SupervisorStrategy.defaultStrategy() 48 | ); 49 | } 50 | } 51 | 52 | class ActorSystemCreatorForTest implements Function0 { 53 | @Override 54 | public ActorSystem call() { 55 | return null; 56 | } 57 | } 58 | 59 | class JavaTestActor extends JavaActorReceiver { 60 | @Override 61 | public void onReceive(Object message) throws Exception { 62 | store((String) message); 63 | store((String) message, new Timeout(1000, TimeUnit.MILLISECONDS)); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /streaming-akka/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file target/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=true 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 25 | 26 | # Ignore messages below warning level from Jetty, because it's a bit verbose 27 | log4j.logger.org.spark-project.jetty=WARN 28 | 29 | -------------------------------------------------------------------------------- /streaming-akka/src/test/scala/org/apache/spark/streaming/akka/AkkaUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.streaming.akka 19 | 20 | import scala.concurrent.duration._ 21 | 22 | import akka.actor.{Props, SupervisorStrategy} 23 | 24 | import org.apache.spark.SparkFunSuite 25 | import org.apache.spark.storage.StorageLevel 26 | import org.apache.spark.streaming.{Seconds, StreamingContext} 27 | import org.apache.spark.streaming.dstream.ReceiverInputDStream 28 | 29 | class AkkaUtilsSuite extends SparkFunSuite { 30 | 31 | test("createStream") { 32 | val ssc: StreamingContext = new StreamingContext("local[2]", "test", Seconds(1000)) 33 | try { 34 | // tests the API, does not actually test data receiving 35 | val test1: ReceiverInputDStream[String] = AkkaUtils.createStream( 36 | ssc, Props[TestActor](), "test") 37 | val test2: ReceiverInputDStream[String] = AkkaUtils.createStream( 38 | ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2) 39 | val test3: ReceiverInputDStream[String] = AkkaUtils.createStream( 40 | ssc, 41 | Props[TestActor](), 42 | "test", 43 | StorageLevel.MEMORY_AND_DISK_SER_2, 44 | supervisorStrategy = SupervisorStrategy.defaultStrategy) 45 | val test4: ReceiverInputDStream[String] = AkkaUtils.createStream( 46 | ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null) 47 | val test5: ReceiverInputDStream[String] = AkkaUtils.createStream( 48 | ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null) 49 | val test6: ReceiverInputDStream[String] = AkkaUtils.createStream( 50 | ssc, 51 | Props[TestActor](), 52 | "test", 53 | StorageLevel.MEMORY_AND_DISK_SER_2, 54 | () => null, 55 | SupervisorStrategy.defaultStrategy) 56 | } finally { 57 | ssc.stop() 58 | } 59 | } 60 | } 61 | 62 | class TestActor extends ActorReceiver { 63 | override def receive: Receive = { 64 | case m: String => store(m) 65 | case m => store(m, 10.seconds) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /streaming-mqtt/examples/src/main/python/streaming/mqtt_wordcount.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | """ 19 | A sample wordcount with MqttStream stream 20 | Usage: mqtt_wordcount.py 21 | 22 | To run this in your local machine, you need to setup a MQTT broker and publisher first, 23 | like Mosquitto (http://mosquitto.org/) an easy to use and install open source MQTT Broker. 24 | On Mac OS Mosquitto can be installed with Homebrew `$ brew install mosquitto`. 25 | On Ubuntu mosquitto can be installed with the command `$ sudo apt-get install mosquitto`. 26 | 27 | Alternatively, the Eclipse paho project provides a number of clients and utilities for 28 | working with MQTT, see http://www.eclipse.org/paho/#getting-started 29 | 30 | How to run this example locally: 31 | 32 | (1) Start Mqtt message broker/server, i.e. Mosquitto: 33 | 34 | `$ mosquitto -p 1883` 35 | 36 | (2) Run the publisher: 37 | 38 | `$ bin/run-example \ 39 | org.apache.spark.examples.streaming.mqtt.MQTTPublisher tcp://localhost:1883 foo` 40 | 41 | (3) Run the example: 42 | 43 | `$ bin/run-example \ 44 | streaming-mqtt/examples/src/main/python/streaming/mqtt_wordcount.py tcp://localhost:1883 foo` 45 | """ 46 | 47 | import sys 48 | 49 | from pyspark import SparkContext 50 | from pyspark.streaming import StreamingContext 51 | from mqtt import MQTTUtils 52 | 53 | if __name__ == "__main__": 54 | if len(sys.argv) != 3: 55 | print >> sys.stderr, "Usage: mqtt_wordcount.py " 56 | exit(-1) 57 | 58 | sc = SparkContext(appName="PythonStreamingMQTTWordCount") 59 | ssc = StreamingContext(sc, 1) 60 | 61 | brokerUrl = sys.argv[1] 62 | topic = sys.argv[2] 63 | 64 | lines = MQTTUtils.createStream(ssc, brokerUrl, topic) 65 | counts = lines.flatMap(lambda line: line.split(" ")) \ 66 | .map(lambda word: (word, 1)) \ 67 | .reduceByKey(lambda a, b: a+b) 68 | counts.pprint() 69 | 70 | ssc.start() 71 | ssc.awaitTermination() 72 | -------------------------------------------------------------------------------- /streaming-mqtt/src/main/assembly/assembly.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | test-jar-with-dependencies 19 | 20 | jar 21 | 22 | false 23 | 24 | 25 | 26 | ${project.build.directory}/scala-${scala.binary.version}/test-classes 27 | 28 | 29 | 30 | 31 | 32 | 33 | true 34 | test 35 | true 36 | 37 | org.apache.hadoop:*:jar 38 | org.apache.zookeeper:*:jar 39 | org.apache.avro:*:jar 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /streaming-mqtt/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file target/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=true 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 25 | 26 | # Ignore messages below warning level from Jetty, because it's a bit verbose 27 | log4j.logger.org.spark-project.jetty=WARN 28 | 29 | -------------------------------------------------------------------------------- /streaming-pubnub/examples/src/main/scala/org/apache/spark/examples/streaming/pubnub/PubNubWordCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.streaming.pubnub 19 | 20 | import com.google.gson.JsonParser 21 | import com.pubnub.api.PNConfiguration 22 | import com.pubnub.api.enums.PNReconnectionPolicy 23 | 24 | import org.apache.spark.SparkConf 25 | import org.apache.spark.storage.StorageLevel 26 | import org.apache.spark.streaming.Milliseconds 27 | import org.apache.spark.streaming.StreamingContext 28 | import org.apache.spark.streaming.dstream.ReceiverInputDStream 29 | import org.apache.spark.streaming.pubnub.{PubNubUtils, SparkPubNubMessage} 30 | 31 | /** 32 | * Consumes messages from a PubNub channel and calculates word count. 33 | * For demo purpose, login to PubNub account and produce messages using Debug Console. 34 | * Expected message format: {"text": "Hello, World!"} 35 | * 36 | * Usage: PubNubWordCount 37 | * subscribe key 38 | * channel 39 | * aggregation period in milliseconds 40 | * 41 | * Example: 42 | * $ bin/run-example \ 43 | * org.apache.spark.examples.streaming.pubnub.PubNubWordCount \ 44 | * sub-c-2d245192-ee8d-11e8-b4c3-46cd67be4fbd my-channel 60000 45 | */ 46 | object PubNubWordCount { 47 | def main(args: Array[String]): Unit = { 48 | if (args.length != 3) { 49 | // scalastyle:off println 50 | System.err.println( 51 | """ 52 | |Usage: PubNubWordCount 53 | | 54 | | subscribe key 55 | | channel 56 | | aggregation period in milliseconds 57 | | 58 | """.stripMargin 59 | ) 60 | // scalastyle:on 61 | System.exit(1) 62 | } 63 | 64 | val Seq(subscribeKey, channel, aggregationPeriod) = args.toSeq 65 | 66 | val sparkConf = new SparkConf().setAppName("PubNubWordCount").setMaster("local[2]") 67 | val ssc = new StreamingContext(sparkConf, Milliseconds(aggregationPeriod.toLong)) 68 | 69 | val config = new PNConfiguration 70 | config.setSubscribeKey(subscribeKey) 71 | config.setSecure(true) 72 | config.setReconnectionPolicy(PNReconnectionPolicy.LINEAR) 73 | 74 | val pubNubStream: ReceiverInputDStream[SparkPubNubMessage] = PubNubUtils.createStream( 75 | ssc, config, Seq(channel), Seq(), None, StorageLevel.MEMORY_AND_DISK_SER_2) 76 | 77 | val wordCounts = pubNubStream 78 | .flatMap( 79 | message => new JsonParser().parse(message.getPayload) 80 | .getAsJsonObject.get("text").getAsString.split("\\s") 81 | ) 82 | .map(word => (word, 1)) 83 | .reduceByKey(_ + _) 84 | 85 | wordCounts.print() 86 | 87 | ssc.start() 88 | ssc.awaitTermination() 89 | } 90 | } 91 | 92 | -------------------------------------------------------------------------------- /streaming-pubnub/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 4.0.0 21 | 22 | bahir-parent_2.12 23 | org.apache.bahir 24 | 3.0.0-SNAPSHOT 25 | ../pom.xml 26 | 27 | 28 | spark-streaming-pubnub_2.12 29 | 30 | streaming-pubnub 31 | 32 | jar 33 | Apache Bahir - Spark Streaming PubNub 34 | http://bahir.apache.org/ 35 | 36 | 37 | 38 | org.apache.bahir 39 | bahir-common_${scala.binary.version} 40 | ${project.version} 41 | test-jar 42 | test 43 | 44 | 45 | org.apache.spark 46 | spark-tags_${scala.binary.version} 47 | 48 | 49 | org.apache.spark 50 | spark-streaming_${scala.binary.version} 51 | ${spark.version} 52 | provided 53 | 54 | 55 | com.pubnub 56 | pubnub-gson 57 | 4.21.0 58 | 59 | 60 | org.apache.spark 61 | spark-core_${scala.binary.version} 62 | ${spark.version} 63 | test-jar 64 | test 65 | 66 | 67 | org.scalacheck 68 | scalacheck_${scala.binary.version} 69 | test 70 | 71 | 72 | 73 | 74 | target/scala-${scala.binary.version}/classes 75 | target/scala-${scala.binary.version}/test-classes 76 | 77 | 78 | org.apache.maven.plugins 79 | maven-source-plugin 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /streaming-pubnub/src/main/scala/org/apache/spark/streaming/pubnub/PubNubUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.streaming.pubnub 19 | 20 | import java.util.{Set => JSet} 21 | 22 | import collection.JavaConverters._ 23 | import com.pubnub.api.PNConfiguration 24 | 25 | import org.apache.spark.storage.StorageLevel 26 | import org.apache.spark.streaming.StreamingContext 27 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream 28 | import org.apache.spark.streaming.api.java.JavaStreamingContext 29 | import org.apache.spark.streaming.dstream.ReceiverInputDStream 30 | 31 | object PubNubUtils { 32 | /** 33 | * Create an input stream that returns messages received from PubNub infrastructure. 34 | * @param ssc Streaming context 35 | * @param configuration PubNub client configuration 36 | * @param channels Sequence of channels to subscribe 37 | * @param channelGroups Sequence of channel groups to subscribe 38 | * @param timeToken Optional point in time to start receiving messages from. 39 | * Leave undefined to get only latest messages. 40 | * @param storageLevel Storage level to use for storing the received objects 41 | * @return Input stream 42 | */ 43 | def createStream( 44 | ssc: StreamingContext, 45 | configuration: PNConfiguration, 46 | channels: Seq[String], 47 | channelGroups: Seq[String], 48 | timeToken: Option[Long] = None, 49 | storageLevel: StorageLevel): ReceiverInputDStream[SparkPubNubMessage] = { 50 | ssc.withNamedScope("PubNub Stream") { 51 | new PubNubInputDStream( 52 | ssc, configuration, channels, channelGroups, timeToken, storageLevel 53 | ) 54 | } 55 | } 56 | 57 | /** 58 | * Create an input stream that returns messages received from PubNub infrastructure. 59 | * @param jssc Java streaming context 60 | * @param configuration PubNub client configuration 61 | * @param channels Set of channels to subscribe 62 | * @param channelGroups Set of channel groups to subscribe 63 | * @param timeToken Optional point in time to start receiving messages from. 64 | * Specify null to get only latest messages. 65 | * @param storageLevel Storage level to use for storing the received objects 66 | * @return Input stream 67 | */ 68 | def createStream( 69 | jssc: JavaStreamingContext, 70 | configuration: PNConfiguration, 71 | channels: JSet[String], 72 | channelGroups: JSet[String], 73 | timeToken: Option[Long], 74 | storageLevel: StorageLevel): JavaReceiverInputDStream[SparkPubNubMessage] = { 75 | createStream( 76 | jssc.ssc, configuration, Seq.empty ++ channels.asScala, 77 | Seq.empty ++ channelGroups.asScala, timeToken, storageLevel 78 | ) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /streaming-pubnub/src/test/java/org/apache/spark/streaming/pubnub/JavaPubNubStreamSuite.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.streaming.pubnub; 19 | 20 | import com.pubnub.api.PNConfiguration; 21 | import org.apache.spark.storage.StorageLevel; 22 | import org.apache.spark.streaming.LocalJavaStreamingContext; 23 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 24 | import org.junit.Test; 25 | 26 | import java.util.HashSet; 27 | 28 | public class JavaPubNubStreamSuite extends LocalJavaStreamingContext { 29 | @Test 30 | public void testPubNubStream() { 31 | // Tests the API compatibility, but do not actually receive any data. 32 | JavaReceiverInputDStream stream = PubNubUtils.createStream( 33 | ssc, new PNConfiguration(), new HashSet<>(), new HashSet<>(), null, 34 | StorageLevel.MEMORY_AND_DISK_SER_2() 35 | ); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /streaming-pubnub/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | log4j.rootCategory=INFO, console, file 19 | 20 | log4j.appender.console=org.apache.log4j.ConsoleAppender 21 | log4j.appender.console.target=System.out 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.conversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 24 | 25 | log4j.appender.file=org.apache.log4j.FileAppender 26 | log4j.appender.file.append=true 27 | log4j.appender.file.file=target/unit-tests.log 28 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 29 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 30 | 31 | # Ignore messages below warning level from Jetty, because it's a bit verbose 32 | log4j.logger.org.spark-project.jetty=WARN 33 | 34 | -------------------------------------------------------------------------------- /streaming-pubnub/src/test/scala/org/apache/spark/streaming/pubnub/MessageSerializationSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.streaming.pubnub 19 | 20 | import java.io.ByteArrayInputStream 21 | import java.io.ByteArrayOutputStream 22 | import java.io.ObjectInputStream 23 | import java.io.ObjectOutputStream 24 | 25 | import com.google.gson.JsonParser 26 | import com.pubnub.api.models.consumer.pubsub.PNMessageResult 27 | 28 | import org.apache.spark.SparkFunSuite 29 | 30 | class MessageSerializationSuite extends SparkFunSuite { 31 | test("Full example") { 32 | checkMessageSerialization( 33 | "{\"message\":\"Hello, World!\"}", "channel1", 34 | "publisher1", "subscription1", System.currentTimeMillis * 10000 35 | ) 36 | } 37 | 38 | test("Message from channel") { 39 | checkMessageSerialization("{\"message\":\"Hello, World!\"}", "c", "p", null, 13534398158620385L) 40 | } 41 | 42 | test("Message from subscription") { 43 | checkMessageSerialization("{\"message\":\"Hello, World!\"}", null, "p", "s", 13534397812467596L) 44 | } 45 | 46 | def checkMessageSerialization(payload: String, channel: String, 47 | publisher: String, subscription: String, timestamp: Long): Unit = { 48 | val builder = PNMessageResult.builder 49 | .message(if (payload != null) new JsonParser().parse(payload) else null) 50 | .channel(channel) 51 | .publisher(publisher) 52 | .subscription(subscription) 53 | .timetoken(timestamp) 54 | val pubNubMessage = builder.build() 55 | val sparkMessage = new SparkPubNubMessage 56 | sparkMessage.message = pubNubMessage 57 | 58 | // serializer 59 | val byteOutStream = new ByteArrayOutputStream 60 | val outputStream = new ObjectOutputStream(byteOutStream) 61 | outputStream.writeObject(sparkMessage) 62 | outputStream.flush() 63 | outputStream.close() 64 | byteOutStream.close() 65 | val serializedBytes = byteOutStream.toByteArray 66 | 67 | // deserialize 68 | val byteInStream = new ByteArrayInputStream(serializedBytes) 69 | val inputStream = new ObjectInputStream(byteInStream) 70 | val deserializedMessage = inputStream.readObject().asInstanceOf[SparkPubNubMessage] 71 | inputStream.close() 72 | byteInStream.close() 73 | 74 | assert(payload.equals(deserializedMessage.getPayload)) 75 | if (channel != null) { 76 | assert(channel.equals(deserializedMessage.getChannel)) 77 | } else { 78 | assert(deserializedMessage.getChannel == null) 79 | } 80 | if (subscription != null) { 81 | assert(subscription.equals(deserializedMessage.getSubscription)) 82 | } else { 83 | assert(deserializedMessage.getSubscription == null) 84 | } 85 | assert(publisher.equals(deserializedMessage.getPublisher)) 86 | val unixTimestamp = Math.ceil(timestamp / 10000).longValue() 87 | assert(unixTimestamp.equals(deserializedMessage.getTimestamp)) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /streaming-pubsub/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 4.0.0 21 | 22 | bahir-parent_2.12 23 | org.apache.bahir 24 | 3.0.0-SNAPSHOT 25 | ../pom.xml 26 | 27 | 28 | org.apache.bahir 29 | spark-streaming-pubsub_2.12 30 | 31 | streaming-pubsub 32 | 33 | jar 34 | Apache Bahir - Spark Streaming Google PubSub 35 | http://bahir.apache.org/ 36 | 37 | 38 | 39 | org.apache.bahir 40 | bahir-common_${scala.binary.version} 41 | ${project.version} 42 | test-jar 43 | test 44 | 45 | 46 | org.apache.spark 47 | spark-tags_${scala.binary.version} 48 | 49 | 50 | org.apache.spark 51 | spark-streaming_${scala.binary.version} 52 | ${spark.version} 53 | provided 54 | 55 | 56 | com.google.apis 57 | google-api-services-pubsub 58 | v1-rev355-1.22.0 59 | 60 | 61 | com.google.cloud.bigdataoss 62 | util 63 | 1.6.0 64 | 65 | 66 | org.apache.spark 67 | spark-core_${scala.binary.version} 68 | ${spark.version} 69 | test-jar 70 | test 71 | 72 | 73 | org.scalacheck 74 | scalacheck_${scala.binary.version} 75 | test 76 | 77 | 78 | com.google.http-client 79 | google-http-client-jackson 80 | 1.22.0 81 | 82 | 83 | 84 | target/scala-${scala.binary.version}/classes 85 | target/scala-${scala.binary.version}/test-classes 86 | 87 | 88 | org.apache.maven.plugins 89 | maven-source-plugin 90 | 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /streaming-pubsub/src/test/java/org/apache/spark/streaming/pubsub/JavaPubsubStreamSuite.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.streaming.pubsub; 19 | 20 | import org.apache.spark.storage.StorageLevel; 21 | import org.apache.spark.streaming.LocalJavaStreamingContext; 22 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 23 | import org.junit.Test; 24 | 25 | public class JavaPubsubStreamSuite extends LocalJavaStreamingContext { 26 | @Test 27 | public void testPubsubStream() { 28 | // tests the API, does not actually test data receiving 29 | JavaReceiverInputDStream stream1 = PubsubUtils.createStream( 30 | ssc, "project", "subscription", 31 | new SparkGCPCredentials.Builder().build(), StorageLevel.MEMORY_AND_DISK_SER_2()); 32 | 33 | JavaReceiverInputDStream stream2 = PubsubUtils.createStream( 34 | ssc, "project", "topic", "subscription", 35 | new SparkGCPCredentials.Builder().build(), StorageLevel.MEMORY_AND_DISK_SER_2()); 36 | 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /streaming-pubsub/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file target/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=true 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 25 | 26 | # Ignore messages below warning level from Jetty, because it's a bit verbose 27 | log4j.logger.org.spark-project.jetty=WARN 28 | 29 | -------------------------------------------------------------------------------- /streaming-twitter/README.md: -------------------------------------------------------------------------------- 1 | 19 | # Spark Streaming Twitter Connector 20 | 21 | A library for reading social data from [twitter](http://twitter.com/) using Spark Streaming. 22 | 23 | ## Linking 24 | 25 | Using SBT: 26 | 27 | libraryDependencies += "org.apache.bahir" %% "spark-streaming-twitter" % "{{site.SPARK_VERSION}}" 28 | 29 | Using Maven: 30 | 31 | 32 | org.apache.bahir 33 | spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}} 34 | {{site.SPARK_VERSION}} 35 | 36 | 37 | This library can also be added to Spark jobs launched through `spark-shell` or `spark-submit` by using the `--packages` command line option. 38 | For example, to include it when starting the spark shell: 39 | 40 | $ bin/spark-shell --packages org.apache.bahir:spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}}:{{site.SPARK_VERSION}} 41 | 42 | Unlike using `--jars`, using `--packages` ensures that this library and its dependencies will be added to the classpath. 43 | The `--packages` argument can also be used with `bin/spark-submit`. 44 | 45 | This library is cross-published for Scala 2.11 and Scala 2.12, so users should replace the proper Scala version in the commands listed above. 46 | 47 | 48 | ## Examples 49 | 50 | `TwitterUtils` uses Twitter4j to get the public stream of tweets using [Twitter's Streaming API](https://dev.twitter.com/docs/streaming-apis). Authentication information 51 | can be provided by any of the [methods](http://twitter4j.org/en/configuration.html) supported by Twitter4J library. You can import the `TwitterUtils` class and create a DStream with `TwitterUtils.createStream` as shown below. 52 | 53 | ### Scala API 54 | 55 | import org.apache.spark.streaming.twitter._ 56 | 57 | TwitterUtils.createStream(ssc, None) 58 | 59 | ### Java API 60 | 61 | import org.apache.spark.streaming.twitter.*; 62 | 63 | TwitterUtils.createStream(jssc); 64 | 65 | 66 | You can also either get the public stream, or get the filtered stream based on keywords. 67 | See end-to-end examples at [Twitter Examples](https://github.com/apache/bahir/tree/master/streaming-twitter/examples). 68 | 69 | ## Unit Test 70 | 71 | Executing integration tests requires users to register custom application at 72 | [Twitter Developer Portal](https://developer.twitter.com) and obtain private OAuth credentials. 73 | Below listing present how to run complete test suite on local workstation. 74 | 75 | cd streaming-twitter 76 | env ENABLE_TWITTER_TESTS=1 \ 77 | twitter4j.oauth.consumerKey=${customer key} \ 78 | twitter4j.oauth.consumerSecret=${customer secret} \ 79 | twitter4j.oauth.accessToken=${access token} \ 80 | twitter4j.oauth.accessTokenSecret=${access token secret} \ 81 | mvn clean test 82 | -------------------------------------------------------------------------------- /streaming-twitter/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 4.0.0 21 | 22 | org.apache.bahir 23 | bahir-parent_2.12 24 | 3.0.0-SNAPSHOT 25 | ../pom.xml 26 | 27 | 28 | org.apache.bahir 29 | spark-streaming-twitter_2.12 30 | 31 | streaming-twitter 32 | 33 | jar 34 | Apache Bahir - Spark Streaming Twitter 35 | http://bahir.apache.org/ 36 | 37 | 38 | 39 | org.apache.bahir 40 | bahir-common_${scala.binary.version} 41 | ${project.version} 42 | test-jar 43 | test 44 | 45 | 46 | org.apache.spark 47 | spark-tags_${scala.binary.version} 48 | 49 | 50 | org.apache.spark 51 | spark-streaming_${scala.binary.version} 52 | ${spark.version} 53 | provided 54 | 55 | 56 | org.apache.spark 57 | spark-core_${scala.binary.version} 58 | ${spark.version} 59 | test-jar 60 | test 61 | 62 | 63 | org.twitter4j 64 | twitter4j-stream 65 | 4.0.6 66 | 67 | 68 | org.scalacheck 69 | scalacheck_${scala.binary.version} 70 | test 71 | 72 | 73 | com.twitter 74 | algebird-core_${scala.binary.version} 75 | 0.12.4 76 | test 77 | 78 | 79 | 80 | target/scala-${scala.binary.version}/classes 81 | target/scala-${scala.binary.version}/test-classes 82 | 83 | 84 | org.apache.maven.plugins 85 | maven-source-plugin 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /streaming-twitter/src/test/java/org/apache/spark/streaming/twitter/JavaTwitterStreamSuite.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.streaming.twitter; 19 | 20 | import org.junit.Test; 21 | import twitter4j.FilterQuery; 22 | import twitter4j.Status; 23 | import twitter4j.auth.Authorization; 24 | import twitter4j.auth.NullAuthorization; 25 | import org.apache.spark.storage.StorageLevel; 26 | import org.apache.spark.streaming.LocalJavaStreamingContext; 27 | import org.apache.spark.streaming.api.java.JavaDStream; 28 | 29 | public class JavaTwitterStreamSuite extends LocalJavaStreamingContext { 30 | @Test 31 | public void testTwitterStream() { 32 | String[] filters = { "filter1", "filter2" }; 33 | Authorization auth = NullAuthorization.getInstance(); 34 | FilterQuery query = new FilterQuery().language("en,es"); 35 | 36 | // tests the API, does not actually test data receiving 37 | JavaDStream test1 = TwitterUtils.createStream(ssc); 38 | JavaDStream test2 = TwitterUtils.createStream(ssc, filters); 39 | JavaDStream test3 = TwitterUtils.createStream( 40 | ssc, filters, StorageLevel.MEMORY_AND_DISK_SER_2()); 41 | JavaDStream test4 = TwitterUtils.createStream(ssc, auth); 42 | JavaDStream test5 = TwitterUtils.createStream(ssc, auth, filters); 43 | JavaDStream test6 = TwitterUtils.createStream(ssc, 44 | auth, filters, StorageLevel.MEMORY_AND_DISK_SER_2()); 45 | JavaDStream test7 = TwitterUtils.createFilteredStream(ssc, 46 | auth, query, StorageLevel.MEMORY_AND_DISK_SER_2()); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /streaming-twitter/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the filetarget/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=true 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 25 | 26 | # Ignore messages below warning level from Jetty, because it's a bit verbose 27 | log4j.logger.org.spark-project.jetty=WARN 28 | 29 | -------------------------------------------------------------------------------- /streaming-zeromq/README.md: -------------------------------------------------------------------------------- 1 | 19 | # Spark Streaming ZeroMQ Connector 20 | 21 | A library for reading data from [ZeroMQ](http://zeromq.org/) using Spark Streaming. 22 | 23 | ## Linking 24 | 25 | Using SBT: 26 | 27 | libraryDependencies += "org.apache.bahir" %% "spark-streaming-zeromq" % "{{site.SPARK_VERSION}}" 28 | 29 | Using Maven: 30 | 31 | 32 | org.apache.bahir 33 | spark-streaming-zeromq_{{site.SCALA_BINARY_VERSION}} 34 | {{site.SPARK_VERSION}} 35 | 36 | 37 | This library can also be added to Spark jobs launched through `spark-shell` or `spark-submit` by using the `--packages` command line option. 38 | For example, to include it when starting the spark shell: 39 | 40 | $ bin/spark-shell --packages org.apache.bahir:spark-streaming-zeromq_{{site.SCALA_BINARY_VERSION}}:{{site.SPARK_VERSION}} 41 | 42 | Unlike using `--jars`, using `--packages` ensures that this library and its dependencies will be added to the classpath. 43 | The `--packages` argument can also be used with `bin/spark-submit`. 44 | 45 | This library is cross-published for Scala 2.11 and Scala 2.12, so users should replace the proper Scala version in the commands listed above. 46 | 47 | ## Examples 48 | 49 | Review end-to-end examples at [ZeroMQ Examples](https://github.com/apache/bahir/tree/master/streaming-zeromq/examples). 50 | 51 | ### Scala API 52 | 53 | import org.apache.spark.streaming.zeromq.ZeroMQUtils 54 | 55 | val lines = ZeroMQUtils.createTextStream( 56 | ssc, "tcp://server:5555", true, Seq("my-topic".getBytes) 57 | ) 58 | 59 | ### Java API 60 | 61 | import org.apache.spark.storage.StorageLevel; 62 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 63 | import org.apache.spark.streaming.zeromq.ZeroMQUtils; 64 | 65 | JavaReceiverInputDStream test1 = ZeroMQUtils.createJavaStream( 66 | ssc, "tcp://server:5555", true, Arrays.asList("my-topic.getBytes()), 67 | StorageLevel.MEMORY_AND_DISK_SER_2() 68 | ); 69 | -------------------------------------------------------------------------------- /streaming-zeromq/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 4.0.0 21 | 22 | org.apache.bahir 23 | bahir-parent_2.12 24 | 3.0.0-SNAPSHOT 25 | ../pom.xml 26 | 27 | 28 | org.apache.bahir 29 | spark-streaming-zeromq_2.12 30 | 31 | streaming-zeromq 32 | 33 | jar 34 | Apache Bahir - Spark Streaming ZeroMQ 35 | http://bahir.apache.org/ 36 | 37 | 38 | 39 | org.apache.bahir 40 | bahir-common_${scala.binary.version} 41 | ${project.version} 42 | test-jar 43 | test 44 | 45 | 46 | org.apache.spark 47 | spark-tags_${scala.binary.version} 48 | 49 | 50 | org.apache.spark 51 | spark-streaming_${scala.binary.version} 52 | ${spark.version} 53 | provided 54 | 55 | 56 | org.zeromq 57 | jeromq 58 | 0.4.3 59 | 60 | 61 | org.apache.spark 62 | spark-core_${scala.binary.version} 63 | ${spark.version} 64 | test-jar 65 | test 66 | 67 | 68 | org.scalacheck 69 | scalacheck_${scala.binary.version} 70 | test 71 | 72 | 73 | 74 | target/scala-${scala.binary.version}/classes 75 | target/scala-${scala.binary.version}/test-classes 76 | 77 | 78 | org.apache.maven.plugins 79 | maven-source-plugin 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /streaming-zeromq/src/test/java/org/apache/spark/streaming/zeromq/JavaZeroMQStreamSuite.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.streaming.zeromq; 19 | 20 | import org.junit.Test; 21 | 22 | import org.apache.spark.api.java.function.Function; 23 | import org.apache.spark.storage.StorageLevel; 24 | import org.apache.spark.streaming.LocalJavaStreamingContext; 25 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 26 | import zmq.ZMQ; 27 | 28 | import java.util.Arrays; 29 | 30 | public class JavaZeroMQStreamSuite extends LocalJavaStreamingContext { 31 | @Test 32 | public void testZeroMQAPICompatibility() { 33 | // Test the API, but do not exchange any messages. 34 | final String publishUrl = "tcp://localhost:5555"; 35 | final String topic = "topic1"; 36 | final Function> messageConverter = 37 | new Function>() { 38 | @Override 39 | public Iterable call(byte[][] bytes) throws Exception { 40 | // Skip topic name and assume that each message contains only one frame. 41 | return Arrays.asList(new String(bytes[1], ZMQ.CHARSET)); 42 | } 43 | }; 44 | 45 | JavaReceiverInputDStream test1 = ZeroMQUtils.createJavaStream( 46 | ssc, publishUrl, true, Arrays.asList(topic.getBytes()), messageConverter, 47 | StorageLevel.MEMORY_AND_DISK_SER_2() 48 | ); 49 | JavaReceiverInputDStream test2 = ZeroMQUtils.createTextJavaStream( 50 | ssc, publishUrl, true, Arrays.asList(topic.getBytes()), 51 | StorageLevel.MEMORY_AND_DISK_SER_2() 52 | ); 53 | } 54 | } 55 | 56 | -------------------------------------------------------------------------------- /streaming-zeromq/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | log4j.rootCategory=INFO, console, file 19 | 20 | log4j.appender.console=org.apache.log4j.ConsoleAppender 21 | log4j.appender.console.target=System.out 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.conversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 24 | 25 | log4j.appender.file=org.apache.log4j.FileAppender 26 | log4j.appender.file.append=true 27 | log4j.appender.file.file=target/unit-tests.log 28 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 29 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 30 | 31 | # Ignore messages below warning level from Jetty, because it's a bit verbose 32 | log4j.logger.org.spark-project.jetty=WARN 33 | 34 | --------------------------------------------------------------------------------