├── .gitattributes
├── .github
└── workflows
│ ├── distribution-ci.yml
│ └── maven-ci.yml
├── .gitignore
├── LICENSE
├── NOTICE
├── README.md
├── bin
└── run-example
├── common
├── pom.xml
└── src
│ ├── main
│ └── scala
│ │ └── org
│ │ └── apache
│ │ └── bahir
│ │ └── utils
│ │ ├── FileHelper.scala
│ │ ├── Logging.scala
│ │ └── Retry.scala
│ └── test
│ └── java
│ └── org
│ └── apache
│ └── spark
│ ├── ConditionalSparkFunSuite.scala
│ └── streaming
│ └── LocalJavaStreamingContext.java
├── dev
├── change-scala-version.sh
├── checkstyle-license-header.txt
├── checkstyle-suppressions.xml
├── checkstyle.xml
└── release-build.sh
├── distribution
├── pom.xml
└── src
│ └── main
│ └── assembly
│ └── src.xml
├── pom.xml
├── scalastyle-config.xml
├── sql-cloudant
├── README.md
├── examples
│ ├── python
│ │ ├── CloudantApp.py
│ │ ├── CloudantDF.py
│ │ ├── CloudantDFOption.py
│ │ ├── CloudantQuery.py
│ │ └── CloudantQueryDF.py
│ └── src
│ │ └── main
│ │ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── examples
│ │ └── sql
│ │ └── cloudant
│ │ ├── CloudantApp.scala
│ │ ├── CloudantDF.scala
│ │ ├── CloudantDFOption.scala
│ │ ├── CloudantStreaming.scala
│ │ └── CloudantStreamingSelector.scala
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── org
│ │ │ └── apache
│ │ │ └── bahir
│ │ │ └── cloudant
│ │ │ └── common
│ │ │ ├── ChangesRow.java
│ │ │ └── ChangesRowScanner.java
│ ├── resources
│ │ ├── application.conf
│ │ └── reference.conf
│ └── scala
│ │ └── org
│ │ └── apache
│ │ └── bahir
│ │ └── cloudant
│ │ ├── CloudantChangesConfig.scala
│ │ ├── CloudantConfig.scala
│ │ ├── CloudantReceiver.scala
│ │ ├── DefaultSource.scala
│ │ ├── common
│ │ ├── CloudantException.scala
│ │ ├── FilterUtil.scala
│ │ ├── JsonStoreConfigManager.scala
│ │ ├── JsonStoreDataAccess.scala
│ │ ├── JsonStoreRDD.scala
│ │ └── JsonUtil.scala
│ │ └── internal
│ │ └── ChangesReceiver.scala
│ └── test
│ ├── resources
│ ├── json-files
│ │ ├── n_airportcodemapping.json
│ │ ├── n_booking.json
│ │ ├── n_customer.json
│ │ ├── n_customersession.json
│ │ ├── n_flight.json
│ │ └── n_flightsegment.json
│ └── log4j.properties
│ └── scala
│ └── org
│ └── apache
│ └── bahir
│ └── cloudant
│ ├── ClientSparkFunSuite.scala
│ ├── CloudantAllDocsDFSuite.scala
│ ├── CloudantChangesDFSuite.scala
│ ├── CloudantOptionSuite.scala
│ ├── CloudantSparkSQLSuite.scala
│ └── TestUtils.scala
├── sql-streaming-akka
├── README.md
├── examples
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── apache
│ │ │ └── bahir
│ │ │ └── examples
│ │ │ └── sql
│ │ │ └── streaming
│ │ │ └── akka
│ │ │ └── JavaAkkaStreamWordCount.java
│ │ └── scala
│ │ └── org
│ │ └── apache
│ │ └── bahir
│ │ └── examples
│ │ └── sql
│ │ └── streaming
│ │ └── akka
│ │ └── AkkaStreamWordCount.scala
├── pom.xml
└── src
│ ├── main
│ ├── assembly
│ │ └── assembly.xml
│ └── scala
│ │ └── org
│ │ └── apache
│ │ └── bahir
│ │ └── sql
│ │ └── streaming
│ │ └── akka
│ │ ├── AkkaStreamSource.scala
│ │ ├── LongOffset.scala
│ │ └── MessageStore.scala
│ └── test
│ ├── resources
│ ├── feeder_actor.conf
│ └── log4j.properties
│ └── scala
│ └── org
│ └── apache
│ └── bahir
│ └── sql
│ └── streaming
│ └── akka
│ ├── AkkaStreamSourceSuite.scala
│ └── AkkaTestUtils.scala
├── sql-streaming-jdbc
├── README.md
├── examples
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── apache
│ │ │ └── bahir
│ │ │ └── examples
│ │ │ └── sql
│ │ │ └── streaming
│ │ │ └── jdbc
│ │ │ └── JavaJdbcSinkDemo.java
│ │ └── scala
│ │ └── org
│ │ └── apache
│ │ └── bahir
│ │ └── examples
│ │ └── sql
│ │ └── streaming
│ │ └── jdbc
│ │ └── JdbcSinkDemo.scala
├── pom.xml
└── src
│ ├── main
│ ├── resources
│ │ └── META-INF
│ │ │ └── services
│ │ │ └── org.apache.spark.sql.sources.DataSourceRegister
│ └── scala
│ │ └── org
│ │ └── apache
│ │ └── bahir
│ │ └── sql
│ │ └── streaming
│ │ └── jdbc
│ │ ├── JdbcSourceProvider.scala
│ │ ├── JdbcStreamWriter.scala
│ │ └── JdbcUtil.scala
│ └── test
│ ├── resources
│ └── log4j.properties
│ └── scala
│ └── org
│ └── apache
│ └── spark
│ └── sql
│ └── jdbc
│ └── JdbcStreamWriterSuite.scala
├── sql-streaming-mqtt
├── README.md
├── examples
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── apache
│ │ │ └── bahir
│ │ │ └── examples
│ │ │ └── sql
│ │ │ └── streaming
│ │ │ └── mqtt
│ │ │ ├── JavaMQTTSinkWordCount.java
│ │ │ └── JavaMQTTStreamWordCount.java
│ │ └── scala
│ │ └── org
│ │ └── apache
│ │ └── bahir
│ │ └── examples
│ │ └── sql
│ │ └── streaming
│ │ └── mqtt
│ │ ├── MQTTSinkWordCount.scala
│ │ └── MQTTStreamWordCount.scala
├── pom.xml
└── src
│ ├── main
│ ├── assembly
│ │ └── assembly.xml
│ ├── resources
│ │ └── META-INF
│ │ │ └── services
│ │ │ └── org.apache.spark.sql.sources.DataSourceRegister
│ └── scala
│ │ └── org
│ │ └── apache
│ │ ├── bahir
│ │ └── sql
│ │ │ └── streaming
│ │ │ └── mqtt
│ │ │ ├── CachedMQTTClient.scala
│ │ │ ├── LongOffset.scala
│ │ │ ├── MQTTStreamSink.scala
│ │ │ ├── MQTTStreamSource.scala
│ │ │ ├── MQTTUtils.scala
│ │ │ └── MessageStore.scala
│ │ └── spark
│ │ └── sql
│ │ └── mqtt
│ │ ├── HDFSMQTTSourceProvider.scala
│ │ └── HdfsBasedMQTTStreamSource.scala
│ └── test
│ ├── bin
│ └── test-BAHIR-83.sh
│ ├── resources
│ ├── keystore.jks
│ ├── log4j.properties
│ ├── logging.properties
│ └── truststore.jks
│ └── scala
│ └── org
│ └── apache
│ └── bahir
│ └── sql
│ └── streaming
│ └── mqtt
│ ├── HDFSBasedMQTTStreamSourceSuite.scala
│ ├── LocalMessageStoreSuite.scala
│ ├── MQTTStreamSinkSuite.scala
│ ├── MQTTStreamSourceSuite.scala
│ └── MQTTTestUtils.scala
├── sql-streaming-sqs
├── README.md
├── examples
│ └── src
│ │ └── main
│ │ └── scala
│ │ └── org
│ │ └── apache
│ │ └── bahir
│ │ └── examples
│ │ └── sql
│ │ └── streaming
│ │ └── sqs
│ │ └── SqsSourceExample.scala
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── org
│ │ │ └── apache
│ │ │ └── spark
│ │ │ └── sql
│ │ │ └── streaming
│ │ │ └── sqs
│ │ │ ├── BasicAWSCredentialsProvider.java
│ │ │ └── InstanceProfileCredentialsProviderWithRetries.java
│ ├── resources
│ │ ├── META-INF
│ │ │ └── services
│ │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister
│ │ └── log4j.properties
│ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── sql
│ │ └── streaming
│ │ └── sqs
│ │ ├── SqsClient.scala
│ │ ├── SqsFileCache.scala
│ │ ├── SqsSource.scala
│ │ ├── SqsSourceOptions.scala
│ │ └── SqsSourceProvider.scala
│ └── test
│ ├── resources
│ └── log4j.properties
│ └── scala
│ └── org
│ └── apache
│ └── spark
│ └── sql
│ └── streaming
│ └── sqs
│ └── SqsSourceOptionsSuite.scala
├── streaming-akka
├── README.md
├── examples
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── apache
│ │ │ └── spark
│ │ │ └── examples
│ │ │ └── streaming
│ │ │ └── akka
│ │ │ └── JavaActorWordCount.java
│ │ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── examples
│ │ └── streaming
│ │ └── akka
│ │ └── ActorWordCount.scala
├── pom.xml
└── src
│ ├── main
│ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── streaming
│ │ └── akka
│ │ ├── ActorReceiver.scala
│ │ └── AkkaUtils.scala
│ └── test
│ ├── java
│ └── org
│ │ └── apache
│ │ └── spark
│ │ └── streaming
│ │ └── akka
│ │ └── JavaAkkaUtilsSuite.java
│ ├── resources
│ └── log4j.properties
│ └── scala
│ └── org
│ └── apache
│ └── spark
│ └── streaming
│ └── akka
│ ├── AkkaStreamSuite.scala
│ └── AkkaUtilsSuite.scala
├── streaming-mqtt
├── README.md
├── examples
│ └── src
│ │ └── main
│ │ ├── python
│ │ └── streaming
│ │ │ └── mqtt_wordcount.py
│ │ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── examples
│ │ └── streaming
│ │ └── mqtt
│ │ └── MQTTWordCount.scala
├── pom.xml
├── python-tests
│ ├── run-python-tests.sh
│ └── tests.py
├── python
│ └── mqtt.py
└── src
│ ├── main
│ ├── assembly
│ │ └── assembly.xml
│ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── streaming
│ │ └── mqtt
│ │ ├── MQTTInputDStream.scala
│ │ ├── MQTTPairedByteArrayInputDStream.scala
│ │ ├── MQTTPairedInputDStream.scala
│ │ └── MQTTUtils.scala
│ └── test
│ ├── java
│ └── org
│ │ └── apache
│ │ └── spark
│ │ └── streaming
│ │ └── mqtt
│ │ └── JavaMQTTStreamSuite.java
│ ├── resources
│ └── log4j.properties
│ └── scala
│ └── org
│ └── apache
│ └── spark
│ └── streaming
│ └── mqtt
│ ├── MQTTStreamSuite.scala
│ └── MQTTTestUtils.scala
├── streaming-pubnub
├── README.md
├── examples
│ └── src
│ │ └── main
│ │ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── examples
│ │ └── streaming
│ │ └── pubnub
│ │ └── PubNubWordCount.scala
├── pom.xml
└── src
│ ├── main
│ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── streaming
│ │ └── pubnub
│ │ ├── PubNubInputDStream.scala
│ │ └── PubNubUtils.scala
│ └── test
│ ├── java
│ └── org
│ │ └── apache
│ │ └── spark
│ │ └── streaming
│ │ └── pubnub
│ │ └── JavaPubNubStreamSuite.java
│ ├── resources
│ └── log4j.properties
│ └── scala
│ └── org
│ └── apache
│ └── spark
│ └── streaming
│ └── pubnub
│ ├── MessageSerializationSuite.scala
│ └── PubNubStreamSuite.scala
├── streaming-pubsub
├── README.md
├── examples
│ └── src
│ │ └── main
│ │ └── scala
│ │ └── org.apache.spark.examples.streaming.pubsub
│ │ └── PubsubWordCount.scala
├── pom.xml
└── src
│ ├── main
│ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── streaming
│ │ └── pubsub
│ │ ├── PubsubInputDStream.scala
│ │ ├── PubsubUtils.scala
│ │ └── SparkGCPCredentials.scala
│ └── test
│ ├── java
│ └── org
│ │ └── apache
│ │ └── spark
│ │ └── streaming
│ │ └── pubsub
│ │ └── JavaPubsubStreamSuite.java
│ ├── resources
│ └── log4j.properties
│ └── scala
│ └── org
│ └── apache
│ └── spark
│ └── streaming
│ └── pubsub
│ ├── PubsubStreamSuite.scala
│ ├── PubsubTestUtils.scala
│ └── SparkGCPCredentialsBuilderSuite.scala
├── streaming-twitter
├── README.md
├── examples
│ ├── data
│ │ └── AFINN-111.txt
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── apache
│ │ │ └── spark
│ │ │ └── examples
│ │ │ └── streaming
│ │ │ └── twitter
│ │ │ └── JavaTwitterHashTagJoinSentiments.java
│ │ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── examples
│ │ └── streaming
│ │ └── twitter
│ │ ├── TwitterAlgebirdCMS.scala
│ │ ├── TwitterAlgebirdHLL.scala
│ │ ├── TwitterHashTagJoinSentiments.scala
│ │ ├── TwitterLocations.scala
│ │ └── TwitterPopularTags.scala
├── pom.xml
└── src
│ ├── main
│ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── streaming
│ │ └── twitter
│ │ ├── TwitterInputDStream.scala
│ │ └── TwitterUtils.scala
│ └── test
│ ├── java
│ └── org
│ │ └── apache
│ │ └── spark
│ │ └── streaming
│ │ └── twitter
│ │ └── JavaTwitterStreamSuite.java
│ ├── resources
│ └── log4j.properties
│ └── scala
│ └── org
│ └── apache
│ └── spark
│ └── streaming
│ └── twitter
│ └── TwitterStreamSuite.scala
└── streaming-zeromq
├── README.md
├── examples
└── src
│ └── main
│ └── scala
│ └── org
│ └── apache
│ └── spark
│ └── examples
│ └── streaming
│ └── zeromq
│ └── ZeroMQWordCount.scala
├── pom.xml
└── src
├── main
└── scala
│ └── org
│ └── apache
│ └── spark
│ └── streaming
│ └── zeromq
│ ├── ZeroMQInputDStream.scala
│ └── ZeroMQUtils.scala
└── test
├── java
└── org
│ └── apache
│ └── spark
│ └── streaming
│ └── zeromq
│ └── JavaZeroMQStreamSuite.java
├── resources
└── log4j.properties
└── scala
└── org
└── apache
└── spark
└── streaming
└── zeromq
└── ZeroMQStreamSuite.scala
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Set the default behavior to have all files normalized to Unix-style
2 | # line endings upon check-in.
3 | * text=auto
4 |
5 | # Declare files that will always have CRLF line endings on checkout.
6 | *.bat text eol=crlf
7 |
8 | # Denote all files that are truly binary and should not be modified.
9 | *.dll binary
10 | *.exp binary
11 | *.lib binary
12 | *.pdb binary
13 | *.exe binary
14 |
--------------------------------------------------------------------------------
/.github/workflows/distribution-ci.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | name: Distribution
19 |
20 | on:
21 | push:
22 | branches: [ master ]
23 |
24 | jobs:
25 | build:
26 |
27 | runs-on: ubuntu-latest
28 | strategy:
29 | matrix:
30 | java: ['8']
31 |
32 | steps:
33 | - uses: actions/checkout@v2
34 | - name: Set up JDK ${{ matrix.java }}
35 | uses: actions/setup-java@v2
36 | with:
37 | java-version: ${{ matrix.java }}
38 | distribution: 'zulu'
39 | cache: maven
40 |
41 | - name: Build
42 | run: mvn -Pdistribution clean install
43 |
--------------------------------------------------------------------------------
/.github/workflows/maven-ci.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | name: Java CI with Maven
19 |
20 | on:
21 | push:
22 | branches: [ master ]
23 | pull_request:
24 | branches: [ master ]
25 |
26 | jobs:
27 | build:
28 |
29 | runs-on: ubuntu-latest
30 | strategy:
31 | matrix:
32 | java: ['8']
33 | spark-version: ['2.4.8']
34 | scala-version: ['2.12']
35 |
36 | steps:
37 | - uses: actions/checkout@v2
38 | - name: Set up JDK ${{ matrix.java }}
39 | uses: actions/setup-java@v2
40 | with:
41 | java-version: ${{ matrix.java }}
42 | distribution: 'zulu'
43 | cache: maven
44 | - name: Change scala version to ${{ matrix.scala-version }}
45 | run: ./dev/change-scala-version.sh ${{ matrix.scala-version }}
46 | shell: bash
47 | - name: Build with spark ${{ matrix.spark-version }}
48 | run: mvn -B clean verify -Dscala-${{ matrix.scala-version }} -Dspark.version=${{ matrix.spark-version }}
49 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Mac
2 | .DS_Store
3 |
4 | # Eclipse
5 | .classpath
6 | .project
7 | .settings/
8 | target/
9 |
10 | # Intellij
11 | .idea/
12 | .idea_modules/
13 | *.iml
14 | *.iws
15 | *.class
16 | *.log
17 |
18 | # Python
19 | *.pyc
20 |
21 | # Others
22 | .checkstyle
23 | .fbExcludeFilterFile
24 | dependency-reduced-pom.xml
25 | checkpoint
26 |
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Apache Bahir
2 | Copyright (c) 2016-2017 The Apache Software Foundation.
3 |
4 | This product includes software developed at
5 | The Apache Software Foundation (http://www.apache.org/).
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
19 | # Apache Bahir
20 |
21 | Apache Bahir provides extensions to distributed analytics platforms such as Apache Spark & Apache Flink.
22 |
23 |
24 |
25 | ## Apache Bahir origins
26 |
27 | The Initial Bahir source code (see issue [BAHIR-1](https://issues.apache.org/jira/browse/BAHIR-1)) containing the source for the Apache Spark streaming connectors for akka, mqtt, twitter, zeromq
28 | extracted from [Apache Spark revision 8301fad](https://github.com/apache/spark/tree/8301fadd8d269da11e72870b7a889596e3337839)
29 | (before the [deletion of the streaming connectors akka, mqtt, twitter, zeromq](https://issues.apache.org/jira/browse/SPARK-13843)).
30 |
31 | ## Source code structure
32 |
33 | Source code folder structure:
34 | ```
35 | - streaming-akka
36 | - examples/src/main/...
37 | - src/main/...
38 | - streaming-mqtt
39 | - examples
40 | - src
41 | - python
42 | - ...
43 | ```
44 |
45 | ## Building Bahir
46 |
47 | Bahir is built using [Apache Maven](http://maven.apache.org/).
48 | To build Bahir and its example programs, run:
49 |
50 | mvn -DskipTests clean install
51 |
52 | ## Running tests
53 |
54 | Testing first requires [building Bahir](#building-bahir). Once Bahir is built, tests
55 | can be run using:
56 |
57 | mvn test
58 |
59 | ## Example programs
60 |
61 | Each extension currently available in Apache Bahir has an example application located under the "examples" folder.
62 |
63 |
64 | ## Documentation
65 |
66 | Currently, each submodule has its own README.md, with information on example usages and API.
67 |
68 | * [SQL Cloudant](https://github.com/apache/bahir/blob/master/sql-cloudant/README.md)
69 | * [SQL Streaming Akka](https://github.com/apache/bahir/blob/master/sql-streaming-akka/README.md)
70 | * [SQL Streaming JDBC](https://github.com/apache/bahir/blob/master/sql-streaming-jdbc/README.md)
71 | * [SQL Streaming MQTT](https://github.com/apache/bahir/blob/master/sql-streaming-mqtt/README.md)
72 | * [SQL Streaming SQS](https://github.com/apache/bahir/blob/master/sql-streaming-sqs/README.md)
73 | * [Streaming Akka](https://github.com/apache/bahir/blob/master/streaming-akka/README.md)
74 | * [Streaming MQTT](https://github.com/apache/bahir/blob/master/streaming-mqtt/README.md)
75 | * [Streaming PubNub](https://github.com/apache/bahir/blob/master/streaming-pubnub/README.md)
76 | * [Streaming Google Pub/Sub](https://github.com/apache/bahir/blob/master/streaming-pubsub/README.md)
77 | * [Streaming Twitter](https://github.com/apache/bahir/blob/master/streaming-twitter/README.md)
78 | * [Streaming ZeroMQ](https://github.com/apache/bahir/blob/master/streaming-zeromq/README.md)
79 |
80 | Furthermore, to generate scaladocs for each module:
81 |
82 | `$ mvn package`
83 |
84 | Scaladocs is generated in, `MODULE_NAME/target/site/scaladocs/index.html`. __ Where `MODULE_NAME` is one of, `sql-streaming-mqtt`, `streaming-akka`, `streaming-mqtt`, `streaming-zeromq`, `streaming-twitter`. __
85 |
86 | ## A note about Apache Spark integration
87 |
88 | Currently, each module in Bahir is available through spark packages. Please follow linking sub section in module specific [README.md](#documentation) for more details.
89 |
--------------------------------------------------------------------------------
/common/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
20 | 4.0.0
21 |
22 | org.apache.bahir
23 | bahir-parent_2.12
24 | 3.0.0-SNAPSHOT
25 | ../pom.xml
26 |
27 |
28 | org.apache.bahir
29 | bahir-common_2.12
30 |
31 | bahir-common
32 |
33 | jar
34 | Apache Bahir - Common
35 | http://bahir.apache.org/
36 |
37 |
38 |
39 | org.apache.spark
40 | spark-tags_${scala.binary.version}
41 |
42 |
43 | org.apache.spark
44 | spark-streaming_${scala.binary.version}
45 | ${spark.version}
46 | compile
47 | true
48 |
49 |
50 | org.apache.spark
51 | spark-core_${scala.binary.version}
52 | ${spark.version}
53 | compile
54 |
55 |
56 | org.apache.spark
57 | spark-core_${scala.binary.version}
58 | ${spark.version}
59 | test-jar
60 | test
61 |
62 |
63 | org.scalacheck
64 | scalacheck_${scala.binary.version}
65 | test
66 |
67 |
68 |
69 | target/scala-${scala.binary.version}/classes
70 | target/scala-${scala.binary.version}/test-classes
71 |
72 |
73 | org.apache.maven.plugins
74 | maven-source-plugin
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/common/src/main/scala/org/apache/bahir/utils/FileHelper.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.utils
19 |
20 | import java.io.{File, IOException}
21 | import java.nio.file.{Files, FileVisitResult, Path, SimpleFileVisitor}
22 | import java.nio.file.attribute.BasicFileAttributes
23 |
24 | object FileHelper extends Logging {
25 | def deleteFileQuietly(file: File): Path = {
26 | Files.walkFileTree(file.toPath, new SimpleFileVisitor[Path]() {
27 | override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = {
28 | try {
29 | Files.delete(file)
30 | } catch {
31 | case t: Throwable => log.warn("Failed to delete", t)
32 | }
33 | FileVisitResult.CONTINUE
34 | }
35 |
36 | override def postVisitDirectory(dir: Path, exc: IOException): FileVisitResult = {
37 | try {
38 | Files.delete(dir)
39 | } catch {
40 | case t: Throwable => log.warn("Failed to delete", t)
41 | }
42 | FileVisitResult.CONTINUE
43 | }
44 | })
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/common/src/main/scala/org/apache/bahir/utils/Logging.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.utils
19 |
20 | import org.slf4j.LoggerFactory
21 |
22 | trait Logging {
23 | final val log = LoggerFactory.getLogger(this.getClass.getName.stripSuffix("$"))
24 | }
25 |
--------------------------------------------------------------------------------
/common/src/main/scala/org/apache/bahir/utils/Retry.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.utils
19 |
20 | object Retry {
21 | /**
22 | * Retry invocation of given code.
23 | * @param attempts Number of attempts to try executing given code. -1 represents infinity.
24 | * @param pauseMs Number of backoff milliseconds.
25 | * @param retryExceptions Types of exceptions to retry.
26 | * @param code Function to execute.
27 | * @tparam A Type parameter.
28 | * @return Returns result of function execution or exception in case of failure.
29 | */
30 | def apply[A](attempts: Int, pauseMs: Long, retryExceptions: Class[_]*)(code: => A): A = {
31 | var result: Option[A] = None
32 | var success = false
33 | var remaining = attempts
34 | while (!success) {
35 | try {
36 | remaining -= 1
37 | result = Some(code)
38 | success = true
39 | }
40 | catch {
41 | case e: Exception =>
42 | if (retryExceptions.contains(e.getClass) && (attempts == -1 || remaining > 0)) {
43 | Thread.sleep(pauseMs)
44 | } else {
45 | throw e
46 | }
47 | }
48 | }
49 | result.get
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/common/src/test/java/org/apache/spark/ConditionalSparkFunSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark
19 |
20 | trait ConditionalSparkFunSuite extends SparkFunSuite {
21 | /**
22 | * Run test if given predicate is satisfied.
23 | * @param testName Test name
24 | * @param condition If satisfied, test will be executed
25 | * @param testBody Test body
26 | */
27 | def testIf(testName: String, condition: () => Boolean)(testBody: => Unit) {
28 | if (condition()) {
29 | test(testName)(testBody)
30 | } else {
31 | ignore(testName)(testBody)
32 | }
33 | }
34 |
35 | /**
36 | * Run given code only if predicate has been satisfied.
37 | * @param condition If satisfied, run code block
38 | * @param body Code block
39 | */
40 | def runIf(condition: () => Boolean)(body: => Unit): Unit = {
41 | if (condition()) {
42 | body
43 | }
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/common/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.streaming;
19 |
20 | import org.apache.spark.SparkConf;
21 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
22 | import org.junit.After;
23 | import org.junit.Before;
24 |
25 | public abstract class LocalJavaStreamingContext {
26 | protected transient JavaStreamingContext ssc;
27 |
28 | @Before
29 | public void setUp() {
30 | final SparkConf conf = new SparkConf()
31 | .setMaster("local[2]")
32 | .setAppName("test")
33 | .set("spark.streaming.clock", "org.apache.spark.util.ManualClock");
34 | ssc = new JavaStreamingContext(conf, new Duration(1000));
35 | ssc.checkpoint("checkpoint");
36 | }
37 |
38 | @After
39 | public void tearDown() {
40 | ssc.stop();
41 | ssc = null;
42 | }
43 | }
44 |
45 |
--------------------------------------------------------------------------------
/dev/change-scala-version.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | set -e
21 |
22 | VALID_VERSIONS=( 2.11 2.12 )
23 |
24 | usage() {
25 | echo "Usage: $(basename $0) [-h|--help]
26 | where :
27 | -h| --help Display this help text
28 | valid version values : ${VALID_VERSIONS[*]}
29 | " 1>&2
30 | exit 1
31 | }
32 |
33 | if [[ ($# -ne 1) || ( $1 == "--help") || $1 == "-h" ]]; then
34 | usage
35 | fi
36 |
37 | TO_VERSION=$1
38 |
39 | check_scala_version() {
40 | for i in ${VALID_VERSIONS[*]}; do [ $i = "$1" ] && return 0; done
41 | echo "Invalid Scala version: $1. Valid versions: ${VALID_VERSIONS[*]}" 1>&2
42 | exit 1
43 | }
44 |
45 | check_scala_version "$TO_VERSION"
46 |
47 | if [ $TO_VERSION = "2.12" ]; then
48 | FROM_VERSION="2.11"
49 | else
50 | FROM_VERSION="2.12"
51 | fi
52 |
53 | sed_i() {
54 | sed -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2"
55 | }
56 |
57 | export -f sed_i
58 |
59 | BASEDIR=$(dirname $0)/..
60 | find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \
61 | -exec bash -c "sed_i 's/\(artifactId.*\)_'$FROM_VERSION'/\1_'$TO_VERSION'/g' {}" \;
62 |
63 | # also update in parent POM
64 | # match any scala binary version to ensure idempotency
65 | sed_i '1,/[0-9]*\.[0-9]*[0-9]*\.[0-9]*'$TO_VERSION'' \
66 | "$BASEDIR/pom.xml"
67 |
--------------------------------------------------------------------------------
/dev/checkstyle-license-header.txt:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
--------------------------------------------------------------------------------
/dev/checkstyle-suppressions.xml:
--------------------------------------------------------------------------------
1 |
17 |
18 |
21 |
22 |
29 |
30 |
31 |
33 |
34 |
--------------------------------------------------------------------------------
/distribution/src/main/assembly/src.xml:
--------------------------------------------------------------------------------
1 |
17 |
18 | src
19 |
20 |
21 | tar.gz
22 | zip
23 |
24 |
25 | true
26 | apache-bahir-${version}-src
27 |
28 |
29 |
30 | ..
31 |
32 |
33 | **/.*
34 | **/.*/**
35 | **/*.log
36 | **/*.iml
37 | **/conf/*.properties
38 | **/conf/*.xml
39 | **/dependency-reduced-pom.xml
40 | **/scratch_space
41 | **/scratch_space/**/*
42 | **/target
43 | **/target/**/*
44 | **/temp
45 | **/temp/**/*
46 |
47 |
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/sql-cloudant/examples/python/CloudantApp.py:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from pyspark.sql import SparkSession
18 |
19 | spark = SparkSession\
20 | .builder\
21 | .appName("Cloudant Spark SQL Example in Python using temp tables")\
22 | .config("cloudant.host","ACCOUNT.cloudant.com")\
23 | .config("cloudant.username", "USERNAME")\
24 | .config("cloudant.password","PASSWORD")\
25 | .getOrCreate()
26 |
27 |
28 | # ***1. Loading temp table from Cloudant db
29 | spark.sql(" CREATE TEMPORARY TABLE airportTable USING org.apache.bahir.cloudant OPTIONS ( database 'n_airportcodemapping')")
30 | airportData = spark.sql("SELECT _id, airportName FROM airportTable WHERE _id >= 'CAA' AND _id <= 'GAA' ORDER BY _id")
31 | airportData.printSchema()
32 | print ('Total # of rows in airportData: ' + str(airportData.count()))
33 | for code in airportData.collect():
34 | print (code._id)
35 |
36 |
37 | # ***2. Loading temp table from Cloudant search index
38 | print ('About to test org.apache.bahir.cloudant for flight with index')
39 | spark.sql(" CREATE TEMPORARY TABLE flightTable1 USING org.apache.bahir.cloudant OPTIONS ( database 'n_flight', index '_design/view/_search/n_flights')")
40 | flightData = spark.sql("SELECT flightSegmentId, scheduledDepartureTime FROM flightTable1 WHERE flightSegmentId >'AA9' AND flightSegmentId<'AA95'")
41 | flightData.printSchema()
42 | for code in flightData.collect():
43 | print ('Flight {0} on {1}'.format(code.flightSegmentId, code.scheduledDepartureTime))
44 |
45 |
--------------------------------------------------------------------------------
/sql-cloudant/examples/python/CloudantDF.py:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from pyspark.sql import SparkSession
18 |
19 | # define cloudant related configuration
20 | # set protocol to http if needed, default value=https
21 | # config("cloudant.protocol","http")
22 | spark = SparkSession\
23 | .builder\
24 | .appName("Cloudant Spark SQL Example in Python using dataframes")\
25 | .config("cloudant.host","ACCOUNT.cloudant.com")\
26 | .config("cloudant.username", "USERNAME")\
27 | .config("cloudant.password","PASSWORD")\
28 | .config("jsonstore.rdd.partitions", 8)\
29 | .getOrCreate()
30 |
31 |
32 | # ***1. Loading dataframe from Cloudant db
33 | df = spark.read.load("n_airportcodemapping", "org.apache.bahir.cloudant")
34 | # In case of doing multiple operations on a dataframe (select, filter etc.)
35 | # you should persist the dataframe.
36 | # Othewise, every operation on the dataframe will load the same data from Cloudant again.
37 | # Persisting will also speed up computation.
38 | df.cache() # persisting in memory
39 | # alternatively for large dbs to persist in memory & disk:
40 | # from pyspark import StorageLevel
41 | # df.persist(storageLevel = StorageLevel(True, True, False, True, 1))
42 | df.printSchema()
43 | df.filter(df.airportName >= 'Moscow').select("_id",'airportName').show()
44 | df.filter(df._id >= 'CAA').select("_id",'airportName').show()
45 |
46 |
47 | # ***2. Saving a datafram to Cloudant db
48 | df = spark.read.load(format="org.apache.bahir.cloudant", database="n_flight")
49 | df.printSchema()
50 | df2 = df.filter(df.flightSegmentId=='AA106')\
51 | .select("flightSegmentId", "economyClassBaseCost")
52 | df2.write.save("n_flight2", "org.apache.bahir.cloudant",
53 | bulkSize = "100", createDBOnSave="true")
54 | total = df.filter(df.flightSegmentId >'AA9').select("flightSegmentId",
55 | "scheduledDepartureTime").orderBy(df.flightSegmentId).count()
56 | print ("Total", total, "flights from table")
57 |
58 |
59 | # ***3. Loading dataframe from a Cloudant search index
60 | df = spark.read.load(format="org.apache.bahir.cloudant", database="n_flight",
61 | index="_design/view/_search/n_flights")
62 | df.printSchema()
63 | total = df.filter(df.flightSegmentId >'AA9').select("flightSegmentId",
64 | "scheduledDepartureTime").orderBy(df.flightSegmentId).count()
65 | print ("Total", total, "flights from index")
66 |
67 |
68 | # ***4. Loading dataframe from a Cloudant view
69 | df = spark.read.load(format="org.apache.bahir.cloudant", path="n_flight",
70 | view="_design/view/_view/AA0", schemaSampleSize="20")
71 | # schema for view will always be: _id, key, value
72 | # where value can be a complex field
73 | df.printSchema()
74 | df.show()
75 |
--------------------------------------------------------------------------------
/sql-cloudant/examples/python/CloudantDFOption.py:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from pyspark.sql import SparkSession
18 |
19 | spark = SparkSession\
20 | .builder\
21 | .appName("Cloudant Spark SQL Example in Python using dataframes with options")\
22 | .getOrCreate()
23 |
24 | cloudant_host = "ACCOUNT.cloudant.com"
25 | cloudant_username = "USERNAME"
26 | cloudant_password = "PASSWORD"
27 |
28 | # ***1. Loading dataframe from Cloudant db
29 | df = spark.read.format("org.apache.bahir.cloudant") \
30 | .option("cloudant.host", cloudant_host) \
31 | .option("cloudant.username", cloudant_username) \
32 | .option("cloudant.password", cloudant_password) \
33 | .load("n_airportcodemapping")
34 | df.cache() # persisting in memory
35 | df.printSchema()
36 | df.filter(df._id >= 'CAA').select("_id",'airportName').show()
37 |
38 |
39 | # ***2.Saving dataframe to Cloudant db
40 | df.filter(df._id >= 'CAA').select("_id",'airportName') \
41 | .write.format("org.apache.bahir.cloudant") \
42 | .option("cloudant.host", cloudant_host) \
43 | .option("cloudant.username", cloudant_username) \
44 | .option("cloudant.password",cloudant_password) \
45 | .option("bulkSize","100") \
46 | .option("createDBOnSave", "true") \
47 | .save("airportcodemapping_df")
48 | df = spark.read.format("org.apache.bahir.cloudant") \
49 | .option("cloudant.host", cloudant_host) \
50 | .option("cloudant.username", cloudant_username) \
51 | .option("cloudant.password", cloudant_password) \
52 | .load("n_flight")
53 | df.printSchema()
54 | total = df.filter(df.flightSegmentId >'AA9') \
55 | .select("flightSegmentId", "scheduledDepartureTime") \
56 | .orderBy(df.flightSegmentId).count()
57 | print ("Total", total, "flights from table")
58 |
59 |
60 | # ***3. Loading dataframe from Cloudant search index
61 | df = spark.read.format("org.apache.bahir.cloudant") \
62 | .option("cloudant.host",cloudant_host) \
63 | .option("cloudant.username",cloudant_username) \
64 | .option("cloudant.password",cloudant_password) \
65 | .option("index","_design/view/_search/n_flights").load("n_flight")
66 | df.printSchema()
67 |
68 | total = df.filter(df.flightSegmentId >'AA9') \
69 | .select("flightSegmentId", "scheduledDepartureTime") \
70 | .orderBy(df.flightSegmentId).count()
71 | print ("Total", total, "flights from index")
72 |
--------------------------------------------------------------------------------
/sql-cloudant/examples/python/CloudantQuery.py:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import pprint
18 | from pyspark.sql import SparkSession
19 |
20 | # define cloudant related configuration
21 | # set protocol to http if needed, default value=https
22 | # config("cloudant.protocol","http")
23 | spark = SparkSession\
24 | .builder\
25 | .appName("Cloudant Spark SQL Example in Python using query")\
26 | .config("cloudant.host","ACCOUNT.cloudant.com")\
27 | .config("cloudant.username", "USERNAME")\
28 | .config("cloudant.password","PASSWORD")\
29 | .config("jsonstore.rdd.partitions", 8)\
30 | .config("cloudant.useQuery", "true")\
31 | .config("schemaSampleSize",1)\
32 | .getOrCreate()
33 |
34 |
35 | spark.sql(" CREATE TEMPORARY VIEW airportTable1 USING org.apache.bahir.cloudant OPTIONS ( database 'n_airportcodemapping')")
36 | airportData = spark.sql("SELECT _id, airportName FROM airportTable1 WHERE airportName == 'Moscow' ")
37 | airportData.printSchema()
38 | print 'Total # of rows in airportData: ' + str(airportData.count())
39 | airportData.show()
40 |
41 | spark.sql(" CREATE TEMPORARY VIEW airportTable2 USING org.apache.bahir.cloudant OPTIONS ( database 'n_airportcodemapping')")
42 | airportData = spark.sql("SELECT _id, airportName FROM airportTable2 WHERE airportName > 'Moscow' ORDER BY _id")
43 | airportData.printSchema()
44 | print 'Total # of rows in airportData: ' + str(airportData.count())
45 | airportData.show()
46 |
47 | spark.sql(" CREATE TEMPORARY VIEW airportTable3 USING org.apache.bahir.cloudant OPTIONS ( database 'n_airportcodemapping')")
48 | airportData = spark.sql("SELECT _id, airportName FROM airportTable3 WHERE airportName > 'Moscow' AND airportName < 'Sydney' ORDER BY _id")
49 | airportData.printSchema()
50 | print 'Total # of rows in airportData: ' + str(airportData.count())
51 | airportData.show()
52 |
53 | spark.sql(" CREATE TEMPORARY VIEW flight1 USING org.apache.bahir.cloudant OPTIONS ( database 'n_flight')")
54 | flightData = spark.sql("SELECT flightSegmentId, economyClassBaseCost, numFirstClassSeats FROM flight1 WHERE economyClassBaseCost >=200 AND numFirstClassSeats<=10")
55 | flightData.printSchema()
56 | print 'Total # of rows in airportData: ' + str(flightData.count())
57 | flightData.show()
58 |
59 | spark.sql(" CREATE TEMPORARY VIEW flight2 USING org.apache.bahir.cloudant OPTIONS ( database 'n_flight')")
60 | flightData = spark.sql("SELECT flightSegmentId, scheduledDepartureTime, scheduledArrivalTime FROM flight2 WHERE scheduledDepartureTime >='2014-12-15T05:00:00.000Z' AND scheduledArrivalTime <='2014-12-15T11:04:00.000Z'")
61 | flightData.printSchema()
62 | print 'Total # of rows in airportData: ' + str(flightData.count())
63 | flightData.show()
64 |
65 |
66 |
--------------------------------------------------------------------------------
/sql-cloudant/examples/python/CloudantQueryDF.py:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import pprint
18 | from pyspark.sql import SparkSession
19 |
20 | # define cloudant related configuration
21 | # set protocol to http if needed, default value=https
22 | # config("cloudant.protocol","http")
23 | spark = SparkSession\
24 | .builder\
25 | .appName("Cloudant Spark SQL Example in Python using query")\
26 | .config("cloudant.host","ACCOUNT.cloudant.com")\
27 | .config("cloudant.username", "USERNAME")\
28 | .config("cloudant.password","PASSWORD")\
29 | .config("jsonstore.rdd.partitions", 8)\
30 | .config("cloudant.useQuery", "true")\
31 | .config("schemaSampleSize",1)\
32 | .getOrCreate()
33 |
34 |
35 | # ***0. Loading dataframe from Cloudant db with one String field condition
36 | df = spark.read.load("n_airportcodemapping", "org.apache.bahir.cloudant")
37 | df.printSchema()
38 | df.filter(df.airportName == 'Moscow').select("_id",'airportName').show()
39 |
40 |
41 | # ***1. Loading dataframe from Cloudant db with one String field condition
42 | df = spark.read.load("n_airportcodemapping", "org.apache.bahir.cloudant")
43 | df.printSchema()
44 | df.filter(df.airportName > 'Moscow').select("_id",'airportName').show()
45 |
46 | # ***2. Loading dataframe from Cloudant db with two String field condition
47 | df = spark.read.load("n_airportcodemapping", "org.apache.bahir.cloudant")
48 | df.printSchema()
49 | df.filter(df.airportName > 'Moscow').filter(df.airportName < 'Sydney').select("_id",'airportName').show()
50 |
51 | # ***3. Loading dataframe from Cloudant db with two int field condition
52 | df = spark.read.load("n_flight", "org.apache.bahir.cloudant")
53 | df.printSchema()
54 | df.filter(df.economyClassBaseCost >= 200).filter(df.numFirstClassSeats <=10).select('flightSegmentId','scheduledDepartureTime', 'scheduledArrivalTime').show()
55 |
56 | # ***4. Loading dataframe from Cloudant db with two timestamp field condition
57 | df = spark.read.load("n_flight", "org.apache.bahir.cloudant")
58 | df.printSchema()
59 | df.filter(df.scheduledDepartureTime >= "2014-12-15T05:00:00.000Z").filter(df.scheduledArrivalTime <="2014-12-15T11:04:00.000Z").select('flightSegmentId','scheduledDepartureTime', 'scheduledArrivalTime').show()
60 |
61 |
62 |
--------------------------------------------------------------------------------
/sql-cloudant/examples/src/main/scala/org/apache/spark/examples/sql/cloudant/CloudantApp.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples.sql.cloudant
19 |
20 | import org.apache.spark.sql.SparkSession
21 |
22 | object CloudantApp {
23 | def main(args: Array[String]) {
24 | val spark = SparkSession
25 | .builder()
26 | .appName("Cloudant Spark SQL Example")
27 | .config("cloudant.host", "ACCOUNT.cloudant.com")
28 | .config("cloudant.username", "USERNAME")
29 | .config("cloudant.password", "PASSWORD")
30 | .getOrCreate()
31 |
32 | // For implicit conversions of Dataframe to RDDs
33 | import spark.implicits._
34 |
35 | // create a temp table from Cloudant db and query it using sql syntax
36 | spark.sql(
37 | s"""
38 | |CREATE TEMPORARY VIEW airportTable
39 | |USING org.apache.bahir.cloudant
40 | |OPTIONS ( database 'n_airportcodemapping')
41 | """.stripMargin)
42 | // create a dataframe
43 | val airportData = spark.sql(
44 | s"""
45 | |SELECT _id, airportName
46 | |FROM airportTable
47 | |WHERE _id >= 'CAA' AND _id <= 'GAA' ORDER BY _id
48 | """.stripMargin)
49 | airportData.printSchema()
50 | println(s"Total # of rows in airportData: " + airportData.count()) // scalastyle:ignore
51 | // convert dataframe to array of Rows, and process each row
52 | airportData.map(t => "code: " + t(0) + ",name:" + t(1)).collect().foreach(println) // scalastyle:ignore
53 |
54 | // create a temp table from Cloudant index and query it using sql syntax
55 | spark.sql(
56 | s"""
57 | |CREATE TEMPORARY VIEW flightTable
58 | |USING org.apache.bahir.cloudant
59 | |OPTIONS (database 'n_flight', index '_design/view/_search/n_flights')
60 | """.stripMargin)
61 | val flightData = spark.sql(
62 | s"""
63 | |SELECT flightSegmentId, scheduledDepartureTime
64 | |FROM flightTable
65 | |WHERE flightSegmentId >'AA9' AND flightSegmentId<'AA95'
66 | """.stripMargin)
67 | flightData.printSchema()
68 | flightData.map(t => "flightSegmentId: " + t(0) + ", scheduledDepartureTime: " + t(1))
69 | .collect().foreach(println) // scalastyle:ignore
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/sql-cloudant/examples/src/main/scala/org/apache/spark/examples/sql/cloudant/CloudantDF.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples.sql.cloudant
19 |
20 | import org.apache.spark.sql.SparkSession
21 |
22 | object CloudantDF{
23 | def main(args: Array[String]) {
24 | val spark = SparkSession
25 | .builder()
26 | .appName("Cloudant Spark SQL Example with Dataframe")
27 | .config("cloudant.host", "ACCOUNT.cloudant.com")
28 | .config("cloudant.username", "USERNAME")
29 | .config("cloudant.password", "PASSWORD")
30 | .config("createDBOnSave", "true") // to create a db on save
31 | .config("jsonstore.rdd.partitions", "20") // using 20 partitions
32 | .getOrCreate()
33 |
34 | // 1. Loading data from Cloudant db
35 | val df = spark.read.format("org.apache.bahir.cloudant").load("n_flight")
36 | // Caching df in memory to speed computations
37 | // and not to retrieve data from cloudant again
38 | df.cache()
39 | df.printSchema()
40 |
41 | // 2. Saving dataframe to Cloudant db
42 | val df2 = df.filter(df("flightSegmentId") === "AA106")
43 | .select("flightSegmentId", "economyClassBaseCost")
44 | df2.show()
45 | df2.write.format("org.apache.bahir.cloudant").save("n_flight2")
46 |
47 | // 3. Loading data from Cloudant search index
48 | val df3 = spark.read.format("org.apache.bahir.cloudant")
49 | .option("index", "_design/view/_search/n_flights").load("n_flight")
50 | val total = df3.filter(df3("flightSegmentId") >"AA9")
51 | .select("flightSegmentId", "scheduledDepartureTime")
52 | .orderBy(df3("flightSegmentId")).count()
53 | println(s"Total $total flights from index") // scalastyle:ignore
54 |
55 | // 4. Loading data from view
56 | val df4 = spark.read.format("org.apache.bahir.cloudant")
57 | .option("view", "_design/view/_view/AA0").load("n_flight")
58 | df4.printSchema()
59 | df4.show()
60 |
61 | // 5. Loading data from a view with map and reduce
62 | // Loading data from Cloudant db
63 | val df5 = spark.read.format("org.apache.bahir.cloudant")
64 | .option("view", "_design/view/_view/AAreduce?reduce=true")
65 | .load("n_flight")
66 | df5.printSchema()
67 | df5.show()
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/sql-cloudant/examples/src/main/scala/org/apache/spark/examples/sql/cloudant/CloudantDFOption.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples.sql.cloudant
19 |
20 | import org.apache.spark.sql.SparkSession
21 |
22 | object CloudantDFOption{
23 | def main(args: Array[String]) {
24 | val spark = SparkSession
25 | .builder()
26 | .appName("Cloudant Spark SQL Example with Dataframe using Option")
27 | .getOrCreate()
28 |
29 | val cloudantHost = "ACCOUNT.cloudant.com"
30 | val cloudantUser = "USERNAME"
31 | val cloudantPassword = "PASSWORD"
32 |
33 | // 1. Loading data from Cloudant db
34 | val df = spark.read.format("org.apache.bahir.cloudant")
35 | .option("cloudant.host", cloudantHost)
36 | .option("cloudant.username", cloudantUser)
37 | .option("cloudant.password", cloudantPassword)
38 | .load("n_airportcodemapping")
39 |
40 | df.cache()
41 | df.printSchema()
42 | df.filter(df("_id") >= "CAA").select("_id", "airportName").show()
43 |
44 | // 2. Saving dataframe to Cloudant db
45 | // To create a Cloudant db during save set the option createDBOnSave=true
46 | df.filter(df("_id") >= "CAA")
47 | .select("_id", "airportName")
48 | .write.format("org.apache.bahir.cloudant")
49 | .option("cloudant.host", cloudantHost)
50 | .option("cloudant.username", cloudantUser)
51 | .option("cloudant.password", cloudantPassword)
52 | .option("createDBOnSave", "true")
53 | .save("airportcodemapping_df")
54 |
55 | // 3. Loading data from Cloudant search index
56 | val df2 = spark.read.format("org.apache.bahir.cloudant")
57 | .option("index", "_design/view/_search/n_flights")
58 | .option("cloudant.host", cloudantHost)
59 | .option("cloudant.username", cloudantUser)
60 | .option("cloudant.password", cloudantPassword)
61 | .load("n_flight")
62 | val total2 = df2.filter(df2("flightSegmentId") >"AA9")
63 | .select("flightSegmentId", "scheduledDepartureTime")
64 | .orderBy(df2("flightSegmentId"))
65 | .count()
66 | println(s"Total $total2 flights from index")// scalastyle:ignore
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/sql-cloudant/examples/src/main/scala/org/apache/spark/examples/sql/cloudant/CloudantStreaming.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.apache.spark.examples.sql.cloudant
18 |
19 | import org.apache.spark.rdd.RDD
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.streaming.{Seconds, StreamingContext, Time}
22 |
23 | import org.apache.bahir.cloudant.CloudantReceiver
24 |
25 | object CloudantStreaming {
26 | def main(args: Array[String]) {
27 | val spark = SparkSession.builder()
28 | .appName("Cloudant Spark SQL External Datasource in Scala")
29 | .master("local[*]")
30 | .getOrCreate()
31 |
32 | // Create the context with a 10 seconds batch size
33 | val ssc = new StreamingContext(spark.sparkContext, Seconds(10))
34 | import spark.implicits._
35 |
36 | val changes = ssc.receiverStream(new CloudantReceiver(spark.sparkContext.getConf, Map(
37 | "cloudant.host" -> "examples.cloudant.com",
38 | "database" -> "sales")))
39 |
40 | changes.foreachRDD((rdd: RDD[String], time: Time) => {
41 | // Get the singleton instance of SparkSession
42 |
43 |
44 | println(s"========= $time =========")// scalastyle:ignore
45 | // Convert RDD[String] to Dataset[String]
46 | val changesDataFrame = spark.read.json(rdd.toDS())
47 | if (changesDataFrame.schema.nonEmpty) {
48 | changesDataFrame.printSchema()
49 |
50 | var hasDelRecord = false
51 | var hasMonth = false
52 | for (field <- changesDataFrame.schema.fieldNames) {
53 | if ("_deleted".equals(field)) {
54 | hasDelRecord = true
55 | }
56 | if ("month".equals(field)) {
57 | hasMonth = true
58 | }
59 | }
60 | if (hasDelRecord) {
61 | changesDataFrame.filter(changesDataFrame("_deleted")).select("*").show()
62 | }
63 |
64 | if (hasMonth) {
65 | changesDataFrame.filter(changesDataFrame("month") === "May").select("*").show(5)
66 | changesDataFrame.createOrReplaceTempView("sales")
67 | val salesInMayCountsDataFrame =
68 | spark.sql(
69 | s"""
70 | |select rep, amount
71 | |from sales
72 | |where month = "May"
73 | """.stripMargin)
74 | salesInMayCountsDataFrame.show(5)
75 | }
76 | }
77 |
78 | })
79 | ssc.start()
80 | // run streaming for 60 secs
81 | Thread.sleep(60000L)
82 | ssc.stop(true)
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/sql-cloudant/examples/src/main/scala/org/apache/spark/examples/sql/cloudant/CloudantStreamingSelector.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples.sql.cloudant
19 |
20 | import java.util.concurrent.atomic.AtomicLong
21 |
22 | import org.apache.spark.rdd.RDD
23 | import org.apache.spark.sql.SparkSession
24 | import org.apache.spark.streaming.{ Seconds, StreamingContext, Time }
25 |
26 | import org.apache.bahir.cloudant.CloudantReceiver
27 |
28 | object CloudantStreamingSelector {
29 | def main(args: Array[String]) {
30 | val spark = SparkSession.builder()
31 | .appName("Cloudant Spark SQL External Datasource in Scala")
32 | .master("local[*]")
33 | .getOrCreate()
34 |
35 | import spark.implicits._
36 |
37 | // Create the context with a 10 seconds batch size
38 | val ssc = new StreamingContext(spark.sparkContext, Seconds(10))
39 | val curTotalAmount = new AtomicLong(0)
40 | val curSalesCount = new AtomicLong(0)
41 | var batchAmount = 0L
42 |
43 | val changes = ssc.receiverStream(new CloudantReceiver(spark.sparkContext.getConf, Map(
44 | "cloudant.host" -> "examples.cloudant.com",
45 | "database" -> "sales",
46 | "selector" -> "{\"month\":\"May\", \"rep\":\"John\"}")))
47 |
48 | changes.foreachRDD((rdd: RDD[String], time: Time) => {
49 | // Get the singleton instance of SQLContext
50 |
51 | println(s"========= $time =========") // scalastyle:ignore
52 | val changesDataFrame = spark.read.json(rdd.toDS())
53 | if (changesDataFrame.schema.nonEmpty) {
54 | changesDataFrame.select("*").show()
55 | batchAmount = changesDataFrame.groupBy().sum("amount").collect()(0).getLong(0)
56 | curSalesCount.getAndAdd(changesDataFrame.count())
57 | curTotalAmount.getAndAdd(batchAmount)
58 | println("Current sales count:" + curSalesCount)// scalastyle:ignore
59 | println("Current total amount:" + curTotalAmount)// scalastyle:ignore
60 | } else {
61 | ssc.stop()
62 | }
63 | })
64 |
65 | ssc.start()
66 | ssc.awaitTermination()
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/sql-cloudant/src/main/java/org/apache/bahir/cloudant/common/ChangesRow.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.apache.bahir.cloudant.common;
18 |
19 | import com.google.gson.JsonElement;
20 | import com.google.gson.JsonObject;
21 |
22 | import java.util.List;
23 |
24 | /**
25 | * Class representing a single row in a changes feed. Structure:
26 | *
27 | * {
28 | * last_seq": 5
29 | * "results": [
30 | * ---*** This next items is the ChangesRow ***---
31 | * {
32 | * "changes": [ {"rev": "2-eec205a9d413992850a6e32678485900"}, ... ],
33 | * "deleted": true,
34 | * "id": "deleted",
35 | * "seq": 5,
36 | * "doc": ... structure ...
37 | * }
38 | * ]
39 | * }
40 | */
41 | public class ChangesRow {
42 |
43 | public class Rev {
44 | private String rev;
45 |
46 | public String getRev() {
47 | return rev;
48 | }
49 | }
50 |
51 | private List changes;
52 | public Boolean deleted;
53 | private String id;
54 | private JsonElement seq;
55 | private JsonObject doc;
56 |
57 | public List getChanges() {
58 | return changes;
59 | }
60 |
61 | public String getSeq() {
62 | if (seq.isJsonNull()) {
63 | return null;
64 | } else {
65 | return seq.toString();
66 | }
67 | }
68 |
69 | public String getId() {
70 | return id;
71 | }
72 |
73 | public JsonObject getDoc() {
74 | return doc;
75 | }
76 |
77 | }
78 |
--------------------------------------------------------------------------------
/sql-cloudant/src/main/java/org/apache/bahir/cloudant/common/ChangesRowScanner.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.apache.bahir.cloudant.common;
18 |
19 | import com.google.gson.Gson;
20 | import java.io.BufferedReader;
21 | import java.io.IOException;
22 |
23 | /**
24 | * This scanner will read through a _changes stream until it finds the
25 | * next meaningful row, either a change entry or the closing line with
26 | * the lastSeq and, perhaps, pending changes (for normal/longpoll feeds).
27 | */
28 | public class ChangesRowScanner {
29 |
30 | private static final Gson gson = new Gson();
31 |
32 | /**
33 | * Read up to the next meaningful line from the changes feed, and calls
34 | * the passed delegate depending on what it finds. Works for all styles of
35 | * changes feed (normal, longpoll, continuous).
36 | *
37 | * @return True if should continue
38 | *
39 | * @throws IOException if there's a problem reading the stream
40 | */
41 | public static ChangesRow readRowFromReader(BufferedReader changesReader)
42 | throws IOException {
43 |
44 | String line;
45 |
46 | // Read the next line (empty = heartbeat, ignore; null = end of stream)
47 | while ((line = changesReader.readLine()) != null) {
48 | if (line.isEmpty()) {
49 | continue;
50 | }
51 | if (line.startsWith("{\"results\":")) {
52 | // ignore, just the START of the result set in normal/longpoll mode
53 | continue;
54 | } else if (line.startsWith("],")) {
55 | // ignore, just the END of the result set in normal/longpoll mode
56 | continue;
57 | }
58 | break;
59 | }
60 |
61 | if(line != null) {
62 | if (line.startsWith("\"last_seq\":")) {
63 | return null; // End of feed
64 | } else if (line.startsWith("{\"last_seq\":")) {
65 | return null; // End of feed
66 | } else {
67 | if (line.endsWith(",")) {
68 | line = line.substring(0, line.length() - 1);
69 | }
70 | ChangesRow r = gson.fromJson(line, ChangesRow.class);
71 | return r; // not end of feed
72 | }
73 | } else {
74 | return null;
75 | }
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/sql-cloudant/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | spark-sql {
2 | bulkSize = 200
3 | schemaSampleSize = -1
4 | createDBOnSave = false
5 | jsonstore.rdd = {
6 | partitions = 10
7 | maxInPartition = -1
8 | minInPartition = 10
9 | requestTimeout = 900000
10 | }
11 | cloudant = {
12 | batchInterval = 8
13 | endpoint = "_all_docs"
14 | numberOfRetries = 3
15 | protocol = https
16 | useQuery = false
17 | queryLimit = 25
18 | storageLevel = "MEMORY_ONLY"
19 | timeout = 60000
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/sql-cloudant/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apache/bahir/8b647c8ef80455ba900ef91e08eaf5eafa35c133/sql-cloudant/src/main/resources/reference.conf
--------------------------------------------------------------------------------
/sql-cloudant/src/main/scala/org/apache/bahir/cloudant/CloudantChangesConfig.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.apache.bahir.cloudant
18 |
19 | import org.apache.spark.storage.StorageLevel
20 |
21 | import org.apache.bahir.cloudant.common.JsonStoreConfigManager
22 |
23 | class CloudantChangesConfig(protocol: String, host: String, dbName: String,
24 | indexName: String = null, viewName: String = null)
25 | (username: String, password: String, partitions: Int,
26 | maxInPartition: Int, minInPartition: Int, requestTimeout: Long,
27 | bulkSize: Int, schemaSampleSize: Int,
28 | createDBOnSave: Boolean, endpoint: String, selector: String,
29 | timeout: Int, storageLevel: StorageLevel, useQuery: Boolean,
30 | queryLimit: Int, batchInterval: Int, numberOfRetries: Int)
31 | extends CloudantConfig(protocol, host, dbName, indexName, viewName)(username, password,
32 | partitions, maxInPartition, minInPartition, requestTimeout, bulkSize, schemaSampleSize,
33 | createDBOnSave, endpoint, useQuery, queryLimit, numberOfRetries) {
34 |
35 | override val defaultIndex: String = endpoint
36 |
37 | def getBatchInterval : Int = {
38 | batchInterval
39 | }
40 |
41 | def getSelector : String = {
42 | if (selector != null && !selector.isEmpty) {
43 | selector
44 | } else {
45 | val version = getClient.serverVersion
46 | if (version.matches("1.*")) {
47 | null
48 | } else {
49 | // Exclude design docs and deleted=true docs
50 | "{ \"_id\": { \"$regex\": \"^(?!_design/)\" }, " +
51 | "\"_deleted\": { \"$exists\": false } }"
52 | }
53 | }
54 | }
55 |
56 | /*
57 | * Storage level when persisting RDDs during streaming.
58 | * See https://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence for
59 | * more details.
60 | * See [[org.apache.spark.storage.StorageLevel]] for all defined storage level options.
61 | */
62 | def getStorageLevelForStreaming : StorageLevel = {
63 | if (storageLevel == null) {
64 | StorageLevel.MEMORY_ONLY
65 | } else {
66 | storageLevel
67 | }
68 | }
69 |
70 | def getContinuousChangesUrl: String = {
71 | var url = dbUrl + "/" + defaultIndex + "?include_docs=true&feed=continuous&heartbeat=3000"
72 | if (getSelector != null) {
73 | url = url + "&filter=_selector"
74 | }
75 | url
76 | }
77 |
78 | def getChangesReceiverUrl: String = {
79 | var url = dbUrl + "/" + defaultIndex + "?include_docs=true&feed=normal" +
80 | "&seq_interval=" + bulkSize + "&timeout=" + timeout
81 | if (getSelector != null) {
82 | url = url + "&filter=_selector"
83 | }
84 | url
85 | }
86 |
87 | // Use _all_docs endpoint for getting the total number of docs
88 | def getTotalUrl: String = {
89 | dbUrl + "/" + JsonStoreConfigManager.ALL_DOCS_INDEX
90 | }
91 | }
92 |
93 | object CloudantChangesConfig {
94 | // Error message from internal _changes receiver
95 | var receiverErrorMsg: String = ""
96 | }
97 |
--------------------------------------------------------------------------------
/sql-cloudant/src/main/scala/org/apache/bahir/cloudant/CloudantReceiver.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.apache.bahir.cloudant
18 |
19 | import java.io.{BufferedReader, InputStreamReader}
20 | import java.util.concurrent.TimeUnit
21 |
22 | import okhttp3._
23 |
24 | import org.apache.spark.SparkConf
25 | import org.apache.spark.storage.StorageLevel
26 | import org.apache.spark.streaming.receiver.Receiver
27 |
28 | import org.apache.bahir.cloudant.common._
29 |
30 | class CloudantReceiver(sparkConf: SparkConf, cloudantParams: Map[String, String])
31 | extends Receiver[String](StorageLevel.MEMORY_AND_DISK) {
32 | // CloudantChangesConfig requires `_changes` endpoint option
33 | lazy val config: CloudantChangesConfig = {
34 | JsonStoreConfigManager.getConfig(sparkConf, cloudantParams
35 | + ("cloudant.endpoint" -> JsonStoreConfigManager.CHANGES_INDEX)
36 | ).asInstanceOf[CloudantChangesConfig]
37 | }
38 |
39 | def onStart() {
40 | // Start the thread that receives data over a connection
41 | new Thread("Cloudant Receiver") {
42 | override def run() { receive() }
43 | }.start()
44 | }
45 |
46 | private def receive(): Unit = {
47 | val okHttpClient: OkHttpClient = new OkHttpClient.Builder()
48 | .connectTimeout(5, TimeUnit.SECONDS)
49 | .readTimeout(60, TimeUnit.SECONDS)
50 | .build
51 | val url = config.getChangesReceiverUrl.toString
52 |
53 | val builder = new Request.Builder().url(url)
54 | if (config.username != null) {
55 | val credential = Credentials.basic(config.username, config.password)
56 | builder.header("Authorization", credential)
57 | }
58 | if(config.getSelector != null) {
59 | val jsonType = MediaType.parse("application/json; charset=utf-8")
60 | val selector = "{\"selector\":" + config.getSelector + "}"
61 | val selectorBody = RequestBody.create(jsonType, selector)
62 | builder.post(selectorBody)
63 | }
64 |
65 | val request = builder.build
66 | val response = okHttpClient.newCall(request).execute
67 | val status_code = response.code
68 |
69 | if (status_code == 200) {
70 | val changesInputStream = response.body.byteStream
71 | var json = new ChangesRow()
72 | if (changesInputStream != null) {
73 | val bufferedReader = new BufferedReader(new InputStreamReader(changesInputStream))
74 | while ((json = ChangesRowScanner.readRowFromReader(bufferedReader)) != null) {
75 | if (!isStopped() && json != null && !json.getDoc.has("_deleted")) {
76 | store(json.getDoc.toString)
77 | }
78 | }
79 | }
80 | } else {
81 | val errorMsg = "Error retrieving _changes feed " + config.getDbname + ": " + status_code
82 | reportError(errorMsg, new CloudantException(errorMsg))
83 | }
84 | }
85 |
86 | def onStop(): Unit = {
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/sql-cloudant/src/main/scala/org/apache/bahir/cloudant/common/CloudantException.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.apache.bahir.cloudant.common
18 |
19 | class CloudantException(msg: String) extends RuntimeException(msg) {
20 | def this(msg: String, cause: Throwable) {
21 | this(msg)
22 | initCause(cause)
23 | }
24 |
25 | def this(cause: Throwable) = {
26 | this(Option(cause).map(_.toString).orNull)
27 | initCause(cause)
28 | }
29 |
30 | def this() = {
31 | this(null: String)
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/sql-cloudant/src/main/scala/org/apache/bahir/cloudant/common/JsonUtil.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.apache.bahir.cloudant.common
18 |
19 | import scala.util.control.Breaks._
20 |
21 | import com.google.gson.{JsonElement, JsonParser}
22 |
23 | object JsonUtil {
24 | def getField(row: JsonElement, field: String) : Option[JsonElement] = {
25 | var path = field.split('.')
26 | var currentValue = row
27 | var finalValue: Option[JsonElement] = None
28 | breakable {
29 | for (i <- path.indices) {
30 | if (currentValue != null && currentValue.isJsonObject) {
31 | val f: Option[JsonElement] =
32 | Option(currentValue.getAsJsonObject.get(path(i)))
33 | f match {
34 | case Some(f2) => currentValue = f2
35 | case None => break
36 | }
37 | if (i == path.length - 1) {
38 | // The leaf node
39 | finalValue = Some(currentValue)
40 | }
41 | }
42 | }
43 | }
44 | finalValue
45 | }
46 |
47 | object JsonConverter {
48 | val parser = new JsonParser
49 | def toJson(value: Any): JsonElement = {
50 | parser.parse(value.toString)
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/sql-cloudant/src/main/scala/org/apache/bahir/cloudant/internal/ChangesReceiver.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.apache.bahir.cloudant.internal
18 |
19 | import java.io.{BufferedReader, InputStreamReader}
20 | import java.util.concurrent.TimeUnit
21 |
22 | import com.google.gson.JsonParser
23 | import okhttp3._
24 |
25 | import org.apache.spark.storage.StorageLevel
26 | import org.apache.spark.streaming.receiver.Receiver
27 |
28 | import org.apache.bahir.cloudant.CloudantChangesConfig
29 | import org.apache.bahir.cloudant.common._
30 |
31 | class ChangesReceiver(config: CloudantChangesConfig)
32 | extends Receiver[String](StorageLevel.MEMORY_AND_DISK) {
33 |
34 | def onStart() {
35 | // Start the thread that receives data over a connection
36 | new Thread("Cloudant Receiver") {
37 | override def run() { receive() }
38 | }.start()
39 | }
40 |
41 | private def receive(): Unit = {
42 | val okHttpClient: OkHttpClient = new OkHttpClient.Builder()
43 | .connectTimeout(5, TimeUnit.SECONDS)
44 | .readTimeout(60, TimeUnit.SECONDS)
45 | .build
46 | val url = config.getChangesReceiverUrl.toString
47 |
48 | val builder = new Request.Builder().url(url)
49 | if (config.username != null) {
50 | val credential = Credentials.basic(config.username, config.password)
51 | builder.header("Authorization", credential)
52 | }
53 | if(config.getSelector != null) {
54 | val jsonType = MediaType.parse("application/json; charset=utf-8")
55 | val selector = "{\"selector\":" + config.getSelector + "}"
56 | val selectorBody = RequestBody.create(jsonType, selector)
57 | builder.post(selectorBody)
58 | }
59 |
60 | val request = builder.build
61 | val response = okHttpClient.newCall(request).execute
62 | val status_code = response.code
63 |
64 | if (status_code == 200) {
65 | val changesInputStream = response.body.byteStream
66 | var json = new ChangesRow()
67 | if (changesInputStream != null) {
68 | val bufferedReader = new BufferedReader(new InputStreamReader(changesInputStream))
69 | while ((json = ChangesRowScanner.readRowFromReader(bufferedReader)) != null) {
70 | if (!isStopped() && json != null && !json.getDoc.has("_deleted")) {
71 | store(json.getDoc.toString)
72 | }
73 | }
74 | }
75 | } else {
76 | val responseAsJson = new JsonParser().parse(response.body.string)
77 | val errorMsg = "Error retrieving _changes feed data from database " + "'" +
78 | config.getDbname + "' with response code " + status_code + ": " + responseAsJson.toString
79 | reportError(errorMsg, new CloudantException(errorMsg))
80 | CloudantChangesConfig.receiverErrorMsg = errorMsg
81 | }
82 | }
83 |
84 | override def onStop(): Unit = {
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/sql-cloudant/src/test/resources/json-files/n_airportcodemapping.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "_id": "BOM",
4 | "airportName": "Mumbai"
5 | },
6 | {
7 | "_id": "CDG",
8 | "airportName": "Paris"
9 | },
10 | {
11 | "_id": "DEL",
12 | "airportName": "Delhi"
13 | },
14 | {
15 | "_id": "FCO",
16 | "airportName": "Rome"
17 | },
18 | {
19 | "_id": "FRA",
20 | "airportName": "Frankfurt"
21 | },
22 | {
23 | "_id": "HKG",
24 | "airportName": "Hong Kong"
25 | },
26 | {
27 | "_id": "IKA",
28 | "airportName": "Tehran"
29 | },
30 | {
31 | "_id": "JFK",
32 | "airportName": "New York"
33 | },
34 | {
35 | "_id": "LHR",
36 | "airportName": "London"
37 | },
38 | {
39 | "_id": "NRT",
40 | "airportName": "Tokyo"
41 | },
42 | {
43 | "_id": "SIN",
44 | "airportName": "Singapore"
45 | },
46 | {
47 | "_id": "SVO",
48 | "airportName": "Moscow"
49 | },
50 | {
51 | "_id": "SYD",
52 | "airportName": "Sydney"
53 | },
54 | {
55 | "_id": "YUL",
56 | "airportName": "Montreal"
57 | }
58 | ]
--------------------------------------------------------------------------------
/sql-cloudant/src/test/resources/json-files/n_booking.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "flightId": "AA93",
4 | "_id": "696869c2-1035-4d4c-8142-86985f5f199e",
5 | "customerId": "uid0@email.com",
6 | "dateOfBooking": "2017-04-12T22:19:45.910Z"
7 | },
8 | {
9 | "language": "javascript",
10 | "views": {
11 |
12 | },
13 | "_id": "_design/view",
14 | "indexes": {
15 | "n_bookings": {
16 | "index": "function(doc){\n index(\"default\", doc._id);\n \t if(doc.customerId){\n \tindex(\"customerId\", doc.customerId, {\"store\": \"yes\"});\n \t }\n}",
17 | "analyzer": "standard"
18 | }
19 | }
20 | },
21 | {
22 | "flightId": "AA330",
23 | "_id": "ccb8fc78-1b29-42ef-bff2-a4a81ae1f807",
24 | "customerId": "uid0@email.com",
25 | "dateOfBooking": "2017-04-12T22:19:46.140Z"
26 | }
27 | ]
--------------------------------------------------------------------------------
/sql-cloudant/src/test/resources/json-files/n_customersession.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "language": "javascript",
4 | "indexes": {
5 | "n_customersessions": {
6 | "index": "function(doc){\n\t index(\"default\", doc._id);\n \t if(doc.customerid){\n \tindex(\"customerid\", doc.customerid, {\"store\": \"yes\"});\n \t }\n}",
7 | "analyzer": "standard"
8 | }
9 | },
10 | "_id": "_design/view",
11 | "views": {
12 |
13 | }
14 | },
15 | {
16 | "customerid": "uid0@email.com",
17 | "lastAccessedTime": "2017-04-12T22:19:45.449Z",
18 | "_id": "a1346fce-2b45-422c-a5d0-1554a47b31e6",
19 | "timeoutTime": "2017-04-13T22:19:45.449Z"
20 | }
21 | ]
--------------------------------------------------------------------------------
/sql-cloudant/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the file target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=true
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
25 |
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.spark_project.jetty=WARN
28 |
--------------------------------------------------------------------------------
/sql-cloudant/src/test/scala/org/apache/bahir/cloudant/TestUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.cloudant
19 |
20 | object TestUtils {
21 | // Set CouchDB/Cloudant host, username and password for local testing
22 | private val host = System.getenv("CLOUDANT_HOST")
23 | private val username = System.getenv("CLOUDANT_USER")
24 | private val password = System.getenv("CLOUDANT_PASSWORD")
25 | private val protocol = System.getenv("CLOUDANT_PROTOCOL")
26 |
27 | // List of test databases to create from JSON flat files
28 | val testDatabasesList: List[String] = List(
29 | "n_airportcodemapping",
30 | "n_booking",
31 | "n_customer",
32 | "n_customersession",
33 | "n_flight",
34 | "n_flight2",
35 | "n_flightsegment"
36 | )
37 |
38 | // default value is https for cloudant.com accounts
39 | def getProtocol: String = {
40 | if (protocol != null && !protocol.isEmpty) {
41 | protocol
42 | } else {
43 | "https"
44 | }
45 | }
46 |
47 | def getHost: String = {
48 | if (host != null && !host.isEmpty) {
49 | host
50 | } else {
51 | getUsername + ".cloudant.com"
52 | }
53 | }
54 |
55 | def getUsername: String = {
56 | username
57 | }
58 |
59 | def getPassword: String = {
60 | password
61 | }
62 |
63 | def shouldRunTest(): Boolean = {
64 | val isEnvSet = (username != null && !username.isEmpty) &&
65 | (password != null && !password.isEmpty)
66 | if (isEnvSet) {
67 | // scalastyle:off println
68 | println(
69 | s"""
70 | |Sql-cloudant tests that require Cloudant databases have been enabled by
71 | |the environment variables CLOUDANT_USER and CLOUDANT_PASSWORD.
72 | """.stripMargin)
73 | // scalastyle:on println
74 | }
75 | isEnvSet
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/sql-streaming-akka/examples/src/main/java/org/apache/bahir/examples/sql/streaming/akka/JavaAkkaStreamWordCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.examples.sql.streaming.akka;
19 |
20 | import org.apache.log4j.Level;
21 | import org.apache.log4j.Logger;
22 | import org.apache.spark.SparkConf;
23 | import org.apache.spark.api.java.function.FlatMapFunction;
24 | import org.apache.spark.sql.Dataset;
25 | import org.apache.spark.sql.Encoders;
26 | import org.apache.spark.sql.Row;
27 | import org.apache.spark.sql.SparkSession;
28 | import org.apache.spark.sql.streaming.StreamingQuery;
29 |
30 | import java.util.Arrays;
31 | import java.util.Iterator;
32 |
33 | /**
34 | * Counts words in UTF8 encoded, '\n' delimited text received from Akka Feeder Actor system.
35 | *
36 | * Usage: AkkaStreamWordCount
37 | * provides the uri of the publisher or feeder actor that Structured Streaming
38 | * would connect to receive data.
39 | *
40 | * To run this on your local machine, a Feeder Actor System should be up and running.
41 | *
42 | */
43 | public final class JavaAkkaStreamWordCount {
44 |
45 | public static void main(String[] args) throws Exception {
46 | if (args.length < 1) {
47 | System.err.println("Usage: JavaAkkaStreamWordCount ");
48 | System.exit(1);
49 | }
50 |
51 | if (!Logger.getRootLogger().getAllAppenders().hasMoreElements()) {
52 | Logger.getRootLogger().setLevel(Level.WARN);
53 | }
54 |
55 | String urlOfPublisher = args[0];
56 |
57 | SparkConf sparkConf = new SparkConf().setAppName("JavaAkkaStreamWordCount");
58 |
59 | // check Spark configuration for master URL, set it to local if not configured
60 | if (!sparkConf.contains("spark.master")) {
61 | sparkConf.setMaster("local[4]");
62 | }
63 |
64 | SparkSession spark = SparkSession.builder()
65 | .config(sparkConf)
66 | .getOrCreate();
67 |
68 | // Create DataFrame representing the stream of input lines from connection
69 | // to publisher or feeder actor
70 | Dataset lines = spark
71 | .readStream()
72 | .format("org.apache.bahir.sql.streaming.akka.AkkaStreamSourceProvider")
73 | .option("urlOfPublisher", urlOfPublisher)
74 | .load().select("value").as(Encoders.STRING());
75 |
76 | // Split the lines into words
77 | Dataset words = lines.flatMap(new FlatMapFunction() {
78 | @Override
79 | public Iterator call(String s) throws Exception {
80 | return Arrays.asList(s.split(" ")).iterator();
81 | }
82 | }, Encoders.STRING());
83 |
84 | // Generate running word count
85 | Dataset wordCounts = words.groupBy("value").count();
86 |
87 | // Start running the query that prints the running counts to the console
88 | StreamingQuery query = wordCounts.writeStream()
89 | .outputMode("complete")
90 | .format("console")
91 | .start();
92 |
93 | query.awaitTermination();
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/sql-streaming-akka/examples/src/main/scala/org/apache/bahir/examples/sql/streaming/akka/AkkaStreamWordCount.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.examples.sql.streaming.akka
19 |
20 | import java.sql.Timestamp
21 |
22 | import org.apache.spark.sql.SparkSession
23 |
24 | /**
25 | * Counts words in UTF8 encoded, '\n' delimited text received from Akka Feeder Actor system.
26 | *
27 | * Usage: AkkaStreamWordCount
28 | * provides the uri of the publisher or feeder actor that Structured Streaming
29 | * would connect to receive data.
30 | *
31 | * To run this on your local machine, a Feeder Actor System should be up and running.
32 | *
33 | */
34 | object AkkaStreamWordCount {
35 | def main(args: Array[String]): Unit = {
36 | if (args.length < 1) {
37 | System.err.println("Usage: AkkaStreamWordCount ") // scalastyle:off println
38 | System.exit(1)
39 | }
40 |
41 | val urlOfPublisher = args(0)
42 |
43 | val spark = SparkSession
44 | .builder()
45 | .appName("AkkaStreamWordCount")
46 | .master("local[4]")
47 | .getOrCreate()
48 |
49 | import spark.implicits._
50 |
51 | // Create DataFrame representing the stream of input lines from connection
52 | // to publisher or feeder actor
53 | val lines = spark.readStream
54 | .format("org.apache.bahir.sql.streaming.akka.AkkaStreamSourceProvider")
55 | .option("urlOfPublisher", urlOfPublisher)
56 | .load().as[(String, Timestamp)]
57 |
58 | // Split the lines into words
59 | val words = lines.map(_._1).flatMap(_.split(" "))
60 |
61 | // Generate running word count
62 | val wordCounts = words.groupBy("value").count()
63 |
64 | // Start running the query that prints the running counts to the console
65 | val query = wordCounts.writeStream
66 | .outputMode("complete")
67 | .format("console")
68 | .start()
69 |
70 | query.awaitTermination()
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/sql-streaming-akka/src/main/assembly/assembly.xml:
--------------------------------------------------------------------------------
1 |
17 |
18 | test-jar-with-dependencies
19 |
20 | jar
21 |
22 | false
23 |
24 |
25 |
26 | ${project.build.directory}/scala-${scala.binary.version}/test-classes
27 |
28 |
29 |
30 |
31 |
32 |
33 | true
34 | test
35 | true
36 |
37 | org.apache.hadoop:*:jar
38 | org.apache.zookeeper:*:jar
39 | org.apache.avro:*:jar
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/sql-streaming-akka/src/main/scala/org/apache/bahir/sql/streaming/akka/LongOffset.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.sql.streaming.akka
19 |
20 | import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
21 | import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2}
22 |
23 | /**
24 | * @note As of 2.3.0, [[org.apache.spark.sql.execution.streaming.LongOffset]]
25 | * hasn't extended v2 Offset yet. Fix version is 3.0.0. Until then
26 | * this is a required class.
27 | * @see SPARK-23092
28 | */
29 | case class LongOffset(offset: Long) extends OffsetV2 {
30 |
31 | override val json = offset.toString
32 |
33 | def +(increment: Long): LongOffset = new LongOffset(offset + increment)
34 | def -(decrement: Long): LongOffset = new LongOffset(offset - decrement)
35 | }
36 |
37 | object LongOffset {
38 |
39 | /**
40 | * LongOffset factory from serialized offset.
41 | * @return new LongOffset
42 | */
43 | def apply(offset: SerializedOffset) : LongOffset = new LongOffset(offset.json.toLong)
44 |
45 | /**
46 | * Convert generic Offset to LongOffset if possible.
47 | * @return converted LongOffset
48 | */
49 | def convert(offset: Offset): Option[LongOffset] = offset match {
50 | case lo: LongOffset => Some(lo)
51 | case so: SerializedOffset => Some(LongOffset(so))
52 | case _ => None
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/sql-streaming-akka/src/main/scala/org/apache/bahir/sql/streaming/akka/MessageStore.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.sql.streaming.akka
19 |
20 | import java.nio.ByteBuffer
21 |
22 | import scala.reflect.ClassTag
23 |
24 | import org.rocksdb.RocksDB
25 |
26 | import org.apache.spark.serializer.{JavaSerializer, Serializer, SerializerInstance}
27 | import org.apache.spark.SparkConf
28 |
29 | import org.apache.bahir.utils.Logging
30 |
31 |
32 | trait MessageStore {
33 |
34 | def store[T: ClassTag](id: Long, message: T): Boolean
35 |
36 | def retrieve[T: ClassTag](start: Long, end: Long): Seq[Option[T]]
37 |
38 | def retrieve[T: ClassTag](id: Long): Option[T]
39 |
40 | def maxProcessedOffset: Long
41 | }
42 |
43 | private[akka] class LocalMessageStore(val persistentStore: RocksDB,
44 | val serializer: Serializer)
45 | extends MessageStore with Logging {
46 |
47 | val classLoader = Thread.currentThread().getContextClassLoader
48 |
49 | def this(persistentStore: RocksDB, conf: SparkConf) =
50 | this(persistentStore, new JavaSerializer(conf))
51 |
52 | val serializerInstance: SerializerInstance = serializer.newInstance()
53 |
54 | private def get(id: Long) = persistentStore.get(id.toString.getBytes)
55 |
56 | override def maxProcessedOffset: Long = persistentStore.getLatestSequenceNumber
57 |
58 | override def store[T: ClassTag](id: Long, message: T): Boolean = {
59 | val bytes: Array[Byte] = serializerInstance.serialize(message).array()
60 | try {
61 | persistentStore.put(id.toString.getBytes(), bytes)
62 | true
63 | } catch {
64 | case e: Exception => log.warn(s"Failed to store message Id: $id", e)
65 | false
66 | }
67 | }
68 |
69 | override def retrieve[T: ClassTag](start: Long, end: Long): Seq[Option[T]] = {
70 | (start until end).map(x => retrieve(x))
71 | }
72 |
73 | override def retrieve[T: ClassTag](id: Long): Option[T] = {
74 | val bytes = persistentStore.get(id.toString.getBytes)
75 |
76 | if (bytes != null) {
77 | Some(serializerInstance.deserialize(
78 | ByteBuffer.wrap(bytes), classLoader))
79 | } else {
80 | None
81 | }
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/sql-streaming-akka/src/test/resources/feeder_actor.conf:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | akka {
19 | loglevel = "INFO"
20 | actor {
21 | provider = "akka.remote.RemoteActorRefProvider"
22 | }
23 | remote {
24 | enabled-transports = ["akka.remote.netty.tcp"]
25 | netty.tcp {
26 | hostname = "127.0.0.1"
27 | port = 0
28 | }
29 | log-sent-messages = on
30 | log-received-messages = on
31 | }
32 | loggers.0 = "akka.event.slf4j.Slf4jLogger"
33 | log-dead-letters-during-shutdown = "off"
34 | }
35 |
--------------------------------------------------------------------------------
/sql-streaming-akka/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the file target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=true
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
25 |
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.spark_project.jetty=WARN
28 |
--------------------------------------------------------------------------------
/sql-streaming-akka/src/test/scala/org/apache/bahir/sql/streaming/akka/AkkaTestUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | // scalastyle:off println
19 | package org.apache.bahir.sql.streaming.akka
20 |
21 | import java.io.File
22 |
23 | import scala.collection.mutable
24 | import scala.concurrent.Await
25 | import scala.concurrent.duration._
26 | import scala.util.Random
27 |
28 | import akka.actor.{Actor, ActorRef, ActorSystem, ExtendedActorSystem, Props}
29 | import com.typesafe.config.{Config, ConfigFactory}
30 |
31 | import org.apache.bahir.utils.Logging
32 |
33 | class AkkaTestUtils extends Logging {
34 | private val actorSystemName = "feeder-actor-system"
35 | private var actorSystem: ActorSystem = _
36 |
37 | private val feederActorName = "feederActor"
38 |
39 | private var message: String = _
40 | private var count = 1
41 |
42 | def getFeederActorConfig(): Config = {
43 | val configFile = getClass.getClassLoader
44 | .getResource("feeder_actor.conf").getFile
45 | ConfigFactory.parseFile(new File(configFile))
46 | }
47 |
48 | def getFeederActorUri(): String =
49 | s"${actorSystem.asInstanceOf[ExtendedActorSystem].provider.getDefaultAddress}" +
50 | s"/user/$feederActorName"
51 |
52 | class FeederActor extends Actor {
53 |
54 | val rand = new Random()
55 | val receivers = new mutable.LinkedHashSet[ActorRef]()
56 |
57 | val sendMessageThread =
58 | new Thread() {
59 | override def run(): Unit = {
60 | var counter = 0
61 | while (counter < count) {
62 | // Thread.sleep(500)
63 | receivers.foreach(_ ! message)
64 | counter += 1
65 | }
66 | }
67 | }
68 |
69 | override def receive: Receive = {
70 | case SubscribeReceiver(receiverActor: ActorRef) =>
71 | log.debug(s"received subscribe from ${receiverActor.toString}")
72 | receivers += receiverActor
73 | sendMessageThread.run()
74 |
75 | case UnsubscribeReceiver(receiverActor: ActorRef) =>
76 | log.debug(s"received unsubscribe from ${receiverActor.toString}")
77 | receivers -= receiverActor
78 | }
79 | }
80 |
81 | def setup(): Unit = {
82 | val feederConf = getFeederActorConfig()
83 |
84 | actorSystem = ActorSystem(actorSystemName, feederConf)
85 | actorSystem.actorOf(Props(new FeederActor), feederActorName)
86 | }
87 |
88 | def shutdown(): Unit = {
89 | Await.ready(actorSystem.terminate(), 5.seconds)
90 | }
91 |
92 | def setMessage(message: String): Unit = this.message = message
93 | def setCountOfMessages(messageCount: Int): Unit = count = messageCount
94 | }
95 |
--------------------------------------------------------------------------------
/sql-streaming-jdbc/examples/src/main/scala/org/apache/bahir/examples/sql/streaming/jdbc/JdbcSinkDemo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.examples.sql.streaming.jdbc
19 |
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions
22 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
23 |
24 | /**
25 | * Mock using rate source, change the log to a simple Person
26 | * object with name and age property, and write to jdbc.
27 | *
28 | * Usage: JdbcSinkDemo
29 | */
30 | object JdbcSinkDemo {
31 |
32 | private case class Person(name: String, age: Int)
33 |
34 | def main(args: Array[String]): Unit = {
35 | if (args.length < 4) {
36 | // scalastyle:off println
37 | System.err.println("Usage: JdbcSinkDemo ")
38 | // scalastyle:on
39 | System.exit(1)
40 | }
41 |
42 | val jdbcUrl = args(0)
43 | val tableName = args(1)
44 | val username = args(2)
45 | val password = args(3)
46 |
47 | val spark = SparkSession
48 | .builder()
49 | .appName("JdbcSinkDemo")
50 | .getOrCreate()
51 |
52 | // load data source
53 | val df = spark.readStream
54 | .format("rate")
55 | .option("numPartitions", "5")
56 | .option("rowsPerSecond", "100")
57 | .load()
58 |
59 | // change input value to a person object.
60 | import spark.implicits._
61 | val lines = df.select("value").as[Long].map{ value =>
62 | Person(s"name_${value}", value.toInt % 30)
63 | }
64 |
65 | lines.printSchema()
66 |
67 | // write result
68 | val query = lines.writeStream
69 | .outputMode("append")
70 | .format("streaming-jdbc")
71 | .outputMode(OutputMode.Append)
72 | .option(JDBCOptions.JDBC_URL, jdbcUrl)
73 | .option(JDBCOptions.JDBC_TABLE_NAME, tableName)
74 | .option(JDBCOptions.JDBC_DRIVER_CLASS, "com.mysql.jdbc.Driver")
75 | .option(JDBCOptions.JDBC_BATCH_INSERT_SIZE, "5")
76 | .option("user", username)
77 | .option("password", password)
78 | .trigger(Trigger.ProcessingTime("10 seconds"))
79 | .start()
80 |
81 | query.awaitTermination()
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/sql-streaming-jdbc/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
20 | 4.0.0
21 |
22 | org.apache.bahir
23 | bahir-parent_2.12
24 | 3.0.0-SNAPSHOT
25 | ../pom.xml
26 |
27 |
28 | org.apache.bahir
29 | spark-sql-streaming-jdbc_2.12
30 |
31 | sql-streaming-jdbc
32 |
33 | jar
34 | Apache Bahir - Spark SQL Streaming JDBC
35 | http://bahir.apache.org/
36 |
37 |
38 |
39 | org.apache.bahir
40 | bahir-common_${scala.binary.version}
41 | ${project.version}
42 |
43 |
44 | org.apache.spark
45 | spark-tags_${scala.binary.version}
46 |
47 |
48 | org.apache.spark
49 | spark-sql_${scala.binary.version}
50 | ${spark.version}
51 |
52 |
53 | org.apache.spark
54 | spark-sql_${scala.binary.version}
55 | ${spark.version}
56 | test-jar
57 | test
58 |
59 |
60 | org.apache.spark
61 | spark-catalyst_${scala.binary.version}
62 | ${spark.version}
63 | test-jar
64 | test
65 |
66 |
67 | org.apache.spark
68 | spark-core_${scala.binary.version}
69 | ${spark.version}
70 | test-jar
71 | test
72 |
73 |
74 | org.scalacheck
75 | scalacheck_${scala.binary.version}
76 | test
77 |
78 |
79 | com.h2database
80 | h2
81 | 1.4.195
82 | test
83 |
84 |
85 |
86 |
--------------------------------------------------------------------------------
/sql-streaming-jdbc/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | org.apache.bahir.sql.streaming.jdbc.JdbcSourceProvider
--------------------------------------------------------------------------------
/sql-streaming-jdbc/src/main/scala/org/apache/bahir/sql/streaming/jdbc/JdbcSourceProvider.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.sql.streaming.jdbc
19 |
20 | import scala.collection.JavaConverters._
21 |
22 | import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions
23 | import org.apache.spark.sql.sources.DataSourceRegister
24 | import org.apache.spark.sql.sources.v2.{DataSourceOptions, StreamWriteSupport}
25 | import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
26 | import org.apache.spark.sql.streaming.OutputMode
27 | import org.apache.spark.sql.types.StructType
28 |
29 | class JdbcSourceProvider extends StreamWriteSupport with DataSourceRegister{
30 | override def createStreamWriter(queryId: String, schema: StructType,
31 | mode: OutputMode, options: DataSourceOptions): StreamWriter = {
32 | val optionMap = options.asMap().asScala.toMap
33 | // add this for parameter check.
34 | new JDBCOptions(optionMap)
35 | new JdbcStreamWriter(schema, optionMap)
36 | }
37 |
38 | // short name 'jdbc' is used for batch, chose a different name for streaming.
39 | override def shortName(): String = "streaming-jdbc"
40 | }
41 |
--------------------------------------------------------------------------------
/sql-streaming-jdbc/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the file target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=true
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
25 |
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.spark_project.jetty=WARN
28 |
--------------------------------------------------------------------------------
/sql-streaming-mqtt/examples/src/main/java/org/apache/bahir/examples/sql/streaming/mqtt/JavaMQTTStreamWordCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.examples.sql.streaming.mqtt;
19 |
20 | import org.apache.log4j.Logger;
21 | import org.apache.log4j.Level;
22 | import org.apache.spark.SparkConf;
23 | import org.apache.spark.api.java.function.FlatMapFunction;
24 | import org.apache.spark.sql.Dataset;
25 | import org.apache.spark.sql.Encoders;
26 | import org.apache.spark.sql.Row;
27 | import org.apache.spark.sql.SparkSession;
28 | import org.apache.spark.sql.streaming.StreamingQuery;
29 |
30 | import java.util.Arrays;
31 | import java.util.Iterator;
32 |
33 | /**
34 | * Counts words in UTF8 encoded, '\n' delimited text received from MQTT Server.
35 | *
36 | * Usage: JavaMQTTStreamWordCount
37 | * and describe the MQTT server that Structured Streaming
38 | * would connect to receive data.
39 | *
40 | * To run this on your local machine, a MQTT Server should be up and running.
41 | *
42 | */
43 | public final class JavaMQTTStreamWordCount {
44 |
45 | public static void main(String[] args) throws Exception {
46 | if (args.length < 2) {
47 | System.err.println("Usage: JavaMQTTStreamWordCount ");
48 | System.exit(1);
49 | }
50 |
51 | if (!Logger.getRootLogger().getAllAppenders().hasMoreElements()) {
52 | Logger.getRootLogger().setLevel(Level.WARN);
53 | }
54 |
55 | String brokerUrl = args[0];
56 | String topic = args[1];
57 |
58 | SparkConf sparkConf = new SparkConf().setAppName("JavaMQTTStreamWordCount");
59 |
60 | // check Spark configuration for master URL, set it to local if not configured
61 | if (!sparkConf.contains("spark.master")) {
62 | sparkConf.setMaster("local[4]");
63 | }
64 |
65 | SparkSession spark = SparkSession.builder()
66 | .config(sparkConf)
67 | .getOrCreate();
68 |
69 | // Create DataFrame representing the stream of input lines from connection to mqtt server
70 | Dataset lines = spark
71 | .readStream()
72 | .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider")
73 | .option("topic", topic)
74 | .load(brokerUrl).selectExpr("CAST(payload AS STRING)").as(Encoders.STRING());
75 |
76 | // Split the lines into words
77 | Dataset words = lines.flatMap(new FlatMapFunction() {
78 | @Override
79 | public Iterator call(String x) {
80 | return Arrays.asList(x.split(" ")).iterator();
81 | }
82 | }, Encoders.STRING());
83 |
84 | // Generate running word count
85 | Dataset wordCounts = words.groupBy("value").count();
86 |
87 | // Start running the query that prints the running counts to the console
88 | StreamingQuery query = wordCounts.writeStream()
89 | .outputMode("complete")
90 | .format("console")
91 | .start();
92 |
93 | query.awaitTermination();
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/sql-streaming-mqtt/examples/src/main/scala/org/apache/bahir/examples/sql/streaming/mqtt/MQTTSinkWordCount.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.examples.sql.streaming.mqtt
19 |
20 | import java.io.File
21 |
22 | import org.apache.commons.io.FileUtils
23 |
24 | import org.apache.spark.sql.SparkSession
25 |
26 | /**
27 | * Counts words in UTF-8 encoded, '\n' delimited text received from local socket
28 | * and publishes results on MQTT topic.
29 | *
30 | * Usage: MQTTSinkWordCount
31 | * represents local network port on which program is listening for input.
32 | * and describe the MQTT server that structured streaming
33 | * would connect and send data.
34 | *
35 | * To run example on your local machine, a MQTT Server should be up and running.
36 | * Linux users may leverage 'nc -lk ' to listen on local port and wait
37 | * for Spark socket connection.
38 | */
39 | object MQTTSinkWordCount {
40 | def main(args: Array[String]) {
41 | if (args.length < 2) {
42 | // scalastyle:off
43 | System.err.println("Usage: MQTTSinkWordCount ")
44 | // scalastyle:on
45 | System.exit(1)
46 | }
47 |
48 | val checkpointDir = System.getProperty("java.io.tmpdir") + "/mqtt-example/"
49 | // Remove checkpoint directory.
50 | FileUtils.deleteDirectory(new File(checkpointDir))
51 |
52 | val port = args(0)
53 | val brokerUrl = args(1)
54 | val topic = args(2)
55 |
56 | val spark = SparkSession.builder
57 | .appName("MQTTSinkWordCount").master("local[4]")
58 | .getOrCreate()
59 |
60 | import spark.implicits._
61 |
62 | // Create DataFrame representing the stream of input lines from local network socket.
63 | val lines = spark.readStream
64 | .format("socket")
65 | .option("host", "localhost").option("port", port)
66 | .load().select("value").as[String]
67 |
68 | // Split the lines into words.
69 | val words = lines.flatMap(_.split(" "))
70 |
71 | // Generate running word count.
72 | val wordCounts = words.groupBy("value").count()
73 |
74 | // Start publishing the counts to MQTT server.
75 | val query = wordCounts.writeStream
76 | .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSinkProvider")
77 | .option("checkpointLocation", checkpointDir)
78 | .outputMode("complete")
79 | .option("topic", topic)
80 | .option("localStorage", checkpointDir)
81 | .start(brokerUrl)
82 |
83 | query.awaitTermination()
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/sql-streaming-mqtt/examples/src/main/scala/org/apache/bahir/examples/sql/streaming/mqtt/MQTTStreamWordCount.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.examples.sql.streaming.mqtt
19 |
20 | import java.sql.Timestamp
21 |
22 | import org.apache.spark.sql.SparkSession
23 |
24 | /**
25 | * Counts words in UTF8 encoded, '\n' delimited text received from MQTT Server.
26 | *
27 | * Usage: MQTTStreamWordCount
28 | * and describe the MQTT server that Structured Streaming
29 | * would connect to receive data.
30 | *
31 | * To run this on your local machine, a MQTT Server should be up and running.
32 | *
33 | */
34 | object MQTTStreamWordCount {
35 | def main(args: Array[String]) {
36 | if (args.length < 2) {
37 | System.err.println("Usage: MQTTStreamWordCount ") // scalastyle:off println
38 | System.exit(1)
39 | }
40 |
41 | val brokerUrl = args(0)
42 | val topic = args(1)
43 |
44 | val spark = SparkSession
45 | .builder
46 | .appName("MQTTStreamWordCount")
47 | .master("local[4]")
48 | .getOrCreate()
49 |
50 | import spark.implicits._
51 |
52 | // Create DataFrame representing the stream of input lines from connection to mqtt server
53 | val lines = spark.readStream
54 | .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider")
55 | .option("topic", topic).option("persistence", "memory")
56 | .load(brokerUrl).selectExpr("CAST(payload AS STRING)").as[String]
57 |
58 | // Split the lines into words
59 | val words = lines.flatMap(_.split(" "))
60 |
61 | // Generate running word count
62 | val wordCounts = words.groupBy("value").count()
63 |
64 | // Start running the query that prints the running counts to the console
65 | val query = wordCounts.writeStream
66 | .outputMode("complete")
67 | .format("console")
68 | .start()
69 |
70 | query.awaitTermination()
71 | }
72 | }
73 |
74 |
--------------------------------------------------------------------------------
/sql-streaming-mqtt/src/main/assembly/assembly.xml:
--------------------------------------------------------------------------------
1 |
17 |
18 | test-jar-with-dependencies
19 |
20 | jar
21 |
22 | false
23 |
24 |
25 |
26 | ${project.build.directory}/scala-${scala.binary.version}/test-classes
27 |
28 |
29 |
30 |
31 |
32 |
33 | true
34 | test
35 | true
36 |
37 | org.apache.hadoop:*:jar
38 | org.apache.zookeeper:*:jar
39 | org.apache.avro:*:jar
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/sql-streaming-mqtt/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | org.apache.bahir.sql.streaming.mqtt.MQTTStreamSinkProvider
19 | org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider
20 | org.apache.spark.sql.mqtt.HDFSMQTTSourceProvider
--------------------------------------------------------------------------------
/sql-streaming-mqtt/src/main/scala/org/apache/bahir/sql/streaming/mqtt/LongOffset.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.sql.streaming.mqtt
19 |
20 | import org.apache.spark.sql.execution.streaming.Offset
21 | import org.apache.spark.sql.execution.streaming.SerializedOffset
22 | import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2}
23 |
24 | /**
25 | * A simple offset for sources that produce a single linear stream of data.
26 | */
27 | case class LongOffset(offset: Long) extends OffsetV2 {
28 |
29 | override val json = offset.toString
30 |
31 | def +(increment: Long): LongOffset = new LongOffset(offset + increment)
32 | def -(decrement: Long): LongOffset = new LongOffset(offset - decrement)
33 | }
34 |
35 | object LongOffset {
36 |
37 | /**
38 | * LongOffset factory from serialized offset.
39 | *
40 | * @return new LongOffset
41 | */
42 | def apply(offset: SerializedOffset) : LongOffset = new LongOffset(offset.json.toLong)
43 |
44 | /**
45 | * Convert generic Offset to LongOffset if possible.
46 | *
47 | * @return converted LongOffset
48 | */
49 | def convert(offset: Offset): Option[LongOffset] = offset match {
50 | case lo: LongOffset => Some(lo)
51 | case so: SerializedOffset => Some(LongOffset(so))
52 | case _ => None
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/sql-streaming-mqtt/src/main/scala/org/apache/spark/sql/mqtt/HDFSMQTTSourceProvider.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.sql.mqtt
19 |
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.SQLContext
22 | import org.apache.spark.sql.execution.streaming.Source
23 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
24 | import org.apache.spark.sql.types.StructType
25 |
26 | import org.apache.bahir.sql.streaming.mqtt.{MQTTStreamConstants, MQTTUtils}
27 |
28 | /**
29 | * The provider class for creating MQTT source.
30 | * This provider throw IllegalArgumentException if 'brokerUrl' or 'topic' parameter
31 | * is not set in options.
32 | */
33 | class HDFSMQTTSourceProvider extends StreamSourceProvider with DataSourceRegister with Logging {
34 |
35 | override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType],
36 | providerName: String, parameters: Map[String, String]): (String, StructType) = {
37 | ("hdfs-mqtt", MQTTStreamConstants.SCHEMA_DEFAULT)
38 | }
39 |
40 | override def createSource(sqlContext: SQLContext, metadataPath: String,
41 | schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = {
42 |
43 | val parsedResult = MQTTUtils.parseConfigParams(parameters)
44 |
45 | new HdfsBasedMQTTStreamSource(
46 | sqlContext,
47 | metadataPath,
48 | parsedResult._1, // brokerUrl
49 | parsedResult._2, // clientId
50 | parsedResult._3, // topic
51 | parsedResult._5, // mqttConnectionOptions
52 | parsedResult._6, // qos
53 | parsedResult._7, // maxBatchMessageNum
54 | parsedResult._8, // maxBatchMessageSize
55 | parsedResult._9 // maxRetryNum
56 | )
57 | }
58 |
59 | override def shortName(): String = "hdfs-mqtt"
60 | }
61 |
62 | object HDFSMQTTSourceProvider {
63 | val SEP = "##"
64 | }
65 |
--------------------------------------------------------------------------------
/sql-streaming-mqtt/src/test/bin/test-BAHIR-83.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | set -o pipefail
20 |
21 | for i in `seq 100` ; do
22 | mvn scalatest:test -pl sql-streaming-mqtt -q -Dsuites='*.BasicMQTTSourceSuite' | \
23 | grep -q "TEST FAILED" && echo "$i: failed"
24 | done
25 |
--------------------------------------------------------------------------------
/sql-streaming-mqtt/src/test/resources/keystore.jks:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apache/bahir/8b647c8ef80455ba900ef91e08eaf5eafa35c133/sql-streaming-mqtt/src/test/resources/keystore.jks
--------------------------------------------------------------------------------
/sql-streaming-mqtt/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the file target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=true
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
25 |
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.spark_project.jetty=WARN
28 |
--------------------------------------------------------------------------------
/sql-streaming-mqtt/src/test/resources/logging.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | ############################################################
19 | # Global properties
20 | ############################################################
21 |
22 | # "handlers" specifies a comma separated list of log Handler
23 | # classes. These handlers will be installed during VM startup.
24 | # Note that these classes must be on the system classpath.
25 | # By default we only configure a ConsoleHandler, which will only
26 | # show messages at the INFO and above levels.
27 | handlers = java.util.logging.ConsoleHandler
28 |
29 | # To also add the FileHandler, use the following line instead.
30 | #handlers = java.util.logging.FileHandler, java.util.logging.ConsoleHandler
31 |
32 | # Default global logging level.
33 | # This specifies which kinds of events are logged across
34 | # all loggers. For any given facility this global level
35 | # can be overriden by a facility specific level
36 | # Note that the ConsoleHandler also has a separate level
37 | # setting to limit messages printed to the console.
38 | .level = INFO
39 |
40 | ############################################################
41 | # Handler specific properties.
42 | # Describes specific configuration info for Handlers.
43 | ############################################################
44 |
45 | # Log file output is in target directory.
46 | java.util.logging.FileHandler.pattern = target/unit-tests-java-%u.log
47 | java.util.logging.FileHandler.limit = 50000
48 | java.util.logging.FileHandler.count = 1
49 | java.util.logging.FileHandler.formatter = java.util.logging.XMLFormatter
50 |
51 | # Limit the message that are printed on the console to WARNING and above.
52 | java.util.logging.ConsoleHandler.level = WARNING
53 | java.util.logging.ConsoleHandler.formatter = java.util.logging.SimpleFormatter
54 |
55 | # Example to customize the SimpleFormatter output format
56 | # to print one-line log message like this:
57 | # : []
58 | #
59 | # java.util.logging.SimpleFormatter.format=%4$s: %5$s [%1$tc]%n
60 |
61 | ############################################################
62 | # Facility specific properties.
63 | # Provides extra control for each logger.
64 | ############################################################
65 |
66 | # [BAHIR-] don't flood build logs with superfluous Parquet INFO messages
67 | # they should be written to a file via FileHandler but they end up in the
68 | # build log anyhow irrespective of the ConsoleHandler log level
69 | # also see https://github.com/Parquet/parquet-mr/issues/425
70 | org.apache.parquet.hadoop.level=SEVERE
71 |
--------------------------------------------------------------------------------
/sql-streaming-mqtt/src/test/resources/truststore.jks:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apache/bahir/8b647c8ef80455ba900ef91e08eaf5eafa35c133/sql-streaming-mqtt/src/test/resources/truststore.jks
--------------------------------------------------------------------------------
/sql-streaming-mqtt/src/test/scala/org/apache/bahir/sql/streaming/mqtt/LocalMessageStoreSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.sql.streaming.mqtt
19 |
20 | import java.io.File
21 |
22 | import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence
23 | import org.scalatest.BeforeAndAfter
24 |
25 | import org.apache.spark.SparkFunSuite
26 |
27 | import org.apache.bahir.utils.FileHelper
28 |
29 |
30 | class LocalMessageStoreSuite extends SparkFunSuite with BeforeAndAfter {
31 |
32 | private val testData = Seq(1, 2, 3, 4, 5, 6)
33 | private val javaSerializer: JavaSerializer = new JavaSerializer()
34 |
35 | private val serializerInstance = javaSerializer
36 | private val tempDir: File = new File(System.getProperty("java.io.tmpdir") + "/mqtt-test2/")
37 | private val persistence: MqttDefaultFilePersistence =
38 | new MqttDefaultFilePersistence(tempDir.getAbsolutePath)
39 |
40 | private val store = new LocalMessageStore(persistence, javaSerializer)
41 |
42 | before {
43 | tempDir.mkdirs()
44 | tempDir.deleteOnExit()
45 | persistence.open("temp", "tcp://dummy-url:0000")
46 | }
47 |
48 | after {
49 | persistence.clear()
50 | persistence.close()
51 | FileHelper.deleteFileQuietly(tempDir)
52 | }
53 |
54 | test("serialize and deserialize") {
55 | val serialized = serializerInstance.serialize(testData)
56 | val deserialized: Seq[Int] = serializerInstance
57 | .deserialize(serialized).asInstanceOf[Seq[Int]]
58 | assert(testData === deserialized)
59 | }
60 |
61 | test("Store and retrieve") {
62 | store.store(1, testData)
63 | val result: Seq[Int] = store.retrieve(1)
64 | assert(testData === result)
65 | }
66 |
67 | test("Max offset stored") {
68 | store.store(1, testData)
69 | store.store(10, testData)
70 | val offset = store.maxProcessedOffset
71 | assert(offset == 10)
72 | }
73 |
74 | }
75 |
--------------------------------------------------------------------------------
/sql-streaming-sqs/examples/src/main/scala/org/apache/bahir/examples/sql/streaming/sqs/SqsSourceExample.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.bahir.examples.sql.streaming.sqs
19 |
20 | import scala.util.Random
21 |
22 | import org.apache.spark.sql.SparkSession
23 |
24 | /**
25 | * Example to read files from S3 using SQS Source and write results to Memory Sink
26 | *
27 | * Usage: SqsSourceExample
28 | */
29 |
30 | object SqsSourceExample {
31 |
32 | def main(args: Array[String]) {
33 |
34 | val randomName = Random.alphanumeric.take(6).mkString("")
35 | val pathName = "path_" + randomName
36 | val queryName = "query_" + randomName
37 | val checkpointDir = s"/checkpoints/$pathName"
38 | val schemaPathString = args(0)
39 |
40 | val spark = SparkSession.builder().appName("SqsExample").getOrCreate()
41 |
42 | val schema = spark.read.json(schemaPathString).schema
43 |
44 | val queueUrl = args(1)
45 |
46 | val fileFormat = args(2)
47 |
48 | val inputDf = spark
49 | .readStream
50 | .format("s3-sqs")
51 | .schema(schema)
52 | .option("sqsUrl", queueUrl)
53 | .option("fileFormat", fileFormat)
54 | .option("sqsFetchIntervalSeconds", "2")
55 | .option("sqsLongPollingWaitTimeSeconds", "5")
56 | .option("maxFilesPerTrigger", "50")
57 | .option("ignoreFileDeletion", "true")
58 | .load()
59 |
60 | val query = inputDf
61 | .writeStream
62 | .queryName(queryName)
63 | .format("memory")
64 | .option("checkpointLocation", checkpointDir)
65 | .start()
66 |
67 | query.awaitTermination()
68 | }
69 | }
70 |
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/sql-streaming-sqs/src/main/java/org/apache/spark/sql/streaming/sqs/BasicAWSCredentialsProvider.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.sql.streaming.sqs;
19 |
20 | import com.amazonaws.AmazonClientException;
21 | import com.amazonaws.auth.AWSCredentialsProvider;
22 | import com.amazonaws.auth.BasicAWSCredentials;
23 | import com.amazonaws.auth.AWSCredentials;
24 | import org.apache.commons.lang.StringUtils;
25 |
26 | public class BasicAWSCredentialsProvider implements AWSCredentialsProvider {
27 | private final String accessKey;
28 | private final String secretKey;
29 |
30 | public BasicAWSCredentialsProvider(String accessKey, String secretKey) {
31 | this.accessKey = accessKey;
32 | this.secretKey = secretKey;
33 | }
34 |
35 | public AWSCredentials getCredentials() {
36 | if (!StringUtils.isEmpty(accessKey) && !StringUtils.isEmpty(secretKey)) {
37 | return new BasicAWSCredentials(accessKey, secretKey);
38 | }
39 | throw new AmazonClientException(
40 | "Access key or secret key is null");
41 | }
42 |
43 | public void refresh() {}
44 |
45 | @Override
46 | public String toString() {
47 | return getClass().getSimpleName();
48 | }
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/sql-streaming-sqs/src/main/java/org/apache/spark/sql/streaming/sqs/InstanceProfileCredentialsProviderWithRetries.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.sql.streaming.sqs;
19 |
20 |
21 | import com.amazonaws.AmazonClientException;
22 | import com.amazonaws.auth.AWSCredentials;
23 | import com.amazonaws.auth.InstanceProfileCredentialsProvider;
24 | import org.apache.commons.logging.Log;
25 | import org.apache.commons.logging.LogFactory;
26 |
27 | public class InstanceProfileCredentialsProviderWithRetries
28 | extends InstanceProfileCredentialsProvider {
29 |
30 | private static final Log LOG = LogFactory.getLog(
31 | InstanceProfileCredentialsProviderWithRetries.class);
32 |
33 | public AWSCredentials getCredentials() {
34 | int retries = 10;
35 | int sleep = 500;
36 | while(retries > 0) {
37 | try {
38 | return super.getCredentials();
39 | }
40 | catch (RuntimeException re) {
41 | LOG.error("Got an exception while fetching credentials " + re);
42 | --retries;
43 | try {
44 | Thread.sleep(sleep);
45 | } catch (InterruptedException ie) {
46 | // Do nothing
47 | }
48 | if (sleep < 10000) {
49 | sleep *= 2;
50 | }
51 | }
52 | catch (Error error) {
53 | LOG.error("Got an exception while fetching credentials " + error);
54 | --retries;
55 | try {
56 | Thread.sleep(sleep);
57 | } catch (InterruptedException ie) {
58 | // Do nothing
59 | }
60 | if (sleep < 10000) {
61 | sleep *= 2;
62 | }
63 | }
64 | }
65 | throw new AmazonClientException("Unable to load credentials.");
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/sql-streaming-sqs/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | org.apache.spark.sql.streaming.sqs.SqsSourceProvider
--------------------------------------------------------------------------------
/sql-streaming-sqs/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | log4j.rootCategory=WARN, console
19 |
20 | # File appender
21 | log4j.appender.file=org.apache.log4j.FileAppender
22 | log4j.appender.file.append=false
23 | log4j.appender.file.file=target/unit-tests.log
24 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
25 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
26 |
27 | # Console appender
28 | log4j.appender.console=org.apache.log4j.ConsoleAppender
29 | log4j.appender.console.target=System.out
30 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
31 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
32 |
33 | # Settings to quiet third party logs that are too verbose
34 | log4j.logger.org.sparkproject.jetty=WARN
35 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
36 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
37 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
38 |
--------------------------------------------------------------------------------
/sql-streaming-sqs/src/main/scala/org/apache/spark/sql/streaming/sqs/SqsSourceProvider.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.sql.streaming.sqs
19 |
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.SQLContext
22 | import org.apache.spark.sql.execution.streaming.Source
23 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
24 | import org.apache.spark.sql.types.StructType
25 |
26 | class SqsSourceProvider extends DataSourceRegister
27 | with StreamSourceProvider
28 | with Logging {
29 |
30 | override def shortName(): String = "s3-sqs"
31 |
32 | override def sourceSchema(sqlContext: SQLContext,
33 | schema: Option[StructType],
34 | providerName: String,
35 | parameters: Map[String, String]): (String, StructType) = {
36 |
37 | require(schema.isDefined, "Sqs source doesn't support empty schema")
38 | (shortName(), schema.get)
39 | }
40 |
41 | override def createSource(sqlContext: SQLContext,
42 | metadataPath: String,
43 | schema: Option[StructType],
44 | providerName: String,
45 | parameters: Map[String, String]): Source = {
46 |
47 | new SqsSource(
48 | sqlContext.sparkSession,
49 | metadataPath,
50 | parameters,
51 | schema.get)
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/sql-streaming-sqs/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the file target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=true
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
25 |
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.sparkproject.jetty=WARN
28 |
--------------------------------------------------------------------------------
/streaming-akka/README.md:
--------------------------------------------------------------------------------
1 |
19 | # Spark Streaming Akka Connector
20 |
21 | A library for reading data from Akka Actors using Spark Streaming.
22 |
23 | ## Linking
24 |
25 | Using SBT:
26 |
27 | libraryDependencies += "org.apache.bahir" %% "spark-streaming-akka" % "{{site.SPARK_VERSION}}"
28 |
29 | Using Maven:
30 |
31 |
32 | org.apache.bahir
33 | spark-streaming-akka_{{site.SCALA_BINARY_VERSION}}
34 | {{site.SPARK_VERSION}}
35 |
36 |
37 | This library can also be added to Spark jobs launched through `spark-shell` or `spark-submit` by using the `--packages` command line option.
38 | For example, to include it when starting the spark shell:
39 |
40 | $ bin/spark-shell --packages org.apache.bahir:spark-streaming-akka_{{site.SCALA_BINARY_VERSION}}:{{site.SPARK_VERSION}}
41 |
42 | Unlike using `--jars`, using `--packages` ensures that this library and its dependencies will be added to the classpath.
43 | The `--packages` argument can also be used with `bin/spark-submit`.
44 |
45 | This library is cross-published for Scala 2.11 and Scala 2.12, so users should replace the proper Scala version in the commands listed above.
46 |
47 | ## Examples
48 |
49 | DStreams can be created with data streams received through Akka actors by using `AkkaUtils.createStream(ssc, actorProps, actor-name)`.
50 |
51 | ### Scala API
52 |
53 | You need to extend `ActorReceiver` so as to store received data into Spark using `store(...)` methods. The supervisor strategy of
54 | this actor can be configured to handle failures, etc.
55 |
56 | class CustomActor extends ActorReceiver {
57 | def receive = {
58 | case data: String => store(data)
59 | }
60 | }
61 |
62 | // A new input stream can be created with this custom actor as
63 | val ssc: StreamingContext = ...
64 | val lines = AkkaUtils.createStream[String](ssc, Props[CustomActor](), "CustomReceiver")
65 |
66 | ### Java API
67 |
68 | You need to extend `JavaActorReceiver` so as to store received data into Spark using `store(...)` methods. The supervisor strategy of
69 | this actor can be configured to handle failures, etc.
70 |
71 | class CustomActor extends JavaActorReceiver {
72 | @Override
73 | public void onReceive(Object msg) throws Exception {
74 | store((String) msg);
75 | }
76 | }
77 |
78 | // A new input stream can be created with this custom actor as
79 | JavaStreamingContext jssc = ...;
80 | JavaDStream lines = AkkaUtils.createStream(jssc, Props.create(CustomActor.class), "CustomReceiver");
81 |
82 | See end-to-end examples at [Akka Examples](https://github.com/apache/bahir/tree/master/streaming-akka/examples)
83 |
--------------------------------------------------------------------------------
/streaming-akka/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
20 | 4.0.0
21 |
22 | org.apache.bahir
23 | bahir-parent_2.12
24 | 3.0.0-SNAPSHOT
25 | ../pom.xml
26 |
27 |
28 | org.apache.bahir
29 | spark-streaming-akka_2.12
30 |
31 | streaming-akka
32 |
33 | jar
34 | Apache Bahir - Spark Streaming Akka
35 | http://bahir.apache.org/
36 |
37 |
38 |
39 | org.apache.bahir
40 | bahir-common_${scala.binary.version}
41 | ${project.version}
42 | test-jar
43 | test
44 |
45 |
46 | org.apache.spark
47 | spark-tags_${scala.binary.version}
48 |
49 |
50 | org.apache.spark
51 | spark-streaming_${scala.binary.version}
52 | ${spark.version}
53 | provided
54 |
55 |
56 | org.apache.spark
57 | spark-core_${scala.binary.version}
58 | ${spark.version}
59 | test-jar
60 | test
61 |
62 |
63 | ${akka.group}
64 | akka-actor_${scala.binary.version}
65 | ${akka.version}
66 |
67 |
68 | ${akka.group}
69 | akka-remote_${scala.binary.version}
70 | ${akka.version}
71 |
72 |
73 |
74 | target/scala-${scala.binary.version}/classes
75 | target/scala-${scala.binary.version}/test-classes
76 |
77 |
78 | org.apache.maven.plugins
79 | maven-source-plugin
80 |
81 |
82 |
83 |
84 |
--------------------------------------------------------------------------------
/streaming-akka/src/test/java/org/apache/spark/streaming/akka/JavaAkkaUtilsSuite.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.streaming.akka;
19 |
20 | import akka.actor.ActorSystem;
21 | import akka.actor.Props;
22 | import akka.actor.SupervisorStrategy;
23 | import akka.util.Timeout;
24 | import org.junit.Test;
25 |
26 | import org.apache.spark.api.java.function.Function0;
27 | import org.apache.spark.storage.StorageLevel;
28 | import org.apache.spark.streaming.LocalJavaStreamingContext;
29 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
30 |
31 | import java.util.concurrent.TimeUnit;
32 |
33 | public class JavaAkkaUtilsSuite extends LocalJavaStreamingContext {
34 | @Test
35 | public void testAkkaUtils() {
36 | // tests the API, does not actually test data receiving
37 | JavaReceiverInputDStream test1 = AkkaUtils.createStream(
38 | ssc, Props.create(JavaTestActor.class), "test"
39 | );
40 | JavaReceiverInputDStream test2 = AkkaUtils.createStream(
41 | ssc, Props.create(JavaTestActor.class), "test",
42 | StorageLevel.MEMORY_AND_DISK_SER_2()
43 | );
44 | JavaReceiverInputDStream test3 = AkkaUtils.createStream(
45 | ssc, Props.create(JavaTestActor.class), "test",
46 | StorageLevel.MEMORY_AND_DISK_SER_2(), new ActorSystemCreatorForTest(),
47 | SupervisorStrategy.defaultStrategy()
48 | );
49 | }
50 | }
51 |
52 | class ActorSystemCreatorForTest implements Function0 {
53 | @Override
54 | public ActorSystem call() {
55 | return null;
56 | }
57 | }
58 |
59 | class JavaTestActor extends JavaActorReceiver {
60 | @Override
61 | public void onReceive(Object message) throws Exception {
62 | store((String) message);
63 | store((String) message, new Timeout(1000, TimeUnit.MILLISECONDS));
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/streaming-akka/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the file target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=true
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
25 |
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.spark-project.jetty=WARN
28 |
29 |
--------------------------------------------------------------------------------
/streaming-akka/src/test/scala/org/apache/spark/streaming/akka/AkkaUtilsSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.streaming.akka
19 |
20 | import scala.concurrent.duration._
21 |
22 | import akka.actor.{Props, SupervisorStrategy}
23 |
24 | import org.apache.spark.SparkFunSuite
25 | import org.apache.spark.storage.StorageLevel
26 | import org.apache.spark.streaming.{Seconds, StreamingContext}
27 | import org.apache.spark.streaming.dstream.ReceiverInputDStream
28 |
29 | class AkkaUtilsSuite extends SparkFunSuite {
30 |
31 | test("createStream") {
32 | val ssc: StreamingContext = new StreamingContext("local[2]", "test", Seconds(1000))
33 | try {
34 | // tests the API, does not actually test data receiving
35 | val test1: ReceiverInputDStream[String] = AkkaUtils.createStream(
36 | ssc, Props[TestActor](), "test")
37 | val test2: ReceiverInputDStream[String] = AkkaUtils.createStream(
38 | ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2)
39 | val test3: ReceiverInputDStream[String] = AkkaUtils.createStream(
40 | ssc,
41 | Props[TestActor](),
42 | "test",
43 | StorageLevel.MEMORY_AND_DISK_SER_2,
44 | supervisorStrategy = SupervisorStrategy.defaultStrategy)
45 | val test4: ReceiverInputDStream[String] = AkkaUtils.createStream(
46 | ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null)
47 | val test5: ReceiverInputDStream[String] = AkkaUtils.createStream(
48 | ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null)
49 | val test6: ReceiverInputDStream[String] = AkkaUtils.createStream(
50 | ssc,
51 | Props[TestActor](),
52 | "test",
53 | StorageLevel.MEMORY_AND_DISK_SER_2,
54 | () => null,
55 | SupervisorStrategy.defaultStrategy)
56 | } finally {
57 | ssc.stop()
58 | }
59 | }
60 | }
61 |
62 | class TestActor extends ActorReceiver {
63 | override def receive: Receive = {
64 | case m: String => store(m)
65 | case m => store(m, 10.seconds)
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/streaming-mqtt/examples/src/main/python/streaming/mqtt_wordcount.py:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | """
19 | A sample wordcount with MqttStream stream
20 | Usage: mqtt_wordcount.py
21 |
22 | To run this in your local machine, you need to setup a MQTT broker and publisher first,
23 | like Mosquitto (http://mosquitto.org/) an easy to use and install open source MQTT Broker.
24 | On Mac OS Mosquitto can be installed with Homebrew `$ brew install mosquitto`.
25 | On Ubuntu mosquitto can be installed with the command `$ sudo apt-get install mosquitto`.
26 |
27 | Alternatively, the Eclipse paho project provides a number of clients and utilities for
28 | working with MQTT, see http://www.eclipse.org/paho/#getting-started
29 |
30 | How to run this example locally:
31 |
32 | (1) Start Mqtt message broker/server, i.e. Mosquitto:
33 |
34 | `$ mosquitto -p 1883`
35 |
36 | (2) Run the publisher:
37 |
38 | `$ bin/run-example \
39 | org.apache.spark.examples.streaming.mqtt.MQTTPublisher tcp://localhost:1883 foo`
40 |
41 | (3) Run the example:
42 |
43 | `$ bin/run-example \
44 | streaming-mqtt/examples/src/main/python/streaming/mqtt_wordcount.py tcp://localhost:1883 foo`
45 | """
46 |
47 | import sys
48 |
49 | from pyspark import SparkContext
50 | from pyspark.streaming import StreamingContext
51 | from mqtt import MQTTUtils
52 |
53 | if __name__ == "__main__":
54 | if len(sys.argv) != 3:
55 | print >> sys.stderr, "Usage: mqtt_wordcount.py "
56 | exit(-1)
57 |
58 | sc = SparkContext(appName="PythonStreamingMQTTWordCount")
59 | ssc = StreamingContext(sc, 1)
60 |
61 | brokerUrl = sys.argv[1]
62 | topic = sys.argv[2]
63 |
64 | lines = MQTTUtils.createStream(ssc, brokerUrl, topic)
65 | counts = lines.flatMap(lambda line: line.split(" ")) \
66 | .map(lambda word: (word, 1)) \
67 | .reduceByKey(lambda a, b: a+b)
68 | counts.pprint()
69 |
70 | ssc.start()
71 | ssc.awaitTermination()
72 |
--------------------------------------------------------------------------------
/streaming-mqtt/src/main/assembly/assembly.xml:
--------------------------------------------------------------------------------
1 |
17 |
18 | test-jar-with-dependencies
19 |
20 | jar
21 |
22 | false
23 |
24 |
25 |
26 | ${project.build.directory}/scala-${scala.binary.version}/test-classes
27 |
28 |
29 |
30 |
31 |
32 |
33 | true
34 | test
35 | true
36 |
37 | org.apache.hadoop:*:jar
38 | org.apache.zookeeper:*:jar
39 | org.apache.avro:*:jar
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/streaming-mqtt/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the file target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=true
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
25 |
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.spark-project.jetty=WARN
28 |
29 |
--------------------------------------------------------------------------------
/streaming-pubnub/examples/src/main/scala/org/apache/spark/examples/streaming/pubnub/PubNubWordCount.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples.streaming.pubnub
19 |
20 | import com.google.gson.JsonParser
21 | import com.pubnub.api.PNConfiguration
22 | import com.pubnub.api.enums.PNReconnectionPolicy
23 |
24 | import org.apache.spark.SparkConf
25 | import org.apache.spark.storage.StorageLevel
26 | import org.apache.spark.streaming.Milliseconds
27 | import org.apache.spark.streaming.StreamingContext
28 | import org.apache.spark.streaming.dstream.ReceiverInputDStream
29 | import org.apache.spark.streaming.pubnub.{PubNubUtils, SparkPubNubMessage}
30 |
31 | /**
32 | * Consumes messages from a PubNub channel and calculates word count.
33 | * For demo purpose, login to PubNub account and produce messages using Debug Console.
34 | * Expected message format: {"text": "Hello, World!"}
35 | *
36 | * Usage: PubNubWordCount
37 | * subscribe key
38 | * channel
39 | * aggregation period in milliseconds
40 | *
41 | * Example:
42 | * $ bin/run-example \
43 | * org.apache.spark.examples.streaming.pubnub.PubNubWordCount \
44 | * sub-c-2d245192-ee8d-11e8-b4c3-46cd67be4fbd my-channel 60000
45 | */
46 | object PubNubWordCount {
47 | def main(args: Array[String]): Unit = {
48 | if (args.length != 3) {
49 | // scalastyle:off println
50 | System.err.println(
51 | """
52 | |Usage: PubNubWordCount
53 | |
54 | | subscribe key
55 | | channel
56 | | aggregation period in milliseconds
57 | |
58 | """.stripMargin
59 | )
60 | // scalastyle:on
61 | System.exit(1)
62 | }
63 |
64 | val Seq(subscribeKey, channel, aggregationPeriod) = args.toSeq
65 |
66 | val sparkConf = new SparkConf().setAppName("PubNubWordCount").setMaster("local[2]")
67 | val ssc = new StreamingContext(sparkConf, Milliseconds(aggregationPeriod.toLong))
68 |
69 | val config = new PNConfiguration
70 | config.setSubscribeKey(subscribeKey)
71 | config.setSecure(true)
72 | config.setReconnectionPolicy(PNReconnectionPolicy.LINEAR)
73 |
74 | val pubNubStream: ReceiverInputDStream[SparkPubNubMessage] = PubNubUtils.createStream(
75 | ssc, config, Seq(channel), Seq(), None, StorageLevel.MEMORY_AND_DISK_SER_2)
76 |
77 | val wordCounts = pubNubStream
78 | .flatMap(
79 | message => new JsonParser().parse(message.getPayload)
80 | .getAsJsonObject.get("text").getAsString.split("\\s")
81 | )
82 | .map(word => (word, 1))
83 | .reduceByKey(_ + _)
84 |
85 | wordCounts.print()
86 |
87 | ssc.start()
88 | ssc.awaitTermination()
89 | }
90 | }
91 |
92 |
--------------------------------------------------------------------------------
/streaming-pubnub/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
20 | 4.0.0
21 |
22 | bahir-parent_2.12
23 | org.apache.bahir
24 | 3.0.0-SNAPSHOT
25 | ../pom.xml
26 |
27 |
28 | spark-streaming-pubnub_2.12
29 |
30 | streaming-pubnub
31 |
32 | jar
33 | Apache Bahir - Spark Streaming PubNub
34 | http://bahir.apache.org/
35 |
36 |
37 |
38 | org.apache.bahir
39 | bahir-common_${scala.binary.version}
40 | ${project.version}
41 | test-jar
42 | test
43 |
44 |
45 | org.apache.spark
46 | spark-tags_${scala.binary.version}
47 |
48 |
49 | org.apache.spark
50 | spark-streaming_${scala.binary.version}
51 | ${spark.version}
52 | provided
53 |
54 |
55 | com.pubnub
56 | pubnub-gson
57 | 4.21.0
58 |
59 |
60 | org.apache.spark
61 | spark-core_${scala.binary.version}
62 | ${spark.version}
63 | test-jar
64 | test
65 |
66 |
67 | org.scalacheck
68 | scalacheck_${scala.binary.version}
69 | test
70 |
71 |
72 |
73 |
74 | target/scala-${scala.binary.version}/classes
75 | target/scala-${scala.binary.version}/test-classes
76 |
77 |
78 | org.apache.maven.plugins
79 | maven-source-plugin
80 |
81 |
82 |
83 |
84 |
--------------------------------------------------------------------------------
/streaming-pubnub/src/main/scala/org/apache/spark/streaming/pubnub/PubNubUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.streaming.pubnub
19 |
20 | import java.util.{Set => JSet}
21 |
22 | import collection.JavaConverters._
23 | import com.pubnub.api.PNConfiguration
24 |
25 | import org.apache.spark.storage.StorageLevel
26 | import org.apache.spark.streaming.StreamingContext
27 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream
28 | import org.apache.spark.streaming.api.java.JavaStreamingContext
29 | import org.apache.spark.streaming.dstream.ReceiverInputDStream
30 |
31 | object PubNubUtils {
32 | /**
33 | * Create an input stream that returns messages received from PubNub infrastructure.
34 | * @param ssc Streaming context
35 | * @param configuration PubNub client configuration
36 | * @param channels Sequence of channels to subscribe
37 | * @param channelGroups Sequence of channel groups to subscribe
38 | * @param timeToken Optional point in time to start receiving messages from.
39 | * Leave undefined to get only latest messages.
40 | * @param storageLevel Storage level to use for storing the received objects
41 | * @return Input stream
42 | */
43 | def createStream(
44 | ssc: StreamingContext,
45 | configuration: PNConfiguration,
46 | channels: Seq[String],
47 | channelGroups: Seq[String],
48 | timeToken: Option[Long] = None,
49 | storageLevel: StorageLevel): ReceiverInputDStream[SparkPubNubMessage] = {
50 | ssc.withNamedScope("PubNub Stream") {
51 | new PubNubInputDStream(
52 | ssc, configuration, channels, channelGroups, timeToken, storageLevel
53 | )
54 | }
55 | }
56 |
57 | /**
58 | * Create an input stream that returns messages received from PubNub infrastructure.
59 | * @param jssc Java streaming context
60 | * @param configuration PubNub client configuration
61 | * @param channels Set of channels to subscribe
62 | * @param channelGroups Set of channel groups to subscribe
63 | * @param timeToken Optional point in time to start receiving messages from.
64 | * Specify null
to get only latest messages.
65 | * @param storageLevel Storage level to use for storing the received objects
66 | * @return Input stream
67 | */
68 | def createStream(
69 | jssc: JavaStreamingContext,
70 | configuration: PNConfiguration,
71 | channels: JSet[String],
72 | channelGroups: JSet[String],
73 | timeToken: Option[Long],
74 | storageLevel: StorageLevel): JavaReceiverInputDStream[SparkPubNubMessage] = {
75 | createStream(
76 | jssc.ssc, configuration, Seq.empty ++ channels.asScala,
77 | Seq.empty ++ channelGroups.asScala, timeToken, storageLevel
78 | )
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/streaming-pubnub/src/test/java/org/apache/spark/streaming/pubnub/JavaPubNubStreamSuite.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.streaming.pubnub;
19 |
20 | import com.pubnub.api.PNConfiguration;
21 | import org.apache.spark.storage.StorageLevel;
22 | import org.apache.spark.streaming.LocalJavaStreamingContext;
23 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
24 | import org.junit.Test;
25 |
26 | import java.util.HashSet;
27 |
28 | public class JavaPubNubStreamSuite extends LocalJavaStreamingContext {
29 | @Test
30 | public void testPubNubStream() {
31 | // Tests the API compatibility, but do not actually receive any data.
32 | JavaReceiverInputDStream stream = PubNubUtils.createStream(
33 | ssc, new PNConfiguration(), new HashSet<>(), new HashSet<>(), null,
34 | StorageLevel.MEMORY_AND_DISK_SER_2()
35 | );
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/streaming-pubnub/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | log4j.rootCategory=INFO, console, file
19 |
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.out
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.conversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
24 |
25 | log4j.appender.file=org.apache.log4j.FileAppender
26 | log4j.appender.file.append=true
27 | log4j.appender.file.file=target/unit-tests.log
28 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
29 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
30 |
31 | # Ignore messages below warning level from Jetty, because it's a bit verbose
32 | log4j.logger.org.spark-project.jetty=WARN
33 |
34 |
--------------------------------------------------------------------------------
/streaming-pubnub/src/test/scala/org/apache/spark/streaming/pubnub/MessageSerializationSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.streaming.pubnub
19 |
20 | import java.io.ByteArrayInputStream
21 | import java.io.ByteArrayOutputStream
22 | import java.io.ObjectInputStream
23 | import java.io.ObjectOutputStream
24 |
25 | import com.google.gson.JsonParser
26 | import com.pubnub.api.models.consumer.pubsub.PNMessageResult
27 |
28 | import org.apache.spark.SparkFunSuite
29 |
30 | class MessageSerializationSuite extends SparkFunSuite {
31 | test("Full example") {
32 | checkMessageSerialization(
33 | "{\"message\":\"Hello, World!\"}", "channel1",
34 | "publisher1", "subscription1", System.currentTimeMillis * 10000
35 | )
36 | }
37 |
38 | test("Message from channel") {
39 | checkMessageSerialization("{\"message\":\"Hello, World!\"}", "c", "p", null, 13534398158620385L)
40 | }
41 |
42 | test("Message from subscription") {
43 | checkMessageSerialization("{\"message\":\"Hello, World!\"}", null, "p", "s", 13534397812467596L)
44 | }
45 |
46 | def checkMessageSerialization(payload: String, channel: String,
47 | publisher: String, subscription: String, timestamp: Long): Unit = {
48 | val builder = PNMessageResult.builder
49 | .message(if (payload != null) new JsonParser().parse(payload) else null)
50 | .channel(channel)
51 | .publisher(publisher)
52 | .subscription(subscription)
53 | .timetoken(timestamp)
54 | val pubNubMessage = builder.build()
55 | val sparkMessage = new SparkPubNubMessage
56 | sparkMessage.message = pubNubMessage
57 |
58 | // serializer
59 | val byteOutStream = new ByteArrayOutputStream
60 | val outputStream = new ObjectOutputStream(byteOutStream)
61 | outputStream.writeObject(sparkMessage)
62 | outputStream.flush()
63 | outputStream.close()
64 | byteOutStream.close()
65 | val serializedBytes = byteOutStream.toByteArray
66 |
67 | // deserialize
68 | val byteInStream = new ByteArrayInputStream(serializedBytes)
69 | val inputStream = new ObjectInputStream(byteInStream)
70 | val deserializedMessage = inputStream.readObject().asInstanceOf[SparkPubNubMessage]
71 | inputStream.close()
72 | byteInStream.close()
73 |
74 | assert(payload.equals(deserializedMessage.getPayload))
75 | if (channel != null) {
76 | assert(channel.equals(deserializedMessage.getChannel))
77 | } else {
78 | assert(deserializedMessage.getChannel == null)
79 | }
80 | if (subscription != null) {
81 | assert(subscription.equals(deserializedMessage.getSubscription))
82 | } else {
83 | assert(deserializedMessage.getSubscription == null)
84 | }
85 | assert(publisher.equals(deserializedMessage.getPublisher))
86 | val unixTimestamp = Math.ceil(timestamp / 10000).longValue()
87 | assert(unixTimestamp.equals(deserializedMessage.getTimestamp))
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/streaming-pubsub/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
20 | 4.0.0
21 |
22 | bahir-parent_2.12
23 | org.apache.bahir
24 | 3.0.0-SNAPSHOT
25 | ../pom.xml
26 |
27 |
28 | org.apache.bahir
29 | spark-streaming-pubsub_2.12
30 |
31 | streaming-pubsub
32 |
33 | jar
34 | Apache Bahir - Spark Streaming Google PubSub
35 | http://bahir.apache.org/
36 |
37 |
38 |
39 | org.apache.bahir
40 | bahir-common_${scala.binary.version}
41 | ${project.version}
42 | test-jar
43 | test
44 |
45 |
46 | org.apache.spark
47 | spark-tags_${scala.binary.version}
48 |
49 |
50 | org.apache.spark
51 | spark-streaming_${scala.binary.version}
52 | ${spark.version}
53 | provided
54 |
55 |
56 | com.google.apis
57 | google-api-services-pubsub
58 | v1-rev355-1.22.0
59 |
60 |
61 | com.google.cloud.bigdataoss
62 | util
63 | 1.6.0
64 |
65 |
66 | org.apache.spark
67 | spark-core_${scala.binary.version}
68 | ${spark.version}
69 | test-jar
70 | test
71 |
72 |
73 | org.scalacheck
74 | scalacheck_${scala.binary.version}
75 | test
76 |
77 |
78 | com.google.http-client
79 | google-http-client-jackson
80 | 1.22.0
81 |
82 |
83 |
84 | target/scala-${scala.binary.version}/classes
85 | target/scala-${scala.binary.version}/test-classes
86 |
87 |
88 | org.apache.maven.plugins
89 | maven-source-plugin
90 |
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/streaming-pubsub/src/test/java/org/apache/spark/streaming/pubsub/JavaPubsubStreamSuite.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.streaming.pubsub;
19 |
20 | import org.apache.spark.storage.StorageLevel;
21 | import org.apache.spark.streaming.LocalJavaStreamingContext;
22 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
23 | import org.junit.Test;
24 |
25 | public class JavaPubsubStreamSuite extends LocalJavaStreamingContext {
26 | @Test
27 | public void testPubsubStream() {
28 | // tests the API, does not actually test data receiving
29 | JavaReceiverInputDStream stream1 = PubsubUtils.createStream(
30 | ssc, "project", "subscription",
31 | new SparkGCPCredentials.Builder().build(), StorageLevel.MEMORY_AND_DISK_SER_2());
32 |
33 | JavaReceiverInputDStream stream2 = PubsubUtils.createStream(
34 | ssc, "project", "topic", "subscription",
35 | new SparkGCPCredentials.Builder().build(), StorageLevel.MEMORY_AND_DISK_SER_2());
36 |
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/streaming-pubsub/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the file target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=true
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
25 |
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.spark-project.jetty=WARN
28 |
29 |
--------------------------------------------------------------------------------
/streaming-twitter/README.md:
--------------------------------------------------------------------------------
1 |
19 | # Spark Streaming Twitter Connector
20 |
21 | A library for reading social data from [twitter](http://twitter.com/) using Spark Streaming.
22 |
23 | ## Linking
24 |
25 | Using SBT:
26 |
27 | libraryDependencies += "org.apache.bahir" %% "spark-streaming-twitter" % "{{site.SPARK_VERSION}}"
28 |
29 | Using Maven:
30 |
31 |
32 | org.apache.bahir
33 | spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}}
34 | {{site.SPARK_VERSION}}
35 |
36 |
37 | This library can also be added to Spark jobs launched through `spark-shell` or `spark-submit` by using the `--packages` command line option.
38 | For example, to include it when starting the spark shell:
39 |
40 | $ bin/spark-shell --packages org.apache.bahir:spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}}:{{site.SPARK_VERSION}}
41 |
42 | Unlike using `--jars`, using `--packages` ensures that this library and its dependencies will be added to the classpath.
43 | The `--packages` argument can also be used with `bin/spark-submit`.
44 |
45 | This library is cross-published for Scala 2.11 and Scala 2.12, so users should replace the proper Scala version in the commands listed above.
46 |
47 |
48 | ## Examples
49 |
50 | `TwitterUtils` uses Twitter4j to get the public stream of tweets using [Twitter's Streaming API](https://dev.twitter.com/docs/streaming-apis). Authentication information
51 | can be provided by any of the [methods](http://twitter4j.org/en/configuration.html) supported by Twitter4J library. You can import the `TwitterUtils` class and create a DStream with `TwitterUtils.createStream` as shown below.
52 |
53 | ### Scala API
54 |
55 | import org.apache.spark.streaming.twitter._
56 |
57 | TwitterUtils.createStream(ssc, None)
58 |
59 | ### Java API
60 |
61 | import org.apache.spark.streaming.twitter.*;
62 |
63 | TwitterUtils.createStream(jssc);
64 |
65 |
66 | You can also either get the public stream, or get the filtered stream based on keywords.
67 | See end-to-end examples at [Twitter Examples](https://github.com/apache/bahir/tree/master/streaming-twitter/examples).
68 |
69 | ## Unit Test
70 |
71 | Executing integration tests requires users to register custom application at
72 | [Twitter Developer Portal](https://developer.twitter.com) and obtain private OAuth credentials.
73 | Below listing present how to run complete test suite on local workstation.
74 |
75 | cd streaming-twitter
76 | env ENABLE_TWITTER_TESTS=1 \
77 | twitter4j.oauth.consumerKey=${customer key} \
78 | twitter4j.oauth.consumerSecret=${customer secret} \
79 | twitter4j.oauth.accessToken=${access token} \
80 | twitter4j.oauth.accessTokenSecret=${access token secret} \
81 | mvn clean test
82 |
--------------------------------------------------------------------------------
/streaming-twitter/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
20 | 4.0.0
21 |
22 | org.apache.bahir
23 | bahir-parent_2.12
24 | 3.0.0-SNAPSHOT
25 | ../pom.xml
26 |
27 |
28 | org.apache.bahir
29 | spark-streaming-twitter_2.12
30 |
31 | streaming-twitter
32 |
33 | jar
34 | Apache Bahir - Spark Streaming Twitter
35 | http://bahir.apache.org/
36 |
37 |
38 |
39 | org.apache.bahir
40 | bahir-common_${scala.binary.version}
41 | ${project.version}
42 | test-jar
43 | test
44 |
45 |
46 | org.apache.spark
47 | spark-tags_${scala.binary.version}
48 |
49 |
50 | org.apache.spark
51 | spark-streaming_${scala.binary.version}
52 | ${spark.version}
53 | provided
54 |
55 |
56 | org.apache.spark
57 | spark-core_${scala.binary.version}
58 | ${spark.version}
59 | test-jar
60 | test
61 |
62 |
63 | org.twitter4j
64 | twitter4j-stream
65 | 4.0.6
66 |
67 |
68 | org.scalacheck
69 | scalacheck_${scala.binary.version}
70 | test
71 |
72 |
73 | com.twitter
74 | algebird-core_${scala.binary.version}
75 | 0.12.4
76 | test
77 |
78 |
79 |
80 | target/scala-${scala.binary.version}/classes
81 | target/scala-${scala.binary.version}/test-classes
82 |
83 |
84 | org.apache.maven.plugins
85 | maven-source-plugin
86 |
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/streaming-twitter/src/test/java/org/apache/spark/streaming/twitter/JavaTwitterStreamSuite.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.streaming.twitter;
19 |
20 | import org.junit.Test;
21 | import twitter4j.FilterQuery;
22 | import twitter4j.Status;
23 | import twitter4j.auth.Authorization;
24 | import twitter4j.auth.NullAuthorization;
25 | import org.apache.spark.storage.StorageLevel;
26 | import org.apache.spark.streaming.LocalJavaStreamingContext;
27 | import org.apache.spark.streaming.api.java.JavaDStream;
28 |
29 | public class JavaTwitterStreamSuite extends LocalJavaStreamingContext {
30 | @Test
31 | public void testTwitterStream() {
32 | String[] filters = { "filter1", "filter2" };
33 | Authorization auth = NullAuthorization.getInstance();
34 | FilterQuery query = new FilterQuery().language("en,es");
35 |
36 | // tests the API, does not actually test data receiving
37 | JavaDStream test1 = TwitterUtils.createStream(ssc);
38 | JavaDStream test2 = TwitterUtils.createStream(ssc, filters);
39 | JavaDStream test3 = TwitterUtils.createStream(
40 | ssc, filters, StorageLevel.MEMORY_AND_DISK_SER_2());
41 | JavaDStream test4 = TwitterUtils.createStream(ssc, auth);
42 | JavaDStream test5 = TwitterUtils.createStream(ssc, auth, filters);
43 | JavaDStream test6 = TwitterUtils.createStream(ssc,
44 | auth, filters, StorageLevel.MEMORY_AND_DISK_SER_2());
45 | JavaDStream test7 = TwitterUtils.createFilteredStream(ssc,
46 | auth, query, StorageLevel.MEMORY_AND_DISK_SER_2());
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/streaming-twitter/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the filetarget/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=true
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
25 |
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.spark-project.jetty=WARN
28 |
29 |
--------------------------------------------------------------------------------
/streaming-zeromq/README.md:
--------------------------------------------------------------------------------
1 |
19 | # Spark Streaming ZeroMQ Connector
20 |
21 | A library for reading data from [ZeroMQ](http://zeromq.org/) using Spark Streaming.
22 |
23 | ## Linking
24 |
25 | Using SBT:
26 |
27 | libraryDependencies += "org.apache.bahir" %% "spark-streaming-zeromq" % "{{site.SPARK_VERSION}}"
28 |
29 | Using Maven:
30 |
31 |
32 | org.apache.bahir
33 | spark-streaming-zeromq_{{site.SCALA_BINARY_VERSION}}
34 | {{site.SPARK_VERSION}}
35 |
36 |
37 | This library can also be added to Spark jobs launched through `spark-shell` or `spark-submit` by using the `--packages` command line option.
38 | For example, to include it when starting the spark shell:
39 |
40 | $ bin/spark-shell --packages org.apache.bahir:spark-streaming-zeromq_{{site.SCALA_BINARY_VERSION}}:{{site.SPARK_VERSION}}
41 |
42 | Unlike using `--jars`, using `--packages` ensures that this library and its dependencies will be added to the classpath.
43 | The `--packages` argument can also be used with `bin/spark-submit`.
44 |
45 | This library is cross-published for Scala 2.11 and Scala 2.12, so users should replace the proper Scala version in the commands listed above.
46 |
47 | ## Examples
48 |
49 | Review end-to-end examples at [ZeroMQ Examples](https://github.com/apache/bahir/tree/master/streaming-zeromq/examples).
50 |
51 | ### Scala API
52 |
53 | import org.apache.spark.streaming.zeromq.ZeroMQUtils
54 |
55 | val lines = ZeroMQUtils.createTextStream(
56 | ssc, "tcp://server:5555", true, Seq("my-topic".getBytes)
57 | )
58 |
59 | ### Java API
60 |
61 | import org.apache.spark.storage.StorageLevel;
62 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
63 | import org.apache.spark.streaming.zeromq.ZeroMQUtils;
64 |
65 | JavaReceiverInputDStream test1 = ZeroMQUtils.createJavaStream(
66 | ssc, "tcp://server:5555", true, Arrays.asList("my-topic.getBytes()),
67 | StorageLevel.MEMORY_AND_DISK_SER_2()
68 | );
69 |
--------------------------------------------------------------------------------
/streaming-zeromq/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
20 | 4.0.0
21 |
22 | org.apache.bahir
23 | bahir-parent_2.12
24 | 3.0.0-SNAPSHOT
25 | ../pom.xml
26 |
27 |
28 | org.apache.bahir
29 | spark-streaming-zeromq_2.12
30 |
31 | streaming-zeromq
32 |
33 | jar
34 | Apache Bahir - Spark Streaming ZeroMQ
35 | http://bahir.apache.org/
36 |
37 |
38 |
39 | org.apache.bahir
40 | bahir-common_${scala.binary.version}
41 | ${project.version}
42 | test-jar
43 | test
44 |
45 |
46 | org.apache.spark
47 | spark-tags_${scala.binary.version}
48 |
49 |
50 | org.apache.spark
51 | spark-streaming_${scala.binary.version}
52 | ${spark.version}
53 | provided
54 |
55 |
56 | org.zeromq
57 | jeromq
58 | 0.4.3
59 |
60 |
61 | org.apache.spark
62 | spark-core_${scala.binary.version}
63 | ${spark.version}
64 | test-jar
65 | test
66 |
67 |
68 | org.scalacheck
69 | scalacheck_${scala.binary.version}
70 | test
71 |
72 |
73 |
74 | target/scala-${scala.binary.version}/classes
75 | target/scala-${scala.binary.version}/test-classes
76 |
77 |
78 | org.apache.maven.plugins
79 | maven-source-plugin
80 |
81 |
82 |
83 |
84 |
--------------------------------------------------------------------------------
/streaming-zeromq/src/test/java/org/apache/spark/streaming/zeromq/JavaZeroMQStreamSuite.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.streaming.zeromq;
19 |
20 | import org.junit.Test;
21 |
22 | import org.apache.spark.api.java.function.Function;
23 | import org.apache.spark.storage.StorageLevel;
24 | import org.apache.spark.streaming.LocalJavaStreamingContext;
25 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
26 | import zmq.ZMQ;
27 |
28 | import java.util.Arrays;
29 |
30 | public class JavaZeroMQStreamSuite extends LocalJavaStreamingContext {
31 | @Test
32 | public void testZeroMQAPICompatibility() {
33 | // Test the API, but do not exchange any messages.
34 | final String publishUrl = "tcp://localhost:5555";
35 | final String topic = "topic1";
36 | final Function> messageConverter =
37 | new Function>() {
38 | @Override
39 | public Iterable call(byte[][] bytes) throws Exception {
40 | // Skip topic name and assume that each message contains only one frame.
41 | return Arrays.asList(new String(bytes[1], ZMQ.CHARSET));
42 | }
43 | };
44 |
45 | JavaReceiverInputDStream test1 = ZeroMQUtils.createJavaStream(
46 | ssc, publishUrl, true, Arrays.asList(topic.getBytes()), messageConverter,
47 | StorageLevel.MEMORY_AND_DISK_SER_2()
48 | );
49 | JavaReceiverInputDStream test2 = ZeroMQUtils.createTextJavaStream(
50 | ssc, publishUrl, true, Arrays.asList(topic.getBytes()),
51 | StorageLevel.MEMORY_AND_DISK_SER_2()
52 | );
53 | }
54 | }
55 |
56 |
--------------------------------------------------------------------------------
/streaming-zeromq/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | log4j.rootCategory=INFO, console, file
19 |
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.out
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.conversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
24 |
25 | log4j.appender.file=org.apache.log4j.FileAppender
26 | log4j.appender.file.append=true
27 | log4j.appender.file.file=target/unit-tests.log
28 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
29 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
30 |
31 | # Ignore messages below warning level from Jetty, because it's a bit verbose
32 | log4j.logger.org.spark-project.jetty=WARN
33 |
34 |
--------------------------------------------------------------------------------