├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── dev
└── checkstyle.xml
├── examples
└── src
│ └── main
│ └── scala
│ └── org
│ └── apache
│ └── spark
│ └── examples
│ └── sql
│ └── streaming
│ └── sqs
│ └── SqsSourceExample.scala
├── pom.xml
├── scalastyle-config.xml
└── src
├── main
├── java
│ └── org
│ │ └── apache
│ │ └── spark
│ │ └── sql
│ │ └── streaming
│ │ └── sqs
│ │ ├── BasicAWSCredentialsProvider.java
│ │ └── InstanceProfileCredentialsProviderWithRetries.java
├── resources
│ ├── META-INF
│ │ └── services
│ │ │ └── org.apache.spark.sql.sources.DataSourceRegister
│ └── log4j.properties
└── scala
│ └── org
│ └── apache
│ └── spark
│ └── sql
│ └── streaming
│ └── sqs
│ ├── SqsClient.scala
│ ├── SqsFileCache.scala
│ ├── SqsSource.scala
│ ├── SqsSourceOptions.scala
│ └── SqsSourceProvider.scala
└── test
├── resources
└── log4j.properties
└── scala
└── org
└── apache
└── spark
└── sql
└── streaming
└── sqs
└── SqsSourceOptionsSuite.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | *#*#
2 | *.#*
3 | *.swm
4 | *.swn
5 | *.swk
6 | *.swl
7 | *.swo
8 | *.swp
9 | *.ipr
10 | *.iml
11 | *.ipr
12 | *.iws
13 | *.pyc
14 | *.pyo
15 | *.swp
16 | *~
17 | .DS_Store
18 | .cache
19 | .classpath
20 | .ensime
21 | .ensime_cache/
22 | .ensime_lucene
23 | .generated-mima*
24 | .idea/
25 | .idea_modules/
26 | .project
27 | .pydevproject
28 | .scala_dependencies
29 | .settings
30 | /lib/
31 | R-unit-tests.log
32 | R/unit-tests.out
33 | R/cran-check.out
34 | R/pkg/vignettes/sparkr-vignettes.html
35 | build/*.jar
36 | build/apache-maven*
37 | build/scala*
38 | build/zinc*
39 | cache
40 | checkpoint
41 | conf/*.cmd
42 | conf/*.conf
43 | conf/*.properties
44 | conf/*.sh
45 | conf/*.xml
46 | conf/java-opts
47 | conf/slaves
48 | dependency-reduced-pom.xml
49 | derby.log
50 | dev/create-release/*final
51 | dev/create-release/*txt
52 | dev/pr-deps/
53 | dist/
54 | docs/_site
55 | docs/api
56 | lib_managed/
57 | lint-r-report.log
58 | log/
59 | logs/
60 | out/
61 | project/boot/
62 | project/build/target/
63 | project/plugins/lib_managed/
64 | project/plugins/project/build.properties
65 | project/plugins/src_managed/
66 | project/plugins/target/
67 | python/lib/pyspark.zip
68 | python/deps
69 | python/pyspark/python
70 | reports/
71 | scalastyle-on-compile.generated.xml
72 | scalastyle-output.xml
73 | scalastyle.txt
74 | spark-*-bin-*.tgz
75 | spark-tests.log
76 | src_managed/
77 | streaming-tests.log
78 | target/
79 | unit-tests.log
80 | work/
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | language: java
19 |
20 | jdk:
21 | - openjdk8
22 | - oraclejdk8
23 |
24 | dist: trusty
25 |
26 | script:
27 | - mvn -DskipTests clean install
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # S3-SQS Connector
2 |
3 | [](https://travis-ci.org/qubole/s3-sqs-connector)
4 |
5 | A library for reading data from Amzon S3 with optimised listing using Amazon SQS using Spark SQL Streaming ( or Structured streaming.).
6 |
7 | ## Linking
8 |
9 | Using SBT:
10 |
11 | libraryDependencies += "com.qubole" %% "spark-sql-streaming-sqs_{{site.SCALA_BINARY_VERSION}}" % "{{site.PROJECT_VERSION}}"
12 |
13 | Using Maven:
14 |
15 |
16 | com.qubole
17 | spark-sql-streaming-sqs_{{site.SCALA_BINARY_VERSION}}
18 | {{site.PROJECT_VERSION}}
19 |
20 |
21 | This library can also be added to Spark jobs launched through `spark-shell` or `spark-submit` by using the `--packages` command line option.
22 | For example, to include it when starting the spark shell:
23 |
24 | $ bin/spark-shell --packages com.qubole:spark-sql-streaming-sqs_{{site.SCALA_BINARY_VERSION}}:{{site.PROJECT_VERSION}}
25 |
26 | Unlike using `--jars`, using `--packages` ensures that this library and its dependencies will be added to the classpath.
27 | The `--packages` argument can also be used with `bin/spark-submit`.
28 |
29 | This library is compiled for Scala 2.11 only, and intends to support Spark 2.4.0 onwards.
30 |
31 | ## Building S3-SQS Connector
32 |
33 | S3-SQS Connector is built using Apache Maven](http://maven.apache.org/).
34 |
35 | To build S3-SQS connector, clone this repository and run:
36 | ```
37 | mvn -DskipTests clean package
38 | ```
39 |
40 | This will create `target/spark-sql-streaming-sqs_2.11-0.5.1.jar` file which contains s3-sqs connector code and associated dependencies. Make sure the Scala and Java versions correspond to those required by your Spark cluster. We have tested it with Java 7/8, Scala 2.11 and Spark version 2.4.0.
41 |
42 |
43 | ## Configuration options
44 | The configuration is obtained from parameters.
45 |
46 | Name |Default | Meaning
47 | --- |:---:| ---
48 | sqsUrl|required, no default value|sqs queue url, like 'https://sqs.us-east-1.amazonaws.com/330183209093/TestQueue'
49 | region|required, no default value|AWS region where queue is created
50 | fileFormat|required, no default value|file format for the s3 files stored on Amazon S3
51 | schema|required, no default value|schema of the data being read
52 | sqsFetchIntervalSeconds|10|time interval (in seconds) after which to fetch messages from Amazon SQS queue
53 | sqsLongPollingWaitTimeSeconds|20|wait time (in seconds) for long polling on Amazon SQS queue
54 | sqsMaxConnections|1|number of parallel threads to connect to Amazon SQS queue
55 | sqsMaxRetries|10|Maximum number of consecutive retries in case of a connection failure to SQS before giving up
56 | ignoreFileDeletion|false|whether to ignore any File deleted message in SQS queue
57 | fileNameOnly|false|Whether to check new files based on only the filename instead of on the full path
58 | shouldSortFiles|true|whether to sort files based on timestamp while listing them from SQS
59 | useInstanceProfileCredentials|false|Whether to use EC2 instance profile credentials for connecting to Amazon SQS
60 | maxFilesPerTrigger|no default value|maximum number of files to process in a microbatch
61 | maxFileAge|7d|Maximum age of a file that can be found in this directory
62 |
63 | ## Example
64 |
65 | An example to create a SQL stream which uses Amazon SQS to list files on S3,
66 |
67 | val inputDf = sparkSession
68 | .readStream
69 | .format("s3-sqs")
70 | .schema(schema)
71 | .option("sqsUrl", queueUrl)
72 | .option("region", awsRegion)
73 | .option("fileFormat", "json")
74 | .option("sqsFetchIntervalSeconds", "2")
75 | .option("useInstanceProfileCredentials", "true")
76 | .option("sqsLongPollingWaitTimeSeconds", "5")
77 | .load()
78 |
--------------------------------------------------------------------------------
/dev/checkstyle.xml:
--------------------------------------------------------------------------------
1 |
17 |
18 |
21 |
22 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
123 |
124 |
125 |
126 |
128 |
129 |
130 |
131 |
133 |
134 |
135 |
137 |
139 |
141 |
143 |
144 |
145 |
155 |
156 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
--------------------------------------------------------------------------------
/examples/src/main/scala/org/apache/spark/examples/sql/streaming/sqs/SqsSourceExample.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples.sql.streaming.sqs
19 |
20 | import scala.util.Random
21 |
22 | import org.apache.spark.sql.SparkSession
23 |
24 | /**
25 | * Example to read files from S3 using SQS Source and write results to Memory Sink
26 | *
27 | * Usage: SqsSourceExample
28 | */
29 |
30 | object SqsSourceExample {
31 |
32 | def main(args: Array[String]) {
33 |
34 | val randomName = Random.alphanumeric.take(6).mkString("")
35 | val pathName = "path_" + randomName
36 | val queryName = "query_" + randomName
37 | val checkpointDir = s"/checkpoints/$pathName"
38 | val schemaPathString = args(0)
39 |
40 | val spark = SparkSession.builder().appName("SqsExample").getOrCreate()
41 |
42 | val schema = spark.read.json(schemaPathString).schema
43 |
44 | val queueUrl = args(1)
45 |
46 | val fileFormat = args(2)
47 |
48 | val inputDf = spark
49 | .readStream
50 | .format("s3-sqs")
51 | .schema(schema)
52 | .option("sqsUrl", queueUrl)
53 | .option("fileFormat", fileFormat)
54 | .option("sqsFetchIntervalSeconds", "2")
55 | .option("sqsLongPollingWaitTimeSeconds", "5")
56 | .option("maxFilesPerTrigger", "50")
57 | .option("ignoreFileDeletion", "true")
58 | .load()
59 |
60 | val query = inputDf
61 | .writeStream
62 | .queryName(queryName)
63 | .format("memory")
64 | .option("checkpointLocation", checkpointDir)
65 | .start()
66 |
67 | query.awaitTermination()
68 | }
69 | }
70 |
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 | 4.0.0
20 |
21 | com.qubole
22 | spark-sql-streaming-sqs_2.11
23 | 0.5.2-SNAPSHOT
24 | jar
25 | Spark SQL Streaming SQS
26 | Connector for faster reads from S3 using SQS
27 | http://github.com/qubole/s3-sqs-connector
28 |
29 |
30 |
31 | qubole
32 | Qubole Inc.
33 | http://www.qubole.com
34 |
35 | developer
36 |
37 |
38 |
39 |
40 |
41 |
42 | Apache License, Version 2.0
43 | https://github.com/qubole/s3-sqs-connector/blob/master/LICENSE.txt
44 | repo
45 |
46 |
47 |
48 |
49 | scm:git:git://github.com/qubole/s3-sqs-connector.git
50 | https://github.com/qubole/s3-sqs-connector
51 | scm:git:git@github.com:qubole/s3-sqs-connector.git
52 | spark-sql-streaming-sqs_2.11-0.5.1
53 |
54 |
55 | 2020
56 |
57 | Qubole
58 | http://www.qubole.com/
59 |
60 |
61 |
62 | spark-sql-streaming-sqs
63 | 2.4.0
64 | 2.11
65 | UTF-8
66 |
67 |
68 |
69 |
70 | org.apache.spark
71 | spark-sql_${scala.binary.version}
72 | ${spark.version}
73 | provided
74 |
75 |
76 | org.apache.spark
77 | spark-core_${scala.binary.version}
78 | ${spark.version}
79 | test-jar
80 | test
81 |
82 |
83 | org.apache.spark
84 | spark-sql_${scala.binary.version}
85 | ${spark.version}
86 | test-jar
87 | test
88 |
89 |
90 | org.apache.spark
91 | spark-catalyst_${scala.binary.version}
92 | ${spark.version}
93 | test-jar
94 | test
95 |
96 |
97 | com.amazonaws
98 | aws-java-sdk-sqs
99 | 1.11.271
100 |
101 |
102 | org.apache.spark
103 | spark-tags_${scala.binary.version}
104 | ${spark.version}
105 |
106 |
107 |
108 |
109 |
110 |
111 | net.alchim31.maven
112 | scala-maven-plugin
113 | 4.0.2
114 |
115 |
116 | compile
117 |
118 | compile
119 | add-source
120 | doc-jar
121 |
122 | compile
123 |
124 |
125 |
126 |
127 | org.apache.maven.plugins
128 | maven-shade-plugin
129 | 3.1.0
130 |
131 |
132 | package
133 |
134 | shade
135 |
136 |
137 |
138 |
139 | com.amazonaws:aws-java-sdk-sqs:*
140 | com.amazonaws:aws-java-sdk-core:*
141 |
142 |
143 |
144 |
145 | *:*
146 |
147 | META-INF/maven/**
148 | META-INF/MANIFEST.MF
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 | net.alchim31.maven
161 | scala-maven-plugin
162 | 4.0.2
163 |
164 |
165 | org.apache.maven.plugins
166 | maven-shade-plugin
167 | 3.1.0
168 |
169 |
170 | target/scala-${scala.binary.version}/classes
171 | target/scala-${scala.binary.version}/test-classes
172 |
173 |
174 |
175 |
176 | release
177 |
178 |
179 |
180 | org.apache.maven.plugins
181 | maven-source-plugin
182 | 2.4
183 |
184 |
185 | create-sources-jar
186 |
187 | jar-no-fork
188 |
189 |
190 |
191 |
192 |
193 | org.apache.maven.plugins
194 | maven-gpg-plugin
195 | 1.5
196 |
197 |
198 | sign-artifacts
199 | verify
200 |
201 | sign
202 |
203 |
204 |
205 |
206 |
207 | org.apache.maven.plugins
208 | maven-javadoc-plugin
209 | 2.10.1
210 |
211 |
212 | create-javadoc-jar
213 |
214 | jar
215 |
216 |
217 |
218 |
219 |
220 | org.apache.maven.plugins
221 | maven-release-plugin
222 | 2.5.1
223 |
224 | true
225 |
226 |
227 |
228 | org.sonatype.plugins
229 | nexus-staging-maven-plugin
230 | 1.6.3
231 | true
232 |
233 | ossrh
234 | https://oss.sonatype.org/
235 | true
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 | ossrh
246 | https://oss.sonatype.org/content/repositories/snapshots
247 |
248 |
249 | ossrh
250 | https://oss.sonatype.org/service/local/staging/deploy/maven2/
251 |
252 |
253 |
254 |
255 |
--------------------------------------------------------------------------------
/scalastyle-config.xml:
--------------------------------------------------------------------------------
1 |
17 |
39 |
40 |
41 | Scalastyle standard configuration
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 | true
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 | ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW
126 |
127 |
128 |
129 |
130 |
131 | ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 | ^FunSuite[A-Za-z]*$
141 | Tests must extend org.apache.spark.SparkFunSuite instead.
142 |
143 |
144 |
145 |
146 | ^println$
147 |
151 |
152 |
153 |
154 | @VisibleForTesting
155 |
158 |
159 |
160 |
161 | Runtime\.getRuntime\.addShutdownHook
162 |
170 |
171 |
172 |
173 | mutable\.SynchronizedBuffer
174 |
182 |
183 |
184 |
185 | Class\.forName
186 |
193 |
194 |
195 |
196 | Await\.result
197 |
204 |
205 |
206 |
207 | Await\.ready
208 |
215 |
216 |
217 |
218 |
219 | JavaConversions
220 | Instead of importing implicits in scala.collection.JavaConversions._, import
221 | scala.collection.JavaConverters._ and use .asScala / .asJava methods
222 |
223 |
224 |
225 | org\.apache\.commons\.lang\.
226 | Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead
227 | of Commons Lang 2 (package org.apache.commons.lang.*)
228 |
229 |
230 |
231 | extractOpt
232 | Use Utils.jsonOption(x).map(.extract[T]) instead of .extractOpt[T], as the latter
233 | is slower.
234 |
235 |
236 |
237 |
238 | java,scala,3rdParty,spark
239 | javax?\..*
240 | scala\..*
241 | (?!org\.apache\.spark\.).*
242 | org\.apache\.spark\..*
243 |
244 |
245 |
246 |
247 |
248 | COMMA
249 |
250 |
251 |
252 |
253 |
254 | \)\{
255 |
258 |
259 |
260 |
261 | (?m)^(\s*)/[*][*].*$(\r|)\n^\1 [*]
262 | Use Javadoc style indentation for multiline comments
263 |
264 |
265 |
266 | case[^\n>]*=>\s*\{
267 | Omit braces in case clauses.
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 | 800>
321 |
322 |
323 |
324 |
325 | 30
326 |
327 |
328 |
329 |
330 | 10
331 |
332 |
333 |
334 |
335 | 50
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 | -1,0,1,2,3
347 |
348 |
349 |
350 |
--------------------------------------------------------------------------------
/src/main/java/org/apache/spark/sql/streaming/sqs/BasicAWSCredentialsProvider.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.sql.streaming.sqs;
19 |
20 | import com.amazonaws.AmazonClientException;
21 | import com.amazonaws.auth.AWSCredentialsProvider;
22 | import com.amazonaws.auth.BasicAWSCredentials;
23 | import com.amazonaws.auth.AWSCredentials;
24 | import org.apache.commons.lang.StringUtils;
25 |
26 | public class BasicAWSCredentialsProvider implements AWSCredentialsProvider {
27 | private final String accessKey;
28 | private final String secretKey;
29 |
30 | public BasicAWSCredentialsProvider(String accessKey, String secretKey) {
31 | this.accessKey = accessKey;
32 | this.secretKey = secretKey;
33 | }
34 |
35 | public AWSCredentials getCredentials() {
36 | if (!StringUtils.isEmpty(accessKey) && !StringUtils.isEmpty(secretKey)) {
37 | return new BasicAWSCredentials(accessKey, secretKey);
38 | }
39 | throw new AmazonClientException(
40 | "Access key or secret key is null");
41 | }
42 |
43 | public void refresh() {}
44 |
45 | @Override
46 | public String toString() {
47 | return getClass().getSimpleName();
48 | }
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/java/org/apache/spark/sql/streaming/sqs/InstanceProfileCredentialsProviderWithRetries.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.sql.streaming.sqs;
19 |
20 |
21 | import com.amazonaws.AmazonClientException;
22 | import com.amazonaws.auth.AWSCredentials;
23 | import com.amazonaws.auth.InstanceProfileCredentialsProvider;
24 | import org.apache.commons.logging.Log;
25 | import org.apache.commons.logging.LogFactory;
26 |
27 | public class InstanceProfileCredentialsProviderWithRetries
28 | extends InstanceProfileCredentialsProvider {
29 |
30 | private static final Log LOG = LogFactory.getLog(
31 | InstanceProfileCredentialsProviderWithRetries.class);
32 |
33 | public AWSCredentials getCredentials() {
34 | int retries = 10;
35 | int sleep = 500;
36 | while(retries > 0) {
37 | try {
38 | return super.getCredentials();
39 | }
40 | catch (RuntimeException re) {
41 | LOG.error("Got an exception while fetching credentials " + re);
42 | --retries;
43 | try {
44 | Thread.sleep(sleep);
45 | } catch (InterruptedException ie) {
46 | // Do nothing
47 | }
48 | if (sleep < 10000) {
49 | sleep *= 2;
50 | }
51 | }
52 | catch (Error error) {
53 | LOG.error("Got an exception while fetching credentials " + error);
54 | --retries;
55 | try {
56 | Thread.sleep(sleep);
57 | } catch (InterruptedException ie) {
58 | // Do nothing
59 | }
60 | if (sleep < 10000) {
61 | sleep *= 2;
62 | }
63 | }
64 | }
65 | throw new AmazonClientException("Unable to load credentials.");
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | org.apache.spark.sql.streaming.sqs.SqsSourceProvider
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | log4j.rootCategory=WARN, console
19 |
20 | # File appender
21 | log4j.appender.file=org.apache.log4j.FileAppender
22 | log4j.appender.file.append=false
23 | log4j.appender.file.file=target/unit-tests.log
24 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
25 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
26 |
27 | # Console appender
28 | log4j.appender.console=org.apache.log4j.ConsoleAppender
29 | log4j.appender.console.target=System.out
30 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
31 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
32 |
33 | # Settings to quiet third party logs that are too verbose
34 | log4j.logger.org.sparkproject.jetty=WARN
35 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
36 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
37 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
38 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/streaming/sqs/SqsClient.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.sql.streaming.sqs
19 |
20 | import java.text.SimpleDateFormat
21 | import java.util.TimeZone
22 | import java.util.concurrent.TimeUnit
23 |
24 | import scala.collection.JavaConverters._
25 |
26 | import com.amazonaws.{AmazonClientException, AmazonServiceException, ClientConfiguration}
27 | import com.amazonaws.services.sqs.{AmazonSQS, AmazonSQSClientBuilder}
28 | import com.amazonaws.services.sqs.model.{DeleteMessageBatchRequestEntry, Message, ReceiveMessageRequest}
29 | import org.apache.hadoop.conf.Configuration
30 | import org.json4s.{DefaultFormats, MappingException}
31 | import org.json4s.JsonAST.JValue
32 | import org.json4s.jackson.JsonMethods.parse
33 |
34 | import org.apache.spark.SparkException
35 | import org.apache.spark.internal.Logging
36 | import org.apache.spark.util.ThreadUtils
37 |
38 | class SqsClient(sourceOptions: SqsSourceOptions,
39 | hadoopConf: Configuration) extends Logging {
40 |
41 | private val sqsFetchIntervalSeconds = sourceOptions.fetchIntervalSeconds
42 | private val sqsLongPollWaitTimeSeconds = sourceOptions.longPollWaitTimeSeconds
43 | private val sqsMaxRetries = sourceOptions.maxRetries
44 | private val maxConnections = sourceOptions.maxConnections
45 | private val ignoreFileDeletion = sourceOptions.ignoreFileDeletion
46 | private val region = sourceOptions.region
47 | val sqsUrl = sourceOptions.sqsUrl
48 |
49 | @volatile var exception: Option[Exception] = None
50 |
51 | private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601
52 | timestampFormat.setTimeZone(TimeZone.getTimeZone("UTC"))
53 | private var retriesOnFailure = 0
54 | private val sqsClient = createSqsClient()
55 |
56 | val sqsScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("sqs-scheduler")
57 |
58 | val sqsFileCache = new SqsFileCache(sourceOptions.maxFileAgeMs, sourceOptions.fileNameOnly)
59 |
60 | val deleteMessageQueue = new java.util.concurrent.ConcurrentLinkedQueue[String]()
61 |
62 | private val sqsFetchMessagesThread = new Runnable {
63 | override def run(): Unit = {
64 | try {
65 | // Fetching messages from Amazon SQS
66 | val newMessages = sqsFetchMessages()
67 |
68 | // Filtering the new messages which are already not seen
69 | if (newMessages.nonEmpty) {
70 | newMessages.filter(message => sqsFileCache.isNewFile(message._1, message._2))
71 | .foreach(message =>
72 | sqsFileCache.add(message._1, MessageDescription(message._2, false, message._3)))
73 | }
74 | } catch {
75 | case e: Exception =>
76 | exception = Some(e)
77 | }
78 | }
79 | }
80 |
81 | sqsScheduler.scheduleWithFixedDelay(
82 | sqsFetchMessagesThread,
83 | 0,
84 | sqsFetchIntervalSeconds,
85 | TimeUnit.SECONDS)
86 |
87 | private def sqsFetchMessages(): Seq[(String, Long, String)] = {
88 | val messageList = try {
89 | val receiveMessageRequest = new ReceiveMessageRequest()
90 | .withQueueUrl(sqsUrl)
91 | .withWaitTimeSeconds(sqsLongPollWaitTimeSeconds)
92 | val messages = sqsClient.receiveMessage(receiveMessageRequest).getMessages.asScala
93 | retriesOnFailure = 0
94 | logDebug(s"successfully received ${messages.size} messages")
95 | messages
96 | } catch {
97 | case ase: AmazonServiceException =>
98 | val message =
99 | """
100 | |Caught an AmazonServiceException, which means your request made it to Amazon SQS,
101 | | rejected with an error response for some reason.
102 | """.stripMargin
103 | logWarning(message)
104 | logWarning(s"Error Message: ${ase.getMessage}")
105 | logWarning(s"HTTP Status Code: ${ase.getStatusCode}, AWS Error Code: ${ase.getErrorCode}")
106 | logWarning(s"Error Type: ${ase.getErrorType}, Request ID: ${ase.getRequestId}")
107 | evaluateRetries()
108 | List.empty
109 | case ace: AmazonClientException =>
110 | val message =
111 | """
112 | |Caught an AmazonClientException, which means, the client encountered a serious
113 | | internal problem while trying to communicate with Amazon SQS, such as not
114 | | being able to access the network.
115 | """.stripMargin
116 | logWarning(message)
117 | logWarning(s"Error Message: ${ace.getMessage()}")
118 | evaluateRetries()
119 | List.empty
120 | case e: Exception =>
121 | val message = "Received unexpected error from SQS"
122 | logWarning(message)
123 | logWarning(s"Error Message: ${e.getMessage()}")
124 | evaluateRetries()
125 | List.empty
126 | }
127 | if (messageList.nonEmpty) {
128 | parseSqsMessages(messageList)
129 | } else {
130 | Seq.empty
131 | }
132 | }
133 |
134 | private def parseSqsMessages(messageList: Seq[Message]): Seq[(String, Long, String)] = {
135 | val errorMessages = scala.collection.mutable.ListBuffer[String]()
136 | val parsedMessages = messageList.foldLeft(Seq[(String, Long, String)]()) { (list, message) =>
137 | implicit val formats = DefaultFormats
138 | try {
139 | val messageReceiptHandle = message.getReceiptHandle
140 | val messageJson = parse(message.getBody).extract[JValue]
141 | val bucketName = (
142 | messageJson \ "Records" \ "s3" \ "bucket" \ "name").extract[Array[String]].head
143 | val eventName = (messageJson \ "Records" \ "eventName").extract[Array[String]].head
144 | if (eventName.contains("ObjectCreated")) {
145 | val timestamp = (messageJson \ "Records" \ "eventTime").extract[Array[String]].head
146 | val timestampMills = convertTimestampToMills(timestamp)
147 | val path = "s3://" +
148 | bucketName + "/" +
149 | (messageJson \ "Records" \ "s3" \ "object" \ "key").extract[Array[String]].head
150 | logDebug("Successfully parsed sqs message")
151 | list :+ ((path, timestampMills, messageReceiptHandle))
152 | } else {
153 | if (eventName.contains("ObjectRemoved")) {
154 | if (!ignoreFileDeletion) {
155 | exception = Some(new SparkException("ObjectDelete message detected in SQS"))
156 | } else {
157 | logInfo("Ignoring file deletion message since ignoreFileDeletion is true")
158 | }
159 | } else {
160 | logWarning("Ignoring unexpected message detected in SQS")
161 | }
162 | errorMessages.append(messageReceiptHandle)
163 | list
164 | }
165 | } catch {
166 | case me: MappingException =>
167 | errorMessages.append(message.getReceiptHandle)
168 | logWarning(s"Error in parsing SQS message ${me.getMessage}")
169 | list
170 | case e: Exception =>
171 | errorMessages.append(message.getReceiptHandle)
172 | logWarning(s"Unexpected error while parsing SQS message ${e.getMessage}")
173 | list
174 | }
175 | }
176 | if (errorMessages.nonEmpty) {
177 | addToDeleteMessageQueue(errorMessages.toList)
178 | }
179 | parsedMessages
180 | }
181 |
182 | private def convertTimestampToMills(timestamp: String): Long = {
183 | val timeInMillis = timestampFormat.parse(timestamp).getTime()
184 | timeInMillis
185 | }
186 |
187 | private def evaluateRetries(): Unit = {
188 | retriesOnFailure += 1
189 | if (retriesOnFailure >= sqsMaxRetries) {
190 | logError("Max retries reached")
191 | exception = Some(new SparkException("Unable to receive Messages from SQS for " +
192 | s"${sqsMaxRetries} times Giving up. Check logs for details."))
193 | } else {
194 | logWarning(s"Attempt ${retriesOnFailure}." +
195 | s"Will reattempt after ${sqsFetchIntervalSeconds} seconds")
196 | }
197 | }
198 |
199 | private def createSqsClient(): AmazonSQS = {
200 | try {
201 | val isClusterOnEc2Role = hadoopConf.getBoolean(
202 | "fs.s3.isClusterOnEc2Role", false) || hadoopConf.getBoolean(
203 | "fs.s3n.isClusterOnEc2Role", false) || sourceOptions.useInstanceProfileCredentials
204 | if (!isClusterOnEc2Role) {
205 | val accessKey = hadoopConf.getTrimmed("fs.s3n.awsAccessKeyId")
206 | val secretAccessKey = new String(hadoopConf.getPassword("fs.s3n.awsSecretAccessKey")).trim
207 | logInfo("Using credentials from keys provided")
208 | val basicAwsCredentialsProvider = new BasicAWSCredentialsProvider(
209 | accessKey, secretAccessKey)
210 | AmazonSQSClientBuilder
211 | .standard()
212 | .withClientConfiguration(new ClientConfiguration().withMaxConnections(maxConnections))
213 | .withCredentials(basicAwsCredentialsProvider)
214 | .withRegion(region)
215 | .build()
216 | } else {
217 | logInfo("Using the credentials attached to the instance")
218 | val instanceProfileCredentialsProvider = new InstanceProfileCredentialsProviderWithRetries()
219 | AmazonSQSClientBuilder
220 | .standard()
221 | .withClientConfiguration(new ClientConfiguration().withMaxConnections(maxConnections))
222 | .withCredentials(instanceProfileCredentialsProvider)
223 | .build()
224 | }
225 | } catch {
226 | case e: Exception =>
227 | throw new SparkException(s"Error occured while creating Amazon SQS Client", e)
228 | }
229 | }
230 |
231 | def addToDeleteMessageQueue(messageReceiptHandles: List[String]): Unit = {
232 | deleteMessageQueue.addAll(messageReceiptHandles.asJava)
233 | }
234 |
235 | def deleteMessagesFromQueue(): Unit = {
236 | try {
237 | var count = -1
238 | val messageReceiptHandles = deleteMessageQueue.asScala.toList
239 | val messageGroups = messageReceiptHandles.sliding(10, 10).toList
240 | messageGroups.foreach { messageGroup =>
241 | val requestEntries = messageGroup.foldLeft(List[DeleteMessageBatchRequestEntry]()) {
242 | (list, messageReceiptHandle) =>
243 | count = count + 1
244 | list :+ new DeleteMessageBatchRequestEntry(count.toString, messageReceiptHandle)
245 | }.asJava
246 | val batchResult = sqsClient.deleteMessageBatch(sqsUrl, requestEntries)
247 | if (!batchResult.getFailed.isEmpty) {
248 | batchResult.getFailed.asScala.foreach { entry =>
249 | sqsClient.deleteMessage(
250 | sqsUrl, requestEntries.get(entry.getId.toInt).getReceiptHandle)
251 | }
252 | }
253 | }
254 | } catch {
255 | case e: Exception =>
256 | logWarning(s"Unable to delete message from SQS ${e.getMessage}")
257 | }
258 | deleteMessageQueue.clear()
259 | }
260 |
261 | def assertSqsIsWorking(): Unit = {
262 | if (exception.isDefined) {
263 | throw exception.get
264 | }
265 | }
266 |
267 | }
268 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/streaming/sqs/SqsFileCache.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.sql.streaming.sqs
19 |
20 | import java.net.URI
21 | import java.util.concurrent.ConcurrentHashMap
22 |
23 | import scala.collection.JavaConverters._
24 | import scala.collection.mutable.ListBuffer
25 |
26 | import org.apache.hadoop.fs.Path
27 |
28 | import org.apache.spark.internal.Logging
29 |
30 | /**
31 | * A custom hash map used to track the list of files seen. This map is thread-safe.
32 | * To prevent the hash map from growing indefinitely, a purge function is available to
33 | * remove files "maxAgeMs" older than the latest file.
34 | */
35 |
36 | class SqsFileCache(maxAgeMs: Long, fileNameOnly: Boolean) extends Logging {
37 | require(maxAgeMs >= 0)
38 | if (fileNameOnly) {
39 | logWarning("'fileNameOnly' is enabled. Make sure your file names are unique (e.g. using " +
40 | "UUID), otherwise, files with the same name but under different paths will be considered " +
41 | "the same and causes data lost.")
42 | }
43 |
44 | /** Mapping from file path to its message description. */
45 | private val sqsMap = new ConcurrentHashMap[String, MessageDescription]
46 |
47 | /** Timestamp for the last purge operation. */
48 | private var lastPurgeTimestamp: Long = 0L
49 |
50 | /** Timestamp of the latest file. */
51 | private var latestTimestamp: Long = 0L
52 |
53 | @inline private def stripPathIfNecessary(path: String) = {
54 | if (fileNameOnly) new Path(new URI(path)).getName else path
55 | }
56 |
57 | /**
58 | * Returns true if we should consider this file a new file. The file is only considered "new"
59 | * if it is new enough that we are still tracking, and we have not seen it before.
60 | */
61 | def isNewFile(path: String, timestamp: Long): Boolean = {
62 | timestamp >= lastPurgeTimestamp && !sqsMap.containsKey(stripPathIfNecessary(path))
63 | }
64 |
65 | /** Add a new file to the map. */
66 | def add(path: String, fileStatus: MessageDescription): Unit = {
67 | sqsMap.put(stripPathIfNecessary(path), fileStatus)
68 | if (fileStatus.timestamp > latestTimestamp) {
69 | latestTimestamp = fileStatus.timestamp
70 | }
71 | }
72 |
73 | /**
74 | * Returns all the new files found - ignore aged files and files that we have already seen.
75 | * Sorts the files by timestamp.
76 | */
77 | def getUncommittedFiles(maxFilesPerTrigger: Option[Int],
78 | shouldSortFiles: Boolean): Seq[(String, Long, String)] = {
79 | if (shouldSortFiles) {
80 | val uncommittedFiles = filterAllUncommittedFiles()
81 | val sortedFiles = reportTimeTaken("Sorting Files") {
82 | uncommittedFiles.sortWith(_._2 < _._2)
83 | }
84 | if (maxFilesPerTrigger.nonEmpty) sortedFiles.take(maxFilesPerTrigger.get) else sortedFiles
85 | } else {
86 | if (maxFilesPerTrigger.isEmpty) {
87 | filterAllUncommittedFiles()
88 | } else {
89 | filterTopUncommittedFiles(maxFilesPerTrigger.get)
90 | }
91 | }
92 | }
93 | private def filterTopUncommittedFiles(maxFilesPerTrigger: Int): List[(String, Long, String)] = {
94 | val iterator = sqsMap.asScala.iterator
95 | val uncommittedFiles = ListBuffer[(String, Long, String)]()
96 | while (uncommittedFiles.length < maxFilesPerTrigger && iterator.hasNext) {
97 | val file = iterator.next()
98 | if (file._2.isCommitted && file._2.timestamp >= lastPurgeTimestamp) {
99 | uncommittedFiles += ((file._1, file._2.timestamp, file._2.messageReceiptHandle))
100 | }
101 | }
102 | uncommittedFiles.toList
103 | }
104 |
105 | private def reportTimeTaken[T](operation: String)(body: => T): T = {
106 | val startTime = System.currentTimeMillis()
107 | val result = body
108 | val endTime = System.currentTimeMillis()
109 | val timeTaken = math.max(endTime - startTime, 0)
110 |
111 | logDebug(s"$operation took $timeTaken ms")
112 | result
113 | }
114 |
115 | private def filterAllUncommittedFiles(): List[(String, Long, String)] = {
116 | sqsMap.asScala.foldLeft(List[(String, Long, String)]()) {
117 | (list, file) =>
118 | if (!file._2.isCommitted && file._2.timestamp >= lastPurgeTimestamp) {
119 | list :+ ((file._1, file._2.timestamp, file._2.messageReceiptHandle))
120 | } else {
121 | list
122 | }
123 | }
124 | }
125 |
126 | /** Removes aged entries and returns the number of files removed. */
127 | def purge(): Int = {
128 | lastPurgeTimestamp = latestTimestamp - maxAgeMs
129 | var count = 0
130 | sqsMap.asScala.foreach { fileEntry =>
131 | if (fileEntry._2.timestamp < lastPurgeTimestamp) {
132 | sqsMap.remove(fileEntry._1)
133 | count += 1
134 | }
135 | }
136 | count
137 | }
138 |
139 | /** Mark file entry as committed or already processed */
140 | def markCommitted(path: String): Unit = {
141 | sqsMap.replace(path, MessageDescription(
142 | sqsMap.get(path).timestamp, true, sqsMap.get(path).messageReceiptHandle))
143 | }
144 |
145 | def size: Int = sqsMap.size()
146 |
147 | }
148 |
149 | /**
150 | * A case class to store file metadata. Metadata includes file timestamp, file status -
151 | * committed or not committed and message reciept handle used for deleting message from
152 | * Amazon SQS
153 | */
154 | case class MessageDescription(timestamp: Long,
155 | isCommitted: Boolean = false,
156 | messageReceiptHandle: String)
157 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/streaming/sqs/SqsSource.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.sql.streaming.sqs
19 |
20 | import java.net.URI
21 |
22 | import org.apache.hadoop.fs.Path
23 |
24 | import org.apache.spark.internal.Logging
25 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
26 | import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation}
27 | import org.apache.spark.sql.execution.streaming._
28 | import org.apache.spark.sql.execution.streaming.FileStreamSource._
29 | import org.apache.spark.sql.types.StructType
30 |
31 |
32 | class SqsSource(sparkSession: SparkSession,
33 | metadataPath: String,
34 | options: Map[String, String],
35 | override val schema: StructType) extends Source with Logging {
36 |
37 | private val sourceOptions = new SqsSourceOptions(options)
38 |
39 | private val hadoopConf = sparkSession.sessionState.newHadoopConf()
40 |
41 | private val metadataLog =
42 | new FileStreamSourceLog(FileStreamSourceLog.VERSION, sparkSession, metadataPath)
43 | private var metadataLogCurrentOffset = metadataLog.getLatest().map(_._1).getOrElse(-1L)
44 |
45 | private val maxFilesPerTrigger = sourceOptions.maxFilesPerTrigger
46 |
47 | private val maxFileAgeMs: Long = sourceOptions.maxFileAgeMs
48 |
49 | private val fileFormatClassName = sourceOptions.fileFormatClassName
50 |
51 | private val shouldSortFiles = sourceOptions.shouldSortFiles
52 |
53 | private val sqsClient = new SqsClient(sourceOptions, hadoopConf)
54 |
55 | metadataLog.allFiles().foreach { entry =>
56 | sqsClient.sqsFileCache.add(entry.path, MessageDescription(entry.timestamp, true, ""))
57 | }
58 | sqsClient.sqsFileCache.purge()
59 |
60 | logInfo(s"maxFilesPerBatch = $maxFilesPerTrigger, maxFileAgeMs = $maxFileAgeMs")
61 |
62 | /**
63 | * Returns the data that is between the offsets (`start`, `end`].
64 | */
65 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
66 | val startOffset = start.map(FileStreamSourceOffset(_).logOffset).getOrElse(-1L)
67 | val endOffset = FileStreamSourceOffset(end).logOffset
68 |
69 | assert(startOffset <= endOffset)
70 | val files = metadataLog.get(Some(startOffset + 1), Some(endOffset)).flatMap(_._2)
71 | logInfo(s"Processing ${files.length} files from ${startOffset + 1}:$endOffset")
72 | logTrace(s"Files are:\n\t" + files.mkString("\n\t"))
73 | val newDataSource =
74 | DataSource(
75 | sparkSession,
76 | paths = files.map(f => new Path(new URI(f.path)).toString),
77 | userSpecifiedSchema = Some(schema),
78 | className = fileFormatClassName,
79 | options = options)
80 | Dataset.ofRows(sparkSession, LogicalRelation(newDataSource.resolveRelation(
81 | checkFilesExist = false), isStreaming = true))
82 | }
83 |
84 | private def fetchMaxOffset(): FileStreamSourceOffset = synchronized {
85 |
86 | sqsClient.assertSqsIsWorking()
87 | /**
88 | * All the new files found - ignore aged files and files that we have seen.
89 | * Obey user's setting to limit the number of files in this batch trigger.
90 | */
91 | val batchFiles = sqsClient.sqsFileCache.getUncommittedFiles(maxFilesPerTrigger, shouldSortFiles)
92 |
93 | if (batchFiles.nonEmpty) {
94 | metadataLogCurrentOffset += 1
95 | metadataLog.add(metadataLogCurrentOffset, batchFiles.map {
96 | case (path, timestamp, receiptHandle) =>
97 | FileEntry(path = path, timestamp = timestamp, batchId = metadataLogCurrentOffset)
98 | }.toArray)
99 | logInfo(s"Log offset set to $metadataLogCurrentOffset with ${batchFiles.size} new files")
100 | val messageReceiptHandles = batchFiles.map {
101 | case (path, timestamp, receiptHandle) =>
102 | sqsClient.sqsFileCache.markCommitted(path)
103 | logDebug(s"New file: $path")
104 | receiptHandle
105 | }.toList
106 | sqsClient.addToDeleteMessageQueue(messageReceiptHandles)
107 | }
108 |
109 | val numPurged = sqsClient.sqsFileCache.purge()
110 |
111 | if (!sqsClient.deleteMessageQueue.isEmpty) {
112 | sqsClient.deleteMessagesFromQueue()
113 | }
114 |
115 | logTrace(
116 | s"""
117 | |Number of files selected for batch = ${batchFiles.size}
118 | |Number of files purged from tracking map = $numPurged
119 | """.stripMargin)
120 |
121 | FileStreamSourceOffset(metadataLogCurrentOffset)
122 | }
123 |
124 | override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.logOffset == -1)
125 |
126 | override def commit(end: Offset): Unit = {
127 | // No-op for now; SqsSource currently garbage-collects files based on timestamp
128 | // and the value of the maxFileAge parameter.
129 | }
130 |
131 | override def stop(): Unit = {
132 | if (!sqsClient.sqsScheduler.isTerminated) {
133 | sqsClient.sqsScheduler.shutdownNow()
134 | }
135 | }
136 |
137 | override def toString: String = s"SqsSource[${sqsClient.sqsUrl}]"
138 |
139 | }
140 |
141 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/streaming/sqs/SqsSourceOptions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.sql.streaming.sqs
19 |
20 | import scala.util.Try
21 |
22 | import org.apache.spark.internal.Logging
23 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
24 | import org.apache.spark.util.Utils
25 |
26 | /**
27 | * User specified options for sqs source.
28 | */
29 | class SqsSourceOptions(parameters: CaseInsensitiveMap[String]) extends Logging {
30 |
31 | def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
32 |
33 | val maxFilesPerTrigger: Option[Int] = parameters.get("maxFilesPerTrigger").map { str =>
34 | Try(str.toInt).toOption.filter(_ > 0).getOrElse {
35 | throw new IllegalArgumentException(
36 | s"Invalid value '$str' for option 'maxFilesPerTrigger', must be a positive integer")
37 | }
38 | }
39 |
40 | /**
41 | * Maximum age of a file that can be found in this directory, before it is ignored. For the
42 | * first batch all files will be considered valid.
43 | *
44 | * The max age is specified with respect to the timestamp of the latest file, and not the
45 | * timestamp of the current system. That this means if the last file has timestamp 1000, and the
46 | * current system time is 2000, and max age is 200, the system will purge files older than
47 | * 800 (rather than 1800) from the internal state.
48 | *
49 | * Default to a week.
50 | */
51 | val maxFileAgeMs: Long =
52 | Utils.timeStringAsMs(parameters.getOrElse("maxFileAge", "7d"))
53 |
54 | val fetchIntervalSeconds: Int = parameters.get("sqsFetchIntervalSeconds").map { str =>
55 | Try(str.toInt).toOption.filter(_ > 0).getOrElse {
56 | throw new IllegalArgumentException(
57 | s"Invalid value '$str' for option 'sqsFetchIntervalSeconds', must be a positive integer")
58 | }
59 | }.getOrElse(10)
60 |
61 | val longPollWaitTimeSeconds: Int = parameters.get("sqsLongPollingWaitTimeSeconds").map { str =>
62 | Try(str.toInt).toOption.filter(x => x >= 0 && x <= 20).getOrElse {
63 | throw new IllegalArgumentException(
64 | s"Invalid value '$str' for option 'sqsLongPollingWaitTimeSeconds'," +
65 | "must be an integer between 0 and 20")
66 | }
67 | }.getOrElse(20)
68 |
69 | val maxRetries: Int = parameters.get("sqsMaxRetries").map { str =>
70 | Try(str.toInt).toOption.filter(_ > 0).getOrElse {
71 | throw new IllegalArgumentException(
72 | s"Invalid value '$str' for option 'sqsMaxRetries', must be a positive integer")
73 | }
74 | }.getOrElse(10)
75 |
76 | val maxConnections: Int = parameters.get("sqsMaxConnections").map { str =>
77 | Try(str.toInt).toOption.filter(_ > 0).getOrElse {
78 | throw new IllegalArgumentException(
79 | s"Invalid value '$str' for option 'sqsMaxConnections', must be a positive integer")
80 | }
81 | }.getOrElse(1)
82 |
83 | val sqsUrl: String = parameters.get("sqsUrl").getOrElse{
84 | throw new IllegalArgumentException("SQS Url is not specified")
85 | }
86 |
87 | val region: String = parameters.get("region").getOrElse {
88 | throw new IllegalArgumentException("Region is not specified")
89 | }
90 |
91 | val fileFormatClassName: String = parameters.get("fileFormat").getOrElse {
92 | throw new IllegalArgumentException("Specifying file format is mandatory with sqs source")
93 | }
94 |
95 | val ignoreFileDeletion: Boolean = withBooleanParameter("ignoreFileDeletion", false)
96 |
97 | /**
98 | * Whether to check new files based on only the filename instead of on the full path.
99 | *
100 | * With this set to `true`, the following files would be considered as the same file, because
101 | * their filenames, "dataset.txt", are the same:
102 | * - "file:///dataset.txt"
103 | * - "s3://a/dataset.txt"
104 | * - "s3n://a/b/dataset.txt"
105 | * - "s3a://a/b/c/dataset.txt"
106 | */
107 | val fileNameOnly: Boolean = withBooleanParameter("fileNameOnly", false)
108 |
109 | val shouldSortFiles: Boolean = withBooleanParameter("shouldSortFiles", true)
110 |
111 | val useInstanceProfileCredentials: Boolean = withBooleanParameter(
112 | "useInstanceProfileCredentials", false)
113 |
114 | private def withBooleanParameter(name: String, default: Boolean) = {
115 | parameters.get(name).map { str =>
116 | try {
117 | str.toBoolean
118 | } catch {
119 | case _: IllegalArgumentException =>
120 | throw new IllegalArgumentException(
121 | s"Invalid value '$str' for option '$name', must be true or false")
122 | }
123 | }.getOrElse(default)
124 | }
125 |
126 | }
127 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/streaming/sqs/SqsSourceProvider.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.sql.streaming.sqs
19 |
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.SQLContext
22 | import org.apache.spark.sql.execution.streaming.Source
23 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
24 | import org.apache.spark.sql.types.StructType
25 |
26 | class SqsSourceProvider extends DataSourceRegister
27 | with StreamSourceProvider
28 | with Logging {
29 |
30 | override def shortName(): String = "s3-sqs"
31 |
32 | override def sourceSchema(sqlContext: SQLContext,
33 | schema: Option[StructType],
34 | providerName: String,
35 | parameters: Map[String, String]): (String, StructType) = {
36 |
37 | require(schema.isDefined, "Sqs source doesn't support empty schema")
38 | (shortName(), schema.get)
39 | }
40 |
41 | override def createSource(sqlContext: SQLContext,
42 | metadataPath: String,
43 | schema: Option[StructType],
44 | providerName: String,
45 | parameters: Map[String, String]): Source = {
46 |
47 | new SqsSource(
48 | sqlContext.sparkSession,
49 | metadataPath,
50 | parameters,
51 | schema.get)
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the file target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=true
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
25 |
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.sparkproject.jetty=WARN
28 |
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/streaming/sqs/SqsSourceOptionsSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.apache.spark.sql.streaming.sqs
18 |
19 | import java.util.Locale
20 |
21 | import org.apache.spark.sql.streaming.{StreamingQuery, StreamingQueryException, StreamTest}
22 | import org.apache.spark.sql.types.StructType
23 |
24 | class SqsSourceOptionsSuite extends StreamTest {
25 |
26 | test("bad source options") {
27 | def testBadOptions(option: (String, String))(expectedMsg: String): Unit = {
28 |
29 | var query : StreamingQuery = null
30 |
31 | try {
32 | val errorMessage = intercept[StreamingQueryException] {
33 | val dummySchema = new StructType
34 | val reader = spark
35 | .readStream
36 | .format("s3-sqs")
37 | .option("fileFormat", "json")
38 | .schema(dummySchema)
39 | .option("sqsUrl", "https://DUMMY_URL")
40 | .option("region", "us-east-1")
41 | .option(option._1, option._2)
42 | .load()
43 |
44 | query = reader.writeStream
45 | .format("memory")
46 | .queryName("badOptionsTest")
47 | .start()
48 |
49 | query.processAllAvailable()
50 | }.getMessage
51 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
52 | } finally {
53 | if (query != null) {
54 | // terminating streaming query if necessary
55 | query.stop()
56 | }
57 |
58 | }
59 | }
60 |
61 | testBadOptions("sqsFetchIntervalSeconds" -> "-2")("Invalid value '-2' " +
62 | "for option 'sqsFetchIntervalSeconds', must be a positive integer")
63 | testBadOptions("sqsLongPollingWaitTimeSeconds" -> "-5")("Invalid value '-5' " +
64 | "for option 'sqsLongPollingWaitTimeSeconds',must be an integer between 0 and 20")
65 | testBadOptions("sqsMaxConnections" -> "-2")("Invalid value '-2' " +
66 | "for option 'sqsMaxConnections', must be a positive integer")
67 | testBadOptions("maxFilesPerTrigger" -> "-50")("Invalid value '-50' " +
68 | "for option 'maxFilesPerTrigger', must be a positive integer")
69 | testBadOptions("ignoreFileDeletion" -> "x")("Invalid value 'x' " +
70 | "for option 'ignoreFileDeletion', must be true or false")
71 | testBadOptions("fileNameOnly" -> "x")("Invalid value 'x' " +
72 | "for option 'fileNameOnly', must be true or false")
73 | testBadOptions("shouldSortFiles" -> "x")("Invalid value 'x' " +
74 | "for option 'shouldSortFiles', must be true or false")
75 | testBadOptions("useInstanceProfileCredentials" -> "x")("Invalid value 'x' " +
76 | "for option 'useInstanceProfileCredentials', must be true or false")
77 |
78 | }
79 |
80 | test("missing mandatory options") {
81 |
82 | def testMissingMandatoryOptions(options: List[(String, String)])(expectedMsg: String): Unit = {
83 |
84 | var query: StreamingQuery = null
85 |
86 | try {
87 | val errorMessage = intercept[StreamingQueryException] {
88 | val dummySchema = new StructType
89 | val reader = spark
90 | .readStream
91 | .format("s3-sqs")
92 | .schema(dummySchema)
93 |
94 | val readerWithOptions = options.map { option =>
95 | reader.option(option._1, option._2)
96 | }.last.load()
97 |
98 | query = readerWithOptions.writeStream
99 | .format("memory")
100 | .queryName("missingMandatoryOptions")
101 | .start()
102 |
103 | query.processAllAvailable()
104 | }.getMessage
105 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
106 | } finally {
107 | if (query != null) {
108 | // terminating streaming query if necessary
109 | query.stop()
110 | }
111 | }
112 | }
113 |
114 | // No fileFormat specified
115 | testMissingMandatoryOptions(List("sqsUrl" -> "https://DUMMY_URL", "region" -> "us-east-1"))(
116 | "Specifying file format is mandatory with sqs source")
117 |
118 | // Sqs URL not specified
119 | testMissingMandatoryOptions(List("fileFormat" -> "json", "region" -> "us-east-1"))(
120 | "SQS Url is not specified")
121 | }
122 |
123 | test("schema not specified") {
124 |
125 | var query: StreamingQuery = null
126 |
127 | val expectedMsg = "Sqs source doesn't support empty schema"
128 |
129 | try {
130 | val errorMessage = intercept[IllegalArgumentException] {
131 | val reader = spark
132 | .readStream
133 | .format("s3-sqs")
134 | .option("sqsUrl", "https://DUMMY_URL")
135 | .option("fileFormat", "json")
136 | .option("region", "us-east-1")
137 | .load()
138 |
139 | query = reader.writeStream
140 | .format("memory")
141 | .queryName("missingSchema")
142 | .start()
143 |
144 | query.processAllAvailable()
145 | }.getMessage
146 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
147 | } finally {
148 | if (query != null) {
149 | // terminating streaming query if necessary
150 | query.stop()
151 | }
152 | }
153 |
154 | }
155 |
156 | }
157 |
158 |
--------------------------------------------------------------------------------