├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── dev └── checkstyle.xml ├── examples └── src │ └── main │ └── scala │ └── org │ └── apache │ └── spark │ └── examples │ └── sql │ └── streaming │ └── sqs │ └── SqsSourceExample.scala ├── pom.xml ├── scalastyle-config.xml └── src ├── main ├── java │ └── org │ │ └── apache │ │ └── spark │ │ └── sql │ │ └── streaming │ │ └── sqs │ │ ├── BasicAWSCredentialsProvider.java │ │ └── InstanceProfileCredentialsProviderWithRetries.java ├── resources │ ├── META-INF │ │ └── services │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ └── log4j.properties └── scala │ └── org │ └── apache │ └── spark │ └── sql │ └── streaming │ └── sqs │ ├── SqsClient.scala │ ├── SqsFileCache.scala │ ├── SqsSource.scala │ ├── SqsSourceOptions.scala │ └── SqsSourceProvider.scala └── test ├── resources └── log4j.properties └── scala └── org └── apache └── spark └── sql └── streaming └── sqs └── SqsSourceOptionsSuite.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *#*# 2 | *.#* 3 | *.swm 4 | *.swn 5 | *.swk 6 | *.swl 7 | *.swo 8 | *.swp 9 | *.ipr 10 | *.iml 11 | *.ipr 12 | *.iws 13 | *.pyc 14 | *.pyo 15 | *.swp 16 | *~ 17 | .DS_Store 18 | .cache 19 | .classpath 20 | .ensime 21 | .ensime_cache/ 22 | .ensime_lucene 23 | .generated-mima* 24 | .idea/ 25 | .idea_modules/ 26 | .project 27 | .pydevproject 28 | .scala_dependencies 29 | .settings 30 | /lib/ 31 | R-unit-tests.log 32 | R/unit-tests.out 33 | R/cran-check.out 34 | R/pkg/vignettes/sparkr-vignettes.html 35 | build/*.jar 36 | build/apache-maven* 37 | build/scala* 38 | build/zinc* 39 | cache 40 | checkpoint 41 | conf/*.cmd 42 | conf/*.conf 43 | conf/*.properties 44 | conf/*.sh 45 | conf/*.xml 46 | conf/java-opts 47 | conf/slaves 48 | dependency-reduced-pom.xml 49 | derby.log 50 | dev/create-release/*final 51 | dev/create-release/*txt 52 | dev/pr-deps/ 53 | dist/ 54 | docs/_site 55 | docs/api 56 | lib_managed/ 57 | lint-r-report.log 58 | log/ 59 | logs/ 60 | out/ 61 | project/boot/ 62 | project/build/target/ 63 | project/plugins/lib_managed/ 64 | project/plugins/project/build.properties 65 | project/plugins/src_managed/ 66 | project/plugins/target/ 67 | python/lib/pyspark.zip 68 | python/deps 69 | python/pyspark/python 70 | reports/ 71 | scalastyle-on-compile.generated.xml 72 | scalastyle-output.xml 73 | scalastyle.txt 74 | spark-*-bin-*.tgz 75 | spark-tests.log 76 | src_managed/ 77 | streaming-tests.log 78 | target/ 79 | unit-tests.log 80 | work/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | language: java 19 | 20 | jdk: 21 | - openjdk8 22 | - oraclejdk8 23 | 24 | dist: trusty 25 | 26 | script: 27 | - mvn -DskipTests clean install -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # S3-SQS Connector 2 | 3 | [![Build Status](https://travis-ci.org/qubole/s3-sqs-connector.svg?branch=master)](https://travis-ci.org/qubole/s3-sqs-connector) 4 | 5 | A library for reading data from Amzon S3 with optimised listing using Amazon SQS using Spark SQL Streaming ( or Structured streaming.). 6 | 7 | ## Linking 8 | 9 | Using SBT: 10 | 11 | libraryDependencies += "com.qubole" %% "spark-sql-streaming-sqs_{{site.SCALA_BINARY_VERSION}}" % "{{site.PROJECT_VERSION}}" 12 | 13 | Using Maven: 14 | 15 | 16 | com.qubole 17 | spark-sql-streaming-sqs_{{site.SCALA_BINARY_VERSION}} 18 | {{site.PROJECT_VERSION}} 19 | 20 | 21 | This library can also be added to Spark jobs launched through `spark-shell` or `spark-submit` by using the `--packages` command line option. 22 | For example, to include it when starting the spark shell: 23 | 24 | $ bin/spark-shell --packages com.qubole:spark-sql-streaming-sqs_{{site.SCALA_BINARY_VERSION}}:{{site.PROJECT_VERSION}} 25 | 26 | Unlike using `--jars`, using `--packages` ensures that this library and its dependencies will be added to the classpath. 27 | The `--packages` argument can also be used with `bin/spark-submit`. 28 | 29 | This library is compiled for Scala 2.11 only, and intends to support Spark 2.4.0 onwards. 30 | 31 | ## Building S3-SQS Connector 32 | 33 | S3-SQS Connector is built using Apache Maven](http://maven.apache.org/). 34 | 35 | To build S3-SQS connector, clone this repository and run: 36 | ``` 37 | mvn -DskipTests clean package 38 | ``` 39 | 40 | This will create `target/spark-sql-streaming-sqs_2.11-0.5.1.jar` file which contains s3-sqs connector code and associated dependencies. Make sure the Scala and Java versions correspond to those required by your Spark cluster. We have tested it with Java 7/8, Scala 2.11 and Spark version 2.4.0. 41 | 42 | 43 | ## Configuration options 44 | The configuration is obtained from parameters. 45 | 46 | Name |Default | Meaning 47 | --- |:---:| --- 48 | sqsUrl|required, no default value|sqs queue url, like 'https://sqs.us-east-1.amazonaws.com/330183209093/TestQueue' 49 | region|required, no default value|AWS region where queue is created 50 | fileFormat|required, no default value|file format for the s3 files stored on Amazon S3 51 | schema|required, no default value|schema of the data being read 52 | sqsFetchIntervalSeconds|10|time interval (in seconds) after which to fetch messages from Amazon SQS queue 53 | sqsLongPollingWaitTimeSeconds|20|wait time (in seconds) for long polling on Amazon SQS queue 54 | sqsMaxConnections|1|number of parallel threads to connect to Amazon SQS queue 55 | sqsMaxRetries|10|Maximum number of consecutive retries in case of a connection failure to SQS before giving up 56 | ignoreFileDeletion|false|whether to ignore any File deleted message in SQS queue 57 | fileNameOnly|false|Whether to check new files based on only the filename instead of on the full path 58 | shouldSortFiles|true|whether to sort files based on timestamp while listing them from SQS 59 | useInstanceProfileCredentials|false|Whether to use EC2 instance profile credentials for connecting to Amazon SQS 60 | maxFilesPerTrigger|no default value|maximum number of files to process in a microbatch 61 | maxFileAge|7d|Maximum age of a file that can be found in this directory 62 | 63 | ## Example 64 | 65 | An example to create a SQL stream which uses Amazon SQS to list files on S3, 66 | 67 | val inputDf = sparkSession 68 | .readStream 69 | .format("s3-sqs") 70 | .schema(schema) 71 | .option("sqsUrl", queueUrl) 72 | .option("region", awsRegion) 73 | .option("fileFormat", "json") 74 | .option("sqsFetchIntervalSeconds", "2") 75 | .option("useInstanceProfileCredentials", "true") 76 | .option("sqsLongPollingWaitTimeSeconds", "5") 77 | .load() 78 | -------------------------------------------------------------------------------- /dev/checkstyle.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | 21 | 22 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 123 | 124 | 125 | 126 | 128 | 129 | 130 | 131 | 133 | 134 | 135 | 137 | 139 | 141 | 143 | 144 | 145 | 155 | 156 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | -------------------------------------------------------------------------------- /examples/src/main/scala/org/apache/spark/examples/sql/streaming/sqs/SqsSourceExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.sql.streaming.sqs 19 | 20 | import scala.util.Random 21 | 22 | import org.apache.spark.sql.SparkSession 23 | 24 | /** 25 | * Example to read files from S3 using SQS Source and write results to Memory Sink 26 | * 27 | * Usage: SqsSourceExample 28 | */ 29 | 30 | object SqsSourceExample { 31 | 32 | def main(args: Array[String]) { 33 | 34 | val randomName = Random.alphanumeric.take(6).mkString("") 35 | val pathName = "path_" + randomName 36 | val queryName = "query_" + randomName 37 | val checkpointDir = s"/checkpoints/$pathName" 38 | val schemaPathString = args(0) 39 | 40 | val spark = SparkSession.builder().appName("SqsExample").getOrCreate() 41 | 42 | val schema = spark.read.json(schemaPathString).schema 43 | 44 | val queueUrl = args(1) 45 | 46 | val fileFormat = args(2) 47 | 48 | val inputDf = spark 49 | .readStream 50 | .format("s3-sqs") 51 | .schema(schema) 52 | .option("sqsUrl", queueUrl) 53 | .option("fileFormat", fileFormat) 54 | .option("sqsFetchIntervalSeconds", "2") 55 | .option("sqsLongPollingWaitTimeSeconds", "5") 56 | .option("maxFilesPerTrigger", "50") 57 | .option("ignoreFileDeletion", "true") 58 | .load() 59 | 60 | val query = inputDf 61 | .writeStream 62 | .queryName(queryName) 63 | .format("memory") 64 | .option("checkpointLocation", checkpointDir) 65 | .start() 66 | 67 | query.awaitTermination() 68 | } 69 | } 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 4.0.0 20 | 21 | com.qubole 22 | spark-sql-streaming-sqs_2.11 23 | 0.5.2-SNAPSHOT 24 | jar 25 | Spark SQL Streaming SQS 26 | Connector for faster reads from S3 using SQS 27 | http://github.com/qubole/s3-sqs-connector 28 | 29 | 30 | 31 | qubole 32 | Qubole Inc. 33 | http://www.qubole.com 34 | 35 | developer 36 | 37 | 38 | 39 | 40 | 41 | 42 | Apache License, Version 2.0 43 | https://github.com/qubole/s3-sqs-connector/blob/master/LICENSE.txt 44 | repo 45 | 46 | 47 | 48 | 49 | scm:git:git://github.com/qubole/s3-sqs-connector.git 50 | https://github.com/qubole/s3-sqs-connector 51 | scm:git:git@github.com:qubole/s3-sqs-connector.git 52 | spark-sql-streaming-sqs_2.11-0.5.1 53 | 54 | 55 | 2020 56 | 57 | Qubole 58 | http://www.qubole.com/ 59 | 60 | 61 | 62 | spark-sql-streaming-sqs 63 | 2.4.0 64 | 2.11 65 | UTF-8 66 | 67 | 68 | 69 | 70 | org.apache.spark 71 | spark-sql_${scala.binary.version} 72 | ${spark.version} 73 | provided 74 | 75 | 76 | org.apache.spark 77 | spark-core_${scala.binary.version} 78 | ${spark.version} 79 | test-jar 80 | test 81 | 82 | 83 | org.apache.spark 84 | spark-sql_${scala.binary.version} 85 | ${spark.version} 86 | test-jar 87 | test 88 | 89 | 90 | org.apache.spark 91 | spark-catalyst_${scala.binary.version} 92 | ${spark.version} 93 | test-jar 94 | test 95 | 96 | 97 | com.amazonaws 98 | aws-java-sdk-sqs 99 | 1.11.271 100 | 101 | 102 | org.apache.spark 103 | spark-tags_${scala.binary.version} 104 | ${spark.version} 105 | 106 | 107 | 108 | 109 | 110 | 111 | net.alchim31.maven 112 | scala-maven-plugin 113 | 4.0.2 114 | 115 | 116 | compile 117 | 118 | compile 119 | add-source 120 | doc-jar 121 | 122 | compile 123 | 124 | 125 | 126 | 127 | org.apache.maven.plugins 128 | maven-shade-plugin 129 | 3.1.0 130 | 131 | 132 | package 133 | 134 | shade 135 | 136 | 137 | 138 | 139 | com.amazonaws:aws-java-sdk-sqs:* 140 | com.amazonaws:aws-java-sdk-core:* 141 | 142 | 143 | 144 | 145 | *:* 146 | 147 | META-INF/maven/** 148 | META-INF/MANIFEST.MF 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | net.alchim31.maven 161 | scala-maven-plugin 162 | 4.0.2 163 | 164 | 165 | org.apache.maven.plugins 166 | maven-shade-plugin 167 | 3.1.0 168 | 169 | 170 | target/scala-${scala.binary.version}/classes 171 | target/scala-${scala.binary.version}/test-classes 172 | 173 | 174 | 175 | 176 | release 177 | 178 | 179 | 180 | org.apache.maven.plugins 181 | maven-source-plugin 182 | 2.4 183 | 184 | 185 | create-sources-jar 186 | 187 | jar-no-fork 188 | 189 | 190 | 191 | 192 | 193 | org.apache.maven.plugins 194 | maven-gpg-plugin 195 | 1.5 196 | 197 | 198 | sign-artifacts 199 | verify 200 | 201 | sign 202 | 203 | 204 | 205 | 206 | 207 | org.apache.maven.plugins 208 | maven-javadoc-plugin 209 | 2.10.1 210 | 211 | 212 | create-javadoc-jar 213 | 214 | jar 215 | 216 | 217 | 218 | 219 | 220 | org.apache.maven.plugins 221 | maven-release-plugin 222 | 2.5.1 223 | 224 | true 225 | 226 | 227 | 228 | org.sonatype.plugins 229 | nexus-staging-maven-plugin 230 | 1.6.3 231 | true 232 | 233 | ossrh 234 | https://oss.sonatype.org/ 235 | true 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | ossrh 246 | https://oss.sonatype.org/content/repositories/snapshots 247 | 248 | 249 | ossrh 250 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 251 | 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /scalastyle-config.xml: -------------------------------------------------------------------------------- 1 | 17 | 39 | 40 | 41 | Scalastyle standard configuration 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | true 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW 126 | 127 | 128 | 129 | 130 | 131 | ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | ^FunSuite[A-Za-z]*$ 141 | Tests must extend org.apache.spark.SparkFunSuite instead. 142 | 143 | 144 | 145 | 146 | ^println$ 147 | 151 | 152 | 153 | 154 | @VisibleForTesting 155 | 158 | 159 | 160 | 161 | Runtime\.getRuntime\.addShutdownHook 162 | 170 | 171 | 172 | 173 | mutable\.SynchronizedBuffer 174 | 182 | 183 | 184 | 185 | Class\.forName 186 | 193 | 194 | 195 | 196 | Await\.result 197 | 204 | 205 | 206 | 207 | Await\.ready 208 | 215 | 216 | 217 | 218 | 219 | JavaConversions 220 | Instead of importing implicits in scala.collection.JavaConversions._, import 221 | scala.collection.JavaConverters._ and use .asScala / .asJava methods 222 | 223 | 224 | 225 | org\.apache\.commons\.lang\. 226 | Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead 227 | of Commons Lang 2 (package org.apache.commons.lang.*) 228 | 229 | 230 | 231 | extractOpt 232 | Use Utils.jsonOption(x).map(.extract[T]) instead of .extractOpt[T], as the latter 233 | is slower. 234 | 235 | 236 | 237 | 238 | java,scala,3rdParty,spark 239 | javax?\..* 240 | scala\..* 241 | (?!org\.apache\.spark\.).* 242 | org\.apache\.spark\..* 243 | 244 | 245 | 246 | 247 | 248 | COMMA 249 | 250 | 251 | 252 | 253 | 254 | \)\{ 255 | 258 | 259 | 260 | 261 | (?m)^(\s*)/[*][*].*$(\r|)\n^\1 [*] 262 | Use Javadoc style indentation for multiline comments 263 | 264 | 265 | 266 | case[^\n>]*=>\s*\{ 267 | Omit braces in case clauses. 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 800> 321 | 322 | 323 | 324 | 325 | 30 326 | 327 | 328 | 329 | 330 | 10 331 | 332 | 333 | 334 | 335 | 50 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | -1,0,1,2,3 347 | 348 | 349 | 350 | -------------------------------------------------------------------------------- /src/main/java/org/apache/spark/sql/streaming/sqs/BasicAWSCredentialsProvider.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.streaming.sqs; 19 | 20 | import com.amazonaws.AmazonClientException; 21 | import com.amazonaws.auth.AWSCredentialsProvider; 22 | import com.amazonaws.auth.BasicAWSCredentials; 23 | import com.amazonaws.auth.AWSCredentials; 24 | import org.apache.commons.lang.StringUtils; 25 | 26 | public class BasicAWSCredentialsProvider implements AWSCredentialsProvider { 27 | private final String accessKey; 28 | private final String secretKey; 29 | 30 | public BasicAWSCredentialsProvider(String accessKey, String secretKey) { 31 | this.accessKey = accessKey; 32 | this.secretKey = secretKey; 33 | } 34 | 35 | public AWSCredentials getCredentials() { 36 | if (!StringUtils.isEmpty(accessKey) && !StringUtils.isEmpty(secretKey)) { 37 | return new BasicAWSCredentials(accessKey, secretKey); 38 | } 39 | throw new AmazonClientException( 40 | "Access key or secret key is null"); 41 | } 42 | 43 | public void refresh() {} 44 | 45 | @Override 46 | public String toString() { 47 | return getClass().getSimpleName(); 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/org/apache/spark/sql/streaming/sqs/InstanceProfileCredentialsProviderWithRetries.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.streaming.sqs; 19 | 20 | 21 | import com.amazonaws.AmazonClientException; 22 | import com.amazonaws.auth.AWSCredentials; 23 | import com.amazonaws.auth.InstanceProfileCredentialsProvider; 24 | import org.apache.commons.logging.Log; 25 | import org.apache.commons.logging.LogFactory; 26 | 27 | public class InstanceProfileCredentialsProviderWithRetries 28 | extends InstanceProfileCredentialsProvider { 29 | 30 | private static final Log LOG = LogFactory.getLog( 31 | InstanceProfileCredentialsProviderWithRetries.class); 32 | 33 | public AWSCredentials getCredentials() { 34 | int retries = 10; 35 | int sleep = 500; 36 | while(retries > 0) { 37 | try { 38 | return super.getCredentials(); 39 | } 40 | catch (RuntimeException re) { 41 | LOG.error("Got an exception while fetching credentials " + re); 42 | --retries; 43 | try { 44 | Thread.sleep(sleep); 45 | } catch (InterruptedException ie) { 46 | // Do nothing 47 | } 48 | if (sleep < 10000) { 49 | sleep *= 2; 50 | } 51 | } 52 | catch (Error error) { 53 | LOG.error("Got an exception while fetching credentials " + error); 54 | --retries; 55 | try { 56 | Thread.sleep(sleep); 57 | } catch (InterruptedException ie) { 58 | // Do nothing 59 | } 60 | if (sleep < 10000) { 61 | sleep *= 2; 62 | } 63 | } 64 | } 65 | throw new AmazonClientException("Unable to load credentials."); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | org.apache.spark.sql.streaming.sqs.SqsSourceProvider -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | log4j.rootCategory=WARN, console 19 | 20 | # File appender 21 | log4j.appender.file=org.apache.log4j.FileAppender 22 | log4j.appender.file.append=false 23 | log4j.appender.file.file=target/unit-tests.log 24 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 25 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n 26 | 27 | # Console appender 28 | log4j.appender.console=org.apache.log4j.ConsoleAppender 29 | log4j.appender.console.target=System.out 30 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 31 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 32 | 33 | # Settings to quiet third party logs that are too verbose 34 | log4j.logger.org.sparkproject.jetty=WARN 35 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR 36 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 37 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 38 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/streaming/sqs/SqsClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.streaming.sqs 19 | 20 | import java.text.SimpleDateFormat 21 | import java.util.TimeZone 22 | import java.util.concurrent.TimeUnit 23 | 24 | import scala.collection.JavaConverters._ 25 | 26 | import com.amazonaws.{AmazonClientException, AmazonServiceException, ClientConfiguration} 27 | import com.amazonaws.services.sqs.{AmazonSQS, AmazonSQSClientBuilder} 28 | import com.amazonaws.services.sqs.model.{DeleteMessageBatchRequestEntry, Message, ReceiveMessageRequest} 29 | import org.apache.hadoop.conf.Configuration 30 | import org.json4s.{DefaultFormats, MappingException} 31 | import org.json4s.JsonAST.JValue 32 | import org.json4s.jackson.JsonMethods.parse 33 | 34 | import org.apache.spark.SparkException 35 | import org.apache.spark.internal.Logging 36 | import org.apache.spark.util.ThreadUtils 37 | 38 | class SqsClient(sourceOptions: SqsSourceOptions, 39 | hadoopConf: Configuration) extends Logging { 40 | 41 | private val sqsFetchIntervalSeconds = sourceOptions.fetchIntervalSeconds 42 | private val sqsLongPollWaitTimeSeconds = sourceOptions.longPollWaitTimeSeconds 43 | private val sqsMaxRetries = sourceOptions.maxRetries 44 | private val maxConnections = sourceOptions.maxConnections 45 | private val ignoreFileDeletion = sourceOptions.ignoreFileDeletion 46 | private val region = sourceOptions.region 47 | val sqsUrl = sourceOptions.sqsUrl 48 | 49 | @volatile var exception: Option[Exception] = None 50 | 51 | private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601 52 | timestampFormat.setTimeZone(TimeZone.getTimeZone("UTC")) 53 | private var retriesOnFailure = 0 54 | private val sqsClient = createSqsClient() 55 | 56 | val sqsScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("sqs-scheduler") 57 | 58 | val sqsFileCache = new SqsFileCache(sourceOptions.maxFileAgeMs, sourceOptions.fileNameOnly) 59 | 60 | val deleteMessageQueue = new java.util.concurrent.ConcurrentLinkedQueue[String]() 61 | 62 | private val sqsFetchMessagesThread = new Runnable { 63 | override def run(): Unit = { 64 | try { 65 | // Fetching messages from Amazon SQS 66 | val newMessages = sqsFetchMessages() 67 | 68 | // Filtering the new messages which are already not seen 69 | if (newMessages.nonEmpty) { 70 | newMessages.filter(message => sqsFileCache.isNewFile(message._1, message._2)) 71 | .foreach(message => 72 | sqsFileCache.add(message._1, MessageDescription(message._2, false, message._3))) 73 | } 74 | } catch { 75 | case e: Exception => 76 | exception = Some(e) 77 | } 78 | } 79 | } 80 | 81 | sqsScheduler.scheduleWithFixedDelay( 82 | sqsFetchMessagesThread, 83 | 0, 84 | sqsFetchIntervalSeconds, 85 | TimeUnit.SECONDS) 86 | 87 | private def sqsFetchMessages(): Seq[(String, Long, String)] = { 88 | val messageList = try { 89 | val receiveMessageRequest = new ReceiveMessageRequest() 90 | .withQueueUrl(sqsUrl) 91 | .withWaitTimeSeconds(sqsLongPollWaitTimeSeconds) 92 | val messages = sqsClient.receiveMessage(receiveMessageRequest).getMessages.asScala 93 | retriesOnFailure = 0 94 | logDebug(s"successfully received ${messages.size} messages") 95 | messages 96 | } catch { 97 | case ase: AmazonServiceException => 98 | val message = 99 | """ 100 | |Caught an AmazonServiceException, which means your request made it to Amazon SQS, 101 | | rejected with an error response for some reason. 102 | """.stripMargin 103 | logWarning(message) 104 | logWarning(s"Error Message: ${ase.getMessage}") 105 | logWarning(s"HTTP Status Code: ${ase.getStatusCode}, AWS Error Code: ${ase.getErrorCode}") 106 | logWarning(s"Error Type: ${ase.getErrorType}, Request ID: ${ase.getRequestId}") 107 | evaluateRetries() 108 | List.empty 109 | case ace: AmazonClientException => 110 | val message = 111 | """ 112 | |Caught an AmazonClientException, which means, the client encountered a serious 113 | | internal problem while trying to communicate with Amazon SQS, such as not 114 | | being able to access the network. 115 | """.stripMargin 116 | logWarning(message) 117 | logWarning(s"Error Message: ${ace.getMessage()}") 118 | evaluateRetries() 119 | List.empty 120 | case e: Exception => 121 | val message = "Received unexpected error from SQS" 122 | logWarning(message) 123 | logWarning(s"Error Message: ${e.getMessage()}") 124 | evaluateRetries() 125 | List.empty 126 | } 127 | if (messageList.nonEmpty) { 128 | parseSqsMessages(messageList) 129 | } else { 130 | Seq.empty 131 | } 132 | } 133 | 134 | private def parseSqsMessages(messageList: Seq[Message]): Seq[(String, Long, String)] = { 135 | val errorMessages = scala.collection.mutable.ListBuffer[String]() 136 | val parsedMessages = messageList.foldLeft(Seq[(String, Long, String)]()) { (list, message) => 137 | implicit val formats = DefaultFormats 138 | try { 139 | val messageReceiptHandle = message.getReceiptHandle 140 | val messageJson = parse(message.getBody).extract[JValue] 141 | val bucketName = ( 142 | messageJson \ "Records" \ "s3" \ "bucket" \ "name").extract[Array[String]].head 143 | val eventName = (messageJson \ "Records" \ "eventName").extract[Array[String]].head 144 | if (eventName.contains("ObjectCreated")) { 145 | val timestamp = (messageJson \ "Records" \ "eventTime").extract[Array[String]].head 146 | val timestampMills = convertTimestampToMills(timestamp) 147 | val path = "s3://" + 148 | bucketName + "/" + 149 | (messageJson \ "Records" \ "s3" \ "object" \ "key").extract[Array[String]].head 150 | logDebug("Successfully parsed sqs message") 151 | list :+ ((path, timestampMills, messageReceiptHandle)) 152 | } else { 153 | if (eventName.contains("ObjectRemoved")) { 154 | if (!ignoreFileDeletion) { 155 | exception = Some(new SparkException("ObjectDelete message detected in SQS")) 156 | } else { 157 | logInfo("Ignoring file deletion message since ignoreFileDeletion is true") 158 | } 159 | } else { 160 | logWarning("Ignoring unexpected message detected in SQS") 161 | } 162 | errorMessages.append(messageReceiptHandle) 163 | list 164 | } 165 | } catch { 166 | case me: MappingException => 167 | errorMessages.append(message.getReceiptHandle) 168 | logWarning(s"Error in parsing SQS message ${me.getMessage}") 169 | list 170 | case e: Exception => 171 | errorMessages.append(message.getReceiptHandle) 172 | logWarning(s"Unexpected error while parsing SQS message ${e.getMessage}") 173 | list 174 | } 175 | } 176 | if (errorMessages.nonEmpty) { 177 | addToDeleteMessageQueue(errorMessages.toList) 178 | } 179 | parsedMessages 180 | } 181 | 182 | private def convertTimestampToMills(timestamp: String): Long = { 183 | val timeInMillis = timestampFormat.parse(timestamp).getTime() 184 | timeInMillis 185 | } 186 | 187 | private def evaluateRetries(): Unit = { 188 | retriesOnFailure += 1 189 | if (retriesOnFailure >= sqsMaxRetries) { 190 | logError("Max retries reached") 191 | exception = Some(new SparkException("Unable to receive Messages from SQS for " + 192 | s"${sqsMaxRetries} times Giving up. Check logs for details.")) 193 | } else { 194 | logWarning(s"Attempt ${retriesOnFailure}." + 195 | s"Will reattempt after ${sqsFetchIntervalSeconds} seconds") 196 | } 197 | } 198 | 199 | private def createSqsClient(): AmazonSQS = { 200 | try { 201 | val isClusterOnEc2Role = hadoopConf.getBoolean( 202 | "fs.s3.isClusterOnEc2Role", false) || hadoopConf.getBoolean( 203 | "fs.s3n.isClusterOnEc2Role", false) || sourceOptions.useInstanceProfileCredentials 204 | if (!isClusterOnEc2Role) { 205 | val accessKey = hadoopConf.getTrimmed("fs.s3n.awsAccessKeyId") 206 | val secretAccessKey = new String(hadoopConf.getPassword("fs.s3n.awsSecretAccessKey")).trim 207 | logInfo("Using credentials from keys provided") 208 | val basicAwsCredentialsProvider = new BasicAWSCredentialsProvider( 209 | accessKey, secretAccessKey) 210 | AmazonSQSClientBuilder 211 | .standard() 212 | .withClientConfiguration(new ClientConfiguration().withMaxConnections(maxConnections)) 213 | .withCredentials(basicAwsCredentialsProvider) 214 | .withRegion(region) 215 | .build() 216 | } else { 217 | logInfo("Using the credentials attached to the instance") 218 | val instanceProfileCredentialsProvider = new InstanceProfileCredentialsProviderWithRetries() 219 | AmazonSQSClientBuilder 220 | .standard() 221 | .withClientConfiguration(new ClientConfiguration().withMaxConnections(maxConnections)) 222 | .withCredentials(instanceProfileCredentialsProvider) 223 | .build() 224 | } 225 | } catch { 226 | case e: Exception => 227 | throw new SparkException(s"Error occured while creating Amazon SQS Client", e) 228 | } 229 | } 230 | 231 | def addToDeleteMessageQueue(messageReceiptHandles: List[String]): Unit = { 232 | deleteMessageQueue.addAll(messageReceiptHandles.asJava) 233 | } 234 | 235 | def deleteMessagesFromQueue(): Unit = { 236 | try { 237 | var count = -1 238 | val messageReceiptHandles = deleteMessageQueue.asScala.toList 239 | val messageGroups = messageReceiptHandles.sliding(10, 10).toList 240 | messageGroups.foreach { messageGroup => 241 | val requestEntries = messageGroup.foldLeft(List[DeleteMessageBatchRequestEntry]()) { 242 | (list, messageReceiptHandle) => 243 | count = count + 1 244 | list :+ new DeleteMessageBatchRequestEntry(count.toString, messageReceiptHandle) 245 | }.asJava 246 | val batchResult = sqsClient.deleteMessageBatch(sqsUrl, requestEntries) 247 | if (!batchResult.getFailed.isEmpty) { 248 | batchResult.getFailed.asScala.foreach { entry => 249 | sqsClient.deleteMessage( 250 | sqsUrl, requestEntries.get(entry.getId.toInt).getReceiptHandle) 251 | } 252 | } 253 | } 254 | } catch { 255 | case e: Exception => 256 | logWarning(s"Unable to delete message from SQS ${e.getMessage}") 257 | } 258 | deleteMessageQueue.clear() 259 | } 260 | 261 | def assertSqsIsWorking(): Unit = { 262 | if (exception.isDefined) { 263 | throw exception.get 264 | } 265 | } 266 | 267 | } 268 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/streaming/sqs/SqsFileCache.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.streaming.sqs 19 | 20 | import java.net.URI 21 | import java.util.concurrent.ConcurrentHashMap 22 | 23 | import scala.collection.JavaConverters._ 24 | import scala.collection.mutable.ListBuffer 25 | 26 | import org.apache.hadoop.fs.Path 27 | 28 | import org.apache.spark.internal.Logging 29 | 30 | /** 31 | * A custom hash map used to track the list of files seen. This map is thread-safe. 32 | * To prevent the hash map from growing indefinitely, a purge function is available to 33 | * remove files "maxAgeMs" older than the latest file. 34 | */ 35 | 36 | class SqsFileCache(maxAgeMs: Long, fileNameOnly: Boolean) extends Logging { 37 | require(maxAgeMs >= 0) 38 | if (fileNameOnly) { 39 | logWarning("'fileNameOnly' is enabled. Make sure your file names are unique (e.g. using " + 40 | "UUID), otherwise, files with the same name but under different paths will be considered " + 41 | "the same and causes data lost.") 42 | } 43 | 44 | /** Mapping from file path to its message description. */ 45 | private val sqsMap = new ConcurrentHashMap[String, MessageDescription] 46 | 47 | /** Timestamp for the last purge operation. */ 48 | private var lastPurgeTimestamp: Long = 0L 49 | 50 | /** Timestamp of the latest file. */ 51 | private var latestTimestamp: Long = 0L 52 | 53 | @inline private def stripPathIfNecessary(path: String) = { 54 | if (fileNameOnly) new Path(new URI(path)).getName else path 55 | } 56 | 57 | /** 58 | * Returns true if we should consider this file a new file. The file is only considered "new" 59 | * if it is new enough that we are still tracking, and we have not seen it before. 60 | */ 61 | def isNewFile(path: String, timestamp: Long): Boolean = { 62 | timestamp >= lastPurgeTimestamp && !sqsMap.containsKey(stripPathIfNecessary(path)) 63 | } 64 | 65 | /** Add a new file to the map. */ 66 | def add(path: String, fileStatus: MessageDescription): Unit = { 67 | sqsMap.put(stripPathIfNecessary(path), fileStatus) 68 | if (fileStatus.timestamp > latestTimestamp) { 69 | latestTimestamp = fileStatus.timestamp 70 | } 71 | } 72 | 73 | /** 74 | * Returns all the new files found - ignore aged files and files that we have already seen. 75 | * Sorts the files by timestamp. 76 | */ 77 | def getUncommittedFiles(maxFilesPerTrigger: Option[Int], 78 | shouldSortFiles: Boolean): Seq[(String, Long, String)] = { 79 | if (shouldSortFiles) { 80 | val uncommittedFiles = filterAllUncommittedFiles() 81 | val sortedFiles = reportTimeTaken("Sorting Files") { 82 | uncommittedFiles.sortWith(_._2 < _._2) 83 | } 84 | if (maxFilesPerTrigger.nonEmpty) sortedFiles.take(maxFilesPerTrigger.get) else sortedFiles 85 | } else { 86 | if (maxFilesPerTrigger.isEmpty) { 87 | filterAllUncommittedFiles() 88 | } else { 89 | filterTopUncommittedFiles(maxFilesPerTrigger.get) 90 | } 91 | } 92 | } 93 | private def filterTopUncommittedFiles(maxFilesPerTrigger: Int): List[(String, Long, String)] = { 94 | val iterator = sqsMap.asScala.iterator 95 | val uncommittedFiles = ListBuffer[(String, Long, String)]() 96 | while (uncommittedFiles.length < maxFilesPerTrigger && iterator.hasNext) { 97 | val file = iterator.next() 98 | if (file._2.isCommitted && file._2.timestamp >= lastPurgeTimestamp) { 99 | uncommittedFiles += ((file._1, file._2.timestamp, file._2.messageReceiptHandle)) 100 | } 101 | } 102 | uncommittedFiles.toList 103 | } 104 | 105 | private def reportTimeTaken[T](operation: String)(body: => T): T = { 106 | val startTime = System.currentTimeMillis() 107 | val result = body 108 | val endTime = System.currentTimeMillis() 109 | val timeTaken = math.max(endTime - startTime, 0) 110 | 111 | logDebug(s"$operation took $timeTaken ms") 112 | result 113 | } 114 | 115 | private def filterAllUncommittedFiles(): List[(String, Long, String)] = { 116 | sqsMap.asScala.foldLeft(List[(String, Long, String)]()) { 117 | (list, file) => 118 | if (!file._2.isCommitted && file._2.timestamp >= lastPurgeTimestamp) { 119 | list :+ ((file._1, file._2.timestamp, file._2.messageReceiptHandle)) 120 | } else { 121 | list 122 | } 123 | } 124 | } 125 | 126 | /** Removes aged entries and returns the number of files removed. */ 127 | def purge(): Int = { 128 | lastPurgeTimestamp = latestTimestamp - maxAgeMs 129 | var count = 0 130 | sqsMap.asScala.foreach { fileEntry => 131 | if (fileEntry._2.timestamp < lastPurgeTimestamp) { 132 | sqsMap.remove(fileEntry._1) 133 | count += 1 134 | } 135 | } 136 | count 137 | } 138 | 139 | /** Mark file entry as committed or already processed */ 140 | def markCommitted(path: String): Unit = { 141 | sqsMap.replace(path, MessageDescription( 142 | sqsMap.get(path).timestamp, true, sqsMap.get(path).messageReceiptHandle)) 143 | } 144 | 145 | def size: Int = sqsMap.size() 146 | 147 | } 148 | 149 | /** 150 | * A case class to store file metadata. Metadata includes file timestamp, file status - 151 | * committed or not committed and message reciept handle used for deleting message from 152 | * Amazon SQS 153 | */ 154 | case class MessageDescription(timestamp: Long, 155 | isCommitted: Boolean = false, 156 | messageReceiptHandle: String) 157 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/streaming/sqs/SqsSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.streaming.sqs 19 | 20 | import java.net.URI 21 | 22 | import org.apache.hadoop.fs.Path 23 | 24 | import org.apache.spark.internal.Logging 25 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} 26 | import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation} 27 | import org.apache.spark.sql.execution.streaming._ 28 | import org.apache.spark.sql.execution.streaming.FileStreamSource._ 29 | import org.apache.spark.sql.types.StructType 30 | 31 | 32 | class SqsSource(sparkSession: SparkSession, 33 | metadataPath: String, 34 | options: Map[String, String], 35 | override val schema: StructType) extends Source with Logging { 36 | 37 | private val sourceOptions = new SqsSourceOptions(options) 38 | 39 | private val hadoopConf = sparkSession.sessionState.newHadoopConf() 40 | 41 | private val metadataLog = 42 | new FileStreamSourceLog(FileStreamSourceLog.VERSION, sparkSession, metadataPath) 43 | private var metadataLogCurrentOffset = metadataLog.getLatest().map(_._1).getOrElse(-1L) 44 | 45 | private val maxFilesPerTrigger = sourceOptions.maxFilesPerTrigger 46 | 47 | private val maxFileAgeMs: Long = sourceOptions.maxFileAgeMs 48 | 49 | private val fileFormatClassName = sourceOptions.fileFormatClassName 50 | 51 | private val shouldSortFiles = sourceOptions.shouldSortFiles 52 | 53 | private val sqsClient = new SqsClient(sourceOptions, hadoopConf) 54 | 55 | metadataLog.allFiles().foreach { entry => 56 | sqsClient.sqsFileCache.add(entry.path, MessageDescription(entry.timestamp, true, "")) 57 | } 58 | sqsClient.sqsFileCache.purge() 59 | 60 | logInfo(s"maxFilesPerBatch = $maxFilesPerTrigger, maxFileAgeMs = $maxFileAgeMs") 61 | 62 | /** 63 | * Returns the data that is between the offsets (`start`, `end`]. 64 | */ 65 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = { 66 | val startOffset = start.map(FileStreamSourceOffset(_).logOffset).getOrElse(-1L) 67 | val endOffset = FileStreamSourceOffset(end).logOffset 68 | 69 | assert(startOffset <= endOffset) 70 | val files = metadataLog.get(Some(startOffset + 1), Some(endOffset)).flatMap(_._2) 71 | logInfo(s"Processing ${files.length} files from ${startOffset + 1}:$endOffset") 72 | logTrace(s"Files are:\n\t" + files.mkString("\n\t")) 73 | val newDataSource = 74 | DataSource( 75 | sparkSession, 76 | paths = files.map(f => new Path(new URI(f.path)).toString), 77 | userSpecifiedSchema = Some(schema), 78 | className = fileFormatClassName, 79 | options = options) 80 | Dataset.ofRows(sparkSession, LogicalRelation(newDataSource.resolveRelation( 81 | checkFilesExist = false), isStreaming = true)) 82 | } 83 | 84 | private def fetchMaxOffset(): FileStreamSourceOffset = synchronized { 85 | 86 | sqsClient.assertSqsIsWorking() 87 | /** 88 | * All the new files found - ignore aged files and files that we have seen. 89 | * Obey user's setting to limit the number of files in this batch trigger. 90 | */ 91 | val batchFiles = sqsClient.sqsFileCache.getUncommittedFiles(maxFilesPerTrigger, shouldSortFiles) 92 | 93 | if (batchFiles.nonEmpty) { 94 | metadataLogCurrentOffset += 1 95 | metadataLog.add(metadataLogCurrentOffset, batchFiles.map { 96 | case (path, timestamp, receiptHandle) => 97 | FileEntry(path = path, timestamp = timestamp, batchId = metadataLogCurrentOffset) 98 | }.toArray) 99 | logInfo(s"Log offset set to $metadataLogCurrentOffset with ${batchFiles.size} new files") 100 | val messageReceiptHandles = batchFiles.map { 101 | case (path, timestamp, receiptHandle) => 102 | sqsClient.sqsFileCache.markCommitted(path) 103 | logDebug(s"New file: $path") 104 | receiptHandle 105 | }.toList 106 | sqsClient.addToDeleteMessageQueue(messageReceiptHandles) 107 | } 108 | 109 | val numPurged = sqsClient.sqsFileCache.purge() 110 | 111 | if (!sqsClient.deleteMessageQueue.isEmpty) { 112 | sqsClient.deleteMessagesFromQueue() 113 | } 114 | 115 | logTrace( 116 | s""" 117 | |Number of files selected for batch = ${batchFiles.size} 118 | |Number of files purged from tracking map = $numPurged 119 | """.stripMargin) 120 | 121 | FileStreamSourceOffset(metadataLogCurrentOffset) 122 | } 123 | 124 | override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.logOffset == -1) 125 | 126 | override def commit(end: Offset): Unit = { 127 | // No-op for now; SqsSource currently garbage-collects files based on timestamp 128 | // and the value of the maxFileAge parameter. 129 | } 130 | 131 | override def stop(): Unit = { 132 | if (!sqsClient.sqsScheduler.isTerminated) { 133 | sqsClient.sqsScheduler.shutdownNow() 134 | } 135 | } 136 | 137 | override def toString: String = s"SqsSource[${sqsClient.sqsUrl}]" 138 | 139 | } 140 | 141 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/streaming/sqs/SqsSourceOptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.streaming.sqs 19 | 20 | import scala.util.Try 21 | 22 | import org.apache.spark.internal.Logging 23 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap 24 | import org.apache.spark.util.Utils 25 | 26 | /** 27 | * User specified options for sqs source. 28 | */ 29 | class SqsSourceOptions(parameters: CaseInsensitiveMap[String]) extends Logging { 30 | 31 | def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters)) 32 | 33 | val maxFilesPerTrigger: Option[Int] = parameters.get("maxFilesPerTrigger").map { str => 34 | Try(str.toInt).toOption.filter(_ > 0).getOrElse { 35 | throw new IllegalArgumentException( 36 | s"Invalid value '$str' for option 'maxFilesPerTrigger', must be a positive integer") 37 | } 38 | } 39 | 40 | /** 41 | * Maximum age of a file that can be found in this directory, before it is ignored. For the 42 | * first batch all files will be considered valid. 43 | * 44 | * The max age is specified with respect to the timestamp of the latest file, and not the 45 | * timestamp of the current system. That this means if the last file has timestamp 1000, and the 46 | * current system time is 2000, and max age is 200, the system will purge files older than 47 | * 800 (rather than 1800) from the internal state. 48 | * 49 | * Default to a week. 50 | */ 51 | val maxFileAgeMs: Long = 52 | Utils.timeStringAsMs(parameters.getOrElse("maxFileAge", "7d")) 53 | 54 | val fetchIntervalSeconds: Int = parameters.get("sqsFetchIntervalSeconds").map { str => 55 | Try(str.toInt).toOption.filter(_ > 0).getOrElse { 56 | throw new IllegalArgumentException( 57 | s"Invalid value '$str' for option 'sqsFetchIntervalSeconds', must be a positive integer") 58 | } 59 | }.getOrElse(10) 60 | 61 | val longPollWaitTimeSeconds: Int = parameters.get("sqsLongPollingWaitTimeSeconds").map { str => 62 | Try(str.toInt).toOption.filter(x => x >= 0 && x <= 20).getOrElse { 63 | throw new IllegalArgumentException( 64 | s"Invalid value '$str' for option 'sqsLongPollingWaitTimeSeconds'," + 65 | "must be an integer between 0 and 20") 66 | } 67 | }.getOrElse(20) 68 | 69 | val maxRetries: Int = parameters.get("sqsMaxRetries").map { str => 70 | Try(str.toInt).toOption.filter(_ > 0).getOrElse { 71 | throw new IllegalArgumentException( 72 | s"Invalid value '$str' for option 'sqsMaxRetries', must be a positive integer") 73 | } 74 | }.getOrElse(10) 75 | 76 | val maxConnections: Int = parameters.get("sqsMaxConnections").map { str => 77 | Try(str.toInt).toOption.filter(_ > 0).getOrElse { 78 | throw new IllegalArgumentException( 79 | s"Invalid value '$str' for option 'sqsMaxConnections', must be a positive integer") 80 | } 81 | }.getOrElse(1) 82 | 83 | val sqsUrl: String = parameters.get("sqsUrl").getOrElse{ 84 | throw new IllegalArgumentException("SQS Url is not specified") 85 | } 86 | 87 | val region: String = parameters.get("region").getOrElse { 88 | throw new IllegalArgumentException("Region is not specified") 89 | } 90 | 91 | val fileFormatClassName: String = parameters.get("fileFormat").getOrElse { 92 | throw new IllegalArgumentException("Specifying file format is mandatory with sqs source") 93 | } 94 | 95 | val ignoreFileDeletion: Boolean = withBooleanParameter("ignoreFileDeletion", false) 96 | 97 | /** 98 | * Whether to check new files based on only the filename instead of on the full path. 99 | * 100 | * With this set to `true`, the following files would be considered as the same file, because 101 | * their filenames, "dataset.txt", are the same: 102 | * - "file:///dataset.txt" 103 | * - "s3://a/dataset.txt" 104 | * - "s3n://a/b/dataset.txt" 105 | * - "s3a://a/b/c/dataset.txt" 106 | */ 107 | val fileNameOnly: Boolean = withBooleanParameter("fileNameOnly", false) 108 | 109 | val shouldSortFiles: Boolean = withBooleanParameter("shouldSortFiles", true) 110 | 111 | val useInstanceProfileCredentials: Boolean = withBooleanParameter( 112 | "useInstanceProfileCredentials", false) 113 | 114 | private def withBooleanParameter(name: String, default: Boolean) = { 115 | parameters.get(name).map { str => 116 | try { 117 | str.toBoolean 118 | } catch { 119 | case _: IllegalArgumentException => 120 | throw new IllegalArgumentException( 121 | s"Invalid value '$str' for option '$name', must be true or false") 122 | } 123 | }.getOrElse(default) 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/streaming/sqs/SqsSourceProvider.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.streaming.sqs 19 | 20 | import org.apache.spark.internal.Logging 21 | import org.apache.spark.sql.SQLContext 22 | import org.apache.spark.sql.execution.streaming.Source 23 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider} 24 | import org.apache.spark.sql.types.StructType 25 | 26 | class SqsSourceProvider extends DataSourceRegister 27 | with StreamSourceProvider 28 | with Logging { 29 | 30 | override def shortName(): String = "s3-sqs" 31 | 32 | override def sourceSchema(sqlContext: SQLContext, 33 | schema: Option[StructType], 34 | providerName: String, 35 | parameters: Map[String, String]): (String, StructType) = { 36 | 37 | require(schema.isDefined, "Sqs source doesn't support empty schema") 38 | (shortName(), schema.get) 39 | } 40 | 41 | override def createSource(sqlContext: SQLContext, 42 | metadataPath: String, 43 | schema: Option[StructType], 44 | providerName: String, 45 | parameters: Map[String, String]): Source = { 46 | 47 | new SqsSource( 48 | sqlContext.sparkSession, 49 | metadataPath, 50 | parameters, 51 | schema.get) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file target/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=true 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 25 | 26 | # Ignore messages below warning level from Jetty, because it's a bit verbose 27 | log4j.logger.org.sparkproject.jetty=WARN 28 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/streaming/sqs/SqsSourceOptionsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.streaming.sqs 18 | 19 | import java.util.Locale 20 | 21 | import org.apache.spark.sql.streaming.{StreamingQuery, StreamingQueryException, StreamTest} 22 | import org.apache.spark.sql.types.StructType 23 | 24 | class SqsSourceOptionsSuite extends StreamTest { 25 | 26 | test("bad source options") { 27 | def testBadOptions(option: (String, String))(expectedMsg: String): Unit = { 28 | 29 | var query : StreamingQuery = null 30 | 31 | try { 32 | val errorMessage = intercept[StreamingQueryException] { 33 | val dummySchema = new StructType 34 | val reader = spark 35 | .readStream 36 | .format("s3-sqs") 37 | .option("fileFormat", "json") 38 | .schema(dummySchema) 39 | .option("sqsUrl", "https://DUMMY_URL") 40 | .option("region", "us-east-1") 41 | .option(option._1, option._2) 42 | .load() 43 | 44 | query = reader.writeStream 45 | .format("memory") 46 | .queryName("badOptionsTest") 47 | .start() 48 | 49 | query.processAllAvailable() 50 | }.getMessage 51 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT))) 52 | } finally { 53 | if (query != null) { 54 | // terminating streaming query if necessary 55 | query.stop() 56 | } 57 | 58 | } 59 | } 60 | 61 | testBadOptions("sqsFetchIntervalSeconds" -> "-2")("Invalid value '-2' " + 62 | "for option 'sqsFetchIntervalSeconds', must be a positive integer") 63 | testBadOptions("sqsLongPollingWaitTimeSeconds" -> "-5")("Invalid value '-5' " + 64 | "for option 'sqsLongPollingWaitTimeSeconds',must be an integer between 0 and 20") 65 | testBadOptions("sqsMaxConnections" -> "-2")("Invalid value '-2' " + 66 | "for option 'sqsMaxConnections', must be a positive integer") 67 | testBadOptions("maxFilesPerTrigger" -> "-50")("Invalid value '-50' " + 68 | "for option 'maxFilesPerTrigger', must be a positive integer") 69 | testBadOptions("ignoreFileDeletion" -> "x")("Invalid value 'x' " + 70 | "for option 'ignoreFileDeletion', must be true or false") 71 | testBadOptions("fileNameOnly" -> "x")("Invalid value 'x' " + 72 | "for option 'fileNameOnly', must be true or false") 73 | testBadOptions("shouldSortFiles" -> "x")("Invalid value 'x' " + 74 | "for option 'shouldSortFiles', must be true or false") 75 | testBadOptions("useInstanceProfileCredentials" -> "x")("Invalid value 'x' " + 76 | "for option 'useInstanceProfileCredentials', must be true or false") 77 | 78 | } 79 | 80 | test("missing mandatory options") { 81 | 82 | def testMissingMandatoryOptions(options: List[(String, String)])(expectedMsg: String): Unit = { 83 | 84 | var query: StreamingQuery = null 85 | 86 | try { 87 | val errorMessage = intercept[StreamingQueryException] { 88 | val dummySchema = new StructType 89 | val reader = spark 90 | .readStream 91 | .format("s3-sqs") 92 | .schema(dummySchema) 93 | 94 | val readerWithOptions = options.map { option => 95 | reader.option(option._1, option._2) 96 | }.last.load() 97 | 98 | query = readerWithOptions.writeStream 99 | .format("memory") 100 | .queryName("missingMandatoryOptions") 101 | .start() 102 | 103 | query.processAllAvailable() 104 | }.getMessage 105 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT))) 106 | } finally { 107 | if (query != null) { 108 | // terminating streaming query if necessary 109 | query.stop() 110 | } 111 | } 112 | } 113 | 114 | // No fileFormat specified 115 | testMissingMandatoryOptions(List("sqsUrl" -> "https://DUMMY_URL", "region" -> "us-east-1"))( 116 | "Specifying file format is mandatory with sqs source") 117 | 118 | // Sqs URL not specified 119 | testMissingMandatoryOptions(List("fileFormat" -> "json", "region" -> "us-east-1"))( 120 | "SQS Url is not specified") 121 | } 122 | 123 | test("schema not specified") { 124 | 125 | var query: StreamingQuery = null 126 | 127 | val expectedMsg = "Sqs source doesn't support empty schema" 128 | 129 | try { 130 | val errorMessage = intercept[IllegalArgumentException] { 131 | val reader = spark 132 | .readStream 133 | .format("s3-sqs") 134 | .option("sqsUrl", "https://DUMMY_URL") 135 | .option("fileFormat", "json") 136 | .option("region", "us-east-1") 137 | .load() 138 | 139 | query = reader.writeStream 140 | .format("memory") 141 | .queryName("missingSchema") 142 | .start() 143 | 144 | query.processAllAvailable() 145 | }.getMessage 146 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT))) 147 | } finally { 148 | if (query != null) { 149 | // terminating streaming query if necessary 150 | query.stop() 151 | } 152 | } 153 | 154 | } 155 | 156 | } 157 | 158 | --------------------------------------------------------------------------------