├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── dev
    └── checkstyle.xml
├── examples
    └── src
    │   └── main
    │       └── scala
    │           └── org
    │               └── apache
    │                   └── spark
    │                       └── examples
    │                           └── sql
    │                               └── streaming
    │                                   └── sqs
    │                                       └── SqsSourceExample.scala
├── pom.xml
├── scalastyle-config.xml
└── src
    ├── main
        ├── java
        │   └── org
        │   │   └── apache
        │   │       └── spark
        │   │           └── sql
        │   │               └── streaming
        │   │                   └── sqs
        │   │                       ├── BasicAWSCredentialsProvider.java
        │   │                       └── InstanceProfileCredentialsProviderWithRetries.java
        ├── resources
        │   ├── META-INF
        │   │   └── services
        │   │   │   └── org.apache.spark.sql.sources.DataSourceRegister
        │   └── log4j.properties
        └── scala
        │   └── org
        │       └── apache
        │           └── spark
        │               └── sql
        │                   └── streaming
        │                       └── sqs
        │                           ├── SqsClient.scala
        │                           ├── SqsFileCache.scala
        │                           ├── SqsSource.scala
        │                           ├── SqsSourceOptions.scala
        │                           └── SqsSourceProvider.scala
    └── test
        ├── resources
            └── log4j.properties
        └── scala
            └── org
                └── apache
                    └── spark
                        └── sql
                            └── streaming
                                └── sqs
                                    └── SqsSourceOptionsSuite.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *#*#
 2 | *.#*
 3 | *.swm
 4 | *.swn
 5 | *.swk
 6 | *.swl
 7 | *.swo
 8 | *.swp
 9 | *.ipr
10 | *.iml
11 | *.ipr
12 | *.iws
13 | *.pyc
14 | *.pyo
15 | *.swp
16 | *~
17 | .DS_Store
18 | .cache
19 | .classpath
20 | .ensime
21 | .ensime_cache/
22 | .ensime_lucene
23 | .generated-mima*
24 | .idea/
25 | .idea_modules/
26 | .project
27 | .pydevproject
28 | .scala_dependencies
29 | .settings
30 | /lib/
31 | R-unit-tests.log
32 | R/unit-tests.out
33 | R/cran-check.out
34 | R/pkg/vignettes/sparkr-vignettes.html
35 | build/*.jar
36 | build/apache-maven*
37 | build/scala*
38 | build/zinc*
39 | cache
40 | checkpoint
41 | conf/*.cmd
42 | conf/*.conf
43 | conf/*.properties
44 | conf/*.sh
45 | conf/*.xml
46 | conf/java-opts
47 | conf/slaves
48 | dependency-reduced-pom.xml
49 | derby.log
50 | dev/create-release/*final
51 | dev/create-release/*txt
52 | dev/pr-deps/
53 | dist/
54 | docs/_site
55 | docs/api
56 | lib_managed/
57 | lint-r-report.log
58 | log/
59 | logs/
60 | out/
61 | project/boot/
62 | project/build/target/
63 | project/plugins/lib_managed/
64 | project/plugins/project/build.properties
65 | project/plugins/src_managed/
66 | project/plugins/target/
67 | python/lib/pyspark.zip
68 | python/deps
69 | python/pyspark/python
70 | reports/
71 | scalastyle-on-compile.generated.xml
72 | scalastyle-output.xml
73 | scalastyle.txt
74 | spark-*-bin-*.tgz
75 | spark-tests.log
76 | src_managed/
77 | streaming-tests.log
78 | target/
79 | unit-tests.log
80 | work/


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | language: java
19 | 
20 | jdk:
21 |   - openjdk8
22 |   - oraclejdk8
23 | 
24 | dist: trusty
25 | 
26 | script:
27 |   - mvn -DskipTests clean install


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # S3-SQS Connector
 2 | 
 3 | [![Build Status](https://travis-ci.org/qubole/s3-sqs-connector.svg?branch=master)](https://travis-ci.org/qubole/s3-sqs-connector)
 4 | 
 5 | A library for reading data from Amzon S3 with optimised listing using Amazon SQS using Spark SQL Streaming ( or Structured streaming.). 
 6 | 
 7 | ## Linking
 8 | 
 9 | Using SBT:
10 | 
11 |     libraryDependencies += "com.qubole" %% "spark-sql-streaming-sqs_{{site.SCALA_BINARY_VERSION}}" % "{{site.PROJECT_VERSION}}"
12 | 
13 | Using Maven:
14 | 
15 |     <dependency>
16 |         <groupId>com.qubole</groupId>
17 |         <artifactId>spark-sql-streaming-sqs_{{site.SCALA_BINARY_VERSION}}</artifactId>
18 |         <version>{{site.PROJECT_VERSION}}</version>
19 |     </dependency>
20 | 
21 | This library can also be added to Spark jobs launched through `spark-shell` or `spark-submit` by using the `--packages` command line option.
22 | For example, to include it when starting the spark shell:
23 | 
24 |     $ bin/spark-shell --packages com.qubole:spark-sql-streaming-sqs_{{site.SCALA_BINARY_VERSION}}:{{site.PROJECT_VERSION}}
25 | 
26 | Unlike using `--jars`, using `--packages` ensures that this library and its dependencies will be added to the classpath.
27 | The `--packages` argument can also be used with `bin/spark-submit`.
28 | 
29 | This library is compiled for Scala 2.11 only, and intends to support Spark 2.4.0 onwards.
30 | 
31 | ## Building S3-SQS Connector
32 | 
33 | S3-SQS Connector is built using Apache Maven](http://maven.apache.org/).
34 | 
35 | To build S3-SQS connector, clone this repository and run:
36 | ```
37 | mvn -DskipTests clean package
38 | ```
39 | 
40 | This will create `target/spark-sql-streaming-sqs_2.11-0.5.1.jar` file which contains s3-sqs connector code and associated dependencies. Make sure the Scala and Java versions correspond to those required by your Spark cluster. We have tested it with Java 7/8, Scala 2.11 and Spark version 2.4.0.
41 | 
42 | 
43 | ## Configuration options
44 | The configuration is obtained from parameters.
45 | 
46 | Name |Default | Meaning
47 | --- |:---:| ---
48 | sqsUrl|required, no default value|sqs queue url, like 'https://sqs.us-east-1.amazonaws.com/330183209093/TestQueue'
49 | region|required, no default value|AWS region where queue is created
50 | fileFormat|required, no default value|file format for the s3 files stored on Amazon S3
51 | schema|required, no default value|schema of the data being read 
52 | sqsFetchIntervalSeconds|10|time interval (in seconds) after which to fetch messages from Amazon SQS queue
53 | sqsLongPollingWaitTimeSeconds|20|wait time (in seconds) for long polling on Amazon SQS queue 
54 | sqsMaxConnections|1|number of parallel threads to connect to Amazon SQS queue
55 | sqsMaxRetries|10|Maximum number of consecutive retries in case of a connection failure to SQS before giving up
56 | ignoreFileDeletion|false|whether to ignore any File deleted message in SQS queue
57 | fileNameOnly|false|Whether to check new files based on only the filename instead of on the full path
58 | shouldSortFiles|true|whether to sort files based on timestamp while listing them from SQS
59 | useInstanceProfileCredentials|false|Whether to use EC2 instance profile credentials for connecting to Amazon SQS
60 | maxFilesPerTrigger|no default value|maximum number of files to process in a microbatch
61 | maxFileAge|7d|Maximum age of a file that can be found in this directory
62 | 
63 | ## Example
64 | 
65 | An example to create a SQL stream which uses Amazon SQS to list files on S3,
66 | 
67 |         val inputDf = sparkSession
68 |                           .readStream
69 |                           .format("s3-sqs")
70 |                           .schema(schema)
71 |                           .option("sqsUrl", queueUrl)
72 |                           .option("region", awsRegion)
73 |                           .option("fileFormat", "json")
74 |                           .option("sqsFetchIntervalSeconds", "2")
75 |                           .option("useInstanceProfileCredentials", "true")
76 |                           .option("sqsLongPollingWaitTimeSeconds", "5")
77 |                           .load()
78 | 


--------------------------------------------------------------------------------
/dev/checkstyle.xml:
--------------------------------------------------------------------------------
  1 | <!--
  2 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
  3 |   ~ contributor license agreements.  See the NOTICE file distributed with
  4 |   ~ this work for additional information regarding copyright ownership.
  5 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
  6 |   ~ (the "License"); you may not use this file except in compliance with
  7 |   ~ the License.  You may obtain a copy of the License at
  8 |   ~
  9 |   ~    http://www.apache.org/licenses/LICENSE-2.0
 10 |   ~
 11 |   ~ Unless required by applicable law or agreed to in writing, software
 12 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
 13 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |   ~ See the License for the specific language governing permissions and
 15 |   ~ limitations under the License.
 16 |   -->
 17 | 
 18 | <!DOCTYPE module PUBLIC
 19 |           "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
 20 |           "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
 21 | 
 22 | <!--
 23 | 
 24 |     Checkstyle configuration based on the Google coding conventions from:
 25 | 
 26 |     -  Google Java Style
 27 |        https://google-styleguide.googlecode.com/svn-history/r130/trunk/javaguide.html
 28 | 
 29 |     with Spark-specific changes from:
 30 | 
 31 |     http://spark.apache.org/contributing.html#code-style-guide
 32 | 
 33 |     Checkstyle is very configurable. Be sure to read the documentation at
 34 |     http://checkstyle.sf.net (or in your downloaded distribution).
 35 | 
 36 |     Most Checks are configurable, be sure to consult the documentation.
 37 | 
 38 |     To completely disable a check, just comment it out or delete it from the file.
 39 | 
 40 |     Authors: Max Vetrenko, Ruslan Diachenko, Roman Ivanov.
 41 | 
 42 |  -->
 43 | 
 44 | <module name = "Checker">
 45 |     <property name="charset" value="UTF-8"/>
 46 | 
 47 |     <property name="severity" value="error"/>
 48 | 
 49 |     <property name="fileExtensions" value="java, properties, xml"/>
 50 | 
 51 |     <module name="SuppressionFilter">
 52 |       <property name="file" value="dev/checkstyle-suppressions.xml"/>
 53 |     </module>
 54 | 
 55 |     <!-- Checks for whitespace                               -->
 56 |     <!-- See http://checkstyle.sf.net/config_whitespace.html -->
 57 |     <module name="FileTabCharacter">
 58 |         <property name="eachLine" value="true"/>
 59 |     </module>
 60 | 
 61 |     <module name="RegexpSingleline">
 62 |         <!-- \s matches whitespace character, $ matches end of line. -->
 63 |         <property name="format" value="\s+$"/>
 64 |         <property name="message" value="No trailing whitespace allowed."/>
 65 |     </module>
 66 | 
 67 |     <module name="NewlineAtEndOfFile"/>
 68 | 
 69 |     <module name="TreeWalker">
 70 |         <!--
 71 |         If you wish to turn off checking for a section of code, you can put a comment in the source
 72 |         before and after the section, with the following syntax:
 73 | 
 74 |           // checkstyle:off no.XXX (such as checkstyle.off: NoFinalizer)
 75 |           ...  // stuff that breaks the styles
 76 |           // checkstyle:on
 77 |         -->
 78 |         <module name="SuppressionCommentFilter">
 79 |             <property name="offCommentFormat" value="checkstyle.off\: ([\w\|]+)"/>
 80 |             <property name="onCommentFormat" value="checkstyle.on\: ([\w\|]+)"/>
 81 |             <property name="checkFormat" value="$1"/>
 82 |         </module>
 83 |         <module name="OuterTypeFilename"/>
 84 |         <module name="IllegalTokenText">
 85 |             <property name="tokens" value="STRING_LITERAL, CHAR_LITERAL"/>
 86 |             <property name="format" value="\\u00(08|09|0(a|A)|0(c|C)|0(d|D)|22|27|5(C|c))|\\(0(10|11|12|14|15|42|47)|134)"/>
 87 |             <property name="message" value="Avoid using corresponding octal or Unicode escape."/>
 88 |         </module>
 89 |         <module name="AvoidEscapedUnicodeCharacters">
 90 |             <property name="allowEscapesForControlCharacters" value="true"/>
 91 |             <property name="allowByTailComment" value="true"/>
 92 |             <property name="allowNonPrintableEscapes" value="true"/>
 93 |         </module>
 94 |         <module name="LineLength">
 95 |             <property name="max" value="100"/>
 96 |             <property name="ignorePattern" value="^package.*|^import.*|a href|href|http://|https://|ftp://"/>
 97 |         </module>
 98 |         <module name="NoLineWrap"/>
 99 |         <module name="EmptyBlock">
100 |             <property name="option" value="TEXT"/>
101 |             <property name="tokens" value="LITERAL_TRY, LITERAL_FINALLY, LITERAL_IF, LITERAL_ELSE, LITERAL_SWITCH"/>
102 |         </module>
103 |         <module name="NeedBraces">
104 |             <property name="allowSingleLineStatement" value="true"/>
105 |         </module>
106 |         <module name="OneStatementPerLine"/>
107 |         <module name="ArrayTypeStyle"/>
108 |         <module name="FallThrough"/>
109 |         <module name="UpperEll"/>
110 |         <module name="ModifierOrder"/>
111 |         <module name="SeparatorWrap">
112 |             <property name="tokens" value="DOT"/>
113 |             <property name="option" value="nl"/>
114 |         </module>
115 |         <module name="SeparatorWrap">
116 |             <property name="tokens" value="COMMA"/>
117 |             <property name="option" value="EOL"/>
118 |         </module>
119 |         <module name="PackageName">
120 |             <property name="format" value="^[a-z]+(\.[a-z][a-z0-9]*)*$"/>
121 |             <message key="name.invalidPattern"
122 |              value="Package name ''{0}'' must match pattern ''{1}''."/>
123 |         </module>
124 |         <module name="ClassTypeParameterName">
125 |             <property name="format" value="([A-Z][a-zA-Z0-9]*$)"/>
126 |             <message key="name.invalidPattern"
127 |              value="Class type name ''{0}'' must match pattern ''{1}''."/>
128 |         </module>
129 |         <module name="MethodTypeParameterName">
130 |             <property name="format" value="([A-Z][a-zA-Z0-9]*)"/>
131 |             <message key="name.invalidPattern"
132 |              value="Method type name ''{0}'' must match pattern ''{1}''."/>
133 |         </module>
134 |         <module name="GenericWhitespace">
135 |             <message key="ws.followed"
136 |              value="GenericWhitespace ''{0}'' is followed by whitespace."/>
137 |              <message key="ws.preceded"
138 |              value="GenericWhitespace ''{0}'' is preceded with whitespace."/>
139 |              <message key="ws.illegalFollow"
140 |              value="GenericWhitespace ''{0}'' should followed by whitespace."/>
141 |              <message key="ws.notPreceded"
142 |              value="GenericWhitespace ''{0}'' is not preceded with whitespace."/>
143 |         </module>
144 |         <!-- TODO: 11/09/15 disabled - indentation is currently inconsistent -->
145 |         <!--
146 |         <module name="Indentation">
147 |             <property name="basicOffset" value="4"/>
148 |             <property name="braceAdjustment" value="0"/>
149 |             <property name="caseIndent" value="4"/>
150 |             <property name="throwsIndent" value="4"/>
151 |             <property name="lineWrappingIndentation" value="4"/>
152 |             <property name="arrayInitIndent" value="4"/>
153 |         </module>
154 |         -->
155 |         <!-- TODO: 11/09/15 disabled - order is currently wrong in many places -->
156 |         <!--
157 |         <module name="ImportOrder">
158 |             <property name="separated" value="true"/>
159 |             <property name="ordered" value="true"/>
160 |             <property name="groups" value="/^javax?\./,scala,*,org.apache.spark"/>
161 |         </module>
162 |         -->
163 |         <module name="MethodParamPad"/>
164 |         <module name="AnnotationLocation">
165 |             <property name="tokens" value="CLASS_DEF, INTERFACE_DEF, ENUM_DEF, METHOD_DEF, CTOR_DEF"/>
166 |         </module>
167 |         <module name="AnnotationLocation">
168 |             <property name="tokens" value="VARIABLE_DEF"/>
169 |             <property name="allowSamelineMultipleAnnotations" value="true"/>
170 |         </module>
171 |         <module name="MethodName">
172 |             <property name="format" value="^[a-z][a-z0-9][a-zA-Z0-9_]*$"/>
173 |             <message key="name.invalidPattern"
174 |              value="Method name ''{0}'' must match pattern ''{1}''."/>
175 |         </module>
176 |         <module name="EmptyCatchBlock">
177 |             <property name="exceptionVariableName" value="expected"/>
178 |         </module>
179 |         <module name="CommentsIndentation"/>
180 |         <module name="UnusedImports"/>
181 |         <module name="RedundantImport"/>
182 |         <module name="RedundantModifier"/>
183 |     </module>
184 | </module>
185 | 


--------------------------------------------------------------------------------
/examples/src/main/scala/org/apache/spark/examples/sql/streaming/sqs/SqsSourceExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples.sql.streaming.sqs
19 | 
20 | import scala.util.Random
21 | 
22 | import org.apache.spark.sql.SparkSession
23 | 
24 |  /**
25 |   * Example to read files from S3 using SQS Source and write results to Memory Sink
26 |   *
27 |   * Usage: SqsSourceExample <Sample Record Path to infer schema> <SQS Queue URL> <File Format>
28 |   */
29 | 
30 | object SqsSourceExample {
31 | 
32 |   def main(args: Array[String]) {
33 | 
34 |     val randomName = Random.alphanumeric.take(6).mkString("")
35 |     val pathName = "path_" + randomName
36 |     val queryName = "query_" + randomName
37 |     val checkpointDir = s"/checkpoints/$pathName"
38 |     val schemaPathString = args(0)
39 | 
40 |     val spark = SparkSession.builder().appName("SqsExample").getOrCreate()
41 | 
42 |     val schema = spark.read.json(schemaPathString).schema
43 | 
44 |     val queueUrl = args(1)
45 | 
46 |     val fileFormat = args(2)
47 | 
48 |     val inputDf = spark
49 |       .readStream
50 |       .format("s3-sqs")
51 |       .schema(schema)
52 |       .option("sqsUrl", queueUrl)
53 |       .option("fileFormat", fileFormat)
54 |       .option("sqsFetchIntervalSeconds", "2")
55 |       .option("sqsLongPollingWaitTimeSeconds", "5")
56 |       .option("maxFilesPerTrigger", "50")
57 |       .option("ignoreFileDeletion", "true")
58 |       .load()
59 | 
60 |     val query = inputDf
61 |       .writeStream
62 |       .queryName(queryName)
63 |       .format("memory")
64 |       .option("checkpointLocation", checkpointDir)
65 |       .start()
66 | 
67 |     query.awaitTermination()
68 |   }
69 | }
70 | 
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 | ~ Licensed to the Apache Software Foundation (ASF) under one or more
  4 | ~ contributor license agreements.  See the NOTICE file distributed with
  5 | ~ this work for additional information regarding copyright ownership.
  6 | ~ The ASF licenses this file to You under the Apache License, Version 2.0
  7 | ~ (the "License"); you may not use this file except in compliance with
  8 | ~ the License.  You may obtain a copy of the License at
  9 | ~
 10 | ~    http://www.apache.org/licenses/LICENSE-2.0
 11 | ~
 12 | ~ Unless required by applicable law or agreed to in writing, software
 13 | ~ distributed under the License is distributed on an "AS IS" BASIS,
 14 | ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | ~ See the License for the specific language governing permissions and
 16 | ~ limitations under the License.
 17 | -->
 18 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 19 |   <modelVersion>4.0.0</modelVersion>
 20 | 
 21 |   <groupId>com.qubole</groupId>
 22 |   <artifactId>spark-sql-streaming-sqs_2.11</artifactId>
 23 |   <version>0.5.2-SNAPSHOT</version>
 24 |   <packaging>jar</packaging>
 25 |   <name>Spark SQL Streaming SQS</name>
 26 |   <description>Connector for faster reads from S3 using SQS</description>
 27 |   <url>http://github.com/qubole/s3-sqs-connector</url>
 28 | 
 29 |   <developers>
 30 |     <developer>
 31 |       <id>qubole</id>
 32 |       <organization>Qubole Inc.</organization>
 33 |       <organizationUrl>http://www.qubole.com</organizationUrl>
 34 |       <roles>
 35 |         <role>developer</role>
 36 |       </roles>
 37 |     </developer>
 38 |   </developers>
 39 | 
 40 |   <licenses>
 41 |     <license>
 42 |       <name>Apache License, Version 2.0</name>
 43 |       <url>https://github.com/qubole/s3-sqs-connector/blob/master/LICENSE.txt</url>
 44 |       <distribution>repo</distribution>
 45 |     </license>
 46 |   </licenses>
 47 | 
 48 |   <scm>
 49 |     <connection>scm:git:git://github.com/qubole/s3-sqs-connector.git</connection>
 50 |     <url>https://github.com/qubole/s3-sqs-connector</url>
 51 |     <developerConnection>scm:git:git@github.com:qubole/s3-sqs-connector.git</developerConnection>
 52 |     <tag>spark-sql-streaming-sqs_2.11-0.5.1</tag>
 53 |   </scm>
 54 | 
 55 |   <inceptionYear>2020</inceptionYear>
 56 |   <organization>
 57 |     <name>Qubole</name>
 58 |     <url>http://www.qubole.com/</url>
 59 |   </organization>
 60 | 
 61 |   <properties>
 62 |     <sbt.project.name>spark-sql-streaming-sqs</sbt.project.name>
 63 |     <spark.version>2.4.0</spark.version>
 64 |     <scala.binary.version>2.11</scala.binary.version>
 65 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 66 |   </properties>
 67 | 
 68 |   <dependencies>
 69 |     <dependency>
 70 |       <groupId>org.apache.spark</groupId>
 71 |       <artifactId>spark-sql_${scala.binary.version}</artifactId>
 72 |       <version>${spark.version}</version>
 73 |       <scope>provided</scope>
 74 |     </dependency>
 75 |     <dependency>
 76 |       <groupId>org.apache.spark</groupId>
 77 |       <artifactId>spark-core_${scala.binary.version}</artifactId>
 78 |       <version>${spark.version}</version>
 79 |       <type>test-jar</type>
 80 |       <scope>test</scope>
 81 |     </dependency>
 82 |     <dependency>
 83 |       <groupId>org.apache.spark</groupId>
 84 |       <artifactId>spark-sql_${scala.binary.version}</artifactId>
 85 |       <version>${spark.version}</version>
 86 |       <type>test-jar</type>
 87 |       <scope>test</scope>
 88 |     </dependency>
 89 |     <dependency>
 90 |       <groupId>org.apache.spark</groupId>
 91 |       <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
 92 |       <version>${spark.version}</version>
 93 |       <type>test-jar</type>
 94 |       <scope>test</scope>
 95 |     </dependency>
 96 |     <dependency>
 97 |       <groupId>com.amazonaws</groupId>
 98 |       <artifactId>aws-java-sdk-sqs</artifactId>
 99 |       <version>1.11.271</version>
100 |     </dependency>
101 |     <dependency>
102 |       <groupId>org.apache.spark</groupId>
103 |       <artifactId>spark-tags_${scala.binary.version}</artifactId>
104 |       <version>${spark.version}</version>
105 |     </dependency>
106 |   </dependencies>
107 |   <build>
108 |     <pluginManagement>
109 |       <plugins>
110 |         <plugin>
111 |           <groupId>net.alchim31.maven</groupId>
112 |           <artifactId>scala-maven-plugin</artifactId>
113 |           <version>4.0.2</version>
114 |           <executions>
115 |             <execution>
116 |               <id>compile</id>
117 |               <goals>
118 |                 <goal>compile</goal>
119 |                 <goal>add-source</goal>
120 |                 <goal>doc-jar</goal>
121 |               </goals>
122 |               <phase>compile</phase>
123 |             </execution>
124 |           </executions>
125 |         </plugin>
126 |         <plugin>
127 |           <groupId>org.apache.maven.plugins</groupId>
128 |           <artifactId>maven-shade-plugin</artifactId>
129 |           <version>3.1.0</version>
130 |           <executions>
131 |             <execution>
132 |               <phase>package</phase>
133 |               <goals>
134 |                 <goal>shade</goal>
135 |               </goals>
136 |               <configuration>
137 |                 <artifactSet>
138 |                   <includes>
139 |                     <include>com.amazonaws:aws-java-sdk-sqs:*</include>
140 |                     <include>com.amazonaws:aws-java-sdk-core:*</include>
141 |                   </includes>
142 |                 </artifactSet>
143 |                 <filters>
144 |                   <filter>
145 |                     <artifact>*:*</artifact>
146 |                     <excludes>
147 |                       <exclude>META-INF/maven/**</exclude>
148 |                       <exclude>META-INF/MANIFEST.MF</exclude>
149 |                     </excludes>
150 |                   </filter>
151 |                 </filters>
152 |               </configuration>
153 |             </execution>
154 |           </executions>
155 |         </plugin>
156 |       </plugins>        
157 |     </pluginManagement>
158 |     <plugins>
159 |       <plugin>
160 |         <groupId>net.alchim31.maven</groupId>
161 |         <artifactId>scala-maven-plugin</artifactId>
162 |         <version>4.0.2</version>
163 |       </plugin>
164 |       <plugin>
165 |         <groupId>org.apache.maven.plugins</groupId>
166 |         <artifactId>maven-shade-plugin</artifactId>
167 |         <version>3.1.0</version>
168 |       </plugin>
169 |     </plugins>
170 |     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
171 |     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
172 |   </build>
173 | 
174 |   <profiles>
175 |     <profile>
176 |       <id>release</id>
177 |       <build>
178 |         <plugins>
179 |           <plugin>
180 |             <groupId>org.apache.maven.plugins</groupId>
181 |             <artifactId>maven-source-plugin</artifactId>
182 |             <version>2.4</version>
183 |             <executions>
184 |               <execution>
185 |                 <id>create-sources-jar</id>
186 |                 <goals>
187 |                   <goal>jar-no-fork</goal>
188 |                 </goals>
189 |               </execution>
190 |             </executions>
191 |           </plugin>
192 |           <plugin>
193 |             <groupId>org.apache.maven.plugins</groupId>
194 |             <artifactId>maven-gpg-plugin</artifactId>
195 |             <version>1.5</version>
196 |             <executions>
197 |               <execution>
198 |                 <id>sign-artifacts</id>
199 |                 <phase>verify</phase>
200 |                 <goals>
201 |                   <goal>sign</goal>
202 |                 </goals>
203 |               </execution>
204 |             </executions>
205 |           </plugin>
206 |           <plugin>
207 |             <groupId>org.apache.maven.plugins</groupId>
208 |             <artifactId>maven-javadoc-plugin</artifactId>
209 |             <version>2.10.1</version>
210 |             <executions>
211 |               <execution>
212 |                 <id>create-javadoc-jar</id>
213 |                 <goals>
214 |                   <goal>jar</goal>
215 |                 </goals>
216 |               </execution>
217 |             </executions>
218 |           </plugin>
219 |           <plugin>
220 |             <groupId>org.apache.maven.plugins</groupId>
221 |             <artifactId>maven-release-plugin</artifactId>
222 |             <version>2.5.1</version>
223 |             <configuration>
224 |               <autoVersionSubmodules>true</autoVersionSubmodules>
225 |             </configuration>
226 |           </plugin>
227 |           <plugin>
228 |             <groupId>org.sonatype.plugins</groupId>
229 |             <artifactId>nexus-staging-maven-plugin</artifactId>
230 |             <version>1.6.3</version>
231 |             <extensions>true</extensions>
232 |             <configuration>
233 |               <serverId>ossrh</serverId>
234 |               <nexusUrl>https://oss.sonatype.org/</nexusUrl>
235 |               <autoReleaseAfterClose>true</autoReleaseAfterClose>
236 |             </configuration>
237 |           </plugin>
238 |         </plugins>
239 |       </build>
240 |     </profile>
241 |   </profiles>
242 | 
243 |   <distributionManagement>
244 |     <snapshotRepository>
245 |       <id>ossrh</id>
246 |       <url>https://oss.sonatype.org/content/repositories/snapshots</url>
247 |     </snapshotRepository>
248 |     <repository>
249 |       <id>ossrh</id>
250 |       <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
251 |     </repository>
252 |   </distributionManagement>
253 | 
254 | </project>
255 | 


--------------------------------------------------------------------------------
/scalastyle-config.xml:
--------------------------------------------------------------------------------
  1 | <!--
  2 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
  3 |   ~ contributor license agreements.  See the NOTICE file distributed with
  4 |   ~ this work for additional information regarding copyright ownership.
  5 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
  6 |   ~ (the "License"); you may not use this file except in compliance with
  7 |   ~ the License.  You may obtain a copy of the License at
  8 |   ~
  9 |   ~    http://www.apache.org/licenses/LICENSE-2.0
 10 |   ~
 11 |   ~ Unless required by applicable law or agreed to in writing, software
 12 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
 13 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |   ~ See the License for the specific language governing permissions and
 15 |   ~ limitations under the License.
 16 |   -->
 17 | <!--
 18 | 
 19 | If you wish to turn off checking for a section of code, you can put a comment in the source
 20 | before and after the section, with the following syntax:
 21 | 
 22 |   // scalastyle:off
 23 |   ...  // stuff that breaks the styles
 24 |   // scalastyle:on
 25 | 
 26 | You can also disable only one rule, by specifying its rule id, as specified in:
 27 |   http://www.scalastyle.org/rules-0.7.0.html
 28 | 
 29 |   // scalastyle:off no.finalize
 30 |   override def finalize(): Unit = ...
 31 |   // scalastyle:on no.finalize
 32 | 
 33 | This file is divided into 3 sections:
 34 |  (1) rules that we enforce.
 35 |  (2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet
 36 |      (or we need to make the scalastyle rule more configurable).
 37 |  (3) rules that we don't want to enforce.
 38 | -->
 39 | 
 40 | <scalastyle>
 41 |   <name>Scalastyle standard configuration</name>
 42 | 
 43 |   <!-- ================================================================================ -->
 44 |   <!--                               rules we enforce                                   -->
 45 |   <!-- ================================================================================ -->
 46 | 
 47 |   <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
 48 | 
 49 |   <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
 50 |     <parameters>
 51 |        <parameter name="header"><![CDATA[/*
 52 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 53 |  * contributor license agreements.  See the NOTICE file distributed with
 54 |  * this work for additional information regarding copyright ownership.
 55 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 56 |  * (the "License"); you may not use this file except in compliance with
 57 |  * the License.  You may obtain a copy of the License at
 58 |  *
 59 |  *    http://www.apache.org/licenses/LICENSE-2.0
 60 |  *
 61 |  * Unless required by applicable law or agreed to in writing, software
 62 |  * distributed under the License is distributed on an "AS IS" BASIS,
 63 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 64 |  * See the License for the specific language governing permissions and
 65 |  * limitations under the License.
 66 |  */]]></parameter>
 67 |     </parameters>
 68 |   </check>
 69 | 
 70 |   <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
 71 | 
 72 |   <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
 73 | 
 74 |   <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
 75 | 
 76 |   <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
 77 |     <parameters>
 78 |       <parameter name="maxLineLength"><![CDATA[100]]></parameter>
 79 |       <parameter name="tabSize"><![CDATA[2]]></parameter>
 80 |       <parameter name="ignoreImports">true</parameter>
 81 |     </parameters>
 82 |   </check>
 83 | 
 84 |   <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
 85 |     <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
 86 |   </check>
 87 | 
 88 |   <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
 89 |     <parameters><parameter name="regex"><![CDATA[(config|[A-Z][A-Za-z]*)]]></parameter></parameters>
 90 |   </check>
 91 | 
 92 |   <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
 93 |     <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
 94 |   </check>
 95 | 
 96 |   <check customId="argcount" level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
 97 |     <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
 98 |   </check>
 99 | 
100 |   <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
101 | 
102 |   <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
103 | 
104 |   <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
105 | 
106 |   <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
107 | 
108 |   <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
109 |     <parameters>
110 |       <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
111 |       <parameter name="doubleLineAllowed"><![CDATA[true]]></parameter>
112 |     </parameters>
113 |   </check>
114 | 
115 |   <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
116 | 
117 |   <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
118 | 
119 |   <check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
120 | 
121 |   <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
122 | 
123 |   <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true">
124 |    <parameters>
125 |      <parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
126 |    </parameters>
127 |   </check>
128 | 
129 |   <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true">
130 |     <parameters>
131 |      <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
132 |     </parameters>
133 |   </check>
134 | 
135 |   <!-- ??? usually shouldn't be checked into the code base. -->
136 |   <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
137 | 
138 |   <!-- As of SPARK-7558, all tests in Spark should extend o.a.s.SparkFunSuite instead of FunSuite directly -->
139 |   <check customId="funsuite" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
140 |     <parameters><parameter name="regex">^FunSuite[A-Za-z]*$</parameter></parameters>
141 |     <customMessage>Tests must extend org.apache.spark.SparkFunSuite instead.</customMessage>
142 |   </check>
143 | 
144 |   <!-- As of SPARK-7977 all printlns need to be wrapped in '// scalastyle:off/on println' -->
145 |   <check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
146 |     <parameters><parameter name="regex">^println$</parameter></parameters>
147 |     <customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with
148 |       // scalastyle:off println
149 |       println(...)
150 |       // scalastyle:on println]]></customMessage>
151 |   </check>
152 | 
153 |   <check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
154 |     <parameters><parameter name="regex">@VisibleForTesting</parameter></parameters>
155 |     <customMessage><![CDATA[
156 |       @VisibleForTesting causes classpath issues. Please note this in the java doc instead (SPARK-11615).
157 |     ]]></customMessage>
158 |   </check>
159 | 
160 |   <check customId="runtimeaddshutdownhook" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
161 |     <parameters><parameter name="regex">Runtime\.getRuntime\.addShutdownHook</parameter></parameters>
162 |     <customMessage><![CDATA[
163 |       Are you sure that you want to use Runtime.getRuntime.addShutdownHook? In most cases, you should use
164 |       ShutdownHookManager.addShutdownHook instead.
165 |       If you must use Runtime.getRuntime.addShutdownHook, wrap the code block with
166 |       // scalastyle:off runtimeaddshutdownhook
167 |       Runtime.getRuntime.addShutdownHook(...)
168 |       // scalastyle:on runtimeaddshutdownhook
169 |     ]]></customMessage>
170 |   </check>
171 | 
172 |   <check customId="mutablesynchronizedbuffer" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
173 |     <parameters><parameter name="regex">mutable\.SynchronizedBuffer</parameter></parameters>
174 |     <customMessage><![CDATA[
175 |       Are you sure that you want to use mutable.SynchronizedBuffer? In most cases, you should use
176 |       java.util.concurrent.ConcurrentLinkedQueue instead.
177 |       If you must use mutable.SynchronizedBuffer, wrap the code block with
178 |       // scalastyle:off mutablesynchronizedbuffer
179 |       mutable.SynchronizedBuffer[...]
180 |       // scalastyle:on mutablesynchronizedbuffer
181 |     ]]></customMessage>
182 |   </check>
183 | 
184 |   <check customId="classforname" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
185 |     <parameters><parameter name="regex">Class\.forName</parameter></parameters>
186 |     <customMessage><![CDATA[
187 |       Are you sure that you want to use Class.forName? In most cases, you should use Utils.classForName instead.
188 |       If you must use Class.forName, wrap the code block with
189 |       // scalastyle:off classforname
190 |       Class.forName(...)
191 |       // scalastyle:on classforname
192 |     ]]></customMessage>
193 |   </check>
194 | 
195 |   <check customId="awaitresult" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
196 |     <parameters><parameter name="regex">Await\.result</parameter></parameters>
197 |     <customMessage><![CDATA[
198 |       Are you sure that you want to use Await.result? In most cases, you should use ThreadUtils.awaitResult instead.
199 |       If you must use Await.result, wrap the code block with
200 |       // scalastyle:off awaitresult
201 |       Await.result(...)
202 |       // scalastyle:on awaitresult
203 |     ]]></customMessage>
204 |   </check>
205 | 
206 |   <check customId="awaitready" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
207 |     <parameters><parameter name="regex">Await\.ready</parameter></parameters>
208 |     <customMessage><![CDATA[
209 |       Are you sure that you want to use Await.ready? In most cases, you should use ThreadUtils.awaitReady instead.
210 |       If you must use Await.ready, wrap the code block with
211 |       // scalastyle:off awaitready
212 |       Await.ready(...)
213 |       // scalastyle:on awaitready
214 |     ]]></customMessage>
215 |   </check>
216 | 
217 |   <!-- As of SPARK-9613 JavaConversions should be replaced with JavaConverters -->
218 |   <check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
219 |     <parameters><parameter name="regex">JavaConversions</parameter></parameters>
220 |     <customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import
221 |     scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage>
222 |   </check>
223 | 
224 |   <check customId="commonslang2" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
225 |     <parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters>
226 |     <customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead
227 |     of Commons Lang 2 (package org.apache.commons.lang.*)</customMessage>
228 |   </check>
229 | 
230 |   <check customId="extractopt" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
231 |     <parameters><parameter name="regex">extractOpt</parameter></parameters>
232 |     <customMessage>Use Utils.jsonOption(x).map(.extract[T]) instead of .extractOpt[T], as the latter
233 |     is slower.  </customMessage>
234 |   </check>
235 | 
236 |   <check level="error" class="org.scalastyle.scalariform.ImportOrderChecker" enabled="true">
237 |     <parameters>
238 |       <parameter name="groups">java,scala,3rdParty,spark</parameter>
239 |       <parameter name="group.java">javax?\..*</parameter>
240 |       <parameter name="group.scala">scala\..*</parameter>
241 |       <parameter name="group.3rdParty">(?!org\.apache\.spark\.).*</parameter>
242 |       <parameter name="group.spark">org\.apache\.spark\..*</parameter>
243 |     </parameters>
244 |   </check>
245 | 
246 |   <check level="error" class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" enabled="true">
247 |     <parameters>
248 |       <parameter name="tokens">COMMA</parameter>
249 |     </parameters>
250 |   </check>
251 | 
252 |   <!-- SPARK-3854: Single Space between ')' and '{' -->
253 |   <check customId="SingleSpaceBetweenRParenAndLCurlyBrace" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
254 |     <parameters><parameter name="regex">\)\{</parameter></parameters>
255 |     <customMessage><![CDATA[
256 |       Single Space between ')' and `{`.
257 |     ]]></customMessage>
258 |   </check>
259 | 
260 |   <check customId="NoScalaDoc" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
261 |     <parameters><parameter name="regex">(?m)^(\s*)/[*][*].*$(\r|)\n^\1  [*]</parameter></parameters>
262 |     <customMessage>Use Javadoc style indentation for multiline comments</customMessage>
263 |   </check>
264 | 
265 |   <check customId="OmitBracesInCase" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
266 |     <parameters><parameter name="regex">case[^\n>]*=>\s*\{</parameter></parameters>
267 |     <customMessage>Omit braces in case clauses.</customMessage>
268 |   </check>
269 | 
270 |   <!-- SPARK-16877: Avoid Java annotations -->
271 |   <check level="error" class="org.scalastyle.scalariform.OverrideJavaChecker" enabled="true"></check>
272 | 
273 |   <check level="error" class="org.scalastyle.scalariform.DeprecatedJavaChecker" enabled="true"></check>
274 | 
275 |   <!-- ================================================================================ -->
276 |   <!--       rules we'd like to enforce, but haven't cleaned up the codebase yet        -->
277 |   <!-- ================================================================================ -->
278 | 
279 |   <!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. -->
280 |   <!-- Ideally the following two rules should be configurable to rule out string interpolation. -->
281 |   <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check>
282 |   <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check>
283 | 
284 |   <!-- This breaks symbolic method names so we don't turn it on. -->
285 |   <!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. -->
286 |   <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
287 |     <parameters>
288 |     <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
289 |     </parameters>
290 |   </check>
291 | 
292 |   <!-- Should turn this on, but we have a few places that need to be fixed first -->
293 |   <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check>
294 | 
295 |   <!-- ================================================================================ -->
296 |   <!--                               rules we don't want                                -->
297 |   <!-- ================================================================================ -->
298 | 
299 |   <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false">
300 |     <parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters>
301 |   </check>
302 | 
303 |   <!-- We want the opposite of this: NewLineAtEofChecker -->
304 |   <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
305 | 
306 |   <!-- This one complains about all kinds of random things. Disable. -->
307 |   <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check>
308 | 
309 |   <!-- We use return quite a bit for control flows and guards -->
310 |   <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check>
311 | 
312 |   <!-- We use null a lot in low level code and to interface with 3rd party code -->
313 |   <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check>
314 | 
315 |   <!-- Doesn't seem super big deal here ... -->
316 |   <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"></check>
317 | 
318 |   <!-- Doesn't seem super big deal here ... -->
319 |   <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false">
320 |     <parameters><parameter name="maxFileLength">800></parameter></parameters>
321 |   </check>
322 | 
323 |   <!-- Doesn't seem super big deal here ... -->
324 |   <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false">
325 |     <parameters><parameter name="maxTypes">30</parameter></parameters>
326 |   </check>
327 | 
328 |   <!-- Doesn't seem super big deal here ... -->
329 |   <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false">
330 |     <parameters><parameter name="maximum">10</parameter></parameters>
331 |   </check>
332 | 
333 |   <!-- Doesn't seem super big deal here ... -->
334 |   <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false">
335 |     <parameters><parameter name="maxLength">50</parameter></parameters>
336 |   </check>
337 | 
338 |   <!-- Not exactly feasible to enforce this right now. -->
339 |   <!-- It is also infrequent that somebody introduces a new class with a lot of methods. -->
340 |   <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false">
341 |     <parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters>
342 |   </check>
343 | 
344 |   <!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... -->
345 |   <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
346 |     <parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters>
347 |   </check>
348 | 
349 | </scalastyle>
350 | 


--------------------------------------------------------------------------------
/src/main/java/org/apache/spark/sql/streaming/sqs/BasicAWSCredentialsProvider.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.streaming.sqs;
19 | 
20 | import com.amazonaws.AmazonClientException;
21 | import com.amazonaws.auth.AWSCredentialsProvider;
22 | import com.amazonaws.auth.BasicAWSCredentials;
23 | import com.amazonaws.auth.AWSCredentials;
24 | import org.apache.commons.lang.StringUtils;
25 | 
26 | public class BasicAWSCredentialsProvider implements AWSCredentialsProvider {
27 |   private final String accessKey;
28 |   private final String secretKey;
29 | 
30 |   public BasicAWSCredentialsProvider(String accessKey, String secretKey) {
31 |     this.accessKey = accessKey;
32 |     this.secretKey = secretKey;
33 |   }
34 | 
35 |   public AWSCredentials getCredentials() {
36 |     if (!StringUtils.isEmpty(accessKey) && !StringUtils.isEmpty(secretKey)) {
37 |       return new BasicAWSCredentials(accessKey, secretKey);
38 |     }
39 |     throw new AmazonClientException(
40 |         "Access key or secret key is null");
41 |   }
42 | 
43 |   public void refresh() {}
44 | 
45 |   @Override
46 |   public String toString() {
47 |     return getClass().getSimpleName();
48 |   }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/org/apache/spark/sql/streaming/sqs/InstanceProfileCredentialsProviderWithRetries.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.streaming.sqs;
19 | 
20 | 
21 | import com.amazonaws.AmazonClientException;
22 | import com.amazonaws.auth.AWSCredentials;
23 | import com.amazonaws.auth.InstanceProfileCredentialsProvider;
24 | import org.apache.commons.logging.Log;
25 | import org.apache.commons.logging.LogFactory;
26 | 
27 | public class InstanceProfileCredentialsProviderWithRetries
28 |         extends InstanceProfileCredentialsProvider {
29 | 
30 |     private static final Log LOG = LogFactory.getLog(
31 |             InstanceProfileCredentialsProviderWithRetries.class);
32 | 
33 |     public AWSCredentials getCredentials() {
34 |         int retries = 10;
35 |         int sleep = 500;
36 |         while(retries > 0) {
37 |             try {
38 |                 return super.getCredentials();
39 |             }
40 |             catch (RuntimeException re) {
41 |                 LOG.error("Got an exception while fetching credentials " + re);
42 |                 --retries;
43 |                 try {
44 |                     Thread.sleep(sleep);
45 |                 } catch (InterruptedException ie) {
46 |                     // Do nothing
47 |                 }
48 |                 if (sleep < 10000) {
49 |                     sleep *= 2;
50 |                 }
51 |             }
52 |             catch (Error error) {
53 |                 LOG.error("Got an exception while fetching credentials " + error);
54 |                 --retries;
55 |                 try {
56 |                     Thread.sleep(sleep);
57 |                 } catch (InterruptedException ie) {
58 |                     // Do nothing
59 |                 }
60 |                 if (sleep < 10000) {
61 |                     sleep *= 2;
62 |                 }
63 |             }
64 |         }
65 |         throw new AmazonClientException("Unable to load credentials.");
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | org.apache.spark.sql.streaming.sqs.SqsSourceProvider


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | log4j.rootCategory=WARN, console
19 | 
20 | # File appender
21 | log4j.appender.file=org.apache.log4j.FileAppender
22 | log4j.appender.file.append=false
23 | log4j.appender.file.file=target/unit-tests.log
24 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
25 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
26 | 
27 | # Console appender
28 | log4j.appender.console=org.apache.log4j.ConsoleAppender
29 | log4j.appender.console.target=System.out
30 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
31 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
32 | 
33 | # Settings to quiet third party logs that are too verbose
34 | log4j.logger.org.sparkproject.jetty=WARN
35 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
36 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
37 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
38 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/streaming/sqs/SqsClient.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.streaming.sqs
 19 | 
 20 | import java.text.SimpleDateFormat
 21 | import java.util.TimeZone
 22 | import java.util.concurrent.TimeUnit
 23 | 
 24 | import scala.collection.JavaConverters._
 25 | 
 26 | import com.amazonaws.{AmazonClientException, AmazonServiceException, ClientConfiguration}
 27 | import com.amazonaws.services.sqs.{AmazonSQS, AmazonSQSClientBuilder}
 28 | import com.amazonaws.services.sqs.model.{DeleteMessageBatchRequestEntry, Message, ReceiveMessageRequest}
 29 | import org.apache.hadoop.conf.Configuration
 30 | import org.json4s.{DefaultFormats, MappingException}
 31 | import org.json4s.JsonAST.JValue
 32 | import org.json4s.jackson.JsonMethods.parse
 33 | 
 34 | import org.apache.spark.SparkException
 35 | import org.apache.spark.internal.Logging
 36 | import org.apache.spark.util.ThreadUtils
 37 | 
 38 | class SqsClient(sourceOptions: SqsSourceOptions,
 39 |                 hadoopConf: Configuration) extends Logging {
 40 | 
 41 |   private val sqsFetchIntervalSeconds = sourceOptions.fetchIntervalSeconds
 42 |   private val sqsLongPollWaitTimeSeconds = sourceOptions.longPollWaitTimeSeconds
 43 |   private val sqsMaxRetries = sourceOptions.maxRetries
 44 |   private val maxConnections = sourceOptions.maxConnections
 45 |   private val ignoreFileDeletion = sourceOptions.ignoreFileDeletion
 46 |   private val region = sourceOptions.region
 47 |   val sqsUrl = sourceOptions.sqsUrl
 48 | 
 49 |   @volatile var exception: Option[Exception] = None
 50 | 
 51 |   private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601
 52 |   timestampFormat.setTimeZone(TimeZone.getTimeZone("UTC"))
 53 |   private var retriesOnFailure = 0
 54 |   private val sqsClient = createSqsClient()
 55 | 
 56 |   val sqsScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("sqs-scheduler")
 57 | 
 58 |   val sqsFileCache = new SqsFileCache(sourceOptions.maxFileAgeMs, sourceOptions.fileNameOnly)
 59 | 
 60 |   val deleteMessageQueue = new java.util.concurrent.ConcurrentLinkedQueue[String]()
 61 | 
 62 |   private val sqsFetchMessagesThread = new Runnable {
 63 |     override def run(): Unit = {
 64 |       try {
 65 |         // Fetching messages from Amazon SQS
 66 |         val newMessages = sqsFetchMessages()
 67 | 
 68 |         // Filtering the new messages which are already not seen
 69 |         if (newMessages.nonEmpty) {
 70 |           newMessages.filter(message => sqsFileCache.isNewFile(message._1, message._2))
 71 |             .foreach(message =>
 72 |               sqsFileCache.add(message._1, MessageDescription(message._2, false, message._3)))
 73 |         }
 74 |       } catch {
 75 |         case e: Exception =>
 76 |           exception = Some(e)
 77 |       }
 78 |     }
 79 |   }
 80 | 
 81 |   sqsScheduler.scheduleWithFixedDelay(
 82 |     sqsFetchMessagesThread,
 83 |     0,
 84 |     sqsFetchIntervalSeconds,
 85 |     TimeUnit.SECONDS)
 86 | 
 87 |   private def sqsFetchMessages(): Seq[(String, Long, String)] = {
 88 |     val messageList = try {
 89 |       val receiveMessageRequest = new ReceiveMessageRequest()
 90 |         .withQueueUrl(sqsUrl)
 91 |         .withWaitTimeSeconds(sqsLongPollWaitTimeSeconds)
 92 |       val messages = sqsClient.receiveMessage(receiveMessageRequest).getMessages.asScala
 93 |       retriesOnFailure = 0
 94 |       logDebug(s"successfully received ${messages.size} messages")
 95 |       messages
 96 |     } catch {
 97 |       case ase: AmazonServiceException =>
 98 |         val message =
 99 |         """
100 |           |Caught an AmazonServiceException, which means your request made it to Amazon SQS,
101 |           | rejected with an error response for some reason.
102 |         """.stripMargin
103 |         logWarning(message)
104 |         logWarning(s"Error Message: ${ase.getMessage}")
105 |         logWarning(s"HTTP Status Code: ${ase.getStatusCode}, AWS Error Code: ${ase.getErrorCode}")
106 |         logWarning(s"Error Type: ${ase.getErrorType}, Request ID: ${ase.getRequestId}")
107 |         evaluateRetries()
108 |         List.empty
109 |       case ace: AmazonClientException =>
110 |         val message =
111 |         """
112 |            |Caught an AmazonClientException, which means, the client encountered a serious
113 |            | internal problem while trying to communicate with Amazon SQS, such as not
114 |            |  being able to access the network.
115 |         """.stripMargin
116 |         logWarning(message)
117 |         logWarning(s"Error Message: ${ace.getMessage()}")
118 |         evaluateRetries()
119 |         List.empty
120 |       case e: Exception =>
121 |         val message = "Received unexpected error from SQS"
122 |         logWarning(message)
123 |         logWarning(s"Error Message: ${e.getMessage()}")
124 |         evaluateRetries()
125 |         List.empty
126 |     }
127 |     if (messageList.nonEmpty) {
128 |       parseSqsMessages(messageList)
129 |     } else {
130 |       Seq.empty
131 |     }
132 |   }
133 | 
134 |   private def parseSqsMessages(messageList: Seq[Message]): Seq[(String, Long, String)] = {
135 |     val errorMessages = scala.collection.mutable.ListBuffer[String]()
136 |     val parsedMessages = messageList.foldLeft(Seq[(String, Long, String)]()) { (list, message) =>
137 |       implicit val formats = DefaultFormats
138 |       try {
139 |         val messageReceiptHandle = message.getReceiptHandle
140 |         val messageJson = parse(message.getBody).extract[JValue]
141 |         val bucketName = (
142 |           messageJson \ "Records" \ "s3" \ "bucket" \ "name").extract[Array[String]].head
143 |         val eventName = (messageJson \ "Records" \ "eventName").extract[Array[String]].head
144 |         if (eventName.contains("ObjectCreated")) {
145 |           val timestamp = (messageJson \ "Records" \ "eventTime").extract[Array[String]].head
146 |           val timestampMills = convertTimestampToMills(timestamp)
147 |           val path = "s3://" +
148 |             bucketName + "/" +
149 |             (messageJson \ "Records" \ "s3" \ "object" \ "key").extract[Array[String]].head
150 |           logDebug("Successfully parsed sqs message")
151 |           list :+ ((path, timestampMills, messageReceiptHandle))
152 |         } else {
153 |           if (eventName.contains("ObjectRemoved")) {
154 |             if (!ignoreFileDeletion) {
155 |               exception = Some(new SparkException("ObjectDelete message detected in SQS"))
156 |             } else {
157 |               logInfo("Ignoring file deletion message since ignoreFileDeletion is true")
158 |             }
159 |           } else {
160 |             logWarning("Ignoring unexpected message detected in SQS")
161 |           }
162 |           errorMessages.append(messageReceiptHandle)
163 |           list
164 |         }
165 |       } catch {
166 |         case me: MappingException =>
167 |           errorMessages.append(message.getReceiptHandle)
168 |           logWarning(s"Error in parsing SQS message ${me.getMessage}")
169 |           list
170 |         case e: Exception =>
171 |           errorMessages.append(message.getReceiptHandle)
172 |           logWarning(s"Unexpected error while parsing SQS message ${e.getMessage}")
173 |           list
174 |       }
175 |     }
176 |     if (errorMessages.nonEmpty) {
177 |       addToDeleteMessageQueue(errorMessages.toList)
178 |     }
179 |     parsedMessages
180 |   }
181 | 
182 |   private def convertTimestampToMills(timestamp: String): Long = {
183 |     val timeInMillis = timestampFormat.parse(timestamp).getTime()
184 |     timeInMillis
185 |   }
186 | 
187 |   private def evaluateRetries(): Unit = {
188 |     retriesOnFailure += 1
189 |     if (retriesOnFailure >= sqsMaxRetries) {
190 |       logError("Max retries reached")
191 |       exception = Some(new SparkException("Unable to receive Messages from SQS for " +
192 |         s"${sqsMaxRetries} times Giving up. Check logs for details."))
193 |     } else {
194 |       logWarning(s"Attempt ${retriesOnFailure}." +
195 |         s"Will reattempt after ${sqsFetchIntervalSeconds} seconds")
196 |     }
197 |   }
198 | 
199 |   private def createSqsClient(): AmazonSQS = {
200 |     try {
201 |       val isClusterOnEc2Role = hadoopConf.getBoolean(
202 |         "fs.s3.isClusterOnEc2Role", false) || hadoopConf.getBoolean(
203 |         "fs.s3n.isClusterOnEc2Role", false) || sourceOptions.useInstanceProfileCredentials
204 |       if (!isClusterOnEc2Role) {
205 |         val accessKey = hadoopConf.getTrimmed("fs.s3n.awsAccessKeyId")
206 |         val secretAccessKey = new String(hadoopConf.getPassword("fs.s3n.awsSecretAccessKey")).trim
207 |         logInfo("Using credentials from keys provided")
208 |         val basicAwsCredentialsProvider = new BasicAWSCredentialsProvider(
209 |           accessKey, secretAccessKey)
210 |         AmazonSQSClientBuilder
211 |           .standard()
212 |           .withClientConfiguration(new ClientConfiguration().withMaxConnections(maxConnections))
213 |           .withCredentials(basicAwsCredentialsProvider)
214 |           .withRegion(region)
215 |           .build()
216 |       } else {
217 |         logInfo("Using the credentials attached to the instance")
218 |         val instanceProfileCredentialsProvider = new InstanceProfileCredentialsProviderWithRetries()
219 |         AmazonSQSClientBuilder
220 |           .standard()
221 |           .withClientConfiguration(new ClientConfiguration().withMaxConnections(maxConnections))
222 |           .withCredentials(instanceProfileCredentialsProvider)
223 |           .build()
224 |       }
225 |     } catch {
226 |       case e: Exception =>
227 |         throw new SparkException(s"Error occured while creating Amazon SQS Client", e)
228 |     }
229 |   }
230 | 
231 |   def addToDeleteMessageQueue(messageReceiptHandles: List[String]): Unit = {
232 |     deleteMessageQueue.addAll(messageReceiptHandles.asJava)
233 |   }
234 | 
235 |   def deleteMessagesFromQueue(): Unit = {
236 |     try {
237 |       var count = -1
238 |       val messageReceiptHandles = deleteMessageQueue.asScala.toList
239 |       val messageGroups = messageReceiptHandles.sliding(10, 10).toList
240 |       messageGroups.foreach { messageGroup =>
241 |         val requestEntries = messageGroup.foldLeft(List[DeleteMessageBatchRequestEntry]()) {
242 |           (list, messageReceiptHandle) =>
243 |             count = count + 1
244 |             list :+ new DeleteMessageBatchRequestEntry(count.toString, messageReceiptHandle)
245 |         }.asJava
246 |         val batchResult = sqsClient.deleteMessageBatch(sqsUrl, requestEntries)
247 |         if (!batchResult.getFailed.isEmpty) {
248 |           batchResult.getFailed.asScala.foreach { entry =>
249 |             sqsClient.deleteMessage(
250 |               sqsUrl, requestEntries.get(entry.getId.toInt).getReceiptHandle)
251 |           }
252 |         }
253 |       }
254 |     } catch {
255 |       case e: Exception =>
256 |         logWarning(s"Unable to delete message from SQS ${e.getMessage}")
257 |     }
258 |     deleteMessageQueue.clear()
259 |   }
260 | 
261 |   def assertSqsIsWorking(): Unit = {
262 |     if (exception.isDefined) {
263 |       throw exception.get
264 |     }
265 |   }
266 | 
267 | }
268 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/streaming/sqs/SqsFileCache.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.streaming.sqs
 19 | 
 20 | import java.net.URI
 21 | import java.util.concurrent.ConcurrentHashMap
 22 | 
 23 | import scala.collection.JavaConverters._
 24 | import scala.collection.mutable.ListBuffer
 25 | 
 26 | import org.apache.hadoop.fs.Path
 27 | 
 28 | import org.apache.spark.internal.Logging
 29 | 
 30 |   /**
 31 |    * A custom hash map used to track the list of files seen. This map is thread-safe.
 32 |    * To prevent the hash map from growing indefinitely, a purge function is available to
 33 |    * remove files "maxAgeMs" older than the latest file.
 34 |    */
 35 | 
 36 | class SqsFileCache(maxAgeMs: Long, fileNameOnly: Boolean) extends Logging {
 37 |   require(maxAgeMs >= 0)
 38 |   if (fileNameOnly) {
 39 |     logWarning("'fileNameOnly' is enabled. Make sure your file names are unique (e.g. using " +
 40 |       "UUID), otherwise, files with the same name but under different paths will be considered " +
 41 |       "the same and causes data lost.")
 42 |   }
 43 | 
 44 |   /** Mapping from file path to its message description. */
 45 |   private val sqsMap = new ConcurrentHashMap[String, MessageDescription]
 46 | 
 47 |   /** Timestamp for the last purge operation. */
 48 |   private var lastPurgeTimestamp: Long = 0L
 49 | 
 50 |   /** Timestamp of the latest file. */
 51 |   private var latestTimestamp: Long = 0L
 52 | 
 53 |   @inline private def stripPathIfNecessary(path: String) = {
 54 |     if (fileNameOnly) new Path(new URI(path)).getName else path
 55 |   }
 56 | 
 57 |    /**
 58 |     * Returns true if we should consider this file a new file. The file is only considered "new"
 59 |     * if it is new enough that we are still tracking, and we have not seen it before.
 60 |     */
 61 |   def isNewFile(path: String, timestamp: Long): Boolean = {
 62 |     timestamp >= lastPurgeTimestamp && !sqsMap.containsKey(stripPathIfNecessary(path))
 63 |   }
 64 | 
 65 |   /** Add a new file to the map. */
 66 |   def add(path: String, fileStatus: MessageDescription): Unit = {
 67 |     sqsMap.put(stripPathIfNecessary(path), fileStatus)
 68 |     if (fileStatus.timestamp > latestTimestamp) {
 69 |       latestTimestamp = fileStatus.timestamp
 70 |     }
 71 |   }
 72 | 
 73 |    /**
 74 |     * Returns all the new files found - ignore aged files and files that we have already seen.
 75 |     * Sorts the files by timestamp.
 76 |     */
 77 |   def getUncommittedFiles(maxFilesPerTrigger: Option[Int],
 78 |                              shouldSortFiles: Boolean): Seq[(String, Long, String)] = {
 79 |     if (shouldSortFiles) {
 80 |       val uncommittedFiles = filterAllUncommittedFiles()
 81 |       val sortedFiles = reportTimeTaken("Sorting Files") {
 82 |          uncommittedFiles.sortWith(_._2 < _._2)
 83 |       }
 84 |       if (maxFilesPerTrigger.nonEmpty) sortedFiles.take(maxFilesPerTrigger.get) else sortedFiles
 85 |     } else {
 86 |       if (maxFilesPerTrigger.isEmpty) {
 87 |         filterAllUncommittedFiles()
 88 |       } else {
 89 |         filterTopUncommittedFiles(maxFilesPerTrigger.get)
 90 |       }
 91 |     }
 92 |   }
 93 |     private def filterTopUncommittedFiles(maxFilesPerTrigger: Int): List[(String, Long, String)] = {
 94 |       val iterator = sqsMap.asScala.iterator
 95 |       val uncommittedFiles = ListBuffer[(String, Long, String)]()
 96 |       while (uncommittedFiles.length < maxFilesPerTrigger && iterator.hasNext) {
 97 |         val file = iterator.next()
 98 |         if (file._2.isCommitted && file._2.timestamp >= lastPurgeTimestamp) {
 99 |           uncommittedFiles += ((file._1, file._2.timestamp, file._2.messageReceiptHandle))
100 |         }
101 |       }
102 |       uncommittedFiles.toList
103 |     }
104 | 
105 |     private def reportTimeTaken[T](operation: String)(body: => T): T = {
106 |       val startTime = System.currentTimeMillis()
107 |       val result = body
108 |       val endTime = System.currentTimeMillis()
109 |       val timeTaken = math.max(endTime - startTime, 0)
110 | 
111 |       logDebug(s"$operation took $timeTaken ms")
112 |       result
113 |     }
114 | 
115 |     private def filterAllUncommittedFiles(): List[(String, Long, String)] = {
116 |       sqsMap.asScala.foldLeft(List[(String, Long, String)]()) {
117 |         (list, file) =>
118 |           if (!file._2.isCommitted && file._2.timestamp >= lastPurgeTimestamp) {
119 |             list :+ ((file._1, file._2.timestamp, file._2.messageReceiptHandle))
120 |           } else {
121 |             list
122 |           }
123 |       }
124 |     }
125 | 
126 |   /** Removes aged entries and returns the number of files removed. */
127 |   def purge(): Int = {
128 |     lastPurgeTimestamp = latestTimestamp - maxAgeMs
129 |     var count = 0
130 |     sqsMap.asScala.foreach { fileEntry =>
131 |       if (fileEntry._2.timestamp < lastPurgeTimestamp) {
132 |         sqsMap.remove(fileEntry._1)
133 |         count += 1
134 |       }
135 |     }
136 |     count
137 |   }
138 | 
139 |   /** Mark file entry as committed or already processed */
140 |   def markCommitted(path: String): Unit = {
141 |     sqsMap.replace(path, MessageDescription(
142 |       sqsMap.get(path).timestamp, true, sqsMap.get(path).messageReceiptHandle))
143 |   }
144 | 
145 |   def size: Int = sqsMap.size()
146 | 
147 | }
148 | 
149 |    /**
150 |     * A case class to store file metadata. Metadata includes file timestamp, file status -
151 |     * committed or not committed and message reciept handle used for deleting message from
152 |     * Amazon SQS
153 |     */
154 | case class MessageDescription(timestamp: Long,
155 |                               isCommitted: Boolean = false,
156 |                               messageReceiptHandle: String)
157 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/streaming/sqs/SqsSource.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.streaming.sqs
 19 | 
 20 | import java.net.URI
 21 | 
 22 | import org.apache.hadoop.fs.Path
 23 | 
 24 | import org.apache.spark.internal.Logging
 25 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
 26 | import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation}
 27 | import org.apache.spark.sql.execution.streaming._
 28 | import org.apache.spark.sql.execution.streaming.FileStreamSource._
 29 | import org.apache.spark.sql.types.StructType
 30 | 
 31 | 
 32 | class SqsSource(sparkSession: SparkSession,
 33 |                 metadataPath: String,
 34 |                 options: Map[String, String],
 35 |                 override val schema: StructType) extends Source with Logging {
 36 | 
 37 |   private val sourceOptions = new SqsSourceOptions(options)
 38 | 
 39 |   private val hadoopConf = sparkSession.sessionState.newHadoopConf()
 40 | 
 41 |   private val metadataLog =
 42 |     new FileStreamSourceLog(FileStreamSourceLog.VERSION, sparkSession, metadataPath)
 43 |   private var metadataLogCurrentOffset = metadataLog.getLatest().map(_._1).getOrElse(-1L)
 44 | 
 45 |   private val maxFilesPerTrigger = sourceOptions.maxFilesPerTrigger
 46 | 
 47 |   private val maxFileAgeMs: Long = sourceOptions.maxFileAgeMs
 48 | 
 49 |   private val fileFormatClassName = sourceOptions.fileFormatClassName
 50 | 
 51 |   private val shouldSortFiles = sourceOptions.shouldSortFiles
 52 | 
 53 |   private val sqsClient = new SqsClient(sourceOptions, hadoopConf)
 54 | 
 55 |   metadataLog.allFiles().foreach { entry =>
 56 |     sqsClient.sqsFileCache.add(entry.path, MessageDescription(entry.timestamp, true, ""))
 57 |   }
 58 |   sqsClient.sqsFileCache.purge()
 59 | 
 60 |   logInfo(s"maxFilesPerBatch = $maxFilesPerTrigger, maxFileAgeMs = $maxFileAgeMs")
 61 | 
 62 |    /**
 63 |     * Returns the data that is between the offsets (`start`, `end`].
 64 |     */
 65 |   override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
 66 |     val startOffset = start.map(FileStreamSourceOffset(_).logOffset).getOrElse(-1L)
 67 |     val endOffset = FileStreamSourceOffset(end).logOffset
 68 | 
 69 |     assert(startOffset <= endOffset)
 70 |     val files = metadataLog.get(Some(startOffset + 1), Some(endOffset)).flatMap(_._2)
 71 |     logInfo(s"Processing ${files.length} files from ${startOffset + 1}:$endOffset")
 72 |     logTrace(s"Files are:\n\t" + files.mkString("\n\t"))
 73 |     val newDataSource =
 74 |       DataSource(
 75 |         sparkSession,
 76 |         paths = files.map(f => new Path(new URI(f.path)).toString),
 77 |         userSpecifiedSchema = Some(schema),
 78 |         className = fileFormatClassName,
 79 |         options = options)
 80 |     Dataset.ofRows(sparkSession, LogicalRelation(newDataSource.resolveRelation(
 81 |       checkFilesExist = false), isStreaming = true))
 82 |   }
 83 | 
 84 |   private def fetchMaxOffset(): FileStreamSourceOffset = synchronized {
 85 | 
 86 |     sqsClient.assertSqsIsWorking()
 87 |     /**
 88 |      * All the new files found - ignore aged files and files that we have seen.
 89 |      *  Obey user's setting to limit the number of files in this batch trigger.
 90 |      */
 91 |     val batchFiles = sqsClient.sqsFileCache.getUncommittedFiles(maxFilesPerTrigger, shouldSortFiles)
 92 | 
 93 |     if (batchFiles.nonEmpty) {
 94 |       metadataLogCurrentOffset += 1
 95 |       metadataLog.add(metadataLogCurrentOffset, batchFiles.map {
 96 |         case (path, timestamp, receiptHandle) =>
 97 |           FileEntry(path = path, timestamp = timestamp, batchId = metadataLogCurrentOffset)
 98 |       }.toArray)
 99 |       logInfo(s"Log offset set to $metadataLogCurrentOffset with ${batchFiles.size} new files")
100 |       val messageReceiptHandles = batchFiles.map {
101 |         case (path, timestamp, receiptHandle) =>
102 |           sqsClient.sqsFileCache.markCommitted(path)
103 |           logDebug(s"New file: $path")
104 |           receiptHandle
105 |       }.toList
106 |       sqsClient.addToDeleteMessageQueue(messageReceiptHandles)
107 |     }
108 | 
109 |     val numPurged = sqsClient.sqsFileCache.purge()
110 | 
111 |     if (!sqsClient.deleteMessageQueue.isEmpty) {
112 |       sqsClient.deleteMessagesFromQueue()
113 |     }
114 | 
115 |     logTrace(
116 |       s"""
117 |          |Number of files selected for batch = ${batchFiles.size}
118 |          |Number of files purged from tracking map = $numPurged
119 |        """.stripMargin)
120 | 
121 |     FileStreamSourceOffset(metadataLogCurrentOffset)
122 |   }
123 | 
124 |   override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.logOffset == -1)
125 | 
126 |   override def commit(end: Offset): Unit = {
127 |     // No-op for now; SqsSource currently garbage-collects files based on timestamp
128 |     // and the value of the maxFileAge parameter.
129 |   }
130 | 
131 |   override def stop(): Unit = {
132 |     if (!sqsClient.sqsScheduler.isTerminated) {
133 |       sqsClient.sqsScheduler.shutdownNow()
134 |     }
135 |   }
136 | 
137 |   override def toString: String = s"SqsSource[${sqsClient.sqsUrl}]"
138 | 
139 | }
140 | 
141 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/streaming/sqs/SqsSourceOptions.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.streaming.sqs
 19 | 
 20 | import scala.util.Try
 21 | 
 22 | import org.apache.spark.internal.Logging
 23 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 24 | import org.apache.spark.util.Utils
 25 | 
 26 | /**
 27 |  * User specified options for sqs source.
 28 |  */
 29 | class SqsSourceOptions(parameters: CaseInsensitiveMap[String]) extends Logging {
 30 | 
 31 |   def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
 32 | 
 33 |   val maxFilesPerTrigger: Option[Int] = parameters.get("maxFilesPerTrigger").map { str =>
 34 |     Try(str.toInt).toOption.filter(_ > 0).getOrElse {
 35 |       throw new IllegalArgumentException(
 36 |         s"Invalid value '$str' for option 'maxFilesPerTrigger', must be a positive integer")
 37 |     }
 38 |   }
 39 | 
 40 |   /**
 41 |    * Maximum age of a file that can be found in this directory, before it is ignored. For the
 42 |    * first batch all files will be considered valid.
 43 |    *
 44 |    * The max age is specified with respect to the timestamp of the latest file, and not the
 45 |    * timestamp of the current system. That this means if the last file has timestamp 1000, and the
 46 |    * current system time is 2000, and max age is 200, the system will purge files older than
 47 |    * 800 (rather than 1800) from the internal state.
 48 |    *
 49 |    * Default to a week.
 50 |    */
 51 |   val maxFileAgeMs: Long =
 52 |     Utils.timeStringAsMs(parameters.getOrElse("maxFileAge", "7d"))
 53 | 
 54 |   val fetchIntervalSeconds: Int = parameters.get("sqsFetchIntervalSeconds").map { str =>
 55 |     Try(str.toInt).toOption.filter(_ > 0).getOrElse {
 56 |       throw new IllegalArgumentException(
 57 |         s"Invalid value '$str' for option 'sqsFetchIntervalSeconds', must be a positive integer")
 58 |     }
 59 |   }.getOrElse(10)
 60 | 
 61 |   val longPollWaitTimeSeconds: Int = parameters.get("sqsLongPollingWaitTimeSeconds").map { str =>
 62 |     Try(str.toInt).toOption.filter(x => x >= 0 && x <= 20).getOrElse {
 63 |       throw new IllegalArgumentException(
 64 |         s"Invalid value '$str' for option 'sqsLongPollingWaitTimeSeconds'," +
 65 |           "must be an integer between 0 and 20")
 66 |     }
 67 |   }.getOrElse(20)
 68 | 
 69 |   val maxRetries: Int = parameters.get("sqsMaxRetries").map { str =>
 70 |     Try(str.toInt).toOption.filter(_ > 0).getOrElse {
 71 |       throw new IllegalArgumentException(
 72 |         s"Invalid value '$str' for option 'sqsMaxRetries', must be a positive integer")
 73 |     }
 74 |   }.getOrElse(10)
 75 | 
 76 |   val maxConnections: Int = parameters.get("sqsMaxConnections").map { str =>
 77 |     Try(str.toInt).toOption.filter(_ > 0).getOrElse {
 78 |       throw new IllegalArgumentException(
 79 |         s"Invalid value '$str' for option 'sqsMaxConnections', must be a positive integer")
 80 |     }
 81 |   }.getOrElse(1)
 82 | 
 83 |   val sqsUrl: String = parameters.get("sqsUrl").getOrElse{
 84 |     throw new IllegalArgumentException("SQS Url is not specified")
 85 |   }
 86 | 
 87 |   val region: String = parameters.get("region").getOrElse {
 88 |     throw new IllegalArgumentException("Region is not specified")
 89 |   }
 90 | 
 91 |   val fileFormatClassName: String = parameters.get("fileFormat").getOrElse {
 92 |     throw new IllegalArgumentException("Specifying file format is mandatory with sqs source")
 93 |   }
 94 | 
 95 |   val ignoreFileDeletion: Boolean = withBooleanParameter("ignoreFileDeletion", false)
 96 | 
 97 |    /**
 98 |     * Whether to check new files based on only the filename instead of on the full path.
 99 |     *
100 |     * With this set to `true`, the following files would be considered as the same file, because
101 |     * their filenames, "dataset.txt", are the same:
102 |     * - "file:///dataset.txt"
103 |     * - "s3://a/dataset.txt"
104 |     * - "s3n://a/b/dataset.txt"
105 |     * - "s3a://a/b/c/dataset.txt"
106 |     */
107 |   val fileNameOnly: Boolean = withBooleanParameter("fileNameOnly", false)
108 | 
109 |   val shouldSortFiles: Boolean = withBooleanParameter("shouldSortFiles", true)
110 | 
111 |   val useInstanceProfileCredentials: Boolean = withBooleanParameter(
112 |     "useInstanceProfileCredentials", false)
113 | 
114 |   private def withBooleanParameter(name: String, default: Boolean) = {
115 |     parameters.get(name).map { str =>
116 |       try {
117 |         str.toBoolean
118 |       } catch {
119 |         case _: IllegalArgumentException =>
120 |           throw new IllegalArgumentException(
121 |             s"Invalid value '$str' for option '$name', must be true or false")
122 |       }
123 |     }.getOrElse(default)
124 |   }
125 | 
126 | }
127 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/streaming/sqs/SqsSourceProvider.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.streaming.sqs
19 | 
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.SQLContext
22 | import org.apache.spark.sql.execution.streaming.Source
23 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
24 | import org.apache.spark.sql.types.StructType
25 | 
26 | class SqsSourceProvider extends DataSourceRegister
27 |   with StreamSourceProvider
28 |   with Logging {
29 | 
30 |   override def shortName(): String = "s3-sqs"
31 | 
32 |   override def sourceSchema(sqlContext: SQLContext,
33 |                             schema: Option[StructType],
34 |                             providerName: String,
35 |                             parameters: Map[String, String]): (String, StructType) = {
36 | 
37 |     require(schema.isDefined, "Sqs source doesn't support empty schema")
38 |     (shortName(), schema.get)
39 |   }
40 | 
41 |   override def createSource(sqlContext: SQLContext,
42 |                             metadataPath: String,
43 |                             schema: Option[StructType],
44 |                             providerName: String,
45 |                             parameters: Map[String, String]): Source = {
46 | 
47 |     new SqsSource(
48 |       sqlContext.sparkSession,
49 |       metadataPath,
50 |       parameters,
51 |       schema.get)
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Set everything to be logged to the file target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=true
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
25 | 
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.sparkproject.jetty=WARN
28 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/streaming/sqs/SqsSourceOptionsSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.spark.sql.streaming.sqs
 18 | 
 19 | import java.util.Locale
 20 | 
 21 | import org.apache.spark.sql.streaming.{StreamingQuery, StreamingQueryException, StreamTest}
 22 | import org.apache.spark.sql.types.StructType
 23 | 
 24 | class SqsSourceOptionsSuite extends StreamTest {
 25 | 
 26 |   test("bad source options") {
 27 |     def testBadOptions(option: (String, String))(expectedMsg: String): Unit = {
 28 | 
 29 |       var query : StreamingQuery = null
 30 | 
 31 |       try {
 32 |         val errorMessage = intercept[StreamingQueryException] {
 33 |           val dummySchema = new StructType
 34 |           val reader = spark
 35 |             .readStream
 36 |             .format("s3-sqs")
 37 |             .option("fileFormat", "json")
 38 |             .schema(dummySchema)
 39 |             .option("sqsUrl", "https://DUMMY_URL")
 40 |             .option("region", "us-east-1")
 41 |             .option(option._1, option._2)
 42 |             .load()
 43 | 
 44 |           query = reader.writeStream
 45 |             .format("memory")
 46 |             .queryName("badOptionsTest")
 47 |             .start()
 48 | 
 49 |           query.processAllAvailable()
 50 |         }.getMessage
 51 |         assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
 52 |       } finally {
 53 |         if (query != null) {
 54 |           // terminating streaming query if necessary
 55 |           query.stop()
 56 |         }
 57 | 
 58 |       }
 59 |     }
 60 | 
 61 |     testBadOptions("sqsFetchIntervalSeconds" -> "-2")("Invalid value '-2' " +
 62 |       "for option 'sqsFetchIntervalSeconds', must be a positive integer")
 63 |     testBadOptions("sqsLongPollingWaitTimeSeconds" -> "-5")("Invalid value '-5' " +
 64 |       "for option 'sqsLongPollingWaitTimeSeconds',must be an integer between 0 and 20")
 65 |     testBadOptions("sqsMaxConnections" -> "-2")("Invalid value '-2' " +
 66 |       "for option 'sqsMaxConnections', must be a positive integer")
 67 |     testBadOptions("maxFilesPerTrigger" -> "-50")("Invalid value '-50' " +
 68 |       "for option 'maxFilesPerTrigger', must be a positive integer")
 69 |     testBadOptions("ignoreFileDeletion" -> "x")("Invalid value 'x' " +
 70 |       "for option 'ignoreFileDeletion', must be true or false")
 71 |     testBadOptions("fileNameOnly" -> "x")("Invalid value 'x' " +
 72 |       "for option 'fileNameOnly', must be true or false")
 73 |     testBadOptions("shouldSortFiles" -> "x")("Invalid value 'x' " +
 74 |       "for option 'shouldSortFiles', must be true or false")
 75 |     testBadOptions("useInstanceProfileCredentials" -> "x")("Invalid value 'x' " +
 76 |       "for option 'useInstanceProfileCredentials', must be true or false")
 77 | 
 78 |   }
 79 | 
 80 |   test("missing mandatory options") {
 81 | 
 82 |     def testMissingMandatoryOptions(options: List[(String, String)])(expectedMsg: String): Unit = {
 83 | 
 84 |       var query: StreamingQuery = null
 85 | 
 86 |       try {
 87 |         val errorMessage = intercept[StreamingQueryException] {
 88 |           val dummySchema = new StructType
 89 |           val reader = spark
 90 |             .readStream
 91 |             .format("s3-sqs")
 92 |             .schema(dummySchema)
 93 | 
 94 |           val readerWithOptions = options.map { option =>
 95 |            reader.option(option._1, option._2)
 96 |           }.last.load()
 97 | 
 98 |           query = readerWithOptions.writeStream
 99 |             .format("memory")
100 |             .queryName("missingMandatoryOptions")
101 |             .start()
102 | 
103 |           query.processAllAvailable()
104 |         }.getMessage
105 |         assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
106 |       } finally {
107 |         if (query != null) {
108 |           // terminating streaming query if necessary
109 |           query.stop()
110 |         }
111 |       }
112 |     }
113 | 
114 |     // No fileFormat specified
115 |     testMissingMandatoryOptions(List("sqsUrl" -> "https://DUMMY_URL", "region" -> "us-east-1"))(
116 |       "Specifying file format is mandatory with sqs source")
117 | 
118 |     // Sqs URL not specified
119 |     testMissingMandatoryOptions(List("fileFormat" -> "json", "region" -> "us-east-1"))(
120 |       "SQS Url is not specified")
121 |   }
122 | 
123 |   test("schema not specified") {
124 | 
125 |     var query: StreamingQuery = null
126 | 
127 |     val expectedMsg = "Sqs source doesn't support empty schema"
128 | 
129 |     try {
130 |       val errorMessage = intercept[IllegalArgumentException] {
131 |         val reader = spark
132 |           .readStream
133 |           .format("s3-sqs")
134 |           .option("sqsUrl", "https://DUMMY_URL")
135 |           .option("fileFormat", "json")
136 |           .option("region", "us-east-1")
137 |           .load()
138 | 
139 |         query = reader.writeStream
140 |           .format("memory")
141 |           .queryName("missingSchema")
142 |           .start()
143 | 
144 |         query.processAllAvailable()
145 |       }.getMessage
146 |       assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
147 |     } finally {
148 |       if (query != null) {
149 |         // terminating streaming query if necessary
150 |         query.stop()
151 |       }
152 |     }
153 | 
154 |   }
155 | 
156 | }
157 | 
158 | 


--------------------------------------------------------------------------------