├── .gitchangelog.rc
├── .gitignore
├── .travis.yml
├── CHANGELOG
├── LICENSE
├── README.md
├── dev
    ├── checkstyle-suppressions.xml
    └── checkstyle.xml
├── pom.xml
├── scalastyle-config.xml
└── src
    ├── main
        ├── java
        │   └── org
        │   │   └── apache
        │   │       └── spark
        │   │           └── sql
        │   │               └── kinesis
        │   │                   └── AWSInstanceProfileCredentialsProviderWithRetries.java
        ├── resources
        │   └── META-INF
        │   │   └── services
        │   │       └── org.apache.spark.sql.sources.DataSourceRegister
        └── scala
        │   └── org
        │       └── apache
        │           └── spark
        │               └── sql
        │                   └── kinesis
        │                       ├── CachedKinesisProducer.scala
        │                       ├── HDFSMetadataCommitter.scala
        │                       ├── KinesisPosition.scala
        │                       ├── KinesisReader.scala
        │                       ├── KinesisSink.scala
        │                       ├── KinesisSource.scala
        │                       ├── KinesisSourceOffset.scala
        │                       ├── KinesisSourceProvider.scala
        │                       ├── KinesisSourceRDD.scala
        │                       ├── KinesisWriteTask.scala
        │                       ├── KinesisWriter.scala
        │                       ├── MetadataCommitter.scala
        │                       ├── ShardSyncer.scala
        │                       ├── SparkAWSCredentials.scala
        │                       └── package-info.java
    └── test
        ├── resources
            └── log4j.properties
        └── scala
            └── org
                └── apache
                    └── spark
                        └── sql
                            └── kinesis
                                ├── HDFSMetaDataCommiterSuite.scala
                                ├── KinesisPositionSuite.scala
                                ├── KinesisReaderSuite.scala
                                ├── KinesisSinkSuite.scala
                                ├── KinesisSourceOffsetSuite.scala
                                ├── KinesisSourceSuite.scala
                                ├── KinesisTestUtils.scala
                                └── ShardSyncerSuite.scala


/.gitchangelog.rc:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8; mode: python -*-
  2 | ##
  3 | ## Format
  4 | ##
  5 | ##   ACTION: [AUDIENCE:] COMMIT_MSG [!TAG ...]
  6 | ##
  7 | ## Description
  8 | ##
  9 | ##   ACTION is one of 'chg', 'fix', 'new'
 10 | ##
 11 | ##       Is WHAT the change is about.
 12 | ##
 13 | ##       'chg' is for refactor, small improvement, cosmetic changes...
 14 | ##       'fix' is for bug fixes
 15 | ##       'new' is for new features, big improvement
 16 | ##
 17 | ##   AUDIENCE is optional and one of 'dev', 'usr', 'pkg', 'test', 'doc'
 18 | ##
 19 | ##       Is WHO is concerned by the change.
 20 | ##
 21 | ##       'dev'  is for developpers (API changes, refactors...)
 22 | ##       'usr'  is for final users (UI changes)
 23 | ##       'pkg'  is for packagers   (packaging changes)
 24 | ##       'test' is for testers     (test only related changes)
 25 | ##       'doc'  is for doc guys    (doc only changes)
 26 | ##
 27 | ##   COMMIT_MSG is ... well ... the commit message itself.
 28 | ##
 29 | ##   TAGs are additionnal adjective as 'refactor' 'minor' 'cosmetic'
 30 | ##
 31 | ##       They are preceded with a '!' or a '@' (prefer the former, as the
 32 | ##       latter is wrongly interpreted in github.) Commonly used tags are:
 33 | ##
 34 | ##       'refactor' is obviously for refactoring code only
 35 | ##       'minor' is for a very meaningless change (a typo, adding a comment)
 36 | ##       'cosmetic' is for cosmetic driven change (re-indentation, 80-col...)
 37 | ##       'wip' is for partial functionality but complete subfunctionality.
 38 | ##
 39 | ## Example:
 40 | ##
 41 | ##   new: usr: support of bazaar implemented
 42 | ##   chg: re-indentend some lines !cosmetic
 43 | ##   new: dev: updated code to be compatible with last version of killer lib.
 44 | ##   fix: pkg: updated year of licence coverage.
 45 | ##   new: test: added a bunch of test around user usability of feature X.
 46 | ##   fix: typo in spelling my name in comment. !minor
 47 | ##
 48 | ##   Please note that multi-line commit message are supported, and only the
 49 | ##   first line will be considered as the "summary" of the commit message. So
 50 | ##   tags, and other rules only applies to the summary.  The body of the commit
 51 | ##   message will be displayed in the changelog without reformatting.
 52 | 
 53 | 
 54 | ##
 55 | ## ``ignore_regexps`` is a line of regexps
 56 | ##
 57 | ## Any commit having its full commit message matching any regexp listed here
 58 | ## will be ignored and won't be reported in the changelog.
 59 | ##
 60 | ignore_regexps = [
 61 |     r'@minor', r'!minor',
 62 |     r'@cosmetic', r'!cosmetic',
 63 |     r'@refactor', r'!refactor',
 64 |     r'@wip', r'!wip',
 65 |     r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*[p|P]kg:',
 66 |     r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*[d|D]ev:',
 67 |     r'^(.{3,3}\s*:)?\s*[fF]irst commit.?\s*$',
 68 |     r'^$',  ## ignore commits with empty messages
 69 | ]
 70 | 
 71 | 
 72 | ## ``section_regexps`` is a list of 2-tuples associating a string label and a
 73 | ## list of regexp
 74 | ##
 75 | ## Commit messages will be classified in sections thanks to this. Section
 76 | ## titles are the label, and a commit is classified under this section if any
 77 | ## of the regexps associated is matching.
 78 | ##
 79 | ## Please note that ``section_regexps`` will only classify commits and won't
 80 | ## make any changes to the contents. So you'll probably want to go check
 81 | ## ``subject_process`` (or ``body_process``) to do some changes to the subject,
 82 | ## whenever you are tweaking this variable.
 83 | ##
 84 | section_regexps = [
 85 |     ('New', [
 86 |         r'^[nN]ew\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$',
 87 |      ]),
 88 |     ('Changes', [
 89 |         r'^[cC]hg\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$',
 90 |      ]),
 91 |     ('Fix', [
 92 |         r'^[fF]ix\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$',
 93 |      ]),
 94 | 
 95 |     ('Other', None ## Match all lines
 96 |      ),
 97 | 
 98 | ]
 99 | 
100 | 
101 | ## ``body_process`` is a callable
102 | ##
103 | ## This callable will be given the original body and result will
104 | ## be used in the changelog.
105 | ##
106 | ## Available constructs are:
107 | ##
108 | ##   - any python callable that take one txt argument and return txt argument.
109 | ##
110 | ##   - ReSub(pattern, replacement): will apply regexp substitution.
111 | ##
112 | ##   - Indent(chars="  "): will indent the text with the prefix
113 | ##     Please remember that template engines gets also to modify the text and
114 | ##     will usually indent themselves the text if needed.
115 | ##
116 | ##   - Wrap(regexp=r"\n\n"): re-wrap text in separate paragraph to fill 80-Columns
117 | ##
118 | ##   - noop: do nothing
119 | ##
120 | ##   - ucfirst: ensure the first letter is uppercase.
121 | ##     (usually used in the ``subject_process`` pipeline)
122 | ##
123 | ##   - final_dot: ensure text finishes with a dot
124 | ##     (usually used in the ``subject_process`` pipeline)
125 | ##
126 | ##   - strip: remove any spaces before or after the content of the string
127 | ##
128 | ##   - SetIfEmpty(msg="No commit message."): will set the text to
129 | ##     whatever given ``msg`` if the current text is empty.
130 | ##
131 | ## Additionally, you can `pipe` the provided filters, for instance:
132 | #body_process = Wrap(regexp=r'\n(?=\w+\s*:)') | Indent(chars="  ")
133 | #body_process = Wrap(regexp=r'\n(?=\w+\s*:)')
134 | #body_process = noop
135 | body_process = ReSub(r'((^|\n)[A-Z]\w+(-\w+)*: .*(\n\s+.*)*)+$', r'') | strip
136 | 
137 | 
138 | ## ``subject_process`` is a callable
139 | ##
140 | ## This callable will be given the original subject and result will
141 | ## be used in the changelog.
142 | ##
143 | ## Available constructs are those listed in ``body_process`` doc.
144 | subject_process = (strip |
145 |     ReSub(r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n@]*)(@[a-z]+\s+)*$', r'\4') |
146 |     SetIfEmpty("No commit message.") | ucfirst | final_dot)
147 | 
148 | 
149 | ## ``tag_filter_regexp`` is a regexp
150 | ##
151 | ## Tags that will be used for the changelog must match this regexp.
152 | ##
153 | tag_filter_regexp = r'^rubix-root-[0-9]+\.[0-9]+(\.[0-9]+)?$'
154 | 
155 | 
156 | ## ``unreleased_version_label`` is a string or a callable that outputs a string
157 | ##
158 | ## This label will be used as the changelog Title of the last set of changes
159 | ## between last valid tag and HEAD if any.
160 | unreleased_version_label = "(unreleased)"
161 | 
162 | 
163 | ## ``output_engine`` is a callable
164 | ##
165 | ## This will change the output format of the generated changelog file
166 | ##
167 | ## Available choices are:
168 | ##
169 | ##   - rest_py
170 | ##
171 | ##        Legacy pure python engine, outputs ReSTructured text.
172 | ##        This is the default.
173 | ##
174 | ##   - mustache(<template_name>)
175 | ##
176 | ##        Template name could be any of the available templates in
177 | ##        ``templates/mustache/*.tpl``.
178 | ##        Requires python package ``pystache``.
179 | ##        Examples:
180 | ##           - mustache("markdown")
181 | ##           - mustache("restructuredtext")
182 | ##
183 | ##   - makotemplate(<template_name>)
184 | ##
185 | ##        Template name could be any of the available templates in
186 | ##        ``templates/mako/*.tpl``.
187 | ##        Requires python package ``mako``.
188 | ##        Examples:
189 | ##           - makotemplate("restructuredtext")
190 | ##
191 | output_engine = rest_py
192 | #output_engine = mustache("restructuredtext")
193 | #output_engine = mustache("markdown")
194 | #output_engine = makotemplate("restructuredtext")
195 | 
196 | 
197 | ## ``include_merge`` is a boolean
198 | ##
199 | ## This option tells git-log whether to include merge commits in the log.
200 | ## The default is to include them.
201 | include_merge = True
202 | 
203 | 
204 | ## ``log_encoding`` is a string identifier
205 | ##
206 | ## This option tells gitchangelog what encoding is outputed by ``git log``.
207 | ## The default is to be clever about it: it checks ``git config`` for
208 | ## ``i18n.logOutputEncoding``, and if not found will default to git's own
209 | ## default: ``utf-8``.
210 | #log_encoding = 'utf-8'
211 | 
212 | 
213 | ## ``publish`` is a callable
214 | ##
215 | ## Sets what ``gitchangelog`` should do with the output generated by
216 | ## the output engine. ``publish`` is a callable taking one argument
217 | ## that is an interator on lines from the output engine.
218 | ##
219 | ## Some helper callable are provided:
220 | ##
221 | ## Available choices are:
222 | ##
223 | ##   - stdout
224 | ##
225 | ##        Outputs directly to standard output
226 | ##        (This is the default)
227 | ##
228 | ##   - FileInsertAtFirstRegexMatch(file, pattern, idx=lamda m: m.start())
229 | ##
230 | ##        Creates a callable that will parse given file for the given
231 | ##        regex pattern and will insert the output in the file.
232 | ##        ``idx`` is a callable that receive the matching object and
233 | ##        must return a integer index point where to insert the
234 | ##        the output in the file. Default is to return the position of
235 | ##        the start of the matched string.
236 | ##
237 | ##   - FileRegexSubst(file, pattern, replace, flags)
238 | ##
239 | ##        Apply a replace inplace in the given file. Your regex pattern must
240 | ##        take care of everything and might be more complex. Check the README
241 | ##        for a complete copy-pastable example.
242 | ##
243 | # publish = FileInsertIntoFirstRegexMatch(
244 | #     "CHANGELOG.rst",
245 | #     r'/(?P<rev>[0-9]+\.[0-9]+(\.[0-9]+)?)\s+\([0-9]+-[0-9]{2}-[0-9]{2}\)\n--+\n/',
246 | #     idx=lambda m: m.start(1)
247 | # )
248 | #publish = stdout
249 | 
250 | 
251 | ## ``revs`` is a list of callable or a list of string
252 | ##
253 | ## callable will be called to resolve as strings and allow dynamical
254 | ## computation of these. The result will be used as revisions for
255 | ## gitchangelog (as if directly stated on the command line). This allows
256 | ## to filter exaclty which commits will be read by gitchangelog.
257 | ##
258 | ## To get a full documentation on the format of these strings, please
259 | ## refer to the ``git rev-list`` arguments. There are many examples.
260 | ##
261 | ## Using callables is especially useful, for instance, if you
262 | ## are using gitchangelog to generate incrementally your changelog.
263 | ##
264 | ## Some helpers are provided, you can use them::
265 | ##
266 | ##   - FileFirstRegexMatch(file, pattern): will return a callable that will
267 | ##     return the first string match for the given pattern in the given file.
268 | ##     If you use named sub-patterns in your regex pattern, it'll output only
269 | ##     the string matching the regex pattern named "rev".
270 | ##
271 | ##   - Caret(rev): will return the rev prefixed by a "^", which is a
272 | ##     way to remove the given revision and all its ancestor.
273 | ##
274 | ## Please note that if you provide a rev-list on the command line, it'll
275 | ## replace this value (which will then be ignored).
276 | ##
277 | ## If empty, then ``gitchangelog`` will act as it had to generate a full
278 | ## changelog.
279 | ##
280 | ## The default is to use all commits to make the changelog.
281 | #revs = ["^1.0.3", ]
282 | #revs = [
283 | #    Caret(
284 | #        FileFirstRegexMatch(
285 | #            "CHANGELOG.rst",
286 | #            r"(?P<rev>[0-9]+\.[0-9]+(\.[0-9]+)?)\s+\([0-9]+-[0-9]{2}-[0-9]{2}\)\n--+\n")),
287 | #    "HEAD"
288 | #]
289 | revs = []
290 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *#*#
 2 | *.#*
 3 | *.swm
 4 | *.swn
 5 | *.swk
 6 | *.swl
 7 | *.swo
 8 | *.swp
 9 | *.ipr
10 | *.iml
11 | *.ipr
12 | *.iws
13 | *.pyc
14 | *.pyo
15 | *.swp
16 | *~
17 | .DS_Store
18 | .cache
19 | .classpath
20 | .ensime
21 | .ensime_cache/
22 | .ensime_lucene
23 | .generated-mima*
24 | .idea/
25 | .idea_modules/
26 | .project
27 | .pydevproject
28 | .scala_dependencies
29 | .settings
30 | /lib/
31 | R-unit-tests.log
32 | R/unit-tests.out
33 | R/cran-check.out
34 | R/pkg/vignettes/sparkr-vignettes.html
35 | build/*.jar
36 | build/apache-maven*
37 | build/scala*
38 | build/zinc*
39 | cache
40 | checkpoint
41 | conf/*.cmd
42 | conf/*.conf
43 | conf/*.properties
44 | conf/*.sh
45 | conf/*.xml
46 | conf/java-opts
47 | conf/slaves
48 | dependency-reduced-pom.xml
49 | derby.log
50 | dev/create-release/*final
51 | dev/create-release/*txt
52 | dev/pr-deps/
53 | dist/
54 | docs/_site
55 | docs/api
56 | lib_managed/
57 | lint-r-report.log
58 | log/
59 | logs/
60 | out/
61 | project/boot/
62 | project/build/target/
63 | project/plugins/lib_managed/
64 | project/plugins/project/build.properties
65 | project/plugins/src_managed/
66 | project/plugins/target/
67 | python/lib/pyspark.zip
68 | python/deps
69 | python/pyspark/python
70 | reports/
71 | scalastyle-on-compile.generated.xml
72 | scalastyle-output.xml
73 | scalastyle.txt
74 | spark-*-bin-*.tgz
75 | spark-tests.log
76 | src_managed/
77 | streaming-tests.log
78 | target/
79 | unit-tests.log
80 | work/
81 | 
82 | # For Hive
83 | TempStatsStore/
84 | metastore/
85 | metastore_db/
86 | sql/hive-thriftserver/test_warehouses
87 | warehouse/
88 | spark-warehouse/
89 | 
90 | # For R session data
91 | .RData
92 | .RHistory
93 | .Rhistory
94 | *.Rproj
95 | *.Rproj.*
96 | 
97 | .Rproj.user
98 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | language: java
19 | 
20 | jdk:
21 |   - openjdk8
22 |   - oraclejdk8
23 | 
24 | dist: trusty
25 | 
26 | script:
27 |   - mvn -DskipTests clean install
28 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
  1 | Changelog
  2 | =========
  3 | 
  4 | spark-sql-kinesis_2.13-1.2.1_spark-3.2
  5 | ----------------------------
  6 | - Update README.md to indicate that this repo is no longer maintained, and include a link to the new repo.
  7 | 
  8 | kinesis_2.12-1.2.0_spark-3.0
  9 | ----------------------------
 10 | - Support for Spark 3.0 and scala 2.12 (#92) [Vikram Agrawal, Vikram
 11 |   Agrawal]
 12 | - Switch to use Kinesis list-shards API (#89) [Chad Lagore]
 13 | - Fix issue with slow kinesis sink. [Vikram Agrawal]
 14 | - Remove lastReadSequenceNumber.isEmpty condition (#90) [Chad Lagore]
 15 | - Fixing latency in kinesis sink (#81) [abhishekd0907]
 16 | 
 17 | kinesis_2.11-1.1.4-spark_2.4
 18 | ----------------------------
 19 | - Fix retries while making Kinesis API calls (#79) [Vikram Agrawal]
 20 | - Adding ability to set exact Kinesis position to start reading from
 21 |   (#78) [Stanislav Norochevskyi]
 22 | 
 23 |   * Adding ability read initial Kinesis position from checkpoint JSON representation.
 24 | 
 25 |   * Adding a unit test for InitialKinesisPosition parser
 26 | - Apply retries to kinesis exceptions with status code >= 500 (#76)
 27 |   [Arne Huang]
 28 | - Make DefaultAWSCredentialsProviderChain available to choose (#71)
 29 |   [Hyeonseop Lee]
 30 | 
 31 |   * Add option to use instance profile credentials
 32 | 
 33 |   * Update docs
 34 | 
 35 |   * Add same option to producer
 36 | - Added session based authentication (#72) [guoming-xu]
 37 | - Add source and target java version to overwrite defaults; add nobootcp
 38 |   option to scala compiler (#68) [Julian Keppel]
 39 | - Update the artifact of the new release. [Vikram Agrawal]
 40 | - [maven-release-plugin] prepare for next development iteration. [Vikram
 41 |   Agrawal]
 42 | - Merge branch 'master' of github.com:qubole/kinesis-sql. [Vikram
 43 |   Agrawal]
 44 | - Update README.md. [Vikram Agrawal]
 45 | - Update README.md. [Vikram Agrawal]
 46 | - Update README.md. [Vikram Agrawal]
 47 | 
 48 | kinesis_2.11-1.1.3-spark_2.4
 49 | --------------------------------
 50 | - [maven-release-plugin] prepare release spark-sql-
 51 |   kinesis_2.11-1.1.3-spark_2.4. [Vikram Agrawal]
 52 | - GitChangelog RC file and changelog. [Vikram Agrawal]
 53 | - Add travis build config. [Vikram Agrawal]
 54 | - Create fat jar with dependencies shaded in the jar. [Vikram Agrawal]
 55 | - Handle deleted shards (#59) [Vikram Agrawal]
 56 | - [maven-release-plugin] prepare for next development iteration. [Vikram
 57 |   Agrawal]
 58 | 
 59 | kinesis_2.11-1.1.2-spark_2.4
 60 | -----------------------------
 61 | - [maven-release-plugin] prepare release spark-sql-
 62 |   kinesis_2.11-1.1.2-spark_2.4. [Vikram Agrawal]
 63 | - [maven-release-plugin] prepare for next development iteration. [Vikram
 64 |   Agrawal]
 65 | - POM changes. [Vikram Agrawal]
 66 | - Update pom.xml. [Vikram Agrawal]
 67 | - [maven-release-plugin] prepare for next development iteration. [Vikram
 68 |   Agrawal]
 69 | - [maven-release-plugin] prepare release spark-sql-
 70 |   kinesis_2.11-1.1.0-spark_2.4. [Vikram Agrawal]
 71 | - Update Pom file for mvn release. [Vikram Agrawal]
 72 | - Fix Deserialisation/Serialisation of KinesisOffsets  (#62) [Vikram
 73 |   Agrawal]
 74 | 
 75 |   * Fix incorrect metadata for batchId in the kinesis offsets
 76 | 
 77 |   * Fix serialization and deserialization fo the kinesis offsets
 78 | - Fix issue with Reprocessing oldshards issue (#63) [Vikram Agrawal]
 79 | - 2.4.0 (#56) [Vikram Agrawal]
 80 | 
 81 |   * Fix broken test suite and scalastyle error
 82 | 
 83 |   *  Fix issues with over-ridden Kinesis Source Options (#36)
 84 | 
 85 |   * Fix issues with over-ridden Kinesis Source Options
 86 | 
 87 |   * Update ReadMe
 88 | 
 89 |   * Fix Review Comments
 90 | 
 91 |   * Fix ambiguity with scala 2.12 (#37)
 92 | 
 93 |   * Fix stylecheck errors
 94 | 
 95 |   * Add retries to Handle EC issues in HDFS metadata committer (#41)
 96 | 
 97 |   * Handle EC issues in HDFS MetaCommitter
 98 | 
 99 |   * Avoid timestamp as Offset and avoid empty batches when there is no new data (#49)
100 | - Changing protobuf version and fixing styling errors (#53)
101 |   [abhishekd0907]
102 | - 2.4.0 (#45) [nhampiholi]
103 | 
104 |   * Fix broken test suite and scalastyle error
105 | 
106 |   *  Fix issues with over-ridden Kinesis Source Options (#36)
107 | 
108 |   * Fix issues with over-ridden Kinesis Source Options
109 | 
110 |   * Update ReadMe
111 | 
112 |   * Fix Review Comments
113 | 
114 |   * Fix ambiguity with scala 2.12 (#37)
115 | 
116 |   * Fix stylecheck errors
117 | 
118 |   * Add retries to Handle EC issues in HDFS metadata committer (#41)
119 | 
120 |   * Handle EC issues in HDFS MetaCommitter
121 | 
122 |   * Avoid timestamp as Offset and avoid empty batches when there is no new data (#49)
123 | - Avoid timestamp as Offset and avoid empty batches when there is no new
124 |   data (#49) [Vikram Agrawal]
125 | - Add retries to Handle EC issues in HDFS metadata committer (#41)
126 |   [Vikram Agrawal]
127 | 
128 |   * Handle EC issues in HDFS MetaCommitter
129 | - Fix stylecheck errors. [Vikram Agrawal]
130 | - Fix ambiguity with scala 2.12 (#37) [Marcin Szymański]
131 | - Fix issues with over-ridden Kinesis Source Options (#36) [Vikram
132 |   Agrawal]
133 | 
134 |   * Fix issues with over-ridden Kinesis Source Options
135 | 
136 |   * Update ReadMe
137 | 
138 |   * Fix Review Comments
139 | - Fix broken test suite and scalastyle error. [anthony.may]
140 | - Use latest DSv2 APIs for continuous processing. [Vikram Agrawal]
141 | - Handle greater than 100 shard streams. [Vikram Agrawal]
142 | - Fix for handling greater than 100 shard streams. [Siddhartha Jain]
143 | - Update README.md. [Vikram Agrawal]
144 | - Add AWSInstanceProfileCredentialsProviderWithRetries to handle issues
145 |   with instance profile provider. [Vikram Agrawal]
146 | - Bump up spark version. [Vikram Agrawal]
147 | - Support of Kinesis Connector for Continuous streaming (#15) [Vikram
148 |   Agrawal]
149 | - Fixed aws sdk and kpl versions (#18) [Georgios]
150 | - Bump up Kinesis Client version. [Vikram Agrawal]
151 | - Support of AWS roles / instance profile for Authentication (#13)
152 |   [Vikram Agrawal]
153 | 
154 |   * Support of AWS roles / instance profile for Authentication
155 | 
156 |   * InstanceProfileCredentials should be case object instead of case class
157 | 
158 |   * Fix unit tests
159 | 
160 |   * clean up
161 | - Update README. [Vikram Agrawal]
162 | - Merge branch '2.3.0' [Vikram Agrawal]
163 | - Fixes to support build against SPARK 2.3.0. [Vikram Agrawal]
164 | - Merge pull request #8 from ggeorgiadis/master. [Vikram Agrawal]
165 | 
166 |   Fixed issue with closed child process caused by the cache timeout
167 | - Cleanup. [Georgios Georgiadis]
168 | - Fixed bug in parameters validation and cache timeout. [Georgios
169 |   Georgiadis]
170 | - Merge pull request #5 from ggeorgiadis/master. [Vikram Agrawal]
171 | 
172 |   Added Kinesis Sink support
173 | - Removed duplicate awsSecretKey option. [Georgios]
174 | - Remove kinesis sink from roadmap. [Georgios Georgiadis]
175 | - Enable record aggregation by default. [Georgios Georgiadis]
176 | - Added kinesis.executor.aggregationEnabled and
177 |   kinesis.producer.cache.timeout parameters Also we flush the producer
178 |   before we destroy. [Georgios Georgiadis]
179 | - Updated configuration naming. [Georgios Georgiadis]
180 | - Updated Readme. [Georgios Georgiadis]
181 | - Improved validation and removed region param. [Georgios Georgiadis]
182 | - Added tests and cleaned up. [Georgios Georgiadis]
183 | - Clean up and use specific aws creds for sink. [Georgios Georgiadis]
184 | - Added Kinesis Sink support. [Georgios Georgiadis]
185 | - Merge branch 'master' of github.com:qubole/kinesis-sql. [Vikram
186 |   Agrawal]
187 | - Update README.md. [mayankahuja]
188 | - Update README.md. [Vikram Agrawal]
189 | - Shade Jackson Dataformat Jar. [Vikram Agrawal]
190 | - Update README file and remove filter for ShardEnd in getOffset.
191 |   [Vikram Agrawal]
192 | - Update Gitignore file. [Vikram Agrawal]
193 | - Support for Kinesis Source in Spark Structured Streaming. [Vikram
194 |   Agrawal]
195 | - Initial commit. [Rajat Venkatesh]
196 | 
197 | 
198 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/qubole/kinesis-sql.svg?branch=master)](https://travis-ci.org/qubole/kinesis-sql)
  2 | 
  3 | ## NOTE: This project is NO LONGER MAINTAINED. 
  4 | 
  5 | [Ron Cremer](https://github.com/roncemer) has volunteered to maintain this project. Beginning with Spark 3.2, the new project is located here: https://github.com/roncemer/spark-sql-kinesis
  6 | 
  7 | 
  8 | # Kinesis Connector for Structured Streaming 
  9 | 
 10 | Implementation of Kinesis Source Provider in Spark Structured Streaming. [SPARK-18165](https://issues.apache.org/jira/browse/SPARK-18165) describes the need for such implementation. More details on the implementation can be read in this [blog](https://www.qubole.com/blog/kinesis-connector-for-structured-streaming/)
 11 | 
 12 | ## Downloading and Using the Connector
 13 | 
 14 | The connector is available from the Maven Central repository. It can be used using the --packages option or the spark.jars.packages configuration property. Use the following connector artifact
 15 | 
 16 | 	Spark 3.0: com.qubole.spark/spark-sql-kinesis_2.12/1.2.0-spark_3.0
 17 | 	Spark 2.4: com.qubole.spark/spark-sql-kinesis_2.11/1.2.0-spark_2.4
 18 | 
 19 | ## Developer Setup
 20 | Checkout kinesis-sql branch depending upon your Spark version. Use Master branch for the latest Spark version 
 21 | 
 22 | ###### Spark version 3.0.x
 23 | 	git clone git@github.com:qubole/kinesis-sql.git
 24 | 	git checkout master
 25 | 	cd kinesis-sql
 26 | 	mvn install -DskipTests
 27 | 
 28 | This will create *target/spark-sql-kinesis_2.12-\*.jar* file which contains the connector code and its dependency jars.
 29 | 
 30 | 
 31 | ## How to use it
 32 | 
 33 | #### Setup Kinesis
 34 | Refer [Amazon Docs](https://docs.aws.amazon.com/cli/latest/reference/kinesis/create-stream.html) for more options
 35 | 
 36 | ###### Create Kinesis Stream 
 37 | 
 38 | 	$ aws kinesis create-stream --stream-name test --shard-count 2
 39 |     
 40 | ###### Add Records in the stream
 41 | 	
 42 |     $ aws kinesis put-record --stream-name test --partition-key 1 --data 'Kinesis'
 43 |     $ aws kinesis put-record --stream-name test --partition-key 1 --data 'Connector'
 44 |     $ aws kinesis put-record --stream-name test --partition-key 1 --data 'for'
 45 |     $ aws kinesis put-record --stream-name test --partition-key 1 --data 'Apache'
 46 |     $ aws kinesis put-record --stream-name test --partition-key 1 --data 'Spark'
 47 | 
 48 | #### Example Streaming Job
 49 | 
 50 | Refering $SPARK_HOME to the Spark installation directory.
 51 | 
 52 | ###### Open Spark-Shell
 53 | 
 54 | 	$SPARK_HOME/bin/spark-shell --jars target/spark-sql-kinesis_2.11-2.2.0.jar
 55 | 
 56 | ###### Subscribe to Kinesis Source
 57 | 	// Subscribe the "test" stream
 58 | 	scala> :paste
 59 | 	val kinesis = spark
 60 |   		.readStream
 61 |   		.format("kinesis")
 62 |     	.option("streamName", "spark-streaming-example")
 63 |        	.option("endpointUrl", "https://kinesis.us-east-1.amazonaws.com")
 64 |         .option("awsAccessKeyId", [ACCESS_KEY])
 65 |         .option("awsSecretKey", [SECRET_KEY])
 66 |         .option("startingposition", "TRIM_HORIZON")
 67 |   		.load
 68 | 
 69 | ###### Check Schema 
 70 | 	scala> kinesis.printSchema
 71 | 	root
 72 |  	|-- data: binary (nullable = true)
 73 |  	|-- streamName: string (nullable = true)
 74 |  	|-- partitionKey: string (nullable = true)
 75 |  	|-- sequenceNumber: string (nullable = true)
 76 |  	|-- approximateArrivalTimestamp: timestamp (nullable = true)
 77 | 
 78 | ###### Word Count 
 79 | 	// Cast data into string and group by data column
 80 | 	scala> :paste
 81 |     
 82 |     	 kinesis
 83 |         .selectExpr("CAST(data AS STRING)").as[(String)]
 84 |         .groupBy("data").count()
 85 |   		.writeStream
 86 |   		.format("console")
 87 |         .outputMode("complete") 
 88 |   		.start()
 89 |   		.awaitTermination()
 90 |         
 91 | ###### Output in Console
 92 | 
 93 | 
 94 | 	+------------+-----+
 95 | 	|        data|count|
 96 | 	+------------+-----+
 97 | 	|         for|    1|
 98 | 	|      Apache|    1|
 99 |     |       Spark|    1|
100 | 	|     Kinesis|    1|
101 | 	|   Connector|    1|
102 | 	+------------+-----+ 
103 | 
104 | ###### Using the Kinesis Sink
105 |     // Cast data into string and group by data column
106 |         scala> :paste
107 |         kinesis
108 |         .selectExpr("CAST(rand() AS STRING) as partitionKey","CAST(data AS STRING)").as[(String,String)]
109 |         .groupBy("data").count()
110 |   	    .writeStream
111 |   	    .format("kinesis")
112 |         .outputMode("update") 
113 |         .option("streamName", "spark-sink-example")
114 |         .option("endpointUrl", "https://kinesis.us-east-1.amazonaws.com")
115 |         .option("awsAccessKeyId", [ACCESS_KEY])
116 |         .option("awsSecretKey", [SECRET_KEY])
117 |   	    .start()
118 |   	    .awaitTermination()
119 | 
120 | ## Kinesis Source Configuration 
121 | 
122 |  Option-Name        | Default-Value           | Description  |
123 | | ------------- |:-------------:| -----:|
124 | | streamName     | - | Name of the stream in Kinesis to read from |
125 | | endpointUrl     |   https://kinesis.us-east-1.amazonaws.com    |   end-point URL for Kinesis Stream|
126 | | awsAccessKeyId |    -     |    AWS Credentials for Kinesis describe, read record operations |
127 | | awsSecretKey |      -  |    AWS Credentials for Kinesis describe, read record operations |
128 | | awsSTSRoleARN |      -  |    AWS STS Role ARN for Kinesis describe, read record operations |
129 | | awsSTSSessionName |      -  |    AWS STS Session name for Kinesis describe, read record operations |
130 | | awsUseInstanceProfile | true |    Use Instance Profile Credentials if none of credentials provided |
131 | | startingPosition |      LATEST |    Starting Position in Kinesis to fetch data from. Possible values are "latest", "trim_horizon", "earliest" (alias for trim_horizon), or JSON serialized map shardId->KinesisPosition   |
132 | | failondataloss| true | fail the streaming job if any active shard is missing or expired
133 | | kinesis.executor.maxFetchTimeInMs |     1000 |  Maximum time spent in executor to fetch record from Kinesis per Shard |
134 | | kinesis.executor.maxFetchRecordsPerShard |     100000 |  Maximum Number of records to fetch per shard  |
135 | | kinesis.executor.maxRecordPerRead |     10000 |  Maximum Number of records to fetch per getRecords API call  |
136 | | kinesis.executor.addIdleTimeBetweenReads	| false	| Add delay between two consecutive getRecords API call	|
137 | | kinesis.executor.idleTimeBetweenReadsInMs	| 1000	| Minimum delay between two consecutive getRecords	| 
138 | | kinesis.client.describeShardInterval |      1s (1 second) |  Minimum Interval between two ListShards API calls to consider resharding  |
139 | | kinesis.client.numRetries |     3 |  Maximum Number of retries for Kinesis API requests  |
140 | | kinesis.client.retryIntervalMs |     1000 |  Cool-off period before retrying Kinesis API  |
141 | | kinesis.client.maxRetryIntervalMs	| 10000	| Max Cool-off period between 2 retries	|
142 | | kinesis.client.avoidEmptyBatches| false | Avoid creating an empty microbatch job by checking upfront if there are any unread data in the stream before the batch is started
143 | 
144 | ## Kinesis Sink Configuration
145 |  Option-Name        | Default-Value           | Description  |
146 | | ------------- |:-------------:| -----:|
147 | | streamName   | - | Name of the stream in Kinesis to write to|
148 | | endpointUrl  | https://kinesis.us-east-1.amazonaws.com |  The aws endpoint of the kinesis Stream |
149 | | awsAccessKeyId |    -     |    AWS Credentials for  Kinesis describe, read record operations    
150 | | awsSecretKey |      -  |    AWS Credentials for  Kinesis describe, read record |
151 | | awsSTSRoleARN |      -  |    AWS STS Role ARN for Kinesis describe, read record operations |
152 | | awsSTSSessionName |      -  |    AWS STS Session name for Kinesis describe, read record operations |
153 | | awsUseInstanceProfile | true |    Use Instance Profile Credentials if none of credentials provided |
154 | | kinesis.executor.recordMaxBufferedTime | 1000 (millis) | Specify the maximum buffered time of a record |
155 | | kinesis.executor.maxConnections | 1 | Specify the maximum connections to Kinesis | 
156 | | kinesis.executor.aggregationEnabled | true | Specify if records should be aggregated before sending them to Kinesis |
157 | | kniesis.executor.flushwaittimemillis | 100 | Wait time while flushing records to Kinesis on Task End |
158 | 
159 | ## Roadmap
160 | *  We need to migrate to DataSource V2 APIs for MicroBatchExecution.
161 | *  Maintain Per Micro-Batch Shard Commit state in Dynamo DB
162 | 
163 | ## Acknowledgement
164 | 
165 | This connector would not have been possible without reference implemetation of [Kafka connector](https://github.com/apache/spark/tree/branch-2.2/external/kafka-0-10-sql) for Structured streaming, [Kinesis Connector](https://github.com/apache/spark/tree/branch-2.2/external/kinesis-asl) for Legacy Streaming and [Kinesis Client Library](https://github.com/awslabs/amazon-kinesis-client). Structure of some part of the code is influenced by the excellent work done by various Apache Spark Contributors.
166 | 


--------------------------------------------------------------------------------
/dev/checkstyle-suppressions.xml:
--------------------------------------------------------------------------------
 1 | <!--
 2 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
 3 |   ~ contributor license agreements.  See the NOTICE file distributed with
 4 |   ~ this work for additional information regarding copyright ownership.
 5 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
 6 |   ~ (the "License"); you may not use this file except in compliance with
 7 |   ~ the License.  You may obtain a copy of the License at
 8 |   ~
 9 |   ~    http://www.apache.org/licenses/LICENSE-2.0
10 |   ~
11 |   ~ Unless required by applicable law or agreed to in writing, software
12 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
13 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |   ~ See the License for the specific language governing permissions and
15 |   ~ limitations under the License.
16 |   -->
17 | 
18 | <!DOCTYPE suppressions PUBLIC
19 | "-//Puppy Crawl//DTD Suppressions 1.1//EN"
20 | "http://www.puppycrawl.com/dtds/suppressions_1_1.dtd">
21 | 
22 | <!--
23 | 
24 |     This file contains suppression rules for Checkstyle checks.
25 |     Ideally only files that cannot be modified (e.g. third-party code)
26 |     should be added here. All other violations should be fixed.
27 | 
28 | -->
29 | 
30 | <suppressions>
31 |     <suppress checks=".*"
32 |               files="core/src/main/java/org/apache/spark/util/collection/TimSort.java"/>
33 |     <suppress checks=".*"
34 |               files="sql/core/src/main/java/org/apache/spark/sql/api.java/*"/>
35 |     <suppress checks="LineLength"
36 |               files="src/test/java/org/apache/spark/sql/hive/test/Complex.java"/>
37 |     <suppress checks="LineLength"
38 |               files="src/main/java/org/apache/spark/examples/JavaLogQuery.java"/>
39 |     <suppress checks="LineLength"
40 |               files="src/main/java/org/apache/hive/service/*"/>
41 |     <suppress checks="MethodName"
42 |               files="src/main/java/org/apache/hive/service/auth/PasswdAuthenticationProvider.java"/>
43 |     <suppress checks="MethodName"
44 |               files="sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java"/>
45 |     <suppress checks="MethodName"
46 |               files="sql/catalyst/src/main/java/org/apache/spark/sql/streaming/GroupStateTimeout.java"/>
47 |     <suppress checks="MethodName"
48 |               files="sql/core/src/main/java/org/apache/spark/sql/streaming/Trigger.java"/>
49 | </suppressions>
50 | 


--------------------------------------------------------------------------------
/dev/checkstyle.xml:
--------------------------------------------------------------------------------
  1 | <!--
  2 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
  3 |   ~ contributor license agreements.  See the NOTICE file distributed with
  4 |   ~ this work for additional information regarding copyright ownership.
  5 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
  6 |   ~ (the "License"); you may not use this file except in compliance with
  7 |   ~ the License.  You may obtain a copy of the License at
  8 |   ~
  9 |   ~    http://www.apache.org/licenses/LICENSE-2.0
 10 |   ~
 11 |   ~ Unless required by applicable law or agreed to in writing, software
 12 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
 13 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |   ~ See the License for the specific language governing permissions and
 15 |   ~ limitations under the License.
 16 |   -->
 17 | 
 18 | <!DOCTYPE module PUBLIC
 19 |           "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
 20 |           "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
 21 | 
 22 | <!--
 23 | 
 24 |     Checkstyle configuration based on the Google coding conventions from:
 25 | 
 26 |     -  Google Java Style
 27 |        https://google-styleguide.googlecode.com/svn-history/r130/trunk/javaguide.html
 28 | 
 29 |     with Spark-specific changes from:
 30 | 
 31 |     http://spark.apache.org/contributing.html#code-style-guide
 32 | 
 33 |     Checkstyle is very configurable. Be sure to read the documentation at
 34 |     http://checkstyle.sf.net (or in your downloaded distribution).
 35 | 
 36 |     Most Checks are configurable, be sure to consult the documentation.
 37 | 
 38 |     To completely disable a check, just comment it out or delete it from the file.
 39 | 
 40 |     Authors: Max Vetrenko, Ruslan Diachenko, Roman Ivanov.
 41 | 
 42 |  -->
 43 | 
 44 | <module name = "Checker">
 45 |     <property name="charset" value="UTF-8"/>
 46 | 
 47 |     <property name="severity" value="error"/>
 48 | 
 49 |     <property name="fileExtensions" value="java, properties, xml"/>
 50 | 
 51 |     <module name="SuppressionFilter">
 52 |       <property name="file" value="dev/checkstyle-suppressions.xml"/>
 53 |     </module>
 54 | 
 55 |     <!-- Checks for whitespace                               -->
 56 |     <!-- See http://checkstyle.sf.net/config_whitespace.html -->
 57 |     <module name="FileTabCharacter">
 58 |         <property name="eachLine" value="true"/>
 59 |     </module>
 60 | 
 61 |     <module name="RegexpSingleline">
 62 |         <!-- \s matches whitespace character, $ matches end of line. -->
 63 |         <property name="format" value="\s+$"/>
 64 |         <property name="message" value="No trailing whitespace allowed."/>
 65 |     </module>
 66 | 
 67 |     <module name="NewlineAtEndOfFile"/>
 68 | 
 69 |     <module name="TreeWalker">
 70 |         <!--
 71 |         If you wish to turn off checking for a section of code, you can put a comment in the source
 72 |         before and after the section, with the following syntax:
 73 | 
 74 |           // checkstyle:off no.XXX (such as checkstyle.off: NoFinalizer)
 75 |           ...  // stuff that breaks the styles
 76 |           // checkstyle:on
 77 |         -->
 78 |         <module name="SuppressionCommentFilter">
 79 |             <property name="offCommentFormat" value="checkstyle.off\: ([\w\|]+)"/>
 80 |             <property name="onCommentFormat" value="checkstyle.on\: ([\w\|]+)"/>
 81 |             <property name="checkFormat" value="$1"/>
 82 |         </module>
 83 |         <module name="OuterTypeFilename"/>
 84 |         <module name="IllegalTokenText">
 85 |             <property name="tokens" value="STRING_LITERAL, CHAR_LITERAL"/>
 86 |             <property name="format" value="\\u00(08|09|0(a|A)|0(c|C)|0(d|D)|22|27|5(C|c))|\\(0(10|11|12|14|15|42|47)|134)"/>
 87 |             <property name="message" value="Avoid using corresponding octal or Unicode escape."/>
 88 |         </module>
 89 |         <module name="AvoidEscapedUnicodeCharacters">
 90 |             <property name="allowEscapesForControlCharacters" value="true"/>
 91 |             <property name="allowByTailComment" value="true"/>
 92 |             <property name="allowNonPrintableEscapes" value="true"/>
 93 |         </module>
 94 |         <module name="LineLength">
 95 |             <property name="max" value="100"/>
 96 |             <property name="ignorePattern" value="^package.*|^import.*|a href|href|http://|https://|ftp://"/>
 97 |         </module>
 98 |         <module name="NoLineWrap"/>
 99 |         <module name="EmptyBlock">
100 |             <property name="option" value="TEXT"/>
101 |             <property name="tokens" value="LITERAL_TRY, LITERAL_FINALLY, LITERAL_IF, LITERAL_ELSE, LITERAL_SWITCH"/>
102 |         </module>
103 |         <module name="NeedBraces">
104 |             <property name="allowSingleLineStatement" value="true"/>
105 |         </module>
106 |         <module name="OneStatementPerLine"/>
107 |         <module name="ArrayTypeStyle"/>
108 |         <module name="FallThrough"/>
109 |         <module name="UpperEll"/>
110 |         <module name="ModifierOrder"/>
111 |         <module name="SeparatorWrap">
112 |             <property name="tokens" value="DOT"/>
113 |             <property name="option" value="nl"/>
114 |         </module>
115 |         <module name="SeparatorWrap">
116 |             <property name="tokens" value="COMMA"/>
117 |             <property name="option" value="EOL"/>
118 |         </module>
119 |         <module name="PackageName">
120 |             <property name="format" value="^[a-z]+(\.[a-z][a-z0-9]*)*$"/>
121 |             <message key="name.invalidPattern"
122 |              value="Package name ''{0}'' must match pattern ''{1}''."/>
123 |         </module>
124 |         <module name="ClassTypeParameterName">
125 |             <property name="format" value="([A-Z][a-zA-Z0-9]*$)"/>
126 |             <message key="name.invalidPattern"
127 |              value="Class type name ''{0}'' must match pattern ''{1}''."/>
128 |         </module>
129 |         <module name="MethodTypeParameterName">
130 |             <property name="format" value="([A-Z][a-zA-Z0-9]*)"/>
131 |             <message key="name.invalidPattern"
132 |              value="Method type name ''{0}'' must match pattern ''{1}''."/>
133 |         </module>
134 |         <module name="GenericWhitespace">
135 |             <message key="ws.followed"
136 |              value="GenericWhitespace ''{0}'' is followed by whitespace."/>
137 |              <message key="ws.preceded"
138 |              value="GenericWhitespace ''{0}'' is preceded with whitespace."/>
139 |              <message key="ws.illegalFollow"
140 |              value="GenericWhitespace ''{0}'' should followed by whitespace."/>
141 |              <message key="ws.notPreceded"
142 |              value="GenericWhitespace ''{0}'' is not preceded with whitespace."/>
143 |         </module>
144 |         <!-- TODO: 11/09/15 disabled - indentation is currently inconsistent -->
145 |         <!--
146 |         <module name="Indentation">
147 |             <property name="basicOffset" value="4"/>
148 |             <property name="braceAdjustment" value="0"/>
149 |             <property name="caseIndent" value="4"/>
150 |             <property name="throwsIndent" value="4"/>
151 |             <property name="lineWrappingIndentation" value="4"/>
152 |             <property name="arrayInitIndent" value="4"/>
153 |         </module>
154 |         -->
155 |         <!-- TODO: 11/09/15 disabled - order is currently wrong in many places -->
156 |         <!--
157 |         <module name="ImportOrder">
158 |             <property name="separated" value="true"/>
159 |             <property name="ordered" value="true"/>
160 |             <property name="groups" value="/^javax?\./,scala,*,org.apache.spark"/>
161 |         </module>
162 |         -->
163 |         <module name="MethodParamPad"/>
164 |         <module name="AnnotationLocation">
165 |             <property name="tokens" value="CLASS_DEF, INTERFACE_DEF, ENUM_DEF, METHOD_DEF, CTOR_DEF"/>
166 |         </module>
167 |         <module name="AnnotationLocation">
168 |             <property name="tokens" value="VARIABLE_DEF"/>
169 |             <property name="allowSamelineMultipleAnnotations" value="true"/>
170 |         </module>
171 |         <module name="MethodName">
172 |             <property name="format" value="^[a-z][a-z0-9][a-zA-Z0-9_]*$"/>
173 |             <message key="name.invalidPattern"
174 |              value="Method name ''{0}'' must match pattern ''{1}''."/>
175 |         </module>
176 |         <module name="EmptyCatchBlock">
177 |             <property name="exceptionVariableName" value="expected"/>
178 |         </module>
179 |         <module name="CommentsIndentation"/>
180 |         <module name="UnusedImports"/>
181 |         <module name="RedundantImport"/>
182 |         <module name="RedundantModifier"/>
183 |     </module>
184 | </module>
185 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 | ~ Licensed to the Apache Software Foundation (ASF) under one or more
  4 | ~ contributor license agreements.  See the NOTICE file distributed with
  5 | ~ this work for additional information regarding copyright ownership.
  6 | ~ The ASF licenses this file to You under the Apache License, Version 2.0
  7 | ~ (the "License"); you may not use this file except in compliance with
  8 | ~ the License.  You may obtain a copy of the License at
  9 | ~
 10 | ~    http://www.apache.org/licenses/LICENSE-2.0
 11 | ~
 12 | ~ Unless required by applicable law or agreed to in writing, software
 13 | ~ distributed under the License is distributed on an "AS IS" BASIS,
 14 | ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | ~ See the License for the specific language governing permissions and
 16 | ~ limitations under the License.
 17 | -->
 18 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 19 |   <modelVersion>4.0.0</modelVersion>
 20 | 
 21 |   <groupId>com.qubole.spark</groupId>
 22 |   <artifactId>spark-sql-kinesis_2.12</artifactId>
 23 |   <version>1.2.1_spark-3.0-SNAPSHOT</version>
 24 |   <packaging>jar</packaging>
 25 |   <name>Kinesis Integration for Structured Streaming</name>
 26 |   <description>Connector to read from and write into Kinesis from Structured Streaming Applications</description>
 27 |   <url>http://github.com/qubole/kinesis-sql</url>
 28 | 
 29 | 
 30 |   <developers>
 31 |     <developer>
 32 |       <id>qubole</id>
 33 |       <organization>Qubole Inc.</organization>
 34 |       <organizationUrl>http://www.qubole.com</organizationUrl>
 35 |       <roles>
 36 |         <role>developer</role>
 37 |       </roles>
 38 |     </developer>
 39 |   </developers>
 40 | 
 41 |   <licenses>
 42 |     <license>
 43 |       <name>Apache License, Version 2.0</name>
 44 |       <url>https://github.com/qubole/kinesis-sql/blob/master/LICENSE.txt</url>
 45 |       <distribution>repo</distribution>
 46 |     </license>
 47 |   </licenses>
 48 | 
 49 |   <scm>
 50 |     <connection>scm:git:git://github.com/qubole/kinesis-sql.git</connection>
 51 |     <url>http://github.com/qubole/kinesis-sql</url>
 52 |     <developerConnection>scm:git:git@github.com:qubole/kinesis-sql.git</developerConnection>
 53 |     <tag>spark-sql-kinesis_2.12-1.2.0-spark_3.0</tag>
 54 |   </scm>
 55 | 
 56 |   <inceptionYear>2018</inceptionYear>
 57 |   <organization>
 58 |     <name>Qubole</name>
 59 |     <url>http://www.qubole.com/</url>
 60 |   </organization>
 61 | 
 62 |   <properties>
 63 |     <sbt.project.name>sql-kinesis</sbt.project.name>
 64 |     <spark.version>3.0.1</spark.version>
 65 |     <scala.binary.version>2.12</scala.binary.version>
 66 |     <fasterxml.jackson.version>2.10.0</fasterxml.jackson.version>
 67 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 68 |     <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
 69 |   </properties>
 70 | 
 71 |   <dependencies>
 72 |     <dependency>
 73 |       <groupId>org.apache.spark</groupId>
 74 |       <artifactId>spark-sql_${scala.binary.version}</artifactId>
 75 |       <version>${spark.version}</version>
 76 |       <scope>provided</scope>
 77 |     </dependency>
 78 |     <dependency>
 79 |       <groupId>org.apache.spark</groupId>
 80 |       <artifactId>spark-core_${scala.binary.version}</artifactId>
 81 |       <version>${spark.version}</version>
 82 |       <type>test-jar</type>
 83 |       <scope>test</scope>
 84 |     </dependency>
 85 |     <dependency>
 86 |       <groupId>org.apache.spark</groupId>
 87 |       <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
 88 |       <version>${spark.version}</version>
 89 |       <type>test-jar</type>
 90 |       <scope>test</scope>
 91 |     </dependency>
 92 |     <dependency>
 93 |       <groupId>org.apache.spark</groupId>
 94 |       <artifactId>spark-sql_${scala.binary.version}</artifactId>
 95 |       <version>${spark.version}</version>
 96 |       <type>test-jar</type>
 97 |       <scope>test</scope>
 98 |     </dependency>
 99 |     <dependency>
100 |       <groupId>com.amazonaws</groupId>
101 |       <artifactId>amazon-kinesis-client</artifactId>
102 |       <version>1.9.0</version>
103 |     </dependency>
104 |     <dependency>
105 |     <groupId>com.amazonaws</groupId>
106 |       <artifactId>aws-java-sdk-core</artifactId>
107 |       <version>1.11.655</version>
108 |     </dependency>
109 |     <dependency>
110 |       <groupId>com.amazonaws</groupId>
111 |       <artifactId>aws-java-sdk-sts</artifactId>
112 |       <version>1.11.271</version>
113 |     </dependency>
114 |     <dependency>
115 |       <groupId>com.amazonaws</groupId>
116 |       <artifactId>amazon-kinesis-producer</artifactId>
117 |       <version>0.12.8</version>
118 |     </dependency>
119 |     <dependency>
120 |       <groupId>com.google.protobuf</groupId>
121 |       <artifactId>protobuf-java</artifactId>
122 |       <version>3.16.1</version>
123 |       <!--
124 |          We are being explicit about version here and overriding the
125 |          spark default of 2.5.0 because KCL appears to have introduced
126 |          a dependency on protobuf 3.16.1 somewhere between KCL 1.4.0 and 1.6.1.
127 |        -->
128 |     </dependency>
129 |     <dependency>
130 |       <groupId>com.fasterxml.jackson.dataformat</groupId>
131 |       <artifactId>jackson-dataformat-cbor</artifactId>
132 |       <version>${fasterxml.jackson.version}</version>
133 |     </dependency>
134 |     <dependency>
135 |       <groupId>org.mockito</groupId>
136 |       <artifactId>mockito-core</artifactId>
137 |       <version>3.1.0</version>
138 |       <scope>test</scope>
139 |     </dependency>
140 |     <dependency>
141 |       <groupId>org.scalacheck</groupId>
142 |       <artifactId>scalacheck_${scala.binary.version}</artifactId>
143 |       <version>1.14.2</version>
144 |       <scope>test</scope>
145 |     </dependency>
146 |     <dependency>
147 |       <groupId>org.scalatest</groupId>
148 |       <artifactId>scalatest_${scala.binary.version}</artifactId>
149 |       <version>3.0.8</version>
150 |       <scope>test</scope>
151 |     </dependency>
152 |     <dependency>
153 |       <groupId>org.apache.spark</groupId>
154 |       <artifactId>spark-tags_${scala.binary.version}</artifactId>
155 |       <version>${spark.version}</version>
156 |     </dependency>
157 | 
158 |     <!--
159 |       This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
160 |       them will yield errors.
161 |     -->
162 |     <dependency>
163 |       <groupId>org.apache.spark</groupId>
164 |       <artifactId>spark-tags_${scala.binary.version}</artifactId>
165 |       <type>test-jar</type>
166 |       <scope>test</scope>
167 |       <version>${spark.version}</version>
168 |     </dependency>
169 |   </dependencies>
170 | 
171 |   <build>
172 |     <pluginManagement>
173 |       <plugins>
174 |         <plugin>
175 |           <groupId>org.scalatest</groupId>
176 |           <artifactId>scalatest-maven-plugin</artifactId>
177 |           <version>2.0.0</version>
178 |           <executions>
179 |             <execution>
180 |               <id>test</id>
181 |               <goals>
182 |                 <goal>test</goal>
183 |               </goals>
184 |             </execution>
185 |           </executions>
186 |         </plugin>
187 |         <plugin>
188 |           <groupId>net.alchim31.maven</groupId>
189 |           <artifactId>scala-maven-plugin</artifactId>
190 |           <version>4.3.0</version>
191 |           <executions>
192 |             <execution>
193 |               <id>compile</id>
194 |               <goals>
195 |                 <goal>compile</goal>
196 |                 <goal>add-source</goal>
197 |                 <goal>doc-jar</goal>
198 |               </goals>
199 |               <phase>compile</phase>
200 |             </execution>
201 |             <execution>
202 |               <id>test-compile</id>
203 |               <goals>
204 |                 <goal>testCompile</goal>
205 |               </goals>
206 |               <phase>test-compile</phase>
207 |             </execution>
208 |             <execution>
209 |               <phase>process-resources</phase>
210 |               <goals>
211 |                 <goal>compile</goal>
212 |               </goals>
213 |             </execution>
214 |           </executions>
215 |           <configuration>
216 |             <args>
217 |               <arg>-nobootcp</arg>
218 |             </args>
219 |           </configuration>
220 |         </plugin>
221 |         <plugin>
222 |           <groupId>org.apache.maven.plugins</groupId>
223 |           <artifactId>maven-compiler-plugin</artifactId>
224 |           <version>3.8.1</version>
225 |           <executions>
226 |             <execution>
227 |               <phase>compile</phase>
228 |               <goals>
229 |                 <goal>compile</goal>
230 |               </goals>
231 |             </execution>
232 |           </executions>
233 |           <configuration>
234 |             <source>1.8</source>
235 |             <target>1.8</target>
236 |           </configuration>
237 |         </plugin>
238 |         <plugin>
239 |           <groupId>org.apache.maven.plugins</groupId>
240 |           <artifactId>maven-shade-plugin</artifactId>
241 |           <version>3.2.1</version>
242 |           <executions>
243 |             <execution>
244 |               <phase>package</phase>
245 |               <goals>
246 |                 <goal>shade</goal>
247 |               </goals>
248 |               <configuration>
249 |                 <artifactSet>
250 |                   <includes>
251 |                     <include>com.amazonaws:amazon-kinesis-client:*</include>
252 |                     <include>com.amazonaws:amazon-kinesis-producer:*</include>
253 |                     <include>com.amazonaws:aws-java-sdk-kinesis:*</include>
254 |                     <include>com.amazonaws:aws-java-sdk-dynamodb:*</include>
255 |                     <include>com.amazonaws:aws-java-sdk-core:*</include>
256 |                     <include>com.amazonaws:aws-java-sdk-sts:*</include>
257 |                     <include>com.fasterxml.jackson.dataformat:*:*</include>
258 |                     <include>com.google.protobuf:*:*</include>
259 |                   </includes>
260 |                 </artifactSet>
261 |                 <relocations>
262 |                   <relocation>
263 |                     <pattern>com.fasterxml.jackson.dataformat</pattern>
264 |                     <shadedPattern>org.apache.spark.sql.kinesis.shaded.fasterxml.jackson.dataformat</shadedPattern>
265 |                     <includes>
266 |                       <include>com.fasterxml.jackson.dataformat.**</include>
267 |                     </includes>
268 |                   </relocation>
269 |                   <relocation>
270 |                     <pattern>com.amazonaws</pattern>
271 |                     <shadedPattern>org.apache.spark.sql.kinesis.shaded.amazonaws</shadedPattern>
272 |                     <includes>
273 |                       <include>com.amazonaws.**</include>
274 |                     </includes>
275 |                   </relocation>
276 |                   <relocation>
277 |                     <pattern>com.google.protobuf</pattern>
278 |                     <shadedPattern>org.apache.spark.sql.kinesis.shaded.google.protobuf</shadedPattern>
279 |                     <includes>
280 |                       <include>com.google.protobuf.**</include>
281 |                     </includes>
282 |                   </relocation>
283 |                 </relocations>
284 |                 <filters>
285 |                   <filter>
286 |                     <artifact>*:*</artifact>
287 |                     <excludes>
288 |                       <exclude>META-INF/LICENSE*</exclude>
289 |                       <exclude>META-INF/NOTICE*</exclude>
290 |                       <exclude>META-INF/DEPENDENCIES</exclude>
291 |                       <exclude>META-INF/maven/**</exclude>
292 |                       <exclude>META-INF/*.xml</exclude>
293 |                       <exclude>META-INF/*.SF</exclude>
294 |                       <exclude>META-INF/*.DSA</exclude>
295 |                       <exclude>META-INF/*.RSA</exclude>
296 |                       <exclude>models/*</exclude>
297 |                       <exclude>.gitkeep</exclude>
298 |                     </excludes>
299 |                   </filter>
300 |                 </filters>
301 |                 <transformers>
302 |                   <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
303 |                   <transformer implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer">
304 |                     <resource>log4j.properties</resource>
305 |                   </transformer>
306 |                   <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer" />
307 |                   <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer" />
308 |                 </transformers>
309 |               </configuration>
310 |             </execution>
311 |           </executions>
312 |         </plugin>
313 |       </plugins>
314 |     </pluginManagement>
315 |     <!-- Enable surefire and scalatest in all children, in one place: -->
316 |     <plugins>
317 |       <plugin>
318 |         <groupId>net.alchim31.maven</groupId>
319 |         <artifactId>scala-maven-plugin</artifactId>
320 |         <version>4.3.0</version>
321 |       </plugin>
322 |       <plugin>
323 |         <groupId>org.apache.maven.plugins</groupId>
324 |         <artifactId>maven-shade-plugin</artifactId>
325 |         <version>3.2.1</version>
326 |       </plugin>
327 |       <plugin>
328 |         <groupId>org.scalatest</groupId>
329 |         <artifactId>scalatest-maven-plugin</artifactId>
330 |         <version>2.0.0</version>
331 |       </plugin>
332 |     </plugins>
333 | 
334 |   <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
335 |   <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
336 |   </build>
337 | 
338 |   <profiles>
339 |     <profile>
340 |       <id>release</id>
341 |       <build>
342 |         <plugins>
343 |           <plugin>
344 |             <groupId>org.apache.maven.plugins</groupId>
345 |             <artifactId>maven-source-plugin</artifactId>
346 |             <version>3.1.0</version>
347 |             <executions>
348 |               <execution>
349 |                 <id>create-sources-jar</id>
350 |                 <goals>
351 |                   <goal>jar-no-fork</goal>
352 |                 </goals>
353 |               </execution>
354 |             </executions>
355 |           </plugin>
356 |           <plugin>
357 |             <groupId>org.apache.maven.plugins</groupId>
358 |             <artifactId>maven-gpg-plugin</artifactId>
359 |             <version>1.5</version>
360 |             <executions>
361 |               <execution>
362 |                 <id>sign-artifacts</id>
363 |                 <phase>verify</phase>
364 |                 <goals>
365 |                   <goal>sign</goal>
366 |                 </goals>
367 |               </execution>
368 |             </executions>
369 |           </plugin>
370 |           <plugin>
371 |             <groupId>org.apache.maven.plugins</groupId>
372 |             <artifactId>maven-javadoc-plugin</artifactId>
373 |             <version>2.10.1</version>
374 |             <executions>
375 |               <execution>
376 |                 <id>create-javadoc-jar</id>
377 |                 <goals>
378 |                   <goal>jar</goal>
379 |                 </goals>
380 |               </execution>
381 |             </executions>
382 |           </plugin>
383 |           <plugin>
384 |             <groupId>org.apache.maven.plugins</groupId>
385 |             <artifactId>maven-release-plugin</artifactId>
386 |             <version>2.5.1</version>
387 |             <configuration>
388 |               <autoVersionSubmodules>true</autoVersionSubmodules>
389 |             </configuration>
390 |           </plugin>
391 | 
392 |           <plugin>
393 |             <groupId>org.sonatype.plugins</groupId>
394 |             <artifactId>nexus-staging-maven-plugin</artifactId>
395 |             <version>1.6.3</version>
396 |             <extensions>true</extensions>
397 |             <configuration>
398 |               <serverId>ossrh</serverId>
399 |               <nexusUrl>https://oss.sonatype.org/</nexusUrl>
400 |               <autoReleaseAfterClose>true</autoReleaseAfterClose>
401 |             </configuration>
402 |           </plugin>
403 |         </plugins>
404 |       </build>
405 |     </profile>
406 |   </profiles>
407 | 
408 |   <distributionManagement>
409 |     <snapshotRepository>
410 |       <id>ossrh</id>
411 |       <url>https://oss.sonatype.org/content/repositories/snapshots</url>
412 |     </snapshotRepository>
413 |     <repository>
414 |       <id>ossrh</id>
415 |       <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
416 |     </repository>
417 |   </distributionManagement>
418 | 
419 |   <reporting>
420 |     <plugins>
421 |       <plugin>
422 |         <groupId>org.codehaus.mojo</groupId>
423 |         <artifactId>cobertura-maven-plugin</artifactId>
424 |         <version>2.7</version>
425 |       </plugin>
426 |     </plugins>
427 |   </reporting>
428 | </project>
429 | 


--------------------------------------------------------------------------------
/src/main/java/org/apache/spark/sql/kinesis/AWSInstanceProfileCredentialsProviderWithRetries.java:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.sql.kinesis;
 2 | 
 3 | import com.amazonaws.AmazonClientException;
 4 | import com.amazonaws.auth.AWSCredentials;
 5 | import com.amazonaws.auth.InstanceProfileCredentialsProvider;
 6 | import org.apache.commons.logging.Log;
 7 | import org.apache.commons.logging.LogFactory;
 8 | 
 9 | public class AWSInstanceProfileCredentialsProviderWithRetries
10 |         extends InstanceProfileCredentialsProvider {
11 | 
12 |     private static final Log LOG =
13 |             LogFactory.getLog(AWSInstanceProfileCredentialsProviderWithRetries.class);
14 | 
15 |     public AWSCredentials getCredentials() {
16 |         int retries = 10;
17 |         int sleep = 500;
18 |         while(retries > 0) {
19 |             try {
20 |                 return super.getCredentials();
21 |             }
22 |             catch (RuntimeException re) {
23 |                 LOG.error("Got an exception while fetching credentials " + re);
24 |                 --retries;
25 |                 try {
26 |                     Thread.sleep(sleep);
27 |                 } catch (InterruptedException ie) {
28 |                     // Do Nothing here
29 |                 }
30 |                 if (sleep < 10000) {
31 |                     sleep *= 2;
32 |                 }
33 |             }
34 |             catch (Error error) {
35 |                 LOG.error("Got an exception while fetching credentials " + error);
36 |                 --retries;
37 |                 try {
38 |                     Thread.sleep(sleep);
39 |                 } catch (InterruptedException ie) {
40 |                     // Do Nothing here
41 |                 }
42 |                 if (sleep < 10000) {
43 |                     sleep *= 2;
44 |                 }
45 |             }
46 |         }
47 |         throw new AmazonClientException("Unable to load credentials.");
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | org.apache.spark.sql.kinesis.KinesisSourceProvider


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/CachedKinesisProducer.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.spark.sql.kinesis
 18 | 
 19 | import java.util.Locale
 20 | import java.util.concurrent.{ExecutionException, TimeUnit}
 21 | 
 22 | import scala.collection.JavaConverters._
 23 | import scala.util.control.NonFatal
 24 | 
 25 | import com.amazonaws.auth.{AWSStaticCredentialsProvider, BasicAWSCredentials}
 26 | import com.amazonaws.regions.RegionUtils
 27 | import com.amazonaws.services.kinesis.AmazonKinesis
 28 | import com.amazonaws.services.kinesis.producer.{KinesisProducer, KinesisProducerConfiguration}
 29 | import com.google.common.cache._
 30 | import com.google.common.util.concurrent.{ExecutionError, UncheckedExecutionException}
 31 | 
 32 | import org.apache.spark.SparkEnv
 33 | import org.apache.spark.internal.Logging
 34 | 
 35 | private[kinesis] object CachedKinesisProducer extends Logging {
 36 | 
 37 |   private type Producer = KinesisProducer
 38 | 
 39 |   private lazy val cacheExpireTimeout: Long =
 40 |     SparkEnv.get.conf.getTimeAsMs("spark.kinesis.producer.cache.timeout", "10m")
 41 | 
 42 |   private val cacheLoader = new CacheLoader[Seq[(String, Object)], Producer] {
 43 |     override def load(config: Seq[(String, Object)]): Producer = {
 44 |       val configMap = config.map(x => x._1 -> x._2.toString).toMap
 45 |       createKinesisProducer(configMap)
 46 |     }
 47 |   }
 48 | 
 49 |   private val removalListener = new RemovalListener[Seq[(String, Object)], Producer]() {
 50 |     override def onRemoval(notification:
 51 |                            RemovalNotification[Seq[(String, Object)], Producer]): Unit = {
 52 |       val paramsSeq: Seq[(String, Object)] = notification.getKey
 53 |       val producer: Producer = notification.getValue
 54 |       logDebug(
 55 |         s"Evicting kinesis producer $producer params: $paramsSeq," +
 56 |           s" due to ${notification.getCause}")
 57 |       close(paramsSeq, producer)
 58 |     }
 59 |   }
 60 | 
 61 |   private lazy val guavaCache: LoadingCache[Seq[(String, Object)], Producer] =
 62 |     CacheBuilder.newBuilder().expireAfterAccess(cacheExpireTimeout, TimeUnit.MILLISECONDS)
 63 |       .removalListener(removalListener)
 64 |       .build[Seq[(String, Object)], Producer](cacheLoader)
 65 | 
 66 |   private def createKinesisProducer(producerConfiguration: Map[String, String]): Producer = {
 67 |     val kinesisParams = producerConfiguration.keySet
 68 |       .filter(_.toLowerCase(Locale.ROOT).startsWith("kinesis."))
 69 |       .map { k => k.drop(8).toString -> producerConfiguration(k) }
 70 |       .toMap
 71 | 
 72 |     val recordMaxBufferedTime = kinesisParams.getOrElse(
 73 |       KinesisSourceProvider.SINK_RECORD_MAX_BUFFERED_TIME,
 74 |       KinesisSourceProvider.DEFAULT_SINK_RECORD_MAX_BUFFERED_TIME)
 75 |       .toLong
 76 | 
 77 |     val maxConnections = kinesisParams.getOrElse(
 78 |       KinesisSourceProvider.SINK_MAX_CONNECTIONS,
 79 |       KinesisSourceProvider.DEFAULT_SINK_MAX_CONNECTIONS)
 80 |       .toInt
 81 | 
 82 |     val awsAccessKeyId = producerConfiguration.getOrElse(
 83 |       KinesisSourceProvider.AWS_ACCESS_KEY_ID, "").toString
 84 | 
 85 |     val awsSecretKey = producerConfiguration.getOrElse(
 86 |       KinesisSourceProvider.AWS_SECRET_KEY, "").toString
 87 | 
 88 |     var sessionToken = producerConfiguration.getOrElse(
 89 |       KinesisSourceProvider.AWS_SESSION_TOKEN, "").toString
 90 | 
 91 |     val awsStsRoleArn = producerConfiguration.getOrElse(
 92 |       KinesisSourceProvider.AWS_STS_ROLE_ARN, "").toString
 93 | 
 94 |     val awsStsSessionName = producerConfiguration.getOrElse(
 95 |       KinesisSourceProvider.AWS_STS_SESSION_NAME, "").toString
 96 | 
 97 |     val awsUseInstanceProfile = producerConfiguration.getOrElse(
 98 |       KinesisSourceProvider.AWS_USE_INSTANCE_PROFILE, "true").toBoolean
 99 | 
100 |     val endpoint = producerConfiguration.getOrElse(
101 |       KinesisSourceProvider.SINK_ENDPOINT_URL, KinesisSourceProvider.DEFAULT_KINESIS_ENDPOINT_URL)
102 |       .toString
103 | 
104 |     val aggregation = producerConfiguration.getOrElse(
105 |       KinesisSourceProvider.SINK_AGGREGATION_ENABLED,
106 |       KinesisSourceProvider.DEFAULT_SINK_AGGREGATION)
107 |       .toBoolean
108 | 
109 |     val region = getRegionNameByEndpoint(endpoint)
110 | 
111 |     val kinesisCredsProvider = if (awsAccessKeyId.length > 0) {
112 |       if(sessionToken.length > 0) {
113 |         BasicAWSSessionCredentials(awsAccessKeyId, awsSecretKey, sessionToken)
114 |       } else {
115 |         BasicCredentials(awsAccessKeyId, awsSecretKey)
116 |       }
117 |     } else if (awsStsRoleArn.length > 0) {
118 |       STSCredentials(awsStsRoleArn, awsStsSessionName)
119 |     } else if (awsUseInstanceProfile) {
120 |       InstanceProfileCredentials
121 |     } else {
122 |       DefaultCredentials
123 |     }
124 | 
125 |     val kinesisProducer = new Producer(new KinesisProducerConfiguration()
126 |       .setRecordMaxBufferedTime(recordMaxBufferedTime)
127 |       .setMaxConnections(maxConnections)
128 |       .setAggregationEnabled(aggregation)
129 |       .setCredentialsProvider(
130 |         kinesisCredsProvider.provider
131 |       )
132 |       .setRegion(region)
133 |     )
134 |     logDebug(s"Created a new instance of KinesisProducer for $producerConfiguration.")
135 |     kinesisProducer
136 |   }
137 | 
138 |   private[kinesis] def getOrCreate(kinesisParams: Map[String, String]): Producer = {
139 |     val paramsSeq: Seq[(String, Object)] = paramsToSeq(kinesisParams)
140 |     try {
141 |       guavaCache.get(paramsSeq)
142 |     } catch {
143 |       case e@(_: ExecutionException | _: UncheckedExecutionException | _: ExecutionError)
144 |         if e.getCause != null =>
145 |         throw e.getCause
146 |     }
147 |   }
148 | 
149 |   private def paramsToSeq(kinesisParams: Map[String, String]): Seq[(String, Object)] = {
150 |     val paramsSeq: Seq[(String, Object)] = kinesisParams.toSeq.sortBy(x => x._1)
151 |     paramsSeq
152 |   }
153 | 
154 |   /** For explicitly closing kinesis producer */
155 |   private[kinesis] def close(kinesisParams: Map[String, String]): Unit = {
156 |     val paramsSeq = paramsToSeq(kinesisParams)
157 |     guavaCache.invalidate(paramsSeq)
158 |   }
159 | 
160 |   /** Auto close on cache evict */
161 |   private def close(paramsSeq: Seq[(String, Object)], producer: Producer): Unit = {
162 |     try {
163 |       logInfo(s"Closing the KinesisProducer with params: ${paramsSeq.mkString("\n")}.")
164 |       producer.flushSync()
165 |       producer.destroy()
166 |     } catch {
167 |       case NonFatal(e) => logWarning("Error while closing kinesis producer.", e)
168 |     }
169 |   }
170 | 
171 |   private def clear(): Unit = {
172 |     logInfo("Cleaning up guava cache.")
173 |     guavaCache.invalidateAll()
174 |   }
175 | 
176 |   def getRegionNameByEndpoint(endpoint: String): String = {
177 |     val uri = new java.net.URI(endpoint)
178 |     RegionUtils.getRegionsForService(AmazonKinesis.ENDPOINT_PREFIX)
179 |       .asScala
180 |       .find(_.getAvailableEndpoints.asScala.toSeq.contains(uri.getHost))
181 |       .map(_.getName)
182 |       .getOrElse(
183 |         throw new IllegalArgumentException(s"Could not resolve region for endpoint: $endpoint"))
184 |   }
185 | 
186 | }
187 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/HDFSMetadataCommitter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.kinesis
 19 | 
 20 | import java.io.{FileNotFoundException, InputStream, InputStreamReader, OutputStream}
 21 | import java.nio.charset.StandardCharsets
 22 | import java.util.{EnumSet, Locale}
 23 | 
 24 | import org.apache.commons.io.IOUtils
 25 | import org.apache.hadoop.conf.Configuration
 26 | import org.apache.hadoop.fs._
 27 | import org.apache.hadoop.fs.permission.FsPermission
 28 | import org.json4s.NoTypeHints
 29 | import org.json4s.jackson.Serialization
 30 | import scala.reflect.ClassTag
 31 | import scala.util.control.NonFatal
 32 | 
 33 | import org.apache.spark.internal.Logging
 34 | import org.apache.spark.util.SerializableConfiguration
 35 | 
 36 | 
 37 | /*
 38 |    [[HDFSFileCommitter]] is used by executors to commit metadata to a HDFS location
 39 |    It is similar to [[HDFSMetadataLog]]. Difference is that it does not use
 40 |    [[SparkSession]] while creating fileContext. Hence it can used by executors.
 41 |    We could have modified [[HDFSMetadataLog]] but then changes for kinesis support
 42 |    would not have been contained within an external jar
 43 |  */
 44 | 
 45 | class HDFSMetadataCommitter[T <: AnyRef : ClassTag](path: String,
 46 |   hadoopConf: SerializableConfiguration,
 47 |   options: Map[String, String] = Map.empty[String, String])
 48 |   extends MetadataCommitter[T] with Logging with Serializable{
 49 | 
 50 | 
 51 |   private implicit val formats = Serialization.formats(NoTypeHints)
 52 | 
 53 |   /** Needed to serialize type T into JSON when using Jackson */
 54 |   private implicit val manifest = Manifest.classType[T](implicitly[ClassTag[T]].runtimeClass)
 55 | 
 56 |   val metadataPath = new Path(path, "shard-commit")
 57 | 
 58 |   protected val fileContext = FileContext.getFileContext(
 59 |     metadataPath.toUri, hadoopConf.value.asInstanceOf[ Configuration ])
 60 | 
 61 |   if ( !fileContext.util().exists(metadataPath) ) {
 62 |     fileContext.mkdir(metadataPath, FsPermission.getDirDefault, true)
 63 |   }
 64 | 
 65 |   private val numRetries: Int = {
 66 |     options.getOrElse("executor.metadata.hdfs.numretries", "3").toInt
 67 |   }
 68 | 
 69 |   private val retryIntervalMs: Long = {
 70 |     options.getOrElse("executor.metadata.hdfs.retryIntervalMs".toLowerCase(Locale.ROOT),
 71 |       "1000").toLong
 72 |   }
 73 | 
 74 |   private val maxRetryIntervalMs: Long = {
 75 |     options.getOrElse("executor.metadata.hdfs.maxRetryIntervalMs".toLowerCase(Locale.ROOT),
 76 |       "10000").toLong
 77 |   }
 78 | 
 79 | 
 80 |   /*
 81 |    *  A `PathFilter` to filter only batch files
 82 |    */
 83 | 
 84 |   protected val batchFilesFilter = new PathFilter {
 85 |     override def accept(path: Path): Boolean = isBatchFile(path)
 86 |   }
 87 | 
 88 |   protected def batchIdToPath(batchId: Long): Path = {
 89 |     new Path(metadataPath, batchId.toString)
 90 |   }
 91 | 
 92 |   protected def pathToBatchId(path: Path) = {
 93 |     path.getName.toLong
 94 |   }
 95 | 
 96 |   protected def isBatchFile(path: Path) = {
 97 |     try {
 98 |       path.getName.toLong
 99 |       true
100 |     } catch {
101 |       case _: NumberFormatException => false
102 |     }
103 |   }
104 | 
105 |   protected def serialize(metadata: T, out: OutputStream): Unit = {
106 |     // called inside a try-finally where the underlying stream is closed in the caller
107 |     Serialization.write(metadata, out)
108 |   }
109 | 
110 |   protected def deserialize(in: InputStream): T = {
111 |     // called inside a try-finally where the underlying stream is closed in the caller
112 |     val reader = new InputStreamReader(in, StandardCharsets.UTF_8)
113 |     Serialization.read[T](reader)
114 |   }
115 | 
116 |   def create(batchId: Long): Unit = {
117 |     val newPath = batchIdToPath(batchId)
118 |     if ( !fileContext.util().exists(newPath) ) {
119 |       fileContext.mkdir(newPath, FsPermission.getDirDefault, true)
120 |     }
121 |   }
122 | 
123 |   override def add(batchId: Long, shardId: String, metadata: T): Boolean = {
124 |     require(metadata != null, "'null' metadata cannot written to a shard commit log")
125 |     create(batchId)
126 |     val shardCommitPath = new Path(batchIdToPath(batchId), shardId)
127 |     import CreateFlag._
128 |     import Options._
129 | 
130 |     val output = fileContext.create(shardCommitPath,
131 |       EnumSet.of(CREATE, OVERWRITE), CreateOpts.checksumParam(ChecksumOpt.createDisabled()))
132 |     try {
133 |       serialize(metadata, output)
134 |       output.close()
135 |     } catch {
136 |       case e: Throwable =>
137 |         // close the open stream and delete the new file added
138 |         output.close()
139 |         withRetry[Boolean]("deleting cancelled metadataFile") {
140 |           fileContext.delete(shardCommitPath, false)
141 |         }
142 |         // throw the exception again so that the caller knows that add operation was not successful
143 |         throw e
144 |     }
145 |     true
146 |   }
147 | 
148 |   override def get(batchId: Long): Seq[T] = {
149 |     val batchMetadataDir = batchIdToPath(batchId)
150 |     withRetry[ Seq[ T ] ]("fetching MetaData") {
151 |       if ( fileContext.util().exists(batchMetadataDir) ) {
152 |         fileContext.util().listStatus(batchMetadataDir).map { f =>
153 |           getData(f.getPath) match {
154 |             case Some(data) => data
155 |             case None =>
156 |               // return if there is any one filepath from which we could not read any data
157 |               logDebug(s"Unable to get data for ${f.getPath}")
158 |               throw new IllegalStateException(s"Failed to get metadata for ${f.getPath}")
159 |           }
160 |         }.toSeq
161 |       } else {
162 |         logDebug(s"Unable to find batch $batchMetadataDir")
163 |         throw new IllegalStateException(s"$batchMetadataDir does not exist")
164 |       }
165 |     }
166 |   }
167 | 
168 |   def getData(path: Path): Option[ T ] = {
169 |       if ( fileContext.util().exists(path) ) {
170 |         val input = fileContext.open(path)
171 |         try {
172 |           Some(deserialize(input))
173 |         } catch {
174 |           case ise: IllegalStateException => // re-throw the exception with the log file path added
175 |             throw new IllegalStateException(s"Failed to read log file ${path}. " +
176 |               s"${ise.getMessage}", ise)
177 |         } finally {
178 |           IOUtils.closeQuietly(input)
179 |         }
180 |       } else {
181 |         logDebug(s"Unable to find file $path")
182 |         None
183 |       }
184 |     }
185 | 
186 |   def delete(batchId: Long): Unit = {
187 |     val batchMetadataDir = batchIdToPath(batchId)
188 |     delete(batchMetadataDir)
189 |   }
190 | 
191 |   def delete(path: Path): Unit = {
192 |     try {
193 |       fileContext.delete(path, true)
194 |     } catch {
195 |       case e: FileNotFoundException =>
196 |       // ignore if file has already been deleted
197 |     }
198 |   }
199 | 
200 |   /*
201 |    * Removes all the log entry earlier than thresholdBatchId (exclusive).
202 |    */
203 |   override def purge(thresholdBatchId: Long): Unit = {
204 |     val batchIds = fileContext.util().listStatus(metadataPath, batchFilesFilter)
205 |       .map(f => pathToBatchId(f.getPath))
206 | 
207 |     for (batchId <- batchIds if batchId < thresholdBatchId) {
208 |       val path = batchIdToPath(batchId)
209 |       delete(path)
210 |       logTrace(s"Removed metadata log file: $path")
211 |     }
212 |   }
213 | 
214 |   /** Helper method to retry with exponential backoff  */
215 |   def withRetry[ T ](message: String, ignoreException: Boolean = true)(body: => T): T = {
216 |     var retryCount = 0
217 |     var result: Option[ T ] = None
218 |     var waitTimeInterval = retryIntervalMs
219 |     var lastError: Throwable = null
220 | 
221 |     def isMaxRetryDone = retryCount >= numRetries
222 | 
223 |     while (result.isEmpty && !isMaxRetryDone) {
224 |       if ( retryCount > 0 ) { // wait only if this is a retry
225 |         Thread.sleep(waitTimeInterval)
226 |         waitTimeInterval = scala.math.min(waitTimeInterval * 2, maxRetryIntervalMs)
227 |       }
228 |       try {
229 |         result = Some(body)
230 |       } catch {
231 |         case NonFatal(t) => lastError = t
232 |           if ( ignoreException ) {
233 |             logWarning(s"Error while $message [attempt = ${retryCount + 1}]", t)
234 |           } else {
235 |             throw new IllegalStateException(s"Error while $message", t)
236 |           }
237 |       }
238 |       retryCount += 1
239 |     }
240 |     result.getOrElse {
241 |       throw new IllegalStateException(s"Gave up after $retryCount retries while $message," +
242 |           s" last exception: ", lastError)
243 |     }
244 |   }
245 | }
246 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/KinesisPosition.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.kinesis
 19 | 
 20 | import org.json4s.NoTypeHints
 21 | import org.json4s.jackson.Serialization
 22 | 
 23 | trait KinesisPosition extends Serializable {
 24 |   val iteratorType: String
 25 |   val iteratorPosition: String
 26 | 
 27 |   override def toString: String = s"KinesisPosition($iteratorType, $iteratorPosition)"
 28 | }
 29 | 
 30 | class TrimHorizon() extends KinesisPosition {
 31 |   override val iteratorType = "TRIM_HORIZON"
 32 |   override val iteratorPosition = ""
 33 | }
 34 | 
 35 | class Latest() extends KinesisPosition {
 36 |   override val iteratorType = "LATEST"
 37 |   override val iteratorPosition = ""
 38 | }
 39 | 
 40 | class AtTimeStamp(timestamp: String) extends KinesisPosition {
 41 |   def this(timestamp: Long) {
 42 |     this(timestamp.toString)
 43 |   }
 44 |   override val iteratorType = "AT_TIMESTAMP"
 45 |   override val iteratorPosition = timestamp.toString
 46 | }
 47 | 
 48 | class AfterSequenceNumber(seqNumber: String) extends KinesisPosition {
 49 |   override val iteratorType = "AFTER_SEQUENCE_NUMBER"
 50 |   override val iteratorPosition = seqNumber
 51 | }
 52 | 
 53 | class AtSequenceNumber(seqNumber: String) extends KinesisPosition {
 54 |   override val iteratorType = "AT_SEQUENCE_NUMBER"
 55 |   override val iteratorPosition = seqNumber
 56 | }
 57 | 
 58 | class ShardEnd() extends KinesisPosition {
 59 |   override val iteratorType = "SHARD_END"
 60 |   override val iteratorPosition = ""
 61 | }
 62 | 
 63 | private[kinesis] object KinesisPosition {
 64 |   def make(iteratorType: String, iteratorPosition: String): KinesisPosition = iteratorType match {
 65 |     case iterType if "TRIM_HORIZON".equalsIgnoreCase(iterType) => new TrimHorizon()
 66 |     case iterType if "LATEST".equalsIgnoreCase(iterType) => new Latest()
 67 |     case iterType if "AT_TIMESTAMP".equalsIgnoreCase(iterType) => new AtTimeStamp(iteratorPosition)
 68 |     case iterType if "AT_SEQUENCE_NUMBER".equalsIgnoreCase(iterType) =>
 69 |       new AtSequenceNumber(iteratorPosition)
 70 |     case iterType if "AFTER_SEQUENCE_NUMBER".equalsIgnoreCase(iterType) =>
 71 |       new AfterSequenceNumber(iteratorPosition)
 72 |     case iterType if "SHARD_END".equalsIgnoreCase(iterType) => new ShardEnd()
 73 |   }
 74 | }
 75 | 
 76 | /**
 77 |  * Specifies initial position in Kenesis to start read from on the application startup.
 78 |  * @param shardPositions map of shardId->KinesisPosition
 79 |  * @param defaultPosition position that is used for shard that is requested but not present in map
 80 |  */
 81 | private[kinesis] class InitialKinesisPosition(shardPositions: Map[String, KinesisPosition],
 82 |                                               defaultPosition: KinesisPosition)
 83 |   extends Serializable {
 84 | 
 85 |   def shardPosition(shardId: String): KinesisPosition =
 86 |     shardPositions.getOrElse(shardId, defaultPosition)
 87 | 
 88 |   override def toString: String = s"InitialKinesisPosition($shardPositions)"
 89 | }
 90 | 
 91 | private[kinesis] object InitialKinesisPosition {
 92 |   implicit val format = Serialization.formats(NoTypeHints)
 93 | 
 94 |   def fromPredefPosition(pos: KinesisPosition): InitialKinesisPosition =
 95 |     new InitialKinesisPosition(Map(), pos)
 96 | 
 97 |   /**
 98 |    * Parses json representation on Kinesis position.
 99 |    * It is useful if Kinesis position is persisted explicitly (e.g. at the end of the batch)
100 |    * and used to continue reading records from the same position on Spark application redeploy.
101 |    * Kinesis position JSON representation example:
102 |    * {{{
103 |    * {
104 |    *   "shardId-000000000001":{
105 |    *     "iteratorType":"AFTER_SEQUENCE_NUMBER",
106 |    *     "iteratorPosition":"49605240428222307037115827613554798409561082419642105874"
107 |    *   },
108 |    *   "metadata":{
109 |    *     "streamName":"my.cool.stream2",
110 |    *     "batchId":"7"
111 |    *   },
112 |    *   "shardId-000000000000":{
113 |    *     "iteratorType":"AFTER_SEQUENCE_NUMBER",
114 |    *     "iteratorPosition":"49605240428200006291917297020490128157480794051565322242"
115 |    *   }
116 |    * }
117 |    * }}}
118 |    * @param text JSON representation of Kinesis position.
119 |    * @return
120 |    */
121 |   def fromCheckpointJson(text: String, defaultPosition: KinesisPosition): InitialKinesisPosition = {
122 |     val kso = KinesisSourceOffset(text)
123 |     val shardOffsets = kso.shardsToOffsets
124 | 
125 |     new InitialKinesisPosition(
126 |       shardOffsets.shardInfoMap
127 |         .map(si => si._1 -> KinesisPosition.make(si._2.iteratorType, si._2.iteratorPosition)),
128 |       defaultPosition
129 |       )
130 |   }
131 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/KinesisReader.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.kinesis
 19 | 
 20 | import java.math.BigInteger
 21 | import java.util
 22 | import java.util.{ArrayList, Locale}
 23 | import java.util.concurrent.{Executors, ThreadFactory}
 24 | 
 25 | import com.amazonaws.AbortedException
 26 | import com.amazonaws.services.kinesis.AmazonKinesisClient
 27 | import com.amazonaws.services.kinesis.clientlibrary.types.UserRecord
 28 | import com.amazonaws.services.kinesis.model.{GetRecordsRequest, ListShardsRequest, Shard, _}
 29 | import scala.collection.JavaConverters._
 30 | import scala.concurrent.{ExecutionContext, Future}
 31 | import scala.concurrent.duration.Duration
 32 | import scala.util.control.NonFatal
 33 | 
 34 | import org.apache.spark.internal.Logging
 35 | import org.apache.spark.sql.types._
 36 | import org.apache.spark.util.{ThreadUtils, UninterruptibleThread}
 37 | 
 38 | 
 39 | // This class uses Kinesis API to read data offsets from Kinesis
 40 | 
 41 | private[kinesis] case class KinesisReader(
 42 |     readerOptions: Map[String, String],
 43 |     streamName: String,
 44 |     kinesisCredsProvider: SparkAWSCredentials,
 45 |     endpointUrl: String
 46 | ) extends Serializable with Logging {
 47 | 
 48 |   /*
 49 |    * Used to ensure execute fetch operations execute in an UninterruptibleThread
 50 |    */
 51 |   val kinesisReaderThread = Executors.newSingleThreadExecutor(new ThreadFactory {
 52 |     override def newThread(r: Runnable): Thread = {
 53 |       val t = new UninterruptibleThread("Kinesis Reader") {
 54 |         override def run(): Unit = {
 55 |           r.run()
 56 |         }
 57 |       }
 58 |       t.setDaemon(true)
 59 |       t
 60 |     }
 61 |   })
 62 | 
 63 |   val execContext = ExecutionContext.fromExecutorService(kinesisReaderThread)
 64 | 
 65 |   private val maxOffsetFetchAttempts =
 66 |     readerOptions.getOrElse("client.numRetries".toLowerCase(Locale.ROOT), "3").toInt
 67 | 
 68 |   private val offsetFetchAttemptIntervalMs =
 69 |     readerOptions.getOrElse("client.retryIntervalMs".toLowerCase(Locale.ROOT), "1000").toLong
 70 | 
 71 |   private val maxRetryIntervalMs: Long = {
 72 |     readerOptions.getOrElse("client.maxRetryIntervalMs".toLowerCase(Locale.ROOT), "10000").toLong
 73 |   }
 74 | 
 75 |   private val maxSupportedShardsPerStream = 10000;
 76 | 
 77 |   private var _amazonClient: AmazonKinesisClient = null
 78 | 
 79 |   private def getAmazonClient(): AmazonKinesisClient = {
 80 |     if (_amazonClient == null) {
 81 |       _amazonClient = new AmazonKinesisClient(kinesisCredsProvider.provider)
 82 |       _amazonClient.setEndpoint(endpointUrl)
 83 |     }
 84 |     _amazonClient
 85 |   }
 86 | 
 87 |   def getShards(): Seq[Shard] = {
 88 |     val shards = listShards
 89 |     logInfo(s"List shards in Kinesis Stream:  ${shards}")
 90 |     shards
 91 |   }
 92 | 
 93 |   def close(): Unit = {
 94 |     runUninterruptibly {
 95 |       if (_amazonClient != null) {
 96 |         _amazonClient.shutdown()
 97 |         _amazonClient = null
 98 |       }
 99 |     }
100 |     kinesisReaderThread.shutdown()
101 |   }
102 | 
103 |   def getShardIterator(shardId: String,
104 |                        iteratorType: String,
105 |                        iteratorPosition: String,
106 |                        failOnDataLoss: Boolean = true): String = {
107 | 
108 |     val getShardIteratorRequest = new GetShardIteratorRequest
109 |     getShardIteratorRequest.setShardId(shardId)
110 |     getShardIteratorRequest.setStreamName(streamName)
111 |     getShardIteratorRequest.setShardIteratorType(iteratorType)
112 | 
113 |     if (iteratorType == "AFTER_SEQUENCE_NUMBER" || iteratorType == "AT_SEQUENCE_NUMBER") {
114 |       getShardIteratorRequest.setStartingSequenceNumber(iteratorPosition)
115 |     }
116 | 
117 |     if (iteratorType == "AT_TIMESTAMP") {
118 |       logDebug(s"TimeStamp while getting shard iterator ${
119 |         (new java.util.Date(iteratorPosition.toLong)).toString}")
120 |       getShardIteratorRequest.setTimestamp(new java.util.Date(iteratorPosition.toLong))
121 |     }
122 | 
123 |     runUninterruptibly {
124 |       retryOrTimeout[GetShardIteratorResult](
125 |         s"Fetching Shard Iterator") {
126 |         try {
127 |           getAmazonClient.getShardIterator(getShardIteratorRequest)
128 |         } catch {
129 |           case r: ResourceNotFoundException =>
130 |             if (!failOnDataLoss) {
131 |               new GetShardIteratorResult()
132 |             }
133 |             else {
134 |               throw r
135 |             }
136 |         }
137 |       }
138 |     }.getShardIterator
139 |   }
140 | 
141 | 
142 |   def getKinesisRecords(shardIterator: String, limit: Int): GetRecordsResult = {
143 |     val getRecordsRequest = new GetRecordsRequest
144 |     getRecordsRequest.setShardIterator(shardIterator)
145 |     getRecordsRequest.setLimit(limit)
146 |     val getRecordsResult: GetRecordsResult = runUninterruptibly {
147 |       retryOrTimeout[ GetRecordsResult ](s"get Records for a shard ") {
148 |         getAmazonClient.getRecords(getRecordsRequest)
149 |       }
150 |     }
151 |     getRecordsResult
152 |   }
153 | 
154 | 
155 |   def deaggregateRecords(records: util.List[ Record ], shard: Shard): util.List[ Record] = {
156 |     // We deaggregate if and only if we got actual Kinesis records, i.e.
157 |     // not instances of some subclass thereof.
158 |     if ( !records.isEmpty && records.get(0).getClass.equals(classOf[ Record ]) ) {
159 |       if ( shard != null ) {
160 |         return UserRecord.deaggregate(
161 |           records,
162 |           new BigInteger(shard.getHashKeyRange.getStartingHashKey),
163 |           new BigInteger(shard.getHashKeyRange.getEndingHashKey))
164 |           .asInstanceOf[ util.List[ _ ] ].asInstanceOf[ util.List[ Record ] ]
165 |       } else {
166 |         return UserRecord.deaggregate(records)
167 |           .asInstanceOf[ util.List[ _ ] ].asInstanceOf[ util.List[ Record ] ]
168 |       }
169 |     }
170 |     records
171 |   }
172 | 
173 |   private def listShards(): Seq[Shard] = {
174 |     var nextToken = ""
175 |     var returnedToken = ""
176 |     val shards = new ArrayList[Shard]()
177 |     val listShardsRequest = new ListShardsRequest
178 |     listShardsRequest.setStreamName(streamName)
179 |     listShardsRequest.setMaxResults(maxSupportedShardsPerStream)
180 | 
181 |     do {
182 |       val listShardsResult: ListShardsResult = runUninterruptibly {
183 |         retryOrTimeout[ListShardsResult]( s"List shards") {
184 |             getAmazonClient.listShards(listShardsRequest)
185 |         }
186 |       }
187 |       shards.addAll(listShardsResult.getShards)
188 |       returnedToken = listShardsResult.getNextToken()
189 |       if (returnedToken != null) {
190 |         nextToken = returnedToken
191 |         listShardsRequest.setNextToken(nextToken)
192 |       }
193 |     } while (!nextToken.isEmpty)
194 | 
195 |     shards.asScala.toSeq
196 |   }
197 | 
198 |   /*
199 |    * This method ensures that the closure is called in an [[UninterruptibleThread]].
200 |    * This is required when communicating with the AWS. In the case
201 |    */
202 |   private def runUninterruptibly[T](body: => T): T = {
203 |     if (!Thread.currentThread.isInstanceOf[UninterruptibleThread]) {
204 |       val future = Future {
205 |         body
206 |       }(execContext)
207 |       ThreadUtils.awaitResult(future, Duration.Inf)
208 |     } else {
209 |       body
210 |     }
211 |   }
212 | 
213 |   /** Helper method to retry Kinesis API request with exponential backoff and timeouts */
214 |   private def retryOrTimeout[T](message: String)(body: => T): T = {
215 |     assert(Thread.currentThread().isInstanceOf[UninterruptibleThread])
216 | 
217 |     val startTimeMs = System.currentTimeMillis()
218 |     var retryCount = 0
219 |     var result: Option[T] = None
220 |     var lastError: Throwable = null
221 |     var waitTimeInterval = offsetFetchAttemptIntervalMs
222 | 
223 |     def isMaxRetryDone = retryCount >= maxOffsetFetchAttempts
224 | 
225 |     while (result.isEmpty && !isMaxRetryDone) {
226 |       if ( retryCount > 0 ) { // wait only if this is a retry
227 |         Thread.sleep(waitTimeInterval)
228 |         waitTimeInterval = scala.math.min(waitTimeInterval * 2, maxRetryIntervalMs)
229 |       }
230 |       try {
231 |         result = Some(body)
232 |       } catch {
233 |         case NonFatal(t) =>
234 |           lastError = t
235 |           t match {
236 |             case ptee: ProvisionedThroughputExceededException =>
237 |               logWarning(s"Error while $message [attempt = ${retryCount + 1}]", ptee)
238 |             case lee: LimitExceededException =>
239 |               logWarning(s"Error while $message [attempt = ${retryCount + 1}]", lee)
240 |             case ae: AbortedException =>
241 |               logWarning(s"Error while $message [attempt = ${retryCount + 1}]", ae)
242 |             case ake: AmazonKinesisException =>
243 |               if (ake.getStatusCode() >= 500) {
244 |                 logWarning(s"Error while $message [attempt = ${retryCount + 1}]", ake)
245 |               } else {
246 |                 throw new IllegalStateException(s"Error while $message", ake)
247 |               }
248 |             case e: Throwable =>
249 |               throw new IllegalStateException(s"Error while $message", e)
250 |           }
251 |       }
252 |       retryCount += 1
253 |     }
254 |     result.getOrElse {
255 |       throw new IllegalStateException(
256 |         s"Gave up after $retryCount retries while $message, last exception: ", lastError)
257 |     }
258 |   }
259 | 
260 | }
261 | 
262 | 
263 | private [kinesis]  object KinesisReader {
264 | 
265 |   val kinesisSchema: StructType =
266 |       StructType(Seq(
267 |         StructField("data", BinaryType),
268 |         StructField("streamName", StringType),
269 |         StructField("partitionKey", StringType),
270 |         StructField("sequenceNumber", StringType),
271 |         StructField("approximateArrivalTimestamp", TimestampType))
272 |       )
273 | }
274 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/KinesisSink.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.kinesis
19 | 
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.{DataFrame, SQLContext}
22 | import org.apache.spark.sql.execution.streaming.Sink
23 | import org.apache.spark.sql.streaming.OutputMode
24 | 
25 | private[kinesis] class KinesisSink(sqlContext: SQLContext,
26 |                                    sinkOptions: Map[String, String],
27 |                                    outputMode: OutputMode)
28 |   extends Sink with Logging {
29 | 
30 |   @volatile private var latestBatchId = -1L
31 | 
32 |   override def toString: String = "KinesisSink"
33 | 
34 |   override def addBatch(batchId: Long, data: DataFrame): Unit = {
35 |     if (batchId <= latestBatchId) {
36 |       logInfo(s"Skipping already committed batch $batchId")
37 |     } else {
38 |       KinesisWriter.write(sqlContext.sparkSession, data.queryExecution, sinkOptions)
39 |       latestBatchId = batchId
40 |     }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/KinesisSource.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.kinesis
 19 | 
 20 | import java.io._
 21 | import java.util.Locale
 22 | import java.util.concurrent.atomic.AtomicBoolean
 23 | 
 24 | import com.amazonaws.services.kinesis.model.Record
 25 | import org.apache.hadoop.conf.Configuration
 26 | import scala.collection.parallel.ForkJoinTaskSupport
 27 | 
 28 | import org.apache.spark.SparkContext
 29 | import org.apache.spark.internal.Logging
 30 | import org.apache.spark.sql._
 31 | import org.apache.spark.sql.catalyst.InternalRow
 32 | import org.apache.spark.sql.catalyst.util.DateTimeUtils
 33 | import org.apache.spark.sql.execution.streaming.{Offset, Source, _}
 34 | import org.apache.spark.sql.types._
 35 | import org.apache.spark.unsafe.types.UTF8String
 36 | import org.apache.spark.util.{SerializableConfiguration, ThreadUtils, Utils}
 37 | 
 38 |  /*
 39 |   * A [[Source]] that reads data from Kinesis using the following design.
 40 |   *
 41 |   *  - The [[KinesisSourceOffset]] is the custom [[Offset]] defined for this source
 42 |   *
 43 |   *  - The [[KinesisSource]] written to do the following.
 44 |   *
 45 |   *   - `getOffset()` uses the [[KinesisSourceOffset]] to query the latest
 46 |   *      available offsets, which are returned as a [[KinesisSourceOffset]].
 47 |   *
 48 |   *   - `getBatch()` returns a DF
 49 |   *   - The DF returned is based on [[KinesisSourceRDD]]
 50 |   */
 51 | 
 52 | private[kinesis] class KinesisSource(
 53 |     sqlContext: SQLContext,
 54 |     sourceOptions: Map[String, String],
 55 |     metadataPath: String,
 56 |     streamName: String,
 57 |     initialPosition: InitialKinesisPosition,
 58 |     endPointURL: String,
 59 |     kinesisCredsProvider: SparkAWSCredentials,
 60 |     failOnDataLoss: Boolean = true
 61 |     )
 62 |   extends Source with Serializable with Logging {
 63 | 
 64 |   import KinesisSource._
 65 | 
 66 |   private def sc: SparkContext = {
 67 |     sqlContext.sparkContext
 68 |   }
 69 | 
 70 |   private def kinesisReader: KinesisReader = {
 71 |     new KinesisReader(sourceOptions, streamName, kinesisCredsProvider, endPointURL)
 72 |   }
 73 | 
 74 |   private var currentShardOffsets: Option[ShardOffsets] = None
 75 | 
 76 |   private val minBatchesToRetain = sqlContext.sparkSession.sessionState.conf.minBatchesToRetain
 77 |   require(minBatchesToRetain > 0, "minBatchesToRetain has to be positive")
 78 | 
 79 |   private val describeShardInterval: Long = {
 80 |     Utils.timeStringAsMs(sourceOptions.getOrElse(KinesisSourceProvider.DESCRIBE_SHARD_INTERVAL,
 81 |       "1s"))
 82 |   }
 83 | 
 84 |   require(describeShardInterval >= 0, "describeShardInterval cannot be less than 0 sec")
 85 | 
 86 |   private var latestDescribeShardTimestamp: Long = -1L
 87 | 
 88 |   private def metadataCommitter: MetadataCommitter[ShardInfo] = {
 89 |     metaDataCommitterType.toLowerCase(Locale.ROOT) match {
 90 |       case "hdfs" =>
 91 |         new HDFSMetadataCommitter[ ShardInfo ](metaDataCommitterPath,
 92 |           hadoopConf(sqlContext), sourceOptions)
 93 |       case _ => throw new IllegalArgumentException("only HDFS is supported")
 94 |     }
 95 |   }
 96 | 
 97 |   private def metaDataCommitterType: String = {
 98 |     sourceOptions.getOrElse("executor.metadata.committer", "hdfs").toString
 99 |   }
100 | 
101 |   private def metaDataCommitterPath: String = {
102 |     sourceOptions.getOrElse("executor.metadata.path", metadataPath).toString
103 |   }
104 | 
105 |   private val avoidEmptyBatches =
106 |     sourceOptions.getOrElse("client.avoidEmptyBatches".
107 |       toLowerCase(Locale.ROOT), "false").toBoolean
108 | 
109 |   private val maxParallelThreads =
110 |     sourceOptions.getOrElse("client.maxParallelThreads".
111 |       toLowerCase(Locale.ROOT), "8").toInt
112 | 
113 |   def options: Map[String, String] = {
114 |     // This function is used for testing
115 |     sourceOptions
116 |   }
117 | 
118 |   def getFailOnDataLoss(): Boolean = {
119 |     // This function is used for testing
120 |     failOnDataLoss
121 |   }
122 | 
123 |   /** Makes an API call to get one record for a shard. Return true if the call is successful  */
124 |   def hasNewData(shardInfo: ShardInfo): Boolean = {
125 |     val shardIterator = kinesisReader.getShardIterator(
126 |       shardInfo.shardId,
127 |       shardInfo.iteratorType,
128 |       shardInfo.iteratorPosition)
129 |     val records = kinesisReader.getKinesisRecords(shardIterator, 1)
130 |     // Return true if we can get back a record. Or if we have not reached the end of the stream
131 |     (records.getRecords.size() > 0 || records.getMillisBehindLatest.longValue() > 0)
132 |   }
133 | 
134 |   def canCreateNewBatch(shardsInfo: Array[ShardInfo]): Boolean = {
135 |     var shardsInfoToCheck = shardsInfo.par
136 |     val threadPoolSize = Math.min(maxParallelThreads, shardsInfoToCheck.size)
137 |     val evalPool = ThreadUtils.newForkJoinPool("checkCreateNewBatch", threadPoolSize)
138 |     shardsInfoToCheck.tasksupport = new ForkJoinTaskSupport(evalPool)
139 |     val hasRecords = new AtomicBoolean(false)
140 |     try {
141 |       shardsInfoToCheck.foreach { s =>
142 |         if (!hasRecords.get() && hasNewData(s)) {
143 |           hasRecords.set(true)
144 |         }
145 |       }
146 |     } finally {
147 |       evalPool.shutdown()
148 |     }
149 |     logDebug(s"Can create new batch = ${hasRecords.get()}")
150 |     hasRecords.get()
151 |   }
152 | 
153 |   def hasShardEndAsOffset(shardInfo: Seq[ShardInfo]): Boolean = {
154 |     shardInfo.exists {
155 |       s: (ShardInfo) => (s.iteratorType.contains(new ShardEnd().iteratorType))
156 |     }
157 |   }
158 | 
159 |   /** Returns the shards position to start reading data from */
160 |   override def getOffset: Option[Offset] = synchronized {
161 |     val defaultOffset = new ShardOffsets(-1L, streamName)
162 |     val prevBatchId = currentShardOffsets.getOrElse(defaultOffset).batchId
163 |     val prevShardsInfo = prevBatchShardInfo(prevBatchId)
164 | 
165 |     val latestShardInfo: Array[ShardInfo] = {
166 |       if (prevBatchId < 0
167 |         || latestDescribeShardTimestamp == -1
168 |         || ((latestDescribeShardTimestamp + describeShardInterval) < System.currentTimeMillis())) {
169 |         val latestShards = kinesisReader.getShards()
170 |         latestDescribeShardTimestamp = System.currentTimeMillis()
171 |         ShardSyncer.getLatestShardInfo(latestShards, prevShardsInfo,
172 |           initialPosition, failOnDataLoss)
173 |       } else {
174 |         prevShardsInfo
175 |       }
176 |     }.toArray
177 | 
178 |     if (!avoidEmptyBatches
179 |         || prevBatchId < 0
180 |         || hasShardEndAsOffset(latestShardInfo)
181 |         || ShardSyncer.hasNewShards(prevShardsInfo, latestShardInfo)
182 |         || ShardSyncer.hasDeletedShards(prevShardsInfo, latestShardInfo)
183 |         || canCreateNewBatch(latestShardInfo)) {
184 |       currentShardOffsets = Some(new ShardOffsets(prevBatchId + 1, streamName, latestShardInfo))
185 |     } else {
186 |       log.info("Offsets are unchanged since `kinesis.client.avoidEmptyBatches` is enabled")
187 |     }
188 | 
189 |     currentShardOffsets match {
190 |       case None => None
191 |       case Some(cso) => Some(KinesisSourceOffset(cso))
192 |     }
193 |   }
194 | 
195 |   override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
196 |     logInfo(s"End Offset is ${end.toString}")
197 |     val currBatchShardOffset = KinesisSourceOffset.getShardOffsets(end)
198 |     val currBatchId = currBatchShardOffset.batchId
199 |     var prevBatchId: Long = start match {
200 |       case Some(prevBatchStartOffset) =>
201 |         KinesisSourceOffset.getShardOffsets(prevBatchStartOffset).batchId
202 |       case None => -1.toLong
203 |     }
204 |     assert(prevBatchId <= currBatchId)
205 | 
206 |     val shardInfos = {
207 |       // filter out those shardInfos for which ShardIterator is shard_end
208 |       currBatchShardOffset.shardInfoMap.values.toSeq.filter {
209 |         s: (ShardInfo) => !(s.iteratorType.contains(new ShardEnd().iteratorType))
210 |       }.sortBy(_.shardId.toString)
211 |     }
212 |     logInfo(s"Processing ${shardInfos.length} shards from ${shardInfos}")
213 | 
214 |     // Create an RDD that reads from Kinesis
215 |     val kinesisSourceRDD = new KinesisSourceRDD(
216 |       sc,
217 |       sourceOptions,
218 |       streamName,
219 |       currBatchId,
220 |       shardInfos,
221 |       kinesisCredsProvider,
222 |       endPointURL,
223 |       hadoopConf(sqlContext),
224 |       metadataPath,
225 |       failOnDataLoss)
226 | 
227 |     val rdd = kinesisSourceRDD.map { r: Record =>
228 |       InternalRow(
229 |         r.getData.array(),
230 |         UTF8String.fromString(streamName),
231 |         UTF8String.fromString(r.getPartitionKey),
232 |         UTF8String.fromString(r.getSequenceNumber),
233 |         DateTimeUtils.fromJavaTimestamp(
234 |           new java.sql.Timestamp(r.getApproximateArrivalTimestamp.getTime))
235 |       )
236 |     }
237 | 
238 |     // On recovery, getBatch will get called before getOffset
239 |     if (currentShardOffsets.isEmpty) {
240 |       currentShardOffsets = Some(currBatchShardOffset)
241 |     }
242 | 
243 |     logInfo("GetBatch generating RDD of offset range: " +
244 |       shardInfos.mkString(", "))
245 | 
246 |     sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
247 | 
248 |   }
249 | 
250 |   override def schema: StructType = KinesisReader.kinesisSchema
251 | 
252 |   /** Stop this source and free any resources it has allocated. */
253 |   override def stop(): Unit = synchronized {
254 |     kinesisReader.close()
255 |   }
256 | 
257 |   override def commit(end: Offset): Unit = {
258 |     val defaultOffset = new ShardOffsets(-1L, streamName)
259 |     val currBatchId = currentShardOffsets.getOrElse(defaultOffset).batchId
260 |     val thresholdBatchId = currBatchId - minBatchesToRetain
261 |     if (thresholdBatchId >= 0) {
262 |       logInfo(s"Purging Committed Entries. ThresholdBatchId = ${thresholdBatchId}")
263 |       metadataCommitter.purge(thresholdBatchId)
264 |     }
265 |   }
266 | 
267 |   override def toString(): String = s"KinesisSource[$streamName]"
268 | 
269 |   private def prevBatchShardInfo(batchId: Long): Seq[ShardInfo] = {
270 |     val shardInfo = if (batchId < 0) {
271 |       logInfo(s"This is the first batch. Returning Empty sequence")
272 |       Seq.empty[ShardInfo]
273 |     } else {
274 |       logDebug(s"BatchId of previously executed batch is $batchId")
275 |       val prevShardinfo = metadataCommitter.get(batchId)
276 |       if (prevShardinfo.isEmpty) {
277 |         throw new IllegalStateException(s"Unable to fetch " +
278 |           s"committed metadata from previous batch. Some data may have been missed")
279 |       }
280 |       prevShardinfo
281 |     }
282 |     logDebug(s"Shard Info is ${shardInfo.mkString(", ")}")
283 |     shardInfo
284 |   }
285 | 
286 | }
287 | 
288 | object KinesisSource {
289 | 
290 |   val VERSION = 1
291 | 
292 |   private var _hadoopConf: SerializableConfiguration = null
293 | 
294 |   def hadoopConf(sqlContext: SQLContext): SerializableConfiguration = {
295 |     if (_hadoopConf == null) {
296 |       val conf: Configuration = sqlContext.sparkSession.sessionState.newHadoopConf()
297 |       _hadoopConf = new SerializableConfiguration(conf)
298 |     }
299 |     _hadoopConf
300 |   }
301 | 
302 | }
303 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/KinesisSourceOffset.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.kinesis
 19 | 
 20 | import org.json4s.NoTypeHints
 21 | import org.json4s.jackson.Serialization
 22 | import scala.collection.mutable.HashMap
 23 | import scala.util.control.NonFatal
 24 | 
 25 | import org.apache.spark.sql.execution.streaming.Offset
 26 | import org.apache.spark.sql.execution.streaming.SerializedOffset
 27 | 
 28 |  /*
 29 |   * @param shardsToOffsets
 30 |   */
 31 | 
 32 | case class KinesisSourceOffset(shardsToOffsets: ShardOffsets) extends Offset {
 33 |   override def json: String = {
 34 |     val metadata = HashMap[String, String](
 35 |       "batchId" -> shardsToOffsets.batchId.toString,
 36 |       "streamName" -> shardsToOffsets.streamName)
 37 |     val result = HashMap[String, HashMap[String, String]]("metadata" -> metadata)
 38 | 
 39 |     val shardInfos = shardsToOffsets.shardInfoMap.keySet.toSeq.sorted  // sort for more determinism
 40 | 
 41 |     shardInfos.foreach {
 42 |       shardId: String =>
 43 |         val shardInfo: ShardInfo = shardsToOffsets.shardInfoMap.get(shardId).get
 44 |         val part = result.getOrElse(shardInfo.shardId, new HashMap[String, String])
 45 |         part += "iteratorType" -> shardInfo.iteratorType
 46 |         part += "iteratorPosition" -> shardInfo.iteratorPosition
 47 |         result += shardId -> part
 48 |     }
 49 |     Serialization.write(result)(KinesisSourceOffset.format)
 50 |     }
 51 | }
 52 | 
 53 | object KinesisSourceOffset {
 54 |   implicit val format = Serialization.formats(NoTypeHints)
 55 | 
 56 |   def getShardOffsets(offset: Offset): ShardOffsets = {
 57 |     offset match {
 58 |       case kso: KinesisSourceOffset => kso.shardsToOffsets
 59 |       case so: SerializedOffset => KinesisSourceOffset(so).shardsToOffsets
 60 |       case _ => throw
 61 |         new IllegalArgumentException(s"Invalid conversion " +
 62 |           s"from offset of ${offset.getClass} to KinesisSourceOffset")
 63 |     }
 64 |   }
 65 | 
 66 |   /*
 67 |    * Returns [[KinesisSourceOffset]] from a JSON [[SerializedOffset]]
 68 |    */
 69 |   def apply(so: SerializedOffset): KinesisSourceOffset = {
 70 |     apply(so.json)
 71 |   }
 72 | 
 73 |   /*
 74 |    * Returns [[KinesisSourceOffset]] from a JSON
 75 |    */
 76 |   def apply(json: String): KinesisSourceOffset = {
 77 |     try {
 78 |       val readObj = Serialization.read[ Map[ String, Map[ String, String ] ] ](json)
 79 |       val metadata = readObj.get("metadata")
 80 |       val shardInfoMap: Map[String, ShardInfo ] = readObj.filter(_._1 != "metadata").map {
 81 |         case (shardId, value) => shardId.toString -> new ShardInfo(shardId.toString,
 82 |           value.get("iteratorType").get,
 83 |           value.get("iteratorPosition").get)
 84 |       }.toMap
 85 |       KinesisSourceOffset(
 86 |         new ShardOffsets(
 87 |           metadata.get("batchId").toLong,
 88 |           metadata.get("streamName"),
 89 |           shardInfoMap))
 90 |     } catch {
 91 |       case NonFatal(x) => throw new IllegalArgumentException(x)
 92 |     }
 93 |   }
 94 | 
 95 |   def getMap(shardInfos: Array[ShardInfo]): Map[String, ShardInfo] = {
 96 |     shardInfos.map {
 97 |       s: ShardInfo => (s.shardId -> s)
 98 |     }.toMap
 99 |   }
100 | 
101 | }
102 | 
103 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/KinesisSourceProvider.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.kinesis
 19 | 
 20 | import java.util.Locale
 21 | 
 22 | import org.apache.spark.internal.Logging
 23 | import org.apache.spark.sql.SQLContext
 24 | import org.apache.spark.sql.execution.streaming.{Sink, Source}
 25 | import org.apache.spark.sql.sources._
 26 | import org.apache.spark.sql.streaming.OutputMode
 27 | import org.apache.spark.sql.types.StructType
 28 | 
 29 | /*
 30 |  * The provider class for the [[KinesisSource]]. This provider is designed such that it throws
 31 |  * IllegalArgumentException when the Kinesis Dataset is created, so that it can catch
 32 |  * missing options even before the query is started.
 33 |  */
 34 | 
 35 | private[kinesis] class KinesisSourceProvider extends DataSourceRegister
 36 |   with StreamSourceProvider
 37 |   with StreamSinkProvider
 38 |   with Logging {
 39 | 
 40 |   import KinesisSourceProvider._
 41 | 
 42 |   override def shortName(): String = "kinesis"
 43 | 
 44 |   /*
 45 |    *  Returns the name and schema of the source. In addition, it also verifies whether the options
 46 |    * are correct and sufficient to create the [[KinesisSource]] when the query is started.
 47 |    */
 48 | 
 49 |   override def sourceSchema(
 50 |       sqlContext: SQLContext,
 51 |       schema: Option[StructType],
 52 |       providerName: String,
 53 |       parameters: Map[String, String]): (String, StructType) = {
 54 |     val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
 55 |     validateStreamOptions(caseInsensitiveParams)
 56 |     require(schema.isEmpty, "Kinesis source has a fixed schema and cannot be set with a custom one")
 57 |     (shortName(), KinesisReader.kinesisSchema)
 58 |   }
 59 | 
 60 |   override def createSource(
 61 |       sqlContext: SQLContext,
 62 |       metadataPath: String,
 63 |       schema: Option[StructType],
 64 |       providerName: String,
 65 |       parameters: Map[String, String]): Source = {
 66 | 
 67 |     val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
 68 | 
 69 |     validateStreamOptions(caseInsensitiveParams)
 70 | 
 71 |     val specifiedKinesisParams =
 72 |       parameters
 73 |         .keySet
 74 |         .filter(_.toLowerCase(Locale.ROOT).startsWith("kinesis."))
 75 |         .map { k => k.drop(8).toString -> parameters(k) }
 76 |         .toMap
 77 | 
 78 |     val streamName = caseInsensitiveParams.get(STREAM_NAME_KEY).get
 79 | 
 80 |     val awsAccessKeyId = caseInsensitiveParams.get(AWS_ACCESS_KEY_ID).getOrElse("")
 81 |     val awsSecretKey = caseInsensitiveParams.get(AWS_SECRET_KEY).getOrElse("")
 82 |     val sessionToken = caseInsensitiveParams.get(AWS_SESSION_TOKEN).getOrElse("")
 83 |     val awsStsRoleArn = caseInsensitiveParams.get(AWS_STS_ROLE_ARN).getOrElse("")
 84 |     val awsStsSessionName = caseInsensitiveParams.get(AWS_STS_SESSION_NAME).getOrElse("")
 85 |     val awsUseInstanceProfile = caseInsensitiveParams.getOrElse(AWS_USE_INSTANCE_PROFILE, "true")
 86 |       .toBoolean
 87 | 
 88 |     val regionName = caseInsensitiveParams.get(REGION_NAME_KEY)
 89 |       .getOrElse(DEFAULT_KINESIS_REGION_NAME)
 90 |     val endPointURL = caseInsensitiveParams.get(END_POINT_URL)
 91 |       .getOrElse(DEFAULT_KINESIS_ENDPOINT_URL)
 92 | 
 93 |     val failOnDataLoss = caseInsensitiveParams.get(FAILONDATALOSS)
 94 |       .getOrElse("true").toBoolean
 95 | 
 96 |     val initialPosition: InitialKinesisPosition = getKinesisPosition(caseInsensitiveParams)
 97 | 
 98 |     val kinesisCredsProvider = if (awsAccessKeyId.length > 0) {
 99 |       if(sessionToken.length > 0) {
100 |         BasicAWSSessionCredentials(awsAccessKeyId, awsSecretKey, sessionToken)
101 |       } else {
102 |         BasicCredentials(awsAccessKeyId, awsSecretKey)
103 |       }
104 |     } else if (awsStsRoleArn.length > 0) {
105 |       STSCredentials(awsStsRoleArn, awsStsSessionName)
106 |     } else if (awsUseInstanceProfile) {
107 |       InstanceProfileCredentials
108 |     } else {
109 |       DefaultCredentials
110 |     }
111 | 
112 |     new KinesisSource(
113 |       sqlContext, specifiedKinesisParams, metadataPath,
114 |       streamName, initialPosition, endPointURL, kinesisCredsProvider, failOnDataLoss)
115 |   }
116 | 
117 |   private def validateStreamOptions(caseInsensitiveParams: Map[String, String]) = {
118 |     if (!caseInsensitiveParams.contains(STREAM_NAME_KEY) ||
119 |       caseInsensitiveParams.get(STREAM_NAME_KEY).get.isEmpty) {
120 |       throw new IllegalArgumentException(
121 |         "Stream name is a required field")
122 |     }
123 |   }
124 | 
125 |   private def validateSinkOptions(caseInsensitiveParams: Map[String, String]): Unit = {
126 |     if (!caseInsensitiveParams.contains(SINK_STREAM_NAME_KEY) ||
127 |       caseInsensitiveParams(SINK_STREAM_NAME_KEY).isEmpty) {
128 |       throw new IllegalArgumentException(
129 |         "Stream name is a required field")
130 |     }
131 |     if (!caseInsensitiveParams.contains(SINK_ENDPOINT_URL) ||
132 |       caseInsensitiveParams(SINK_ENDPOINT_URL).isEmpty) {
133 |       throw new IllegalArgumentException(
134 |         "Sink endpoint url is a required field")
135 |     }
136 |     if (caseInsensitiveParams.contains(SINK_AGGREGATION_ENABLED) && (
137 |         caseInsensitiveParams(SINK_AGGREGATION_ENABLED).trim != "true" &&
138 |         caseInsensitiveParams(SINK_AGGREGATION_ENABLED).trim != "false"
139 |       )) {
140 |       throw new IllegalArgumentException(
141 |         "Sink aggregation value must be either true or false")
142 |     }
143 |   }
144 | 
145 |   override def createSink(
146 |                            sqlContext: SQLContext,
147 |                            parameters: Map[String, String],
148 |                            partitionColumns: Seq[String],
149 |                            outputMode: OutputMode): Sink = {
150 |     val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
151 |     validateSinkOptions(caseInsensitiveParams)
152 |     new KinesisSink(sqlContext, caseInsensitiveParams, outputMode)
153 |   }
154 | 
155 | }
156 | 
157 | private[kinesis] object KinesisSourceProvider extends Logging {
158 | 
159 |   private[kinesis] val STREAM_NAME_KEY = "streamname"
160 |   private[kinesis] val END_POINT_URL = "endpointurl"
161 |   private[kinesis] val REGION_NAME_KEY = "regionname"
162 |   private[kinesis] val AWS_ACCESS_KEY_ID = "awsaccesskeyid"
163 |   private[kinesis] val AWS_SECRET_KEY = "awssecretkey"
164 |   private[kinesis] val AWS_SESSION_TOKEN = "sessiontoken"
165 |   private[kinesis] val AWS_STS_ROLE_ARN = "awsstsrolearn"
166 |   private[kinesis] val AWS_STS_SESSION_NAME = "awsstssessionname"
167 |   private[kinesis] val AWS_USE_INSTANCE_PROFILE = "awsuseinstanceprofile"
168 |   private[kinesis] val STARTING_POSITION_KEY = "startingposition"
169 |   private[kinesis] val FAILONDATALOSS = "failondataloss"
170 | 
171 |   private[kinesis] val DESCRIBE_SHARD_INTERVAL = "client.describeshardinterval"
172 | 
173 |   // Sink Options
174 |   private[kinesis] val SINK_STREAM_NAME_KEY = "streamname"
175 |   private[kinesis] val SINK_ENDPOINT_URL = "endpointurl"
176 |   private[kinesis] val SINK_RECORD_MAX_BUFFERED_TIME = "kinesis.executor.recordmaxbufferedtime"
177 |   private[kinesis] val SINK_MAX_CONNECTIONS = "kinesis.executor.maxconnections"
178 |   private[kinesis] val SINK_AGGREGATION_ENABLED = "kinesis.executor.aggregationenabled"
179 |   private[kinesis] val SINK_FLUSH_WAIT_TIME_MILLIS = "kniesis.executor.flushwaittimemillis"
180 | 
181 | 
182 |   private[kinesis] def getKinesisPosition(
183 |       params: Map[String, String]): InitialKinesisPosition = {
184 |     val CURRENT_TIMESTAMP = System.currentTimeMillis
185 |     params.get(STARTING_POSITION_KEY).map(_.trim) match {
186 |       case Some(position) if position.toLowerCase(Locale.ROOT) == "latest" =>
187 |         InitialKinesisPosition.fromPredefPosition(new AtTimeStamp(CURRENT_TIMESTAMP))
188 |       case Some(position) if position.toLowerCase(Locale.ROOT) == "trim_horizon" =>
189 |         InitialKinesisPosition.fromPredefPosition(new TrimHorizon)
190 |       case Some(position) if position.toLowerCase(Locale.ROOT) == "earliest" =>
191 |         InitialKinesisPosition.fromPredefPosition(new TrimHorizon)
192 |       case Some(json) =>
193 |         InitialKinesisPosition.fromCheckpointJson(json, new AtTimeStamp(CURRENT_TIMESTAMP))
194 |       case None => InitialKinesisPosition.fromPredefPosition(new AtTimeStamp(CURRENT_TIMESTAMP))
195 |     }
196 |   }
197 | 
198 |   private[kinesis] val DEFAULT_KINESIS_ENDPOINT_URL: String =
199 |     "https://kinesis.us-east-1.amazonaws.com"
200 | 
201 |   private[kinesis] val DEFAULT_KINESIS_REGION_NAME: String = "us-east-1"
202 | 
203 |   private[kinesis] val DEFAULT_SINK_RECORD_MAX_BUFFERED_TIME: String = "1000"
204 | 
205 |   private[kinesis] val DEFAULT_SINK_MAX_CONNECTIONS: String = "1"
206 | 
207 |   private[kinesis] val DEFAULT_SINK_AGGREGATION: String = "true"
208 | 
209 |   private[kinesis] val DEFAULT_FLUSH_WAIT_TIME_MILLIS: String = "100"
210 | }
211 | 
212 | 
213 | 
214 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/KinesisSourceRDD.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.spark.sql.kinesis
 18 | 
 19 | import com.amazonaws.services.kinesis.model.{GetRecordsResult, Record}
 20 | import java.io.Serializable
 21 | import java.util.Locale
 22 | import scala.collection.JavaConverters._
 23 | 
 24 | import org.apache.spark.{Partition, SparkContext, TaskContext}
 25 | import org.apache.spark.rdd.RDD
 26 | import org.apache.spark.storage.StorageLevel
 27 | import org.apache.spark.util.NextIterator
 28 | import org.apache.spark.util.SerializableConfiguration
 29 | 
 30 | 
 31 | /** Offset range that one partition of the KinesiSourceRDD has to read */
 32 | private[kinesis] case class ShardInfo(
 33 |     shardId: String,
 34 |     iteratorType: String,
 35 |     iteratorPosition: String) extends Serializable {
 36 | 
 37 |   def this(shardId: String, kinesisPosition: KinesisPosition) {
 38 |     this(shardId, kinesisPosition.iteratorType, kinesisPosition.iteratorPosition)
 39 |   }
 40 | }
 41 | 
 42 | private[kinesis] case class ShardOffsets(
 43 |     batchId: Long,
 44 |     streamName: String,
 45 |     shardInfoMap: Map[String, ShardInfo]
 46 |     ) extends Serializable {
 47 | 
 48 |   def this(batchId: Long, streamName: String) {
 49 |     this(batchId, streamName, Map.empty[String, ShardInfo])
 50 |   }
 51 | 
 52 |   def this(shardInfoMap: Map[String, ShardInfo]) {
 53 |     this(-1, "", shardInfoMap)
 54 |   }
 55 | 
 56 |   def this(batchId: Long, streamName: String, shardInfos: Array[ShardInfo]) {
 57 |     this(batchId, streamName, KinesisSourceOffset.getMap(shardInfos))
 58 |   }
 59 | 
 60 |   def this(shardInfos: Array[ShardInfo]) {
 61 |     this(-1, "", KinesisSourceOffset.getMap(shardInfos))
 62 |   }
 63 | 
 64 | }
 65 | 
 66 | 
 67 | /** Partition of the KinesiSourceRDD */
 68 | private[kinesis] case class KinesisSourceRDDPartition(
 69 |     index: Int,
 70 |     shardInfo: ShardInfo) extends Partition
 71 | 
 72 |  /*
 73 |   * An RDD that reads data from Kinesis based on offset ranges across multiple shards.
 74 |   */
 75 | 
 76 | private[kinesis] class KinesisSourceRDD(
 77 |     sparkContext: SparkContext,
 78 |     sourceOptions: Map[String, String],
 79 |     streamName: String,
 80 |     batchId: Long,
 81 |     shardInfos: Seq[ShardInfo],
 82 |     kinesisCredsProvider: SparkAWSCredentials,
 83 |     endpointUrl: String,
 84 |     conf: SerializableConfiguration,
 85 |     metadataPath: String,
 86 |     failOnDataLoss: Boolean = true
 87 |     )
 88 |   extends RDD[Record](sparkContext, Nil) {
 89 | 
 90 |   override def persist(newLevel: StorageLevel): this.type = {
 91 |     logError("Kinesis Record is not serializable. " +
 92 |       "Use .map to extract fields before calling .persist or .window")
 93 |     super.persist(newLevel)
 94 |   }
 95 | 
 96 |   override def getPartitions: Array[Partition] = {
 97 |     shardInfos.zipWithIndex.map { case (o, i) => new KinesisSourceRDDPartition(i, o) }.toArray
 98 |   }
 99 | 
100 |   override def compute(
101 |       thePart: Partition,
102 |       context: TaskContext): Iterator[Record] = {
103 |     val sourcePartition = thePart.asInstanceOf[KinesisSourceRDDPartition]
104 | 
105 |     val kinesisShardId = sourcePartition.shardInfo.shardId
106 | 
107 |     val kinesisReader = new KinesisReader(
108 |       sourceOptions,
109 |       streamName,
110 |       kinesisCredsProvider,
111 |       endpointUrl
112 |     )
113 | 
114 |     val maxFetchTimeInMs =
115 |     sourceOptions.getOrElse("executor.maxFetchTimeInMs".toLowerCase(Locale.ROOT), "1000").toLong
116 | 
117 |     val maxRecordsPerShard =
118 |     sourceOptions.getOrElse("executor.maxFetchRecordsPerShard".toLowerCase(Locale.ROOT),
119 |       "100000").toLong
120 | 
121 |     val recordPerRequest =
122 |       sourceOptions.getOrElse("executor.maxRecordPerRead".toLowerCase(Locale.ROOT), "10000").toInt
123 | 
124 |     val enableIdleTimeBetweenReads: Boolean =
125 |       sourceOptions.getOrElse("executor.addIdleTimeBetweenReads".toLowerCase(Locale.ROOT),
126 |         "false").toBoolean
127 | 
128 |     val idleTimeBetweenReads =
129 |       sourceOptions.getOrElse("executor.idleTimeBetweenReadsInMs".toLowerCase(Locale.ROOT),
130 |         "1000").toLong
131 | 
132 |     val startTimestamp: Long = System.currentTimeMillis
133 |     var lastReadTimeMs: Long = 0
134 |     var lastReadSequenceNumber: String = ""
135 |     var numRecordRead: Long = 0
136 |     var hasShardClosed = false
137 | 
138 |     val underlying = new NextIterator[Record]() {
139 |       var _shardIterator: String = null
140 |       var fetchedRecords: Array[Record] = Array.empty
141 |       var currentIndex = 0
142 |       var fetchNext = true
143 | 
144 |       def getShardIterator(): String = {
145 |         if (_shardIterator == null) {
146 |           _shardIterator = kinesisReader.getShardIterator(
147 |             sourcePartition.shardInfo.shardId,
148 |             sourcePartition.shardInfo.iteratorType,
149 |             sourcePartition.shardInfo.iteratorPosition,
150 |             failOnDataLoss)
151 |           if (!failOnDataLoss && _shardIterator == null) {
152 |             logWarning(
153 |               s"""
154 |                  | Some data may have been lost because ${sourcePartition.shardInfo.shardId}
155 |                  | is not available in Kinesis any more. The shard has
156 |                  | we have processed all records in it. We would ignore th
157 |                  | processing. If you want your streaming query to
158 |                  |  set the source option "failOnDataLoss" to "true"
159 |                 """.stripMargin)
160 |             return _shardIterator
161 |           }
162 |         }
163 |         assert(_shardIterator != null)
164 |         _shardIterator
165 |       }
166 | 
167 |       def canFetchMoreRecords(currentTimestamp: Long): Boolean = {
168 |         currentTimestamp - startTimestamp < maxFetchTimeInMs
169 |       }
170 | 
171 |       def addDelayInFetchingRecords(currentTimestamp: Long): Unit = {
172 |         if ( enableIdleTimeBetweenReads && lastReadTimeMs > 0 ) {
173 |           val delayMs: Long = idleTimeBetweenReads - (currentTimestamp - lastReadTimeMs)
174 |           if (delayMs > 0) {
175 |             logInfo(s"Sleeping for ${delayMs}ms")
176 |             Thread.sleep(delayMs)
177 |           }
178 |         }
179 |       }
180 | 
181 |       override def getNext(): Record = {
182 |         if (fetchedRecords.length == 0 || currentIndex >= fetchedRecords.length) {
183 |           fetchedRecords = Array.empty
184 |           currentIndex = 0
185 |           while (fetchedRecords.length == 0 && fetchNext == true)  {
186 |             val currentTimestamp: Long = System.currentTimeMillis
187 |             if (canFetchMoreRecords(currentTimestamp) && getShardIterator() != null) {
188 |               // getShardIterator() should raise exception if its null if failOnDataLoss is true
189 |               // if failOnDataLoss is false, getShardIterator() will be null and we should stop
190 |               // fetching more records
191 |               addDelayInFetchingRecords(currentTimestamp)
192 |               val records: GetRecordsResult = kinesisReader.getKinesisRecords(
193 |                 _shardIterator, recordPerRequest)
194 |               // de-aggregate records
195 |               val deaggregateRecords = kinesisReader.deaggregateRecords(records.getRecords, null)
196 |               fetchedRecords = deaggregateRecords.asScala.toArray
197 |               _shardIterator = records.getNextShardIterator
198 |               lastReadTimeMs = System.currentTimeMillis()
199 |               logDebug(s"Milli secs behind is ${records.getMillisBehindLatest.longValue()}")
200 |               if ( _shardIterator == null ) {
201 |                 hasShardClosed = true
202 |                 fetchNext = false
203 |               }
204 |               if ( records.getMillisBehindLatest.longValue() == 0 ) {
205 |                 fetchNext = false
206 |               }
207 |             }
208 |             else {
209 |               // either we cannot fetch more records or ShardIterator was null
210 |               fetchNext = false
211 |             }
212 |           }
213 |         }
214 | 
215 |         if (fetchedRecords.length == 0) {
216 |           finished = true
217 |           null
218 |         }
219 |         else {
220 |           val record: Record = fetchedRecords(currentIndex)
221 |           currentIndex += 1
222 |           numRecordRead +=1
223 |           if (numRecordRead > maxRecordsPerShard) {
224 |             fetchNext = false
225 |           }
226 |           lastReadSequenceNumber = record.getSequenceNumber
227 |           record
228 |         }
229 |       }
230 |       override protected def close(): Unit = synchronized {
231 |         kinesisReader.close()
232 |       }
233 |     }
234 | 
235 |     lazy val metadataCommitter: MetadataCommitter[ShardInfo] = {
236 |       metaDataCommitterType.toLowerCase(Locale.ROOT) match {
237 |         case "hdfs" => new HDFSMetadataCommitter[ ShardInfo ](
238 |           metaDataCommitterPath, conf, sourceOptions)
239 |         case _ => throw new IllegalArgumentException("only HDFS is supported")
240 |       }
241 |     }
242 | 
243 |     def metaDataCommitterType: String = {
244 |       sourceOptions.getOrElse("executor.metadata.committer", "hdfs").toString
245 |     }
246 | 
247 |     def metaDataCommitterPath: String = {
248 |       sourceOptions.getOrElse("executor.metadata.path", metadataPath).toString
249 |     }
250 | 
251 | 
252 |     def updateMetadata(taskContext: TaskContext): Unit = {
253 | 
254 |       // if lastReadSequenceNumber exists, use AfterSequenceNumber for next Iterator
255 |       // else use the same iterator information which was given to the RDD
256 | 
257 |       val shardInfo: ShardInfo =
258 |         if (hasShardClosed) {
259 |           new ShardInfo(sourcePartition.shardInfo.shardId,
260 |             new ShardEnd())
261 |         }
262 |         else if (!lastReadSequenceNumber.isEmpty) {
263 |           new ShardInfo(
264 |             sourcePartition.shardInfo.shardId,
265 |             new AfterSequenceNumber(lastReadSequenceNumber))
266 |         }
267 |         else {
268 |             logInfo("No Records were processed in this batch")
269 |             sourcePartition.shardInfo
270 |         }
271 |       logInfo(s"Batch $batchId : Committing End Shard position for $kinesisShardId")
272 |       metadataCommitter.add(batchId, kinesisShardId, shardInfo)
273 |     }
274 | 
275 |       // Release reader, either by removing it or indicating we're no longer using it
276 |     context.addTaskCompletionListener [Unit]{ taskContext: TaskContext =>
277 |       logInfo("Task Completed")
278 |       updateMetadata(taskContext)
279 |     }
280 | 
281 |     underlying
282 |   }
283 | 
284 | }
285 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/KinesisWriteTask.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.spark.sql.kinesis
 18 | 
 19 | import java.nio.ByteBuffer
 20 | 
 21 | import scala.util.Try
 22 | 
 23 | import com.amazonaws.services.kinesis.producer.{KinesisProducer, UserRecordResult}
 24 | import com.google.common.util.concurrent.{FutureCallback, Futures}
 25 | 
 26 | import org.apache.spark.internal.Logging
 27 | import org.apache.spark.sql.catalyst.InternalRow
 28 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, UnsafeProjection}
 29 | import org.apache.spark.sql.types.{BinaryType, StringType}
 30 | 
 31 | private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, String],
 32 |                                         inputSchema: Seq[Attribute]) extends Logging {
 33 | 
 34 |   private var producer: KinesisProducer = _
 35 |   private val projection = createProjection
 36 |   private val streamName = producerConfiguration.getOrElse(
 37 |     KinesisSourceProvider.SINK_STREAM_NAME_KEY, "")
 38 | 
 39 |   private val flushWaitTimeMills = Try(producerConfiguration.getOrElse(
 40 |     KinesisSourceProvider.SINK_FLUSH_WAIT_TIME_MILLIS,
 41 |     KinesisSourceProvider.DEFAULT_FLUSH_WAIT_TIME_MILLIS).toLong).getOrElse {
 42 |     throw new IllegalArgumentException(
 43 |       s"${KinesisSourceProvider.SINK_FLUSH_WAIT_TIME_MILLIS} has to be a positive integer")
 44 |   }
 45 | 
 46 |   private var failedWrite: Throwable = _
 47 | 
 48 |   def execute(iterator: Iterator[InternalRow]): Unit = {
 49 |     producer = CachedKinesisProducer.getOrCreate(producerConfiguration)
 50 |     while (iterator.hasNext && failedWrite == null) {
 51 |       val currentRow = iterator.next()
 52 |       val projectedRow = projection(currentRow)
 53 |       val partitionKey = projectedRow.getString(0)
 54 |       val data = projectedRow.getBinary(1)
 55 | 
 56 |       sendData(partitionKey, data)
 57 |     }
 58 |   }
 59 | 
 60 |   def sendData(partitionKey: String, data: Array[Byte]): String = {
 61 |     var sentSeqNumbers = new String
 62 | 
 63 |     val future = producer.addUserRecord(streamName, partitionKey, ByteBuffer.wrap(data))
 64 | 
 65 |     val kinesisCallBack = new FutureCallback[UserRecordResult]() {
 66 | 
 67 |       override def onFailure(t: Throwable): Unit = {
 68 |         if (failedWrite == null && t!= null) {
 69 |           failedWrite = t
 70 |           logError(s"Writing to  $streamName failed due to ${t.getCause}")
 71 |         }
 72 |       }
 73 | 
 74 |       override def onSuccess(result: UserRecordResult): Unit = {
 75 |         val shardId = result.getShardId
 76 |         sentSeqNumbers = result.getSequenceNumber
 77 |       }
 78 |     }
 79 |     Futures.addCallback(future, kinesisCallBack)
 80 | 
 81 |     sentSeqNumbers
 82 |   }
 83 | 
 84 |   private def flushRecordsIfNecessary(): Unit = {
 85 |     if (producer != null) {
 86 |       while (producer.getOutstandingRecordsCount > 0) {
 87 |         try {
 88 |           producer.flush()
 89 |           Thread.sleep(flushWaitTimeMills)
 90 |         } catch {
 91 |           case e: InterruptedException =>
 92 |           // Do Nothing
 93 |         } finally {
 94 |           checkForErrors()
 95 |         }
 96 |       }
 97 |     }
 98 |   }
 99 | 
100 |   def checkForErrors(): Unit = {
101 |     if (failedWrite != null) {
102 |       throw failedWrite
103 |     }
104 |   }
105 | 
106 |   def close(): Unit = {
107 |     checkForErrors()
108 |     flushRecordsIfNecessary()
109 |     checkForErrors()
110 |     producer = null
111 |   }
112 | 
113 |   private def createProjection: UnsafeProjection = {
114 | 
115 |     val partitionKeyExpression = inputSchema
116 |       .find(_.name == KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME).getOrElse(
117 |       throw new IllegalStateException("Required attribute " +
118 |         s"'${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME}' not found"))
119 | 
120 |     partitionKeyExpression.dataType match {
121 |       case StringType | BinaryType => // ok
122 |       case t =>
123 |         throw new IllegalStateException(s"${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME} " +
124 |           "attribute type must be a String or BinaryType")
125 |     }
126 | 
127 |     val dataExpression = inputSchema.find(_.name == KinesisWriter.DATA_ATTRIBUTE_NAME).getOrElse(
128 |       throw new IllegalStateException("Required attribute " +
129 |         s"'${KinesisWriter.DATA_ATTRIBUTE_NAME}' not found")
130 |     )
131 | 
132 |     dataExpression.dataType match {
133 |       case StringType | BinaryType => // ok
134 |       case t =>
135 |         throw new IllegalStateException(s"${KinesisWriter.DATA_ATTRIBUTE_NAME} " +
136 |           "attribute type must be a String or BinaryType")
137 |     }
138 | 
139 |     UnsafeProjection.create(
140 |       Seq(Cast(partitionKeyExpression, StringType), Cast(dataExpression, StringType)), inputSchema)
141 |   }
142 | 
143 | }
144 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/KinesisWriter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.kinesis
19 | 
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.SparkSession
22 | import org.apache.spark.sql.execution.{QueryExecution, SQLExecution}
23 | import org.apache.spark.util.Utils
24 | 
25 | private[kinesis] object KinesisWriter extends Logging {
26 | 
27 |   val DATA_ATTRIBUTE_NAME: String = "data"
28 |   val PARTITION_KEY_ATTRIBUTE_NAME: String = "partitionKey"
29 | 
30 |   override def toString: String = "KinesisWriter"
31 | 
32 |   def write(sparkSession: SparkSession,
33 |             queryExecution: QueryExecution,
34 |             kinesisParameters: Map[String, String]): Unit = {
35 |     val schema = queryExecution.analyzed.output
36 | 
37 |     SQLExecution.withNewExecutionId(queryExecution) {
38 |       queryExecution.toRdd.foreachPartition { iter =>
39 |         val writeTask = new KinesisWriteTask(kinesisParameters, schema)
40 |         Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
41 |           finallyBlock = writeTask.close())
42 |       }
43 |     }
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/MetadataCommitter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.kinesis
19 | 
20 | trait MetadataCommitter[T <: AnyRef] {
21 |   // Functions that various committer need to implement
22 |   // This committed will be used by executors to push metadata related to kinesis shards
23 |   // Possibile Implemetations are HDFS, DynamoDB, Mysql etc
24 |   def add(batchId: Long, shardId: String, metadata: T): Boolean
25 |   def get(batchId: Long): Seq[T]
26 |   def purge(thresholdBatchId: Long): Unit
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/ShardSyncer.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.kinesis
 19 | 
 20 | import com.amazonaws.services.kinesis.model.Shard
 21 | import scala.collection.mutable
 22 | 
 23 | import org.apache.spark.internal.Logging
 24 | 
 25 | /*
 26 |  * Helper class to sync batch with shards of the Kinesis stream.
 27 |  * It will create new activities when it discovers new Kinesis shards (bootstrap/resharding).
 28 |  * It works in similar way as
 29 |  * com.amazonaws.services.kinesis.clientlibrary.lib.worker.ShardSyncer in KCL
 30 |  */
 31 | 
 32 | private[kinesis] object ShardSyncer extends Logging {
 33 | 
 34 |   private def getShardIdToChildShardsMap(latestShards: Seq[Shard]):
 35 |     mutable.Map[String, List[String ]] = {
 36 |     val shardIdToChildShardsMap = mutable.Map.empty[String, List[String]]
 37 | 
 38 |     val shardIdToShardMap =
 39 |       latestShards.map {
 40 |         s => (s.getShardId -> s)
 41 |       }.toMap
 42 | 
 43 |     for ((shardId, shard) <- shardIdToShardMap) {
 44 |       val parentShardId: String = shard.getParentShardId
 45 |       if ( parentShardId != null && shardIdToShardMap.contains(parentShardId) ) {
 46 |         shardIdToChildShardsMap += (
 47 |           parentShardId ->
 48 |             (shardId :: shardIdToChildShardsMap.get(parentShardId).getOrElse(Nil))
 49 |           )
 50 |       }
 51 | 
 52 |       val adjacentParentShardId: String = shard.getAdjacentParentShardId
 53 |       if ( adjacentParentShardId != null && shardIdToShardMap.contains(adjacentParentShardId) ) {
 54 |         shardIdToChildShardsMap += (
 55 |           adjacentParentShardId ->
 56 |             (shardId :: shardIdToChildShardsMap.get(adjacentParentShardId).getOrElse(Nil))
 57 |           )
 58 |       }
 59 |     }
 60 |     // Assert that Parent Shards are closed
 61 |     shardIdToChildShardsMap.keySet.foreach {
 62 |       parentShardId =>
 63 |         shardIdToShardMap.get(parentShardId) match {
 64 |           case None =>
 65 |             throw new IllegalStateException(s"ShardId $parentShardId is not closed. " +
 66 |               s"This can happen due to a race condition between listShards and a" +
 67 |               s" reshard operation")
 68 |           case Some(parentShard: Shard) =>
 69 |             if (parentShard.getSequenceNumberRange().getEndingSequenceNumber == null) {
 70 |               throw new IllegalStateException(s"ShardId $parentShardId is not closed. " +
 71 |                 s"This can happen due to a race condition between listShards and a " +
 72 |                 s"reshard operation")
 73 |             }
 74 |         }
 75 |     }
 76 |     shardIdToChildShardsMap
 77 |   }
 78 | 
 79 |   private[kinesis] def AddShardInfoForAncestors(
 80 |      shardId: String,
 81 |      latestShards: Seq[Shard],
 82 |      initialPosition: InitialKinesisPosition,
 83 |      prevShardsList: mutable.Set[ String ],
 84 |      newShardsInfoMap: mutable.HashMap[ String, ShardInfo ],
 85 |      memoizationContext: mutable.Map[String, Boolean ]): Unit = {
 86 | 
 87 |     val shardIdToShardMap =
 88 |       latestShards.map {
 89 |         s => (s.getShardId -> s)
 90 |       }.toMap
 91 | 
 92 |     if (!memoizationContext.contains(shardId) &&
 93 |       shardId != null && shardIdToShardMap.contains(shardId) ) {
 94 |       if (prevShardsList.contains(shardId) ) {
 95 |         // we already have processed this shard in previous batch and added its ancestors
 96 |         memoizationContext.put(shardId, true)
 97 |         return
 98 |       }
 99 |       var shard = shardIdToShardMap.get(shardId).get
100 |       // get parent of shards if exist
101 |       var parentShardIds: mutable.HashSet[String] = getParentShardIds(shard, latestShards)
102 |       for (parentShardId <- parentShardIds) {
103 |         // Add ShardInfo of Parent's ancestors.
104 |         AddShardInfoForAncestors( parentShardId,
105 |           latestShards, initialPosition, prevShardsList,
106 |           newShardsInfoMap, memoizationContext)
107 |       }
108 |       // create shardInfo for its parent shards (if they don't exist)
109 |       for (parentShardId <- parentShardIds) {
110 |         if (!prevShardsList.contains(parentShardId) ) {
111 |           logDebug("Need to create a shardInfo for shardId " + parentShardId)
112 |           if (newShardsInfoMap.get(parentShardId).isEmpty) {
113 |               newShardsInfoMap.put(parentShardId,
114 |                 new ShardInfo(parentShardId, initialPosition.shardPosition(parentShardId)))
115 |             }
116 |           }
117 |       }
118 |       memoizationContext.put(shardId, true)
119 |     }
120 |   }
121 | 
122 |   private[kinesis] def getParentShardIds(
123 |      shard: Shard,
124 |      shards: Seq[Shard]): mutable.HashSet[String] = {
125 |     val parentShardIds = new mutable.HashSet[ String ]
126 |     val parentShardId = shard.getParentShardId
127 |     val shardIdToShardMap =
128 |       shards.map {
129 |         s => (s.getShardId -> s)
130 |       }.toMap
131 | 
132 |     if ((parentShardId != null) && shardIdToShardMap.contains(parentShardId)) {
133 |       parentShardIds.add(parentShardId)
134 |     }
135 |     val adjacentParentShardId = shard.getAdjacentParentShardId
136 |     if ( (adjacentParentShardId != null) && shardIdToShardMap.contains(adjacentParentShardId)) {
137 |       parentShardIds.add(adjacentParentShardId)
138 |     }
139 |     return parentShardIds
140 |   }
141 | 
142 |   /*
143 |    *  Takes a sequence of Shard as input params
144 |    *  It iterate though each shards
145 |    *  and return a sequence of shard-ids of open Shards
146 |    */
147 |   def openShards(shards: Seq[Shard]): Seq[String] = {
148 |     // List of open Shards
149 |     shards.collect {
150 |       case s: Shard if (s.getSequenceNumberRange.getEndingSequenceNumber == null) => s.getShardId
151 |     }
152 |   }
153 | 
154 |   /*
155 |    *  Takes a sequence of Shard as input params
156 |    *  It iterate though each shards
157 |    *  and return a sequence of shard-ids of closed Shards
158 |    */
159 | 
160 |   def closedShards(shards: Seq[Shard]): Seq[String] = {
161 |     // List of closed Shards
162 |     shards.collect {
163 |       case s: Shard if (s.getSequenceNumberRange.getEndingSequenceNumber != null) => s.getShardId
164 |     }
165 |   }
166 | 
167 |   def hasNewShards(latestShardsInfo: Seq[ShardInfo],
168 |                    prevShardsInfo: Seq[ShardInfo]): Boolean = {
169 |     latestShardsInfo.foldLeft(false) {
170 |       (hasNewShard, shardInfo) =>
171 |         if (!hasNewShard) {
172 |           // Check only if hasNewShard is false
173 |           prevShardsInfo.contains(shardInfo.shardId)
174 |         } else {
175 |           hasNewShard
176 |         }
177 |     }
178 |   }
179 | 
180 |   def hasDeletedShards(latestShardsInfo: Seq[ShardInfo],
181 |                    prevShardsInfo: Seq[ShardInfo]): Boolean = {
182 |     prevShardsInfo.foldLeft(false) {
183 |       (hasDeletedShard, shardInfo) =>
184 |         if (!hasDeletedShard) {
185 |           // Check only if hasDeletedShard is false
186 |           latestShardsInfo.contains(shardInfo.shardId)
187 |         } else {
188 |           hasDeletedShard
189 |         }
190 |     }
191 |   }
192 | 
193 |   def getLatestShardInfo(
194 |       latestShards: Seq[Shard],
195 |       prevShardsInfo: Seq[ShardInfo],
196 |       initialPosition: InitialKinesisPosition,
197 |       failOnDataLoss: Boolean = true): Seq[ShardInfo] = {
198 | 
199 |     if (latestShards.isEmpty) {
200 |       return prevShardsInfo
201 |     }
202 |     var prevShardsList = new mutable.HashSet[String]
203 |     var latestShardsList = new mutable.HashSet[String]
204 |     prevShardsInfo.foreach {
205 |       s: ShardInfo => prevShardsList.add(s.shardId)
206 |     }
207 |     latestShards.foreach {
208 |       s: Shard => latestShardsList.add(s.getShardId)
209 |     }
210 |     // check for deleted shards
211 |     val deletedShardsList = prevShardsList.diff(latestShardsList)
212 |     val newShardsInfoMap = new mutable.HashMap[String, ShardInfo]
213 |     val memoizationContext = new mutable.HashMap[ String, Boolean]
214 | 
215 |     // check for deleted Shards and update newShardInfo if failOnDataLoss is false
216 |     if (deletedShardsList.nonEmpty) {
217 |       if (failOnDataLoss) {
218 |         throw new IllegalStateException(
219 |           s"""
220 |              | Some data may have been lost because ${deletedShardsList.toString()}
221 |              | are not available in Kinesis any more. The shard has been deleted before
222 |              | we have processed all records in it. If you do not want your streaming query
223 |              | to fail on such cases, set the source option "failOnDataLoss" to "false"
224 |            """.stripMargin
225 |         )
226 |       } else {
227 |         log.warn(
228 |           s"""
229 |              | Some data may have been lost because $deletedShardsList are not available in Kinesis
230 |              | any more. The shard has been deleted before we have processed all records in it.
231 |              | If you want your streaming query to fail on such cases, set the source option
232 |              | "failOnDataLoss" to "true"
233 |            """.stripMargin
234 |         )
235 |       }
236 |     }
237 | 
238 |     // filter the deleted shards
239 |     var filteredPrevShardsInfo = prevShardsInfo.filter {
240 |       s: ShardInfo => !deletedShardsList.contains(s.shardId)
241 |     }
242 | 
243 |     // check for new shards and fetch ShardInfo for them
244 |     openShards(latestShards).map {
245 |       shardId: String =>
246 |         if (prevShardsList.contains(shardId)) {
247 |           logDebug("Info for shardId " + shardId + " already exists")
248 |         }
249 |         else {
250 |           AddShardInfoForAncestors(shardId,
251 |             latestShards, initialPosition, prevShardsList, newShardsInfoMap, memoizationContext)
252 |           newShardsInfoMap.put(shardId,
253 |             new ShardInfo(shardId, initialPosition.shardPosition(shardId)))
254 |         }
255 |     }
256 |     filteredPrevShardsInfo ++ newShardsInfoMap.values.toSeq
257 |   }
258 | 
259 | }
260 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/SparkAWSCredentials.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.spark.sql.kinesis
 18 | 
 19 | import com.amazonaws.auth._
 20 | 
 21 | import org.apache.spark.annotation.Evolving
 22 | import org.apache.spark.internal.Logging
 23 | 
 24 | /**
 25 |  * Serializable interface providing a method executors can call to obtain an
 26 |  * AWSCredentialsProvider instance for authenticating to AWS services.
 27 |  */
 28 | private[kinesis] sealed trait SparkAWSCredentials extends Serializable {
 29 |   /**
 30 |    * Return an AWSCredentialProvider instance that can be used by the Kinesis Client
 31 |    * Library to authenticate to AWS services (Kinesis, CloudWatch and DynamoDB).
 32 |    */
 33 |   def provider: AWSCredentialsProvider
 34 | }
 35 | 
 36 | /** Returns DefaultAWSCredentialsProviderChain for authentication. */
 37 | private[kinesis] final case object DefaultCredentials extends SparkAWSCredentials {
 38 | 
 39 |   def provider: AWSCredentialsProvider = new DefaultAWSCredentialsProviderChain
 40 | }
 41 | 
 42 | /*
 43 |  * Returns AWSInstanceProfileCredentialsProviderWithRetries.
 44 |  */
 45 | 
 46 | private[kinesis] final case object InstanceProfileCredentials
 47 |   extends SparkAWSCredentials {
 48 |   def provider: AWSCredentialsProvider = new AWSInstanceProfileCredentialsProviderWithRetries
 49 | }
 50 | 
 51 | 
 52 | /**
 53 |  * Returns AWSStaticCredentialsProvider constructed using basic AWS keypair. Falls back to using
 54 |  * DefaultCredentialsProviderChain if unable to construct a AWSCredentialsProviderChain
 55 |  * instance with the provided arguments (e.g. if they are null).
 56 |  */
 57 | private[kinesis] final case class BasicCredentials(
 58 |     awsAccessKeyId: String,
 59 |     awsSecretKey: String) extends SparkAWSCredentials with Logging {
 60 | 
 61 |   def provider: AWSCredentialsProvider = try {
 62 |     new AWSStaticCredentialsProvider(new BasicAWSCredentials(awsAccessKeyId, awsSecretKey))
 63 |   } catch {
 64 |     case e: IllegalArgumentException =>
 65 |       logWarning("Unable to construct AWSStaticCredentialsProvider with provided keypair; " +
 66 |         "falling back to DefaultCredentialsProviderChain.", e)
 67 |       new DefaultAWSCredentialsProviderChain
 68 |   }
 69 | }
 70 | 
 71 | private[kinesis] final case class BasicAWSSessionCredentials(
 72 |     awsAccessKeyId: String,
 73 |     awsSecretKey: String, 
 74 |     sessionToken: String) extends SparkAWSCredentials with Logging {
 75 | 
 76 |   def provider: AWSCredentialsProvider = try {
 77 |     new AWSStaticCredentialsProvider(new BasicSessionCredentials(awsAccessKeyId, awsSecretKey, sessionToken))
 78 |   } catch {
 79 |     case e: IllegalArgumentException =>
 80 |       logWarning("Unable to construct AWSStaticCredentialsProvider with provided keyparir; " +
 81 |         "falling back to DefaultCredentialsProviderChain.", e)
 82 |       new DefaultAWSCredentialsProviderChain
 83 |   }
 84 | }
 85 | 
 86 | /**
 87 |  * Returns an STSAssumeRoleSessionCredentialsProvider instance which assumes an IAM
 88 |  * role in order to authenticate against resources in an external account.
 89 |  */
 90 | private[kinesis] final case class STSCredentials(
 91 |     stsRoleArn: String,
 92 |     stsSessionName: String,
 93 |     stsExternalId: Option[String] = None,
 94 |     longLivedCreds: SparkAWSCredentials = DefaultCredentials)
 95 |   extends SparkAWSCredentials  {
 96 | 
 97 |   def provider: AWSCredentialsProvider = {
 98 |     val builder = new STSAssumeRoleSessionCredentialsProvider.Builder(stsRoleArn, stsSessionName)
 99 |       .withLongLivedCredentialsProvider(longLivedCreds.provider)
100 |     stsExternalId match {
101 |       case Some(stsExternalId) =>
102 |         builder.withExternalId(stsExternalId)
103 |           .build()
104 |       case None =>
105 |         builder.build()
106 |     }
107 |   }
108 | }
109 | 
110 | @Evolving
111 | object SparkAWSCredentials {
112 | 
113 |   @Evolving
114 |   class Builder {
115 |     private var basicCreds: Option[BasicCredentials] = None
116 |     private var stsCreds: Option[STSCredentials] = None
117 |     private var basicSessionCreds: Option[BasicSessionCredentials] = None
118 | 
119 |     // scalastyle:off
120 |     /**
121 |      * Use a basic AWS keypair for long-lived authorization.
122 |      *
123 |      * @note The given AWS keypair will be saved in DStream checkpoints if checkpointing is
124 |      * enabled. Make sure that your checkpoint directory is secure. Prefer using the
125 |      * [[http://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default default provider chain]]
126 |      * instead if possible.
127 |      *
128 |      * @param accessKeyId AWS access key ID
129 |      * @param secretKey AWS secret key
130 |      * @return Reference to this [[SparkAWSCredentials.Builder]]
131 |      */
132 |     // scalastyle:on
133 |     def basicCredentials(accessKeyId: String, secretKey: String): Builder = {
134 |       basicCreds = Option(BasicCredentials(
135 |         awsAccessKeyId = accessKeyId,
136 |         awsSecretKey = secretKey))
137 |       this
138 |     }
139 | 
140 | 
141 |     // scalastyle:off
142 |     /**
143 |      * Use a shortlived aws key pair plus security token for short-term authentication
144 |      *
145 |      *
146 |      * @param accessKeyId AWS access key ID
147 |      * @param secretKey AWS secret key
148 |      * @param securityToken AWS Security Token 
149 |      * @return Reference to this [[SparkAWSCredentials.Builder]]
150 |      */
151 |     // scalastyle:on
152 |     def basicSessionCredentials(accessKeyId: String, secretKey: String, securityToken: String): Builder = {
153 |       basicSessionCreds = Option(new BasicSessionCredentials(
154 |         accessKeyId,
155 |         secretKey,
156 |         securityToken))
157 |       this
158 |     }
159 | 
160 |     /**
161 |      * Use STS to assume an IAM role for temporary session-based authentication. Will use configured
162 |      * long-lived credentials for authorizing to STS itself (either the default provider chain
163 |      * or a configured keypair).
164 |      *
165 |      * @param roleArn ARN of IAM role to assume via STS
166 |      * @param sessionName Name to use for the STS session
167 |      * @return Reference to this [[SparkAWSCredentials.Builder]]
168 |      */
169 |     def stsCredentials(roleArn: String, sessionName: String): Builder = {
170 |       stsCreds = Option(STSCredentials(stsRoleArn = roleArn, stsSessionName = sessionName))
171 |       this
172 |     }
173 | 
174 |     /**
175 |      * Use STS to assume an IAM role for temporary session-based authentication. Will use configured
176 |      * long-lived credentials for authorizing to STS itself (either the default provider chain
177 |      * or a configured keypair). STS will validate the provided external ID with the one defined
178 |      * in the trust policy of the IAM role to be assumed (if one is present).
179 |      *
180 |      * @param roleArn ARN of IAM role to assume via STS
181 |      * @param sessionName Name to use for the STS session
182 |      * @param externalId External ID to validate against assumed IAM role's trust policy
183 |      * @return Reference to this [[SparkAWSCredentials.Builder]]
184 |      */
185 |     def stsCredentials(roleArn: String, sessionName: String, externalId: String): Builder = {
186 |       stsCreds = Option(STSCredentials(
187 |         stsRoleArn = roleArn,
188 |         stsSessionName = sessionName,
189 |         stsExternalId = Option(externalId)))
190 |       this
191 |     }
192 | 
193 | 
194 |     def build(): SparkAWSCredentials =
195 |       stsCreds.map(_.copy(longLivedCreds = longLivedCreds)).getOrElse(longLivedCreds)
196 | 
197 |     private def longLivedCreds: SparkAWSCredentials = basicCreds.getOrElse(DefaultCredentials)
198 |   }
199 | 
200 | 
201 |   def builder: Builder = new Builder
202 | }
203 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/kinesis/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | /**
19 |  * Structured Streaming Data Source for Kinesis
20 |  */
21 | 
22 | package org.apache.spark.sql.kinesis;
23 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Set everything to be logged to the file target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=true
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
25 | 
26 | # Ignore messages below warning level from Jetty, because it's a bit verbose
27 | log4j.logger.org.spark_project.jetty=WARN
28 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/kinesis/HDFSMetaDataCommiterSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.kinesis
19 | 
20 | import java.io.File
21 | 
22 | import scala.language.implicitConversions
23 | 
24 | import org.apache.hadoop.conf.Configuration
25 | 
26 | import org.apache.spark.SparkFunSuite
27 | import org.apache.spark.sql.test.SharedSparkSession
28 | import org.apache.spark.util.SerializableConfiguration
29 | 
30 | 
31 | class HDFSMetaDataCommiterSuite extends SparkFunSuite with SharedSparkSession {
32 | 
33 |   val testConf: Configuration = new Configuration()
34 |   val serializedConf = new SerializableConfiguration(testConf)
35 | 
36 |   test("Add and Get operation") {
37 |     withTempDir { temp =>
38 |       val dir = new File(temp, "commit")
39 |       val metadataCommitter = new HDFSMetadataCommitter[String](dir.getAbsolutePath, serializedConf)
40 |       assert(metadataCommitter.add(0, "Shard-000001", "foo"))
41 |       assert(metadataCommitter.get(0) === Seq("foo"))
42 | 
43 |       assert(metadataCommitter.add(1, "Shard-000001", "one"))
44 |       assert(metadataCommitter.add(1, "Shard-000002", "two"))
45 |       assert(metadataCommitter.get(1).toSet === Set("one", "two"))
46 | 
47 |       // Adding the same batch over-writes the previous entry
48 |       // This is required since re-attempt of a failed task will
49 |       // update in same location
50 |       assert(metadataCommitter.add(1, "Shard-000001", "updated-one"))
51 |       assert(metadataCommitter.get(1).toSet === Set("updated-one", "two"))
52 |     }
53 |   }
54 | 
55 |   test("Purge operation") {
56 |     withTempDir { temp =>
57 |       val metadataCommitter = new HDFSMetadataCommitter[String](
58 |         temp.getAbsolutePath, serializedConf)
59 | 
60 |       assert(metadataCommitter.add(0, "Shard-000001", "one"))
61 |       assert(metadataCommitter.add(1, "Shard-000001", "two"))
62 |       assert(metadataCommitter.add(2, "Shard-000001", "three"))
63 | 
64 |       assert(metadataCommitter.get(0).nonEmpty)
65 |       assert(metadataCommitter.get(1).nonEmpty)
66 |       assert(metadataCommitter.get(2).nonEmpty)
67 | 
68 |       metadataCommitter.purge(2)
69 |       assertThrows[IllegalStateException](metadataCommitter.get(0))
70 |       assertThrows[IllegalStateException](metadataCommitter.get(1))
71 |       assert(metadataCommitter.get(2).nonEmpty)
72 | 
73 |       // There should be exactly one file, called "2", in the metadata directory.
74 |       val allFiles = new File(metadataCommitter.metadataPath.toString).listFiles().toSeq
75 |       assert(allFiles.size == 1)
76 |       assert(allFiles.head.getName == "2")
77 |     }
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/kinesis/KinesisPositionSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.kinesis
19 | 
20 | import org.apache.spark.SparkFunSuite
21 | 
22 | class KinesisPositionSuite extends SparkFunSuite {
23 | 
24 |   test("Fail on invalid kinesis source offset JSON") {
25 |     assertThrows[IllegalArgumentException] {
26 |       InitialKinesisPosition.fromCheckpointJson("""{"a":5}""", new TrimHorizon())
27 |     }
28 |   }
29 | 
30 |   test("Construct initial position from KinesisSourceOffset JSON") {
31 |     // Given
32 |     val shard00 = new AfterSequenceNumber("111")
33 |     val shard01 = new AfterSequenceNumber("222")
34 |     val offset = KinesisSourceOffset(
35 |       ShardOffsets(
36 |         batchId = 5L,
37 |         streamName = "my.stream",
38 |         shardInfoMap = Map(
39 |           "shardId-00" -> ShardInfo("shardId-00", shard00.iteratorType, shard00.iteratorPosition),
40 |           "shardId-01" -> ShardInfo("shardId-01", shard01.iteratorType, shard01.iteratorPosition)
41 |         )
42 |       )
43 |     )
44 |     val offsetJson = offset.json
45 | 
46 |     // When
47 |     val initPos = InitialKinesisPosition.fromCheckpointJson(offsetJson, new TrimHorizon())
48 | 
49 |     // Expected
50 |     val shard00Result = initPos.shardPosition("shardId-00")
51 |     assertResult(shard00Result.iteratorType)(shard00.iteratorType)
52 |     assertResult(shard00Result.iteratorPosition)(shard00.iteratorPosition)
53 | 
54 |     val shard01Result = initPos.shardPosition("shardId-01")
55 |     assertResult(shard01Result.iteratorType)(shard01.iteratorType)
56 |     assertResult(shard01Result.iteratorPosition)(shard01.iteratorPosition)
57 | 
58 |     // Should give default position for a newly discovered shard
59 |     val shard02Result = initPos.shardPosition("shardId-02")
60 |     assertResult(shard02Result.iteratorType)(new TrimHorizon().iteratorType)
61 |     assertResult(shard02Result.iteratorPosition)(new TrimHorizon().iteratorPosition)
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/kinesis/KinesisReaderSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | 
 19 | package org.apache.spark.sql.kinesis
 20 | 
 21 | import scala.util.Try
 22 | 
 23 | import org.scalatest.PrivateMethodTester
 24 | 
 25 | import org.apache.spark.SparkException
 26 | import org.apache.spark.sql.kinesis.KinesisTestUtils.{envVarNameForEnablingTests, shouldRunTests}
 27 | import org.apache.spark.sql.test.SharedSparkSession
 28 | 
 29 | class KinesisReaderSuite extends SharedSparkSession with PrivateMethodTester {
 30 | 
 31 |   protected var testUtils: KinesisTestUtils = _
 32 | 
 33 |   /** Run the test if environment variable is set or ignore the test */
 34 |   def testIfEnabled(testName: String)(testBody: => Unit) {
 35 |     if (shouldRunTests) {
 36 |       test(testName)(testBody)
 37 |     } else {
 38 |       ignore(s"$testName [enable by setting env var $envVarNameForEnablingTests=1]")(testBody)
 39 |     }
 40 |   }
 41 | 
 42 |   test("Should throw exception when there is no InstanceProfile") {
 43 |     val ex = intercept[ SparkException  ] {
 44 |       val kinesisReader =
 45 |         new KinesisReader(
 46 |           Map.empty[String, String],
 47 |           "Test",
 48 |           InstanceProfileCredentials,
 49 |           KinesisTestUtils.endpointUrl
 50 |         )
 51 |       kinesisReader.getShards()
 52 |     }
 53 |   }
 54 | 
 55 |   test("Should throw exception when STSCredentials are incorrect") {
 56 |     val ex = intercept[ SparkException ] {
 57 |       val kinesisReader = new KinesisReader(
 58 |         Map.empty[ String, String],
 59 |         "Test",
 60 |         STSCredentials("role-arn", "session-name"),
 61 |         KinesisTestUtils.endpointUrl)
 62 |       kinesisReader.getShards()
 63 |     }
 64 |   }
 65 | 
 66 |   test("Should throw exception when BasicCredentials are incorrect") {
 67 |     val ex = intercept[ SparkException  ] {
 68 |       val kinesisReader =
 69 |         new KinesisReader(
 70 |           Map.empty[String, String],
 71 |           "Test",
 72 |           BasicCredentials("access-key", "secret-key"),
 73 |           KinesisTestUtils.endpointUrl
 74 |         )
 75 |       kinesisReader.getShards()
 76 |     }
 77 |   }
 78 | 
 79 |   testIfEnabled("Should succeed for valid Credentials") {
 80 |     Try {
 81 |       val kinesisReader =
 82 |         new KinesisReader(
 83 |           Map.empty[String, String],
 84 |           "Test",
 85 |           BasicCredentials(
 86 |             KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId,
 87 |             KinesisTestUtils.getAWSCredentials().getAWSSecretKey
 88 |           ),
 89 |           KinesisTestUtils.endpointUrl
 90 |         )
 91 |       kinesisReader.getShards()
 92 |     }.isSuccess
 93 |   }
 94 | 
 95 |   testIfEnabled("getShardIterator should return null when shard-id is incorrect" +
 96 |     " and failOnDataLoss is false") {
 97 |     val kinesisReader =
 98 |       new KinesisReader(
 99 |         Map.empty[String, String],
100 |         "Test",
101 |         BasicCredentials(
102 |           KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId,
103 |           KinesisTestUtils.getAWSCredentials().getAWSSecretKey
104 |         ),
105 |         KinesisTestUtils.endpointUrl
106 |       )
107 |     val shardIterator = kinesisReader.getShardIterator("BAD-SHARD-ID", "LATEST",
108 |       "", false)
109 |     assert(shardIterator === null)
110 |   }
111 | 
112 |   testIfEnabled("getShardIterator should throw exception when shard-id is incorrect" +
113 |     " and failOnDataLoss is true") {
114 |     val ex = intercept[ SparkException  ] {
115 |       val kinesisReader =
116 |         new KinesisReader(
117 |           Map.empty[String, String],
118 |           "Test",
119 |           BasicCredentials(
120 |             KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId,
121 |             KinesisTestUtils.getAWSCredentials().getAWSSecretKey
122 |           ),
123 |           KinesisTestUtils.endpointUrl
124 |         )
125 |       val shardIterator = kinesisReader.getShardIterator("BAD-SHARD-ID", "LATEST",
126 |         "", true)
127 |     }
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/kinesis/KinesisSinkSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.spark.sql.kinesis
 18 | 
 19 | import java.util.Locale
 20 | 
 21 | import org.scalatest.concurrent.PatienceConfiguration.Timeout
 22 | 
 23 | import org.apache.spark.sql.{DataFrame, Row}
 24 | import org.apache.spark.sql.execution.streaming.MemoryStream
 25 | import org.apache.spark.sql.kinesis.KinesisTestUtils.{envVarNameForEnablingTests, shouldRunTests}
 26 | import org.apache.spark.sql.streaming._
 27 | import org.apache.spark.sql.streaming.util.StreamManualClock
 28 | import org.apache.spark.sql.test.SharedSparkSession
 29 | 
 30 | abstract class KinesisSinkTest extends StreamTest with SharedSparkSession {
 31 | 
 32 |   protected var testUtils: KinesisTestUtils = _
 33 | 
 34 |   override def beforeAll(): Unit = {
 35 |     super.beforeAll()
 36 | 
 37 |     testUtils = new KPLBasedKinesisTestUtils(1)
 38 |     testUtils.createStream()
 39 |   }
 40 | 
 41 |   override def afterAll(): Unit = {
 42 |     if (testUtils != null) {
 43 |       testUtils.deleteStream()
 44 |       testUtils = null
 45 |       super.afterAll()
 46 |     }
 47 |   }
 48 | 
 49 |   /** Run the test if environment variable is set or ignore the test */
 50 |   def testIfEnabled(testName: String)(testBody: => Unit) {
 51 |     if (shouldRunTests) {
 52 |       test(testName)(testBody)
 53 |     } else {
 54 |       ignore(s"$testName [enable by setting env var $envVarNameForEnablingTests=1]")(testBody)
 55 |     }
 56 |   }
 57 | 
 58 |   /** Run the give body of code only if Kinesis tests are enabled */
 59 |   def runIfTestsEnabled(message: String)(body: => Unit): Unit = {
 60 |     if (shouldRunTests) {
 61 |       body
 62 |     } else {
 63 |       ignore(s"$message [enable by setting env var $envVarNameForEnablingTests=1]")(())
 64 |     }
 65 |   }
 66 | 
 67 | }
 68 | 
 69 | class KinesisSinkOptionsSuite extends StreamTest with SharedSparkSession {
 70 | 
 71 |   test("bad source options") {
 72 |     def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = {
 73 |       val ex = intercept[IllegalArgumentException] {
 74 |         val reader = spark.readStream.format("kinesis")
 75 |         options.foreach { case (k, v) => reader.option(k, v) }
 76 |         reader.load()
 77 |       }
 78 |       expectedMsgs.foreach { m =>
 79 |         assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(m.toLowerCase(Locale.ROOT)))
 80 |       }
 81 |     }
 82 | 
 83 |     testBadOptions()("Stream name is a required field")
 84 |     testBadOptions("streamname" -> "")("Stream name is a required field")
 85 |   }
 86 | }
 87 | 
 88 | class KinesisSinkSuite extends KinesisSinkTest {
 89 | 
 90 |   import testImplicits._
 91 | 
 92 |   testIfEnabled("Test write data with bad schema") {
 93 |     val input = MemoryStream[String]
 94 |     var writer: StreamingQuery = null
 95 |     var ex: Exception = null
 96 | 
 97 |     val options = Map[String, String](
 98 |       "streamName" -> testUtils.streamName,
 99 |       "endpointUrl" -> testUtils.endpointUrl,
100 |       "AWSAccessKeyId" -> KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId,
101 |       "AWSSecretKey" -> KinesisTestUtils.getAWSCredentials().getAWSSecretKey
102 |     )
103 | 
104 |     try {
105 |       ex = intercept[StreamingQueryException] {
106 |         writer = createKinesisWriter(input.toDF(), withOptions = options)(
107 |           withSelectExpr = "value as partitionKey", "value"
108 |         )
109 |         input.addData("1", "2", "3", "4", "5")
110 |         writer.processAllAvailable()
111 |       }
112 |     } finally {
113 |       if (writer != null) {
114 |         writer.stop()
115 |       }
116 |     }
117 |     assert(ex.getMessage
118 |       .toLowerCase(Locale.ROOT)
119 |       .contains("required attribute 'data' not found"))
120 | 
121 |     try {
122 |       ex = intercept[StreamingQueryException] {
123 |         writer = createKinesisWriter(input.toDF(), withOptions = options)(
124 |           withSelectExpr = "value as data", "value"
125 |         )
126 |         input.addData("1", "2", "3", "4", "5")
127 |         writer.processAllAvailable()
128 |       }
129 |     } finally {
130 |       if (writer != null) {
131 |         writer.stop()
132 |       }
133 |     }
134 |     assert(ex.getMessage
135 |       .toLowerCase(Locale.ROOT)
136 |       .contains("required attribute 'partitionkey' not found"))
137 |   }
138 | 
139 |   testIfEnabled("Test write data with valid schema but wrong types") {
140 |     val options = Map[String, String](
141 |       "streamName" -> testUtils.streamName,
142 |       "endpointUrl" -> testUtils.endpointUrl,
143 |       "AWSAccessKeyId" -> KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId,
144 |       "AWSSecretKey" -> KinesisTestUtils.getAWSCredentials().getAWSSecretKey
145 |     )
146 | 
147 |     val input = MemoryStream[String]
148 |     var writer: StreamingQuery = null
149 |     var ex: Exception = null
150 |     try {
151 |       /* partitionKey field wrong type */
152 |       ex = intercept[StreamingQueryException] {
153 |         writer = createKinesisWriter(input.toDF(), withOptions = options)(
154 |           withSelectExpr = s"CAST('1' as INT) as partitionKey", "value as data"
155 |         )
156 |         input.addData("1", "2", "3", "4", "5")
157 |         writer.processAllAvailable()
158 |       }
159 |     } finally {
160 |       if (writer != null) {
161 |         writer.stop()
162 |       }
163 |     }
164 |     assert(ex.getMessage.toLowerCase(Locale.ROOT)
165 |       .contains("partitionkey attribute type must be a string or binarytype"))
166 | 
167 |     try {
168 |       /* data field wrong type */
169 |       ex = intercept[StreamingQueryException] {
170 |         writer = createKinesisWriter(input.toDF(), withOptions = options)(
171 |           withSelectExpr = "value as partitionKey", "CAST(value as INT) as data"
172 |         )
173 |         input.addData("1", "2", "3", "4", "5")
174 |         writer.processAllAvailable()
175 |       }
176 |     } finally {
177 |       if (writer != null) {
178 |         writer.stop()
179 |       }
180 |     }
181 |     assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
182 |       "data attribute type must be a string or binarytype"))
183 |   }
184 | 
185 |   testIfEnabled("Test write data to Kinesis") {
186 |     val clock = new StreamManualClock
187 | 
188 |     val waitUntilBatchProcessed = AssertOnQuery { q =>
189 |       eventually(Timeout(streamingTimeout)) {
190 |         if (!q.exception.isDefined) {
191 |           assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
192 |         }
193 |       }
194 |       if (q.exception.isDefined) {
195 |         throw q.exception.get
196 |       }
197 |       true
198 |     }
199 |     var writer: StreamingQuery = null
200 | 
201 |     val input = MemoryStream[String]
202 |     val writerOptions = Map[String, String](
203 |       "streamName" -> testUtils.streamName,
204 |       "endpointUrl" -> testUtils.endpointUrl,
205 |       "AWSAccessKeyId" -> KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId,
206 |       "AWSSecretKey" -> KinesisTestUtils.getAWSCredentials().getAWSSecretKey
207 |     )
208 | 
209 |     val reader = createKinesisReader()
210 |       .selectExpr("CAST(data AS STRING)")
211 |       .as[String].map(_.toInt)
212 | 
213 |     try {
214 |       writer = createKinesisWriter(input.toDF(), withOptions = writerOptions)(
215 |         withSelectExpr = s"CAST('1' as STRING) as partitionKey", "value as data")
216 |       input.addData("1", "2", "3", "4", "5")
217 | 
218 |       testStream(reader)(
219 |         StartStream(Trigger.ProcessingTime(100), clock),
220 |         waitUntilBatchProcessed,
221 |         AssertOnQuery { query =>
222 |           logInfo("Pushing Data ")
223 |           writer.processAllAvailable()
224 |           true
225 |         },
226 |         AdvanceManualClock(100),
227 |         waitUntilBatchProcessed,
228 |         CheckAnswer(1, 2, 3, 4, 5)
229 |       )
230 | 
231 |     } finally {
232 |       if (writer != null) {
233 |         writer.stop()
234 |       }
235 |     }
236 |   }
237 | 
238 |   private def createKinesisReader(): DataFrame = {
239 |     spark.readStream
240 |       .format("kinesis")
241 |       .option("streamName", testUtils.streamName)
242 |       .option("endpointUrl", testUtils.endpointUrl)
243 |       .option("AWSAccessKeyId", KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId)
244 |       .option("AWSSecretKey", KinesisTestUtils.getAWSCredentials().getAWSSecretKey)
245 |       .load
246 |   }
247 | 
248 |   private def createKinesisWriter(input: DataFrame,
249 |                                   withOutputMode: Option[OutputMode] = None,
250 |                                   withOptions: Map[String, String] = Map[String, String]())
251 |                                  (withSelectExpr: String*): StreamingQuery = {
252 |     var stream: DataStreamWriter[Row] = null
253 |     withTempDir { checkpointDir =>
254 |       var df = input.toDF()
255 |       if (withSelectExpr.nonEmpty) {
256 |         df = df.selectExpr(withSelectExpr: _*)
257 |       }
258 |       stream = df.writeStream
259 |         .format("kinesis")
260 |         .option("checkpointLocation", checkpointDir.getCanonicalPath)
261 |         .queryName("kinesisStream")
262 |       withOutputMode.foreach(stream.outputMode(_))
263 |       withOptions.foreach(opt => stream.option(opt._1, opt._2))
264 |     }
265 |     stream.start()
266 |   }
267 | }


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/kinesis/KinesisSourceOffsetSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.kinesis
 19 | 
 20 | import java.io.File
 21 | 
 22 | import org.apache.spark.sql.execution.streaming._
 23 | import org.apache.spark.sql.streaming.OffsetSuite
 24 | import org.apache.spark.sql.test.SharedSparkSession
 25 | 
 26 | 
 27 | class KinesisSourceOffsetSuite extends OffsetSuite with SharedSparkSession {
 28 | 
 29 | 
 30 |   compare(
 31 |     one = KinesisSourceOffset(new ShardOffsets(-1L, "dummy", Array.empty[ShardInfo])),
 32 |     two = KinesisSourceOffset(new ShardOffsets(1L, "dummy", Array.empty[ShardInfo])))
 33 | 
 34 |   compare(
 35 |     one = KinesisSourceOffset(new ShardOffsets(1L, "foo", Array.empty[ShardInfo])),
 36 |     two = KinesisSourceOffset(new ShardOffsets(1L, "bar", Array.empty[ShardInfo]))
 37 |   )
 38 | 
 39 |   compare(
 40 |     one = KinesisSourceOffset(new ShardOffsets(1L, "foo", Array(
 41 |       new ShardInfo("shard-001", new TrimHorizon())))),
 42 |     two = KinesisSourceOffset(new ShardOffsets(1L, "foo",
 43 |       Array(new ShardInfo("shard-001", new TrimHorizon()),
 44 |         new ShardInfo("shard-002", new TrimHorizon()) )))
 45 |   )
 46 |   var shardInfo1 = Array.empty[ShardInfo]
 47 |   shardInfo1 = shardInfo1 ++ Array(ShardInfo("shard-001", "AFTER_SEQUENCE_NUMBER", "1234"))
 48 | 
 49 |   val kso1 = KinesisSourceOffset(
 50 |     new ShardOffsets(1L, "foo", shardInfo1))
 51 | 
 52 |   val shardInfo2 = shardInfo1 ++ Array(ShardInfo("shard-002", "TRIM_HORIZON", ""))
 53 |   val kso2 = KinesisSourceOffset(
 54 |     new ShardOffsets(1L, "bar", shardInfo2))
 55 | 
 56 |   val shardInfo3 = shardInfo2 ++ Array(ShardInfo("shard-003", "AFTER_SEQUENCE_NUMBER", "2342"))
 57 |   val kso3 = KinesisSourceOffset(
 58 |     new ShardOffsets(1L, "bar", shardInfo3)
 59 |   )
 60 | 
 61 |   compare(KinesisSourceOffset(SerializedOffset(kso1.json)), kso2)
 62 | 
 63 |   test("basic serialization - deserialization") {
 64 |     assert(KinesisSourceOffset.getShardOffsets(kso1) ==
 65 |       KinesisSourceOffset.getShardOffsets(SerializedOffset(kso1.json)))
 66 |   }
 67 | 
 68 |   test("OffsetSeqLog serialization - deserialization") {
 69 |     withTempDir { temp =>
 70 |       // use non-existent directory to test whether log make the dir
 71 |       val dir = new File(temp, "dir")
 72 |       val metadataLog = new OffsetSeqLog(spark, dir.getAbsolutePath)
 73 |       val batch0 = OffsetSeq.fill(kso1)
 74 |       val batch1 = OffsetSeq.fill(kso2, kso3)
 75 | 
 76 |       val batch0Serialized = OffsetSeq.fill(batch0.offsets.flatMap(_.map(o =>
 77 |         SerializedOffset(o.json))): _*)
 78 | 
 79 |       val batch1Serialized = OffsetSeq.fill(batch1.offsets.flatMap(_.map(o =>
 80 |         SerializedOffset(o.json))): _*)
 81 | 
 82 |       assert(metadataLog.add(0, batch0))
 83 |       assert(metadataLog.getLatest() === Some(0 -> batch0Serialized))
 84 |       assert(metadataLog.get(0) === Some(batch0Serialized))
 85 | 
 86 |       assert(metadataLog.add(1, batch1))
 87 |       assert(metadataLog.get(0) === Some(batch0Serialized))
 88 |       assert(metadataLog.get(1) === Some(batch1Serialized))
 89 |       assert(metadataLog.getLatest() === Some(1 -> batch1Serialized))
 90 |       assert(metadataLog.get(None, Some(1)) ===
 91 |         Array(0 -> batch0Serialized, 1 -> batch1Serialized))
 92 | 
 93 |       // Adding the same batch does nothing
 94 |       metadataLog.add(1, OffsetSeq.fill(LongOffset(3)))
 95 |       assert(metadataLog.get(0) === Some(batch0Serialized))
 96 |       assert(metadataLog.get(1) === Some(batch1Serialized))
 97 |       assert(metadataLog.getLatest() === Some(1 -> batch1Serialized))
 98 |       assert(metadataLog.get(None, Some(1)) ===
 99 |         Array(0 -> batch0Serialized, 1 -> batch1Serialized))
100 |     }
101 |   }
102 | 
103 | 
104 | }
105 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/kinesis/KinesisTestUtils.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.kinesis
 19 | 
 20 | import java.nio.ByteBuffer
 21 | import java.nio.charset.StandardCharsets
 22 | import java.util.concurrent.TimeUnit
 23 | 
 24 | import scala.collection.JavaConverters._
 25 | import scala.collection.mutable
 26 | import scala.collection.mutable.ArrayBuffer
 27 | import scala.util.{Failure, Random, Success, Try}
 28 | 
 29 | import com.amazonaws.auth.{AWSCredentials, DefaultAWSCredentialsProviderChain}
 30 | import com.amazonaws.regions.RegionUtils
 31 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClient
 32 | import com.amazonaws.services.dynamodbv2.document.DynamoDB
 33 | import com.amazonaws.services.kinesis.{AmazonKinesis, AmazonKinesisClient}
 34 | import com.amazonaws.services.kinesis.model._
 35 | import com.amazonaws.services.kinesis.producer.{KinesisProducer => KPLProducer, KinesisProducerConfiguration, UserRecordResult}
 36 | import com.google.common.util.concurrent.{FutureCallback, Futures}
 37 | 
 38 | import org.apache.spark.internal.Logging
 39 | 
 40 | private[kinesis] class KinesisTestUtils(streamShardCount: Int = 2) extends Logging {
 41 | 
 42 |   val endpointUrl = KinesisTestUtils.endpointUrl
 43 |   val regionName = KinesisTestUtils.getRegionNameByEndpoint(endpointUrl)
 44 | 
 45 |   private val createStreamTimeoutSeconds = 300
 46 |   private val describeStreamPollTimeSeconds = 1
 47 | 
 48 |   @volatile
 49 |   private var streamCreated = false
 50 | 
 51 |   @volatile
 52 |   private var _streamName: String = _
 53 | 
 54 |   protected lazy val kinesisClient = {
 55 |     val client = new AmazonKinesisClient(KinesisTestUtils.getAWSCredentials())
 56 |     client.setEndpoint(endpointUrl)
 57 |     client
 58 |   }
 59 | 
 60 |   /*
 61 |   private lazy val dynamoDB = {
 62 |     val dynamoDBClient = new AmazonDynamoDBClient(new DefaultAWSCredentialsProviderChain())
 63 |     dynamoDBClient.setRegion(RegionUtils.getRegion(regionName))
 64 |     new DynamoDB(dynamoDBClient)
 65 |   }
 66 |   */
 67 | 
 68 |   protected def getProducer(aggregate: Boolean): KinesisDataGenerator = {
 69 |     if (!aggregate) {
 70 |       new SimpleDataGenerator(kinesisClient)
 71 |     } else {
 72 |       throw new UnsupportedOperationException("Aggregation is not supported through this code path")
 73 |     }
 74 |   }
 75 | 
 76 | 
 77 |   def streamName: String = {
 78 |     require(streamCreated, "Stream not yet created, call createStream() to create one")
 79 |     _streamName
 80 |   }
 81 | 
 82 |   def createStream(): Unit = {
 83 |     require(!streamCreated, "Stream already created")
 84 |     _streamName = findNonExistentStreamName()
 85 | 
 86 |     // Create a stream. The number of shards determines the provisioned throughput.
 87 |     logInfo(s"Creating stream ${_streamName}")
 88 |     val createStreamRequest = new CreateStreamRequest()
 89 |     createStreamRequest.setStreamName(_streamName)
 90 |     createStreamRequest.setShardCount(streamShardCount)
 91 |     kinesisClient.createStream(createStreamRequest)
 92 | 
 93 |     // The stream is now being created. Wait for it to become active.
 94 |     waitForStreamToBeActive(_streamName)
 95 |     streamCreated = true
 96 |     logInfo(s"Created stream ${_streamName}")
 97 |   }
 98 | 
 99 |   def getShards(): Seq[Shard] = {
100 |     kinesisClient.describeStream(_streamName).getStreamDescription.getShards.asScala
101 |   }
102 | 
103 |   def splitShard(shardId: String): Unit = {
104 |     val splitShardRequest = new SplitShardRequest()
105 |     splitShardRequest.withStreamName(_streamName)
106 |     splitShardRequest.withShardToSplit(shardId)
107 |     // Set a half of the max hash value
108 |     splitShardRequest.withNewStartingHashKey("170141183460469231731687303715884105728")
109 |     kinesisClient.splitShard(splitShardRequest)
110 |     // Wait for the shards to become active
111 |     waitForStreamToBeActive(_streamName)
112 |   }
113 | 
114 |   def splitShard : (Integer, Integer) = {
115 |     val shardToSplit = getShards().head
116 |     splitShard(shardToSplit.getShardId)
117 |     val (splitOpenShards, splitCloseShards) = getShards().partition {
118 |       shard => shard.getSequenceNumberRange.getEndingSequenceNumber == null
119 |     }
120 |     (splitOpenShards.size, splitCloseShards.size)
121 |   }
122 | 
123 |   def mergeShard(shardToMerge: String, adjacentShardToMerge: String): Unit = {
124 |     val mergeShardRequest = new MergeShardsRequest
125 |     mergeShardRequest.withStreamName(_streamName)
126 |     mergeShardRequest.withShardToMerge(shardToMerge)
127 |     mergeShardRequest.withAdjacentShardToMerge(adjacentShardToMerge)
128 |     kinesisClient.mergeShards(mergeShardRequest)
129 |     // Wait for the shards to become active
130 |     waitForStreamToBeActive(_streamName)
131 |   }
132 | 
133 | 
134 |   def mergeShard: (Integer, Integer) = {
135 |     val (openShard, closeShard) = getShards().partition {
136 |       shard => shard.getSequenceNumberRange.getEndingSequenceNumber == null
137 |     }
138 |     val Seq(shardToMerge, adjShard) = openShard
139 |     mergeShard(shardToMerge.getShardId, adjShard.getShardId)
140 |     val shardToSplit = getShards().head
141 |     val (mergedOpenShards, mergedCloseShards) =
142 |       getShards().partition {
143 |         shard => shard.getSequenceNumberRange.getEndingSequenceNumber == null
144 |       }
145 |     (mergedOpenShards.size, mergedCloseShards.size)
146 |   }
147 | 
148 |   /**
149 |     * Push data to Kinesis stream and return a map of
150 |     * shardId -> seq of (data, seq number) pushed to corresponding shard
151 |     */
152 |   def pushData(testData: Array[String], aggregate: Boolean): Map[String, Seq[(String, String)]] = {
153 |     require(streamCreated, "Stream not yet created, call createStream() to create one")
154 |     val producer = getProducer(aggregate)
155 |     val shardIdToSeqNumbers = producer.sendData(streamName, testData)
156 |     logInfo(s"Pushed $testData:\n\t ${shardIdToSeqNumbers.mkString("\n\t")}")
157 |     shardIdToSeqNumbers
158 |   }
159 | 
160 |   /**
161 |     * Expose a Python friendly API.
162 |     */
163 |   def pushData(testData: java.util.List[String]): Unit = {
164 |     pushData(testData.asScala.toArray, aggregate = false)
165 |   }
166 | 
167 |   def deleteStream(): Unit = {
168 |     try {
169 |       if (streamCreated) {
170 |         kinesisClient.deleteStream(streamName)
171 |       }
172 |     } catch {
173 |       case e: Exception =>
174 |         logWarning(s"Could not delete stream $streamName")
175 |     }
176 |   }
177 | 
178 |   /*
179 |   def deleteDynamoDBTable(tableName: String): Unit = {
180 |     try {
181 |       val table = dynamoDB.getTable(tableName)
182 |       table.delete()
183 |       table.waitForDelete()
184 |     } catch {
185 |       case e: Exception =>
186 |         logWarning(s"Could not delete DynamoDB table $tableName")
187 |     }
188 |   }
189 |   */
190 | 
191 |   private def describeStream(streamNameToDescribe: String): Option[StreamDescription] = {
192 |     try {
193 |       val describeStreamRequest = new DescribeStreamRequest().withStreamName(streamNameToDescribe)
194 |       val desc = kinesisClient.describeStream(describeStreamRequest).getStreamDescription()
195 |       Some(desc)
196 |     } catch {
197 |       case rnfe: ResourceNotFoundException =>
198 |         None
199 |     }
200 |   }
201 | 
202 |   private def findNonExistentStreamName(): String = {
203 |     var testStreamName: String = null
204 |     do {
205 |       Thread.sleep(TimeUnit.SECONDS.toMillis(describeStreamPollTimeSeconds))
206 |       testStreamName = s"KinesisTestUtils-${math.abs(Random.nextLong())}"
207 |     } while (describeStream(testStreamName).nonEmpty)
208 |     testStreamName
209 |   }
210 | 
211 |   private def waitForStreamToBeActive(streamNameToWaitFor: String): Unit = {
212 |     val startTime = System.currentTimeMillis()
213 |     val endTime = startTime + TimeUnit.SECONDS.toMillis(createStreamTimeoutSeconds)
214 |     while (System.currentTimeMillis() < endTime) {
215 |       Thread.sleep(TimeUnit.SECONDS.toMillis(describeStreamPollTimeSeconds))
216 |       describeStream(streamNameToWaitFor).foreach { description =>
217 |         val streamStatus = description.getStreamStatus()
218 |         logDebug(s"\t- current state: $streamStatus\n")
219 |         if ("ACTIVE".equals(streamStatus)) {
220 |           return
221 |         }
222 |       }
223 |     }
224 |     require(false, s"Stream $streamName never became active")
225 |   }
226 | }
227 | 
228 | private[kinesis] object KinesisTestUtils {
229 | 
230 |   val envVarNameForEnablingTests = "ENABLE_KINESIS_SQL_TESTS"
231 |   val endVarNameForEndpoint = "KINESIS_TEST_ENDPOINT_URL"
232 |   val defaultEndpointUrl = "https://kinesis.us-east-1.amazonaws.com"
233 |   val regionName: String = getRegionNameByEndpoint(endpointUrl)
234 | 
235 |   def getRegionNameByEndpoint(endpoint: String): String = {
236 |     val uri = new java.net.URI(endpoint)
237 |     RegionUtils.getRegionsForService(AmazonKinesis.ENDPOINT_PREFIX)
238 |       .asScala
239 |       .find(_.getAvailableEndpoints.asScala.toSeq.contains(uri.getHost))
240 |       .map(_.getName)
241 |       .getOrElse(
242 |         throw new IllegalArgumentException(s"Could not resolve region for endpoint: $endpoint"))
243 |   }
244 | 
245 |   lazy val shouldRunTests = {
246 |     val isEnvSet = sys.env.get(envVarNameForEnablingTests) == Some("1")
247 |     if (isEnvSet) {
248 |       // scalastyle:off println
249 |       // Print this so that they are easily visible on the console and not hidden in the log4j logs.
250 |       println(
251 |         s"""
252 |            |Kinesis tests that actually send data has been enabled by setting the environment
253 |            |variable $envVarNameForEnablingTests to 1. This will create Kinesis Streams
254 |            |in AWS. Please be aware that this may incur some AWS costs.
255 |            |By default, the tests use the endpoint URL $defaultEndpointUrl to create Kinesis streams.
256 |            |To change this endpoint URL to a different region, you can set the environment variable
257 |            |$endVarNameForEndpoint to the desired endpoint URL
258 |            |(e.g. $endVarNameForEndpoint="https://kinesis.us-west-2.amazonaws.com").
259 |         """.stripMargin)
260 |       // scalastyle:on println
261 |     }
262 |     isEnvSet
263 |   }
264 | 
265 |   lazy val endpointUrl = {
266 |     val url = sys.env.getOrElse(endVarNameForEndpoint, defaultEndpointUrl)
267 |     // scalastyle:off println
268 |     // Print this so that they are easily visible on the console and not hidden in the log4j logs.
269 |     println(s"Using endpoint URL $url for creating Kinesis streams for tests.")
270 |     // scalastyle:on println
271 |     url
272 |   }
273 | 
274 |   def isAWSCredentialsPresent: Boolean = {
275 |     Try { new DefaultAWSCredentialsProviderChain().getCredentials() }.isSuccess
276 |   }
277 | 
278 |   def getAWSCredentials(): AWSCredentials = {
279 |     assert(shouldRunTests,
280 |       "Kinesis test not enabled, should not attempt to get AWS credentials")
281 |     Try { new DefaultAWSCredentialsProviderChain().getCredentials() } match {
282 |       case Success(cred) =>
283 |         cred
284 |       case Failure(e) =>
285 |         throw new Exception(
286 |           s"""
287 |              |Kinesis tests enabled using environment variable $envVarNameForEnablingTests
288 |              |but could not find AWS credentials. Please follow instructions in AWS documentation
289 |              |to set the credentials in your system such that the DefaultAWSCredentialsProviderChain
290 |              |can find the credentials.
291 |            """.stripMargin)
292 |     }
293 |   }
294 | }
295 | 
296 | /** A wrapper interface that will allow us to consolidate the code for synthetic data generation. */
297 | private[kinesis] trait KinesisDataGenerator {
298 |   /** Sends the data to Kinesis and returns the metadata for everything that has been sent. */
299 |   def sendData(streamName: String, data: Array[String]): Map[String, Seq[(String, String)]]
300 | }
301 | 
302 | private[kinesis] class SimpleDataGenerator(
303 |     client: AmazonKinesisClient) extends KinesisDataGenerator {
304 |   override def sendData(streamName: String, data: Array[String]):
305 |   Map[String, Seq[(String, String)]] = {
306 |     val shardIdToSeqNumbers =
307 |       new mutable.HashMap[String, ArrayBuffer[(String, String)]]()
308 |     data.foreach { num =>
309 |       val str = num.toString
310 |       val data = ByteBuffer.wrap(str.getBytes(StandardCharsets.UTF_8))
311 |       val putRecordRequest = new PutRecordRequest().withStreamName(streamName)
312 |         .withData(data)
313 |         .withPartitionKey(str)
314 | 
315 |       val putRecordResult = client.putRecord(putRecordRequest)
316 |       val shardId = putRecordResult.getShardId
317 |       val seqNumber = putRecordResult.getSequenceNumber()
318 |       val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId,
319 |         new ArrayBuffer[(String, String)]())
320 |       sentSeqNumbers += ((num, seqNumber))
321 |     }
322 | 
323 |     shardIdToSeqNumbers.toMap
324 |   }
325 | }
326 | 
327 | private[kinesis] class KPLBasedKinesisTestUtils(streamShardCount: Int = 2)
328 |   extends KinesisTestUtils(streamShardCount) {
329 |   override protected def getProducer(aggregate: Boolean): KinesisDataGenerator = {
330 |     if (!aggregate) {
331 |       new SimpleDataGenerator(kinesisClient)
332 |     } else {
333 |       new KPLDataGenerator(regionName)
334 |     }
335 |   }
336 | }
337 | 
338 | /** A wrapper for the KinesisProducer provided in the KPL. */
339 | private[kinesis] class KPLDataGenerator(regionName: String) extends KinesisDataGenerator {
340 | 
341 |   private lazy val producer: KPLProducer = {
342 |     val conf = new KinesisProducerConfiguration()
343 |       .setRecordMaxBufferedTime(1000)
344 |       .setMaxConnections(1)
345 |       .setRegion(regionName)
346 |       .setMetricsLevel("none")
347 | 
348 |     new KPLProducer(conf)
349 |   }
350 | 
351 |   override def sendData(streamName: String,
352 |                         data: Array[String]): Map[String, Seq[(String, String)]] = {
353 |     val shardIdToSeqNumbers = new mutable.HashMap[String, ArrayBuffer[(String, String)]]()
354 |     data.foreach { num =>
355 |       val str = num.toString
356 |       val data = ByteBuffer.wrap(str.getBytes(StandardCharsets.UTF_8))
357 |       val future = producer.addUserRecord(streamName, str, data)
358 |       val kinesisCallBack = new FutureCallback[UserRecordResult]() {
359 |         override def onFailure(t: Throwable): Unit = {} // do nothing
360 | 
361 |         override def onSuccess(result: UserRecordResult): Unit = {
362 |           val shardId = result.getShardId
363 |           val seqNumber = result.getSequenceNumber()
364 |           val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId,
365 |             new ArrayBuffer[(String, String)]())
366 |           sentSeqNumbers += ((num, seqNumber))
367 |         }
368 |       }
369 |       Futures.addCallback(future, kinesisCallBack)
370 |     }
371 |     producer.flushSync()
372 |     shardIdToSeqNumbers.toMap
373 |   }
374 | }
375 | 
376 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/kinesis/ShardSyncerSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.kinesis
19 | 
20 | import com.amazonaws.services.kinesis.model.{SequenceNumberRange, Shard}
21 | 
22 | import org.apache.spark.SparkFunSuite
23 | import org.apache.spark.sql.test.SharedSparkSession
24 | 
25 | class ShardSyncerSuite extends SparkFunSuite with SharedSparkSession {
26 | 
27 |   val latestShards = Seq(createShard("shard1", "1"))
28 |   val prevShardInfo = Seq(new ShardInfo("shard0", new AfterSequenceNumber("0")))
29 | 
30 |   test("Should error out when failondataloss is true and a shard is deleted") {
31 |     val ex = intercept[ IllegalStateException ] {
32 |       ShardSyncer.getLatestShardInfo(latestShards, prevShardInfo,
33 |         InitialKinesisPosition.fromPredefPosition(new TrimHorizon), true)
34 |     }
35 |   }
36 | 
37 |   test("Should error out when failondataloss is false and a shard is deleted") {
38 |     val expectedShardInfo = Seq(new ShardInfo("Shard1", new TrimHorizon))
39 |     val latest: Seq[ShardInfo] = ShardSyncer.getLatestShardInfo(
40 |       latestShards, prevShardInfo, InitialKinesisPosition.fromPredefPosition(new TrimHorizon),
41 |       false)
42 |     assert(latest.nonEmpty)
43 |     assert(latest(0).shardId === "Shard1")
44 |     assert(latest(0).iteratorType === new TrimHorizon().iteratorType )
45 |   }
46 | 
47 |   private def createShard(shardId: String, seqNum: String): Shard = {
48 |     new Shard()
49 |       .withShardId("Shard1")
50 |       .withSequenceNumberRange(
51 |         new SequenceNumberRange().withStartingSequenceNumber("1")
52 |       )
53 |   }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------