├── .gitchangelog.rc ├── .gitignore ├── .travis.yml ├── CHANGELOG ├── LICENSE ├── README.md ├── dev ├── checkstyle-suppressions.xml └── checkstyle.xml ├── pom.xml ├── scalastyle-config.xml └── src ├── main ├── java │ └── org │ │ └── apache │ │ └── spark │ │ └── sql │ │ └── kinesis │ │ └── AWSInstanceProfileCredentialsProviderWithRetries.java ├── resources │ └── META-INF │ │ └── services │ │ └── org.apache.spark.sql.sources.DataSourceRegister └── scala │ └── org │ └── apache │ └── spark │ └── sql │ └── kinesis │ ├── CachedKinesisProducer.scala │ ├── HDFSMetadataCommitter.scala │ ├── KinesisPosition.scala │ ├── KinesisReader.scala │ ├── KinesisSink.scala │ ├── KinesisSource.scala │ ├── KinesisSourceOffset.scala │ ├── KinesisSourceProvider.scala │ ├── KinesisSourceRDD.scala │ ├── KinesisWriteTask.scala │ ├── KinesisWriter.scala │ ├── MetadataCommitter.scala │ ├── ShardSyncer.scala │ ├── SparkAWSCredentials.scala │ └── package-info.java └── test ├── resources └── log4j.properties └── scala └── org └── apache └── spark └── sql └── kinesis ├── HDFSMetaDataCommiterSuite.scala ├── KinesisPositionSuite.scala ├── KinesisReaderSuite.scala ├── KinesisSinkSuite.scala ├── KinesisSourceOffsetSuite.scala ├── KinesisSourceSuite.scala ├── KinesisTestUtils.scala └── ShardSyncerSuite.scala /.gitchangelog.rc: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8; mode: python -*- 2 | ## 3 | ## Format 4 | ## 5 | ## ACTION: [AUDIENCE:] COMMIT_MSG [!TAG ...] 6 | ## 7 | ## Description 8 | ## 9 | ## ACTION is one of 'chg', 'fix', 'new' 10 | ## 11 | ## Is WHAT the change is about. 12 | ## 13 | ## 'chg' is for refactor, small improvement, cosmetic changes... 14 | ## 'fix' is for bug fixes 15 | ## 'new' is for new features, big improvement 16 | ## 17 | ## AUDIENCE is optional and one of 'dev', 'usr', 'pkg', 'test', 'doc' 18 | ## 19 | ## Is WHO is concerned by the change. 20 | ## 21 | ## 'dev' is for developpers (API changes, refactors...) 22 | ## 'usr' is for final users (UI changes) 23 | ## 'pkg' is for packagers (packaging changes) 24 | ## 'test' is for testers (test only related changes) 25 | ## 'doc' is for doc guys (doc only changes) 26 | ## 27 | ## COMMIT_MSG is ... well ... the commit message itself. 28 | ## 29 | ## TAGs are additionnal adjective as 'refactor' 'minor' 'cosmetic' 30 | ## 31 | ## They are preceded with a '!' or a '@' (prefer the former, as the 32 | ## latter is wrongly interpreted in github.) Commonly used tags are: 33 | ## 34 | ## 'refactor' is obviously for refactoring code only 35 | ## 'minor' is for a very meaningless change (a typo, adding a comment) 36 | ## 'cosmetic' is for cosmetic driven change (re-indentation, 80-col...) 37 | ## 'wip' is for partial functionality but complete subfunctionality. 38 | ## 39 | ## Example: 40 | ## 41 | ## new: usr: support of bazaar implemented 42 | ## chg: re-indentend some lines !cosmetic 43 | ## new: dev: updated code to be compatible with last version of killer lib. 44 | ## fix: pkg: updated year of licence coverage. 45 | ## new: test: added a bunch of test around user usability of feature X. 46 | ## fix: typo in spelling my name in comment. !minor 47 | ## 48 | ## Please note that multi-line commit message are supported, and only the 49 | ## first line will be considered as the "summary" of the commit message. So 50 | ## tags, and other rules only applies to the summary. The body of the commit 51 | ## message will be displayed in the changelog without reformatting. 52 | 53 | 54 | ## 55 | ## ``ignore_regexps`` is a line of regexps 56 | ## 57 | ## Any commit having its full commit message matching any regexp listed here 58 | ## will be ignored and won't be reported in the changelog. 59 | ## 60 | ignore_regexps = [ 61 | r'@minor', r'!minor', 62 | r'@cosmetic', r'!cosmetic', 63 | r'@refactor', r'!refactor', 64 | r'@wip', r'!wip', 65 | r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*[p|P]kg:', 66 | r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*[d|D]ev:', 67 | r'^(.{3,3}\s*:)?\s*[fF]irst commit.?\s*$', 68 | r'^$', ## ignore commits with empty messages 69 | ] 70 | 71 | 72 | ## ``section_regexps`` is a list of 2-tuples associating a string label and a 73 | ## list of regexp 74 | ## 75 | ## Commit messages will be classified in sections thanks to this. Section 76 | ## titles are the label, and a commit is classified under this section if any 77 | ## of the regexps associated is matching. 78 | ## 79 | ## Please note that ``section_regexps`` will only classify commits and won't 80 | ## make any changes to the contents. So you'll probably want to go check 81 | ## ``subject_process`` (or ``body_process``) to do some changes to the subject, 82 | ## whenever you are tweaking this variable. 83 | ## 84 | section_regexps = [ 85 | ('New', [ 86 | r'^[nN]ew\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$', 87 | ]), 88 | ('Changes', [ 89 | r'^[cC]hg\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$', 90 | ]), 91 | ('Fix', [ 92 | r'^[fF]ix\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$', 93 | ]), 94 | 95 | ('Other', None ## Match all lines 96 | ), 97 | 98 | ] 99 | 100 | 101 | ## ``body_process`` is a callable 102 | ## 103 | ## This callable will be given the original body and result will 104 | ## be used in the changelog. 105 | ## 106 | ## Available constructs are: 107 | ## 108 | ## - any python callable that take one txt argument and return txt argument. 109 | ## 110 | ## - ReSub(pattern, replacement): will apply regexp substitution. 111 | ## 112 | ## - Indent(chars=" "): will indent the text with the prefix 113 | ## Please remember that template engines gets also to modify the text and 114 | ## will usually indent themselves the text if needed. 115 | ## 116 | ## - Wrap(regexp=r"\n\n"): re-wrap text in separate paragraph to fill 80-Columns 117 | ## 118 | ## - noop: do nothing 119 | ## 120 | ## - ucfirst: ensure the first letter is uppercase. 121 | ## (usually used in the ``subject_process`` pipeline) 122 | ## 123 | ## - final_dot: ensure text finishes with a dot 124 | ## (usually used in the ``subject_process`` pipeline) 125 | ## 126 | ## - strip: remove any spaces before or after the content of the string 127 | ## 128 | ## - SetIfEmpty(msg="No commit message."): will set the text to 129 | ## whatever given ``msg`` if the current text is empty. 130 | ## 131 | ## Additionally, you can `pipe` the provided filters, for instance: 132 | #body_process = Wrap(regexp=r'\n(?=\w+\s*:)') | Indent(chars=" ") 133 | #body_process = Wrap(regexp=r'\n(?=\w+\s*:)') 134 | #body_process = noop 135 | body_process = ReSub(r'((^|\n)[A-Z]\w+(-\w+)*: .*(\n\s+.*)*)+$', r'') | strip 136 | 137 | 138 | ## ``subject_process`` is a callable 139 | ## 140 | ## This callable will be given the original subject and result will 141 | ## be used in the changelog. 142 | ## 143 | ## Available constructs are those listed in ``body_process`` doc. 144 | subject_process = (strip | 145 | ReSub(r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n@]*)(@[a-z]+\s+)*$', r'\4') | 146 | SetIfEmpty("No commit message.") | ucfirst | final_dot) 147 | 148 | 149 | ## ``tag_filter_regexp`` is a regexp 150 | ## 151 | ## Tags that will be used for the changelog must match this regexp. 152 | ## 153 | tag_filter_regexp = r'^rubix-root-[0-9]+\.[0-9]+(\.[0-9]+)?$' 154 | 155 | 156 | ## ``unreleased_version_label`` is a string or a callable that outputs a string 157 | ## 158 | ## This label will be used as the changelog Title of the last set of changes 159 | ## between last valid tag and HEAD if any. 160 | unreleased_version_label = "(unreleased)" 161 | 162 | 163 | ## ``output_engine`` is a callable 164 | ## 165 | ## This will change the output format of the generated changelog file 166 | ## 167 | ## Available choices are: 168 | ## 169 | ## - rest_py 170 | ## 171 | ## Legacy pure python engine, outputs ReSTructured text. 172 | ## This is the default. 173 | ## 174 | ## - mustache() 175 | ## 176 | ## Template name could be any of the available templates in 177 | ## ``templates/mustache/*.tpl``. 178 | ## Requires python package ``pystache``. 179 | ## Examples: 180 | ## - mustache("markdown") 181 | ## - mustache("restructuredtext") 182 | ## 183 | ## - makotemplate() 184 | ## 185 | ## Template name could be any of the available templates in 186 | ## ``templates/mako/*.tpl``. 187 | ## Requires python package ``mako``. 188 | ## Examples: 189 | ## - makotemplate("restructuredtext") 190 | ## 191 | output_engine = rest_py 192 | #output_engine = mustache("restructuredtext") 193 | #output_engine = mustache("markdown") 194 | #output_engine = makotemplate("restructuredtext") 195 | 196 | 197 | ## ``include_merge`` is a boolean 198 | ## 199 | ## This option tells git-log whether to include merge commits in the log. 200 | ## The default is to include them. 201 | include_merge = True 202 | 203 | 204 | ## ``log_encoding`` is a string identifier 205 | ## 206 | ## This option tells gitchangelog what encoding is outputed by ``git log``. 207 | ## The default is to be clever about it: it checks ``git config`` for 208 | ## ``i18n.logOutputEncoding``, and if not found will default to git's own 209 | ## default: ``utf-8``. 210 | #log_encoding = 'utf-8' 211 | 212 | 213 | ## ``publish`` is a callable 214 | ## 215 | ## Sets what ``gitchangelog`` should do with the output generated by 216 | ## the output engine. ``publish`` is a callable taking one argument 217 | ## that is an interator on lines from the output engine. 218 | ## 219 | ## Some helper callable are provided: 220 | ## 221 | ## Available choices are: 222 | ## 223 | ## - stdout 224 | ## 225 | ## Outputs directly to standard output 226 | ## (This is the default) 227 | ## 228 | ## - FileInsertAtFirstRegexMatch(file, pattern, idx=lamda m: m.start()) 229 | ## 230 | ## Creates a callable that will parse given file for the given 231 | ## regex pattern and will insert the output in the file. 232 | ## ``idx`` is a callable that receive the matching object and 233 | ## must return a integer index point where to insert the 234 | ## the output in the file. Default is to return the position of 235 | ## the start of the matched string. 236 | ## 237 | ## - FileRegexSubst(file, pattern, replace, flags) 238 | ## 239 | ## Apply a replace inplace in the given file. Your regex pattern must 240 | ## take care of everything and might be more complex. Check the README 241 | ## for a complete copy-pastable example. 242 | ## 243 | # publish = FileInsertIntoFirstRegexMatch( 244 | # "CHANGELOG.rst", 245 | # r'/(?P[0-9]+\.[0-9]+(\.[0-9]+)?)\s+\([0-9]+-[0-9]{2}-[0-9]{2}\)\n--+\n/', 246 | # idx=lambda m: m.start(1) 247 | # ) 248 | #publish = stdout 249 | 250 | 251 | ## ``revs`` is a list of callable or a list of string 252 | ## 253 | ## callable will be called to resolve as strings and allow dynamical 254 | ## computation of these. The result will be used as revisions for 255 | ## gitchangelog (as if directly stated on the command line). This allows 256 | ## to filter exaclty which commits will be read by gitchangelog. 257 | ## 258 | ## To get a full documentation on the format of these strings, please 259 | ## refer to the ``git rev-list`` arguments. There are many examples. 260 | ## 261 | ## Using callables is especially useful, for instance, if you 262 | ## are using gitchangelog to generate incrementally your changelog. 263 | ## 264 | ## Some helpers are provided, you can use them:: 265 | ## 266 | ## - FileFirstRegexMatch(file, pattern): will return a callable that will 267 | ## return the first string match for the given pattern in the given file. 268 | ## If you use named sub-patterns in your regex pattern, it'll output only 269 | ## the string matching the regex pattern named "rev". 270 | ## 271 | ## - Caret(rev): will return the rev prefixed by a "^", which is a 272 | ## way to remove the given revision and all its ancestor. 273 | ## 274 | ## Please note that if you provide a rev-list on the command line, it'll 275 | ## replace this value (which will then be ignored). 276 | ## 277 | ## If empty, then ``gitchangelog`` will act as it had to generate a full 278 | ## changelog. 279 | ## 280 | ## The default is to use all commits to make the changelog. 281 | #revs = ["^1.0.3", ] 282 | #revs = [ 283 | # Caret( 284 | # FileFirstRegexMatch( 285 | # "CHANGELOG.rst", 286 | # r"(?P[0-9]+\.[0-9]+(\.[0-9]+)?)\s+\([0-9]+-[0-9]{2}-[0-9]{2}\)\n--+\n")), 287 | # "HEAD" 288 | #] 289 | revs = [] 290 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *#*# 2 | *.#* 3 | *.swm 4 | *.swn 5 | *.swk 6 | *.swl 7 | *.swo 8 | *.swp 9 | *.ipr 10 | *.iml 11 | *.ipr 12 | *.iws 13 | *.pyc 14 | *.pyo 15 | *.swp 16 | *~ 17 | .DS_Store 18 | .cache 19 | .classpath 20 | .ensime 21 | .ensime_cache/ 22 | .ensime_lucene 23 | .generated-mima* 24 | .idea/ 25 | .idea_modules/ 26 | .project 27 | .pydevproject 28 | .scala_dependencies 29 | .settings 30 | /lib/ 31 | R-unit-tests.log 32 | R/unit-tests.out 33 | R/cran-check.out 34 | R/pkg/vignettes/sparkr-vignettes.html 35 | build/*.jar 36 | build/apache-maven* 37 | build/scala* 38 | build/zinc* 39 | cache 40 | checkpoint 41 | conf/*.cmd 42 | conf/*.conf 43 | conf/*.properties 44 | conf/*.sh 45 | conf/*.xml 46 | conf/java-opts 47 | conf/slaves 48 | dependency-reduced-pom.xml 49 | derby.log 50 | dev/create-release/*final 51 | dev/create-release/*txt 52 | dev/pr-deps/ 53 | dist/ 54 | docs/_site 55 | docs/api 56 | lib_managed/ 57 | lint-r-report.log 58 | log/ 59 | logs/ 60 | out/ 61 | project/boot/ 62 | project/build/target/ 63 | project/plugins/lib_managed/ 64 | project/plugins/project/build.properties 65 | project/plugins/src_managed/ 66 | project/plugins/target/ 67 | python/lib/pyspark.zip 68 | python/deps 69 | python/pyspark/python 70 | reports/ 71 | scalastyle-on-compile.generated.xml 72 | scalastyle-output.xml 73 | scalastyle.txt 74 | spark-*-bin-*.tgz 75 | spark-tests.log 76 | src_managed/ 77 | streaming-tests.log 78 | target/ 79 | unit-tests.log 80 | work/ 81 | 82 | # For Hive 83 | TempStatsStore/ 84 | metastore/ 85 | metastore_db/ 86 | sql/hive-thriftserver/test_warehouses 87 | warehouse/ 88 | spark-warehouse/ 89 | 90 | # For R session data 91 | .RData 92 | .RHistory 93 | .Rhistory 94 | *.Rproj 95 | *.Rproj.* 96 | 97 | .Rproj.user 98 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | language: java 19 | 20 | jdk: 21 | - openjdk8 22 | - oraclejdk8 23 | 24 | dist: trusty 25 | 26 | script: 27 | - mvn -DskipTests clean install 28 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | spark-sql-kinesis_2.13-1.2.1_spark-3.2 5 | ---------------------------- 6 | - Update README.md to indicate that this repo is no longer maintained, and include a link to the new repo. 7 | 8 | kinesis_2.12-1.2.0_spark-3.0 9 | ---------------------------- 10 | - Support for Spark 3.0 and scala 2.12 (#92) [Vikram Agrawal, Vikram 11 | Agrawal] 12 | - Switch to use Kinesis list-shards API (#89) [Chad Lagore] 13 | - Fix issue with slow kinesis sink. [Vikram Agrawal] 14 | - Remove lastReadSequenceNumber.isEmpty condition (#90) [Chad Lagore] 15 | - Fixing latency in kinesis sink (#81) [abhishekd0907] 16 | 17 | kinesis_2.11-1.1.4-spark_2.4 18 | ---------------------------- 19 | - Fix retries while making Kinesis API calls (#79) [Vikram Agrawal] 20 | - Adding ability to set exact Kinesis position to start reading from 21 | (#78) [Stanislav Norochevskyi] 22 | 23 | * Adding ability read initial Kinesis position from checkpoint JSON representation. 24 | 25 | * Adding a unit test for InitialKinesisPosition parser 26 | - Apply retries to kinesis exceptions with status code >= 500 (#76) 27 | [Arne Huang] 28 | - Make DefaultAWSCredentialsProviderChain available to choose (#71) 29 | [Hyeonseop Lee] 30 | 31 | * Add option to use instance profile credentials 32 | 33 | * Update docs 34 | 35 | * Add same option to producer 36 | - Added session based authentication (#72) [guoming-xu] 37 | - Add source and target java version to overwrite defaults; add nobootcp 38 | option to scala compiler (#68) [Julian Keppel] 39 | - Update the artifact of the new release. [Vikram Agrawal] 40 | - [maven-release-plugin] prepare for next development iteration. [Vikram 41 | Agrawal] 42 | - Merge branch 'master' of github.com:qubole/kinesis-sql. [Vikram 43 | Agrawal] 44 | - Update README.md. [Vikram Agrawal] 45 | - Update README.md. [Vikram Agrawal] 46 | - Update README.md. [Vikram Agrawal] 47 | 48 | kinesis_2.11-1.1.3-spark_2.4 49 | -------------------------------- 50 | - [maven-release-plugin] prepare release spark-sql- 51 | kinesis_2.11-1.1.3-spark_2.4. [Vikram Agrawal] 52 | - GitChangelog RC file and changelog. [Vikram Agrawal] 53 | - Add travis build config. [Vikram Agrawal] 54 | - Create fat jar with dependencies shaded in the jar. [Vikram Agrawal] 55 | - Handle deleted shards (#59) [Vikram Agrawal] 56 | - [maven-release-plugin] prepare for next development iteration. [Vikram 57 | Agrawal] 58 | 59 | kinesis_2.11-1.1.2-spark_2.4 60 | ----------------------------- 61 | - [maven-release-plugin] prepare release spark-sql- 62 | kinesis_2.11-1.1.2-spark_2.4. [Vikram Agrawal] 63 | - [maven-release-plugin] prepare for next development iteration. [Vikram 64 | Agrawal] 65 | - POM changes. [Vikram Agrawal] 66 | - Update pom.xml. [Vikram Agrawal] 67 | - [maven-release-plugin] prepare for next development iteration. [Vikram 68 | Agrawal] 69 | - [maven-release-plugin] prepare release spark-sql- 70 | kinesis_2.11-1.1.0-spark_2.4. [Vikram Agrawal] 71 | - Update Pom file for mvn release. [Vikram Agrawal] 72 | - Fix Deserialisation/Serialisation of KinesisOffsets (#62) [Vikram 73 | Agrawal] 74 | 75 | * Fix incorrect metadata for batchId in the kinesis offsets 76 | 77 | * Fix serialization and deserialization fo the kinesis offsets 78 | - Fix issue with Reprocessing oldshards issue (#63) [Vikram Agrawal] 79 | - 2.4.0 (#56) [Vikram Agrawal] 80 | 81 | * Fix broken test suite and scalastyle error 82 | 83 | * Fix issues with over-ridden Kinesis Source Options (#36) 84 | 85 | * Fix issues with over-ridden Kinesis Source Options 86 | 87 | * Update ReadMe 88 | 89 | * Fix Review Comments 90 | 91 | * Fix ambiguity with scala 2.12 (#37) 92 | 93 | * Fix stylecheck errors 94 | 95 | * Add retries to Handle EC issues in HDFS metadata committer (#41) 96 | 97 | * Handle EC issues in HDFS MetaCommitter 98 | 99 | * Avoid timestamp as Offset and avoid empty batches when there is no new data (#49) 100 | - Changing protobuf version and fixing styling errors (#53) 101 | [abhishekd0907] 102 | - 2.4.0 (#45) [nhampiholi] 103 | 104 | * Fix broken test suite and scalastyle error 105 | 106 | * Fix issues with over-ridden Kinesis Source Options (#36) 107 | 108 | * Fix issues with over-ridden Kinesis Source Options 109 | 110 | * Update ReadMe 111 | 112 | * Fix Review Comments 113 | 114 | * Fix ambiguity with scala 2.12 (#37) 115 | 116 | * Fix stylecheck errors 117 | 118 | * Add retries to Handle EC issues in HDFS metadata committer (#41) 119 | 120 | * Handle EC issues in HDFS MetaCommitter 121 | 122 | * Avoid timestamp as Offset and avoid empty batches when there is no new data (#49) 123 | - Avoid timestamp as Offset and avoid empty batches when there is no new 124 | data (#49) [Vikram Agrawal] 125 | - Add retries to Handle EC issues in HDFS metadata committer (#41) 126 | [Vikram Agrawal] 127 | 128 | * Handle EC issues in HDFS MetaCommitter 129 | - Fix stylecheck errors. [Vikram Agrawal] 130 | - Fix ambiguity with scala 2.12 (#37) [Marcin Szymański] 131 | - Fix issues with over-ridden Kinesis Source Options (#36) [Vikram 132 | Agrawal] 133 | 134 | * Fix issues with over-ridden Kinesis Source Options 135 | 136 | * Update ReadMe 137 | 138 | * Fix Review Comments 139 | - Fix broken test suite and scalastyle error. [anthony.may] 140 | - Use latest DSv2 APIs for continuous processing. [Vikram Agrawal] 141 | - Handle greater than 100 shard streams. [Vikram Agrawal] 142 | - Fix for handling greater than 100 shard streams. [Siddhartha Jain] 143 | - Update README.md. [Vikram Agrawal] 144 | - Add AWSInstanceProfileCredentialsProviderWithRetries to handle issues 145 | with instance profile provider. [Vikram Agrawal] 146 | - Bump up spark version. [Vikram Agrawal] 147 | - Support of Kinesis Connector for Continuous streaming (#15) [Vikram 148 | Agrawal] 149 | - Fixed aws sdk and kpl versions (#18) [Georgios] 150 | - Bump up Kinesis Client version. [Vikram Agrawal] 151 | - Support of AWS roles / instance profile for Authentication (#13) 152 | [Vikram Agrawal] 153 | 154 | * Support of AWS roles / instance profile for Authentication 155 | 156 | * InstanceProfileCredentials should be case object instead of case class 157 | 158 | * Fix unit tests 159 | 160 | * clean up 161 | - Update README. [Vikram Agrawal] 162 | - Merge branch '2.3.0' [Vikram Agrawal] 163 | - Fixes to support build against SPARK 2.3.0. [Vikram Agrawal] 164 | - Merge pull request #8 from ggeorgiadis/master. [Vikram Agrawal] 165 | 166 | Fixed issue with closed child process caused by the cache timeout 167 | - Cleanup. [Georgios Georgiadis] 168 | - Fixed bug in parameters validation and cache timeout. [Georgios 169 | Georgiadis] 170 | - Merge pull request #5 from ggeorgiadis/master. [Vikram Agrawal] 171 | 172 | Added Kinesis Sink support 173 | - Removed duplicate awsSecretKey option. [Georgios] 174 | - Remove kinesis sink from roadmap. [Georgios Georgiadis] 175 | - Enable record aggregation by default. [Georgios Georgiadis] 176 | - Added kinesis.executor.aggregationEnabled and 177 | kinesis.producer.cache.timeout parameters Also we flush the producer 178 | before we destroy. [Georgios Georgiadis] 179 | - Updated configuration naming. [Georgios Georgiadis] 180 | - Updated Readme. [Georgios Georgiadis] 181 | - Improved validation and removed region param. [Georgios Georgiadis] 182 | - Added tests and cleaned up. [Georgios Georgiadis] 183 | - Clean up and use specific aws creds for sink. [Georgios Georgiadis] 184 | - Added Kinesis Sink support. [Georgios Georgiadis] 185 | - Merge branch 'master' of github.com:qubole/kinesis-sql. [Vikram 186 | Agrawal] 187 | - Update README.md. [mayankahuja] 188 | - Update README.md. [Vikram Agrawal] 189 | - Shade Jackson Dataformat Jar. [Vikram Agrawal] 190 | - Update README file and remove filter for ShardEnd in getOffset. 191 | [Vikram Agrawal] 192 | - Update Gitignore file. [Vikram Agrawal] 193 | - Support for Kinesis Source in Spark Structured Streaming. [Vikram 194 | Agrawal] 195 | - Initial commit. [Rajat Venkatesh] 196 | 197 | 198 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/qubole/kinesis-sql.svg?branch=master)](https://travis-ci.org/qubole/kinesis-sql) 2 | 3 | ## NOTE: This project is NO LONGER MAINTAINED. 4 | 5 | [Ron Cremer](https://github.com/roncemer) has volunteered to maintain this project. Beginning with Spark 3.2, the new project is located here: https://github.com/roncemer/spark-sql-kinesis 6 | 7 | 8 | # Kinesis Connector for Structured Streaming 9 | 10 | Implementation of Kinesis Source Provider in Spark Structured Streaming. [SPARK-18165](https://issues.apache.org/jira/browse/SPARK-18165) describes the need for such implementation. More details on the implementation can be read in this [blog](https://www.qubole.com/blog/kinesis-connector-for-structured-streaming/) 11 | 12 | ## Downloading and Using the Connector 13 | 14 | The connector is available from the Maven Central repository. It can be used using the --packages option or the spark.jars.packages configuration property. Use the following connector artifact 15 | 16 | Spark 3.0: com.qubole.spark/spark-sql-kinesis_2.12/1.2.0-spark_3.0 17 | Spark 2.4: com.qubole.spark/spark-sql-kinesis_2.11/1.2.0-spark_2.4 18 | 19 | ## Developer Setup 20 | Checkout kinesis-sql branch depending upon your Spark version. Use Master branch for the latest Spark version 21 | 22 | ###### Spark version 3.0.x 23 | git clone git@github.com:qubole/kinesis-sql.git 24 | git checkout master 25 | cd kinesis-sql 26 | mvn install -DskipTests 27 | 28 | This will create *target/spark-sql-kinesis_2.12-\*.jar* file which contains the connector code and its dependency jars. 29 | 30 | 31 | ## How to use it 32 | 33 | #### Setup Kinesis 34 | Refer [Amazon Docs](https://docs.aws.amazon.com/cli/latest/reference/kinesis/create-stream.html) for more options 35 | 36 | ###### Create Kinesis Stream 37 | 38 | $ aws kinesis create-stream --stream-name test --shard-count 2 39 | 40 | ###### Add Records in the stream 41 | 42 | $ aws kinesis put-record --stream-name test --partition-key 1 --data 'Kinesis' 43 | $ aws kinesis put-record --stream-name test --partition-key 1 --data 'Connector' 44 | $ aws kinesis put-record --stream-name test --partition-key 1 --data 'for' 45 | $ aws kinesis put-record --stream-name test --partition-key 1 --data 'Apache' 46 | $ aws kinesis put-record --stream-name test --partition-key 1 --data 'Spark' 47 | 48 | #### Example Streaming Job 49 | 50 | Refering $SPARK_HOME to the Spark installation directory. 51 | 52 | ###### Open Spark-Shell 53 | 54 | $SPARK_HOME/bin/spark-shell --jars target/spark-sql-kinesis_2.11-2.2.0.jar 55 | 56 | ###### Subscribe to Kinesis Source 57 | // Subscribe the "test" stream 58 | scala> :paste 59 | val kinesis = spark 60 | .readStream 61 | .format("kinesis") 62 | .option("streamName", "spark-streaming-example") 63 | .option("endpointUrl", "https://kinesis.us-east-1.amazonaws.com") 64 | .option("awsAccessKeyId", [ACCESS_KEY]) 65 | .option("awsSecretKey", [SECRET_KEY]) 66 | .option("startingposition", "TRIM_HORIZON") 67 | .load 68 | 69 | ###### Check Schema 70 | scala> kinesis.printSchema 71 | root 72 | |-- data: binary (nullable = true) 73 | |-- streamName: string (nullable = true) 74 | |-- partitionKey: string (nullable = true) 75 | |-- sequenceNumber: string (nullable = true) 76 | |-- approximateArrivalTimestamp: timestamp (nullable = true) 77 | 78 | ###### Word Count 79 | // Cast data into string and group by data column 80 | scala> :paste 81 | 82 | kinesis 83 | .selectExpr("CAST(data AS STRING)").as[(String)] 84 | .groupBy("data").count() 85 | .writeStream 86 | .format("console") 87 | .outputMode("complete") 88 | .start() 89 | .awaitTermination() 90 | 91 | ###### Output in Console 92 | 93 | 94 | +------------+-----+ 95 | | data|count| 96 | +------------+-----+ 97 | | for| 1| 98 | | Apache| 1| 99 | | Spark| 1| 100 | | Kinesis| 1| 101 | | Connector| 1| 102 | +------------+-----+ 103 | 104 | ###### Using the Kinesis Sink 105 | // Cast data into string and group by data column 106 | scala> :paste 107 | kinesis 108 | .selectExpr("CAST(rand() AS STRING) as partitionKey","CAST(data AS STRING)").as[(String,String)] 109 | .groupBy("data").count() 110 | .writeStream 111 | .format("kinesis") 112 | .outputMode("update") 113 | .option("streamName", "spark-sink-example") 114 | .option("endpointUrl", "https://kinesis.us-east-1.amazonaws.com") 115 | .option("awsAccessKeyId", [ACCESS_KEY]) 116 | .option("awsSecretKey", [SECRET_KEY]) 117 | .start() 118 | .awaitTermination() 119 | 120 | ## Kinesis Source Configuration 121 | 122 | Option-Name | Default-Value | Description | 123 | | ------------- |:-------------:| -----:| 124 | | streamName | - | Name of the stream in Kinesis to read from | 125 | | endpointUrl | https://kinesis.us-east-1.amazonaws.com | end-point URL for Kinesis Stream| 126 | | awsAccessKeyId | - | AWS Credentials for Kinesis describe, read record operations | 127 | | awsSecretKey | - | AWS Credentials for Kinesis describe, read record operations | 128 | | awsSTSRoleARN | - | AWS STS Role ARN for Kinesis describe, read record operations | 129 | | awsSTSSessionName | - | AWS STS Session name for Kinesis describe, read record operations | 130 | | awsUseInstanceProfile | true | Use Instance Profile Credentials if none of credentials provided | 131 | | startingPosition | LATEST | Starting Position in Kinesis to fetch data from. Possible values are "latest", "trim_horizon", "earliest" (alias for trim_horizon), or JSON serialized map shardId->KinesisPosition | 132 | | failondataloss| true | fail the streaming job if any active shard is missing or expired 133 | | kinesis.executor.maxFetchTimeInMs | 1000 | Maximum time spent in executor to fetch record from Kinesis per Shard | 134 | | kinesis.executor.maxFetchRecordsPerShard | 100000 | Maximum Number of records to fetch per shard | 135 | | kinesis.executor.maxRecordPerRead | 10000 | Maximum Number of records to fetch per getRecords API call | 136 | | kinesis.executor.addIdleTimeBetweenReads | false | Add delay between two consecutive getRecords API call | 137 | | kinesis.executor.idleTimeBetweenReadsInMs | 1000 | Minimum delay between two consecutive getRecords | 138 | | kinesis.client.describeShardInterval | 1s (1 second) | Minimum Interval between two ListShards API calls to consider resharding | 139 | | kinesis.client.numRetries | 3 | Maximum Number of retries for Kinesis API requests | 140 | | kinesis.client.retryIntervalMs | 1000 | Cool-off period before retrying Kinesis API | 141 | | kinesis.client.maxRetryIntervalMs | 10000 | Max Cool-off period between 2 retries | 142 | | kinesis.client.avoidEmptyBatches| false | Avoid creating an empty microbatch job by checking upfront if there are any unread data in the stream before the batch is started 143 | 144 | ## Kinesis Sink Configuration 145 | Option-Name | Default-Value | Description | 146 | | ------------- |:-------------:| -----:| 147 | | streamName | - | Name of the stream in Kinesis to write to| 148 | | endpointUrl | https://kinesis.us-east-1.amazonaws.com | The aws endpoint of the kinesis Stream | 149 | | awsAccessKeyId | - | AWS Credentials for Kinesis describe, read record operations 150 | | awsSecretKey | - | AWS Credentials for Kinesis describe, read record | 151 | | awsSTSRoleARN | - | AWS STS Role ARN for Kinesis describe, read record operations | 152 | | awsSTSSessionName | - | AWS STS Session name for Kinesis describe, read record operations | 153 | | awsUseInstanceProfile | true | Use Instance Profile Credentials if none of credentials provided | 154 | | kinesis.executor.recordMaxBufferedTime | 1000 (millis) | Specify the maximum buffered time of a record | 155 | | kinesis.executor.maxConnections | 1 | Specify the maximum connections to Kinesis | 156 | | kinesis.executor.aggregationEnabled | true | Specify if records should be aggregated before sending them to Kinesis | 157 | | kniesis.executor.flushwaittimemillis | 100 | Wait time while flushing records to Kinesis on Task End | 158 | 159 | ## Roadmap 160 | * We need to migrate to DataSource V2 APIs for MicroBatchExecution. 161 | * Maintain Per Micro-Batch Shard Commit state in Dynamo DB 162 | 163 | ## Acknowledgement 164 | 165 | This connector would not have been possible without reference implemetation of [Kafka connector](https://github.com/apache/spark/tree/branch-2.2/external/kafka-0-10-sql) for Structured streaming, [Kinesis Connector](https://github.com/apache/spark/tree/branch-2.2/external/kinesis-asl) for Legacy Streaming and [Kinesis Client Library](https://github.com/awslabs/amazon-kinesis-client). Structure of some part of the code is influenced by the excellent work done by various Apache Spark Contributors. 166 | -------------------------------------------------------------------------------- /dev/checkstyle-suppressions.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | 21 | 22 | 29 | 30 | 31 | 33 | 35 | 37 | 39 | 41 | 43 | 45 | 47 | 49 | 50 | -------------------------------------------------------------------------------- /dev/checkstyle.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | 21 | 22 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 123 | 124 | 125 | 126 | 128 | 129 | 130 | 131 | 133 | 134 | 135 | 137 | 139 | 141 | 143 | 144 | 145 | 155 | 156 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 4.0.0 20 | 21 | com.qubole.spark 22 | spark-sql-kinesis_2.12 23 | 1.2.1_spark-3.0-SNAPSHOT 24 | jar 25 | Kinesis Integration for Structured Streaming 26 | Connector to read from and write into Kinesis from Structured Streaming Applications 27 | http://github.com/qubole/kinesis-sql 28 | 29 | 30 | 31 | 32 | qubole 33 | Qubole Inc. 34 | http://www.qubole.com 35 | 36 | developer 37 | 38 | 39 | 40 | 41 | 42 | 43 | Apache License, Version 2.0 44 | https://github.com/qubole/kinesis-sql/blob/master/LICENSE.txt 45 | repo 46 | 47 | 48 | 49 | 50 | scm:git:git://github.com/qubole/kinesis-sql.git 51 | http://github.com/qubole/kinesis-sql 52 | scm:git:git@github.com:qubole/kinesis-sql.git 53 | spark-sql-kinesis_2.12-1.2.0-spark_3.0 54 | 55 | 56 | 2018 57 | 58 | Qubole 59 | http://www.qubole.com/ 60 | 61 | 62 | 63 | sql-kinesis 64 | 3.0.1 65 | 2.12 66 | 2.10.0 67 | UTF-8 68 | UTF-8 69 | 70 | 71 | 72 | 73 | org.apache.spark 74 | spark-sql_${scala.binary.version} 75 | ${spark.version} 76 | provided 77 | 78 | 79 | org.apache.spark 80 | spark-core_${scala.binary.version} 81 | ${spark.version} 82 | test-jar 83 | test 84 | 85 | 86 | org.apache.spark 87 | spark-catalyst_${scala.binary.version} 88 | ${spark.version} 89 | test-jar 90 | test 91 | 92 | 93 | org.apache.spark 94 | spark-sql_${scala.binary.version} 95 | ${spark.version} 96 | test-jar 97 | test 98 | 99 | 100 | com.amazonaws 101 | amazon-kinesis-client 102 | 1.9.0 103 | 104 | 105 | com.amazonaws 106 | aws-java-sdk-core 107 | 1.11.655 108 | 109 | 110 | com.amazonaws 111 | aws-java-sdk-sts 112 | 1.11.271 113 | 114 | 115 | com.amazonaws 116 | amazon-kinesis-producer 117 | 0.12.8 118 | 119 | 120 | com.google.protobuf 121 | protobuf-java 122 | 3.16.1 123 | 128 | 129 | 130 | com.fasterxml.jackson.dataformat 131 | jackson-dataformat-cbor 132 | ${fasterxml.jackson.version} 133 | 134 | 135 | org.mockito 136 | mockito-core 137 | 3.1.0 138 | test 139 | 140 | 141 | org.scalacheck 142 | scalacheck_${scala.binary.version} 143 | 1.14.2 144 | test 145 | 146 | 147 | org.scalatest 148 | scalatest_${scala.binary.version} 149 | 3.0.8 150 | test 151 | 152 | 153 | org.apache.spark 154 | spark-tags_${scala.binary.version} 155 | ${spark.version} 156 | 157 | 158 | 162 | 163 | org.apache.spark 164 | spark-tags_${scala.binary.version} 165 | test-jar 166 | test 167 | ${spark.version} 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | org.scalatest 176 | scalatest-maven-plugin 177 | 2.0.0 178 | 179 | 180 | test 181 | 182 | test 183 | 184 | 185 | 186 | 187 | 188 | net.alchim31.maven 189 | scala-maven-plugin 190 | 4.3.0 191 | 192 | 193 | compile 194 | 195 | compile 196 | add-source 197 | doc-jar 198 | 199 | compile 200 | 201 | 202 | test-compile 203 | 204 | testCompile 205 | 206 | test-compile 207 | 208 | 209 | process-resources 210 | 211 | compile 212 | 213 | 214 | 215 | 216 | 217 | -nobootcp 218 | 219 | 220 | 221 | 222 | org.apache.maven.plugins 223 | maven-compiler-plugin 224 | 3.8.1 225 | 226 | 227 | compile 228 | 229 | compile 230 | 231 | 232 | 233 | 234 | 1.8 235 | 1.8 236 | 237 | 238 | 239 | org.apache.maven.plugins 240 | maven-shade-plugin 241 | 3.2.1 242 | 243 | 244 | package 245 | 246 | shade 247 | 248 | 249 | 250 | 251 | com.amazonaws:amazon-kinesis-client:* 252 | com.amazonaws:amazon-kinesis-producer:* 253 | com.amazonaws:aws-java-sdk-kinesis:* 254 | com.amazonaws:aws-java-sdk-dynamodb:* 255 | com.amazonaws:aws-java-sdk-core:* 256 | com.amazonaws:aws-java-sdk-sts:* 257 | com.fasterxml.jackson.dataformat:*:* 258 | com.google.protobuf:*:* 259 | 260 | 261 | 262 | 263 | com.fasterxml.jackson.dataformat 264 | org.apache.spark.sql.kinesis.shaded.fasterxml.jackson.dataformat 265 | 266 | com.fasterxml.jackson.dataformat.** 267 | 268 | 269 | 270 | com.amazonaws 271 | org.apache.spark.sql.kinesis.shaded.amazonaws 272 | 273 | com.amazonaws.** 274 | 275 | 276 | 277 | com.google.protobuf 278 | org.apache.spark.sql.kinesis.shaded.google.protobuf 279 | 280 | com.google.protobuf.** 281 | 282 | 283 | 284 | 285 | 286 | *:* 287 | 288 | META-INF/LICENSE* 289 | META-INF/NOTICE* 290 | META-INF/DEPENDENCIES 291 | META-INF/maven/** 292 | META-INF/*.xml 293 | META-INF/*.SF 294 | META-INF/*.DSA 295 | META-INF/*.RSA 296 | models/* 297 | .gitkeep 298 | 299 | 300 | 301 | 302 | 303 | 304 | log4j.properties 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | net.alchim31.maven 319 | scala-maven-plugin 320 | 4.3.0 321 | 322 | 323 | org.apache.maven.plugins 324 | maven-shade-plugin 325 | 3.2.1 326 | 327 | 328 | org.scalatest 329 | scalatest-maven-plugin 330 | 2.0.0 331 | 332 | 333 | 334 | target/scala-${scala.binary.version}/classes 335 | target/scala-${scala.binary.version}/test-classes 336 | 337 | 338 | 339 | 340 | release 341 | 342 | 343 | 344 | org.apache.maven.plugins 345 | maven-source-plugin 346 | 3.1.0 347 | 348 | 349 | create-sources-jar 350 | 351 | jar-no-fork 352 | 353 | 354 | 355 | 356 | 357 | org.apache.maven.plugins 358 | maven-gpg-plugin 359 | 1.5 360 | 361 | 362 | sign-artifacts 363 | verify 364 | 365 | sign 366 | 367 | 368 | 369 | 370 | 371 | org.apache.maven.plugins 372 | maven-javadoc-plugin 373 | 2.10.1 374 | 375 | 376 | create-javadoc-jar 377 | 378 | jar 379 | 380 | 381 | 382 | 383 | 384 | org.apache.maven.plugins 385 | maven-release-plugin 386 | 2.5.1 387 | 388 | true 389 | 390 | 391 | 392 | 393 | org.sonatype.plugins 394 | nexus-staging-maven-plugin 395 | 1.6.3 396 | true 397 | 398 | ossrh 399 | https://oss.sonatype.org/ 400 | true 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | ossrh 411 | https://oss.sonatype.org/content/repositories/snapshots 412 | 413 | 414 | ossrh 415 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 416 | 417 | 418 | 419 | 420 | 421 | 422 | org.codehaus.mojo 423 | cobertura-maven-plugin 424 | 2.7 425 | 426 | 427 | 428 | 429 | -------------------------------------------------------------------------------- /src/main/java/org/apache/spark/sql/kinesis/AWSInstanceProfileCredentialsProviderWithRetries.java: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.kinesis; 2 | 3 | import com.amazonaws.AmazonClientException; 4 | import com.amazonaws.auth.AWSCredentials; 5 | import com.amazonaws.auth.InstanceProfileCredentialsProvider; 6 | import org.apache.commons.logging.Log; 7 | import org.apache.commons.logging.LogFactory; 8 | 9 | public class AWSInstanceProfileCredentialsProviderWithRetries 10 | extends InstanceProfileCredentialsProvider { 11 | 12 | private static final Log LOG = 13 | LogFactory.getLog(AWSInstanceProfileCredentialsProviderWithRetries.class); 14 | 15 | public AWSCredentials getCredentials() { 16 | int retries = 10; 17 | int sleep = 500; 18 | while(retries > 0) { 19 | try { 20 | return super.getCredentials(); 21 | } 22 | catch (RuntimeException re) { 23 | LOG.error("Got an exception while fetching credentials " + re); 24 | --retries; 25 | try { 26 | Thread.sleep(sleep); 27 | } catch (InterruptedException ie) { 28 | // Do Nothing here 29 | } 30 | if (sleep < 10000) { 31 | sleep *= 2; 32 | } 33 | } 34 | catch (Error error) { 35 | LOG.error("Got an exception while fetching credentials " + error); 36 | --retries; 37 | try { 38 | Thread.sleep(sleep); 39 | } catch (InterruptedException ie) { 40 | // Do Nothing here 41 | } 42 | if (sleep < 10000) { 43 | sleep *= 2; 44 | } 45 | } 46 | } 47 | throw new AmazonClientException("Unable to load credentials."); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | org.apache.spark.sql.kinesis.KinesisSourceProvider -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/CachedKinesisProducer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.kinesis 18 | 19 | import java.util.Locale 20 | import java.util.concurrent.{ExecutionException, TimeUnit} 21 | 22 | import scala.collection.JavaConverters._ 23 | import scala.util.control.NonFatal 24 | 25 | import com.amazonaws.auth.{AWSStaticCredentialsProvider, BasicAWSCredentials} 26 | import com.amazonaws.regions.RegionUtils 27 | import com.amazonaws.services.kinesis.AmazonKinesis 28 | import com.amazonaws.services.kinesis.producer.{KinesisProducer, KinesisProducerConfiguration} 29 | import com.google.common.cache._ 30 | import com.google.common.util.concurrent.{ExecutionError, UncheckedExecutionException} 31 | 32 | import org.apache.spark.SparkEnv 33 | import org.apache.spark.internal.Logging 34 | 35 | private[kinesis] object CachedKinesisProducer extends Logging { 36 | 37 | private type Producer = KinesisProducer 38 | 39 | private lazy val cacheExpireTimeout: Long = 40 | SparkEnv.get.conf.getTimeAsMs("spark.kinesis.producer.cache.timeout", "10m") 41 | 42 | private val cacheLoader = new CacheLoader[Seq[(String, Object)], Producer] { 43 | override def load(config: Seq[(String, Object)]): Producer = { 44 | val configMap = config.map(x => x._1 -> x._2.toString).toMap 45 | createKinesisProducer(configMap) 46 | } 47 | } 48 | 49 | private val removalListener = new RemovalListener[Seq[(String, Object)], Producer]() { 50 | override def onRemoval(notification: 51 | RemovalNotification[Seq[(String, Object)], Producer]): Unit = { 52 | val paramsSeq: Seq[(String, Object)] = notification.getKey 53 | val producer: Producer = notification.getValue 54 | logDebug( 55 | s"Evicting kinesis producer $producer params: $paramsSeq," + 56 | s" due to ${notification.getCause}") 57 | close(paramsSeq, producer) 58 | } 59 | } 60 | 61 | private lazy val guavaCache: LoadingCache[Seq[(String, Object)], Producer] = 62 | CacheBuilder.newBuilder().expireAfterAccess(cacheExpireTimeout, TimeUnit.MILLISECONDS) 63 | .removalListener(removalListener) 64 | .build[Seq[(String, Object)], Producer](cacheLoader) 65 | 66 | private def createKinesisProducer(producerConfiguration: Map[String, String]): Producer = { 67 | val kinesisParams = producerConfiguration.keySet 68 | .filter(_.toLowerCase(Locale.ROOT).startsWith("kinesis.")) 69 | .map { k => k.drop(8).toString -> producerConfiguration(k) } 70 | .toMap 71 | 72 | val recordMaxBufferedTime = kinesisParams.getOrElse( 73 | KinesisSourceProvider.SINK_RECORD_MAX_BUFFERED_TIME, 74 | KinesisSourceProvider.DEFAULT_SINK_RECORD_MAX_BUFFERED_TIME) 75 | .toLong 76 | 77 | val maxConnections = kinesisParams.getOrElse( 78 | KinesisSourceProvider.SINK_MAX_CONNECTIONS, 79 | KinesisSourceProvider.DEFAULT_SINK_MAX_CONNECTIONS) 80 | .toInt 81 | 82 | val awsAccessKeyId = producerConfiguration.getOrElse( 83 | KinesisSourceProvider.AWS_ACCESS_KEY_ID, "").toString 84 | 85 | val awsSecretKey = producerConfiguration.getOrElse( 86 | KinesisSourceProvider.AWS_SECRET_KEY, "").toString 87 | 88 | var sessionToken = producerConfiguration.getOrElse( 89 | KinesisSourceProvider.AWS_SESSION_TOKEN, "").toString 90 | 91 | val awsStsRoleArn = producerConfiguration.getOrElse( 92 | KinesisSourceProvider.AWS_STS_ROLE_ARN, "").toString 93 | 94 | val awsStsSessionName = producerConfiguration.getOrElse( 95 | KinesisSourceProvider.AWS_STS_SESSION_NAME, "").toString 96 | 97 | val awsUseInstanceProfile = producerConfiguration.getOrElse( 98 | KinesisSourceProvider.AWS_USE_INSTANCE_PROFILE, "true").toBoolean 99 | 100 | val endpoint = producerConfiguration.getOrElse( 101 | KinesisSourceProvider.SINK_ENDPOINT_URL, KinesisSourceProvider.DEFAULT_KINESIS_ENDPOINT_URL) 102 | .toString 103 | 104 | val aggregation = producerConfiguration.getOrElse( 105 | KinesisSourceProvider.SINK_AGGREGATION_ENABLED, 106 | KinesisSourceProvider.DEFAULT_SINK_AGGREGATION) 107 | .toBoolean 108 | 109 | val region = getRegionNameByEndpoint(endpoint) 110 | 111 | val kinesisCredsProvider = if (awsAccessKeyId.length > 0) { 112 | if(sessionToken.length > 0) { 113 | BasicAWSSessionCredentials(awsAccessKeyId, awsSecretKey, sessionToken) 114 | } else { 115 | BasicCredentials(awsAccessKeyId, awsSecretKey) 116 | } 117 | } else if (awsStsRoleArn.length > 0) { 118 | STSCredentials(awsStsRoleArn, awsStsSessionName) 119 | } else if (awsUseInstanceProfile) { 120 | InstanceProfileCredentials 121 | } else { 122 | DefaultCredentials 123 | } 124 | 125 | val kinesisProducer = new Producer(new KinesisProducerConfiguration() 126 | .setRecordMaxBufferedTime(recordMaxBufferedTime) 127 | .setMaxConnections(maxConnections) 128 | .setAggregationEnabled(aggregation) 129 | .setCredentialsProvider( 130 | kinesisCredsProvider.provider 131 | ) 132 | .setRegion(region) 133 | ) 134 | logDebug(s"Created a new instance of KinesisProducer for $producerConfiguration.") 135 | kinesisProducer 136 | } 137 | 138 | private[kinesis] def getOrCreate(kinesisParams: Map[String, String]): Producer = { 139 | val paramsSeq: Seq[(String, Object)] = paramsToSeq(kinesisParams) 140 | try { 141 | guavaCache.get(paramsSeq) 142 | } catch { 143 | case e@(_: ExecutionException | _: UncheckedExecutionException | _: ExecutionError) 144 | if e.getCause != null => 145 | throw e.getCause 146 | } 147 | } 148 | 149 | private def paramsToSeq(kinesisParams: Map[String, String]): Seq[(String, Object)] = { 150 | val paramsSeq: Seq[(String, Object)] = kinesisParams.toSeq.sortBy(x => x._1) 151 | paramsSeq 152 | } 153 | 154 | /** For explicitly closing kinesis producer */ 155 | private[kinesis] def close(kinesisParams: Map[String, String]): Unit = { 156 | val paramsSeq = paramsToSeq(kinesisParams) 157 | guavaCache.invalidate(paramsSeq) 158 | } 159 | 160 | /** Auto close on cache evict */ 161 | private def close(paramsSeq: Seq[(String, Object)], producer: Producer): Unit = { 162 | try { 163 | logInfo(s"Closing the KinesisProducer with params: ${paramsSeq.mkString("\n")}.") 164 | producer.flushSync() 165 | producer.destroy() 166 | } catch { 167 | case NonFatal(e) => logWarning("Error while closing kinesis producer.", e) 168 | } 169 | } 170 | 171 | private def clear(): Unit = { 172 | logInfo("Cleaning up guava cache.") 173 | guavaCache.invalidateAll() 174 | } 175 | 176 | def getRegionNameByEndpoint(endpoint: String): String = { 177 | val uri = new java.net.URI(endpoint) 178 | RegionUtils.getRegionsForService(AmazonKinesis.ENDPOINT_PREFIX) 179 | .asScala 180 | .find(_.getAvailableEndpoints.asScala.toSeq.contains(uri.getHost)) 181 | .map(_.getName) 182 | .getOrElse( 183 | throw new IllegalArgumentException(s"Could not resolve region for endpoint: $endpoint")) 184 | } 185 | 186 | } 187 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/HDFSMetadataCommitter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import java.io.{FileNotFoundException, InputStream, InputStreamReader, OutputStream} 21 | import java.nio.charset.StandardCharsets 22 | import java.util.{EnumSet, Locale} 23 | 24 | import org.apache.commons.io.IOUtils 25 | import org.apache.hadoop.conf.Configuration 26 | import org.apache.hadoop.fs._ 27 | import org.apache.hadoop.fs.permission.FsPermission 28 | import org.json4s.NoTypeHints 29 | import org.json4s.jackson.Serialization 30 | import scala.reflect.ClassTag 31 | import scala.util.control.NonFatal 32 | 33 | import org.apache.spark.internal.Logging 34 | import org.apache.spark.util.SerializableConfiguration 35 | 36 | 37 | /* 38 | [[HDFSFileCommitter]] is used by executors to commit metadata to a HDFS location 39 | It is similar to [[HDFSMetadataLog]]. Difference is that it does not use 40 | [[SparkSession]] while creating fileContext. Hence it can used by executors. 41 | We could have modified [[HDFSMetadataLog]] but then changes for kinesis support 42 | would not have been contained within an external jar 43 | */ 44 | 45 | class HDFSMetadataCommitter[T <: AnyRef : ClassTag](path: String, 46 | hadoopConf: SerializableConfiguration, 47 | options: Map[String, String] = Map.empty[String, String]) 48 | extends MetadataCommitter[T] with Logging with Serializable{ 49 | 50 | 51 | private implicit val formats = Serialization.formats(NoTypeHints) 52 | 53 | /** Needed to serialize type T into JSON when using Jackson */ 54 | private implicit val manifest = Manifest.classType[T](implicitly[ClassTag[T]].runtimeClass) 55 | 56 | val metadataPath = new Path(path, "shard-commit") 57 | 58 | protected val fileContext = FileContext.getFileContext( 59 | metadataPath.toUri, hadoopConf.value.asInstanceOf[ Configuration ]) 60 | 61 | if ( !fileContext.util().exists(metadataPath) ) { 62 | fileContext.mkdir(metadataPath, FsPermission.getDirDefault, true) 63 | } 64 | 65 | private val numRetries: Int = { 66 | options.getOrElse("executor.metadata.hdfs.numretries", "3").toInt 67 | } 68 | 69 | private val retryIntervalMs: Long = { 70 | options.getOrElse("executor.metadata.hdfs.retryIntervalMs".toLowerCase(Locale.ROOT), 71 | "1000").toLong 72 | } 73 | 74 | private val maxRetryIntervalMs: Long = { 75 | options.getOrElse("executor.metadata.hdfs.maxRetryIntervalMs".toLowerCase(Locale.ROOT), 76 | "10000").toLong 77 | } 78 | 79 | 80 | /* 81 | * A `PathFilter` to filter only batch files 82 | */ 83 | 84 | protected val batchFilesFilter = new PathFilter { 85 | override def accept(path: Path): Boolean = isBatchFile(path) 86 | } 87 | 88 | protected def batchIdToPath(batchId: Long): Path = { 89 | new Path(metadataPath, batchId.toString) 90 | } 91 | 92 | protected def pathToBatchId(path: Path) = { 93 | path.getName.toLong 94 | } 95 | 96 | protected def isBatchFile(path: Path) = { 97 | try { 98 | path.getName.toLong 99 | true 100 | } catch { 101 | case _: NumberFormatException => false 102 | } 103 | } 104 | 105 | protected def serialize(metadata: T, out: OutputStream): Unit = { 106 | // called inside a try-finally where the underlying stream is closed in the caller 107 | Serialization.write(metadata, out) 108 | } 109 | 110 | protected def deserialize(in: InputStream): T = { 111 | // called inside a try-finally where the underlying stream is closed in the caller 112 | val reader = new InputStreamReader(in, StandardCharsets.UTF_8) 113 | Serialization.read[T](reader) 114 | } 115 | 116 | def create(batchId: Long): Unit = { 117 | val newPath = batchIdToPath(batchId) 118 | if ( !fileContext.util().exists(newPath) ) { 119 | fileContext.mkdir(newPath, FsPermission.getDirDefault, true) 120 | } 121 | } 122 | 123 | override def add(batchId: Long, shardId: String, metadata: T): Boolean = { 124 | require(metadata != null, "'null' metadata cannot written to a shard commit log") 125 | create(batchId) 126 | val shardCommitPath = new Path(batchIdToPath(batchId), shardId) 127 | import CreateFlag._ 128 | import Options._ 129 | 130 | val output = fileContext.create(shardCommitPath, 131 | EnumSet.of(CREATE, OVERWRITE), CreateOpts.checksumParam(ChecksumOpt.createDisabled())) 132 | try { 133 | serialize(metadata, output) 134 | output.close() 135 | } catch { 136 | case e: Throwable => 137 | // close the open stream and delete the new file added 138 | output.close() 139 | withRetry[Boolean]("deleting cancelled metadataFile") { 140 | fileContext.delete(shardCommitPath, false) 141 | } 142 | // throw the exception again so that the caller knows that add operation was not successful 143 | throw e 144 | } 145 | true 146 | } 147 | 148 | override def get(batchId: Long): Seq[T] = { 149 | val batchMetadataDir = batchIdToPath(batchId) 150 | withRetry[ Seq[ T ] ]("fetching MetaData") { 151 | if ( fileContext.util().exists(batchMetadataDir) ) { 152 | fileContext.util().listStatus(batchMetadataDir).map { f => 153 | getData(f.getPath) match { 154 | case Some(data) => data 155 | case None => 156 | // return if there is any one filepath from which we could not read any data 157 | logDebug(s"Unable to get data for ${f.getPath}") 158 | throw new IllegalStateException(s"Failed to get metadata for ${f.getPath}") 159 | } 160 | }.toSeq 161 | } else { 162 | logDebug(s"Unable to find batch $batchMetadataDir") 163 | throw new IllegalStateException(s"$batchMetadataDir does not exist") 164 | } 165 | } 166 | } 167 | 168 | def getData(path: Path): Option[ T ] = { 169 | if ( fileContext.util().exists(path) ) { 170 | val input = fileContext.open(path) 171 | try { 172 | Some(deserialize(input)) 173 | } catch { 174 | case ise: IllegalStateException => // re-throw the exception with the log file path added 175 | throw new IllegalStateException(s"Failed to read log file ${path}. " + 176 | s"${ise.getMessage}", ise) 177 | } finally { 178 | IOUtils.closeQuietly(input) 179 | } 180 | } else { 181 | logDebug(s"Unable to find file $path") 182 | None 183 | } 184 | } 185 | 186 | def delete(batchId: Long): Unit = { 187 | val batchMetadataDir = batchIdToPath(batchId) 188 | delete(batchMetadataDir) 189 | } 190 | 191 | def delete(path: Path): Unit = { 192 | try { 193 | fileContext.delete(path, true) 194 | } catch { 195 | case e: FileNotFoundException => 196 | // ignore if file has already been deleted 197 | } 198 | } 199 | 200 | /* 201 | * Removes all the log entry earlier than thresholdBatchId (exclusive). 202 | */ 203 | override def purge(thresholdBatchId: Long): Unit = { 204 | val batchIds = fileContext.util().listStatus(metadataPath, batchFilesFilter) 205 | .map(f => pathToBatchId(f.getPath)) 206 | 207 | for (batchId <- batchIds if batchId < thresholdBatchId) { 208 | val path = batchIdToPath(batchId) 209 | delete(path) 210 | logTrace(s"Removed metadata log file: $path") 211 | } 212 | } 213 | 214 | /** Helper method to retry with exponential backoff */ 215 | def withRetry[ T ](message: String, ignoreException: Boolean = true)(body: => T): T = { 216 | var retryCount = 0 217 | var result: Option[ T ] = None 218 | var waitTimeInterval = retryIntervalMs 219 | var lastError: Throwable = null 220 | 221 | def isMaxRetryDone = retryCount >= numRetries 222 | 223 | while (result.isEmpty && !isMaxRetryDone) { 224 | if ( retryCount > 0 ) { // wait only if this is a retry 225 | Thread.sleep(waitTimeInterval) 226 | waitTimeInterval = scala.math.min(waitTimeInterval * 2, maxRetryIntervalMs) 227 | } 228 | try { 229 | result = Some(body) 230 | } catch { 231 | case NonFatal(t) => lastError = t 232 | if ( ignoreException ) { 233 | logWarning(s"Error while $message [attempt = ${retryCount + 1}]", t) 234 | } else { 235 | throw new IllegalStateException(s"Error while $message", t) 236 | } 237 | } 238 | retryCount += 1 239 | } 240 | result.getOrElse { 241 | throw new IllegalStateException(s"Gave up after $retryCount retries while $message," + 242 | s" last exception: ", lastError) 243 | } 244 | } 245 | } 246 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/KinesisPosition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import org.json4s.NoTypeHints 21 | import org.json4s.jackson.Serialization 22 | 23 | trait KinesisPosition extends Serializable { 24 | val iteratorType: String 25 | val iteratorPosition: String 26 | 27 | override def toString: String = s"KinesisPosition($iteratorType, $iteratorPosition)" 28 | } 29 | 30 | class TrimHorizon() extends KinesisPosition { 31 | override val iteratorType = "TRIM_HORIZON" 32 | override val iteratorPosition = "" 33 | } 34 | 35 | class Latest() extends KinesisPosition { 36 | override val iteratorType = "LATEST" 37 | override val iteratorPosition = "" 38 | } 39 | 40 | class AtTimeStamp(timestamp: String) extends KinesisPosition { 41 | def this(timestamp: Long) { 42 | this(timestamp.toString) 43 | } 44 | override val iteratorType = "AT_TIMESTAMP" 45 | override val iteratorPosition = timestamp.toString 46 | } 47 | 48 | class AfterSequenceNumber(seqNumber: String) extends KinesisPosition { 49 | override val iteratorType = "AFTER_SEQUENCE_NUMBER" 50 | override val iteratorPosition = seqNumber 51 | } 52 | 53 | class AtSequenceNumber(seqNumber: String) extends KinesisPosition { 54 | override val iteratorType = "AT_SEQUENCE_NUMBER" 55 | override val iteratorPosition = seqNumber 56 | } 57 | 58 | class ShardEnd() extends KinesisPosition { 59 | override val iteratorType = "SHARD_END" 60 | override val iteratorPosition = "" 61 | } 62 | 63 | private[kinesis] object KinesisPosition { 64 | def make(iteratorType: String, iteratorPosition: String): KinesisPosition = iteratorType match { 65 | case iterType if "TRIM_HORIZON".equalsIgnoreCase(iterType) => new TrimHorizon() 66 | case iterType if "LATEST".equalsIgnoreCase(iterType) => new Latest() 67 | case iterType if "AT_TIMESTAMP".equalsIgnoreCase(iterType) => new AtTimeStamp(iteratorPosition) 68 | case iterType if "AT_SEQUENCE_NUMBER".equalsIgnoreCase(iterType) => 69 | new AtSequenceNumber(iteratorPosition) 70 | case iterType if "AFTER_SEQUENCE_NUMBER".equalsIgnoreCase(iterType) => 71 | new AfterSequenceNumber(iteratorPosition) 72 | case iterType if "SHARD_END".equalsIgnoreCase(iterType) => new ShardEnd() 73 | } 74 | } 75 | 76 | /** 77 | * Specifies initial position in Kenesis to start read from on the application startup. 78 | * @param shardPositions map of shardId->KinesisPosition 79 | * @param defaultPosition position that is used for shard that is requested but not present in map 80 | */ 81 | private[kinesis] class InitialKinesisPosition(shardPositions: Map[String, KinesisPosition], 82 | defaultPosition: KinesisPosition) 83 | extends Serializable { 84 | 85 | def shardPosition(shardId: String): KinesisPosition = 86 | shardPositions.getOrElse(shardId, defaultPosition) 87 | 88 | override def toString: String = s"InitialKinesisPosition($shardPositions)" 89 | } 90 | 91 | private[kinesis] object InitialKinesisPosition { 92 | implicit val format = Serialization.formats(NoTypeHints) 93 | 94 | def fromPredefPosition(pos: KinesisPosition): InitialKinesisPosition = 95 | new InitialKinesisPosition(Map(), pos) 96 | 97 | /** 98 | * Parses json representation on Kinesis position. 99 | * It is useful if Kinesis position is persisted explicitly (e.g. at the end of the batch) 100 | * and used to continue reading records from the same position on Spark application redeploy. 101 | * Kinesis position JSON representation example: 102 | * {{{ 103 | * { 104 | * "shardId-000000000001":{ 105 | * "iteratorType":"AFTER_SEQUENCE_NUMBER", 106 | * "iteratorPosition":"49605240428222307037115827613554798409561082419642105874" 107 | * }, 108 | * "metadata":{ 109 | * "streamName":"my.cool.stream2", 110 | * "batchId":"7" 111 | * }, 112 | * "shardId-000000000000":{ 113 | * "iteratorType":"AFTER_SEQUENCE_NUMBER", 114 | * "iteratorPosition":"49605240428200006291917297020490128157480794051565322242" 115 | * } 116 | * } 117 | * }}} 118 | * @param text JSON representation of Kinesis position. 119 | * @return 120 | */ 121 | def fromCheckpointJson(text: String, defaultPosition: KinesisPosition): InitialKinesisPosition = { 122 | val kso = KinesisSourceOffset(text) 123 | val shardOffsets = kso.shardsToOffsets 124 | 125 | new InitialKinesisPosition( 126 | shardOffsets.shardInfoMap 127 | .map(si => si._1 -> KinesisPosition.make(si._2.iteratorType, si._2.iteratorPosition)), 128 | defaultPosition 129 | ) 130 | } 131 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/KinesisReader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import java.math.BigInteger 21 | import java.util 22 | import java.util.{ArrayList, Locale} 23 | import java.util.concurrent.{Executors, ThreadFactory} 24 | 25 | import com.amazonaws.AbortedException 26 | import com.amazonaws.services.kinesis.AmazonKinesisClient 27 | import com.amazonaws.services.kinesis.clientlibrary.types.UserRecord 28 | import com.amazonaws.services.kinesis.model.{GetRecordsRequest, ListShardsRequest, Shard, _} 29 | import scala.collection.JavaConverters._ 30 | import scala.concurrent.{ExecutionContext, Future} 31 | import scala.concurrent.duration.Duration 32 | import scala.util.control.NonFatal 33 | 34 | import org.apache.spark.internal.Logging 35 | import org.apache.spark.sql.types._ 36 | import org.apache.spark.util.{ThreadUtils, UninterruptibleThread} 37 | 38 | 39 | // This class uses Kinesis API to read data offsets from Kinesis 40 | 41 | private[kinesis] case class KinesisReader( 42 | readerOptions: Map[String, String], 43 | streamName: String, 44 | kinesisCredsProvider: SparkAWSCredentials, 45 | endpointUrl: String 46 | ) extends Serializable with Logging { 47 | 48 | /* 49 | * Used to ensure execute fetch operations execute in an UninterruptibleThread 50 | */ 51 | val kinesisReaderThread = Executors.newSingleThreadExecutor(new ThreadFactory { 52 | override def newThread(r: Runnable): Thread = { 53 | val t = new UninterruptibleThread("Kinesis Reader") { 54 | override def run(): Unit = { 55 | r.run() 56 | } 57 | } 58 | t.setDaemon(true) 59 | t 60 | } 61 | }) 62 | 63 | val execContext = ExecutionContext.fromExecutorService(kinesisReaderThread) 64 | 65 | private val maxOffsetFetchAttempts = 66 | readerOptions.getOrElse("client.numRetries".toLowerCase(Locale.ROOT), "3").toInt 67 | 68 | private val offsetFetchAttemptIntervalMs = 69 | readerOptions.getOrElse("client.retryIntervalMs".toLowerCase(Locale.ROOT), "1000").toLong 70 | 71 | private val maxRetryIntervalMs: Long = { 72 | readerOptions.getOrElse("client.maxRetryIntervalMs".toLowerCase(Locale.ROOT), "10000").toLong 73 | } 74 | 75 | private val maxSupportedShardsPerStream = 10000; 76 | 77 | private var _amazonClient: AmazonKinesisClient = null 78 | 79 | private def getAmazonClient(): AmazonKinesisClient = { 80 | if (_amazonClient == null) { 81 | _amazonClient = new AmazonKinesisClient(kinesisCredsProvider.provider) 82 | _amazonClient.setEndpoint(endpointUrl) 83 | } 84 | _amazonClient 85 | } 86 | 87 | def getShards(): Seq[Shard] = { 88 | val shards = listShards 89 | logInfo(s"List shards in Kinesis Stream: ${shards}") 90 | shards 91 | } 92 | 93 | def close(): Unit = { 94 | runUninterruptibly { 95 | if (_amazonClient != null) { 96 | _amazonClient.shutdown() 97 | _amazonClient = null 98 | } 99 | } 100 | kinesisReaderThread.shutdown() 101 | } 102 | 103 | def getShardIterator(shardId: String, 104 | iteratorType: String, 105 | iteratorPosition: String, 106 | failOnDataLoss: Boolean = true): String = { 107 | 108 | val getShardIteratorRequest = new GetShardIteratorRequest 109 | getShardIteratorRequest.setShardId(shardId) 110 | getShardIteratorRequest.setStreamName(streamName) 111 | getShardIteratorRequest.setShardIteratorType(iteratorType) 112 | 113 | if (iteratorType == "AFTER_SEQUENCE_NUMBER" || iteratorType == "AT_SEQUENCE_NUMBER") { 114 | getShardIteratorRequest.setStartingSequenceNumber(iteratorPosition) 115 | } 116 | 117 | if (iteratorType == "AT_TIMESTAMP") { 118 | logDebug(s"TimeStamp while getting shard iterator ${ 119 | (new java.util.Date(iteratorPosition.toLong)).toString}") 120 | getShardIteratorRequest.setTimestamp(new java.util.Date(iteratorPosition.toLong)) 121 | } 122 | 123 | runUninterruptibly { 124 | retryOrTimeout[GetShardIteratorResult]( 125 | s"Fetching Shard Iterator") { 126 | try { 127 | getAmazonClient.getShardIterator(getShardIteratorRequest) 128 | } catch { 129 | case r: ResourceNotFoundException => 130 | if (!failOnDataLoss) { 131 | new GetShardIteratorResult() 132 | } 133 | else { 134 | throw r 135 | } 136 | } 137 | } 138 | }.getShardIterator 139 | } 140 | 141 | 142 | def getKinesisRecords(shardIterator: String, limit: Int): GetRecordsResult = { 143 | val getRecordsRequest = new GetRecordsRequest 144 | getRecordsRequest.setShardIterator(shardIterator) 145 | getRecordsRequest.setLimit(limit) 146 | val getRecordsResult: GetRecordsResult = runUninterruptibly { 147 | retryOrTimeout[ GetRecordsResult ](s"get Records for a shard ") { 148 | getAmazonClient.getRecords(getRecordsRequest) 149 | } 150 | } 151 | getRecordsResult 152 | } 153 | 154 | 155 | def deaggregateRecords(records: util.List[ Record ], shard: Shard): util.List[ Record] = { 156 | // We deaggregate if and only if we got actual Kinesis records, i.e. 157 | // not instances of some subclass thereof. 158 | if ( !records.isEmpty && records.get(0).getClass.equals(classOf[ Record ]) ) { 159 | if ( shard != null ) { 160 | return UserRecord.deaggregate( 161 | records, 162 | new BigInteger(shard.getHashKeyRange.getStartingHashKey), 163 | new BigInteger(shard.getHashKeyRange.getEndingHashKey)) 164 | .asInstanceOf[ util.List[ _ ] ].asInstanceOf[ util.List[ Record ] ] 165 | } else { 166 | return UserRecord.deaggregate(records) 167 | .asInstanceOf[ util.List[ _ ] ].asInstanceOf[ util.List[ Record ] ] 168 | } 169 | } 170 | records 171 | } 172 | 173 | private def listShards(): Seq[Shard] = { 174 | var nextToken = "" 175 | var returnedToken = "" 176 | val shards = new ArrayList[Shard]() 177 | val listShardsRequest = new ListShardsRequest 178 | listShardsRequest.setStreamName(streamName) 179 | listShardsRequest.setMaxResults(maxSupportedShardsPerStream) 180 | 181 | do { 182 | val listShardsResult: ListShardsResult = runUninterruptibly { 183 | retryOrTimeout[ListShardsResult]( s"List shards") { 184 | getAmazonClient.listShards(listShardsRequest) 185 | } 186 | } 187 | shards.addAll(listShardsResult.getShards) 188 | returnedToken = listShardsResult.getNextToken() 189 | if (returnedToken != null) { 190 | nextToken = returnedToken 191 | listShardsRequest.setNextToken(nextToken) 192 | } 193 | } while (!nextToken.isEmpty) 194 | 195 | shards.asScala.toSeq 196 | } 197 | 198 | /* 199 | * This method ensures that the closure is called in an [[UninterruptibleThread]]. 200 | * This is required when communicating with the AWS. In the case 201 | */ 202 | private def runUninterruptibly[T](body: => T): T = { 203 | if (!Thread.currentThread.isInstanceOf[UninterruptibleThread]) { 204 | val future = Future { 205 | body 206 | }(execContext) 207 | ThreadUtils.awaitResult(future, Duration.Inf) 208 | } else { 209 | body 210 | } 211 | } 212 | 213 | /** Helper method to retry Kinesis API request with exponential backoff and timeouts */ 214 | private def retryOrTimeout[T](message: String)(body: => T): T = { 215 | assert(Thread.currentThread().isInstanceOf[UninterruptibleThread]) 216 | 217 | val startTimeMs = System.currentTimeMillis() 218 | var retryCount = 0 219 | var result: Option[T] = None 220 | var lastError: Throwable = null 221 | var waitTimeInterval = offsetFetchAttemptIntervalMs 222 | 223 | def isMaxRetryDone = retryCount >= maxOffsetFetchAttempts 224 | 225 | while (result.isEmpty && !isMaxRetryDone) { 226 | if ( retryCount > 0 ) { // wait only if this is a retry 227 | Thread.sleep(waitTimeInterval) 228 | waitTimeInterval = scala.math.min(waitTimeInterval * 2, maxRetryIntervalMs) 229 | } 230 | try { 231 | result = Some(body) 232 | } catch { 233 | case NonFatal(t) => 234 | lastError = t 235 | t match { 236 | case ptee: ProvisionedThroughputExceededException => 237 | logWarning(s"Error while $message [attempt = ${retryCount + 1}]", ptee) 238 | case lee: LimitExceededException => 239 | logWarning(s"Error while $message [attempt = ${retryCount + 1}]", lee) 240 | case ae: AbortedException => 241 | logWarning(s"Error while $message [attempt = ${retryCount + 1}]", ae) 242 | case ake: AmazonKinesisException => 243 | if (ake.getStatusCode() >= 500) { 244 | logWarning(s"Error while $message [attempt = ${retryCount + 1}]", ake) 245 | } else { 246 | throw new IllegalStateException(s"Error while $message", ake) 247 | } 248 | case e: Throwable => 249 | throw new IllegalStateException(s"Error while $message", e) 250 | } 251 | } 252 | retryCount += 1 253 | } 254 | result.getOrElse { 255 | throw new IllegalStateException( 256 | s"Gave up after $retryCount retries while $message, last exception: ", lastError) 257 | } 258 | } 259 | 260 | } 261 | 262 | 263 | private [kinesis] object KinesisReader { 264 | 265 | val kinesisSchema: StructType = 266 | StructType(Seq( 267 | StructField("data", BinaryType), 268 | StructField("streamName", StringType), 269 | StructField("partitionKey", StringType), 270 | StructField("sequenceNumber", StringType), 271 | StructField("approximateArrivalTimestamp", TimestampType)) 272 | ) 273 | } 274 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/KinesisSink.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import org.apache.spark.internal.Logging 21 | import org.apache.spark.sql.{DataFrame, SQLContext} 22 | import org.apache.spark.sql.execution.streaming.Sink 23 | import org.apache.spark.sql.streaming.OutputMode 24 | 25 | private[kinesis] class KinesisSink(sqlContext: SQLContext, 26 | sinkOptions: Map[String, String], 27 | outputMode: OutputMode) 28 | extends Sink with Logging { 29 | 30 | @volatile private var latestBatchId = -1L 31 | 32 | override def toString: String = "KinesisSink" 33 | 34 | override def addBatch(batchId: Long, data: DataFrame): Unit = { 35 | if (batchId <= latestBatchId) { 36 | logInfo(s"Skipping already committed batch $batchId") 37 | } else { 38 | KinesisWriter.write(sqlContext.sparkSession, data.queryExecution, sinkOptions) 39 | latestBatchId = batchId 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/KinesisSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import java.io._ 21 | import java.util.Locale 22 | import java.util.concurrent.atomic.AtomicBoolean 23 | 24 | import com.amazonaws.services.kinesis.model.Record 25 | import org.apache.hadoop.conf.Configuration 26 | import scala.collection.parallel.ForkJoinTaskSupport 27 | 28 | import org.apache.spark.SparkContext 29 | import org.apache.spark.internal.Logging 30 | import org.apache.spark.sql._ 31 | import org.apache.spark.sql.catalyst.InternalRow 32 | import org.apache.spark.sql.catalyst.util.DateTimeUtils 33 | import org.apache.spark.sql.execution.streaming.{Offset, Source, _} 34 | import org.apache.spark.sql.types._ 35 | import org.apache.spark.unsafe.types.UTF8String 36 | import org.apache.spark.util.{SerializableConfiguration, ThreadUtils, Utils} 37 | 38 | /* 39 | * A [[Source]] that reads data from Kinesis using the following design. 40 | * 41 | * - The [[KinesisSourceOffset]] is the custom [[Offset]] defined for this source 42 | * 43 | * - The [[KinesisSource]] written to do the following. 44 | * 45 | * - `getOffset()` uses the [[KinesisSourceOffset]] to query the latest 46 | * available offsets, which are returned as a [[KinesisSourceOffset]]. 47 | * 48 | * - `getBatch()` returns a DF 49 | * - The DF returned is based on [[KinesisSourceRDD]] 50 | */ 51 | 52 | private[kinesis] class KinesisSource( 53 | sqlContext: SQLContext, 54 | sourceOptions: Map[String, String], 55 | metadataPath: String, 56 | streamName: String, 57 | initialPosition: InitialKinesisPosition, 58 | endPointURL: String, 59 | kinesisCredsProvider: SparkAWSCredentials, 60 | failOnDataLoss: Boolean = true 61 | ) 62 | extends Source with Serializable with Logging { 63 | 64 | import KinesisSource._ 65 | 66 | private def sc: SparkContext = { 67 | sqlContext.sparkContext 68 | } 69 | 70 | private def kinesisReader: KinesisReader = { 71 | new KinesisReader(sourceOptions, streamName, kinesisCredsProvider, endPointURL) 72 | } 73 | 74 | private var currentShardOffsets: Option[ShardOffsets] = None 75 | 76 | private val minBatchesToRetain = sqlContext.sparkSession.sessionState.conf.minBatchesToRetain 77 | require(minBatchesToRetain > 0, "minBatchesToRetain has to be positive") 78 | 79 | private val describeShardInterval: Long = { 80 | Utils.timeStringAsMs(sourceOptions.getOrElse(KinesisSourceProvider.DESCRIBE_SHARD_INTERVAL, 81 | "1s")) 82 | } 83 | 84 | require(describeShardInterval >= 0, "describeShardInterval cannot be less than 0 sec") 85 | 86 | private var latestDescribeShardTimestamp: Long = -1L 87 | 88 | private def metadataCommitter: MetadataCommitter[ShardInfo] = { 89 | metaDataCommitterType.toLowerCase(Locale.ROOT) match { 90 | case "hdfs" => 91 | new HDFSMetadataCommitter[ ShardInfo ](metaDataCommitterPath, 92 | hadoopConf(sqlContext), sourceOptions) 93 | case _ => throw new IllegalArgumentException("only HDFS is supported") 94 | } 95 | } 96 | 97 | private def metaDataCommitterType: String = { 98 | sourceOptions.getOrElse("executor.metadata.committer", "hdfs").toString 99 | } 100 | 101 | private def metaDataCommitterPath: String = { 102 | sourceOptions.getOrElse("executor.metadata.path", metadataPath).toString 103 | } 104 | 105 | private val avoidEmptyBatches = 106 | sourceOptions.getOrElse("client.avoidEmptyBatches". 107 | toLowerCase(Locale.ROOT), "false").toBoolean 108 | 109 | private val maxParallelThreads = 110 | sourceOptions.getOrElse("client.maxParallelThreads". 111 | toLowerCase(Locale.ROOT), "8").toInt 112 | 113 | def options: Map[String, String] = { 114 | // This function is used for testing 115 | sourceOptions 116 | } 117 | 118 | def getFailOnDataLoss(): Boolean = { 119 | // This function is used for testing 120 | failOnDataLoss 121 | } 122 | 123 | /** Makes an API call to get one record for a shard. Return true if the call is successful */ 124 | def hasNewData(shardInfo: ShardInfo): Boolean = { 125 | val shardIterator = kinesisReader.getShardIterator( 126 | shardInfo.shardId, 127 | shardInfo.iteratorType, 128 | shardInfo.iteratorPosition) 129 | val records = kinesisReader.getKinesisRecords(shardIterator, 1) 130 | // Return true if we can get back a record. Or if we have not reached the end of the stream 131 | (records.getRecords.size() > 0 || records.getMillisBehindLatest.longValue() > 0) 132 | } 133 | 134 | def canCreateNewBatch(shardsInfo: Array[ShardInfo]): Boolean = { 135 | var shardsInfoToCheck = shardsInfo.par 136 | val threadPoolSize = Math.min(maxParallelThreads, shardsInfoToCheck.size) 137 | val evalPool = ThreadUtils.newForkJoinPool("checkCreateNewBatch", threadPoolSize) 138 | shardsInfoToCheck.tasksupport = new ForkJoinTaskSupport(evalPool) 139 | val hasRecords = new AtomicBoolean(false) 140 | try { 141 | shardsInfoToCheck.foreach { s => 142 | if (!hasRecords.get() && hasNewData(s)) { 143 | hasRecords.set(true) 144 | } 145 | } 146 | } finally { 147 | evalPool.shutdown() 148 | } 149 | logDebug(s"Can create new batch = ${hasRecords.get()}") 150 | hasRecords.get() 151 | } 152 | 153 | def hasShardEndAsOffset(shardInfo: Seq[ShardInfo]): Boolean = { 154 | shardInfo.exists { 155 | s: (ShardInfo) => (s.iteratorType.contains(new ShardEnd().iteratorType)) 156 | } 157 | } 158 | 159 | /** Returns the shards position to start reading data from */ 160 | override def getOffset: Option[Offset] = synchronized { 161 | val defaultOffset = new ShardOffsets(-1L, streamName) 162 | val prevBatchId = currentShardOffsets.getOrElse(defaultOffset).batchId 163 | val prevShardsInfo = prevBatchShardInfo(prevBatchId) 164 | 165 | val latestShardInfo: Array[ShardInfo] = { 166 | if (prevBatchId < 0 167 | || latestDescribeShardTimestamp == -1 168 | || ((latestDescribeShardTimestamp + describeShardInterval) < System.currentTimeMillis())) { 169 | val latestShards = kinesisReader.getShards() 170 | latestDescribeShardTimestamp = System.currentTimeMillis() 171 | ShardSyncer.getLatestShardInfo(latestShards, prevShardsInfo, 172 | initialPosition, failOnDataLoss) 173 | } else { 174 | prevShardsInfo 175 | } 176 | }.toArray 177 | 178 | if (!avoidEmptyBatches 179 | || prevBatchId < 0 180 | || hasShardEndAsOffset(latestShardInfo) 181 | || ShardSyncer.hasNewShards(prevShardsInfo, latestShardInfo) 182 | || ShardSyncer.hasDeletedShards(prevShardsInfo, latestShardInfo) 183 | || canCreateNewBatch(latestShardInfo)) { 184 | currentShardOffsets = Some(new ShardOffsets(prevBatchId + 1, streamName, latestShardInfo)) 185 | } else { 186 | log.info("Offsets are unchanged since `kinesis.client.avoidEmptyBatches` is enabled") 187 | } 188 | 189 | currentShardOffsets match { 190 | case None => None 191 | case Some(cso) => Some(KinesisSourceOffset(cso)) 192 | } 193 | } 194 | 195 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = { 196 | logInfo(s"End Offset is ${end.toString}") 197 | val currBatchShardOffset = KinesisSourceOffset.getShardOffsets(end) 198 | val currBatchId = currBatchShardOffset.batchId 199 | var prevBatchId: Long = start match { 200 | case Some(prevBatchStartOffset) => 201 | KinesisSourceOffset.getShardOffsets(prevBatchStartOffset).batchId 202 | case None => -1.toLong 203 | } 204 | assert(prevBatchId <= currBatchId) 205 | 206 | val shardInfos = { 207 | // filter out those shardInfos for which ShardIterator is shard_end 208 | currBatchShardOffset.shardInfoMap.values.toSeq.filter { 209 | s: (ShardInfo) => !(s.iteratorType.contains(new ShardEnd().iteratorType)) 210 | }.sortBy(_.shardId.toString) 211 | } 212 | logInfo(s"Processing ${shardInfos.length} shards from ${shardInfos}") 213 | 214 | // Create an RDD that reads from Kinesis 215 | val kinesisSourceRDD = new KinesisSourceRDD( 216 | sc, 217 | sourceOptions, 218 | streamName, 219 | currBatchId, 220 | shardInfos, 221 | kinesisCredsProvider, 222 | endPointURL, 223 | hadoopConf(sqlContext), 224 | metadataPath, 225 | failOnDataLoss) 226 | 227 | val rdd = kinesisSourceRDD.map { r: Record => 228 | InternalRow( 229 | r.getData.array(), 230 | UTF8String.fromString(streamName), 231 | UTF8String.fromString(r.getPartitionKey), 232 | UTF8String.fromString(r.getSequenceNumber), 233 | DateTimeUtils.fromJavaTimestamp( 234 | new java.sql.Timestamp(r.getApproximateArrivalTimestamp.getTime)) 235 | ) 236 | } 237 | 238 | // On recovery, getBatch will get called before getOffset 239 | if (currentShardOffsets.isEmpty) { 240 | currentShardOffsets = Some(currBatchShardOffset) 241 | } 242 | 243 | logInfo("GetBatch generating RDD of offset range: " + 244 | shardInfos.mkString(", ")) 245 | 246 | sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) 247 | 248 | } 249 | 250 | override def schema: StructType = KinesisReader.kinesisSchema 251 | 252 | /** Stop this source and free any resources it has allocated. */ 253 | override def stop(): Unit = synchronized { 254 | kinesisReader.close() 255 | } 256 | 257 | override def commit(end: Offset): Unit = { 258 | val defaultOffset = new ShardOffsets(-1L, streamName) 259 | val currBatchId = currentShardOffsets.getOrElse(defaultOffset).batchId 260 | val thresholdBatchId = currBatchId - minBatchesToRetain 261 | if (thresholdBatchId >= 0) { 262 | logInfo(s"Purging Committed Entries. ThresholdBatchId = ${thresholdBatchId}") 263 | metadataCommitter.purge(thresholdBatchId) 264 | } 265 | } 266 | 267 | override def toString(): String = s"KinesisSource[$streamName]" 268 | 269 | private def prevBatchShardInfo(batchId: Long): Seq[ShardInfo] = { 270 | val shardInfo = if (batchId < 0) { 271 | logInfo(s"This is the first batch. Returning Empty sequence") 272 | Seq.empty[ShardInfo] 273 | } else { 274 | logDebug(s"BatchId of previously executed batch is $batchId") 275 | val prevShardinfo = metadataCommitter.get(batchId) 276 | if (prevShardinfo.isEmpty) { 277 | throw new IllegalStateException(s"Unable to fetch " + 278 | s"committed metadata from previous batch. Some data may have been missed") 279 | } 280 | prevShardinfo 281 | } 282 | logDebug(s"Shard Info is ${shardInfo.mkString(", ")}") 283 | shardInfo 284 | } 285 | 286 | } 287 | 288 | object KinesisSource { 289 | 290 | val VERSION = 1 291 | 292 | private var _hadoopConf: SerializableConfiguration = null 293 | 294 | def hadoopConf(sqlContext: SQLContext): SerializableConfiguration = { 295 | if (_hadoopConf == null) { 296 | val conf: Configuration = sqlContext.sparkSession.sessionState.newHadoopConf() 297 | _hadoopConf = new SerializableConfiguration(conf) 298 | } 299 | _hadoopConf 300 | } 301 | 302 | } 303 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/KinesisSourceOffset.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import org.json4s.NoTypeHints 21 | import org.json4s.jackson.Serialization 22 | import scala.collection.mutable.HashMap 23 | import scala.util.control.NonFatal 24 | 25 | import org.apache.spark.sql.execution.streaming.Offset 26 | import org.apache.spark.sql.execution.streaming.SerializedOffset 27 | 28 | /* 29 | * @param shardsToOffsets 30 | */ 31 | 32 | case class KinesisSourceOffset(shardsToOffsets: ShardOffsets) extends Offset { 33 | override def json: String = { 34 | val metadata = HashMap[String, String]( 35 | "batchId" -> shardsToOffsets.batchId.toString, 36 | "streamName" -> shardsToOffsets.streamName) 37 | val result = HashMap[String, HashMap[String, String]]("metadata" -> metadata) 38 | 39 | val shardInfos = shardsToOffsets.shardInfoMap.keySet.toSeq.sorted // sort for more determinism 40 | 41 | shardInfos.foreach { 42 | shardId: String => 43 | val shardInfo: ShardInfo = shardsToOffsets.shardInfoMap.get(shardId).get 44 | val part = result.getOrElse(shardInfo.shardId, new HashMap[String, String]) 45 | part += "iteratorType" -> shardInfo.iteratorType 46 | part += "iteratorPosition" -> shardInfo.iteratorPosition 47 | result += shardId -> part 48 | } 49 | Serialization.write(result)(KinesisSourceOffset.format) 50 | } 51 | } 52 | 53 | object KinesisSourceOffset { 54 | implicit val format = Serialization.formats(NoTypeHints) 55 | 56 | def getShardOffsets(offset: Offset): ShardOffsets = { 57 | offset match { 58 | case kso: KinesisSourceOffset => kso.shardsToOffsets 59 | case so: SerializedOffset => KinesisSourceOffset(so).shardsToOffsets 60 | case _ => throw 61 | new IllegalArgumentException(s"Invalid conversion " + 62 | s"from offset of ${offset.getClass} to KinesisSourceOffset") 63 | } 64 | } 65 | 66 | /* 67 | * Returns [[KinesisSourceOffset]] from a JSON [[SerializedOffset]] 68 | */ 69 | def apply(so: SerializedOffset): KinesisSourceOffset = { 70 | apply(so.json) 71 | } 72 | 73 | /* 74 | * Returns [[KinesisSourceOffset]] from a JSON 75 | */ 76 | def apply(json: String): KinesisSourceOffset = { 77 | try { 78 | val readObj = Serialization.read[ Map[ String, Map[ String, String ] ] ](json) 79 | val metadata = readObj.get("metadata") 80 | val shardInfoMap: Map[String, ShardInfo ] = readObj.filter(_._1 != "metadata").map { 81 | case (shardId, value) => shardId.toString -> new ShardInfo(shardId.toString, 82 | value.get("iteratorType").get, 83 | value.get("iteratorPosition").get) 84 | }.toMap 85 | KinesisSourceOffset( 86 | new ShardOffsets( 87 | metadata.get("batchId").toLong, 88 | metadata.get("streamName"), 89 | shardInfoMap)) 90 | } catch { 91 | case NonFatal(x) => throw new IllegalArgumentException(x) 92 | } 93 | } 94 | 95 | def getMap(shardInfos: Array[ShardInfo]): Map[String, ShardInfo] = { 96 | shardInfos.map { 97 | s: ShardInfo => (s.shardId -> s) 98 | }.toMap 99 | } 100 | 101 | } 102 | 103 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/KinesisSourceProvider.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import java.util.Locale 21 | 22 | import org.apache.spark.internal.Logging 23 | import org.apache.spark.sql.SQLContext 24 | import org.apache.spark.sql.execution.streaming.{Sink, Source} 25 | import org.apache.spark.sql.sources._ 26 | import org.apache.spark.sql.streaming.OutputMode 27 | import org.apache.spark.sql.types.StructType 28 | 29 | /* 30 | * The provider class for the [[KinesisSource]]. This provider is designed such that it throws 31 | * IllegalArgumentException when the Kinesis Dataset is created, so that it can catch 32 | * missing options even before the query is started. 33 | */ 34 | 35 | private[kinesis] class KinesisSourceProvider extends DataSourceRegister 36 | with StreamSourceProvider 37 | with StreamSinkProvider 38 | with Logging { 39 | 40 | import KinesisSourceProvider._ 41 | 42 | override def shortName(): String = "kinesis" 43 | 44 | /* 45 | * Returns the name and schema of the source. In addition, it also verifies whether the options 46 | * are correct and sufficient to create the [[KinesisSource]] when the query is started. 47 | */ 48 | 49 | override def sourceSchema( 50 | sqlContext: SQLContext, 51 | schema: Option[StructType], 52 | providerName: String, 53 | parameters: Map[String, String]): (String, StructType) = { 54 | val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) } 55 | validateStreamOptions(caseInsensitiveParams) 56 | require(schema.isEmpty, "Kinesis source has a fixed schema and cannot be set with a custom one") 57 | (shortName(), KinesisReader.kinesisSchema) 58 | } 59 | 60 | override def createSource( 61 | sqlContext: SQLContext, 62 | metadataPath: String, 63 | schema: Option[StructType], 64 | providerName: String, 65 | parameters: Map[String, String]): Source = { 66 | 67 | val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) } 68 | 69 | validateStreamOptions(caseInsensitiveParams) 70 | 71 | val specifiedKinesisParams = 72 | parameters 73 | .keySet 74 | .filter(_.toLowerCase(Locale.ROOT).startsWith("kinesis.")) 75 | .map { k => k.drop(8).toString -> parameters(k) } 76 | .toMap 77 | 78 | val streamName = caseInsensitiveParams.get(STREAM_NAME_KEY).get 79 | 80 | val awsAccessKeyId = caseInsensitiveParams.get(AWS_ACCESS_KEY_ID).getOrElse("") 81 | val awsSecretKey = caseInsensitiveParams.get(AWS_SECRET_KEY).getOrElse("") 82 | val sessionToken = caseInsensitiveParams.get(AWS_SESSION_TOKEN).getOrElse("") 83 | val awsStsRoleArn = caseInsensitiveParams.get(AWS_STS_ROLE_ARN).getOrElse("") 84 | val awsStsSessionName = caseInsensitiveParams.get(AWS_STS_SESSION_NAME).getOrElse("") 85 | val awsUseInstanceProfile = caseInsensitiveParams.getOrElse(AWS_USE_INSTANCE_PROFILE, "true") 86 | .toBoolean 87 | 88 | val regionName = caseInsensitiveParams.get(REGION_NAME_KEY) 89 | .getOrElse(DEFAULT_KINESIS_REGION_NAME) 90 | val endPointURL = caseInsensitiveParams.get(END_POINT_URL) 91 | .getOrElse(DEFAULT_KINESIS_ENDPOINT_URL) 92 | 93 | val failOnDataLoss = caseInsensitiveParams.get(FAILONDATALOSS) 94 | .getOrElse("true").toBoolean 95 | 96 | val initialPosition: InitialKinesisPosition = getKinesisPosition(caseInsensitiveParams) 97 | 98 | val kinesisCredsProvider = if (awsAccessKeyId.length > 0) { 99 | if(sessionToken.length > 0) { 100 | BasicAWSSessionCredentials(awsAccessKeyId, awsSecretKey, sessionToken) 101 | } else { 102 | BasicCredentials(awsAccessKeyId, awsSecretKey) 103 | } 104 | } else if (awsStsRoleArn.length > 0) { 105 | STSCredentials(awsStsRoleArn, awsStsSessionName) 106 | } else if (awsUseInstanceProfile) { 107 | InstanceProfileCredentials 108 | } else { 109 | DefaultCredentials 110 | } 111 | 112 | new KinesisSource( 113 | sqlContext, specifiedKinesisParams, metadataPath, 114 | streamName, initialPosition, endPointURL, kinesisCredsProvider, failOnDataLoss) 115 | } 116 | 117 | private def validateStreamOptions(caseInsensitiveParams: Map[String, String]) = { 118 | if (!caseInsensitiveParams.contains(STREAM_NAME_KEY) || 119 | caseInsensitiveParams.get(STREAM_NAME_KEY).get.isEmpty) { 120 | throw new IllegalArgumentException( 121 | "Stream name is a required field") 122 | } 123 | } 124 | 125 | private def validateSinkOptions(caseInsensitiveParams: Map[String, String]): Unit = { 126 | if (!caseInsensitiveParams.contains(SINK_STREAM_NAME_KEY) || 127 | caseInsensitiveParams(SINK_STREAM_NAME_KEY).isEmpty) { 128 | throw new IllegalArgumentException( 129 | "Stream name is a required field") 130 | } 131 | if (!caseInsensitiveParams.contains(SINK_ENDPOINT_URL) || 132 | caseInsensitiveParams(SINK_ENDPOINT_URL).isEmpty) { 133 | throw new IllegalArgumentException( 134 | "Sink endpoint url is a required field") 135 | } 136 | if (caseInsensitiveParams.contains(SINK_AGGREGATION_ENABLED) && ( 137 | caseInsensitiveParams(SINK_AGGREGATION_ENABLED).trim != "true" && 138 | caseInsensitiveParams(SINK_AGGREGATION_ENABLED).trim != "false" 139 | )) { 140 | throw new IllegalArgumentException( 141 | "Sink aggregation value must be either true or false") 142 | } 143 | } 144 | 145 | override def createSink( 146 | sqlContext: SQLContext, 147 | parameters: Map[String, String], 148 | partitionColumns: Seq[String], 149 | outputMode: OutputMode): Sink = { 150 | val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) } 151 | validateSinkOptions(caseInsensitiveParams) 152 | new KinesisSink(sqlContext, caseInsensitiveParams, outputMode) 153 | } 154 | 155 | } 156 | 157 | private[kinesis] object KinesisSourceProvider extends Logging { 158 | 159 | private[kinesis] val STREAM_NAME_KEY = "streamname" 160 | private[kinesis] val END_POINT_URL = "endpointurl" 161 | private[kinesis] val REGION_NAME_KEY = "regionname" 162 | private[kinesis] val AWS_ACCESS_KEY_ID = "awsaccesskeyid" 163 | private[kinesis] val AWS_SECRET_KEY = "awssecretkey" 164 | private[kinesis] val AWS_SESSION_TOKEN = "sessiontoken" 165 | private[kinesis] val AWS_STS_ROLE_ARN = "awsstsrolearn" 166 | private[kinesis] val AWS_STS_SESSION_NAME = "awsstssessionname" 167 | private[kinesis] val AWS_USE_INSTANCE_PROFILE = "awsuseinstanceprofile" 168 | private[kinesis] val STARTING_POSITION_KEY = "startingposition" 169 | private[kinesis] val FAILONDATALOSS = "failondataloss" 170 | 171 | private[kinesis] val DESCRIBE_SHARD_INTERVAL = "client.describeshardinterval" 172 | 173 | // Sink Options 174 | private[kinesis] val SINK_STREAM_NAME_KEY = "streamname" 175 | private[kinesis] val SINK_ENDPOINT_URL = "endpointurl" 176 | private[kinesis] val SINK_RECORD_MAX_BUFFERED_TIME = "kinesis.executor.recordmaxbufferedtime" 177 | private[kinesis] val SINK_MAX_CONNECTIONS = "kinesis.executor.maxconnections" 178 | private[kinesis] val SINK_AGGREGATION_ENABLED = "kinesis.executor.aggregationenabled" 179 | private[kinesis] val SINK_FLUSH_WAIT_TIME_MILLIS = "kniesis.executor.flushwaittimemillis" 180 | 181 | 182 | private[kinesis] def getKinesisPosition( 183 | params: Map[String, String]): InitialKinesisPosition = { 184 | val CURRENT_TIMESTAMP = System.currentTimeMillis 185 | params.get(STARTING_POSITION_KEY).map(_.trim) match { 186 | case Some(position) if position.toLowerCase(Locale.ROOT) == "latest" => 187 | InitialKinesisPosition.fromPredefPosition(new AtTimeStamp(CURRENT_TIMESTAMP)) 188 | case Some(position) if position.toLowerCase(Locale.ROOT) == "trim_horizon" => 189 | InitialKinesisPosition.fromPredefPosition(new TrimHorizon) 190 | case Some(position) if position.toLowerCase(Locale.ROOT) == "earliest" => 191 | InitialKinesisPosition.fromPredefPosition(new TrimHorizon) 192 | case Some(json) => 193 | InitialKinesisPosition.fromCheckpointJson(json, new AtTimeStamp(CURRENT_TIMESTAMP)) 194 | case None => InitialKinesisPosition.fromPredefPosition(new AtTimeStamp(CURRENT_TIMESTAMP)) 195 | } 196 | } 197 | 198 | private[kinesis] val DEFAULT_KINESIS_ENDPOINT_URL: String = 199 | "https://kinesis.us-east-1.amazonaws.com" 200 | 201 | private[kinesis] val DEFAULT_KINESIS_REGION_NAME: String = "us-east-1" 202 | 203 | private[kinesis] val DEFAULT_SINK_RECORD_MAX_BUFFERED_TIME: String = "1000" 204 | 205 | private[kinesis] val DEFAULT_SINK_MAX_CONNECTIONS: String = "1" 206 | 207 | private[kinesis] val DEFAULT_SINK_AGGREGATION: String = "true" 208 | 209 | private[kinesis] val DEFAULT_FLUSH_WAIT_TIME_MILLIS: String = "100" 210 | } 211 | 212 | 213 | 214 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/KinesisSourceRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.kinesis 18 | 19 | import com.amazonaws.services.kinesis.model.{GetRecordsResult, Record} 20 | import java.io.Serializable 21 | import java.util.Locale 22 | import scala.collection.JavaConverters._ 23 | 24 | import org.apache.spark.{Partition, SparkContext, TaskContext} 25 | import org.apache.spark.rdd.RDD 26 | import org.apache.spark.storage.StorageLevel 27 | import org.apache.spark.util.NextIterator 28 | import org.apache.spark.util.SerializableConfiguration 29 | 30 | 31 | /** Offset range that one partition of the KinesiSourceRDD has to read */ 32 | private[kinesis] case class ShardInfo( 33 | shardId: String, 34 | iteratorType: String, 35 | iteratorPosition: String) extends Serializable { 36 | 37 | def this(shardId: String, kinesisPosition: KinesisPosition) { 38 | this(shardId, kinesisPosition.iteratorType, kinesisPosition.iteratorPosition) 39 | } 40 | } 41 | 42 | private[kinesis] case class ShardOffsets( 43 | batchId: Long, 44 | streamName: String, 45 | shardInfoMap: Map[String, ShardInfo] 46 | ) extends Serializable { 47 | 48 | def this(batchId: Long, streamName: String) { 49 | this(batchId, streamName, Map.empty[String, ShardInfo]) 50 | } 51 | 52 | def this(shardInfoMap: Map[String, ShardInfo]) { 53 | this(-1, "", shardInfoMap) 54 | } 55 | 56 | def this(batchId: Long, streamName: String, shardInfos: Array[ShardInfo]) { 57 | this(batchId, streamName, KinesisSourceOffset.getMap(shardInfos)) 58 | } 59 | 60 | def this(shardInfos: Array[ShardInfo]) { 61 | this(-1, "", KinesisSourceOffset.getMap(shardInfos)) 62 | } 63 | 64 | } 65 | 66 | 67 | /** Partition of the KinesiSourceRDD */ 68 | private[kinesis] case class KinesisSourceRDDPartition( 69 | index: Int, 70 | shardInfo: ShardInfo) extends Partition 71 | 72 | /* 73 | * An RDD that reads data from Kinesis based on offset ranges across multiple shards. 74 | */ 75 | 76 | private[kinesis] class KinesisSourceRDD( 77 | sparkContext: SparkContext, 78 | sourceOptions: Map[String, String], 79 | streamName: String, 80 | batchId: Long, 81 | shardInfos: Seq[ShardInfo], 82 | kinesisCredsProvider: SparkAWSCredentials, 83 | endpointUrl: String, 84 | conf: SerializableConfiguration, 85 | metadataPath: String, 86 | failOnDataLoss: Boolean = true 87 | ) 88 | extends RDD[Record](sparkContext, Nil) { 89 | 90 | override def persist(newLevel: StorageLevel): this.type = { 91 | logError("Kinesis Record is not serializable. " + 92 | "Use .map to extract fields before calling .persist or .window") 93 | super.persist(newLevel) 94 | } 95 | 96 | override def getPartitions: Array[Partition] = { 97 | shardInfos.zipWithIndex.map { case (o, i) => new KinesisSourceRDDPartition(i, o) }.toArray 98 | } 99 | 100 | override def compute( 101 | thePart: Partition, 102 | context: TaskContext): Iterator[Record] = { 103 | val sourcePartition = thePart.asInstanceOf[KinesisSourceRDDPartition] 104 | 105 | val kinesisShardId = sourcePartition.shardInfo.shardId 106 | 107 | val kinesisReader = new KinesisReader( 108 | sourceOptions, 109 | streamName, 110 | kinesisCredsProvider, 111 | endpointUrl 112 | ) 113 | 114 | val maxFetchTimeInMs = 115 | sourceOptions.getOrElse("executor.maxFetchTimeInMs".toLowerCase(Locale.ROOT), "1000").toLong 116 | 117 | val maxRecordsPerShard = 118 | sourceOptions.getOrElse("executor.maxFetchRecordsPerShard".toLowerCase(Locale.ROOT), 119 | "100000").toLong 120 | 121 | val recordPerRequest = 122 | sourceOptions.getOrElse("executor.maxRecordPerRead".toLowerCase(Locale.ROOT), "10000").toInt 123 | 124 | val enableIdleTimeBetweenReads: Boolean = 125 | sourceOptions.getOrElse("executor.addIdleTimeBetweenReads".toLowerCase(Locale.ROOT), 126 | "false").toBoolean 127 | 128 | val idleTimeBetweenReads = 129 | sourceOptions.getOrElse("executor.idleTimeBetweenReadsInMs".toLowerCase(Locale.ROOT), 130 | "1000").toLong 131 | 132 | val startTimestamp: Long = System.currentTimeMillis 133 | var lastReadTimeMs: Long = 0 134 | var lastReadSequenceNumber: String = "" 135 | var numRecordRead: Long = 0 136 | var hasShardClosed = false 137 | 138 | val underlying = new NextIterator[Record]() { 139 | var _shardIterator: String = null 140 | var fetchedRecords: Array[Record] = Array.empty 141 | var currentIndex = 0 142 | var fetchNext = true 143 | 144 | def getShardIterator(): String = { 145 | if (_shardIterator == null) { 146 | _shardIterator = kinesisReader.getShardIterator( 147 | sourcePartition.shardInfo.shardId, 148 | sourcePartition.shardInfo.iteratorType, 149 | sourcePartition.shardInfo.iteratorPosition, 150 | failOnDataLoss) 151 | if (!failOnDataLoss && _shardIterator == null) { 152 | logWarning( 153 | s""" 154 | | Some data may have been lost because ${sourcePartition.shardInfo.shardId} 155 | | is not available in Kinesis any more. The shard has 156 | | we have processed all records in it. We would ignore th 157 | | processing. If you want your streaming query to 158 | | set the source option "failOnDataLoss" to "true" 159 | """.stripMargin) 160 | return _shardIterator 161 | } 162 | } 163 | assert(_shardIterator != null) 164 | _shardIterator 165 | } 166 | 167 | def canFetchMoreRecords(currentTimestamp: Long): Boolean = { 168 | currentTimestamp - startTimestamp < maxFetchTimeInMs 169 | } 170 | 171 | def addDelayInFetchingRecords(currentTimestamp: Long): Unit = { 172 | if ( enableIdleTimeBetweenReads && lastReadTimeMs > 0 ) { 173 | val delayMs: Long = idleTimeBetweenReads - (currentTimestamp - lastReadTimeMs) 174 | if (delayMs > 0) { 175 | logInfo(s"Sleeping for ${delayMs}ms") 176 | Thread.sleep(delayMs) 177 | } 178 | } 179 | } 180 | 181 | override def getNext(): Record = { 182 | if (fetchedRecords.length == 0 || currentIndex >= fetchedRecords.length) { 183 | fetchedRecords = Array.empty 184 | currentIndex = 0 185 | while (fetchedRecords.length == 0 && fetchNext == true) { 186 | val currentTimestamp: Long = System.currentTimeMillis 187 | if (canFetchMoreRecords(currentTimestamp) && getShardIterator() != null) { 188 | // getShardIterator() should raise exception if its null if failOnDataLoss is true 189 | // if failOnDataLoss is false, getShardIterator() will be null and we should stop 190 | // fetching more records 191 | addDelayInFetchingRecords(currentTimestamp) 192 | val records: GetRecordsResult = kinesisReader.getKinesisRecords( 193 | _shardIterator, recordPerRequest) 194 | // de-aggregate records 195 | val deaggregateRecords = kinesisReader.deaggregateRecords(records.getRecords, null) 196 | fetchedRecords = deaggregateRecords.asScala.toArray 197 | _shardIterator = records.getNextShardIterator 198 | lastReadTimeMs = System.currentTimeMillis() 199 | logDebug(s"Milli secs behind is ${records.getMillisBehindLatest.longValue()}") 200 | if ( _shardIterator == null ) { 201 | hasShardClosed = true 202 | fetchNext = false 203 | } 204 | if ( records.getMillisBehindLatest.longValue() == 0 ) { 205 | fetchNext = false 206 | } 207 | } 208 | else { 209 | // either we cannot fetch more records or ShardIterator was null 210 | fetchNext = false 211 | } 212 | } 213 | } 214 | 215 | if (fetchedRecords.length == 0) { 216 | finished = true 217 | null 218 | } 219 | else { 220 | val record: Record = fetchedRecords(currentIndex) 221 | currentIndex += 1 222 | numRecordRead +=1 223 | if (numRecordRead > maxRecordsPerShard) { 224 | fetchNext = false 225 | } 226 | lastReadSequenceNumber = record.getSequenceNumber 227 | record 228 | } 229 | } 230 | override protected def close(): Unit = synchronized { 231 | kinesisReader.close() 232 | } 233 | } 234 | 235 | lazy val metadataCommitter: MetadataCommitter[ShardInfo] = { 236 | metaDataCommitterType.toLowerCase(Locale.ROOT) match { 237 | case "hdfs" => new HDFSMetadataCommitter[ ShardInfo ]( 238 | metaDataCommitterPath, conf, sourceOptions) 239 | case _ => throw new IllegalArgumentException("only HDFS is supported") 240 | } 241 | } 242 | 243 | def metaDataCommitterType: String = { 244 | sourceOptions.getOrElse("executor.metadata.committer", "hdfs").toString 245 | } 246 | 247 | def metaDataCommitterPath: String = { 248 | sourceOptions.getOrElse("executor.metadata.path", metadataPath).toString 249 | } 250 | 251 | 252 | def updateMetadata(taskContext: TaskContext): Unit = { 253 | 254 | // if lastReadSequenceNumber exists, use AfterSequenceNumber for next Iterator 255 | // else use the same iterator information which was given to the RDD 256 | 257 | val shardInfo: ShardInfo = 258 | if (hasShardClosed) { 259 | new ShardInfo(sourcePartition.shardInfo.shardId, 260 | new ShardEnd()) 261 | } 262 | else if (!lastReadSequenceNumber.isEmpty) { 263 | new ShardInfo( 264 | sourcePartition.shardInfo.shardId, 265 | new AfterSequenceNumber(lastReadSequenceNumber)) 266 | } 267 | else { 268 | logInfo("No Records were processed in this batch") 269 | sourcePartition.shardInfo 270 | } 271 | logInfo(s"Batch $batchId : Committing End Shard position for $kinesisShardId") 272 | metadataCommitter.add(batchId, kinesisShardId, shardInfo) 273 | } 274 | 275 | // Release reader, either by removing it or indicating we're no longer using it 276 | context.addTaskCompletionListener [Unit]{ taskContext: TaskContext => 277 | logInfo("Task Completed") 278 | updateMetadata(taskContext) 279 | } 280 | 281 | underlying 282 | } 283 | 284 | } 285 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/KinesisWriteTask.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.kinesis 18 | 19 | import java.nio.ByteBuffer 20 | 21 | import scala.util.Try 22 | 23 | import com.amazonaws.services.kinesis.producer.{KinesisProducer, UserRecordResult} 24 | import com.google.common.util.concurrent.{FutureCallback, Futures} 25 | 26 | import org.apache.spark.internal.Logging 27 | import org.apache.spark.sql.catalyst.InternalRow 28 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, UnsafeProjection} 29 | import org.apache.spark.sql.types.{BinaryType, StringType} 30 | 31 | private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, String], 32 | inputSchema: Seq[Attribute]) extends Logging { 33 | 34 | private var producer: KinesisProducer = _ 35 | private val projection = createProjection 36 | private val streamName = producerConfiguration.getOrElse( 37 | KinesisSourceProvider.SINK_STREAM_NAME_KEY, "") 38 | 39 | private val flushWaitTimeMills = Try(producerConfiguration.getOrElse( 40 | KinesisSourceProvider.SINK_FLUSH_WAIT_TIME_MILLIS, 41 | KinesisSourceProvider.DEFAULT_FLUSH_WAIT_TIME_MILLIS).toLong).getOrElse { 42 | throw new IllegalArgumentException( 43 | s"${KinesisSourceProvider.SINK_FLUSH_WAIT_TIME_MILLIS} has to be a positive integer") 44 | } 45 | 46 | private var failedWrite: Throwable = _ 47 | 48 | def execute(iterator: Iterator[InternalRow]): Unit = { 49 | producer = CachedKinesisProducer.getOrCreate(producerConfiguration) 50 | while (iterator.hasNext && failedWrite == null) { 51 | val currentRow = iterator.next() 52 | val projectedRow = projection(currentRow) 53 | val partitionKey = projectedRow.getString(0) 54 | val data = projectedRow.getBinary(1) 55 | 56 | sendData(partitionKey, data) 57 | } 58 | } 59 | 60 | def sendData(partitionKey: String, data: Array[Byte]): String = { 61 | var sentSeqNumbers = new String 62 | 63 | val future = producer.addUserRecord(streamName, partitionKey, ByteBuffer.wrap(data)) 64 | 65 | val kinesisCallBack = new FutureCallback[UserRecordResult]() { 66 | 67 | override def onFailure(t: Throwable): Unit = { 68 | if (failedWrite == null && t!= null) { 69 | failedWrite = t 70 | logError(s"Writing to $streamName failed due to ${t.getCause}") 71 | } 72 | } 73 | 74 | override def onSuccess(result: UserRecordResult): Unit = { 75 | val shardId = result.getShardId 76 | sentSeqNumbers = result.getSequenceNumber 77 | } 78 | } 79 | Futures.addCallback(future, kinesisCallBack) 80 | 81 | sentSeqNumbers 82 | } 83 | 84 | private def flushRecordsIfNecessary(): Unit = { 85 | if (producer != null) { 86 | while (producer.getOutstandingRecordsCount > 0) { 87 | try { 88 | producer.flush() 89 | Thread.sleep(flushWaitTimeMills) 90 | } catch { 91 | case e: InterruptedException => 92 | // Do Nothing 93 | } finally { 94 | checkForErrors() 95 | } 96 | } 97 | } 98 | } 99 | 100 | def checkForErrors(): Unit = { 101 | if (failedWrite != null) { 102 | throw failedWrite 103 | } 104 | } 105 | 106 | def close(): Unit = { 107 | checkForErrors() 108 | flushRecordsIfNecessary() 109 | checkForErrors() 110 | producer = null 111 | } 112 | 113 | private def createProjection: UnsafeProjection = { 114 | 115 | val partitionKeyExpression = inputSchema 116 | .find(_.name == KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME).getOrElse( 117 | throw new IllegalStateException("Required attribute " + 118 | s"'${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME}' not found")) 119 | 120 | partitionKeyExpression.dataType match { 121 | case StringType | BinaryType => // ok 122 | case t => 123 | throw new IllegalStateException(s"${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME} " + 124 | "attribute type must be a String or BinaryType") 125 | } 126 | 127 | val dataExpression = inputSchema.find(_.name == KinesisWriter.DATA_ATTRIBUTE_NAME).getOrElse( 128 | throw new IllegalStateException("Required attribute " + 129 | s"'${KinesisWriter.DATA_ATTRIBUTE_NAME}' not found") 130 | ) 131 | 132 | dataExpression.dataType match { 133 | case StringType | BinaryType => // ok 134 | case t => 135 | throw new IllegalStateException(s"${KinesisWriter.DATA_ATTRIBUTE_NAME} " + 136 | "attribute type must be a String or BinaryType") 137 | } 138 | 139 | UnsafeProjection.create( 140 | Seq(Cast(partitionKeyExpression, StringType), Cast(dataExpression, StringType)), inputSchema) 141 | } 142 | 143 | } 144 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/KinesisWriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import org.apache.spark.internal.Logging 21 | import org.apache.spark.sql.SparkSession 22 | import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} 23 | import org.apache.spark.util.Utils 24 | 25 | private[kinesis] object KinesisWriter extends Logging { 26 | 27 | val DATA_ATTRIBUTE_NAME: String = "data" 28 | val PARTITION_KEY_ATTRIBUTE_NAME: String = "partitionKey" 29 | 30 | override def toString: String = "KinesisWriter" 31 | 32 | def write(sparkSession: SparkSession, 33 | queryExecution: QueryExecution, 34 | kinesisParameters: Map[String, String]): Unit = { 35 | val schema = queryExecution.analyzed.output 36 | 37 | SQLExecution.withNewExecutionId(queryExecution) { 38 | queryExecution.toRdd.foreachPartition { iter => 39 | val writeTask = new KinesisWriteTask(kinesisParameters, schema) 40 | Utils.tryWithSafeFinally(block = writeTask.execute(iter))( 41 | finallyBlock = writeTask.close()) 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/MetadataCommitter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | trait MetadataCommitter[T <: AnyRef] { 21 | // Functions that various committer need to implement 22 | // This committed will be used by executors to push metadata related to kinesis shards 23 | // Possibile Implemetations are HDFS, DynamoDB, Mysql etc 24 | def add(batchId: Long, shardId: String, metadata: T): Boolean 25 | def get(batchId: Long): Seq[T] 26 | def purge(thresholdBatchId: Long): Unit 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/ShardSyncer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import com.amazonaws.services.kinesis.model.Shard 21 | import scala.collection.mutable 22 | 23 | import org.apache.spark.internal.Logging 24 | 25 | /* 26 | * Helper class to sync batch with shards of the Kinesis stream. 27 | * It will create new activities when it discovers new Kinesis shards (bootstrap/resharding). 28 | * It works in similar way as 29 | * com.amazonaws.services.kinesis.clientlibrary.lib.worker.ShardSyncer in KCL 30 | */ 31 | 32 | private[kinesis] object ShardSyncer extends Logging { 33 | 34 | private def getShardIdToChildShardsMap(latestShards: Seq[Shard]): 35 | mutable.Map[String, List[String ]] = { 36 | val shardIdToChildShardsMap = mutable.Map.empty[String, List[String]] 37 | 38 | val shardIdToShardMap = 39 | latestShards.map { 40 | s => (s.getShardId -> s) 41 | }.toMap 42 | 43 | for ((shardId, shard) <- shardIdToShardMap) { 44 | val parentShardId: String = shard.getParentShardId 45 | if ( parentShardId != null && shardIdToShardMap.contains(parentShardId) ) { 46 | shardIdToChildShardsMap += ( 47 | parentShardId -> 48 | (shardId :: shardIdToChildShardsMap.get(parentShardId).getOrElse(Nil)) 49 | ) 50 | } 51 | 52 | val adjacentParentShardId: String = shard.getAdjacentParentShardId 53 | if ( adjacentParentShardId != null && shardIdToShardMap.contains(adjacentParentShardId) ) { 54 | shardIdToChildShardsMap += ( 55 | adjacentParentShardId -> 56 | (shardId :: shardIdToChildShardsMap.get(adjacentParentShardId).getOrElse(Nil)) 57 | ) 58 | } 59 | } 60 | // Assert that Parent Shards are closed 61 | shardIdToChildShardsMap.keySet.foreach { 62 | parentShardId => 63 | shardIdToShardMap.get(parentShardId) match { 64 | case None => 65 | throw new IllegalStateException(s"ShardId $parentShardId is not closed. " + 66 | s"This can happen due to a race condition between listShards and a" + 67 | s" reshard operation") 68 | case Some(parentShard: Shard) => 69 | if (parentShard.getSequenceNumberRange().getEndingSequenceNumber == null) { 70 | throw new IllegalStateException(s"ShardId $parentShardId is not closed. " + 71 | s"This can happen due to a race condition between listShards and a " + 72 | s"reshard operation") 73 | } 74 | } 75 | } 76 | shardIdToChildShardsMap 77 | } 78 | 79 | private[kinesis] def AddShardInfoForAncestors( 80 | shardId: String, 81 | latestShards: Seq[Shard], 82 | initialPosition: InitialKinesisPosition, 83 | prevShardsList: mutable.Set[ String ], 84 | newShardsInfoMap: mutable.HashMap[ String, ShardInfo ], 85 | memoizationContext: mutable.Map[String, Boolean ]): Unit = { 86 | 87 | val shardIdToShardMap = 88 | latestShards.map { 89 | s => (s.getShardId -> s) 90 | }.toMap 91 | 92 | if (!memoizationContext.contains(shardId) && 93 | shardId != null && shardIdToShardMap.contains(shardId) ) { 94 | if (prevShardsList.contains(shardId) ) { 95 | // we already have processed this shard in previous batch and added its ancestors 96 | memoizationContext.put(shardId, true) 97 | return 98 | } 99 | var shard = shardIdToShardMap.get(shardId).get 100 | // get parent of shards if exist 101 | var parentShardIds: mutable.HashSet[String] = getParentShardIds(shard, latestShards) 102 | for (parentShardId <- parentShardIds) { 103 | // Add ShardInfo of Parent's ancestors. 104 | AddShardInfoForAncestors( parentShardId, 105 | latestShards, initialPosition, prevShardsList, 106 | newShardsInfoMap, memoizationContext) 107 | } 108 | // create shardInfo for its parent shards (if they don't exist) 109 | for (parentShardId <- parentShardIds) { 110 | if (!prevShardsList.contains(parentShardId) ) { 111 | logDebug("Need to create a shardInfo for shardId " + parentShardId) 112 | if (newShardsInfoMap.get(parentShardId).isEmpty) { 113 | newShardsInfoMap.put(parentShardId, 114 | new ShardInfo(parentShardId, initialPosition.shardPosition(parentShardId))) 115 | } 116 | } 117 | } 118 | memoizationContext.put(shardId, true) 119 | } 120 | } 121 | 122 | private[kinesis] def getParentShardIds( 123 | shard: Shard, 124 | shards: Seq[Shard]): mutable.HashSet[String] = { 125 | val parentShardIds = new mutable.HashSet[ String ] 126 | val parentShardId = shard.getParentShardId 127 | val shardIdToShardMap = 128 | shards.map { 129 | s => (s.getShardId -> s) 130 | }.toMap 131 | 132 | if ((parentShardId != null) && shardIdToShardMap.contains(parentShardId)) { 133 | parentShardIds.add(parentShardId) 134 | } 135 | val adjacentParentShardId = shard.getAdjacentParentShardId 136 | if ( (adjacentParentShardId != null) && shardIdToShardMap.contains(adjacentParentShardId)) { 137 | parentShardIds.add(adjacentParentShardId) 138 | } 139 | return parentShardIds 140 | } 141 | 142 | /* 143 | * Takes a sequence of Shard as input params 144 | * It iterate though each shards 145 | * and return a sequence of shard-ids of open Shards 146 | */ 147 | def openShards(shards: Seq[Shard]): Seq[String] = { 148 | // List of open Shards 149 | shards.collect { 150 | case s: Shard if (s.getSequenceNumberRange.getEndingSequenceNumber == null) => s.getShardId 151 | } 152 | } 153 | 154 | /* 155 | * Takes a sequence of Shard as input params 156 | * It iterate though each shards 157 | * and return a sequence of shard-ids of closed Shards 158 | */ 159 | 160 | def closedShards(shards: Seq[Shard]): Seq[String] = { 161 | // List of closed Shards 162 | shards.collect { 163 | case s: Shard if (s.getSequenceNumberRange.getEndingSequenceNumber != null) => s.getShardId 164 | } 165 | } 166 | 167 | def hasNewShards(latestShardsInfo: Seq[ShardInfo], 168 | prevShardsInfo: Seq[ShardInfo]): Boolean = { 169 | latestShardsInfo.foldLeft(false) { 170 | (hasNewShard, shardInfo) => 171 | if (!hasNewShard) { 172 | // Check only if hasNewShard is false 173 | prevShardsInfo.contains(shardInfo.shardId) 174 | } else { 175 | hasNewShard 176 | } 177 | } 178 | } 179 | 180 | def hasDeletedShards(latestShardsInfo: Seq[ShardInfo], 181 | prevShardsInfo: Seq[ShardInfo]): Boolean = { 182 | prevShardsInfo.foldLeft(false) { 183 | (hasDeletedShard, shardInfo) => 184 | if (!hasDeletedShard) { 185 | // Check only if hasDeletedShard is false 186 | latestShardsInfo.contains(shardInfo.shardId) 187 | } else { 188 | hasDeletedShard 189 | } 190 | } 191 | } 192 | 193 | def getLatestShardInfo( 194 | latestShards: Seq[Shard], 195 | prevShardsInfo: Seq[ShardInfo], 196 | initialPosition: InitialKinesisPosition, 197 | failOnDataLoss: Boolean = true): Seq[ShardInfo] = { 198 | 199 | if (latestShards.isEmpty) { 200 | return prevShardsInfo 201 | } 202 | var prevShardsList = new mutable.HashSet[String] 203 | var latestShardsList = new mutable.HashSet[String] 204 | prevShardsInfo.foreach { 205 | s: ShardInfo => prevShardsList.add(s.shardId) 206 | } 207 | latestShards.foreach { 208 | s: Shard => latestShardsList.add(s.getShardId) 209 | } 210 | // check for deleted shards 211 | val deletedShardsList = prevShardsList.diff(latestShardsList) 212 | val newShardsInfoMap = new mutable.HashMap[String, ShardInfo] 213 | val memoizationContext = new mutable.HashMap[ String, Boolean] 214 | 215 | // check for deleted Shards and update newShardInfo if failOnDataLoss is false 216 | if (deletedShardsList.nonEmpty) { 217 | if (failOnDataLoss) { 218 | throw new IllegalStateException( 219 | s""" 220 | | Some data may have been lost because ${deletedShardsList.toString()} 221 | | are not available in Kinesis any more. The shard has been deleted before 222 | | we have processed all records in it. If you do not want your streaming query 223 | | to fail on such cases, set the source option "failOnDataLoss" to "false" 224 | """.stripMargin 225 | ) 226 | } else { 227 | log.warn( 228 | s""" 229 | | Some data may have been lost because $deletedShardsList are not available in Kinesis 230 | | any more. The shard has been deleted before we have processed all records in it. 231 | | If you want your streaming query to fail on such cases, set the source option 232 | | "failOnDataLoss" to "true" 233 | """.stripMargin 234 | ) 235 | } 236 | } 237 | 238 | // filter the deleted shards 239 | var filteredPrevShardsInfo = prevShardsInfo.filter { 240 | s: ShardInfo => !deletedShardsList.contains(s.shardId) 241 | } 242 | 243 | // check for new shards and fetch ShardInfo for them 244 | openShards(latestShards).map { 245 | shardId: String => 246 | if (prevShardsList.contains(shardId)) { 247 | logDebug("Info for shardId " + shardId + " already exists") 248 | } 249 | else { 250 | AddShardInfoForAncestors(shardId, 251 | latestShards, initialPosition, prevShardsList, newShardsInfoMap, memoizationContext) 252 | newShardsInfoMap.put(shardId, 253 | new ShardInfo(shardId, initialPosition.shardPosition(shardId))) 254 | } 255 | } 256 | filteredPrevShardsInfo ++ newShardsInfoMap.values.toSeq 257 | } 258 | 259 | } 260 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/SparkAWSCredentials.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.kinesis 18 | 19 | import com.amazonaws.auth._ 20 | 21 | import org.apache.spark.annotation.Evolving 22 | import org.apache.spark.internal.Logging 23 | 24 | /** 25 | * Serializable interface providing a method executors can call to obtain an 26 | * AWSCredentialsProvider instance for authenticating to AWS services. 27 | */ 28 | private[kinesis] sealed trait SparkAWSCredentials extends Serializable { 29 | /** 30 | * Return an AWSCredentialProvider instance that can be used by the Kinesis Client 31 | * Library to authenticate to AWS services (Kinesis, CloudWatch and DynamoDB). 32 | */ 33 | def provider: AWSCredentialsProvider 34 | } 35 | 36 | /** Returns DefaultAWSCredentialsProviderChain for authentication. */ 37 | private[kinesis] final case object DefaultCredentials extends SparkAWSCredentials { 38 | 39 | def provider: AWSCredentialsProvider = new DefaultAWSCredentialsProviderChain 40 | } 41 | 42 | /* 43 | * Returns AWSInstanceProfileCredentialsProviderWithRetries. 44 | */ 45 | 46 | private[kinesis] final case object InstanceProfileCredentials 47 | extends SparkAWSCredentials { 48 | def provider: AWSCredentialsProvider = new AWSInstanceProfileCredentialsProviderWithRetries 49 | } 50 | 51 | 52 | /** 53 | * Returns AWSStaticCredentialsProvider constructed using basic AWS keypair. Falls back to using 54 | * DefaultCredentialsProviderChain if unable to construct a AWSCredentialsProviderChain 55 | * instance with the provided arguments (e.g. if they are null). 56 | */ 57 | private[kinesis] final case class BasicCredentials( 58 | awsAccessKeyId: String, 59 | awsSecretKey: String) extends SparkAWSCredentials with Logging { 60 | 61 | def provider: AWSCredentialsProvider = try { 62 | new AWSStaticCredentialsProvider(new BasicAWSCredentials(awsAccessKeyId, awsSecretKey)) 63 | } catch { 64 | case e: IllegalArgumentException => 65 | logWarning("Unable to construct AWSStaticCredentialsProvider with provided keypair; " + 66 | "falling back to DefaultCredentialsProviderChain.", e) 67 | new DefaultAWSCredentialsProviderChain 68 | } 69 | } 70 | 71 | private[kinesis] final case class BasicAWSSessionCredentials( 72 | awsAccessKeyId: String, 73 | awsSecretKey: String, 74 | sessionToken: String) extends SparkAWSCredentials with Logging { 75 | 76 | def provider: AWSCredentialsProvider = try { 77 | new AWSStaticCredentialsProvider(new BasicSessionCredentials(awsAccessKeyId, awsSecretKey, sessionToken)) 78 | } catch { 79 | case e: IllegalArgumentException => 80 | logWarning("Unable to construct AWSStaticCredentialsProvider with provided keyparir; " + 81 | "falling back to DefaultCredentialsProviderChain.", e) 82 | new DefaultAWSCredentialsProviderChain 83 | } 84 | } 85 | 86 | /** 87 | * Returns an STSAssumeRoleSessionCredentialsProvider instance which assumes an IAM 88 | * role in order to authenticate against resources in an external account. 89 | */ 90 | private[kinesis] final case class STSCredentials( 91 | stsRoleArn: String, 92 | stsSessionName: String, 93 | stsExternalId: Option[String] = None, 94 | longLivedCreds: SparkAWSCredentials = DefaultCredentials) 95 | extends SparkAWSCredentials { 96 | 97 | def provider: AWSCredentialsProvider = { 98 | val builder = new STSAssumeRoleSessionCredentialsProvider.Builder(stsRoleArn, stsSessionName) 99 | .withLongLivedCredentialsProvider(longLivedCreds.provider) 100 | stsExternalId match { 101 | case Some(stsExternalId) => 102 | builder.withExternalId(stsExternalId) 103 | .build() 104 | case None => 105 | builder.build() 106 | } 107 | } 108 | } 109 | 110 | @Evolving 111 | object SparkAWSCredentials { 112 | 113 | @Evolving 114 | class Builder { 115 | private var basicCreds: Option[BasicCredentials] = None 116 | private var stsCreds: Option[STSCredentials] = None 117 | private var basicSessionCreds: Option[BasicSessionCredentials] = None 118 | 119 | // scalastyle:off 120 | /** 121 | * Use a basic AWS keypair for long-lived authorization. 122 | * 123 | * @note The given AWS keypair will be saved in DStream checkpoints if checkpointing is 124 | * enabled. Make sure that your checkpoint directory is secure. Prefer using the 125 | * [[http://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default default provider chain]] 126 | * instead if possible. 127 | * 128 | * @param accessKeyId AWS access key ID 129 | * @param secretKey AWS secret key 130 | * @return Reference to this [[SparkAWSCredentials.Builder]] 131 | */ 132 | // scalastyle:on 133 | def basicCredentials(accessKeyId: String, secretKey: String): Builder = { 134 | basicCreds = Option(BasicCredentials( 135 | awsAccessKeyId = accessKeyId, 136 | awsSecretKey = secretKey)) 137 | this 138 | } 139 | 140 | 141 | // scalastyle:off 142 | /** 143 | * Use a shortlived aws key pair plus security token for short-term authentication 144 | * 145 | * 146 | * @param accessKeyId AWS access key ID 147 | * @param secretKey AWS secret key 148 | * @param securityToken AWS Security Token 149 | * @return Reference to this [[SparkAWSCredentials.Builder]] 150 | */ 151 | // scalastyle:on 152 | def basicSessionCredentials(accessKeyId: String, secretKey: String, securityToken: String): Builder = { 153 | basicSessionCreds = Option(new BasicSessionCredentials( 154 | accessKeyId, 155 | secretKey, 156 | securityToken)) 157 | this 158 | } 159 | 160 | /** 161 | * Use STS to assume an IAM role for temporary session-based authentication. Will use configured 162 | * long-lived credentials for authorizing to STS itself (either the default provider chain 163 | * or a configured keypair). 164 | * 165 | * @param roleArn ARN of IAM role to assume via STS 166 | * @param sessionName Name to use for the STS session 167 | * @return Reference to this [[SparkAWSCredentials.Builder]] 168 | */ 169 | def stsCredentials(roleArn: String, sessionName: String): Builder = { 170 | stsCreds = Option(STSCredentials(stsRoleArn = roleArn, stsSessionName = sessionName)) 171 | this 172 | } 173 | 174 | /** 175 | * Use STS to assume an IAM role for temporary session-based authentication. Will use configured 176 | * long-lived credentials for authorizing to STS itself (either the default provider chain 177 | * or a configured keypair). STS will validate the provided external ID with the one defined 178 | * in the trust policy of the IAM role to be assumed (if one is present). 179 | * 180 | * @param roleArn ARN of IAM role to assume via STS 181 | * @param sessionName Name to use for the STS session 182 | * @param externalId External ID to validate against assumed IAM role's trust policy 183 | * @return Reference to this [[SparkAWSCredentials.Builder]] 184 | */ 185 | def stsCredentials(roleArn: String, sessionName: String, externalId: String): Builder = { 186 | stsCreds = Option(STSCredentials( 187 | stsRoleArn = roleArn, 188 | stsSessionName = sessionName, 189 | stsExternalId = Option(externalId))) 190 | this 191 | } 192 | 193 | 194 | def build(): SparkAWSCredentials = 195 | stsCreds.map(_.copy(longLivedCreds = longLivedCreds)).getOrElse(longLivedCreds) 196 | 197 | private def longLivedCreds: SparkAWSCredentials = basicCreds.getOrElse(DefaultCredentials) 198 | } 199 | 200 | 201 | def builder: Builder = new Builder 202 | } 203 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/kinesis/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | /** 19 | * Structured Streaming Data Source for Kinesis 20 | */ 21 | 22 | package org.apache.spark.sql.kinesis; 23 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file target/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=true 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 25 | 26 | # Ignore messages below warning level from Jetty, because it's a bit verbose 27 | log4j.logger.org.spark_project.jetty=WARN 28 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/kinesis/HDFSMetaDataCommiterSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import java.io.File 21 | 22 | import scala.language.implicitConversions 23 | 24 | import org.apache.hadoop.conf.Configuration 25 | 26 | import org.apache.spark.SparkFunSuite 27 | import org.apache.spark.sql.test.SharedSparkSession 28 | import org.apache.spark.util.SerializableConfiguration 29 | 30 | 31 | class HDFSMetaDataCommiterSuite extends SparkFunSuite with SharedSparkSession { 32 | 33 | val testConf: Configuration = new Configuration() 34 | val serializedConf = new SerializableConfiguration(testConf) 35 | 36 | test("Add and Get operation") { 37 | withTempDir { temp => 38 | val dir = new File(temp, "commit") 39 | val metadataCommitter = new HDFSMetadataCommitter[String](dir.getAbsolutePath, serializedConf) 40 | assert(metadataCommitter.add(0, "Shard-000001", "foo")) 41 | assert(metadataCommitter.get(0) === Seq("foo")) 42 | 43 | assert(metadataCommitter.add(1, "Shard-000001", "one")) 44 | assert(metadataCommitter.add(1, "Shard-000002", "two")) 45 | assert(metadataCommitter.get(1).toSet === Set("one", "two")) 46 | 47 | // Adding the same batch over-writes the previous entry 48 | // This is required since re-attempt of a failed task will 49 | // update in same location 50 | assert(metadataCommitter.add(1, "Shard-000001", "updated-one")) 51 | assert(metadataCommitter.get(1).toSet === Set("updated-one", "two")) 52 | } 53 | } 54 | 55 | test("Purge operation") { 56 | withTempDir { temp => 57 | val metadataCommitter = new HDFSMetadataCommitter[String]( 58 | temp.getAbsolutePath, serializedConf) 59 | 60 | assert(metadataCommitter.add(0, "Shard-000001", "one")) 61 | assert(metadataCommitter.add(1, "Shard-000001", "two")) 62 | assert(metadataCommitter.add(2, "Shard-000001", "three")) 63 | 64 | assert(metadataCommitter.get(0).nonEmpty) 65 | assert(metadataCommitter.get(1).nonEmpty) 66 | assert(metadataCommitter.get(2).nonEmpty) 67 | 68 | metadataCommitter.purge(2) 69 | assertThrows[IllegalStateException](metadataCommitter.get(0)) 70 | assertThrows[IllegalStateException](metadataCommitter.get(1)) 71 | assert(metadataCommitter.get(2).nonEmpty) 72 | 73 | // There should be exactly one file, called "2", in the metadata directory. 74 | val allFiles = new File(metadataCommitter.metadataPath.toString).listFiles().toSeq 75 | assert(allFiles.size == 1) 76 | assert(allFiles.head.getName == "2") 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/kinesis/KinesisPositionSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import org.apache.spark.SparkFunSuite 21 | 22 | class KinesisPositionSuite extends SparkFunSuite { 23 | 24 | test("Fail on invalid kinesis source offset JSON") { 25 | assertThrows[IllegalArgumentException] { 26 | InitialKinesisPosition.fromCheckpointJson("""{"a":5}""", new TrimHorizon()) 27 | } 28 | } 29 | 30 | test("Construct initial position from KinesisSourceOffset JSON") { 31 | // Given 32 | val shard00 = new AfterSequenceNumber("111") 33 | val shard01 = new AfterSequenceNumber("222") 34 | val offset = KinesisSourceOffset( 35 | ShardOffsets( 36 | batchId = 5L, 37 | streamName = "my.stream", 38 | shardInfoMap = Map( 39 | "shardId-00" -> ShardInfo("shardId-00", shard00.iteratorType, shard00.iteratorPosition), 40 | "shardId-01" -> ShardInfo("shardId-01", shard01.iteratorType, shard01.iteratorPosition) 41 | ) 42 | ) 43 | ) 44 | val offsetJson = offset.json 45 | 46 | // When 47 | val initPos = InitialKinesisPosition.fromCheckpointJson(offsetJson, new TrimHorizon()) 48 | 49 | // Expected 50 | val shard00Result = initPos.shardPosition("shardId-00") 51 | assertResult(shard00Result.iteratorType)(shard00.iteratorType) 52 | assertResult(shard00Result.iteratorPosition)(shard00.iteratorPosition) 53 | 54 | val shard01Result = initPos.shardPosition("shardId-01") 55 | assertResult(shard01Result.iteratorType)(shard01.iteratorType) 56 | assertResult(shard01Result.iteratorPosition)(shard01.iteratorPosition) 57 | 58 | // Should give default position for a newly discovered shard 59 | val shard02Result = initPos.shardPosition("shardId-02") 60 | assertResult(shard02Result.iteratorType)(new TrimHorizon().iteratorType) 61 | assertResult(shard02Result.iteratorPosition)(new TrimHorizon().iteratorPosition) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/kinesis/KinesisReaderSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | 19 | package org.apache.spark.sql.kinesis 20 | 21 | import scala.util.Try 22 | 23 | import org.scalatest.PrivateMethodTester 24 | 25 | import org.apache.spark.SparkException 26 | import org.apache.spark.sql.kinesis.KinesisTestUtils.{envVarNameForEnablingTests, shouldRunTests} 27 | import org.apache.spark.sql.test.SharedSparkSession 28 | 29 | class KinesisReaderSuite extends SharedSparkSession with PrivateMethodTester { 30 | 31 | protected var testUtils: KinesisTestUtils = _ 32 | 33 | /** Run the test if environment variable is set or ignore the test */ 34 | def testIfEnabled(testName: String)(testBody: => Unit) { 35 | if (shouldRunTests) { 36 | test(testName)(testBody) 37 | } else { 38 | ignore(s"$testName [enable by setting env var $envVarNameForEnablingTests=1]")(testBody) 39 | } 40 | } 41 | 42 | test("Should throw exception when there is no InstanceProfile") { 43 | val ex = intercept[ SparkException ] { 44 | val kinesisReader = 45 | new KinesisReader( 46 | Map.empty[String, String], 47 | "Test", 48 | InstanceProfileCredentials, 49 | KinesisTestUtils.endpointUrl 50 | ) 51 | kinesisReader.getShards() 52 | } 53 | } 54 | 55 | test("Should throw exception when STSCredentials are incorrect") { 56 | val ex = intercept[ SparkException ] { 57 | val kinesisReader = new KinesisReader( 58 | Map.empty[ String, String], 59 | "Test", 60 | STSCredentials("role-arn", "session-name"), 61 | KinesisTestUtils.endpointUrl) 62 | kinesisReader.getShards() 63 | } 64 | } 65 | 66 | test("Should throw exception when BasicCredentials are incorrect") { 67 | val ex = intercept[ SparkException ] { 68 | val kinesisReader = 69 | new KinesisReader( 70 | Map.empty[String, String], 71 | "Test", 72 | BasicCredentials("access-key", "secret-key"), 73 | KinesisTestUtils.endpointUrl 74 | ) 75 | kinesisReader.getShards() 76 | } 77 | } 78 | 79 | testIfEnabled("Should succeed for valid Credentials") { 80 | Try { 81 | val kinesisReader = 82 | new KinesisReader( 83 | Map.empty[String, String], 84 | "Test", 85 | BasicCredentials( 86 | KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId, 87 | KinesisTestUtils.getAWSCredentials().getAWSSecretKey 88 | ), 89 | KinesisTestUtils.endpointUrl 90 | ) 91 | kinesisReader.getShards() 92 | }.isSuccess 93 | } 94 | 95 | testIfEnabled("getShardIterator should return null when shard-id is incorrect" + 96 | " and failOnDataLoss is false") { 97 | val kinesisReader = 98 | new KinesisReader( 99 | Map.empty[String, String], 100 | "Test", 101 | BasicCredentials( 102 | KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId, 103 | KinesisTestUtils.getAWSCredentials().getAWSSecretKey 104 | ), 105 | KinesisTestUtils.endpointUrl 106 | ) 107 | val shardIterator = kinesisReader.getShardIterator("BAD-SHARD-ID", "LATEST", 108 | "", false) 109 | assert(shardIterator === null) 110 | } 111 | 112 | testIfEnabled("getShardIterator should throw exception when shard-id is incorrect" + 113 | " and failOnDataLoss is true") { 114 | val ex = intercept[ SparkException ] { 115 | val kinesisReader = 116 | new KinesisReader( 117 | Map.empty[String, String], 118 | "Test", 119 | BasicCredentials( 120 | KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId, 121 | KinesisTestUtils.getAWSCredentials().getAWSSecretKey 122 | ), 123 | KinesisTestUtils.endpointUrl 124 | ) 125 | val shardIterator = kinesisReader.getShardIterator("BAD-SHARD-ID", "LATEST", 126 | "", true) 127 | } 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/kinesis/KinesisSinkSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.kinesis 18 | 19 | import java.util.Locale 20 | 21 | import org.scalatest.concurrent.PatienceConfiguration.Timeout 22 | 23 | import org.apache.spark.sql.{DataFrame, Row} 24 | import org.apache.spark.sql.execution.streaming.MemoryStream 25 | import org.apache.spark.sql.kinesis.KinesisTestUtils.{envVarNameForEnablingTests, shouldRunTests} 26 | import org.apache.spark.sql.streaming._ 27 | import org.apache.spark.sql.streaming.util.StreamManualClock 28 | import org.apache.spark.sql.test.SharedSparkSession 29 | 30 | abstract class KinesisSinkTest extends StreamTest with SharedSparkSession { 31 | 32 | protected var testUtils: KinesisTestUtils = _ 33 | 34 | override def beforeAll(): Unit = { 35 | super.beforeAll() 36 | 37 | testUtils = new KPLBasedKinesisTestUtils(1) 38 | testUtils.createStream() 39 | } 40 | 41 | override def afterAll(): Unit = { 42 | if (testUtils != null) { 43 | testUtils.deleteStream() 44 | testUtils = null 45 | super.afterAll() 46 | } 47 | } 48 | 49 | /** Run the test if environment variable is set or ignore the test */ 50 | def testIfEnabled(testName: String)(testBody: => Unit) { 51 | if (shouldRunTests) { 52 | test(testName)(testBody) 53 | } else { 54 | ignore(s"$testName [enable by setting env var $envVarNameForEnablingTests=1]")(testBody) 55 | } 56 | } 57 | 58 | /** Run the give body of code only if Kinesis tests are enabled */ 59 | def runIfTestsEnabled(message: String)(body: => Unit): Unit = { 60 | if (shouldRunTests) { 61 | body 62 | } else { 63 | ignore(s"$message [enable by setting env var $envVarNameForEnablingTests=1]")(()) 64 | } 65 | } 66 | 67 | } 68 | 69 | class KinesisSinkOptionsSuite extends StreamTest with SharedSparkSession { 70 | 71 | test("bad source options") { 72 | def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = { 73 | val ex = intercept[IllegalArgumentException] { 74 | val reader = spark.readStream.format("kinesis") 75 | options.foreach { case (k, v) => reader.option(k, v) } 76 | reader.load() 77 | } 78 | expectedMsgs.foreach { m => 79 | assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(m.toLowerCase(Locale.ROOT))) 80 | } 81 | } 82 | 83 | testBadOptions()("Stream name is a required field") 84 | testBadOptions("streamname" -> "")("Stream name is a required field") 85 | } 86 | } 87 | 88 | class KinesisSinkSuite extends KinesisSinkTest { 89 | 90 | import testImplicits._ 91 | 92 | testIfEnabled("Test write data with bad schema") { 93 | val input = MemoryStream[String] 94 | var writer: StreamingQuery = null 95 | var ex: Exception = null 96 | 97 | val options = Map[String, String]( 98 | "streamName" -> testUtils.streamName, 99 | "endpointUrl" -> testUtils.endpointUrl, 100 | "AWSAccessKeyId" -> KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId, 101 | "AWSSecretKey" -> KinesisTestUtils.getAWSCredentials().getAWSSecretKey 102 | ) 103 | 104 | try { 105 | ex = intercept[StreamingQueryException] { 106 | writer = createKinesisWriter(input.toDF(), withOptions = options)( 107 | withSelectExpr = "value as partitionKey", "value" 108 | ) 109 | input.addData("1", "2", "3", "4", "5") 110 | writer.processAllAvailable() 111 | } 112 | } finally { 113 | if (writer != null) { 114 | writer.stop() 115 | } 116 | } 117 | assert(ex.getMessage 118 | .toLowerCase(Locale.ROOT) 119 | .contains("required attribute 'data' not found")) 120 | 121 | try { 122 | ex = intercept[StreamingQueryException] { 123 | writer = createKinesisWriter(input.toDF(), withOptions = options)( 124 | withSelectExpr = "value as data", "value" 125 | ) 126 | input.addData("1", "2", "3", "4", "5") 127 | writer.processAllAvailable() 128 | } 129 | } finally { 130 | if (writer != null) { 131 | writer.stop() 132 | } 133 | } 134 | assert(ex.getMessage 135 | .toLowerCase(Locale.ROOT) 136 | .contains("required attribute 'partitionkey' not found")) 137 | } 138 | 139 | testIfEnabled("Test write data with valid schema but wrong types") { 140 | val options = Map[String, String]( 141 | "streamName" -> testUtils.streamName, 142 | "endpointUrl" -> testUtils.endpointUrl, 143 | "AWSAccessKeyId" -> KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId, 144 | "AWSSecretKey" -> KinesisTestUtils.getAWSCredentials().getAWSSecretKey 145 | ) 146 | 147 | val input = MemoryStream[String] 148 | var writer: StreamingQuery = null 149 | var ex: Exception = null 150 | try { 151 | /* partitionKey field wrong type */ 152 | ex = intercept[StreamingQueryException] { 153 | writer = createKinesisWriter(input.toDF(), withOptions = options)( 154 | withSelectExpr = s"CAST('1' as INT) as partitionKey", "value as data" 155 | ) 156 | input.addData("1", "2", "3", "4", "5") 157 | writer.processAllAvailable() 158 | } 159 | } finally { 160 | if (writer != null) { 161 | writer.stop() 162 | } 163 | } 164 | assert(ex.getMessage.toLowerCase(Locale.ROOT) 165 | .contains("partitionkey attribute type must be a string or binarytype")) 166 | 167 | try { 168 | /* data field wrong type */ 169 | ex = intercept[StreamingQueryException] { 170 | writer = createKinesisWriter(input.toDF(), withOptions = options)( 171 | withSelectExpr = "value as partitionKey", "CAST(value as INT) as data" 172 | ) 173 | input.addData("1", "2", "3", "4", "5") 174 | writer.processAllAvailable() 175 | } 176 | } finally { 177 | if (writer != null) { 178 | writer.stop() 179 | } 180 | } 181 | assert(ex.getMessage.toLowerCase(Locale.ROOT).contains( 182 | "data attribute type must be a string or binarytype")) 183 | } 184 | 185 | testIfEnabled("Test write data to Kinesis") { 186 | val clock = new StreamManualClock 187 | 188 | val waitUntilBatchProcessed = AssertOnQuery { q => 189 | eventually(Timeout(streamingTimeout)) { 190 | if (!q.exception.isDefined) { 191 | assert(clock.isStreamWaitingAt(clock.getTimeMillis())) 192 | } 193 | } 194 | if (q.exception.isDefined) { 195 | throw q.exception.get 196 | } 197 | true 198 | } 199 | var writer: StreamingQuery = null 200 | 201 | val input = MemoryStream[String] 202 | val writerOptions = Map[String, String]( 203 | "streamName" -> testUtils.streamName, 204 | "endpointUrl" -> testUtils.endpointUrl, 205 | "AWSAccessKeyId" -> KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId, 206 | "AWSSecretKey" -> KinesisTestUtils.getAWSCredentials().getAWSSecretKey 207 | ) 208 | 209 | val reader = createKinesisReader() 210 | .selectExpr("CAST(data AS STRING)") 211 | .as[String].map(_.toInt) 212 | 213 | try { 214 | writer = createKinesisWriter(input.toDF(), withOptions = writerOptions)( 215 | withSelectExpr = s"CAST('1' as STRING) as partitionKey", "value as data") 216 | input.addData("1", "2", "3", "4", "5") 217 | 218 | testStream(reader)( 219 | StartStream(Trigger.ProcessingTime(100), clock), 220 | waitUntilBatchProcessed, 221 | AssertOnQuery { query => 222 | logInfo("Pushing Data ") 223 | writer.processAllAvailable() 224 | true 225 | }, 226 | AdvanceManualClock(100), 227 | waitUntilBatchProcessed, 228 | CheckAnswer(1, 2, 3, 4, 5) 229 | ) 230 | 231 | } finally { 232 | if (writer != null) { 233 | writer.stop() 234 | } 235 | } 236 | } 237 | 238 | private def createKinesisReader(): DataFrame = { 239 | spark.readStream 240 | .format("kinesis") 241 | .option("streamName", testUtils.streamName) 242 | .option("endpointUrl", testUtils.endpointUrl) 243 | .option("AWSAccessKeyId", KinesisTestUtils.getAWSCredentials().getAWSAccessKeyId) 244 | .option("AWSSecretKey", KinesisTestUtils.getAWSCredentials().getAWSSecretKey) 245 | .load 246 | } 247 | 248 | private def createKinesisWriter(input: DataFrame, 249 | withOutputMode: Option[OutputMode] = None, 250 | withOptions: Map[String, String] = Map[String, String]()) 251 | (withSelectExpr: String*): StreamingQuery = { 252 | var stream: DataStreamWriter[Row] = null 253 | withTempDir { checkpointDir => 254 | var df = input.toDF() 255 | if (withSelectExpr.nonEmpty) { 256 | df = df.selectExpr(withSelectExpr: _*) 257 | } 258 | stream = df.writeStream 259 | .format("kinesis") 260 | .option("checkpointLocation", checkpointDir.getCanonicalPath) 261 | .queryName("kinesisStream") 262 | withOutputMode.foreach(stream.outputMode(_)) 263 | withOptions.foreach(opt => stream.option(opt._1, opt._2)) 264 | } 265 | stream.start() 266 | } 267 | } -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/kinesis/KinesisSourceOffsetSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import java.io.File 21 | 22 | import org.apache.spark.sql.execution.streaming._ 23 | import org.apache.spark.sql.streaming.OffsetSuite 24 | import org.apache.spark.sql.test.SharedSparkSession 25 | 26 | 27 | class KinesisSourceOffsetSuite extends OffsetSuite with SharedSparkSession { 28 | 29 | 30 | compare( 31 | one = KinesisSourceOffset(new ShardOffsets(-1L, "dummy", Array.empty[ShardInfo])), 32 | two = KinesisSourceOffset(new ShardOffsets(1L, "dummy", Array.empty[ShardInfo]))) 33 | 34 | compare( 35 | one = KinesisSourceOffset(new ShardOffsets(1L, "foo", Array.empty[ShardInfo])), 36 | two = KinesisSourceOffset(new ShardOffsets(1L, "bar", Array.empty[ShardInfo])) 37 | ) 38 | 39 | compare( 40 | one = KinesisSourceOffset(new ShardOffsets(1L, "foo", Array( 41 | new ShardInfo("shard-001", new TrimHorizon())))), 42 | two = KinesisSourceOffset(new ShardOffsets(1L, "foo", 43 | Array(new ShardInfo("shard-001", new TrimHorizon()), 44 | new ShardInfo("shard-002", new TrimHorizon()) ))) 45 | ) 46 | var shardInfo1 = Array.empty[ShardInfo] 47 | shardInfo1 = shardInfo1 ++ Array(ShardInfo("shard-001", "AFTER_SEQUENCE_NUMBER", "1234")) 48 | 49 | val kso1 = KinesisSourceOffset( 50 | new ShardOffsets(1L, "foo", shardInfo1)) 51 | 52 | val shardInfo2 = shardInfo1 ++ Array(ShardInfo("shard-002", "TRIM_HORIZON", "")) 53 | val kso2 = KinesisSourceOffset( 54 | new ShardOffsets(1L, "bar", shardInfo2)) 55 | 56 | val shardInfo3 = shardInfo2 ++ Array(ShardInfo("shard-003", "AFTER_SEQUENCE_NUMBER", "2342")) 57 | val kso3 = KinesisSourceOffset( 58 | new ShardOffsets(1L, "bar", shardInfo3) 59 | ) 60 | 61 | compare(KinesisSourceOffset(SerializedOffset(kso1.json)), kso2) 62 | 63 | test("basic serialization - deserialization") { 64 | assert(KinesisSourceOffset.getShardOffsets(kso1) == 65 | KinesisSourceOffset.getShardOffsets(SerializedOffset(kso1.json))) 66 | } 67 | 68 | test("OffsetSeqLog serialization - deserialization") { 69 | withTempDir { temp => 70 | // use non-existent directory to test whether log make the dir 71 | val dir = new File(temp, "dir") 72 | val metadataLog = new OffsetSeqLog(spark, dir.getAbsolutePath) 73 | val batch0 = OffsetSeq.fill(kso1) 74 | val batch1 = OffsetSeq.fill(kso2, kso3) 75 | 76 | val batch0Serialized = OffsetSeq.fill(batch0.offsets.flatMap(_.map(o => 77 | SerializedOffset(o.json))): _*) 78 | 79 | val batch1Serialized = OffsetSeq.fill(batch1.offsets.flatMap(_.map(o => 80 | SerializedOffset(o.json))): _*) 81 | 82 | assert(metadataLog.add(0, batch0)) 83 | assert(metadataLog.getLatest() === Some(0 -> batch0Serialized)) 84 | assert(metadataLog.get(0) === Some(batch0Serialized)) 85 | 86 | assert(metadataLog.add(1, batch1)) 87 | assert(metadataLog.get(0) === Some(batch0Serialized)) 88 | assert(metadataLog.get(1) === Some(batch1Serialized)) 89 | assert(metadataLog.getLatest() === Some(1 -> batch1Serialized)) 90 | assert(metadataLog.get(None, Some(1)) === 91 | Array(0 -> batch0Serialized, 1 -> batch1Serialized)) 92 | 93 | // Adding the same batch does nothing 94 | metadataLog.add(1, OffsetSeq.fill(LongOffset(3))) 95 | assert(metadataLog.get(0) === Some(batch0Serialized)) 96 | assert(metadataLog.get(1) === Some(batch1Serialized)) 97 | assert(metadataLog.getLatest() === Some(1 -> batch1Serialized)) 98 | assert(metadataLog.get(None, Some(1)) === 99 | Array(0 -> batch0Serialized, 1 -> batch1Serialized)) 100 | } 101 | } 102 | 103 | 104 | } 105 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/kinesis/KinesisTestUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import java.nio.ByteBuffer 21 | import java.nio.charset.StandardCharsets 22 | import java.util.concurrent.TimeUnit 23 | 24 | import scala.collection.JavaConverters._ 25 | import scala.collection.mutable 26 | import scala.collection.mutable.ArrayBuffer 27 | import scala.util.{Failure, Random, Success, Try} 28 | 29 | import com.amazonaws.auth.{AWSCredentials, DefaultAWSCredentialsProviderChain} 30 | import com.amazonaws.regions.RegionUtils 31 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClient 32 | import com.amazonaws.services.dynamodbv2.document.DynamoDB 33 | import com.amazonaws.services.kinesis.{AmazonKinesis, AmazonKinesisClient} 34 | import com.amazonaws.services.kinesis.model._ 35 | import com.amazonaws.services.kinesis.producer.{KinesisProducer => KPLProducer, KinesisProducerConfiguration, UserRecordResult} 36 | import com.google.common.util.concurrent.{FutureCallback, Futures} 37 | 38 | import org.apache.spark.internal.Logging 39 | 40 | private[kinesis] class KinesisTestUtils(streamShardCount: Int = 2) extends Logging { 41 | 42 | val endpointUrl = KinesisTestUtils.endpointUrl 43 | val regionName = KinesisTestUtils.getRegionNameByEndpoint(endpointUrl) 44 | 45 | private val createStreamTimeoutSeconds = 300 46 | private val describeStreamPollTimeSeconds = 1 47 | 48 | @volatile 49 | private var streamCreated = false 50 | 51 | @volatile 52 | private var _streamName: String = _ 53 | 54 | protected lazy val kinesisClient = { 55 | val client = new AmazonKinesisClient(KinesisTestUtils.getAWSCredentials()) 56 | client.setEndpoint(endpointUrl) 57 | client 58 | } 59 | 60 | /* 61 | private lazy val dynamoDB = { 62 | val dynamoDBClient = new AmazonDynamoDBClient(new DefaultAWSCredentialsProviderChain()) 63 | dynamoDBClient.setRegion(RegionUtils.getRegion(regionName)) 64 | new DynamoDB(dynamoDBClient) 65 | } 66 | */ 67 | 68 | protected def getProducer(aggregate: Boolean): KinesisDataGenerator = { 69 | if (!aggregate) { 70 | new SimpleDataGenerator(kinesisClient) 71 | } else { 72 | throw new UnsupportedOperationException("Aggregation is not supported through this code path") 73 | } 74 | } 75 | 76 | 77 | def streamName: String = { 78 | require(streamCreated, "Stream not yet created, call createStream() to create one") 79 | _streamName 80 | } 81 | 82 | def createStream(): Unit = { 83 | require(!streamCreated, "Stream already created") 84 | _streamName = findNonExistentStreamName() 85 | 86 | // Create a stream. The number of shards determines the provisioned throughput. 87 | logInfo(s"Creating stream ${_streamName}") 88 | val createStreamRequest = new CreateStreamRequest() 89 | createStreamRequest.setStreamName(_streamName) 90 | createStreamRequest.setShardCount(streamShardCount) 91 | kinesisClient.createStream(createStreamRequest) 92 | 93 | // The stream is now being created. Wait for it to become active. 94 | waitForStreamToBeActive(_streamName) 95 | streamCreated = true 96 | logInfo(s"Created stream ${_streamName}") 97 | } 98 | 99 | def getShards(): Seq[Shard] = { 100 | kinesisClient.describeStream(_streamName).getStreamDescription.getShards.asScala 101 | } 102 | 103 | def splitShard(shardId: String): Unit = { 104 | val splitShardRequest = new SplitShardRequest() 105 | splitShardRequest.withStreamName(_streamName) 106 | splitShardRequest.withShardToSplit(shardId) 107 | // Set a half of the max hash value 108 | splitShardRequest.withNewStartingHashKey("170141183460469231731687303715884105728") 109 | kinesisClient.splitShard(splitShardRequest) 110 | // Wait for the shards to become active 111 | waitForStreamToBeActive(_streamName) 112 | } 113 | 114 | def splitShard : (Integer, Integer) = { 115 | val shardToSplit = getShards().head 116 | splitShard(shardToSplit.getShardId) 117 | val (splitOpenShards, splitCloseShards) = getShards().partition { 118 | shard => shard.getSequenceNumberRange.getEndingSequenceNumber == null 119 | } 120 | (splitOpenShards.size, splitCloseShards.size) 121 | } 122 | 123 | def mergeShard(shardToMerge: String, adjacentShardToMerge: String): Unit = { 124 | val mergeShardRequest = new MergeShardsRequest 125 | mergeShardRequest.withStreamName(_streamName) 126 | mergeShardRequest.withShardToMerge(shardToMerge) 127 | mergeShardRequest.withAdjacentShardToMerge(adjacentShardToMerge) 128 | kinesisClient.mergeShards(mergeShardRequest) 129 | // Wait for the shards to become active 130 | waitForStreamToBeActive(_streamName) 131 | } 132 | 133 | 134 | def mergeShard: (Integer, Integer) = { 135 | val (openShard, closeShard) = getShards().partition { 136 | shard => shard.getSequenceNumberRange.getEndingSequenceNumber == null 137 | } 138 | val Seq(shardToMerge, adjShard) = openShard 139 | mergeShard(shardToMerge.getShardId, adjShard.getShardId) 140 | val shardToSplit = getShards().head 141 | val (mergedOpenShards, mergedCloseShards) = 142 | getShards().partition { 143 | shard => shard.getSequenceNumberRange.getEndingSequenceNumber == null 144 | } 145 | (mergedOpenShards.size, mergedCloseShards.size) 146 | } 147 | 148 | /** 149 | * Push data to Kinesis stream and return a map of 150 | * shardId -> seq of (data, seq number) pushed to corresponding shard 151 | */ 152 | def pushData(testData: Array[String], aggregate: Boolean): Map[String, Seq[(String, String)]] = { 153 | require(streamCreated, "Stream not yet created, call createStream() to create one") 154 | val producer = getProducer(aggregate) 155 | val shardIdToSeqNumbers = producer.sendData(streamName, testData) 156 | logInfo(s"Pushed $testData:\n\t ${shardIdToSeqNumbers.mkString("\n\t")}") 157 | shardIdToSeqNumbers 158 | } 159 | 160 | /** 161 | * Expose a Python friendly API. 162 | */ 163 | def pushData(testData: java.util.List[String]): Unit = { 164 | pushData(testData.asScala.toArray, aggregate = false) 165 | } 166 | 167 | def deleteStream(): Unit = { 168 | try { 169 | if (streamCreated) { 170 | kinesisClient.deleteStream(streamName) 171 | } 172 | } catch { 173 | case e: Exception => 174 | logWarning(s"Could not delete stream $streamName") 175 | } 176 | } 177 | 178 | /* 179 | def deleteDynamoDBTable(tableName: String): Unit = { 180 | try { 181 | val table = dynamoDB.getTable(tableName) 182 | table.delete() 183 | table.waitForDelete() 184 | } catch { 185 | case e: Exception => 186 | logWarning(s"Could not delete DynamoDB table $tableName") 187 | } 188 | } 189 | */ 190 | 191 | private def describeStream(streamNameToDescribe: String): Option[StreamDescription] = { 192 | try { 193 | val describeStreamRequest = new DescribeStreamRequest().withStreamName(streamNameToDescribe) 194 | val desc = kinesisClient.describeStream(describeStreamRequest).getStreamDescription() 195 | Some(desc) 196 | } catch { 197 | case rnfe: ResourceNotFoundException => 198 | None 199 | } 200 | } 201 | 202 | private def findNonExistentStreamName(): String = { 203 | var testStreamName: String = null 204 | do { 205 | Thread.sleep(TimeUnit.SECONDS.toMillis(describeStreamPollTimeSeconds)) 206 | testStreamName = s"KinesisTestUtils-${math.abs(Random.nextLong())}" 207 | } while (describeStream(testStreamName).nonEmpty) 208 | testStreamName 209 | } 210 | 211 | private def waitForStreamToBeActive(streamNameToWaitFor: String): Unit = { 212 | val startTime = System.currentTimeMillis() 213 | val endTime = startTime + TimeUnit.SECONDS.toMillis(createStreamTimeoutSeconds) 214 | while (System.currentTimeMillis() < endTime) { 215 | Thread.sleep(TimeUnit.SECONDS.toMillis(describeStreamPollTimeSeconds)) 216 | describeStream(streamNameToWaitFor).foreach { description => 217 | val streamStatus = description.getStreamStatus() 218 | logDebug(s"\t- current state: $streamStatus\n") 219 | if ("ACTIVE".equals(streamStatus)) { 220 | return 221 | } 222 | } 223 | } 224 | require(false, s"Stream $streamName never became active") 225 | } 226 | } 227 | 228 | private[kinesis] object KinesisTestUtils { 229 | 230 | val envVarNameForEnablingTests = "ENABLE_KINESIS_SQL_TESTS" 231 | val endVarNameForEndpoint = "KINESIS_TEST_ENDPOINT_URL" 232 | val defaultEndpointUrl = "https://kinesis.us-east-1.amazonaws.com" 233 | val regionName: String = getRegionNameByEndpoint(endpointUrl) 234 | 235 | def getRegionNameByEndpoint(endpoint: String): String = { 236 | val uri = new java.net.URI(endpoint) 237 | RegionUtils.getRegionsForService(AmazonKinesis.ENDPOINT_PREFIX) 238 | .asScala 239 | .find(_.getAvailableEndpoints.asScala.toSeq.contains(uri.getHost)) 240 | .map(_.getName) 241 | .getOrElse( 242 | throw new IllegalArgumentException(s"Could not resolve region for endpoint: $endpoint")) 243 | } 244 | 245 | lazy val shouldRunTests = { 246 | val isEnvSet = sys.env.get(envVarNameForEnablingTests) == Some("1") 247 | if (isEnvSet) { 248 | // scalastyle:off println 249 | // Print this so that they are easily visible on the console and not hidden in the log4j logs. 250 | println( 251 | s""" 252 | |Kinesis tests that actually send data has been enabled by setting the environment 253 | |variable $envVarNameForEnablingTests to 1. This will create Kinesis Streams 254 | |in AWS. Please be aware that this may incur some AWS costs. 255 | |By default, the tests use the endpoint URL $defaultEndpointUrl to create Kinesis streams. 256 | |To change this endpoint URL to a different region, you can set the environment variable 257 | |$endVarNameForEndpoint to the desired endpoint URL 258 | |(e.g. $endVarNameForEndpoint="https://kinesis.us-west-2.amazonaws.com"). 259 | """.stripMargin) 260 | // scalastyle:on println 261 | } 262 | isEnvSet 263 | } 264 | 265 | lazy val endpointUrl = { 266 | val url = sys.env.getOrElse(endVarNameForEndpoint, defaultEndpointUrl) 267 | // scalastyle:off println 268 | // Print this so that they are easily visible on the console and not hidden in the log4j logs. 269 | println(s"Using endpoint URL $url for creating Kinesis streams for tests.") 270 | // scalastyle:on println 271 | url 272 | } 273 | 274 | def isAWSCredentialsPresent: Boolean = { 275 | Try { new DefaultAWSCredentialsProviderChain().getCredentials() }.isSuccess 276 | } 277 | 278 | def getAWSCredentials(): AWSCredentials = { 279 | assert(shouldRunTests, 280 | "Kinesis test not enabled, should not attempt to get AWS credentials") 281 | Try { new DefaultAWSCredentialsProviderChain().getCredentials() } match { 282 | case Success(cred) => 283 | cred 284 | case Failure(e) => 285 | throw new Exception( 286 | s""" 287 | |Kinesis tests enabled using environment variable $envVarNameForEnablingTests 288 | |but could not find AWS credentials. Please follow instructions in AWS documentation 289 | |to set the credentials in your system such that the DefaultAWSCredentialsProviderChain 290 | |can find the credentials. 291 | """.stripMargin) 292 | } 293 | } 294 | } 295 | 296 | /** A wrapper interface that will allow us to consolidate the code for synthetic data generation. */ 297 | private[kinesis] trait KinesisDataGenerator { 298 | /** Sends the data to Kinesis and returns the metadata for everything that has been sent. */ 299 | def sendData(streamName: String, data: Array[String]): Map[String, Seq[(String, String)]] 300 | } 301 | 302 | private[kinesis] class SimpleDataGenerator( 303 | client: AmazonKinesisClient) extends KinesisDataGenerator { 304 | override def sendData(streamName: String, data: Array[String]): 305 | Map[String, Seq[(String, String)]] = { 306 | val shardIdToSeqNumbers = 307 | new mutable.HashMap[String, ArrayBuffer[(String, String)]]() 308 | data.foreach { num => 309 | val str = num.toString 310 | val data = ByteBuffer.wrap(str.getBytes(StandardCharsets.UTF_8)) 311 | val putRecordRequest = new PutRecordRequest().withStreamName(streamName) 312 | .withData(data) 313 | .withPartitionKey(str) 314 | 315 | val putRecordResult = client.putRecord(putRecordRequest) 316 | val shardId = putRecordResult.getShardId 317 | val seqNumber = putRecordResult.getSequenceNumber() 318 | val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId, 319 | new ArrayBuffer[(String, String)]()) 320 | sentSeqNumbers += ((num, seqNumber)) 321 | } 322 | 323 | shardIdToSeqNumbers.toMap 324 | } 325 | } 326 | 327 | private[kinesis] class KPLBasedKinesisTestUtils(streamShardCount: Int = 2) 328 | extends KinesisTestUtils(streamShardCount) { 329 | override protected def getProducer(aggregate: Boolean): KinesisDataGenerator = { 330 | if (!aggregate) { 331 | new SimpleDataGenerator(kinesisClient) 332 | } else { 333 | new KPLDataGenerator(regionName) 334 | } 335 | } 336 | } 337 | 338 | /** A wrapper for the KinesisProducer provided in the KPL. */ 339 | private[kinesis] class KPLDataGenerator(regionName: String) extends KinesisDataGenerator { 340 | 341 | private lazy val producer: KPLProducer = { 342 | val conf = new KinesisProducerConfiguration() 343 | .setRecordMaxBufferedTime(1000) 344 | .setMaxConnections(1) 345 | .setRegion(regionName) 346 | .setMetricsLevel("none") 347 | 348 | new KPLProducer(conf) 349 | } 350 | 351 | override def sendData(streamName: String, 352 | data: Array[String]): Map[String, Seq[(String, String)]] = { 353 | val shardIdToSeqNumbers = new mutable.HashMap[String, ArrayBuffer[(String, String)]]() 354 | data.foreach { num => 355 | val str = num.toString 356 | val data = ByteBuffer.wrap(str.getBytes(StandardCharsets.UTF_8)) 357 | val future = producer.addUserRecord(streamName, str, data) 358 | val kinesisCallBack = new FutureCallback[UserRecordResult]() { 359 | override def onFailure(t: Throwable): Unit = {} // do nothing 360 | 361 | override def onSuccess(result: UserRecordResult): Unit = { 362 | val shardId = result.getShardId 363 | val seqNumber = result.getSequenceNumber() 364 | val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId, 365 | new ArrayBuffer[(String, String)]()) 366 | sentSeqNumbers += ((num, seqNumber)) 367 | } 368 | } 369 | Futures.addCallback(future, kinesisCallBack) 370 | } 371 | producer.flushSync() 372 | shardIdToSeqNumbers.toMap 373 | } 374 | } 375 | 376 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/kinesis/ShardSyncerSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.kinesis 19 | 20 | import com.amazonaws.services.kinesis.model.{SequenceNumberRange, Shard} 21 | 22 | import org.apache.spark.SparkFunSuite 23 | import org.apache.spark.sql.test.SharedSparkSession 24 | 25 | class ShardSyncerSuite extends SparkFunSuite with SharedSparkSession { 26 | 27 | val latestShards = Seq(createShard("shard1", "1")) 28 | val prevShardInfo = Seq(new ShardInfo("shard0", new AfterSequenceNumber("0"))) 29 | 30 | test("Should error out when failondataloss is true and a shard is deleted") { 31 | val ex = intercept[ IllegalStateException ] { 32 | ShardSyncer.getLatestShardInfo(latestShards, prevShardInfo, 33 | InitialKinesisPosition.fromPredefPosition(new TrimHorizon), true) 34 | } 35 | } 36 | 37 | test("Should error out when failondataloss is false and a shard is deleted") { 38 | val expectedShardInfo = Seq(new ShardInfo("Shard1", new TrimHorizon)) 39 | val latest: Seq[ShardInfo] = ShardSyncer.getLatestShardInfo( 40 | latestShards, prevShardInfo, InitialKinesisPosition.fromPredefPosition(new TrimHorizon), 41 | false) 42 | assert(latest.nonEmpty) 43 | assert(latest(0).shardId === "Shard1") 44 | assert(latest(0).iteratorType === new TrimHorizon().iteratorType ) 45 | } 46 | 47 | private def createShard(shardId: String, seqNum: String): Shard = { 48 | new Shard() 49 | .withShardId("Shard1") 50 | .withSequenceNumberRange( 51 | new SequenceNumberRange().withStartingSequenceNumber("1") 52 | ) 53 | } 54 | 55 | } 56 | --------------------------------------------------------------------------------