├── .circleci └── config.yml ├── .github └── pull_request_template.md ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── produce-records.rb ├── setup.cfg ├── setup.py ├── target_config.json ├── target_stitch └── __init__.py └── tests ├── __init__.py ├── activate_version_tests.py ├── doesnt_validate.json ├── empty_key_properties.json ├── gate_mocks.py ├── integration_tests.py ├── record_missing_key_property.json ├── test_target_stitch.py └── versioned_stream.json /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | workflows: 4 | build: 5 | jobs: 6 | - build: 7 | context: 8 | - circleci-user 9 | 10 | jobs: 11 | build: 12 | docker: 13 | - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/sources-python:0.7.0 14 | steps: 15 | - checkout 16 | - run: 17 | name: 'Setup virtualenv' 18 | command: | 19 | pyenv global 3.9.6 20 | mkdir -p ~/.virtualenvs 21 | python3 -m venv ~/.virtualenvs/target-stitch 22 | source ~/.virtualenvs/target-stitch/bin/activate 23 | pip install -U pip setuptools 24 | pip install -e .[dev] 25 | pip install -U pylint 26 | - run: 27 | name: 'Run tests' 28 | command: | 29 | # Need to re-activate the virtualenv 30 | source ~/.virtualenvs/target-stitch/bin/activate 31 | nosetests -v tests/activate_version_tests.py 32 | nosetests -v --ignore-files=activate_version_tests.py 33 | #nosetests 34 | pylint target_stitch "--extension-pkg-whitelist=ciso8601" --max-positional-arguments=8 -d 'global-variable-not-assigned, consider-using-generator, broad-exception-raised, unused-argument' 35 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Description of change 2 | (write a short description here or paste a link to JIRA) 3 | 4 | # QA steps 5 | - [ ] automated tests passing 6 | - [ ] manual qa steps passing (list below) 7 | 8 | # Risks 9 | 10 | # Rollback steps 11 | - revert this branch 12 | 13 | #### AI generated code 14 | https://internal.qlik.dev/general/ways-of-working/code-reviews/#guidelines-for-ai-generated-code 15 | - [ ] this PR has been written with the help of GitHub Copilot or another generative AI tool 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | data-*.txt 92 | *~ 93 | \#* 94 | .\#* 95 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 4.0.1 4 | * Bump aiohttp from 3.8.5 to 3.11.9 5 | * Bump requests from 2.31.0 to 3.32.3 [#112] (https://github.com/singer-io/target-stitch/pull/112) 6 | 7 | ## 4.0.0 8 | * Bump singer-python to version `6.0.0`, which adds support for python `3.10+` but is no longer compatible with python `3.5` 9 | * Bumps requests and aiohttp libraries to more secure versions [#108](https://github.com/singer-io/target-stitch/pull/108) 10 | 11 | ## 3.2.2 12 | * Remove unused dependencies [#107](https://github.com/singer-io/target-stitch/pull/107) 13 | 14 | ## 3.2.1 15 | * Updated dependencies to support Python 3.9.6, deprecated support for Python 3.5.X. [#104](https://github.com/singer-io/target-stitch/pull/104) 16 | 17 | ## 3.2.0 18 | * Log how many records appear in a batch and note the number of bytes [#98](https://github.com/singer-io/target-stitch/pull/98) 19 | 20 | ## 3.1.1 21 | * Fix a bug related to buffering records per stream that would cause state to not be emitted during certain edge conditions [#96](https://github.com/singer-io/target-stitch/pull/96) 22 | 23 | ## 3.1.0 24 | * Buffer records per stream so that changing streams does not flush records [#94](https://github.com/singer-io/target-stitch/pull/94) 25 | 26 | ## 3.0.3 27 | * Generates sequence numbers based on nanosecond time to avoid collisions with small, async batches [#90](https://github.com/singer-io/target-stitch/pull/90) 28 | 29 | ## 3.0.1 30 | * Removes requirement for `connection_ns` property. 31 | 32 | ## 3.0.0 33 | * Adds new configuration properties - `small_batch_url`, `big_batch_url` and `batch_size_preferences` - for internal Stitch use. 34 | 35 | ## 2.0.7 36 | * Any exception in flush_state callback will set SEND_EXCEPTION resulting in the termination of the main thread and process. 37 | 38 | ## 2.0.5 39 | * Emits final state after all records have been pushed to Stitch, before exit [#71](https://github.com/singer-io/target-stitch/pull/71) 40 | 41 | ## 1.8.1 42 | * Updates `requests` to version `2.20.0` in response to CVE 2018-18074 43 | 44 | ## 1.7.6 45 | * Flush buffer if enough time has passed when state message is received [#57](https://github.com/singer-io/target-stitch/pull/57) 46 | 47 | ## 1.7.5 48 | * Throw an error in the ValidationHandler if schema validation fails. 49 | 50 | ## 1.7.4 51 | * Generate unique sequence numbers based on the current time millis with an appended zero-padded message number 52 | 53 | ## 1.7.3 54 | * Update to singer-python==5.0.15 to use the change to `RecordMessage.asdict` for serialization of `time_extracted` 55 | 56 | ## 1.7.2 57 | * Updates serialize to format `time_extracted` in a cross platform way, using `singer.utils.strftime` 58 | 59 | ## 1.7.1 60 | * Allows the push to the Stitch API to bypass SSL verification if an env variable is set [#45](https://github.com/singer-io/target-stitch/pull/45) 61 | * Updates error message to clarify when a message is too large for the Stitch API [#47](https://github.com/singer-io/target-stitch/pull/47) 62 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # target-stitch 2 | 3 | Reads [Singer](https://singer.io) formatted data from stdin and persists it to the Stitch Import API. 4 | 5 | ## Install 6 | 7 | Requires Python 3.5.6 8 | 9 | ```bash 10 | › pip install target-stitch 11 | ``` 12 | 13 | ## Use 14 | 15 | target-stitch takes two types of input: 16 | 17 | 1. A config file containing your Stitch client id and access token 18 | 2. A stream of Singer-formatted data on stdin 19 | 20 | Create config file to contain your Stitch client id and token: 21 | 22 | ```json 23 | { 24 | "client_id" : 1234, 25 | "token" : "asdkjqbawsdciobasdpkjnqweobdclakjsdbcakbdsac", 26 | "small_batch_url": "https://api.stitchdata.com/v2/import/batch", 27 | "big_batch_url": "https://api.stitchdata.com/v2/import/batch", 28 | "batch_size_preferences": {} 29 | } 30 | ``` 31 | ```bash 32 | › tap-some-api | target-stitch --config config.json 33 | ``` 34 | 35 | where `tap-some-api` is [Singer Tap](https://singer.io). 36 | 37 | --- 38 | 39 | Copyright © 2017 Stitch 40 | -------------------------------------------------------------------------------- /produce-records.rb: -------------------------------------------------------------------------------- 1 | require 'json' 2 | 3 | $TABLE_NAME='postgres_full_table_replication_test' 4 | records = File.open('records.json', 'w') 5 | 6 | schema = {"stream" => $TABLE_NAME, 7 | "bookmark_properties" => [], 8 | "key_properties" => ["id"], 9 | "schema" => { "selected" => true, 10 | "type" => "object", 11 | "properties" => { 12 | "our_real" => {"selected" => true, "inclusion" => "available", "type" => ["null", "number"]}, 13 | "our_smallint" => {"selected" => true, "minimum" => -32768, "maximum" => 32767, "inclusion" => "available", "type" => ["null", "integer"]}, 14 | "OUR DATE" => {"selected" => true, "type" => ["null", "string"], "inclusion" => "available", "format" => "date-time"}, 15 | "id" => {"selected" => true, "minimum" => -2147483648, "maximum" => 2147483647, "inclusion" => "automatic", "type" => ["integer"]}, 16 | "our_bigint" => {"selected" => true, "minimum" => -9223372036854775808, "maximum" => 9223372036854775807, "inclusion" => "available", "type" => ["null", "integer"]}, 17 | "our_integer" => {"selected" => true, "minimum" => -2147483648, "maximum" => 2147483647, "inclusion" => "available", "type" => ["null", "integer"]}, 18 | "our_boolean" => {"selected" => true, "inclusion" => "available", "type" => ["null", "boolean"]}, 19 | "our_double" => {"selected" => true, "inclusion" => "available", "type" => ["null", "number"]}, 20 | "our_json" => {"selected" => true, "inclusion" => "available", "type" => ["null", "string"]}, 21 | "our_store" => {"selected" => true, "type" => ["null", "object"], "inclusion" => "available", "properties" => {}}, 22 | "our_decimal" => {"exclusiveMinimum" => true, "minimum" => -10000000000, "exclusiveMaximum" => true, "inclusion" => "available", "selected" => true, 23 | "multipleOf" => 0.01, "maximum" => 10000000000, "type" => ["null", "number"]}, 24 | "our_text" => {"selected" => true, "inclusion" => "available", "type" => ["null", "string"]}, 25 | 26 | 27 | "our_real2" => {"selected" => true, "inclusion" => "available", "type" => ["null", "number"]}, 28 | "our_smallint2" => {"selected" => true, "minimum" => -32768, "maximum" => 32767, "inclusion" => "available", "type" => ["null", "integer"]}, 29 | "OUR DATE2" => {"selected" => true, "type" => ["null", "string"], "inclusion" => "available", "format" => "date-time"}, 30 | "our_bigint2" => {"selected" => true, "minimum" => -9223372036854775808, "maximum" => 9223372036854775807, "inclusion" => "available", "type" => ["null", "integer"]}, 31 | "our_integer2" => {"selected" => true, "minimum" => -2147483648, "maximum" => 2147483647, "inclusion" => "available", "type" => ["null", "integer"]}, 32 | "our_boolean2" => {"selected" => true, "inclusion" => "available", "type" => ["null", "boolean"]}, 33 | "our_double2" => {"selected" => true, "inclusion" => "available", "type" => ["null", "number"]}, 34 | "our_json2" => {"selected" => true, "inclusion" => "available", "type" => ["null", "string"]}, 35 | "our_store2" => {"selected" => true, "type" => ["null", "object"], "inclusion" => "available", "properties" => {}}, 36 | "our_decimal2" => {"exclusiveMinimum" => true, "minimum" => -10000000000, "exclusiveMaximum" => true, "inclusion" => "available", "selected" => true, 37 | "multipleOf" => 0.01, "maximum" => 10000000000, "type" => ["null", "number"]}, 38 | "our_text2" => {"selected" => true, "inclusion" => "available", "type" => ["null", "string"]} 39 | }}, 40 | "type" => "SCHEMA"} 41 | 42 | records.puts( schema.to_json() ) 43 | 44 | 1000000.times do |i| 45 | records.puts( { "stream" => $TABLE_NAME, 46 | "record" => { 47 | "our_real" => 1.2, 48 | "our_smallint" => 100, 49 | "OUR DATE" => "1998-03-04T00:00:00+00:00", 50 | "id" => i, 51 | "our_bigint" => 1000000, 52 | "our_integer" => 44100, 53 | "our_boolean" => true, 54 | "our_double" => 1.1, 55 | "our_json" => "{\"secret\" => 55}", 56 | "our_store" => {"name" => "betty", "dances" => "floor"}, 57 | "our_decimal" => 0.01, 58 | "our_text" => "some text", 59 | 60 | "our_real2" => 1.2, 61 | "our_smallint2" => 100, 62 | "OUR DATE2" => "1998-03-04T00:00:00+00:00", 63 | "our_bigint2" => 1000000, 64 | "our_integer2" => 44100, 65 | "our_boolean2" => true, 66 | "our_double2" => 1.1, 67 | "our_json2" => "{\"secret\" => 55}", 68 | "our_store2" => {"name" => "betty", "dances" => "floor"}, 69 | "our_decimal2" => 0.01, 70 | "our_text2" => " I've seen things you people wouldn't believe. Attack ships on fire off the shoulder of Orion. I watched c-beams glitter in the dark near the Tannhauser Gate. All those moments will be lost in time, like tears in rain. Time to die." }, 71 | "time_extracted" => "2019-06-18T17:10:05.878611Z", 72 | "version" => 1560877805878, "type" => "RECORD"}.to_json() ) 73 | if i % 10 == 0 74 | records.puts({"value" => {"bookmarks" => {"dev-public-postgres_full_table_replication_test" => 75 | {"last_replication_method" => "FULL_TABLE", "version" => 1561124881384, "xmin" => i}}, 76 | "currently_syncing" => "dev-public-postgres_full_table_replication_test"}, "type" => "STATE"}.to_json()) 77 | end 78 | end 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | 5 | setup(name='target-stitch', 6 | version='4.0.1', 7 | description='Singer.io target for the Stitch API', 8 | author='Stitch', 9 | url='https://singer.io', 10 | classifiers=['Programming Language :: Python :: 3 :: Only'], 11 | py_modules=['target_stitch'], 12 | install_requires=[ 13 | 'jsonschema==2.6.0', 14 | 'mock==2.0.0', 15 | 'requests==2.32.3', 16 | 'singer-python==6.0.0', 17 | 'psutil==5.6.6', 18 | 'simplejson==3.11.1', 19 | 'aiohttp==3.11.9', 20 | 'ciso8601', 21 | ], 22 | extras_require={ 23 | 'dev': [ 24 | 'nose==1.3.7', 25 | 'astroid==2.1.0', 26 | 'pylint==2.1.1' 27 | ] 28 | }, 29 | entry_points=''' 30 | [console_scripts] 31 | target-stitch=target_stitch:main 32 | ''', 33 | packages=['target_stitch'], 34 | ) 35 | -------------------------------------------------------------------------------- /target_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "client_id": 3, 3 | "token": "some-token", 4 | "small_batch_url": "https://api.stitchdata.com/v2/import/batch", 5 | "big_batch_url": "https://api.stitchdata.com/v2/import/batch", 6 | "batch_size_preferences": {} 7 | } 8 | -------------------------------------------------------------------------------- /target_stitch/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # pylint: disable=too-many-arguments,invalid-name,too-many-nested-blocks,line-too-long,missing-docstring,global-statement, broad-except 3 | 4 | ''' 5 | Target for Stitch API. 6 | ''' 7 | 8 | import argparse 9 | import copy 10 | import gzip 11 | import http.client 12 | import io 13 | import json 14 | import os 15 | import re 16 | import sys 17 | import time 18 | import urllib 19 | import functools 20 | 21 | from threading import Thread 22 | from contextlib import contextmanager 23 | from collections import namedtuple 24 | from datetime import datetime, timezone 25 | from decimal import Decimal, getcontext 26 | import asyncio 27 | import concurrent 28 | from pprint import pformat 29 | import simplejson 30 | import psutil 31 | 32 | import aiohttp 33 | from aiohttp.client_exceptions import ClientConnectorError, ClientResponseError 34 | 35 | from jsonschema import ValidationError, Draft4Validator, FormatChecker 36 | import pkg_resources 37 | import backoff 38 | 39 | import singer 40 | from singer import metrics 41 | import ciso8601 42 | 43 | LOGGER = singer.get_logger().getChild('target_stitch') 44 | 45 | # We use this to store schema and key properties from SCHEMA messages 46 | StreamMeta = namedtuple('StreamMeta', ['schema', 'key_properties', 'bookmark_properties']) 47 | 48 | BIGBATCH_MAX_BATCH_BYTES = 20000000 49 | DEFAULT_MAX_BATCH_BYTES = 4000000 50 | DEFAULT_MAX_BATCH_RECORDS = 20000 51 | MILLISECOND_SEQUENCE_MULTIPLIER = 1000 52 | NANOSECOND_SEQUENCE_MULTIPLIER = 1000000 53 | 54 | # This is our singleton aiohttp session 55 | OUR_SESSION = None 56 | 57 | # This datastructure contains our pending aiohttp requests. 58 | # The main thread will read from it. 59 | # The event loop thread will write to it by appending new requests to it and removing completed requests. 60 | PENDING_REQUESTS = [] 61 | 62 | # This variable holds any exceptions we have encountered sending data to the gate. 63 | # The main thread will read from it and terminate the target if an exception is present. 64 | # The event loop thread will write to it after each aiohttp request completes 65 | SEND_EXCEPTION = None 66 | 67 | CONFIG = {} 68 | 69 | def start_loop(loop): 70 | asyncio.set_event_loop(loop) 71 | global OUR_SESSION 72 | timeout = aiohttp.ClientTimeout(sock_connect=60, sock_read=60) 73 | OUR_SESSION = aiohttp.ClientSession(connector=aiohttp.TCPConnector(loop=loop), timeout=timeout) 74 | loop.run_forever() 75 | 76 | new_loop = asyncio.new_event_loop() 77 | # new_loop.set_debug(True) 78 | t = Thread(target=start_loop, args=(new_loop,)) 79 | 80 | #The event loop thread should not keep the process alive after the main thread terminates 81 | t.daemon = True 82 | 83 | t.start() 84 | 85 | 86 | class TargetStitchException(Exception): 87 | '''A known exception for which we don't need to print a stack trace''' 88 | 89 | class StitchClientResponseError(Exception): 90 | def __init__(self, status, response_body): 91 | self.response_body = response_body 92 | self.status = status 93 | super().__init__() 94 | 95 | class MemoryReporter(Thread): 96 | '''Logs memory usage every 30 seconds''' 97 | 98 | def __init__(self): 99 | self.process = psutil.Process() 100 | super().__init__(name='memory_reporter', daemon=True) 101 | 102 | def run(self): 103 | while True: 104 | LOGGER.debug('Virtual memory usage: %.2f%% of total: %s', 105 | self.process.memory_percent(), 106 | self.process.memory_info()) 107 | time.sleep(30.0) 108 | 109 | 110 | class Timings: 111 | '''Gathers timing information for the three main steps of the Tap.''' 112 | def __init__(self): 113 | self.last_time = time.time() 114 | self.timings = { 115 | 'serializing': 0.0, 116 | 'posting': 0.0, 117 | None: 0.0 118 | } 119 | 120 | @contextmanager 121 | def mode(self, mode): 122 | '''We wrap the big steps of the Tap in this context manager to accumulate 123 | timing info.''' 124 | 125 | start = time.time() 126 | yield 127 | end = time.time() 128 | self.timings[None] += start - self.last_time 129 | self.timings[mode] += end - start 130 | self.last_time = end 131 | 132 | 133 | def log_timings(self): 134 | '''We call this with every flush to print out the accumulated timings''' 135 | LOGGER.debug('Timings: unspecified: %.3f; serializing: %.3f; posting: %.3f;', 136 | self.timings[None], 137 | self.timings['serializing'], 138 | self.timings['posting']) 139 | 140 | TIMINGS = Timings() 141 | 142 | 143 | class BatchTooLargeException(TargetStitchException): 144 | '''Exception for when the records and schema are so large that we can't 145 | create a batch with even one record.''' 146 | 147 | def _log_backoff(details): 148 | (_, exc, _) = sys.exc_info() 149 | LOGGER.info( 150 | 'Error sending data to Stitch. Sleeping %d seconds before trying again: %s', 151 | details['wait'], exc) 152 | 153 | def parse_config(config_location): 154 | global CONFIG 155 | CONFIG = json.load(config_location) 156 | if not CONFIG.get('token'): 157 | raise Exception('Configuration is missing required "token" field') 158 | 159 | if not CONFIG.get('client_id'): 160 | raise Exception('Configuration is missing required "client_id"') 161 | 162 | if not isinstance(CONFIG.get('batch_size_preferences'), dict): 163 | raise Exception('Configuration is requires batch_size_preferences dictionary') 164 | 165 | if not CONFIG['batch_size_preferences'].get('full_table_streams'): 166 | CONFIG['batch_size_preferences']['full_table_streams'] = [] 167 | LOGGER.info('Using batch_size_prefernces of %s', CONFIG['batch_size_preferences']) 168 | 169 | if not CONFIG.get('turbo_boost_factor'): 170 | CONFIG['turbo_boost_factor'] = 1 171 | 172 | if CONFIG['turbo_boost_factor'] != 5: 173 | LOGGER.info('Using turbo_boost_factor of %s', CONFIG['turbo_boost_factor']) 174 | 175 | if not CONFIG.get('small_batch_url'): 176 | raise Exception('Configuration is missing required "small_batch_url"') 177 | 178 | if not CONFIG.get('big_batch_url'): 179 | raise Exception('Configuration is missing required "big_batch_url"') 180 | 181 | def determine_stitch_url(stream_name): 182 | batch_size_prefs = CONFIG.get('batch_size_preferences') 183 | if stream_name in batch_size_prefs.get('full_table_streams'): 184 | return CONFIG.get('big_batch_url') 185 | 186 | #eg. platform.heap requires S3 because it is fulltable data 187 | if batch_size_prefs.get('batch_size_preference') == 'bigbatch': 188 | return CONFIG.get('big_batch_url') 189 | 190 | if batch_size_prefs.get('batch_size_preference') == 'smallbatch': 191 | return CONFIG.get('small_batch_url') 192 | 193 | #NB> not implemented yet 194 | if batch_size_prefs.get('user_batch_size_preference') == 'bigbatch': 195 | return CONFIG.get('big_batch_url') 196 | 197 | #NB> not implemented yet 198 | if batch_size_prefs.get('user_batch_size_preference') == 'smallbatch': 199 | return CONFIG.get('small_batch_url') 200 | 201 | return CONFIG.get('small_batch_url') 202 | 203 | 204 | 205 | class StitchHandler: # pylint: disable=too-few-public-methods 206 | '''Sends messages to Stitch.''' 207 | 208 | def __init__(self, max_batch_bytes, max_batch_records): 209 | self.token = CONFIG.get('token') 210 | self.max_batch_bytes = max_batch_bytes 211 | self.max_batch_records = max_batch_records 212 | 213 | @staticmethod 214 | #this happens in the event loop 215 | def flush_states(state_writer, future): 216 | 217 | global PENDING_REQUESTS 218 | global SEND_EXCEPTION 219 | 220 | completed_count = 0 221 | 222 | #NB> if/when the first coroutine errors out, we will record it for examination by the main threa. 223 | #if/when this happens, no further flushing of state should ever occur. the main thread, in fact, 224 | #should shutdown quickly after it spots the exception 225 | if SEND_EXCEPTION is None: 226 | SEND_EXCEPTION = future.exception() 227 | 228 | if SEND_EXCEPTION is not None: 229 | LOGGER.info('FLUSH early exit because of SEND_EXCEPTION: %s', pformat(SEND_EXCEPTION)) 230 | return 231 | 232 | try: 233 | for f, s in PENDING_REQUESTS: 234 | if f.done(): 235 | completed_count = completed_count + 1 236 | #NB> this is a very import line. 237 | #NEVER blinding emit state just because a coroutine has completed. 238 | #if this were None, we would have just nuked the client's state 239 | if s: 240 | line = simplejson.dumps(s) 241 | state_writer.write(f"{line}\n") 242 | state_writer.flush() 243 | else: 244 | break 245 | 246 | PENDING_REQUESTS = PENDING_REQUESTS[completed_count:] 247 | 248 | except BaseException as err: 249 | SEND_EXCEPTION = err 250 | 251 | 252 | def headers(self): 253 | '''Return the headers based on the token''' 254 | return { 255 | 'Authorization': f'Bearer {self.token}', 256 | 'Content-Type': 'application/json' 257 | } 258 | 259 | def send(self, data, contains_activate_version, state_writer, state, stitch_url): 260 | '''Send the given data to Stitch, retrying on exceptions''' 261 | global PENDING_REQUESTS 262 | global SEND_EXCEPTION 263 | 264 | check_send_exception() 265 | 266 | headers = self.headers() 267 | verify_ssl = True 268 | if os.environ.get("TARGET_STITCH_SSL_VERIFY") == 'false': 269 | verify_ssl = False 270 | 271 | LOGGER.info("Sending batch of %d bytes to %s", len(data), stitch_url) 272 | 273 | #NB> before we send any activate_versions we must ensure that all PENDING_REQUETS complete. 274 | #this is to ensure ordering in the case of Full Table replication where the Activate Version, 275 | #must arrive AFTER all of the relevant data. 276 | if len(PENDING_REQUESTS) > 0 and contains_activate_version: 277 | LOGGER.info('Sending batch with ActivateVersion. Flushing PENDING_REQUESTS first') 278 | finish_requests() 279 | 280 | if len(PENDING_REQUESTS) >= CONFIG.get('turbo_boost_factor'): 281 | 282 | #wait for to finish the first future before resuming the main thread 283 | finish_requests(CONFIG.get('turbo_boost_factor') - 1) 284 | 285 | #NB> this schedules the task on the event loop thread. 286 | # it will be executed at some point in the future 287 | future = asyncio.run_coroutine_threadsafe(post_coroutine(stitch_url, headers, data, verify_ssl), new_loop) 288 | next_pending_request = (future, state) 289 | PENDING_REQUESTS.append(next_pending_request) 290 | future.add_done_callback(functools.partial(self.flush_states, state_writer)) 291 | 292 | 293 | def handle_state_only(self, state_writer=None, state=None): 294 | async def fake_future_fn(): 295 | pass 296 | 297 | global PENDING_REQUESTS 298 | #NB> no point in sending out this state if a previous request has failed 299 | check_send_exception() 300 | future = asyncio.run_coroutine_threadsafe(fake_future_fn(), new_loop) 301 | next_pending_request = (future, state) 302 | PENDING_REQUESTS.append(next_pending_request) 303 | 304 | future.add_done_callback(functools.partial(self.flush_states, state_writer)) 305 | 306 | 307 | def handle_batch(self, messages, contains_activate_version, schema, key_names, bookmark_names=None, state_writer=None, state=None, ): 308 | '''Handle messages by sending them to Stitch. 309 | 310 | If the serialized form of the messages is too large to fit into a 311 | single request this will break them up into multiple smaller 312 | requests. 313 | 314 | ''' 315 | 316 | stitch_url = determine_stitch_url(messages[0].stream) 317 | LOGGER.info("Serializing batch with %d messages for table %s", len(messages), messages[0].stream) 318 | with TIMINGS.mode('serializing'): 319 | bodies = serialize(messages, 320 | schema, 321 | key_names, 322 | bookmark_names, 323 | self.max_batch_bytes, 324 | self.max_batch_records) 325 | 326 | LOGGER.debug('Split batch into %d requests', len(bodies)) 327 | for i, body in enumerate(bodies): 328 | with TIMINGS.mode('posting'): 329 | LOGGER.debug('Request %d of %d is %d bytes', i + 1, len(bodies), len(body)) 330 | if len(body) > DEFAULT_MAX_BATCH_BYTES: 331 | stitch_url = CONFIG.get('big_batch_url') 332 | 333 | flushable_state = None 334 | if i + 1 == len(bodies): 335 | flushable_state = state 336 | 337 | self.send(body, contains_activate_version, state_writer, flushable_state, stitch_url) 338 | 339 | # Write a singer.metrics.Counter and set the value to the count of records being sent 340 | with metrics.Counter(metrics.Metric.record_count, tags={"endpoint": messages[0].stream, 341 | "num_bytes": sum([len(body) for body in bodies])}) as c: 342 | c.value = len([m for m in messages if isinstance(m, singer.RecordMessage)]) 343 | 344 | 345 | 346 | class LoggingHandler: # pylint: disable=too-few-public-methods 347 | '''Logs records to a local output file.''' 348 | def __init__(self, output_file, max_batch_bytes, max_batch_records): 349 | self.output_file = output_file 350 | self.max_batch_bytes = max_batch_bytes 351 | self.max_batch_records = max_batch_records 352 | 353 | def handle_state_only(self, state_writer=None, state=None): 354 | LOGGER.info("LoggingHandler handle_state_only: %s", state) 355 | if state: 356 | line = simplejson.dumps(state) 357 | state_writer.write(f"{line}\n") 358 | state_writer.flush() 359 | 360 | 361 | def handle_batch(self, messages, contains_activate_version, schema, key_names, bookmark_names=None, state_writer=None, state=None): #pylint: disable=unused-argument 362 | '''Handles a batch of messages by saving them to a local output file. 363 | 364 | Serializes records in the same way StitchHandler does, so the 365 | output file should contain the exact request bodies that we would 366 | send to Stitch. 367 | 368 | ''' 369 | LOGGER.info("LoggingHandler handle_batch") 370 | LOGGER.info("Saving batch with %d messages for table %s to %s", 371 | len(messages), messages[0].stream, self.output_file.name) 372 | for i, body in enumerate(serialize(messages, 373 | schema, 374 | key_names, 375 | bookmark_names, 376 | self.max_batch_bytes, 377 | self.max_batch_records)): 378 | LOGGER.debug("Request body %d is %d bytes", i, len(body)) 379 | self.output_file.write(body) 380 | self.output_file.write('\n') 381 | 382 | if state: 383 | line = simplejson.dumps(state) 384 | state_writer.write(f"{line}\n") 385 | state_writer.flush() 386 | 387 | 388 | 389 | class ValidatingHandler: # pylint: disable=too-few-public-methods 390 | '''Validates input messages against their schema.''' 391 | 392 | def __init__(self): 393 | getcontext().prec = 76 394 | 395 | def handle_state_only(self, state_writer=None, state=None): 396 | LOGGER.info("ValidatingHandler handle_state_only: %s", state) 397 | if state: 398 | line = simplejson.dumps(state) 399 | state_writer.write(f"{line}\n") 400 | state_writer.flush() 401 | 402 | def handle_batch(self, messages, contains_activate_version, schema, key_names, bookmark_names=None, state_writer=None, state=None): 403 | '''Handles messages by validating them against schema.''' 404 | LOGGER.info("ValidatingHandler handle_batch") 405 | validator = Draft4Validator(schema, format_checker=FormatChecker()) 406 | for i, message in enumerate(messages): 407 | if isinstance(message, singer.RecordMessage): 408 | try: 409 | validator.validate(message.record) 410 | if key_names: 411 | for k in key_names: 412 | if k not in message.record: 413 | raise TargetStitchException( 414 | f'Message {i} is missing key property {k}' 415 | ) 416 | except Exception as e: 417 | raise TargetStitchException( 418 | f'Record does not pass schema validation: {e}') from e 419 | 420 | # pylint: disable=undefined-loop-variable 421 | # NB: This seems incorrect as there's a chance message is not defined 422 | LOGGER.info('%s (%s): Batch is valid', 423 | messages[0].stream, 424 | len(messages)) 425 | if state: 426 | line = simplejson.dumps(state) 427 | state_writer.write(f"{line}\n") 428 | state_writer.flush() 429 | 430 | def generate_sequence(message_num, max_records): 431 | ''' 432 | Generates a unique sequence number based on the current time in nanoseconds 433 | with a zero-padded message number based on the index of the record within the 434 | magnitude of max_records. 435 | 436 | COMPATIBILITY: 437 | Maintains a historical width of 19 characters (with default `max_records`), in order 438 | to not overflow downstream processes that depend on the width of this number. 439 | 440 | Because of this requirement, `message_num` is modulo the difference between nanos 441 | and millis to maintain 19 characters. 442 | ''' 443 | nanosecond_sequence_base = str(int(time.time() * NANOSECOND_SEQUENCE_MULTIPLIER)) 444 | modulo = NANOSECOND_SEQUENCE_MULTIPLIER / MILLISECOND_SEQUENCE_MULTIPLIER 445 | zfill_width_mod = len(str(NANOSECOND_SEQUENCE_MULTIPLIER)) - len(str(MILLISECOND_SEQUENCE_MULTIPLIER)) 446 | 447 | # add an extra order of magnitude to account for the fact that we can 448 | # actually accept more than the max record count 449 | fill = len(str(10 * max_records)) - zfill_width_mod 450 | sequence_suffix = str(int(message_num % modulo)).zfill(fill) 451 | 452 | return int(nanosecond_sequence_base + sequence_suffix) 453 | 454 | def serialize(messages, schema, key_names, bookmark_names, max_bytes, max_records): 455 | '''Produces request bodies for Stitch. 456 | 457 | Builds a request body consisting of all the messages. Serializes it as 458 | JSON. If the result exceeds the request size limit, splits the batch 459 | in half and recurs. 460 | 461 | ''' 462 | serialized_messages = [] 463 | for idx, message in enumerate(messages): 464 | if isinstance(message, singer.RecordMessage): 465 | record_message = { 466 | 'action': 'upsert', 467 | 'data': message.record, 468 | 'sequence': generate_sequence(idx, max_records) 469 | } 470 | 471 | if message.time_extracted: 472 | #"%04Y-%m-%dT%H:%M:%S.%fZ" 473 | record_message['time_extracted'] = singer.utils.strftime(message.time_extracted) 474 | 475 | serialized_messages.append(record_message) 476 | elif isinstance(message, singer.ActivateVersionMessage): 477 | serialized_messages.append({ 478 | 'action': 'activate_version', 479 | 'sequence': generate_sequence(idx, max_records) 480 | }) 481 | 482 | body = { 483 | 'table_name': messages[0].stream, 484 | 'schema': schema, 485 | 'key_names': key_names, 486 | 'messages': serialized_messages 487 | } 488 | if messages[0].version is not None: 489 | body['table_version'] = messages[0].version 490 | 491 | if bookmark_names: 492 | body['bookmark_names'] = bookmark_names 493 | 494 | 495 | # We are not using Decimals for parsing here. We recognize that 496 | # exposes data to potential rounding errors. However, the Stitch API 497 | # as it is implemented currently is also subject to rounding errors. 498 | # This will affect very few data points and we have chosen to leave 499 | # conversion as is for now. 500 | 501 | serialized = simplejson.dumps(body) 502 | LOGGER.debug('Serialized %d messages into %d bytes', len(messages), len(serialized)) 503 | 504 | if len(serialized) < max_bytes: 505 | return [serialized] 506 | 507 | if len(messages) <= 1: 508 | if len(serialized) < BIGBATCH_MAX_BATCH_BYTES: 509 | return [serialized] 510 | raise BatchTooLargeException( 511 | f"A single record is larger than the Stitch API limit of {BIGBATCH_MAX_BATCH_BYTES // 1000000} Mb" 512 | ) 513 | 514 | 515 | pivot = len(messages) // 2 516 | l_half = serialize(messages[:pivot], schema, key_names, bookmark_names, max_bytes, max_records) 517 | r_half = serialize(messages[pivot:], schema, key_names, bookmark_names, max_bytes, max_records) 518 | return l_half + r_half 519 | 520 | 521 | class TargetStitch: 522 | '''Encapsulates most of the logic of target-stitch. 523 | 524 | Useful for unit testing. 525 | 526 | ''' 527 | 528 | # pylint: disable=too-many-instance-attributes 529 | def __init__(self, # pylint: disable=too-many-arguments 530 | handlers, 531 | state_writer, 532 | max_batch_bytes, 533 | max_batch_records, 534 | batch_delay_seconds): 535 | self.messages = {} 536 | self.contains_activate_version = {} 537 | self.buffer_size_bytes = {} 538 | self.state = None 539 | 540 | # Mapping from stream name to {'schema': ..., 'key_names': ..., 'bookmark_names': ... } 541 | self.stream_meta = {} 542 | 543 | # Instance of StitchHandler 544 | self.handlers = handlers 545 | 546 | # Writer that we write state records to 547 | self.state_writer = state_writer 548 | 549 | # Batch size limits. Stored as properties here so we can easily 550 | # change for testing. 551 | self.max_batch_bytes = max_batch_bytes 552 | self.max_batch_records = max_batch_records 553 | 554 | # Minimum frequency to send a batch, used with self.time_last_batch_sent 555 | self.batch_delay_seconds = batch_delay_seconds 556 | 557 | # Time that the last batch was sent 558 | self.time_last_batch_sent = time.time() 559 | 560 | 561 | 562 | def flush_stream(self, stream, is_final_stream): 563 | '''Send all the buffered messages to Stitch.''' 564 | 565 | messages = self.messages[stream] 566 | stream_meta = self.stream_meta[stream] 567 | 568 | # NB: We only want to include the state on the final stream we are 569 | # batching because this will prevent the state from flushing until 570 | # all of the streams are flushed because the state is global for 571 | # all streams so if one of the streams fails to batch we cannot 572 | # flush the state 573 | if is_final_stream: 574 | state = self.state 575 | else: 576 | state = None 577 | 578 | for handler in self.handlers: 579 | handler.handle_batch(messages, 580 | self.contains_activate_version.get(stream, False), 581 | stream_meta.schema, 582 | stream_meta.key_properties, 583 | stream_meta.bookmark_properties, 584 | self.state_writer, 585 | state) 586 | 587 | self.time_last_batch_sent = time.time() 588 | self.contains_activate_version[stream] = False 589 | self.buffer_size_bytes[stream] = 0 590 | self.messages[stream] = [] 591 | # NB: We can only clear the state if this is the final stream 592 | # flush. Otherwise we risk clearing out the state before we can 593 | # even send it. 594 | if is_final_stream: 595 | self.state = None 596 | 597 | 598 | def flush(self): 599 | # Have to keep track of how many streams we have looked at so we 600 | # know when we are flushing the final stream 601 | messages_to_flush = { stream: messages for stream, messages in self.messages.items() if len(messages) > 0 } 602 | num_flushed = 0 603 | num_streams = len(messages_to_flush) 604 | for stream, messages in messages_to_flush.items(): 605 | num_flushed += 1 606 | is_final_stream = num_flushed == num_streams 607 | self.flush_stream(stream, is_final_stream) 608 | # NB> State is usually handled above but in the case there are no messages 609 | # we still want to ensure state is emitted. 610 | if num_flushed == 0 and self.state: 611 | for handler in self.handlers: 612 | handler.handle_state_only(self.state_writer, self.state) 613 | self.state = None 614 | TIMINGS.log_timings() 615 | 616 | 617 | 618 | def handle_line(self, line): 619 | 620 | '''Takes a raw line from stdin and handles it, updating state and possibly 621 | flushing the batch to the Gate and the state to the output 622 | stream. 623 | 624 | ''' 625 | 626 | message = overloaded_parse_message(line) 627 | 628 | # If we got a Schema, set the schema and key properties for this 629 | # stream. Flush the batch, if there is one, in case the schema is 630 | # different. 631 | if isinstance(message, singer.SchemaMessage): 632 | self.flush() 633 | 634 | if message.stream not in self.messages: 635 | self.messages[message.stream] = [] 636 | self.stream_meta[message.stream] = StreamMeta( 637 | message.schema, 638 | message.key_properties, 639 | message.bookmark_properties) 640 | 641 | elif isinstance(message, (singer.RecordMessage, singer.ActivateVersionMessage)): 642 | current_stream = message.stream 643 | # NB> This previously would flush on a stream change. Because 644 | # we are now buffering records across streams we do not need 645 | # to flush on stream change 646 | if self.messages[current_stream] and (message.version != self.messages[current_stream][0].version): 647 | self.flush() 648 | 649 | self.messages[current_stream].append(message) 650 | self.buffer_size_bytes[current_stream] = self.buffer_size_bytes.get(current_stream, 0) + len(line) 651 | if isinstance(message, singer.ActivateVersionMessage): 652 | self.contains_activate_version[current_stream] = True 653 | 654 | num_bytes = sum(self.buffer_size_bytes.values()) 655 | num_messages = sum((len(messages) for messages in self.messages.values())) 656 | num_seconds = time.time() - self.time_last_batch_sent 657 | 658 | enough_bytes = num_bytes >= self.max_batch_bytes 659 | enough_messages = num_messages >= self.max_batch_records 660 | enough_time = num_seconds >= self.batch_delay_seconds 661 | if enough_bytes or enough_messages or enough_time: 662 | LOGGER.debug('Flushing %d bytes, %d messages, after %.2f seconds', 663 | num_bytes, num_messages, num_seconds) 664 | self.flush() 665 | 666 | elif isinstance(message, singer.StateMessage): 667 | self.state = message.value 668 | 669 | # only check time since state message does not increase num_messages or 670 | # num_bytes for the batch 671 | num_seconds = time.time() - self.time_last_batch_sent 672 | 673 | if num_seconds >= self.batch_delay_seconds: 674 | LOGGER.debug('Flushing %d bytes, %d messages, after %.2f seconds', 675 | sum(self.buffer_size_bytes.values()), 676 | sum(len(messages) for messages in self.messages.values()), num_seconds) 677 | self.flush() 678 | self.time_last_batch_sent = time.time() 679 | 680 | 681 | 682 | def consume(self, reader): 683 | '''Consume all the lines from the queue, flushing when done.''' 684 | for line in reader: 685 | self.handle_line(line) 686 | self.flush() 687 | 688 | 689 | def collect(): 690 | '''Send usage info to Stitch.''' 691 | 692 | try: 693 | version = pkg_resources.get_distribution('target-stitch').version 694 | conn = http.client.HTTPSConnection('collector.stitchdata.com', timeout=10) 695 | conn.connect() 696 | params = { 697 | 'e': 'se', 698 | 'aid': 'singer', 699 | 'se_ca': 'target-stitch', 700 | 'se_ac': 'open', 701 | 'se_la': version, 702 | } 703 | conn.request('GET', '/i?' + urllib.parse.urlencode(params)) 704 | conn.getresponse() 705 | conn.close() 706 | except: # pylint: disable=bare-except 707 | LOGGER.debug('Collection request failed') 708 | 709 | 710 | def main_impl(): 711 | '''We wrap this function in main() to add exception handling''' 712 | parser = argparse.ArgumentParser() 713 | 714 | parser.add_argument( 715 | '-c', '--config', 716 | help='Config file', 717 | type=argparse.FileType('r')) 718 | parser.add_argument( 719 | '-n', '--dry-run', 720 | help='Dry run - Do not push data to Stitch', 721 | action='store_true') 722 | parser.add_argument( 723 | '-o', '--output-file', 724 | help='Save requests to this output file', 725 | type=argparse.FileType('w')) 726 | parser.add_argument( 727 | '-v', '--verbose', 728 | help='Produce debug-level logging', 729 | action='store_true') 730 | parser.add_argument( 731 | '-q', '--quiet', 732 | help='Suppress info-level logging', 733 | action='store_true') 734 | parser.add_argument('--max-batch-records', type=int, default=DEFAULT_MAX_BATCH_RECORDS) 735 | parser.add_argument('--max-batch-bytes', type=int, default=DEFAULT_MAX_BATCH_BYTES) 736 | parser.add_argument('--batch-delay-seconds', type=float, default=300.0) 737 | args = parser.parse_args() 738 | 739 | if args.verbose: 740 | LOGGER.setLevel('DEBUG') 741 | elif args.quiet: 742 | LOGGER.setLevel('WARNING') 743 | 744 | handlers = [] 745 | if args.output_file: 746 | handlers.append(LoggingHandler(args.output_file, 747 | args.max_batch_bytes, 748 | args.max_batch_records)) 749 | if args.dry_run: 750 | handlers.append(ValidatingHandler()) 751 | elif not args.config: 752 | parser.error("config file required if not in dry run mode") 753 | else: 754 | parse_config(args.config) 755 | 756 | if not CONFIG.get('disable_collection'): 757 | LOGGER.info('Sending version information to stitchdata.com. ' + 758 | 'To disable sending anonymous usage data, set ' + 759 | 'the config parameter "disable_collection" to true') 760 | Thread(target=collect).start() 761 | handlers.append(StitchHandler(args.max_batch_bytes, 762 | args.max_batch_records)) 763 | 764 | # queue = Queue(args.max_batch_records) 765 | reader = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') 766 | target_stitch = TargetStitch(handlers, 767 | sys.stdout, 768 | args.max_batch_bytes, 769 | args.max_batch_records, 770 | args.batch_delay_seconds) 771 | target_stitch.consume(reader) 772 | 773 | #NB> we need to wait for this to be empty indicating that all of the 774 | #requests have been finished and their states flushed 775 | finish_requests() 776 | LOGGER.info("Requests complete, stopping loop") 777 | new_loop.call_soon_threadsafe(new_loop.stop) 778 | 779 | 780 | def finish_requests(max_count=0): 781 | global PENDING_REQUESTS 782 | while True: 783 | # LOGGER.info("Finishing %s requests:", len(PENDING_REQUESTS)) 784 | check_send_exception() 785 | if len(PENDING_REQUESTS) <= max_count: #pylint: disable=len-as-condition 786 | break 787 | time.sleep(1 / 1000.0) 788 | 789 | 790 | 791 | def check_send_exception(): 792 | try: 793 | global SEND_EXCEPTION 794 | if SEND_EXCEPTION: 795 | raise SEND_EXCEPTION 796 | 797 | # An StitchClientResponseError means we received > 2xx response 798 | # Try to parse the "message" from the 799 | # json body of the response, since Stitch should include 800 | # the human-oriented message in that field. If there are 801 | # any errors parsing the message, just include the 802 | # stringified response. 803 | except StitchClientResponseError as exc: 804 | try: 805 | msg = f"{str(exc.status)}: {exc.response_body}" 806 | except Exception: # pylint: disable=bare-except 807 | LOGGER.exception('Exception while processing error response') 808 | msg = f'{exc}' 809 | raise TargetStitchException('Error persisting data to Stitch: ' + 810 | msg) from exc 811 | 812 | # A ClientConnectorErrormeans we 813 | # couldn't even connect to stitch. The exception is likely 814 | # to be very long and gross. Log the full details but just 815 | # include the summary in the critical error message. 816 | except ClientConnectorError as exc: 817 | LOGGER.exception(exc) 818 | raise TargetStitchException('Error connecting to Stitch') from exc 819 | 820 | except concurrent.futures._base.TimeoutError as exc: #pylint: disable=protected-access 821 | raise TargetStitchException("Timeout sending to Stitch") from exc 822 | 823 | 824 | def exception_is_4xx(ex): 825 | return 400 <= ex.status < 500 826 | 827 | @backoff.on_exception(backoff.expo, 828 | StitchClientResponseError, 829 | max_tries=5, 830 | giveup=exception_is_4xx, 831 | on_backoff=_log_backoff) 832 | async def post_coroutine(url, headers, data, verify_ssl): 833 | # LOGGER.info("POST starting: %s ssl(%s)", url, verify_ssl) 834 | global OUR_SESSION 835 | async with OUR_SESSION.post(url, headers=headers, data=data, raise_for_status=False, verify_ssl=verify_ssl) as response: 836 | result_body = None 837 | try: 838 | result_body = await response.json() 839 | except BaseException as ex: #pylint: disable=unused-variable 840 | raise StitchClientResponseError(response.status, "unable to parse response body as json") from ex 841 | 842 | if response.status // 100 != 2: 843 | raise StitchClientResponseError(response.status, result_body) 844 | 845 | return result_body 846 | 847 | def _required_key(msg, k): 848 | if k not in msg: 849 | raise Exception(f"Message is missing required key '{k}': {msg}") 850 | 851 | return msg[k] 852 | 853 | def overloaded_parse_message(msg): 854 | """Parse a message string into a Message object.""" 855 | 856 | # We are not using Decimals for parsing here. 857 | # We recognize that exposes data to potentially 858 | # lossy conversions. However, this will affect 859 | # very few data points and we have chosen to 860 | # leave conversion as is for now. 861 | obj = simplejson.loads(msg, use_decimal=True) 862 | msg_type = _required_key(obj, 'type') 863 | 864 | if msg_type == 'RECORD': 865 | time_extracted = obj.get('time_extracted') 866 | if time_extracted: 867 | try: 868 | time_extracted = ciso8601.parse_datetime(time_extracted) 869 | except Exception: 870 | time_extracted = None 871 | return singer.RecordMessage(stream=_required_key(obj, 'stream'), 872 | record=_required_key(obj, 'record'), 873 | version=obj.get('version'), 874 | time_extracted=time_extracted) 875 | 876 | if msg_type == 'SCHEMA': 877 | return singer.SchemaMessage(stream=_required_key(obj, 'stream'), 878 | schema=_required_key(obj, 'schema'), 879 | key_properties=_required_key(obj, 'key_properties'), 880 | bookmark_properties=obj.get('bookmark_properties')) 881 | 882 | if msg_type == 'STATE': 883 | return singer.StateMessage(value=_required_key(obj, 'value')) 884 | 885 | if msg_type == 'ACTIVATE_VERSION': 886 | return singer.ActivateVersionMessage(stream=_required_key(obj, 'stream'), 887 | version=_required_key(obj, 'version')) 888 | return None 889 | 890 | 891 | def main(): 892 | '''Main entry point''' 893 | try: 894 | MemoryReporter().start() 895 | main_impl() 896 | 897 | # If we catch an exception at the top level we want to log a CRITICAL 898 | # line to indicate the reason why we're terminating. Sometimes the 899 | # extended stack traces can be confusing and this provides a clear way 900 | # to call out the root cause. If it's a known TargetStitchException we 901 | # can suppress the stack trace, otherwise we should include the stack 902 | # trace for debugging purposes, so re-raise the exception. 903 | except TargetStitchException as exc: 904 | for line in str(exc).splitlines(): 905 | LOGGER.critical(line) 906 | sys.exit(1) 907 | except Exception as exc: 908 | LOGGER.critical(exc) 909 | raise exc 910 | 911 | if __name__ == '__main__': 912 | main() 913 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/singer-io/target-stitch/a2a084ccdff70aeb0c313066e5ae523f183b67f1/tests/__init__.py -------------------------------------------------------------------------------- /tests/activate_version_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import target_stitch 3 | from target_stitch import StitchHandler, TargetStitchException, finish_requests 4 | import io 5 | import json 6 | import simplejson 7 | import asyncio 8 | 9 | try: 10 | from tests.gate_mocks import mock_out_of_order_all_200 11 | except ImportError: 12 | from gate_mocks import mock_out_of_order_all_200 13 | 14 | 15 | class FakePost: 16 | def __init__(self, requests_sent, makeFakeResponse): 17 | self.requests_sent = requests_sent 18 | self.makeFakeResponse = makeFakeResponse 19 | 20 | async def __aenter__(self): 21 | return self.makeFakeResponse(self.requests_sent) 22 | 23 | async def __aexit__(self, exc_type, exc, tb): 24 | await asyncio.sleep(1) 25 | 26 | class FakeSession: 27 | def __init__(self, makeFakeResponse): 28 | self.requests_sent = 0 29 | self.bodies_sent = [] 30 | self.makeFakeResponse = makeFakeResponse 31 | 32 | def post(self, url, *, data, **kwargs): 33 | self.requests_sent = self.requests_sent + 1 34 | self.bodies_sent.append(data) 35 | return FakePost(self.requests_sent, self.makeFakeResponse) 36 | 37 | class ActivateVersion(unittest.TestCase): 38 | def fake_flush_states(self, state_writer, future): 39 | self.flushed_state_count = self.flushed_state_count + 1 40 | 41 | if self.flushed_state_count == 1: 42 | #2nd request has not begun because it contains an ActivateVersion and must wait for 1 to complete 43 | if len(target_stitch.PENDING_REQUESTS) != 1: 44 | self.first_flush_error = "ActivateVersion request should not have been issues until 1st request completed: wrong pending request count for first flush" 45 | 46 | if future != target_stitch.PENDING_REQUESTS[0][0]: 47 | self.first_flush_error = "ActivateVersion request should not have been issues until 1st request completed: received wrong future for first flush" 48 | 49 | if target_stitch.PENDING_REQUESTS[0][1] != {'bookmarks': {'chicken_stream': {'id': 1}}}: 50 | self.first_flush_error = "ActivateVersion request should not have been issues until 1st request completed: wrong state for first flush" 51 | 52 | elif self.flushed_state_count == 2: 53 | if len(target_stitch.PENDING_REQUESTS) != 1: 54 | self.second_flush_error = "ActivateVersion request should not have been issues until 1st request completed: wrong pending request count for second flush" 55 | 56 | if future != target_stitch.PENDING_REQUESTS[0][0]: 57 | self.second_flush_error = "ActivateVersion request should not have been issues until 1st request completed: wrong future for second flush" 58 | 59 | if target_stitch.PENDING_REQUESTS[0][1] is not None: 60 | self.second_flush_error = "ActivateVersion request should not have been issues until 1st request completed: wrong state for second flush" 61 | 62 | else: 63 | raise Exception('flushed state should only have been called twice') 64 | 65 | self.og_flush_states(state_writer, future) 66 | 67 | 68 | def setUp(self): 69 | token = None 70 | self.first_flush_error = None 71 | self.second_flush_error = None 72 | 73 | handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 2) 74 | 75 | self.out = io.StringIO() 76 | self.target_stitch = target_stitch.TargetStitch( 77 | [handler], self.out, 4000000, 2, 100000) 78 | self.queue = [simplejson.dumps({"type": "SCHEMA", "stream": "chicken_stream", 79 | "key_properties": ["my_float"], 80 | "schema": {"type": "object", 81 | "properties": {"my_float": {"type": "number"}}}})] 82 | target_stitch.SEND_EXCEPTION = None 83 | target_stitch.PENDING_REQUESTS = [] 84 | self.og_flush_states = StitchHandler.flush_states 85 | self.flushed_state_count = 0 86 | StitchHandler.flush_states = self.fake_flush_states 87 | 88 | target_stitch.CONFIG = { 89 | 'token': "some-token", 90 | 'client_id': "some-client", 91 | 'disable_collection': True, 92 | 'connection_ns': "some-ns", 93 | 'batch_size_preferences' : { 94 | 'full_table_streams' : [], 95 | 'batch_size_preference': None, 96 | 'user_batch_size_preference': None, 97 | }, 98 | 'turbo_boost_factor' : 10, 99 | 'small_batch_url' : "http://small-batch", 100 | 'big_batch_url' : "http://big-batch", 101 | } 102 | 103 | def test_activate_version_finishes_pending_requests(self): 104 | target_stitch.OUR_SESSION = FakeSession(mock_out_of_order_all_200) 105 | #request 2 would ordinarily complete first because the mock_out_of_order_all_200, but because 106 | #request 2 contains an ACTIVATE_VERSION, it will not even be sent until request 1 completes 107 | 108 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "version":1, "record": {"id": 1, "name": "Mike"}})) 109 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}})) 110 | #will flush here after 2 records 111 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", 'version':1, "record": {"id": 2, "name": "Paul"}})) 112 | self.queue.append(json.dumps({"type":"ACTIVATE_VERSION", 'stream': 'chicken_stream', 'version': 1 })) 113 | #will flush here after 2 records 114 | 115 | 116 | self.target_stitch.consume(self.queue) 117 | finish_requests() 118 | self.assertEqual(self.first_flush_error, None, self.first_flush_error) 119 | self.assertEqual(self.second_flush_error, None, self.second_flush_error) 120 | 121 | 122 | if __name__== "__main__": 123 | test1 = ActivateVersion() 124 | test1.setUp() 125 | test1.test_activate_version_finishes_pending_requests() 126 | #test1.test_unparseable_json_response() 127 | -------------------------------------------------------------------------------- /tests/doesnt_validate.json: -------------------------------------------------------------------------------- 1 | {"type": "SCHEMA", "stream": "users", "schema": { "type": "object", "properties": { "name": { "type": "string" } }}, "key_properties": ["name"] } 2 | {"type": "RECORD", "stream": "users", "record": {"name": 1}} 3 | -------------------------------------------------------------------------------- /tests/empty_key_properties.json: -------------------------------------------------------------------------------- 1 | {"type": "SCHEMA", "stream": "test_empty_key_properties", "key_properties": [], "schema": {"type": "object", "properties": {"name": {"type": "string"}}}} 2 | {"type": "RECORD", "stream": "test_empty_key_properties", "record": {"name": "Mike"}} 3 | -------------------------------------------------------------------------------- /tests/gate_mocks.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | def mock_unparsable_response_body_200(requests_sent): 4 | class FakeResponse: 5 | def __init__(self, requests_sent): 6 | self.requests_sent = requests_sent 7 | 8 | async def json(self): 9 | self.status = 200 10 | raise Exception("bad json response") 11 | 12 | return FakeResponse(requests_sent) 13 | 14 | def mock_in_order_all_200(requests_sent): 15 | class FakeResponse: 16 | def __init__(self, requests_sent): 17 | self.requests_sent = requests_sent 18 | 19 | async def json(self): 20 | self.status = 200 21 | await asyncio.sleep(0) 22 | return {"status" : "finished request {}".format(requests_sent)} 23 | 24 | return FakeResponse(requests_sent) 25 | 26 | def mock_out_of_order_all_200(requests_sent): 27 | class FakeResponse: 28 | def __init__(self, requests_sent): 29 | self.requests_sent = requests_sent 30 | 31 | async def json(self): 32 | self.status = 200 33 | if self.requests_sent == 1: 34 | await asyncio.sleep(3) 35 | return {"status" : "finished request {}".format(requests_sent)} 36 | 37 | return FakeResponse(requests_sent) 38 | 39 | def mock_in_order_first_errors(requests_sent): 40 | class FakeResponse: 41 | def __init__(self, requests_sent): 42 | self.requests_sent = requests_sent 43 | 44 | async def json(self): 45 | if (self.requests_sent == 1): 46 | self.status = 400 47 | return {"status" : "finished request {}".format(requests_sent)} 48 | 49 | self.status = 200 50 | return {"status" : "finished request {}".format(requests_sent)} 51 | 52 | return FakeResponse(requests_sent) 53 | 54 | def mock_in_order_second_errors(requests_sent): 55 | class FakeResponse: 56 | def __init__(self, requests_sent): 57 | self.requests_sent = requests_sent 58 | 59 | async def json(self): 60 | if (self.requests_sent == 2): 61 | self.status = 400 62 | return {"status" : "finished request {}".format(requests_sent)} 63 | 64 | self.status = 200 65 | return {"status" : "finished request {}".format(requests_sent)} 66 | 67 | return FakeResponse(requests_sent) 68 | 69 | def mock_out_of_order_first_errors(requests_sent): 70 | class FakeResponse: 71 | def __init__(self, requests_sent): 72 | self.requests_sent = requests_sent 73 | 74 | async def json(self): 75 | if (self.requests_sent == 1): 76 | self.status = 400 77 | await asyncio.sleep(3) 78 | return {"status" : "finished request {}".format(requests_sent)} 79 | 80 | self.status = 200 81 | return {"status" : "finished request {}".format(requests_sent)} 82 | 83 | return FakeResponse(requests_sent) 84 | 85 | def mock_out_of_order_second_errors(requests_sent): 86 | class FakeResponse: 87 | def __init__(self, requests_sent): 88 | self.requests_sent = requests_sent 89 | 90 | async def json(self): 91 | if (self.requests_sent == 1): 92 | self.status = 200 93 | await asyncio.sleep(3) 94 | return {"status" : "finished request {}".format(requests_sent)} 95 | 96 | self.status = 400 97 | return {"status" : "finished request {}".format(requests_sent)} 98 | 99 | return FakeResponse(requests_sent) 100 | 101 | def mock_out_of_order_both_error(requests_sent): 102 | class FakeResponse: 103 | def __init__(self, requests_sent): 104 | self.requests_sent = requests_sent 105 | 106 | async def json(self): 107 | self.status = 400 108 | if (self.requests_sent == 1): 109 | await asyncio.sleep(10) 110 | return {"status" : "finished request {}".format(requests_sent)} 111 | 112 | return {"status" : "finished request {}".format(requests_sent)} 113 | 114 | return FakeResponse(requests_sent) 115 | 116 | 117 | def mock_in_order_both_error(requests_sent): 118 | class FakeResponse: 119 | def __init__(self, requests_sent): 120 | self.requests_sent = requests_sent 121 | 122 | async def json(self): 123 | self.status = 400 124 | return {"status" : "finished request {}".format(requests_sent)} 125 | 126 | return FakeResponse(requests_sent) 127 | -------------------------------------------------------------------------------- /tests/integration_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import singer 3 | import target_stitch 4 | from target_stitch import StitchHandler, TargetStitchException, finish_requests 5 | import io 6 | import os 7 | import json 8 | import asyncio 9 | import simplejson 10 | import collections 11 | import time 12 | from decimal import Decimal 13 | try: 14 | from tests.gate_mocks import ( 15 | mock_in_order_all_200, 16 | mock_out_of_order_all_200, 17 | mock_in_order_first_errors, 18 | mock_in_order_second_errors, 19 | mock_out_of_order_first_errors, 20 | mock_out_of_order_second_errors, 21 | mock_out_of_order_both_error, 22 | mock_in_order_both_error, 23 | mock_unparsable_response_body_200, 24 | ) 25 | except ImportError: 26 | from gate_mocks import ( 27 | mock_in_order_all_200, 28 | mock_out_of_order_all_200, 29 | mock_in_order_first_errors, 30 | mock_in_order_second_errors, 31 | mock_out_of_order_first_errors, 32 | mock_out_of_order_second_errors, 33 | mock_out_of_order_both_error, 34 | mock_in_order_both_error, 35 | mock_unparsable_response_body_200, 36 | ) 37 | 38 | from nose.tools import nottest 39 | 40 | LOGGER = singer.get_logger().getChild('target_stitch') 41 | 42 | def fake_check_send_exception(): 43 | return None 44 | 45 | def load_sample_lines(filename): 46 | with open('tests/' + filename) as fp: 47 | return [line for line in fp] 48 | 49 | class FakePost: 50 | def __init__(self, requests_sent, makeFakeResponse): 51 | self.requests_sent = requests_sent 52 | self.makeFakeResponse = makeFakeResponse 53 | 54 | async def __aenter__(self): 55 | return self.makeFakeResponse(self.requests_sent) 56 | 57 | async def __aexit__(self, exc_type, exc, tb): 58 | await asyncio.sleep(1) 59 | 60 | class FakeSession: 61 | def __init__(self, makeFakeResponse): 62 | self.requests_sent = 0 63 | self.urls = [] 64 | self.messages_sent = [] 65 | self.bodies_sent = [] 66 | self.makeFakeResponse = makeFakeResponse 67 | 68 | def post(self, url, *, data, **kwargs): 69 | data_json = simplejson.loads(data) 70 | self.messages_sent.append(data_json["messages"]) 71 | self.requests_sent = self.requests_sent + 1 72 | self.bodies_sent.append(data) 73 | self.urls.append(url) 74 | return FakePost(self.requests_sent, self.makeFakeResponse) 75 | 76 | 77 | class AsyncSerializeFloats(unittest.TestCase): 78 | def setUp(self): 79 | token = None 80 | handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 2) 81 | 82 | self.out = io.StringIO() 83 | self.target_stitch = target_stitch.TargetStitch( 84 | [handler], self.out, 4000000, 2, 100000) 85 | self.queue = [simplejson.dumps({"type": "SCHEMA", "stream": "chicken_stream", 86 | "key_properties": ["my_float"], 87 | "schema": {"type": "object", 88 | "properties": {"my_float": {"type": "number"}}}})] 89 | target_stitch.SEND_EXCEPTION = None 90 | target_stitch.PENDING_REQUESTS = [] 91 | 92 | LOGGER.info("cleaning SEND_EXCEPTIONS: %s AND PENDING_REQUESTS: %s", 93 | target_stitch.SEND_EXCEPTION, 94 | target_stitch.PENDING_REQUESTS) 95 | 96 | target_stitch.CONFIG = { 97 | 'token': "some-token", 98 | 'client_id': "some-client", 99 | 'disable_collection': True, 100 | 'connection_ns': "some-ns", 101 | 'batch_size_preferences' : { 102 | 'full_table_streams' : [], 103 | 'batch_size_preference': None, 104 | 'user_batch_size_preference': None, 105 | }, 106 | 'turbo_boost_factor' : 10, 107 | 'small_batch_url' : "http://small-batch", 108 | 'big_batch_url' : "http://big-batch", 109 | } 110 | 111 | 112 | def test_serialize_floats(self): 113 | floats = [ 114 | '-9999999999999999.9999999999999999999999', 115 | '-7187498962233394.3739812942138415666763', 116 | '9273972760690975.2044306442955715221042', 117 | '29515565286974.1188802122612813004366', 118 | '9176089101347578.2596296292040288441238', 119 | '-8416853039392703.306423225471199148379', 120 | '1285266411314091.3002668125515694162268', 121 | '6051872750342125.3812886238958681227336', 122 | '-1132031605459408.5571559429308939781468', 123 | '-6387836755056303.0038029604189860431045', 124 | '4526059300505414' 125 | ] 126 | 127 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200) 128 | for float_val in floats: 129 | self.queue.append(simplejson.dumps({"type": "RECORD", 130 | "stream": "chicken_stream", 131 | "record": {"my_float": Decimal(float_val)}})) 132 | 133 | 134 | self.queue.append(simplejson.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"my_float": Decimal(float_val) }}}})) 135 | 136 | self.target_stitch.consume(self.queue) 137 | finish_requests() 138 | 139 | output_record_floats = [] 140 | for batch in target_stitch.OUR_SESSION.bodies_sent: 141 | output_record_floats.extend([str(x['data']['my_float']) for x in simplejson.loads(batch, use_decimal=True)['messages']]) 142 | 143 | self.assertEqual(floats, output_record_floats) 144 | 145 | emitted_state = list(map(lambda x: simplejson.loads(x, use_decimal=True), self.out.getvalue().strip().split('\n'))) 146 | self.assertEqual(len(emitted_state), 6) 147 | self.assertEqual( emitted_state[0], {'bookmarks': {'chicken_stream': {'my_float': Decimal(floats[0])}}}) 148 | self.assertEqual( emitted_state[1], {'bookmarks': {'chicken_stream': {'my_float': Decimal(floats[2])}}}) 149 | self.assertEqual( emitted_state[2], {'bookmarks': {'chicken_stream': {'my_float': Decimal(floats[4])}}}) 150 | self.assertEqual( emitted_state[3], {'bookmarks': {'chicken_stream': {'my_float': Decimal(floats[6])}}}) 151 | self.assertEqual( emitted_state[4], {'bookmarks': {'chicken_stream': {'my_float': Decimal(floats[8])}}}) 152 | self.assertEqual( emitted_state[5], {'bookmarks': {'chicken_stream': {'my_float': Decimal(floats[10])}}}) 153 | 154 | 155 | class AsyncPushToGate(unittest.TestCase): 156 | def setUp(self): 157 | token = None 158 | handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 2) 159 | 160 | self.og_check_send_exception = target_stitch.check_send_exception 161 | self.out = io.StringIO() 162 | self.target_stitch = target_stitch.TargetStitch( 163 | [handler], self.out, 4000000, 2, 100000) 164 | self.queue = [json.dumps({"type": "SCHEMA", "stream": "chicken_stream", 165 | "key_properties": ["id"], 166 | "schema": {"type": "object", 167 | "properties": {"id": {"type": "integer"}, 168 | "name": {"type": "string"}}}})] 169 | 170 | target_stitch.SEND_EXCEPTION = None 171 | for f,s in target_stitch.PENDING_REQUESTS: 172 | try: 173 | f.cancel() 174 | except: 175 | pass 176 | 177 | target_stitch.PENDING_REQUESTS = [] 178 | LOGGER.info("cleaning SEND_EXCEPTIONS: %s AND PENDING_REQUESTS: %s", 179 | target_stitch.SEND_EXCEPTION, 180 | target_stitch.PENDING_REQUESTS) 181 | 182 | target_stitch.CONFIG ={ 183 | 'token': "some-token", 184 | 'client_id': "some-client", 185 | 'disable_collection': True, 186 | 'connection_ns': "some-ns", 187 | 'batch_size_preferences' : { 188 | 'full_table_streams' : [], 189 | 'batch_size_preference': None, 190 | 'user_batch_size_preference': None, 191 | }, 192 | 'turbo_boost_factor' : 10, 193 | 'small_batch_url' : "http://small-batch", 194 | 'big_batch_url' : "http://big-batch", 195 | } 196 | 197 | # 2 requests 198 | # both with state 199 | # in order responses 200 | def test_requests_in_order(self): 201 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200) 202 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 203 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}})) 204 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 205 | #will flush here after 2 records 206 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}})) 207 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 208 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}})) 209 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}})) 210 | #will flush here after 2 records 211 | 212 | self.target_stitch.consume(self.queue) 213 | finish_requests() 214 | 215 | emitted_state = list(map(json.loads, self.out.getvalue().strip().split('\n'))) 216 | self.assertEqual(len(emitted_state), 2) 217 | self.assertEqual( emitted_state[0], {'bookmarks': {'chicken_stream': {'id': 1}}}) 218 | self.assertEqual( emitted_state[1], {'bookmarks': {'chicken_stream': {'id': 3}}}) 219 | 220 | def test_request_to_big_batch_for_large_record(self): 221 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200) 222 | self.target_stitch.max_batch_records = 4 223 | self.target_stitch.handlers[0].max_batch_records = 4 224 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "M" * 5000000}})) 225 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 226 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 227 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}})) 228 | #will flush here after 4 records 229 | 230 | self.target_stitch.consume(self.queue) 231 | finish_requests() 232 | self.assertEqual(target_stitch.OUR_SESSION.urls, [target_stitch.CONFIG["big_batch_url"], 233 | target_stitch.CONFIG["small_batch_url"]]) 234 | self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent[0]), 1) 235 | self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent[1]), 3) 236 | 237 | # 2 requests 238 | # last SENT request has state 239 | # in order 240 | def test_requests_in_order_first_has_no_state(self): 241 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200) 242 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 243 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 244 | #will flush here after 2 records 245 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}})) 246 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 247 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}})) 248 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}})) 249 | #will flush here after 2 records 250 | 251 | self.target_stitch.consume(self.queue) 252 | finish_requests() 253 | 254 | emitted_state = list(map(json.loads, self.out.getvalue().strip().split('\n'))) 255 | self.assertEqual(len(emitted_state), 1) 256 | self.assertEqual( emitted_state[0], {'bookmarks': {'chicken_stream': {'id': 3}}}) 257 | 258 | 259 | # 2 requests. 260 | # both with state. 261 | # in order 262 | # first sent request errors 263 | def test_requests_in_order_first_errors(self): 264 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_first_errors) 265 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 266 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}})) 267 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 268 | #will flush here after 2 records 269 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}})) 270 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 271 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}})) 272 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}})) 273 | #will flush here after 2 records 274 | 275 | #consume() can encounter an exception via check_send_exception in send() 276 | #if SEND_EXCEPTION has already been set by the coroutine it can blow up. 277 | target_stitch.check_send_exception = fake_check_send_exception 278 | self.target_stitch.consume(self.queue) 279 | target_stitch.check_send_exception = self.og_check_send_exception 280 | our_exception = None 281 | try: 282 | finish_requests() 283 | except Exception as ex: 284 | our_exception = ex 285 | 286 | self.assertIsNotNone(our_exception) 287 | self.assertTrue(isinstance(our_exception, TargetStitchException)) 288 | 289 | #no state is emitted 290 | emitted_state = self.assertEqual(self.out.getvalue(), '') 291 | 292 | # 2 requests. 293 | # both with state. 294 | # in order 295 | # second SENT request errors 296 | def test_requests_in_order_second_errors(self): 297 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_second_errors) 298 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 299 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}})) 300 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 301 | #will flush here after 2 records 302 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}})) 303 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 304 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}})) 305 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}})) 306 | #will flush here after 2 records 307 | 308 | #consume() can encounter an exception via check_send_exception in send() 309 | #if SEND_EXCEPTION has already been set by the coroutine it can blow up. 310 | target_stitch.check_send_exception = fake_check_send_exception 311 | self.target_stitch.consume(self.queue) 312 | target_stitch.check_send_exception = self.og_check_send_exception 313 | 314 | our_exception = None 315 | try: 316 | finish_requests() 317 | except Exception as ex: 318 | our_exception = ex 319 | 320 | self.assertIsNotNone(our_exception) 321 | self.assertTrue(isinstance(our_exception, TargetStitchException)) 322 | 323 | emitted_state = self.out.getvalue().strip().split('\n') 324 | self.assertEqual(1, len(emitted_state)) 325 | self.assertEqual({'bookmarks': {'chicken_stream': {'id': 1}}}, json.loads(emitted_state[0])) 326 | 327 | # 2 requests. 328 | # both with state. 329 | # in order 330 | # both requests errors 331 | def test_requests_in_order_both_errors(self): 332 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_both_error) 333 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 334 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}})) 335 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 336 | #will flush here after 2 records 337 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}})) 338 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 339 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}})) 340 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}})) 341 | #will flush here after 2 records 342 | 343 | #consume() can encounter an exception via check_send_exception in send() 344 | #if SEND_EXCEPTION has already been set by the coroutine it can blow up. 345 | target_stitch.check_send_exception = fake_check_send_exception 346 | self.target_stitch.consume(self.queue) 347 | target_stitch.check_send_exception = self.og_check_send_exception 348 | our_exception = None 349 | try: 350 | finish_requests() 351 | except Exception as ex: 352 | our_exception = ex 353 | 354 | self.assertIsNotNone(our_exception) 355 | self.assertTrue(isinstance(our_exception, TargetStitchException)) 356 | 357 | #no state is emitted 358 | self.assertEqual(self.out.getvalue(), '') 359 | 360 | 361 | 362 | 363 | # 2 requests 364 | # both with state. 365 | # out of order responses 366 | def test_requests_out_of_order(self): 367 | target_stitch.OUR_SESSION = FakeSession(mock_out_of_order_all_200) 368 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 369 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}})) 370 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 371 | #will flush here after 2 records 372 | 373 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}})) 374 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 375 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}})) 376 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}})) 377 | #will flush here after 2 records 378 | 379 | self.target_stitch.consume(self.queue) 380 | finish_requests() 381 | 382 | emitted_state = list(map(json.loads, self.out.getvalue().strip().split('\n'))) 383 | self.assertEqual(len(emitted_state), 2) 384 | self.assertEqual( emitted_state[0], {'bookmarks': {'chicken_stream': {'id': 1}}}) 385 | self.assertEqual( emitted_state[1], {'bookmarks': {'chicken_stream': {'id': 3}}}) 386 | 387 | # 2 requests. 388 | # both with state. 389 | # out of order 390 | # first SENT request errors 391 | def test_requests_out_of_order_first_errors(self): 392 | target_stitch.OUR_SESSION = FakeSession(mock_out_of_order_first_errors) 393 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 394 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}})) 395 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 396 | #will flush here after 2 records 397 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}})) 398 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 399 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}})) 400 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}})) 401 | #will flush here after 2 records 402 | 403 | #consume() can encounter an exception via check_send_exception in send() 404 | #if SEND_EXCEPTION has already been set by the coroutine it can blow up. 405 | target_stitch.check_send_exception = fake_check_send_exception 406 | self.target_stitch.consume(self.queue) 407 | target_stitch.check_send_exception = self.og_check_send_exception 408 | our_exception = None 409 | try: 410 | finish_requests() 411 | except Exception as ex: 412 | our_exception = ex 413 | 414 | self.assertIsNotNone(our_exception) 415 | self.assertTrue(isinstance(our_exception, TargetStitchException)) 416 | 417 | #no state is emitted 418 | self.assertEqual(self.out.getvalue(), '') 419 | 420 | # 2 requests. 421 | # both with state. 422 | # out of order 423 | # second SENT request errors 424 | def out_of_order_second_errors(self, requests_sent): 425 | class FakeResponse: 426 | def __init__(self, requests_sent): 427 | self.requests_sent = requests_sent 428 | 429 | async def json(self): 430 | if (self.requests_sent == 1): 431 | self.status = 200 432 | await asyncio.sleep(3) 433 | return {"status" : "finished request {}".format(requests_sent)} 434 | 435 | self.status = 400 436 | return {"status" : "finished request {}".format(requests_sent)} 437 | 438 | return FakeResponse(requests_sent) 439 | 440 | def test_requests_out_of_order_second_errors(self): 441 | target_stitch.OUR_SESSION = FakeSession(mock_out_of_order_second_errors) 442 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 443 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}})) 444 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 445 | #will flush here after 2 records 446 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}})) 447 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 448 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}})) 449 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}})) 450 | #will flush here after 2 records 451 | 452 | #consume() can encounter an exception via check_send_exception in send() 453 | #if SEND_EXCEPTION has already been set by the coroutine it can blow up. 454 | target_stitch.check_send_exception = fake_check_send_exception 455 | self.target_stitch.consume(self.queue) 456 | target_stitch.check_send_exception = self.og_check_send_exception 457 | our_exception = None 458 | try: 459 | finish_requests() 460 | except Exception as ex: 461 | our_exception = ex 462 | 463 | #the 2nd request returns immediately with a 400, triggering a TargetStitchException. 464 | #at this point, it is game over and it does NOT matter when or with what status the 1st request comples 465 | self.assertIsNotNone(our_exception) 466 | self.assertTrue(isinstance(our_exception, TargetStitchException)) 467 | 468 | emitted_state = self.out.getvalue().strip().split('\n') 469 | self.assertEqual(1, len(emitted_state)) 470 | self.assertEqual('', emitted_state[0]) 471 | 472 | # 2 requests. 473 | # both with state. 474 | # out of order 475 | # both requests errors 476 | def test_requests_out_of_order_both_errors(self): 477 | target_stitch.OUR_SESSION = FakeSession(mock_out_of_order_both_error) 478 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 479 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}})) 480 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 481 | #will flush here after 2 records 482 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}})) 483 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 484 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}})) 485 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}})) 486 | #will flush here after 2 records 487 | 488 | #consume() can encounter an exception via check_send_exception in send() 489 | #if SEND_EXCEPTION has already been set by the coroutine it can blow up. 490 | target_stitch.check_send_exception = fake_check_send_exception 491 | self.target_stitch.consume(self.queue) 492 | target_stitch.check_send_exception = self.og_check_send_exception 493 | our_exception = None 494 | try: 495 | finish_requests() 496 | except Exception as ex: 497 | our_exception = ex 498 | 499 | self.assertIsNotNone(our_exception) 500 | self.assertTrue(isinstance(our_exception, TargetStitchException)) 501 | 502 | #no state is emitted 503 | self.assertEqual(self.out.getvalue(), '') 504 | 505 | def test_unparseable_json_response(self): 506 | target_stitch.OUR_SESSION = FakeSession(mock_unparsable_response_body_200) 507 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 508 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}})) 509 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 510 | #will flush here after 2 records 511 | 512 | target_stitch.check_send_exception = fake_check_send_exception 513 | self.target_stitch.consume(self.queue) 514 | target_stitch.check_send_exception = self.og_check_send_exception 515 | try: 516 | finish_requests() 517 | except Exception as ex: 518 | our_exception = ex 519 | 520 | self.assertIsNotNone(our_exception) 521 | 522 | 523 | class StateOnly(unittest.TestCase): 524 | def setUp(self): 525 | token = None 526 | handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 2) 527 | self.og_check_send_exception = target_stitch.check_send_exception 528 | self.out = io.StringIO() 529 | self.target_stitch = target_stitch.TargetStitch( 530 | [handler], self.out, 4000000, 1, 0) 531 | self.queue = [] 532 | target_stitch.SEND_EXCEPTION = None 533 | for f,s in target_stitch.PENDING_REQUESTS: 534 | try: 535 | f.cancel() 536 | except: 537 | pass 538 | 539 | target_stitch.PENDING_REQUESTS = [] 540 | LOGGER.info("cleaning SEND_EXCEPTIONS: %s AND PENDING_REQUESTS: %s", 541 | target_stitch.SEND_EXCEPTION, 542 | target_stitch.PENDING_REQUESTS) 543 | target_stitch.CONFIG ={ 544 | 'token': "some-token", 545 | 'client_id': "some-client", 546 | 'disable_collection': True, 547 | 'connection_ns': "some-ns", 548 | 'batch_size_preferences' : { 549 | 'full_table_streams' : [], 550 | 'batch_size_preference': None, 551 | 'user_batch_size_preference': None, 552 | }, 553 | 'turbo_boost_factor' : 10, 554 | 'small_batch_url' : "http://small-batch", 555 | 'big_batch_url' : "http://big-batch", 556 | } 557 | 558 | def test_state_only(self): 559 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200) 560 | self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}})) 561 | #will flush here, because TargetStitch.time_last_batch_sent was set to 0 in setUp 562 | self.target_stitch.consume(self.queue) 563 | finish_requests() 564 | 565 | emitted_state = list(map(json.loads, self.out.getvalue().strip().split('\n'))) 566 | self.assertEqual(len(emitted_state), 1) 567 | self.assertEqual( emitted_state[0], {'bookmarks': {'chicken_stream': {'id': 1}}}) 568 | 569 | 570 | class StateEdgeCases(unittest.TestCase): 571 | def setUp(self): 572 | token = None 573 | handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 2) 574 | self.out = io.StringIO() 575 | self.target_stitch = target_stitch.TargetStitch( 576 | [handler], self.out, 4000000, 2, 100000) 577 | self.queue = [simplejson.dumps({"type": "SCHEMA", "stream": "chicken_stream", 578 | "key_properties": ["my_float"], 579 | "schema": {"type": "object", 580 | "properties": {"my_float": {"type": "number"}}}})] 581 | target_stitch.SEND_EXCEPTION = None 582 | target_stitch.PENDING_REQUESTS = [] 583 | 584 | LOGGER.info("cleaning SEND_EXCEPTIONS: %s AND PENDING_REQUESTS: %s", 585 | target_stitch.SEND_EXCEPTION, 586 | target_stitch.PENDING_REQUESTS) 587 | 588 | target_stitch.CONFIG ={ 589 | 'token': "some-token", 590 | 'client_id': "some-client", 591 | 'disable_collection': True, 592 | 'connection_ns': "some-ns", 593 | 'batch_size_preferences' : { 594 | 'full_table_streams' : [], 595 | 'batch_size_preference': None, 596 | 'user_batch_size_preference': None, 597 | }, 598 | 'turbo_boost_factor' : 10, 599 | 'small_batch_url' : "http://small-batch", 600 | 'big_batch_url' : "http://big-batch", 601 | } 602 | 603 | 604 | def test_trailing_state_after_final_message(self): 605 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200) 606 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 607 | self.queue.append(json.dumps({"type":"STATE", 608 | "value":{"bookmarks":{"chicken_stream":{"id": 1 }}, 609 | 'currently_syncing' : 'chicken_stream'}})) 610 | 611 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 612 | #will flush here after 2 records 613 | self.queue.append(json.dumps({"type":"STATE", 614 | "value":{"bookmarks":{"chicken_stream":{"id": 2 }}, 615 | 'currently_syncing' : None}})) 616 | 617 | self.target_stitch.consume(self.queue) 618 | finish_requests() 619 | 620 | emitted_state = list(map(json.loads, self.out.getvalue().strip().split('\n'))) 621 | self.assertEqual(len(emitted_state), 2) 622 | self.assertEqual( emitted_state[0], 623 | {"bookmarks":{"chicken_stream":{"id": 1 }}, 624 | 'currently_syncing' : 'chicken_stream'}) 625 | self.assertEqual( emitted_state[1], 626 | {"bookmarks":{"chicken_stream":{"id": 2 }}, 627 | 'currently_syncing' : None}) 628 | 629 | def test_will_not_output_empty_state(self): 630 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200) 631 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 632 | self.queue.append(json.dumps({"type":"STATE", 633 | "value":{"bookmarks":{"chicken_stream":{"id": 1 }}, 634 | 'currently_syncing' : 'chicken_stream'}})) 635 | 636 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}})) 637 | #will flush here after 2 records, state will reset to None 638 | 639 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Kyle"}})) 640 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Alice"}})) 641 | #will flush here after 2 records, but will NOT write blank state 642 | 643 | self.target_stitch.consume(self.queue) 644 | finish_requests() 645 | 646 | emitted_state = list(map(json.loads, self.out.getvalue().strip().split('\n'))) 647 | self.assertEqual(len(emitted_state), 1) 648 | self.assertEqual( emitted_state[0], 649 | {"bookmarks":{"chicken_stream":{"id": 1 }}, 650 | 'currently_syncing' : 'chicken_stream'}) 651 | 652 | class BufferingPerStreamConstraints(unittest.TestCase): 653 | def setUp(self): 654 | self.maxDiff = None 655 | token = None 656 | handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 3) 657 | 658 | self.og_check_send_exception = target_stitch.check_send_exception 659 | self.out = io.StringIO() 660 | self.target_stitch = target_stitch.TargetStitch( 661 | [handler], self.out, 500, 7, 100000) 662 | self.queue = [json.dumps({"type": "SCHEMA", "stream": "chicken_stream", 663 | "key_properties": ["id"], 664 | "schema": {"type": "object", 665 | "properties": {"id": {"type": "integer"}, 666 | "name": {"type": "string"}}}}), 667 | json.dumps({"type": "SCHEMA", "stream": "zebra_stream", 668 | "key_properties": ["id"], 669 | "schema": {"type": "object", 670 | "properties": {"id": {"type": "integer"}, 671 | "name": {"type": "string"}}}})] 672 | 673 | target_stitch.SEND_EXCEPTION = None 674 | for f,s in target_stitch.PENDING_REQUESTS: 675 | try: 676 | f.cancel() 677 | except: 678 | pass 679 | 680 | target_stitch.PENDING_REQUESTS = [] 681 | LOGGER.info("cleaning SEND_EXCEPTIONS: %s AND PENDING_REQUESTS: %s", 682 | target_stitch.SEND_EXCEPTION, 683 | target_stitch.PENDING_REQUESTS) 684 | 685 | target_stitch.CONFIG ={ 686 | 'token': "some-token", 687 | 'client_id': "some-client", 688 | 'disable_collection': True, 689 | 'connection_ns': "some-ns", 690 | 'batch_size_preferences' : { 691 | 'full_table_streams' : [], 692 | 'batch_size_preference': None, 693 | 'user_batch_size_preference': None, 694 | }, 695 | 'turbo_boost_factor' : 10, 696 | 'small_batch_url' : "http://small-batch", 697 | 'big_batch_url' : "http://big-batch", 698 | } 699 | 700 | def test_flush_based_on_message_count(self): 701 | # Tests that the target will buffer records per stream. This will 702 | # allow the tap to alternate which streams it is emitting records 703 | # for without the target cutting small batches 704 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200) 705 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 706 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 2, "name": "Paul"}})) 707 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 708 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 4, "name": "Cathy"}})) 709 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 5, "name": "Dan"}})) 710 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 6, "name": "A"}})) 711 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 7, "name": "B"}})) 712 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 7}, 713 | "zebra_stream": {"id": 6}}}})) 714 | # Should flush here 715 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 8, "name": "C"}})) 716 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 9, "name": "D"}})) 717 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 10, "name": "E"}})) 718 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 10}, 719 | "zebra_stream": {"id": 8}}}})) 720 | # Should flush here 721 | 722 | self.target_stitch.consume(self.queue) 723 | finish_requests() 724 | 725 | expected_messages = [ 726 | [{'action': 'upsert', 727 | 'data': {'id': 8, 'name': 'C'}}], 728 | [{'action': 'upsert', 729 | 'data': {'id': 9, 'name': 'D'}}, 730 | {'action': 'upsert', 731 | 'data': {'id': 10, 'name': 'E'}}], 732 | [{'action': 'upsert', 733 | 'data': {'id': 2, 'name': 'Paul'}}, 734 | {'action': 'upsert', 735 | 'data': {'id': 4, 'name': 'Cathy'}}, 736 | {'action': 'upsert', 737 | 'data': {'id': 6, 'name': 'A'}}], 738 | [{'action': 'upsert', 739 | 'data': {'id': 1, 'name': 'Mike'}}, 740 | {'action': 'upsert', 741 | 'data': {'id': 3, 'name': 'Harrsion'}}, 742 | {'action': 'upsert', 743 | 'data': {'id': 5, 'name': 'Dan'}}, 744 | {'action': 'upsert', 745 | 'data': {'id': 7, 'name': 'B'}},]] 746 | 747 | expected_state = [{"bookmarks": {"zebra_stream": {"id": 8}, "chicken_stream": {"id": 10}}}] 748 | 749 | # Should be broken into 4 batches 750 | self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent), 4) 751 | 752 | # Sort by length and remove sequence number to compare directly 753 | actual_messages = [[{key: m[key] for key in ["action","data"]} for m in ms] 754 | for ms in sorted(target_stitch.OUR_SESSION.messages_sent, key=lambda ms: len(ms))] 755 | 756 | actual_state = list(map(lambda x: simplejson.loads(x, use_decimal=True), self.out.getvalue().strip().split('\n'))) 757 | 758 | self.assertEqual(actual_messages, expected_messages) 759 | self.assertEqual(actual_state, expected_state) 760 | 761 | 762 | def test_flush_based_on_bytes(self): 763 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200) 764 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 765 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 2, "name": "Paul"}})) 766 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 767 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 4, "name": "The byte limit should be across streams, so lets make lots of data on both streams"}})) 768 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 5, "name": "to force the target to exceed its byte limit"}})) 769 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 6, "name": "A"}})) 770 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 7, "name": "B"}})) 771 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 7}, 772 | "zebra_stream": {"id": 6}}}})) 773 | # Should flush here 774 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 8, "name": "C"}})) 775 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 9, "name": "D"}})) 776 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 10, "name": "E"}})) 777 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 10}, 778 | "zebra_stream": {"id": 8}}}})) 779 | # Should flush here 780 | 781 | self.target_stitch.consume(self.queue) 782 | finish_requests() 783 | 784 | expected_messages = [ 785 | [{'action': 'upsert', 'data': {'id': 1, 'name': 'Mike'}}, 786 | {'action': 'upsert', 'data': {'id': 3, 'name': 'Harrsion'}}, 787 | {'action': 'upsert', 788 | 'data': {'id': 5, 789 | 'name': 'to force the target to exceed its byte limit'}}], 790 | [{'action': 'upsert', 'data': {'id': 2, 'name': 'Paul'}}, 791 | {'action': 'upsert', 'data': {'id': 4, 'name': 'The byte limit should be across streams, so lets make lots of data on both streams'}}], 792 | [{'action': 'upsert', 'data': {'id': 6, 'name': 'A'}}, 793 | {'action': 'upsert', 'data': {'id': 8, 'name': 'C'}}], 794 | [{'action': 'upsert', 'data': {'id': 7, 'name': 'B'}}, 795 | {'action': 'upsert', 'data': {'id': 9, 'name': 'D'}}, 796 | {'action': 'upsert', 'data': {'id': 10, 'name': 'E'}}]] 797 | 798 | 799 | expected_state = [{"bookmarks": {"zebra_stream": {"id": 8}, "chicken_stream": {"id": 10}}}] 800 | 801 | # Should be broken into 4 batches 802 | self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent), 4) 803 | 804 | # Sort by length and remove sequence number to compare directly 805 | actual_messages = [[{key: m[key] for key in ["action","data"]} for m in ms] 806 | for ms in sorted(target_stitch.OUR_SESSION.messages_sent, key=lambda ms: ms[0]['data']['id'])] 807 | 808 | actual_state = list(map(lambda x: simplejson.loads(x, use_decimal=True), self.out.getvalue().strip().split('\n'))) 809 | 810 | self.assertEqual(actual_messages, expected_messages) 811 | self.assertEqual(actual_state, expected_state) 812 | 813 | 814 | def test_state_works_when_streams_with_no_messages(self): 815 | # Test that target_stitch will emit state messages for a stream 816 | # even if the final stream in self.messages does not contain any 817 | # messages 818 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200) 819 | self.target_stitch.messages = collections.OrderedDict(self.target_stitch.messages) 820 | 821 | self.queue.append(json.dumps({ 822 | "type": "SCHEMA", 823 | "stream": "lion_stream", 824 | "key_properties": ["id"], 825 | "schema": {"type": "object", 826 | "properties": {"id": {"type": "integer"}, 827 | "name": {"type": "string"}}}})) 828 | 829 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}})) 830 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 2, "name": "Paul"}})) 831 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}})) 832 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 4, "name": "Cathy"}})) 833 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 5, "name": "Dan"}})) 834 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 6, "name": "A"}})) 835 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 7, "name": "B"}})) 836 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 7}, 837 | "zebra_stream": {"id": 6}}}})) 838 | # Should flush here 839 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 8, "name": "C"}})) 840 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 9, "name": "D"}})) 841 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 10, "name": "E"}})) 842 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 10}, 843 | "zebra_stream": {"id": 8}}}})) 844 | # Should flush here 845 | 846 | self.target_stitch.consume(self.queue) 847 | finish_requests() 848 | 849 | expected_messages = [ 850 | [{'action': 'upsert', 851 | 'data': {'id': 8, 'name': 'C'}}], 852 | [{'action': 'upsert', 853 | 'data': {'id': 9, 'name': 'D'}}, 854 | {'action': 'upsert', 855 | 'data': {'id': 10, 'name': 'E'}}], 856 | [{'action': 'upsert', 857 | 'data': {'id': 2, 'name': 'Paul'}}, 858 | {'action': 'upsert', 859 | 'data': {'id': 4, 'name': 'Cathy'}}, 860 | {'action': 'upsert', 861 | 'data': {'id': 6, 'name': 'A'}}], 862 | [{'action': 'upsert', 863 | 'data': {'id': 1, 'name': 'Mike'}}, 864 | {'action': 'upsert', 865 | 'data': {'id': 3, 'name': 'Harrsion'}}, 866 | {'action': 'upsert', 867 | 'data': {'id': 5, 'name': 'Dan'}}, 868 | {'action': 'upsert', 869 | 'data': {'id': 7, 'name': 'B'}},]] 870 | 871 | expected_state = [{"bookmarks": {"zebra_stream": {"id": 8}, "chicken_stream": {"id": 10}}}] 872 | 873 | # Should be broken into 4 batches 874 | self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent), 4) 875 | 876 | # Sort by length and remove sequence number to compare directly 877 | actual_messages = [[{key: m[key] for key in ["action","data"]} for m in ms] 878 | for ms in sorted(target_stitch.OUR_SESSION.messages_sent, key=lambda ms: len(ms))] 879 | 880 | actual_state = list(map(lambda x: simplejson.loads(x, use_decimal=True), self.out.getvalue().strip().split('\n'))) 881 | 882 | self.assertEqual(actual_messages, expected_messages) 883 | self.assertEqual(actual_state, expected_state) 884 | 885 | 886 | class BufferingPerStreamNoStateOnFailure(unittest.TestCase): 887 | def setUp(self): 888 | time.sleep(20) 889 | self.maxDiff = None 890 | token = None 891 | handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 3) 892 | 893 | # Swap out the post_coroutine with a mocked one to fake failures 894 | self.actual_post_coroutine = target_stitch.post_coroutine 895 | target_stitch.post_coroutine = self.mock_post_coroutine 896 | 897 | self.messages_sent = 0 898 | 899 | self.og_check_send_exception = target_stitch.check_send_exception 900 | self.out = io.StringIO() 901 | self.target_stitch = target_stitch.TargetStitch( 902 | [handler], self.out, 4000000, 10, 100000) 903 | self.queue = [json.dumps({"type": "SCHEMA", "stream": "chicken_stream", 904 | "key_properties": ["id"], 905 | "schema": {"type": "object", 906 | "properties": {"id": {"type": "integer"}}}}), 907 | json.dumps({"type": "SCHEMA", "stream": "zebra_stream", 908 | "key_properties": ["id"], 909 | "schema": {"type": "object", 910 | "properties": {"id": {"type": "integer"}}}}), 911 | json.dumps({"type": "SCHEMA", "stream": "dog_stream", 912 | "key_properties": ["id"], 913 | "schema": {"type": "object", 914 | "properties": {"id": {"type": "integer"}}}})] 915 | 916 | target_stitch.SEND_EXCEPTION = None 917 | for f,s in target_stitch.PENDING_REQUESTS: 918 | try: 919 | f.cancel() 920 | except: 921 | pass 922 | 923 | target_stitch.PENDING_REQUESTS = [] 924 | LOGGER.info("cleaning SEND_EXCEPTIONS: %s AND PENDING_REQUESTS: %s", 925 | target_stitch.SEND_EXCEPTION, 926 | target_stitch.PENDING_REQUESTS) 927 | 928 | target_stitch.CONFIG ={ 929 | 'token': "some-token", 930 | 'client_id': "some-client", 931 | 'disable_collection': True, 932 | 'connection_ns': "some-ns", 933 | 'batch_size_preferences' : { 934 | 'full_table_streams' : [], 935 | 'batch_size_preference': None, 936 | 'user_batch_size_preference': None, 937 | }, 938 | 'turbo_boost_factor' : 10, 939 | 'small_batch_url' : "http://small-batch", 940 | 'big_batch_url' : "http://big-batch", 941 | } 942 | 943 | 944 | def tearDown(self): 945 | target_stitch.post_coroutine = self.actual_post_coroutine 946 | 947 | async def mock_post_coroutine(self, url, headers, data, verify_ssl): 948 | LOGGER.info("Sending message number %s", self.messages_sent) 949 | self.messages_sent += 1 950 | if self.messages_sent == self.messages_until_error: 951 | return await self.wait_then_throw() 952 | else: 953 | return await self.actual_post_coroutine(url, headers, data, verify_ssl) 954 | 955 | @staticmethod 956 | async def wait_then_throw(): 957 | await asyncio.sleep(5) 958 | raise target_stitch.StitchClientResponseError(400, "Test exception") 959 | 960 | def test_state_interleaving_works(self): 961 | # Tests that the target will buffer records per stream. This will 962 | # allow the tap to alternate which streams it is emitting records 963 | # for without the target cutting small batches 964 | self.messages_until_error = 3 965 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200) 966 | 967 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1}})) 968 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 1}}}})) 969 | 970 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 1}})) 971 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 1}, 972 | "zebra_stream": {"id": 1}}}})) 973 | 974 | self.queue.append(json.dumps({"type": "RECORD", "stream": "dog_stream", "record": {"id": 1}})) 975 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 1}, 976 | "zebra_stream": {"id": 1}, 977 | "dog_stream": {"id": 1}}}})) 978 | 979 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2}})) 980 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 2}, 981 | "zebra_stream": {"id": 1}, 982 | "dog_stream": {"id": 1}}}})) 983 | 984 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 2}})) 985 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 2}, 986 | "zebra_stream": {"id": 2}, 987 | "dog_stream": {"id": 1}}}})) 988 | 989 | self.queue.append(json.dumps({"type": "RECORD", "stream": "dog_stream", "record": {"id": 2}})) 990 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 2}, 991 | "zebra_stream": {"id": 2}, 992 | "dog_stream": {"id": 2}}}})) 993 | 994 | 995 | self.target_stitch.consume(self.queue) 996 | 997 | try: 998 | finish_requests() 999 | except: 1000 | pass 1001 | 1002 | # There should only be messages for the 2 streams because the 1003 | # third one should fail due to the mocking code 1004 | expected_messages = [[{'action': 'upsert', 'data': {'id': 1}}, 1005 | {'action': 'upsert', 'data': {'id': 2}}], 1006 | [{'action': 'upsert', 'data': {'id': 1}}, 1007 | {'action': 'upsert', 'data': {'id': 2}}]] 1008 | 1009 | expected_state = '' 1010 | 1011 | self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent), 2) 1012 | 1013 | # Sort by length and remove sequence number to compare directly 1014 | emitted_state = self.out.getvalue() 1015 | actual_messages = [[{key: m[key] for key in ["action","data"]} for m in ms] 1016 | for ms in sorted(target_stitch.OUR_SESSION.messages_sent, key=lambda ms: len(ms))] 1017 | 1018 | self.assertEqual(actual_messages, expected_messages) 1019 | self.assertEqual(emitted_state, expected_state) 1020 | 1021 | 1022 | 1023 | 1024 | 1025 | def test_state_interleaving_works_with_error_on_first(self): 1026 | '''Test that the target will not emit state if the first stream to be 1027 | batched fails ''' 1028 | 1029 | self.messages_until_error = 1 1030 | target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200) 1031 | 1032 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1}})) 1033 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 1}}}})) 1034 | 1035 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 1}})) 1036 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 1}, 1037 | "zebra_stream": {"id": 1}}}})) 1038 | 1039 | self.queue.append(json.dumps({"type": "RECORD", "stream": "dog_stream", "record": {"id": 1}})) 1040 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 1}, 1041 | "zebra_stream": {"id": 1}, 1042 | "dog_stream": {"id": 1}}}})) 1043 | 1044 | self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2}})) 1045 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 2}, 1046 | "zebra_stream": {"id": 1}, 1047 | "dog_stream": {"id": 1}}}})) 1048 | 1049 | self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 2}})) 1050 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 2}, 1051 | "zebra_stream": {"id": 2}, 1052 | "dog_stream": {"id": 1}}}})) 1053 | 1054 | self.queue.append(json.dumps({"type": "RECORD", "stream": "dog_stream", "record": {"id": 2}})) 1055 | self.queue.append(json.dumps({"type": "STATE", "value": {"bookmarks": {"chicken_stream": {"id": 2}, 1056 | "zebra_stream": {"id": 2}, 1057 | "dog_stream": {"id": 2}}}})) 1058 | 1059 | 1060 | self.target_stitch.consume(self.queue) 1061 | 1062 | try: 1063 | finish_requests() 1064 | except: 1065 | pass 1066 | 1067 | # There should only be messages for the 2 streams because the 1068 | # third one should fail due to the mocking code 1069 | expected_messages = [[{'action': 'upsert', 'data': {'id': 1}}, 1070 | {'action': 'upsert', 'data': {'id': 2}}], 1071 | [{'action': 'upsert', 'data': {'id': 1}}, 1072 | {'action': 'upsert', 'data': {'id': 2}}]] 1073 | 1074 | expected_state = '' 1075 | 1076 | # Should be broken into 2 batches (because the third fails) 1077 | self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent), 2) 1078 | 1079 | # Sort by length and remove sequence number to compare directly 1080 | emitted_state = self.out.getvalue() 1081 | actual_messages = [[{key: m[key] for key in ["action","data"]} for m in ms] 1082 | for ms in sorted(target_stitch.OUR_SESSION.messages_sent, key=lambda ms: len(ms))] 1083 | 1084 | self.assertEqual(actual_messages, expected_messages) 1085 | self.assertEqual(emitted_state, expected_state) 1086 | 1087 | 1088 | 1089 | if __name__== "__main__": 1090 | test1 = StateEdgeCases() 1091 | test1.setUp() 1092 | test1.test_will_not_output_empty_state() 1093 | # test1.test_requests_in_order() 1094 | -------------------------------------------------------------------------------- /tests/record_missing_key_property.json: -------------------------------------------------------------------------------- 1 | {"type": "SCHEMA", "stream": "test_record_missing_key_property", "key_properties": ["id"], "schema": {"type": "object", "properties": {"id": {"type": "integer"}, "name": {"type": "string"}}}} 2 | {"type": "RECORD", "stream": "test_record_missing_key_property", "record": {"name": "Mike"}} 3 | -------------------------------------------------------------------------------- /tests/test_target_stitch.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import target_stitch 3 | import json 4 | import io 5 | import mock 6 | import sys 7 | import datetime 8 | import pytz 9 | import jsonschema 10 | import simplejson 11 | import decimal 12 | import re 13 | import time 14 | 15 | from decimal import Decimal 16 | from jsonschema import ValidationError, Draft4Validator, validators, FormatChecker 17 | from singer import ActivateVersionMessage, RecordMessage, utils, parse_message 18 | 19 | 20 | class DummyClient(object): 21 | 22 | def __init__(self): 23 | self.batches = [] 24 | 25 | def handle_batch(self, messages, contains_activate_version, schema, key_names, bookmark_names, state_writer, state): 26 | self.batches.append( 27 | {'messages': messages, 28 | 'schema': schema, 29 | 'key_names': key_names, 30 | 'bookmark_names': bookmark_names}) 31 | 32 | def message_queue(messages): 33 | return [json.dumps(m) for m in messages] 34 | 35 | def persist_all(recs): 36 | with DummyClient() as client: 37 | target_stitch.persist_lines(client, message_lines(recs)) 38 | return client.messages 39 | 40 | 41 | def state(i): 42 | return {"type": "STATE", "value": i} 43 | def record(i): 44 | return {"type": "RECORD", "stream": "foo", "record": {"i": i}} 45 | 46 | schema = {"type": "SCHEMA", 47 | "stream": "foo", 48 | "key_properties": ["i"], 49 | "schema": {"properties": {"i": {"type": "integer"}}} 50 | } 51 | 52 | def load_sample_lines(filename): 53 | with open('tests/' + filename) as fp: 54 | return [line for line in fp] 55 | 56 | 57 | class TestTargetStitch(unittest.TestCase): 58 | 59 | def setUp(self): 60 | self.client = DummyClient() 61 | self.out = io.StringIO() 62 | self.target_stitch = target_stitch.TargetStitch( 63 | [self.client], self.out, 4000000, 20000, 100000) 64 | 65 | def test_persist_lines_fails_without_key_properties(self): 66 | recs = [ 67 | {"type": "SCHEMA", 68 | "stream": "users", 69 | "schema": { 70 | "properties": { 71 | "id": {"type": "integer"}, 72 | "name": {"type": "string"}}}}] 73 | 74 | with self.assertRaises(Exception): 75 | target_stitch.consume(message_queue(recs)) 76 | 77 | def test_persist_lines_works_with_empty_key_properties(self): 78 | queue = load_sample_lines('empty_key_properties.json') 79 | self.target_stitch.consume(queue) 80 | self.assertEqual(len(self.client.batches), 1) 81 | self.assertEqual(self.client.batches[0]['key_names'], []) 82 | 83 | 84 | def test_persist_lines_sets_key_names(self): 85 | inputs = [ 86 | {"type": "SCHEMA", 87 | "stream": "users", 88 | "key_properties": ["id"], 89 | "schema": { 90 | "properties": { 91 | "id": {"type": "integer"}, 92 | "name": {"type": "string"}}}}, 93 | {"type": "RECORD", 94 | "stream": "users", 95 | "record": {"id": 1, "name": "mike"}}] 96 | 97 | self.target_stitch.consume(message_queue(inputs)) 98 | self.assertEqual(len(self.client.batches), 1) 99 | batch = self.client.batches[0] 100 | self.assertEqual( 101 | batch['schema'], 102 | { 103 | "properties": { 104 | "id": {"type": "integer"}, 105 | "name": {"type": "string"} 106 | } 107 | } 108 | ) 109 | 110 | self.assertEqual(batch['key_names'], ['id']) 111 | 112 | def test_persist_last_state_when_stream_ends_with_record(self): 113 | self.target_stitch.max_batch_records = 3 114 | inputs = [ 115 | schema, 116 | record(0), state(0), record(1), state(1), record(2), 117 | # flush state 1 118 | state(2), record(3), state(3), record(4), state(4), record(5), 119 | # flush state 4 120 | record(6), 121 | record(7), 122 | record(8), 123 | # flush empty states 124 | state(8), 125 | record(9), 126 | state(9), 127 | record(10)] 128 | 129 | self.target_stitch.consume(message_queue(inputs)) 130 | 131 | expected = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10]] 132 | got = [[r.record['i'] for r in batch['messages']] for batch in self.client.batches] 133 | self.assertEqual(got, expected) 134 | 135 | def test_persist_last_state_when_stream_ends_with_state(self): 136 | self.target_stitch.max_batch_records = 3 137 | inputs = [ 138 | schema, 139 | record(0), state(0), record(1), state(1), record(2), 140 | # flush state 1 141 | state(2), record(3), state(3), record(4), state(4), record(5), 142 | # flush state 4 143 | record(6), 144 | record(7), 145 | record(8), 146 | # flush empty states 147 | state(8), 148 | record(9), 149 | state(9), 150 | record(10), 151 | state(10)] 152 | 153 | self.target_stitch.consume(message_queue(inputs)) 154 | 155 | 156 | expected = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10]] 157 | got = [[r.record['i'] for r in batch['messages']] for batch in self.client.batches] 158 | self.assertEqual(got, expected) 159 | 160 | def test_time_triggered_persist(self): 161 | self.target_stitch.batch_delay_seconds = -1 162 | self.target_stitch.max_batch_records = 10000 163 | inputs = [ 164 | schema, 165 | record(0), 166 | record(1), 167 | record(2)] 168 | self.target_stitch.consume(message_queue(inputs)) 169 | expected = [[0], [1], [2]] 170 | got = [[r.record['i'] for r in batch['messages']] for batch in self.client.batches] 171 | self.assertEqual(got, expected) 172 | 173 | def test_persist_lines_updates_schema(self): 174 | inputs = [ 175 | {"type": "SCHEMA", 176 | "stream": "users", 177 | "key_properties": ["id"], 178 | "schema": { 179 | "properties": { 180 | "id": {"type": "integer"}, 181 | "name": {"type": "string"}}}}, 182 | {"type": "RECORD", 183 | "stream": "users", 184 | "record": {"id": 1, "name": "mike"}}, 185 | {"type": "SCHEMA", 186 | "stream": "users", 187 | "key_properties": ["id"], 188 | "schema": { 189 | "properties": { 190 | "id": {"type": "string"}, 191 | "name": {"type": "string"}}}}, 192 | {"type": "RECORD", 193 | "stream": "users", 194 | "record": {"id": "1", "name": "mike"}}] 195 | 196 | self.target_stitch.consume(message_queue(inputs)) 197 | 198 | self.assertEqual(len(self.client.batches), 2) 199 | self.assertEqual(self.client.batches[0]['key_names'], ['id']) 200 | self.assertEqual(self.client.batches[0]['schema']['properties']['id']['type'], 'integer') 201 | self.assertEqual(self.client.batches[1]['schema']['properties']['id']['type'], 'string') 202 | 203 | def test_versioned_stream(self): 204 | queue = load_sample_lines('versioned_stream.json') 205 | self.target_stitch.consume(queue) 206 | 207 | batches = self.client.batches 208 | self.assertEqual(2, len(batches)) 209 | self.assertEqual(1, batches[0]['messages'][0].version) 210 | self.assertEqual(2, batches[1]['messages'][0].version) 211 | 212 | class TestSerialize(unittest.TestCase): 213 | 214 | def setUp(self): 215 | self.schema = { 216 | 'type': 'object', 217 | 'properties': { 218 | 'id': {'type': 'integer'}, 219 | 'color': {'type': 'string'} 220 | } 221 | } 222 | 223 | self.colors = ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'violet'] 224 | self.key_names = ['id'] 225 | self.bookmark_names = ['updated_at'] 226 | 227 | self.records = [{'id': i, 'color': color, 'updated_at': utils.strftime(utils.now())} 228 | for i, color in enumerate(self.colors)] 229 | self.messages = [RecordMessage(stream='colors', record=r) for r in self.records] 230 | self.messages.append(ActivateVersionMessage(stream='colors', version=1)) 231 | 232 | def serialize_with_limit(self, limit): 233 | return target_stitch.serialize(self.messages, self.schema, self.key_names, self.bookmark_names, limit, target_stitch.DEFAULT_MAX_BATCH_RECORDS) 234 | 235 | def unpack_colors(self, request_bodies): 236 | colors = [] 237 | for body in request_bodies: 238 | loaded = json.loads(body) 239 | for message in loaded['messages']: 240 | action = message['action'] 241 | if action == 'upsert': 242 | colors.append((action, message['data']['color'])) 243 | else: 244 | colors.append((action)) 245 | return colors 246 | 247 | def test_splits_batches(self): 248 | self.assertEqual(1, len(self.serialize_with_limit(2000))) 249 | self.assertEqual(2, len(self.serialize_with_limit(1000))) 250 | self.assertEqual(4, len(self.serialize_with_limit(500))) 251 | self.assertEqual(8, len(self.serialize_with_limit(385))) 252 | 253 | def test_raises_if_cant_stay_in_limit(self): 254 | data = 'a' * 21000000 255 | message = RecordMessage(stream='colors', record=data) 256 | with self.assertRaisesRegex(target_stitch.BatchTooLargeException, re.compile('the Stitch API limit of 20 Mb')): 257 | target_stitch.serialize([message], self.schema, self.key_names, self.bookmark_names, 4000000, target_stitch.DEFAULT_MAX_BATCH_RECORDS) 258 | 259 | def test_does_not_drop_records(self): 260 | expected = [ 261 | ('upsert', 'red'), 262 | ('upsert', 'orange'), 263 | ('upsert', 'yellow'), 264 | ('upsert', 'green'), 265 | ('upsert', 'blue'), 266 | ('upsert', 'indigo'), 267 | ('upsert', 'violet'), 268 | ('activate_version')] 269 | 270 | self.assertEqual(expected, self.unpack_colors(self.serialize_with_limit(2000))) 271 | self.assertEqual(expected, self.unpack_colors(self.serialize_with_limit(1000))) 272 | self.assertEqual(expected, self.unpack_colors(self.serialize_with_limit(500))) 273 | self.assertEqual(expected, self.unpack_colors(self.serialize_with_limit(385))) 274 | 275 | def test_serialize_time_extracted(self): 276 | """ Test that we're not corrupting timestamps with cross platform parsing. (Test case for OSX, specifically) """ 277 | expected = "1970-01-01T03:45:23.000000Z" 278 | test_time = datetime.datetime(1970, 1, 1, 3, 45, 23, tzinfo=pytz.utc) 279 | 280 | record = [RecordMessage("greetings",'{greeting: "hi"}', time_extracted=test_time)] 281 | schema = '{"type": "object", "properties": {"greeting": {"type": "string"}}}' 282 | batch = target_stitch.serialize(record, schema, [], [], 1000, target_stitch.DEFAULT_MAX_BATCH_RECORDS)[0] 283 | actual = json.loads(batch)["messages"][0]["time_extracted"] 284 | 285 | self.assertEqual(expected, actual) 286 | 287 | 288 | def create_raw_record(self, value): 289 | return '{"value": ' + value + '}' 290 | 291 | def create_raw_record_message(self,raw_record): 292 | return '{"type": "RECORD", "stream": "test", "record": ' + raw_record + '}' 293 | 294 | class TestDetermineStitchUrl(unittest.TestCase): 295 | def test_full_table_stream(self): 296 | big_batch_url = 'https://bigbatches.org' 297 | small_batch_url = 'https://smallbatch.mil' 298 | target_stitch.CONFIG = {'batch_size_preferences' : 299 | {'full_table_streams' : ['chickens'], 300 | 'batch_size_preference' : None, 301 | 'user_batch_size_preference' : None 302 | }, 303 | 'small_batch_url' : small_batch_url, 304 | 'big_batch_url' : big_batch_url} 305 | 306 | self.assertEqual(target_stitch.determine_stitch_url('chickens'), big_batch_url) 307 | 308 | def test_incremental_stream(self): 309 | big_batch_url = 'https://bigbatches.org' 310 | small_batch_url = 'https://smallbatch.mil' 311 | target_stitch.CONFIG = {'batch_size_preferences' : 312 | {'full_table_streams' : [], 313 | 'batch_size_preference' : None, 314 | 'user_batch_size_preference' : None 315 | }, 316 | 'small_batch_url' : small_batch_url, 317 | 'big_batch_url' : big_batch_url} 318 | 319 | self.assertEqual(target_stitch.determine_stitch_url('chickens'), small_batch_url) 320 | 321 | def test_big_batch_preference(self): 322 | big_batch_url = 'https://bigbatches.org' 323 | small_batch_url = 'https://smallbatch.mil' 324 | target_stitch.CONFIG = {'batch_size_preferences' : 325 | {'full_table_streams' : [], 326 | 'batch_size_preference' : 'bigbatch', 327 | 'user_batch_size_preference' : None 328 | }, 329 | 'small_batch_url' : small_batch_url, 330 | 'big_batch_url' : big_batch_url} 331 | 332 | self.assertEqual(target_stitch.determine_stitch_url('chickens'), big_batch_url) 333 | 334 | class TestSequenceNumbers(unittest.TestCase): 335 | def setUp(self): 336 | # NB: This is the historical width of the sequence number integer 337 | # - Generally, it's a combination of (timestamp + padded_row_index) for 19 digits 338 | # - This should be increased/decreased with care to prevent downstream issues 339 | self.STANDARD_SEQ_LENGTH = 19 340 | 341 | def test_generate_sequence_normal_batch(self): 342 | # Call with a sleep, to simulate the normal case (no ms collisions) 343 | seq1 = target_stitch.generate_sequence(0,target_stitch.DEFAULT_MAX_BATCH_RECORDS) 344 | time.sleep(0.1) 345 | seq2 = target_stitch.generate_sequence(10,target_stitch.DEFAULT_MAX_BATCH_RECORDS) 346 | time.sleep(0.1) 347 | seq3 = target_stitch.generate_sequence(999,target_stitch.DEFAULT_MAX_BATCH_RECORDS) 348 | time.sleep(0.1) 349 | 350 | generated_seqs = [seq1,seq2,seq3] 351 | # Assert number's width for downstream 352 | [self.assertEqual(len(str(s)), self.STANDARD_SEQ_LENGTH) for s in generated_seqs] 353 | # Assert they are all at least increasing 354 | self.assertEqual(generated_seqs, sorted(generated_seqs)) 355 | # Assert no collisions 356 | self.assertEqual(len(generated_seqs), len(set(generated_seqs))) 357 | 358 | def test_generate_sequence_single_record_batches(self): 359 | # Call without sleep and same message_num to create collisions reliably 360 | # This is the situation where multiple single record batches get cut in succession 361 | seq1 = target_stitch.generate_sequence(0,target_stitch.DEFAULT_MAX_BATCH_RECORDS) 362 | seq2 = target_stitch.generate_sequence(0,target_stitch.DEFAULT_MAX_BATCH_RECORDS) 363 | seq3 = target_stitch.generate_sequence(0,target_stitch.DEFAULT_MAX_BATCH_RECORDS) 364 | 365 | generated_seqs = [seq1,seq2,seq3] 366 | 367 | # Assert number's width for downstream 368 | [self.assertEqual(len(str(s)), self.STANDARD_SEQ_LENGTH) for s in generated_seqs] 369 | # Assert they are all at least increasing 370 | self.assertEqual(generated_seqs, sorted(generated_seqs)) 371 | # Assert no collisions 372 | self.assertEqual(len(generated_seqs), len(set(generated_seqs))) 373 | 374 | def test_generate_sequence_max_batch(self): 375 | # Call with an overshot max batch to ensure no duplication 376 | # - The target can consume more than max_batch before cutting a batch 377 | # - It should tolerate an order of magnitude greater records without repeat or extending the width 378 | max_batch = range(target_stitch.DEFAULT_MAX_BATCH_RECORDS * 10) 379 | 380 | generated_seqs = [target_stitch.generate_sequence(i,target_stitch.DEFAULT_MAX_BATCH_RECORDS) 381 | for i in max_batch] 382 | 383 | # Assert number's width for downstream 384 | [self.assertEqual(len(str(s)), self.STANDARD_SEQ_LENGTH) for s in generated_seqs] 385 | # Assert they are all at least increasing 386 | self.assertEqual(generated_seqs, sorted(generated_seqs)) 387 | # Assert no collisions 388 | self.assertEqual(len(generated_seqs), len(set(generated_seqs))) 389 | 390 | 391 | def test_generate_sequence_mixed_case(self): 392 | # Call with varying lengths of batches to ensure the widths mix 393 | regular_batch = [(i,target_stitch.DEFAULT_MAX_BATCH_RECORDS) for i in range(100)] 394 | single_record_batch = [(0,target_stitch.DEFAULT_MAX_BATCH_RECORDS)] 395 | 396 | test_case = (single_record_batch + 397 | regular_batch + 398 | single_record_batch + 399 | single_record_batch + 400 | single_record_batch + 401 | regular_batch + 402 | single_record_batch) 403 | generated_seqs = [target_stitch.generate_sequence(*values) for values in test_case] 404 | 405 | # Assert number's width for downstream 406 | [self.assertEqual(len(str(s)), self.STANDARD_SEQ_LENGTH) for s in generated_seqs] 407 | # Assert they are all at least increasing 408 | self.assertEqual(generated_seqs, sorted(generated_seqs)) 409 | # Assert no collisions 410 | self.assertEqual(len(generated_seqs), len(set(generated_seqs))) 411 | 412 | 413 | if __name__== "__main__": 414 | test1 = TestSerialize() 415 | test1.setUp() 416 | test1.test_raises_if_cant_stay_in_limit() 417 | -------------------------------------------------------------------------------- /tests/versioned_stream.json: -------------------------------------------------------------------------------- 1 | {"type": "SCHEMA", "stream": "users", "key_properties": ["id"], "schema": {"type": "object", "properties": {"id": {"type": "integer"}, "name": {"type": "string"}}}} 2 | {"type": "RECORD", "stream": "users", "version": 1, "record": {"id": 1, "name": "Sam"}} 3 | {"type": "RECORD", "stream": "users", "version": 1, "record": {"id": 2, "name": "Pat"}} 4 | {"type": "RECORD", "stream": "users", "version": 1, "record": {"id": 3, "name": "Alex"}} 5 | {"type": "ACTIVATE_VERSION", "stream": "users", "version": 1} 6 | {"type": "RECORD", "stream": "users", "version": 2, "record": {"id": 1, "name": "Samantha"}} 7 | {"type": "RECORD", "stream": "users", "version": 2, "record": {"id": 2, "name": "Patrick"}} 8 | {"type": "ACTIVATE_VERSION", "stream": "users", "version": 2} 9 | --------------------------------------------------------------------------------