├── .circleci
    └── config.yml
├── .github
    └── pull_request_template.md
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── produce-records.rb
├── setup.cfg
├── setup.py
├── target_config.json
├── target_stitch
    └── __init__.py
└── tests
    ├── __init__.py
    ├── activate_version_tests.py
    ├── doesnt_validate.json
    ├── empty_key_properties.json
    ├── gate_mocks.py
    ├── integration_tests.py
    ├── record_missing_key_property.json
    ├── test_target_stitch.py
    └── versioned_stream.json


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2.1
 2 | 
 3 | workflows:
 4 |   build:
 5 |     jobs:
 6 |       - build:
 7 |           context:
 8 |             - circleci-user
 9 | 
10 | jobs:
11 |   build:
12 |     docker:
13 |       - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/sources-python:0.7.0
14 |     steps:
15 |       - checkout
16 |       - run:
17 |           name: 'Setup virtualenv'
18 |           command: |
19 |             pyenv global 3.9.6
20 |             mkdir -p ~/.virtualenvs
21 |             python3 -m venv ~/.virtualenvs/target-stitch
22 |             source ~/.virtualenvs/target-stitch/bin/activate
23 |             pip install -U pip setuptools
24 |             pip install -e .[dev]
25 |             pip install -U pylint
26 |       - run:
27 |           name: 'Run tests'
28 |           command: |
29 |             # Need to re-activate the virtualenv
30 |             source ~/.virtualenvs/target-stitch/bin/activate
31 |             nosetests -v tests/activate_version_tests.py
32 |             nosetests -v --ignore-files=activate_version_tests.py
33 |             #nosetests
34 |             pylint target_stitch "--extension-pkg-whitelist=ciso8601" --max-positional-arguments=8 -d 'global-variable-not-assigned, consider-using-generator, broad-exception-raised, unused-argument'
35 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # Description of change
 2 | (write a short description here or paste a link to JIRA)
 3 | 
 4 | # QA steps
 5 |  - [ ] automated tests passing
 6 |  - [ ] manual qa steps passing (list below)
 7 |  
 8 | # Risks
 9 | 
10 | # Rollback steps
11 |  - revert this branch
12 | 
13 | #### AI generated code
14 | https://internal.qlik.dev/general/ways-of-working/code-reviews/#guidelines-for-ai-generated-code
15 | - [ ] this PR has been written with the help of GitHub Copilot or another generative AI tool
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | data-*.txt
92 | *~
93 | \#*
94 | .\#*
95 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## 4.0.1
 4 |   * Bump aiohttp from 3.8.5 to 3.11.9
 5 |   * Bump requests from 2.31.0 to 3.32.3 [#112] (https://github.com/singer-io/target-stitch/pull/112)
 6 | 
 7 | ## 4.0.0
 8 |   * Bump singer-python to version `6.0.0`, which adds support for python `3.10+` but is no longer compatible with python `3.5` 
 9 |   * Bumps requests and aiohttp libraries to more secure versions [#108](https://github.com/singer-io/target-stitch/pull/108)
10 | 
11 | ## 3.2.2
12 |   * Remove unused dependencies [#107](https://github.com/singer-io/target-stitch/pull/107)
13 | 
14 | ## 3.2.1
15 |   * Updated dependencies to support Python 3.9.6, deprecated support for Python 3.5.X. [#104](https://github.com/singer-io/target-stitch/pull/104)
16 | 
17 | ## 3.2.0
18 |   * Log how many records appear in a batch and note the number of bytes [#98](https://github.com/singer-io/target-stitch/pull/98)
19 | 
20 | ## 3.1.1
21 |   * Fix a bug related to buffering records per stream that would cause state to not be emitted during certain edge conditions [#96](https://github.com/singer-io/target-stitch/pull/96)
22 | 
23 | ## 3.1.0
24 |   * Buffer records per stream so that changing streams does not flush records [#94](https://github.com/singer-io/target-stitch/pull/94)
25 | 
26 | ## 3.0.3
27 |   * Generates sequence numbers based on nanosecond time to avoid collisions with small, async batches [#90](https://github.com/singer-io/target-stitch/pull/90)
28 | 
29 | ## 3.0.1
30 |   * Removes requirement for `connection_ns` property.
31 | 
32 | ## 3.0.0
33 |   * Adds new configuration properties - `small_batch_url`, `big_batch_url` and `batch_size_preferences` - for internal Stitch use.
34 | 
35 | ## 2.0.7
36 |   * Any exception in flush_state callback will set SEND_EXCEPTION resulting in the termination of the main thread and process.
37 | 
38 | ## 2.0.5
39 |   * Emits final state after all records have been pushed to Stitch, before exit [#71](https://github.com/singer-io/target-stitch/pull/71)
40 | 
41 | ## 1.8.1
42 |   * Updates `requests` to version `2.20.0` in response to CVE 2018-18074
43 | 
44 | ## 1.7.6
45 |   * Flush buffer if enough time has passed when state message is received [#57](https://github.com/singer-io/target-stitch/pull/57)
46 | 
47 | ## 1.7.5
48 |   * Throw an error in the ValidationHandler if schema validation fails.
49 | 
50 | ## 1.7.4
51 |   * Generate unique sequence numbers based on the current time millis with an appended zero-padded message number
52 | 
53 | ## 1.7.3
54 |   * Update to singer-python==5.0.15 to use the change to `RecordMessage.asdict` for serialization of `time_extracted`
55 | 
56 | ## 1.7.2
57 |   * Updates serialize to format `time_extracted` in a cross platform way, using `singer.utils.strftime`
58 | 
59 | ## 1.7.1
60 |   * Allows the push to the Stitch API to bypass SSL verification if an env variable is set [#45](https://github.com/singer-io/target-stitch/pull/45)
61 |   * Updates error message to clarify when a message is too large for the Stitch API [#47](https://github.com/singer-io/target-stitch/pull/47)
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU AFFERO GENERAL PUBLIC LICENSE
  2 |                        Version 3, 19 November 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU Affero General Public License is a free, copyleft license for
 11 | software and other kinds of works, specifically designed to ensure
 12 | cooperation with the community in the case of network server software.
 13 | 
 14 |   The licenses for most software and other practical works are designed
 15 | to take away your freedom to share and change the works.  By contrast,
 16 | our General Public Licenses are intended to guarantee your freedom to
 17 | share and change all versions of a program--to make sure it remains free
 18 | software for all its users.
 19 | 
 20 |   When we speak of free software, we are referring to freedom, not
 21 | price.  Our General Public Licenses are designed to make sure that you
 22 | have the freedom to distribute copies of free software (and charge for
 23 | them if you wish), that you receive source code or can get it if you
 24 | want it, that you can change the software or use pieces of it in new
 25 | free programs, and that you know you can do these things.
 26 | 
 27 |   Developers that use our General Public Licenses protect your rights
 28 | with two steps: (1) assert copyright on the software, and (2) offer
 29 | you this License which gives you legal permission to copy, distribute
 30 | and/or modify the software.
 31 | 
 32 |   A secondary benefit of defending all users' freedom is that
 33 | improvements made in alternate versions of the program, if they
 34 | receive widespread use, become available for other developers to
 35 | incorporate.  Many developers of free software are heartened and
 36 | encouraged by the resulting cooperation.  However, in the case of
 37 | software used on network servers, this result may fail to come about.
 38 | The GNU General Public License permits making a modified version and
 39 | letting the public access it on a server without ever releasing its
 40 | source code to the public.
 41 | 
 42 |   The GNU Affero General Public License is designed specifically to
 43 | ensure that, in such cases, the modified source code becomes available
 44 | to the community.  It requires the operator of a network server to
 45 | provide the source code of the modified version running there to the
 46 | users of that server.  Therefore, public use of a modified version, on
 47 | a publicly accessible server, gives the public access to the source
 48 | code of the modified version.
 49 | 
 50 |   An older license, called the Affero General Public License and
 51 | published by Affero, was designed to accomplish similar goals.  This is
 52 | a different license, not a version of the Affero GPL, but Affero has
 53 | released a new version of the Affero GPL which permits relicensing under
 54 | this license.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                        TERMS AND CONDITIONS
 60 | 
 61 |   0. Definitions.
 62 | 
 63 |   "This License" refers to version 3 of the GNU Affero General Public License.
 64 | 
 65 |   "Copyright" also means copyright-like laws that apply to other kinds of
 66 | works, such as semiconductor masks.
 67 | 
 68 |   "The Program" refers to any copyrightable work licensed under this
 69 | License.  Each licensee is addressed as "you".  "Licensees" and
 70 | "recipients" may be individuals or organizations.
 71 | 
 72 |   To "modify" a work means to copy from or adapt all or part of the work
 73 | in a fashion requiring copyright permission, other than the making of an
 74 | exact copy.  The resulting work is called a "modified version" of the
 75 | earlier work or a work "based on" the earlier work.
 76 | 
 77 |   A "covered work" means either the unmodified Program or a work based
 78 | on the Program.
 79 | 
 80 |   To "propagate" a work means to do anything with it that, without
 81 | permission, would make you directly or secondarily liable for
 82 | infringement under applicable copyright law, except executing it on a
 83 | computer or modifying a private copy.  Propagation includes copying,
 84 | distribution (with or without modification), making available to the
 85 | public, and in some countries other activities as well.
 86 | 
 87 |   To "convey" a work means any kind of propagation that enables other
 88 | parties to make or receive copies.  Mere interaction with a user through
 89 | a computer network, with no transfer of a copy, is not conveying.
 90 | 
 91 |   An interactive user interface displays "Appropriate Legal Notices"
 92 | to the extent that it includes a convenient and prominently visible
 93 | feature that (1) displays an appropriate copyright notice, and (2)
 94 | tells the user that there is no warranty for the work (except to the
 95 | extent that warranties are provided), that licensees may convey the
 96 | work under this License, and how to view a copy of this License.  If
 97 | the interface presents a list of user commands or options, such as a
 98 | menu, a prominent item in the list meets this criterion.
 99 | 
100 |   1. Source Code.
101 | 
102 |   The "source code" for a work means the preferred form of the work
103 | for making modifications to it.  "Object code" means any non-source
104 | form of a work.
105 | 
106 |   A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 | 
111 |   The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form.  A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 | 
122 |   The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities.  However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work.  For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 | 
135 |   The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 | 
139 |   The Corresponding Source for a work in source code form is that
140 | same work.
141 | 
142 |   2. Basic Permissions.
143 | 
144 |   All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met.  This License explicitly affirms your unlimited
147 | permission to run the unmodified Program.  The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work.  This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 | 
152 |   You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force.  You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright.  Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 | 
163 |   Conveying under any other circumstances is permitted solely under
164 | the conditions stated below.  Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 | 
167 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 | 
169 |   No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 | 
175 |   When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 | 
183 |   4. Conveying Verbatim Copies.
184 | 
185 |   You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 | 
193 |   You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 | 
196 |   5. Conveying Modified Source Versions.
197 | 
198 |   You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 | 
202 |     a) The work must carry prominent notices stating that you modified
203 |     it, and giving a relevant date.
204 | 
205 |     b) The work must carry prominent notices stating that it is
206 |     released under this License and any conditions added under section
207 |     7.  This requirement modifies the requirement in section 4 to
208 |     "keep intact all notices".
209 | 
210 |     c) You must license the entire work, as a whole, under this
211 |     License to anyone who comes into possession of a copy.  This
212 |     License will therefore apply, along with any applicable section 7
213 |     additional terms, to the whole of the work, and all its parts,
214 |     regardless of how they are packaged.  This License gives no
215 |     permission to license the work in any other way, but it does not
216 |     invalidate such permission if you have separately received it.
217 | 
218 |     d) If the work has interactive user interfaces, each must display
219 |     Appropriate Legal Notices; however, if the Program has interactive
220 |     interfaces that do not display Appropriate Legal Notices, your
221 |     work need not make them do so.
222 | 
223 |   A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit.  Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 | 
233 |   6. Conveying Non-Source Forms.
234 | 
235 |   You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 | 
240 |     a) Convey the object code in, or embodied in, a physical product
241 |     (including a physical distribution medium), accompanied by the
242 |     Corresponding Source fixed on a durable physical medium
243 |     customarily used for software interchange.
244 | 
245 |     b) Convey the object code in, or embodied in, a physical product
246 |     (including a physical distribution medium), accompanied by a
247 |     written offer, valid for at least three years and valid for as
248 |     long as you offer spare parts or customer support for that product
249 |     model, to give anyone who possesses the object code either (1) a
250 |     copy of the Corresponding Source for all the software in the
251 |     product that is covered by this License, on a durable physical
252 |     medium customarily used for software interchange, for a price no
253 |     more than your reasonable cost of physically performing this
254 |     conveying of source, or (2) access to copy the
255 |     Corresponding Source from a network server at no charge.
256 | 
257 |     c) Convey individual copies of the object code with a copy of the
258 |     written offer to provide the Corresponding Source.  This
259 |     alternative is allowed only occasionally and noncommercially, and
260 |     only if you received the object code with such an offer, in accord
261 |     with subsection 6b.
262 | 
263 |     d) Convey the object code by offering access from a designated
264 |     place (gratis or for a charge), and offer equivalent access to the
265 |     Corresponding Source in the same way through the same place at no
266 |     further charge.  You need not require recipients to copy the
267 |     Corresponding Source along with the object code.  If the place to
268 |     copy the object code is a network server, the Corresponding Source
269 |     may be on a different server (operated by you or a third party)
270 |     that supports equivalent copying facilities, provided you maintain
271 |     clear directions next to the object code saying where to find the
272 |     Corresponding Source.  Regardless of what server hosts the
273 |     Corresponding Source, you remain obligated to ensure that it is
274 |     available for as long as needed to satisfy these requirements.
275 | 
276 |     e) Convey the object code using peer-to-peer transmission, provided
277 |     you inform other peers where the object code and Corresponding
278 |     Source of the work are being offered to the general public at no
279 |     charge under subsection 6d.
280 | 
281 |   A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 | 
285 |   A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling.  In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage.  For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product.  A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 | 
298 |   "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source.  The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 | 
306 |   If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information.  But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 | 
317 |   The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed.  Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 | 
325 |   Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 | 
331 |   7. Additional Terms.
332 | 
333 |   "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law.  If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 | 
342 |   When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it.  (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.)  You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 | 
349 |   Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 | 
353 |     a) Disclaiming warranty or limiting liability differently from the
354 |     terms of sections 15 and 16 of this License; or
355 | 
356 |     b) Requiring preservation of specified reasonable legal notices or
357 |     author attributions in that material or in the Appropriate Legal
358 |     Notices displayed by works containing it; or
359 | 
360 |     c) Prohibiting misrepresentation of the origin of that material, or
361 |     requiring that modified versions of such material be marked in
362 |     reasonable ways as different from the original version; or
363 | 
364 |     d) Limiting the use for publicity purposes of names of licensors or
365 |     authors of the material; or
366 | 
367 |     e) Declining to grant rights under trademark law for use of some
368 |     trade names, trademarks, or service marks; or
369 | 
370 |     f) Requiring indemnification of licensors and authors of that
371 |     material by anyone who conveys the material (or modified versions of
372 |     it) with contractual assumptions of liability to the recipient, for
373 |     any liability that these contractual assumptions directly impose on
374 |     those licensors and authors.
375 | 
376 |   All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10.  If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term.  If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 | 
386 |   If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 | 
391 |   Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 | 
395 |   8. Termination.
396 | 
397 |   You may not propagate or modify a covered work except as expressly
398 | provided under this License.  Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 | 
403 |   However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 | 
410 |   Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 | 
417 |   Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License.  If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 | 
423 |   9. Acceptance Not Required for Having Copies.
424 | 
425 |   You are not required to accept this License in order to receive or
426 | run a copy of the Program.  Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance.  However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work.  These actions infringe copyright if you do
431 | not accept this License.  Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 | 
434 |   10. Automatic Licensing of Downstream Recipients.
435 | 
436 |   Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License.  You are not responsible
439 | for enforcing compliance by third parties with this License.
440 | 
441 |   An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations.  If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 | 
451 |   You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License.  For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 | 
459 |   11. Patents.
460 | 
461 |   A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based.  The
463 | work thus licensed is called the contributor's "contributor version".
464 | 
465 |   A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version.  For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 | 
475 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 | 
480 |   In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement).  To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 | 
487 |   If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients.  "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 | 
501 |   If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 | 
509 |   A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License.  You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 | 
524 |   Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 | 
528 |   12. No Surrender of Others' Freedom.
529 | 
530 |   If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License.  If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all.  For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 | 
540 |   13. Remote Network Interaction; Use with the GNU General Public License.
541 | 
542 |   Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software.  This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 | 
553 |   Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work.  The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 | 
561 |   14. Revised Versions of this License.
562 | 
563 |   The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time.  Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 | 
568 |   Each version is given a distinguishing version number.  If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation.  If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 | 
577 |   If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 | 
582 |   Later license versions may give you additional or different
583 | permissions.  However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 | 
587 |   15. Disclaimer of Warranty.
588 | 
589 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 | 
598 |   16. Limitation of Liability.
599 | 
600 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 | 
610 |   17. Interpretation of Sections 15 and 16.
611 | 
612 |   If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 | 
619 |                      END OF TERMS AND CONDITIONS
620 | 		     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # target-stitch
 2 | 
 3 | Reads [Singer](https://singer.io) formatted data from stdin and persists it to the Stitch Import API.
 4 | 
 5 | ## Install
 6 | 
 7 | Requires Python 3.5.6
 8 | 
 9 | ```bash
10 | › pip install target-stitch
11 | ```
12 | 
13 | ## Use
14 | 
15 | target-stitch takes two types of input:
16 | 
17 | 1. A config file containing your Stitch client id and access token
18 | 2. A stream of Singer-formatted data on stdin
19 | 
20 | Create config file to contain your Stitch client id and token:
21 | 
22 | ```json
23 | {
24 |   "client_id" : 1234,
25 |   "token" : "asdkjqbawsdciobasdpkjnqweobdclakjsdbcakbdsac",
26 |   "small_batch_url": "https://api.stitchdata.com/v2/import/batch",
27 |   "big_batch_url": "https://api.stitchdata.com/v2/import/batch",
28 |   "batch_size_preferences": {}
29 | }
30 | ```
31 | ```bash
32 | › tap-some-api | target-stitch --config config.json
33 | ```
34 | 
35 | where `tap-some-api` is [Singer Tap](https://singer.io).
36 | 
37 | ---
38 | 
39 | Copyright &copy; 2017 Stitch
40 | 


--------------------------------------------------------------------------------
/produce-records.rb:
--------------------------------------------------------------------------------
 1 | require 'json'
 2 | 
 3 | $TABLE_NAME='postgres_full_table_replication_test'
 4 | records = File.open('records.json', 'w')
 5 | 
 6 | schema = {"stream" =>  $TABLE_NAME, 
 7 | 	  "bookmark_properties" =>  [], 
 8 | 	  "key_properties" =>  ["id"], 
 9 | 	  "schema" =>  {	"selected" =>  true, 
10 | 			"type" =>  "object", 
11 | 			"properties" =>  {
12 | 					  "our_real" =>  {"selected" =>  true, "inclusion" =>  "available", "type" =>  ["null", "number"]}, 
13 | 					  "our_smallint" =>  {"selected" =>  true, "minimum" =>  -32768, "maximum" =>  32767, "inclusion" =>  "available", "type" =>  ["null", "integer"]}, 
14 | 					  "OUR DATE" =>  {"selected" =>  true, "type" =>  ["null", "string"], "inclusion" =>  "available", "format" =>  "date-time"}, 
15 | 					  "id" =>  {"selected" =>  true, "minimum" =>  -2147483648, "maximum" =>  2147483647, "inclusion" =>  "automatic", "type" =>  ["integer"]}, 
16 | 					  "our_bigint" =>  {"selected" =>  true, "minimum" =>  -9223372036854775808, "maximum" =>  9223372036854775807, "inclusion" =>  "available", "type" =>  ["null", "integer"]}, 
17 | 					  "our_integer" =>  {"selected" =>  true, "minimum" =>  -2147483648, "maximum" =>  2147483647, "inclusion" =>  "available", "type" =>  ["null", "integer"]}, 
18 | 					  "our_boolean" =>  {"selected" =>  true, "inclusion" =>  "available", "type" =>  ["null", "boolean"]}, 
19 | 					  "our_double" =>  {"selected" =>  true, "inclusion" =>  "available", "type" =>  ["null", "number"]}, 
20 | 					  "our_json" =>  {"selected" =>  true, "inclusion" =>  "available", "type" =>  ["null", "string"]}, 
21 | 					  "our_store" =>  {"selected" =>  true, "type" =>  ["null", "object"], "inclusion" =>  "available", "properties" =>  {}},
22 | 				 	  "our_decimal" =>  {"exclusiveMinimum" =>  true, "minimum" =>  -10000000000, "exclusiveMaximum" =>  true, "inclusion" =>  "available", "selected" =>  true, 
23 | 					  		"multipleOf" =>  0.01, "maximum" =>  10000000000, "type" =>  ["null", "number"]}, 
24 | 					  "our_text" =>  {"selected" =>  true, "inclusion" =>  "available", "type" =>  ["null", "string"]},
25 | 
26 | 
27 | 					  "our_real2" =>  {"selected" =>  true, "inclusion" =>  "available", "type" =>  ["null", "number"]}, 
28 | 					  "our_smallint2" =>  {"selected" =>  true, "minimum" =>  -32768, "maximum" =>  32767, "inclusion" =>  "available", "type" =>  ["null", "integer"]}, 
29 | 					  "OUR DATE2" =>  {"selected" =>  true, "type" =>  ["null", "string"], "inclusion" =>  "available", "format" =>  "date-time"}, 
30 | 					  "our_bigint2" =>  {"selected" =>  true, "minimum" =>  -9223372036854775808, "maximum" =>  9223372036854775807, "inclusion" =>  "available", "type" =>  ["null", "integer"]}, 
31 | 					  "our_integer2" =>  {"selected" =>  true, "minimum" =>  -2147483648, "maximum" =>  2147483647, "inclusion" =>  "available", "type" =>  ["null", "integer"]}, 
32 | 					  "our_boolean2" =>  {"selected" =>  true, "inclusion" =>  "available", "type" =>  ["null", "boolean"]}, 
33 | 					  "our_double2" =>  {"selected" =>  true, "inclusion" =>  "available", "type" =>  ["null", "number"]}, 
34 | 					  "our_json2" =>  {"selected" =>  true, "inclusion" =>  "available", "type" =>  ["null", "string"]}, 
35 | 					  "our_store2" =>  {"selected" =>  true, "type" =>  ["null", "object"], "inclusion" =>  "available", "properties" =>  {}},
36 | 				 	  "our_decimal2" =>  {"exclusiveMinimum" =>  true, "minimum" =>  -10000000000, "exclusiveMaximum" =>  true, "inclusion" =>  "available", "selected" =>  true, 
37 | 					  		"multipleOf" =>  0.01, "maximum" =>  10000000000, "type" =>  ["null", "number"]}, 
38 | 					  "our_text2" =>  {"selected" =>  true, "inclusion" =>  "available", "type" =>  ["null", "string"]}
39 | 					  }}, 
40 | 	"type" =>  "SCHEMA"}
41 | 
42 | records.puts( schema.to_json() )
43 | 
44 | 1000000.times do |i|
45 | 	records.puts( {	"stream" =>  $TABLE_NAME, 
46 | 			"record" =>  {	
47 | 					"our_real" =>  1.2, 
48 | 					"our_smallint" =>  100,
49 | 					"OUR DATE" =>  "1998-03-04T00:00:00+00:00",
50 | 					"id" =>  i,
51 | 					"our_bigint" =>  1000000, 
52 | 					"our_integer" =>  44100, 
53 | 					"our_boolean" =>  true, 
54 | 					"our_double" =>  1.1, 
55 | 					"our_json" =>  "{\"secret\" =>  55}",
56 | 					"our_store" => {"name" =>  "betty", "dances" => "floor"},
57 | 					"our_decimal" =>  0.01,
58 | 					"our_text" =>  "some text",
59 | 
60 | 					"our_real2" =>  1.2, 
61 | 					"our_smallint2" =>  100,
62 | 					"OUR DATE2" =>  "1998-03-04T00:00:00+00:00",
63 | 					"our_bigint2" =>  1000000, 
64 | 					"our_integer2" =>  44100, 
65 | 					"our_boolean2" =>  true, 
66 | 					"our_double2" =>  1.1, 
67 | 					"our_json2" =>  "{\"secret\" =>  55}",
68 | 					"our_store2" => {"name" =>  "betty", "dances" => "floor"},
69 | 					"our_decimal2" =>  0.01,
70 | 					"our_text2" =>  " I've seen things you people wouldn't believe. Attack ships on fire off the shoulder of Orion. I watched c-beams glitter in the dark near the Tannhauser Gate. All those moments will be lost in time, like tears in rain. Time to die." }, 
71 | 			"time_extracted" =>  "2019-06-18T17:10:05.878611Z", 
72 | 			"version" =>  1560877805878, "type" =>  "RECORD"}.to_json() )
73 | 	if i % 10 == 0
74 | 		records.puts({"value" =>  {"bookmarks" =>  {"dev-public-postgres_full_table_replication_test" =>  
75 | 								{"last_replication_method" =>  "FULL_TABLE", "version" =>  1561124881384, "xmin" =>  i}}, 
76 | 					  "currently_syncing" =>  "dev-public-postgres_full_table_replication_test"}, "type" =>  "STATE"}.to_json())
77 | 	end
78 | end
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | setup(name='target-stitch',
 6 |       version='4.0.1',
 7 |       description='Singer.io target for the Stitch API',
 8 |       author='Stitch',
 9 |       url='https://singer.io',
10 |       classifiers=['Programming Language :: Python :: 3 :: Only'],
11 |       py_modules=['target_stitch'],
12 |       install_requires=[
13 |           'jsonschema==2.6.0',
14 |           'mock==2.0.0',
15 |           'requests==2.32.3',
16 |           'singer-python==6.0.0',
17 |           'psutil==5.6.6',
18 |           'simplejson==3.11.1',
19 |           'aiohttp==3.11.9',
20 | 	  'ciso8601',
21 |       ],
22 |       extras_require={
23 |           'dev': [
24 |               'nose==1.3.7',
25 |               'astroid==2.1.0',
26 |               'pylint==2.1.1'
27 |           ]
28 |       },
29 |       entry_points='''
30 |           [console_scripts]
31 |           target-stitch=target_stitch:main
32 |       ''',
33 |       packages=['target_stitch'],
34 | )
35 | 


--------------------------------------------------------------------------------
/target_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "client_id": 3, 
3 | "token": "some-token",
4 | "small_batch_url": "https://api.stitchdata.com/v2/import/batch",
5 | "big_batch_url": "https://api.stitchdata.com/v2/import/batch",
6 | "batch_size_preferences": {}
7 | }
8 | 


--------------------------------------------------------------------------------
/target_stitch/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # pylint: disable=too-many-arguments,invalid-name,too-many-nested-blocks,line-too-long,missing-docstring,global-statement, broad-except
  3 | 
  4 | '''
  5 | Target for Stitch API.
  6 | '''
  7 | 
  8 | import argparse
  9 | import copy
 10 | import gzip
 11 | import http.client
 12 | import io
 13 | import json
 14 | import os
 15 | import re
 16 | import sys
 17 | import time
 18 | import urllib
 19 | import functools
 20 | 
 21 | from threading import Thread
 22 | from contextlib import contextmanager
 23 | from collections import namedtuple
 24 | from datetime import datetime, timezone
 25 | from decimal import Decimal, getcontext
 26 | import asyncio
 27 | import concurrent
 28 | from pprint import pformat
 29 | import simplejson
 30 | import psutil
 31 | 
 32 | import aiohttp
 33 | from aiohttp.client_exceptions import ClientConnectorError, ClientResponseError
 34 | 
 35 | from jsonschema import ValidationError, Draft4Validator, FormatChecker
 36 | import pkg_resources
 37 | import backoff
 38 | 
 39 | import singer
 40 | from singer import metrics
 41 | import ciso8601
 42 | 
 43 | LOGGER = singer.get_logger().getChild('target_stitch')
 44 | 
 45 | # We use this to store schema and key properties from SCHEMA messages
 46 | StreamMeta = namedtuple('StreamMeta', ['schema', 'key_properties', 'bookmark_properties'])
 47 | 
 48 | BIGBATCH_MAX_BATCH_BYTES = 20000000
 49 | DEFAULT_MAX_BATCH_BYTES = 4000000
 50 | DEFAULT_MAX_BATCH_RECORDS = 20000
 51 | MILLISECOND_SEQUENCE_MULTIPLIER = 1000
 52 | NANOSECOND_SEQUENCE_MULTIPLIER = 1000000
 53 | 
 54 | # This is our singleton aiohttp session
 55 | OUR_SESSION = None
 56 | 
 57 | # This datastructure contains our pending aiohttp requests.
 58 | # The main thread will read from it.
 59 | # The event loop thread will write to it by appending new requests to it and removing completed requests.
 60 | PENDING_REQUESTS = []
 61 | 
 62 | # This variable holds any exceptions we have encountered sending data to the gate.
 63 | # The main thread will read from it and terminate the target if an exception is present.
 64 | # The event loop thread will write to it after each aiohttp request completes
 65 | SEND_EXCEPTION = None
 66 | 
 67 | CONFIG = {}
 68 | 
 69 | def start_loop(loop):
 70 |     asyncio.set_event_loop(loop)
 71 |     global OUR_SESSION
 72 |     timeout = aiohttp.ClientTimeout(sock_connect=60, sock_read=60)
 73 |     OUR_SESSION = aiohttp.ClientSession(connector=aiohttp.TCPConnector(loop=loop), timeout=timeout)
 74 |     loop.run_forever()
 75 | 
 76 | new_loop = asyncio.new_event_loop()
 77 | # new_loop.set_debug(True)
 78 | t = Thread(target=start_loop, args=(new_loop,))
 79 | 
 80 | #The event loop thread should not keep the process alive after the main thread terminates
 81 | t.daemon = True
 82 | 
 83 | t.start()
 84 | 
 85 | 
 86 | class TargetStitchException(Exception):
 87 |     '''A known exception for which we don't need to print a stack trace'''
 88 | 
 89 | class StitchClientResponseError(Exception):
 90 |     def __init__(self, status, response_body):
 91 |         self.response_body = response_body
 92 |         self.status = status
 93 |         super().__init__()
 94 | 
 95 | class MemoryReporter(Thread):
 96 |     '''Logs memory usage every 30 seconds'''
 97 | 
 98 |     def __init__(self):
 99 |         self.process = psutil.Process()
100 |         super().__init__(name='memory_reporter', daemon=True)
101 | 
102 |     def run(self):
103 |         while True:
104 |             LOGGER.debug('Virtual memory usage: %.2f%% of total: %s',
105 |                          self.process.memory_percent(),
106 |                          self.process.memory_info())
107 |             time.sleep(30.0)
108 | 
109 | 
110 | class Timings:
111 |     '''Gathers timing information for the three main steps of the Tap.'''
112 |     def __init__(self):
113 |         self.last_time = time.time()
114 |         self.timings = {
115 |             'serializing': 0.0,
116 |             'posting': 0.0,
117 |             None: 0.0
118 |         }
119 | 
120 |     @contextmanager
121 |     def mode(self, mode):
122 |         '''We wrap the big steps of the Tap in this context manager to accumulate
123 |         timing info.'''
124 | 
125 |         start = time.time()
126 |         yield
127 |         end = time.time()
128 |         self.timings[None] += start - self.last_time
129 |         self.timings[mode] += end - start
130 |         self.last_time = end
131 | 
132 | 
133 |     def log_timings(self):
134 |         '''We call this with every flush to print out the accumulated timings'''
135 |         LOGGER.debug('Timings: unspecified: %.3f; serializing: %.3f; posting: %.3f;',
136 |                      self.timings[None],
137 |                      self.timings['serializing'],
138 |                      self.timings['posting'])
139 | 
140 | TIMINGS = Timings()
141 | 
142 | 
143 | class BatchTooLargeException(TargetStitchException):
144 |     '''Exception for when the records and schema are so large that we can't
145 |     create a batch with even one record.'''
146 | 
147 | def _log_backoff(details):
148 |     (_, exc, _) = sys.exc_info()
149 |     LOGGER.info(
150 |         'Error sending data to Stitch. Sleeping %d seconds before trying again: %s',
151 |         details['wait'], exc)
152 | 
153 | def parse_config(config_location):
154 |     global CONFIG
155 |     CONFIG = json.load(config_location)
156 |     if not CONFIG.get('token'):
157 |         raise Exception('Configuration is missing required "token" field')
158 | 
159 |     if not CONFIG.get('client_id'):
160 |         raise Exception('Configuration is missing required "client_id"')
161 | 
162 |     if not isinstance(CONFIG.get('batch_size_preferences'), dict):
163 |         raise Exception('Configuration is requires batch_size_preferences dictionary')
164 | 
165 |     if not CONFIG['batch_size_preferences'].get('full_table_streams'):
166 |         CONFIG['batch_size_preferences']['full_table_streams'] = []
167 |     LOGGER.info('Using batch_size_prefernces of %s', CONFIG['batch_size_preferences'])
168 | 
169 |     if not CONFIG.get('turbo_boost_factor'):
170 |         CONFIG['turbo_boost_factor'] = 1
171 | 
172 |     if CONFIG['turbo_boost_factor'] != 5:
173 |         LOGGER.info('Using turbo_boost_factor of %s', CONFIG['turbo_boost_factor'])
174 | 
175 |     if not CONFIG.get('small_batch_url'):
176 |         raise Exception('Configuration is missing required "small_batch_url"')
177 | 
178 |     if not CONFIG.get('big_batch_url'):
179 |         raise Exception('Configuration is missing required "big_batch_url"')
180 | 
181 | def determine_stitch_url(stream_name):
182 |     batch_size_prefs = CONFIG.get('batch_size_preferences')
183 |     if stream_name in batch_size_prefs.get('full_table_streams'):
184 |         return CONFIG.get('big_batch_url')
185 | 
186 |     #eg. platform.heap requires S3 because it is fulltable data
187 |     if batch_size_prefs.get('batch_size_preference') == 'bigbatch':
188 |         return CONFIG.get('big_batch_url')
189 | 
190 |     if batch_size_prefs.get('batch_size_preference') == 'smallbatch':
191 |         return CONFIG.get('small_batch_url')
192 | 
193 |     #NB> not implemented yet
194 |     if batch_size_prefs.get('user_batch_size_preference') == 'bigbatch':
195 |         return CONFIG.get('big_batch_url')
196 | 
197 |     #NB> not implemented yet
198 |     if batch_size_prefs.get('user_batch_size_preference') == 'smallbatch':
199 |         return CONFIG.get('small_batch_url')
200 | 
201 |     return CONFIG.get('small_batch_url')
202 | 
203 | 
204 | 
205 | class StitchHandler: # pylint: disable=too-few-public-methods
206 |     '''Sends messages to Stitch.'''
207 | 
208 |     def __init__(self, max_batch_bytes, max_batch_records):
209 |         self.token = CONFIG.get('token')
210 |         self.max_batch_bytes = max_batch_bytes
211 |         self.max_batch_records = max_batch_records
212 | 
213 |     @staticmethod
214 |     #this happens in the event loop
215 |     def flush_states(state_writer, future):
216 | 
217 |         global PENDING_REQUESTS
218 |         global SEND_EXCEPTION
219 | 
220 |         completed_count = 0
221 | 
222 |         #NB> if/when the first coroutine errors out, we will record it for examination by the main threa.
223 |         #if/when this happens, no further flushing of state should ever occur.  the main thread, in fact,
224 |         #should shutdown quickly after it spots the exception
225 |         if SEND_EXCEPTION is None:
226 |             SEND_EXCEPTION = future.exception()
227 | 
228 |         if SEND_EXCEPTION is not None:
229 |             LOGGER.info('FLUSH early exit because of SEND_EXCEPTION: %s', pformat(SEND_EXCEPTION))
230 |             return
231 | 
232 |         try:
233 |             for f, s in PENDING_REQUESTS:
234 |                 if f.done():
235 |                     completed_count = completed_count + 1
236 |                     #NB> this is a very import line.
237 |                     #NEVER blinding emit state just because a coroutine has completed.
238 |                     #if this were None, we would have just nuked the client's state
239 |                     if s:
240 |                         line = simplejson.dumps(s)
241 |                         state_writer.write(f"{line}\n")
242 |                         state_writer.flush()
243 |                 else:
244 |                     break
245 | 
246 |             PENDING_REQUESTS = PENDING_REQUESTS[completed_count:]
247 | 
248 |         except BaseException as err:
249 |             SEND_EXCEPTION = err
250 | 
251 | 
252 |     def headers(self):
253 |         '''Return the headers based on the token'''
254 |         return {
255 |             'Authorization': f'Bearer {self.token}',
256 |             'Content-Type': 'application/json'
257 |         }
258 | 
259 |     def send(self, data, contains_activate_version, state_writer, state, stitch_url):
260 |         '''Send the given data to Stitch, retrying on exceptions'''
261 |         global PENDING_REQUESTS
262 |         global SEND_EXCEPTION
263 | 
264 |         check_send_exception()
265 | 
266 |         headers = self.headers()
267 |         verify_ssl = True
268 |         if os.environ.get("TARGET_STITCH_SSL_VERIFY") == 'false':
269 |             verify_ssl = False
270 | 
271 |         LOGGER.info("Sending batch of %d bytes to %s", len(data), stitch_url)
272 | 
273 |         #NB> before we send any activate_versions we must ensure that all PENDING_REQUETS complete.
274 |         #this is to ensure ordering in the case of Full Table replication where the Activate Version,
275 |         #must arrive AFTER all of the relevant data.
276 |         if len(PENDING_REQUESTS) > 0 and contains_activate_version:
277 |             LOGGER.info('Sending batch with ActivateVersion. Flushing PENDING_REQUESTS first')
278 |             finish_requests()
279 | 
280 |         if len(PENDING_REQUESTS) >= CONFIG.get('turbo_boost_factor'):
281 | 
282 |             #wait for to finish the first future before resuming the main thread
283 |             finish_requests(CONFIG.get('turbo_boost_factor') - 1)
284 | 
285 |         #NB> this schedules the task on the event loop thread.
286 |         #    it will be executed at some point in the future
287 |         future = asyncio.run_coroutine_threadsafe(post_coroutine(stitch_url, headers, data, verify_ssl), new_loop)
288 |         next_pending_request = (future, state)
289 |         PENDING_REQUESTS.append(next_pending_request)
290 |         future.add_done_callback(functools.partial(self.flush_states, state_writer))
291 | 
292 | 
293 |     def handle_state_only(self, state_writer=None, state=None):
294 |         async def fake_future_fn():
295 |             pass
296 | 
297 |         global PENDING_REQUESTS
298 |         #NB> no point in sending out this state if a previous request has failed
299 |         check_send_exception()
300 |         future = asyncio.run_coroutine_threadsafe(fake_future_fn(), new_loop)
301 |         next_pending_request = (future, state)
302 |         PENDING_REQUESTS.append(next_pending_request)
303 | 
304 |         future.add_done_callback(functools.partial(self.flush_states, state_writer))
305 | 
306 | 
307 |     def handle_batch(self, messages, contains_activate_version, schema, key_names, bookmark_names=None, state_writer=None, state=None, ):
308 |         '''Handle messages by sending them to Stitch.
309 | 
310 |         If the serialized form of the messages is too large to fit into a
311 |         single request this will break them up into multiple smaller
312 |         requests.
313 | 
314 |         '''
315 | 
316 |         stitch_url = determine_stitch_url(messages[0].stream)
317 |         LOGGER.info("Serializing batch with %d messages for table %s", len(messages), messages[0].stream)
318 |         with TIMINGS.mode('serializing'):
319 |             bodies = serialize(messages,
320 |                                schema,
321 |                                key_names,
322 |                                bookmark_names,
323 |                                self.max_batch_bytes,
324 |                                self.max_batch_records)
325 | 
326 |         LOGGER.debug('Split batch into %d requests', len(bodies))
327 |         for i, body in enumerate(bodies):
328 |             with TIMINGS.mode('posting'):
329 |                 LOGGER.debug('Request %d of %d is %d bytes', i + 1, len(bodies), len(body))
330 |                 if len(body) > DEFAULT_MAX_BATCH_BYTES:
331 |                     stitch_url = CONFIG.get('big_batch_url')
332 | 
333 |                 flushable_state = None
334 |                 if i + 1 == len(bodies):
335 |                     flushable_state = state
336 | 
337 |                 self.send(body, contains_activate_version, state_writer, flushable_state, stitch_url)
338 | 
339 |         # Write a singer.metrics.Counter and set the value to the count of records being sent
340 |         with metrics.Counter(metrics.Metric.record_count, tags={"endpoint": messages[0].stream,
341 |                                                                 "num_bytes": sum([len(body) for body in bodies])}) as c:
342 |             c.value = len([m for m in messages if isinstance(m, singer.RecordMessage)])
343 | 
344 | 
345 | 
346 | class LoggingHandler:  # pylint: disable=too-few-public-methods
347 |     '''Logs records to a local output file.'''
348 |     def __init__(self, output_file, max_batch_bytes, max_batch_records):
349 |         self.output_file = output_file
350 |         self.max_batch_bytes = max_batch_bytes
351 |         self.max_batch_records = max_batch_records
352 | 
353 |     def handle_state_only(self, state_writer=None, state=None):
354 |         LOGGER.info("LoggingHandler handle_state_only: %s", state)
355 |         if state:
356 |             line = simplejson.dumps(state)
357 |             state_writer.write(f"{line}\n")
358 |             state_writer.flush()
359 | 
360 | 
361 |     def handle_batch(self, messages, contains_activate_version, schema, key_names, bookmark_names=None, state_writer=None, state=None): #pylint: disable=unused-argument
362 |         '''Handles a batch of messages by saving them to a local output file.
363 | 
364 |         Serializes records in the same way StitchHandler does, so the
365 |         output file should contain the exact request bodies that we would
366 |         send to Stitch.
367 | 
368 |         '''
369 |         LOGGER.info("LoggingHandler handle_batch")
370 |         LOGGER.info("Saving batch with %d messages for table %s to %s",
371 |                     len(messages), messages[0].stream, self.output_file.name)
372 |         for i, body in enumerate(serialize(messages,
373 |                                            schema,
374 |                                            key_names,
375 |                                            bookmark_names,
376 |                                            self.max_batch_bytes,
377 |                                            self.max_batch_records)):
378 |             LOGGER.debug("Request body %d is %d bytes", i, len(body))
379 |             self.output_file.write(body)
380 |             self.output_file.write('\n')
381 | 
382 |         if state:
383 |             line = simplejson.dumps(state)
384 |             state_writer.write(f"{line}\n")
385 |             state_writer.flush()
386 | 
387 | 
388 | 
389 | class ValidatingHandler: # pylint: disable=too-few-public-methods
390 |     '''Validates input messages against their schema.'''
391 | 
392 |     def __init__(self):
393 |         getcontext().prec = 76
394 | 
395 |     def handle_state_only(self, state_writer=None, state=None):
396 |         LOGGER.info("ValidatingHandler handle_state_only: %s", state)
397 |         if state:
398 |             line = simplejson.dumps(state)
399 |             state_writer.write(f"{line}\n")
400 |             state_writer.flush()
401 | 
402 |     def handle_batch(self, messages, contains_activate_version, schema, key_names, bookmark_names=None, state_writer=None, state=None):
403 |         '''Handles messages by validating them against schema.'''
404 |         LOGGER.info("ValidatingHandler handle_batch")
405 |         validator = Draft4Validator(schema, format_checker=FormatChecker())
406 |         for i, message in enumerate(messages):
407 |             if isinstance(message, singer.RecordMessage):
408 |                 try:
409 |                     validator.validate(message.record)
410 |                     if key_names:
411 |                         for k in key_names:
412 |                             if k not in message.record:
413 |                                 raise TargetStitchException(
414 |                                     f'Message {i} is missing key property {k}'
415 |                                 )
416 |                 except Exception as e:
417 |                     raise TargetStitchException(
418 |                         f'Record does not pass schema validation: {e}') from e
419 | 
420 |         # pylint: disable=undefined-loop-variable
421 |         # NB: This seems incorrect as there's a chance message is not defined
422 |         LOGGER.info('%s (%s): Batch is valid',
423 |                     messages[0].stream,
424 |                     len(messages))
425 |         if state:
426 |             line = simplejson.dumps(state)
427 |             state_writer.write(f"{line}\n")
428 |             state_writer.flush()
429 | 
430 | def generate_sequence(message_num, max_records):
431 |     '''
432 |     Generates a unique sequence number based on the current time in nanoseconds
433 |     with a zero-padded message number based on the index of the record within the
434 |     magnitude of max_records.
435 | 
436 |     COMPATIBILITY:
437 |     Maintains a historical width of 19 characters (with default `max_records`), in order
438 |     to not overflow downstream processes that depend on the width of this number.
439 | 
440 |     Because of this requirement, `message_num` is modulo the difference between nanos
441 |     and millis to maintain 19 characters.
442 |     '''
443 |     nanosecond_sequence_base = str(int(time.time() * NANOSECOND_SEQUENCE_MULTIPLIER))
444 |     modulo = NANOSECOND_SEQUENCE_MULTIPLIER / MILLISECOND_SEQUENCE_MULTIPLIER
445 |     zfill_width_mod = len(str(NANOSECOND_SEQUENCE_MULTIPLIER)) - len(str(MILLISECOND_SEQUENCE_MULTIPLIER))
446 | 
447 |     # add an extra order of magnitude to account for the fact that we can
448 |     # actually accept more than the max record count
449 |     fill = len(str(10 * max_records)) - zfill_width_mod
450 |     sequence_suffix = str(int(message_num % modulo)).zfill(fill)
451 | 
452 |     return int(nanosecond_sequence_base + sequence_suffix)
453 | 
454 | def serialize(messages, schema, key_names, bookmark_names, max_bytes, max_records):
455 |     '''Produces request bodies for Stitch.
456 | 
457 |     Builds a request body consisting of all the messages. Serializes it as
458 |     JSON. If the result exceeds the request size limit, splits the batch
459 |     in half and recurs.
460 | 
461 |     '''
462 |     serialized_messages = []
463 |     for idx, message in enumerate(messages):
464 |         if isinstance(message, singer.RecordMessage):
465 |             record_message = {
466 |                 'action': 'upsert',
467 |                 'data': message.record,
468 |                 'sequence': generate_sequence(idx, max_records)
469 |             }
470 | 
471 |             if message.time_extracted:
472 |                 #"%04Y-%m-%dT%H:%M:%S.%fZ"
473 |                 record_message['time_extracted'] = singer.utils.strftime(message.time_extracted)
474 | 
475 |             serialized_messages.append(record_message)
476 |         elif isinstance(message, singer.ActivateVersionMessage):
477 |             serialized_messages.append({
478 |                 'action': 'activate_version',
479 |                 'sequence': generate_sequence(idx, max_records)
480 |             })
481 | 
482 |     body = {
483 |         'table_name': messages[0].stream,
484 |         'schema': schema,
485 |         'key_names': key_names,
486 |         'messages': serialized_messages
487 |     }
488 |     if messages[0].version is not None:
489 |         body['table_version'] = messages[0].version
490 | 
491 |     if bookmark_names:
492 |         body['bookmark_names'] = bookmark_names
493 | 
494 | 
495 |     # We are not using Decimals for parsing here. We recognize that
496 |     # exposes data to potential rounding errors. However, the Stitch API
497 |     # as it is implemented currently is also subject to rounding errors.
498 |     # This will affect very few data points and we have chosen to leave
499 |     # conversion as is for now.
500 | 
501 |     serialized = simplejson.dumps(body)
502 |     LOGGER.debug('Serialized %d messages into %d bytes', len(messages), len(serialized))
503 | 
504 |     if len(serialized) < max_bytes:
505 |         return [serialized]
506 | 
507 |     if len(messages) <= 1:
508 |         if len(serialized) < BIGBATCH_MAX_BATCH_BYTES:
509 |             return [serialized]
510 |         raise BatchTooLargeException(
511 |             f"A single record is larger than the Stitch API limit of {BIGBATCH_MAX_BATCH_BYTES // 1000000} Mb"
512 |         )
513 | 
514 | 
515 |     pivot = len(messages) // 2
516 |     l_half = serialize(messages[:pivot], schema, key_names, bookmark_names, max_bytes, max_records)
517 |     r_half = serialize(messages[pivot:], schema, key_names, bookmark_names, max_bytes, max_records)
518 |     return l_half + r_half
519 | 
520 | 
521 | class TargetStitch:
522 |     '''Encapsulates most of the logic of target-stitch.
523 | 
524 |     Useful for unit testing.
525 | 
526 |     '''
527 | 
528 |     # pylint: disable=too-many-instance-attributes
529 |     def __init__(self, # pylint: disable=too-many-arguments
530 |                  handlers,
531 |                  state_writer,
532 |                  max_batch_bytes,
533 |                  max_batch_records,
534 |                  batch_delay_seconds):
535 |         self.messages = {}
536 |         self.contains_activate_version = {}
537 |         self.buffer_size_bytes = {}
538 |         self.state = None
539 | 
540 |         # Mapping from stream name to {'schema': ..., 'key_names': ..., 'bookmark_names': ... }
541 |         self.stream_meta = {}
542 | 
543 |         # Instance of StitchHandler
544 |         self.handlers = handlers
545 | 
546 |         # Writer that we write state records to
547 |         self.state_writer = state_writer
548 | 
549 |         # Batch size limits. Stored as properties here so we can easily
550 |         # change for testing.
551 |         self.max_batch_bytes = max_batch_bytes
552 |         self.max_batch_records = max_batch_records
553 | 
554 |         # Minimum frequency to send a batch, used with self.time_last_batch_sent
555 |         self.batch_delay_seconds = batch_delay_seconds
556 | 
557 |         # Time that the last batch was sent
558 |         self.time_last_batch_sent = time.time()
559 | 
560 | 
561 | 
562 |     def flush_stream(self, stream, is_final_stream):
563 |         '''Send all the buffered messages to Stitch.'''
564 | 
565 |         messages = self.messages[stream]
566 |         stream_meta = self.stream_meta[stream]
567 | 
568 |         # NB: We only want to include the state on the final stream we are
569 |         # batching because this will prevent the state from flushing until
570 |         # all of the streams are flushed because the state is global for
571 |         # all streams so if one of the streams fails to batch we cannot
572 |         # flush the state
573 |         if is_final_stream:
574 |             state = self.state
575 |         else:
576 |             state = None
577 | 
578 |         for handler in self.handlers:
579 |             handler.handle_batch(messages,
580 |                                  self.contains_activate_version.get(stream, False),
581 |                                  stream_meta.schema,
582 |                                  stream_meta.key_properties,
583 |                                  stream_meta.bookmark_properties,
584 |                                  self.state_writer,
585 |                                  state)
586 | 
587 |         self.time_last_batch_sent = time.time()
588 |         self.contains_activate_version[stream] = False
589 |         self.buffer_size_bytes[stream] = 0
590 |         self.messages[stream] = []
591 |         # NB: We can only clear the state if this is the final stream
592 |         # flush. Otherwise we risk clearing out the state before we can
593 |         # even send it.
594 |         if is_final_stream:
595 |             self.state = None
596 | 
597 | 
598 |     def flush(self):
599 |         # Have to keep track of how many streams we have looked at so we
600 |         # know when we are flushing the final stream
601 |         messages_to_flush = { stream: messages for stream, messages in self.messages.items() if len(messages) > 0 }
602 |         num_flushed = 0
603 |         num_streams = len(messages_to_flush)
604 |         for stream, messages in messages_to_flush.items():
605 |             num_flushed += 1
606 |             is_final_stream = num_flushed == num_streams
607 |             self.flush_stream(stream, is_final_stream)
608 |         # NB> State is usually handled above but in the case there are no messages
609 |         # we still want to ensure state is emitted.
610 |         if num_flushed == 0 and self.state:
611 |             for handler in self.handlers:
612 |                 handler.handle_state_only(self.state_writer, self.state)
613 |             self.state = None
614 |             TIMINGS.log_timings()
615 | 
616 | 
617 | 
618 |     def handle_line(self, line):
619 | 
620 |         '''Takes a raw line from stdin and handles it, updating state and possibly
621 |         flushing the batch to the Gate and the state to the output
622 |         stream.
623 | 
624 |         '''
625 | 
626 |         message = overloaded_parse_message(line)
627 | 
628 |         # If we got a Schema, set the schema and key properties for this
629 |         # stream. Flush the batch, if there is one, in case the schema is
630 |         # different.
631 |         if isinstance(message, singer.SchemaMessage):
632 |             self.flush()
633 | 
634 |             if message.stream not in self.messages:
635 |                 self.messages[message.stream] = []
636 |             self.stream_meta[message.stream] = StreamMeta(
637 |                 message.schema,
638 |                 message.key_properties,
639 |                 message.bookmark_properties)
640 | 
641 |         elif isinstance(message, (singer.RecordMessage, singer.ActivateVersionMessage)):
642 |             current_stream = message.stream
643 |             # NB> This previously would flush on a stream change. Because
644 |             # we are now buffering records across streams we do not need
645 |             # to flush on stream change
646 |             if self.messages[current_stream] and (message.version != self.messages[current_stream][0].version):
647 |                 self.flush()
648 | 
649 |             self.messages[current_stream].append(message)
650 |             self.buffer_size_bytes[current_stream] = self.buffer_size_bytes.get(current_stream, 0) + len(line)
651 |             if isinstance(message, singer.ActivateVersionMessage):
652 |                 self.contains_activate_version[current_stream] = True
653 | 
654 |             num_bytes = sum(self.buffer_size_bytes.values())
655 |             num_messages = sum((len(messages) for messages in self.messages.values()))
656 |             num_seconds = time.time() - self.time_last_batch_sent
657 | 
658 |             enough_bytes = num_bytes >= self.max_batch_bytes
659 |             enough_messages = num_messages >= self.max_batch_records
660 |             enough_time = num_seconds >= self.batch_delay_seconds
661 |             if enough_bytes or enough_messages or enough_time:
662 |                 LOGGER.debug('Flushing %d bytes, %d messages, after %.2f seconds',
663 |                              num_bytes, num_messages, num_seconds)
664 |                 self.flush()
665 | 
666 |         elif isinstance(message, singer.StateMessage):
667 |             self.state = message.value
668 | 
669 |             # only check time since state message does not increase num_messages or
670 |             # num_bytes for the batch
671 |             num_seconds = time.time() - self.time_last_batch_sent
672 | 
673 |             if num_seconds >= self.batch_delay_seconds:
674 |                 LOGGER.debug('Flushing %d bytes, %d messages, after %.2f seconds',
675 |                              sum(self.buffer_size_bytes.values()),
676 |                              sum(len(messages) for messages in self.messages.values()), num_seconds)
677 |                 self.flush()
678 |                 self.time_last_batch_sent = time.time()
679 | 
680 | 
681 | 
682 |     def consume(self, reader):
683 |         '''Consume all the lines from the queue, flushing when done.'''
684 |         for line in reader:
685 |             self.handle_line(line)
686 |         self.flush()
687 | 
688 | 
689 | def collect():
690 |     '''Send usage info to Stitch.'''
691 | 
692 |     try:
693 |         version = pkg_resources.get_distribution('target-stitch').version
694 |         conn = http.client.HTTPSConnection('collector.stitchdata.com', timeout=10)
695 |         conn.connect()
696 |         params = {
697 |             'e': 'se',
698 |             'aid': 'singer',
699 |             'se_ca': 'target-stitch',
700 |             'se_ac': 'open',
701 |             'se_la': version,
702 |         }
703 |         conn.request('GET', '/i?' + urllib.parse.urlencode(params))
704 |         conn.getresponse()
705 |         conn.close()
706 |     except: # pylint: disable=bare-except
707 |         LOGGER.debug('Collection request failed')
708 | 
709 | 
710 | def main_impl():
711 |     '''We wrap this function in main() to add exception handling'''
712 |     parser = argparse.ArgumentParser()
713 | 
714 |     parser.add_argument(
715 |         '-c', '--config',
716 |         help='Config file',
717 |         type=argparse.FileType('r'))
718 |     parser.add_argument(
719 |         '-n', '--dry-run',
720 |         help='Dry run - Do not push data to Stitch',
721 |         action='store_true')
722 |     parser.add_argument(
723 |         '-o', '--output-file',
724 |         help='Save requests to this output file',
725 |         type=argparse.FileType('w'))
726 |     parser.add_argument(
727 |         '-v', '--verbose',
728 |         help='Produce debug-level logging',
729 |         action='store_true')
730 |     parser.add_argument(
731 |         '-q', '--quiet',
732 |         help='Suppress info-level logging',
733 |         action='store_true')
734 |     parser.add_argument('--max-batch-records', type=int, default=DEFAULT_MAX_BATCH_RECORDS)
735 |     parser.add_argument('--max-batch-bytes', type=int, default=DEFAULT_MAX_BATCH_BYTES)
736 |     parser.add_argument('--batch-delay-seconds', type=float, default=300.0)
737 |     args = parser.parse_args()
738 | 
739 |     if args.verbose:
740 |         LOGGER.setLevel('DEBUG')
741 |     elif args.quiet:
742 |         LOGGER.setLevel('WARNING')
743 | 
744 |     handlers = []
745 |     if args.output_file:
746 |         handlers.append(LoggingHandler(args.output_file,
747 |                                        args.max_batch_bytes,
748 |                                        args.max_batch_records))
749 |     if args.dry_run:
750 |         handlers.append(ValidatingHandler())
751 |     elif not args.config:
752 |         parser.error("config file required if not in dry run mode")
753 |     else:
754 |         parse_config(args.config)
755 | 
756 |         if not CONFIG.get('disable_collection'):
757 |             LOGGER.info('Sending version information to stitchdata.com. ' +
758 |                         'To disable sending anonymous usage data, set ' +
759 |                         'the config parameter "disable_collection" to true')
760 |             Thread(target=collect).start()
761 |         handlers.append(StitchHandler(args.max_batch_bytes,
762 |                                       args.max_batch_records))
763 | 
764 |     # queue = Queue(args.max_batch_records)
765 |     reader = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
766 |     target_stitch = TargetStitch(handlers,
767 |                                  sys.stdout,
768 |                                  args.max_batch_bytes,
769 |                                  args.max_batch_records,
770 |                                  args.batch_delay_seconds)
771 |     target_stitch.consume(reader)
772 | 
773 |     #NB> we need to wait for this to be empty indicating that all of the
774 |     #requests have been finished and their states flushed
775 |     finish_requests()
776 |     LOGGER.info("Requests complete, stopping loop")
777 |     new_loop.call_soon_threadsafe(new_loop.stop)
778 | 
779 | 
780 | def finish_requests(max_count=0):
781 |     global PENDING_REQUESTS
782 |     while True:
783 |         # LOGGER.info("Finishing %s requests:", len(PENDING_REQUESTS))
784 |         check_send_exception()
785 |         if len(PENDING_REQUESTS) <= max_count: #pylint: disable=len-as-condition
786 |             break
787 |         time.sleep(1 / 1000.0)
788 | 
789 | 
790 | 
791 | def check_send_exception():
792 |     try:
793 |         global SEND_EXCEPTION
794 |         if SEND_EXCEPTION:
795 |             raise SEND_EXCEPTION
796 | 
797 |     # An StitchClientResponseError means we received > 2xx response
798 |     # Try to parse the "message" from the
799 |     # json body of the response, since Stitch should include
800 |     # the human-oriented message in that field. If there are
801 |     # any errors parsing the message, just include the
802 |     # stringified response.
803 |     except StitchClientResponseError as exc:
804 |         try:
805 |             msg = f"{str(exc.status)}: {exc.response_body}"
806 |         except Exception: # pylint: disable=bare-except
807 |             LOGGER.exception('Exception while processing error response')
808 |             msg = f'{exc}'
809 |         raise TargetStitchException('Error persisting data to Stitch: ' +
810 |                                     msg) from exc
811 | 
812 |     # A ClientConnectorErrormeans we
813 |     # couldn't even connect to stitch. The exception is likely
814 |     # to be very long and gross. Log the full details but just
815 |     # include the summary in the critical error message.
816 |     except ClientConnectorError as exc:
817 |         LOGGER.exception(exc)
818 |         raise TargetStitchException('Error connecting to Stitch') from exc
819 | 
820 |     except concurrent.futures._base.TimeoutError as exc: #pylint: disable=protected-access
821 |         raise TargetStitchException("Timeout sending to Stitch") from exc
822 | 
823 | 
824 | def exception_is_4xx(ex):
825 |     return 400 <= ex.status < 500
826 | 
827 | @backoff.on_exception(backoff.expo,
828 |                       StitchClientResponseError,
829 |                       max_tries=5,
830 |                       giveup=exception_is_4xx,
831 |                       on_backoff=_log_backoff)
832 | async def post_coroutine(url, headers, data, verify_ssl):
833 |     # LOGGER.info("POST starting: %s ssl(%s)", url, verify_ssl)
834 |     global OUR_SESSION
835 |     async with OUR_SESSION.post(url, headers=headers, data=data, raise_for_status=False, verify_ssl=verify_ssl) as response:
836 |         result_body = None
837 |         try:
838 |             result_body = await response.json()
839 |         except BaseException as ex: #pylint: disable=unused-variable
840 |             raise StitchClientResponseError(response.status, "unable to parse response body as json") from ex
841 | 
842 |         if response.status // 100 != 2:
843 |             raise StitchClientResponseError(response.status, result_body)
844 | 
845 |         return result_body
846 | 
847 | def _required_key(msg, k):
848 |     if k not in msg:
849 |         raise Exception(f"Message is missing required key '{k}': {msg}")
850 | 
851 |     return msg[k]
852 | 
853 | def overloaded_parse_message(msg):
854 |     """Parse a message string into a Message object."""
855 | 
856 |     # We are not using Decimals for parsing here.
857 |     # We recognize that exposes data to potentially
858 |     # lossy conversions.  However, this will affect
859 |     # very few data points and we have chosen to
860 |     # leave conversion as is for now.
861 |     obj = simplejson.loads(msg, use_decimal=True)
862 |     msg_type = _required_key(obj, 'type')
863 | 
864 |     if msg_type == 'RECORD':
865 |         time_extracted = obj.get('time_extracted')
866 |         if time_extracted:
867 |             try:
868 |                 time_extracted = ciso8601.parse_datetime(time_extracted)
869 |             except Exception:
870 |                 time_extracted = None
871 |         return singer.RecordMessage(stream=_required_key(obj, 'stream'),
872 |                                     record=_required_key(obj, 'record'),
873 |                                     version=obj.get('version'),
874 |                                     time_extracted=time_extracted)
875 | 
876 |     if msg_type == 'SCHEMA':
877 |         return singer.SchemaMessage(stream=_required_key(obj, 'stream'),
878 |                                     schema=_required_key(obj, 'schema'),
879 |                                     key_properties=_required_key(obj, 'key_properties'),
880 |                                     bookmark_properties=obj.get('bookmark_properties'))
881 | 
882 |     if msg_type == 'STATE':
883 |         return singer.StateMessage(value=_required_key(obj, 'value'))
884 | 
885 |     if msg_type == 'ACTIVATE_VERSION':
886 |         return singer.ActivateVersionMessage(stream=_required_key(obj, 'stream'),
887 |                                              version=_required_key(obj, 'version'))
888 |     return None
889 | 
890 | 
891 | def main():
892 |     '''Main entry point'''
893 |     try:
894 |         MemoryReporter().start()
895 |         main_impl()
896 | 
897 |     # If we catch an exception at the top level we want to log a CRITICAL
898 |     # line to indicate the reason why we're terminating. Sometimes the
899 |     # extended stack traces can be confusing and this provides a clear way
900 |     # to call out the root cause. If it's a known TargetStitchException we
901 |     # can suppress the stack trace, otherwise we should include the stack
902 |     # trace for debugging purposes, so re-raise the exception.
903 |     except TargetStitchException as exc:
904 |         for line in str(exc).splitlines():
905 |             LOGGER.critical(line)
906 |         sys.exit(1)
907 |     except Exception as exc:
908 |         LOGGER.critical(exc)
909 |         raise exc
910 | 
911 | if __name__ == '__main__':
912 |     main()
913 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/singer-io/target-stitch/a2a084ccdff70aeb0c313066e5ae523f183b67f1/tests/__init__.py


--------------------------------------------------------------------------------
/tests/activate_version_tests.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import target_stitch
  3 | from target_stitch import StitchHandler, TargetStitchException, finish_requests
  4 | import io
  5 | import json
  6 | import simplejson
  7 | import asyncio
  8 | 
  9 | try:
 10 |     from tests.gate_mocks import mock_out_of_order_all_200
 11 | except ImportError:
 12 |     from gate_mocks  import mock_out_of_order_all_200
 13 | 
 14 | 
 15 | class FakePost:
 16 |     def __init__(self, requests_sent, makeFakeResponse):
 17 |         self.requests_sent = requests_sent
 18 |         self.makeFakeResponse = makeFakeResponse
 19 | 
 20 |     async def __aenter__(self):
 21 |         return self.makeFakeResponse(self.requests_sent)
 22 | 
 23 |     async def __aexit__(self, exc_type, exc, tb):
 24 |         await asyncio.sleep(1)
 25 | 
 26 | class FakeSession:
 27 |     def __init__(self, makeFakeResponse):
 28 |         self.requests_sent = 0
 29 |         self.bodies_sent = []
 30 |         self.makeFakeResponse = makeFakeResponse
 31 | 
 32 |     def post(self, url, *, data, **kwargs):
 33 |         self.requests_sent = self.requests_sent + 1
 34 |         self.bodies_sent.append(data)
 35 |         return FakePost(self.requests_sent, self.makeFakeResponse)
 36 | 
 37 | class ActivateVersion(unittest.TestCase):
 38 |     def fake_flush_states(self, state_writer, future):
 39 |         self.flushed_state_count = self.flushed_state_count + 1
 40 | 
 41 |         if self.flushed_state_count == 1:
 42 |             #2nd request has not begun because it contains an ActivateVersion and must wait for 1 to complete
 43 |             if len(target_stitch.PENDING_REQUESTS) != 1:
 44 |                 self.first_flush_error = "ActivateVersion request should not have been issues until 1st request completed: wrong pending request count for first flush"
 45 | 
 46 |             if future != target_stitch.PENDING_REQUESTS[0][0]:
 47 |                 self.first_flush_error = "ActivateVersion request should not have been issues until 1st request completed: received wrong future for first flush"
 48 | 
 49 |             if  target_stitch.PENDING_REQUESTS[0][1] != {'bookmarks': {'chicken_stream': {'id': 1}}}:
 50 |                 self.first_flush_error = "ActivateVersion request should not have been issues until 1st request completed: wrong state for first flush"
 51 | 
 52 |         elif self.flushed_state_count == 2:
 53 |             if len(target_stitch.PENDING_REQUESTS) != 1:
 54 |                 self.second_flush_error = "ActivateVersion request should not have been issues until 1st request completed: wrong pending request count for second flush"
 55 | 
 56 |             if future != target_stitch.PENDING_REQUESTS[0][0]:
 57 |                 self.second_flush_error = "ActivateVersion request should not have been issues until 1st request completed: wrong future for second flush"
 58 | 
 59 |             if target_stitch.PENDING_REQUESTS[0][1] is not None:
 60 |                 self.second_flush_error = "ActivateVersion request should not have been issues until 1st request completed: wrong state for second flush"
 61 | 
 62 |         else:
 63 |             raise Exception('flushed state should only have been called twice')
 64 | 
 65 |         self.og_flush_states(state_writer, future)
 66 | 
 67 | 
 68 |     def setUp(self):
 69 |         token = None
 70 |         self.first_flush_error = None
 71 |         self.second_flush_error = None
 72 | 
 73 |         handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 2)
 74 | 
 75 |         self.out = io.StringIO()
 76 |         self.target_stitch = target_stitch.TargetStitch(
 77 |             [handler], self.out, 4000000, 2, 100000)
 78 |         self.queue = [simplejson.dumps({"type": "SCHEMA", "stream": "chicken_stream",
 79 |                                   "key_properties": ["my_float"],
 80 |                                   "schema": {"type": "object",
 81 |                                              "properties": {"my_float": {"type": "number"}}}})]
 82 |         target_stitch.SEND_EXCEPTION = None
 83 |         target_stitch.PENDING_REQUESTS = []
 84 |         self.og_flush_states = StitchHandler.flush_states
 85 |         self.flushed_state_count = 0
 86 |         StitchHandler.flush_states = self.fake_flush_states
 87 | 
 88 |         target_stitch.CONFIG = {
 89 |             'token': "some-token",
 90 |             'client_id': "some-client",
 91 |             'disable_collection': True,
 92 |             'connection_ns': "some-ns",
 93 |             'batch_size_preferences' : {
 94 |                 'full_table_streams' : [],
 95 |                 'batch_size_preference': None,
 96 |                 'user_batch_size_preference': None,
 97 |             },
 98 |             'turbo_boost_factor' : 10,
 99 |             'small_batch_url' : "http://small-batch",
100 |             'big_batch_url' : "http://big-batch",
101 |         }
102 | 
103 |     def test_activate_version_finishes_pending_requests(self):
104 |         target_stitch.OUR_SESSION = FakeSession(mock_out_of_order_all_200)
105 |         #request 2 would ordinarily complete first because the mock_out_of_order_all_200, but because
106 |         #request 2 contains an ACTIVATE_VERSION, it will not even be sent until request 1 completes
107 | 
108 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "version":1, "record": {"id": 1, "name": "Mike"}}))
109 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}}))
110 |         #will flush here after 2 records
111 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", 'version':1, "record": {"id": 2, "name": "Paul"}}))
112 |         self.queue.append(json.dumps({"type":"ACTIVATE_VERSION", 'stream': 'chicken_stream', 'version': 1 }))
113 |         #will flush here after 2 records
114 | 
115 | 
116 |         self.target_stitch.consume(self.queue)
117 |         finish_requests()
118 |         self.assertEqual(self.first_flush_error, None, self.first_flush_error)
119 |         self.assertEqual(self.second_flush_error, None, self.second_flush_error)
120 | 
121 | 
122 | if __name__== "__main__":
123 |     test1 = ActivateVersion()
124 |     test1.setUp()
125 |     test1.test_activate_version_finishes_pending_requests()
126 |     #test1.test_unparseable_json_response()
127 | 


--------------------------------------------------------------------------------
/tests/doesnt_validate.json:
--------------------------------------------------------------------------------
1 | {"type": "SCHEMA", "stream": "users", "schema": { "type": "object", "properties": { "name": { "type": "string" } }}, "key_properties": ["name"] }
2 | {"type": "RECORD", "stream": "users", "record": {"name": 1}}
3 | 


--------------------------------------------------------------------------------
/tests/empty_key_properties.json:
--------------------------------------------------------------------------------
1 | {"type": "SCHEMA", "stream": "test_empty_key_properties", "key_properties": [], "schema": {"type": "object", "properties": {"name": {"type": "string"}}}}
2 | {"type": "RECORD", "stream": "test_empty_key_properties", "record": {"name": "Mike"}}
3 | 


--------------------------------------------------------------------------------
/tests/gate_mocks.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | 
  3 | def mock_unparsable_response_body_200(requests_sent):
  4 |     class FakeResponse:
  5 |         def __init__(self, requests_sent):
  6 |             self.requests_sent = requests_sent
  7 | 
  8 |         async def json(self):
  9 |             self.status = 200
 10 |             raise Exception("bad json response")
 11 | 
 12 |     return FakeResponse(requests_sent)
 13 | 
 14 | def mock_in_order_all_200(requests_sent):
 15 |     class FakeResponse:
 16 |         def __init__(self, requests_sent):
 17 |             self.requests_sent = requests_sent
 18 | 
 19 |         async def json(self):
 20 |             self.status = 200
 21 |             await asyncio.sleep(0)
 22 |             return {"status" : "finished request {}".format(requests_sent)}
 23 | 
 24 |     return FakeResponse(requests_sent)
 25 | 
 26 | def mock_out_of_order_all_200(requests_sent):
 27 |     class FakeResponse:
 28 |         def __init__(self, requests_sent):
 29 |             self.requests_sent = requests_sent
 30 | 
 31 |         async def json(self):
 32 |             self.status = 200
 33 |             if self.requests_sent == 1:
 34 |                 await asyncio.sleep(3)
 35 |             return {"status" : "finished request {}".format(requests_sent)}
 36 | 
 37 |     return FakeResponse(requests_sent)
 38 | 
 39 | def mock_in_order_first_errors(requests_sent):
 40 |     class FakeResponse:
 41 |         def __init__(self, requests_sent):
 42 |             self.requests_sent = requests_sent
 43 | 
 44 |         async def json(self):
 45 |             if (self.requests_sent == 1):
 46 |                 self.status = 400
 47 |                 return {"status" : "finished request {}".format(requests_sent)}
 48 | 
 49 |             self.status = 200
 50 |             return {"status" : "finished request {}".format(requests_sent)}
 51 | 
 52 |     return FakeResponse(requests_sent)
 53 | 
 54 | def mock_in_order_second_errors(requests_sent):
 55 |     class FakeResponse:
 56 |         def __init__(self, requests_sent):
 57 |             self.requests_sent = requests_sent
 58 | 
 59 |         async def json(self):
 60 |             if (self.requests_sent == 2):
 61 |                 self.status = 400
 62 |                 return {"status" : "finished request {}".format(requests_sent)}
 63 | 
 64 |             self.status = 200
 65 |             return {"status" : "finished request {}".format(requests_sent)}
 66 | 
 67 |     return FakeResponse(requests_sent)
 68 | 
 69 | def mock_out_of_order_first_errors(requests_sent):
 70 |     class FakeResponse:
 71 |         def __init__(self, requests_sent):
 72 |             self.requests_sent = requests_sent
 73 | 
 74 |         async def json(self):
 75 |             if (self.requests_sent == 1):
 76 |                 self.status = 400
 77 |                 await asyncio.sleep(3)
 78 |                 return {"status" : "finished request {}".format(requests_sent)}
 79 | 
 80 |             self.status = 200
 81 |             return {"status" : "finished request {}".format(requests_sent)}
 82 | 
 83 |     return FakeResponse(requests_sent)
 84 | 
 85 | def mock_out_of_order_second_errors(requests_sent):
 86 |     class FakeResponse:
 87 |         def __init__(self, requests_sent):
 88 |             self.requests_sent = requests_sent
 89 | 
 90 |         async def json(self):
 91 |             if (self.requests_sent == 1):
 92 |                 self.status = 200
 93 |                 await asyncio.sleep(3)
 94 |                 return {"status" : "finished request {}".format(requests_sent)}
 95 | 
 96 |             self.status = 400
 97 |             return {"status" : "finished request {}".format(requests_sent)}
 98 | 
 99 |     return FakeResponse(requests_sent)
100 | 
101 | def mock_out_of_order_both_error(requests_sent):
102 |     class FakeResponse:
103 |         def __init__(self, requests_sent):
104 |             self.requests_sent = requests_sent
105 | 
106 |         async def json(self):
107 |             self.status = 400
108 |             if (self.requests_sent == 1):
109 |                 await asyncio.sleep(10)
110 |                 return {"status" : "finished request {}".format(requests_sent)}
111 | 
112 |             return {"status" : "finished request {}".format(requests_sent)}
113 | 
114 |     return FakeResponse(requests_sent)
115 | 
116 | 
117 | def mock_in_order_both_error(requests_sent):
118 |     class FakeResponse:
119 |         def __init__(self, requests_sent):
120 |             self.requests_sent = requests_sent
121 | 
122 |         async def json(self):
123 |             self.status = 400
124 |             return {"status" : "finished request {}".format(requests_sent)}
125 | 
126 |     return FakeResponse(requests_sent)
127 | 


--------------------------------------------------------------------------------
/tests/integration_tests.py:
--------------------------------------------------------------------------------
   1 | import unittest
   2 | import singer
   3 | import target_stitch
   4 | from target_stitch import StitchHandler, TargetStitchException, finish_requests
   5 | import io
   6 | import os
   7 | import json
   8 | import asyncio
   9 | import simplejson
  10 | import collections
  11 | import time
  12 | from decimal import Decimal
  13 | try:
  14 |     from tests.gate_mocks import (
  15 |         mock_in_order_all_200,
  16 |         mock_out_of_order_all_200,
  17 |         mock_in_order_first_errors,
  18 |         mock_in_order_second_errors,
  19 |         mock_out_of_order_first_errors,
  20 |         mock_out_of_order_second_errors,
  21 |         mock_out_of_order_both_error,
  22 |         mock_in_order_both_error,
  23 |         mock_unparsable_response_body_200,
  24 |     )
  25 | except ImportError:
  26 |     from gate_mocks import (
  27 |         mock_in_order_all_200,
  28 |         mock_out_of_order_all_200,
  29 |         mock_in_order_first_errors,
  30 |         mock_in_order_second_errors,
  31 |         mock_out_of_order_first_errors,
  32 |         mock_out_of_order_second_errors,
  33 |         mock_out_of_order_both_error,
  34 |         mock_in_order_both_error,
  35 |         mock_unparsable_response_body_200,
  36 |     )
  37 | 
  38 | from nose.tools import nottest
  39 | 
  40 | LOGGER = singer.get_logger().getChild('target_stitch')
  41 | 
  42 | def fake_check_send_exception():
  43 |     return None
  44 | 
  45 | def load_sample_lines(filename):
  46 |     with open('tests/' + filename) as fp:
  47 |         return [line for line in fp]
  48 | 
  49 | class FakePost:
  50 |     def __init__(self, requests_sent, makeFakeResponse):
  51 |         self.requests_sent = requests_sent
  52 |         self.makeFakeResponse = makeFakeResponse
  53 | 
  54 |     async def __aenter__(self):
  55 |         return self.makeFakeResponse(self.requests_sent)
  56 | 
  57 |     async def __aexit__(self, exc_type, exc, tb):
  58 |         await asyncio.sleep(1)
  59 | 
  60 | class FakeSession:
  61 |     def __init__(self, makeFakeResponse):
  62 |         self.requests_sent = 0
  63 |         self.urls = []
  64 |         self.messages_sent = []
  65 |         self.bodies_sent = []
  66 |         self.makeFakeResponse = makeFakeResponse
  67 | 
  68 |     def post(self, url, *, data, **kwargs):
  69 |         data_json = simplejson.loads(data)
  70 |         self.messages_sent.append(data_json["messages"])
  71 |         self.requests_sent = self.requests_sent + 1
  72 |         self.bodies_sent.append(data)
  73 |         self.urls.append(url)
  74 |         return FakePost(self.requests_sent, self.makeFakeResponse)
  75 | 
  76 | 
  77 | class AsyncSerializeFloats(unittest.TestCase):
  78 |     def setUp(self):
  79 |         token = None
  80 |         handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 2)
  81 | 
  82 |         self.out = io.StringIO()
  83 |         self.target_stitch = target_stitch.TargetStitch(
  84 |             [handler], self.out, 4000000, 2, 100000)
  85 |         self.queue = [simplejson.dumps({"type": "SCHEMA", "stream": "chicken_stream",
  86 |                                   "key_properties": ["my_float"],
  87 |                                   "schema": {"type": "object",
  88 |                                              "properties": {"my_float": {"type": "number"}}}})]
  89 |         target_stitch.SEND_EXCEPTION = None
  90 |         target_stitch.PENDING_REQUESTS = []
  91 | 
  92 |         LOGGER.info("cleaning SEND_EXCEPTIONS: %s AND PENDING_REQUESTS: %s",
  93 |                     target_stitch.SEND_EXCEPTION,
  94 |                     target_stitch.PENDING_REQUESTS)
  95 | 
  96 |         target_stitch.CONFIG = {
  97 |             'token': "some-token",
  98 |             'client_id': "some-client",
  99 |             'disable_collection': True,
 100 |             'connection_ns': "some-ns",
 101 |             'batch_size_preferences' : {
 102 |                 'full_table_streams' : [],
 103 |                 'batch_size_preference': None,
 104 |                 'user_batch_size_preference': None,
 105 |             },
 106 |             'turbo_boost_factor' : 10,
 107 |             'small_batch_url' : "http://small-batch",
 108 |             'big_batch_url' : "http://big-batch",
 109 |         }
 110 | 
 111 | 
 112 |     def test_serialize_floats(self):
 113 |         floats = [
 114 |             '-9999999999999999.9999999999999999999999',
 115 |             '-7187498962233394.3739812942138415666763',
 116 |             '9273972760690975.2044306442955715221042',
 117 |             '29515565286974.1188802122612813004366',
 118 |             '9176089101347578.2596296292040288441238',
 119 |             '-8416853039392703.306423225471199148379',
 120 |             '1285266411314091.3002668125515694162268',
 121 |             '6051872750342125.3812886238958681227336',
 122 |             '-1132031605459408.5571559429308939781468',
 123 |             '-6387836755056303.0038029604189860431045',
 124 |             '4526059300505414'
 125 |         ]
 126 | 
 127 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200)
 128 |         for float_val in floats:
 129 |             self.queue.append(simplejson.dumps({"type": "RECORD",
 130 |                                                 "stream": "chicken_stream",
 131 |                                                 "record": {"my_float": Decimal(float_val)}}))
 132 | 
 133 | 
 134 |             self.queue.append(simplejson.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"my_float": Decimal(float_val) }}}}))
 135 | 
 136 |         self.target_stitch.consume(self.queue)
 137 |         finish_requests()
 138 | 
 139 |         output_record_floats = []
 140 |         for batch in target_stitch.OUR_SESSION.bodies_sent:
 141 |             output_record_floats.extend([str(x['data']['my_float']) for x in simplejson.loads(batch, use_decimal=True)['messages']])
 142 | 
 143 |         self.assertEqual(floats, output_record_floats)
 144 | 
 145 |         emitted_state = list(map(lambda x: simplejson.loads(x, use_decimal=True), self.out.getvalue().strip().split('\n')))
 146 |         self.assertEqual(len(emitted_state), 6)
 147 |         self.assertEqual( emitted_state[0], {'bookmarks': {'chicken_stream': {'my_float': Decimal(floats[0])}}})
 148 |         self.assertEqual( emitted_state[1], {'bookmarks': {'chicken_stream': {'my_float': Decimal(floats[2])}}})
 149 |         self.assertEqual( emitted_state[2], {'bookmarks': {'chicken_stream': {'my_float': Decimal(floats[4])}}})
 150 |         self.assertEqual( emitted_state[3], {'bookmarks': {'chicken_stream': {'my_float': Decimal(floats[6])}}})
 151 |         self.assertEqual( emitted_state[4], {'bookmarks': {'chicken_stream': {'my_float': Decimal(floats[8])}}})
 152 |         self.assertEqual( emitted_state[5], {'bookmarks': {'chicken_stream': {'my_float': Decimal(floats[10])}}})
 153 | 
 154 | 
 155 | class AsyncPushToGate(unittest.TestCase):
 156 |     def setUp(self):
 157 |         token = None
 158 |         handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 2)
 159 | 
 160 |         self.og_check_send_exception = target_stitch.check_send_exception
 161 |         self.out = io.StringIO()
 162 |         self.target_stitch = target_stitch.TargetStitch(
 163 |             [handler], self.out, 4000000, 2, 100000)
 164 |         self.queue = [json.dumps({"type": "SCHEMA", "stream": "chicken_stream",
 165 |                                   "key_properties": ["id"],
 166 |                                   "schema": {"type": "object",
 167 |                                              "properties": {"id": {"type": "integer"},
 168 |                                                             "name": {"type": "string"}}}})]
 169 | 
 170 |         target_stitch.SEND_EXCEPTION = None
 171 |         for f,s in target_stitch.PENDING_REQUESTS:
 172 |             try:
 173 |                 f.cancel()
 174 |             except:
 175 |                 pass
 176 | 
 177 |         target_stitch.PENDING_REQUESTS = []
 178 |         LOGGER.info("cleaning SEND_EXCEPTIONS: %s AND PENDING_REQUESTS: %s",
 179 |                     target_stitch.SEND_EXCEPTION,
 180 |                     target_stitch.PENDING_REQUESTS)
 181 | 
 182 |         target_stitch.CONFIG ={
 183 |             'token': "some-token",
 184 |             'client_id': "some-client",
 185 |             'disable_collection': True,
 186 |             'connection_ns': "some-ns",
 187 |             'batch_size_preferences' : {
 188 |                 'full_table_streams' : [],
 189 |                 'batch_size_preference': None,
 190 |                 'user_batch_size_preference': None,
 191 |             },
 192 |             'turbo_boost_factor' : 10,
 193 |             'small_batch_url' : "http://small-batch",
 194 |             'big_batch_url' : "http://big-batch",
 195 |         }
 196 | 
 197 |     # 2 requests
 198 |     # both with state
 199 |     # in order responses
 200 |     def test_requests_in_order(self):
 201 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200)
 202 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 203 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}}))
 204 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 205 |         #will flush here after 2 records
 206 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}}))
 207 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 208 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}}))
 209 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}}))
 210 |         #will flush here after 2 records
 211 | 
 212 |         self.target_stitch.consume(self.queue)
 213 |         finish_requests()
 214 | 
 215 |         emitted_state = list(map(json.loads, self.out.getvalue().strip().split('\n')))
 216 |         self.assertEqual(len(emitted_state), 2)
 217 |         self.assertEqual( emitted_state[0], {'bookmarks': {'chicken_stream': {'id': 1}}})
 218 |         self.assertEqual( emitted_state[1], {'bookmarks': {'chicken_stream': {'id': 3}}})
 219 | 
 220 |     def test_request_to_big_batch_for_large_record(self):
 221 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200)
 222 |         self.target_stitch.max_batch_records = 4
 223 |         self.target_stitch.handlers[0].max_batch_records = 4
 224 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "M" * 5000000}}))
 225 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 226 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 227 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}}))
 228 |         #will flush here after 4 records
 229 | 
 230 |         self.target_stitch.consume(self.queue)
 231 |         finish_requests()
 232 |         self.assertEqual(target_stitch.OUR_SESSION.urls, [target_stitch.CONFIG["big_batch_url"],
 233 |                                                           target_stitch.CONFIG["small_batch_url"]])
 234 |         self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent[0]), 1)
 235 |         self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent[1]), 3)
 236 | 
 237 |     # 2 requests
 238 |     # last SENT request has state
 239 |     # in order
 240 |     def test_requests_in_order_first_has_no_state(self):
 241 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200)
 242 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 243 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 244 |         #will flush here after 2 records
 245 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}}))
 246 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 247 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}}))
 248 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}}))
 249 |         #will flush here after 2 records
 250 | 
 251 |         self.target_stitch.consume(self.queue)
 252 |         finish_requests()
 253 | 
 254 |         emitted_state = list(map(json.loads, self.out.getvalue().strip().split('\n')))
 255 |         self.assertEqual(len(emitted_state), 1)
 256 |         self.assertEqual( emitted_state[0], {'bookmarks': {'chicken_stream': {'id': 3}}})
 257 | 
 258 | 
 259 |     # 2 requests.
 260 |     # both with state.
 261 |     # in order
 262 |     # first sent request errors
 263 |     def test_requests_in_order_first_errors(self):
 264 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_first_errors)
 265 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 266 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}}))
 267 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 268 |         #will flush here after 2 records
 269 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}}))
 270 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 271 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}}))
 272 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}}))
 273 |         #will flush here after 2 records
 274 | 
 275 |         #consume() can encounter an exception via check_send_exception in send()
 276 |         #if SEND_EXCEPTION has already been set by the coroutine it can blow up.
 277 |         target_stitch.check_send_exception = fake_check_send_exception
 278 |         self.target_stitch.consume(self.queue)
 279 |         target_stitch.check_send_exception = self.og_check_send_exception
 280 |         our_exception = None
 281 |         try:
 282 |             finish_requests()
 283 |         except Exception as ex:
 284 |             our_exception = ex
 285 | 
 286 |         self.assertIsNotNone(our_exception)
 287 |         self.assertTrue(isinstance(our_exception, TargetStitchException))
 288 | 
 289 |         #no state is emitted
 290 |         emitted_state = self.assertEqual(self.out.getvalue(), '')
 291 | 
 292 |     # 2 requests.
 293 |     # both with state.
 294 |     # in order
 295 |     # second SENT request errors
 296 |     def test_requests_in_order_second_errors(self):
 297 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_second_errors)
 298 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 299 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}}))
 300 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 301 |         #will flush here after 2 records
 302 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}}))
 303 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 304 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}}))
 305 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}}))
 306 |         #will flush here after 2 records
 307 | 
 308 |         #consume() can encounter an exception via check_send_exception in send()
 309 |         #if SEND_EXCEPTION has already been set by the coroutine it can blow up.
 310 |         target_stitch.check_send_exception = fake_check_send_exception
 311 |         self.target_stitch.consume(self.queue)
 312 |         target_stitch.check_send_exception = self.og_check_send_exception
 313 | 
 314 |         our_exception = None
 315 |         try:
 316 |             finish_requests()
 317 |         except Exception as ex:
 318 |             our_exception = ex
 319 | 
 320 |         self.assertIsNotNone(our_exception)
 321 |         self.assertTrue(isinstance(our_exception, TargetStitchException))
 322 | 
 323 |         emitted_state = self.out.getvalue().strip().split('\n')
 324 |         self.assertEqual(1, len(emitted_state))
 325 |         self.assertEqual({'bookmarks': {'chicken_stream': {'id': 1}}}, json.loads(emitted_state[0]))
 326 | 
 327 |     # 2 requests.
 328 |     # both with state.
 329 |     # in order
 330 |     # both requests errors
 331 |     def test_requests_in_order_both_errors(self):
 332 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_both_error)
 333 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 334 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}}))
 335 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 336 |         #will flush here after 2 records
 337 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}}))
 338 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 339 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}}))
 340 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}}))
 341 |         #will flush here after 2 records
 342 | 
 343 |         #consume() can encounter an exception via check_send_exception in send()
 344 |         #if SEND_EXCEPTION has already been set by the coroutine it can blow up.
 345 |         target_stitch.check_send_exception = fake_check_send_exception
 346 |         self.target_stitch.consume(self.queue)
 347 |         target_stitch.check_send_exception = self.og_check_send_exception
 348 |         our_exception = None
 349 |         try:
 350 |             finish_requests()
 351 |         except Exception as ex:
 352 |             our_exception = ex
 353 | 
 354 |         self.assertIsNotNone(our_exception)
 355 |         self.assertTrue(isinstance(our_exception, TargetStitchException))
 356 | 
 357 |         #no state is emitted
 358 |         self.assertEqual(self.out.getvalue(), '')
 359 | 
 360 | 
 361 | 
 362 | 
 363 |     # 2 requests
 364 |     # both with state.
 365 |     # out of order responses
 366 |     def test_requests_out_of_order(self):
 367 |         target_stitch.OUR_SESSION = FakeSession(mock_out_of_order_all_200)
 368 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 369 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}}))
 370 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 371 |         #will flush here after 2 records
 372 | 
 373 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}}))
 374 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 375 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}}))
 376 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}}))
 377 |         #will flush here after 2 records
 378 | 
 379 |         self.target_stitch.consume(self.queue)
 380 |         finish_requests()
 381 | 
 382 |         emitted_state = list(map(json.loads, self.out.getvalue().strip().split('\n')))
 383 |         self.assertEqual(len(emitted_state), 2)
 384 |         self.assertEqual( emitted_state[0], {'bookmarks': {'chicken_stream': {'id': 1}}})
 385 |         self.assertEqual( emitted_state[1], {'bookmarks': {'chicken_stream': {'id': 3}}})
 386 | 
 387 |     # 2 requests.
 388 |     # both with state.
 389 |     # out of order
 390 |     # first SENT request errors
 391 |     def test_requests_out_of_order_first_errors(self):
 392 |         target_stitch.OUR_SESSION = FakeSession(mock_out_of_order_first_errors)
 393 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 394 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}}))
 395 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 396 |         #will flush here after 2 records
 397 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}}))
 398 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 399 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}}))
 400 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}}))
 401 |         #will flush here after 2 records
 402 | 
 403 |         #consume() can encounter an exception via check_send_exception in send()
 404 |         #if SEND_EXCEPTION has already been set by the coroutine it can blow up.
 405 |         target_stitch.check_send_exception = fake_check_send_exception
 406 |         self.target_stitch.consume(self.queue)
 407 |         target_stitch.check_send_exception = self.og_check_send_exception
 408 |         our_exception = None
 409 |         try:
 410 |             finish_requests()
 411 |         except Exception as ex:
 412 |             our_exception = ex
 413 | 
 414 |         self.assertIsNotNone(our_exception)
 415 |         self.assertTrue(isinstance(our_exception, TargetStitchException))
 416 | 
 417 |         #no state is emitted
 418 |         self.assertEqual(self.out.getvalue(), '')
 419 | 
 420 |     # 2 requests.
 421 |     # both with state.
 422 |     # out of order
 423 |     # second SENT request errors
 424 |     def out_of_order_second_errors(self, requests_sent):
 425 |         class FakeResponse:
 426 |             def __init__(self, requests_sent):
 427 |                 self.requests_sent = requests_sent
 428 | 
 429 |             async def json(self):
 430 |                 if (self.requests_sent == 1):
 431 |                     self.status = 200
 432 |                     await asyncio.sleep(3)
 433 |                     return {"status" : "finished request {}".format(requests_sent)}
 434 | 
 435 |                 self.status = 400
 436 |                 return {"status" : "finished request {}".format(requests_sent)}
 437 | 
 438 |         return FakeResponse(requests_sent)
 439 | 
 440 |     def test_requests_out_of_order_second_errors(self):
 441 |         target_stitch.OUR_SESSION = FakeSession(mock_out_of_order_second_errors)
 442 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 443 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}}))
 444 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 445 |         #will flush here after 2 records
 446 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}}))
 447 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 448 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}}))
 449 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}}))
 450 |         #will flush here after 2 records
 451 | 
 452 |         #consume() can encounter an exception via check_send_exception in send()
 453 |         #if SEND_EXCEPTION has already been set by the coroutine it can blow up.
 454 |         target_stitch.check_send_exception = fake_check_send_exception
 455 |         self.target_stitch.consume(self.queue)
 456 |         target_stitch.check_send_exception = self.og_check_send_exception
 457 |         our_exception = None
 458 |         try:
 459 |             finish_requests()
 460 |         except Exception as ex:
 461 |             our_exception = ex
 462 | 
 463 |         #the 2nd request returns immediately with a 400, triggering a TargetStitchException.
 464 |         #at this point, it is game over and it does NOT matter when or with what status the 1st request comples
 465 |         self.assertIsNotNone(our_exception)
 466 |         self.assertTrue(isinstance(our_exception, TargetStitchException))
 467 | 
 468 |         emitted_state = self.out.getvalue().strip().split('\n')
 469 |         self.assertEqual(1, len(emitted_state))
 470 |         self.assertEqual('', emitted_state[0])
 471 | 
 472 |     # 2 requests.
 473 |     # both with state.
 474 |     # out of order
 475 |     # both requests errors
 476 |     def test_requests_out_of_order_both_errors(self):
 477 |         target_stitch.OUR_SESSION = FakeSession(mock_out_of_order_both_error)
 478 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 479 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}}))
 480 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 481 |         #will flush here after 2 records
 482 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 2 }}}}))
 483 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 484 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 3 }}}}))
 485 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Cathy"}}))
 486 |         #will flush here after 2 records
 487 | 
 488 |         #consume() can encounter an exception via check_send_exception in send()
 489 |         #if SEND_EXCEPTION has already been set by the coroutine it can blow up.
 490 |         target_stitch.check_send_exception = fake_check_send_exception
 491 |         self.target_stitch.consume(self.queue)
 492 |         target_stitch.check_send_exception = self.og_check_send_exception
 493 |         our_exception = None
 494 |         try:
 495 |             finish_requests()
 496 |         except Exception as ex:
 497 |             our_exception = ex
 498 | 
 499 |         self.assertIsNotNone(our_exception)
 500 |         self.assertTrue(isinstance(our_exception, TargetStitchException))
 501 | 
 502 |         #no state is emitted
 503 |         self.assertEqual(self.out.getvalue(), '')
 504 | 
 505 |     def test_unparseable_json_response(self):
 506 |         target_stitch.OUR_SESSION = FakeSession(mock_unparsable_response_body_200)
 507 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 508 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}}))
 509 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 510 |         #will flush here after 2 records
 511 | 
 512 |         target_stitch.check_send_exception = fake_check_send_exception
 513 |         self.target_stitch.consume(self.queue)
 514 |         target_stitch.check_send_exception = self.og_check_send_exception
 515 |         try:
 516 |             finish_requests()
 517 |         except Exception as ex:
 518 |             our_exception = ex
 519 | 
 520 |         self.assertIsNotNone(our_exception)
 521 | 
 522 | 
 523 | class StateOnly(unittest.TestCase):
 524 |     def setUp(self):
 525 |         token = None
 526 |         handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 2)
 527 |         self.og_check_send_exception = target_stitch.check_send_exception
 528 |         self.out = io.StringIO()
 529 |         self.target_stitch = target_stitch.TargetStitch(
 530 |             [handler], self.out, 4000000, 1, 0)
 531 |         self.queue = []
 532 |         target_stitch.SEND_EXCEPTION = None
 533 |         for f,s in target_stitch.PENDING_REQUESTS:
 534 |             try:
 535 |                 f.cancel()
 536 |             except:
 537 |                 pass
 538 | 
 539 |         target_stitch.PENDING_REQUESTS = []
 540 |         LOGGER.info("cleaning SEND_EXCEPTIONS: %s AND PENDING_REQUESTS: %s",
 541 |                     target_stitch.SEND_EXCEPTION,
 542 |                     target_stitch.PENDING_REQUESTS)
 543 |         target_stitch.CONFIG ={
 544 |             'token': "some-token",
 545 |             'client_id': "some-client",
 546 |             'disable_collection': True,
 547 |             'connection_ns': "some-ns",
 548 |             'batch_size_preferences' : {
 549 |                 'full_table_streams' : [],
 550 |                 'batch_size_preference': None,
 551 |                 'user_batch_size_preference': None,
 552 |             },
 553 |             'turbo_boost_factor' : 10,
 554 |             'small_batch_url' : "http://small-batch",
 555 |             'big_batch_url' : "http://big-batch",
 556 |         }
 557 | 
 558 |     def test_state_only(self):
 559 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200)
 560 |         self.queue.append(json.dumps({"type":"STATE", "value":{"bookmarks":{"chicken_stream":{"id": 1 }}}}))
 561 |         #will flush here, because TargetStitch.time_last_batch_sent was set to 0 in setUp
 562 |         self.target_stitch.consume(self.queue)
 563 |         finish_requests()
 564 | 
 565 |         emitted_state = list(map(json.loads, self.out.getvalue().strip().split('\n')))
 566 |         self.assertEqual(len(emitted_state), 1)
 567 |         self.assertEqual( emitted_state[0], {'bookmarks': {'chicken_stream': {'id': 1}}})
 568 | 
 569 | 
 570 | class StateEdgeCases(unittest.TestCase):
 571 |     def setUp(self):
 572 |         token = None
 573 |         handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 2)
 574 |         self.out = io.StringIO()
 575 |         self.target_stitch = target_stitch.TargetStitch(
 576 |             [handler], self.out, 4000000, 2, 100000)
 577 |         self.queue = [simplejson.dumps({"type": "SCHEMA", "stream": "chicken_stream",
 578 |                                   "key_properties": ["my_float"],
 579 |                                   "schema": {"type": "object",
 580 |                                              "properties": {"my_float": {"type": "number"}}}})]
 581 |         target_stitch.SEND_EXCEPTION = None
 582 |         target_stitch.PENDING_REQUESTS = []
 583 | 
 584 |         LOGGER.info("cleaning SEND_EXCEPTIONS: %s AND PENDING_REQUESTS: %s",
 585 |                     target_stitch.SEND_EXCEPTION,
 586 |                     target_stitch.PENDING_REQUESTS)
 587 | 
 588 |         target_stitch.CONFIG ={
 589 |             'token': "some-token",
 590 |             'client_id': "some-client",
 591 |             'disable_collection': True,
 592 |             'connection_ns': "some-ns",
 593 |             'batch_size_preferences' : {
 594 |                 'full_table_streams' : [],
 595 |                 'batch_size_preference': None,
 596 |                 'user_batch_size_preference': None,
 597 |             },
 598 |             'turbo_boost_factor' : 10,
 599 |             'small_batch_url' : "http://small-batch",
 600 |             'big_batch_url' : "http://big-batch",
 601 |         }
 602 | 
 603 | 
 604 |     def test_trailing_state_after_final_message(self):
 605 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200)
 606 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 607 |         self.queue.append(json.dumps({"type":"STATE",
 608 |                                       "value":{"bookmarks":{"chicken_stream":{"id": 1 }},
 609 |                                                'currently_syncing' : 'chicken_stream'}}))
 610 | 
 611 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 612 |         #will flush here after 2 records
 613 |         self.queue.append(json.dumps({"type":"STATE",
 614 |                                       "value":{"bookmarks":{"chicken_stream":{"id": 2 }},
 615 |                                                'currently_syncing' : None}}))
 616 | 
 617 |         self.target_stitch.consume(self.queue)
 618 |         finish_requests()
 619 | 
 620 |         emitted_state = list(map(json.loads, self.out.getvalue().strip().split('\n')))
 621 |         self.assertEqual(len(emitted_state), 2)
 622 |         self.assertEqual( emitted_state[0],
 623 |                           {"bookmarks":{"chicken_stream":{"id": 1 }},
 624 |                            'currently_syncing' : 'chicken_stream'})
 625 |         self.assertEqual( emitted_state[1],
 626 |                           {"bookmarks":{"chicken_stream":{"id": 2 }},
 627 |                            'currently_syncing' : None})
 628 | 
 629 |     def test_will_not_output_empty_state(self):
 630 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200)
 631 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 632 |         self.queue.append(json.dumps({"type":"STATE",
 633 |                                       "value":{"bookmarks":{"chicken_stream":{"id": 1 }},
 634 |                                                'currently_syncing' : 'chicken_stream'}}))
 635 | 
 636 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2, "name": "Paul"}}))
 637 |         #will flush here after 2 records, state will reset to None
 638 | 
 639 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Kyle"}}))
 640 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 4, "name": "Alice"}}))
 641 |         #will flush here after 2 records, but will NOT write blank state
 642 | 
 643 |         self.target_stitch.consume(self.queue)
 644 |         finish_requests()
 645 | 
 646 |         emitted_state = list(map(json.loads, self.out.getvalue().strip().split('\n')))
 647 |         self.assertEqual(len(emitted_state), 1)
 648 |         self.assertEqual( emitted_state[0],
 649 |                           {"bookmarks":{"chicken_stream":{"id": 1 }},
 650 |                            'currently_syncing' : 'chicken_stream'})
 651 | 
 652 | class BufferingPerStreamConstraints(unittest.TestCase):
 653 |     def setUp(self):
 654 |         self.maxDiff = None
 655 |         token = None
 656 |         handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 3)
 657 | 
 658 |         self.og_check_send_exception = target_stitch.check_send_exception
 659 |         self.out = io.StringIO()
 660 |         self.target_stitch = target_stitch.TargetStitch(
 661 |             [handler], self.out, 500, 7, 100000)
 662 |         self.queue = [json.dumps({"type": "SCHEMA", "stream": "chicken_stream",
 663 |                                   "key_properties": ["id"],
 664 |                                   "schema": {"type": "object",
 665 |                                              "properties": {"id": {"type": "integer"},
 666 |                                                             "name": {"type": "string"}}}}),
 667 |                       json.dumps({"type": "SCHEMA", "stream": "zebra_stream",
 668 |                                   "key_properties": ["id"],
 669 |                                   "schema": {"type": "object",
 670 |                                              "properties": {"id": {"type": "integer"},
 671 |                                                             "name": {"type": "string"}}}})]
 672 | 
 673 |         target_stitch.SEND_EXCEPTION = None
 674 |         for f,s in target_stitch.PENDING_REQUESTS:
 675 |             try:
 676 |                 f.cancel()
 677 |             except:
 678 |                 pass
 679 | 
 680 |         target_stitch.PENDING_REQUESTS = []
 681 |         LOGGER.info("cleaning SEND_EXCEPTIONS: %s AND PENDING_REQUESTS: %s",
 682 |                     target_stitch.SEND_EXCEPTION,
 683 |                     target_stitch.PENDING_REQUESTS)
 684 | 
 685 |         target_stitch.CONFIG ={
 686 |             'token': "some-token",
 687 |             'client_id': "some-client",
 688 |             'disable_collection': True,
 689 |             'connection_ns': "some-ns",
 690 |             'batch_size_preferences' : {
 691 |                 'full_table_streams' : [],
 692 |                 'batch_size_preference': None,
 693 |                 'user_batch_size_preference': None,
 694 |             },
 695 |             'turbo_boost_factor' : 10,
 696 |             'small_batch_url' : "http://small-batch",
 697 |             'big_batch_url' : "http://big-batch",
 698 |         }
 699 | 
 700 |     def test_flush_based_on_message_count(self):
 701 |         # Tests that the target will buffer records per stream. This will
 702 |         # allow the tap to alternate which streams it is emitting records
 703 |         # for without the target cutting small batches
 704 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200)
 705 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 706 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 2, "name": "Paul"}}))
 707 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 708 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 4, "name": "Cathy"}}))
 709 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 5, "name": "Dan"}}))
 710 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 6, "name": "A"}}))
 711 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 7, "name": "B"}}))
 712 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 7},
 713 |                                                                                 "zebra_stream": {"id": 6}}}}))
 714 |         # Should flush here
 715 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 8, "name": "C"}}))
 716 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 9, "name": "D"}}))
 717 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 10, "name": "E"}}))
 718 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 10},
 719 |                                                                                 "zebra_stream": {"id": 8}}}}))
 720 |         # Should flush here
 721 | 
 722 |         self.target_stitch.consume(self.queue)
 723 |         finish_requests()
 724 | 
 725 |         expected_messages = [
 726 |             [{'action': 'upsert',
 727 |               'data': {'id': 8, 'name': 'C'}}],
 728 |             [{'action': 'upsert',
 729 |               'data': {'id': 9, 'name': 'D'}},
 730 |              {'action': 'upsert',
 731 |               'data': {'id': 10, 'name': 'E'}}],
 732 |             [{'action': 'upsert',
 733 |               'data': {'id': 2, 'name': 'Paul'}},
 734 |              {'action': 'upsert',
 735 |               'data': {'id': 4, 'name': 'Cathy'}},
 736 |              {'action': 'upsert',
 737 |               'data': {'id': 6, 'name': 'A'}}],
 738 |             [{'action': 'upsert',
 739 |               'data': {'id': 1, 'name': 'Mike'}},
 740 |              {'action': 'upsert',
 741 |               'data': {'id': 3, 'name': 'Harrsion'}},
 742 |              {'action': 'upsert',
 743 |               'data': {'id': 5, 'name': 'Dan'}},
 744 |              {'action': 'upsert',
 745 |               'data': {'id': 7, 'name': 'B'}},]]
 746 | 
 747 |         expected_state = [{"bookmarks": {"zebra_stream": {"id": 8}, "chicken_stream": {"id": 10}}}]
 748 | 
 749 |         # Should be broken into 4 batches
 750 |         self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent), 4)
 751 | 
 752 |         # Sort by length and remove sequence number to compare directly
 753 |         actual_messages = [[{key: m[key] for key in ["action","data"]} for m in ms]
 754 |                            for ms in sorted(target_stitch.OUR_SESSION.messages_sent, key=lambda ms: len(ms))]
 755 | 
 756 |         actual_state = list(map(lambda x: simplejson.loads(x, use_decimal=True), self.out.getvalue().strip().split('\n')))
 757 | 
 758 |         self.assertEqual(actual_messages, expected_messages)
 759 |         self.assertEqual(actual_state, expected_state)
 760 | 
 761 | 
 762 |     def test_flush_based_on_bytes(self):
 763 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200)
 764 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 765 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 2, "name": "Paul"}}))
 766 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 767 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 4, "name": "The byte limit should be across streams, so lets make lots of data on both streams"}}))
 768 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 5, "name": "to force the target to exceed its byte limit"}}))
 769 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 6, "name": "A"}}))
 770 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 7, "name": "B"}}))
 771 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 7},
 772 |                                                                                 "zebra_stream": {"id": 6}}}}))
 773 |         # Should flush here
 774 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 8, "name": "C"}}))
 775 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 9, "name": "D"}}))
 776 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 10, "name": "E"}}))
 777 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 10},
 778 |                                                                                 "zebra_stream": {"id": 8}}}}))
 779 |         # Should flush here
 780 | 
 781 |         self.target_stitch.consume(self.queue)
 782 |         finish_requests()
 783 | 
 784 |         expected_messages = [
 785 |             [{'action': 'upsert', 'data': {'id': 1, 'name': 'Mike'}},
 786 |              {'action': 'upsert', 'data': {'id': 3, 'name': 'Harrsion'}},
 787 |              {'action': 'upsert',
 788 |               'data': {'id': 5,
 789 |                        'name': 'to force the target to exceed its byte limit'}}],
 790 |             [{'action': 'upsert', 'data': {'id': 2, 'name': 'Paul'}},
 791 |              {'action': 'upsert', 'data': {'id': 4, 'name': 'The byte limit should be across streams, so lets make lots of data on both streams'}}],
 792 |             [{'action': 'upsert', 'data': {'id': 6, 'name': 'A'}},
 793 |              {'action': 'upsert', 'data': {'id': 8, 'name': 'C'}}],
 794 |             [{'action': 'upsert', 'data': {'id': 7, 'name': 'B'}},
 795 |              {'action': 'upsert', 'data': {'id': 9, 'name': 'D'}},
 796 |              {'action': 'upsert', 'data': {'id': 10, 'name': 'E'}}]]
 797 | 
 798 | 
 799 |         expected_state = [{"bookmarks": {"zebra_stream": {"id": 8}, "chicken_stream": {"id": 10}}}]
 800 | 
 801 |         # Should be broken into 4 batches
 802 |         self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent), 4)
 803 | 
 804 |         # Sort by length and remove sequence number to compare directly
 805 |         actual_messages = [[{key: m[key] for key in ["action","data"]} for m in ms]
 806 |                            for ms in sorted(target_stitch.OUR_SESSION.messages_sent, key=lambda ms: ms[0]['data']['id'])]
 807 | 
 808 |         actual_state = list(map(lambda x: simplejson.loads(x, use_decimal=True), self.out.getvalue().strip().split('\n')))
 809 | 
 810 |         self.assertEqual(actual_messages, expected_messages)
 811 |         self.assertEqual(actual_state, expected_state)
 812 | 
 813 | 
 814 |     def test_state_works_when_streams_with_no_messages(self):
 815 |         # Test that target_stitch will emit state messages for a stream
 816 |         # even if the final stream in self.messages does not contain any
 817 |         # messages
 818 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200)
 819 |         self.target_stitch.messages = collections.OrderedDict(self.target_stitch.messages)
 820 | 
 821 |         self.queue.append(json.dumps({
 822 |                 "type": "SCHEMA",
 823 |                 "stream": "lion_stream",
 824 |                 "key_properties": ["id"],
 825 |                 "schema": {"type": "object",
 826 |                            "properties": {"id": {"type": "integer"},
 827 |                                           "name": {"type": "string"}}}}))
 828 | 
 829 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1, "name": "Mike"}}))
 830 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 2, "name": "Paul"}}))
 831 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 3, "name": "Harrsion"}}))
 832 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 4, "name": "Cathy"}}))
 833 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 5, "name": "Dan"}}))
 834 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 6, "name": "A"}}))
 835 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 7, "name": "B"}}))
 836 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 7},
 837 |                                                                                 "zebra_stream": {"id": 6}}}}))
 838 |         # Should flush here
 839 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 8, "name": "C"}}))
 840 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 9, "name": "D"}}))
 841 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 10, "name": "E"}}))
 842 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 10},
 843 |                                                                                 "zebra_stream": {"id": 8}}}}))
 844 |         # Should flush here
 845 | 
 846 |         self.target_stitch.consume(self.queue)
 847 |         finish_requests()
 848 | 
 849 |         expected_messages = [
 850 |             [{'action': 'upsert',
 851 |               'data': {'id': 8, 'name': 'C'}}],
 852 |             [{'action': 'upsert',
 853 |               'data': {'id': 9, 'name': 'D'}},
 854 |              {'action': 'upsert',
 855 |               'data': {'id': 10, 'name': 'E'}}],
 856 |             [{'action': 'upsert',
 857 |               'data': {'id': 2, 'name': 'Paul'}},
 858 |              {'action': 'upsert',
 859 |               'data': {'id': 4, 'name': 'Cathy'}},
 860 |              {'action': 'upsert',
 861 |               'data': {'id': 6, 'name': 'A'}}],
 862 |             [{'action': 'upsert',
 863 |               'data': {'id': 1, 'name': 'Mike'}},
 864 |              {'action': 'upsert',
 865 |               'data': {'id': 3, 'name': 'Harrsion'}},
 866 |              {'action': 'upsert',
 867 |               'data': {'id': 5, 'name': 'Dan'}},
 868 |              {'action': 'upsert',
 869 |               'data': {'id': 7, 'name': 'B'}},]]
 870 | 
 871 |         expected_state = [{"bookmarks": {"zebra_stream": {"id": 8}, "chicken_stream": {"id": 10}}}]
 872 | 
 873 |         # Should be broken into 4 batches
 874 |         self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent), 4)
 875 | 
 876 |         # Sort by length and remove sequence number to compare directly
 877 |         actual_messages = [[{key: m[key] for key in ["action","data"]} for m in ms]
 878 |                            for ms in sorted(target_stitch.OUR_SESSION.messages_sent, key=lambda ms: len(ms))]
 879 | 
 880 |         actual_state = list(map(lambda x: simplejson.loads(x, use_decimal=True), self.out.getvalue().strip().split('\n')))
 881 | 
 882 |         self.assertEqual(actual_messages, expected_messages)
 883 |         self.assertEqual(actual_state, expected_state)
 884 | 
 885 | 
 886 | class BufferingPerStreamNoStateOnFailure(unittest.TestCase):
 887 |     def setUp(self):
 888 |         time.sleep(20)
 889 |         self.maxDiff = None
 890 |         token = None
 891 |         handler = StitchHandler(target_stitch.DEFAULT_MAX_BATCH_BYTES, 3)
 892 | 
 893 |         # Swap out the post_coroutine with a mocked one to fake failures
 894 |         self.actual_post_coroutine = target_stitch.post_coroutine
 895 |         target_stitch.post_coroutine = self.mock_post_coroutine
 896 | 
 897 |         self.messages_sent = 0
 898 | 
 899 |         self.og_check_send_exception = target_stitch.check_send_exception
 900 |         self.out = io.StringIO()
 901 |         self.target_stitch = target_stitch.TargetStitch(
 902 |             [handler], self.out, 4000000, 10, 100000)
 903 |         self.queue = [json.dumps({"type": "SCHEMA", "stream": "chicken_stream",
 904 |                                   "key_properties": ["id"],
 905 |                                   "schema": {"type": "object",
 906 |                                              "properties": {"id": {"type": "integer"}}}}),
 907 |                       json.dumps({"type": "SCHEMA", "stream": "zebra_stream",
 908 |                                   "key_properties": ["id"],
 909 |                                   "schema": {"type": "object",
 910 |                                              "properties": {"id": {"type": "integer"}}}}),
 911 |                       json.dumps({"type": "SCHEMA", "stream": "dog_stream",
 912 |                                   "key_properties": ["id"],
 913 |                                   "schema": {"type": "object",
 914 |                                              "properties": {"id": {"type": "integer"}}}})]
 915 | 
 916 |         target_stitch.SEND_EXCEPTION = None
 917 |         for f,s in target_stitch.PENDING_REQUESTS:
 918 |             try:
 919 |                 f.cancel()
 920 |             except:
 921 |                 pass
 922 | 
 923 |         target_stitch.PENDING_REQUESTS = []
 924 |         LOGGER.info("cleaning SEND_EXCEPTIONS: %s AND PENDING_REQUESTS: %s",
 925 |                     target_stitch.SEND_EXCEPTION,
 926 |                     target_stitch.PENDING_REQUESTS)
 927 | 
 928 |         target_stitch.CONFIG ={
 929 |             'token': "some-token",
 930 |             'client_id': "some-client",
 931 |             'disable_collection': True,
 932 |             'connection_ns': "some-ns",
 933 |             'batch_size_preferences' : {
 934 |                 'full_table_streams' : [],
 935 |                 'batch_size_preference': None,
 936 |                 'user_batch_size_preference': None,
 937 |             },
 938 |             'turbo_boost_factor' : 10,
 939 |             'small_batch_url' : "http://small-batch",
 940 |             'big_batch_url' : "http://big-batch",
 941 |         }
 942 | 
 943 | 
 944 |     def tearDown(self):
 945 |         target_stitch.post_coroutine = self.actual_post_coroutine
 946 | 
 947 |     async def mock_post_coroutine(self, url, headers, data, verify_ssl):
 948 |         LOGGER.info("Sending message number %s", self.messages_sent)
 949 |         self.messages_sent += 1
 950 |         if self.messages_sent == self.messages_until_error:
 951 |             return await self.wait_then_throw()
 952 |         else:
 953 |             return await self.actual_post_coroutine(url, headers, data, verify_ssl)
 954 | 
 955 |     @staticmethod
 956 |     async def wait_then_throw():
 957 |         await asyncio.sleep(5)
 958 |         raise target_stitch.StitchClientResponseError(400, "Test exception")
 959 | 
 960 |     def test_state_interleaving_works(self):
 961 |         # Tests that the target will buffer records per stream. This will
 962 |         # allow the tap to alternate which streams it is emitting records
 963 |         # for without the target cutting small batches
 964 |         self.messages_until_error = 3
 965 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200)
 966 | 
 967 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1}}))
 968 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 1}}}}))
 969 | 
 970 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 1}}))
 971 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 1},
 972 |                                                                                 "zebra_stream": {"id": 1}}}}))
 973 | 
 974 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "dog_stream", "record": {"id": 1}}))
 975 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 1},
 976 |                                                                                 "zebra_stream": {"id": 1},
 977 |                                                                                 "dog_stream": {"id": 1}}}}))
 978 | 
 979 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2}}))
 980 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 2},
 981 |                                                                                 "zebra_stream": {"id": 1},
 982 |                                                                                 "dog_stream": {"id": 1}}}}))
 983 | 
 984 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 2}}))
 985 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 2},
 986 |                                                                                 "zebra_stream": {"id": 2},
 987 |                                                                                 "dog_stream": {"id": 1}}}}))
 988 | 
 989 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "dog_stream", "record": {"id": 2}}))
 990 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 2},
 991 |                                                                                 "zebra_stream": {"id": 2},
 992 |                                                                                 "dog_stream": {"id": 2}}}}))
 993 | 
 994 | 
 995 |         self.target_stitch.consume(self.queue)
 996 | 
 997 |         try:
 998 |             finish_requests()
 999 |         except:
1000 |             pass
1001 | 
1002 |         # There should only be messages for the 2 streams because the
1003 |         # third one should fail due to the mocking code
1004 |         expected_messages = [[{'action': 'upsert', 'data': {'id': 1}},
1005 |                               {'action': 'upsert', 'data': {'id': 2}}],
1006 |                              [{'action': 'upsert', 'data': {'id': 1}},
1007 |                               {'action': 'upsert', 'data': {'id': 2}}]]
1008 | 
1009 |         expected_state = ''
1010 | 
1011 |         self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent), 2)
1012 | 
1013 |         # Sort by length and remove sequence number to compare directly
1014 |         emitted_state = self.out.getvalue()
1015 |         actual_messages = [[{key: m[key] for key in ["action","data"]} for m in ms]
1016 |                            for ms in sorted(target_stitch.OUR_SESSION.messages_sent, key=lambda ms: len(ms))]
1017 | 
1018 |         self.assertEqual(actual_messages, expected_messages)
1019 |         self.assertEqual(emitted_state, expected_state)
1020 | 
1021 | 
1022 | 
1023 | 
1024 | 
1025 |     def test_state_interleaving_works_with_error_on_first(self):
1026 |         '''Test that the target will not emit state if the first stream to be
1027 |         batched fails '''
1028 | 
1029 |         self.messages_until_error = 1
1030 |         target_stitch.OUR_SESSION = FakeSession(mock_in_order_all_200)
1031 | 
1032 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 1}}))
1033 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 1}}}}))
1034 | 
1035 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 1}}))
1036 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 1},
1037 |                                                                                 "zebra_stream": {"id": 1}}}}))
1038 | 
1039 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "dog_stream", "record": {"id": 1}}))
1040 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 1},
1041 |                                                                                 "zebra_stream": {"id": 1},
1042 |                                                                                 "dog_stream": {"id": 1}}}}))
1043 | 
1044 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "chicken_stream", "record": {"id": 2}}))
1045 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 2},
1046 |                                                                                 "zebra_stream": {"id": 1},
1047 |                                                                                 "dog_stream": {"id": 1}}}}))
1048 | 
1049 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "zebra_stream", "record": {"id": 2}}))
1050 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 2},
1051 |                                                                                 "zebra_stream": {"id": 2},
1052 |                                                                                 "dog_stream": {"id": 1}}}}))
1053 | 
1054 |         self.queue.append(json.dumps({"type": "RECORD", "stream": "dog_stream", "record": {"id": 2}}))
1055 |         self.queue.append(json.dumps({"type": "STATE",  "value": {"bookmarks": {"chicken_stream": {"id": 2},
1056 |                                                                                 "zebra_stream": {"id": 2},
1057 |                                                                                 "dog_stream": {"id": 2}}}}))
1058 | 
1059 | 
1060 |         self.target_stitch.consume(self.queue)
1061 | 
1062 |         try:
1063 |             finish_requests()
1064 |         except:
1065 |             pass
1066 | 
1067 |         # There should only be messages for the 2 streams because the
1068 |         # third one should fail due to the mocking code
1069 |         expected_messages = [[{'action': 'upsert', 'data': {'id': 1}},
1070 |                               {'action': 'upsert', 'data': {'id': 2}}],
1071 |                              [{'action': 'upsert', 'data': {'id': 1}},
1072 |                              {'action': 'upsert', 'data': {'id': 2}}]]
1073 | 
1074 |         expected_state = ''
1075 | 
1076 |         # Should be broken into 2 batches (because the third fails)
1077 |         self.assertEqual(len(target_stitch.OUR_SESSION.messages_sent), 2)
1078 | 
1079 |         # Sort by length and remove sequence number to compare directly
1080 |         emitted_state = self.out.getvalue()
1081 |         actual_messages = [[{key: m[key] for key in ["action","data"]} for m in ms]
1082 |                            for ms in sorted(target_stitch.OUR_SESSION.messages_sent, key=lambda ms: len(ms))]
1083 | 
1084 |         self.assertEqual(actual_messages, expected_messages)
1085 |         self.assertEqual(emitted_state, expected_state)
1086 | 
1087 | 
1088 | 
1089 | if __name__== "__main__":
1090 |     test1 = StateEdgeCases()
1091 |     test1.setUp()
1092 |     test1.test_will_not_output_empty_state()
1093 |     # test1.test_requests_in_order()
1094 | 


--------------------------------------------------------------------------------
/tests/record_missing_key_property.json:
--------------------------------------------------------------------------------
1 | {"type": "SCHEMA", "stream": "test_record_missing_key_property", "key_properties": ["id"], "schema": {"type": "object", "properties": {"id": {"type": "integer"}, "name": {"type": "string"}}}}
2 | {"type": "RECORD", "stream": "test_record_missing_key_property", "record": {"name": "Mike"}}
3 | 


--------------------------------------------------------------------------------
/tests/test_target_stitch.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import target_stitch
  3 | import json
  4 | import io
  5 | import mock
  6 | import sys
  7 | import datetime
  8 | import pytz
  9 | import jsonschema
 10 | import simplejson
 11 | import decimal
 12 | import re
 13 | import time
 14 | 
 15 | from decimal import Decimal
 16 | from jsonschema import ValidationError, Draft4Validator, validators, FormatChecker
 17 | from singer import ActivateVersionMessage, RecordMessage, utils, parse_message
 18 | 
 19 | 
 20 | class DummyClient(object):
 21 | 
 22 |     def __init__(self):
 23 |         self.batches = []
 24 | 
 25 |     def handle_batch(self, messages, contains_activate_version, schema, key_names, bookmark_names, state_writer, state):
 26 |         self.batches.append(
 27 |             {'messages': messages,
 28 |              'schema': schema,
 29 |              'key_names': key_names,
 30 |              'bookmark_names': bookmark_names})
 31 | 
 32 | def message_queue(messages):
 33 |     return [json.dumps(m) for m in messages]
 34 | 
 35 | def persist_all(recs):
 36 |     with DummyClient() as client:
 37 |         target_stitch.persist_lines(client, message_lines(recs))
 38 |         return client.messages
 39 | 
 40 | 
 41 | def state(i):
 42 |     return {"type": "STATE", "value": i}
 43 | def record(i):
 44 |     return {"type": "RECORD", "stream": "foo", "record": {"i": i}}
 45 | 
 46 | schema = {"type": "SCHEMA",
 47 |           "stream": "foo",
 48 |           "key_properties": ["i"],
 49 |           "schema": {"properties": {"i": {"type": "integer"}}}
 50 | }
 51 | 
 52 | def load_sample_lines(filename):
 53 |     with open('tests/' + filename) as fp:
 54 |         return [line for line in fp]
 55 | 
 56 | 
 57 | class TestTargetStitch(unittest.TestCase):
 58 | 
 59 |     def setUp(self):
 60 |         self.client = DummyClient()
 61 |         self.out = io.StringIO()
 62 |         self.target_stitch = target_stitch.TargetStitch(
 63 |             [self.client], self.out, 4000000, 20000, 100000)
 64 | 
 65 |     def test_persist_lines_fails_without_key_properties(self):
 66 |         recs = [
 67 |             {"type": "SCHEMA",
 68 |              "stream": "users",
 69 |              "schema": {
 70 |                  "properties": {
 71 |                      "id": {"type": "integer"},
 72 |                      "name": {"type": "string"}}}}]
 73 | 
 74 |         with self.assertRaises(Exception):
 75 |             target_stitch.consume(message_queue(recs))
 76 | 
 77 |     def test_persist_lines_works_with_empty_key_properties(self):
 78 |         queue = load_sample_lines('empty_key_properties.json')
 79 |         self.target_stitch.consume(queue)
 80 |         self.assertEqual(len(self.client.batches), 1)
 81 |         self.assertEqual(self.client.batches[0]['key_names'], [])
 82 | 
 83 | 
 84 |     def test_persist_lines_sets_key_names(self):
 85 |         inputs = [
 86 |             {"type": "SCHEMA",
 87 |              "stream": "users",
 88 |              "key_properties": ["id"],
 89 |              "schema": {
 90 |                  "properties": {
 91 |                      "id": {"type": "integer"},
 92 |                      "name": {"type": "string"}}}},
 93 |             {"type": "RECORD",
 94 |              "stream": "users",
 95 |              "record": {"id": 1, "name": "mike"}}]
 96 | 
 97 |         self.target_stitch.consume(message_queue(inputs))
 98 |         self.assertEqual(len(self.client.batches), 1)
 99 |         batch = self.client.batches[0]
100 |         self.assertEqual(
101 |             batch['schema'],
102 |             {
103 |                 "properties": {
104 |                     "id": {"type": "integer"},
105 |                     "name": {"type": "string"}
106 |                 }
107 |             }
108 |         )
109 | 
110 |         self.assertEqual(batch['key_names'], ['id'])
111 | 
112 |     def test_persist_last_state_when_stream_ends_with_record(self):
113 |         self.target_stitch.max_batch_records = 3
114 |         inputs = [
115 |             schema,
116 |             record(0), state(0), record(1), state(1), record(2),
117 |             # flush state 1
118 |             state(2), record(3), state(3), record(4), state(4), record(5),
119 |             # flush state 4
120 |             record(6),
121 |             record(7),
122 |             record(8),
123 |             # flush empty states
124 |             state(8),
125 |             record(9),
126 |             state(9),
127 |             record(10)]
128 | 
129 |         self.target_stitch.consume(message_queue(inputs))
130 | 
131 |         expected = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10]]
132 |         got = [[r.record['i'] for r in batch['messages']] for batch in self.client.batches]
133 |         self.assertEqual(got, expected)
134 | 
135 |     def test_persist_last_state_when_stream_ends_with_state(self):
136 |         self.target_stitch.max_batch_records = 3
137 |         inputs = [
138 |             schema,
139 |             record(0), state(0), record(1), state(1), record(2),
140 |             # flush state 1
141 |             state(2), record(3), state(3), record(4), state(4), record(5),
142 |             # flush state 4
143 |             record(6),
144 |             record(7),
145 |             record(8),
146 |             # flush empty states
147 |             state(8),
148 |             record(9),
149 |             state(9),
150 |             record(10),
151 |             state(10)]
152 | 
153 |         self.target_stitch.consume(message_queue(inputs))
154 | 
155 | 
156 |         expected = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10]]
157 |         got = [[r.record['i'] for r in batch['messages']] for batch in self.client.batches]
158 |         self.assertEqual(got, expected)
159 | 
160 |     def test_time_triggered_persist(self):
161 |         self.target_stitch.batch_delay_seconds = -1
162 |         self.target_stitch.max_batch_records = 10000
163 |         inputs = [
164 |             schema,
165 |             record(0),
166 |             record(1),
167 |             record(2)]
168 |         self.target_stitch.consume(message_queue(inputs))
169 |         expected = [[0], [1], [2]]
170 |         got = [[r.record['i'] for r in batch['messages']] for batch in self.client.batches]
171 |         self.assertEqual(got, expected)
172 | 
173 |     def test_persist_lines_updates_schema(self):
174 |         inputs = [
175 |             {"type": "SCHEMA",
176 |              "stream": "users",
177 |              "key_properties": ["id"],
178 |              "schema": {
179 |                  "properties": {
180 |                      "id": {"type": "integer"},
181 |                      "name": {"type": "string"}}}},
182 |             {"type": "RECORD",
183 |              "stream": "users",
184 |              "record": {"id": 1, "name": "mike"}},
185 |             {"type": "SCHEMA",
186 |              "stream": "users",
187 |              "key_properties": ["id"],
188 |              "schema": {
189 |                  "properties": {
190 |                      "id": {"type": "string"},
191 |                      "name": {"type": "string"}}}},
192 |             {"type": "RECORD",
193 |              "stream": "users",
194 |              "record": {"id": "1", "name": "mike"}}]
195 | 
196 |         self.target_stitch.consume(message_queue(inputs))
197 | 
198 |         self.assertEqual(len(self.client.batches), 2)
199 |         self.assertEqual(self.client.batches[0]['key_names'], ['id'])
200 |         self.assertEqual(self.client.batches[0]['schema']['properties']['id']['type'], 'integer')
201 |         self.assertEqual(self.client.batches[1]['schema']['properties']['id']['type'], 'string')
202 | 
203 |     def test_versioned_stream(self):
204 |         queue = load_sample_lines('versioned_stream.json')
205 |         self.target_stitch.consume(queue)
206 | 
207 |         batches = self.client.batches
208 |         self.assertEqual(2, len(batches))
209 |         self.assertEqual(1, batches[0]['messages'][0].version)
210 |         self.assertEqual(2, batches[1]['messages'][0].version)
211 | 
212 | class TestSerialize(unittest.TestCase):
213 | 
214 |     def setUp(self):
215 |         self.schema = {
216 |             'type': 'object',
217 |             'properties': {
218 |                 'id': {'type': 'integer'},
219 |                 'color': {'type': 'string'}
220 |             }
221 |         }
222 | 
223 |         self.colors = ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'violet']
224 |         self.key_names = ['id']
225 |         self.bookmark_names = ['updated_at']
226 | 
227 |         self.records = [{'id': i, 'color': color, 'updated_at': utils.strftime(utils.now())}
228 |                         for i, color in enumerate(self.colors)]
229 |         self.messages = [RecordMessage(stream='colors', record=r) for r in self.records]
230 |         self.messages.append(ActivateVersionMessage(stream='colors', version=1))
231 | 
232 |     def serialize_with_limit(self, limit):
233 |         return target_stitch.serialize(self.messages, self.schema, self.key_names, self.bookmark_names, limit, target_stitch.DEFAULT_MAX_BATCH_RECORDS)
234 | 
235 |     def unpack_colors(self, request_bodies):
236 |         colors = []
237 |         for body in request_bodies:
238 |             loaded = json.loads(body)
239 |             for message in loaded['messages']:
240 |                 action = message['action']
241 |                 if action == 'upsert':
242 |                     colors.append((action, message['data']['color']))
243 |                 else:
244 |                     colors.append((action))
245 |         return colors
246 | 
247 |     def test_splits_batches(self):
248 |         self.assertEqual(1, len(self.serialize_with_limit(2000)))
249 |         self.assertEqual(2, len(self.serialize_with_limit(1000)))
250 |         self.assertEqual(4, len(self.serialize_with_limit(500)))
251 |         self.assertEqual(8, len(self.serialize_with_limit(385)))
252 | 
253 |     def test_raises_if_cant_stay_in_limit(self):
254 |         data = 'a' * 21000000
255 |         message = RecordMessage(stream='colors', record=data)
256 |         with self.assertRaisesRegex(target_stitch.BatchTooLargeException, re.compile('the Stitch API limit of 20 Mb')):
257 |             target_stitch.serialize([message], self.schema, self.key_names, self.bookmark_names, 4000000, target_stitch.DEFAULT_MAX_BATCH_RECORDS)
258 | 
259 |     def test_does_not_drop_records(self):
260 |         expected = [
261 |             ('upsert', 'red'),
262 |             ('upsert', 'orange'),
263 |             ('upsert', 'yellow'),
264 |             ('upsert', 'green'),
265 |             ('upsert', 'blue'),
266 |             ('upsert', 'indigo'),
267 |             ('upsert', 'violet'),
268 |             ('activate_version')]
269 | 
270 |         self.assertEqual(expected, self.unpack_colors(self.serialize_with_limit(2000)))
271 |         self.assertEqual(expected, self.unpack_colors(self.serialize_with_limit(1000)))
272 |         self.assertEqual(expected, self.unpack_colors(self.serialize_with_limit(500)))
273 |         self.assertEqual(expected, self.unpack_colors(self.serialize_with_limit(385)))
274 | 
275 |     def test_serialize_time_extracted(self):
276 |         """ Test that we're not corrupting timestamps with cross platform parsing. (Test case for OSX, specifically) """
277 |         expected = "1970-01-01T03:45:23.000000Z"
278 |         test_time = datetime.datetime(1970, 1, 1, 3, 45, 23, tzinfo=pytz.utc)
279 | 
280 |         record = [RecordMessage("greetings",'{greeting: "hi"}', time_extracted=test_time)]
281 |         schema = '{"type": "object", "properties": {"greeting": {"type": "string"}}}'
282 |         batch = target_stitch.serialize(record, schema, [], [], 1000, target_stitch.DEFAULT_MAX_BATCH_RECORDS)[0]
283 |         actual = json.loads(batch)["messages"][0]["time_extracted"]
284 | 
285 |         self.assertEqual(expected, actual)
286 | 
287 | 
288 |     def create_raw_record(self, value):
289 |         return '{"value": ' + value + '}'
290 | 
291 |     def create_raw_record_message(self,raw_record):
292 |         return '{"type": "RECORD", "stream": "test", "record": ' + raw_record + '}'
293 | 
294 | class TestDetermineStitchUrl(unittest.TestCase):
295 |     def test_full_table_stream(self):
296 |         big_batch_url = 'https://bigbatches.org'
297 |         small_batch_url = 'https://smallbatch.mil'
298 |         target_stitch.CONFIG = {'batch_size_preferences' :
299 |                                 {'full_table_streams' : ['chickens'],
300 |                                  'batch_size_preference' : None,
301 |                                  'user_batch_size_preference' : None
302 |                                 },
303 |                                 'small_batch_url' : small_batch_url,
304 |                                 'big_batch_url' : big_batch_url}
305 | 
306 |         self.assertEqual(target_stitch.determine_stitch_url('chickens'), big_batch_url)
307 | 
308 |     def test_incremental_stream(self):
309 |         big_batch_url = 'https://bigbatches.org'
310 |         small_batch_url = 'https://smallbatch.mil'
311 |         target_stitch.CONFIG = {'batch_size_preferences' :
312 |                                 {'full_table_streams' : [],
313 |                                  'batch_size_preference' : None,
314 |                                  'user_batch_size_preference' : None
315 |                                 },
316 |                                 'small_batch_url' : small_batch_url,
317 |                                 'big_batch_url' : big_batch_url}
318 | 
319 |         self.assertEqual(target_stitch.determine_stitch_url('chickens'), small_batch_url)
320 | 
321 |     def test_big_batch_preference(self):
322 |         big_batch_url = 'https://bigbatches.org'
323 |         small_batch_url = 'https://smallbatch.mil'
324 |         target_stitch.CONFIG = {'batch_size_preferences' :
325 |                                 {'full_table_streams' : [],
326 |                                  'batch_size_preference' : 'bigbatch',
327 |                                  'user_batch_size_preference' : None
328 |                                 },
329 |                                 'small_batch_url' : small_batch_url,
330 |                                 'big_batch_url' : big_batch_url}
331 | 
332 |         self.assertEqual(target_stitch.determine_stitch_url('chickens'), big_batch_url)
333 | 
334 | class TestSequenceNumbers(unittest.TestCase):
335 |     def setUp(self):
336 |         # NB: This is the historical width of the sequence number integer
337 |         # - Generally, it's a combination of (timestamp + padded_row_index) for 19 digits
338 |         # - This should be increased/decreased with care to prevent downstream issues
339 |         self.STANDARD_SEQ_LENGTH = 19
340 | 
341 |     def test_generate_sequence_normal_batch(self):
342 |         # Call with a sleep, to simulate the normal case (no ms collisions)
343 |         seq1 = target_stitch.generate_sequence(0,target_stitch.DEFAULT_MAX_BATCH_RECORDS)
344 |         time.sleep(0.1)
345 |         seq2 = target_stitch.generate_sequence(10,target_stitch.DEFAULT_MAX_BATCH_RECORDS)
346 |         time.sleep(0.1)
347 |         seq3 = target_stitch.generate_sequence(999,target_stitch.DEFAULT_MAX_BATCH_RECORDS)
348 |         time.sleep(0.1)
349 | 
350 |         generated_seqs = [seq1,seq2,seq3]
351 |         # Assert number's width for downstream
352 |         [self.assertEqual(len(str(s)), self.STANDARD_SEQ_LENGTH) for s in generated_seqs]
353 |         # Assert they are all at least increasing
354 |         self.assertEqual(generated_seqs, sorted(generated_seqs))
355 |         # Assert no collisions
356 |         self.assertEqual(len(generated_seqs), len(set(generated_seqs)))
357 | 
358 |     def test_generate_sequence_single_record_batches(self):
359 |         # Call without sleep and same message_num to create collisions reliably
360 |         # This is the situation where multiple single record batches get cut in succession
361 |         seq1 = target_stitch.generate_sequence(0,target_stitch.DEFAULT_MAX_BATCH_RECORDS)
362 |         seq2 = target_stitch.generate_sequence(0,target_stitch.DEFAULT_MAX_BATCH_RECORDS)
363 |         seq3 = target_stitch.generate_sequence(0,target_stitch.DEFAULT_MAX_BATCH_RECORDS)
364 | 
365 |         generated_seqs = [seq1,seq2,seq3]
366 | 
367 |         # Assert number's width for downstream
368 |         [self.assertEqual(len(str(s)), self.STANDARD_SEQ_LENGTH) for s in generated_seqs]
369 |         # Assert they are all at least increasing
370 |         self.assertEqual(generated_seqs, sorted(generated_seqs))
371 |         # Assert no collisions
372 |         self.assertEqual(len(generated_seqs), len(set(generated_seqs)))
373 | 
374 |     def test_generate_sequence_max_batch(self):
375 |         # Call with an overshot max batch to ensure no duplication
376 |         # - The target can consume more than max_batch before cutting a batch
377 |         # - It should tolerate an order of magnitude greater records without repeat or extending the width
378 |         max_batch = range(target_stitch.DEFAULT_MAX_BATCH_RECORDS * 10)
379 | 
380 |         generated_seqs = [target_stitch.generate_sequence(i,target_stitch.DEFAULT_MAX_BATCH_RECORDS)
381 |                           for i in max_batch]
382 | 
383 |         # Assert number's width for downstream
384 |         [self.assertEqual(len(str(s)), self.STANDARD_SEQ_LENGTH) for s in generated_seqs]
385 |         # Assert they are all at least increasing
386 |         self.assertEqual(generated_seqs, sorted(generated_seqs))
387 |         # Assert no collisions
388 |         self.assertEqual(len(generated_seqs), len(set(generated_seqs)))
389 | 
390 | 
391 |     def test_generate_sequence_mixed_case(self):
392 |         # Call with varying lengths of batches to ensure the widths mix
393 |         regular_batch = [(i,target_stitch.DEFAULT_MAX_BATCH_RECORDS) for i in range(100)]
394 |         single_record_batch = [(0,target_stitch.DEFAULT_MAX_BATCH_RECORDS)]
395 | 
396 |         test_case = (single_record_batch +
397 |                      regular_batch +
398 |                      single_record_batch +
399 |                      single_record_batch +
400 |                      single_record_batch +
401 |                      regular_batch +
402 |                      single_record_batch)
403 |         generated_seqs = [target_stitch.generate_sequence(*values) for values in test_case]
404 | 
405 |         # Assert number's width for downstream
406 |         [self.assertEqual(len(str(s)), self.STANDARD_SEQ_LENGTH) for s in generated_seqs]
407 |         # Assert they are all at least increasing
408 |         self.assertEqual(generated_seqs, sorted(generated_seqs))
409 |         # Assert no collisions
410 |         self.assertEqual(len(generated_seqs), len(set(generated_seqs)))
411 | 
412 | 
413 | if __name__== "__main__":
414 |     test1 = TestSerialize()
415 |     test1.setUp()
416 |     test1.test_raises_if_cant_stay_in_limit()
417 | 


--------------------------------------------------------------------------------
/tests/versioned_stream.json:
--------------------------------------------------------------------------------
1 | {"type": "SCHEMA", "stream": "users", "key_properties": ["id"], "schema": {"type": "object", "properties": {"id": {"type": "integer"}, "name": {"type": "string"}}}}
2 | {"type": "RECORD", "stream": "users", "version": 1, "record": {"id": 1, "name": "Sam"}}
3 | {"type": "RECORD", "stream": "users", "version": 1, "record": {"id": 2, "name": "Pat"}}
4 | {"type": "RECORD", "stream": "users", "version": 1, "record": {"id": 3, "name": "Alex"}}
5 | {"type": "ACTIVATE_VERSION", "stream": "users", "version": 1}
6 | {"type": "RECORD", "stream": "users", "version": 2, "record": {"id": 1, "name": "Samantha"}}
7 | {"type": "RECORD", "stream": "users", "version": 2, "record": {"id": 2, "name": "Patrick"}}
8 | {"type": "ACTIVATE_VERSION", "stream": "users", "version": 2}
9 | 


--------------------------------------------------------------------------------