├── .circleci
    ├── config.yml
    └── integration
    │   ├── tap-github
    │       ├── catalog.json
    │       └── config-template.json
    │   ├── tap-postgres
    │       └── config.json
    │   └── target-config.json
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── DECISIONS.md
├── LICENSE
├── README.md
├── docker-compose.yml
├── docker-entrypoint.sh
├── docs
    └── TableMetadata.md
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── target_postgres
    ├── __init__.py
    ├── denest.py
    ├── exceptions.py
    ├── json_schema.py
    ├── postgres.py
    ├── singer.py
    ├── singer_stream.py
    ├── sql_base.py
    ├── stream_tracker.py
    └── target_tools.py
└── tests
    ├── conftest.py
    ├── migrations
        ├── data
        │   └── tap
        ├── scripts
        │   ├── install_schema_versions.sh
        │   ├── to_latest.sh
        │   └── to_target.sh
        └── test_migrations.py
    ├── unit
        ├── test_BufferedSingerStream.py
        ├── test_denest.py
        ├── test_json_schema.py
        ├── test_postgres.py
        ├── test_sandbox.py
        └── test_target_tools.py
    └── utils
        └── fixtures.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | version: 2.1
  2 | 
  3 | filters: &filters
  4 |   filters:
  5 |     tags:
  6 |       only: /^v[0-9]+(\.[0-9]+)*$/
  7 | 
  8 | filters__tags: &filters__tags
  9 |   filters:
 10 |     branches:
 11 |       ignore: /.*/
 12 |     tags:
 13 |       only: /^v[0-9]+(\.[0-9]+)*$/
 14 | 
 15 | workflows:
 16 |   test:
 17 |     jobs:
 18 |       - cache:
 19 |           <<: *filters
 20 |       - build:
 21 |           <<: *filters
 22 |           requires:
 23 |             - cache
 24 |       - test--15:
 25 |           <<: *filters
 26 |           requires:
 27 |             - cache
 28 |       - test--14:
 29 |           <<: *filters
 30 |           requires:
 31 |             - cache
 32 |       - test--13:
 33 |           <<: *filters
 34 |           requires:
 35 |             - cache
 36 |       - test--12:
 37 |           <<: *filters
 38 |           requires:
 39 |             - cache
 40 |       - test--11:
 41 |           <<: *filters
 42 |           requires:
 43 |             - cache
 44 |       - test--10:
 45 |           <<: *filters
 46 |           requires:
 47 |             - cache
 48 |       - test--migrations:
 49 |           <<: *filters
 50 |           requires:
 51 |             - cache
 52 |       - test--tap-github:
 53 |           <<: *filters
 54 |           requires:
 55 |             - test--12
 56 |       - test-release:
 57 |           <<: *filters__tags
 58 |           requires:
 59 |             - test--tap-github
 60 |             - test--migrations
 61 |             - build
 62 |       - approve-release:
 63 |           <<: *filters__tags
 64 |           type: approval
 65 |           requires:
 66 |             - test-release
 67 |       - release:
 68 |           <<: *filters__tags
 69 |           requires:
 70 |             - approve-release
 71 | 
 72 | cache: &cache deps-v7-{{ checksum "poetry.lock" }}-{{ checksum ".circleci/config.yml" }}
 73 | 
 74 | py: &py python:3.7.15-bullseye
 75 | 
 76 | restore__cache: &restore__cache
 77 |   restore_cache:
 78 |     keys:
 79 |       - *cache
 80 | 
 81 | # Simple checkout command to pull external forks.
 82 | # The CircleCI util does not work without setting up SSH keys
 83 | # which we technically do not need for open-source repos.
 84 | checkout_command: &checkout_command
 85 |   run:
 86 |     name: checkout
 87 |     command: |
 88 |       git clone https://github.com/datamill-co/target-postgres .
 89 |       if [[ "$CIRCLE_BRANCH" =~ ^pull\/* ]]; then
 90 |         git fetch origin refs/pull/${CIRCLE_PR_NUMBER}/head
 91 |         git checkout ${CIRCLE_SHA1}
 92 |       else
 93 |         git checkout ${CIRCLE_BRANCH}
 94 |       fi
 95 | 
 96 | install_poetry: &install_poetry
 97 |   run:
 98 |     name: Install poetry
 99 |     command: |
100 |       export POETRY_HOME=/opt/poetry
101 |       python -m venv $POETRY_HOME
102 |       $POETRY_HOME/bin/pip install -U pip
103 |       $POETRY_HOME/bin/pip install poetry==1.2.2
104 |       $POETRY_HOME/bin/poetry --version
105 | 
106 | test__base: &test__base
107 |   working_directory: /code/
108 |   steps:
109 |     - *checkout_command
110 |     - *restore__cache
111 |     - *install_poetry
112 |     - attach_workspace:
113 |         at: "./"
114 | 
115 |     - run:
116 |         name: Run Tests
117 |         command: |
118 |           source venv/target-postgres/bin/activate
119 |           pytest --verbose tests/unit
120 |         environment:
121 |           POSTGRES_HOST: localhost
122 |           POSTGRES_DATABASE: target_postgres_test
123 |           POSTGRES_USERNAME: postgres
124 |           POSTGRES_PASSWORD: postgres
125 | 
126 |     - store_artifacts:
127 |         path: target/test-results
128 |         destination: raw-test-output
129 | 
130 | jobs:
131 |   cache:
132 |     working_directory: /code/
133 |     docker:
134 |       - image: *py
135 |     steps:
136 |       - *checkout_command
137 |       - *restore__cache
138 |       - *install_poetry
139 | 
140 |       - run:
141 |           name: Install target-postgres
142 |           command: |
143 |             python -m venv venv/target-postgres
144 |             source venv/target-postgres/bin/activate
145 |             pip install -U pip
146 |             /opt/poetry/bin/poetry install --with tests
147 |             deactivate
148 | 
149 |       - run:
150 |           name: Install older versions of target-postgres for migration testing
151 |           command: ./tests/migrations/scripts/install_schema_versions.sh
152 | 
153 |       - run:
154 |           name: Install tap-github
155 |           command: |
156 |             python -m venv venv/tap-github
157 |             source venv/tap-github/bin/activate
158 |             pip install -U pip
159 |             pip install git+https://github.com/MeltanoLabs/tap-github.git@v1.1.0
160 |             deactivate
161 | 
162 |       - run:
163 |           name: Install tap-postgres
164 |           command: |
165 |             python -m venv venv/tap-postgres
166 |             source venv/tap-postgres/bin/activate
167 |             pip install -U pip
168 |             pip install tap-postgres
169 |             deactivate
170 | 
171 |       - save_cache:
172 |           key: *cache
173 |           paths:
174 |             - "./venv"
175 |             - "/usr/local/bin"
176 |             - "/usr/local/lib/python3.7/site-packages"
177 |             - "/opt/poetry"
178 | 
179 |       - persist_to_workspace:
180 |           root: "./"
181 |           paths:
182 |             - "./venv"
183 | 
184 |   test--15:
185 |     <<: *test__base
186 |     docker:
187 |       - image: *py
188 |       - image: postgres:15.0
189 |         environment:
190 |           POSTGRES_DB: target_postgres_test
191 |           POSTGRES_PASSWORD: postgres
192 | 
193 |   test--14:
194 |     <<: *test__base
195 |     docker:
196 |       - image: *py
197 |       - image: postgres:14.5
198 |         environment:
199 |           POSTGRES_DB: target_postgres_test
200 |           POSTGRES_PASSWORD: postgres
201 | 
202 |   test--13:
203 |     <<: *test__base
204 |     docker:
205 |       - image: *py
206 |       - image: postgres:13.8
207 |         environment:
208 |           POSTGRES_DB: target_postgres_test
209 |           POSTGRES_PASSWORD: postgres
210 | 
211 |   test--12:
212 |     <<: *test__base
213 |     docker:
214 |       - image: *py
215 |       - image: postgres:12.12
216 |         environment:
217 |           POSTGRES_DB: target_postgres_test
218 |           POSTGRES_PASSWORD: postgres
219 | 
220 |   test--11:
221 |     <<: *test__base
222 |     docker:
223 |       - image: *py
224 |       - image: postgres:11.17-bullseye
225 |         environment:
226 |           POSTGRES_DB: target_postgres_test
227 |           POSTGRES_PASSWORD: postgres
228 | 
229 |   test--10:
230 |     <<: *test__base
231 |     docker:
232 |       - image: *py
233 |       - image: postgres:10.22-bullseye
234 |         environment:
235 |           POSTGRES_DB: target_postgres_test
236 |           POSTGRES_PASSWORD: postgres
237 | 
238 |   test--tap-github:
239 |     working_directory: /code/
240 |     docker:
241 |       - image: *py
242 |       - image: postgres:12.12
243 |         environment:
244 |           POSTGRES_DB: target_postgres_test
245 |           POSTGRES_PASSWORD: postgres
246 |     steps:
247 |       - *checkout_command
248 |       - *restore__cache
249 |       - attach_workspace:
250 |           at: "./"
251 | 
252 |       - run:
253 |           name: Setup artifacts folder
254 |           command: mkdir -p /code/artifacts/data
255 | 
256 |       - run:
257 |           name: Setup tap-github
258 |           working_directory: /code/.circleci/integration/tap-github
259 |           command: sed "s/REPLACE_ME/$TAP_GITHUB_TOKEN/" config-template.json > config.json
260 | 
261 |       - run:
262 |           name: Tap -> Data
263 |           command: |
264 |             source venv/tap-github/bin/activate
265 |             cd /code/.circleci/integration/tap-github
266 | 
267 |             tap-github --config config.json --catalog catalog.json > /code/artifacts/data/tap
268 | 
269 |             deactivate
270 | 
271 |       - run:
272 |           name: Data -> Target
273 |           command: |
274 |             source venv/target-postgres/bin/activate
275 |             pip install -U pip
276 |             /opt/poetry/bin/poetry install
277 |             cd /code/.circleci/integration
278 | 
279 |             cat /code/artifacts/data/tap | target-postgres --config target-config.json
280 | 
281 |             deactivate
282 | 
283 |       - run:
284 |           name: Target -> Data
285 |           command: |
286 |             source venv/tap-postgres/bin/activate
287 |             cd /code/.circleci/integration/tap-postgres
288 | 
289 |             tap-postgres --config config.json --discover > tmp-properties.json
290 | 
291 |             ## Select _every_ table found in properties.
292 |             ##  row-count seems to only show up inside of the necessary metadata object...easier than multi-line-sed
293 |             sed 's/"row-count": 0,/"row-count": 0,"selected":true,/g' tmp-properties.json > /code/artifacts/data/properties.json
294 | 
295 |             tap-postgres --config config.json --properties /code/artifacts/data/properties.json > /code/artifacts/data/target
296 | 
297 |             deactivate
298 | 
299 |       - run:
300 |           name: Repeatability of Data -> Target
301 |           command: |
302 |             source venv/target-postgres/bin/activate
303 |             pip install -U pip
304 |             pip install .
305 |             cd /code/.circleci/integration
306 | 
307 |             cat /code/artifacts/data/tap | target-postgres --config target-config.json
308 | 
309 |             deactivate
310 | 
311 |             cd /code/
312 | 
313 |             source venv/tap-postgres/bin/activate
314 |             cd /code/.circleci/integration/tap-postgres
315 | 
316 |             tap-postgres --config config.json --discover > tmp-properties.json
317 | 
318 |             ## Select _every_ table found in properties.
319 |             ##  row-count seems to only show up inside of the necessary metadata object...easier than multi-line-sed
320 |             sed 's/"row-count": 0,/"row-count": 0,"selected":true,/g' tmp-properties.json > /code/artifacts/data/properties.json
321 | 
322 |             tap-postgres --config config.json --properties /code/artifacts/data/properties.json > /code/artifacts/data/target.repeated
323 | 
324 |             deactivate
325 | 
326 |             ## TODO: compare repeated data to insure that we only changed _sdc values
327 |             # diff /code/artifacts/data/target /code/artifacts/data/target.repeated
328 | 
329 |       - store_artifacts:
330 |           path: /code/artifacts
331 | 
332 |   test--migrations:
333 |     working_directory: /code/
334 |     docker:
335 |       - image: *py
336 |       - image: postgres:12.12
337 |         environment:
338 |           POSTGRES_DB: target_postgres_test
339 |           POSTGRES_PASSWORD: postgres
340 |     steps:
341 |       - *checkout_command
342 |       - *restore__cache
343 |       - *install_poetry
344 |       - attach_workspace:
345 |           at: "./"
346 | 
347 |       - run:
348 |           name: Run Tests
349 |           command: |
350 |             source venv/target-postgres/bin/activate
351 |             pytest --verbose tests/migrations
352 |           environment:
353 |             POSTGRES_HOST: localhost
354 |             POSTGRES_DATABASE: target_postgres_test
355 |             POSTGRES_USERNAME: postgres
356 |             POSTGRES_PASSWORD: postgres
357 | 
358 |       - store_artifacts:
359 |           path: /code/tests/migrations/artifacts
360 |           destination: raw-test-output
361 | 
362 |   build:
363 |     working_directory: /code/
364 |     docker:
365 |       - image: *py
366 |     steps:
367 |       - *checkout_command
368 |       - *restore__cache
369 |       - attach_workspace:
370 |           at: "./"
371 |       - run:
372 |           name: Build distribution
373 |           command: |
374 |             source venv/target-postgres/bin/activate
375 | 
376 |             pip install -U pip
377 |             pip install --upgrade setuptools wheel twine
378 | 
379 |             /opt/poetry/bin/poetry build
380 | 
381 |             deactivate
382 | 
383 |       - persist_to_workspace:
384 |           root: "./"
385 |           paths:
386 |             - "./dist"
387 | 
388 |   test-release:
389 |     working_directory: /code/
390 |     docker:
391 |       - image: *py
392 |     steps:
393 |       - *checkout_command
394 |       - *restore__cache
395 |       - attach_workspace:
396 |           at: "./"
397 |       - run:
398 |           name: Validate tag
399 |           command: |
400 |             export TAG=`echo $CIRCLE_TAG | sed 's/v//'`
401 |             VERSION=`grep version pyproject.toml | sed 's/^.*version = "\(.*\)",.*$/\1/'`
402 | 
403 |             echo tag: $TAG equals version: $VERSION '?'
404 | 
405 |             [[ $TAG == $VERSION ]]
406 |       - run:
407 |           name: Install upload tools
408 |           command: pip install --upgrade twine
409 |       - run:
410 |           name: Test Publish
411 |           environment:
412 |             TWINE_USERNAME: datamill
413 |             TWINE_REPOSITORY_URL: https://test.pypi.org/legacy/
414 |           command: |
415 |             export TWINE_PASSWORD=$PYPI__PASSWORD__TEST
416 |             twine upload ./dist/*
417 | 
418 |   release:
419 |     working_directory: /code/
420 |     docker:
421 |       - image: *py
422 |     steps:
423 |       - *checkout_command
424 |       - *restore__cache
425 |       - attach_workspace:
426 |           at: "./"
427 |       - run:
428 |           name: Install upload tools
429 |           command: pip install --upgrade twine
430 |       - run:
431 |           name: Publish
432 |           environment:
433 |             TWINE_USERNAME: datamill
434 |           command: |
435 |             export TWINE_PASSWORD=$PYPI__PASSWORD
436 |             twine upload ./dist/*
437 | 


--------------------------------------------------------------------------------
/.circleci/integration/tap-github/config-template.json:
--------------------------------------------------------------------------------
1 | {
2 |   "start_date": "2022-04-12",
3 |   "repositories": ["datamill-co/target-postgres"]
4 | }
5 | 


--------------------------------------------------------------------------------
/.circleci/integration/tap-postgres/config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dbname": "target_postgres_test",
3 |   "host": "localhost",
4 |   "port":  "5432",
5 |   "user": "postgres",
6 |   "password":  null,
7 |   "default_replication_method": "FULL_TABLE"
8 | }


--------------------------------------------------------------------------------
/.circleci/integration/target-config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "postgres_database": "target_postgres_test",
3 |   "postgres_username": "postgres"
4 | }
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # IPython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env
 80 | 
 81 | # virtualenv
 82 | venv/
 83 | ENV/
 84 | 
 85 | # Spyder project settings
 86 | .spyderproject
 87 | 
 88 | # Rope project settings
 89 | .ropeproject
 90 | 
 91 | .pytest_cache/
 92 | 
 93 | # Mac
 94 | .DS_Store
 95 | ._*
 96 | 
 97 | # Singer files
 98 | *.txt
 99 | 
100 | /venv--*
101 | /venv
102 | /tests/migrations/artifacts
103 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## 0.2.4
  4 | 
  5 | - **BUG FIX:** `multipleOf` validation
  6 |   - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/179)
  7 |   - Due to floating point errors in Python and JSONSchema, `multipleOf`
  8 |     validation has been failing.
  9 | 
 10 | ## 0.2.3
 11 | 
 12 | - **FEATURES:**
 13 |   - [`JSONSchema: anyOf` Support](https://github.com/datamill-co/target-postgres/pull/155)
 14 |     - Streamed `JSONSchema`s which include `anyOf` combinations should now be fully supported
 15 |     - This allows for full support of Stitch/Singer's `DateTime` string fallbacks.
 16 |   - [`JSONSchema`: allOf` Support](https://github.com/datamill-co/target-postgres/pull/154)
 17 |     - Streamed `JSONSchema`s which include `allOf` combinations should now be fully supported
 18 |     - Columns are persisted as normal.
 19 |     - This is _perceived_ to be most useful for merging objects, and putting in place things like `maxLength` etc.
 20 | - **BUG FIX:** Buffer Flushing at frequent intervals/with small batches
 21 |   - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/169)
 22 |   - Buffer _size_ calculations relied upon some "sophisticated" logic for determining the "size" in
 23 |     memory of a Python object
 24 |   - The method used by Singer libraries is to simply use the size of the streamed `JSON` blob
 25 |   - Performance Improvement seen due to batches now being far larger and interactions with the remote
 26 |     being far fewer.
 27 | - **BUG FIX:** `NULLABLE` not being _implied_ when field is missing from streamed `JSONSchema`
 28 |   - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/174)
 29 |   - If a field was persisted in remote, but then left _out_ of a subsequent streamed `JSONSchema`, we would fail
 30 |   - In this instance, the field is _implied_ to be `NULL`, but additionally, if values _are_ present for it
 31 |     in the streamed data, we _should_ persist it.
 32 | 
 33 | ## 0.2.2
 34 | 
 35 | - **FEATURES:**
 36 |   - [Performance improvement for upserting data](https://github.com/datamill-co/target-postgres/pull/161)
 37 |     - Saw long running queries for some `SELECT COUNT(1)...` queries
 38 |       - Resulting in full table scans
 39 |     - These queries are _only_ being used for `is_table_empty`, therefore we can use a more efficient
 40 |       `SELECT EXISTS(...)` query which only needs a single row to be fetched
 41 | 
 42 | ## 0.2.1
 43 | 
 44 | - **FEATURES:**
 45 |   - [Performance improvement for upserting data ](https://github.com/datamill-co/target-postgres/pull/152)
 46 |     - For large or even reasonably sized tables, trying to upsert the data was prohibitively slow
 47 |     - To mitigate this, we now add indexes to allow
 48 |     - This change can be opted out of via the `add_upsert_indexes` config option
 49 |     - **NOTE**: This only effects intallations post `0.2.1`, and will not upgrade/migrate existing installations
 50 |   - Support for latest PostgreSQL 12.0
 51 |     - PostgreSQL recently released 12.0, and we now have testing around it and can confirm that `target-postgres`
 52 |       _should_ function correctly for it!
 53 | - **BUG FIX:** `STATE` messages being sent at the wrong time
 54 |   - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/149)
 55 |   - `STATE` messages were being output incorrectly for feeds which had many streams outputting at varying rates
 56 | 
 57 | ## 0.2.0
 58 | 
 59 | - **NOTE:** The `minor` version bump is not expected to have much effect on folks. This was done to signal the
 60 |   output change from the below bug fix. It is our impression not many are using this feature yet anyways. Since
 61 |   this was _not_ a `patch` change, we decided to make this a `minor` instead of `major` change to raise _less_
 62 |   concern. Thank you for your patience!
 63 | - **FEATURES:**
 64 |   - [Performance improvement for creating `tmp` tables necessary for uploading data](https://github.com/datamill-co/target-postgres/pull/147)
 65 |     - PostgreSQL dialects allow for creating a table identical to a parent table in a single command
 66 |     - [`CREATE TABLE <name> (LIKE <parent-name>);`](https://www.postgresql.org/docs/9.1/sql-createtable.html)
 67 |     - Previously we leveraged using our `upsert` helpers to create new tables. This resulted in _many_ calls
 68 |       to remote, of varying complexity.
 69 | - **BUG FIX:** No `STATE` Message Wrapper necessary
 70 |   - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/142)
 71 |   - `STATE` messages are formatted as `{"value": ...}`
 72 |   - `target-potgres` emitted the _full_ message
 73 |   - The official `singer-target-template`, doesn't write out that `value` "wrapper", and just writes
 74 |     the JSON blob contained in it
 75 |   - This fix makes `target-postgres` do the same
 76 | 
 77 | ## 0.1.11
 78 | 
 79 | - **BUG FIX:** `canonicalize_identifier` Not called on _all_ identifiers persisted to remote
 80 |   - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/144)
 81 |   - Presently, on column splits/name collisions, we add a suffix to an identifier
 82 |   - Previously, we did not canonicalize these suffixes
 83 |   - While this was not an issue for any `targets` currently in production, it was an issue
 84 |     for some up and coming `targets`.
 85 |   - This fix simply makes sure to call `canonicalize_identifier` before persisting an identifier to remote
 86 | 
 87 | ## 0.1.10
 88 | 
 89 | - **FEATURES:**
 90 |   - [Root Table Name Canonicalization](https://github.com/datamill-co/target-postgres/pull/131)
 91 |     - The `stream` name is used for the value of the root table name in Postgres
 92 |     - `stream` names are controlled exclusively by the tap and do _not_ have to meet many standards
 93 |     - Previously, only `stream` names which were lowercase, alphanumeric, etc.
 94 |     - Now, the `target` can canonicalize the root table name, allowing for the input `stream` name to be
 95 |       whatever the `tap` provides.
 96 | 
 97 | ## 0.1.9
 98 | 
 99 | - **Singer-Python:** bumped to latest _5.6.1_
100 | - **Psycopg2:** bumped to latest _2.8.2_
101 | - **FEATURES:**
102 |   - [`STATE` Message support](https://github.com/datamill-co/target-postgres/pull/130)
103 |     - Emits message only when all records buffered _before_ the `STATE` message have been persisted to remote.
104 |   - [SSL Support for Postgres](https://github.com/datamill-co/target-postgres/pull/124)
105 |     - Added config options for enabling/supporting SSL support.
106 | - **BUG FIX:** `ACTIVATE_VERSION` Messages did not flush buffer
107 |   - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/135)
108 |   - When we issue an activate version record, we presently do not flush the buffer after writing the batch. This results in more records being written to remote than need to be.
109 |   - This results in no functionality change, and should not alleviate any _known_ bugs.
110 |   - This should be purely performance related.
111 | 
112 | ## 0.1.8
113 | 
114 | - **Singer-Python:** bumped to latest
115 | - **Minor housekeeping:**
116 |   - Updated container versions to latest
117 |   - Updated README to reflect new versions of PostgreSQL Server
118 | 
119 | ## 0.1.7
120 | 
121 | - **BUG FIX:** A bug was identified for de-nesting.
122 |   - [ISSUE LINK](https://github.com/datamill-co/target-postgres/issues/109)
123 |   - [FAILING TESTS LINK](https://github.com/datamill-co/target-postgres/pull/110)
124 |   - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/111)
125 |   - Subtables with subtables did not serialize column names correctly
126 |     - The column names ended up having the _table names_ (paths) prepended on them
127 |     - Due to the denested table _schema_ and denested _records_ being different
128 |       no information showed up in remote.
129 |     - This bug was ultimately tracked down to the core denesting logic.
130 |   - This will fix failing uploads which had **_nullable_** columns in subtables but
131 |     no data was seen populating those columns.
132 |     - The broken schema columns will still remain
133 |   - Failing schemas which had **_non-null_** columns in subtables will still be broken
134 |     - To fix will require dropping the associated tables, potentially resetting the entire
135 |       `db`/`schema`
136 | 
137 | ## 0.1.6
138 | 
139 | - **BUG FIX:** A bug was identified for path to column serialization.
140 |   - [LINK](https://github.com/datamill-co/target-postgres/pull/100)
141 |   - A nullable properties which had _multiple_ JSONSchema types
142 |     - ie, something like `[null, string, integer ...]`
143 |     - Failed to find an appropriate column in remote to persist `None` values to.
144 |   - Found by usage of the [Hubspot Tap](https://github.com/singer-io/tap-hubspot)
145 | 
146 | ## 0.1.5
147 | 
148 | - **FEATURES:**
149 |   - [Added the `persist_empty_tables`](https://github.com/datamill-co/target-postgres/pull/97) config option which allows the Target to create empty tables in Remote.
150 | 
151 | ## 0.1.4
152 | 
153 | - **BUG FIX:** A bug was identified in 0.1.3 with stream `key_properties` and canonicalization.
154 |   - [LINK](https://github.com/datamill-co/target-postgres/pull/95)
155 |   - Discovered and fixed by @mirelagrigoras
156 |   - If the `key_properties` for a stream changed due to canonicalization, the stream would fail to persist due to:
157 |     - the `persist_csv_rows` `key_properties` values would remain un-canonicalized (sp?) and therefore cause issues once serialized into a SQL statement
158 |     - the pre-checks for tables would break because no values could be pulled from the schema with un-canonicalized fields pulled out of the `key_properties`
159 |   - **NOTE:** the `key_properties` metadata is saved with _raw_ field names.
160 | 
161 | ## 0.1.3
162 | 
163 | - **SCHEMA_VERSION: 1**
164 |   - [LINK](https://github.com/datamill-co/target-postgres/pull/89)
165 |   - Initialized a new field in remote table schemas `schema_version`
166 |   - A migration in `PostgresTarget` handles updating this
167 | - **BUG FIX:** A bug was identified in 0.1.2 with column type splitting.
168 |   - [LINK](https://github.com/datamill-co/target-postgres/pull/89)
169 |   - A schema with a field of type `string` is persisted to remote
170 |     - Later, the same field is of type `date-time`
171 |       - The values for this field will _not_ be placed under a new column, but rather under the original `string` column
172 |   - A schema with a field of type `date-time` is persisted to remote
173 |     - Later, the same field is of type `string`
174 |       - The original `date-time` column will be made `nullable`
175 |       - The values for this field will fail to persist
176 | - **FEATURES:**
177 |   - [Added the `logging_level`](https://github.com/datamill-co/target-postgres/pull/92) config option which uses standard Python Logger Levels to configure more details about what Target-Postgres is doing
178 |     - Query level logging and timing
179 |     - Table schema changes logging and timing
180 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution guidelines
 2 | 
 3 | ## Setting up test environment
 4 | 
 5 | ### Prerequisits
 6 | 
 7 | ```
 8 | python3 -m virtualenv python3 venv
 9 | source venv/bin/activate
10 | python3 -m pip install -e .[tests]
11 | # python3 -m pip install -e .\[tests\] <- night need to escape on zsh
12 | export POSTGRES_HOST=localhost
13 | export POSTGRES_DATABASE=target_postgres_test
14 | export POSTGRES_USERNAME=target_postgres_test
15 | export POSTGRES_PASSWORD=target_postgres_test
16 | ```
17 | 
18 | #### Database setup
19 | If you're not using the docker images for tests you'll need to set one up and
20 | configure a user on it.
21 | 
22 | ```
23 | $ psql template1;
24 | <>=# CREATE USER target_postgres_test WITH PASSWORD 'target_postgres_test';
25 | <>=# CREATE DATABASE target_postgres_test WITH owner=target_postgres_test;
26 | <>=# GRANT ALL privileges ON DATABASE target_postgres_test TO target_postgres_test;
27 | ```
28 | 
29 | #### If psycopg2 install fails
30 | 
31 | psycopg2 requires ssl and may fail the `pip install` process above
32 | 
33 | ##### Installing openssl
34 | 
35 | ###### OSX:
36 | 
37 | One possible solution is to use [homebrew](https://brew.sh/):
38 | 
39 | ```
40 | brew install openssl@1.1
41 | export LDFLAGS="-L/usr/local/opt/openssl@1.1/lib"
42 | export CPPFLAGS="-I/usr/local/opt/openssl@1.1/include
43 | python3 -m pip install -r requirements_test.pip
44 | ```
45 | 
46 | ## Running tests
47 | Tests are written using [pytest](https://docs.pytest.org/).
48 | 
49 | ```
50 | cd <CHECKOUT>
51 | python3 -m pytest tests/unit
52 | ```
53 | 
54 | Simply run the tests with pytest as a module when inside the root of the
55 | checkout; this ensures the `target_postgres/` module directory is found on the
56 | `PYTHONPATH`.
57 | 


--------------------------------------------------------------------------------
/DECISIONS.md:
--------------------------------------------------------------------------------
  1 | # Decisions
  2 | 
  3 | This document is intended to provide clarity on many of the decisions/rationalizations
  4 | which exist inside of [Datamill's](https://datamill.co/) Target SQL project
  5 | for [Singer](https://singer.io).
  6 | 
  7 | ## Principles
  8 | 
  9 | The guiding principles we try to adhere to herein as far as _how_ to reach a
 10 | conclusion are:
 11 | 
 12 | 1. When possible, make the resulting data/schema in the remote target consistent, no matter the ordering of potential messages
 13 |     - ie, if our decision would result in a random schema being produced in the remote target for no reasonable benefit, this is in violation
 14 | 1. Do right by the common _majority_ of users
 15 | 1. Make a best effort to prevent a user from having to intervene
 16 | 1. Use [Stitch’s offering and documentation](https://www.stitchdata.com/docs) as best practice guidance
 17 | 
 18 | ## Schema
 19 | 
 20 | ### De-nesting
 21 | 
 22 | #### What
 23 | 
 24 | - [JSON Schema](https://json-schema.org/) allows for complex schemas which have non-literal (ie, compositional) elements
 25 |   - examples include:
 26 |     - `objects` (ie, `{'a': 1, 'b': 2 ...}`)
 27 |     - `array` (ie, `[1, 'a', 2, {4: False}]`)
 28 |     - `anyOf`
 29 | - Standard SQL does not support compositional elements, but rather data which is highly structured in potentially many related tables
 30 | - To overcome this, `target-sql` provides tooling which unpacks:
 31 |   - json `objects` into their parent record
 32 |   - json `arrays` as sub tables
 33 | 
 34 | ```py
 35 | # Stream `FOO`
 36 | [
 37 |   {'nested_object': {
 38 |     'a': 1,
 39 |     'b': 2
 40 |    }
 41 |    'nested_array': [
 42 |      {'c': False, 'd': 'abc'},
 43 |      {'c': True, 'd': 'xyz'}
 44 |    ]
 45 |   }
 46 | ]
 47 | 
 48 | 
 49 | # Results in:
 50 | ## Table `foo`
 51 | [
 52 |   {'nested_object__a': 1,
 53 |    'nested_object__b': 2}
 54 | ]
 55 | 
 56 | ## Table `foo__nested_array`
 57 | [
 58 |   {'c': False, 'd': 'abc'},
 59 |   {'c': True, 'd': 'xyz'}
 60 | ]
 61 | 
 62 | ```
 63 | 
 64 | #### Why
 65 | 
 66 | - This approach is inspired by what Stitch Data takes with `object`/`array` de-nesting.
 67 | - The user experience for those using a SQL querying language is better for flat tables
 68 |   - as compared to something like [PostgreSQL's JSONB](https://www.postgresql.org/docs/9.4/datatype-json.html) support
 69 | - Data warehouses tend to prefer [denormalized](https://en.wikipedia.org/wiki/Denormalization) structures while operational databases prefer normalized structures. We normalize the incoming structure so the user can choose what to do with the normalized raw data. Also it's easy to access and transform later than JSON blobs.
 70 | 
 71 | ### Column Type Mismatch
 72 | 
 73 | #### What
 74 | 
 75 | 1. A field has been streamed to the remote target with type `integer`
 76 | 1. A new field with the _same raw name_ as the remote column has been streamed but has type `boolean`
 77 |     - Data of type `boolean` cannot be placed into a column of type `integer`
 78 | 1. `target-sql` has tooling which will:
 79 |     1. rename the original column to `original_field_name__i`
 80 |     1. make the renamed column `nullable`
 81 |     1. create a new column of name `original_field_name__b`
 82 |     1. stream new data to `original_field_name__b`
 83 |     - (to see a full list of type suffixes, please see: [`json_schema._shorthand_mappings`](https://github.com/datamill-co/target-postgres/blob/d626061d7a0e785f06b19589e1951637f2748262/target_postgres/json_schema.py#L283))
 84 | 
 85 | #### Why
 86 | 
 87 | ***TL;DR:*** Instead of throwing a hard error and forcing users to do some manual
 88 | transformation _before_ streaming data through `target-sql`, we chose a "best
 89 | effort" approach to resolving the underlying error.
 90 | 
 91 | By renaming and migrating the column we:
 92 | 
 93 | - make the resulting structure in the database the same no matter whether we upload column `integer` _then_ column `boolean` or vice versa.
 94 | - users learn of dependent views/columns blocking a type change _early_
 95 | 
 96 | ### Column Name Collision
 97 | 
 98 | #### What
 99 | 
100 | 1. Field of name `foo` is streamed
101 | 1. Field of name `FOO` is then streamed
102 | 1. Since both of these names canonicalize to the same result (ie, `foo`), we have a name collision
103 | 1. When attempting to `upsert_table`, `SQLInterface` has to handle name collisions. To do this, it attaches a unique suffix to the name which _caused the collision_, not the original
104 |     - The suffix is an auto-incrementing numerical value
105 | 
106 | ```py
107 | # Field `foo` is streamed
108 | # Field `FOO` is streamed
109 | 
110 | [
111 |   {'foo': 1,
112 |    'FOO': False,
113 |    'fOo': 4.0}
114 | ]
115 | 
116 | # The resulting table will be:
117 | 
118 | [
119 |   {'foo': 1,
120 |    'foo__1': False,
121 |    'foo__2': 4.0}
122 | ]
123 | 
124 | ```
125 | 
126 | #### Why
127 | 
128 | ***TL;DR:*** Instead of throwing a hard error and forcing users to do some manual
129 | transformation _before_ streaming data through `target-sql`, we chose a "best
130 | effort" approach to resolving the underlying error.
131 | 
132 | - While this means that _ordering_ of fields/actions matters in regards to the final remote structure, users can observe their remote structure simply
133 | - Hashes have been used as suffixes in past, but it was determined that these were too confusing for end users. So while they allowed us to adhere to [principle](#principles) (1), it meant [principle](#principles) (2) was being ignored.
134 |   - Additionally, we chose _not to_ prepend a numerical suffix to _all_ columns for the same reason. _Most_ users are not going to have name collisions, so instead of making the overall user experience worse, we chose to have a targeted solution to this particular edge case
135 | 
136 | ### Column Name Length
137 | 
138 | #### What
139 | 
140 | - `SQLInterface` provides a single field called `IDENTIFIER_FIELD_LENGTH` which is to be overridden by the implementing class
141 | - Any column which is found to be excess of `IDENTIFIER_FIELD_LENGTH` is truncated to be no longer than `IDENTIFIER_FIELD_LENGTH`
142 | - All `collision` and `type` information is preserved in the truncation
143 |   - ie, any values which are suffixed onto the name as `__...`
144 | - All original field/column names are preserved as a `column_mapping`
145 | 
146 | #### Why
147 | 
148 | ***TL;DR:*** Instead of throwing a hard error and forcing users to do some manual
149 | transformation _before_ streaming data through `target-sql`, we chose a "best
150 | effort" approach to resolving the underlying error.
151 | 
152 | Most (all?) SQL targets we have encountered have length restrictions for identifiers
153 | in their schema. Since arbitrary JSON _does_ not have this same restriction, we needed
154 | a best effort mechanism for handling names which were either auto-generated and are
155 | too long, or user input fields which physically cannot fit into the remote target.
156 | 
157 | As such, we chose to take the simplest method here for clarity. ie, truncate the
158 | original/generated name, and then proceed with collision support as normal.
159 | 
160 | The implementing class is tasked with providing `canonicalize_identifier`, a method
161 | which when called is expected to _only_ transform a string identifier into another
162 | string identifier which contains only characters which are allowed by the remote target.
163 | 
164 | ## Data De-nesting
165 | 
166 | ### Objects
167 | 
168 | #### What
169 | 
170 | - `Objects` are unpacked into their parent table.
171 | - The unpacked fields are prefixed with the name of the `field` which originally contained the object.
172 | 
173 | #### Why
174 | 
175 | - This approach is inspired by what Stitch Data takes with `object` de-nesting.
176 | - The user experience for those using a SQL querying language is better for flat tables
177 |   - as compared to something like [PostgreSQL's JSONB](https://www.postgresql.org/docs/9.4/datatype-json.html) support
178 | 
179 | ### Arrays
180 | 
181 | #### What
182 | 
183 | - `Arrays` are unrolled as individual rows into a child table
184 | - The table name is constructed as `parent_table__field`
185 | 
186 | #### Why
187 | 
188 | - This approach is inspired by what Stitch Data takes with `array` de-nesting.
189 | - The user experience for those using a SQL querying language is better for flat tables
190 |   - as compared to something like [PostgreSQL's JSONB](https://www.postgresql.org/docs/9.4/datatype-json.html) support
191 | 
192 | ## Queries
193 | 
194 | ### What
195 | 
196 | - When we write SQL at any given point, we have the option to use "latest" PostgreSQL features
197 | - We opt for features available from PostgreSQL 8.4.22 forward
198 | - We ***DO NOT*** support PostgreSQL 8.4.22
199 |   - any features/bugs issues based on this will be weighed against this decision as far as effort to benefit
200 | 
201 | ### Why
202 | 
203 | - Supporting multiple versions of PostgreSQL has _thus far_ been fairly straightforward by adhering to only query support available in the _oldest_ version of supported PostgreSQL
204 | - By doing this, we only have one main code base, instead of many fractured versions which all employ the latest/greatest system functions/methods/tables/information schemas available
205 | - By using 8.4.22, supporting [Redshift](https://github.com/datamill-co/target-redshift) is made simpler
206 |   - Redshift was originally split from [PostgreSQL 8.0.2](https://docs.aws.amazon.com/redshift/latest/dg/c_redshift-and-postgres-sql.html)
207 |   - At some point, a _lot_ of work was done by AWS to make Redshift a "simple fork" of PostgreSQL 8.4
208 | - We do not _support_ PostgreSQL 8.4 simply because PostgreSQL does not support it anymore
209 |   - Our _only_ benefit to making 8.4 query language our target is Redshift
210 |   - When a new supported version of PostgreSQL comes along, and we undertake the effort to support it herein, if supporting it is simpler to do by breaking 8.4, we will move the necessary logic to [target-redshift](https://github.com/datamill-co/target-redshift)
211 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2018-2021 Data Mill Services, LLC
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Target Postgres
  2 | 
  3 | [![CircleCI](https://circleci.com/gh/datamill-co/target-postgres.svg?style=svg)](https://circleci.com/gh/datamill-co/target-postgres)
  4 | 
  5 | [![PyPI version](https://badge.fury.io/py/singer-target-postgres.svg)](https://pypi.org/project/singer-target-postgres/)
  6 | 
  7 | [![](https://img.shields.io/librariesio/github/datamill-co/target-postgres.svg)](https://libraries.io/github/datamill-co/target-postgres)
  8 | 
  9 | A [Singer](https://singer.io/) postgres target, for use with Singer streams generated by Singer taps.
 10 | 
 11 | ## Features
 12 | 
 13 | - Creates SQL tables for [Singer](https://singer.io) streams
 14 | - Denests objects flattening them into the parent object's table
 15 | - Denests rows into separate tables
 16 | - Adds columns and sub-tables as new fields are added to the stream [JSON Schema](https://json-schema.org/)
 17 | - Full stream replication via record `version` and `ACTIVATE_VERSION` messages.
 18 | 
 19 | ## Install
 20 | 
 21 | 1. Add `libpq` dependency
 22 | 
 23 | ```sh
 24 | # macos
 25 | brew install postgresql
 26 | ```
 27 | ```sh
 28 | # ubuntu
 29 | sudo apt install libpq-dev
 30 | ```
 31 | 
 32 | 1. install `singer-target-postgres`
 33 | 
 34 | ```sh
 35 | pip install singer-target-postgres
 36 | ```
 37 | 
 38 | ## Usage
 39 | 
 40 | 1. Follow the
 41 |    [Singer.io Best Practices](https://github.com/singer-io/getting-started/blob/master/docs/RUNNING_AND_DEVELOPING.md#running-a-singer-tap-with-a-singer-target)
 42 |    for setting up separate `tap` and `target` virtualenvs to avoid version
 43 |    conflicts.
 44 | 
 45 | 1. Create a [config file](#configjson) at
 46 |    `~/singer.io/target_postgres_config.json` with postgres connection
 47 |    information and target postgres schema.
 48 | 
 49 |    ```json
 50 |    {
 51 |      "postgres_host": "localhost",
 52 |      "postgres_port": 5432,
 53 |      "postgres_database": "my_analytics",
 54 |      "postgres_username": "myuser",
 55 |      "postgres_password": "1234",
 56 |      "postgres_schema": "mytapname"
 57 |    }
 58 |    ```
 59 | 
 60 | 1. Run `target-postgres` against a [Singer](https://singer.io) tap.
 61 | 
 62 |    ```bash
 63 |    ~/.virtualenvs/tap-something/bin/tap-something \
 64 |      | ~/.virtualenvs/target-postgres/bin/target-postgres \
 65 |        --config ~/singer.io/target_postgres_config.json >> state.json
 66 |    ```
 67 | 
 68 |    If you are running windows, the following is equivalent:
 69 | 
 70 |    ```
 71 |    venvs\tap-exchangeratesapi\Scripts\tap-exchangeratesapi.exe | ^
 72 |    venvs\target-postgresql\Scripts\target-postgres.exe ^
 73 |    --config target_postgres_config.json
 74 |    ```
 75 | 
 76 | ### Config.json
 77 | 
 78 | The fields available to be specified in the config file are specified
 79 | here.
 80 | 
 81 | | Field                       | Type                  | Default                            | Details                                                                                                                                                                                                                                                                                                                                                                               |
 82 | | --------------------------- | --------------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 83 | | `postgres_host`             | `["string", "null"]`  | `"localhost"`                      |                                                                                                                                                                                                                                                                                                                                                                                       |
 84 | | `postgres_port`             | `["integer", "null"]` | `5432`                             |                                                                                                                                                                                                                                                                                                                                                                                       |
 85 | | `postgres_database`         | `["string"]`          | `N/A`                              |                                                                                                                                                                                                                                                                                                                                                                                       |
 86 | | `postgres_username`         | `["string", "null"]`  | `N/A`                              |                                                                                                                                                                                                                                                                                                                                                                                       |
 87 | | `postgres_password`         | `["string", "null"]`  | `null`                             |                                                                                                                                                                                                                                                                                                                                                                                       |
 88 | | `postgres_schema`           | `["string", "null"]`  | `"public"`                         |                                                                                                                                                                                                                                                                                                                                                                                       |
 89 | | `postgres_sslmode`          | `["string", "null"]`  | `"prefer"`                         | Refer to the [libpq](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-PARAMKEYWORDS) docs for more information about SSL                                                                                                                                                                                                                                              |
 90 | | `postgres_sslcert`          | `["string", "null"]`  | `"~/.postgresql/postgresql.crt"`   | Only used if a SSL request w/ a client certificate is being made                                                                                                                                                                                                                                                                                                                      |
 91 | | `postgres_sslkey`           | `["string", "null"]`  | `"~/.postgresql/postgresql.key"`   | Only used if a SSL request w/ a client certificate is being made                                                                                                                                                                                                                                                                                                                      |
 92 | | `postgres_sslrootcert`      | `["string", "null"]`  | `"~/.postgresql/root.crt"`         | Used for authentication of a server SSL certificate                                                                                                                                                                                                                                                                                                                                   |
 93 | | `postgres_sslcrl`           | `["string", "null"]`  | `"~/.postgresql/root.crl"`         | Used for authentication of a server SSL certificate                                                                                                                                                                                                                                                                                                                                   |
 94 | | `invalid_records_detect`    | `["boolean", "null"]` | `true`                             | Include `false` in your config to disable `target-postgres` from crashing on invalid records                                                                                                                                                                                                                                                                                          |
 95 | | `invalid_records_threshold` | `["integer", "null"]` | `0`                                | Include a positive value `n` in your config to allow for `target-postgres` to encounter at most `n` invalid records per stream before giving up.                                                                                                                                                                                                                                      |
 96 | | `disable_collection`        | `["string", "null"]`  | `false`                            | Include `true` in your config to disable [Singer Usage Logging](#usage-logging).                                                                                                                                                                                                                                                                                                      |
 97 | | `logging_level`             | `["string", "null"]`  | `"INFO"`                           | The level for logging. Set to `DEBUG` to get things like queries executed, timing of those queries, etc. See [Python's Logger Levels](https://docs.python.org/3/library/logging.html#levels) for information about valid values.                                                                                                                                                      |
 98 | | `persist_empty_tables`      | `["boolean", "null"]` | `False`                            | Whether the Target should create tables which have no records present in Remote.                                                                                                                                                                                                                                                                                                      |
 99 | | `max_batch_rows`            | `["integer", "null"]` | `200000`                           | The maximum number of rows to buffer in memory before writing to the destination table in Postgres                                                                                                                                                                                                                                                                                    |
100 | | `max_buffer_size`           | `["integer", "null"]` | `104857600` (100MB in bytes)       | The maximum number of bytes to buffer in memory before writing to the destination table in Postgres                                                                                                                                                                                                                                                                                   |
101 | | `batch_detection_threshold` | `["integer", "null"]` | `5000`, or 1/40th `max_batch_rows` | How often, in rows received, to count the buffered rows and bytes to check if a flush is necessary. There's a slight performance penalty to checking the buffered records count or bytesize, so this controls how often this is polled in order to mitigate the penalty. This value is usually not necessary to set as the default is dynamically adjusted to check reasonably often. |
102 | | `state_support`             | `["boolean", "null"]` | `True`                             | Whether the Target should emit `STATE` messages to stdout for further consumption. In this mode, which is on by default, STATE messages are buffered in memory until all the records that occurred before them are flushed according to the batch flushing schedule the target is configured with.                                                                                    |
103 | | `add_upsert_indexes`        | `["boolean", "null"]` | `True`                             | Whether the Target should create column indexes on the important columns used during data loading. These indexes will make data loading slightly slower but the deduplication phase much faster. Defaults to on for better baseline performance.                                                                                                                                      |
104 | | `before_run_sql`            | `["string", "null"]`  | `None`                             | Raw SQL statement(s) to execute as soon as the connection to Postgres is opened by the target. Useful for setup like `SET ROLE` or other connection state that is important.                                                                                                                                                                                                          |
105 | | `after_run_sql`             | `["string", "null"]`  | `None`                             | Raw SQL statement(s) to execute as soon as the connection to Postgres is opened by the target. Useful for setup like `SET ROLE` or other connection state that is important.                                                                                                                                                                                                          |
106 | | `before_run_sql_file`       | `["string", "null"]`  | `None`                             | Similar to `before_run_sql` but reads an external file instead of SQL in the JSON config file.                                                                                                                                                                                                                                                                                        |
107 | | `after_run_sql_file`        | `["string", "null"]`  | `None`                             | Similar to `after_run_sql` but reads an external file instead of SQL in the JSON config file.                                                                                                                                                                                                                                                                                         |
108 | | `application_name`        | `["string", "null"]`  | `None`                             | Set the postgresql `application_name` connection option to help with debugging, etc...                                                                                                                                                                                                                                                                                         |
109 | 
110 | 
111 | ### Supported Versions
112 | 
113 | `target-postgres` only supports [JSON Schema Draft4](http://json-schema.org/specification-links.html#draft-4).
114 | While declaring a schema _is optional_, any input schema which declares a version
115 | other than 4 will be rejected.
116 | 
117 | `target-postgres` supports all versions of PostgreSQL which are presently supported
118 | by the PostgreSQL Global Development Group. Our [CI config](https://github.com/datamill-co/target-postgres/blob/master/.circleci/config.yml) defines all versions we are currently supporting.
119 | 
120 | | Version | Current minor | Supported | First Release      | Final Release     |
121 | | ------- | ------------- | --------- | ------------------ | ----------------- |
122 | | 15      | 15.0          | Yes       | October 13, 2022   | November 11, 2027 |
123 | | 14      | 14.5          | Yes       | September 30, 2021 | November 12, 2026 |
124 | | 13      | 13.8          | Yes       | September 24, 2020 | November 13, 2025 |
125 | | 12      | 12.12         | Yes       | October 3, 2019    | November 14, 2024 |
126 | | 11      | 11.17         | Yes       | October 18, 2018   | November 9, 2023  |
127 | | 10      | 10.22         | Yes       | October 5, 2017    | November 10, 2022 |
128 | 
129 | _The above is copied from the [current list of versions](https://www.postgresql.org/support/versioning/) on Postgresql.org_
130 | 
131 | ## Known Limitations
132 | 
133 | - Requires a [JSON Schema](https://json-schema.org/) for every stream.
134 | - Only string, string with date-time format, integer, number, boolean,
135 |   object, and array types with or without null are supported. Arrays can
136 |   have any of the other types listed, including objects as types within
137 |   items.
138 |   - Example of JSON Schema types that work
139 |     - `['number']`
140 |     - `['string']`
141 |     - `['string', 'null']`
142 |   - Exmaple of JSON Schema types that **DO NOT** work
143 |     - `['string', 'integer']`
144 |     - `['integer', 'number']`
145 |     - `['any']`
146 |     - `['null']`
147 | - JSON Schema combinations such as `anyOf` and `oneOf` are not supported.
148 | - JSON Schema \$ref is partially supported:
149 |   - **_NOTE:_** The following limitations are known to **NOT** fail gracefully
150 |   - Presently you cannot have any circular or recursive `$ref`s
151 |   - `$ref`s must be present within the schema:
152 |     - URI's do not work
153 |     - if the `$ref` is broken, the behaviour is considered unexpected
154 | - Any values which are the `string` `NULL` will be streamed to PostgreSQL as the literal `null`
155 | - Table names are restricted to:
156 |   - 63 characters in length
157 |   - can only be composed of `_`, lowercase letters, numbers, `$`
158 |   - cannot start with `$`
159 |   - ASCII characters
160 | - Field/Column names are restricted to:
161 |   - 63 characters in length
162 |   - ASCII characters
163 | 
164 | ## Indexes
165 | 
166 | If the `add_upsert_indexes` config option is enabled, which it is by default, `target-postgres` adds indexes on the tables it creates for its own queries to be more performant. Specifically, `target-postgres` automatically adds indexes to the `_sdc_sequence` column and the `_sdc_level_<n>_id` columns which are used heavily when inserting and upserting.
167 | 
168 | `target-postgres` doesn't have any facilities for adding other indexes to the managed tables, so if there are more indexes required, they should be added by another downstream tool, or can just be added by an administrator when necessary. Note that these indexes incur performance overhead to maintain as data is inserted, These indexes can also prevent `target-postgres` from dropping columns in the future if the schema of the table changes, in which case an administrator should drop the index so `target-postgres` is able to drop the columns it needs to.
169 | 
170 | **Note**: Index adding is new as of version `0.2.1`, and `target-postgres` does not retroactively create indexes for tables it created before that time. If you want to add indexes to older tables `target-postgres` is loading data into, they should be added manually.
171 | 
172 | ## Usage Logging
173 | 
174 | [Singer.io](https://www.singer.io/) requires official taps and targets to collect anonymous usage data. This data is only used in aggregate to report on individual tap/targets, as well as the Singer community at-large. IP addresses are recorded to detect unique tap/targets users but not shared with third-parties.
175 | 
176 | To disable anonymous data collection set `disable_collection` to `true` in the configuration JSON file.
177 | 
178 | ## Developing
179 | 
180 | `target-postgres` utilizes [poetry](https://python-poetry.org/docs/) for package
181 | management, and [PyTest](https://docs.pytest.org/en/latest/contents.html) for testing.
182 | 
183 | ### Documentation
184 | 
185 | See also:
186 | 
187 | - [DECISIONS](./DECISIONS.md): A document containing high level explanations of various decisions and decision making paradigms. A good place to request more explanation/clarification on confusing things found herein.
188 | - [TableMetadata](./docs/TableMetadata.md): A document detailing some of the metadata necessary for `TargetPostgres` to function correctly on the Remote
189 | 
190 | ### Docker
191 | 
192 | If you have [Docker](https://www.docker.com/) and [Docker Compose](https://docs.docker.com/compose/) installed, you can
193 | easily run the following to get a local env setup quickly.
194 | 
195 | ```sh
196 | $ docker-compose up -d --build
197 | $ docker logs -tf target-postgres_target-postgres_1 # You container names might differ
198 | ```
199 | 
200 | As soon as you see `INFO: Dev environment ready.` you can shell into the container and start running test commands:
201 | 
202 | ```sh
203 | $ docker-compose exec target-postgres bash
204 | (target-postgres) root@...:/code# pytest
205 | ```
206 | 
207 | The environment inside the docker container has a virtualenv set up and activated, with an `--editable` install of `target-postgres` inside it and your local code mounted as a Docker volume. If you make changes on your host and re-run `pytest` any changes should be reflected inside the container.
208 | 
209 | See the [PyTest](#pytest) commands below!
210 | 
211 | ### DB
212 | 
213 | To run the tests, you will need a PostgreSQL server running.
214 | 
215 | **_NOTE:_** Testing assumes that you've exposed the traditional port `5432`.
216 | 
217 | Make sure to set the following env vars for [PyTest](#pytest):
218 | 
219 | ```sh
220 | $ EXPORT POSTGRES_HOST='<your-host-name>' # Most likely 'localhost'
221 | $ EXPORT POSTGRES_DB='<your-db-name>'     # We use 'target_postgres_test'
222 | $ EXPORT POSTGRES_USER='<your-user-name'  # Probably just 'postgres', make sure this user has no auth
223 | ```
224 | 
225 | ### PyTest
226 | 
227 | To run tests, try:
228 | 
229 | ```sh
230 | $ poetry run pytest
231 | ```
232 | 
233 | If you've `bash` shelled into the Docker Compose container ([see above](#docker)), you should be able to simply use:
234 | 
235 | ```sh
236 | $ pytest
237 | ```
238 | 
239 | ## Collaboration and Contributions
240 | 
241 | Join the conversation over at the [Singer.io Slack](singer-io.slack.com) and on the `#target-postgres` channel.
242 | 
243 | Try to adhere to the following for contributing:
244 | 
245 | - File New Issue -> Fork -> New Branch(If needed) -> Pull Request -> Approval -> Merge
246 | 
247 | Users can file an issue without submitting a pull request but be aware not all issues can or will be addressed.
248 | 
249 | ## Sponsorship
250 | 
251 | Target Postgres is sponsored by Data Mill (Data Mill Services, LLC) [datamill.co](https://datamill.co/).
252 | 
253 | Data Mill helps organizations utilize modern data infrastructure and data science to power analytics, products, and services.
254 | 
255 | ---
256 | 
257 | Copyright Data Mill Services, LLC 2018
258 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | services:
 3 |   db:
 4 |     image: postgres:9.6.17
 5 |     environment:
 6 |       POSTGRES_DB: target_postgres_test
 7 |       POSTGRES_PASSWORD: postgres
 8 |     ports:
 9 |       - "5432:5432"
10 | 
11 |   target-postgres:
12 |     image: python:3.7.7-stretch
13 |     working_dir: /code
14 |     entrypoint: /code/docker-entrypoint.sh
15 |     environment:
16 |       POSTGRES_HOST: db
17 |       POSTGRES_DATABASE: target_postgres_test
18 |       POSTGRES_USERNAME: postgres
19 |       POSTGRES_PASSWORD: postgres
20 |     volumes:
21 |       - .:/code
22 | 


--------------------------------------------------------------------------------
/docker-entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | python -m venv venv/target-postgres
 4 | tests/migrations/scripts/install_schema_versions.sh
 5 | source /code/venv/target-postgres/bin/activate
 6 | 
 7 | pip install -e .[tests]
 8 | 
 9 | echo "source /code/venv/target-postgres/bin/activate" >> ~/.bashrc
10 | echo -e "\n\nINFO: Dev environment ready."
11 | 
12 | tail -f /dev/null
13 | 


--------------------------------------------------------------------------------
/docs/TableMetadata.md:
--------------------------------------------------------------------------------
 1 | # Table Metadata
 2 | 
 3 | `SQLInterface` relies upon more schema information than is normally able to be
 4 | provided by a raw SQL schema. For instance, information about the original
 5 | non-canonicalized name of a field gets lost when that field is normalized into
 6 | a column.
 7 | 
 8 | To achieve this, metadata is stored. For Target Postgres, this metadata is currently
 9 | stored in a JSON Blob which is set onto each table's comment.
10 | 
11 | This document details the structure of this structure.  
12 | 
13 | ## Table Comment Schema
14 | 
15 | | Field | Type | Default | Details |
16 | | ----- | ---- | ------- | ------- |
17 | | `version` |`["string", "null"]` | `null` | The Singer table version to be used with `activate_version` |
18 | | `key_properties` | `["array", "null"]` | `null` | Array of `string`s representing the pks for the table. |
19 | | `mappings` | `["object", "null"]`|  `null` | Mappings which take `current_column_name` to a `COLUMN_MAPPING` detailed below. |
20 | | `table_mappings` | `{'type': ["array", "null"], 'items': {'type': "$TABLE_MAPPING"}}`|  `null` | Mappings which detail information about tables and their names. See `TABLE_MAPPING` below. |
21 | 
22 | ## COLUMN_MAPPING
23 | 
24 | | Field | Type | Default | Details |
25 | | ----- | ---- | ------- | ------- |
26 | | `from` | `["string"]` | `N/A` | The original name of the field/property this column represents |
27 | | `type` | `["array"]` | `N/A` | The `json_schema.type` of the `from` column |
28 | 
29 | ## TABLE_MAPPING
30 | 
31 | | Field | Type | Default | Details |
32 | | ----- | ---- | ------- | ------- |
33 | | `type` | `["string"]` | `TABLE` | The type of mapping present, which is always `TABLE` |
34 | | `from` | `{'type': ["array"], 'items': {'type': ["string"]}}` | `[]` | The fields/properties which lead to this (sub)table in the original schema. ie, the root table's path will always be `[<stream-name>]`, a table made from an array found at the property `foo` will be `[<stream-name>, "foo"]` etc. etc. |
35 | | `to` | `["string"]` | `N/A` | The table name which takes the `from` path `to` the target's representation |
36 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "singer-target-postgres"
 3 | version = "0.2.4"
 4 | authors = ["datamill"]
 5 | description = "Singer.io target for loading data into postgres"
 6 | readme = "README.md"
 7 | homepage = "https://github.com/datamill-co/target-postgres"
 8 | repository = "https://github.com/datamill-co/target-postgres"
 9 | classifiers = [
10 |     "License :: OSI Approved :: MIT License",
11 |     "Intended Audience :: Developers",
12 |     "Operating System :: OS Independent",
13 |     "Programming Language :: Python :: 3.7",
14 |     "Programming Language :: Python :: 3.8",
15 |     "Programming Language :: Python :: 3.9",
16 |     "Programming Language :: Python :: 3.10",
17 |     "Programming Language :: Python :: 3.11",
18 |     "Programming Language :: Python :: Implementation :: CPython",
19 |     "Topic :: Software Development :: Libraries :: Application Frameworks",
20 | ]
21 | license = "MIT"
22 | packages = [{include = "target_postgres"}]
23 | 
24 | [project.urls]
25 | "Homepage" = "https://github.com/datamill-co/target-postgres"
26 | "Bug Tracker" = "https://github.com/datamill-co/target-postgres/issues"
27 | 
28 | 
29 | [tool.poetry.dependencies]
30 | python = ">=3.7"
31 | arrow = "^1.2.3"
32 | psycopg2-binary = "^2.9.5"
33 | singer-python = "^5.9.0"
34 | 
35 | [tool.poetry.group.tests]
36 | optional = true
37 | 
38 | [tool.poetry.group.tests.dependencies]
39 | chance = "^0.110"
40 | Faker = "^15.1.3"
41 | pytest = "^7.2.0"
42 | 
43 | [build-system]
44 | requires = ["poetry-core"]
45 | build-backend = "poetry.core.masonry.api"
46 | 
47 | [tool.poetry.scripts]
48 | # CLI declaration
49 | target-postgres = 'target_postgres:cli'
50 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     error
4 |     ignore::UserWarning
5 |     ignore:.*Using or importing the ABCs from:DeprecationWarning
6 | 


--------------------------------------------------------------------------------
/target_postgres/__init__.py:
--------------------------------------------------------------------------------
 1 | from singer import utils
 2 | import psycopg2
 3 | 
 4 | from target_postgres.postgres import MillisLoggingConnection, PostgresTarget
 5 | from target_postgres import target_tools
 6 | 
 7 | REQUIRED_CONFIG_KEYS = [
 8 |     'postgres_database'
 9 | ]
10 | 
11 | 
12 | def main(config, input_stream=None):
13 |     with psycopg2.connect(
14 |             connection_factory=MillisLoggingConnection,
15 |             host=config.get('postgres_host', 'localhost'),
16 |             port=config.get('postgres_port', 5432),
17 |             dbname=config.get('postgres_database'),
18 |             user=config.get('postgres_username'),
19 |             password=config.get('postgres_password'),
20 |             sslmode=config.get('postgres_sslmode'),
21 |             sslcert=config.get('postgres_sslcert'),
22 |             sslkey=config.get('postgres_sslkey'),
23 |             sslrootcert=config.get('postgres_sslrootcert'),
24 |             sslcrl=config.get('postgres_sslcrl'),
25 |             application_name=config.get('application_name', 'target-postgres'),
26 |     ) as connection:
27 |         postgres_target = PostgresTarget(
28 |             connection,
29 |             postgres_schema=config.get('postgres_schema', 'public'),
30 |             logging_level=config.get('logging_level'),
31 |             persist_empty_tables=config.get('persist_empty_tables'),
32 |             add_upsert_indexes=config.get('add_upsert_indexes', True),
33 |             before_run_sql=config.get('before_run_sql'),
34 |             after_run_sql=config.get('after_run_sql'),
35 |         )
36 | 
37 |         if input_stream:
38 |             target_tools.stream_to_target(input_stream, postgres_target, config=config)
39 |         else:
40 |             target_tools.main(postgres_target)
41 | 
42 | 
43 | def cli():
44 |     args = utils.parse_args(REQUIRED_CONFIG_KEYS)
45 | 
46 |     main(args.config)
47 | 


--------------------------------------------------------------------------------
/target_postgres/denest.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | 
  3 | from target_postgres import json_schema, singer
  4 | 
  5 | 
  6 | def to_table_batches(schema, key_properties, records):
  7 |     """
  8 |     Given a schema, and records, get all table schemas and records and prep them
  9 |     in a `table_batch`.
 10 | 
 11 |     :param schema: SingerStreamSchema
 12 |     :param key_properties: [string, ...]
 13 |     :param records: [{...}, ...]
 14 |     :return: [{'streamed_schema': TABLE_SCHEMA(local),
 15 |                'records': [{(path_0, path_1, ...):
 16 |                             (_json_schema_string_type, value), ...},
 17 |                             ...]},
 18 |               ...]
 19 |     """
 20 |     table_schemas = _get_streamed_table_schemas(schema,
 21 |                                                 key_properties)
 22 | 
 23 |     table_records = _get_streamed_table_records(key_properties,
 24 |                                                 records)
 25 |     writeable_batches = []
 26 |     for table_json_schema in table_schemas:
 27 |         writeable_batches.append({'streamed_schema': table_json_schema,
 28 |                                   'records': table_records.get(table_json_schema['path'], [])})
 29 | 
 30 |     return writeable_batches
 31 | 
 32 | 
 33 | def _get_streamed_table_schemas(schema, key_properties):
 34 |     """
 35 |     Given a `schema` and `key_properties` return the denested/flattened TABLE_SCHEMA of
 36 |     the root table and each sub table.
 37 | 
 38 |     :param schema: SingerStreamSchema
 39 |     :param key_properties: [string, ...]
 40 |     :return: [TABLE_SCHEMA(denested_streamed_schema_0), ...]
 41 |     """
 42 |     root_table_schema = json_schema.simplify(schema)
 43 | 
 44 |     subtables = {}
 45 |     key_prop_schemas = {}
 46 |     for key in key_properties:
 47 |         key_prop_schemas[key] = schema['properties'][key]
 48 |     _denest_schema(tuple(), root_table_schema, key_prop_schemas, subtables)
 49 | 
 50 |     ret = [_to_table_schema(tuple(), None, key_properties, root_table_schema['properties'])]
 51 |     for path, schema in subtables.items():
 52 |         ret.append(_to_table_schema(path, schema['level'], schema['key_properties'], schema['properties']))
 53 | 
 54 |     return ret
 55 | 
 56 | 
 57 | def _to_table_schema(path, level, keys, properties):
 58 |     for key in keys:
 59 |         if not (key,) in properties:
 60 |             raise Exception('Unknown key "{}" found for table "{}". Known fields are: {}'.format(
 61 |                 key, path, properties
 62 |             ))
 63 | 
 64 |     return {'type': 'TABLE_SCHEMA',
 65 |             'path': path,
 66 |             'level': level,
 67 |             'key_properties': keys,
 68 |             'mappings': [],
 69 |             'schema': {'type': 'object',
 70 |                        'additionalProperties': False,
 71 |                        'properties': properties}}
 72 | 
 73 | 
 74 | def _literal_only_schema(schema):
 75 | 
 76 |     ret_types = json_schema.get_type(schema)
 77 | 
 78 |     if json_schema.is_object(schema):
 79 |         ret_types.remove(json_schema.OBJECT)
 80 |     if json_schema.is_iterable(schema):
 81 |         ret_types.remove(json_schema.ARRAY)
 82 |     if json_schema.is_nullable(schema):
 83 |         ret_types.remove(json_schema.NULL)
 84 | 
 85 |     ret_schemas = []
 86 |     for t in ret_types:
 87 |         s = deepcopy(schema)
 88 |         s['type'] = [t]
 89 | 
 90 |         if json_schema.is_nullable(schema):
 91 |             s = json_schema.make_nullable(s)
 92 | 
 93 |         ret_schemas.append(s)
 94 | 
 95 |     return {
 96 |         'anyOf': ret_schemas
 97 |     }
 98 | 
 99 | 
100 | def _create_subtable(table_path, table_json_schema, key_prop_schemas, subtables, level):
101 |     if json_schema.is_object(table_json_schema['items']):
102 |         new_properties = table_json_schema['items']['properties']
103 |     else:
104 |         new_properties = {singer.VALUE: table_json_schema['items']}
105 | 
106 |     key_properties = []
107 |     for pk, item_json_schema in key_prop_schemas.items():
108 |         key_properties.append(singer.SOURCE_PK_PREFIX + pk)
109 |         new_properties[singer.SOURCE_PK_PREFIX + pk] = item_json_schema
110 | 
111 |     new_properties[singer.SEQUENCE] = {
112 |         'type': ['null', 'integer']
113 |     }
114 | 
115 |     for i in range(0, level + 1):
116 |         new_properties[singer.LEVEL_FMT.format(i)] = {
117 |             'type': ['integer']
118 |         }
119 | 
120 |     new_schema = {'type': [json_schema.OBJECT],
121 |                   'properties': new_properties,
122 |                   'level': level,
123 |                   'key_properties': key_properties}
124 | 
125 |     _denest_schema(table_path, new_schema, key_prop_schemas, subtables, level=level)
126 | 
127 |     subtables[table_path] = new_schema
128 | 
129 | 
130 | def _denest_schema__singular_schemas(table_json_schema):
131 |     ret = []
132 |     assert json_schema.is_object(table_json_schema), 'Cannot denest non-object json_schema for tables. Passed: {}'.format(table_json_schema)
133 | 
134 |     for prop, sub_schema in table_json_schema['properties'].items():
135 |         singular_sub_schemas = [sub_schema]
136 |         if json_schema.is_anyof(sub_schema):
137 |             singular_sub_schemas = sub_schema['anyOf']
138 | 
139 |         for s in singular_sub_schemas:
140 |             assert json_schema.is_object(s) or json_schema.is_iterable(s) or json_schema.is_literal(s), \
141 |                 'Table schema cannot be denested due to: {} {}'.format(
142 |                     s,
143 |                     table_json_schema)
144 | 
145 |             ret.append((prop, s))
146 | 
147 |     return ret
148 | 
149 | 
150 | def _denest_schema_helper(
151 |     table_path,
152 |     prop_path,
153 |     table_json_schema,
154 |     nullable,
155 |     top_level_schema,
156 |     key_prop_schemas,
157 |     subtables,
158 |     level):
159 | 
160 |     for prop, item_json_schema in _denest_schema__singular_schemas(table_json_schema):
161 | 
162 |         if json_schema.is_object(item_json_schema):
163 |             _denest_schema_helper(table_path + (prop,),
164 |                                 prop_path + (prop,),
165 |                                 item_json_schema,
166 |                                 nullable,
167 |                                 top_level_schema,
168 |                                 key_prop_schemas,
169 |                                 subtables,
170 |                                 level)
171 | 
172 |         elif json_schema.is_iterable(item_json_schema):
173 |             _create_subtable(table_path + (prop,),
174 |                             item_json_schema,
175 |                             key_prop_schemas,
176 |                             subtables,
177 |                             level + 1)
178 | 
179 |         elif json_schema.is_literal(item_json_schema):
180 |             if nullable:
181 |                 item_json_schema = json_schema.make_nullable(item_json_schema)
182 | 
183 |             p = prop_path + (prop,)
184 |             if p in top_level_schema:
185 |                 top_level_schema[p]['anyOf'].append(item_json_schema)
186 |             else:
187 |                 top_level_schema[p] = {'anyOf': [item_json_schema]}
188 | 
189 | 
190 | def _denest_schema(
191 |     table_path,
192 |     table_json_schema,
193 |     key_prop_schemas,
194 |     subtables,
195 |     level=-1):
196 | 
197 |     new_properties = {}
198 |     for prop, item_json_schema in _denest_schema__singular_schemas(table_json_schema):
199 | 
200 |         if json_schema.is_object(item_json_schema):
201 |             _denest_schema_helper(table_path + (prop,),
202 |                                 (prop,),
203 |                                 item_json_schema,
204 |                                 json_schema.is_nullable(item_json_schema),
205 |                                 new_properties,
206 |                                 key_prop_schemas,
207 |                                 subtables,
208 |                                 level)
209 | 
210 |         elif json_schema.is_iterable(item_json_schema):
211 |             _create_subtable(table_path + (prop,),
212 |                             item_json_schema,
213 |                             key_prop_schemas,
214 |                             subtables,
215 |                             level + 1)
216 | 
217 |         elif json_schema.is_literal(item_json_schema):
218 |             if (prop,) in new_properties:
219 |                 new_properties[(prop,)]['anyOf'].append(item_json_schema)
220 |             else:
221 |                 new_properties[(prop,)] = {'anyOf': [item_json_schema]}
222 | 
223 | 
224 |     table_json_schema['properties'] = new_properties
225 | 
226 | 
227 | def _get_streamed_table_records(key_properties, records):
228 |     """
229 |     Flatten the given `records` into `table_records`.
230 |     Maintains `key_properties`.
231 |     into `table_records`.
232 | 
233 |     :param key_properties: [string, ...]
234 |     :param records: [{...}, ...]
235 |     :return: {TableName string: [{(path_0, path_1, ...): (_json_schema_string_type, value), ...}, ...],
236 |               ...}
237 |     """
238 | 
239 |     records_map = {}
240 |     _denest_records(tuple(),
241 |                     records,
242 |                     records_map,
243 |                     key_properties)
244 | 
245 |     return records_map
246 | 
247 | 
248 | def _denest_subrecord(table_path,
249 |                       prop_path,
250 |                       parent_record,
251 |                       record,
252 |                       records_map,
253 |                       key_properties,
254 |                       pk_fks,
255 |                       level):
256 |     """"""
257 |     """
258 |     {...}
259 |     """
260 |     for prop, value in record.items():
261 |         """
262 |         str : {...} | [...] | ???None??? | <literal>
263 |         """
264 | 
265 |         if isinstance(value, dict):
266 |             """
267 |             {...}
268 |             """
269 |             _denest_subrecord(table_path + (prop,),
270 |                               prop_path + (prop,),
271 |                               parent_record,
272 |                               value,
273 |                               records_map,
274 |                               key_properties,
275 |                               pk_fks,
276 |                               level)
277 | 
278 |         elif isinstance(value, list):
279 |             """
280 |             [...]
281 |             """
282 |             _denest_records(table_path + (prop,),
283 |                             value,
284 |                             records_map,
285 |                             key_properties,
286 |                             pk_fks=pk_fks,
287 |                             level=level + 1)
288 | 
289 |         elif value is None:
290 |             """
291 |             None
292 |             """
293 |             continue
294 | 
295 |         else:
296 |             """
297 |             <literal>
298 |             """
299 |             parent_record[prop_path + (prop,)] = (json_schema.python_type(value), value)
300 | 
301 | 
302 | def _denest_record(table_path, record, records_map, key_properties, pk_fks, level):
303 |     """"""
304 |     """
305 |     {...}
306 |     """
307 |     denested_record = {}
308 |     for prop, value in record.items():
309 |         """
310 |         str : {...} | [...] | None | <literal>
311 |         """
312 | 
313 |         if isinstance(value, dict):
314 |             """
315 |             {...}
316 |             """
317 |             _denest_subrecord(table_path + (prop,),
318 |                               (prop,),
319 |                               denested_record,
320 |                               value,
321 |                               records_map,
322 |                               key_properties,
323 |                               pk_fks,
324 |                               level)
325 | 
326 |         elif isinstance(value, list):
327 |             """
328 |             [...]
329 |             """
330 |             _denest_records(table_path + (prop,),
331 |                             value,
332 |                             records_map,
333 |                             key_properties,
334 |                             pk_fks=pk_fks,
335 |                             level=level + 1)
336 | 
337 |         elif value is None:
338 |             """
339 |             None
340 |             """
341 |             continue
342 | 
343 |         else:
344 |             """
345 |             <literal>
346 |             """
347 |             denested_record[(prop,)] = (json_schema.python_type(value), value)
348 | 
349 |     if table_path not in records_map:
350 |         records_map[table_path] = []
351 |     records_map[table_path].append(denested_record)
352 | 
353 | 
354 | def _denest_records(table_path, records, records_map, key_properties, pk_fks=None, level=-1):
355 |     row_index = 0
356 |     """
357 |     [{...} ...] | [[...] ...] | [literal ...]
358 |     """
359 |     for record in records:
360 |         if pk_fks:
361 |             record_pk_fks = pk_fks.copy()
362 |             record_pk_fks[singer.LEVEL_FMT.format(level)] = row_index
363 | 
364 |             if not isinstance(record, dict):
365 |                 """
366 |                 [...] | literal
367 |                 """
368 |                 record = {singer.VALUE: record}
369 | 
370 |             for key, value in record_pk_fks.items():
371 |                 record[key] = value
372 |             row_index += 1
373 |         else:  ## top level
374 |             record_pk_fks = {}
375 |             for key in key_properties:
376 |                 record_pk_fks[singer.SOURCE_PK_PREFIX + key] = record[key]
377 |             if singer.SEQUENCE in record:
378 |                 record_pk_fks[singer.SEQUENCE] = record[singer.SEQUENCE]
379 | 
380 |         """
381 |         {...}
382 |         """
383 |         _denest_record(table_path, record, records_map, key_properties, record_pk_fks, level)
384 | 


--------------------------------------------------------------------------------
/target_postgres/exceptions.py:
--------------------------------------------------------------------------------
 1 | class JSONSchemaError(Exception):
 2 |     """
 3 |     Raise this when there is an error with regards to an instance of JSON Schema
 4 |     """
 5 | 
 6 | 
 7 | class TargetError(Exception):
 8 |     """
 9 |     Raise when there is an Exception streaming data to the target.
10 |     """
11 | 
12 | 
13 | class PostgresError(Exception):
14 |     """
15 |     Raise this when there is an error with regards to Postgres streaming
16 |     """
17 | 
18 | 
19 | class SingerStreamError(Exception):
20 |     """
21 |     Raise when there is an Exception with Singer Streams.
22 |     """
23 | 


--------------------------------------------------------------------------------
/target_postgres/json_schema.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import decimal
  3 | import json
  4 | import re
  5 | 
  6 | from jsonschema import Draft4Validator
  7 | from jsonschema.exceptions import SchemaError
  8 | from target_postgres.exceptions import JSONSchemaError
  9 | 
 10 | NULL = 'null'
 11 | OBJECT = 'object'
 12 | ARRAY = 'array'
 13 | INTEGER = 'integer'
 14 | NUMBER = 'number'
 15 | BOOLEAN = 'boolean'
 16 | STRING = 'string'
 17 | DATE_TIME_FORMAT = 'date-time'
 18 | 
 19 | _PYTHON_TYPE_TO_JSON_SCHEMA = {
 20 |     int: INTEGER,
 21 |     float: NUMBER,
 22 |     bool: BOOLEAN,
 23 |     str: STRING,
 24 |     type(None): NULL,
 25 |     decimal.Decimal: NUMBER
 26 | }
 27 | 
 28 | 
 29 | def python_type(x):
 30 |     """
 31 |     Given a value `x`, return its Python Type as a JSONSchema type.
 32 |     :param x:
 33 |     :return:
 34 |     """
 35 |     if not type(x) in _PYTHON_TYPE_TO_JSON_SCHEMA:
 36 |         raise JSONSchemaError('Unknown type `{}`. Cannot translate to JSONSchema type.'.format(
 37 |             str(type(x))
 38 |         ))
 39 |     return _PYTHON_TYPE_TO_JSON_SCHEMA[type(x)]
 40 | 
 41 | 
 42 | def get_type(schema):
 43 |     """
 44 |     Given a JSON Schema dict, extracts the simplified `type` value
 45 |     :param schema: dict, JSON Schema
 46 |     :return: [string ...]
 47 |     """
 48 |     t = schema.get('type', None)
 49 |     if not t:
 50 |         return [OBJECT]
 51 | 
 52 |     if isinstance(t, str):
 53 |         return [t]
 54 | 
 55 |     return deepcopy(t)
 56 | 
 57 | 
 58 | def simple_type(schema):
 59 |     """
 60 |     Given a JSON Schema dict, extracts the simplified schema, ie, a schema which can only represent
 61 |     _one_ of the given types allowed (along with the Nullable modifier):
 62 |     - OBJECT
 63 |     - ARRAY
 64 |     - INTEGER
 65 |     - NUMBER
 66 |     - BOOLEAN
 67 |     - STRING
 68 |     - DATE_TIME
 69 | 
 70 |     :param schema: dict, JSON Schema
 71 |     :return: dict, JSON Schema
 72 |     """
 73 |     t = get_type(schema)
 74 | 
 75 |     if is_datetime(schema):
 76 |         return {'type': t,
 77 |                 'format': DATE_TIME_FORMAT}
 78 | 
 79 |     return {'type': t}
 80 | 
 81 | 
 82 | def _get_ref(schema, paths):
 83 |     if not paths:
 84 |         return schema
 85 | 
 86 |     if not paths[0] in schema:
 87 |         raise JSONSchemaError('`$ref` "{}" not found in provided JSON Schema'.format(paths[0]))
 88 | 
 89 |     return _get_ref(schema[paths[0]], paths[1:])
 90 | 
 91 | 
 92 | def get_ref(schema, ref):
 93 |     """
 94 |     Given a JSON Schema dict, and a valid ref (`$ref`), get the JSON Schema from within schema
 95 |     :param schema: dict, JSON Schema
 96 |     :param ref: string
 97 |     :return: dict, JSON Schema
 98 |     :raises: Exception
 99 |     """
100 | 
101 |     # Explicitly only allow absolute internally defined $ref's
102 |     if not re.match(r'^#/.*', ref):
103 |         raise JSONSchemaError('Invalid format for `$ref`: "{}"'.format(ref))
104 | 
105 |     return _get_ref(schema,
106 |                     re.split('/', re.sub(r'^#/', '', ref)))
107 | 
108 | 
109 | def _is_ref(schema):
110 |     """
111 |     Given a JSON Schema compatible dict, returns True when the schema implements `$ref`
112 | 
113 |     NOTE: `$ref` OVERRIDES all other keys present in a schema
114 |     :param schema:
115 |     :return: Boolean
116 |     """
117 | 
118 |     return '$ref' in schema
119 | 
120 | 
121 | def _is_allof(schema):
122 |     """
123 |     Given a JSON Schema compatible dict, returns True when the schema implements `allOf`.
124 | 
125 |     :param schema:
126 |     :return: Boolean
127 |     """
128 | 
129 |     return not _is_ref(schema) and 'allOf' in schema
130 | 
131 | 
132 | def is_anyof(schema):
133 |     """
134 |     Given a JSON Schema compatible dict, returns True when the schema implements `anyOf`.
135 | 
136 |     :param schema:
137 |     :return: Boolean
138 |     """
139 | 
140 |     return not _is_ref(schema) and not _is_allof(schema) and 'anyOf' in schema
141 | 
142 | 
143 | def is_object(schema):
144 |     """
145 |     Given a JSON Schema compatible dict, returns True when schema's type allows being an Object.
146 |     :param schema: dict, JSON Schema
147 |     :return: Boolean
148 |     """
149 | 
150 |     return not _is_ref(schema) and not is_anyof(schema) and not _is_allof(schema) \
151 |            and (OBJECT in get_type(schema)
152 |                 or 'properties' in schema
153 |                 or not schema)
154 | 
155 | 
156 | def is_iterable(schema):
157 |     """
158 |     Given a JSON Schema compatible dict, returns True when schema's type allows being iterable (ie, 'array')
159 |     :param schema: dict, JSON Schema
160 |     :return: Boolean
161 |     """
162 | 
163 |     return not _is_ref(schema) \
164 |            and ARRAY in get_type(schema) \
165 |            and 'items' in schema
166 | 
167 | 
168 | def is_nullable(schema):
169 |     """
170 |     Given a JSON Schema compatible dict, returns True when schema's type allows being 'null'
171 |     :param schema: dict, JSON Schema
172 |     :return: Boolean
173 |     """
174 | 
175 |     return NULL in get_type(schema)
176 | 
177 | 
178 | def is_literal(schema):
179 |     """
180 |     Given a JSON Schema compatible dict, returns True when schema's type allows being a literal
181 |     (ie, 'integer', 'number', etc.)
182 |     :param schema: dict, JSON Schema
183 |     :return: Boolean
184 |     """
185 | 
186 |     return not {STRING, INTEGER, NUMBER, BOOLEAN}.isdisjoint(set(get_type(schema)))
187 | 
188 | 
189 | def is_datetime(schema):
190 |     """
191 |     Given a JSON Schema compatible dict, returns True when schema's type allows being a date-time
192 |     :param schema: dict, JSON Schema
193 |     :return: Boolean
194 |     """
195 | 
196 |     return STRING in get_type(schema) and schema.get('format') == DATE_TIME_FORMAT
197 | 
198 | 
199 | def make_nullable(schema):
200 |     """
201 |     Given a JSON Schema dict, returns the dict but makes the `type` `null`able.
202 |     `is_nullable` will return true on the output.
203 |     :return: dict, JSON Schema
204 |     """
205 |     t = get_type(schema)
206 |     if NULL in t:
207 |         return schema
208 | 
209 |     ret_schema = deepcopy(schema)
210 |     ret_schema['type'] = t + [NULL]
211 |     return ret_schema
212 | 
213 | 
214 | class Cachable(dict):
215 |     '''
216 |     The simplified json_schemas we produce are idempotent. ie, if you simplify a simplified
217 |     json_schema, it will return the same thing. We wrap the `dict` object with a few
218 |     helpers which extend it so that we avoid recursion in some instances.
219 |     '''
220 |     def __init__(self, raw_dict, simplified=True):
221 |         self._c = None
222 |         super(Cachable, self).__init__(self, **raw_dict)
223 | 
224 |     def __hash__(self):
225 |         return self._comparator().__hash__()
226 | 
227 |     def deepcopy(self):
228 |         s = deepcopy(self)
229 |         s._c = self._c
230 |         return s
231 | 
232 |     def _comparator(self):
233 |         if not self._c:
234 |             self._c = json.dumps(self, sort_keys=True)
235 | 
236 |         return self._c
237 | 
238 |     def __lt__(self, other):
239 |         return self._comparator() < other._comparator()
240 | 
241 | 
242 | def _allof_sort_key(schema):
243 |     '''
244 |     We prefer scalars over combinations.
245 |     With scalars we prefer date-times over strings.
246 |     With combinations, we prefer objects.
247 |     With all, we prefer nullables.
248 |     '''
249 |     if is_nullable(schema):
250 |         sort_value = 0
251 |     else:
252 |         sort_value = 1
253 | 
254 |     if is_datetime(schema):
255 |         sort_value += 0
256 |     elif is_literal(schema):
257 |         sort_value += 10
258 |     elif is_object(schema):
259 |         sort_value += 100
260 |     elif is_iterable(schema):
261 |         sort_value += 200
262 |     else:
263 |         # Unknown schema...maybe a $ref?
264 |         sort_value += 1000
265 | 
266 |     return sort_value
267 | 
268 | 
269 | def _simplify__allof__merge__objects(schemas):
270 |     ret_schema = schemas[0]
271 |     # Merge objects together preferring later allOfs over earlier
272 |     next_schemas = schemas[1:]
273 |     while next_schemas and is_object(next_schemas[0]):
274 |         ret_schema['properties'] = {
275 |             **ret_schema.get('properties', {}),
276 |             **next_schemas[0].get('properties', {})}
277 | 
278 |         next_schemas = next_schemas[1:]
279 | 
280 |     return ret_schema
281 | 
282 | 
283 | def _simplify__allof__merge__iterables(root_schema, schemas):
284 |     ret_schema = schemas[0]
285 |     # Recurse on all of the item schemas to create a single item schema
286 |     item_schemas = []
287 | 
288 |     next_schemas = schemas
289 |     while next_schemas and is_iterable(next_schemas[0]):
290 |         item_schemas.append(next_schemas[0]['items'])
291 | 
292 |         next_schemas = next_schemas[1:]
293 | 
294 |     ret_schema['items'] = _helper_simplify(root_schema, {'allOf': item_schemas})
295 |     return ret_schema
296 | 
297 | 
298 | def _simplify__allof(root_schema, child_schema):
299 |     simplified_schemas = [
300 |         _helper_simplify(root_schema, schema)
301 |         for schema in child_schema['allOf']]
302 |     schemas = sorted(simplified_schemas, key=_allof_sort_key)
303 | 
304 |     ret_schema = schemas[0]
305 | 
306 |     if is_object(ret_schema):
307 |         return _simplify__allof__merge__objects(schemas)
308 | 
309 |     if is_iterable(ret_schema):
310 |         return _simplify__allof__merge__iterables(root_schema, schemas)
311 | 
312 |     return ret_schema
313 | 
314 | 
315 | def _simplify__implicit_anyof(root_schema, schema):
316 |     '''
317 |     Typically literals are simple and have at most two types, one of which being NULL.
318 |     However, they _can_ have many types wrapped up inside them as an implicit `anyOf`.
319 | 
320 |     Since we support `anyOf`, it is simpler to unwrap and "flatten" this implicit
321 |     combination type.
322 |     '''
323 |     schemas = []
324 |     types = set(get_type(schema))
325 | 
326 |     if types == {NULL}:
327 |         return Cachable({'type': [NULL]})
328 | 
329 |     types.discard(NULL)
330 | 
331 |     if is_datetime(schema):
332 |         schemas.append(Cachable({
333 |             'type': [STRING],
334 |             'format': DATE_TIME_FORMAT
335 |         }))
336 | 
337 |         types.remove(STRING)
338 | 
339 |     if is_object(schema):
340 |         properties = {}
341 |         for field, field_json_schema in schema.get('properties', {}).items():
342 |             properties[field] = _helper_simplify(root_schema, field_json_schema)
343 | 
344 |         schemas.append({
345 |             'type': [OBJECT],
346 |             'properties': properties
347 |         })
348 | 
349 |         types.discard(OBJECT)
350 | 
351 |     if is_iterable(schema):
352 |         schemas.append({
353 |             'type': [ARRAY],
354 |             'items': _helper_simplify(root_schema, schema.get('items', {}))
355 |         })
356 | 
357 |         types.remove(ARRAY)
358 | 
359 |     schemas += [{'type': [t]} for t in types]
360 | 
361 |     if is_nullable(schema):
362 |         schemas = [make_nullable(s) for s in schemas]
363 | 
364 | 
365 |     return _helper_simplify(root_schema, {'anyOf': [Cachable(s) for s in schemas]})
366 | 
367 | 
368 | def _simplify__anyof(root_schema, schema):
369 |     '''
370 |     `anyOf` clauses are merged/simplified according to the following rules (these _are_ recursive):
371 | 
372 |     - all literals are dedupped
373 |     - all objects are merged into the same object schema, with sub-schemas being grouped as simplified `anyOf` schemas
374 |     - all iterables' `items` schemas are merged as simplified `anyOf` schemas
375 |     - all `anyOf`s are flattened to the topmost
376 |     - if there is only a single element in an `anyOf`, that is denested
377 |     - if any `anyOf`s are nullable, all are nullable
378 |     '''
379 | 
380 |     schemas = [
381 |             _helper_simplify(root_schema, schema)
382 |             for schema in schema['anyOf']]
383 | 
384 |     literals = set()
385 |     any_nullable = False
386 |     any_merged_objects = False
387 |     merged_object_properties = {}
388 |     any_merged_iters = False
389 |     merged_item_schemas = []
390 | 
391 |     while schemas:
392 |         sub_schema = schemas.pop()
393 |         any_nullable = any_nullable or is_nullable(sub_schema)
394 | 
395 |         if is_literal(sub_schema):
396 |             literals.add(sub_schema)
397 | 
398 |         elif is_anyof(sub_schema):
399 |             # Flatten potentially deeply nested `anyOf`s
400 |             schemas += sub_schema['anyOf']
401 | 
402 |         elif is_object(sub_schema):
403 |             any_merged_objects = True
404 |             for k, s in sub_schema.get('properties', {}).items():
405 |                 if k in merged_object_properties:
406 |                     merged_object_properties[k].append(s)
407 |                 else:
408 |                     merged_object_properties[k] = [s]
409 | 
410 |         elif is_iterable(sub_schema):
411 |             any_merged_iters = True
412 |             merged_item_schemas.append(sub_schema['items'])
413 | 
414 |     merged_schemas = set()
415 |     for l in literals:
416 |         s = l
417 |         if any_nullable:
418 |             s = make_nullable(l)
419 | 
420 |         merged_schemas.add(Cachable(s))
421 | 
422 |     if any_merged_objects:
423 |         for k, v in merged_object_properties.items():
424 |             merged_object_properties[k] = _helper_simplify(root_schema, {'anyOf': v})
425 | 
426 |         s = {
427 |             'type': [OBJECT],
428 |             'properties': merged_object_properties
429 |         }
430 | 
431 |         if any_nullable:
432 |             s = make_nullable(s)
433 | 
434 |         merged_schemas.add(Cachable(s))
435 | 
436 |     if any_merged_iters:
437 |         merged_item_schemas = _helper_simplify(root_schema, {'anyOf': merged_item_schemas})
438 | 
439 |         s = {
440 |             'type': [ARRAY],
441 |             'items': merged_item_schemas
442 |         }
443 | 
444 |         if any_nullable:
445 |             s = make_nullable(s)
446 | 
447 |         merged_schemas.add(Cachable(s))
448 | 
449 |     if len(merged_schemas) == 1:
450 |         return merged_schemas.pop()
451 | 
452 |     return Cachable({'anyOf': sorted(merged_schemas)})
453 | 
454 | 
455 | def _helper_simplify(root_schema, child_schema):
456 |     # We check this value to make simplify a noop for schemas which have _already_ been simplified
457 |     if isinstance(child_schema, Cachable):
458 |         return child_schema
459 | 
460 |     ## Refs override all other type definitions
461 |     if _is_ref(child_schema):
462 |         try:
463 |             ret_schema = _helper_simplify(root_schema, get_ref(root_schema, child_schema['$ref']))
464 | 
465 |         except RecursionError:
466 |             raise JSONSchemaError('`$ref` path "{}" is recursive'.format(get_ref(root_schema, child_schema['$ref'])))
467 | 
468 |     elif _is_allof(child_schema):
469 |         ret_schema = _simplify__allof(root_schema, child_schema)
470 | 
471 |     elif is_anyof(child_schema):
472 |         ret_schema = _simplify__anyof(root_schema, child_schema)
473 | 
474 |     else:
475 |         ret_schema = _simplify__implicit_anyof(root_schema, child_schema)
476 | 
477 |     if 'default' in child_schema:
478 |         ret_schema['default'] = child_schema.get('default')
479 | 
480 |     return Cachable(ret_schema)
481 | 
482 | 
483 | def simplify(schema):
484 |     """
485 |     Given a JSON Schema compatible dict, returns a simplified JSON Schema dict
486 | 
487 |     - Expands `$ref` fields to their reference
488 |     - Expands `type` fields into array'ed type fields
489 |     - Strips out all fields which are not `type`/`properties`
490 | 
491 |     :param schema: dict, JSON Schema
492 |     :return: dict, JSON Schema
493 |     :raises: Exception
494 |     """
495 |     if isinstance(schema, Cachable):
496 |         return schema.deepcopy()
497 | 
498 |     return _helper_simplify(schema, schema)
499 | 
500 | 
501 | def _valid_schema_version(schema):
502 |     return '$schema' not in schema \
503 |            or schema['$schema'] == 'http://json-schema.org/draft-04/schema#'
504 | 
505 | 
506 | def _unexpected_validation_error(errors, exception):
507 |     """
508 | 
509 |     :param errors: [String, ...]
510 |     :param exception: Exception
511 |     :return: [String, ...]
512 |     """
513 | 
514 |     if not errors:
515 |         return ['Unexpected exception encountered: {}'.format(str(exception))]
516 | 
517 |     return errors
518 | 
519 | 
520 | def validation_errors(schema):
521 |     """
522 |     Given a dict, returns any known JSON Schema validation errors. If there are none,
523 |     implies that the dict is a valid JSON Schema.
524 |     :param schema: dict
525 |     :return: [String, ...]
526 |     """
527 | 
528 |     errors = []
529 | 
530 |     if not isinstance(schema, dict):
531 |         errors.append('Parameter `schema` is not a dict, instead found: {}'.format(type(schema)))
532 | 
533 |     try:
534 |         if not _valid_schema_version(schema):
535 |             errors.append('Schema version must be Draft 4. Found: {}'.format('$schema'))
536 |     except Exception as ex:
537 |         errors = _unexpected_validation_error(errors, ex)
538 | 
539 |     try:
540 |         Draft4Validator.check_schema(schema)
541 |     except SchemaError as error:
542 |         errors.append(str(error))
543 |     except Exception as ex:
544 |         errors = _unexpected_validation_error(errors, ex)
545 | 
546 |     try:
547 |         simplify(schema)
548 |     except JSONSchemaError as error:
549 |         errors.append(str(error))
550 |     except Exception as ex:
551 |         errors = _unexpected_validation_error(errors, ex)
552 | 
553 |     return errors
554 | 
555 | 
556 | _shorthand_mapping = {
557 |     NULL: '',
558 |     'string': 's',
559 |     'number': 'f',
560 |     'integer': 'i',
561 |     'boolean': 'b',
562 |     'date-time': 't'
563 | }
564 | 
565 | 
566 | def _type_shorthand(type_s):
567 |     if isinstance(type_s, list):
568 |         shorthand = ''
569 |         for t in sorted(type_s):
570 |             shorthand += _type_shorthand(t)
571 |         return shorthand
572 | 
573 |     if not type_s in _shorthand_mapping:
574 |         raise JSONSchemaError('Shorthand not available for type {}. Expected one of {}'.format(
575 |             type_s,
576 |             list(_shorthand_mapping.keys())
577 |         ))
578 | 
579 |     return _shorthand_mapping[type_s]
580 | 
581 | 
582 | def shorthand(schema):
583 |     t = deepcopy(get_type(schema))
584 | 
585 |     if 'format' in schema and 'date-time' == schema['format'] and STRING in t:
586 |         t.remove(STRING)
587 |         t.append('date-time')
588 | 
589 |     return _type_shorthand(t)
590 | 


--------------------------------------------------------------------------------
/target_postgres/singer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Module for Singer literals and helpers.
 3 | '''
 4 | _PREFIX = '_sdc_'
 5 | RECEIVED_AT =      _PREFIX + 'received_at'
 6 | BATCHED_AT =       _PREFIX + 'batched_at'
 7 | SEQUENCE =         _PREFIX + 'sequence'
 8 | TABLE_VERSION =    _PREFIX + 'table_version'
 9 | PK =               _PREFIX + 'primary_key'
10 | SOURCE_PK_PREFIX = _PREFIX + 'source_key_'
11 | LEVEL_FMT =        _PREFIX + 'level_{}_id'
12 | VALUE =            _PREFIX + 'value'
13 | 


--------------------------------------------------------------------------------
/target_postgres/singer_stream.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import json
  3 | import uuid
  4 | 
  5 | import arrow
  6 | from jsonschema import Draft4Validator, FormatChecker
  7 | from jsonschema.exceptions import ValidationError
  8 | 
  9 | from target_postgres import json_schema, singer
 10 | from target_postgres.exceptions import SingerStreamError
 11 | 
 12 | 
 13 | SINGER_RECEIVED_AT = '_sdc_received_at'
 14 | SINGER_BATCHED_AT = '_sdc_batched_at'
 15 | SINGER_SEQUENCE = '_sdc_sequence'
 16 | SINGER_TABLE_VERSION = '_sdc_table_version'
 17 | SINGER_PK = '_sdc_primary_key'
 18 | SINGER_SOURCE_PK_PREFIX = '_sdc_source_key_'
 19 | SINGER_LEVEL = '_sdc_level_{}_id'
 20 | SINGER_VALUE = '_sdc_value'
 21 | 
 22 | RAW_LINE_SIZE = '__raw_line_size'
 23 | 
 24 | 
 25 | def get_line_size(line_data):
 26 |     return line_data.get(RAW_LINE_SIZE) or len(json.dumps(line_data))
 27 | 
 28 | 
 29 | class BufferedSingerStream():
 30 |     def __init__(self,
 31 |                  stream,
 32 |                  schema,
 33 |                  key_properties,
 34 |                  *args,
 35 |                  invalid_records_detect=None,
 36 |                  invalid_records_threshold=None,
 37 |                  max_rows=200000,
 38 |                  max_buffer_size=104857600,  # 100MB
 39 |                  **kwargs):
 40 |         """
 41 |         :param invalid_records_detect: Defaults to True when value is None
 42 |         :param invalid_records_threshold: Defaults to 0 when value is None
 43 |         """
 44 |         self.schema = None
 45 |         self.key_properties = None
 46 |         self.validator = None
 47 |         self.update_schema(schema, key_properties)
 48 | 
 49 |         self.stream = stream
 50 |         self.invalid_records = []
 51 |         self.max_rows = max_rows
 52 |         self.max_buffer_size = max_buffer_size
 53 | 
 54 |         self.invalid_records_detect = invalid_records_detect
 55 |         self.invalid_records_threshold = invalid_records_threshold
 56 | 
 57 |         if self.invalid_records_detect is None:
 58 |             self.invalid_records_detect = True
 59 |         if self.invalid_records_threshold is None:
 60 |             self.invalid_records_threshold = 0
 61 | 
 62 |         self.__buffer = []
 63 |         self.__count = 0
 64 |         self.__size = 0
 65 |         self.__lifetime_max_version = None
 66 | 
 67 |     def update_schema(self, schema, key_properties):
 68 |         # In order to determine whether a value _is in_ properties _or not_ we need to flatten `$ref`s etc.
 69 |         self.schema = json_schema.simplify(schema)
 70 |         self.key_properties = deepcopy(key_properties)
 71 | 
 72 |         # The validator can handle _many_ more things than our simplified schema, and is, in general handled by third party code
 73 |         self.validator = Draft4Validator(schema, format_checker=FormatChecker())
 74 | 
 75 |         properties = self.schema['properties']
 76 | 
 77 |         if singer.RECEIVED_AT not in properties:
 78 |             properties[singer.RECEIVED_AT] = {
 79 |                 'type': ['null', 'string'],
 80 |                 'format': 'date-time'
 81 |             }
 82 | 
 83 |         if singer.SEQUENCE not in properties:
 84 |             properties[singer.SEQUENCE] = {
 85 |                 'type': ['null', 'integer']
 86 |             }
 87 | 
 88 |         if singer.TABLE_VERSION not in properties:
 89 |             properties[singer.TABLE_VERSION] = {
 90 |                 'type': ['null', 'integer']
 91 |             }
 92 | 
 93 |         if singer.BATCHED_AT not in properties:
 94 |             properties[singer.BATCHED_AT] = {
 95 |                 'type': ['null', 'string'],
 96 |                 'format': 'date-time'
 97 |             }
 98 | 
 99 |         if len(self.key_properties) == 0:
100 |             self.use_uuid_pk = True
101 |             self.key_properties = [singer.PK]
102 |             properties[singer.PK] = {
103 |                 'type': ['string']
104 |             }
105 |         else:
106 |             self.use_uuid_pk = False
107 | 
108 |     @property
109 |     def count(self):
110 |         return self.__count
111 | 
112 |     @property
113 |     def buffer_full(self):
114 |         if self.__count >= self.max_rows:
115 |             return True
116 | 
117 |         if self.__count > 0:
118 |             if self.__size >= self.max_buffer_size:
119 |                 return True
120 | 
121 |         return False
122 | 
123 |     @property
124 |     def max_version(self):
125 |         return self.__lifetime_max_version
126 | 
127 |     def __update_version(self, version):
128 |         if version is None or (self.__lifetime_max_version is not None and self.__lifetime_max_version >= version):
129 |             return None
130 | 
131 |         ## TODO: log warning about earlier records detected
132 | 
133 |         self.flush_buffer()
134 |         self.__lifetime_max_version = version
135 | 
136 |     def add_record_message(self, record_message):
137 |         add_record = True
138 | 
139 |         self.__update_version(record_message.get('version'))
140 | 
141 |         if self.__lifetime_max_version != record_message.get('version'):
142 |             return None
143 | 
144 |         try:
145 |             self.validator.validate(record_message['record'])
146 |         except ValidationError as error:
147 |             add_record = False
148 |             self.invalid_records.append((error, record_message))
149 | 
150 |         if add_record:
151 |             self.__buffer.append(record_message)
152 |             self.__size += get_line_size(record_message)
153 |             self.__count += 1
154 |         elif self.invalid_records_detect \
155 |                 and len(self.invalid_records) >= self.invalid_records_threshold:
156 |             raise SingerStreamError(
157 |                 'Invalid records detected above threshold: {}. See `.args` for details.'.format(
158 |                     self.invalid_records_threshold),
159 |                 self.invalid_records)
160 | 
161 |     def peek_buffer(self):
162 |         return self.__buffer
163 | 
164 |     def get_batch(self):
165 |         current_time = arrow.get().format('YYYY-MM-DD HH:mm:ss.SSSSZZ')
166 | 
167 |         records = []
168 |         for record_message in self.peek_buffer():
169 |             record = record_message['record']
170 | 
171 |             if 'version' in record_message:
172 |                 record[singer.TABLE_VERSION] = record_message['version']
173 | 
174 |             if 'time_extracted' in record_message and record.get(singer.RECEIVED_AT) is None:
175 |                 record[singer.RECEIVED_AT] = record_message['time_extracted']
176 | 
177 |             if self.use_uuid_pk and record.get(singer.PK) is None:
178 |                 record[singer.PK] = str(uuid.uuid4())
179 | 
180 |             record[singer.BATCHED_AT] = current_time
181 | 
182 |             if 'sequence' in record_message:
183 |                 record[singer.SEQUENCE] = record_message['sequence']
184 |             else:
185 |                 record[singer.SEQUENCE] = arrow.get().int_timestamp
186 | 
187 |             records.append(record)
188 | 
189 |         return records
190 | 
191 |     def flush_buffer(self):
192 |         _buffer = self.__buffer
193 |         self.__buffer = []
194 |         self.__size = 0
195 |         self.__count = 0
196 |         return _buffer
197 | 
198 |     def peek_invalid_records(self):
199 |         return self.invalid_records
200 | 


--------------------------------------------------------------------------------
/target_postgres/stream_tracker.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | import json
 3 | import singer.statediff as statediff
 4 | import sys
 5 | 
 6 | from target_postgres.exceptions import TargetError
 7 | 
 8 | 
 9 | class StreamTracker:
10 |     """
11 |     Object to track the BufferedStream objects for each incoming stream to the target, and the STATE messages coming in. This object understands which streams need to be flushed before STATE messages can be safely emitted and does so.
12 | 
13 |     Because Singer taps don't have a standard way of expressing which streams correspond to which STATEs, the target can only safely
14 |     emit a STATE message once all the records that came in prior to that STATE in the stream. Because target-postgres buffers
15 |     the records in BufferedSingerStreams, the STATE messages need to be delayed until all the records that came before them have been
16 |     saved to the database from their buffers.
17 |     """
18 | 
19 |     def __init__(self, target, emit_states):
20 |         self.target = target
21 |         self.emit_states = emit_states
22 | 
23 |         self.streams = {}
24 | 
25 |         # dict of {'<stream_name>': number}, where the number is the message counter of the most recently received record for that stream. Will contain a value for all registered streams.
26 |         self.stream_add_watermarks = {}
27 | 
28 |         # dict of {'<stream_name>': number}, where the number is the message counter of the most recently flushed record for that stream. Will contain a value for all registered streams.
29 |         self.stream_flush_watermarks = {}
30 | 
31 |         self.streams_added_to = set()  # list of stream names which have seen records
32 |         self.state_queue = deque()  # contains dicts of {'state': <state blob>, 'watermark': number}
33 |         self.message_counter = 0
34 |         self.last_emitted_state = None
35 | 
36 |     def register_stream(self, stream, buffered_stream):
37 |         self.streams[stream] = buffered_stream
38 |         self.stream_flush_watermarks[stream] = 0
39 | 
40 |     def flush_stream(self, stream):
41 |         self._write_batch_and_update_watermarks(stream)
42 |         self._emit_safe_queued_states()
43 | 
44 |     def flush_streams(self, force=False):
45 |         for (stream, stream_buffer) in self.streams.items():
46 |             if force or stream_buffer.buffer_full:
47 |                 self._write_batch_and_update_watermarks(stream)
48 | 
49 |         self._emit_safe_queued_states(force=force)
50 | 
51 |     def handle_state_message(self, line):
52 |         if self.emit_states:
53 |             self.state_queue.append({'state': line, 'watermark': self.message_counter})
54 |             self._emit_safe_queued_states()
55 | 
56 |     def handle_record_message(self, stream, line_data):
57 |         if stream not in self.streams:
58 |             raise TargetError('A record for stream {} was encountered before a corresponding schema'.format(stream))
59 | 
60 |         self.message_counter += 1
61 |         self.streams_added_to.add(stream)
62 |         self.stream_add_watermarks[stream] = self.message_counter
63 |         self.streams[stream].add_record_message(line_data)
64 | 
65 |     def _write_batch_and_update_watermarks(self, stream):
66 |         stream_buffer = self.streams[stream]
67 |         self.target.write_batch(stream_buffer)
68 |         stream_buffer.flush_buffer()
69 |         self.stream_flush_watermarks[stream] = self.stream_add_watermarks.get(stream, 0)
70 | 
71 |     def _emit_safe_queued_states(self, force=False):
72 |         # State messages that occured before the least recently flushed record are safe to emit.
73 |         # If they occurred after some records that haven't yet been flushed, they aren't safe to emit.
74 |         # Because records arrive at different rates from different streams, we take the earliest unflushed record
75 |         # as the threshold for what STATE messages are safe to emit. We ignore the threshold of 0 for streams that
76 |         # have been registered (via a SCHEMA message) but where no records have arrived yet.
77 |         valid_flush_watermarks = []
78 |         for stream, watermark in self.stream_flush_watermarks.items():
79 |             if stream in self.streams_added_to:
80 |                 valid_flush_watermarks.append(watermark)
81 |         safe_flush_threshold = min(valid_flush_watermarks, default=0)
82 | 
83 |         # the STATE message that the target forwards
84 |         emittable_state = None
85 |         emittable_state_str = None
86 |         while len(self.state_queue) > 0 and (force or self.state_queue[0]['watermark'] <= safe_flush_threshold):
87 |             emittable_state_str = self.state_queue.popleft()['state']
88 | 
89 |         if emittable_state_str is not None:
90 |             emittable_state = json.loads(emittable_state_str)['value']
91 | 
92 |         if emittable_state:
93 |             if len(statediff.diff(emittable_state, self.last_emitted_state or {})) > 0:
94 |                 line = json.dumps(emittable_state)
95 |                 sys.stdout.write("{}\n".format(line))
96 |                 sys.stdout.flush()
97 | 
98 |             self.last_emitted_state = emittable_state
99 | 


--------------------------------------------------------------------------------
/target_postgres/target_tools.py:
--------------------------------------------------------------------------------
  1 | import http.client
  2 | import io
  3 | import json
  4 | import pkg_resources
  5 | import sys
  6 | import threading
  7 | import decimal
  8 | 
  9 | import singer
 10 | from singer import utils, metadata, metrics
 11 | 
 12 | from target_postgres import json_schema
 13 | from target_postgres.exceptions import TargetError
 14 | from target_postgres.singer_stream import BufferedSingerStream, RAW_LINE_SIZE
 15 | from target_postgres.stream_tracker import StreamTracker
 16 | 
 17 | LOGGER = singer.get_logger()
 18 | 
 19 | 
 20 | def main(target):
 21 |     """
 22 |     Given a target, stream stdin input as a text stream.
 23 |     :param target: object which implements `write_batch` and `activate_version`
 24 |     :return: None
 25 |     """
 26 |     config = utils.parse_args([]).config
 27 |     input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
 28 |     stream_to_target(input_stream, target, config=config)
 29 | 
 30 |     return None
 31 | 
 32 | 
 33 | def stream_to_target(stream, target, config={}):
 34 |     """
 35 |     Persist `stream` to `target` with optional `config`.
 36 |     :param stream: iterator which represents a Singer data stream
 37 |     :param target: object which implements `write_batch` and `activate_version`
 38 |     :param config: [optional] configuration for buffers etc.
 39 |     :return: None
 40 |     """
 41 | 
 42 |     state_support = config.get('state_support', True)
 43 |     state_tracker = StreamTracker(target, state_support)
 44 |     _run_sql_hook('before_run_sql', config, target)
 45 | 
 46 |     try:
 47 |         if not config.get('disable_collection', False):
 48 |             _async_send_usage_stats()
 49 | 
 50 |         invalid_records_detect = config.get('invalid_records_detect')
 51 |         invalid_records_threshold = config.get('invalid_records_threshold')
 52 |         max_batch_rows = config.get('max_batch_rows', 200000)
 53 |         max_batch_size = config.get('max_batch_size', 104857600)  # 100MB
 54 |         batch_detection_threshold = config.get('batch_detection_threshold', max(max_batch_rows / 40, 50))
 55 | 
 56 |         line_count = 0
 57 |         for line in stream:
 58 |             _line_handler(state_tracker,
 59 |                           target,
 60 |                           invalid_records_detect,
 61 |                           invalid_records_threshold,
 62 |                           max_batch_rows,
 63 |                           max_batch_size,
 64 |                           line
 65 |                           )
 66 |             if line_count > 0 and line_count % batch_detection_threshold == 0:
 67 |                 state_tracker.flush_streams()
 68 |             line_count += 1
 69 | 
 70 |         state_tracker.flush_streams(force=True)
 71 |         _run_sql_hook('after_run_sql', config, target)
 72 | 
 73 |         return None
 74 | 
 75 |     except Exception as e:
 76 |         LOGGER.critical(e)
 77 |         raise e
 78 |     finally:
 79 |         _report_invalid_records(state_tracker.streams)
 80 | 
 81 | 
 82 | def _report_invalid_records(streams):
 83 |     for stream_buffer in streams.values():
 84 |         if stream_buffer.peek_invalid_records():
 85 |             LOGGER.warning("Invalid records detected for stream {}: {}".format(
 86 |                 stream_buffer.stream,
 87 |                 stream_buffer.peek_invalid_records()
 88 |             ))
 89 | 
 90 | 
 91 | def _line_handler(state_tracker, target, invalid_records_detect, invalid_records_threshold, max_batch_rows,
 92 |                   max_batch_size, line):
 93 |     try:
 94 |         line_data = json.loads(line, parse_float=decimal.Decimal)
 95 |     except json.decoder.JSONDecodeError:
 96 |         LOGGER.error("Unable to parse JSON: {}".format(line))
 97 |         raise
 98 | 
 99 |     if 'type' not in line_data:
100 |         raise TargetError('`type` is a required key: {}'.format(line))
101 | 
102 |     if line_data['type'] == 'SCHEMA':
103 |         if 'stream' not in line_data:
104 |             raise TargetError('`stream` is a required key: {}'.format(line))
105 | 
106 |         stream = line_data['stream']
107 | 
108 |         if 'schema' not in line_data:
109 |             raise TargetError('`schema` is a required key: {}'.format(line))
110 | 
111 |         schema = line_data['schema']
112 | 
113 |         schema_validation_errors = json_schema.validation_errors(schema)
114 |         if schema_validation_errors:
115 |             raise TargetError('`schema` is an invalid JSON Schema instance: {}'.format(line), *schema_validation_errors)
116 | 
117 |         if 'key_properties' in line_data:
118 |             key_properties = line_data['key_properties']
119 |         else:
120 |             key_properties = None
121 | 
122 |         if stream not in state_tracker.streams:
123 |             buffered_stream = BufferedSingerStream(stream,
124 |                                                    schema,
125 |                                                    key_properties,
126 |                                                    invalid_records_detect=invalid_records_detect,
127 |                                                    invalid_records_threshold=invalid_records_threshold)
128 |             if max_batch_rows:
129 |                 buffered_stream.max_rows = max_batch_rows
130 |             if max_batch_size:
131 |                 buffered_stream.max_buffer_size = max_batch_size
132 | 
133 |             state_tracker.register_stream(stream, buffered_stream)
134 |         else:
135 |             state_tracker.streams[stream].update_schema(schema, key_properties)
136 |     elif line_data['type'] == 'RECORD':
137 |         if 'stream' not in line_data:
138 |             raise TargetError('`stream` is a required key: {}'.format(line))
139 | 
140 |         line_data[RAW_LINE_SIZE] = len(line)
141 |         state_tracker.handle_record_message(line_data['stream'], line_data)
142 |     elif line_data['type'] == 'ACTIVATE_VERSION':
143 |         if 'stream' not in line_data:
144 |             raise TargetError('`stream` is a required key: {}'.format(line))
145 |         if 'version' not in line_data:
146 |             raise TargetError('`version` is a required key: {}'.format(line))
147 |         if line_data['stream'] not in state_tracker.streams:
148 |             raise TargetError('A ACTIVATE_VERSION for stream {} was encountered before a corresponding schema'
149 |                               .format(line_data['stream']))
150 | 
151 |         stream_buffer = state_tracker.streams[line_data['stream']]
152 |         state_tracker.flush_stream(line_data['stream'])
153 |         target.activate_version(stream_buffer, line_data['version'])
154 |     elif line_data['type'] == 'STATE':
155 |         # pass the string instead of the deserialized object to save memory in the deque
156 |         state_tracker.handle_state_message(line)
157 |     else:
158 |         raise TargetError('Unknown message type {} in message {}'.format(
159 |             line_data['type'],
160 |             line))
161 | 
162 | 
163 | def _send_usage_stats():
164 |     try:
165 |         version = pkg_resources.get_distribution('target-postgres').version
166 |         with http.client.HTTPConnection('collector.singer.io', timeout=10).connect() as conn:
167 |             params = {
168 |                 'e': 'se',
169 |                 'aid': 'singer',
170 |                 'se_ca': 'target-postgres',
171 |                 'se_ac': 'open',
172 |                 'se_la': version,
173 |             }
174 |             conn.request('GET', '/i?' + urllib.parse.urlencode(params))
175 |             conn.getresponse()
176 |     except:
177 |         LOGGER.debug('Collection request failed')
178 | 
179 | 
180 | def _async_send_usage_stats():
181 |     LOGGER.info('Sending version information to singer.io. ' +
182 |                 'To disable sending anonymous usage data, set ' +
183 |                 'the config parameter "disable_collection" to true')
184 |     threading.Thread(target=_send_usage_stats()).start()
185 | 
186 | 
187 | def _run_sql_hook(hook_name, config, target):
188 |     if hook_name in config:
189 |         with target.conn.cursor() as cur:
190 |             cur.execute(config[hook_name])
191 |             LOGGER.debug('{} SQL executed'.format(hook_name))
192 | 
193 |     hook_file = hook_name + '_file'
194 |     if hook_file in config:
195 |         with open(config[hook_file]) as f:
196 |             with target.conn.cursor() as cur:
197 |                 cur.execute(f.read())
198 |                 LOGGER.debug('{} SQL file executed'.format(hook_file))
199 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | 
4 | sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
5 | 


--------------------------------------------------------------------------------
/tests/migrations/scripts/install_schema_versions.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e -x
 3 | 
 4 | python -m venv /code/venv/target-postgres--schema0
 5 | source /code/venv/target-postgres--schema0/bin/activate
 6 | pip install "singer-target-postgres==0.1.2"
 7 | deactivate
 8 | 
 9 | python -m venv /code/venv/target-postgres--schema1
10 | source /code/venv/target-postgres--schema1/bin/activate
11 | pip install "singer-target-postgres==0.1.9"
12 | deactivate
13 | 


--------------------------------------------------------------------------------
/tests/migrations/scripts/to_latest.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e -x
 3 | 
 4 | cd /code
 5 | 
 6 | source venv/target-postgres/bin/activate
 7 | pip install -U pip
 8 | /opt/poetry/bin/poetry install
 9 | 
10 | cat tests/migrations/data/tap | target-postgres --config ${1}
11 | X="$?"
12 | 
13 | deactivate
14 | 
15 | exit ${X}
16 | 


--------------------------------------------------------------------------------
/tests/migrations/scripts/to_target.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e -x
3 | 
4 | cat /code/tests/migrations/data/tap | /code/venv/target-postgres--${1}/bin/target-postgres --config ${2}
5 | 


--------------------------------------------------------------------------------
/tests/migrations/test_migrations.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | If we assert that any upgrade to _latest_ from some older version should
  3 | be met with chaining versions together...
  4 | 
  5 | versions = [v0 v1 v2]
  6 | 
  7 | v0
  8 | v0 -> v1
  9 | v1
 10 | v1 -> v2
 11 | v2
 12 | ...
 13 | vn-1 -> vn
 14 | vn
 15 | '''
 16 | 
 17 | from copy import deepcopy
 18 | import json
 19 | import os
 20 | import pytest
 21 | import subprocess
 22 | 
 23 | import psycopg2
 24 | from psycopg2 import sql
 25 | 
 26 | from utils.fixtures import CONFIG, TEST_DB
 27 | 
 28 | SCHEMA_PREFIX = "migration_testing__"
 29 | FILE_PATH = "/code/tests/migrations/"
 30 | 
 31 | 
 32 | def abs_path(relative_path):
 33 |     return FILE_PATH + relative_path
 34 | 
 35 | 
 36 | def _cursor_list(cursor, idx=0):
 37 |     return [x[idx] for x in cursor.fetchall()]
 38 | 
 39 | 
 40 | def list_schemas():
 41 |     with psycopg2.connect(**TEST_DB) as conn:
 42 |         with conn.cursor() as cur:
 43 |             cur.execute(
 44 |                 "SELECT schema_name FROM information_schema.schemata WHERE schema_name LIKE '{}%'".format(
 45 |                     SCHEMA_PREFIX))
 46 |             return _cursor_list(cur)
 47 | 
 48 | 
 49 | def clear_schema(schema):
 50 |     with psycopg2.connect(**TEST_DB) as conn:
 51 |         with conn.cursor() as cur:
 52 |             cur.execute(sql.SQL(
 53 |                 'DROP SCHEMA IF EXISTS {} CASCADE;').format(
 54 |                 sql.Identifier(schema)))
 55 | 
 56 | 
 57 | def clear_db():
 58 |     for schema in list_schemas():
 59 |         clear_schema(schema)
 60 | 
 61 | 
 62 | @pytest.fixture
 63 | def db_cleanup():
 64 |     clear_db()
 65 | 
 66 |     yield
 67 | 
 68 | 
 69 | def create_schema(schema):
 70 |     name = SCHEMA_PREFIX + schema
 71 |     with psycopg2.connect(**TEST_DB) as conn:
 72 |         with conn.cursor() as cur:
 73 |             cur.execute(sql.SQL(
 74 |                 'CREATE SCHEMA IF NOT EXISTS {};').format(
 75 |                 sql.Identifier(name)))
 76 | 
 77 |     return name
 78 | 
 79 | 
 80 | def setup_config(version, psql_schema):
 81 |     os.makedirs(abs_path("artifacts"), exist_ok=True)
 82 | 
 83 |     config_path = abs_path("artifacts/config--{}.json".format(psql_schema))
 84 | 
 85 |     if not os.path.exists(config_path):
 86 |         target_config = deepcopy(CONFIG)
 87 |         target_config['postgres_schema'] = psql_schema
 88 | 
 89 |         with open(config_path, 'w') as outfile:
 90 |             json.dump(target_config, outfile)
 91 | 
 92 |     return config_path
 93 | 
 94 | 
 95 | def script_cmd(script, *args):
 96 |     cmd = [abs_path("scripts/{}.sh".format(script))] + list(args)
 97 | 
 98 |     p = subprocess.Popen(cmd)
 99 |     communication = p.communicate()
100 |     if p.returncode:
101 |         raise Exception(communication)
102 | 
103 |     return communication
104 | 
105 | 
106 | def tap_to_target(version, psql_schema):
107 |     config_path = setup_config(version, psql_schema)
108 | 
109 |     if version == 'LATEST':
110 |         return script_cmd("to_latest", config_path)
111 | 
112 |     return script_cmd("to_target", version, config_path)
113 | 
114 | 
115 | def _test_versions(versions):
116 |     length = len(versions)
117 |     for idx in range(length):
118 |         version = versions[idx]
119 |         if idx:
120 |             prev_version = versions[idx - 1]
121 |             schema = create_schema('{}_{}'.format(prev_version, version))
122 |             tap_to_target(prev_version, schema)
123 |             tap_to_target(version, schema)
124 | 
125 |         schema = create_schema(version)
126 |         tap_to_target(version, schema)
127 | 
128 | 
129 | def tables_in_schema(schema):
130 |     with psycopg2.connect(**TEST_DB) as conn:
131 |         with conn.cursor() as cur:
132 |             cur.execute(sql.SQL(
133 |                 "SELECT table_name FROM information_schema.tables WHERE table_schema = {}"
134 |             ).format(sql.Literal(schema)))
135 |             return set(_cursor_list(cur))
136 | 
137 | 
138 | def table_length(schema, table):
139 |     with psycopg2.connect(**TEST_DB) as conn:
140 |         with conn.cursor() as cur:
141 |             cur.execute(sql.SQL(
142 |                 "SELECT count(*) FROM {}.{}"
143 |             ).format(
144 |                 sql.Identifier(schema),
145 |                 sql.Identifier(table)))
146 |             return cur.fetchone()[0]
147 | 
148 | 
149 | def assert_table_lengths_equal(schema_a, schema_b, table):
150 |     assert table_length(schema_a, table) == table_length(schema_b, table), \
151 |         "Table {} in schemas {}, {} does not match in length".format(table, schema_a, schema_b)
152 | 
153 | 
154 | def assert_tables_equal():
155 |     schemas = list_schemas()
156 |     tables = tables_in_schema(schemas[0])
157 | 
158 |     for idx in range(1, len(schemas)):
159 |         schema = schemas[idx]
160 |         assert tables == tables_in_schema(schema), \
161 |             "Schema: {} differs from the rest. Processed {} of {}".format(schema, idx, len(schemas))
162 | 
163 |         for table in tables:
164 |             assert_table_lengths_equal(schemas[0], schema, table)
165 | 
166 | 
167 | def test(db_cleanup):
168 |     _test_versions(['schema0', 'schema1', 'LATEST'])
169 | 
170 |     schemas = list_schemas()
171 |     assert list_schemas(), "There should have been at least one generated schema..."
172 | 
173 |     tables = tables_in_schema(schemas[0])
174 |     assert tables, "There should have been at least one generated table..."
175 | 
176 |     assert_tables_equal()
177 | 


--------------------------------------------------------------------------------
/tests/unit/test_BufferedSingerStream.py:
--------------------------------------------------------------------------------
  1 | from decimal import Decimal
  2 | from copy import deepcopy
  3 | 
  4 | import pytest
  5 | 
  6 | from target_postgres import singer
  7 | from target_postgres.singer_stream import BufferedSingerStream, SingerStreamError, RAW_LINE_SIZE
  8 | 
  9 | from utils.fixtures import CatStream, InvalidCatStream, CATS_SCHEMA
 10 | 
 11 | 
 12 | def missing_sdc_properties(stream_buffer):
 13 |     errors = []
 14 |     for p in [singer.BATCHED_AT, singer.RECEIVED_AT, singer.SEQUENCE, singer.TABLE_VERSION]:
 15 |         if not p in stream_buffer.schema['properties']:
 16 |             errors.append({'_sdc': p,
 17 |                            'message': '`_sdc` missing'})
 18 | 
 19 |     return errors
 20 | 
 21 | 
 22 | def test_init():
 23 |     singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
 24 |                                          CATS_SCHEMA['schema'],
 25 |                                          CATS_SCHEMA['key_properties'])
 26 | 
 27 |     assert singer_stream
 28 |     assert [] == missing_sdc_properties(singer_stream)
 29 | 
 30 | 
 31 | def test_init__empty_key_properties():
 32 |     singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
 33 |                                          CATS_SCHEMA['schema'],
 34 |                                          [])
 35 | 
 36 |     stream = CatStream(100)
 37 |     for _ in range(20):
 38 |         singer_stream.add_record_message(stream.generate_record_message())
 39 | 
 40 |     assert singer_stream
 41 |     assert [] == missing_sdc_properties(singer_stream)
 42 |     assert [singer.PK] == singer_stream.key_properties
 43 | 
 44 |     rows_missing_pk = []
 45 |     rows_checked = 0
 46 |     for r in singer_stream.get_batch():
 47 |         if not r[singer.PK]:
 48 |             rows_missing_pk.append(r)
 49 | 
 50 |         rows_checked += 1
 51 | 
 52 |     assert rows_checked > 1
 53 |     assert [] == rows_missing_pk
 54 | 
 55 | 
 56 | def test_add_record_message():
 57 |     stream = CatStream(10)
 58 |     singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
 59 |                                          CATS_SCHEMA['schema'],
 60 |                                          CATS_SCHEMA['key_properties'])
 61 |     assert singer_stream.add_record_message(stream.generate_record_message()) is None
 62 |     assert not singer_stream.peek_invalid_records()
 63 |     assert [] == missing_sdc_properties(singer_stream)
 64 | 
 65 | 
 66 | def test_add_record_message__invalid_record():
 67 |     stream = InvalidCatStream(10)
 68 |     singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
 69 |                                          CATS_SCHEMA['schema'],
 70 |                                          CATS_SCHEMA['key_properties'])
 71 |     with pytest.raises(SingerStreamError):
 72 |         singer_stream.add_record_message(stream.generate_record_message())
 73 | 
 74 |     assert singer_stream.peek_invalid_records()
 75 |     assert singer_stream.count == 0
 76 |     assert [] == missing_sdc_properties(singer_stream)
 77 | 
 78 | 
 79 | SIMPLE_MULTIPLE_OF_VALID_SCHEMA = {
 80 |     'properties': {
 81 |         'multipleOfKey': {
 82 |             'type': 'number',
 83 |             'multipleOf': Decimal('1e-15')
 84 |         }
 85 |     }
 86 | }
 87 | 
 88 | SIMPLE_MULTIPLE_OF_INVALID_SCHEMA = {
 89 |     'properties': {
 90 |         'multipleOfKey': {
 91 |             'type': 'number',
 92 |             'multipleOf': 1e-15
 93 |         }
 94 |     }
 95 | }
 96 | 
 97 | def test_add_record_message__multipleOf():
 98 |     stream_name = 'test'
 99 |     singer_stream = BufferedSingerStream(stream_name,
100 |                                          deepcopy(SIMPLE_MULTIPLE_OF_VALID_SCHEMA),
101 |                                          [])
102 | 
103 |     multiple_of_values = ['1', '2', '3', '4', '5', '1.1', '2.3', '1.23456789', '20', '100.1']
104 | 
105 |     for value in multiple_of_values:
106 |         singer_stream.add_record_message(
107 |             {
108 |                 'type': 'RECORD',
109 |                 'stream': stream_name,
110 |                 'record': {'multipleOfKey': Decimal(value)},
111 |                 'sequence': 0,
112 |                 RAW_LINE_SIZE: 100
113 |             }
114 |         )
115 | 
116 |     assert not singer_stream.peek_invalid_records()
117 |     assert singer_stream.count == len(multiple_of_values)
118 | 
119 | 
120 | def test_add_record_message__multipleOf_invalid_record():
121 |     stream_name = 'test'
122 |     singer_stream = BufferedSingerStream(stream_name,
123 |                                          deepcopy(SIMPLE_MULTIPLE_OF_INVALID_SCHEMA),
124 |                                          [])
125 | 
126 |     multiple_of_values = [1, 2]
127 | 
128 |     for value in multiple_of_values:
129 |         with pytest.raises(SingerStreamError):
130 |             singer_stream.add_record_message(
131 |                 {
132 |                     'type': 'RECORD',
133 |                     'stream': stream_name,
134 |                     'record': {'multipleOfKey': value},
135 |                     'sequence': 0,
136 |                     RAW_LINE_SIZE: 100
137 |                 }
138 |             )
139 | 
140 |     assert singer_stream.peek_invalid_records()
141 |     assert singer_stream.count == 0
142 | 
143 | 
144 | SIMPLE_ALLOF_SCHEMA = {
145 |   'type': 'object',
146 |   'properties': {
147 |       'allOfKey': {
148 |           'allOf': [
149 |               { 'type': ['string'] },
150 |               { 'maxLength': 5 }
151 |               ]}}}
152 | 
153 | 
154 | def test_add_record_message__allOf():
155 |     stream_name = 'test'
156 |     singer_stream = BufferedSingerStream(stream_name,
157 |                                          deepcopy(SIMPLE_ALLOF_SCHEMA),
158 |                                          [])
159 | 
160 |     strs_shorter_than_6 = [
161 |         'hello',
162 |         'I',
163 |         'am',
164 |         'a set',
165 |         'of',
166 |         'short',
167 |         'strs'
168 |     ]
169 | 
170 |     for string in strs_shorter_than_6:
171 |         singer_stream.add_record_message(
172 |             {
173 |                 'type': 'RECORD',
174 |                 'stream': stream_name,
175 |                 'record': {'allOfKey': string},
176 |                 'sequence': 0
177 |             }
178 |         )
179 | 
180 |     assert not singer_stream.peek_invalid_records()
181 |     assert singer_stream.count == len(strs_shorter_than_6)
182 |     assert [] == missing_sdc_properties(singer_stream)
183 | 
184 | 
185 | def test_add_record_message__allOf__invalid_record():
186 |     stream_name = 'test'
187 |     singer_stream = BufferedSingerStream(stream_name,
188 |                                          deepcopy(SIMPLE_ALLOF_SCHEMA),
189 |                                          [])
190 | 
191 |     with pytest.raises(SingerStreamError):
192 |         singer_stream.add_record_message(
193 |             {
194 |                 'type': 'RECORD',
195 |                 'stream': stream_name,
196 |                 'record': {'allOfKey': 'this is a string which is much too long to be allowed'},
197 |                 'sequence': 0
198 |             }
199 |         )
200 | 
201 |     assert singer_stream.peek_invalid_records()
202 |     assert singer_stream.count == 0
203 |     assert [] == missing_sdc_properties(singer_stream)
204 | 
205 | 
206 | def test_add_record_message__allOf__impossible_schema():
207 |     stream_name = 'test'
208 | 
209 |     schema = deepcopy(SIMPLE_ALLOF_SCHEMA)
210 |     schema['properties']['allOfKey']['allOf'].append({'type': ['number']})
211 | 
212 |     singer_stream = BufferedSingerStream(stream_name,
213 |                                          schema,
214 |                                          [])
215 | 
216 | 
217 |     with pytest.raises(SingerStreamError):
218 |         singer_stream.add_record_message(
219 |             {
220 |                 'type': 'RECORD',
221 |                 'stream': stream_name,
222 |                 'record': {'allOfKey': 'short'},
223 |                 'sequence': 0
224 |             }
225 |         )
226 |     with pytest.raises(SingerStreamError):
227 |         singer_stream.add_record_message(
228 |             {
229 |                 'type': 'RECORD',
230 |                 'stream': stream_name,
231 |                 'record': {'allOfKey': 314159},
232 |                 'sequence': 0
233 |             }
234 |         )
235 | 
236 |     assert singer_stream.peek_invalid_records()
237 |     assert singer_stream.count == 0
238 |     assert [] == missing_sdc_properties(singer_stream)
239 | 
240 | 
241 | def test_add_record_message__invalid_record__detection_off():
242 |     stream = InvalidCatStream(10)
243 |     singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
244 |                                          CATS_SCHEMA['schema'],
245 |                                          CATS_SCHEMA['key_properties'],
246 |                                          invalid_records_detect=False)
247 | 
248 |     singer_stream.add_record_message(stream.generate_record_message())
249 | 
250 |     assert singer_stream.peek_invalid_records()
251 |     assert singer_stream.count == 0
252 |     assert [] == missing_sdc_properties(singer_stream)
253 | 
254 | 
255 | def test_add_record_message__invalid_record__cross_threshold():
256 |     stream = InvalidCatStream(10)
257 | 
258 |     singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
259 |                                          CATS_SCHEMA['schema'],
260 |                                          CATS_SCHEMA['key_properties'],
261 |                                          invalid_records_threshold=3)
262 | 
263 |     singer_stream.add_record_message(stream.generate_record_message())
264 |     singer_stream.add_record_message(stream.generate_record_message())
265 | 
266 |     with pytest.raises(SingerStreamError):
267 |         singer_stream.add_record_message(stream.generate_record_message())
268 | 
269 |     assert singer_stream.peek_invalid_records()
270 |     assert singer_stream.count == 0
271 |     assert [] == missing_sdc_properties(singer_stream)
272 | 
273 | 
274 | def mocked_mock_write_batch(stream_buffer):
275 |     stream_buffer.flush_buffer()
276 | 
277 | 
278 | def test_multiple_batches__by_rows():
279 |     stream = CatStream(100)
280 | 
281 |     singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
282 |                                          CATS_SCHEMA['schema'],
283 |                                          CATS_SCHEMA['key_properties'],
284 |                                          max_rows=20)
285 | 
286 |     assert len(singer_stream.peek_buffer()) == 0
287 | 
288 |     while not singer_stream.buffer_full:
289 |         singer_stream.add_record_message(stream.generate_record_message())
290 | 
291 |     assert len(singer_stream.peek_buffer()) == 20
292 |     assert [] == missing_sdc_properties(singer_stream)
293 | 
294 |     singer_stream.flush_buffer()
295 | 
296 |     assert len(singer_stream.peek_buffer()) == 0
297 | 
298 | 
299 | def test_multiple_batches__by_memory():
300 |     stream = CatStream(100)
301 | 
302 |     singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
303 |                                          CATS_SCHEMA['schema'],
304 |                                          CATS_SCHEMA['key_properties'],
305 |                                          max_buffer_size=10)
306 | 
307 |     assert len(singer_stream.peek_buffer()) == 0
308 | 
309 |     while not singer_stream.buffer_full:
310 |         singer_stream.add_record_message(stream.generate_record_message())
311 | 
312 |     assert len(singer_stream.peek_buffer()) == 1
313 |     assert [] == missing_sdc_properties(singer_stream)
314 | 
315 |     singer_stream.flush_buffer()
316 | 
317 |     assert len(singer_stream.peek_buffer()) == 0
318 | 
319 | 
320 | def test_multiple_batches__old_records__by_rows():
321 |     stream_oldest = CatStream(100, version=0)
322 |     stream_middle_aged = CatStream(100, version=5)
323 |     stream_latest = CatStream(100, version=10)
324 | 
325 |     singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
326 |                                          CATS_SCHEMA['schema'],
327 |                                          CATS_SCHEMA['key_properties'],
328 |                                          max_rows=20)
329 | 
330 |     assert len(singer_stream.peek_buffer()) == 0
331 | 
332 |     while not singer_stream.buffer_full:
333 |         singer_stream.add_record_message(stream_oldest.generate_record_message())
334 | 
335 |     assert len(singer_stream.peek_buffer()) == 20
336 | 
337 |     singer_stream.flush_buffer()
338 | 
339 |     assert len(singer_stream.peek_buffer()) == 0
340 | 
341 |     singer_stream.add_record_message(stream_latest.generate_record_message())
342 | 
343 |     assert len(singer_stream.peek_buffer()) == 1
344 | 
345 |     reasonable_cutoff = 1000
346 |     while not singer_stream.buffer_full and reasonable_cutoff != 0:
347 |         singer_stream.add_record_message(stream_middle_aged.generate_record_message())
348 |         reasonable_cutoff -= 1
349 | 
350 |     assert reasonable_cutoff == 0
351 |     assert len(singer_stream.peek_buffer()) == 1
352 |     assert [] == missing_sdc_properties(singer_stream)
353 | 
354 | 
355 | def test_multiple_batches__old_records__by_memory():
356 |     stream_oldest = CatStream(100, version=0)
357 |     stream_middle_aged = CatStream(100, version=5)
358 |     stream_latest = CatStream(100, version=10)
359 | 
360 |     singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
361 |                                          CATS_SCHEMA['schema'],
362 |                                          CATS_SCHEMA['key_properties'],
363 |                                          max_buffer_size=32768)
364 | 
365 |     assert len(singer_stream.peek_buffer()) == 0
366 | 
367 |     while not singer_stream.buffer_full:
368 |         singer_stream.add_record_message(stream_oldest.generate_record_message())
369 | 
370 |     assert len(singer_stream.peek_buffer()) > 0
371 |     assert [] == missing_sdc_properties(singer_stream)
372 | 
373 |     singer_stream.flush_buffer()
374 | 
375 |     assert len(singer_stream.peek_buffer()) == 0
376 | 
377 |     singer_stream.add_record_message(stream_latest.generate_record_message())
378 | 
379 |     assert len(singer_stream.peek_buffer()) == 1
380 | 
381 |     reasonable_cutoff = 1000
382 |     while not singer_stream.buffer_full and reasonable_cutoff != 0:
383 |         singer_stream.add_record_message(stream_middle_aged.generate_record_message())
384 |         reasonable_cutoff -= 1
385 | 
386 |     assert reasonable_cutoff == 0
387 |     assert len(singer_stream.peek_buffer()) == 1
388 |     assert [] == missing_sdc_properties(singer_stream)
389 | 


--------------------------------------------------------------------------------
/tests/unit/test_denest.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import pytest
  4 | from chance import chance
  5 | 
  6 | from target_postgres import denest, json_schema, singer
  7 | 
  8 | 
  9 | def non_path_properties(table_batch):
 10 |     errors = []
 11 |     for p in table_batch['streamed_schema']['schema']['properties']:
 12 |         if not isinstance(p, tuple):
 13 |             errors.append({'x': p,
 14 |                            'message': '`x` is not a `tuple`'})
 15 | 
 16 |     return errors
 17 | 
 18 | 
 19 | def missing_key_properties(table_batch):
 20 |     errors = []
 21 |     for p in table_batch['streamed_schema']['key_properties']:
 22 |         if not (p,) in table_batch['streamed_schema']['schema']['properties']:
 23 |             errors.append({'path': tuple(p),
 24 |                            'message': 'key_property missing'})
 25 | 
 26 |     for p, s in table_batch['streamed_schema']['schema']['properties'].items():
 27 |         if not json_schema.is_anyof(s):
 28 |             errors.append({
 29 |                 'path': tuple(p),
 30 |                 'message': 'Expected anyOf json schema for propery schema, got: {}'.format(s)
 31 |             })
 32 | 
 33 |     return errors
 34 | 
 35 | 
 36 | def errors(table_batch):
 37 |     return non_path_properties(table_batch) + missing_key_properties(table_batch)
 38 | 
 39 | 
 40 | def error_check_denest(schema, key_properties, records):
 41 |     denested = denest.to_table_batches(schema, key_properties, records)
 42 | 
 43 |     for table_batch in denested:
 44 |         assert [] == errors(table_batch)
 45 | 
 46 |     return denested
 47 | 
 48 | 
 49 | def test_empty():
 50 |     denested = error_check_denest({}, [], [])
 51 |     assert 1 == len(denested)
 52 |     assert [] == denested[0]['records']
 53 |     assert [] == denested[0]['streamed_schema']['key_properties']
 54 | 
 55 | 
 56 | def test__schema__objects_add_fields():
 57 |     denested = error_check_denest({'properties':
 58 |                                             {'a': {'type': 'integer'},
 59 |                                              'b': {'type': 'object',
 60 |                                                    'properties': {
 61 |                                                        'c': {'type': 'string'},
 62 |                                                        'd': {'type': 'boolean'}}}}},
 63 |                                        ['a'],
 64 |                                        [])
 65 | 
 66 |     assert 1 == len(denested)
 67 |     assert ('b', 'c') in denested[0]['streamed_schema']['schema']['properties']
 68 |     assert ('b', 'd') in denested[0]['streamed_schema']['schema']['properties']
 69 | 
 70 | 
 71 | def random_object_schema():
 72 |     length_of_path = random.randint(1, 50)
 73 |     path = []
 74 |     schema = {'type': chance.pickone([json_schema.BOOLEAN,
 75 |                                       json_schema.INTEGER,
 76 |                                       json_schema.NUMBER,
 77 |                                       json_schema.STRING])}
 78 |     for _ in range(0, length_of_path):
 79 |         field = chance.string(pool='', length=0)
 80 |         schema = {'type': json_schema.OBJECT,
 81 |                   'properties': {field: schema}}
 82 |         path.append(field)
 83 | 
 84 |     return {'schema': schema,
 85 |             'path': path[::-1]}
 86 | 
 87 | 
 88 | def test__schema__nested_objects_add_fields():
 89 |     for _ in range(0, 100):
 90 |         r = random_object_schema()
 91 |         denested = error_check_denest(r['schema'],
 92 |                                            [],
 93 |                                            [])
 94 | 
 95 |         print('r:', r)
 96 |         print()
 97 |         print('denested:', denested)
 98 | 
 99 |         assert 1 == len(denested)
100 |         assert tuple(r['path']) in denested[0]['streamed_schema']['schema']['properties']
101 | 
102 | 
103 | def test__schema__arrays_add_tables():
104 |     denested = error_check_denest({'properties':
105 |                                             {'a': {'type': 'integer'},
106 |                                              'b': {'type': 'array',
107 |                                                    'items': {'properties': {
108 |                                                        'c': {'type': 'string'},
109 |                                                        'd': {'type': 'boolean'}}}}}},
110 |                                        ['a'],
111 |                                        [])
112 |     assert 2 == len(denested)
113 | 
114 | 
115 | def random_array_schema():
116 |     length_of_path = random.randint(1, 50)
117 |     path = []
118 |     schema = {'type': json_schema.ARRAY,
119 |               'items': {'type': chance.pickone([json_schema.BOOLEAN,
120 |                                                 json_schema.INTEGER,
121 |                                                 json_schema.NUMBER,
122 |                                                 json_schema.STRING])}}
123 |     for _ in range(0, length_of_path):
124 |         field = chance.string(pool='', length=0)
125 |         schema = {'type': json_schema.ARRAY,
126 |                   'items': {'type': json_schema.OBJECT,
127 |                             'properties': {field: schema}}}
128 |         path.append(field)
129 | 
130 |     schema = {'type': json_schema.OBJECT,
131 |               'properties': {
132 |                   'root': schema}}
133 |     path.append('root')
134 | 
135 |     return {'schema': schema,
136 |             'path': path[::-1]}
137 | 
138 | 
139 | def test__schema__nested_arrays_add_tables():
140 |     for _ in range(0, 100):
141 |         r = random_array_schema()
142 |         denested = error_check_denest(r['schema'],
143 |                                            [],
144 |                                            [])
145 | 
146 |         print('r:', r)
147 |         print()
148 |         print('denested:', denested)
149 | 
150 |         assert len(r['path']) + 1 == len(denested)
151 | 
152 |         table_path_accum = []
153 |         tables_checked = 0
154 |         while True:
155 |             found_table = False
156 | 
157 |             print('looking for a table with path:', table_path_accum)
158 | 
159 |             for table_batch in denested:
160 |                 if tuple(table_path_accum) == table_batch['streamed_schema']['path']:
161 |                     found_table = True
162 |                     break
163 | 
164 |             assert found_table
165 |             print('...table found')
166 | 
167 |             tables_checked += 1
168 | 
169 |             if len(table_path_accum) == len(r['path']):
170 |                 break
171 | 
172 |             table_path_accum.append(r['path'][len(table_path_accum)])
173 | 
174 |         ## Assert that we looked for every table path
175 |         assert tables_checked == len(denested)
176 | 
177 | 
178 | NESTED_SCHEMA = {
179 |     "properties": {
180 |         "a": {"type": "object",
181 |               "properties": {
182 |                   "b": {
183 |                       "type": "array",
184 |                       "items": {
185 |                           "type": "object",
186 |                           "properties": {
187 |                               "c": {
188 |                                   "type": "object",
189 |                                   "properties": {
190 |                                       "d": {"type": "integer"},
191 |                                       "e": {"type": "array",
192 |                                             "items": {"type": "object",
193 |                                                       "properties": {
194 |                                                           "f": {"type": "string"},
195 |                                                           "g": {"type": "boolean"}}}}}}}}}}}}}
196 | 
197 | NESTED_RECORDS = [{"a": {"b": []}},
198 |                   {"a": {"b": [{"c": {"d": 1}}]}},
199 |                   {"a": {"b": [{"c": {"d": 12}},
200 |                                {"c": {"d": 123}}]}},
201 |                   {"a": {"b": [{"c": {"d": 1234}},
202 |                                {"c": {"d": 12345}},
203 |                                {"c": {"d": 123456}}]}},
204 |                   {"a": {"b": [{"c": {"e": [{"f": "hello",
205 |                                              "g": True},
206 |                                             {"f": "goodbye",
207 |                                              "g": True}]}}]}}]
208 | 
209 | 
210 | def test__records__nested__tables():
211 |     denested = error_check_denest(NESTED_SCHEMA, [], NESTED_RECORDS)
212 | 
213 |     print('denested:', denested)
214 | 
215 |     assert 3 == len(denested)
216 | 
217 |     for table_batch in denested:
218 |         assert table_batch['streamed_schema']['path'] in \
219 |                {tuple(),
220 |                 ('a', 'b'),
221 |                 ('a', 'b', 'c', 'e')}
222 | 
223 | 
224 | def _get_table_batch_with_path(table_batches, path):
225 |     for table_batch in table_batches:
226 |         if path == table_batch['streamed_schema']['path']:
227 |             return table_batch
228 |     raise Exception('Could not find table_batch with path: {}'.format(path))
229 | 
230 | 
231 | def test__records__nested__root_empty():
232 |     denested = error_check_denest(NESTED_SCHEMA, [], NESTED_RECORDS)
233 |     table_batch = _get_table_batch_with_path(denested,
234 |                                              tuple())
235 | 
236 |     assert {} == table_batch['streamed_schema']['schema']['properties']
237 | 
238 |     assert 5 == len(table_batch['records'])
239 | 
240 |     for record in table_batch['records']:
241 |         assert {} == record
242 | 
243 | 
244 | def test__records__nested__child_table__a_b():
245 |     denested = error_check_denest(NESTED_SCHEMA, [], NESTED_RECORDS)
246 |     table_batch = _get_table_batch_with_path(denested,
247 |                                              ('a', 'b'))
248 | 
249 |     assert 1 == len(table_batch['streamed_schema']['schema']['properties'][('c', 'd')]['anyOf'])
250 |     assert {'type': ['integer']} == table_batch['streamed_schema']['schema']['properties'][('c', 'd')]['anyOf'][0]
251 | 
252 |     assert 7 == len(table_batch['records'])
253 | 
254 |     for record in table_batch['records']:
255 |         # Don't try to access key "('c', 'd')" if record is empty
256 |         if record == {}:
257 |             continue
258 |         assert 'integer' == record[('c', 'd')][0]
259 |         assert int == type(record[('c', 'd')][1])
260 | 
261 | 
262 | def test__records__nested__child_table__a_b_c_e():
263 |     denested = error_check_denest(NESTED_SCHEMA, [], NESTED_RECORDS)
264 |     table_batch = _get_table_batch_with_path(denested,
265 |                                              ('a', 'b', 'c', 'e'))
266 | 
267 |     assert 1 == len(table_batch['streamed_schema']['schema']['properties'][('f',)]['anyOf'])
268 |     assert {'type': ['string']} == table_batch['streamed_schema']['schema']['properties'][('f',)]['anyOf'][0]
269 |     assert 1 == len(table_batch['streamed_schema']['schema']['properties'][('g',)]['anyOf'])
270 |     assert {'type': ['boolean']} == table_batch['streamed_schema']['schema']['properties'][('g',)]['anyOf'][0]
271 | 
272 |     assert 2 == len(table_batch['records'])
273 | 
274 |     for record in table_batch['records']:
275 |         assert 'string' == record[('f',)][0]
276 |         assert str == type(record[('f',)][1])
277 | 
278 |         assert 'boolean' == record[('g',)][0]
279 |         assert bool == type(record[('g',)][1])
280 | 
281 | 
282 | def test__anyOf__schema__stitch_date_times():
283 |     denested = error_check_denest(
284 |         {'properties': {
285 |             'a': {
286 |                 "anyOf": [
287 |                     {
288 |                         "type": "string",
289 |                         "format": "date-time"
290 |                     },
291 |                     {"type": ["string", "null"]}]}}},
292 |         [],
293 |         [])
294 |     table_batch = _get_table_batch_with_path(denested, tuple())
295 | 
296 |     anyof_schemas = table_batch['streamed_schema']['schema']['properties'][('a',)]['anyOf']
297 | 
298 |     assert 2 == len(anyof_schemas)
299 |     assert 2 == len([x for x in anyof_schemas if json_schema.is_literal(x)])
300 |     assert 2 == len([x for x in anyof_schemas if json_schema.is_nullable(x)])
301 |     assert 1 == len([x for x in anyof_schemas if json_schema.is_datetime(x)])
302 | 
303 | def test__anyOf__schema__implicit_any_of():
304 |     denested = error_check_denest(
305 |         {
306 |             'properties': {
307 |                 'every_type': {
308 |                     'type': ['integer', 'null', 'number', 'boolean', 'string', 'array', 'object'],
309 |                     'items': {'type': 'integer'},
310 |                     'format': 'date-time',
311 |                     'properties': {
312 |                         'i': {'type': 'integer'},
313 |                         'n': {'type': 'number'},
314 |                         'b': {'type': 'boolean'}
315 |                     }
316 |                 }
317 |             }
318 |         },
319 |         [],
320 |         [])
321 |     assert 2 == len(denested)
322 | 
323 |     table_batch = _get_table_batch_with_path(denested, tuple())
324 |     denested_props = table_batch['streamed_schema']['schema']['properties']
325 | 
326 |     assert 4 == len(denested_props)
327 | 
328 |     anyof_schemas = denested_props[('every_type',)]['anyOf']
329 | 
330 |     assert 4 == len(anyof_schemas)
331 |     assert 4 == len([x for x in anyof_schemas if json_schema.is_literal(x)])
332 |     assert 4 == len([x for x in anyof_schemas if json_schema.is_nullable(x)])
333 |     assert 1 == len([x for x in anyof_schemas if json_schema.is_datetime(x)])
334 | 
335 | 
336 | def test__anyOf__schema__implicit_any_of__arrays():
337 |     denested = error_check_denest(
338 |         {
339 |             'properties': {
340 |                 'every_type': {
341 |                     'type': ['null', 'string', 'array', 'object'],
342 |                     'items': {
343 |                         'anyOf': [
344 |                             {'type': 'integer'},
345 |                             {'type': 'number'}]
346 |                     },
347 |                     'format': 'date-time',
348 |                     'properties': {
349 |                         'i': {'type': 'integer'}
350 |                     }
351 |                 }
352 |             }
353 |         },
354 |         [],
355 |         [])
356 |     assert 2 == len(denested)
357 | 
358 |     table_batch = _get_table_batch_with_path(denested, ('every_type',))
359 |     denested_props = table_batch['streamed_schema']['schema']['properties']
360 |     anyof_schemas = denested_props[(singer.VALUE,)]['anyOf']
361 | 
362 |     assert 2 == len(anyof_schemas)
363 |     assert 2 == len([x for x in anyof_schemas if json_schema.is_literal(x)])
364 | 
365 | 
366 | def test__anyOf__schema__implicit_any_of__objects():
367 |     denested = error_check_denest(
368 |         {
369 |             'properties': {
370 |                 'every_type': {
371 |                     'type': ['integer', 'null', 'number', 'boolean', 'string', 'array', 'object'],
372 |                     'items': {'type': 'integer'},
373 |                     'format': 'date-time',
374 |                     'properties': {
375 |                         'i': {'anyOf': [
376 |                             {'type': 'integer'},
377 |                             {'type': 'number'},
378 |                             {'type': 'boolean'}]
379 |                         }
380 |                     }
381 |                 }
382 |             }
383 |         },
384 |         [],
385 |         [])
386 |     assert 2 == len(denested)
387 | 
388 |     table_batch = _get_table_batch_with_path(denested, tuple())
389 |     denested_props = table_batch['streamed_schema']['schema']['properties']
390 |     print(denested_props)
391 |     anyof_schemas = denested_props[('every_type', 'i')]['anyOf']
392 | 
393 |     assert 3 == len(anyof_schemas)
394 |     assert 3 == len([x for x in anyof_schemas if json_schema.is_literal(x)])
395 | 


--------------------------------------------------------------------------------
/tests/unit/test_sandbox.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import psycopg2
  4 | import psycopg2.extras
  5 | import pytest
  6 | 
  7 | from utils.fixtures import CONFIG, db_cleanup, ListStream, TEST_DB
  8 | from target_postgres import main
  9 | 
 10 | 
 11 | def assert_tables_equal(cursor, expected_table_names):
 12 |     cursor.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'")
 13 |     tables = []
 14 |     for table in cursor.fetchall():
 15 |         tables.append(table[0])
 16 | 
 17 |     assert (not tables and not expected_table_names) \
 18 |            or set(tables) == expected_table_names
 19 | 
 20 | 
 21 | def assert_columns_equal(cursor, table_name, expected_column_tuples):
 22 |     cursor.execute("SELECT column_name, data_type, is_nullable FROM information_schema.columns " + \
 23 |                    "WHERE table_schema = 'public' and table_name = '{}';".format(
 24 |                        table_name))
 25 |     columns = cursor.fetchall()
 26 | 
 27 |     assert (not columns and not expected_column_tuples) \
 28 |            or set(columns) == expected_column_tuples
 29 | 
 30 | 
 31 | def assert_count_equal(cursor, table_name, n):
 32 |     cursor.execute('SELECT count(*) FROM "public"."{}"'.format(table_name))
 33 |     assert cursor.fetchone()[0] == n
 34 | 
 35 | 
 36 | class BigCommerceStream(ListStream):
 37 |     stream = [
 38 |         {"type": "SCHEMA",
 39 |          "stream": "products",
 40 |          "schema": {
 41 |              "type": "object",
 42 |              "properties": {"id": {"type": "integer"},
 43 |                             "name": {"type": ["null",
 44 |                                               "string"]},
 45 |                             "type": {"type": ["null",
 46 |                                               "string"]},
 47 |                             "sku": {"type": ["null",
 48 |                                              "string"]},
 49 |                             "description": {"type": ["null",
 50 |                                                      "string"]},
 51 |                             "weight": {"type": ["null",
 52 |                                                 "integer"]},
 53 |                             "width": {"type": ["null",
 54 |                                                "integer"]},
 55 |                             "depth": {"type": ["null",
 56 |                                                "integer"]},
 57 |                             "height": {"type": ["null",
 58 |                                                 "integer"]},
 59 |                             "price": {"type": ["null",
 60 |                                                "integer",
 61 |                                                "number"]},
 62 |                             "cost_price": {"type": ["null",
 63 |                                                     "integer"]},
 64 |                             "retail_price": {"type": ["null",
 65 |                                                       "integer"]},
 66 |                             "sale_price": {"type": ["null",
 67 |                                                     "integer"]},
 68 |                             "map_price": {"type": ["null",
 69 |                                                    "integer"]},
 70 |                             "tax_class_id": {"type": ["null",
 71 |                                                       "integer"]},
 72 |                             "product_tax_code": {"type": ["null",
 73 |                                                           "string"]},
 74 |                             "calculated_price": {"type": ["null",
 75 |                                                           "integer",
 76 |                                                           "number"]},
 77 |                             "categories": {"type": ["null",
 78 |                                                     "array"],
 79 |                                            "items": {"type": ["null",
 80 |                                                               "integer"]}},
 81 |                             "brand_id": {"type": ["null",
 82 |                                                   "integer"]},
 83 |                             "option_set_id": {"type": ["null",
 84 |                                                        "integer"]},
 85 |                             "option_set_display": {"type": ["null",
 86 |                                                             "string"]},
 87 |                             "inventory_level": {"type": ["null",
 88 |                                                          "integer"]},
 89 |                             "inventory_warning_level": {"type": ["null",
 90 |                                                                  "integer"]},
 91 |                             "inventory_tracking": {"type": ["null",
 92 |                                                             "string"]},
 93 |                             "reviews_rating_sum": {"type": ["null",
 94 |                                                             "integer"]},
 95 |                             "reviews_count": {"type": ["null",
 96 |                                                        "integer"]},
 97 |                             "total_sold": {"type": ["null",
 98 |                                                     "integer"]},
 99 |                             "fixed_cost_shipping_price": {"type": ["null",
100 |                                                                    "integer"]},
101 |                             "is_free_shipping": {"type": ["null",
102 |                                                           "boolean"]},
103 |                             "is_visible": {"type": ["null",
104 |                                                     "boolean"]},
105 |                             "is_featured": {"type": ["null",
106 |                                                      "boolean"]},
107 |                             "related_products": {"type": ["null",
108 |                                                           "array"],
109 |                                                  "items": {"type": ["null",
110 |                                                                     "integer"]}},
111 |                             "warranty": {"type": ["null",
112 |                                                   "string"]},
113 |                             "bin_picking_number": {"type": ["null",
114 |                                                             "string"]},
115 |                             "layout_file": {"type": ["null",
116 |                                                      "string"]},
117 |                             "upc": {"type": ["null",
118 |                                              "string"]},
119 |                             "mpn": {"type": ["null",
120 |                                              "string"]},
121 |                             "gtin": {"type": ["null",
122 |                                               "string"]},
123 |                             "search_keywords": {"type": ["null",
124 |                                                          "string"]},
125 |                             "availability": {"type": ["null",
126 |                                                       "string"]},
127 |                             "availability_description": {"type": ["null",
128 |                                                                   "string"]},
129 |                             "gift_wrapping_options_type": {"type": ["null",
130 |                                                                     "string"]},
131 |                             "sort_order": {"type": ["null",
132 |                                                     "integer"]},
133 |                             "condition": {"type": ["null",
134 |                                                    "string"]},
135 |                             "is_condition_shown": {"type": ["null",
136 |                                                             "boolean"]},
137 |                             "order_quantity_minimum": {"type": ["null",
138 |                                                                 "integer"]},
139 |                             "order_quantity_maximum": {"type": ["null",
140 |                                                                 "integer"]},
141 |                             "page_title": {"type": ["null",
142 |                                                     "string"]},
143 |                             "meta_description": {"type": ["null",
144 |                                                           "string"]},
145 |                             "date_created": {"type": "string",
146 |                                              "format": "date-time"},
147 |                             "date_modified": {"type": "string",
148 |                                               "format": "date-time"},
149 |                             "view_count": {"type": ["null",
150 |                                                     "integer"]},
151 |                             "preorder_release_date": {"type": ["null",
152 |                                                                "string"],
153 |                                                       "format": "date-time"},
154 |                             "preorder_message": {"type": ["null",
155 |                                                           "string"]},
156 |                             "is_preorder_only": {"type": ["null",
157 |                                                           "boolean"]},
158 |                             "is_price_hidden": {"type": ["null",
159 |                                                          "boolean"]},
160 |                             "price_hidden_label": {"type": ["null",
161 |                                                             "string"]},
162 |                             "custom_url": {
163 |                                 "type": ["null",
164 |                                          "object"],
165 |                                 "properties": {"url": {"type": ["null",
166 |                                                                 "string"]},
167 |                                                "is_customized": {"type": ["null",
168 |                                                                           "boolean"]}}},
169 |                             "base_variant_id": {"type": ["null",
170 |                                                          "integer"]},
171 |                             "open_graph_type": {"type": ["null",
172 |                                                          "string"]},
173 |                             "open_graph_title": {"type": ["null",
174 |                                                           "string"]},
175 |                             "open_graph_description": {"type": ["null",
176 |                                                                 "string"]},
177 |                             "open_graph_use_meta_description": {"type": ["null",
178 |                                                                          "boolean"]},
179 |                             "open_graph_use_product_name": {"type": ["null",
180 |                                                                      "boolean"]},
181 |                             "open_graph_use_image": {"type": ["null",
182 |                                                               "boolean"]}}},
183 |          "key_properties": ["id"]},
184 |         {"type": "RECORD",
185 |          "stream": "products",
186 |          "record": {"id": 1,
187 |                     "name": "SAMPLE",
188 |                     "type": "physical",
189 |                     "sku": "very-sku-y",
190 |                     "description": "<p>some</p>\n<p>random</p>\n<p>html</p>",
191 |                     "weight": 123,
192 |                     "width": 0,
193 |                     "depth": 0,
194 |                     "height": 0,
195 |                     "price": 31.45,
196 |                     "cost_price": 0,
197 |                     "retail_price": 0,
198 |                     "sale_price": 0,
199 |                     "map_price": 0,
200 |                     "tax_class_id": 0,
201 |                     "product_tax_code": "",
202 |                     "calculated_price": 31.45,
203 |                     "categories": [32, 22, 21, 20],
204 |                     "brand_id": 42,
205 |                     "option_set_id": None,
206 |                     "option_set_display": "right",
207 |                     "inventory_level": 0,
208 |                     "inventory_warning_level": 0,
209 |                     "inventory_tracking": "none",
210 |                     "reviews_rating_sum": 0,
211 |                     "reviews_count": 0,
212 |                     "total_sold": 0,
213 |                     "fixed_cost_shipping_price": 0,
214 |                     "is_free_shipping": False,
215 |                     "is_visible": True,
216 |                     "is_featured": False,
217 |                     "related_products": [-1],
218 |                     "warranty": "",
219 |                     "bin_picking_number": "0",
220 |                     "layout_file": "a-product.html",
221 |                     "upc": "",
222 |                     "mpn": "",
223 |                     "gtin": "",
224 |                     "search_keywords": "",
225 |                     "availability": "available",
226 |                     "availability_description": "",
227 |                     "gift_wrapping_options_type": "any",
228 |                     "sort_order": 0,
229 |                     "condition": "New",
230 |                     "is_condition_shown": False,
231 |                     "order_quantity_minimum": 0,
232 |                     "order_quantity_maximum": 0,
233 |                     "page_title": "",
234 |                     "meta_description": "",
235 |                     "date_created": "2018-08-27T18:40:23.000000Z",
236 |                     "date_modified": "2018-08-27T20:45:53.000000Z",
237 |                     "view_count": 31,
238 |                     "preorder_release_date": None,
239 |                     "preorder_message": "0",
240 |                     "is_preorder_only": False,
241 |                     "is_price_hidden": False,
242 |                     "price_hidden_label": "0",
243 |                     "custom_url": {"url": "/SAMPLE/",
244 |                                    "is_customized": False},
245 |                     "base_variant_id": 77,
246 |                     "open_graph_type": "product",
247 |                     "open_graph_title": "",
248 |                     "open_graph_description": "",
249 |                     "open_graph_use_meta_description": True,
250 |                     "open_graph_use_product_name": True,
251 |                     "open_graph_use_image": True}},
252 |         {"type": "STATE",
253 |          "value": {"bookmarks": {"products": "2018-11-17T21:26:50+00:00"}}},
254 |         {"type": "SCHEMA",
255 |          "stream": "customers",
256 |          "schema": {
257 |              "properties": {"id": {"type": "integer"},
258 |                             "company": {"type": ["null",
259 |                                                  "string"]},
260 |                             "first_name": {"type": ["null",
261 |                                                     "string"]},
262 |                             "last_name": {"type": ["null",
263 |                                                    "string"]},
264 |                             "email": {"type": ["null",
265 |                                                "string"]},
266 |                             "phone": {"type": ["null",
267 |                                                "string"]},
268 |                             "form_fields": {"type": ["null"]},
269 |                             "date_created": {"format": "date-time",
270 |                                              "type": "string"},
271 |                             "date_modified": {"format": "date-time",
272 |                                               "type": "string"},
273 |                             "store_credit": {"type": ["null",
274 |                                                       "string"]},
275 |                             "registration_ip_address": {"type": ["null",
276 |                                                                  "string"]},
277 |                             "customer_group_id": {"type": ["null",
278 |                                                            "integer"]},
279 |                             "notes": {"type": ["null",
280 |                                                "string"]},
281 |                             "tax_exempt_category": {"type": ["null",
282 |                                                              "string"]},
283 |                             "reset_pass_on_login": {"type": ["null",
284 |                                                              "boolean"]},
285 |                             "accepts_marketing": {"type": ["null",
286 |                                                            "boolean"]},
287 |                             "addresses": {
288 |                                 "properties": {"url": {"type": ["null",
289 |                                                                 "string"]},
290 |                                                "resource": {"type": ["null",
291 |                                                                      "string"]}},
292 |                                 "type": ["null",
293 |                                          "object"]}},
294 |              "type": ["null",
295 |                       "object"]},
296 |          "key_properties": ["id"]},
297 |         {"type": "RECORD",
298 |          "stream": "customers",
299 |          "record": {"id": 1,
300 |                     "company": "",
301 |                     "first_name": "Data",
302 |                     "last_name": "Mill",
303 |                     "email": "test@test.com",
304 |                     "phone": "1231231234",
305 |                     "form_fields": None,
306 |                     "date_created": "2018-11-17T21:25:00.000000Z",
307 |                     "date_modified": "2018-11-17T21:25:01.000000Z",
308 |                     "store_credit": "0.0000",
309 |                     "registration_ip_address": "127.0.0.1",
310 |                     "customer_group_id": 0,
311 |                     "notes": "",
312 |                     "tax_exempt_category": "",
313 |                     "reset_pass_on_login": False,
314 |                     "accepts_marketing": False,
315 |                     "addresses": {"url": "https://api.bigcommerce.com/stores/some-unique-hash/v2/customers/1/addresses",
316 |                                   "resource": "/customers/1/addresses"}}},
317 |         {"type": "STATE",
318 |          "value": {"bookmarks": {"products": "2018-11-17T21:26:50+00:00",
319 |                                  "customers": "2018-11-17T21:25:01+00:00"}}}]
320 | 
321 | 
322 | def test_bigcommerce__sandbox(db_cleanup):
323 |     main(CONFIG, input_stream=BigCommerceStream())
324 | 
325 |     with psycopg2.connect(**TEST_DB) as conn:
326 |         with conn.cursor() as cur:
327 |             assert_tables_equal(cur,
328 |                                 {'products',
329 |                                  'customers',
330 |                                  'products__categories',
331 |                                  'products__related_products'})
332 | 
333 |             ## form_fields should not show up as it can only be `null`
334 |             assert_columns_equal(cur,
335 |                                  'customers',
336 |                                  {
337 |                                      ('_sdc_table_version', 'bigint', 'YES'),
338 |                                      ('_sdc_received_at', 'timestamp with time zone', 'YES'),
339 |                                      ('_sdc_sequence', 'bigint', 'YES'),
340 |                                      ('_sdc_batched_at', 'timestamp with time zone', 'YES'),
341 |                                      ('id', 'bigint', 'NO'),
342 |                                      ('date_modified', 'timestamp with time zone', 'NO'),
343 |                                      ('store_credit', 'text', 'YES'),
344 |                                      ('notes', 'text', 'YES'),
345 |                                      ('tax_exempt_category', 'text', 'YES'),
346 |                                      ('email', 'text', 'YES'),
347 |                                      ('company', 'text', 'YES'),
348 |                                      ('customer_group_id', 'bigint', 'YES'),
349 |                                      ('registration_ip_address', 'text', 'YES'),
350 |                                      ('date_created', 'timestamp with time zone', 'NO'),
351 |                                      ('accepts_marketing', 'boolean', 'YES'),
352 |                                      ('addresses__resource', 'text', 'YES'),
353 |                                      ('reset_pass_on_login', 'boolean', 'YES'),
354 |                                      ('addresses__url', 'text', 'YES'),
355 |                                      ('first_name', 'text', 'YES'),
356 |                                      ('phone', 'text', 'YES'),
357 |                                      ('last_name', 'text', 'YES')
358 |                                  })
359 | 
360 | 
361 | class HubspotStream(ListStream):
362 |     stream = [
363 |         {"type": "SCHEMA",
364 |          "stream": "deals",
365 |          "schema": {
366 |              "type": "object",
367 |              "properties": {
368 |                  "properties": {
369 |                      "type": "object",
370 |                      "properties": {
371 |                          "num_contacted_notes": {
372 |                              "type": "object",
373 |                              "properties": {
374 |                                  "value": {
375 |                                      "type": ["null", "number", "string"]
376 |                                  }}}}}}},
377 |          "key_properties": []},
378 |         {"type": "RECORD",
379 |          "stream": "deals",
380 |          "record": {}},
381 |         {"type": "RECORD",
382 |          "stream": "deals",
383 |          "record": {
384 |              "properties": {}}},
385 |         {"type": "RECORD",
386 |          "stream": "deals",
387 |          "record": {
388 |              "properties": {
389 |                  "num_contacted_notes": {}}}},
390 |         {"type": "RECORD",
391 |          "stream": "deals",
392 |          "record": {
393 |              "properties": {
394 |                  "num_contacted_notes": {
395 |                      "value": None}}}},
396 |         {"type": "RECORD",
397 |          "stream": "deals",
398 |          "record": {
399 |              "properties": {
400 |                  "num_contacted_notes": {
401 |                      "value": "helloworld"}}}},
402 |         {"type": "RECORD",
403 |          "stream": "deals",
404 |          "record": {
405 |              "properties": {
406 |                  "num_contacted_notes": {
407 |                      "value": 12345}}}},
408 |         {"type": "RECORD",
409 |          "stream": "deals",
410 |          "record": {
411 |              "properties": {
412 |                  "num_contacted_notes": {
413 |                      "value": 12345.6789}}}}]
414 | 
415 | 
416 | def test_hubspot__sandbox(db_cleanup):
417 |     config = CONFIG.copy()
418 |     config['persist_empty_tables'] = True
419 |     main(config, input_stream=HubspotStream())
420 | 
421 |     with psycopg2.connect(**TEST_DB) as conn:
422 |         with conn.cursor() as cur:
423 |             assert_tables_equal(cur,
424 |                                 {'deals'})
425 | 
426 |             assert_columns_equal(cur,
427 |                                  'deals',
428 |                                  {
429 |                                      ('_sdc_table_version', 'bigint', 'YES'),
430 |                                      ('_sdc_received_at', 'timestamp with time zone', 'YES'),
431 |                                      ('_sdc_sequence', 'bigint', 'YES'),
432 |                                      ('_sdc_primary_key', 'text', 'NO'),
433 |                                      ('_sdc_batched_at', 'timestamp with time zone', 'YES'),
434 |                                      ('properties__num_contacted_notes__value__f', 'double precision', 'YES'),
435 |                                      ('properties__num_contacted_notes__value__s', 'text', 'YES')
436 |                                  })
437 | 
438 |             assert_count_equal(cur,
439 |                                'deals',
440 |                                7)
441 | 


--------------------------------------------------------------------------------
/tests/unit/test_target_tools.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import json
  3 | 
  4 | from unittest.mock import patch
  5 | import pytest
  6 | 
  7 | from target_postgres import singer_stream
  8 | from target_postgres import target_tools
  9 | from target_postgres.sql_base import SQLInterface
 10 | 
 11 | from utils.fixtures import CONFIG, CatStream, ListStream, InvalidCatStream, DogStream
 12 | 
 13 | 
 14 | class Target(SQLInterface):
 15 |     IDENTIFIER_FIELD_LENGTH = 50
 16 | 
 17 |     def __init__(self):
 18 |         self.calls = {'write_batch': [], 'activate_version': []}
 19 | 
 20 |     def write_batch(self, stream_buffer):
 21 |         self.calls['write_batch'].append({'records_count': len(stream_buffer.peek_buffer())})
 22 |         return None
 23 | 
 24 |     def activate_version(self, stream_buffer, version):
 25 |         self.calls['activate_version'].append({'records_count': len(stream_buffer.peek_buffer())})
 26 |         return None
 27 | 
 28 | 
 29 | def filtered_output(capsys):
 30 |     out, _ = capsys.readouterr()
 31 |     return list(filter(None, out.split('\n')))
 32 | 
 33 | 
 34 | def test_usage_stats():
 35 |     config = deepcopy(CONFIG)
 36 |     assert config['disable_collection']
 37 | 
 38 |     with patch.object(target_tools,
 39 |                       '_async_send_usage_stats') as mock:
 40 |         target_tools.stream_to_target([], None, config=config)
 41 | 
 42 |         assert mock.call_count == 0
 43 | 
 44 |         config['disable_collection'] = False
 45 | 
 46 |         target_tools.stream_to_target([], None, config=config)
 47 | 
 48 |         assert mock.call_count == 1
 49 | 
 50 | 
 51 | def test_loading__invalid__records():
 52 |     with pytest.raises(singer_stream.SingerStreamError, match=r'.*'):
 53 |         target_tools.stream_to_target(InvalidCatStream(1), None, config=CONFIG)
 54 | 
 55 | 
 56 | def test_loading__invalid__records__disable():
 57 |     config = deepcopy(CONFIG)
 58 |     config['invalid_records_detect'] = False
 59 | 
 60 |     target = Target()
 61 | 
 62 |     target_tools.stream_to_target(InvalidCatStream(100), target, config=config)
 63 | 
 64 |     ## Since all `cat`s records were invalid, we could not persist them, hence, no calls made to `write_batch`
 65 |     assert len(target.calls['write_batch']) == 1
 66 |     assert target.calls['write_batch'][0]['records_count'] == 0
 67 | 
 68 | 
 69 | def test_loading__invalid__records__threshold():
 70 |     config = deepcopy(CONFIG)
 71 |     config['invalid_records_threshold'] = 10
 72 | 
 73 |     target = Target()
 74 | 
 75 |     with pytest.raises(singer_stream.SingerStreamError, match=r'.*.10*'):
 76 |         target_tools.stream_to_target(InvalidCatStream(20), target, config=config)
 77 | 
 78 |     assert len(target.calls['write_batch']) == 0
 79 | 
 80 | 
 81 | def test_activate_version():
 82 |     config = CONFIG.copy()
 83 |     config['max_batch_rows'] = 20
 84 |     config['batch_detection_threshold'] = 11
 85 | 
 86 |     records = [{"type": "RECORD",
 87 |                 "stream": "abc",
 88 |                 "record": {},
 89 |                 "version": 123}] * (config['batch_detection_threshold'] - 1)
 90 | 
 91 |     class TestStream(ListStream):
 92 |         stream = [
 93 |                      {"type": "SCHEMA",
 94 |                       "stream": "abc",
 95 |                       "schema": {
 96 |                           "type": "object",
 97 |                           "properties": {
 98 |                               'a': {'type': 'number'}}},
 99 |                       "key_properties": []}
100 |                  ] + records + [
101 |                      {'type': 'ACTIVATE_VERSION',
102 |                       'stream': "abc",
103 |                       'version': 123}
104 |                  ] + records
105 | 
106 |     target = Target()
107 | 
108 |     target_tools.stream_to_target(TestStream(), target, config=config)
109 | 
110 |     rows_persisted = 0
111 |     for call in target.calls['write_batch']:
112 |         rows_persisted += call['records_count']
113 | 
114 |     expected_rows = (2 * len(records))
115 |     assert rows_persisted == expected_rows
116 | 
117 | 
118 | def test_record_with_multiple_of():
119 |     values = [1, 1.0, 2, 2.0, 3, 7, 10.1]
120 |     records = []
121 |     for value in values:
122 |         records.append({
123 |             "type": "RECORD",
124 |             "stream": "test",
125 |             "record": {"multipleOfKey": value},
126 |         })
127 | 
128 |     class TestStream(ListStream):
129 |         stream = [
130 |             {
131 |                 "type": "SCHEMA",
132 |                 "stream": "test",
133 |                 "schema": {
134 |                     "properties": {
135 |                         "multipleOfKey": {
136 |                             "type": "number",
137 |                             "multipleOf": 1e-15
138 |                         }
139 |                     }
140 |                 },
141 |                 "key_properties": []
142 |             }
143 |         ] + records
144 | 
145 |     target = Target()
146 | 
147 |     target_tools.stream_to_target(TestStream(), target, config=CONFIG.copy())
148 | 
149 |     expected_rows = len(records)
150 |     rows_persisted = 0
151 |     for call in target.calls['write_batch']:
152 |         rows_persisted += call['records_count']
153 | 
154 |     assert rows_persisted == expected_rows
155 | 
156 | 
157 | def test_state__capture(capsys):
158 |     stream = [
159 |         json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}),
160 |         json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})]
161 | 
162 |     target_tools.stream_to_target(stream, Target())
163 |     output = filtered_output(capsys)
164 | 
165 |     assert len(output) == 2
166 |     assert json.loads(output[0])['test'] == 'state-1'
167 |     assert json.loads(output[1])['test'] == 'state-2'
168 | 
169 | 
170 | def test_state__capture_can_be_disabled(capsys):
171 |     stream = [
172 |         json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}),
173 |         json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})]
174 | 
175 |     target_tools.stream_to_target(stream, Target(), {'state_support': False})
176 |     output = filtered_output(capsys)
177 | 
178 |     assert len(output) == 0
179 | 
180 | 
181 | def test_state__emits_only_messages_when_all_records_before_have_been_flushed(capsys):
182 |     config = CONFIG.copy()
183 |     config['max_batch_rows'] = 20
184 |     config['batch_detection_threshold'] = 1
185 |     rows = list(CatStream(100))
186 |     target = Target()
187 | 
188 |     def test_stream():
189 |         yield rows[0]
190 |         for row in rows[slice(1, 5)]:
191 |             yield row
192 |         yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})
193 |         for row in rows[slice(6, 10)]:
194 |             yield row
195 |         yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})
196 |         for row in rows[slice(11, 15)]:
197 |             yield row
198 |         yield json.dumps({'type': 'STATE', 'value': {'test': 'state-3'}})
199 | 
200 |         # After some state messages but before the batch size has been hit no state messages should have been emitted
201 |         assert len(target.calls['write_batch']) == 0
202 |         output = filtered_output(capsys)
203 |         assert output == []
204 | 
205 |         for row in rows[slice(16, 25)]:
206 |             yield row
207 |         yield json.dumps({'type': 'STATE', 'value': {'test': 'state-4'}})
208 | 
209 |         # After the batch size has been hit and a write_batch call was made, the most recent safe to emit state should have been emitted
210 |         assert len(target.calls['write_batch']) == 1
211 |         output = filtered_output(capsys)
212 |         assert len(output) == 1
213 |         assert json.loads(output[0])['test'] == 'state-3'
214 | 
215 |         for row in rows[slice(26, 31)]:
216 |             yield row
217 | 
218 |     target_tools.stream_to_target(test_stream(), target, config=config)
219 | 
220 |     # The final state message should have been outputted after the last records were loaded
221 |     output = filtered_output(capsys)
222 |     assert len(output) == 1
223 |     assert json.loads(output[0])['test'] == 'state-4'
224 | 
225 | 
226 | def test_state__emits_most_recent_state_when_final_flush_occurs(capsys):
227 |     config = CONFIG.copy()
228 |     config['max_batch_rows'] = 20
229 |     config['batch_detection_threshold'] = 1
230 |     rows = list(CatStream(5))
231 |     rows.append(json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}))
232 | 
233 |     target_tools.stream_to_target(rows, Target(), config=config)
234 | 
235 |     # The final state message should have been outputted after the last records were loaded despite not reaching
236 |     # one full flushable batch
237 |     output = filtered_output(capsys)
238 |     assert len(output) == 1
239 |     assert json.loads(output[0])['test'] == 'state-1'
240 | 
241 | 
242 | def test_state__doesnt_emit_when_only_one_of_several_streams_is_flushing(capsys):
243 |     config = CONFIG.copy()
244 |     config['max_batch_rows'] = 20
245 |     config['batch_detection_threshold'] = 1
246 |     cat_rows = list(CatStream(100))
247 |     dog_rows = list(DogStream(50))
248 |     target = Target()
249 | 
250 |     # Simulate one stream that yields a lot of records with another that yields few records and ensure both need to be flushed
251 |     # before any state messages are emitted
252 |     def test_stream():
253 |         yield cat_rows[0]
254 |         yield dog_rows[0]
255 |         for row in cat_rows[slice(1, 5)]:
256 |             yield row
257 |         for row in dog_rows[slice(1, 5)]:
258 |             yield row
259 |         yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})
260 | 
261 |         for row in cat_rows[slice(6, 45)]:
262 |             yield row
263 |         yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})
264 | 
265 |         for row in cat_rows[slice(46, 65)]:
266 |             yield row
267 |         yield json.dumps({'type': 'STATE', 'value': {'test': 'state-3'}})
268 | 
269 |         # After some state messages but before the batch size has been hit for both streams no state messages should have been emitted
270 |         assert len(target.calls['write_batch']) == 3
271 |         output = filtered_output(capsys)
272 |         assert output == []
273 | 
274 |         for row in dog_rows[slice(6, 25)]:
275 |             yield row
276 |         yield json.dumps({'type': 'STATE', 'value': {'test': 'state-4'}})
277 | 
278 |         # After the batch size has been hit and a write_batch call was made, the most recent safe to emit state should have been emitted
279 |         assert len(target.calls['write_batch']) == 4
280 |         output = filtered_output(capsys)
281 |         assert len(output) == 1
282 |         assert json.loads(output[0])['test'] == 'state-2'
283 | 
284 |     target_tools.stream_to_target(test_stream(), target, config=config)
285 | 
286 |     # The final state message should have been outputted after the last dog records were loaded despite not reaching one full flushable batch
287 |     output = filtered_output(capsys)
288 |     assert len(output) == 1
289 |     assert json.loads(output[0])['test'] == 'state-4'
290 | 
291 | 
292 | def test_state__emits_when_multiple_streams_are_registered_but_records_arrive_from_only_one(capsys):
293 |     config = CONFIG.copy()
294 |     config['max_batch_rows'] = 20
295 |     config['batch_detection_threshold'] = 1
296 |     cat_rows = list(CatStream(100))
297 |     dog_rows = list(DogStream(50))
298 |     target = Target()
299 | 
300 |     # Simulate one stream that yields a lot of records with another that yields no records, and ensure that only the first
301 |     # needs to be flushed before any state messages are emitted
302 |     def test_stream():
303 |         yield cat_rows[0]
304 |         yield dog_rows[0]
305 |         for row in cat_rows[slice(1, 5)]:
306 |             yield row
307 |         yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})
308 | 
309 |         for row in cat_rows[slice(6, 25)]:
310 |             yield row
311 |         yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})
312 | 
313 |         # After some state messages and only one of the registered streams has hit the batch size, the state message should be emitted, as there are no unflushed records from the other stream yet
314 |         assert len(target.calls['write_batch']) == 1
315 |         output = filtered_output(capsys)
316 |         assert len(output) == 1
317 |         assert json.loads(output[0])['test'] == 'state-1'
318 | 
319 | 
320 |     target_tools.stream_to_target(test_stream(), target, config=config)
321 | 
322 |     # The final state message should have been outputted after the last dog records were loaded despite not reaching one full flushable batch
323 |     output = filtered_output(capsys)
324 |     assert len(output) == 1
325 |     assert json.loads(output[0])['test'] == 'state-2'
326 | 
327 | 
328 | def test_state__doesnt_emit_when_it_isnt_different_than_the_previous_emission(capsys):
329 |     config = CONFIG.copy()
330 |     config['max_batch_rows'] = 5
331 |     config['batch_detection_threshold'] = 1
332 |     rows = list(CatStream(100))
333 |     target = Target()
334 | 
335 |     def test_stream():
336 |         yield rows[0]
337 |         for row in rows[slice(1, 21)]:
338 |             yield row
339 |         yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})
340 |         output = filtered_output(capsys)
341 |         assert len(output) == 1
342 | 
343 |         for row in rows[slice(22, 99)]:
344 |             yield row
345 |         yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})
346 | 
347 |         output = filtered_output(capsys)
348 |         assert len(output) == 0
349 | 
350 |     target_tools.stream_to_target(test_stream(), target, config=config)
351 | 
352 |     output = filtered_output(capsys)
353 |     assert len(output) == 0
354 | 


--------------------------------------------------------------------------------
/tests/utils/fixtures.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import random
  4 | 
  5 | import pytest
  6 | import psycopg2
  7 | import arrow
  8 | from faker import Faker
  9 | from chance import chance
 10 | 
 11 | CONFIG = {
 12 |     'postgres_host': os.environ['POSTGRES_HOST'],
 13 |     'postgres_database': os.environ['POSTGRES_DATABASE'],
 14 |     'postgres_username': os.environ['POSTGRES_USERNAME'],
 15 |     'disable_collection': True,
 16 |     'logging_level': 'DEBUG'
 17 | }
 18 | 
 19 | TEST_DB = {
 20 |     'host': CONFIG['postgres_host'],
 21 |     'dbname': CONFIG['postgres_database'],
 22 |     'user': CONFIG['postgres_username']
 23 | }
 24 | 
 25 | fake = Faker()
 26 | 
 27 | CATS_SCHEMA = {
 28 |     'type': 'SCHEMA',
 29 |     'stream': 'cats',
 30 |     'schema': {
 31 |         'additionalProperties': False,
 32 |         'properties': {
 33 |             'id': {
 34 |                 'type': 'integer'
 35 |             },
 36 |             'name': {
 37 |                 'type': ['string']
 38 |             },
 39 |             'bio': {
 40 |                 'type': ['string']
 41 |             },
 42 |             'paw_size': {
 43 |                 'type': ['integer'],
 44 |                 'default': 314159
 45 |             },
 46 |             'paw_colour': {
 47 |                 'type': 'string',
 48 |                 'default': ''
 49 |             },
 50 |             'flea_check_complete': {
 51 |                 'type': ['boolean'],
 52 |                 'default': False
 53 |             },
 54 |             'pattern': {
 55 |                 'type': ['null', 'string']
 56 |             },
 57 |             'age': {
 58 |                 'type': ['null', 'integer']
 59 |             },
 60 |             'adoption': {
 61 |                 'type': ['object', 'null'],
 62 |                 'properties': {
 63 |                     'adopted_on': {
 64 |                         'type': ['null', 'string'],
 65 |                         'format': 'date-time'
 66 |                     },
 67 |                     'was_foster': {
 68 |                         'type': 'boolean'
 69 |                     },
 70 |                     'immunizations': {
 71 |                         'type': ['null', 'array'],
 72 |                         'items': {
 73 |                             'type': ['object'],
 74 |                             'properties': {
 75 |                                 'type': {
 76 |                                     'type': ['null', 'string']
 77 |                                 },
 78 |                                 'date_administered': {
 79 |                                     'type': ['null', 'string'],
 80 |                                     'format': 'date-time'
 81 |                                 }
 82 |                             }
 83 |                         }
 84 |                     }
 85 |                 }
 86 |             }
 87 |         }
 88 |     },
 89 |     'key_properties': ['id']
 90 | }
 91 | 
 92 | 
 93 | class FakeStream(object):
 94 |     def __init__(self,
 95 |                  n,
 96 |                  *args,
 97 |                  version=None,
 98 |                  nested_count=0,
 99 |                  duplicates=0,
100 |                  duplicate_sequence_delta=200,
101 |                  sequence=None,
102 |                  **kwargs):
103 |         self.n = n
104 |         self.wrote_schema = False
105 |         self.id = 1
106 |         self.nested_count = nested_count
107 |         self.version = version
108 |         self.wrote_activate_version = False
109 |         self.records = []
110 |         self.duplicates = duplicates
111 |         self.duplicates_written = 0
112 |         self.duplicate_pks_used = []
113 |         self.record_message_count = 0
114 |         if sequence:
115 |             self.sequence = sequence
116 |         else:
117 |             self.sequence = arrow.get().int_timestamp
118 |         self.duplicate_sequence_delta = duplicate_sequence_delta
119 | 
120 |     def duplicate(self, force=False):
121 |         if self.duplicates > 0 and \
122 |                 len(self.records) > 0 and \
123 |                 self.duplicates_written < self.duplicates and \
124 |                 (force or chance.boolean(likelihood=30)):
125 |             self.duplicates_written += 1
126 |             random_index = random.randint(0, len(self.records) - 1)
127 |             record = self.records[random_index]
128 |             self.duplicate_pks_used.append(record['id'])
129 |             record_message = self.generate_record_message(record=record)
130 |             record_message['sequence'] = self.sequence + self.duplicate_sequence_delta
131 |             return record_message
132 |         else:
133 |             return False
134 | 
135 |     def generate_record_message(self, record=None):
136 |         if not record:
137 |             record = self.generate_record()
138 |             self.id += 1
139 | 
140 |         self.records.append(record)
141 |         message = {
142 |             'type': 'RECORD',
143 |             'stream': self.stream,
144 |             'record': record,
145 |             'sequence': self.sequence
146 |         }
147 | 
148 |         if self.version is not None:
149 |             message['version'] = self.version
150 | 
151 |         self.record_message_count += 1
152 | 
153 |         return message
154 | 
155 |     def activate_version(self):
156 |         self.wrote_activate_version = True
157 |         return {
158 |             'type': 'ACTIVATE_VERSION',
159 |             'stream': self.stream,
160 |             'version': self.version
161 |         }
162 | 
163 |     def __iter__(self):
164 |         return self
165 | 
166 |     def __next__(self):
167 |         if not self.wrote_schema:
168 |             self.wrote_schema = True
169 |             return json.dumps(self.schema)
170 |         if self.id <= self.n:
171 |             dup = self.duplicate()
172 |             if dup != False:
173 |                 return json.dumps(dup)
174 |             return json.dumps(self.generate_record_message())
175 |         if self.id == self.n:
176 |             dup = self.duplicate(force=True)
177 |             if dup != False:
178 |                 return json.dumps(dup)
179 |         if self.version is not None and self.wrote_activate_version == False:
180 |             return json.dumps(self.activate_version())
181 |         raise StopIteration
182 | 
183 | 
184 | def fake_conjunctive_text(n):
185 |     t = fake.text()
186 |     for i in range(0, n):
187 |         t = '{}, {} {}'.format(
188 |             t[:-1],
189 |             chance.pickone(['and', 'or', 'for', 'nor', 'but', 'yet', 'so']),
190 |             fake.text())
191 |     return t
192 | 
193 | 
194 | class CatStream(FakeStream):
195 |     stream = 'cats'
196 |     schema = CATS_SCHEMA
197 | 
198 |     def generate_record(self):
199 |         adoption = None
200 |         if self.nested_count or chance.boolean(likelihood=70):
201 |             immunizations = []
202 |             for i in range(0, self.nested_count or random.randint(0, 4)):
203 |                 immunizations.append({
204 |                     'type': chance.pickone(['FIV', 'Panleukopenia', 'Rabies', 'Feline Leukemia']),
205 |                     'date_administered': chance.date(minyear=2012).isoformat()
206 |                 })
207 |             adoption = {
208 |                 'adopted_on': chance.date(minyear=2012).isoformat(),
209 |                 'was_foster': chance.boolean(),
210 |                 'immunizations': immunizations
211 |             }
212 | 
213 |         return {
214 |             'id': self.id,
215 |             'name': fake.first_name(),
216 |             'bio': fake_conjunctive_text(random.randint(0, 10)),
217 |             'pattern': chance.pickone(['Tabby', 'Tuxedo', 'Calico', 'Tortoiseshell']),
218 |             'age': random.randint(1, 15),
219 |             'adoption': adoption
220 |         }
221 | 
222 | 
223 | class InvalidCatStream(CatStream):
224 |     def generate_record(self):
225 |         record = CatStream.generate_record(self)
226 | 
227 |         if chance.boolean(likelihood=50):
228 |             record['adoption'] = ['invalid', 'adoption']
229 |         elif chance.boolean(likelihood=50):
230 |             record['age'] = 'very invalid age'
231 |         elif record['adoption'] and chance.boolean(likelihood=50):
232 |             record['adoption']['immunizations'] = {
233 |                 'type': chance.pickone(['a', 'b', 'c']),
234 |                 'date_administered': ['clearly', 'not', 'a', 'date']
235 |             }
236 |         else:
237 |             record['name'] = 22 / 7
238 | 
239 |         return record
240 | 
241 | 
242 | NESTED_STREAM = {
243 |     'type': 'SCHEMA',
244 |     'stream': 'root',
245 |     'schema': {
246 |         'additionalProperties': False,
247 |         'properties': {
248 |             'id': {
249 |                 'type': 'integer'
250 |             },
251 |             ## TODO: Complex types defaulted
252 |             # 'array_scalar_defaulted': {
253 |             #     'type': 'array',
254 |             #     'items': {'type': 'integer'},
255 |             #     'default': list(range(10))
256 |             # },
257 |             'array_scalar': {
258 |                 'type': 'array',
259 |                 'items': {'type': 'integer'}
260 |             },
261 |             'array_of_array': {
262 |                 'type': 'array',
263 |                 'items': {
264 |                     'type': 'array',
265 |                     'items': {
266 |                         'type': 'array',
267 |                         'items': {'type': 'integer'}
268 |                     }
269 |                 }
270 |             },
271 |             ## TODO: Complex types defaulted
272 |             # 'object_defaulted': {
273 |             #     'type': 'object',
274 |             #     'properties': {
275 |             #         'a': {
276 |             #             'type': 'integer'
277 |             #         },
278 |             #         'b': {
279 |             #             'type': 'integer'
280 |             #         },
281 |             #         'c': {
282 |             #             'type': 'integer'
283 |             #         }
284 |             #     },
285 |             #     'default': {'a': 123, 'b': 456, 'c': 789}
286 |             # },
287 |             'object_of_object_0': {
288 |                 'type': 'object',
289 |                 'properties': {
290 |                     'object_of_object_1': {
291 |                         'type': 'object',
292 |                         'properties': {
293 |                             'object_of_object_2': {
294 |                                 'type': 'object',
295 |                                 'properties': {
296 |                                     'array_scalar': {
297 |                                         'type': 'array',
298 |                                         'items': {
299 |                                             'type': 'boolean'
300 |                                         }
301 |                                     },
302 |                                     'a': {
303 |                                         'type': 'integer'
304 |                                     },
305 |                                     'b': {
306 |                                         'type': 'integer'
307 |                                     },
308 |                                     'c': {
309 |                                         'type': 'integer'
310 |                                     }
311 |                                 }
312 |                             }
313 |                         }
314 |                     }
315 |                 }
316 |             },
317 |             'null': {
318 |                 'type': ['null', 'integer']
319 |             },
320 |             'nested_null': {
321 |                 'type': 'object',
322 |                 'properties': {
323 |                     'null': {
324 |                         'type': ['null', 'integer']
325 |                     }
326 |                 }
327 |             }
328 |         }
329 |     },
330 |     'key_properties': ['id']
331 | }
332 | 
333 | 
334 | class NestedStream(FakeStream):
335 |     stream = 'root'
336 |     schema = NESTED_STREAM
337 | 
338 |     def generate_record(self):
339 |         null = None
340 |         ## We use this trick so that we _always_ know we'll have both null and non-null values
341 |         ##  vs using something like chance here.
342 |         if self.id % 2 == 0:
343 |             null = 31415
344 | 
345 |         return {
346 |             'id': self.id,
347 |             'array_scalar': list(range(5)),
348 |             'array_of_array': [[[1, 2, 3],
349 |                                 [4, 5, 6, 7, 8],
350 |                                 [9, 10],
351 |                                 []],
352 |                                [[10],
353 |                                 [20, 30],
354 |                                 [40, 50, 60],
355 |                                 [70, 80, 90, 100]]],
356 |             'object_of_object_0': {
357 |                 'object_of_object_1': {
358 |                     'object_of_object_2': {
359 |                         'array_scalar': [True, False, True, False, False],
360 |                         'a': self.id,
361 |                         'b': self.id,
362 |                         'c': self.id
363 |                     }
364 |                 }
365 |             },
366 |             'null': null,
367 |             'nested_null': {
368 |                 'null': null
369 |             }
370 |         }
371 | 
372 | 
373 | MULTI_TYPE = {
374 |     'type': 'SCHEMA',
375 |     'stream': 'root',
376 |     'schema': {
377 |         'additionalProperties': False,
378 |         'properties': {
379 |             'every_type': {
380 |                 'type': ['null', 'integer', 'number', 'boolean', 'string', 'array', 'object'],
381 |                 'items': {'type': 'integer'},
382 |                 'format': 'date-time',
383 |                 'properties': {
384 |                     ## We use these field names to increase the difficulty for our column
385 |                     ##  name collision functionality. ie, the denested values will not only
386 |                     ##  conflict in terms of their denested _names_ but also, their types
387 |                     'i': {'type': 'integer'},
388 |                     'f': {'type': 'number'},
389 |                     'b': {'type': 'boolean'}
390 |                 }
391 |             },
392 |             'number_which_only_comes_as_integer': {
393 |                 'type': 'number'
394 |             }
395 |         }
396 |     },
397 |     'key_properties': []
398 | }
399 | 
400 | 
401 | class MultiTypeStream(FakeStream):
402 |     stream = 'root'
403 |     schema = MULTI_TYPE
404 | 
405 |     def generate_record(self):
406 |         value_null = None
407 |         value_integer = random.randint(-314159265359, 314159265359)
408 |         value_integer_as_number = float(random.randint(-314159265359, 314159265359))
409 |         value_number = random.uniform(-314159265359, 314159265359)
410 |         value_boolean = chance.boolean()
411 |         value_date_time_string = chance.date(minyear=2012).isoformat()
412 |         value_array = []
413 |         for i in range(random.randint(0, 1000)):
414 |             value_array.append(random.randint(-314, 314))
415 | 
416 |         value_object = {'i': random.randint(-314159265359, 314159265359),
417 |                         'n': random.uniform(-314159265359, 314159265359),
418 |                         'b': chance.boolean()}
419 | 
420 |         return {
421 |             'every_type': chance.pickone(
422 |                 [value_null,
423 |                  value_integer,
424 |                  value_integer_as_number,
425 |                  value_number,
426 |                  value_boolean,
427 |                  value_date_time_string,
428 |                  value_array,
429 |                  value_object]),
430 |             'number_which_only_comes_as_integer': value_integer
431 |         }
432 | 
433 | 
434 | class TypeChangeStream(FakeStream):
435 |     stream = 'root'
436 | 
437 |     def __init__(self, n, starting_id):
438 |         FakeStream.__init__(self, n)
439 |         self.starting_id = starting_id
440 |         self.changing_literal_type = chance.pickone(['integer', 'number', 'boolean', 'string', 'date-time'])
441 |         type_def = {'type': self.changing_literal_type}
442 | 
443 |         if self.changing_literal_type == 'date-time':
444 |             type_def = {'type': 'string',
445 |                         'format': 'date-time'}
446 | 
447 |         print('TypeChangeStream chose:', type_def, 'id starting at:', self.id)
448 |         self.schema = {
449 |             'type': 'SCHEMA',
450 |             'stream': 'root',
451 |             'schema': {
452 |                 'additionalProperties': False,
453 |                 'properties': {
454 |                     'id': {'type': 'integer'},
455 |                     'changing_literal_type': type_def
456 |                 }
457 |             },
458 |             'key_properties': ['id']
459 |         }
460 | 
461 |     def generate_record(self):
462 |         value = None
463 |         if self.changing_literal_type == 'integer':
464 |             value = random.randint(-314159265359, 314159265359)
465 |         elif self.changing_literal_type == 'number':
466 |             value = chance.pickone([random.uniform(-314159265359, 314159265359),
467 |                                     float(random.randint(-314159265359, 314159265359)),
468 |                                     random.randint(-314159265359, 314159265359)])
469 |         elif self.changing_literal_type == 'boolean':
470 |             value = chance.boolean()
471 |         elif self.changing_literal_type == 'string':
472 |             value = chance.date(minyear=2012).isoformat()
473 |         elif self.changing_literal_type == 'date-time':
474 |             value = chance.date(minyear=2012).isoformat()
475 |         else:
476 |             raise Exception('Unknown changing_literal_type: `{}`'.format(self.changing_literal_type))
477 | 
478 |         return {
479 |             'id': self.id + self.starting_id,
480 |             'changing_literal_type': value,
481 |         }
482 | 
483 | 
484 | def clear_db():
485 |     with psycopg2.connect(**TEST_DB) as conn:
486 |         with conn.cursor() as cur:
487 |             cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'")
488 |             drop_command = ''
489 |             for table in cur.fetchall():
490 |                 drop_command += 'DROP TABLE IF EXISTS ' + table[0] + ';'
491 |             cur.execute('begin;' +
492 |                         drop_command +
493 |                         'commit;')
494 | 
495 | 
496 | @pytest.fixture
497 | def db_cleanup():
498 |     clear_db()
499 | 
500 |     yield
501 | 
502 |     clear_db()
503 | 
504 | 
505 | class ListStream:
506 |     idx = None
507 |     stream = NotImplementedError()
508 | 
509 |     def __init__(self):
510 |         self.idx = -1
511 | 
512 |     def __iter__(self):
513 |         return self
514 | 
515 |     def __next__(self):
516 |         self.idx += 1
517 | 
518 |         if self.idx < len(self.stream):
519 |             return json.dumps(self.stream[self.idx])
520 | 
521 |         raise StopIteration
522 | 
523 | 
524 | class DogStream(CatStream):
525 |     stream = 'dogs'
526 |     schema = CatStream.schema.copy()
527 | 
528 | 
529 | DogStream.schema['stream'] = 'dogs'


--------------------------------------------------------------------------------