├── .circleci ├── config.yml └── integration │ ├── tap-github │ ├── catalog.json │ └── config-template.json │ ├── tap-postgres │ └── config.json │ └── target-config.json ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── DECISIONS.md ├── LICENSE ├── README.md ├── docker-compose.yml ├── docker-entrypoint.sh ├── docs └── TableMetadata.md ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── target_postgres ├── __init__.py ├── denest.py ├── exceptions.py ├── json_schema.py ├── postgres.py ├── singer.py ├── singer_stream.py ├── sql_base.py ├── stream_tracker.py └── target_tools.py └── tests ├── conftest.py ├── migrations ├── data │ └── tap ├── scripts │ ├── install_schema_versions.sh │ ├── to_latest.sh │ └── to_target.sh └── test_migrations.py ├── unit ├── test_BufferedSingerStream.py ├── test_denest.py ├── test_json_schema.py ├── test_postgres.py ├── test_sandbox.py └── test_target_tools.py └── utils └── fixtures.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | filters: &filters 4 | filters: 5 | tags: 6 | only: /^v[0-9]+(\.[0-9]+)*$/ 7 | 8 | filters__tags: &filters__tags 9 | filters: 10 | branches: 11 | ignore: /.*/ 12 | tags: 13 | only: /^v[0-9]+(\.[0-9]+)*$/ 14 | 15 | workflows: 16 | test: 17 | jobs: 18 | - cache: 19 | <<: *filters 20 | - build: 21 | <<: *filters 22 | requires: 23 | - cache 24 | - test--15: 25 | <<: *filters 26 | requires: 27 | - cache 28 | - test--14: 29 | <<: *filters 30 | requires: 31 | - cache 32 | - test--13: 33 | <<: *filters 34 | requires: 35 | - cache 36 | - test--12: 37 | <<: *filters 38 | requires: 39 | - cache 40 | - test--11: 41 | <<: *filters 42 | requires: 43 | - cache 44 | - test--10: 45 | <<: *filters 46 | requires: 47 | - cache 48 | - test--migrations: 49 | <<: *filters 50 | requires: 51 | - cache 52 | - test--tap-github: 53 | <<: *filters 54 | requires: 55 | - test--12 56 | - test-release: 57 | <<: *filters__tags 58 | requires: 59 | - test--tap-github 60 | - test--migrations 61 | - build 62 | - approve-release: 63 | <<: *filters__tags 64 | type: approval 65 | requires: 66 | - test-release 67 | - release: 68 | <<: *filters__tags 69 | requires: 70 | - approve-release 71 | 72 | cache: &cache deps-v7-{{ checksum "poetry.lock" }}-{{ checksum ".circleci/config.yml" }} 73 | 74 | py: &py python:3.7.15-bullseye 75 | 76 | restore__cache: &restore__cache 77 | restore_cache: 78 | keys: 79 | - *cache 80 | 81 | # Simple checkout command to pull external forks. 82 | # The CircleCI util does not work without setting up SSH keys 83 | # which we technically do not need for open-source repos. 84 | checkout_command: &checkout_command 85 | run: 86 | name: checkout 87 | command: | 88 | git clone https://github.com/datamill-co/target-postgres . 89 | if [[ "$CIRCLE_BRANCH" =~ ^pull\/* ]]; then 90 | git fetch origin refs/pull/${CIRCLE_PR_NUMBER}/head 91 | git checkout ${CIRCLE_SHA1} 92 | else 93 | git checkout ${CIRCLE_BRANCH} 94 | fi 95 | 96 | install_poetry: &install_poetry 97 | run: 98 | name: Install poetry 99 | command: | 100 | export POETRY_HOME=/opt/poetry 101 | python -m venv $POETRY_HOME 102 | $POETRY_HOME/bin/pip install -U pip 103 | $POETRY_HOME/bin/pip install poetry==1.2.2 104 | $POETRY_HOME/bin/poetry --version 105 | 106 | test__base: &test__base 107 | working_directory: /code/ 108 | steps: 109 | - *checkout_command 110 | - *restore__cache 111 | - *install_poetry 112 | - attach_workspace: 113 | at: "./" 114 | 115 | - run: 116 | name: Run Tests 117 | command: | 118 | source venv/target-postgres/bin/activate 119 | pytest --verbose tests/unit 120 | environment: 121 | POSTGRES_HOST: localhost 122 | POSTGRES_DATABASE: target_postgres_test 123 | POSTGRES_USERNAME: postgres 124 | POSTGRES_PASSWORD: postgres 125 | 126 | - store_artifacts: 127 | path: target/test-results 128 | destination: raw-test-output 129 | 130 | jobs: 131 | cache: 132 | working_directory: /code/ 133 | docker: 134 | - image: *py 135 | steps: 136 | - *checkout_command 137 | - *restore__cache 138 | - *install_poetry 139 | 140 | - run: 141 | name: Install target-postgres 142 | command: | 143 | python -m venv venv/target-postgres 144 | source venv/target-postgres/bin/activate 145 | pip install -U pip 146 | /opt/poetry/bin/poetry install --with tests 147 | deactivate 148 | 149 | - run: 150 | name: Install older versions of target-postgres for migration testing 151 | command: ./tests/migrations/scripts/install_schema_versions.sh 152 | 153 | - run: 154 | name: Install tap-github 155 | command: | 156 | python -m venv venv/tap-github 157 | source venv/tap-github/bin/activate 158 | pip install -U pip 159 | pip install git+https://github.com/MeltanoLabs/tap-github.git@v1.1.0 160 | deactivate 161 | 162 | - run: 163 | name: Install tap-postgres 164 | command: | 165 | python -m venv venv/tap-postgres 166 | source venv/tap-postgres/bin/activate 167 | pip install -U pip 168 | pip install tap-postgres 169 | deactivate 170 | 171 | - save_cache: 172 | key: *cache 173 | paths: 174 | - "./venv" 175 | - "/usr/local/bin" 176 | - "/usr/local/lib/python3.7/site-packages" 177 | - "/opt/poetry" 178 | 179 | - persist_to_workspace: 180 | root: "./" 181 | paths: 182 | - "./venv" 183 | 184 | test--15: 185 | <<: *test__base 186 | docker: 187 | - image: *py 188 | - image: postgres:15.0 189 | environment: 190 | POSTGRES_DB: target_postgres_test 191 | POSTGRES_PASSWORD: postgres 192 | 193 | test--14: 194 | <<: *test__base 195 | docker: 196 | - image: *py 197 | - image: postgres:14.5 198 | environment: 199 | POSTGRES_DB: target_postgres_test 200 | POSTGRES_PASSWORD: postgres 201 | 202 | test--13: 203 | <<: *test__base 204 | docker: 205 | - image: *py 206 | - image: postgres:13.8 207 | environment: 208 | POSTGRES_DB: target_postgres_test 209 | POSTGRES_PASSWORD: postgres 210 | 211 | test--12: 212 | <<: *test__base 213 | docker: 214 | - image: *py 215 | - image: postgres:12.12 216 | environment: 217 | POSTGRES_DB: target_postgres_test 218 | POSTGRES_PASSWORD: postgres 219 | 220 | test--11: 221 | <<: *test__base 222 | docker: 223 | - image: *py 224 | - image: postgres:11.17-bullseye 225 | environment: 226 | POSTGRES_DB: target_postgres_test 227 | POSTGRES_PASSWORD: postgres 228 | 229 | test--10: 230 | <<: *test__base 231 | docker: 232 | - image: *py 233 | - image: postgres:10.22-bullseye 234 | environment: 235 | POSTGRES_DB: target_postgres_test 236 | POSTGRES_PASSWORD: postgres 237 | 238 | test--tap-github: 239 | working_directory: /code/ 240 | docker: 241 | - image: *py 242 | - image: postgres:12.12 243 | environment: 244 | POSTGRES_DB: target_postgres_test 245 | POSTGRES_PASSWORD: postgres 246 | steps: 247 | - *checkout_command 248 | - *restore__cache 249 | - attach_workspace: 250 | at: "./" 251 | 252 | - run: 253 | name: Setup artifacts folder 254 | command: mkdir -p /code/artifacts/data 255 | 256 | - run: 257 | name: Setup tap-github 258 | working_directory: /code/.circleci/integration/tap-github 259 | command: sed "s/REPLACE_ME/$TAP_GITHUB_TOKEN/" config-template.json > config.json 260 | 261 | - run: 262 | name: Tap -> Data 263 | command: | 264 | source venv/tap-github/bin/activate 265 | cd /code/.circleci/integration/tap-github 266 | 267 | tap-github --config config.json --catalog catalog.json > /code/artifacts/data/tap 268 | 269 | deactivate 270 | 271 | - run: 272 | name: Data -> Target 273 | command: | 274 | source venv/target-postgres/bin/activate 275 | pip install -U pip 276 | /opt/poetry/bin/poetry install 277 | cd /code/.circleci/integration 278 | 279 | cat /code/artifacts/data/tap | target-postgres --config target-config.json 280 | 281 | deactivate 282 | 283 | - run: 284 | name: Target -> Data 285 | command: | 286 | source venv/tap-postgres/bin/activate 287 | cd /code/.circleci/integration/tap-postgres 288 | 289 | tap-postgres --config config.json --discover > tmp-properties.json 290 | 291 | ## Select _every_ table found in properties. 292 | ## row-count seems to only show up inside of the necessary metadata object...easier than multi-line-sed 293 | sed 's/"row-count": 0,/"row-count": 0,"selected":true,/g' tmp-properties.json > /code/artifacts/data/properties.json 294 | 295 | tap-postgres --config config.json --properties /code/artifacts/data/properties.json > /code/artifacts/data/target 296 | 297 | deactivate 298 | 299 | - run: 300 | name: Repeatability of Data -> Target 301 | command: | 302 | source venv/target-postgres/bin/activate 303 | pip install -U pip 304 | pip install . 305 | cd /code/.circleci/integration 306 | 307 | cat /code/artifacts/data/tap | target-postgres --config target-config.json 308 | 309 | deactivate 310 | 311 | cd /code/ 312 | 313 | source venv/tap-postgres/bin/activate 314 | cd /code/.circleci/integration/tap-postgres 315 | 316 | tap-postgres --config config.json --discover > tmp-properties.json 317 | 318 | ## Select _every_ table found in properties. 319 | ## row-count seems to only show up inside of the necessary metadata object...easier than multi-line-sed 320 | sed 's/"row-count": 0,/"row-count": 0,"selected":true,/g' tmp-properties.json > /code/artifacts/data/properties.json 321 | 322 | tap-postgres --config config.json --properties /code/artifacts/data/properties.json > /code/artifacts/data/target.repeated 323 | 324 | deactivate 325 | 326 | ## TODO: compare repeated data to insure that we only changed _sdc values 327 | # diff /code/artifacts/data/target /code/artifacts/data/target.repeated 328 | 329 | - store_artifacts: 330 | path: /code/artifacts 331 | 332 | test--migrations: 333 | working_directory: /code/ 334 | docker: 335 | - image: *py 336 | - image: postgres:12.12 337 | environment: 338 | POSTGRES_DB: target_postgres_test 339 | POSTGRES_PASSWORD: postgres 340 | steps: 341 | - *checkout_command 342 | - *restore__cache 343 | - *install_poetry 344 | - attach_workspace: 345 | at: "./" 346 | 347 | - run: 348 | name: Run Tests 349 | command: | 350 | source venv/target-postgres/bin/activate 351 | pytest --verbose tests/migrations 352 | environment: 353 | POSTGRES_HOST: localhost 354 | POSTGRES_DATABASE: target_postgres_test 355 | POSTGRES_USERNAME: postgres 356 | POSTGRES_PASSWORD: postgres 357 | 358 | - store_artifacts: 359 | path: /code/tests/migrations/artifacts 360 | destination: raw-test-output 361 | 362 | build: 363 | working_directory: /code/ 364 | docker: 365 | - image: *py 366 | steps: 367 | - *checkout_command 368 | - *restore__cache 369 | - attach_workspace: 370 | at: "./" 371 | - run: 372 | name: Build distribution 373 | command: | 374 | source venv/target-postgres/bin/activate 375 | 376 | pip install -U pip 377 | pip install --upgrade setuptools wheel twine 378 | 379 | /opt/poetry/bin/poetry build 380 | 381 | deactivate 382 | 383 | - persist_to_workspace: 384 | root: "./" 385 | paths: 386 | - "./dist" 387 | 388 | test-release: 389 | working_directory: /code/ 390 | docker: 391 | - image: *py 392 | steps: 393 | - *checkout_command 394 | - *restore__cache 395 | - attach_workspace: 396 | at: "./" 397 | - run: 398 | name: Validate tag 399 | command: | 400 | export TAG=`echo $CIRCLE_TAG | sed 's/v//'` 401 | VERSION=`grep version pyproject.toml | sed 's/^.*version = "\(.*\)",.*$/\1/'` 402 | 403 | echo tag: $TAG equals version: $VERSION '?' 404 | 405 | [[ $TAG == $VERSION ]] 406 | - run: 407 | name: Install upload tools 408 | command: pip install --upgrade twine 409 | - run: 410 | name: Test Publish 411 | environment: 412 | TWINE_USERNAME: datamill 413 | TWINE_REPOSITORY_URL: https://test.pypi.org/legacy/ 414 | command: | 415 | export TWINE_PASSWORD=$PYPI__PASSWORD__TEST 416 | twine upload ./dist/* 417 | 418 | release: 419 | working_directory: /code/ 420 | docker: 421 | - image: *py 422 | steps: 423 | - *checkout_command 424 | - *restore__cache 425 | - attach_workspace: 426 | at: "./" 427 | - run: 428 | name: Install upload tools 429 | command: pip install --upgrade twine 430 | - run: 431 | name: Publish 432 | environment: 433 | TWINE_USERNAME: datamill 434 | command: | 435 | export TWINE_PASSWORD=$PYPI__PASSWORD 436 | twine upload ./dist/* 437 | -------------------------------------------------------------------------------- /.circleci/integration/tap-github/config-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "start_date": "2022-04-12", 3 | "repositories": ["datamill-co/target-postgres"] 4 | } 5 | -------------------------------------------------------------------------------- /.circleci/integration/tap-postgres/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "dbname": "target_postgres_test", 3 | "host": "localhost", 4 | "port": "5432", 5 | "user": "postgres", 6 | "password": null, 7 | "default_replication_method": "FULL_TABLE" 8 | } -------------------------------------------------------------------------------- /.circleci/integration/target-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "postgres_database": "target_postgres_test", 3 | "postgres_username": "postgres" 4 | } 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | .pytest_cache/ 92 | 93 | # Mac 94 | .DS_Store 95 | ._* 96 | 97 | # Singer files 98 | *.txt 99 | 100 | /venv--* 101 | /venv 102 | /tests/migrations/artifacts 103 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.2.4 4 | 5 | - **BUG FIX:** `multipleOf` validation 6 | - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/179) 7 | - Due to floating point errors in Python and JSONSchema, `multipleOf` 8 | validation has been failing. 9 | 10 | ## 0.2.3 11 | 12 | - **FEATURES:** 13 | - [`JSONSchema: anyOf` Support](https://github.com/datamill-co/target-postgres/pull/155) 14 | - Streamed `JSONSchema`s which include `anyOf` combinations should now be fully supported 15 | - This allows for full support of Stitch/Singer's `DateTime` string fallbacks. 16 | - [`JSONSchema`: allOf` Support](https://github.com/datamill-co/target-postgres/pull/154) 17 | - Streamed `JSONSchema`s which include `allOf` combinations should now be fully supported 18 | - Columns are persisted as normal. 19 | - This is _perceived_ to be most useful for merging objects, and putting in place things like `maxLength` etc. 20 | - **BUG FIX:** Buffer Flushing at frequent intervals/with small batches 21 | - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/169) 22 | - Buffer _size_ calculations relied upon some "sophisticated" logic for determining the "size" in 23 | memory of a Python object 24 | - The method used by Singer libraries is to simply use the size of the streamed `JSON` blob 25 | - Performance Improvement seen due to batches now being far larger and interactions with the remote 26 | being far fewer. 27 | - **BUG FIX:** `NULLABLE` not being _implied_ when field is missing from streamed `JSONSchema` 28 | - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/174) 29 | - If a field was persisted in remote, but then left _out_ of a subsequent streamed `JSONSchema`, we would fail 30 | - In this instance, the field is _implied_ to be `NULL`, but additionally, if values _are_ present for it 31 | in the streamed data, we _should_ persist it. 32 | 33 | ## 0.2.2 34 | 35 | - **FEATURES:** 36 | - [Performance improvement for upserting data](https://github.com/datamill-co/target-postgres/pull/161) 37 | - Saw long running queries for some `SELECT COUNT(1)...` queries 38 | - Resulting in full table scans 39 | - These queries are _only_ being used for `is_table_empty`, therefore we can use a more efficient 40 | `SELECT EXISTS(...)` query which only needs a single row to be fetched 41 | 42 | ## 0.2.1 43 | 44 | - **FEATURES:** 45 | - [Performance improvement for upserting data ](https://github.com/datamill-co/target-postgres/pull/152) 46 | - For large or even reasonably sized tables, trying to upsert the data was prohibitively slow 47 | - To mitigate this, we now add indexes to allow 48 | - This change can be opted out of via the `add_upsert_indexes` config option 49 | - **NOTE**: This only effects intallations post `0.2.1`, and will not upgrade/migrate existing installations 50 | - Support for latest PostgreSQL 12.0 51 | - PostgreSQL recently released 12.0, and we now have testing around it and can confirm that `target-postgres` 52 | _should_ function correctly for it! 53 | - **BUG FIX:** `STATE` messages being sent at the wrong time 54 | - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/149) 55 | - `STATE` messages were being output incorrectly for feeds which had many streams outputting at varying rates 56 | 57 | ## 0.2.0 58 | 59 | - **NOTE:** The `minor` version bump is not expected to have much effect on folks. This was done to signal the 60 | output change from the below bug fix. It is our impression not many are using this feature yet anyways. Since 61 | this was _not_ a `patch` change, we decided to make this a `minor` instead of `major` change to raise _less_ 62 | concern. Thank you for your patience! 63 | - **FEATURES:** 64 | - [Performance improvement for creating `tmp` tables necessary for uploading data](https://github.com/datamill-co/target-postgres/pull/147) 65 | - PostgreSQL dialects allow for creating a table identical to a parent table in a single command 66 | - [`CREATE TABLE (LIKE );`](https://www.postgresql.org/docs/9.1/sql-createtable.html) 67 | - Previously we leveraged using our `upsert` helpers to create new tables. This resulted in _many_ calls 68 | to remote, of varying complexity. 69 | - **BUG FIX:** No `STATE` Message Wrapper necessary 70 | - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/142) 71 | - `STATE` messages are formatted as `{"value": ...}` 72 | - `target-potgres` emitted the _full_ message 73 | - The official `singer-target-template`, doesn't write out that `value` "wrapper", and just writes 74 | the JSON blob contained in it 75 | - This fix makes `target-postgres` do the same 76 | 77 | ## 0.1.11 78 | 79 | - **BUG FIX:** `canonicalize_identifier` Not called on _all_ identifiers persisted to remote 80 | - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/144) 81 | - Presently, on column splits/name collisions, we add a suffix to an identifier 82 | - Previously, we did not canonicalize these suffixes 83 | - While this was not an issue for any `targets` currently in production, it was an issue 84 | for some up and coming `targets`. 85 | - This fix simply makes sure to call `canonicalize_identifier` before persisting an identifier to remote 86 | 87 | ## 0.1.10 88 | 89 | - **FEATURES:** 90 | - [Root Table Name Canonicalization](https://github.com/datamill-co/target-postgres/pull/131) 91 | - The `stream` name is used for the value of the root table name in Postgres 92 | - `stream` names are controlled exclusively by the tap and do _not_ have to meet many standards 93 | - Previously, only `stream` names which were lowercase, alphanumeric, etc. 94 | - Now, the `target` can canonicalize the root table name, allowing for the input `stream` name to be 95 | whatever the `tap` provides. 96 | 97 | ## 0.1.9 98 | 99 | - **Singer-Python:** bumped to latest _5.6.1_ 100 | - **Psycopg2:** bumped to latest _2.8.2_ 101 | - **FEATURES:** 102 | - [`STATE` Message support](https://github.com/datamill-co/target-postgres/pull/130) 103 | - Emits message only when all records buffered _before_ the `STATE` message have been persisted to remote. 104 | - [SSL Support for Postgres](https://github.com/datamill-co/target-postgres/pull/124) 105 | - Added config options for enabling/supporting SSL support. 106 | - **BUG FIX:** `ACTIVATE_VERSION` Messages did not flush buffer 107 | - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/135) 108 | - When we issue an activate version record, we presently do not flush the buffer after writing the batch. This results in more records being written to remote than need to be. 109 | - This results in no functionality change, and should not alleviate any _known_ bugs. 110 | - This should be purely performance related. 111 | 112 | ## 0.1.8 113 | 114 | - **Singer-Python:** bumped to latest 115 | - **Minor housekeeping:** 116 | - Updated container versions to latest 117 | - Updated README to reflect new versions of PostgreSQL Server 118 | 119 | ## 0.1.7 120 | 121 | - **BUG FIX:** A bug was identified for de-nesting. 122 | - [ISSUE LINK](https://github.com/datamill-co/target-postgres/issues/109) 123 | - [FAILING TESTS LINK](https://github.com/datamill-co/target-postgres/pull/110) 124 | - [FIX LINK](https://github.com/datamill-co/target-postgres/pull/111) 125 | - Subtables with subtables did not serialize column names correctly 126 | - The column names ended up having the _table names_ (paths) prepended on them 127 | - Due to the denested table _schema_ and denested _records_ being different 128 | no information showed up in remote. 129 | - This bug was ultimately tracked down to the core denesting logic. 130 | - This will fix failing uploads which had **_nullable_** columns in subtables but 131 | no data was seen populating those columns. 132 | - The broken schema columns will still remain 133 | - Failing schemas which had **_non-null_** columns in subtables will still be broken 134 | - To fix will require dropping the associated tables, potentially resetting the entire 135 | `db`/`schema` 136 | 137 | ## 0.1.6 138 | 139 | - **BUG FIX:** A bug was identified for path to column serialization. 140 | - [LINK](https://github.com/datamill-co/target-postgres/pull/100) 141 | - A nullable properties which had _multiple_ JSONSchema types 142 | - ie, something like `[null, string, integer ...]` 143 | - Failed to find an appropriate column in remote to persist `None` values to. 144 | - Found by usage of the [Hubspot Tap](https://github.com/singer-io/tap-hubspot) 145 | 146 | ## 0.1.5 147 | 148 | - **FEATURES:** 149 | - [Added the `persist_empty_tables`](https://github.com/datamill-co/target-postgres/pull/97) config option which allows the Target to create empty tables in Remote. 150 | 151 | ## 0.1.4 152 | 153 | - **BUG FIX:** A bug was identified in 0.1.3 with stream `key_properties` and canonicalization. 154 | - [LINK](https://github.com/datamill-co/target-postgres/pull/95) 155 | - Discovered and fixed by @mirelagrigoras 156 | - If the `key_properties` for a stream changed due to canonicalization, the stream would fail to persist due to: 157 | - the `persist_csv_rows` `key_properties` values would remain un-canonicalized (sp?) and therefore cause issues once serialized into a SQL statement 158 | - the pre-checks for tables would break because no values could be pulled from the schema with un-canonicalized fields pulled out of the `key_properties` 159 | - **NOTE:** the `key_properties` metadata is saved with _raw_ field names. 160 | 161 | ## 0.1.3 162 | 163 | - **SCHEMA_VERSION: 1** 164 | - [LINK](https://github.com/datamill-co/target-postgres/pull/89) 165 | - Initialized a new field in remote table schemas `schema_version` 166 | - A migration in `PostgresTarget` handles updating this 167 | - **BUG FIX:** A bug was identified in 0.1.2 with column type splitting. 168 | - [LINK](https://github.com/datamill-co/target-postgres/pull/89) 169 | - A schema with a field of type `string` is persisted to remote 170 | - Later, the same field is of type `date-time` 171 | - The values for this field will _not_ be placed under a new column, but rather under the original `string` column 172 | - A schema with a field of type `date-time` is persisted to remote 173 | - Later, the same field is of type `string` 174 | - The original `date-time` column will be made `nullable` 175 | - The values for this field will fail to persist 176 | - **FEATURES:** 177 | - [Added the `logging_level`](https://github.com/datamill-co/target-postgres/pull/92) config option which uses standard Python Logger Levels to configure more details about what Target-Postgres is doing 178 | - Query level logging and timing 179 | - Table schema changes logging and timing 180 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution guidelines 2 | 3 | ## Setting up test environment 4 | 5 | ### Prerequisits 6 | 7 | ``` 8 | python3 -m virtualenv python3 venv 9 | source venv/bin/activate 10 | python3 -m pip install -e .[tests] 11 | # python3 -m pip install -e .\[tests\] <- night need to escape on zsh 12 | export POSTGRES_HOST=localhost 13 | export POSTGRES_DATABASE=target_postgres_test 14 | export POSTGRES_USERNAME=target_postgres_test 15 | export POSTGRES_PASSWORD=target_postgres_test 16 | ``` 17 | 18 | #### Database setup 19 | If you're not using the docker images for tests you'll need to set one up and 20 | configure a user on it. 21 | 22 | ``` 23 | $ psql template1; 24 | <>=# CREATE USER target_postgres_test WITH PASSWORD 'target_postgres_test'; 25 | <>=# CREATE DATABASE target_postgres_test WITH owner=target_postgres_test; 26 | <>=# GRANT ALL privileges ON DATABASE target_postgres_test TO target_postgres_test; 27 | ``` 28 | 29 | #### If psycopg2 install fails 30 | 31 | psycopg2 requires ssl and may fail the `pip install` process above 32 | 33 | ##### Installing openssl 34 | 35 | ###### OSX: 36 | 37 | One possible solution is to use [homebrew](https://brew.sh/): 38 | 39 | ``` 40 | brew install openssl@1.1 41 | export LDFLAGS="-L/usr/local/opt/openssl@1.1/lib" 42 | export CPPFLAGS="-I/usr/local/opt/openssl@1.1/include 43 | python3 -m pip install -r requirements_test.pip 44 | ``` 45 | 46 | ## Running tests 47 | Tests are written using [pytest](https://docs.pytest.org/). 48 | 49 | ``` 50 | cd 51 | python3 -m pytest tests/unit 52 | ``` 53 | 54 | Simply run the tests with pytest as a module when inside the root of the 55 | checkout; this ensures the `target_postgres/` module directory is found on the 56 | `PYTHONPATH`. 57 | -------------------------------------------------------------------------------- /DECISIONS.md: -------------------------------------------------------------------------------- 1 | # Decisions 2 | 3 | This document is intended to provide clarity on many of the decisions/rationalizations 4 | which exist inside of [Datamill's](https://datamill.co/) Target SQL project 5 | for [Singer](https://singer.io). 6 | 7 | ## Principles 8 | 9 | The guiding principles we try to adhere to herein as far as _how_ to reach a 10 | conclusion are: 11 | 12 | 1. When possible, make the resulting data/schema in the remote target consistent, no matter the ordering of potential messages 13 | - ie, if our decision would result in a random schema being produced in the remote target for no reasonable benefit, this is in violation 14 | 1. Do right by the common _majority_ of users 15 | 1. Make a best effort to prevent a user from having to intervene 16 | 1. Use [Stitch’s offering and documentation](https://www.stitchdata.com/docs) as best practice guidance 17 | 18 | ## Schema 19 | 20 | ### De-nesting 21 | 22 | #### What 23 | 24 | - [JSON Schema](https://json-schema.org/) allows for complex schemas which have non-literal (ie, compositional) elements 25 | - examples include: 26 | - `objects` (ie, `{'a': 1, 'b': 2 ...}`) 27 | - `array` (ie, `[1, 'a', 2, {4: False}]`) 28 | - `anyOf` 29 | - Standard SQL does not support compositional elements, but rather data which is highly structured in potentially many related tables 30 | - To overcome this, `target-sql` provides tooling which unpacks: 31 | - json `objects` into their parent record 32 | - json `arrays` as sub tables 33 | 34 | ```py 35 | # Stream `FOO` 36 | [ 37 | {'nested_object': { 38 | 'a': 1, 39 | 'b': 2 40 | } 41 | 'nested_array': [ 42 | {'c': False, 'd': 'abc'}, 43 | {'c': True, 'd': 'xyz'} 44 | ] 45 | } 46 | ] 47 | 48 | 49 | # Results in: 50 | ## Table `foo` 51 | [ 52 | {'nested_object__a': 1, 53 | 'nested_object__b': 2} 54 | ] 55 | 56 | ## Table `foo__nested_array` 57 | [ 58 | {'c': False, 'd': 'abc'}, 59 | {'c': True, 'd': 'xyz'} 60 | ] 61 | 62 | ``` 63 | 64 | #### Why 65 | 66 | - This approach is inspired by what Stitch Data takes with `object`/`array` de-nesting. 67 | - The user experience for those using a SQL querying language is better for flat tables 68 | - as compared to something like [PostgreSQL's JSONB](https://www.postgresql.org/docs/9.4/datatype-json.html) support 69 | - Data warehouses tend to prefer [denormalized](https://en.wikipedia.org/wiki/Denormalization) structures while operational databases prefer normalized structures. We normalize the incoming structure so the user can choose what to do with the normalized raw data. Also it's easy to access and transform later than JSON blobs. 70 | 71 | ### Column Type Mismatch 72 | 73 | #### What 74 | 75 | 1. A field has been streamed to the remote target with type `integer` 76 | 1. A new field with the _same raw name_ as the remote column has been streamed but has type `boolean` 77 | - Data of type `boolean` cannot be placed into a column of type `integer` 78 | 1. `target-sql` has tooling which will: 79 | 1. rename the original column to `original_field_name__i` 80 | 1. make the renamed column `nullable` 81 | 1. create a new column of name `original_field_name__b` 82 | 1. stream new data to `original_field_name__b` 83 | - (to see a full list of type suffixes, please see: [`json_schema._shorthand_mappings`](https://github.com/datamill-co/target-postgres/blob/d626061d7a0e785f06b19589e1951637f2748262/target_postgres/json_schema.py#L283)) 84 | 85 | #### Why 86 | 87 | ***TL;DR:*** Instead of throwing a hard error and forcing users to do some manual 88 | transformation _before_ streaming data through `target-sql`, we chose a "best 89 | effort" approach to resolving the underlying error. 90 | 91 | By renaming and migrating the column we: 92 | 93 | - make the resulting structure in the database the same no matter whether we upload column `integer` _then_ column `boolean` or vice versa. 94 | - users learn of dependent views/columns blocking a type change _early_ 95 | 96 | ### Column Name Collision 97 | 98 | #### What 99 | 100 | 1. Field of name `foo` is streamed 101 | 1. Field of name `FOO` is then streamed 102 | 1. Since both of these names canonicalize to the same result (ie, `foo`), we have a name collision 103 | 1. When attempting to `upsert_table`, `SQLInterface` has to handle name collisions. To do this, it attaches a unique suffix to the name which _caused the collision_, not the original 104 | - The suffix is an auto-incrementing numerical value 105 | 106 | ```py 107 | # Field `foo` is streamed 108 | # Field `FOO` is streamed 109 | 110 | [ 111 | {'foo': 1, 112 | 'FOO': False, 113 | 'fOo': 4.0} 114 | ] 115 | 116 | # The resulting table will be: 117 | 118 | [ 119 | {'foo': 1, 120 | 'foo__1': False, 121 | 'foo__2': 4.0} 122 | ] 123 | 124 | ``` 125 | 126 | #### Why 127 | 128 | ***TL;DR:*** Instead of throwing a hard error and forcing users to do some manual 129 | transformation _before_ streaming data through `target-sql`, we chose a "best 130 | effort" approach to resolving the underlying error. 131 | 132 | - While this means that _ordering_ of fields/actions matters in regards to the final remote structure, users can observe their remote structure simply 133 | - Hashes have been used as suffixes in past, but it was determined that these were too confusing for end users. So while they allowed us to adhere to [principle](#principles) (1), it meant [principle](#principles) (2) was being ignored. 134 | - Additionally, we chose _not to_ prepend a numerical suffix to _all_ columns for the same reason. _Most_ users are not going to have name collisions, so instead of making the overall user experience worse, we chose to have a targeted solution to this particular edge case 135 | 136 | ### Column Name Length 137 | 138 | #### What 139 | 140 | - `SQLInterface` provides a single field called `IDENTIFIER_FIELD_LENGTH` which is to be overridden by the implementing class 141 | - Any column which is found to be excess of `IDENTIFIER_FIELD_LENGTH` is truncated to be no longer than `IDENTIFIER_FIELD_LENGTH` 142 | - All `collision` and `type` information is preserved in the truncation 143 | - ie, any values which are suffixed onto the name as `__...` 144 | - All original field/column names are preserved as a `column_mapping` 145 | 146 | #### Why 147 | 148 | ***TL;DR:*** Instead of throwing a hard error and forcing users to do some manual 149 | transformation _before_ streaming data through `target-sql`, we chose a "best 150 | effort" approach to resolving the underlying error. 151 | 152 | Most (all?) SQL targets we have encountered have length restrictions for identifiers 153 | in their schema. Since arbitrary JSON _does_ not have this same restriction, we needed 154 | a best effort mechanism for handling names which were either auto-generated and are 155 | too long, or user input fields which physically cannot fit into the remote target. 156 | 157 | As such, we chose to take the simplest method here for clarity. ie, truncate the 158 | original/generated name, and then proceed with collision support as normal. 159 | 160 | The implementing class is tasked with providing `canonicalize_identifier`, a method 161 | which when called is expected to _only_ transform a string identifier into another 162 | string identifier which contains only characters which are allowed by the remote target. 163 | 164 | ## Data De-nesting 165 | 166 | ### Objects 167 | 168 | #### What 169 | 170 | - `Objects` are unpacked into their parent table. 171 | - The unpacked fields are prefixed with the name of the `field` which originally contained the object. 172 | 173 | #### Why 174 | 175 | - This approach is inspired by what Stitch Data takes with `object` de-nesting. 176 | - The user experience for those using a SQL querying language is better for flat tables 177 | - as compared to something like [PostgreSQL's JSONB](https://www.postgresql.org/docs/9.4/datatype-json.html) support 178 | 179 | ### Arrays 180 | 181 | #### What 182 | 183 | - `Arrays` are unrolled as individual rows into a child table 184 | - The table name is constructed as `parent_table__field` 185 | 186 | #### Why 187 | 188 | - This approach is inspired by what Stitch Data takes with `array` de-nesting. 189 | - The user experience for those using a SQL querying language is better for flat tables 190 | - as compared to something like [PostgreSQL's JSONB](https://www.postgresql.org/docs/9.4/datatype-json.html) support 191 | 192 | ## Queries 193 | 194 | ### What 195 | 196 | - When we write SQL at any given point, we have the option to use "latest" PostgreSQL features 197 | - We opt for features available from PostgreSQL 8.4.22 forward 198 | - We ***DO NOT*** support PostgreSQL 8.4.22 199 | - any features/bugs issues based on this will be weighed against this decision as far as effort to benefit 200 | 201 | ### Why 202 | 203 | - Supporting multiple versions of PostgreSQL has _thus far_ been fairly straightforward by adhering to only query support available in the _oldest_ version of supported PostgreSQL 204 | - By doing this, we only have one main code base, instead of many fractured versions which all employ the latest/greatest system functions/methods/tables/information schemas available 205 | - By using 8.4.22, supporting [Redshift](https://github.com/datamill-co/target-redshift) is made simpler 206 | - Redshift was originally split from [PostgreSQL 8.0.2](https://docs.aws.amazon.com/redshift/latest/dg/c_redshift-and-postgres-sql.html) 207 | - At some point, a _lot_ of work was done by AWS to make Redshift a "simple fork" of PostgreSQL 8.4 208 | - We do not _support_ PostgreSQL 8.4 simply because PostgreSQL does not support it anymore 209 | - Our _only_ benefit to making 8.4 query language our target is Redshift 210 | - When a new supported version of PostgreSQL comes along, and we undertake the effort to support it herein, if supporting it is simpler to do by breaking 8.4, we will move the necessary logic to [target-redshift](https://github.com/datamill-co/target-redshift) 211 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018-2021 Data Mill Services, LLC 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Target Postgres 2 | 3 | [![CircleCI](https://circleci.com/gh/datamill-co/target-postgres.svg?style=svg)](https://circleci.com/gh/datamill-co/target-postgres) 4 | 5 | [![PyPI version](https://badge.fury.io/py/singer-target-postgres.svg)](https://pypi.org/project/singer-target-postgres/) 6 | 7 | [![](https://img.shields.io/librariesio/github/datamill-co/target-postgres.svg)](https://libraries.io/github/datamill-co/target-postgres) 8 | 9 | A [Singer](https://singer.io/) postgres target, for use with Singer streams generated by Singer taps. 10 | 11 | ## Features 12 | 13 | - Creates SQL tables for [Singer](https://singer.io) streams 14 | - Denests objects flattening them into the parent object's table 15 | - Denests rows into separate tables 16 | - Adds columns and sub-tables as new fields are added to the stream [JSON Schema](https://json-schema.org/) 17 | - Full stream replication via record `version` and `ACTIVATE_VERSION` messages. 18 | 19 | ## Install 20 | 21 | 1. Add `libpq` dependency 22 | 23 | ```sh 24 | # macos 25 | brew install postgresql 26 | ``` 27 | ```sh 28 | # ubuntu 29 | sudo apt install libpq-dev 30 | ``` 31 | 32 | 1. install `singer-target-postgres` 33 | 34 | ```sh 35 | pip install singer-target-postgres 36 | ``` 37 | 38 | ## Usage 39 | 40 | 1. Follow the 41 | [Singer.io Best Practices](https://github.com/singer-io/getting-started/blob/master/docs/RUNNING_AND_DEVELOPING.md#running-a-singer-tap-with-a-singer-target) 42 | for setting up separate `tap` and `target` virtualenvs to avoid version 43 | conflicts. 44 | 45 | 1. Create a [config file](#configjson) at 46 | `~/singer.io/target_postgres_config.json` with postgres connection 47 | information and target postgres schema. 48 | 49 | ```json 50 | { 51 | "postgres_host": "localhost", 52 | "postgres_port": 5432, 53 | "postgres_database": "my_analytics", 54 | "postgres_username": "myuser", 55 | "postgres_password": "1234", 56 | "postgres_schema": "mytapname" 57 | } 58 | ``` 59 | 60 | 1. Run `target-postgres` against a [Singer](https://singer.io) tap. 61 | 62 | ```bash 63 | ~/.virtualenvs/tap-something/bin/tap-something \ 64 | | ~/.virtualenvs/target-postgres/bin/target-postgres \ 65 | --config ~/singer.io/target_postgres_config.json >> state.json 66 | ``` 67 | 68 | If you are running windows, the following is equivalent: 69 | 70 | ``` 71 | venvs\tap-exchangeratesapi\Scripts\tap-exchangeratesapi.exe | ^ 72 | venvs\target-postgresql\Scripts\target-postgres.exe ^ 73 | --config target_postgres_config.json 74 | ``` 75 | 76 | ### Config.json 77 | 78 | The fields available to be specified in the config file are specified 79 | here. 80 | 81 | | Field | Type | Default | Details | 82 | | --------------------------- | --------------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 83 | | `postgres_host` | `["string", "null"]` | `"localhost"` | | 84 | | `postgres_port` | `["integer", "null"]` | `5432` | | 85 | | `postgres_database` | `["string"]` | `N/A` | | 86 | | `postgres_username` | `["string", "null"]` | `N/A` | | 87 | | `postgres_password` | `["string", "null"]` | `null` | | 88 | | `postgres_schema` | `["string", "null"]` | `"public"` | | 89 | | `postgres_sslmode` | `["string", "null"]` | `"prefer"` | Refer to the [libpq](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-PARAMKEYWORDS) docs for more information about SSL | 90 | | `postgres_sslcert` | `["string", "null"]` | `"~/.postgresql/postgresql.crt"` | Only used if a SSL request w/ a client certificate is being made | 91 | | `postgres_sslkey` | `["string", "null"]` | `"~/.postgresql/postgresql.key"` | Only used if a SSL request w/ a client certificate is being made | 92 | | `postgres_sslrootcert` | `["string", "null"]` | `"~/.postgresql/root.crt"` | Used for authentication of a server SSL certificate | 93 | | `postgres_sslcrl` | `["string", "null"]` | `"~/.postgresql/root.crl"` | Used for authentication of a server SSL certificate | 94 | | `invalid_records_detect` | `["boolean", "null"]` | `true` | Include `false` in your config to disable `target-postgres` from crashing on invalid records | 95 | | `invalid_records_threshold` | `["integer", "null"]` | `0` | Include a positive value `n` in your config to allow for `target-postgres` to encounter at most `n` invalid records per stream before giving up. | 96 | | `disable_collection` | `["string", "null"]` | `false` | Include `true` in your config to disable [Singer Usage Logging](#usage-logging). | 97 | | `logging_level` | `["string", "null"]` | `"INFO"` | The level for logging. Set to `DEBUG` to get things like queries executed, timing of those queries, etc. See [Python's Logger Levels](https://docs.python.org/3/library/logging.html#levels) for information about valid values. | 98 | | `persist_empty_tables` | `["boolean", "null"]` | `False` | Whether the Target should create tables which have no records present in Remote. | 99 | | `max_batch_rows` | `["integer", "null"]` | `200000` | The maximum number of rows to buffer in memory before writing to the destination table in Postgres | 100 | | `max_buffer_size` | `["integer", "null"]` | `104857600` (100MB in bytes) | The maximum number of bytes to buffer in memory before writing to the destination table in Postgres | 101 | | `batch_detection_threshold` | `["integer", "null"]` | `5000`, or 1/40th `max_batch_rows` | How often, in rows received, to count the buffered rows and bytes to check if a flush is necessary. There's a slight performance penalty to checking the buffered records count or bytesize, so this controls how often this is polled in order to mitigate the penalty. This value is usually not necessary to set as the default is dynamically adjusted to check reasonably often. | 102 | | `state_support` | `["boolean", "null"]` | `True` | Whether the Target should emit `STATE` messages to stdout for further consumption. In this mode, which is on by default, STATE messages are buffered in memory until all the records that occurred before them are flushed according to the batch flushing schedule the target is configured with. | 103 | | `add_upsert_indexes` | `["boolean", "null"]` | `True` | Whether the Target should create column indexes on the important columns used during data loading. These indexes will make data loading slightly slower but the deduplication phase much faster. Defaults to on for better baseline performance. | 104 | | `before_run_sql` | `["string", "null"]` | `None` | Raw SQL statement(s) to execute as soon as the connection to Postgres is opened by the target. Useful for setup like `SET ROLE` or other connection state that is important. | 105 | | `after_run_sql` | `["string", "null"]` | `None` | Raw SQL statement(s) to execute as soon as the connection to Postgres is opened by the target. Useful for setup like `SET ROLE` or other connection state that is important. | 106 | | `before_run_sql_file` | `["string", "null"]` | `None` | Similar to `before_run_sql` but reads an external file instead of SQL in the JSON config file. | 107 | | `after_run_sql_file` | `["string", "null"]` | `None` | Similar to `after_run_sql` but reads an external file instead of SQL in the JSON config file. | 108 | | `application_name` | `["string", "null"]` | `None` | Set the postgresql `application_name` connection option to help with debugging, etc... | 109 | 110 | 111 | ### Supported Versions 112 | 113 | `target-postgres` only supports [JSON Schema Draft4](http://json-schema.org/specification-links.html#draft-4). 114 | While declaring a schema _is optional_, any input schema which declares a version 115 | other than 4 will be rejected. 116 | 117 | `target-postgres` supports all versions of PostgreSQL which are presently supported 118 | by the PostgreSQL Global Development Group. Our [CI config](https://github.com/datamill-co/target-postgres/blob/master/.circleci/config.yml) defines all versions we are currently supporting. 119 | 120 | | Version | Current minor | Supported | First Release | Final Release | 121 | | ------- | ------------- | --------- | ------------------ | ----------------- | 122 | | 15 | 15.0 | Yes | October 13, 2022 | November 11, 2027 | 123 | | 14 | 14.5 | Yes | September 30, 2021 | November 12, 2026 | 124 | | 13 | 13.8 | Yes | September 24, 2020 | November 13, 2025 | 125 | | 12 | 12.12 | Yes | October 3, 2019 | November 14, 2024 | 126 | | 11 | 11.17 | Yes | October 18, 2018 | November 9, 2023 | 127 | | 10 | 10.22 | Yes | October 5, 2017 | November 10, 2022 | 128 | 129 | _The above is copied from the [current list of versions](https://www.postgresql.org/support/versioning/) on Postgresql.org_ 130 | 131 | ## Known Limitations 132 | 133 | - Requires a [JSON Schema](https://json-schema.org/) for every stream. 134 | - Only string, string with date-time format, integer, number, boolean, 135 | object, and array types with or without null are supported. Arrays can 136 | have any of the other types listed, including objects as types within 137 | items. 138 | - Example of JSON Schema types that work 139 | - `['number']` 140 | - `['string']` 141 | - `['string', 'null']` 142 | - Exmaple of JSON Schema types that **DO NOT** work 143 | - `['string', 'integer']` 144 | - `['integer', 'number']` 145 | - `['any']` 146 | - `['null']` 147 | - JSON Schema combinations such as `anyOf` and `oneOf` are not supported. 148 | - JSON Schema \$ref is partially supported: 149 | - **_NOTE:_** The following limitations are known to **NOT** fail gracefully 150 | - Presently you cannot have any circular or recursive `$ref`s 151 | - `$ref`s must be present within the schema: 152 | - URI's do not work 153 | - if the `$ref` is broken, the behaviour is considered unexpected 154 | - Any values which are the `string` `NULL` will be streamed to PostgreSQL as the literal `null` 155 | - Table names are restricted to: 156 | - 63 characters in length 157 | - can only be composed of `_`, lowercase letters, numbers, `$` 158 | - cannot start with `$` 159 | - ASCII characters 160 | - Field/Column names are restricted to: 161 | - 63 characters in length 162 | - ASCII characters 163 | 164 | ## Indexes 165 | 166 | If the `add_upsert_indexes` config option is enabled, which it is by default, `target-postgres` adds indexes on the tables it creates for its own queries to be more performant. Specifically, `target-postgres` automatically adds indexes to the `_sdc_sequence` column and the `_sdc_level__id` columns which are used heavily when inserting and upserting. 167 | 168 | `target-postgres` doesn't have any facilities for adding other indexes to the managed tables, so if there are more indexes required, they should be added by another downstream tool, or can just be added by an administrator when necessary. Note that these indexes incur performance overhead to maintain as data is inserted, These indexes can also prevent `target-postgres` from dropping columns in the future if the schema of the table changes, in which case an administrator should drop the index so `target-postgres` is able to drop the columns it needs to. 169 | 170 | **Note**: Index adding is new as of version `0.2.1`, and `target-postgres` does not retroactively create indexes for tables it created before that time. If you want to add indexes to older tables `target-postgres` is loading data into, they should be added manually. 171 | 172 | ## Usage Logging 173 | 174 | [Singer.io](https://www.singer.io/) requires official taps and targets to collect anonymous usage data. This data is only used in aggregate to report on individual tap/targets, as well as the Singer community at-large. IP addresses are recorded to detect unique tap/targets users but not shared with third-parties. 175 | 176 | To disable anonymous data collection set `disable_collection` to `true` in the configuration JSON file. 177 | 178 | ## Developing 179 | 180 | `target-postgres` utilizes [poetry](https://python-poetry.org/docs/) for package 181 | management, and [PyTest](https://docs.pytest.org/en/latest/contents.html) for testing. 182 | 183 | ### Documentation 184 | 185 | See also: 186 | 187 | - [DECISIONS](./DECISIONS.md): A document containing high level explanations of various decisions and decision making paradigms. A good place to request more explanation/clarification on confusing things found herein. 188 | - [TableMetadata](./docs/TableMetadata.md): A document detailing some of the metadata necessary for `TargetPostgres` to function correctly on the Remote 189 | 190 | ### Docker 191 | 192 | If you have [Docker](https://www.docker.com/) and [Docker Compose](https://docs.docker.com/compose/) installed, you can 193 | easily run the following to get a local env setup quickly. 194 | 195 | ```sh 196 | $ docker-compose up -d --build 197 | $ docker logs -tf target-postgres_target-postgres_1 # You container names might differ 198 | ``` 199 | 200 | As soon as you see `INFO: Dev environment ready.` you can shell into the container and start running test commands: 201 | 202 | ```sh 203 | $ docker-compose exec target-postgres bash 204 | (target-postgres) root@...:/code# pytest 205 | ``` 206 | 207 | The environment inside the docker container has a virtualenv set up and activated, with an `--editable` install of `target-postgres` inside it and your local code mounted as a Docker volume. If you make changes on your host and re-run `pytest` any changes should be reflected inside the container. 208 | 209 | See the [PyTest](#pytest) commands below! 210 | 211 | ### DB 212 | 213 | To run the tests, you will need a PostgreSQL server running. 214 | 215 | **_NOTE:_** Testing assumes that you've exposed the traditional port `5432`. 216 | 217 | Make sure to set the following env vars for [PyTest](#pytest): 218 | 219 | ```sh 220 | $ EXPORT POSTGRES_HOST='' # Most likely 'localhost' 221 | $ EXPORT POSTGRES_DB='' # We use 'target_postgres_test' 222 | $ EXPORT POSTGRES_USER=' Fork -> New Branch(If needed) -> Pull Request -> Approval -> Merge 246 | 247 | Users can file an issue without submitting a pull request but be aware not all issues can or will be addressed. 248 | 249 | ## Sponsorship 250 | 251 | Target Postgres is sponsored by Data Mill (Data Mill Services, LLC) [datamill.co](https://datamill.co/). 252 | 253 | Data Mill helps organizations utilize modern data infrastructure and data science to power analytics, products, and services. 254 | 255 | --- 256 | 257 | Copyright Data Mill Services, LLC 2018 258 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | services: 3 | db: 4 | image: postgres:9.6.17 5 | environment: 6 | POSTGRES_DB: target_postgres_test 7 | POSTGRES_PASSWORD: postgres 8 | ports: 9 | - "5432:5432" 10 | 11 | target-postgres: 12 | image: python:3.7.7-stretch 13 | working_dir: /code 14 | entrypoint: /code/docker-entrypoint.sh 15 | environment: 16 | POSTGRES_HOST: db 17 | POSTGRES_DATABASE: target_postgres_test 18 | POSTGRES_USERNAME: postgres 19 | POSTGRES_PASSWORD: postgres 20 | volumes: 21 | - .:/code 22 | -------------------------------------------------------------------------------- /docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python -m venv venv/target-postgres 4 | tests/migrations/scripts/install_schema_versions.sh 5 | source /code/venv/target-postgres/bin/activate 6 | 7 | pip install -e .[tests] 8 | 9 | echo "source /code/venv/target-postgres/bin/activate" >> ~/.bashrc 10 | echo -e "\n\nINFO: Dev environment ready." 11 | 12 | tail -f /dev/null 13 | -------------------------------------------------------------------------------- /docs/TableMetadata.md: -------------------------------------------------------------------------------- 1 | # Table Metadata 2 | 3 | `SQLInterface` relies upon more schema information than is normally able to be 4 | provided by a raw SQL schema. For instance, information about the original 5 | non-canonicalized name of a field gets lost when that field is normalized into 6 | a column. 7 | 8 | To achieve this, metadata is stored. For Target Postgres, this metadata is currently 9 | stored in a JSON Blob which is set onto each table's comment. 10 | 11 | This document details the structure of this structure. 12 | 13 | ## Table Comment Schema 14 | 15 | | Field | Type | Default | Details | 16 | | ----- | ---- | ------- | ------- | 17 | | `version` |`["string", "null"]` | `null` | The Singer table version to be used with `activate_version` | 18 | | `key_properties` | `["array", "null"]` | `null` | Array of `string`s representing the pks for the table. | 19 | | `mappings` | `["object", "null"]`| `null` | Mappings which take `current_column_name` to a `COLUMN_MAPPING` detailed below. | 20 | | `table_mappings` | `{'type': ["array", "null"], 'items': {'type': "$TABLE_MAPPING"}}`| `null` | Mappings which detail information about tables and their names. See `TABLE_MAPPING` below. | 21 | 22 | ## COLUMN_MAPPING 23 | 24 | | Field | Type | Default | Details | 25 | | ----- | ---- | ------- | ------- | 26 | | `from` | `["string"]` | `N/A` | The original name of the field/property this column represents | 27 | | `type` | `["array"]` | `N/A` | The `json_schema.type` of the `from` column | 28 | 29 | ## TABLE_MAPPING 30 | 31 | | Field | Type | Default | Details | 32 | | ----- | ---- | ------- | ------- | 33 | | `type` | `["string"]` | `TABLE` | The type of mapping present, which is always `TABLE` | 34 | | `from` | `{'type': ["array"], 'items': {'type': ["string"]}}` | `[]` | The fields/properties which lead to this (sub)table in the original schema. ie, the root table's path will always be `[]`, a table made from an array found at the property `foo` will be `[, "foo"]` etc. etc. | 35 | | `to` | `["string"]` | `N/A` | The table name which takes the `from` path `to` the target's representation | 36 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "singer-target-postgres" 3 | version = "0.2.4" 4 | authors = ["datamill"] 5 | description = "Singer.io target for loading data into postgres" 6 | readme = "README.md" 7 | homepage = "https://github.com/datamill-co/target-postgres" 8 | repository = "https://github.com/datamill-co/target-postgres" 9 | classifiers = [ 10 | "License :: OSI Approved :: MIT License", 11 | "Intended Audience :: Developers", 12 | "Operating System :: OS Independent", 13 | "Programming Language :: Python :: 3.7", 14 | "Programming Language :: Python :: 3.8", 15 | "Programming Language :: Python :: 3.9", 16 | "Programming Language :: Python :: 3.10", 17 | "Programming Language :: Python :: 3.11", 18 | "Programming Language :: Python :: Implementation :: CPython", 19 | "Topic :: Software Development :: Libraries :: Application Frameworks", 20 | ] 21 | license = "MIT" 22 | packages = [{include = "target_postgres"}] 23 | 24 | [project.urls] 25 | "Homepage" = "https://github.com/datamill-co/target-postgres" 26 | "Bug Tracker" = "https://github.com/datamill-co/target-postgres/issues" 27 | 28 | 29 | [tool.poetry.dependencies] 30 | python = ">=3.7" 31 | arrow = "^1.2.3" 32 | psycopg2-binary = "^2.9.5" 33 | singer-python = "^5.9.0" 34 | 35 | [tool.poetry.group.tests] 36 | optional = true 37 | 38 | [tool.poetry.group.tests.dependencies] 39 | chance = "^0.110" 40 | Faker = "^15.1.3" 41 | pytest = "^7.2.0" 42 | 43 | [build-system] 44 | requires = ["poetry-core"] 45 | build-backend = "poetry.core.masonry.api" 46 | 47 | [tool.poetry.scripts] 48 | # CLI declaration 49 | target-postgres = 'target_postgres:cli' 50 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | error 4 | ignore::UserWarning 5 | ignore:.*Using or importing the ABCs from:DeprecationWarning 6 | -------------------------------------------------------------------------------- /target_postgres/__init__.py: -------------------------------------------------------------------------------- 1 | from singer import utils 2 | import psycopg2 3 | 4 | from target_postgres.postgres import MillisLoggingConnection, PostgresTarget 5 | from target_postgres import target_tools 6 | 7 | REQUIRED_CONFIG_KEYS = [ 8 | 'postgres_database' 9 | ] 10 | 11 | 12 | def main(config, input_stream=None): 13 | with psycopg2.connect( 14 | connection_factory=MillisLoggingConnection, 15 | host=config.get('postgres_host', 'localhost'), 16 | port=config.get('postgres_port', 5432), 17 | dbname=config.get('postgres_database'), 18 | user=config.get('postgres_username'), 19 | password=config.get('postgres_password'), 20 | sslmode=config.get('postgres_sslmode'), 21 | sslcert=config.get('postgres_sslcert'), 22 | sslkey=config.get('postgres_sslkey'), 23 | sslrootcert=config.get('postgres_sslrootcert'), 24 | sslcrl=config.get('postgres_sslcrl'), 25 | application_name=config.get('application_name', 'target-postgres'), 26 | ) as connection: 27 | postgres_target = PostgresTarget( 28 | connection, 29 | postgres_schema=config.get('postgres_schema', 'public'), 30 | logging_level=config.get('logging_level'), 31 | persist_empty_tables=config.get('persist_empty_tables'), 32 | add_upsert_indexes=config.get('add_upsert_indexes', True), 33 | before_run_sql=config.get('before_run_sql'), 34 | after_run_sql=config.get('after_run_sql'), 35 | ) 36 | 37 | if input_stream: 38 | target_tools.stream_to_target(input_stream, postgres_target, config=config) 39 | else: 40 | target_tools.main(postgres_target) 41 | 42 | 43 | def cli(): 44 | args = utils.parse_args(REQUIRED_CONFIG_KEYS) 45 | 46 | main(args.config) 47 | -------------------------------------------------------------------------------- /target_postgres/denest.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from target_postgres import json_schema, singer 4 | 5 | 6 | def to_table_batches(schema, key_properties, records): 7 | """ 8 | Given a schema, and records, get all table schemas and records and prep them 9 | in a `table_batch`. 10 | 11 | :param schema: SingerStreamSchema 12 | :param key_properties: [string, ...] 13 | :param records: [{...}, ...] 14 | :return: [{'streamed_schema': TABLE_SCHEMA(local), 15 | 'records': [{(path_0, path_1, ...): 16 | (_json_schema_string_type, value), ...}, 17 | ...]}, 18 | ...] 19 | """ 20 | table_schemas = _get_streamed_table_schemas(schema, 21 | key_properties) 22 | 23 | table_records = _get_streamed_table_records(key_properties, 24 | records) 25 | writeable_batches = [] 26 | for table_json_schema in table_schemas: 27 | writeable_batches.append({'streamed_schema': table_json_schema, 28 | 'records': table_records.get(table_json_schema['path'], [])}) 29 | 30 | return writeable_batches 31 | 32 | 33 | def _get_streamed_table_schemas(schema, key_properties): 34 | """ 35 | Given a `schema` and `key_properties` return the denested/flattened TABLE_SCHEMA of 36 | the root table and each sub table. 37 | 38 | :param schema: SingerStreamSchema 39 | :param key_properties: [string, ...] 40 | :return: [TABLE_SCHEMA(denested_streamed_schema_0), ...] 41 | """ 42 | root_table_schema = json_schema.simplify(schema) 43 | 44 | subtables = {} 45 | key_prop_schemas = {} 46 | for key in key_properties: 47 | key_prop_schemas[key] = schema['properties'][key] 48 | _denest_schema(tuple(), root_table_schema, key_prop_schemas, subtables) 49 | 50 | ret = [_to_table_schema(tuple(), None, key_properties, root_table_schema['properties'])] 51 | for path, schema in subtables.items(): 52 | ret.append(_to_table_schema(path, schema['level'], schema['key_properties'], schema['properties'])) 53 | 54 | return ret 55 | 56 | 57 | def _to_table_schema(path, level, keys, properties): 58 | for key in keys: 59 | if not (key,) in properties: 60 | raise Exception('Unknown key "{}" found for table "{}". Known fields are: {}'.format( 61 | key, path, properties 62 | )) 63 | 64 | return {'type': 'TABLE_SCHEMA', 65 | 'path': path, 66 | 'level': level, 67 | 'key_properties': keys, 68 | 'mappings': [], 69 | 'schema': {'type': 'object', 70 | 'additionalProperties': False, 71 | 'properties': properties}} 72 | 73 | 74 | def _literal_only_schema(schema): 75 | 76 | ret_types = json_schema.get_type(schema) 77 | 78 | if json_schema.is_object(schema): 79 | ret_types.remove(json_schema.OBJECT) 80 | if json_schema.is_iterable(schema): 81 | ret_types.remove(json_schema.ARRAY) 82 | if json_schema.is_nullable(schema): 83 | ret_types.remove(json_schema.NULL) 84 | 85 | ret_schemas = [] 86 | for t in ret_types: 87 | s = deepcopy(schema) 88 | s['type'] = [t] 89 | 90 | if json_schema.is_nullable(schema): 91 | s = json_schema.make_nullable(s) 92 | 93 | ret_schemas.append(s) 94 | 95 | return { 96 | 'anyOf': ret_schemas 97 | } 98 | 99 | 100 | def _create_subtable(table_path, table_json_schema, key_prop_schemas, subtables, level): 101 | if json_schema.is_object(table_json_schema['items']): 102 | new_properties = table_json_schema['items']['properties'] 103 | else: 104 | new_properties = {singer.VALUE: table_json_schema['items']} 105 | 106 | key_properties = [] 107 | for pk, item_json_schema in key_prop_schemas.items(): 108 | key_properties.append(singer.SOURCE_PK_PREFIX + pk) 109 | new_properties[singer.SOURCE_PK_PREFIX + pk] = item_json_schema 110 | 111 | new_properties[singer.SEQUENCE] = { 112 | 'type': ['null', 'integer'] 113 | } 114 | 115 | for i in range(0, level + 1): 116 | new_properties[singer.LEVEL_FMT.format(i)] = { 117 | 'type': ['integer'] 118 | } 119 | 120 | new_schema = {'type': [json_schema.OBJECT], 121 | 'properties': new_properties, 122 | 'level': level, 123 | 'key_properties': key_properties} 124 | 125 | _denest_schema(table_path, new_schema, key_prop_schemas, subtables, level=level) 126 | 127 | subtables[table_path] = new_schema 128 | 129 | 130 | def _denest_schema__singular_schemas(table_json_schema): 131 | ret = [] 132 | assert json_schema.is_object(table_json_schema), 'Cannot denest non-object json_schema for tables. Passed: {}'.format(table_json_schema) 133 | 134 | for prop, sub_schema in table_json_schema['properties'].items(): 135 | singular_sub_schemas = [sub_schema] 136 | if json_schema.is_anyof(sub_schema): 137 | singular_sub_schemas = sub_schema['anyOf'] 138 | 139 | for s in singular_sub_schemas: 140 | assert json_schema.is_object(s) or json_schema.is_iterable(s) or json_schema.is_literal(s), \ 141 | 'Table schema cannot be denested due to: {} {}'.format( 142 | s, 143 | table_json_schema) 144 | 145 | ret.append((prop, s)) 146 | 147 | return ret 148 | 149 | 150 | def _denest_schema_helper( 151 | table_path, 152 | prop_path, 153 | table_json_schema, 154 | nullable, 155 | top_level_schema, 156 | key_prop_schemas, 157 | subtables, 158 | level): 159 | 160 | for prop, item_json_schema in _denest_schema__singular_schemas(table_json_schema): 161 | 162 | if json_schema.is_object(item_json_schema): 163 | _denest_schema_helper(table_path + (prop,), 164 | prop_path + (prop,), 165 | item_json_schema, 166 | nullable, 167 | top_level_schema, 168 | key_prop_schemas, 169 | subtables, 170 | level) 171 | 172 | elif json_schema.is_iterable(item_json_schema): 173 | _create_subtable(table_path + (prop,), 174 | item_json_schema, 175 | key_prop_schemas, 176 | subtables, 177 | level + 1) 178 | 179 | elif json_schema.is_literal(item_json_schema): 180 | if nullable: 181 | item_json_schema = json_schema.make_nullable(item_json_schema) 182 | 183 | p = prop_path + (prop,) 184 | if p in top_level_schema: 185 | top_level_schema[p]['anyOf'].append(item_json_schema) 186 | else: 187 | top_level_schema[p] = {'anyOf': [item_json_schema]} 188 | 189 | 190 | def _denest_schema( 191 | table_path, 192 | table_json_schema, 193 | key_prop_schemas, 194 | subtables, 195 | level=-1): 196 | 197 | new_properties = {} 198 | for prop, item_json_schema in _denest_schema__singular_schemas(table_json_schema): 199 | 200 | if json_schema.is_object(item_json_schema): 201 | _denest_schema_helper(table_path + (prop,), 202 | (prop,), 203 | item_json_schema, 204 | json_schema.is_nullable(item_json_schema), 205 | new_properties, 206 | key_prop_schemas, 207 | subtables, 208 | level) 209 | 210 | elif json_schema.is_iterable(item_json_schema): 211 | _create_subtable(table_path + (prop,), 212 | item_json_schema, 213 | key_prop_schemas, 214 | subtables, 215 | level + 1) 216 | 217 | elif json_schema.is_literal(item_json_schema): 218 | if (prop,) in new_properties: 219 | new_properties[(prop,)]['anyOf'].append(item_json_schema) 220 | else: 221 | new_properties[(prop,)] = {'anyOf': [item_json_schema]} 222 | 223 | 224 | table_json_schema['properties'] = new_properties 225 | 226 | 227 | def _get_streamed_table_records(key_properties, records): 228 | """ 229 | Flatten the given `records` into `table_records`. 230 | Maintains `key_properties`. 231 | into `table_records`. 232 | 233 | :param key_properties: [string, ...] 234 | :param records: [{...}, ...] 235 | :return: {TableName string: [{(path_0, path_1, ...): (_json_schema_string_type, value), ...}, ...], 236 | ...} 237 | """ 238 | 239 | records_map = {} 240 | _denest_records(tuple(), 241 | records, 242 | records_map, 243 | key_properties) 244 | 245 | return records_map 246 | 247 | 248 | def _denest_subrecord(table_path, 249 | prop_path, 250 | parent_record, 251 | record, 252 | records_map, 253 | key_properties, 254 | pk_fks, 255 | level): 256 | """""" 257 | """ 258 | {...} 259 | """ 260 | for prop, value in record.items(): 261 | """ 262 | str : {...} | [...] | ???None??? | 263 | """ 264 | 265 | if isinstance(value, dict): 266 | """ 267 | {...} 268 | """ 269 | _denest_subrecord(table_path + (prop,), 270 | prop_path + (prop,), 271 | parent_record, 272 | value, 273 | records_map, 274 | key_properties, 275 | pk_fks, 276 | level) 277 | 278 | elif isinstance(value, list): 279 | """ 280 | [...] 281 | """ 282 | _denest_records(table_path + (prop,), 283 | value, 284 | records_map, 285 | key_properties, 286 | pk_fks=pk_fks, 287 | level=level + 1) 288 | 289 | elif value is None: 290 | """ 291 | None 292 | """ 293 | continue 294 | 295 | else: 296 | """ 297 | 298 | """ 299 | parent_record[prop_path + (prop,)] = (json_schema.python_type(value), value) 300 | 301 | 302 | def _denest_record(table_path, record, records_map, key_properties, pk_fks, level): 303 | """""" 304 | """ 305 | {...} 306 | """ 307 | denested_record = {} 308 | for prop, value in record.items(): 309 | """ 310 | str : {...} | [...] | None | 311 | """ 312 | 313 | if isinstance(value, dict): 314 | """ 315 | {...} 316 | """ 317 | _denest_subrecord(table_path + (prop,), 318 | (prop,), 319 | denested_record, 320 | value, 321 | records_map, 322 | key_properties, 323 | pk_fks, 324 | level) 325 | 326 | elif isinstance(value, list): 327 | """ 328 | [...] 329 | """ 330 | _denest_records(table_path + (prop,), 331 | value, 332 | records_map, 333 | key_properties, 334 | pk_fks=pk_fks, 335 | level=level + 1) 336 | 337 | elif value is None: 338 | """ 339 | None 340 | """ 341 | continue 342 | 343 | else: 344 | """ 345 | 346 | """ 347 | denested_record[(prop,)] = (json_schema.python_type(value), value) 348 | 349 | if table_path not in records_map: 350 | records_map[table_path] = [] 351 | records_map[table_path].append(denested_record) 352 | 353 | 354 | def _denest_records(table_path, records, records_map, key_properties, pk_fks=None, level=-1): 355 | row_index = 0 356 | """ 357 | [{...} ...] | [[...] ...] | [literal ...] 358 | """ 359 | for record in records: 360 | if pk_fks: 361 | record_pk_fks = pk_fks.copy() 362 | record_pk_fks[singer.LEVEL_FMT.format(level)] = row_index 363 | 364 | if not isinstance(record, dict): 365 | """ 366 | [...] | literal 367 | """ 368 | record = {singer.VALUE: record} 369 | 370 | for key, value in record_pk_fks.items(): 371 | record[key] = value 372 | row_index += 1 373 | else: ## top level 374 | record_pk_fks = {} 375 | for key in key_properties: 376 | record_pk_fks[singer.SOURCE_PK_PREFIX + key] = record[key] 377 | if singer.SEQUENCE in record: 378 | record_pk_fks[singer.SEQUENCE] = record[singer.SEQUENCE] 379 | 380 | """ 381 | {...} 382 | """ 383 | _denest_record(table_path, record, records_map, key_properties, record_pk_fks, level) 384 | -------------------------------------------------------------------------------- /target_postgres/exceptions.py: -------------------------------------------------------------------------------- 1 | class JSONSchemaError(Exception): 2 | """ 3 | Raise this when there is an error with regards to an instance of JSON Schema 4 | """ 5 | 6 | 7 | class TargetError(Exception): 8 | """ 9 | Raise when there is an Exception streaming data to the target. 10 | """ 11 | 12 | 13 | class PostgresError(Exception): 14 | """ 15 | Raise this when there is an error with regards to Postgres streaming 16 | """ 17 | 18 | 19 | class SingerStreamError(Exception): 20 | """ 21 | Raise when there is an Exception with Singer Streams. 22 | """ 23 | -------------------------------------------------------------------------------- /target_postgres/json_schema.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import decimal 3 | import json 4 | import re 5 | 6 | from jsonschema import Draft4Validator 7 | from jsonschema.exceptions import SchemaError 8 | from target_postgres.exceptions import JSONSchemaError 9 | 10 | NULL = 'null' 11 | OBJECT = 'object' 12 | ARRAY = 'array' 13 | INTEGER = 'integer' 14 | NUMBER = 'number' 15 | BOOLEAN = 'boolean' 16 | STRING = 'string' 17 | DATE_TIME_FORMAT = 'date-time' 18 | 19 | _PYTHON_TYPE_TO_JSON_SCHEMA = { 20 | int: INTEGER, 21 | float: NUMBER, 22 | bool: BOOLEAN, 23 | str: STRING, 24 | type(None): NULL, 25 | decimal.Decimal: NUMBER 26 | } 27 | 28 | 29 | def python_type(x): 30 | """ 31 | Given a value `x`, return its Python Type as a JSONSchema type. 32 | :param x: 33 | :return: 34 | """ 35 | if not type(x) in _PYTHON_TYPE_TO_JSON_SCHEMA: 36 | raise JSONSchemaError('Unknown type `{}`. Cannot translate to JSONSchema type.'.format( 37 | str(type(x)) 38 | )) 39 | return _PYTHON_TYPE_TO_JSON_SCHEMA[type(x)] 40 | 41 | 42 | def get_type(schema): 43 | """ 44 | Given a JSON Schema dict, extracts the simplified `type` value 45 | :param schema: dict, JSON Schema 46 | :return: [string ...] 47 | """ 48 | t = schema.get('type', None) 49 | if not t: 50 | return [OBJECT] 51 | 52 | if isinstance(t, str): 53 | return [t] 54 | 55 | return deepcopy(t) 56 | 57 | 58 | def simple_type(schema): 59 | """ 60 | Given a JSON Schema dict, extracts the simplified schema, ie, a schema which can only represent 61 | _one_ of the given types allowed (along with the Nullable modifier): 62 | - OBJECT 63 | - ARRAY 64 | - INTEGER 65 | - NUMBER 66 | - BOOLEAN 67 | - STRING 68 | - DATE_TIME 69 | 70 | :param schema: dict, JSON Schema 71 | :return: dict, JSON Schema 72 | """ 73 | t = get_type(schema) 74 | 75 | if is_datetime(schema): 76 | return {'type': t, 77 | 'format': DATE_TIME_FORMAT} 78 | 79 | return {'type': t} 80 | 81 | 82 | def _get_ref(schema, paths): 83 | if not paths: 84 | return schema 85 | 86 | if not paths[0] in schema: 87 | raise JSONSchemaError('`$ref` "{}" not found in provided JSON Schema'.format(paths[0])) 88 | 89 | return _get_ref(schema[paths[0]], paths[1:]) 90 | 91 | 92 | def get_ref(schema, ref): 93 | """ 94 | Given a JSON Schema dict, and a valid ref (`$ref`), get the JSON Schema from within schema 95 | :param schema: dict, JSON Schema 96 | :param ref: string 97 | :return: dict, JSON Schema 98 | :raises: Exception 99 | """ 100 | 101 | # Explicitly only allow absolute internally defined $ref's 102 | if not re.match(r'^#/.*', ref): 103 | raise JSONSchemaError('Invalid format for `$ref`: "{}"'.format(ref)) 104 | 105 | return _get_ref(schema, 106 | re.split('/', re.sub(r'^#/', '', ref))) 107 | 108 | 109 | def _is_ref(schema): 110 | """ 111 | Given a JSON Schema compatible dict, returns True when the schema implements `$ref` 112 | 113 | NOTE: `$ref` OVERRIDES all other keys present in a schema 114 | :param schema: 115 | :return: Boolean 116 | """ 117 | 118 | return '$ref' in schema 119 | 120 | 121 | def _is_allof(schema): 122 | """ 123 | Given a JSON Schema compatible dict, returns True when the schema implements `allOf`. 124 | 125 | :param schema: 126 | :return: Boolean 127 | """ 128 | 129 | return not _is_ref(schema) and 'allOf' in schema 130 | 131 | 132 | def is_anyof(schema): 133 | """ 134 | Given a JSON Schema compatible dict, returns True when the schema implements `anyOf`. 135 | 136 | :param schema: 137 | :return: Boolean 138 | """ 139 | 140 | return not _is_ref(schema) and not _is_allof(schema) and 'anyOf' in schema 141 | 142 | 143 | def is_object(schema): 144 | """ 145 | Given a JSON Schema compatible dict, returns True when schema's type allows being an Object. 146 | :param schema: dict, JSON Schema 147 | :return: Boolean 148 | """ 149 | 150 | return not _is_ref(schema) and not is_anyof(schema) and not _is_allof(schema) \ 151 | and (OBJECT in get_type(schema) 152 | or 'properties' in schema 153 | or not schema) 154 | 155 | 156 | def is_iterable(schema): 157 | """ 158 | Given a JSON Schema compatible dict, returns True when schema's type allows being iterable (ie, 'array') 159 | :param schema: dict, JSON Schema 160 | :return: Boolean 161 | """ 162 | 163 | return not _is_ref(schema) \ 164 | and ARRAY in get_type(schema) \ 165 | and 'items' in schema 166 | 167 | 168 | def is_nullable(schema): 169 | """ 170 | Given a JSON Schema compatible dict, returns True when schema's type allows being 'null' 171 | :param schema: dict, JSON Schema 172 | :return: Boolean 173 | """ 174 | 175 | return NULL in get_type(schema) 176 | 177 | 178 | def is_literal(schema): 179 | """ 180 | Given a JSON Schema compatible dict, returns True when schema's type allows being a literal 181 | (ie, 'integer', 'number', etc.) 182 | :param schema: dict, JSON Schema 183 | :return: Boolean 184 | """ 185 | 186 | return not {STRING, INTEGER, NUMBER, BOOLEAN}.isdisjoint(set(get_type(schema))) 187 | 188 | 189 | def is_datetime(schema): 190 | """ 191 | Given a JSON Schema compatible dict, returns True when schema's type allows being a date-time 192 | :param schema: dict, JSON Schema 193 | :return: Boolean 194 | """ 195 | 196 | return STRING in get_type(schema) and schema.get('format') == DATE_TIME_FORMAT 197 | 198 | 199 | def make_nullable(schema): 200 | """ 201 | Given a JSON Schema dict, returns the dict but makes the `type` `null`able. 202 | `is_nullable` will return true on the output. 203 | :return: dict, JSON Schema 204 | """ 205 | t = get_type(schema) 206 | if NULL in t: 207 | return schema 208 | 209 | ret_schema = deepcopy(schema) 210 | ret_schema['type'] = t + [NULL] 211 | return ret_schema 212 | 213 | 214 | class Cachable(dict): 215 | ''' 216 | The simplified json_schemas we produce are idempotent. ie, if you simplify a simplified 217 | json_schema, it will return the same thing. We wrap the `dict` object with a few 218 | helpers which extend it so that we avoid recursion in some instances. 219 | ''' 220 | def __init__(self, raw_dict, simplified=True): 221 | self._c = None 222 | super(Cachable, self).__init__(self, **raw_dict) 223 | 224 | def __hash__(self): 225 | return self._comparator().__hash__() 226 | 227 | def deepcopy(self): 228 | s = deepcopy(self) 229 | s._c = self._c 230 | return s 231 | 232 | def _comparator(self): 233 | if not self._c: 234 | self._c = json.dumps(self, sort_keys=True) 235 | 236 | return self._c 237 | 238 | def __lt__(self, other): 239 | return self._comparator() < other._comparator() 240 | 241 | 242 | def _allof_sort_key(schema): 243 | ''' 244 | We prefer scalars over combinations. 245 | With scalars we prefer date-times over strings. 246 | With combinations, we prefer objects. 247 | With all, we prefer nullables. 248 | ''' 249 | if is_nullable(schema): 250 | sort_value = 0 251 | else: 252 | sort_value = 1 253 | 254 | if is_datetime(schema): 255 | sort_value += 0 256 | elif is_literal(schema): 257 | sort_value += 10 258 | elif is_object(schema): 259 | sort_value += 100 260 | elif is_iterable(schema): 261 | sort_value += 200 262 | else: 263 | # Unknown schema...maybe a $ref? 264 | sort_value += 1000 265 | 266 | return sort_value 267 | 268 | 269 | def _simplify__allof__merge__objects(schemas): 270 | ret_schema = schemas[0] 271 | # Merge objects together preferring later allOfs over earlier 272 | next_schemas = schemas[1:] 273 | while next_schemas and is_object(next_schemas[0]): 274 | ret_schema['properties'] = { 275 | **ret_schema.get('properties', {}), 276 | **next_schemas[0].get('properties', {})} 277 | 278 | next_schemas = next_schemas[1:] 279 | 280 | return ret_schema 281 | 282 | 283 | def _simplify__allof__merge__iterables(root_schema, schemas): 284 | ret_schema = schemas[0] 285 | # Recurse on all of the item schemas to create a single item schema 286 | item_schemas = [] 287 | 288 | next_schemas = schemas 289 | while next_schemas and is_iterable(next_schemas[0]): 290 | item_schemas.append(next_schemas[0]['items']) 291 | 292 | next_schemas = next_schemas[1:] 293 | 294 | ret_schema['items'] = _helper_simplify(root_schema, {'allOf': item_schemas}) 295 | return ret_schema 296 | 297 | 298 | def _simplify__allof(root_schema, child_schema): 299 | simplified_schemas = [ 300 | _helper_simplify(root_schema, schema) 301 | for schema in child_schema['allOf']] 302 | schemas = sorted(simplified_schemas, key=_allof_sort_key) 303 | 304 | ret_schema = schemas[0] 305 | 306 | if is_object(ret_schema): 307 | return _simplify__allof__merge__objects(schemas) 308 | 309 | if is_iterable(ret_schema): 310 | return _simplify__allof__merge__iterables(root_schema, schemas) 311 | 312 | return ret_schema 313 | 314 | 315 | def _simplify__implicit_anyof(root_schema, schema): 316 | ''' 317 | Typically literals are simple and have at most two types, one of which being NULL. 318 | However, they _can_ have many types wrapped up inside them as an implicit `anyOf`. 319 | 320 | Since we support `anyOf`, it is simpler to unwrap and "flatten" this implicit 321 | combination type. 322 | ''' 323 | schemas = [] 324 | types = set(get_type(schema)) 325 | 326 | if types == {NULL}: 327 | return Cachable({'type': [NULL]}) 328 | 329 | types.discard(NULL) 330 | 331 | if is_datetime(schema): 332 | schemas.append(Cachable({ 333 | 'type': [STRING], 334 | 'format': DATE_TIME_FORMAT 335 | })) 336 | 337 | types.remove(STRING) 338 | 339 | if is_object(schema): 340 | properties = {} 341 | for field, field_json_schema in schema.get('properties', {}).items(): 342 | properties[field] = _helper_simplify(root_schema, field_json_schema) 343 | 344 | schemas.append({ 345 | 'type': [OBJECT], 346 | 'properties': properties 347 | }) 348 | 349 | types.discard(OBJECT) 350 | 351 | if is_iterable(schema): 352 | schemas.append({ 353 | 'type': [ARRAY], 354 | 'items': _helper_simplify(root_schema, schema.get('items', {})) 355 | }) 356 | 357 | types.remove(ARRAY) 358 | 359 | schemas += [{'type': [t]} for t in types] 360 | 361 | if is_nullable(schema): 362 | schemas = [make_nullable(s) for s in schemas] 363 | 364 | 365 | return _helper_simplify(root_schema, {'anyOf': [Cachable(s) for s in schemas]}) 366 | 367 | 368 | def _simplify__anyof(root_schema, schema): 369 | ''' 370 | `anyOf` clauses are merged/simplified according to the following rules (these _are_ recursive): 371 | 372 | - all literals are dedupped 373 | - all objects are merged into the same object schema, with sub-schemas being grouped as simplified `anyOf` schemas 374 | - all iterables' `items` schemas are merged as simplified `anyOf` schemas 375 | - all `anyOf`s are flattened to the topmost 376 | - if there is only a single element in an `anyOf`, that is denested 377 | - if any `anyOf`s are nullable, all are nullable 378 | ''' 379 | 380 | schemas = [ 381 | _helper_simplify(root_schema, schema) 382 | for schema in schema['anyOf']] 383 | 384 | literals = set() 385 | any_nullable = False 386 | any_merged_objects = False 387 | merged_object_properties = {} 388 | any_merged_iters = False 389 | merged_item_schemas = [] 390 | 391 | while schemas: 392 | sub_schema = schemas.pop() 393 | any_nullable = any_nullable or is_nullable(sub_schema) 394 | 395 | if is_literal(sub_schema): 396 | literals.add(sub_schema) 397 | 398 | elif is_anyof(sub_schema): 399 | # Flatten potentially deeply nested `anyOf`s 400 | schemas += sub_schema['anyOf'] 401 | 402 | elif is_object(sub_schema): 403 | any_merged_objects = True 404 | for k, s in sub_schema.get('properties', {}).items(): 405 | if k in merged_object_properties: 406 | merged_object_properties[k].append(s) 407 | else: 408 | merged_object_properties[k] = [s] 409 | 410 | elif is_iterable(sub_schema): 411 | any_merged_iters = True 412 | merged_item_schemas.append(sub_schema['items']) 413 | 414 | merged_schemas = set() 415 | for l in literals: 416 | s = l 417 | if any_nullable: 418 | s = make_nullable(l) 419 | 420 | merged_schemas.add(Cachable(s)) 421 | 422 | if any_merged_objects: 423 | for k, v in merged_object_properties.items(): 424 | merged_object_properties[k] = _helper_simplify(root_schema, {'anyOf': v}) 425 | 426 | s = { 427 | 'type': [OBJECT], 428 | 'properties': merged_object_properties 429 | } 430 | 431 | if any_nullable: 432 | s = make_nullable(s) 433 | 434 | merged_schemas.add(Cachable(s)) 435 | 436 | if any_merged_iters: 437 | merged_item_schemas = _helper_simplify(root_schema, {'anyOf': merged_item_schemas}) 438 | 439 | s = { 440 | 'type': [ARRAY], 441 | 'items': merged_item_schemas 442 | } 443 | 444 | if any_nullable: 445 | s = make_nullable(s) 446 | 447 | merged_schemas.add(Cachable(s)) 448 | 449 | if len(merged_schemas) == 1: 450 | return merged_schemas.pop() 451 | 452 | return Cachable({'anyOf': sorted(merged_schemas)}) 453 | 454 | 455 | def _helper_simplify(root_schema, child_schema): 456 | # We check this value to make simplify a noop for schemas which have _already_ been simplified 457 | if isinstance(child_schema, Cachable): 458 | return child_schema 459 | 460 | ## Refs override all other type definitions 461 | if _is_ref(child_schema): 462 | try: 463 | ret_schema = _helper_simplify(root_schema, get_ref(root_schema, child_schema['$ref'])) 464 | 465 | except RecursionError: 466 | raise JSONSchemaError('`$ref` path "{}" is recursive'.format(get_ref(root_schema, child_schema['$ref']))) 467 | 468 | elif _is_allof(child_schema): 469 | ret_schema = _simplify__allof(root_schema, child_schema) 470 | 471 | elif is_anyof(child_schema): 472 | ret_schema = _simplify__anyof(root_schema, child_schema) 473 | 474 | else: 475 | ret_schema = _simplify__implicit_anyof(root_schema, child_schema) 476 | 477 | if 'default' in child_schema: 478 | ret_schema['default'] = child_schema.get('default') 479 | 480 | return Cachable(ret_schema) 481 | 482 | 483 | def simplify(schema): 484 | """ 485 | Given a JSON Schema compatible dict, returns a simplified JSON Schema dict 486 | 487 | - Expands `$ref` fields to their reference 488 | - Expands `type` fields into array'ed type fields 489 | - Strips out all fields which are not `type`/`properties` 490 | 491 | :param schema: dict, JSON Schema 492 | :return: dict, JSON Schema 493 | :raises: Exception 494 | """ 495 | if isinstance(schema, Cachable): 496 | return schema.deepcopy() 497 | 498 | return _helper_simplify(schema, schema) 499 | 500 | 501 | def _valid_schema_version(schema): 502 | return '$schema' not in schema \ 503 | or schema['$schema'] == 'http://json-schema.org/draft-04/schema#' 504 | 505 | 506 | def _unexpected_validation_error(errors, exception): 507 | """ 508 | 509 | :param errors: [String, ...] 510 | :param exception: Exception 511 | :return: [String, ...] 512 | """ 513 | 514 | if not errors: 515 | return ['Unexpected exception encountered: {}'.format(str(exception))] 516 | 517 | return errors 518 | 519 | 520 | def validation_errors(schema): 521 | """ 522 | Given a dict, returns any known JSON Schema validation errors. If there are none, 523 | implies that the dict is a valid JSON Schema. 524 | :param schema: dict 525 | :return: [String, ...] 526 | """ 527 | 528 | errors = [] 529 | 530 | if not isinstance(schema, dict): 531 | errors.append('Parameter `schema` is not a dict, instead found: {}'.format(type(schema))) 532 | 533 | try: 534 | if not _valid_schema_version(schema): 535 | errors.append('Schema version must be Draft 4. Found: {}'.format('$schema')) 536 | except Exception as ex: 537 | errors = _unexpected_validation_error(errors, ex) 538 | 539 | try: 540 | Draft4Validator.check_schema(schema) 541 | except SchemaError as error: 542 | errors.append(str(error)) 543 | except Exception as ex: 544 | errors = _unexpected_validation_error(errors, ex) 545 | 546 | try: 547 | simplify(schema) 548 | except JSONSchemaError as error: 549 | errors.append(str(error)) 550 | except Exception as ex: 551 | errors = _unexpected_validation_error(errors, ex) 552 | 553 | return errors 554 | 555 | 556 | _shorthand_mapping = { 557 | NULL: '', 558 | 'string': 's', 559 | 'number': 'f', 560 | 'integer': 'i', 561 | 'boolean': 'b', 562 | 'date-time': 't' 563 | } 564 | 565 | 566 | def _type_shorthand(type_s): 567 | if isinstance(type_s, list): 568 | shorthand = '' 569 | for t in sorted(type_s): 570 | shorthand += _type_shorthand(t) 571 | return shorthand 572 | 573 | if not type_s in _shorthand_mapping: 574 | raise JSONSchemaError('Shorthand not available for type {}. Expected one of {}'.format( 575 | type_s, 576 | list(_shorthand_mapping.keys()) 577 | )) 578 | 579 | return _shorthand_mapping[type_s] 580 | 581 | 582 | def shorthand(schema): 583 | t = deepcopy(get_type(schema)) 584 | 585 | if 'format' in schema and 'date-time' == schema['format'] and STRING in t: 586 | t.remove(STRING) 587 | t.append('date-time') 588 | 589 | return _type_shorthand(t) 590 | -------------------------------------------------------------------------------- /target_postgres/singer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Module for Singer literals and helpers. 3 | ''' 4 | _PREFIX = '_sdc_' 5 | RECEIVED_AT = _PREFIX + 'received_at' 6 | BATCHED_AT = _PREFIX + 'batched_at' 7 | SEQUENCE = _PREFIX + 'sequence' 8 | TABLE_VERSION = _PREFIX + 'table_version' 9 | PK = _PREFIX + 'primary_key' 10 | SOURCE_PK_PREFIX = _PREFIX + 'source_key_' 11 | LEVEL_FMT = _PREFIX + 'level_{}_id' 12 | VALUE = _PREFIX + 'value' 13 | -------------------------------------------------------------------------------- /target_postgres/singer_stream.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import json 3 | import uuid 4 | 5 | import arrow 6 | from jsonschema import Draft4Validator, FormatChecker 7 | from jsonschema.exceptions import ValidationError 8 | 9 | from target_postgres import json_schema, singer 10 | from target_postgres.exceptions import SingerStreamError 11 | 12 | 13 | SINGER_RECEIVED_AT = '_sdc_received_at' 14 | SINGER_BATCHED_AT = '_sdc_batched_at' 15 | SINGER_SEQUENCE = '_sdc_sequence' 16 | SINGER_TABLE_VERSION = '_sdc_table_version' 17 | SINGER_PK = '_sdc_primary_key' 18 | SINGER_SOURCE_PK_PREFIX = '_sdc_source_key_' 19 | SINGER_LEVEL = '_sdc_level_{}_id' 20 | SINGER_VALUE = '_sdc_value' 21 | 22 | RAW_LINE_SIZE = '__raw_line_size' 23 | 24 | 25 | def get_line_size(line_data): 26 | return line_data.get(RAW_LINE_SIZE) or len(json.dumps(line_data)) 27 | 28 | 29 | class BufferedSingerStream(): 30 | def __init__(self, 31 | stream, 32 | schema, 33 | key_properties, 34 | *args, 35 | invalid_records_detect=None, 36 | invalid_records_threshold=None, 37 | max_rows=200000, 38 | max_buffer_size=104857600, # 100MB 39 | **kwargs): 40 | """ 41 | :param invalid_records_detect: Defaults to True when value is None 42 | :param invalid_records_threshold: Defaults to 0 when value is None 43 | """ 44 | self.schema = None 45 | self.key_properties = None 46 | self.validator = None 47 | self.update_schema(schema, key_properties) 48 | 49 | self.stream = stream 50 | self.invalid_records = [] 51 | self.max_rows = max_rows 52 | self.max_buffer_size = max_buffer_size 53 | 54 | self.invalid_records_detect = invalid_records_detect 55 | self.invalid_records_threshold = invalid_records_threshold 56 | 57 | if self.invalid_records_detect is None: 58 | self.invalid_records_detect = True 59 | if self.invalid_records_threshold is None: 60 | self.invalid_records_threshold = 0 61 | 62 | self.__buffer = [] 63 | self.__count = 0 64 | self.__size = 0 65 | self.__lifetime_max_version = None 66 | 67 | def update_schema(self, schema, key_properties): 68 | # In order to determine whether a value _is in_ properties _or not_ we need to flatten `$ref`s etc. 69 | self.schema = json_schema.simplify(schema) 70 | self.key_properties = deepcopy(key_properties) 71 | 72 | # The validator can handle _many_ more things than our simplified schema, and is, in general handled by third party code 73 | self.validator = Draft4Validator(schema, format_checker=FormatChecker()) 74 | 75 | properties = self.schema['properties'] 76 | 77 | if singer.RECEIVED_AT not in properties: 78 | properties[singer.RECEIVED_AT] = { 79 | 'type': ['null', 'string'], 80 | 'format': 'date-time' 81 | } 82 | 83 | if singer.SEQUENCE not in properties: 84 | properties[singer.SEQUENCE] = { 85 | 'type': ['null', 'integer'] 86 | } 87 | 88 | if singer.TABLE_VERSION not in properties: 89 | properties[singer.TABLE_VERSION] = { 90 | 'type': ['null', 'integer'] 91 | } 92 | 93 | if singer.BATCHED_AT not in properties: 94 | properties[singer.BATCHED_AT] = { 95 | 'type': ['null', 'string'], 96 | 'format': 'date-time' 97 | } 98 | 99 | if len(self.key_properties) == 0: 100 | self.use_uuid_pk = True 101 | self.key_properties = [singer.PK] 102 | properties[singer.PK] = { 103 | 'type': ['string'] 104 | } 105 | else: 106 | self.use_uuid_pk = False 107 | 108 | @property 109 | def count(self): 110 | return self.__count 111 | 112 | @property 113 | def buffer_full(self): 114 | if self.__count >= self.max_rows: 115 | return True 116 | 117 | if self.__count > 0: 118 | if self.__size >= self.max_buffer_size: 119 | return True 120 | 121 | return False 122 | 123 | @property 124 | def max_version(self): 125 | return self.__lifetime_max_version 126 | 127 | def __update_version(self, version): 128 | if version is None or (self.__lifetime_max_version is not None and self.__lifetime_max_version >= version): 129 | return None 130 | 131 | ## TODO: log warning about earlier records detected 132 | 133 | self.flush_buffer() 134 | self.__lifetime_max_version = version 135 | 136 | def add_record_message(self, record_message): 137 | add_record = True 138 | 139 | self.__update_version(record_message.get('version')) 140 | 141 | if self.__lifetime_max_version != record_message.get('version'): 142 | return None 143 | 144 | try: 145 | self.validator.validate(record_message['record']) 146 | except ValidationError as error: 147 | add_record = False 148 | self.invalid_records.append((error, record_message)) 149 | 150 | if add_record: 151 | self.__buffer.append(record_message) 152 | self.__size += get_line_size(record_message) 153 | self.__count += 1 154 | elif self.invalid_records_detect \ 155 | and len(self.invalid_records) >= self.invalid_records_threshold: 156 | raise SingerStreamError( 157 | 'Invalid records detected above threshold: {}. See `.args` for details.'.format( 158 | self.invalid_records_threshold), 159 | self.invalid_records) 160 | 161 | def peek_buffer(self): 162 | return self.__buffer 163 | 164 | def get_batch(self): 165 | current_time = arrow.get().format('YYYY-MM-DD HH:mm:ss.SSSSZZ') 166 | 167 | records = [] 168 | for record_message in self.peek_buffer(): 169 | record = record_message['record'] 170 | 171 | if 'version' in record_message: 172 | record[singer.TABLE_VERSION] = record_message['version'] 173 | 174 | if 'time_extracted' in record_message and record.get(singer.RECEIVED_AT) is None: 175 | record[singer.RECEIVED_AT] = record_message['time_extracted'] 176 | 177 | if self.use_uuid_pk and record.get(singer.PK) is None: 178 | record[singer.PK] = str(uuid.uuid4()) 179 | 180 | record[singer.BATCHED_AT] = current_time 181 | 182 | if 'sequence' in record_message: 183 | record[singer.SEQUENCE] = record_message['sequence'] 184 | else: 185 | record[singer.SEQUENCE] = arrow.get().int_timestamp 186 | 187 | records.append(record) 188 | 189 | return records 190 | 191 | def flush_buffer(self): 192 | _buffer = self.__buffer 193 | self.__buffer = [] 194 | self.__size = 0 195 | self.__count = 0 196 | return _buffer 197 | 198 | def peek_invalid_records(self): 199 | return self.invalid_records 200 | -------------------------------------------------------------------------------- /target_postgres/stream_tracker.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import json 3 | import singer.statediff as statediff 4 | import sys 5 | 6 | from target_postgres.exceptions import TargetError 7 | 8 | 9 | class StreamTracker: 10 | """ 11 | Object to track the BufferedStream objects for each incoming stream to the target, and the STATE messages coming in. This object understands which streams need to be flushed before STATE messages can be safely emitted and does so. 12 | 13 | Because Singer taps don't have a standard way of expressing which streams correspond to which STATEs, the target can only safely 14 | emit a STATE message once all the records that came in prior to that STATE in the stream. Because target-postgres buffers 15 | the records in BufferedSingerStreams, the STATE messages need to be delayed until all the records that came before them have been 16 | saved to the database from their buffers. 17 | """ 18 | 19 | def __init__(self, target, emit_states): 20 | self.target = target 21 | self.emit_states = emit_states 22 | 23 | self.streams = {} 24 | 25 | # dict of {'': number}, where the number is the message counter of the most recently received record for that stream. Will contain a value for all registered streams. 26 | self.stream_add_watermarks = {} 27 | 28 | # dict of {'': number}, where the number is the message counter of the most recently flushed record for that stream. Will contain a value for all registered streams. 29 | self.stream_flush_watermarks = {} 30 | 31 | self.streams_added_to = set() # list of stream names which have seen records 32 | self.state_queue = deque() # contains dicts of {'state': , 'watermark': number} 33 | self.message_counter = 0 34 | self.last_emitted_state = None 35 | 36 | def register_stream(self, stream, buffered_stream): 37 | self.streams[stream] = buffered_stream 38 | self.stream_flush_watermarks[stream] = 0 39 | 40 | def flush_stream(self, stream): 41 | self._write_batch_and_update_watermarks(stream) 42 | self._emit_safe_queued_states() 43 | 44 | def flush_streams(self, force=False): 45 | for (stream, stream_buffer) in self.streams.items(): 46 | if force or stream_buffer.buffer_full: 47 | self._write_batch_and_update_watermarks(stream) 48 | 49 | self._emit_safe_queued_states(force=force) 50 | 51 | def handle_state_message(self, line): 52 | if self.emit_states: 53 | self.state_queue.append({'state': line, 'watermark': self.message_counter}) 54 | self._emit_safe_queued_states() 55 | 56 | def handle_record_message(self, stream, line_data): 57 | if stream not in self.streams: 58 | raise TargetError('A record for stream {} was encountered before a corresponding schema'.format(stream)) 59 | 60 | self.message_counter += 1 61 | self.streams_added_to.add(stream) 62 | self.stream_add_watermarks[stream] = self.message_counter 63 | self.streams[stream].add_record_message(line_data) 64 | 65 | def _write_batch_and_update_watermarks(self, stream): 66 | stream_buffer = self.streams[stream] 67 | self.target.write_batch(stream_buffer) 68 | stream_buffer.flush_buffer() 69 | self.stream_flush_watermarks[stream] = self.stream_add_watermarks.get(stream, 0) 70 | 71 | def _emit_safe_queued_states(self, force=False): 72 | # State messages that occured before the least recently flushed record are safe to emit. 73 | # If they occurred after some records that haven't yet been flushed, they aren't safe to emit. 74 | # Because records arrive at different rates from different streams, we take the earliest unflushed record 75 | # as the threshold for what STATE messages are safe to emit. We ignore the threshold of 0 for streams that 76 | # have been registered (via a SCHEMA message) but where no records have arrived yet. 77 | valid_flush_watermarks = [] 78 | for stream, watermark in self.stream_flush_watermarks.items(): 79 | if stream in self.streams_added_to: 80 | valid_flush_watermarks.append(watermark) 81 | safe_flush_threshold = min(valid_flush_watermarks, default=0) 82 | 83 | # the STATE message that the target forwards 84 | emittable_state = None 85 | emittable_state_str = None 86 | while len(self.state_queue) > 0 and (force or self.state_queue[0]['watermark'] <= safe_flush_threshold): 87 | emittable_state_str = self.state_queue.popleft()['state'] 88 | 89 | if emittable_state_str is not None: 90 | emittable_state = json.loads(emittable_state_str)['value'] 91 | 92 | if emittable_state: 93 | if len(statediff.diff(emittable_state, self.last_emitted_state or {})) > 0: 94 | line = json.dumps(emittable_state) 95 | sys.stdout.write("{}\n".format(line)) 96 | sys.stdout.flush() 97 | 98 | self.last_emitted_state = emittable_state 99 | -------------------------------------------------------------------------------- /target_postgres/target_tools.py: -------------------------------------------------------------------------------- 1 | import http.client 2 | import io 3 | import json 4 | import pkg_resources 5 | import sys 6 | import threading 7 | import decimal 8 | 9 | import singer 10 | from singer import utils, metadata, metrics 11 | 12 | from target_postgres import json_schema 13 | from target_postgres.exceptions import TargetError 14 | from target_postgres.singer_stream import BufferedSingerStream, RAW_LINE_SIZE 15 | from target_postgres.stream_tracker import StreamTracker 16 | 17 | LOGGER = singer.get_logger() 18 | 19 | 20 | def main(target): 21 | """ 22 | Given a target, stream stdin input as a text stream. 23 | :param target: object which implements `write_batch` and `activate_version` 24 | :return: None 25 | """ 26 | config = utils.parse_args([]).config 27 | input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') 28 | stream_to_target(input_stream, target, config=config) 29 | 30 | return None 31 | 32 | 33 | def stream_to_target(stream, target, config={}): 34 | """ 35 | Persist `stream` to `target` with optional `config`. 36 | :param stream: iterator which represents a Singer data stream 37 | :param target: object which implements `write_batch` and `activate_version` 38 | :param config: [optional] configuration for buffers etc. 39 | :return: None 40 | """ 41 | 42 | state_support = config.get('state_support', True) 43 | state_tracker = StreamTracker(target, state_support) 44 | _run_sql_hook('before_run_sql', config, target) 45 | 46 | try: 47 | if not config.get('disable_collection', False): 48 | _async_send_usage_stats() 49 | 50 | invalid_records_detect = config.get('invalid_records_detect') 51 | invalid_records_threshold = config.get('invalid_records_threshold') 52 | max_batch_rows = config.get('max_batch_rows', 200000) 53 | max_batch_size = config.get('max_batch_size', 104857600) # 100MB 54 | batch_detection_threshold = config.get('batch_detection_threshold', max(max_batch_rows / 40, 50)) 55 | 56 | line_count = 0 57 | for line in stream: 58 | _line_handler(state_tracker, 59 | target, 60 | invalid_records_detect, 61 | invalid_records_threshold, 62 | max_batch_rows, 63 | max_batch_size, 64 | line 65 | ) 66 | if line_count > 0 and line_count % batch_detection_threshold == 0: 67 | state_tracker.flush_streams() 68 | line_count += 1 69 | 70 | state_tracker.flush_streams(force=True) 71 | _run_sql_hook('after_run_sql', config, target) 72 | 73 | return None 74 | 75 | except Exception as e: 76 | LOGGER.critical(e) 77 | raise e 78 | finally: 79 | _report_invalid_records(state_tracker.streams) 80 | 81 | 82 | def _report_invalid_records(streams): 83 | for stream_buffer in streams.values(): 84 | if stream_buffer.peek_invalid_records(): 85 | LOGGER.warning("Invalid records detected for stream {}: {}".format( 86 | stream_buffer.stream, 87 | stream_buffer.peek_invalid_records() 88 | )) 89 | 90 | 91 | def _line_handler(state_tracker, target, invalid_records_detect, invalid_records_threshold, max_batch_rows, 92 | max_batch_size, line): 93 | try: 94 | line_data = json.loads(line, parse_float=decimal.Decimal) 95 | except json.decoder.JSONDecodeError: 96 | LOGGER.error("Unable to parse JSON: {}".format(line)) 97 | raise 98 | 99 | if 'type' not in line_data: 100 | raise TargetError('`type` is a required key: {}'.format(line)) 101 | 102 | if line_data['type'] == 'SCHEMA': 103 | if 'stream' not in line_data: 104 | raise TargetError('`stream` is a required key: {}'.format(line)) 105 | 106 | stream = line_data['stream'] 107 | 108 | if 'schema' not in line_data: 109 | raise TargetError('`schema` is a required key: {}'.format(line)) 110 | 111 | schema = line_data['schema'] 112 | 113 | schema_validation_errors = json_schema.validation_errors(schema) 114 | if schema_validation_errors: 115 | raise TargetError('`schema` is an invalid JSON Schema instance: {}'.format(line), *schema_validation_errors) 116 | 117 | if 'key_properties' in line_data: 118 | key_properties = line_data['key_properties'] 119 | else: 120 | key_properties = None 121 | 122 | if stream not in state_tracker.streams: 123 | buffered_stream = BufferedSingerStream(stream, 124 | schema, 125 | key_properties, 126 | invalid_records_detect=invalid_records_detect, 127 | invalid_records_threshold=invalid_records_threshold) 128 | if max_batch_rows: 129 | buffered_stream.max_rows = max_batch_rows 130 | if max_batch_size: 131 | buffered_stream.max_buffer_size = max_batch_size 132 | 133 | state_tracker.register_stream(stream, buffered_stream) 134 | else: 135 | state_tracker.streams[stream].update_schema(schema, key_properties) 136 | elif line_data['type'] == 'RECORD': 137 | if 'stream' not in line_data: 138 | raise TargetError('`stream` is a required key: {}'.format(line)) 139 | 140 | line_data[RAW_LINE_SIZE] = len(line) 141 | state_tracker.handle_record_message(line_data['stream'], line_data) 142 | elif line_data['type'] == 'ACTIVATE_VERSION': 143 | if 'stream' not in line_data: 144 | raise TargetError('`stream` is a required key: {}'.format(line)) 145 | if 'version' not in line_data: 146 | raise TargetError('`version` is a required key: {}'.format(line)) 147 | if line_data['stream'] not in state_tracker.streams: 148 | raise TargetError('A ACTIVATE_VERSION for stream {} was encountered before a corresponding schema' 149 | .format(line_data['stream'])) 150 | 151 | stream_buffer = state_tracker.streams[line_data['stream']] 152 | state_tracker.flush_stream(line_data['stream']) 153 | target.activate_version(stream_buffer, line_data['version']) 154 | elif line_data['type'] == 'STATE': 155 | # pass the string instead of the deserialized object to save memory in the deque 156 | state_tracker.handle_state_message(line) 157 | else: 158 | raise TargetError('Unknown message type {} in message {}'.format( 159 | line_data['type'], 160 | line)) 161 | 162 | 163 | def _send_usage_stats(): 164 | try: 165 | version = pkg_resources.get_distribution('target-postgres').version 166 | with http.client.HTTPConnection('collector.singer.io', timeout=10).connect() as conn: 167 | params = { 168 | 'e': 'se', 169 | 'aid': 'singer', 170 | 'se_ca': 'target-postgres', 171 | 'se_ac': 'open', 172 | 'se_la': version, 173 | } 174 | conn.request('GET', '/i?' + urllib.parse.urlencode(params)) 175 | conn.getresponse() 176 | except: 177 | LOGGER.debug('Collection request failed') 178 | 179 | 180 | def _async_send_usage_stats(): 181 | LOGGER.info('Sending version information to singer.io. ' + 182 | 'To disable sending anonymous usage data, set ' + 183 | 'the config parameter "disable_collection" to true') 184 | threading.Thread(target=_send_usage_stats()).start() 185 | 186 | 187 | def _run_sql_hook(hook_name, config, target): 188 | if hook_name in config: 189 | with target.conn.cursor() as cur: 190 | cur.execute(config[hook_name]) 191 | LOGGER.debug('{} SQL executed'.format(hook_name)) 192 | 193 | hook_file = hook_name + '_file' 194 | if hook_file in config: 195 | with open(config[hook_file]) as f: 196 | with target.conn.cursor() as cur: 197 | cur.execute(f.read()) 198 | LOGGER.debug('{} SQL file executed'.format(hook_file)) 199 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | sys.path.append(os.path.join(os.path.dirname(__file__), 'utils')) 5 | -------------------------------------------------------------------------------- /tests/migrations/scripts/install_schema_versions.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e -x 3 | 4 | python -m venv /code/venv/target-postgres--schema0 5 | source /code/venv/target-postgres--schema0/bin/activate 6 | pip install "singer-target-postgres==0.1.2" 7 | deactivate 8 | 9 | python -m venv /code/venv/target-postgres--schema1 10 | source /code/venv/target-postgres--schema1/bin/activate 11 | pip install "singer-target-postgres==0.1.9" 12 | deactivate 13 | -------------------------------------------------------------------------------- /tests/migrations/scripts/to_latest.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e -x 3 | 4 | cd /code 5 | 6 | source venv/target-postgres/bin/activate 7 | pip install -U pip 8 | /opt/poetry/bin/poetry install 9 | 10 | cat tests/migrations/data/tap | target-postgres --config ${1} 11 | X="$?" 12 | 13 | deactivate 14 | 15 | exit ${X} 16 | -------------------------------------------------------------------------------- /tests/migrations/scripts/to_target.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e -x 3 | 4 | cat /code/tests/migrations/data/tap | /code/venv/target-postgres--${1}/bin/target-postgres --config ${2} 5 | -------------------------------------------------------------------------------- /tests/migrations/test_migrations.py: -------------------------------------------------------------------------------- 1 | ''' 2 | If we assert that any upgrade to _latest_ from some older version should 3 | be met with chaining versions together... 4 | 5 | versions = [v0 v1 v2] 6 | 7 | v0 8 | v0 -> v1 9 | v1 10 | v1 -> v2 11 | v2 12 | ... 13 | vn-1 -> vn 14 | vn 15 | ''' 16 | 17 | from copy import deepcopy 18 | import json 19 | import os 20 | import pytest 21 | import subprocess 22 | 23 | import psycopg2 24 | from psycopg2 import sql 25 | 26 | from utils.fixtures import CONFIG, TEST_DB 27 | 28 | SCHEMA_PREFIX = "migration_testing__" 29 | FILE_PATH = "/code/tests/migrations/" 30 | 31 | 32 | def abs_path(relative_path): 33 | return FILE_PATH + relative_path 34 | 35 | 36 | def _cursor_list(cursor, idx=0): 37 | return [x[idx] for x in cursor.fetchall()] 38 | 39 | 40 | def list_schemas(): 41 | with psycopg2.connect(**TEST_DB) as conn: 42 | with conn.cursor() as cur: 43 | cur.execute( 44 | "SELECT schema_name FROM information_schema.schemata WHERE schema_name LIKE '{}%'".format( 45 | SCHEMA_PREFIX)) 46 | return _cursor_list(cur) 47 | 48 | 49 | def clear_schema(schema): 50 | with psycopg2.connect(**TEST_DB) as conn: 51 | with conn.cursor() as cur: 52 | cur.execute(sql.SQL( 53 | 'DROP SCHEMA IF EXISTS {} CASCADE;').format( 54 | sql.Identifier(schema))) 55 | 56 | 57 | def clear_db(): 58 | for schema in list_schemas(): 59 | clear_schema(schema) 60 | 61 | 62 | @pytest.fixture 63 | def db_cleanup(): 64 | clear_db() 65 | 66 | yield 67 | 68 | 69 | def create_schema(schema): 70 | name = SCHEMA_PREFIX + schema 71 | with psycopg2.connect(**TEST_DB) as conn: 72 | with conn.cursor() as cur: 73 | cur.execute(sql.SQL( 74 | 'CREATE SCHEMA IF NOT EXISTS {};').format( 75 | sql.Identifier(name))) 76 | 77 | return name 78 | 79 | 80 | def setup_config(version, psql_schema): 81 | os.makedirs(abs_path("artifacts"), exist_ok=True) 82 | 83 | config_path = abs_path("artifacts/config--{}.json".format(psql_schema)) 84 | 85 | if not os.path.exists(config_path): 86 | target_config = deepcopy(CONFIG) 87 | target_config['postgres_schema'] = psql_schema 88 | 89 | with open(config_path, 'w') as outfile: 90 | json.dump(target_config, outfile) 91 | 92 | return config_path 93 | 94 | 95 | def script_cmd(script, *args): 96 | cmd = [abs_path("scripts/{}.sh".format(script))] + list(args) 97 | 98 | p = subprocess.Popen(cmd) 99 | communication = p.communicate() 100 | if p.returncode: 101 | raise Exception(communication) 102 | 103 | return communication 104 | 105 | 106 | def tap_to_target(version, psql_schema): 107 | config_path = setup_config(version, psql_schema) 108 | 109 | if version == 'LATEST': 110 | return script_cmd("to_latest", config_path) 111 | 112 | return script_cmd("to_target", version, config_path) 113 | 114 | 115 | def _test_versions(versions): 116 | length = len(versions) 117 | for idx in range(length): 118 | version = versions[idx] 119 | if idx: 120 | prev_version = versions[idx - 1] 121 | schema = create_schema('{}_{}'.format(prev_version, version)) 122 | tap_to_target(prev_version, schema) 123 | tap_to_target(version, schema) 124 | 125 | schema = create_schema(version) 126 | tap_to_target(version, schema) 127 | 128 | 129 | def tables_in_schema(schema): 130 | with psycopg2.connect(**TEST_DB) as conn: 131 | with conn.cursor() as cur: 132 | cur.execute(sql.SQL( 133 | "SELECT table_name FROM information_schema.tables WHERE table_schema = {}" 134 | ).format(sql.Literal(schema))) 135 | return set(_cursor_list(cur)) 136 | 137 | 138 | def table_length(schema, table): 139 | with psycopg2.connect(**TEST_DB) as conn: 140 | with conn.cursor() as cur: 141 | cur.execute(sql.SQL( 142 | "SELECT count(*) FROM {}.{}" 143 | ).format( 144 | sql.Identifier(schema), 145 | sql.Identifier(table))) 146 | return cur.fetchone()[0] 147 | 148 | 149 | def assert_table_lengths_equal(schema_a, schema_b, table): 150 | assert table_length(schema_a, table) == table_length(schema_b, table), \ 151 | "Table {} in schemas {}, {} does not match in length".format(table, schema_a, schema_b) 152 | 153 | 154 | def assert_tables_equal(): 155 | schemas = list_schemas() 156 | tables = tables_in_schema(schemas[0]) 157 | 158 | for idx in range(1, len(schemas)): 159 | schema = schemas[idx] 160 | assert tables == tables_in_schema(schema), \ 161 | "Schema: {} differs from the rest. Processed {} of {}".format(schema, idx, len(schemas)) 162 | 163 | for table in tables: 164 | assert_table_lengths_equal(schemas[0], schema, table) 165 | 166 | 167 | def test(db_cleanup): 168 | _test_versions(['schema0', 'schema1', 'LATEST']) 169 | 170 | schemas = list_schemas() 171 | assert list_schemas(), "There should have been at least one generated schema..." 172 | 173 | tables = tables_in_schema(schemas[0]) 174 | assert tables, "There should have been at least one generated table..." 175 | 176 | assert_tables_equal() 177 | -------------------------------------------------------------------------------- /tests/unit/test_BufferedSingerStream.py: -------------------------------------------------------------------------------- 1 | from decimal import Decimal 2 | from copy import deepcopy 3 | 4 | import pytest 5 | 6 | from target_postgres import singer 7 | from target_postgres.singer_stream import BufferedSingerStream, SingerStreamError, RAW_LINE_SIZE 8 | 9 | from utils.fixtures import CatStream, InvalidCatStream, CATS_SCHEMA 10 | 11 | 12 | def missing_sdc_properties(stream_buffer): 13 | errors = [] 14 | for p in [singer.BATCHED_AT, singer.RECEIVED_AT, singer.SEQUENCE, singer.TABLE_VERSION]: 15 | if not p in stream_buffer.schema['properties']: 16 | errors.append({'_sdc': p, 17 | 'message': '`_sdc` missing'}) 18 | 19 | return errors 20 | 21 | 22 | def test_init(): 23 | singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], 24 | CATS_SCHEMA['schema'], 25 | CATS_SCHEMA['key_properties']) 26 | 27 | assert singer_stream 28 | assert [] == missing_sdc_properties(singer_stream) 29 | 30 | 31 | def test_init__empty_key_properties(): 32 | singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], 33 | CATS_SCHEMA['schema'], 34 | []) 35 | 36 | stream = CatStream(100) 37 | for _ in range(20): 38 | singer_stream.add_record_message(stream.generate_record_message()) 39 | 40 | assert singer_stream 41 | assert [] == missing_sdc_properties(singer_stream) 42 | assert [singer.PK] == singer_stream.key_properties 43 | 44 | rows_missing_pk = [] 45 | rows_checked = 0 46 | for r in singer_stream.get_batch(): 47 | if not r[singer.PK]: 48 | rows_missing_pk.append(r) 49 | 50 | rows_checked += 1 51 | 52 | assert rows_checked > 1 53 | assert [] == rows_missing_pk 54 | 55 | 56 | def test_add_record_message(): 57 | stream = CatStream(10) 58 | singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], 59 | CATS_SCHEMA['schema'], 60 | CATS_SCHEMA['key_properties']) 61 | assert singer_stream.add_record_message(stream.generate_record_message()) is None 62 | assert not singer_stream.peek_invalid_records() 63 | assert [] == missing_sdc_properties(singer_stream) 64 | 65 | 66 | def test_add_record_message__invalid_record(): 67 | stream = InvalidCatStream(10) 68 | singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], 69 | CATS_SCHEMA['schema'], 70 | CATS_SCHEMA['key_properties']) 71 | with pytest.raises(SingerStreamError): 72 | singer_stream.add_record_message(stream.generate_record_message()) 73 | 74 | assert singer_stream.peek_invalid_records() 75 | assert singer_stream.count == 0 76 | assert [] == missing_sdc_properties(singer_stream) 77 | 78 | 79 | SIMPLE_MULTIPLE_OF_VALID_SCHEMA = { 80 | 'properties': { 81 | 'multipleOfKey': { 82 | 'type': 'number', 83 | 'multipleOf': Decimal('1e-15') 84 | } 85 | } 86 | } 87 | 88 | SIMPLE_MULTIPLE_OF_INVALID_SCHEMA = { 89 | 'properties': { 90 | 'multipleOfKey': { 91 | 'type': 'number', 92 | 'multipleOf': 1e-15 93 | } 94 | } 95 | } 96 | 97 | def test_add_record_message__multipleOf(): 98 | stream_name = 'test' 99 | singer_stream = BufferedSingerStream(stream_name, 100 | deepcopy(SIMPLE_MULTIPLE_OF_VALID_SCHEMA), 101 | []) 102 | 103 | multiple_of_values = ['1', '2', '3', '4', '5', '1.1', '2.3', '1.23456789', '20', '100.1'] 104 | 105 | for value in multiple_of_values: 106 | singer_stream.add_record_message( 107 | { 108 | 'type': 'RECORD', 109 | 'stream': stream_name, 110 | 'record': {'multipleOfKey': Decimal(value)}, 111 | 'sequence': 0, 112 | RAW_LINE_SIZE: 100 113 | } 114 | ) 115 | 116 | assert not singer_stream.peek_invalid_records() 117 | assert singer_stream.count == len(multiple_of_values) 118 | 119 | 120 | def test_add_record_message__multipleOf_invalid_record(): 121 | stream_name = 'test' 122 | singer_stream = BufferedSingerStream(stream_name, 123 | deepcopy(SIMPLE_MULTIPLE_OF_INVALID_SCHEMA), 124 | []) 125 | 126 | multiple_of_values = [1, 2] 127 | 128 | for value in multiple_of_values: 129 | with pytest.raises(SingerStreamError): 130 | singer_stream.add_record_message( 131 | { 132 | 'type': 'RECORD', 133 | 'stream': stream_name, 134 | 'record': {'multipleOfKey': value}, 135 | 'sequence': 0, 136 | RAW_LINE_SIZE: 100 137 | } 138 | ) 139 | 140 | assert singer_stream.peek_invalid_records() 141 | assert singer_stream.count == 0 142 | 143 | 144 | SIMPLE_ALLOF_SCHEMA = { 145 | 'type': 'object', 146 | 'properties': { 147 | 'allOfKey': { 148 | 'allOf': [ 149 | { 'type': ['string'] }, 150 | { 'maxLength': 5 } 151 | ]}}} 152 | 153 | 154 | def test_add_record_message__allOf(): 155 | stream_name = 'test' 156 | singer_stream = BufferedSingerStream(stream_name, 157 | deepcopy(SIMPLE_ALLOF_SCHEMA), 158 | []) 159 | 160 | strs_shorter_than_6 = [ 161 | 'hello', 162 | 'I', 163 | 'am', 164 | 'a set', 165 | 'of', 166 | 'short', 167 | 'strs' 168 | ] 169 | 170 | for string in strs_shorter_than_6: 171 | singer_stream.add_record_message( 172 | { 173 | 'type': 'RECORD', 174 | 'stream': stream_name, 175 | 'record': {'allOfKey': string}, 176 | 'sequence': 0 177 | } 178 | ) 179 | 180 | assert not singer_stream.peek_invalid_records() 181 | assert singer_stream.count == len(strs_shorter_than_6) 182 | assert [] == missing_sdc_properties(singer_stream) 183 | 184 | 185 | def test_add_record_message__allOf__invalid_record(): 186 | stream_name = 'test' 187 | singer_stream = BufferedSingerStream(stream_name, 188 | deepcopy(SIMPLE_ALLOF_SCHEMA), 189 | []) 190 | 191 | with pytest.raises(SingerStreamError): 192 | singer_stream.add_record_message( 193 | { 194 | 'type': 'RECORD', 195 | 'stream': stream_name, 196 | 'record': {'allOfKey': 'this is a string which is much too long to be allowed'}, 197 | 'sequence': 0 198 | } 199 | ) 200 | 201 | assert singer_stream.peek_invalid_records() 202 | assert singer_stream.count == 0 203 | assert [] == missing_sdc_properties(singer_stream) 204 | 205 | 206 | def test_add_record_message__allOf__impossible_schema(): 207 | stream_name = 'test' 208 | 209 | schema = deepcopy(SIMPLE_ALLOF_SCHEMA) 210 | schema['properties']['allOfKey']['allOf'].append({'type': ['number']}) 211 | 212 | singer_stream = BufferedSingerStream(stream_name, 213 | schema, 214 | []) 215 | 216 | 217 | with pytest.raises(SingerStreamError): 218 | singer_stream.add_record_message( 219 | { 220 | 'type': 'RECORD', 221 | 'stream': stream_name, 222 | 'record': {'allOfKey': 'short'}, 223 | 'sequence': 0 224 | } 225 | ) 226 | with pytest.raises(SingerStreamError): 227 | singer_stream.add_record_message( 228 | { 229 | 'type': 'RECORD', 230 | 'stream': stream_name, 231 | 'record': {'allOfKey': 314159}, 232 | 'sequence': 0 233 | } 234 | ) 235 | 236 | assert singer_stream.peek_invalid_records() 237 | assert singer_stream.count == 0 238 | assert [] == missing_sdc_properties(singer_stream) 239 | 240 | 241 | def test_add_record_message__invalid_record__detection_off(): 242 | stream = InvalidCatStream(10) 243 | singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], 244 | CATS_SCHEMA['schema'], 245 | CATS_SCHEMA['key_properties'], 246 | invalid_records_detect=False) 247 | 248 | singer_stream.add_record_message(stream.generate_record_message()) 249 | 250 | assert singer_stream.peek_invalid_records() 251 | assert singer_stream.count == 0 252 | assert [] == missing_sdc_properties(singer_stream) 253 | 254 | 255 | def test_add_record_message__invalid_record__cross_threshold(): 256 | stream = InvalidCatStream(10) 257 | 258 | singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], 259 | CATS_SCHEMA['schema'], 260 | CATS_SCHEMA['key_properties'], 261 | invalid_records_threshold=3) 262 | 263 | singer_stream.add_record_message(stream.generate_record_message()) 264 | singer_stream.add_record_message(stream.generate_record_message()) 265 | 266 | with pytest.raises(SingerStreamError): 267 | singer_stream.add_record_message(stream.generate_record_message()) 268 | 269 | assert singer_stream.peek_invalid_records() 270 | assert singer_stream.count == 0 271 | assert [] == missing_sdc_properties(singer_stream) 272 | 273 | 274 | def mocked_mock_write_batch(stream_buffer): 275 | stream_buffer.flush_buffer() 276 | 277 | 278 | def test_multiple_batches__by_rows(): 279 | stream = CatStream(100) 280 | 281 | singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], 282 | CATS_SCHEMA['schema'], 283 | CATS_SCHEMA['key_properties'], 284 | max_rows=20) 285 | 286 | assert len(singer_stream.peek_buffer()) == 0 287 | 288 | while not singer_stream.buffer_full: 289 | singer_stream.add_record_message(stream.generate_record_message()) 290 | 291 | assert len(singer_stream.peek_buffer()) == 20 292 | assert [] == missing_sdc_properties(singer_stream) 293 | 294 | singer_stream.flush_buffer() 295 | 296 | assert len(singer_stream.peek_buffer()) == 0 297 | 298 | 299 | def test_multiple_batches__by_memory(): 300 | stream = CatStream(100) 301 | 302 | singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], 303 | CATS_SCHEMA['schema'], 304 | CATS_SCHEMA['key_properties'], 305 | max_buffer_size=10) 306 | 307 | assert len(singer_stream.peek_buffer()) == 0 308 | 309 | while not singer_stream.buffer_full: 310 | singer_stream.add_record_message(stream.generate_record_message()) 311 | 312 | assert len(singer_stream.peek_buffer()) == 1 313 | assert [] == missing_sdc_properties(singer_stream) 314 | 315 | singer_stream.flush_buffer() 316 | 317 | assert len(singer_stream.peek_buffer()) == 0 318 | 319 | 320 | def test_multiple_batches__old_records__by_rows(): 321 | stream_oldest = CatStream(100, version=0) 322 | stream_middle_aged = CatStream(100, version=5) 323 | stream_latest = CatStream(100, version=10) 324 | 325 | singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], 326 | CATS_SCHEMA['schema'], 327 | CATS_SCHEMA['key_properties'], 328 | max_rows=20) 329 | 330 | assert len(singer_stream.peek_buffer()) == 0 331 | 332 | while not singer_stream.buffer_full: 333 | singer_stream.add_record_message(stream_oldest.generate_record_message()) 334 | 335 | assert len(singer_stream.peek_buffer()) == 20 336 | 337 | singer_stream.flush_buffer() 338 | 339 | assert len(singer_stream.peek_buffer()) == 0 340 | 341 | singer_stream.add_record_message(stream_latest.generate_record_message()) 342 | 343 | assert len(singer_stream.peek_buffer()) == 1 344 | 345 | reasonable_cutoff = 1000 346 | while not singer_stream.buffer_full and reasonable_cutoff != 0: 347 | singer_stream.add_record_message(stream_middle_aged.generate_record_message()) 348 | reasonable_cutoff -= 1 349 | 350 | assert reasonable_cutoff == 0 351 | assert len(singer_stream.peek_buffer()) == 1 352 | assert [] == missing_sdc_properties(singer_stream) 353 | 354 | 355 | def test_multiple_batches__old_records__by_memory(): 356 | stream_oldest = CatStream(100, version=0) 357 | stream_middle_aged = CatStream(100, version=5) 358 | stream_latest = CatStream(100, version=10) 359 | 360 | singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], 361 | CATS_SCHEMA['schema'], 362 | CATS_SCHEMA['key_properties'], 363 | max_buffer_size=32768) 364 | 365 | assert len(singer_stream.peek_buffer()) == 0 366 | 367 | while not singer_stream.buffer_full: 368 | singer_stream.add_record_message(stream_oldest.generate_record_message()) 369 | 370 | assert len(singer_stream.peek_buffer()) > 0 371 | assert [] == missing_sdc_properties(singer_stream) 372 | 373 | singer_stream.flush_buffer() 374 | 375 | assert len(singer_stream.peek_buffer()) == 0 376 | 377 | singer_stream.add_record_message(stream_latest.generate_record_message()) 378 | 379 | assert len(singer_stream.peek_buffer()) == 1 380 | 381 | reasonable_cutoff = 1000 382 | while not singer_stream.buffer_full and reasonable_cutoff != 0: 383 | singer_stream.add_record_message(stream_middle_aged.generate_record_message()) 384 | reasonable_cutoff -= 1 385 | 386 | assert reasonable_cutoff == 0 387 | assert len(singer_stream.peek_buffer()) == 1 388 | assert [] == missing_sdc_properties(singer_stream) 389 | -------------------------------------------------------------------------------- /tests/unit/test_denest.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import pytest 4 | from chance import chance 5 | 6 | from target_postgres import denest, json_schema, singer 7 | 8 | 9 | def non_path_properties(table_batch): 10 | errors = [] 11 | for p in table_batch['streamed_schema']['schema']['properties']: 12 | if not isinstance(p, tuple): 13 | errors.append({'x': p, 14 | 'message': '`x` is not a `tuple`'}) 15 | 16 | return errors 17 | 18 | 19 | def missing_key_properties(table_batch): 20 | errors = [] 21 | for p in table_batch['streamed_schema']['key_properties']: 22 | if not (p,) in table_batch['streamed_schema']['schema']['properties']: 23 | errors.append({'path': tuple(p), 24 | 'message': 'key_property missing'}) 25 | 26 | for p, s in table_batch['streamed_schema']['schema']['properties'].items(): 27 | if not json_schema.is_anyof(s): 28 | errors.append({ 29 | 'path': tuple(p), 30 | 'message': 'Expected anyOf json schema for propery schema, got: {}'.format(s) 31 | }) 32 | 33 | return errors 34 | 35 | 36 | def errors(table_batch): 37 | return non_path_properties(table_batch) + missing_key_properties(table_batch) 38 | 39 | 40 | def error_check_denest(schema, key_properties, records): 41 | denested = denest.to_table_batches(schema, key_properties, records) 42 | 43 | for table_batch in denested: 44 | assert [] == errors(table_batch) 45 | 46 | return denested 47 | 48 | 49 | def test_empty(): 50 | denested = error_check_denest({}, [], []) 51 | assert 1 == len(denested) 52 | assert [] == denested[0]['records'] 53 | assert [] == denested[0]['streamed_schema']['key_properties'] 54 | 55 | 56 | def test__schema__objects_add_fields(): 57 | denested = error_check_denest({'properties': 58 | {'a': {'type': 'integer'}, 59 | 'b': {'type': 'object', 60 | 'properties': { 61 | 'c': {'type': 'string'}, 62 | 'd': {'type': 'boolean'}}}}}, 63 | ['a'], 64 | []) 65 | 66 | assert 1 == len(denested) 67 | assert ('b', 'c') in denested[0]['streamed_schema']['schema']['properties'] 68 | assert ('b', 'd') in denested[0]['streamed_schema']['schema']['properties'] 69 | 70 | 71 | def random_object_schema(): 72 | length_of_path = random.randint(1, 50) 73 | path = [] 74 | schema = {'type': chance.pickone([json_schema.BOOLEAN, 75 | json_schema.INTEGER, 76 | json_schema.NUMBER, 77 | json_schema.STRING])} 78 | for _ in range(0, length_of_path): 79 | field = chance.string(pool='', length=0) 80 | schema = {'type': json_schema.OBJECT, 81 | 'properties': {field: schema}} 82 | path.append(field) 83 | 84 | return {'schema': schema, 85 | 'path': path[::-1]} 86 | 87 | 88 | def test__schema__nested_objects_add_fields(): 89 | for _ in range(0, 100): 90 | r = random_object_schema() 91 | denested = error_check_denest(r['schema'], 92 | [], 93 | []) 94 | 95 | print('r:', r) 96 | print() 97 | print('denested:', denested) 98 | 99 | assert 1 == len(denested) 100 | assert tuple(r['path']) in denested[0]['streamed_schema']['schema']['properties'] 101 | 102 | 103 | def test__schema__arrays_add_tables(): 104 | denested = error_check_denest({'properties': 105 | {'a': {'type': 'integer'}, 106 | 'b': {'type': 'array', 107 | 'items': {'properties': { 108 | 'c': {'type': 'string'}, 109 | 'd': {'type': 'boolean'}}}}}}, 110 | ['a'], 111 | []) 112 | assert 2 == len(denested) 113 | 114 | 115 | def random_array_schema(): 116 | length_of_path = random.randint(1, 50) 117 | path = [] 118 | schema = {'type': json_schema.ARRAY, 119 | 'items': {'type': chance.pickone([json_schema.BOOLEAN, 120 | json_schema.INTEGER, 121 | json_schema.NUMBER, 122 | json_schema.STRING])}} 123 | for _ in range(0, length_of_path): 124 | field = chance.string(pool='', length=0) 125 | schema = {'type': json_schema.ARRAY, 126 | 'items': {'type': json_schema.OBJECT, 127 | 'properties': {field: schema}}} 128 | path.append(field) 129 | 130 | schema = {'type': json_schema.OBJECT, 131 | 'properties': { 132 | 'root': schema}} 133 | path.append('root') 134 | 135 | return {'schema': schema, 136 | 'path': path[::-1]} 137 | 138 | 139 | def test__schema__nested_arrays_add_tables(): 140 | for _ in range(0, 100): 141 | r = random_array_schema() 142 | denested = error_check_denest(r['schema'], 143 | [], 144 | []) 145 | 146 | print('r:', r) 147 | print() 148 | print('denested:', denested) 149 | 150 | assert len(r['path']) + 1 == len(denested) 151 | 152 | table_path_accum = [] 153 | tables_checked = 0 154 | while True: 155 | found_table = False 156 | 157 | print('looking for a table with path:', table_path_accum) 158 | 159 | for table_batch in denested: 160 | if tuple(table_path_accum) == table_batch['streamed_schema']['path']: 161 | found_table = True 162 | break 163 | 164 | assert found_table 165 | print('...table found') 166 | 167 | tables_checked += 1 168 | 169 | if len(table_path_accum) == len(r['path']): 170 | break 171 | 172 | table_path_accum.append(r['path'][len(table_path_accum)]) 173 | 174 | ## Assert that we looked for every table path 175 | assert tables_checked == len(denested) 176 | 177 | 178 | NESTED_SCHEMA = { 179 | "properties": { 180 | "a": {"type": "object", 181 | "properties": { 182 | "b": { 183 | "type": "array", 184 | "items": { 185 | "type": "object", 186 | "properties": { 187 | "c": { 188 | "type": "object", 189 | "properties": { 190 | "d": {"type": "integer"}, 191 | "e": {"type": "array", 192 | "items": {"type": "object", 193 | "properties": { 194 | "f": {"type": "string"}, 195 | "g": {"type": "boolean"}}}}}}}}}}}}} 196 | 197 | NESTED_RECORDS = [{"a": {"b": []}}, 198 | {"a": {"b": [{"c": {"d": 1}}]}}, 199 | {"a": {"b": [{"c": {"d": 12}}, 200 | {"c": {"d": 123}}]}}, 201 | {"a": {"b": [{"c": {"d": 1234}}, 202 | {"c": {"d": 12345}}, 203 | {"c": {"d": 123456}}]}}, 204 | {"a": {"b": [{"c": {"e": [{"f": "hello", 205 | "g": True}, 206 | {"f": "goodbye", 207 | "g": True}]}}]}}] 208 | 209 | 210 | def test__records__nested__tables(): 211 | denested = error_check_denest(NESTED_SCHEMA, [], NESTED_RECORDS) 212 | 213 | print('denested:', denested) 214 | 215 | assert 3 == len(denested) 216 | 217 | for table_batch in denested: 218 | assert table_batch['streamed_schema']['path'] in \ 219 | {tuple(), 220 | ('a', 'b'), 221 | ('a', 'b', 'c', 'e')} 222 | 223 | 224 | def _get_table_batch_with_path(table_batches, path): 225 | for table_batch in table_batches: 226 | if path == table_batch['streamed_schema']['path']: 227 | return table_batch 228 | raise Exception('Could not find table_batch with path: {}'.format(path)) 229 | 230 | 231 | def test__records__nested__root_empty(): 232 | denested = error_check_denest(NESTED_SCHEMA, [], NESTED_RECORDS) 233 | table_batch = _get_table_batch_with_path(denested, 234 | tuple()) 235 | 236 | assert {} == table_batch['streamed_schema']['schema']['properties'] 237 | 238 | assert 5 == len(table_batch['records']) 239 | 240 | for record in table_batch['records']: 241 | assert {} == record 242 | 243 | 244 | def test__records__nested__child_table__a_b(): 245 | denested = error_check_denest(NESTED_SCHEMA, [], NESTED_RECORDS) 246 | table_batch = _get_table_batch_with_path(denested, 247 | ('a', 'b')) 248 | 249 | assert 1 == len(table_batch['streamed_schema']['schema']['properties'][('c', 'd')]['anyOf']) 250 | assert {'type': ['integer']} == table_batch['streamed_schema']['schema']['properties'][('c', 'd')]['anyOf'][0] 251 | 252 | assert 7 == len(table_batch['records']) 253 | 254 | for record in table_batch['records']: 255 | # Don't try to access key "('c', 'd')" if record is empty 256 | if record == {}: 257 | continue 258 | assert 'integer' == record[('c', 'd')][0] 259 | assert int == type(record[('c', 'd')][1]) 260 | 261 | 262 | def test__records__nested__child_table__a_b_c_e(): 263 | denested = error_check_denest(NESTED_SCHEMA, [], NESTED_RECORDS) 264 | table_batch = _get_table_batch_with_path(denested, 265 | ('a', 'b', 'c', 'e')) 266 | 267 | assert 1 == len(table_batch['streamed_schema']['schema']['properties'][('f',)]['anyOf']) 268 | assert {'type': ['string']} == table_batch['streamed_schema']['schema']['properties'][('f',)]['anyOf'][0] 269 | assert 1 == len(table_batch['streamed_schema']['schema']['properties'][('g',)]['anyOf']) 270 | assert {'type': ['boolean']} == table_batch['streamed_schema']['schema']['properties'][('g',)]['anyOf'][0] 271 | 272 | assert 2 == len(table_batch['records']) 273 | 274 | for record in table_batch['records']: 275 | assert 'string' == record[('f',)][0] 276 | assert str == type(record[('f',)][1]) 277 | 278 | assert 'boolean' == record[('g',)][0] 279 | assert bool == type(record[('g',)][1]) 280 | 281 | 282 | def test__anyOf__schema__stitch_date_times(): 283 | denested = error_check_denest( 284 | {'properties': { 285 | 'a': { 286 | "anyOf": [ 287 | { 288 | "type": "string", 289 | "format": "date-time" 290 | }, 291 | {"type": ["string", "null"]}]}}}, 292 | [], 293 | []) 294 | table_batch = _get_table_batch_with_path(denested, tuple()) 295 | 296 | anyof_schemas = table_batch['streamed_schema']['schema']['properties'][('a',)]['anyOf'] 297 | 298 | assert 2 == len(anyof_schemas) 299 | assert 2 == len([x for x in anyof_schemas if json_schema.is_literal(x)]) 300 | assert 2 == len([x for x in anyof_schemas if json_schema.is_nullable(x)]) 301 | assert 1 == len([x for x in anyof_schemas if json_schema.is_datetime(x)]) 302 | 303 | def test__anyOf__schema__implicit_any_of(): 304 | denested = error_check_denest( 305 | { 306 | 'properties': { 307 | 'every_type': { 308 | 'type': ['integer', 'null', 'number', 'boolean', 'string', 'array', 'object'], 309 | 'items': {'type': 'integer'}, 310 | 'format': 'date-time', 311 | 'properties': { 312 | 'i': {'type': 'integer'}, 313 | 'n': {'type': 'number'}, 314 | 'b': {'type': 'boolean'} 315 | } 316 | } 317 | } 318 | }, 319 | [], 320 | []) 321 | assert 2 == len(denested) 322 | 323 | table_batch = _get_table_batch_with_path(denested, tuple()) 324 | denested_props = table_batch['streamed_schema']['schema']['properties'] 325 | 326 | assert 4 == len(denested_props) 327 | 328 | anyof_schemas = denested_props[('every_type',)]['anyOf'] 329 | 330 | assert 4 == len(anyof_schemas) 331 | assert 4 == len([x for x in anyof_schemas if json_schema.is_literal(x)]) 332 | assert 4 == len([x for x in anyof_schemas if json_schema.is_nullable(x)]) 333 | assert 1 == len([x for x in anyof_schemas if json_schema.is_datetime(x)]) 334 | 335 | 336 | def test__anyOf__schema__implicit_any_of__arrays(): 337 | denested = error_check_denest( 338 | { 339 | 'properties': { 340 | 'every_type': { 341 | 'type': ['null', 'string', 'array', 'object'], 342 | 'items': { 343 | 'anyOf': [ 344 | {'type': 'integer'}, 345 | {'type': 'number'}] 346 | }, 347 | 'format': 'date-time', 348 | 'properties': { 349 | 'i': {'type': 'integer'} 350 | } 351 | } 352 | } 353 | }, 354 | [], 355 | []) 356 | assert 2 == len(denested) 357 | 358 | table_batch = _get_table_batch_with_path(denested, ('every_type',)) 359 | denested_props = table_batch['streamed_schema']['schema']['properties'] 360 | anyof_schemas = denested_props[(singer.VALUE,)]['anyOf'] 361 | 362 | assert 2 == len(anyof_schemas) 363 | assert 2 == len([x for x in anyof_schemas if json_schema.is_literal(x)]) 364 | 365 | 366 | def test__anyOf__schema__implicit_any_of__objects(): 367 | denested = error_check_denest( 368 | { 369 | 'properties': { 370 | 'every_type': { 371 | 'type': ['integer', 'null', 'number', 'boolean', 'string', 'array', 'object'], 372 | 'items': {'type': 'integer'}, 373 | 'format': 'date-time', 374 | 'properties': { 375 | 'i': {'anyOf': [ 376 | {'type': 'integer'}, 377 | {'type': 'number'}, 378 | {'type': 'boolean'}] 379 | } 380 | } 381 | } 382 | } 383 | }, 384 | [], 385 | []) 386 | assert 2 == len(denested) 387 | 388 | table_batch = _get_table_batch_with_path(denested, tuple()) 389 | denested_props = table_batch['streamed_schema']['schema']['properties'] 390 | print(denested_props) 391 | anyof_schemas = denested_props[('every_type', 'i')]['anyOf'] 392 | 393 | assert 3 == len(anyof_schemas) 394 | assert 3 == len([x for x in anyof_schemas if json_schema.is_literal(x)]) 395 | -------------------------------------------------------------------------------- /tests/unit/test_sandbox.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import psycopg2 4 | import psycopg2.extras 5 | import pytest 6 | 7 | from utils.fixtures import CONFIG, db_cleanup, ListStream, TEST_DB 8 | from target_postgres import main 9 | 10 | 11 | def assert_tables_equal(cursor, expected_table_names): 12 | cursor.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'") 13 | tables = [] 14 | for table in cursor.fetchall(): 15 | tables.append(table[0]) 16 | 17 | assert (not tables and not expected_table_names) \ 18 | or set(tables) == expected_table_names 19 | 20 | 21 | def assert_columns_equal(cursor, table_name, expected_column_tuples): 22 | cursor.execute("SELECT column_name, data_type, is_nullable FROM information_schema.columns " + \ 23 | "WHERE table_schema = 'public' and table_name = '{}';".format( 24 | table_name)) 25 | columns = cursor.fetchall() 26 | 27 | assert (not columns and not expected_column_tuples) \ 28 | or set(columns) == expected_column_tuples 29 | 30 | 31 | def assert_count_equal(cursor, table_name, n): 32 | cursor.execute('SELECT count(*) FROM "public"."{}"'.format(table_name)) 33 | assert cursor.fetchone()[0] == n 34 | 35 | 36 | class BigCommerceStream(ListStream): 37 | stream = [ 38 | {"type": "SCHEMA", 39 | "stream": "products", 40 | "schema": { 41 | "type": "object", 42 | "properties": {"id": {"type": "integer"}, 43 | "name": {"type": ["null", 44 | "string"]}, 45 | "type": {"type": ["null", 46 | "string"]}, 47 | "sku": {"type": ["null", 48 | "string"]}, 49 | "description": {"type": ["null", 50 | "string"]}, 51 | "weight": {"type": ["null", 52 | "integer"]}, 53 | "width": {"type": ["null", 54 | "integer"]}, 55 | "depth": {"type": ["null", 56 | "integer"]}, 57 | "height": {"type": ["null", 58 | "integer"]}, 59 | "price": {"type": ["null", 60 | "integer", 61 | "number"]}, 62 | "cost_price": {"type": ["null", 63 | "integer"]}, 64 | "retail_price": {"type": ["null", 65 | "integer"]}, 66 | "sale_price": {"type": ["null", 67 | "integer"]}, 68 | "map_price": {"type": ["null", 69 | "integer"]}, 70 | "tax_class_id": {"type": ["null", 71 | "integer"]}, 72 | "product_tax_code": {"type": ["null", 73 | "string"]}, 74 | "calculated_price": {"type": ["null", 75 | "integer", 76 | "number"]}, 77 | "categories": {"type": ["null", 78 | "array"], 79 | "items": {"type": ["null", 80 | "integer"]}}, 81 | "brand_id": {"type": ["null", 82 | "integer"]}, 83 | "option_set_id": {"type": ["null", 84 | "integer"]}, 85 | "option_set_display": {"type": ["null", 86 | "string"]}, 87 | "inventory_level": {"type": ["null", 88 | "integer"]}, 89 | "inventory_warning_level": {"type": ["null", 90 | "integer"]}, 91 | "inventory_tracking": {"type": ["null", 92 | "string"]}, 93 | "reviews_rating_sum": {"type": ["null", 94 | "integer"]}, 95 | "reviews_count": {"type": ["null", 96 | "integer"]}, 97 | "total_sold": {"type": ["null", 98 | "integer"]}, 99 | "fixed_cost_shipping_price": {"type": ["null", 100 | "integer"]}, 101 | "is_free_shipping": {"type": ["null", 102 | "boolean"]}, 103 | "is_visible": {"type": ["null", 104 | "boolean"]}, 105 | "is_featured": {"type": ["null", 106 | "boolean"]}, 107 | "related_products": {"type": ["null", 108 | "array"], 109 | "items": {"type": ["null", 110 | "integer"]}}, 111 | "warranty": {"type": ["null", 112 | "string"]}, 113 | "bin_picking_number": {"type": ["null", 114 | "string"]}, 115 | "layout_file": {"type": ["null", 116 | "string"]}, 117 | "upc": {"type": ["null", 118 | "string"]}, 119 | "mpn": {"type": ["null", 120 | "string"]}, 121 | "gtin": {"type": ["null", 122 | "string"]}, 123 | "search_keywords": {"type": ["null", 124 | "string"]}, 125 | "availability": {"type": ["null", 126 | "string"]}, 127 | "availability_description": {"type": ["null", 128 | "string"]}, 129 | "gift_wrapping_options_type": {"type": ["null", 130 | "string"]}, 131 | "sort_order": {"type": ["null", 132 | "integer"]}, 133 | "condition": {"type": ["null", 134 | "string"]}, 135 | "is_condition_shown": {"type": ["null", 136 | "boolean"]}, 137 | "order_quantity_minimum": {"type": ["null", 138 | "integer"]}, 139 | "order_quantity_maximum": {"type": ["null", 140 | "integer"]}, 141 | "page_title": {"type": ["null", 142 | "string"]}, 143 | "meta_description": {"type": ["null", 144 | "string"]}, 145 | "date_created": {"type": "string", 146 | "format": "date-time"}, 147 | "date_modified": {"type": "string", 148 | "format": "date-time"}, 149 | "view_count": {"type": ["null", 150 | "integer"]}, 151 | "preorder_release_date": {"type": ["null", 152 | "string"], 153 | "format": "date-time"}, 154 | "preorder_message": {"type": ["null", 155 | "string"]}, 156 | "is_preorder_only": {"type": ["null", 157 | "boolean"]}, 158 | "is_price_hidden": {"type": ["null", 159 | "boolean"]}, 160 | "price_hidden_label": {"type": ["null", 161 | "string"]}, 162 | "custom_url": { 163 | "type": ["null", 164 | "object"], 165 | "properties": {"url": {"type": ["null", 166 | "string"]}, 167 | "is_customized": {"type": ["null", 168 | "boolean"]}}}, 169 | "base_variant_id": {"type": ["null", 170 | "integer"]}, 171 | "open_graph_type": {"type": ["null", 172 | "string"]}, 173 | "open_graph_title": {"type": ["null", 174 | "string"]}, 175 | "open_graph_description": {"type": ["null", 176 | "string"]}, 177 | "open_graph_use_meta_description": {"type": ["null", 178 | "boolean"]}, 179 | "open_graph_use_product_name": {"type": ["null", 180 | "boolean"]}, 181 | "open_graph_use_image": {"type": ["null", 182 | "boolean"]}}}, 183 | "key_properties": ["id"]}, 184 | {"type": "RECORD", 185 | "stream": "products", 186 | "record": {"id": 1, 187 | "name": "SAMPLE", 188 | "type": "physical", 189 | "sku": "very-sku-y", 190 | "description": "

some

\n

random

\n

html

", 191 | "weight": 123, 192 | "width": 0, 193 | "depth": 0, 194 | "height": 0, 195 | "price": 31.45, 196 | "cost_price": 0, 197 | "retail_price": 0, 198 | "sale_price": 0, 199 | "map_price": 0, 200 | "tax_class_id": 0, 201 | "product_tax_code": "", 202 | "calculated_price": 31.45, 203 | "categories": [32, 22, 21, 20], 204 | "brand_id": 42, 205 | "option_set_id": None, 206 | "option_set_display": "right", 207 | "inventory_level": 0, 208 | "inventory_warning_level": 0, 209 | "inventory_tracking": "none", 210 | "reviews_rating_sum": 0, 211 | "reviews_count": 0, 212 | "total_sold": 0, 213 | "fixed_cost_shipping_price": 0, 214 | "is_free_shipping": False, 215 | "is_visible": True, 216 | "is_featured": False, 217 | "related_products": [-1], 218 | "warranty": "", 219 | "bin_picking_number": "0", 220 | "layout_file": "a-product.html", 221 | "upc": "", 222 | "mpn": "", 223 | "gtin": "", 224 | "search_keywords": "", 225 | "availability": "available", 226 | "availability_description": "", 227 | "gift_wrapping_options_type": "any", 228 | "sort_order": 0, 229 | "condition": "New", 230 | "is_condition_shown": False, 231 | "order_quantity_minimum": 0, 232 | "order_quantity_maximum": 0, 233 | "page_title": "", 234 | "meta_description": "", 235 | "date_created": "2018-08-27T18:40:23.000000Z", 236 | "date_modified": "2018-08-27T20:45:53.000000Z", 237 | "view_count": 31, 238 | "preorder_release_date": None, 239 | "preorder_message": "0", 240 | "is_preorder_only": False, 241 | "is_price_hidden": False, 242 | "price_hidden_label": "0", 243 | "custom_url": {"url": "/SAMPLE/", 244 | "is_customized": False}, 245 | "base_variant_id": 77, 246 | "open_graph_type": "product", 247 | "open_graph_title": "", 248 | "open_graph_description": "", 249 | "open_graph_use_meta_description": True, 250 | "open_graph_use_product_name": True, 251 | "open_graph_use_image": True}}, 252 | {"type": "STATE", 253 | "value": {"bookmarks": {"products": "2018-11-17T21:26:50+00:00"}}}, 254 | {"type": "SCHEMA", 255 | "stream": "customers", 256 | "schema": { 257 | "properties": {"id": {"type": "integer"}, 258 | "company": {"type": ["null", 259 | "string"]}, 260 | "first_name": {"type": ["null", 261 | "string"]}, 262 | "last_name": {"type": ["null", 263 | "string"]}, 264 | "email": {"type": ["null", 265 | "string"]}, 266 | "phone": {"type": ["null", 267 | "string"]}, 268 | "form_fields": {"type": ["null"]}, 269 | "date_created": {"format": "date-time", 270 | "type": "string"}, 271 | "date_modified": {"format": "date-time", 272 | "type": "string"}, 273 | "store_credit": {"type": ["null", 274 | "string"]}, 275 | "registration_ip_address": {"type": ["null", 276 | "string"]}, 277 | "customer_group_id": {"type": ["null", 278 | "integer"]}, 279 | "notes": {"type": ["null", 280 | "string"]}, 281 | "tax_exempt_category": {"type": ["null", 282 | "string"]}, 283 | "reset_pass_on_login": {"type": ["null", 284 | "boolean"]}, 285 | "accepts_marketing": {"type": ["null", 286 | "boolean"]}, 287 | "addresses": { 288 | "properties": {"url": {"type": ["null", 289 | "string"]}, 290 | "resource": {"type": ["null", 291 | "string"]}}, 292 | "type": ["null", 293 | "object"]}}, 294 | "type": ["null", 295 | "object"]}, 296 | "key_properties": ["id"]}, 297 | {"type": "RECORD", 298 | "stream": "customers", 299 | "record": {"id": 1, 300 | "company": "", 301 | "first_name": "Data", 302 | "last_name": "Mill", 303 | "email": "test@test.com", 304 | "phone": "1231231234", 305 | "form_fields": None, 306 | "date_created": "2018-11-17T21:25:00.000000Z", 307 | "date_modified": "2018-11-17T21:25:01.000000Z", 308 | "store_credit": "0.0000", 309 | "registration_ip_address": "127.0.0.1", 310 | "customer_group_id": 0, 311 | "notes": "", 312 | "tax_exempt_category": "", 313 | "reset_pass_on_login": False, 314 | "accepts_marketing": False, 315 | "addresses": {"url": "https://api.bigcommerce.com/stores/some-unique-hash/v2/customers/1/addresses", 316 | "resource": "/customers/1/addresses"}}}, 317 | {"type": "STATE", 318 | "value": {"bookmarks": {"products": "2018-11-17T21:26:50+00:00", 319 | "customers": "2018-11-17T21:25:01+00:00"}}}] 320 | 321 | 322 | def test_bigcommerce__sandbox(db_cleanup): 323 | main(CONFIG, input_stream=BigCommerceStream()) 324 | 325 | with psycopg2.connect(**TEST_DB) as conn: 326 | with conn.cursor() as cur: 327 | assert_tables_equal(cur, 328 | {'products', 329 | 'customers', 330 | 'products__categories', 331 | 'products__related_products'}) 332 | 333 | ## form_fields should not show up as it can only be `null` 334 | assert_columns_equal(cur, 335 | 'customers', 336 | { 337 | ('_sdc_table_version', 'bigint', 'YES'), 338 | ('_sdc_received_at', 'timestamp with time zone', 'YES'), 339 | ('_sdc_sequence', 'bigint', 'YES'), 340 | ('_sdc_batched_at', 'timestamp with time zone', 'YES'), 341 | ('id', 'bigint', 'NO'), 342 | ('date_modified', 'timestamp with time zone', 'NO'), 343 | ('store_credit', 'text', 'YES'), 344 | ('notes', 'text', 'YES'), 345 | ('tax_exempt_category', 'text', 'YES'), 346 | ('email', 'text', 'YES'), 347 | ('company', 'text', 'YES'), 348 | ('customer_group_id', 'bigint', 'YES'), 349 | ('registration_ip_address', 'text', 'YES'), 350 | ('date_created', 'timestamp with time zone', 'NO'), 351 | ('accepts_marketing', 'boolean', 'YES'), 352 | ('addresses__resource', 'text', 'YES'), 353 | ('reset_pass_on_login', 'boolean', 'YES'), 354 | ('addresses__url', 'text', 'YES'), 355 | ('first_name', 'text', 'YES'), 356 | ('phone', 'text', 'YES'), 357 | ('last_name', 'text', 'YES') 358 | }) 359 | 360 | 361 | class HubspotStream(ListStream): 362 | stream = [ 363 | {"type": "SCHEMA", 364 | "stream": "deals", 365 | "schema": { 366 | "type": "object", 367 | "properties": { 368 | "properties": { 369 | "type": "object", 370 | "properties": { 371 | "num_contacted_notes": { 372 | "type": "object", 373 | "properties": { 374 | "value": { 375 | "type": ["null", "number", "string"] 376 | }}}}}}}, 377 | "key_properties": []}, 378 | {"type": "RECORD", 379 | "stream": "deals", 380 | "record": {}}, 381 | {"type": "RECORD", 382 | "stream": "deals", 383 | "record": { 384 | "properties": {}}}, 385 | {"type": "RECORD", 386 | "stream": "deals", 387 | "record": { 388 | "properties": { 389 | "num_contacted_notes": {}}}}, 390 | {"type": "RECORD", 391 | "stream": "deals", 392 | "record": { 393 | "properties": { 394 | "num_contacted_notes": { 395 | "value": None}}}}, 396 | {"type": "RECORD", 397 | "stream": "deals", 398 | "record": { 399 | "properties": { 400 | "num_contacted_notes": { 401 | "value": "helloworld"}}}}, 402 | {"type": "RECORD", 403 | "stream": "deals", 404 | "record": { 405 | "properties": { 406 | "num_contacted_notes": { 407 | "value": 12345}}}}, 408 | {"type": "RECORD", 409 | "stream": "deals", 410 | "record": { 411 | "properties": { 412 | "num_contacted_notes": { 413 | "value": 12345.6789}}}}] 414 | 415 | 416 | def test_hubspot__sandbox(db_cleanup): 417 | config = CONFIG.copy() 418 | config['persist_empty_tables'] = True 419 | main(config, input_stream=HubspotStream()) 420 | 421 | with psycopg2.connect(**TEST_DB) as conn: 422 | with conn.cursor() as cur: 423 | assert_tables_equal(cur, 424 | {'deals'}) 425 | 426 | assert_columns_equal(cur, 427 | 'deals', 428 | { 429 | ('_sdc_table_version', 'bigint', 'YES'), 430 | ('_sdc_received_at', 'timestamp with time zone', 'YES'), 431 | ('_sdc_sequence', 'bigint', 'YES'), 432 | ('_sdc_primary_key', 'text', 'NO'), 433 | ('_sdc_batched_at', 'timestamp with time zone', 'YES'), 434 | ('properties__num_contacted_notes__value__f', 'double precision', 'YES'), 435 | ('properties__num_contacted_notes__value__s', 'text', 'YES') 436 | }) 437 | 438 | assert_count_equal(cur, 439 | 'deals', 440 | 7) 441 | -------------------------------------------------------------------------------- /tests/unit/test_target_tools.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import json 3 | 4 | from unittest.mock import patch 5 | import pytest 6 | 7 | from target_postgres import singer_stream 8 | from target_postgres import target_tools 9 | from target_postgres.sql_base import SQLInterface 10 | 11 | from utils.fixtures import CONFIG, CatStream, ListStream, InvalidCatStream, DogStream 12 | 13 | 14 | class Target(SQLInterface): 15 | IDENTIFIER_FIELD_LENGTH = 50 16 | 17 | def __init__(self): 18 | self.calls = {'write_batch': [], 'activate_version': []} 19 | 20 | def write_batch(self, stream_buffer): 21 | self.calls['write_batch'].append({'records_count': len(stream_buffer.peek_buffer())}) 22 | return None 23 | 24 | def activate_version(self, stream_buffer, version): 25 | self.calls['activate_version'].append({'records_count': len(stream_buffer.peek_buffer())}) 26 | return None 27 | 28 | 29 | def filtered_output(capsys): 30 | out, _ = capsys.readouterr() 31 | return list(filter(None, out.split('\n'))) 32 | 33 | 34 | def test_usage_stats(): 35 | config = deepcopy(CONFIG) 36 | assert config['disable_collection'] 37 | 38 | with patch.object(target_tools, 39 | '_async_send_usage_stats') as mock: 40 | target_tools.stream_to_target([], None, config=config) 41 | 42 | assert mock.call_count == 0 43 | 44 | config['disable_collection'] = False 45 | 46 | target_tools.stream_to_target([], None, config=config) 47 | 48 | assert mock.call_count == 1 49 | 50 | 51 | def test_loading__invalid__records(): 52 | with pytest.raises(singer_stream.SingerStreamError, match=r'.*'): 53 | target_tools.stream_to_target(InvalidCatStream(1), None, config=CONFIG) 54 | 55 | 56 | def test_loading__invalid__records__disable(): 57 | config = deepcopy(CONFIG) 58 | config['invalid_records_detect'] = False 59 | 60 | target = Target() 61 | 62 | target_tools.stream_to_target(InvalidCatStream(100), target, config=config) 63 | 64 | ## Since all `cat`s records were invalid, we could not persist them, hence, no calls made to `write_batch` 65 | assert len(target.calls['write_batch']) == 1 66 | assert target.calls['write_batch'][0]['records_count'] == 0 67 | 68 | 69 | def test_loading__invalid__records__threshold(): 70 | config = deepcopy(CONFIG) 71 | config['invalid_records_threshold'] = 10 72 | 73 | target = Target() 74 | 75 | with pytest.raises(singer_stream.SingerStreamError, match=r'.*.10*'): 76 | target_tools.stream_to_target(InvalidCatStream(20), target, config=config) 77 | 78 | assert len(target.calls['write_batch']) == 0 79 | 80 | 81 | def test_activate_version(): 82 | config = CONFIG.copy() 83 | config['max_batch_rows'] = 20 84 | config['batch_detection_threshold'] = 11 85 | 86 | records = [{"type": "RECORD", 87 | "stream": "abc", 88 | "record": {}, 89 | "version": 123}] * (config['batch_detection_threshold'] - 1) 90 | 91 | class TestStream(ListStream): 92 | stream = [ 93 | {"type": "SCHEMA", 94 | "stream": "abc", 95 | "schema": { 96 | "type": "object", 97 | "properties": { 98 | 'a': {'type': 'number'}}}, 99 | "key_properties": []} 100 | ] + records + [ 101 | {'type': 'ACTIVATE_VERSION', 102 | 'stream': "abc", 103 | 'version': 123} 104 | ] + records 105 | 106 | target = Target() 107 | 108 | target_tools.stream_to_target(TestStream(), target, config=config) 109 | 110 | rows_persisted = 0 111 | for call in target.calls['write_batch']: 112 | rows_persisted += call['records_count'] 113 | 114 | expected_rows = (2 * len(records)) 115 | assert rows_persisted == expected_rows 116 | 117 | 118 | def test_record_with_multiple_of(): 119 | values = [1, 1.0, 2, 2.0, 3, 7, 10.1] 120 | records = [] 121 | for value in values: 122 | records.append({ 123 | "type": "RECORD", 124 | "stream": "test", 125 | "record": {"multipleOfKey": value}, 126 | }) 127 | 128 | class TestStream(ListStream): 129 | stream = [ 130 | { 131 | "type": "SCHEMA", 132 | "stream": "test", 133 | "schema": { 134 | "properties": { 135 | "multipleOfKey": { 136 | "type": "number", 137 | "multipleOf": 1e-15 138 | } 139 | } 140 | }, 141 | "key_properties": [] 142 | } 143 | ] + records 144 | 145 | target = Target() 146 | 147 | target_tools.stream_to_target(TestStream(), target, config=CONFIG.copy()) 148 | 149 | expected_rows = len(records) 150 | rows_persisted = 0 151 | for call in target.calls['write_batch']: 152 | rows_persisted += call['records_count'] 153 | 154 | assert rows_persisted == expected_rows 155 | 156 | 157 | def test_state__capture(capsys): 158 | stream = [ 159 | json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}), 160 | json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})] 161 | 162 | target_tools.stream_to_target(stream, Target()) 163 | output = filtered_output(capsys) 164 | 165 | assert len(output) == 2 166 | assert json.loads(output[0])['test'] == 'state-1' 167 | assert json.loads(output[1])['test'] == 'state-2' 168 | 169 | 170 | def test_state__capture_can_be_disabled(capsys): 171 | stream = [ 172 | json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}), 173 | json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})] 174 | 175 | target_tools.stream_to_target(stream, Target(), {'state_support': False}) 176 | output = filtered_output(capsys) 177 | 178 | assert len(output) == 0 179 | 180 | 181 | def test_state__emits_only_messages_when_all_records_before_have_been_flushed(capsys): 182 | config = CONFIG.copy() 183 | config['max_batch_rows'] = 20 184 | config['batch_detection_threshold'] = 1 185 | rows = list(CatStream(100)) 186 | target = Target() 187 | 188 | def test_stream(): 189 | yield rows[0] 190 | for row in rows[slice(1, 5)]: 191 | yield row 192 | yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}) 193 | for row in rows[slice(6, 10)]: 194 | yield row 195 | yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}}) 196 | for row in rows[slice(11, 15)]: 197 | yield row 198 | yield json.dumps({'type': 'STATE', 'value': {'test': 'state-3'}}) 199 | 200 | # After some state messages but before the batch size has been hit no state messages should have been emitted 201 | assert len(target.calls['write_batch']) == 0 202 | output = filtered_output(capsys) 203 | assert output == [] 204 | 205 | for row in rows[slice(16, 25)]: 206 | yield row 207 | yield json.dumps({'type': 'STATE', 'value': {'test': 'state-4'}}) 208 | 209 | # After the batch size has been hit and a write_batch call was made, the most recent safe to emit state should have been emitted 210 | assert len(target.calls['write_batch']) == 1 211 | output = filtered_output(capsys) 212 | assert len(output) == 1 213 | assert json.loads(output[0])['test'] == 'state-3' 214 | 215 | for row in rows[slice(26, 31)]: 216 | yield row 217 | 218 | target_tools.stream_to_target(test_stream(), target, config=config) 219 | 220 | # The final state message should have been outputted after the last records were loaded 221 | output = filtered_output(capsys) 222 | assert len(output) == 1 223 | assert json.loads(output[0])['test'] == 'state-4' 224 | 225 | 226 | def test_state__emits_most_recent_state_when_final_flush_occurs(capsys): 227 | config = CONFIG.copy() 228 | config['max_batch_rows'] = 20 229 | config['batch_detection_threshold'] = 1 230 | rows = list(CatStream(5)) 231 | rows.append(json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})) 232 | 233 | target_tools.stream_to_target(rows, Target(), config=config) 234 | 235 | # The final state message should have been outputted after the last records were loaded despite not reaching 236 | # one full flushable batch 237 | output = filtered_output(capsys) 238 | assert len(output) == 1 239 | assert json.loads(output[0])['test'] == 'state-1' 240 | 241 | 242 | def test_state__doesnt_emit_when_only_one_of_several_streams_is_flushing(capsys): 243 | config = CONFIG.copy() 244 | config['max_batch_rows'] = 20 245 | config['batch_detection_threshold'] = 1 246 | cat_rows = list(CatStream(100)) 247 | dog_rows = list(DogStream(50)) 248 | target = Target() 249 | 250 | # Simulate one stream that yields a lot of records with another that yields few records and ensure both need to be flushed 251 | # before any state messages are emitted 252 | def test_stream(): 253 | yield cat_rows[0] 254 | yield dog_rows[0] 255 | for row in cat_rows[slice(1, 5)]: 256 | yield row 257 | for row in dog_rows[slice(1, 5)]: 258 | yield row 259 | yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}) 260 | 261 | for row in cat_rows[slice(6, 45)]: 262 | yield row 263 | yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}}) 264 | 265 | for row in cat_rows[slice(46, 65)]: 266 | yield row 267 | yield json.dumps({'type': 'STATE', 'value': {'test': 'state-3'}}) 268 | 269 | # After some state messages but before the batch size has been hit for both streams no state messages should have been emitted 270 | assert len(target.calls['write_batch']) == 3 271 | output = filtered_output(capsys) 272 | assert output == [] 273 | 274 | for row in dog_rows[slice(6, 25)]: 275 | yield row 276 | yield json.dumps({'type': 'STATE', 'value': {'test': 'state-4'}}) 277 | 278 | # After the batch size has been hit and a write_batch call was made, the most recent safe to emit state should have been emitted 279 | assert len(target.calls['write_batch']) == 4 280 | output = filtered_output(capsys) 281 | assert len(output) == 1 282 | assert json.loads(output[0])['test'] == 'state-2' 283 | 284 | target_tools.stream_to_target(test_stream(), target, config=config) 285 | 286 | # The final state message should have been outputted after the last dog records were loaded despite not reaching one full flushable batch 287 | output = filtered_output(capsys) 288 | assert len(output) == 1 289 | assert json.loads(output[0])['test'] == 'state-4' 290 | 291 | 292 | def test_state__emits_when_multiple_streams_are_registered_but_records_arrive_from_only_one(capsys): 293 | config = CONFIG.copy() 294 | config['max_batch_rows'] = 20 295 | config['batch_detection_threshold'] = 1 296 | cat_rows = list(CatStream(100)) 297 | dog_rows = list(DogStream(50)) 298 | target = Target() 299 | 300 | # Simulate one stream that yields a lot of records with another that yields no records, and ensure that only the first 301 | # needs to be flushed before any state messages are emitted 302 | def test_stream(): 303 | yield cat_rows[0] 304 | yield dog_rows[0] 305 | for row in cat_rows[slice(1, 5)]: 306 | yield row 307 | yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}) 308 | 309 | for row in cat_rows[slice(6, 25)]: 310 | yield row 311 | yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}}) 312 | 313 | # After some state messages and only one of the registered streams has hit the batch size, the state message should be emitted, as there are no unflushed records from the other stream yet 314 | assert len(target.calls['write_batch']) == 1 315 | output = filtered_output(capsys) 316 | assert len(output) == 1 317 | assert json.loads(output[0])['test'] == 'state-1' 318 | 319 | 320 | target_tools.stream_to_target(test_stream(), target, config=config) 321 | 322 | # The final state message should have been outputted after the last dog records were loaded despite not reaching one full flushable batch 323 | output = filtered_output(capsys) 324 | assert len(output) == 1 325 | assert json.loads(output[0])['test'] == 'state-2' 326 | 327 | 328 | def test_state__doesnt_emit_when_it_isnt_different_than_the_previous_emission(capsys): 329 | config = CONFIG.copy() 330 | config['max_batch_rows'] = 5 331 | config['batch_detection_threshold'] = 1 332 | rows = list(CatStream(100)) 333 | target = Target() 334 | 335 | def test_stream(): 336 | yield rows[0] 337 | for row in rows[slice(1, 21)]: 338 | yield row 339 | yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}) 340 | output = filtered_output(capsys) 341 | assert len(output) == 1 342 | 343 | for row in rows[slice(22, 99)]: 344 | yield row 345 | yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}) 346 | 347 | output = filtered_output(capsys) 348 | assert len(output) == 0 349 | 350 | target_tools.stream_to_target(test_stream(), target, config=config) 351 | 352 | output = filtered_output(capsys) 353 | assert len(output) == 0 354 | -------------------------------------------------------------------------------- /tests/utils/fixtures.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | 5 | import pytest 6 | import psycopg2 7 | import arrow 8 | from faker import Faker 9 | from chance import chance 10 | 11 | CONFIG = { 12 | 'postgres_host': os.environ['POSTGRES_HOST'], 13 | 'postgres_database': os.environ['POSTGRES_DATABASE'], 14 | 'postgres_username': os.environ['POSTGRES_USERNAME'], 15 | 'disable_collection': True, 16 | 'logging_level': 'DEBUG' 17 | } 18 | 19 | TEST_DB = { 20 | 'host': CONFIG['postgres_host'], 21 | 'dbname': CONFIG['postgres_database'], 22 | 'user': CONFIG['postgres_username'] 23 | } 24 | 25 | fake = Faker() 26 | 27 | CATS_SCHEMA = { 28 | 'type': 'SCHEMA', 29 | 'stream': 'cats', 30 | 'schema': { 31 | 'additionalProperties': False, 32 | 'properties': { 33 | 'id': { 34 | 'type': 'integer' 35 | }, 36 | 'name': { 37 | 'type': ['string'] 38 | }, 39 | 'bio': { 40 | 'type': ['string'] 41 | }, 42 | 'paw_size': { 43 | 'type': ['integer'], 44 | 'default': 314159 45 | }, 46 | 'paw_colour': { 47 | 'type': 'string', 48 | 'default': '' 49 | }, 50 | 'flea_check_complete': { 51 | 'type': ['boolean'], 52 | 'default': False 53 | }, 54 | 'pattern': { 55 | 'type': ['null', 'string'] 56 | }, 57 | 'age': { 58 | 'type': ['null', 'integer'] 59 | }, 60 | 'adoption': { 61 | 'type': ['object', 'null'], 62 | 'properties': { 63 | 'adopted_on': { 64 | 'type': ['null', 'string'], 65 | 'format': 'date-time' 66 | }, 67 | 'was_foster': { 68 | 'type': 'boolean' 69 | }, 70 | 'immunizations': { 71 | 'type': ['null', 'array'], 72 | 'items': { 73 | 'type': ['object'], 74 | 'properties': { 75 | 'type': { 76 | 'type': ['null', 'string'] 77 | }, 78 | 'date_administered': { 79 | 'type': ['null', 'string'], 80 | 'format': 'date-time' 81 | } 82 | } 83 | } 84 | } 85 | } 86 | } 87 | } 88 | }, 89 | 'key_properties': ['id'] 90 | } 91 | 92 | 93 | class FakeStream(object): 94 | def __init__(self, 95 | n, 96 | *args, 97 | version=None, 98 | nested_count=0, 99 | duplicates=0, 100 | duplicate_sequence_delta=200, 101 | sequence=None, 102 | **kwargs): 103 | self.n = n 104 | self.wrote_schema = False 105 | self.id = 1 106 | self.nested_count = nested_count 107 | self.version = version 108 | self.wrote_activate_version = False 109 | self.records = [] 110 | self.duplicates = duplicates 111 | self.duplicates_written = 0 112 | self.duplicate_pks_used = [] 113 | self.record_message_count = 0 114 | if sequence: 115 | self.sequence = sequence 116 | else: 117 | self.sequence = arrow.get().int_timestamp 118 | self.duplicate_sequence_delta = duplicate_sequence_delta 119 | 120 | def duplicate(self, force=False): 121 | if self.duplicates > 0 and \ 122 | len(self.records) > 0 and \ 123 | self.duplicates_written < self.duplicates and \ 124 | (force or chance.boolean(likelihood=30)): 125 | self.duplicates_written += 1 126 | random_index = random.randint(0, len(self.records) - 1) 127 | record = self.records[random_index] 128 | self.duplicate_pks_used.append(record['id']) 129 | record_message = self.generate_record_message(record=record) 130 | record_message['sequence'] = self.sequence + self.duplicate_sequence_delta 131 | return record_message 132 | else: 133 | return False 134 | 135 | def generate_record_message(self, record=None): 136 | if not record: 137 | record = self.generate_record() 138 | self.id += 1 139 | 140 | self.records.append(record) 141 | message = { 142 | 'type': 'RECORD', 143 | 'stream': self.stream, 144 | 'record': record, 145 | 'sequence': self.sequence 146 | } 147 | 148 | if self.version is not None: 149 | message['version'] = self.version 150 | 151 | self.record_message_count += 1 152 | 153 | return message 154 | 155 | def activate_version(self): 156 | self.wrote_activate_version = True 157 | return { 158 | 'type': 'ACTIVATE_VERSION', 159 | 'stream': self.stream, 160 | 'version': self.version 161 | } 162 | 163 | def __iter__(self): 164 | return self 165 | 166 | def __next__(self): 167 | if not self.wrote_schema: 168 | self.wrote_schema = True 169 | return json.dumps(self.schema) 170 | if self.id <= self.n: 171 | dup = self.duplicate() 172 | if dup != False: 173 | return json.dumps(dup) 174 | return json.dumps(self.generate_record_message()) 175 | if self.id == self.n: 176 | dup = self.duplicate(force=True) 177 | if dup != False: 178 | return json.dumps(dup) 179 | if self.version is not None and self.wrote_activate_version == False: 180 | return json.dumps(self.activate_version()) 181 | raise StopIteration 182 | 183 | 184 | def fake_conjunctive_text(n): 185 | t = fake.text() 186 | for i in range(0, n): 187 | t = '{}, {} {}'.format( 188 | t[:-1], 189 | chance.pickone(['and', 'or', 'for', 'nor', 'but', 'yet', 'so']), 190 | fake.text()) 191 | return t 192 | 193 | 194 | class CatStream(FakeStream): 195 | stream = 'cats' 196 | schema = CATS_SCHEMA 197 | 198 | def generate_record(self): 199 | adoption = None 200 | if self.nested_count or chance.boolean(likelihood=70): 201 | immunizations = [] 202 | for i in range(0, self.nested_count or random.randint(0, 4)): 203 | immunizations.append({ 204 | 'type': chance.pickone(['FIV', 'Panleukopenia', 'Rabies', 'Feline Leukemia']), 205 | 'date_administered': chance.date(minyear=2012).isoformat() 206 | }) 207 | adoption = { 208 | 'adopted_on': chance.date(minyear=2012).isoformat(), 209 | 'was_foster': chance.boolean(), 210 | 'immunizations': immunizations 211 | } 212 | 213 | return { 214 | 'id': self.id, 215 | 'name': fake.first_name(), 216 | 'bio': fake_conjunctive_text(random.randint(0, 10)), 217 | 'pattern': chance.pickone(['Tabby', 'Tuxedo', 'Calico', 'Tortoiseshell']), 218 | 'age': random.randint(1, 15), 219 | 'adoption': adoption 220 | } 221 | 222 | 223 | class InvalidCatStream(CatStream): 224 | def generate_record(self): 225 | record = CatStream.generate_record(self) 226 | 227 | if chance.boolean(likelihood=50): 228 | record['adoption'] = ['invalid', 'adoption'] 229 | elif chance.boolean(likelihood=50): 230 | record['age'] = 'very invalid age' 231 | elif record['adoption'] and chance.boolean(likelihood=50): 232 | record['adoption']['immunizations'] = { 233 | 'type': chance.pickone(['a', 'b', 'c']), 234 | 'date_administered': ['clearly', 'not', 'a', 'date'] 235 | } 236 | else: 237 | record['name'] = 22 / 7 238 | 239 | return record 240 | 241 | 242 | NESTED_STREAM = { 243 | 'type': 'SCHEMA', 244 | 'stream': 'root', 245 | 'schema': { 246 | 'additionalProperties': False, 247 | 'properties': { 248 | 'id': { 249 | 'type': 'integer' 250 | }, 251 | ## TODO: Complex types defaulted 252 | # 'array_scalar_defaulted': { 253 | # 'type': 'array', 254 | # 'items': {'type': 'integer'}, 255 | # 'default': list(range(10)) 256 | # }, 257 | 'array_scalar': { 258 | 'type': 'array', 259 | 'items': {'type': 'integer'} 260 | }, 261 | 'array_of_array': { 262 | 'type': 'array', 263 | 'items': { 264 | 'type': 'array', 265 | 'items': { 266 | 'type': 'array', 267 | 'items': {'type': 'integer'} 268 | } 269 | } 270 | }, 271 | ## TODO: Complex types defaulted 272 | # 'object_defaulted': { 273 | # 'type': 'object', 274 | # 'properties': { 275 | # 'a': { 276 | # 'type': 'integer' 277 | # }, 278 | # 'b': { 279 | # 'type': 'integer' 280 | # }, 281 | # 'c': { 282 | # 'type': 'integer' 283 | # } 284 | # }, 285 | # 'default': {'a': 123, 'b': 456, 'c': 789} 286 | # }, 287 | 'object_of_object_0': { 288 | 'type': 'object', 289 | 'properties': { 290 | 'object_of_object_1': { 291 | 'type': 'object', 292 | 'properties': { 293 | 'object_of_object_2': { 294 | 'type': 'object', 295 | 'properties': { 296 | 'array_scalar': { 297 | 'type': 'array', 298 | 'items': { 299 | 'type': 'boolean' 300 | } 301 | }, 302 | 'a': { 303 | 'type': 'integer' 304 | }, 305 | 'b': { 306 | 'type': 'integer' 307 | }, 308 | 'c': { 309 | 'type': 'integer' 310 | } 311 | } 312 | } 313 | } 314 | } 315 | } 316 | }, 317 | 'null': { 318 | 'type': ['null', 'integer'] 319 | }, 320 | 'nested_null': { 321 | 'type': 'object', 322 | 'properties': { 323 | 'null': { 324 | 'type': ['null', 'integer'] 325 | } 326 | } 327 | } 328 | } 329 | }, 330 | 'key_properties': ['id'] 331 | } 332 | 333 | 334 | class NestedStream(FakeStream): 335 | stream = 'root' 336 | schema = NESTED_STREAM 337 | 338 | def generate_record(self): 339 | null = None 340 | ## We use this trick so that we _always_ know we'll have both null and non-null values 341 | ## vs using something like chance here. 342 | if self.id % 2 == 0: 343 | null = 31415 344 | 345 | return { 346 | 'id': self.id, 347 | 'array_scalar': list(range(5)), 348 | 'array_of_array': [[[1, 2, 3], 349 | [4, 5, 6, 7, 8], 350 | [9, 10], 351 | []], 352 | [[10], 353 | [20, 30], 354 | [40, 50, 60], 355 | [70, 80, 90, 100]]], 356 | 'object_of_object_0': { 357 | 'object_of_object_1': { 358 | 'object_of_object_2': { 359 | 'array_scalar': [True, False, True, False, False], 360 | 'a': self.id, 361 | 'b': self.id, 362 | 'c': self.id 363 | } 364 | } 365 | }, 366 | 'null': null, 367 | 'nested_null': { 368 | 'null': null 369 | } 370 | } 371 | 372 | 373 | MULTI_TYPE = { 374 | 'type': 'SCHEMA', 375 | 'stream': 'root', 376 | 'schema': { 377 | 'additionalProperties': False, 378 | 'properties': { 379 | 'every_type': { 380 | 'type': ['null', 'integer', 'number', 'boolean', 'string', 'array', 'object'], 381 | 'items': {'type': 'integer'}, 382 | 'format': 'date-time', 383 | 'properties': { 384 | ## We use these field names to increase the difficulty for our column 385 | ## name collision functionality. ie, the denested values will not only 386 | ## conflict in terms of their denested _names_ but also, their types 387 | 'i': {'type': 'integer'}, 388 | 'f': {'type': 'number'}, 389 | 'b': {'type': 'boolean'} 390 | } 391 | }, 392 | 'number_which_only_comes_as_integer': { 393 | 'type': 'number' 394 | } 395 | } 396 | }, 397 | 'key_properties': [] 398 | } 399 | 400 | 401 | class MultiTypeStream(FakeStream): 402 | stream = 'root' 403 | schema = MULTI_TYPE 404 | 405 | def generate_record(self): 406 | value_null = None 407 | value_integer = random.randint(-314159265359, 314159265359) 408 | value_integer_as_number = float(random.randint(-314159265359, 314159265359)) 409 | value_number = random.uniform(-314159265359, 314159265359) 410 | value_boolean = chance.boolean() 411 | value_date_time_string = chance.date(minyear=2012).isoformat() 412 | value_array = [] 413 | for i in range(random.randint(0, 1000)): 414 | value_array.append(random.randint(-314, 314)) 415 | 416 | value_object = {'i': random.randint(-314159265359, 314159265359), 417 | 'n': random.uniform(-314159265359, 314159265359), 418 | 'b': chance.boolean()} 419 | 420 | return { 421 | 'every_type': chance.pickone( 422 | [value_null, 423 | value_integer, 424 | value_integer_as_number, 425 | value_number, 426 | value_boolean, 427 | value_date_time_string, 428 | value_array, 429 | value_object]), 430 | 'number_which_only_comes_as_integer': value_integer 431 | } 432 | 433 | 434 | class TypeChangeStream(FakeStream): 435 | stream = 'root' 436 | 437 | def __init__(self, n, starting_id): 438 | FakeStream.__init__(self, n) 439 | self.starting_id = starting_id 440 | self.changing_literal_type = chance.pickone(['integer', 'number', 'boolean', 'string', 'date-time']) 441 | type_def = {'type': self.changing_literal_type} 442 | 443 | if self.changing_literal_type == 'date-time': 444 | type_def = {'type': 'string', 445 | 'format': 'date-time'} 446 | 447 | print('TypeChangeStream chose:', type_def, 'id starting at:', self.id) 448 | self.schema = { 449 | 'type': 'SCHEMA', 450 | 'stream': 'root', 451 | 'schema': { 452 | 'additionalProperties': False, 453 | 'properties': { 454 | 'id': {'type': 'integer'}, 455 | 'changing_literal_type': type_def 456 | } 457 | }, 458 | 'key_properties': ['id'] 459 | } 460 | 461 | def generate_record(self): 462 | value = None 463 | if self.changing_literal_type == 'integer': 464 | value = random.randint(-314159265359, 314159265359) 465 | elif self.changing_literal_type == 'number': 466 | value = chance.pickone([random.uniform(-314159265359, 314159265359), 467 | float(random.randint(-314159265359, 314159265359)), 468 | random.randint(-314159265359, 314159265359)]) 469 | elif self.changing_literal_type == 'boolean': 470 | value = chance.boolean() 471 | elif self.changing_literal_type == 'string': 472 | value = chance.date(minyear=2012).isoformat() 473 | elif self.changing_literal_type == 'date-time': 474 | value = chance.date(minyear=2012).isoformat() 475 | else: 476 | raise Exception('Unknown changing_literal_type: `{}`'.format(self.changing_literal_type)) 477 | 478 | return { 479 | 'id': self.id + self.starting_id, 480 | 'changing_literal_type': value, 481 | } 482 | 483 | 484 | def clear_db(): 485 | with psycopg2.connect(**TEST_DB) as conn: 486 | with conn.cursor() as cur: 487 | cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'") 488 | drop_command = '' 489 | for table in cur.fetchall(): 490 | drop_command += 'DROP TABLE IF EXISTS ' + table[0] + ';' 491 | cur.execute('begin;' + 492 | drop_command + 493 | 'commit;') 494 | 495 | 496 | @pytest.fixture 497 | def db_cleanup(): 498 | clear_db() 499 | 500 | yield 501 | 502 | clear_db() 503 | 504 | 505 | class ListStream: 506 | idx = None 507 | stream = NotImplementedError() 508 | 509 | def __init__(self): 510 | self.idx = -1 511 | 512 | def __iter__(self): 513 | return self 514 | 515 | def __next__(self): 516 | self.idx += 1 517 | 518 | if self.idx < len(self.stream): 519 | return json.dumps(self.stream[self.idx]) 520 | 521 | raise StopIteration 522 | 523 | 524 | class DogStream(CatStream): 525 | stream = 'dogs' 526 | schema = CatStream.schema.copy() 527 | 528 | 529 | DogStream.schema['stream'] = 'dogs' --------------------------------------------------------------------------------