├── .circleci └── config.yml ├── .github └── pull_request_template.md ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── bin ├── circleci-local ├── tap-mssql ├── test └── test-db ├── project.clj ├── resources └── log4j.properties ├── spikes ├── 000-notes.md ├── 001-connection-test │ ├── connection-test.py │ └── create_sample_db ├── 002-connection-test-clj │ ├── project.clj │ ├── src │ │ └── connection_test_clj │ │ │ └── core.clj │ └── target │ │ ├── classes │ │ └── META-INF │ │ │ └── maven │ │ │ └── connection-test-clj │ │ │ └── connection-test-clj │ │ │ └── pom.properties │ │ └── stale │ │ └── extract-native.dependencies ├── 003-dev-circle-testing-options │ ├── .lein-repl-history │ ├── 000-notes.md │ ├── project.clj │ ├── src │ │ └── dev_circle_testing_options │ │ │ └── core.clj │ └── target │ │ ├── classes │ │ └── META-INF │ │ │ └── maven │ │ │ └── dev-circle-testing-options │ │ │ └── dev-circle-testing-options │ │ │ └── pom.properties │ │ └── stale │ │ └── extract-native.dependencies ├── 004-using-mssql-cli │ └── 000-notes.md ├── 005-central-concept │ ├── 000-notes.md │ └── tap-const │ │ ├── .gitignore │ │ ├── project.clj │ │ └── src │ │ └── tap_const │ │ ├── core.clj │ │ ├── in_situ_println_with_state_managed_by_paginator.clj │ │ ├── in_situ_println_with_state_managed_by_record.clj │ │ ├── lazy_seq.clj │ │ ├── test_in_situ_println_with_state_managed_by_paginator.clj │ │ ├── test_in_situ_println_with_state_managed_by_record.clj │ │ └── test_lazy_seq.clj ├── 006-memory-management │ └── 000-notes.md ├── 006-permissions │ ├── 000-notes.md │ ├── tap-const │ │ ├── .gitignore │ │ ├── project.clj │ │ └── src │ │ │ └── tap_const │ │ │ ├── core.clj │ │ │ ├── in_situ_println_with_state_managed_by_paginator.clj │ │ │ ├── in_situ_println_with_state_managed_by_record.clj │ │ │ ├── lazy_seq.clj │ │ │ ├── test_in_situ_println_with_state_managed_by_paginator.clj │ │ │ ├── test_in_situ_println_with_state_managed_by_record.clj │ │ │ └── test_lazy_seq.clj │ └── test-db ├── 007-full-table-syncing-strategy │ ├── 000-notes.md │ └── test-db ├── 008-how-to-read-cdc-log │ ├── 000-notes.md │ ├── tap-const │ │ ├── .gitignore │ │ ├── project.clj │ │ └── src │ │ │ └── tap_const │ │ │ └── core.clj │ └── test-db ├── 009-lockdown-config │ └── 000-notes.md ├── 010-permissions │ └── 000-notes.md └── 011-initial-full-table-to-cdc-transition-strategy │ ├── 000-notes.md │ ├── tap-const │ ├── .gitignore │ ├── project.clj │ └── src │ │ └── tap_const │ │ └── core.clj │ └── test-db ├── src └── tap_mssql │ ├── catalog.clj │ ├── config.clj │ ├── core.clj │ ├── serialized_catalog.clj │ ├── singer │ ├── bookmarks.clj │ ├── fields.clj │ ├── messages.clj │ ├── parse.clj │ ├── schema.clj │ └── transform.clj │ ├── sync_strategies │ ├── common.clj │ ├── full.clj │ ├── incremental.clj │ └── logical.clj │ └── utils.clj ├── test └── tap_mssql │ ├── core_sync_test.clj │ ├── core_test.clj │ ├── discover_config_test.clj │ ├── discover_empty_catalog_test.clj │ ├── discover_permissions_test.clj │ ├── discover_populated_catalog_datatyping_test.clj │ ├── discover_populated_catalog_metadata_test.clj │ ├── discover_populated_catalog_test.clj │ ├── discover_populated_catalog_with_multiple_non_system_databases_test.clj │ ├── discover_sync_precision_test.clj │ ├── discover_tables_in_schema_test.clj │ ├── messages_test.clj │ ├── sync_full_table_test.clj │ ├── sync_incremental_test.clj │ ├── sync_interruptible_full_table_test.clj │ ├── sync_log_based_test.clj │ ├── sync_strategies │ └── common_test.clj │ ├── test_utils.clj │ └── try_read_only_utils_test.clj └── tests ├── base.py ├── database.py ├── spec.py ├── test_automatic_fields.py ├── test_discovery_data_types.py ├── test_discovery_multiple_dbs.py ├── test_discovery_names.py ├── test_discovery_pks.py ├── test_discovery_unsupported_pks.py ├── test_full_replication.py ├── test_full_table_interrupted.py ├── test_log_based_interruped_replication.py ├── test_log_based_missing_user_permissions.py ├── test_missing_user_select_permission.py ├── test_mssql_drop_multiple_tables.py ├── test_mssql_log_based_no_pk.py ├── test_saas_stream.py ├── test_sync_full_datetime.py ├── test_sync_full_decimal.py ├── test_sync_full_float.py ├── test_sync_full_integers.py ├── test_sync_full_multiple_dbs.py ├── test_sync_full_names.py ├── test_sync_full_others.py ├── test_sync_full_pks.py ├── test_sync_full_strings.py ├── test_sync_incremental_datetime.py ├── test_sync_incremental_decimal.py ├── test_sync_incremental_float.py ├── test_sync_incremental_integers.py ├── test_sync_incremental_others.py ├── test_sync_incremental_pks.py ├── test_sync_logical_current_log_version_null.py ├── test_sync_logical_datetime.py ├── test_sync_logical_decimal.py ├── test_sync_logical_float.py ├── test_sync_logical_integers.py ├── test_sync_logical_min_valid_version_null.py ├── test_sync_logical_multiple_dbs.py ├── test_sync_logical_names.py ├── test_sync_logical_others.py ├── test_sync_logical_pks.py ├── test_sync_logical_rowversion.py ├── test_sync_logical_view.py ├── test_table_reset_incremental.py └── test_table_reset_logical.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | orbs: 3 | slack: circleci/slack@3.4.2 4 | 5 | jobs: 6 | build: 7 | docker: 8 | - image: clojure:lein-2.5.3 9 | - image: mcr.microsoft.com/mssql/server:2022-CU18-ubuntu-22.04 10 | environment: 11 | ACCEPT_EULA: Y 12 | SA_PASSWORD: Password1! 13 | steps: 14 | - checkout 15 | - restore_cache: 16 | key: dependency-cache-{{ checksum "project.clj" }} 17 | - run: 18 | name: 'Install Dockerize' 19 | command: wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz 20 | environment: 21 | DOCKERIZE_VERSION: v0.3.0 22 | - run: 23 | name: 'Wait for MSSQL Docker' 24 | command: | 25 | dockerize -wait tcp://127.0.0.1:1433 -timeout 1m 26 | sleep 5 27 | - run: 28 | name: Test 29 | command: | 30 | bin/test 31 | - save_cache: 32 | key: dependency-cache-{{ checksum "project.clj" }} 33 | paths: 34 | - /root/.m2 35 | - slack/notify-on-failure: 36 | only_for_branches: master 37 | 38 | tap_tester: 39 | docker: 40 | - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester-clj 41 | - image: mcr.microsoft.com/mssql/server:2022-CU18-ubuntu-22.04 42 | environment: 43 | ACCEPT_EULA: Y 44 | SA_PASSWORD: Password1! 45 | steps: 46 | - checkout 47 | - run: 48 | name: 'Install Dockerize' 49 | command: wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz 50 | environment: 51 | DOCKERIZE_VERSION: v0.3.0 52 | - run: 53 | name: 'Wait for MSSQL Docker' 54 | command: | 55 | dockerize -wait tcp://127.0.0.1:1433 -timeout 1m 56 | sleep 5 57 | - run: 58 | name: 'Tap Tester' 59 | command: | 60 | cd /root/project 61 | aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox dev_env.sh 62 | source dev_env.sh 63 | aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-mssql/tap_tester_sandbox tap-mssql.env 64 | source tap-mssql.env 65 | lein deps 66 | source /usr/local/share/virtualenvs/tap-tester/bin/activate 67 | pip install numpy==1.21.2 68 | mkdir /tmp/${CIRCLE_PROJECT_REPONAME} 69 | export STITCH_CONFIG_DIR=/tmp/${CIRCLE_PROJECT_REPONAME} 70 | run-test --tap=/root/project/bin/tap-mssql tests 71 | - slack/notify-on-failure: 72 | only_for_branches: master 73 | - store_artifacts: 74 | path: /tmp/tap-mssql 75 | 76 | workflows: 77 | version: 2 78 | build_and_test: &commit_jobs 79 | jobs: 80 | - build: 81 | context: 82 | - circleci-user 83 | - tier-1-tap-user 84 | - tap_tester: 85 | context: 86 | - circleci-user 87 | - tier-1-tap-user 88 | requires: 89 | - build 90 | build_daily: 91 | <<: *commit_jobs 92 | triggers: 93 | - schedule: 94 | cron: "0 1 * * *" 95 | filters: 96 | branches: 97 | only: 98 | - master 99 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Description of change 2 | (write a short description here or paste a link to JIRA) 3 | 4 | # QA steps 5 | - [ ] automated tests passing 6 | - [ ] manual qa steps passing (list below) 7 | 8 | # Risks 9 | 10 | # Rollback steps 11 | - revert this branch 12 | 13 | #### AI generated code 14 | https://internal.qlik.dev/general/ways-of-working/code-reviews/#guidelines-for-ai-generated-code 15 | - [ ] this PR has been written with the help of GitHub Copilot or another generative AI tool 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .nrepl-port 2 | .lein-failures 3 | target/ 4 | todo.org 5 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 1.8.0 4 | * Retry logic for the Deadlock error [#91](https://github.com/singer-io/tap-mssql/pull/91) 5 | 6 | ## 1.7.0 7 | * Remove `maxLength` from the catalog [#88](https://github.com/singer-io/tap-mssql/pull/88) 8 | 9 | ## 1.6.13 10 | * Removes rowversion and replication keys as valid interruptible bookmarks for logical replication, ensuring the use of a Primary Key 11 | * Adds additional check for valid Primary Key before beginning logical replication rather than after completing initial historical replication 12 | * Updates full table interruptible bookmark key priority to optimize index efficiency by using PK if available before falling back to rowversion if available 13 | * Adds additional tests for full-table interruptible 14 | * [#86](https://github.com/singer-io/tap-mssql/pull/86) 15 | 16 | ## 1.6.12 17 | * Update singer-clojure and logging dependencies [#84](https://github.com/singer-io/tap-mssql/pull/84) 18 | 19 | ## 1.6.11 20 | * Fix RowVersion states for logical syncs [#76](https://github.com/singer-io/tap-mssql/pull/76) 21 | 22 | ## 1.6.10 23 | * Update `nrepl` server binding from `0.0.0.0` to `127.0.0.1` [#62](https://github.com/singer-io/tap-mssql/pull/62) 24 | 25 | ## 1.6.9 26 | * Quote the `ORDER BY` columns in incremental queries [#59](https://github.com/singer-io/tap-mssql/pull/59) 27 | 28 | ## 1.6.8 29 | * Attempt readonly connections on `jdbc/reducible-query` and `jdbc/query` calls in Full Table and Incremental Syncs [#58](https://github.com/singer-io/tap-mssql/pull/58) 30 | 31 | ## 1.6.7 32 | * Apply approaches from 1.6.5 and 1.6.6 to only try `ApplicationIntent=ReadOnly` for query-based connections, and fall-back to not read only if the check fails. [#55](https://github.com/singer-io/tap-mssql/pull/55) 33 | 34 | ## 1.6.6 35 | * Handle cases where `ApplicationIntent=ReadOnly` is not doable for log-based sync's initial full table [#53](https://github.com/singer-io/tap-mssql/pull/53) 36 | 37 | ## 1.6.5 38 | * Only adds `ApplicationIntent=ReadOnly` to query-based connections due to an issue with Change Tracking and secondary read replicas not supporting it [#52](https://github.com/singer-io/tap-mssql/pull/52) 39 | 40 | ## 1.6.4 41 | * Adds `ApplicationIntent=ReadOnly` to the connection string [#50](https://github.com/singer-io/tap-mssql/pull/50) 42 | 43 | ## 1.6.3 44 | * Properly check whether change tracking is enabled on a DB in Azure [#41](https://github.com/singer-io/tap-mssql/pull/41) 45 | 46 | ## 1.6.2 47 | * Warn on permissions errors when discovering schemas if the user doesn't have access [#33](https://github.com/singer-io/tap-mssql/pull/33) 48 | 49 | ## 1.6.1 50 | * Fix bugs with Views being interrupted during a full table sync [#28](https://github.com/singer-io/tap-mssql/pull/28) 51 | 52 | ## 1.6.0 53 | * Configure ResultSet options for concurrency mode and cursor type [#25](https://github.com/singer-io/tap-mssql/pull/25) 54 | 55 | ## 1.5.2 56 | * Clarify error messages in edge case when change tracking is not available [#24](https://github.com/singer-io/tap-mssql/pull/24) 57 | * Fix edge case where two identically named tables in different schemas have different change tracking status [#24](https://github.com/singer-io/tap-mssql/pull/24) 58 | 59 | ## 1.5.1 60 | * Fix issue where some datetime types were having issues when approaching year 0. [#22](https://github.com/singer-io/tap-mssql/pull/22) 61 | 62 | ## 1.5.0 63 | * During query generation surround schema names with square brackets to allow reserved words in schemas. [#20](https://github.com/singer-io/tap-mssql/pull/20) 64 | 65 | ## 1.4.5 66 | * Added a fallback value for `_sdc_deleted_at` when running a log based sync. 67 | * The tap also logs when this happens. 68 | * [#18](https://github.com/singer-io/tap-mssql/pull/18) 69 | 70 | ## 1.4.4 71 | * Fixes a bug where during discovery, for columns of type `binary` the tap was writing the schema as a string, but not transforming the data to a string, instead emitting it as a byte array. [#16](https://github.com/singer-io/tap-mssql/pull/16) 72 | 73 | ## 1.4.3 74 | * Fix a bug where timestamp column bookmarks cause an exception when resuming full-table [#15](https://github.com/singer-io/tap-mssql/pull/15) 75 | 76 | ## 1.4.2 77 | * Extract database from config replacing it with an empty string if it is nil [#13](https://github.com/singer-io/tap-mssql/pull/13) 78 | 79 | ## 1.4.1 80 | * Removed max length from binary-type JSON schemas [#8](https://github.com/singer-io/tap-mssql/pull/8/) 81 | 82 | ## 1.4.0 83 | * Add support for connecting to Named Instances by omitting port and adding the instance name in the host field [#4](https://github.com/singer-io/tap-mssql/pull/4) 84 | 85 | ## 1.3.1 86 | * Fix discovery query to correctly type `sys.partitions.rows` as `bigint` [#5](https://github.com/singer-io/tap-mssql/pull/5) 87 | 88 | ## 1.3.0 89 | * Add support for money and smallmoney columns [#1](https://github.com/singer-io/tap-mssql/pull/1) 90 | 91 | ## 1.2.3 92 | * Fix the sql query generation for full table interruptible syncs with composite pks [#53](https://github.com/stitchdata/tap-mssql/pull/53) 93 | 94 | ## 1.2.2 95 | * Fix a bug with a View's key_properties being improperly set [#51](https://github.com/stitchdata/tap-mssql/pull/51) 96 | * Add an assertion to ensure log-based replication has the primary keys needed to replicate [#50](https://github.com/stitchdata/tap-mssql/pull/50) 97 | 98 | ## 1.2.1 99 | * Make _sdc_deleted_at nullable [commit](https://github.com/stitchdata/tap-mssql/commit/e95170bab642da301346cdf56485f8778d32ad2b) 100 | 101 | ## 1.2.0 102 | * Add support for datetime2, datetimeoffset, and smalldatetime [#49](https://github.com/stitchdata/tap-mssql/pull/49) 103 | 104 | ## 1.1.0 105 | * Add support for numeric and decimal identity columns [#48](https://github.com/stitchdata/tap-mssql/pull/48) 106 | 107 | ## 1.0.0 108 | * GA Release 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tap-mssql 2 | 3 | [![CircleCI](https://circleci.com/gh/singer-io/tap-mssql.svg?style=svg)](https://circleci.com/gh/singer-io/tap-mssql) 4 | 5 | [Singer](https://www.singer.io/) tap that extracts data from a [Microsoft SQL Server (MSSQL)](https://www.microsoft.com/en-us/sql-server/default.aspx) database and produces JSON-formatted data following the [Singer spec](https://github.com/singer-io/getting-started/blob/master/docs/SPEC.md). 6 | 7 | ## Requirements 8 | 9 | This tap is written in Clojure, and as such, requires the JVM. It has been consistently tested to run using `OpenJDK 8`, which can be installed on Ubuntu using these commands. 10 | 11 | ``` 12 | apt-get update && apt-get install -y openjdk-8-jdk 13 | ``` 14 | 15 | Associated tooling required to use the scripts in this repository follow. (Running the latest versions) 16 | 17 | - [**Leiningen**](https://leiningen.org/) 18 | - [**Docker (for integration tests)**](https://www.docker.com/) 19 | - [**MSSQL CLI (to connect to test database)**](https://docs.microsoft.com/en-us/sql/tools/mssql-cli?view=sql-server-2017) 20 | 21 | ## Quick Start 22 | 23 | ``` 24 | $ bin/tap-mssql --config config.json --discover > catalog.json 25 | $ bin/tap-mssql --config config.json --catalog catalog.json --state state.json | target... 26 | ``` 27 | 28 | ## Usage 29 | 30 | In the `bin` folder, there are a few utility scripts to simplify interacting with this tap. Many of these scripts rely on some environment variables being set, see "Testing Infrastructure Design" for more information. 31 | 32 | **bin/tap-mssql** - This script wraps the `lein` command to run the tap from source code. It is analogous to the command installed by setuptools in Python taps. 33 | 34 | As this is a Clojure tap, it supports a non-standard mode of operation by passing the `--repl` flag. This will start an NREPL server and log the port that it is running on to connect from an IDE for REPL driven development. It is compatible with all other command-line arguments, or can be used on its own. If the tap is invoked in discovery or sync mode along with `--repl`, the process will be kept alive after the usual Singer process is completed. 35 | 36 | ``` 37 | Example: 38 | # Discovery 39 | $ bin/tap-mssql --config config.json --discover > catalog.json 40 | 41 | # Sync 42 | $ bin/tap-mssql --config config.json --catalog catalog.json --state state.json 43 | 44 | # REPL Mode 45 | $ bin/tap-mssql --config config.json --repl 46 | ``` 47 | 48 | **bin/test** - This script wraps `lein test` in order to run the Clojure unit and integration tests against a database running locally. 49 | 50 | ``` 51 | Example: 52 | $ bin/test 53 | ``` 54 | 55 | **bin/test-db** - This script uses docker to run a SQL Server container locally that can be used to run the unit tests against. See the usage text for more information. 56 | 57 | Note: It also depends on the `mssql-cli` tool being installed in order to use the `connect` option. 58 | 59 | To install `mssql-cli`: 60 | - create a virtualenv 61 | - source your new virtualenv 62 | - `pip install mssql-cli` 63 | 64 | Before running `bin/test-db connect`, just make sure your virtualenv is sourced. 65 | 66 | ``` 67 | Example: 68 | $ bin/test-db start 69 | $ bin/test-db connect 70 | $ bin/test-db stop 71 | ``` 72 | 73 | **bin/circleci-local** - This script wraps the [`circleci` CLI tool](https://circleci.com/docs/2.0/local-cli/) to run the Clojure unit and integration tests in the way CircleCI does, on localhost. 74 | 75 | ``` 76 | Example: 77 | $ bin/circleci-local 78 | ``` 79 | 80 | ## Testing Infrastructure Design 81 | 82 | Each actor (developer, CI, etc.) needs their own testing infrastructure so 83 | that development can proceed and be verified independently of each other. 84 | In order to provide this isolation, we've migrated towards a Docker-based 85 | solution. 86 | 87 | A script, `bin/test-db` has been provided that will honor several 88 | environment variables and manage the container required by the development 89 | and testing. 90 | 91 | The environment variables are: 92 | 93 | | name | description | 94 | | --- | --- | 95 | | `STITCH_TAP_MSSQL_TEST_DATABASE_USER` | The admin user that should be used to connect to the test database (for docker, this is SA) | 96 | | `STITCH_TAP_MSSQL_TEST_DATABASE_PASSWORD` | The password for the user (if docker, the SA user will be configured with this password) | 97 | | `STITCH_TAP_MSSQL_TEST_DATABASE_PORT` | The port for hosting the server. (Default 1433)| 98 | 99 | To interact with the container, these commands are available: 100 | 101 | `bin/test-db start` - Starts the container under the name `sql1` 102 | 103 | `bin/test-db connect` - Uses `mssql-cli` to open a shell to the local MSSQL instance 104 | 105 | `bin/test-db stop` - Tears down and removes the container 106 | 107 | **Note:** There is no volume binding, so all of the data and state in the 108 | running container is entirely ephemeral 109 | 110 | ## Observed error messages: 111 | 112 | ``` 113 | # Bad Host Message 114 | 115 | The TCP/IP connection to the host charnock.org, port 51552 has failed. 116 | Error: "connect timed out. Verify the connection properties. Make sure 117 | that an instance of SQL Server is running on the host and accepting 118 | TCP/IP connections at the port. Make sure that TCP connections to the 119 | port are not blocked by a firewall.". 120 | 121 | # Unspecified azure server error message 122 | 123 | Cannot open server "127.0.0.1" requested by the login. The login 124 | failed. ClientConnectionId:33b6ae38-254a-483b-ba24-04d69828fe0c 125 | 126 | 127 | # Bad dbname error message 128 | 129 | Login failed for user 'foo'. 130 | ClientConnectionId:4c47c255-a330-4bc9-94bd-039c592a8a31 131 | 132 | # Database does not exist 133 | 134 | Cannot open database "foo" requested by the login. The login 135 | failed. ClientConnectionId:f6e2df79-1d72-4df3-8c38-2a9e7a349003 136 | ``` 137 | 138 | --- 139 | 140 | Copyright © 2019 Stitch 141 | -------------------------------------------------------------------------------- /bin/circleci-local: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [[ -z $STITCH_TAP_MSSQL_TEST_DATABASE_USER ]] \ 4 | || [[ -z $STITCH_TAP_MSSQL_TEST_DATABASE_PASSWORD ]] 5 | then 6 | printf 'Missing required environment variables\n' >&2 7 | exit 1 8 | fi 9 | 10 | circleci local execute \ 11 | -e "STITCH_TAP_MSSQL_TEST_DATABASE_USER=${STITCH_TAP_MSSQL_TEST_DATABASE_USER}" \ 12 | -e "STITCH_TAP_MSSQL_TEST_DATABASE_PASSWORD=${STITCH_TAP_MSSQL_TEST_DATABASE_PASSWORD}" 13 | -------------------------------------------------------------------------------- /bin/tap-mssql: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [[ $HOSTNAME = taps-* ]] && [[ $1 == --repl ]] 4 | then 5 | source /etc/default/tap-mssql 6 | if [[ -z $STITCH_TAP_MSSQL_TEST_DATABASE_USER ]] \ 7 | || [[ -z $STITCH_TAP_MSSQL_TEST_DATABASE_PASSWORD ]] 8 | then 9 | printf 'Missing required environment variables. Have you source /etc/default/tap-mssql?\n' >&2 10 | exit 1 11 | fi 12 | fi 13 | 14 | cd "${0%/*}" 15 | 16 | lein run -m tap-mssql.core "$@" 17 | -------------------------------------------------------------------------------- /bin/test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [[ -z $STITCH_TAP_MSSQL_TEST_DATABASE_USER ]] \ 4 | || [[ -z $STITCH_TAP_MSSQL_TEST_DATABASE_PASSWORD ]] 5 | then 6 | cat < select @@version 57 | # 2> go 58 | 59 | # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 60 | # Microsoft SQL Server 2017 (RTM-CU13) (KB4466404) - 14.0.3048.4 (X64) 61 | # Nov 30 2018 12:57:58 62 | # Copyright (C) 2017 Microsoft Corporation 63 | # Developer Edition (64-bit) on Linux (Ubuntu 16.04.4 LTS) 64 | 65 | # (1 rows affected) 66 | 67 | import pyodbc 68 | server = 'localhost' 69 | database = 'SampleDB' 70 | username = 'sa' 71 | password = 'password123!' 72 | cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';PORT=1443;DATABASE='+database+';UID='+username+';PWD='+ password) 73 | cursor = cnxn.cursor() 74 | 75 | print ('Inserting a new row into table') 76 | #Insert Query 77 | tsql = "INSERT INTO Employees (Name, Location) VALUES (?,?);" 78 | with cursor.execute(tsql,'Jake','United States'): 79 | print ('Successfuly Inserted!') 80 | 81 | 82 | #Update Query 83 | print ('Updating Location for Nikita') 84 | tsql = "UPDATE Employees SET Location = ? WHERE Name = ?" 85 | with cursor.execute(tsql,'Sweden','Nikita'): 86 | print ('Successfuly Updated!') 87 | 88 | 89 | #Delete Query 90 | print ('Deleting user Jared') 91 | tsql = "DELETE FROM Employees WHERE Name = ?" 92 | with cursor.execute(tsql,'Jared'): 93 | print ('Successfuly Deleted!') 94 | 95 | 96 | #Select Query 97 | print ('Reading data from table') 98 | tsql = "SELECT Name, Location FROM Employees;" 99 | with cursor.execute(tsql): 100 | row = cursor.fetchone() 101 | while row: 102 | print (str(row[0]) + " " + str(row[1])) 103 | row = cursor.fetchone() 104 | -------------------------------------------------------------------------------- /spikes/001-connection-test/create_sample_db: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | sqlcmd -S localhost -U sa -P 'password123!' -Q "CREATE DATABASE SampleDB;" 4 | sqlcmd -S localhost -U sa -P 'password123!' -Q "USE SampleDB; CREATE TABLE Employees (Id INT IDENTITY(1,1) NOT NULL PRIMARY KEY, Name NVARCHAR(50), Location NVARCHAR(50));" 5 | sqlcmd -S localhost -U sa -P 'password123!' -Q "USE SampleDB; INSERT INTO Employees (Name, Location) VALUES (N'Jared', N'Australia'), (N'Nikita', N'India'), (N'Tom', N'Germany');" 6 | -------------------------------------------------------------------------------- /spikes/002-connection-test-clj/project.clj: -------------------------------------------------------------------------------- 1 | (defproject connection-test-clj "0.0.1-SNAPSHOT" 2 | :dependencies [[org.clojure/clojure "1.10.0"] 3 | [org.clojure/java.jdbc "0.7.9"] 4 | [com.microsoft.sqlserver/mssql-jdbc "7.2.1.jre8"]] 5 | :profiles {:system {:java-cmd "/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java"}}) 6 | -------------------------------------------------------------------------------- /spikes/002-connection-test-clj/src/connection_test_clj/core.clj: -------------------------------------------------------------------------------- 1 | ;;; For https://stitchdata.atlassian.net/browse/SRCE-515 2 | 3 | (ns connection-test-clj.core 4 | (:require [clojure.java.jdbc :as jdbc])) 5 | 6 | (def db-spec 7 | {:dbtype "sqlserver" 8 | :dbname "SampleDB" 9 | :user "sa" 10 | :password "password123!"}) 11 | 12 | (jdbc/query db-spec ["select @@version"]) 13 | ;; ({: "Microsoft SQL Server 2017 (RTM-CU13) (KB4466404) - 14.0.3048.4 (X64) \n\tNov 30 2018 12:57:58 \n\tCopyright (C) 2017 Microsoft Corporation\n\tDeveloper Edition (64-bit) on Linux (Ubuntu 16.04.4 LTS)"}) 14 | -------------------------------------------------------------------------------- /spikes/002-connection-test-clj/target/classes/META-INF/maven/connection-test-clj/connection-test-clj/pom.properties: -------------------------------------------------------------------------------- 1 | #Leiningen 2 | #Thu Apr 18 17:42:54 UTC 2019 3 | version=0.0.1-SNAPSHOT 4 | groupId=connection-test-clj 5 | artifactId=connection-test-clj 6 | -------------------------------------------------------------------------------- /spikes/002-connection-test-clj/target/stale/extract-native.dependencies: -------------------------------------------------------------------------------- 1 | ([:dependencies ([cider/cider-nrepl "0.17.0"] [org.clojure/tools.nrepl "0.2.13" :exclusions [org.clojure/clojure]] [org.clojure/clojure "1.10.0"] [org.clojure/java.jdbc "0.7.9"] [com.microsoft.sqlserver/mssql-jdbc "7.2.1.jre8"] [org.clojure/tools.nrepl "0.2.10" :exclusions [org.clojure/clojure]] [clojure-complete "0.2.3" :exclusions [org.clojure/clojure]] [criterium/criterium "0.4.4"] [org.clojure/tools.trace "0.7.9"])]) -------------------------------------------------------------------------------- /spikes/003-dev-circle-testing-options/.lein-repl-history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/singer-io/tap-mssql/1769d799d0c16572a7efdc00d0d2d4a20eb80ebe/spikes/003-dev-circle-testing-options/.lein-repl-history -------------------------------------------------------------------------------- /spikes/003-dev-circle-testing-options/000-notes.md: -------------------------------------------------------------------------------- 1 | Dev/Circle/tap-tester mssql instance options 2 | ============================================ 3 | 4 | Investigate options for getting test instances for mssql. 5 | 6 | Primary options (in order of preference) are: 7 | 8 | 1. Cloud Hosted 9 | 1. Docker 10 | 1. Linux Installation 11 | 12 | Requirements 13 | ------------ 14 | 15 | - Allow running all supported versions 16 | - Allow turning CDC on/off 17 | 18 | AWS RDS 19 | ------- 20 | 21 | From 22 | [the docs](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Appendix.SQLServer.CommonDBATasks.CDC.html) 23 | 24 | > Using Change Data Capture 25 | > 26 | > Amazon RDS supports change data capture (CDC) for your DB instances 27 | > running Microsoft SQL Server. CDC captures changes that are made to the 28 | > data in your tables. It stores metadata about each change, which you can 29 | > access later. For more information about how CDC works, see Change Data 30 | > Capture in the Microsoft documentation. 31 | > 32 | > Before you use CDC with your Amazon RDS DB instances, enable it in the 33 | > database by running `msdb.dbo.rds_cdc_enable_db`. After CDC is enabled, 34 | > any user who is `db_owner` of that database can enable or disable CDC on 35 | > tables in that database. 36 | > 37 | > **Important** 38 | > 39 | > During restores, CDC will be disabled. All of the related metadata is 40 | > automatically removed from the database. This applies to snapshot 41 | > restores, point-in-time restores, and SQL Server Native restores from S3. 42 | > After performing one of these types of restores, you can re-enable CDC and 43 | > re-specify tables to track. 44 | > 45 | > ``` 46 | > --Enable CDC for RDS DB Instance 47 | > exec msdb.dbo.rds_cdc_enable_db '' 48 | > ``` 49 | > 50 | > To disable CDC, `msdb.dbo.rds_cdc_disable_db` run . 51 | > 52 | > ``` 53 | > --Disable CDC for RDS DB Instance 54 | > exec msdb.dbo.rds_cdc_disable_db '' 55 | > ``` 56 | 57 | It looks to be impossible to set up CDC on an express edition DB. 58 | 59 | ``` 60 | vagrant@taps-tvisher1:~$ mssql-cli -U spike_tap_mssql -P spike_tap_mssql -S spike-tap-mssql.cqaqbfvfo67k.us-east-1.rds.amazonaws.com 61 | Version: 0.15.0 62 | Mail: sqlcli@microsoft.com 63 | Home: http://github.com/dbcli/mssql-cli 64 | master> \d 65 | OBJECT is required. Usage '\d OBJECT'. 66 | 67 | Time: 0.000s 68 | master> \ld 69 | +----------+ 70 | | name | 71 | |----------| 72 | | master | 73 | | tempdb | 74 | | model | 75 | | msdb | 76 | | rdsadmin | 77 | +----------+ 78 | (5 rows affected) 79 | Time: 1.750s (a second) 80 | master> create database "spike_tap_mssql" 81 | Commands completed successfully. 82 | Time: 2.157s (2 seconds) 83 | master> exec msdb.dbo.rds_cdc_enable_db 'spike_tap_mssql' 84 | Msg 50000, Level 16, State 1, Procedure msdb.dbo.rds_cdc_enable_db, Line 70 85 | This instance of SQL Server is the Express Edition (64-bit). Change data capture is only available in the Enterprise, Developer, Enterprise Evaluation, and Standard editions. 86 | Time: 1.131s (a second) 87 | ``` 88 | 89 | We can create the following server editions in RDS: 90 | 91 | > SQL Server Express Edition 92 | > Affordable database management system that supports database sizes up to 10 GiB. 93 | > 94 | > SQL Server Web Edition 95 | > In accordance with Microsoft's licensing policies, it can only be used to support public and Internet-accessible webpages, websites, web applications, and web services. 96 | > 97 | > SQL Server Standard Edition 98 | > Core data management and business intelligence capabilities for mission-critical applications and mixed workloads. 99 | > 100 | > SQL Server Enterprise Edition 101 | > Comprehensive high-end capabilities for mission-critical applications with demanding database workloads and business intelligence requirements. 102 | 103 | See [the 104 | docs](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/SQLServer.Concepts.General.Licensing.html) 105 | for further information. 106 | 107 | Presumably this indicates successfully enabling CDC in RDS (against a 108 | 'Standard Edition' database). 109 | 110 | ``` 111 | vagrant@taps-tvisher1:~$ mssql-cli -U spike_tap_mssql -P spike_tap_mssql -S spike-tap-mssql-2.cqaqbfvfo67k.us-east-1.rds.amazonaws.com 112 | Version: 0.15.0 113 | Mail: sqlcli@microsoft.com 114 | Home: http://github.com/dbcli/mssql-cli 115 | master> create database "spike_tap_mssql" 116 | Commands completed successfully. 117 | Time: 0.359s 118 | master> exec msdb.dbo.rds_cdc_enable_db 'spike_tap_mssql' 119 | CDC enabled on database spike_tap_mssql 120 | Time: 0.512s 121 | master> 122 | ``` 123 | 124 | Based on testing RDS works so we'll go with that since it was our 125 | preferred option. 126 | -------------------------------------------------------------------------------- /spikes/003-dev-circle-testing-options/project.clj: -------------------------------------------------------------------------------- 1 | (defproject dev-circle-testing-options "0.0.1-SNAPSHOT" 2 | :dependencies [[org.clojure/clojure "1.10.0"] 3 | [org.clojure/java.jdbc "0.7.9"] 4 | [com.microsoft.sqlserver/mssql-jdbc "7.2.1.jre8"]] 5 | :profiles {:system {:java-cmd "/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java"}}) 6 | -------------------------------------------------------------------------------- /spikes/003-dev-circle-testing-options/src/dev_circle_testing_options/core.clj: -------------------------------------------------------------------------------- 1 | (ns dev-circle-testing-options 2 | (:require [clojure.java.jdbc :as jdbc])) 3 | 4 | (def db-spec 5 | {:dbtype "sqlserver" 6 | :dbname "spike_tap_mssql" 7 | :host "spike-tap-mssql-2.cqaqbfvfo67k.us-east-1.rds.amazonaws.com" 8 | :user "spike_tap_mssql" 9 | :password "spike_tap_mssql"}) 10 | 11 | (jdbc/query db-spec ["select @@version"]) 12 | ;; ({: "Microsoft SQL Server 2017 (RTM-CU13-OD) (KB4483666) - 14.0.3049.1 (X64) \n\tDec 15 2018 11:16:42 \n\tCopyright (C) 2017 Microsoft Corporation\n\tStandard Edition (64-bit) on Windows Server 2016 Datacenter 10.0 (Build 14393: ) (Hypervisor)\n"}) 13 | 14 | (.getDatabaseMajorVersion (.getMetaData (jdbc/get-connection db-spec))) 15 | ;; 14 16 | (.getDatabaseMinorVersion (.getMetaData (jdbc/get-connection db-spec))) 17 | ;; 0 18 | -------------------------------------------------------------------------------- /spikes/003-dev-circle-testing-options/target/classes/META-INF/maven/dev-circle-testing-options/dev-circle-testing-options/pom.properties: -------------------------------------------------------------------------------- 1 | #Leiningen 2 | #Thu Apr 18 17:44:07 UTC 2019 3 | version=0.0.1-SNAPSHOT 4 | groupId=dev-circle-testing-options 5 | artifactId=dev-circle-testing-options 6 | -------------------------------------------------------------------------------- /spikes/003-dev-circle-testing-options/target/stale/extract-native.dependencies: -------------------------------------------------------------------------------- 1 | ([:dependencies ([cider/cider-nrepl "0.17.0"] [org.clojure/tools.nrepl "0.2.13" :exclusions [org.clojure/clojure]] [org.clojure/clojure "1.10.0"] [org.clojure/java.jdbc "0.7.9"] [com.microsoft.sqlserver/mssql-jdbc "7.2.1.jre8"] [org.clojure/tools.nrepl "0.2.10" :exclusions [org.clojure/clojure]] [clojure-complete "0.2.3" :exclusions [org.clojure/clojure]] [criterium/criterium "0.4.4"] [org.clojure/tools.trace "0.7.9"])]) -------------------------------------------------------------------------------- /spikes/004-using-mssql-cli/000-notes.md: -------------------------------------------------------------------------------- 1 | https://github.com/dbcli/mssql-cli seems like a useful tool for 2 | interacting with mssql instances from a linux machine. 3 | 4 | I was able to install it from microsoft's apt repo. 5 | 6 | ``` 7 | vagrant@taps-tvisher1:~$ apt-cache policy mssql-cli 8 | mssql-cli: 9 | Installed: 0.15.0-1 10 | Candidate: 0.15.0-1 11 | Version table: 12 | *** 0.15.0-1 500 13 | 500 https://packages.microsoft.com/ubuntu/16.04/prod xenial/main amd64 Packages 14 | 100 /var/lib/dpkg/status 15 | 0.14.0-1 500 16 | 500 https://packages.microsoft.com/ubuntu/16.04/prod xenial/main amd64 Packages 17 | 0.13.0-1 500 18 | 500 https://packages.microsoft.com/ubuntu/16.04/prod xenial/main amd64 Packages 19 | 0.12.0-1 500 20 | 500 https://packages.microsoft.com/ubuntu/16.04/prod xenial/main amd64 Packages 21 | 0.11.0-1 500 22 | 500 https://packages.microsoft.com/ubuntu/16.04/prod xenial/main amd64 Packages 23 | 0.10.0.dev1804041738-1 500 24 | 500 https://packages.microsoft.com/ubuntu/16.04/prod xenial/main amd64 Packages 25 | 0.10.0-1 500 26 | 500 https://packages.microsoft.com/ubuntu/16.04/prod xenial/main amd64 Packages 27 | ``` 28 | 29 | From https://github.com/dbcli/mssql-cli/blob/master/doc/installation/linux.md#ubuntu-1604 30 | 31 | ``` 32 | # Import the public repository GPG keys 33 | wget -qO- https://packages.microsoft.com/keys/microsoft.asc | sudo apt-key add - 34 | 35 | # Register the Microsoft Ubuntu repository 36 | sudo curl -o /etc/apt/sources.list.d/microsoft.list "https://packages.microsoft.com/config/ubuntu/$(lsb_release -sr)/prod.list" 37 | 38 | # Update the list of products 39 | sudo apt-get update 40 | 41 | # Install mssql-cli 42 | sudo apt-get install mssql-cli 43 | 44 | # Start mssql-cli 45 | mssql-cli 46 | ``` 47 | -------------------------------------------------------------------------------- /spikes/005-central-concept/000-notes.md: -------------------------------------------------------------------------------- 1 | Establish Central Concept 2 | 3 | For https://stitchdata.atlassian.net/browse/SRCE-861 4 | 5 | > How do we want to structure the stream output? 6 | > 7 | > lazy sequences processed centrally? 8 | > 9 | > core.async? 10 | > 11 | > in-situ stateful (println based) 12 | > 13 | > Be mindful of: 14 | > 15 | > State messages that should only be emitted after records are sent 16 | > 17 | > Schema messages that need to be emitted before records 18 | > 19 | > Records being sent in order 20 | > 21 | > Transformation 22 | > 23 | > State updates (in memory) 24 | 25 | The results of this spike should be considered 26 | [the lazy-seq implementation](https://github.com/stitchdata/tap-mssql/blob/bd1a53447b53f14b211d04c1b5b35da149005063/spikes/005-central-concept/tap-const/src/tap_const/lazy_seq.clj#L1) 27 | which satisfies all the requirements. 28 | 29 | -------------------------------------------------------------------------------- /spikes/005-central-concept/tap-const/.gitignore: -------------------------------------------------------------------------------- 1 | .nrepl-port 2 | .lein-repl-history 3 | target/ 4 | -------------------------------------------------------------------------------- /spikes/005-central-concept/tap-const/project.clj: -------------------------------------------------------------------------------- 1 | (defproject tap-const "0.0.1-SNAPSHOT" 2 | ;; 1.9.0 is the max we can get without bumping CIDER and we can't bump 3 | ;; CIDER until we can bump Java everywhere. 4 | :dependencies [[org.clojure/clojure "1.9.0"] 5 | [org.clojure/data.json "0.2.6"] 6 | [org.clojure/tools.nrepl "0.2.13" 7 | :exclusions [org.clojure/clojure]] 8 | [cider/cider-nrepl "0.17.0"]] 9 | :profiles {:system {:java-cmd "/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java"}}) 10 | -------------------------------------------------------------------------------- /spikes/005-central-concept/tap-const/src/tap_const/core.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.core 2 | (:require [clojure.tools.nrepl.server :as nrepl-server]) 3 | (:gen-class)) 4 | 5 | (defn nrepl-handler 6 | [] 7 | (require 'cider.nrepl) 8 | (ns-resolve 'cider.nrepl 'cider-nrepl-handler)) 9 | 10 | (defonce the-nrepl-server 11 | (nrepl-server/start-server :bind "127.0.0.1" 12 | :handler (nrepl-handler))) 13 | 14 | (defn log-infof 15 | [message-format & args] 16 | (binding [*out* *err*] 17 | (println (apply format 18 | (str "INFO " message-format) 19 | args)))) 20 | 21 | (defn -main 22 | [& args] 23 | (log-infof "INFO Started nrepl server at %s" 24 | (.getLocalSocketAddress (:server-socket the-nrepl-server))) 25 | (spit ".nrepl-port" (:port the-nrepl-server)) 26 | (.start (Thread. #((loop [] 27 | (Thread/sleep 1000) 28 | (recur)))))) 29 | -------------------------------------------------------------------------------- /spikes/005-central-concept/tap-const/src/tap_const/in_situ_println_with_state_managed_by_paginator.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.in-situ-println-with-state-managed-by-paginator 2 | "tap-const.in-situ-println-with-state-managed-by-paginator is for 3 | exploring the idea of in-situ println statements being the primary way 4 | to write singer messages in a clojure tap. The primary difference 5 | between `record` and `paginator` is to show what can be done to liberate 6 | state messages from record messages. 7 | 8 | The idea is to make sync-record operate on a `new-state` arg and a 9 | `record` arg, returning the new-state on a successful write. Then, the 10 | thing processing that record can write state messages at whatever 11 | interval it feels comfortable with." 12 | (:require [clojure.data.json :as json] 13 | [clojure.string :as string] 14 | [tap-const.core :refer [log-infof]] 15 | [clojure.tools.nrepl.server :as nrepl-server]) 16 | (:gen-class)) 17 | 18 | (def source (let [contents (partition-all 19 | 2 20 | [{:id 1 21 | :date "2019-04-19T09:40:50Z"} 22 | {:id 2 23 | :date "2019-04-19T09:41:40Z"} 24 | {:id 3 25 | :date "2019-04-19T09:41:50Z"} 26 | {:id 4 27 | :date "2019-04-19T09:41:54Z"} 28 | {:id 5 29 | :date "2019-04-19T09:41:58Z"} 30 | {:id 6 31 | :date "2019-04-19T09:42:06Z"} 32 | {:id 7 33 | :date "2019-04-19T09:42:12Z"} 34 | {:id 8 35 | :date "2019-04-19T09:42:15Z"} 36 | {:id 9 37 | :date "2019-04-19T09:42:19Z"} 38 | {:id 10 39 | :date "2019-04-19T09:42:23Z"} 40 | {:id 11 41 | :date "2019-04-19T15:10:10"}])] 42 | {:stream1 contents 43 | :skipped-stream2 contents 44 | :stream3 contents})) 45 | 46 | (defn make-record 47 | [stream rec] 48 | {:stream (name stream) 49 | :record rec}) 50 | 51 | (defn make-message 52 | [type-kw value] 53 | (let [singer-type (string/upper-case (name type-kw))] 54 | (assoc value :type singer-type))) 55 | 56 | (defn write-message 57 | [type-kw value] 58 | (println 59 | (json/write-str 60 | (make-message type-kw value)))) 61 | 62 | (defn sync-record 63 | "Function of current state, rec → new state based on record" 64 | [stream-name new-state rec] 65 | ;; `new-bookmark` being passed here is a bit awkward but I think it's 66 | ;; what makes the most sense. Really the record _may_ be useful for 67 | ;; deriving the new state but it's the thing that's calling sync-record 68 | ;; that actually is likely to be able to make that decision. If you need 69 | ;; to derive the new state from the record then you'd do it in the 70 | ;; calling function based on the rec before you passed it in here. The 71 | ;; reason you can't just do that here is because there's lots of 72 | ;; bookmarks that aren't derived from the record itself. 73 | (write-message :record (make-record stream-name rec)) 74 | new-state) 75 | 76 | (defn record-seq 77 | [data] 78 | (mapcat identity data)) 79 | 80 | (defn make-state 81 | [state stream-name bookmark] 82 | (assoc-in state [:bookmarks stream-name :position] bookmark)) 83 | 84 | (defn sync-stream 85 | "catalog, stream-name, current state → new state" 86 | [catalog stream-name state] 87 | (log-infof "Syncing stream %s" stream-name) 88 | (write-message 89 | :schema 90 | (get-in catalog [:streams stream-name])) 91 | ;; This is the logic that would be replaced with real sync logic. Most 92 | ;; notably the thing responsible for writing messages is the thing that 93 | ;; must be responsible for writing state messages. 94 | 95 | ;; data here is 'page's of records 96 | (let [data (source stream-name) 97 | current-bookmark (get-in state 98 | [:bookmarks 99 | stream-name 100 | :position] 101 | 0) 102 | state-emission-rate 10 103 | bookmarks (drop (+ 1 current-bookmark) (range))] 104 | ;; The idea of sync-stream is still catalog, stream-name, current 105 | ;; state → new state but the design differs from the record emission 106 | ;; strategy. There, you essentially have to emit state every time you 107 | ;; emit a record because every time you emit a record you don't have 108 | ;; any context about what you had emitted before. 109 | ;; 110 | ;; The primary change here is that now instead of a reduction over 111 | ;; state we're simply mapping over state-to-be and records which 112 | ;; become the state-to-be once they're written. This lets us have very 113 | ;; granular states but only emit them every so often and could be 114 | ;; configured in whatever way we saw fit. If bookmarks needed to be 115 | ;; generated from the record rather than outside of the record then 116 | ;; they could be generated via mapping over the records rather than 117 | ;; generated ex nihilo as we're doing it here. 118 | 119 | ;; Take either the last state emitted or (if nothing was emitted) the 120 | ;; state as it was entered. 121 | (or (last 122 | (map (fn [state-page] 123 | (let [latest-state (last state-page)] 124 | (write-message 125 | :state {:value latest-state}) 126 | latest-state)) 127 | ;; partition into state pages for processing. 128 | (partition-all 129 | state-emission-rate 130 | (map (partial sync-record stream-name) 131 | ;; make new states over all the bookmarks. The new 132 | ;; state must be the state as it should be _after_ the 133 | ;; record is written. 134 | (map (partial make-state state stream-name) bookmarks) 135 | ;; Move to the current bookmark in the data, this 136 | ;; allows us to ignore the pagination because we can 137 | ;; express it as a lazy-seq. 138 | (drop current-bookmark (record-seq data)))))) 139 | state))) 140 | 141 | (defn selected? 142 | [stream-name] 143 | ((complement #{:skipped-stream2}) stream-name)) 144 | 145 | (defn maybe-sync-stream 146 | [catalog state stream-name] 147 | (if (selected? stream-name) 148 | (sync-stream catalog stream-name state) 149 | (do (log-infof "Skipping stream %s" 150 | stream-name) 151 | state))) 152 | 153 | (defn do-sync 154 | [catalog state] 155 | ;; To make parallel you need to move state to an atom. 156 | (reduce (partial maybe-sync-stream catalog) 157 | state 158 | (keys source))) 159 | -------------------------------------------------------------------------------- /spikes/005-central-concept/tap-const/src/tap_const/in_situ_println_with_state_managed_by_record.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.in-situ-println-with-state-managed-by-record 2 | "tap-const.core is for exploring the idea of in-situ println 3 | statements being the primary way to write singer messages in a clojure 4 | tap" 5 | (:require [clojure.data.json :as json] 6 | [clojure.string :as string] 7 | [tap-const.core :refer [log-infof]] 8 | [clojure.tools.nrepl.server :as nrepl-server]) 9 | (:gen-class)) 10 | 11 | (def source (let [contents (partition-all 12 | 2 13 | [{:id 1 14 | :date "2019-04-19T09:40:50Z"} 15 | {:id 2 16 | :date "2019-04-19T09:41:40Z"} 17 | {:id 3 18 | :date "2019-04-19T09:41:50Z"} 19 | {:id 4 20 | :date "2019-04-19T09:41:54Z"} 21 | {:id 5 22 | :date "2019-04-19T09:41:58Z"} 23 | {:id 6 24 | :date "2019-04-19T09:42:06Z"} 25 | {:id 7 26 | :date "2019-04-19T09:42:12Z"} 27 | {:id 8 28 | :date "2019-04-19T09:42:15Z"} 29 | {:id 9 30 | :date "2019-04-19T09:42:19Z"} 31 | {:id 10 32 | :date "2019-04-19T09:42:23Z"} 33 | {:id 11 34 | :date "2019-04-19T15:10:10"}])] 35 | {:stream1 contents 36 | :skipped-stream2 contents 37 | :stream3 contents})) 38 | 39 | (defn make-record 40 | [stream rec] 41 | {:stream (name stream) 42 | :record rec}) 43 | 44 | (defn make-message 45 | [type-kw value] 46 | (let [singer-type (string/upper-case (name type-kw))] 47 | (assoc value :type singer-type))) 48 | 49 | (defn write-message 50 | [type-kw value] 51 | (println 52 | (json/write-str 53 | (make-message type-kw value)))) 54 | 55 | (defn sync-record 56 | "Function of current state, rec → new state based on record" 57 | [stream-name state new-bookmark rec] 58 | ;; `new-bookmark` being passed here is a bit awkward but I think it's 59 | ;; what makes the most sense. Really the record _may_ be useful for 60 | ;; deriving the new state but it's the thing that's calling sync-record 61 | ;; that actually is likely to be able to make that decision. If you need 62 | ;; to derive the new state from the record then you'd do it in the 63 | ;; calling function based on the rec before you passed it in here. The 64 | ;; reason you can't just do that here is because there's lots of 65 | ;; bookmarks that aren't derived from the record itself. 66 | (write-message :record (make-record stream-name rec)) 67 | (assoc-in state [:bookmarks stream-name :position] new-bookmark)) 68 | 69 | (defn record-seq 70 | [data] 71 | (mapcat identity data)) 72 | 73 | (defn sync-stream 74 | "catalog, stream-name, current state → new state" 75 | [catalog stream-name state] 76 | (log-infof "Syncing stream %s" stream-name) 77 | (write-message 78 | :schema 79 | (get-in catalog [:streams stream-name])) 80 | ;; This is the logic that would be replaced with real sync logic. Most 81 | ;; notably the thing responsible for writing messages is the thing that 82 | ;; must be responsible for writing state messages. 83 | (let [data (source stream-name) 84 | current-bookmark (get-in state 85 | [:bookmarks 86 | stream-name 87 | :position] 88 | 0)] 89 | (let [new-state 90 | (reduce (partial apply sync-record stream-name) 91 | state 92 | (map vector 93 | (drop current-bookmark 94 | (range)) 95 | (drop current-bookmark 96 | (record-seq data))))] 97 | ;; FIXME not good enough. One state message at the end of syncing 98 | ;; the entire stream. Need to emit state message at end of every 99 | ;; 'page' at least. 100 | (write-message :state {:value new-state}) 101 | new-state))) 102 | 103 | (defn selected? 104 | [stream-name] 105 | ((complement #{:skipped-stream2}) stream-name)) 106 | 107 | (defn maybe-sync-stream 108 | [catalog state stream-name] 109 | (if (selected? stream-name) 110 | (sync-stream catalog stream-name state) 111 | (do (log-infof "Skipping stream %s" 112 | stream-name) 113 | state))) 114 | 115 | (defn do-sync 116 | [catalog state] 117 | ;; To make parallel you need to move state to an atom. 118 | (reduce (partial maybe-sync-stream catalog) 119 | state 120 | (keys source))) 121 | -------------------------------------------------------------------------------- /spikes/005-central-concept/tap-const/src/tap_const/test_in_situ_println_with_state_managed_by_paginator.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.test-in-situ-println-with-state-managed-by-paginator 2 | "Code for development runs of do-sync etc." 3 | (:require [tap-const.in-situ-println-with-state-managed-by-paginator :refer :all])) 4 | 5 | (def catalog (let [schema {:type "object" 6 | :properties {:id {:type "integer"} 7 | :date {:type "string" 8 | :format "date-time"}}}] 9 | (reduce (fn [catalog stream-name] 10 | (assoc-in catalog [:streams stream-name] 11 | {:stream stream-name 12 | :key_properties ["id"] 13 | :schema schema})) 14 | {} 15 | (keys source)))) 16 | 17 | (require '[clojure.java.io :as io]) 18 | 19 | (def initial-state {:bookmarks {:stream1 {:position 3} 20 | :stream3 {:position 7}}}) 21 | 22 | (defmacro with-out-and-err-to-dev-null 23 | [& body] 24 | `(let [null-out# (io/writer 25 | (proxy [java.io.OutputStream] [] 26 | (write [& args#])))] 27 | (binding [*err* null-out# 28 | *out* null-out#] 29 | ~@body))) 30 | 31 | (comment 32 | (do-sync catalog initial-state) 33 | ) 34 | 35 | ;; This is a convient way to run the sync through target-stitch in 36 | ;; dry-run mode. 37 | (require '[clojure.java.shell :refer [sh]] 38 | '[clojure.string :as string] 39 | '[clojure.data.json :as json]) 40 | (defn run-test 41 | [state] 42 | (let [results (sh 43 | "/usr/local/share/virtualenvs/target-stitch/bin/target-stitch" 44 | "--dry-run" 45 | :in (with-out-str (do-sync catalog state)))] 46 | (-> results 47 | (update :err string/split #"\n") 48 | (update :out (comp (partial map (fn [s] 49 | (try (json/read-str s) 50 | (catch Exception _ s)))) 51 | #(string/split % #"\n")))))) 52 | 53 | (comment 54 | (string/split "" #"\n") 55 | (with-out-and-err-to-dev-null 56 | (run-test initial-state)) 57 | (run-test initial-state) 58 | ;; => {:exit 0, 59 | ;; :out 60 | ;; ({"bookmarks" {"stream3" {"position" 5}, "stream1" {"position" 10}}} 61 | ;; {"bookmarks" {"stream3" {"position" 10}, "stream1" {"position" 10}}}), 62 | ;; :err 63 | ;; ["INFO stream1 (7): Batch is valid" 64 | ;; "INFO stream3 (5): Batch is valid" 65 | ;; "INFO Exiting normally"]} 66 | ) 67 | -------------------------------------------------------------------------------- /spikes/005-central-concept/tap-const/src/tap_const/test_in_situ_println_with_state_managed_by_record.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.test-in-situ-println-with-state-managed-by-paginator 2 | "Code for development runs of do-sync etc." 3 | (:require [tap-const.in-situ-println-with-state-managed-by-record :refer :all])) 4 | 5 | (def catalog (let [schema {:type "object" 6 | :properties {:id {:type "integer"} 7 | :date {:type "string" 8 | :format "date-time"}}}] 9 | (reduce (fn [catalog stream-name] 10 | (assoc-in catalog [:streams stream-name] 11 | {:stream stream-name 12 | :key_properties ["id"] 13 | :schema schema})) 14 | {} 15 | (keys source)))) 16 | 17 | (require '[clojure.java.io :as io]) 18 | 19 | (def initial-state {:bookmarks {:stream1 {:position 3} 20 | :stream3 {:position 7}}}) 21 | 22 | (defmacro with-out-and-err-to-dev-null 23 | [& body] 24 | `(let [null-out# (io/writer 25 | (proxy [java.io.OutputStream] [] 26 | (write [& args#])))] 27 | (binding [*err* null-out# 28 | *out* null-out#] 29 | ~@body))) 30 | 31 | (comment 32 | (with-out-and-err-to-dev-null 33 | (do-sync catalog initial-state)) 34 | ) 35 | 36 | ;; This is a convient way to run the sync through target-stitch in 37 | ;; dry-run mode. 38 | (require '[clojure.java.shell :refer [sh]] 39 | '[clojure.string :as string] 40 | '[clojure.data.json :as json]) 41 | (defn run-test 42 | [state] 43 | (let [results (sh 44 | "/usr/local/share/virtualenvs/target-stitch/bin/target-stitch" 45 | "--dry-run" 46 | :in (with-out-str (do-sync catalog state)))] 47 | (-> results 48 | (update :err string/split #"\n") 49 | (update :out (comp (partial map (fn [s] 50 | (try (json/read-str s) 51 | (catch Exception _ s)))) 52 | #(string/split % #"\n")))))) 53 | 54 | (comment 55 | (string/split "" #"\n") 56 | (with-out-and-err-to-dev-null 57 | (run-test initial-state)) 58 | (run-test initial-state) 59 | ;; => {:exit 0, 60 | ;; :out 61 | ;; ({"bookmarks" {"stream3" {"position" 5}, "stream1" {"position" 10}}} 62 | ;; {"bookmarks" {"stream3" {"position" 10}, "stream1" {"position" 10}}}), 63 | ;; :err 64 | ;; ["INFO stream1 (7): Batch is valid" 65 | ;; "INFO stream3 (5): Batch is valid" 66 | ;; "INFO Exiting normally"]} 67 | ) 68 | -------------------------------------------------------------------------------- /spikes/005-central-concept/tap-const/src/tap_const/test_lazy_seq.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.test-lazy-seq 2 | (:require [tap-const.lazy-seq :refer :all]) 3 | (:gen-class)) 4 | 5 | (def catalog (let [schema {:type "object" 6 | :properties {:id {:type "integer"} 7 | :date {:type "string" 8 | :format "date-time"}}}] 9 | (reduce (fn [catalog stream-name] 10 | (assoc-in catalog [:streams stream-name] 11 | {:stream stream-name 12 | :key_properties ["id"] 13 | :schema schema})) 14 | {} 15 | (keys source)))) 16 | 17 | (def initial-state {:bookmarks {:stream1 {:id 37} 18 | :stream3 {:id 775}}}) 19 | 20 | (def config {}) 21 | 22 | (require '[clojure.java.io :as io]) 23 | 24 | ;; This is a convient way to run the sync through target-stitch in 25 | ;; dry-run mode. 26 | (require '[clojure.java.shell :refer [sh]] 27 | '[clojure.string :as string] 28 | '[clojure.data.json :as json]) 29 | (defn run-test 30 | [state] 31 | (let [results (sh 32 | "/usr/local/share/virtualenvs/target-stitch/bin/target-stitch" 33 | "--dry-run" 34 | :in (with-out-str (do-sync (assoc config :max-records 1000) 35 | catalog 36 | state)))] 37 | (-> results 38 | (update :err string/split #"\n") 39 | (update :out (comp (partial map (fn [s] 40 | (try (json/read-str s) 41 | (catch Exception _ s)))) 42 | #(string/split % #"\n")))))) 43 | 44 | (comment 45 | (do-sync (assoc config 46 | :max-records 1000 47 | :page-size 100) 48 | catalog 49 | initial-state) 50 | 51 | (with-out-and-err-to-dev-null 52 | (run-test initial-state)) 53 | ;; => {:exit 0, 54 | ;; :out 55 | ;; ({"bookmarks" {"stream3" {"id" 775}, "stream1" {"id" 1036}}} 56 | ;; {"value" {"bookmarks" {"stream3" {"id" 775}, "stream1" {"id" 1036}}}, 57 | ;; "type" "state", 58 | ;; "bookmarks" {"stream3" {"id" 999}}}), 59 | ;; :err 60 | ;; ["INFO stream1 (1000): Batch is valid" 61 | ;; "INFO stream3 (1000): Batch is valid" 62 | ;; "INFO Exiting normally"]} 63 | (run-test initial-state) 64 | ) 65 | 66 | (defn -main [& args] 67 | (do-sync catalog initial-state)) 68 | -------------------------------------------------------------------------------- /spikes/006-memory-management/000-notes.md: -------------------------------------------------------------------------------- 1 | See 2 | [the community docs](http://clojure-doc.org/articles/ecosystem/java_jdbc/using_sql.html#processing-a-result-set-lazily) 3 | for a reference on lazy evaluation. 4 | -------------------------------------------------------------------------------- /spikes/006-permissions/000-notes.md: -------------------------------------------------------------------------------- 1 | This spike is for establishing a decent minimal set of permissions for how 2 | to do discovery and syncing for mssql. 3 | 4 | From the stitch docs: 5 | 6 | > To set up MSSQL in Stitch, you need: 7 | > 8 | > **Permissions in MSSQL that allow you to create/manage users.** This is 9 | > required to create the Stitch database user. 10 | > 11 | > A server that: 12 | > 13 | > - Uses case-insensitive collation. More info about collation can be 14 | > found here in Microsoft’s documentation. 15 | > - Allows connections over TCP/IP 16 | > - Allows mixed mode authentication 17 | > 18 | > **Make sure your server is set up properly before continuing.** If you 19 | > need some help figuring out your hosting details, we recommend looping 20 | > in a member of your engineering team. 21 | -------------------------------------------------------------------------------- /spikes/006-permissions/tap-const/.gitignore: -------------------------------------------------------------------------------- 1 | .nrepl-port 2 | .lein-repl-history 3 | target/ 4 | -------------------------------------------------------------------------------- /spikes/006-permissions/tap-const/project.clj: -------------------------------------------------------------------------------- 1 | (defproject tap-const "0.0.1-SNAPSHOT" 2 | ;; 1.9.0 is the max we can get without bumping CIDER and we can't bump 3 | ;; CIDER until we can bump Java everywhere. 4 | :dependencies [[org.clojure/clojure "1.9.0"] 5 | [org.clojure/java.jdbc "0.7.9"] 6 | [com.microsoft.sqlserver/mssql-jdbc "7.2.1.jre8"] 7 | [org.clojure/tools.nrepl "0.2.13" 8 | :exclusions [org.clojure/clojure]] 9 | [cider/cider-nrepl "0.17.0"]] 10 | :profiles {:system {:java-cmd "/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java"}}) 11 | -------------------------------------------------------------------------------- /spikes/006-permissions/tap-const/src/tap_const/core.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.core 2 | (:require [clojure.tools.nrepl.server :as nrepl-server]) 3 | (:gen-class)) 4 | 5 | (defn nrepl-handler 6 | [] 7 | (require 'cider.nrepl) 8 | (ns-resolve 'cider.nrepl 'cider-nrepl-handler)) 9 | 10 | (defonce the-nrepl-server 11 | (nrepl-server/start-server :bind "127.0.0.1" 12 | :handler (nrepl-handler))) 13 | 14 | (defn log-infof 15 | [message-format & args] 16 | (binding [*out* *err*] 17 | (println (apply format 18 | (str "INFO " message-format) 19 | args)))) 20 | 21 | (defn -main 22 | [& args] 23 | (log-infof "INFO Started nrepl server at %s" 24 | (.getLocalSocketAddress (:server-socket the-nrepl-server))) 25 | (spit ".nrepl-port" (:port the-nrepl-server)) 26 | (.start (Thread. #((loop [] 27 | (Thread/sleep 1000) 28 | (recur)))))) 29 | -------------------------------------------------------------------------------- /spikes/006-permissions/tap-const/src/tap_const/in_situ_println_with_state_managed_by_paginator.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.in-situ-println-with-state-managed-by-paginator 2 | "tap-const.in-situ-println-with-state-managed-by-paginator is for 3 | exploring the idea of in-situ println statements being the primary way 4 | to write singer messages in a clojure tap. The primary difference 5 | between `record` and `paginator` is to show what can be done to liberate 6 | state messages from record messages. 7 | 8 | The idea is to make sync-record operate on a `new-state` arg and a 9 | `record` arg, returning the new-state on a successful write. Then, the 10 | thing processing that record can write state messages at whatever 11 | interval it feels comfortable with." 12 | (:require [clojure.data.json :as json] 13 | [clojure.string :as string] 14 | [tap-const.core :refer [log-infof]] 15 | [clojure.tools.nrepl.server :as nrepl-server]) 16 | (:gen-class)) 17 | 18 | (def source (let [contents (partition-all 19 | 2 20 | [{:id 1 21 | :date "2019-04-19T09:40:50Z"} 22 | {:id 2 23 | :date "2019-04-19T09:41:40Z"} 24 | {:id 3 25 | :date "2019-04-19T09:41:50Z"} 26 | {:id 4 27 | :date "2019-04-19T09:41:54Z"} 28 | {:id 5 29 | :date "2019-04-19T09:41:58Z"} 30 | {:id 6 31 | :date "2019-04-19T09:42:06Z"} 32 | {:id 7 33 | :date "2019-04-19T09:42:12Z"} 34 | {:id 8 35 | :date "2019-04-19T09:42:15Z"} 36 | {:id 9 37 | :date "2019-04-19T09:42:19Z"} 38 | {:id 10 39 | :date "2019-04-19T09:42:23Z"} 40 | {:id 11 41 | :date "2019-04-19T15:10:10"}])] 42 | {:stream1 contents 43 | :skipped-stream2 contents 44 | :stream3 contents})) 45 | 46 | (defn make-record 47 | [stream rec] 48 | {:stream (name stream) 49 | :record rec}) 50 | 51 | (defn make-message 52 | [type-kw value] 53 | (let [singer-type (string/upper-case (name type-kw))] 54 | (assoc value :type singer-type))) 55 | 56 | (defn write-message 57 | [type-kw value] 58 | (println 59 | (json/write-str 60 | (make-message type-kw value)))) 61 | 62 | (defn sync-record 63 | "Function of current state, rec → new state based on record" 64 | [stream-name new-state rec] 65 | ;; `new-bookmark` being passed here is a bit awkward but I think it's 66 | ;; what makes the most sense. Really the record _may_ be useful for 67 | ;; deriving the new state but it's the thing that's calling sync-record 68 | ;; that actually is likely to be able to make that decision. If you need 69 | ;; to derive the new state from the record then you'd do it in the 70 | ;; calling function based on the rec before you passed it in here. The 71 | ;; reason you can't just do that here is because there's lots of 72 | ;; bookmarks that aren't derived from the record itself. 73 | (write-message :record (make-record stream-name rec)) 74 | new-state) 75 | 76 | (defn record-seq 77 | [data] 78 | (mapcat identity data)) 79 | 80 | (defn make-state 81 | [state stream-name bookmark] 82 | (assoc-in state [:bookmarks stream-name :position] bookmark)) 83 | 84 | (defn sync-stream 85 | "catalog, stream-name, current state → new state" 86 | [catalog stream-name state] 87 | (log-infof "Syncing stream %s" stream-name) 88 | (write-message 89 | :schema 90 | (get-in catalog [:streams stream-name])) 91 | ;; This is the logic that would be replaced with real sync logic. Most 92 | ;; notably the thing responsible for writing messages is the thing that 93 | ;; must be responsible for writing state messages. 94 | 95 | ;; data here is 'page's of records 96 | (let [data (source stream-name) 97 | current-bookmark (get-in state 98 | [:bookmarks 99 | stream-name 100 | :position] 101 | 0) 102 | state-emission-rate 10 103 | bookmarks (drop (+ 1 current-bookmark) (range))] 104 | ;; The idea of sync-stream is still catalog, stream-name, current 105 | ;; state → new state but the design differs from the record emission 106 | ;; strategy. There, you essentially have to emit state every time you 107 | ;; emit a record because every time you emit a record you don't have 108 | ;; any context about what you had emitted before. 109 | ;; 110 | ;; The primary change here is that now instead of a reduction over 111 | ;; state we're simply mapping over state-to-be and records which 112 | ;; become the state-to-be once they're written. This lets us have very 113 | ;; granular states but only emit them every so often and could be 114 | ;; configured in whatever way we saw fit. If bookmarks needed to be 115 | ;; generated from the record rather than outside of the record then 116 | ;; they could be generated via mapping over the records rather than 117 | ;; generated ex nihilo as we're doing it here. 118 | 119 | ;; Take either the last state emitted or (if nothing was emitted) the 120 | ;; state as it was entered. 121 | (or (last 122 | (map (fn [state-page] 123 | (let [latest-state (last state-page)] 124 | (write-message 125 | :state {:value latest-state}) 126 | latest-state)) 127 | ;; partition into state pages for processing. 128 | (partition-all 129 | state-emission-rate 130 | (map (partial sync-record stream-name) 131 | ;; make new states over all the bookmarks. The new 132 | ;; state must be the state as it should be _after_ the 133 | ;; record is written. 134 | (map (partial make-state state stream-name) bookmarks) 135 | ;; Move to the current bookmark in the data, this 136 | ;; allows us to ignore the pagination because we can 137 | ;; express it as a lazy-seq. 138 | (drop current-bookmark (record-seq data)))))) 139 | state))) 140 | 141 | (defn selected? 142 | [stream-name] 143 | ((complement #{:skipped-stream2}) stream-name)) 144 | 145 | (defn maybe-sync-stream 146 | [catalog state stream-name] 147 | (if (selected? stream-name) 148 | (sync-stream catalog stream-name state) 149 | (do (log-infof "Skipping stream %s" 150 | stream-name) 151 | state))) 152 | 153 | (defn do-sync 154 | [catalog state] 155 | ;; To make parallel you need to move state to an atom. 156 | (reduce (partial maybe-sync-stream catalog) 157 | state 158 | (keys source))) 159 | -------------------------------------------------------------------------------- /spikes/006-permissions/tap-const/src/tap_const/in_situ_println_with_state_managed_by_record.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.in-situ-println-with-state-managed-by-record 2 | "tap-const.core is for exploring the idea of in-situ println 3 | statements being the primary way to write singer messages in a clojure 4 | tap" 5 | (:require [clojure.data.json :as json] 6 | [clojure.string :as string] 7 | [tap-const.core :refer [log-infof]] 8 | [clojure.tools.nrepl.server :as nrepl-server]) 9 | (:gen-class)) 10 | 11 | (def source (let [contents (partition-all 12 | 2 13 | [{:id 1 14 | :date "2019-04-19T09:40:50Z"} 15 | {:id 2 16 | :date "2019-04-19T09:41:40Z"} 17 | {:id 3 18 | :date "2019-04-19T09:41:50Z"} 19 | {:id 4 20 | :date "2019-04-19T09:41:54Z"} 21 | {:id 5 22 | :date "2019-04-19T09:41:58Z"} 23 | {:id 6 24 | :date "2019-04-19T09:42:06Z"} 25 | {:id 7 26 | :date "2019-04-19T09:42:12Z"} 27 | {:id 8 28 | :date "2019-04-19T09:42:15Z"} 29 | {:id 9 30 | :date "2019-04-19T09:42:19Z"} 31 | {:id 10 32 | :date "2019-04-19T09:42:23Z"} 33 | {:id 11 34 | :date "2019-04-19T15:10:10"}])] 35 | {:stream1 contents 36 | :skipped-stream2 contents 37 | :stream3 contents})) 38 | 39 | (defn make-record 40 | [stream rec] 41 | {:stream (name stream) 42 | :record rec}) 43 | 44 | (defn make-message 45 | [type-kw value] 46 | (let [singer-type (string/upper-case (name type-kw))] 47 | (assoc value :type singer-type))) 48 | 49 | (defn write-message 50 | [type-kw value] 51 | (println 52 | (json/write-str 53 | (make-message type-kw value)))) 54 | 55 | (defn sync-record 56 | "Function of current state, rec → new state based on record" 57 | [stream-name state new-bookmark rec] 58 | ;; `new-bookmark` being passed here is a bit awkward but I think it's 59 | ;; what makes the most sense. Really the record _may_ be useful for 60 | ;; deriving the new state but it's the thing that's calling sync-record 61 | ;; that actually is likely to be able to make that decision. If you need 62 | ;; to derive the new state from the record then you'd do it in the 63 | ;; calling function based on the rec before you passed it in here. The 64 | ;; reason you can't just do that here is because there's lots of 65 | ;; bookmarks that aren't derived from the record itself. 66 | (write-message :record (make-record stream-name rec)) 67 | (assoc-in state [:bookmarks stream-name :position] new-bookmark)) 68 | 69 | (defn record-seq 70 | [data] 71 | (mapcat identity data)) 72 | 73 | (defn sync-stream 74 | "catalog, stream-name, current state → new state" 75 | [catalog stream-name state] 76 | (log-infof "Syncing stream %s" stream-name) 77 | (write-message 78 | :schema 79 | (get-in catalog [:streams stream-name])) 80 | ;; This is the logic that would be replaced with real sync logic. Most 81 | ;; notably the thing responsible for writing messages is the thing that 82 | ;; must be responsible for writing state messages. 83 | (let [data (source stream-name) 84 | current-bookmark (get-in state 85 | [:bookmarks 86 | stream-name 87 | :position] 88 | 0)] 89 | (let [new-state 90 | (reduce (partial apply sync-record stream-name) 91 | state 92 | (map vector 93 | (drop current-bookmark 94 | (range)) 95 | (drop current-bookmark 96 | (record-seq data))))] 97 | ;; FIXME not good enough. One state message at the end of syncing 98 | ;; the entire stream. Need to emit state message at end of every 99 | ;; 'page' at least. 100 | (write-message :state {:value new-state}) 101 | new-state))) 102 | 103 | (defn selected? 104 | [stream-name] 105 | ((complement #{:skipped-stream2}) stream-name)) 106 | 107 | (defn maybe-sync-stream 108 | [catalog state stream-name] 109 | (if (selected? stream-name) 110 | (sync-stream catalog stream-name state) 111 | (do (log-infof "Skipping stream %s" 112 | stream-name) 113 | state))) 114 | 115 | (defn do-sync 116 | [catalog state] 117 | ;; To make parallel you need to move state to an atom. 118 | (reduce (partial maybe-sync-stream catalog) 119 | state 120 | (keys source))) 121 | -------------------------------------------------------------------------------- /spikes/006-permissions/tap-const/src/tap_const/test_in_situ_println_with_state_managed_by_paginator.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.test-in-situ-println-with-state-managed-by-paginator 2 | "Code for development runs of do-sync etc." 3 | (:require [tap-const.in-situ-println-with-state-managed-by-paginator :refer :all])) 4 | 5 | (def catalog (let [schema {:type "object" 6 | :properties {:id {:type "integer"} 7 | :date {:type "string" 8 | :format "date-time"}}}] 9 | (reduce (fn [catalog stream-name] 10 | (assoc-in catalog [:streams stream-name] 11 | {:stream stream-name 12 | :key_properties ["id"] 13 | :schema schema})) 14 | {} 15 | (keys source)))) 16 | 17 | (require '[clojure.java.io :as io]) 18 | 19 | (def initial-state {:bookmarks {:stream1 {:position 3} 20 | :stream3 {:position 7}}}) 21 | 22 | (defmacro with-out-and-err-to-dev-null 23 | [& body] 24 | `(let [null-out# (io/writer 25 | (proxy [java.io.OutputStream] [] 26 | (write [& args#])))] 27 | (binding [*err* null-out# 28 | *out* null-out#] 29 | ~@body))) 30 | 31 | (comment 32 | (do-sync catalog initial-state) 33 | ) 34 | 35 | ;; This is a convient way to run the sync through target-stitch in 36 | ;; dry-run mode. 37 | (require '[clojure.java.shell :refer [sh]] 38 | '[clojure.string :as string] 39 | '[clojure.data.json :as json]) 40 | (defn run-test 41 | [state] 42 | (let [results (sh 43 | "/usr/local/share/virtualenvs/target-stitch/bin/target-stitch" 44 | "--dry-run" 45 | :in (with-out-str (do-sync catalog state)))] 46 | (-> results 47 | (update :err string/split #"\n") 48 | (update :out (comp (partial map (fn [s] 49 | (try (json/read-str s) 50 | (catch Exception _ s)))) 51 | #(string/split % #"\n")))))) 52 | 53 | (comment 54 | (string/split "" #"\n") 55 | (with-out-and-err-to-dev-null 56 | (run-test initial-state)) 57 | (run-test initial-state) 58 | ;; => {:exit 0, 59 | ;; :out 60 | ;; ({"bookmarks" {"stream3" {"position" 5}, "stream1" {"position" 10}}} 61 | ;; {"bookmarks" {"stream3" {"position" 10}, "stream1" {"position" 10}}}), 62 | ;; :err 63 | ;; ["INFO stream1 (7): Batch is valid" 64 | ;; "INFO stream3 (5): Batch is valid" 65 | ;; "INFO Exiting normally"]} 66 | ) 67 | -------------------------------------------------------------------------------- /spikes/006-permissions/tap-const/src/tap_const/test_in_situ_println_with_state_managed_by_record.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.test-in-situ-println-with-state-managed-by-paginator 2 | "Code for development runs of do-sync etc." 3 | (:require [tap-const.in-situ-println-with-state-managed-by-record :refer :all])) 4 | 5 | (def catalog (let [schema {:type "object" 6 | :properties {:id {:type "integer"} 7 | :date {:type "string" 8 | :format "date-time"}}}] 9 | (reduce (fn [catalog stream-name] 10 | (assoc-in catalog [:streams stream-name] 11 | {:stream stream-name 12 | :key_properties ["id"] 13 | :schema schema})) 14 | {} 15 | (keys source)))) 16 | 17 | (require '[clojure.java.io :as io]) 18 | 19 | (def initial-state {:bookmarks {:stream1 {:position 3} 20 | :stream3 {:position 7}}}) 21 | 22 | (defmacro with-out-and-err-to-dev-null 23 | [& body] 24 | `(let [null-out# (io/writer 25 | (proxy [java.io.OutputStream] [] 26 | (write [& args#])))] 27 | (binding [*err* null-out# 28 | *out* null-out#] 29 | ~@body))) 30 | 31 | (comment 32 | (with-out-and-err-to-dev-null 33 | (do-sync catalog initial-state)) 34 | ) 35 | 36 | ;; This is a convient way to run the sync through target-stitch in 37 | ;; dry-run mode. 38 | (require '[clojure.java.shell :refer [sh]] 39 | '[clojure.string :as string] 40 | '[clojure.data.json :as json]) 41 | (defn run-test 42 | [state] 43 | (let [results (sh 44 | "/usr/local/share/virtualenvs/target-stitch/bin/target-stitch" 45 | "--dry-run" 46 | :in (with-out-str (do-sync catalog state)))] 47 | (-> results 48 | (update :err string/split #"\n") 49 | (update :out (comp (partial map (fn [s] 50 | (try (json/read-str s) 51 | (catch Exception _ s)))) 52 | #(string/split % #"\n")))))) 53 | 54 | (comment 55 | (string/split "" #"\n") 56 | (with-out-and-err-to-dev-null 57 | (run-test initial-state)) 58 | (run-test initial-state) 59 | ;; => {:exit 0, 60 | ;; :out 61 | ;; ({"bookmarks" {"stream3" {"position" 5}, "stream1" {"position" 10}}} 62 | ;; {"bookmarks" {"stream3" {"position" 10}, "stream1" {"position" 10}}}), 63 | ;; :err 64 | ;; ["INFO stream1 (7): Batch is valid" 65 | ;; "INFO stream3 (5): Batch is valid" 66 | ;; "INFO Exiting normally"]} 67 | ) 68 | -------------------------------------------------------------------------------- /spikes/006-permissions/tap-const/src/tap_const/test_lazy_seq.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.test-lazy-seq 2 | (:require [tap-const.lazy-seq :refer :all]) 3 | (:gen-class)) 4 | 5 | (def catalog (let [schema {:type "object" 6 | :properties {:id {:type "integer"} 7 | :date {:type "string" 8 | :format "date-time"}}}] 9 | (reduce (fn [catalog stream-name] 10 | (assoc-in catalog [:streams stream-name] 11 | {:stream stream-name 12 | :key_properties ["id"] 13 | :schema schema})) 14 | {} 15 | (keys source)))) 16 | 17 | (def initial-state {:bookmarks {:stream1 {:id 37} 18 | :stream3 {:id 775}}}) 19 | 20 | (def config {}) 21 | 22 | (require '[clojure.java.io :as io]) 23 | 24 | ;; This is a convient way to run the sync through target-stitch in 25 | ;; dry-run mode. 26 | (require '[clojure.java.shell :refer [sh]] 27 | '[clojure.string :as string] 28 | '[clojure.data.json :as json]) 29 | (defn run-test 30 | [state] 31 | (let [results (sh 32 | "/usr/local/share/virtualenvs/target-stitch/bin/target-stitch" 33 | "--dry-run" 34 | :in (with-out-str (do-sync (assoc config :max-records 1000) 35 | catalog 36 | state)))] 37 | (-> results 38 | (update :err string/split #"\n") 39 | (update :out (comp (partial map (fn [s] 40 | (try (json/read-str s) 41 | (catch Exception _ s)))) 42 | #(string/split % #"\n")))))) 43 | 44 | (comment 45 | (do-sync (assoc config 46 | :max-records 1000 47 | :page-size 100) 48 | catalog 49 | initial-state) 50 | 51 | (with-out-and-err-to-dev-null 52 | (run-test initial-state)) 53 | ;; => {:exit 0, 54 | ;; :out 55 | ;; ({"bookmarks" {"stream3" {"id" 775}, "stream1" {"id" 1036}}} 56 | ;; {"value" {"bookmarks" {"stream3" {"id" 775}, "stream1" {"id" 1036}}}, 57 | ;; "type" "state", 58 | ;; "bookmarks" {"stream3" {"id" 999}}}), 59 | ;; :err 60 | ;; ["INFO stream1 (1000): Batch is valid" 61 | ;; "INFO stream3 (1000): Batch is valid" 62 | ;; "INFO Exiting normally"]} 63 | (run-test initial-state) 64 | ) 65 | 66 | (defn -main [& args] 67 | (do-sync catalog initial-state)) 68 | -------------------------------------------------------------------------------- /spikes/006-permissions/test-db: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o pipefail 4 | 5 | logf() { 6 | printf "$1\n" "${@:2}" >&2 7 | } 8 | 9 | warnf() { 10 | logf "WARN: ${1}" "${@:2}" 11 | } 12 | 13 | infof() { 14 | logf "INFO: ${1}" "${@:2}" 15 | } 16 | 17 | errorf() { 18 | logf "ERROR: ${1}" "${@:2}" 19 | exit 1 20 | } 21 | 22 | while : 23 | do 24 | case $1 in 25 | destroy) 26 | command=destroy 27 | ;; 28 | create) 29 | command=create 30 | ;; 31 | describe) 32 | command=describe 33 | ;; 34 | -l) 35 | long_output=true 36 | ;; 37 | -?*|--?*=?*) 38 | warnf 'Unknown option (ignored): %s\n' "$1" 39 | ;; 40 | *) 41 | break 42 | esac 43 | shift 44 | done 45 | 46 | identifier="$(hostname)-test-mssql" 47 | 48 | describe_instance() { 49 | local jq_filter 50 | jq_filter='.DBInstances[] 51 | | {address: .Endpoint.Address, 52 | identifier: .DBInstanceIdentifier, 53 | status: .DBInstanceStatus, 54 | master_username: .MasterUsername, 55 | master_password: .MasterUsername}' 56 | if [[ $long_output == true ]] || [[ $1 == full ]] 57 | then 58 | jq_filter='.DBInstances[]' 59 | fi 60 | aws rds describe-db-instances --db-instance-identifier "${identifier}" \ 61 | | jq "$jq_filter" 62 | } 63 | 64 | test_db_not_online() { 65 | db_instance_status=$(aws rds describe-db-instances --db-instance-identifier "${identifier}" \ 66 | | jq -r '.DBInstances[].DBInstanceStatus') 67 | if (( $? != 0 )) 68 | then 69 | errorf 'Failed to describe %s' "$identifier" 70 | fi 71 | [[ $db_instance_status != available ]] 72 | } 73 | 74 | instance_status() { 75 | describe_instance full | jq -r '.DBInstanceStatus' 76 | } 77 | 78 | case $command in 79 | destroy) 80 | if describe_instance >/dev/null 81 | then 82 | infof 'Destroying %s' "$identifier" 83 | if [[ $(instance_status) != deleting ]] 84 | then 85 | if ! aws --region us-east-1 \ 86 | rds delete-db-instance \ 87 | --db-instance-identifier "$identifier" \ 88 | --skip-final-snapshot 89 | then 90 | errorf "Failed to initiate destruction of %s" "$identifier" 91 | fi 92 | fi 93 | while describe_instance >/dev/null 94 | do 95 | infof "Waiting for %s to be destroyed for %s seconds (current status: %s)" \ 96 | "$identifier" "$SECONDS" "$(instance_status)" 97 | sleep 30 98 | done 99 | else 100 | infof '%s is already offline' "$identifier" 101 | fi 102 | ;; 103 | create) 104 | if test_db_not_online 105 | then 106 | if ! describe_instance >/dev/null 107 | then 108 | logf "Creating db instance %s" "$identifier" 109 | if ! aws --region us-east-1 \ 110 | rds create-db-instance \ 111 | --db-instance-identifier "$identifier" \ 112 | --db-instance-class db.m4.large \ 113 | --engine sqlserver-se \ 114 | --allocated-storage 100 \ 115 | --master-username spike_mssql \ 116 | --master-user-password spike_mssql \ 117 | --backup-retention-period 0 \ 118 | --license-model license-included 119 | then 120 | errorf "Failed to initiate creation of %s" "$identifier" 121 | fi 122 | fi 123 | while test_db_not_online 124 | do 125 | infof "Waiting for db instance to come up for %s seconds (current status: %s)" \ 126 | "$SECONDS" \ 127 | "$(instance_status)" 128 | sleep 30 129 | done 130 | fi 131 | 132 | infof 'DB instance %s created' "$identifier" 133 | describe_instance 134 | ;; 135 | describe) 136 | infof 'Describing instance %s' "$identifier" 137 | describe_instance 138 | ;; 139 | *) 140 | errorf "Unknown or no command passed" 141 | esac 142 | 143 | -------------------------------------------------------------------------------- /spikes/007-full-table-syncing-strategy/000-notes.md: -------------------------------------------------------------------------------- 1 | Postgres has a concept of xmin associated with each row which makes every 2 | table able to be synced incrementally. Does that concept exist in mssql? 3 | Can we lean on it to do incremental full table or do we need to take the 4 | mysql approach of requiring orderable primary keys. 5 | 6 | ----------- 7 | 8 | It appears that SQL Server does not have an automatically available `xmin` 9 | column so the strategy for doing interruptible full table syncing will 10 | have to be the same as 11 | [tap-mysql](https://github.com/singer-io/tap-mysql/blob/5b466c2a4dc0d81a6cf66d1a0c740237cc6212b0/tap_mysql/sync_strategies/full_table.py#L48-L82). 12 | 13 | One tidbit from my research would be that if there’s a `rowversion` column 14 | on a table we should probably use that to capture updates and inserts to a 15 | full table sync. That kind of column doesn't exist in postgres. 16 | -------------------------------------------------------------------------------- /spikes/007-full-table-syncing-strategy/test-db: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o pipefail 4 | 5 | logf() { 6 | printf "$1\n" "${@:2}" >&2 7 | } 8 | 9 | warnf() { 10 | logf "WARN: ${1}" "${@:2}" 11 | } 12 | 13 | infof() { 14 | logf "INFO: ${1}" "${@:2}" 15 | } 16 | 17 | errorf() { 18 | logf "ERROR: ${1}" "${@:2}" 19 | exit 1 20 | } 21 | 22 | while : 23 | do 24 | case $1 in 25 | destroy) 26 | command=destroy 27 | ;; 28 | create) 29 | command=create 30 | ;; 31 | describe) 32 | command=describe 33 | ;; 34 | connect) 35 | command=connect 36 | ;; 37 | -l) 38 | long_output=true 39 | ;; 40 | -?*|--?*=?*) 41 | warnf 'Unknown option (ignored): %s\n' "$1" 42 | ;; 43 | *) 44 | break 45 | esac 46 | shift 47 | done 48 | 49 | identifier="$(hostname)-test-mssql" 50 | 51 | describe_instance() { 52 | local jq_filter 53 | jq_filter='.DBInstances[] 54 | | {address: .Endpoint.Address, 55 | identifier: .DBInstanceIdentifier, 56 | status: .DBInstanceStatus, 57 | master_username: .MasterUsername, 58 | master_password: .MasterUsername}' 59 | if [[ $long_output == true ]] || [[ $1 == full ]] 60 | then 61 | jq_filter='.DBInstances[]' 62 | fi 63 | aws rds describe-db-instances --db-instance-identifier "${identifier}" \ 64 | | jq "$jq_filter" 65 | } 66 | 67 | test_db_not_online() { 68 | db_instance_status=$(aws rds describe-db-instances --db-instance-identifier "${identifier}" \ 69 | | jq -r '.DBInstances[].DBInstanceStatus') 70 | if (( $? != 0 )) 71 | then 72 | errorf 'Failed to describe %s' "$identifier" 73 | fi 74 | [[ $db_instance_status != available ]] 75 | } 76 | 77 | instance_status() { 78 | describe_instance full | jq -r '.DBInstanceStatus' 79 | } 80 | 81 | case $command in 82 | destroy) 83 | if describe_instance >/dev/null 84 | then 85 | infof 'Destroying %s' "$identifier" 86 | if [[ $(instance_status) != deleting ]] 87 | then 88 | if ! aws --region us-east-1 \ 89 | rds delete-db-instance \ 90 | --db-instance-identifier "$identifier" \ 91 | --skip-final-snapshot 92 | then 93 | errorf "Failed to initiate destruction of %s" "$identifier" 94 | fi 95 | fi 96 | while describe_instance >/dev/null 97 | do 98 | infof "Waiting for %s to be destroyed for %s seconds (current status: %s)" \ 99 | "$identifier" "$SECONDS" "$(instance_status)" 100 | sleep 30 101 | done 102 | else 103 | infof '%s is already offline' "$identifier" 104 | fi 105 | ;; 106 | create) 107 | if test_db_not_online 108 | then 109 | if ! describe_instance >/dev/null 110 | then 111 | logf "Creating db instance %s" "$identifier" 112 | if ! aws --region us-east-1 \ 113 | rds create-db-instance \ 114 | --vpc-security-group-ids sg-089b5962b0c44592f \ 115 | --db-instance-identifier "$identifier" \ 116 | --db-instance-class db.m4.large \ 117 | --engine sqlserver-se \ 118 | --allocated-storage 100 \ 119 | --master-username spike_mssql \ 120 | --master-user-password spike_mssql \ 121 | --backup-retention-period 0 \ 122 | --no-auto-minor-version-upgrade \ 123 | --license-model license-included 124 | then 125 | errorf "Failed to initiate creation of %s" "$identifier" 126 | fi 127 | fi 128 | while test_db_not_online 129 | do 130 | infof "Waiting for db instance to come up for %s seconds (current status: %s)" \ 131 | "$SECONDS" \ 132 | "$(instance_status)" 133 | sleep 30 134 | done 135 | fi 136 | 137 | infof 'DB instance %s created' "$identifier" 138 | describe_instance 139 | ;; 140 | describe) 141 | infof 'Describing instance %s' "$identifier" 142 | describe_instance 143 | ;; 144 | connect) 145 | infof 'Connecting to %s' "$identifier" 146 | instance_definition=$(describe_instance full) 147 | if (( 0 != $? )) 148 | then 149 | errorf 'Could not describe %s' "$identifier" 150 | fi 151 | exec mssql-cli -U spike_mssql -P spike_mssql -S "$(jq -r '.Endpoint.Address' <<<"$instance_definition")" 152 | ;; 153 | *) 154 | errorf "Unknown or no command passed" 155 | esac 156 | -------------------------------------------------------------------------------- /spikes/008-how-to-read-cdc-log/000-notes.md: -------------------------------------------------------------------------------- 1 | ``` 2 | master> create database "spike_tap_mssql" 3 | Commands completed successfully. 4 | Time: 1.431s (a second) 5 | master> exec msdb.dbo.rds_cdc_enable_db 'spike_tap_mssql' 6 | CDC enabled on database spike_tap_mssql 7 | Time: 0.558s 8 | master> use "spike_tap_mssql"; 9 | Commands completed successfully. 10 | Time: 0.257s 11 | spike_tap_mssql> create table foo (id int primary key); 12 | Commands completed successfully. 13 | Time: 0.258s 14 | spike_tap_mssql> insert into foo (id) values (1), (2), (3); 15 | (3 rows affected) 16 | Time: 0.257s 17 | spike_tap_mssql> update foo set id = 4 where id = 2; 18 | (1 row affected) 19 | Time: 0.259s 20 | spike_tap_mssql> select * from foo; 21 | +------+ 22 | | id | 23 | |------| 24 | | 1 | 25 | | 3 | 26 | | 4 | 27 | +------+ 28 | (3 rows affected) 29 | Time: 0.365s 30 | spike_tap_mssql> exec sys.sp_cdc_enable_table @source_schema = 'dbo', @source_name = 'foo', @role_name = null; 31 | Job 'cdc.spike_tap_mssql_capture' started successfully. 32 | Time: 7.435s (7 seconds) 33 | ``` 34 | 35 | Schemas are locked once the CDC table is created. 36 | https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/about-change-data-capture-sql-server?view=sql-server-2017 37 | Could potentially use the DDL log to capture updates here. 38 | https://docs.microsoft.com/en-us/sql/relational-databases/system-stored-procedures/sys-sp-cdc-get-ddl-history-transact-sql?view=sql-server-2017 39 | 40 | We may need to support netchanges. 41 | https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/work-with-change-data-sql-server?view=sql-server-2017 42 | 43 | > The function cdc.fn_cdc_get_net_changes_ is generated 44 | > when the parameter @supports_net_changes is set to 1 when the source 45 | > table is enabled. 46 | > 47 | > Note: 48 | > 49 | > This option is only supported if the source table has a defined primary 50 | > key or if the parameter @index_name has been used to identify a unique 51 | > index. 52 | > 53 | > The netchanges function returns one change per modified source table 54 | > row. If more than one change is logged for the row during the specified 55 | > interval, the column values will reflect the final contents of the row. 56 | > To correctly identify the operation that is necessary to update the 57 | > target environment, the TVF must consider both the initial operation on 58 | > the row during the interval and the final operation on the row. When the 59 | > row filter option 'all' is specified, the operations that are returned 60 | > by a net changes query will either be insert, delete, or update (new 61 | > values). This option always returns the update mask as null because 62 | > there is a cost associated with computing an aggregate mask. If you 63 | > require an aggregate mask that reflects all changes to a row, use the 64 | > 'all with mask' option. If downstream processing does not require 65 | > inserts and updates to be distinguished, use the 'all with merge' 66 | > option. In this case, the operation value will only take on two values: 67 | > 1 for delete and 5 for an operation that could be either an insert or an 68 | > update. This option eliminates the additional processing needed to 69 | > determine whether the derived operation should be an insert or an 70 | > update, and can improve the performance of the query when this 71 | > differentiation is not necessary. 72 | 73 | ``` 74 | spike_tap_mssql> select * from cdc.fn_cdc_get_all_changes_dbo_foo(sys.fn_cdc_get_min_lsn ( 'dbo_foo' ), sys.fn_cdc_get_max_lsn (), N'all'); 75 | (0 rows affected) 76 | Time: 0.409s 77 | spike_tap_mssql> insert into foo (id) values (7), (8), (9); 78 | (3 rows affected) 79 | Time: 0.262s 80 | spike_tap_mssql> update foo set id = 2 where id = 4; 81 | (1 row affected) 82 | Time: 0.257s 83 | spike_tap_mssql> insert into foo (id) values (4), (5), (6); 84 | (3 rows affected) 85 | Time: 0.260s 86 | spike_tap_mssql> select * from cdc.fn_cdc_get_all_changes_dbo_foo(sys.fn_cdc_get_min_lsn ( 'dbo_foo' ), sys.fn_cdc_get_max_lsn (), N'all'); 87 | +------------------------+------------------------+----------------+------------------+------+ 88 | | __$start_lsn | __$seqval | __$operation | __$update_mask | id | 89 | |------------------------+------------------------+----------------+------------------+------| 90 | | 0x00000028000008570005 | 0x00000028000008570002 | 2 | 0x01 | 7 | 91 | | 0x00000028000008570005 | 0x00000028000008570003 | 2 | 0x01 | 8 | 92 | | 0x00000028000008570005 | 0x00000028000008570004 | 2 | 0x01 | 9 | 93 | | 0x00000028000008670006 | 0x00000028000008670002 | 1 | 0x01 | 4 | 94 | | 0x00000028000008670006 | 0x00000028000008670002 | 2 | 0x01 | 2 | 95 | +------------------------+------------------------+----------------+------------------+------+ 96 | (5 rows affected) 97 | Time: 0.366s 98 | spike_tap_mssql> select * from cdc.fn_cdc_get_all_changes_dbo_foo(sys.fn_cdc_get_min_lsn ( 'dbo_foo' ), sys.fn_cdc_get_max_lsn (), N'all'); 99 | +------------------------+------------------------+----------------+------------------+------+ 100 | | __$start_lsn | __$seqval | __$operation | __$update_mask | id | 101 | |------------------------+------------------------+----------------+------------------+------| 102 | | 0x00000028000008570005 | 0x00000028000008570002 | 2 | 0x01 | 7 | 103 | | 0x00000028000008570005 | 0x00000028000008570003 | 2 | 0x01 | 8 | 104 | | 0x00000028000008570005 | 0x00000028000008570004 | 2 | 0x01 | 9 | 105 | | 0x00000028000008670006 | 0x00000028000008670002 | 1 | 0x01 | 4 | 106 | | 0x00000028000008670006 | 0x00000028000008670002 | 2 | 0x01 | 2 | 107 | | 0x000000280000086F0005 | 0x000000280000086F0002 | 2 | 0x01 | 4 | 108 | | 0x000000280000086F0005 | 0x000000280000086F0003 | 2 | 0x01 | 5 | 109 | | 0x000000280000086F0005 | 0x000000280000086F0004 | 2 | 0x01 | 6 | 110 | +------------------------+------------------------+----------------+------------------+------+ 111 | (8 rows affected) 112 | Time: 0.368s 113 | spike_tap_mssql> select * from cdc.fn_cdc_get_all_changes_dbo_foo(sys.fn_cdc_get_min_lsn ( 'dbo_foo' ), sys.fn_cdc_get_max_lsn (), N'all'); 114 | +------------------------+------------------------+----------------+------------------+------+ 115 | | __$start_lsn | __$seqval | __$operation | __$update_mask | id | 116 | |------------------------+------------------------+----------------+------------------+------| 117 | | 0x00000028000008570005 | 0x00000028000008570002 | 2 | 0x01 | 7 | 118 | | 0x00000028000008570005 | 0x00000028000008570003 | 2 | 0x01 | 8 | 119 | | 0x00000028000008570005 | 0x00000028000008570004 | 2 | 0x01 | 9 | 120 | | 0x00000028000008670006 | 0x00000028000008670002 | 1 | 0x01 | 4 | 121 | | 0x00000028000008670006 | 0x00000028000008670002 | 2 | 0x01 | 2 | 122 | | 0x000000280000086F0005 | 0x000000280000086F0002 | 2 | 0x01 | 4 | 123 | | 0x000000280000086F0005 | 0x000000280000086F0003 | 2 | 0x01 | 5 | 124 | | 0x000000280000086F0005 | 0x000000280000086F0004 | 2 | 0x01 | 6 | 125 | +------------------------+------------------------+----------------+------------------+------+ 126 | (8 rows affected) 127 | Time: 0.367s 128 | spike_tap_mssql> 129 | ``` 130 | -------------------------------------------------------------------------------- /spikes/008-how-to-read-cdc-log/tap-const/.gitignore: -------------------------------------------------------------------------------- 1 | .nrepl-port 2 | .lein-repl-history 3 | target/ 4 | -------------------------------------------------------------------------------- /spikes/008-how-to-read-cdc-log/tap-const/project.clj: -------------------------------------------------------------------------------- 1 | (defproject tap-const "0.0.1-SNAPSHOT" 2 | ;; 1.9.0 is the max we can get without bumping CIDER and we can't bump 3 | ;; CIDER until we can bump Java everywhere. 4 | :dependencies [[org.clojure/clojure "1.9.0"] 5 | [org.clojure/data.json "0.2.6"] 6 | [org.clojure/java.jdbc "0.7.9"] 7 | [com.microsoft.sqlserver/mssql-jdbc "7.2.1.jre8"] 8 | [org.clojure/tools.nrepl "0.2.13" 9 | :exclusions [org.clojure/clojure]] 10 | [cider/cider-nrepl "0.17.0"]] 11 | :profiles {:system {:java-cmd "/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java"}}) 12 | -------------------------------------------------------------------------------- /spikes/008-how-to-read-cdc-log/tap-const/src/tap_const/core.clj: -------------------------------------------------------------------------------- 1 | (ns tap-const.core 2 | (:require [clojure.tools.nrepl.server :as nrepl-server] 3 | [clojure.java.jdbc :as jdbc]) 4 | (:gen-class)) 5 | 6 | (def db-spec {:dbtype "sqlserver" 7 | :dbname "spike_tap_mssql" 8 | :host "taps-tvisher1-test-mssql.cqaqbfvfo67k.us-east-1.rds.amazonaws.com" 9 | :user "spike_mssql" 10 | :password "spike_mssql"}) 11 | 12 | (comment 13 | (jdbc/query db-spec ["select * from foo"]) 14 | 15 | (jdbc/db-query-with-resultset 16 | db-spec 17 | "exec sys.sp_cdc_help_change_data_capture" 18 | (comp doall jdbc/result-set-seq)) 19 | ;; => ({:index_name "PK__foo__3213E83FA6198C88", 20 | ;; :index_column_list "[id]", 21 | ;; :object_id 1269579561, 22 | ;; :start_lsn [0, 0, 0, 36, 0, 0, 10, 24, 0, 66], 23 | ;; :source_table "foo", 24 | ;; :create_date #inst "2019-04-23T19:33:47.923000000-00:00", 25 | ;; :captured_column_list "[id]", 26 | ;; :role_name nil, 27 | ;; :supports_net_changes true, 28 | ;; :source_object_id 1237579447, 29 | ;; :has_drop_pending nil, 30 | ;; :end_lsn nil, 31 | ;; :filegroup_name nil, 32 | ;; :capture_instance "dbo_foo", 33 | ;; :source_schema "dbo"}) 34 | 35 | ;; https://docs.microsoft.com/en-us/sql/relational-databases/system-stored-procedures/sys-sp-cdc-help-change-data-capture-transact-sql?view=sql-server-2017#remarks 36 | ;; 37 | ;; Remarks 38 | ;; 39 | ;; When both source_schema and source_name default to NULL, or are 40 | ;; explicitly set the NULL, this stored procedure returns information 41 | ;; for all of the database capture instances that the caller has SELECT 42 | ;; access to. When source_schema and source_name are non-NULL, only 43 | ;; information on the specific named enabled table is returned. 44 | ;; 45 | ;; Could make that a decent way for us to get information about all the 46 | ;; tables we have CDC access for. 47 | 48 | (jdbc/db-query-with-resultset 49 | db-spec 50 | "exec sys.sp_cdc_help_change_data_capture @source_schema = 'dbo', @source_name = 'foo'" 51 | (comp doall jdbc/result-set-seq)) 52 | ;; => ({:index_name "PK__foo__3213E83FA6198C88", 53 | ;; :index_column_list "[id]", 54 | ;; :object_id 1269579561, 55 | ;; :start_lsn [0, 0, 0, 36, 0, 0, 10, 24, 0, 66], 56 | ;; :source_table "foo", 57 | ;; :create_date #inst "2019-04-23T19:33:47.923000000-00:00", 58 | ;; :captured_column_list "[id]", 59 | ;; :role_name nil, 60 | ;; :supports_net_changes true, 61 | ;; :source_object_id 1237579447, 62 | ;; :has_drop_pending nil, 63 | ;; :end_lsn nil, 64 | ;; :filegroup_name nil, 65 | ;; :capture_instance "dbo_foo", 66 | ;; :source_schema "dbo"}) 67 | 68 | (jdbc/query db-spec ["select sys.fn_cdc_get_min_lsn ( 'dbo_foo' ) as min_lsn"]) 69 | ;; => ({:min_lsn [0, 0, 0, 36, 0, 0, 10, 24, 0, 66]}) 70 | 71 | (jdbc/query db-spec ["select sys.fn_cdc_get_max_lsn () as max_lsn"]) 72 | ;; => ({:max_lsn [0, 0, 0, 40, 0, 0, 8, -13, 0, 1]}) 73 | 74 | (jdbc/query db-spec ["select * from cdc.fn_cdc_get_all_changes_dbo_foo(sys.fn_cdc_get_min_lsn ( 'dbo_foo' ), sys.fn_cdc_get_max_lsn (), 'all');"]) 75 | ;; => ({:__$start_lsn [0, 0, 0, 40, 0, 0, 8, 87, 0, 5], 76 | ;; :__$seqval [0, 0, 0, 40, 0, 0, 8, 87, 0, 2], 77 | ;; :__$operation 2, 78 | ;; :__$update_mask [1], 79 | ;; :id 7} 80 | ;; {:__$start_lsn [0, 0, 0, 40, 0, 0, 8, 87, 0, 5], 81 | ;; :__$seqval [0, 0, 0, 40, 0, 0, 8, 87, 0, 3], 82 | ;; :__$operation 2, 83 | ;; :__$update_mask [1], 84 | ;; :id 8} 85 | ;; {:__$start_lsn [0, 0, 0, 40, 0, 0, 8, 87, 0, 5], 86 | ;; :__$seqval [0, 0, 0, 40, 0, 0, 8, 87, 0, 4], 87 | ;; :__$operation 2, 88 | ;; :__$update_mask [1], 89 | ;; :id 9} 90 | ;; {:__$start_lsn [0, 0, 0, 40, 0, 0, 8, 103, 0, 6], 91 | ;; :__$seqval [0, 0, 0, 40, 0, 0, 8, 103, 0, 2], 92 | ;; :__$operation 1, 93 | ;; :__$update_mask [1], 94 | ;; :id 4} 95 | ;; {:__$start_lsn [0, 0, 0, 40, 0, 0, 8, 103, 0, 6], 96 | ;; :__$seqval [0, 0, 0, 40, 0, 0, 8, 103, 0, 2], 97 | ;; :__$operation 2, 98 | ;; :__$update_mask [1], 99 | ;; :id 2} 100 | ;; {:__$start_lsn [0, 0, 0, 40, 0, 0, 8, 111, 0, 5], 101 | ;; :__$seqval [0, 0, 0, 40, 0, 0, 8, 111, 0, 2], 102 | ;; :__$operation 2, 103 | ;; :__$update_mask [1], 104 | ;; :id 4} 105 | ;; {:__$start_lsn [0, 0, 0, 40, 0, 0, 8, 111, 0, 5], 106 | ;; :__$seqval [0, 0, 0, 40, 0, 0, 8, 111, 0, 3], 107 | ;; :__$operation 2, 108 | ;; :__$update_mask [1], 109 | ;; :id 5} 110 | ;; {:__$start_lsn [0, 0, 0, 40, 0, 0, 8, 111, 0, 5], 111 | ;; :__$seqval [0, 0, 0, 40, 0, 0, 8, 111, 0, 4], 112 | ;; :__$operation 2, 113 | ;; :__$update_mask [1], 114 | ;; :id 6}) 115 | 116 | ) 117 | 118 | 119 | 120 | (defn nrepl-handler 121 | [] 122 | (require 'cider.nrepl) 123 | (ns-resolve 'cider.nrepl 'cider-nrepl-handler)) 124 | 125 | (defonce the-nrepl-server 126 | (nrepl-server/start-server :bind "127.0.0.1" 127 | :handler (nrepl-handler))) 128 | 129 | (defn log-infof 130 | [message-format & args] 131 | (binding [*out* *err*] 132 | (println (apply format 133 | (str "INFO " message-format) 134 | args)))) 135 | 136 | (defn -main 137 | [& args] 138 | (log-infof "Started nrepl server at %s" 139 | (.getLocalSocketAddress (:server-socket the-nrepl-server))) 140 | (spit ".nrepl-port" (:port the-nrepl-server)) 141 | (.start (Thread. #((loop [] 142 | (Thread/sleep 1000) 143 | (recur)))))) 144 | -------------------------------------------------------------------------------- /spikes/008-how-to-read-cdc-log/test-db: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o pipefail 4 | 5 | logf() { 6 | printf "$1\n" "${@:2}" >&2 7 | } 8 | 9 | warnf() { 10 | logf "WARN: ${1}" "${@:2}" 11 | } 12 | 13 | infof() { 14 | logf "INFO: ${1}" "${@:2}" 15 | } 16 | 17 | errorf() { 18 | logf "ERROR: ${1}" "${@:2}" 19 | exit 1 20 | } 21 | 22 | while : 23 | do 24 | case $1 in 25 | destroy) 26 | command=destroy 27 | ;; 28 | create) 29 | command=create 30 | ;; 31 | describe) 32 | command=describe 33 | ;; 34 | connect) 35 | command=connect 36 | ;; 37 | -l) 38 | long_output=true 39 | ;; 40 | -?*|--?*=?*) 41 | warnf 'Unknown option (ignored): %s\n' "$1" 42 | ;; 43 | *) 44 | break 45 | esac 46 | shift 47 | done 48 | 49 | identifier="$(hostname)-test-mssql" 50 | 51 | describe_instance() { 52 | local jq_filter 53 | jq_filter='.DBInstances[] 54 | | {address: .Endpoint.Address, 55 | identifier: .DBInstanceIdentifier, 56 | status: .DBInstanceStatus, 57 | master_username: .MasterUsername, 58 | master_password: .MasterUsername}' 59 | if [[ $long_output == true ]] || [[ $1 == full ]] 60 | then 61 | jq_filter='.DBInstances[]' 62 | fi 63 | aws rds describe-db-instances --db-instance-identifier "${identifier}" \ 64 | | jq "$jq_filter" 65 | } 66 | 67 | test_db_not_online() { 68 | db_instance_status=$(aws rds describe-db-instances --db-instance-identifier "${identifier}" \ 69 | | jq -r '.DBInstances[].DBInstanceStatus') 70 | if (( $? != 0 )) 71 | then 72 | errorf 'Failed to describe %s' "$identifier" 73 | fi 74 | [[ $db_instance_status != available ]] 75 | } 76 | 77 | instance_status() { 78 | describe_instance full | jq -r '.DBInstanceStatus' 79 | } 80 | 81 | case $command in 82 | destroy) 83 | if describe_instance >/dev/null 84 | then 85 | infof 'Destroying %s' "$identifier" 86 | if [[ $(instance_status) != deleting ]] 87 | then 88 | if ! aws --region us-east-1 \ 89 | rds delete-db-instance \ 90 | --db-instance-identifier "$identifier" \ 91 | --skip-final-snapshot 92 | then 93 | errorf "Failed to initiate destruction of %s" "$identifier" 94 | fi 95 | fi 96 | while describe_instance >/dev/null 97 | do 98 | infof "Waiting for %s to be destroyed for %s seconds (current status: %s)" \ 99 | "$identifier" "$SECONDS" "$(instance_status)" 100 | sleep 30 101 | done 102 | else 103 | infof '%s is already offline' "$identifier" 104 | fi 105 | ;; 106 | create) 107 | if test_db_not_online 108 | then 109 | if ! describe_instance >/dev/null 110 | then 111 | logf "Creating db instance %s" "$identifier" 112 | if ! aws --region us-east-1 \ 113 | rds create-db-instance \ 114 | --vpc-security-group-ids sg-089b5962b0c44592f \ 115 | --db-instance-identifier "$identifier" \ 116 | --db-instance-class db.m4.large \ 117 | --engine sqlserver-se \ 118 | --allocated-storage 100 \ 119 | --master-username spike_mssql \ 120 | --master-user-password spike_mssql \ 121 | --backup-retention-period 0 \ 122 | --no-auto-minor-version-upgrade \ 123 | --license-model license-included 124 | then 125 | errorf "Failed to initiate creation of %s" "$identifier" 126 | fi 127 | fi 128 | while test_db_not_online 129 | do 130 | infof "Waiting for db instance to come up for %s seconds (current status: %s)" \ 131 | "$SECONDS" \ 132 | "$(instance_status)" 133 | sleep 30 134 | done 135 | fi 136 | 137 | infof 'DB instance %s created' "$identifier" 138 | describe_instance 139 | ;; 140 | describe) 141 | infof 'Describing instance %s' "$identifier" 142 | describe_instance 143 | ;; 144 | connect) 145 | infof 'Connecting to %s' "$identifier" 146 | instance_definition=$(describe_instance full) 147 | if (( 0 != $? )) 148 | then 149 | errorf 'Could not describe %s' "$identifier" 150 | fi 151 | exec mssql-cli -U spike_mssql -P spike_mssql -S "$(jq -r '.Endpoint.Address' <<<"$instance_definition")" 152 | ;; 153 | *) 154 | errorf "Unknown or no command passed" 155 | esac 156 | -------------------------------------------------------------------------------- /spikes/009-lockdown-config/000-notes.md: -------------------------------------------------------------------------------- 1 | The config values should probably be basically a union between tap-mysql/tap-postgresql 2 | and db reps mssql. 3 | 4 | ``` 5 | +------------------------------+----------------------+-------------+---------------+-----------------+-------------+---------------------------------------------------------------------+---------------+---------+ 6 | | name | environment_variable | is_required | setup_step_id | system_provided | tap_mutable | json_schema | property_type | ordinal | 7 | +------------------------------+----------------------+-------------+---------------+-----------------+-------------+---------------------------------------------------------------------+---------------+---------+ 8 | | host | NULL | 1 | 47 | 0 | 0 | {"type":"string","anyOf":[{"format":"hostname"},{"format":"ipv4"}]} | user_provided | NULL | 9 | | port | NULL | 1 | 47 | 0 | 0 | {"type":"string","pattern":"^\\d+"} | user_provided | NULL | 10 | | user | NULL | 1 | 47 | 0 | 0 | {"type":"string"} | user_provided | NULL | 11 | | password | NULL | 1 | 47 | 0 | 0 | {"type":"string"} | user_provided | NULL | 12 | | database | NULL | 0 | 47 | 0 | 0 | {"type":"string"} | user_provided | NULL | 13 | | image_version | NULL | 1 | 47 | 1 | 0 | NULL | read_only | NULL | 14 | | frequency_in_minutes | NULL | 0 | 47 | 0 | 0 | {"type": "string", "pattern": "^1$|^30$|^60$|^360$|^720$|^1440$"} | user_provided | NULL | 15 | | anchor_time | NULL | 0 | 47 | 0 | 0 | {"type": "string", "format": "date-time"} | user_provided | NULL | 16 | | cron_expression | NULL | 0 | 47 | 0 | 0 | NULL | user_provided | NULL | 17 | 18 | # Introduce ssh stuff as late as possible 19 | # | ssh | NULL | 0 | 47 | 0 | 0 | {"type":"string","pattern":"^(true|false)"} | user_provided | NULL | 20 | # | ssh_host | NULL | 0 | 47 | 0 | 0 | {"type":"string","anyOf":[{"format":"hostname"},{"format":"ipv4"}]} | user_provided | NULL | 21 | # | ssh_port | NULL | 0 | 47 | 0 | 0 | {"type":"string","pattern":"^\\d+"} | user_provided | NULL | 22 | # | ssh_user | NULL | 0 | 47 | 0 | 0 | {"type":"string"} | user_provided | NULL | 23 | # | ssl | NULL | 0 | 47 | 0 | 0 | {"type":"string","pattern":"^(true|false)"} | user_provided | NULL | 24 | 25 | # Looks to be used between mysql and postgres so maybe useful here? Should introduce later though. 26 | # | filter_dbs | NULL | 0 | 47 | 0 | 0 | {"type":"string"} | user_provided | NULL | 27 | # Looks to be unused anywhere 28 | # | use_log_based_replication | NULL | 0 | 47 | 0 | 0 | {"type":"string","pattern":"^(true|false)$"} | user_provided | NULL | 29 | # Looks to be specific to mysql 30 | # | server_id | NULL | 0 | 47 | 0 | 0 | {"type":"string","pattern":"^\\d+$"} | user_provided | NULL | 31 | # Introduce the following three as late as possible 32 | # | ssl_cert | NULL | 0 | 47 | 0 | 0 | {"type":"string"} | user_provided | NULL | 33 | # | ssl_key | NULL | 0 | 47 | 0 | 0 | {"type":"string"} | user_provided | NULL | 34 | # | ssl_ca | NULL | 0 | 47 | 0 | 0 | {"type":"string"} | user_provided | NULL | 35 | 36 | # Looks to be specific to mysql 37 | # | check_hostname | NULL | 0 | 47 | 0 | 0 | {"type":"string","pattern":"^(true|false)"} | user_provided | NULL | 38 | # Looks to be specific to mysql 39 | # | verify_mode | NULL | 0 | 47 | 0 | 0 | {"type":"string","pattern":"^(true|false)"} | user_provided | NULL | 40 | # Doesn't appear to be used in the tap but it's in the docs? 41 | # | ssl_client_auth_enabled | NULL | 0 | 47 | 0 | 0 | {"type":"string","pattern":"^(true|false)"} | user_provided | NULL | 42 | # Unused 43 | # | allow_non_auto_increment_pks | NULL | 0 | 47 | 0 | 0 | {"type":"string","pattern":"^(true|false)$"} | user_provided | NULL | 44 | +------------------------------+----------------------+-------------+---------------+-----------------+-------------+---------------------------------------------------------------------+---------------+---------+ 45 | ``` 46 | -------------------------------------------------------------------------------- /spikes/010-permissions/000-notes.md: -------------------------------------------------------------------------------- 1 | This spike is for establishing a decent minimal set of permissions for how 2 | to do discovery and syncing for mssql. 3 | 4 | From the stitch docs: 5 | 6 | > To set up MSSQL in Stitch, you need: 7 | > 8 | > **Permissions in MSSQL that allow you to create/manage users.** This is 9 | > required to create the Stitch database user. 10 | > 11 | > A server that: 12 | > 13 | > - Uses case-insensitive collation. More info about collation can be 14 | > found here in Microsoft’s documentation. 15 | > - Allows connections over TCP/IP 16 | > - Allows mixed mode authentication 17 | > 18 | > **Make sure your server is set up properly before continuing.** If you 19 | > need some help figuring out your hosting details, we recommend looping 20 | > in a member of your engineering team. 21 | -------------------------------------------------------------------------------- /spikes/011-initial-full-table-to-cdc-transition-strategy/000-notes.md: -------------------------------------------------------------------------------- 1 | Spike 011: Initial Full Table to CDC Transition Strategy 2 | ======================================================== 3 | 4 | Should be informed by 5 | [tap-postgres's](https://github.com/singer-io/tap-postgres) and 6 | [tap-mysql's](https://github.com/singer-io/tap-mysql) strategy. 7 | 8 | Need to be sure to send an 9 | [initial](https://github.com/singer-io/tap-postgres/blob/390fc1148ff70dff40509992ac78c112363cf323/tap_postgres/sync_strategies/full_table.py#L95) 10 | [activate version message](https://github.com/singer-io/tap-mysql/blob/5b466c2a4dc0d81a6cf66d1a0c740237cc6212b0/tap_mysql/sync_strategies/full_table.py#L205-L208) 11 | on the initial full table so data trickles in. 12 | 13 | Essentially, `tap-postgres` and `tap-mysql` save or elide some state to 14 | indicate that they're doing their initial sync or their initial sync is 15 | completed. I think it would be slightly more elegant in the Clojure world 16 | to always 'do' the full table sync, but based on the current state emit an 17 | empty sequence of records since we're actually done. 18 | 19 | In the case of `tap-postgres`, the state in question is `xmin`. During the 20 | full table sync it's 21 | [written to at intervals until the sync finishes](https://github.com/singer-io/tap-postgres/blob/390fc1148ff70dff40509992ac78c112363cf323/tap_postgres/sync_strategies/full_table.py#L138-L148). 22 | If the sync is interrupted it is 23 | [used to indicate that the sync should resume](https://github.com/singer-io/tap-postgres/blob/390fc1148ff70dff40509992ac78c112363cf323/tap_postgres/sync_strategies/full_table.py#L118). 24 | Once the sync has completed 25 | [it's cleared](https://github.com/singer-io/tap-postgres/blob/390fc1148ff70dff40509992ac78c112363cf323/tap_postgres/__init__.py#L570). 26 | 27 | In the case of `tap-mysql`, the state in question is a conglomeration of 28 | [`log_file`, `log_pos`, `max_pk_values`, and `last_pk_fetched`](https://github.com/singer-io/tap-mysql/blob/5b466c2a4dc0d81a6cf66d1a0c740237cc6212b0/tap_mysql/__init__.py#L347-L367). 29 | If `log_file` and `log_pos` are present and `max_pk_values` and 30 | `last_pk_fetched` are not then the initial sync is done. This is managed 31 | by 32 | [`do_sync_historical_binlog`](https://github.com/singer-io/tap-mysql/blob/5b466c2a4dc0d81a6cf66d1a0c740237cc6212b0/tap_mysql/__init__.py#L528-L599). 33 | 34 | The goal of the code here is to have a unit test essentiall that shows 35 | state mutating appropriately. No matter how we solve this problem it will 36 | have something to do with state as that's all we get passed in between 37 | runs. 38 | -------------------------------------------------------------------------------- /spikes/011-initial-full-table-to-cdc-transition-strategy/tap-const/.gitignore: -------------------------------------------------------------------------------- 1 | .nrepl-port 2 | .lein-repl-history 3 | target/ 4 | -------------------------------------------------------------------------------- /spikes/011-initial-full-table-to-cdc-transition-strategy/tap-const/project.clj: -------------------------------------------------------------------------------- 1 | (defproject tap-const "0.0.1-SNAPSHOT" 2 | ;; 1.9.0 is the max we can get without bumping CIDER and we can't bump 3 | ;; CIDER until we can bump Java everywhere. 4 | :dependencies [[org.clojure/clojure "1.9.0"] 5 | [org.clojure/data.json "0.2.6"] 6 | [org.clojure/java.jdbc "0.7.9"] 7 | [com.microsoft.sqlserver/mssql-jdbc "7.2.1.jre8"] 8 | [org.clojure/tools.nrepl "0.2.13" 9 | :exclusions [org.clojure/clojure]] 10 | [cider/cider-nrepl "0.17.0"]] 11 | :profiles {:system {:java-cmd "/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java"}}) 12 | -------------------------------------------------------------------------------- /spikes/011-initial-full-table-to-cdc-transition-strategy/test-db: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o pipefail 4 | 5 | logf() { 6 | printf "$1\n" "${@:2}" >&2 7 | } 8 | 9 | warnf() { 10 | logf "WARN: ${1}" "${@:2}" 11 | } 12 | 13 | infof() { 14 | logf "INFO: ${1}" "${@:2}" 15 | } 16 | 17 | errorf() { 18 | logf "ERROR: ${1}" "${@:2}" 19 | exit 1 20 | } 21 | 22 | while : 23 | do 24 | case $1 in 25 | destroy) 26 | command=destroy 27 | ;; 28 | create) 29 | command=create 30 | ;; 31 | describe) 32 | command=describe 33 | ;; 34 | connect) 35 | command=connect 36 | ;; 37 | -l) 38 | long_output=true 39 | ;; 40 | -?*|--?*=?*) 41 | warnf 'Unknown option (ignored): %s\n' "$1" 42 | ;; 43 | *) 44 | break 45 | esac 46 | shift 47 | done 48 | 49 | identifier="$(hostname)-test-mssql" 50 | 51 | describe_instance() { 52 | local jq_filter 53 | jq_filter='.DBInstances[] 54 | | {address: .Endpoint.Address, 55 | identifier: .DBInstanceIdentifier, 56 | status: .DBInstanceStatus, 57 | master_username: .MasterUsername, 58 | master_password: .MasterUsername}' 59 | if [[ $long_output == true ]] || [[ $1 == full ]] 60 | then 61 | jq_filter='.DBInstances[]' 62 | fi 63 | aws rds describe-db-instances --db-instance-identifier "${identifier}" \ 64 | | jq "$jq_filter" 65 | } 66 | 67 | test_db_not_online() { 68 | db_instance_status=$(aws rds describe-db-instances --db-instance-identifier "${identifier}" \ 69 | | jq -r '.DBInstances[].DBInstanceStatus') 70 | if (( $? != 0 )) 71 | then 72 | warnf 'Failed to describe %s' "$identifier" 73 | fi 74 | [[ $db_instance_status != available ]] 75 | } 76 | 77 | instance_status() { 78 | describe_instance full | jq -r '.DBInstanceStatus' 79 | } 80 | 81 | case $command in 82 | destroy) 83 | if describe_instance >/dev/null 84 | then 85 | infof 'Destroying %s' "$identifier" 86 | if [[ $(instance_status) != deleting ]] 87 | then 88 | if ! aws --region us-east-1 \ 89 | rds delete-db-instance \ 90 | --db-instance-identifier "$identifier" \ 91 | --skip-final-snapshot 92 | then 93 | errorf "Failed to initiate destruction of %s" "$identifier" 94 | fi 95 | fi 96 | while describe_instance >/dev/null 97 | do 98 | infof "Waiting for %s to be destroyed for %s seconds (current status: %s)" \ 99 | "$identifier" "$SECONDS" "$(instance_status)" 100 | sleep 30 101 | done 102 | else 103 | infof '%s is already offline' "$identifier" 104 | fi 105 | ;; 106 | create) 107 | if test_db_not_online 108 | then 109 | if ! describe_instance >/dev/null 110 | then 111 | logf "Creating db instance %s" "$identifier" 112 | if ! aws --region us-east-1 \ 113 | rds create-db-instance \ 114 | --vpc-security-group-ids sg-089b5962b0c44592f \ 115 | --db-instance-identifier "$identifier" \ 116 | --db-instance-class db.m4.large \ 117 | --engine sqlserver-se \ 118 | --allocated-storage 100 \ 119 | --master-username spike_mssql \ 120 | --master-user-password spike_mssql \ 121 | --backup-retention-period 0 \ 122 | --no-auto-minor-version-upgrade \ 123 | --license-model license-included 124 | then 125 | errorf "Failed to initiate creation of %s" "$identifier" 126 | fi 127 | fi 128 | while test_db_not_online 129 | do 130 | infof "Waiting for db instance to come up for %s seconds (current status: %s)" \ 131 | "$SECONDS" \ 132 | "$(instance_status)" 133 | sleep 30 134 | done 135 | fi 136 | 137 | infof 'DB instance %s created' "$identifier" 138 | describe_instance 139 | ;; 140 | describe) 141 | infof 'Describing instance %s' "$identifier" 142 | describe_instance 143 | ;; 144 | connect) 145 | infof 'Connecting to %s' "$identifier" 146 | instance_definition=$(describe_instance full) 147 | if (( 0 != $? )) 148 | then 149 | errorf 'Could not describe %s' "$identifier" 150 | fi 151 | exec mssql-cli -U spike_mssql -P spike_mssql -S "$(jq -r '.Endpoint.Address' <<<"$instance_definition")" 152 | ;; 153 | *) 154 | errorf "Unknown or no command passed" 155 | esac 156 | -------------------------------------------------------------------------------- /src/tap_mssql/config.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.config 2 | (:require [tap-mssql.utils :refer [try-read-only]] 3 | [clojure.tools.logging :as log] 4 | [clojure.java.jdbc :as jdbc])) 5 | 6 | (defn check-connection [conn-map] 7 | (do (jdbc/with-db-metadata [md conn-map] 8 | (jdbc/metadata-result (.getCatalogs md))) 9 | (log/info "Successfully connected to the instance") 10 | conn-map)) 11 | 12 | (defn ->conn-map* 13 | ([config] 14 | (->conn-map* config false)) 15 | ([config is-readonly?] 16 | (let [conn-map (cond-> {:dbtype "sqlserver" 17 | :dbname (or (config "database") "") ;; database is optional - if omitted it is set to an empty string 18 | :host (config "host") 19 | :port (or (config "port") 0) ;; port is optional - if omitted it is set to 0 for a dynamic port 20 | :password (config "password") 21 | :user (config "user")} 22 | 23 | (= "true" (config "ssl")) 24 | ;; TODO: The only way I can get a test failure is by 25 | ;; changing the code to say ":trustServerCertificate 26 | ;; false". In which case, truststores need to be 27 | ;; specified. This is for the "correct" way of doing 28 | ;; things, where we are validating SSL, but for now, 29 | ;; leaving the certificate unverified should work. 30 | (assoc ;; Based on the [docs][1], we believe thet 31 | ;; setting `authentication` to anything but 32 | ;; `NotSpecified` (the default) activates SSL 33 | ;; for the connection and have verified that by 34 | ;; setting `trustServerCertificate` to `false` 35 | ;; with `authentication` set to `SqlPassword` 36 | ;; and observing SSL handshake errors. Because 37 | ;; of this, we don't believe it's necessary to 38 | ;; set `encrypt` to `true` as it used to be 39 | ;; prior to Driver version 6.0. 40 | ;; 41 | ;; [1]: https://docs.microsoft.com/en-us/sql/connect/jdbc/setting-the-connection-properties?view=sql-server-2017 42 | :authentication "SqlPassword" 43 | :trustServerCertificate false))] 44 | ;; returns conn-map and logs on successful connection 45 | (if is-readonly? 46 | (try-read-only [test-conn conn-map] 47 | (check-connection test-conn)) 48 | (check-connection conn-map))))) 49 | 50 | (def ->conn-map (memoize ->conn-map*)) 51 | -------------------------------------------------------------------------------- /src/tap_mssql/core.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.core 2 | (:require [tap-mssql.catalog :as catalog] 3 | [tap-mssql.serialized-catalog :as serialized-catalog] 4 | [tap-mssql.sync-strategies.full :as full] 5 | [tap-mssql.sync-strategies.logical :as logical] 6 | [tap-mssql.sync-strategies.incremental :as incremental] 7 | [tap-mssql.singer.parse :as singer-parse] 8 | [tap-mssql.singer.messages :as singer-messages] 9 | [clojure.tools.logging :as log] 10 | [singer-clojure.log :as singer-log] 11 | [nrepl.server :as nrepl-server] 12 | [clojure.tools.cli :as cli] 13 | [clojure.string :as string] 14 | [clojure.data.json :as json]) 15 | (:gen-class)) 16 | 17 | (defn valid-state? 18 | [state] 19 | (map? state)) 20 | 21 | (def cli-options 22 | [["-d" "--discover" "Discovery Mode"] 23 | [nil "--repl" "REPL Mode"] 24 | [nil "--config CONFIG" "Config File" 25 | :parse-fn #'singer-parse/config 26 | :validate [catalog/non-system-database-name? 27 | (format "System databases (%s) may not be synced" 28 | (string/join ", " catalog/system-database-names))]] 29 | [nil "--catalog CATALOG" "Singer Catalog File" 30 | :parse-fn (comp serialized-catalog/->catalog 31 | #'singer-parse/catalog)] 32 | [nil "--state STATE" "Singer State File" 33 | :default {} 34 | :parse-fn #'singer-parse/state] 35 | ["-h" "--help"]]) 36 | 37 | (defn repl-arg-passed? 38 | [args] 39 | (some (partial = "--repl") args)) 40 | 41 | (defn start-nrepl-server 42 | [args] 43 | (require 'cider.nrepl) 44 | (let [the-nrepl-server 45 | (nrepl-server/start-server :bind "127.0.0.1" 46 | :handler (ns-resolve 'cider.nrepl 'cider-nrepl-handler))] 47 | (spit ".nrepl-port" (:port the-nrepl-server)) 48 | (log/infof "Started nrepl server at %s" 49 | (.getLocalSocketAddress (:server-socket the-nrepl-server))) 50 | the-nrepl-server)) 51 | 52 | (defn maybe-stop-nrepl-server 53 | [args the-nrepl-server] 54 | (if (repl-arg-passed? args) 55 | (do 56 | (log/infof "Leaving repl open for inspection because --repl was passed") 57 | (log/infof "nrepl server at %s" 58 | (.getLocalSocketAddress (:server-socket the-nrepl-server)))) 59 | (do 60 | (log/infof "Shutting down the nrepl server") 61 | (nrepl-server/stop-server the-nrepl-server)))) 62 | 63 | (defn get-interesting-errors 64 | [opts] 65 | (filter (fn [error] 66 | (not (string/starts-with? error "Unknown option: "))) 67 | (:errors opts))) 68 | 69 | (defn parse-opts 70 | [args] 71 | (let [opts (cli/parse-opts args cli-options) 72 | _ (def opts opts) 73 | interesting-errors (get-interesting-errors opts)] 74 | (def config (get-in opts [:options :config])) 75 | (def catalog (get-in opts [:options :catalog])) 76 | (def state (get-in opts [:options :state])) 77 | (when (not (empty? interesting-errors)) 78 | (throw (IllegalArgumentException. (string/join "\n" interesting-errors)))) 79 | opts)) 80 | 81 | (defn do-discovery [config] 82 | (log/info "Starting discovery mode") 83 | (-> (catalog/discover config) 84 | (catalog/->serialized-catalog) 85 | json/write-str 86 | println)) 87 | 88 | (defn valid-primary-keys? [catalog stream-name] 89 | (let [stream-metadata (get-in catalog ["streams" stream-name "metadata"]) 90 | primary-keys (get-in stream-metadata ["table-key-properties"]) 91 | unsupported-primary-keys (filter #(= "unsupported" 92 | (get-in stream-metadata ["properties" % "inclusion"])) 93 | primary-keys)] 94 | (if (not-empty unsupported-primary-keys) 95 | (throw (ex-info (format "Stream %s has unsupported primary key(s): %s" 96 | stream-name 97 | (string/join ", " unsupported-primary-keys)) {})) 98 | true))) 99 | 100 | (defn dispatch-sync-by-strategy [config catalog stream-name state] 101 | {:post [(map? %)]} 102 | (condp = (get-in catalog ["streams" stream-name "metadata" "replication-method"]) 103 | "FULL_TABLE" 104 | (full/sync! config catalog stream-name state) 105 | 106 | "LOG_BASED" 107 | (logical/sync! config catalog stream-name state) 108 | 109 | "INCREMENTAL" 110 | (incremental/sync! config catalog stream-name state) 111 | 112 | ;; Default 113 | (throw (IllegalArgumentException. (format "Replication Method for stream %s is invalid: %s" 114 | stream-name 115 | (get-in catalog ["streams" stream-name "metadata" "replication-method"])))))) 116 | 117 | (defn sync-stream! 118 | [config catalog state stream-name] 119 | {:pre [(valid-primary-keys? catalog stream-name)]} 120 | (let [replication-method (get-in catalog ["streams" stream-name "metadata" "replication-method"])] 121 | (log/infof "Syncing stream %s using replication method %s" stream-name replication-method) 122 | (singer-messages/write-schema! catalog stream-name) 123 | (->> (singer-messages/maybe-write-activate-version! stream-name replication-method catalog state) 124 | (dispatch-sync-by-strategy config catalog stream-name) 125 | (singer-messages/write-state! stream-name)))) 126 | 127 | (defn selected? [catalog stream-name] 128 | (get-in catalog ["streams" stream-name "metadata" "selected"])) 129 | 130 | (defn maybe-sync-stream! [config catalog state stream-name] 131 | {:post [(valid-state? %)]} 132 | (if (selected? catalog stream-name) 133 | ;; returns state 134 | (sync-stream! config catalog state stream-name) 135 | (do (log/infof "Skipping stream %s" 136 | stream-name) 137 | ;; returns original state 138 | state))) 139 | 140 | (defn do-sync [config catalog state] 141 | {:pre [(valid-state? state)]} 142 | (log/info "Starting sync mode") 143 | ;; Sync streams, no selection (e.g., maybe-sync-stream) 144 | (reduce (partial maybe-sync-stream! config catalog) 145 | state 146 | (->> (catalog "streams") 147 | vals 148 | (map #(get % "tap_stream_id"))))) 149 | 150 | (defn set-include-db-and-schema-names-in-messages! 151 | [config] 152 | (reset! singer-messages/include-db-and-schema-names-in-messages? (= "true" 153 | (get config "include_schemas_in_destination_stream_name")))) 154 | 155 | (defn -main [& args] 156 | (let [the-nrepl-server (start-nrepl-server args)] 157 | ;; This and the other defs here are not accidental. These are required 158 | ;; to be able to easily debug a running process that you didn't already 159 | ;; intend to repl into. 160 | (def args args) 161 | (try 162 | (let [{{:keys [discover repl config catalog state]} :options} 163 | (parse-opts args)] 164 | (set-include-db-and-schema-names-in-messages! config) 165 | (cond 166 | discover 167 | (do-discovery config) 168 | 169 | catalog 170 | (do-sync config catalog state) 171 | 172 | :else 173 | ;; FIXME: (show-help)? 174 | nil) 175 | (log/info "Tap Finished") 176 | (maybe-stop-nrepl-server args the-nrepl-server) 177 | (when (not (repl-arg-passed? args)) 178 | (System/exit 0))) 179 | 180 | (catch Throwable ex 181 | (singer-log/log-fatal "Fatal Error Occured" ex)) 182 | (finally 183 | ;; If we somehow skip the catch block, we need to always at least exit if not --repl 184 | (maybe-stop-nrepl-server args the-nrepl-server) 185 | (when (not (repl-arg-passed? args)) 186 | (System/exit 1)))))) 187 | -------------------------------------------------------------------------------- /src/tap_mssql/serialized_catalog.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.serialized-catalog) 2 | 3 | (defn deserialize-stream-metadata 4 | [serialized-stream-metadata] 5 | (reduce (fn [metadata serialized-metadata-entry] 6 | (reduce (fn [entry-metadata [k v]] 7 | (assoc-in 8 | entry-metadata 9 | (conj (serialized-metadata-entry "breadcrumb") k) 10 | v)) 11 | metadata 12 | (serialized-metadata-entry "metadata"))) 13 | {} 14 | serialized-stream-metadata)) 15 | 16 | (defn get-unsupported-breadcrumbs 17 | [stream-schema-metadata] 18 | (->> (stream-schema-metadata "properties") 19 | (filter (fn [[k v]] 20 | (= "unsupported" (v "inclusion")))) 21 | (map (fn [[k _]] 22 | ["properties" k])))) 23 | 24 | (defn deserialize-stream-schema 25 | [serialized-stream-schema stream-schema-metadata] 26 | (let [unsupported-breadcrumbs (get-unsupported-breadcrumbs stream-schema-metadata)] 27 | (reduce (fn [acc unsupported-breadcrumb] 28 | (assoc-in acc unsupported-breadcrumb nil)) 29 | serialized-stream-schema 30 | unsupported-breadcrumbs))) 31 | 32 | (defn deserialize-stream 33 | [serialized-stream] 34 | {:pre [(map? serialized-stream)]} 35 | (as-> serialized-stream ss 36 | (update ss "metadata" deserialize-stream-metadata) 37 | (update ss "schema" deserialize-stream-schema (ss "metadata")))) 38 | 39 | (defn deserialize-streams 40 | [serialized-streams] 41 | (reduce (fn [streams deserialized-stream] 42 | (assoc streams (deserialized-stream "tap_stream_id") deserialized-stream)) 43 | {} 44 | (map deserialize-stream serialized-streams))) 45 | 46 | (defn ->catalog 47 | [serialized-catalog] 48 | (update serialized-catalog "streams" deserialize-streams)) 49 | -------------------------------------------------------------------------------- /src/tap_mssql/singer/bookmarks.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.singer.bookmarks) 2 | 3 | (defn get-full-bookmark-keys 4 | "Ensures the use of a stream's `table-key-properties` > `rowversion` as an intermediary bookmark for 5 | interrupted full syncs, else returns nil" 6 | [catalog stream-name] 7 | (let [is-view? (get-in catalog ["streams" stream-name "metadata" "is-view"]) 8 | table-key-properties (if is-view? 9 | (get-in catalog ["streams" 10 | stream-name 11 | "metadata" 12 | "view-key-properties"]) 13 | (get-in catalog ["streams" 14 | stream-name 15 | "metadata" 16 | "table-key-properties"])) 17 | timestamp-column (first 18 | (first 19 | (filter (fn [[k v]] (= "timestamp" 20 | (v "sql-datatype"))) 21 | (get-in catalog ["streams" 22 | stream-name 23 | "metadata" 24 | "properties"]))))] 25 | 26 | (if (seq table-key-properties) 27 | table-key-properties 28 | (when (some? timestamp-column) 29 | [timestamp-column] 30 | )))) 31 | 32 | (defn get-logical-bookmark-keys 33 | "Ensures the use of a stream's `table-key-properties` as an intermediary bookmark for 34 | interrupted logical syncs." 35 | [catalog stream-name] 36 | (get-in catalog ["streams" 37 | stream-name 38 | "metadata" 39 | "table-key-properties"])) 40 | 41 | (defn update-state [stream-name replication-key record state] 42 | (-> state 43 | (assoc-in ["bookmarks" stream-name "replication_key_value"] 44 | (get record replication-key)) 45 | (assoc-in ["bookmarks" stream-name "replication_key_name"] 46 | replication-key))) 47 | 48 | (defn update-last-pk-fetched [stream-name bookmark-keys state record] 49 | ;; bookmark-keys can be nil under certain conditions: 50 | ;; ex: if a view is missing view-key-properties 51 | (if bookmark-keys 52 | (assoc-in state 53 | ["bookmarks" stream-name "last_pk_fetched"] 54 | (zipmap bookmark-keys (map (partial get record) bookmark-keys))) 55 | state)) 56 | -------------------------------------------------------------------------------- /src/tap_mssql/singer/fields.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.singer.fields) 2 | 3 | (defn selected-field? 4 | [[field-name field-metadata]] 5 | (or (field-metadata "selected") 6 | (= (field-metadata "inclusion") "automatic") 7 | (and (field-metadata "selected-by-default") 8 | (not (contains? field-metadata "selected"))))) 9 | 10 | (defn get-selected-fields 11 | [catalog stream-name] 12 | (let [metadata-properties 13 | (get-in catalog ["streams" stream-name "metadata" "properties"]) 14 | selected-fields (filter selected-field? metadata-properties) 15 | selected-field-names (map (comp name first) selected-fields)] 16 | selected-field-names)) 17 | -------------------------------------------------------------------------------- /src/tap_mssql/singer/messages.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.singer.messages 2 | (:require [tap-mssql.singer.schema :as singer-schema] 3 | [tap-mssql.singer.transform :as singer-transform] 4 | [clojure.data.json :as json])) 5 | 6 | (defn now [] 7 | ;; To redef in tests 8 | (System/currentTimeMillis)) 9 | 10 | (defn valid? 11 | [message] 12 | (and (#{"SCHEMA" "STATE" "RECORD" "ACTIVATE_VERSION"} (message "type")) 13 | (case (message "type") 14 | "SCHEMA" 15 | (message "schema") 16 | 17 | "STATE" 18 | (message "value") 19 | 20 | "RECORD" 21 | (message "record") 22 | 23 | "ACTIVATE_VERSION" 24 | (message "version")))) 25 | 26 | (def df (-> (java.time.format.DateTimeFormatterBuilder.) 27 | (.appendPattern "yyyy-MM-dd'T'HH:mm:ss.SSSSSSX") 28 | (.toFormatter))) 29 | 30 | (defn- parse-timestamp-to-string [ts] 31 | (-> ts 32 | (.toLocalDateTime) 33 | (.atOffset java.time.ZoneOffset/UTC) 34 | (.format df) 35 | (.replace "000Z" "Z") ;; replacing microseconds because dates are saved as bookmarks and mssql does not support string datetimes being more precise than the column type 36 | (.replace ".000Z" "Z") ;; same with milliseconds 37 | )) 38 | 39 | ;; date - 0001-01-01 through 9999-12-31 40 | ;; datetime - 1753-01-01 through 9999-12-31 and 00:00:00 through 23:59:59.997 and no TZ 41 | ;; datetime2 - 0001-01-01 through 9999-12-31 and 00:00:00 through 23:59:59.9999999 and no TZ 42 | ;; datetimeoffset - 0001-01-01 through 9999-12-31 and 00:00:00 through 23:59:59.9999999 and -14:00 through +14:00 43 | ;; smalldatetime - 1900-01-01 through 2079-06-06 and 00:00:00 through 23:59:59 and no TZ 44 | ;; time - 00:00:00.0000000 through 23:59:59.9999999 45 | 46 | ;; 1) The tap should always write ISO8601 dates 47 | ;; 2) In the absence of time, we should add 00:00:00 48 | ;; 3) In the absence of a TZ, we should add Z (assume UTC) 49 | ;; 4) In the presence of a TZ, we should emit with the appropriate +/- 00:00 50 | ;; 5) In the absense of a date, we should emit the time in the format HH:mm:ss.ffffffff 51 | (defn serialize-datetimes [k v] 52 | (condp contains? (type v) 53 | #{java.sql.Timestamp} ;; Java type for datetime, datetime2, and smalldatetime column types 54 | (parse-timestamp-to-string v) 55 | 56 | #{microsoft.sql.DateTimeOffset} ;; Java type for datetimeoffset columns 57 | (-> v 58 | (.getTimestamp) 59 | (parse-timestamp-to-string)) 60 | 61 | #{java.sql.Time java.sql.Date} ;; Java type for Time/Date columns respectively 62 | (.toString v) 63 | 64 | v)) 65 | 66 | (def include-db-and-schema-names-in-messages? (atom false)) 67 | 68 | (defn calculate-destination-stream-name 69 | [stream-name catalog] 70 | (if @include-db-and-schema-names-in-messages? 71 | (get-in catalog ["streams" stream-name "tap_stream_id"]) 72 | (get-in catalog ["streams" stream-name "table_name"]))) 73 | 74 | (defn write! 75 | [message] 76 | {:pre [(valid? message)]} 77 | (-> message 78 | (json/write-str :value-fn serialize-datetimes) 79 | println)) 80 | 81 | (defn write-schema! [catalog stream-name] 82 | ;; TODO: Make sure that unsupported values are written with an empty schema 83 | (-> {"type" "SCHEMA" 84 | "stream" (calculate-destination-stream-name stream-name catalog) 85 | "key_properties" (if (get-in catalog ["streams" stream-name "metadata" "is-view"]) 86 | (get-in catalog ["streams" stream-name "metadata" "view-key-properties"] []) 87 | (get-in catalog ["streams" stream-name "metadata" "table-key-properties"])) 88 | "schema" (get-in catalog ["streams" stream-name "schema"])} 89 | (singer-schema/maybe-add-bookmark-properties-to-schema catalog stream-name) 90 | (singer-schema/maybe-add-deleted-at-to-schema catalog stream-name) 91 | (singer-schema/make-unsupported-schemas-empty catalog stream-name) 92 | write!)) 93 | 94 | (defn write-state! 95 | [stream-name state] 96 | (write! {"type" "STATE" 97 | "stream" stream-name 98 | "value" state}) 99 | ;; This is very important. This function needs to return state so that 100 | ;; the outer reduce can pass it in to the next iteration. 101 | state) 102 | 103 | (defn write-record! 104 | [stream-name state record catalog] 105 | (let [transformed-record (singer-transform/transform catalog stream-name record) 106 | record-message {"type" "RECORD" 107 | "stream" (calculate-destination-stream-name stream-name catalog) 108 | "record" transformed-record} 109 | version (get-in state ["bookmarks" stream-name "version"])] 110 | (if (nil? version) 111 | (write! record-message) 112 | (write! (assoc record-message "version" version))))) 113 | 114 | (defn write-activate-version! 115 | [stream-name catalog state] 116 | (write! {"type" "ACTIVATE_VERSION" 117 | "stream" (calculate-destination-stream-name stream-name catalog) 118 | "version" (get-in state 119 | ["bookmarks" stream-name "version"])}) 120 | ;; This must return state, as it appears in the pipeline of a sync 121 | state) 122 | 123 | (defn maybe-write-activate-version! 124 | "Writes activate version message if not in state" 125 | [stream-name replication-method catalog state] 126 | (let [version-bookmark (get-in state ["bookmarks" stream-name "version"]) 127 | resuming? (get-in state ["bookmarks" 128 | stream-name "last_pk_fetched"] nil) 129 | new-state (condp contains? replication-method 130 | #{"FULL_TABLE"} 131 | (if resuming? 132 | state 133 | (assoc-in state 134 | ["bookmarks" stream-name "version"] 135 | (now))) 136 | 137 | #{"INCREMENTAL" "LOG_BASED"} 138 | (if version-bookmark 139 | state 140 | (assoc-in state 141 | ["bookmarks" stream-name "version"] 142 | (now))) 143 | 144 | (throw (IllegalArgumentException. (format "Replication Method for stream %s is invalid: %s" 145 | stream-name 146 | replication-method))))] 147 | ;; Write an activate_version message when we havent and its full table 148 | (when (and (nil? version-bookmark) 149 | (contains? #{"FULL_TABLE" "LOG_BASED"} replication-method)) 150 | (write-activate-version! stream-name catalog new-state)) 151 | new-state)) 152 | 153 | (def records-since-last-state (atom 0)) 154 | 155 | (def record-buffer-size 100) 156 | 157 | (defn write-state-buffered! [stream-name state] 158 | (swap! records-since-last-state inc) 159 | (if (> @records-since-last-state record-buffer-size) 160 | (do 161 | (reset! records-since-last-state 0) 162 | (write-state! stream-name state)) 163 | state)) 164 | -------------------------------------------------------------------------------- /src/tap_mssql/singer/parse.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.singer.parse 2 | (:require [clojure.data.json :as json] 3 | [clojure.java.io :as io])) 4 | 5 | (defn slurp-json 6 | [f] 7 | (-> f 8 | io/reader 9 | json/read)) 10 | 11 | (defn config 12 | "This function exists as a test seam" 13 | [config-file] 14 | (slurp-json config-file)) 15 | 16 | 17 | (defn state 18 | "This function exists as a test seam and for the post condition" 19 | [state-file] 20 | {:post [(map? %)]} 21 | (slurp-json state-file)) 22 | 23 | (defn catalog 24 | "This function exists as a test seam" 25 | [catalog-file] 26 | (slurp-json catalog-file)) 27 | -------------------------------------------------------------------------------- /src/tap_mssql/singer/schema.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.singer.schema) 2 | 3 | (defn make-unsupported-schemas-empty [schema-message catalog stream-name] 4 | (let [schema-keys (get-in catalog ["streams" stream-name "metadata" "properties"]) 5 | unsupported-keys (map first (filter #(= "unsupported" ((second %) "inclusion")) 6 | (seq schema-keys)))] 7 | (reduce (fn [msg x] (assoc-in msg ["schema" "properties" x] {})) 8 | schema-message 9 | unsupported-keys))) 10 | 11 | (defn maybe-add-deleted-at-to-schema [schema-message catalog stream-name] 12 | (if (= "LOG_BASED"(get-in catalog ["streams" stream-name "metadata" "replication-method"])) 13 | (assoc-in schema-message ["schema" "properties" "_sdc_deleted_at"] {"type" ["string" "null"] 14 | "format" "date-time"}) 15 | schema-message)) 16 | 17 | (defn maybe-add-bookmark-properties-to-schema [schema-message catalog stream-name] 18 | ;; Add or don't and return message 19 | (let [replication-key (get-in catalog ["streams" 20 | stream-name 21 | "metadata" 22 | "replication-key"])] 23 | (if replication-key 24 | (assoc schema-message "bookmark_properties" [replication-key]) 25 | schema-message))) 26 | -------------------------------------------------------------------------------- /src/tap_mssql/singer/transform.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.singer.transform 2 | (:require [clojure.string :as string])) 3 | 4 | (defn transform-binary [binary] 5 | (when binary 6 | (apply str "0x" (map (comp string/upper-case 7 | (partial format "%02x")) 8 | binary)))) 9 | 10 | (defn transform-date [^java.sql.Date date] 11 | (when date 12 | (str date "T00:00:00+00:00"))) 13 | 14 | (defn transform-field [catalog stream-name [k v]] 15 | (condp contains? (get-in catalog ["streams" stream-name "metadata" "properties" k "sql-datatype"]) 16 | #{"timestamp" "varbinary" "binary"} 17 | [k (transform-binary v)] 18 | 19 | #{"date"} 20 | [k (transform-date v)] 21 | 22 | [k v])) 23 | 24 | (defn transform [catalog stream-name record] 25 | (into {} (map (partial transform-field catalog stream-name) record))) 26 | -------------------------------------------------------------------------------- /src/tap_mssql/sync_strategies/common.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.sync-strategies.common 2 | (:require [clojure.string :as string]) 3 | (:import [com.microsoft.sqlserver.jdbc SQLServerResultSet])) 4 | 5 | (def result-set-opts {:raw? true 6 | ;; Using SQLServerResultSet/TYPE_SS_SERVER_CURSOR_FORWARD_ONLY raises: 7 | ;; com.microsoft.sqlserver.jdbc.TDSParser throwUnexpectedTokenException 8 | :result-type SQLServerResultSet/TYPE_FORWARD_ONLY 9 | :concurrency SQLServerResultSet/CONCUR_READ_ONLY}) 10 | 11 | ;; Square brackets or quotes can be used interchangeably to sanitize names. Square brackets 12 | ;; are used by SSMS so we are using the same pattern. 13 | (defn sanitize-names 14 | "Used for escaping column or table names that contain special characters or reserved words" 15 | [table-name] 16 | (format "[%s]" (-> table-name 17 | (string/replace "]" "]]")))) 18 | -------------------------------------------------------------------------------- /src/tap_mssql/sync_strategies/incremental.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.sync-strategies.incremental 2 | (:require [tap-mssql.config :as config] 3 | [tap-mssql.utils :refer [try-read-only]] 4 | [tap-mssql.singer.fields :as singer-fields] 5 | [tap-mssql.singer.bookmarks :as singer-bookmarks] 6 | [tap-mssql.singer.messages :as singer-messages] 7 | [tap-mssql.sync-strategies.common :as common] 8 | [clojure.tools.logging :as log] 9 | [clojure.string :as string] 10 | [clojure.java.jdbc :as jdbc])) 11 | 12 | (defn build-incremental-sync-query 13 | [stream-name schema-name table-name record-keys replication-key state] 14 | {:pre [(not (empty? record-keys))]} ;; Is there more incoming state that we think is worth asserting? 15 | (let [replication-key-name (get-in state ["bookmarks" stream-name "replication_key_name"]) 16 | replication-key-value (get-in state ["bookmarks" stream-name "replication_key_value"]) 17 | bookmarking-clause (format "%s >= ?" (common/sanitize-names replication-key)) 18 | add-where-clause? (and (some? replication-key-value) 19 | (= replication-key replication-key-name)) ;; if the replication-key in metadata changes, we negate our bookmark 20 | where-clause (when add-where-clause? 21 | (str " WHERE " bookmarking-clause)) 22 | order-by (str " ORDER BY " (common/sanitize-names replication-key)) 23 | sql-params [(str (format "SELECT %s FROM %s.%s" 24 | (string/join ", " (map common/sanitize-names record-keys)) 25 | (common/sanitize-names schema-name) 26 | (common/sanitize-names table-name)) 27 | where-clause 28 | order-by)]] 29 | (if add-where-clause? 30 | (concat sql-params 31 | [replication-key-value]) 32 | sql-params))) 33 | 34 | (defn sync-and-write-messages! 35 | "Syncs all records, states, returns the latest state. Ensures that the 36 | bookmark we have for this stream matches our understanding of the fields 37 | defined in the catalog that are bookmark-able." 38 | [config catalog stream-name state] 39 | (let [dbname (get-in catalog ["streams" stream-name "metadata" "database-name"]) 40 | record-keys (singer-fields/get-selected-fields catalog stream-name) 41 | table-name (get-in catalog ["streams" stream-name "table_name"]) 42 | schema-name (get-in catalog ["streams" stream-name "metadata" "schema-name"]) 43 | replication-key (get-in catalog ["streams" stream-name "metadata" "replication-key"]) 44 | sql-params (build-incremental-sync-query stream-name 45 | schema-name 46 | table-name 47 | record-keys 48 | replication-key 49 | state)] 50 | (log/infof "Executing query: %s" (pr-str sql-params)) 51 | (try-read-only [conn-map (assoc (config/->conn-map config true) 52 | :dbname dbname)] 53 | (reduce (fn [acc result] 54 | (let [record (select-keys result record-keys)] 55 | (singer-messages/write-record! stream-name acc record catalog) 56 | (->> (singer-bookmarks/update-state stream-name replication-key record acc) 57 | (singer-messages/write-state-buffered! stream-name)))) 58 | state 59 | (jdbc/reducible-query conn-map 60 | sql-params 61 | common/result-set-opts))))) 62 | 63 | (defn sync! 64 | [config catalog stream-name state] 65 | (->> state 66 | (singer-messages/write-activate-version! stream-name catalog) 67 | (singer-messages/write-state! stream-name) 68 | (sync-and-write-messages! config catalog stream-name) 69 | (singer-messages/write-activate-version! stream-name catalog))) 70 | -------------------------------------------------------------------------------- /src/tap_mssql/utils.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.utils) 2 | 3 | (defmacro try-read-only 4 | " 5 | Note: This macro is structured similar to if-let. 6 | 7 | Tries macro body with ApplicationIntent set, then without (if first 8 | fails). The db-spec used is defined in the binding supplied to this 9 | macro. 10 | 11 | Example: 12 | 13 | (try-read-only [a-db-spec-binding (config/->conn-map config)] 14 | (jdbc/query a-db-spec-binding 15 | \"SELECT 'this should work with read-only, if possible'\")) 16 | " 17 | [bindings & body] 18 | (assert (vector? bindings) "try-read-only requires a vector for its binding.") 19 | (assert (= 2 (count bindings)) "try-read-only requires exactly 2 forms in binding vector") 20 | (let [inner-name (bindings 0) 21 | binding-val (bindings 1)] 22 | `(let [db-spec-initial# ~binding-val] 23 | (loop [~inner-name (assoc db-spec-initial# :ApplicationIntent "ReadOnly") 24 | should-retry# true] 25 | (if-let [result# (try 26 | ~@body 27 | (catch com.microsoft.sqlserver.jdbc.SQLServerException ex# 28 | (when-not should-retry# (throw ex#))))] 29 | result# 30 | (recur (dissoc ~inner-name :ApplicationIntent) false)))))) 31 | -------------------------------------------------------------------------------- /test/tap_mssql/core_test.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.core-test 2 | (:require [tap-mssql.catalog :as catalog] 3 | [tap-mssql.serialized-catalog :as serialized-catalog] 4 | [tap-mssql.config :as config] 5 | [clojure.test :refer [is deftest]] 6 | [tap-mssql.core :refer :all])) 7 | 8 | (defn get-serialized-catalog-entry [serialized-catalog stream-name] 9 | (first (filter (comp (partial = stream-name) 10 | #(get % "tap_stream_id")) 11 | (serialized-catalog "streams")))) 12 | 13 | (defn get-serialized-catalog-metadata-for-breadcrumb 14 | [serialized-catalog-entry breadcrumb] 15 | ((first 16 | (filter (comp (partial = breadcrumb) 17 | #(get % "breadcrumb")) 18 | (serialized-catalog-entry "metadata"))) 19 | "metadata")) 20 | 21 | (deftest add-int-column-to-catalog 22 | (is (= ["integer" "null"] 23 | (let [catalog (catalog/add-column nil {:table_name "theologians" 24 | :table_cat "test" 25 | :table_schem "bar" 26 | :column_name "name" 27 | :type_name "int" 28 | :primary-key? false 29 | :is-view? false})] 30 | (get-in catalog 31 | ["streams" "test_bar_theologians" "schema" "properties" "name" "type"]))))) 32 | 33 | (deftest catalog->serialized-catalog-test 34 | (let [catalog (reduce catalog/add-column nil [{:table_name "catalog_test" 35 | :column_name "id" 36 | :type_name "int" 37 | :primary-key? false 38 | :is-view? false} 39 | {:table_name "unsupported_data_types" 40 | :column_name "rowversion" 41 | :type_name "rowversion" 42 | :is_nullable "YES" 43 | :unsupported? true}]) 44 | serialized-catalog (catalog/->serialized-catalog catalog)] 45 | (is (= catalog (serialized-catalog/->catalog serialized-catalog))) 46 | ;; Specific Structure 47 | (is (map? (catalog "streams"))) 48 | (is (every? (comp map? #(get % "metadata")) (vals (catalog "streams")))) 49 | (is (sequential? (serialized-catalog "streams"))) 50 | (is (every? (comp sequential? #(get % "metadata")) (serialized-catalog "streams"))) 51 | ;; Unsupported Type Replacement 52 | (is (and (contains? (get (serialized-catalog/->catalog serialized-catalog) 53 | "streams") 54 | "null_null_unsupported_data_types") 55 | (nil? (get-in (serialized-catalog/->catalog serialized-catalog) 56 | ["streams" "null_null_unsupported_data_types" "schema" "properties" "rowversion"])))) 57 | (is (= {} (get-in (get-serialized-catalog-entry serialized-catalog "null_null_unsupported_data_types") 58 | ["schema" "properties" "rowversion"]))) 59 | (is (= {"inclusion" "unsupported", 60 | "sql-datatype" "rowversion", 61 | "selected-by-default" false} 62 | (get-serialized-catalog-metadata-for-breadcrumb 63 | (get-serialized-catalog-entry serialized-catalog "null_null_unsupported_data_types") 64 | ["properties" "rowversion"])))) 65 | ) 66 | 67 | (deftest catalog->serialized-catalog-invalid-characters-test 68 | (let [catalog (reduce catalog/add-column nil [{:table_name "invalid_characters" 69 | :column_name "invalid_characters_ !#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" 70 | :type_name "int" 71 | :primary-key? false 72 | :is-view? false} 73 | {:table_name "invalid_characters" 74 | :column_name "invalid_characters_ !\"#$%&'()*+,-./:;<=>?@\\^_`{|}~" 75 | :type_name "int"}]) 76 | serialized-catalog (catalog/->serialized-catalog catalog)] 77 | (is (= catalog (serialized-catalog/->catalog serialized-catalog))) 78 | ;; Property Validation 79 | (is (= {"type" ["integer" "null"], "minimum" -2147483648, "maximum" 2147483647} 80 | (get-in (serialized-catalog/->catalog serialized-catalog) 81 | ["streams" "null_null_invalid_characters" "schema" "properties" "invalid_characters_ !#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"]))) 82 | (is (= {"type" ["integer" "null"], "minimum" -2147483648, "maximum" 2147483647} 83 | (get-in (serialized-catalog/->catalog serialized-catalog) 84 | ["streams" "null_null_invalid_characters" "schema" "properties" "invalid_characters_ !\"#$%&'()*+,-./:;<=>?@\\^_`{|}~"]))) 85 | (is (= {"type" ["integer" "null"], "minimum" -2147483648, "maximum" 2147483647} 86 | (get-in (get-serialized-catalog-entry serialized-catalog "null_null_invalid_characters") 87 | ["schema" "properties" "invalid_characters_ !#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"]))) 88 | (is (= {"type" ["integer" "null"], "minimum" -2147483648, "maximum" 2147483647} 89 | (get-in (get-serialized-catalog-entry serialized-catalog "null_null_invalid_characters") 90 | ["schema" "properties" "invalid_characters_ !\"#$%&'()*+,-./:;<=>?@\\^_`{|}~"]))) 91 | ;; Metadata Validation 92 | (is (= {"inclusion" "available", "sql-datatype" "int", "selected-by-default" true} 93 | (get-in (serialized-catalog/->catalog serialized-catalog) 94 | ["streams" "null_null_invalid_characters" "metadata" "properties" "invalid_characters_ !#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"]))) 95 | (is (= {"inclusion" "available", "sql-datatype" "int", "selected-by-default" true} 96 | (get-in (serialized-catalog/->catalog serialized-catalog) 97 | ["streams" "null_null_invalid_characters" "metadata" "properties" "invalid_characters_ !\"#$%&'()*+,-./:;<=>?@\\^_`{|}~"]))) 98 | (is (= {"inclusion" "available", "sql-datatype" "int", "selected-by-default" true} 99 | (get-serialized-catalog-metadata-for-breadcrumb 100 | (get-serialized-catalog-entry serialized-catalog "null_null_invalid_characters") 101 | ["properties" "invalid_characters_ !#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"]))) 102 | (is (= {"inclusion" "available", "sql-datatype" "int", "selected-by-default" true} 103 | (get-serialized-catalog-metadata-for-breadcrumb 104 | (get-serialized-catalog-entry serialized-catalog "null_null_invalid_characters") 105 | ["properties" "invalid_characters_ !\"#$%&'()*+,-./:;<=>?@\\^_`{|}~"]))))) 106 | 107 | (deftest verify-extra-arguments-does-not-throw 108 | (is (parse-opts ["--properties" "foo"]))) 109 | 110 | (deftest type-name->type-name-lookup 111 | ;; Won't change an arbitrary type 112 | (is (= "mytype" (catalog/type-name->type-name-lookup "mytype"))) 113 | ;; Handles in identity fields 114 | (is (= "int" (catalog/type-name->type-name-lookup "int identity"))) 115 | ;; Handles numeric identity fields 116 | (is (= "numeric" (catalog/type-name->type-name-lookup "numeric() identity"))) 117 | ;; Replaces only at end of string 118 | (is (= "myidentity()type" (catalog/type-name->type-name-lookup "myidentity()type"))) 119 | ) 120 | 121 | (comment 122 | ;; Run all loaded tests 123 | (do 124 | (require '[clojure.string :as string]) 125 | (apply clojure.test/run-tests (->> (all-ns) 126 | (map ns-name) 127 | (filter #(string/starts-with? % "tap-mssql.")) 128 | (filter #(string/ends-with? % "-test"))))) 129 | 130 | (clojure.test/run-tests *ns*) 131 | ) 132 | -------------------------------------------------------------------------------- /test/tap_mssql/discover_empty_catalog_test.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.discover-empty-catalog-test 2 | (:require [clojure.test :refer [is deftest use-fixtures]] 3 | [clojure.java.io :as io] 4 | [clojure.java.jdbc :as jdbc] 5 | [clojure.set :as set] 6 | [clojure.string :as string] 7 | [tap-mssql.core :refer :all] 8 | [tap-mssql.catalog :as catalog] 9 | [tap-mssql.config :as config] 10 | [tap-mssql.test-utils :refer [with-out-and-err-to-dev-null 11 | test-db-config 12 | test-db-configs 13 | with-matrix-assertions]])) 14 | 15 | (defn get-destroy-database-command 16 | [database] 17 | (format "DROP DATABASE %s" (:table_cat database))) 18 | 19 | (defn maybe-destroy-test-db 20 | [config] 21 | (let [destroy-database-commands (->> (catalog/get-databases config) 22 | (filter catalog/non-system-database?) 23 | (map get-destroy-database-command))] 24 | (let [db-spec (config/->conn-map config)] 25 | (jdbc/db-do-commands db-spec destroy-database-commands)))) 26 | 27 | (defn create-test-db 28 | [config] 29 | (let [db-spec (config/->conn-map config)] 30 | (jdbc/db-do-commands db-spec ["CREATE DATABASE empty_database"]))) 31 | 32 | (defn test-db-fixture [f config] 33 | (with-out-and-err-to-dev-null 34 | (maybe-destroy-test-db config) 35 | (create-test-db config) 36 | (f))) 37 | 38 | (deftest ^:integration verify-throw-on-empty-catalog 39 | (with-matrix-assertions test-db-configs test-db-fixture 40 | (is (thrown-with-msg? java.lang.Exception 41 | #"Empty Catalog: did not discover any streams" 42 | (catalog/discover test-db-config))))) 43 | 44 | (comment 45 | ;; TODO Can these be helper functions? 46 | ;; Clear all tests from namespace 47 | (map (comp (partial ns-unmap *ns*) #(.sym %)) (filter (comp :test meta) (vals (ns-publics *ns*)))) 48 | ;; Clear entire namespace 49 | (map (comp (partial ns-unmap *ns*) #(.sym %)) (vals (ns-publics *ns*))) 50 | ) 51 | -------------------------------------------------------------------------------- /test/tap_mssql/discover_permissions_test.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.discover-permissions-test 2 | (:require [tap-mssql.catalog :as catalog] 3 | [tap-mssql.config :as config] 4 | [clojure.test :refer [is deftest use-fixtures]] 5 | [clojure.java.io :as io] 6 | [clojure.java.jdbc :as jdbc] 7 | [clojure.set :as set] 8 | [clojure.string :as string] 9 | [tap-mssql.core :refer :all] 10 | [tap-mssql.test-utils :refer [with-out-and-err-to-dev-null 11 | test-db-config]])) 12 | 13 | (defn get-destroy-database-command 14 | [database] 15 | (format "DROP DATABASE IF EXISTS %s" (:table_cat database))) 16 | 17 | (defn maybe-destroy-test-db 18 | [] 19 | (let [destroy-database-commands (->> [{:table_cat "not_authorized_database"} 20 | {:table_cat "database_with_a_table"} 21 | {:table_cat "not_authorized_database_too"}] 22 | (filter catalog/non-system-database?) 23 | (map get-destroy-database-command))] 24 | (let [db-spec (config/->conn-map test-db-config)] 25 | (jdbc/db-do-commands db-spec destroy-database-commands)))) 26 | 27 | (defn create-test-db 28 | [] 29 | (let [db-spec (config/->conn-map test-db-config)] 30 | (jdbc/db-do-commands db-spec ["CREATE DATABASE not_authorized_database" 31 | "CREATE DATABASE database_with_a_table" 32 | "CREATE DATABASE not_authorized_database_too"]) 33 | ;; Create Tables 34 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_a_table") 35 | [(jdbc/create-table-ddl :empty_table [[:id "int"]])]) 36 | (jdbc/db-do-commands (assoc db-spec :dbname "not_authorized_database") 37 | [(jdbc/create-table-ddl :empty_table [[:id "int"]])]) 38 | (jdbc/db-do-commands (assoc db-spec :dbname "not_authorized_database_too") 39 | [(jdbc/create-table-ddl :empty_table [[:id "int"]])]) 40 | ;; Create view/tvf for fun 41 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_a_table") 42 | ["CREATE VIEW empty_table_ids 43 | AS 44 | SELECT id FROM empty_table"]) 45 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_a_table") 46 | ["CREATE FUNCTION table_valued_test(@input_value int) 47 | RETURNS @result table (a_value int) 48 | AS 49 | BEGIN 50 | INSERT INTO @result VALUES(@input_value + 1) 51 | RETURN 52 | END"]) 53 | ;; Create User if not exists 54 | (when (empty? (jdbc/query (assoc db-spec :dbname "database_with_a_table") 55 | "SELECT principal_id FROM sys.server_principals WHERE name = 'SingerTestUser'")) 56 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_a_table") 57 | ["CREATE LOGIN SingerTestUser WITH PASSWORD = 'ABCD12345$%'"])) 58 | (when (empty? (jdbc/query (assoc db-spec :dbname "database_with_a_table") 59 | "SELECT principal_id FROM sys.database_principals WHERE name = 'SingerTestUser'")) 60 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_a_table") 61 | ["CREATE USER SingerTestUser FOR LOGIN SingerTestUser" 62 | "GRANT SELECT ON dbo.empty_table TO SingerTestUser" 63 | "GRANT SELECT ON dbo.empty_table_ids TO SingerTestUser"])))) 64 | 65 | (defn test-db-fixture [f] 66 | (with-out-and-err-to-dev-null 67 | (maybe-destroy-test-db) 68 | (create-test-db) 69 | (f))) 70 | 71 | (use-fixtures :each test-db-fixture) 72 | 73 | (deftest ^:integration verify-populated-catalog 74 | (is (let [stream-names (set (map #(get % "stream") (vals ((catalog/discover (assoc test-db-config 75 | "user" "SingerTestUser" 76 | "password" "ABCD12345$%")) 77 | "streams"))))] 78 | (stream-names "empty_table"))) 79 | (is (let [stream-names (set (map #(get % "stream") (vals ((catalog/discover (assoc test-db-config 80 | "user" "SingerTestUser" 81 | "password" "ABCD12345$%")) 82 | "streams"))))] 83 | (stream-names "empty_table_ids"))) 84 | ;; Table-Valued functions should not be discovered 85 | (is (nil? (let [stream-names (set (map #(get % "stream") (vals ((catalog/discover (assoc test-db-config 86 | "user" "SingerTestUser" 87 | "password" "ABCD12345$%")) 88 | "streams"))))] 89 | (stream-names "table_valued_test")))) 90 | 91 | ;; Should not discover tables in non-permitted databases 92 | (is (let [discovered-dbs (->> (get (catalog/discover (assoc test-db-config 93 | "user" "SingerTestUser" 94 | "password" "ABCD12345$%")) "streams") 95 | (map (fn [[_ schema]] (get-in schema ["metadata" "database-name"]))) 96 | set)] 97 | (empty? (clojure.set/intersection 98 | discovered-dbs 99 | #{"not_authorized_database" "not_authorized_database_too"}))))) 100 | -------------------------------------------------------------------------------- /test/tap_mssql/discover_populated_catalog_metadata_test.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.discover-populated-catalog-metadata-test 2 | (:require 3 | [tap-mssql.catalog :as catalog] 4 | [tap-mssql.config :as config] 5 | [clojure.test :refer [is deftest use-fixtures]] 6 | [clojure.java.io :as io] 7 | [clojure.java.jdbc :as jdbc] 8 | [clojure.set :as set] 9 | [clojure.string :as string] 10 | [tap-mssql.core :refer :all] 11 | [tap-mssql.test-utils :refer [with-out-and-err-to-dev-null 12 | test-db-config]])) 13 | 14 | (defn get-destroy-database-command 15 | [database] 16 | (format "DROP DATABASE %s" (:table_cat database))) 17 | 18 | (defn maybe-destroy-test-db 19 | [] 20 | (let [destroy-database-commands (->> (catalog/get-databases test-db-config) 21 | (filter catalog/non-system-database?) 22 | (map get-destroy-database-command))] 23 | (let [db-spec (config/->conn-map test-db-config)] 24 | (jdbc/db-do-commands db-spec destroy-database-commands)))) 25 | 26 | (defn create-test-db 27 | [] 28 | (let [db-spec (config/->conn-map test-db-config)] 29 | (jdbc/db-do-commands db-spec ["CREATE DATABASE database_for_metadata"]) 30 | (jdbc/db-do-commands (assoc db-spec :dbname "database_for_metadata") 31 | [(jdbc/create-table-ddl :table_with_a_primary_key [[:id "int primary key"] 32 | [:name "varchar"]])]) 33 | (jdbc/db-do-commands (assoc db-spec :dbname "database_for_metadata") 34 | ["CREATE TABLE table_with_a_composite_key (id int, col_b varchar, name varchar, primary key (id, col_b))"]) 35 | (jdbc/db-do-commands (assoc db-spec :dbname "database_for_metadata") 36 | ["CREATE VIEW view_of_table_with_a_primary_key_id 37 | AS 38 | SELECT id FROM table_with_a_primary_key"]))) 39 | 40 | (defn populate-data 41 | [] 42 | (jdbc/insert! (-> (config/->conn-map test-db-config) 43 | (assoc :dbname "database_for_metadata")) 44 | "dbo.table_with_a_primary_key" 45 | {:id 1 :name "t"})) 46 | 47 | (defn test-db-fixture [f] 48 | (with-out-and-err-to-dev-null 49 | (maybe-destroy-test-db) 50 | (create-test-db) 51 | (populate-data) 52 | (f))) 53 | 54 | (use-fixtures :each test-db-fixture) 55 | 56 | (deftest ^:integration verify-metadata 57 | (is (= "automatic" 58 | (get-in (catalog/discover test-db-config) 59 | ["streams" "database_for_metadata_dbo_table_with_a_primary_key" "metadata" "properties" "id" "inclusion"]))) 60 | (is (= "int" 61 | (get-in (catalog/discover test-db-config) 62 | ["streams" "database_for_metadata_dbo_table_with_a_primary_key" "metadata" "properties" "id" "sql-datatype"]))) 63 | (is (= true 64 | (get-in (catalog/discover test-db-config) 65 | ["streams" "database_for_metadata_dbo_table_with_a_primary_key" "metadata" "properties" "id" "selected-by-default"]))) 66 | (is (= "available" 67 | (get-in (catalog/discover test-db-config) 68 | ["streams" "database_for_metadata_dbo_table_with_a_primary_key" "metadata" "properties" "name" "inclusion"]))) 69 | (is (= "varchar" 70 | (get-in (catalog/discover test-db-config) 71 | ["streams" "database_for_metadata_dbo_table_with_a_primary_key" "metadata" "properties" "name" "sql-datatype"]))) 72 | (is (= true 73 | (get-in (catalog/discover test-db-config) 74 | ["streams" "database_for_metadata_dbo_table_with_a_primary_key" "metadata" "properties" "name" "selected-by-default"]))) 75 | (is (= "database_for_metadata" 76 | (get-in (catalog/discover test-db-config) 77 | ["streams" "database_for_metadata_dbo_table_with_a_primary_key" "metadata" "database-name"]))) 78 | (is (= "dbo" 79 | (get-in (catalog/discover test-db-config) 80 | ["streams" "database_for_metadata_dbo_table_with_a_primary_key" "metadata" "schema-name"]))) 81 | (is (= false 82 | (get-in (catalog/discover test-db-config) 83 | ["streams" "database_for_metadata_dbo_table_with_a_primary_key" "metadata" "is-view"]))) 84 | (is (= #{"id"} 85 | (get-in (catalog/discover test-db-config) 86 | ["streams" "database_for_metadata_dbo_table_with_a_primary_key" "metadata" "table-key-properties"]))) 87 | (is (= true 88 | (get-in (catalog/discover test-db-config) 89 | ["streams" "database_for_metadata_dbo_view_of_table_with_a_primary_key_id" "metadata" "is-view"]))) 90 | (is (= 1 91 | (get-in (catalog/discover test-db-config) 92 | ["streams" "database_for_metadata_dbo_table_with_a_primary_key" "metadata" "row-count"]))) 93 | (is (= 0 94 | (get-in (catalog/discover test-db-config) 95 | ["streams" "database_for_metadata_dbo_view_of_table_with_a_primary_key_id" "metadata" "row-count"])))) 96 | 97 | (deftest ^:integration verify-metadata-when-composite-keys 98 | (let [catalog (catalog/discover test-db-config)] 99 | (is (= #{"id" "col_b"} 100 | (get-in catalog 101 | ["streams" "database_for_metadata_dbo_table_with_a_composite_key" "metadata" "table-key-properties"]))))) 102 | -------------------------------------------------------------------------------- /test/tap_mssql/discover_populated_catalog_test.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.discover-populated-catalog-test 2 | (:require [tap-mssql.catalog :as catalog] 3 | [tap-mssql.config :as config] 4 | [clojure.test :refer [is deftest use-fixtures]] 5 | [clojure.java.io :as io] 6 | [clojure.java.jdbc :as jdbc] 7 | [clojure.set :as set] 8 | [clojure.string :as string] 9 | [tap-mssql.core :refer :all] 10 | [tap-mssql.test-utils :refer [with-out-and-err-to-dev-null 11 | test-db-config]])) 12 | 13 | (defn get-destroy-database-command 14 | [database] 15 | (format "DROP DATABASE %s" (:table_cat database))) 16 | 17 | (defn maybe-destroy-test-db 18 | [] 19 | (let [destroy-database-commands (->> (catalog/get-databases test-db-config) 20 | (filter catalog/non-system-database?) 21 | (map get-destroy-database-command))] 22 | (let [db-spec (config/->conn-map test-db-config)] 23 | (jdbc/db-do-commands db-spec destroy-database-commands)))) 24 | 25 | (defn create-test-db 26 | [] 27 | (let [db-spec (config/->conn-map test-db-config)] 28 | (jdbc/db-do-commands db-spec ["CREATE DATABASE empty_database" 29 | "CREATE DATABASE database_with_a_table" 30 | "CREATE DATABASE database_with_table_valued_function"]) 31 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_a_table") 32 | [(jdbc/create-table-ddl :empty_table [[:id "int"]])]) 33 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_a_table") 34 | ["CREATE VIEW empty_table_ids 35 | AS 36 | SELECT id FROM empty_table"]) 37 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_a_table") 38 | ["CREATE FUNCTION table_valued_test(@input_value int) 39 | RETURNS @result table (a_value int) 40 | AS 41 | BEGIN 42 | INSERT INTO @result VALUES(@input_value + 1) 43 | RETURN 44 | END"]))) 45 | 46 | (defn test-db-fixture [f] 47 | (with-out-and-err-to-dev-null 48 | (maybe-destroy-test-db) 49 | (create-test-db) 50 | (f))) 51 | 52 | (use-fixtures :each test-db-fixture) 53 | 54 | (deftest ^:integration verify-populated-catalog 55 | (is (let [stream-names (set (map #(get % "stream") (vals ((catalog/discover test-db-config) "streams"))))] 56 | (stream-names "empty_table"))) 57 | (is (let [stream-names (set (map #(get % "stream") (vals ((catalog/discover test-db-config) "streams"))))] 58 | (stream-names "empty_table_ids"))) 59 | ;; Table-Valued functions should not be discovered 60 | (is (nil? (let [stream-names (set (map #(get % "stream") (vals ((catalog/discover test-db-config) "streams"))))] 61 | (stream-names "table_valued_test"))))) 62 | -------------------------------------------------------------------------------- /test/tap_mssql/discover_populated_catalog_with_multiple_non_system_databases_test.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.discover-populated-catalog-with-multiple-non-system-databases-test 2 | (:require [tap-mssql.catalog :as catalog] 3 | [tap-mssql.config :as config] 4 | [clojure.test :refer [is deftest use-fixtures]] 5 | [clojure.java.io :as io] 6 | [clojure.java.jdbc :as jdbc] 7 | [clojure.set :as set] 8 | [clojure.string :as string] 9 | [tap-mssql.core :refer :all] 10 | [tap-mssql.test-utils :refer [with-out-and-err-to-dev-null 11 | test-db-config]])) 12 | 13 | (defn get-destroy-database-command 14 | [database] 15 | (format "DROP DATABASE %s" (:table_cat database))) 16 | 17 | (defn maybe-destroy-test-db 18 | [] 19 | (let [destroy-database-commands (->> (catalog/get-databases test-db-config) 20 | (filter catalog/non-system-database?) 21 | (map get-destroy-database-command))] 22 | (let [db-spec (config/->conn-map test-db-config)] 23 | (jdbc/db-do-commands db-spec destroy-database-commands)))) 24 | 25 | (defn create-test-db 26 | [] 27 | (let [db-spec (config/->conn-map test-db-config)] 28 | (jdbc/db-do-commands db-spec ["CREATE DATABASE empty_database" 29 | "CREATE DATABASE database_with_a_table" 30 | "CREATE DATABASE another_database_with_a_table"]) 31 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_a_table") 32 | [(jdbc/create-table-ddl :empty_table [[:id "int"]])]) 33 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_a_table") 34 | ["CREATE VIEW empty_table_ids 35 | AS 36 | SELECT id FROM empty_table"]) 37 | (jdbc/db-do-commands (assoc db-spec :dbname "another_database_with_a_table") 38 | [(jdbc/create-table-ddl "another_empty_table" [[:id "int"]])]))) 39 | 40 | (defn test-db-fixture [f] 41 | (with-out-and-err-to-dev-null 42 | (maybe-destroy-test-db) 43 | (create-test-db) 44 | (f))) 45 | 46 | (use-fixtures :each test-db-fixture) 47 | 48 | (deftest ^:integration verify-populated-catalog 49 | (is (let [stream-names (set (map #(get % "stream") (vals ((catalog/discover test-db-config) "streams"))))] 50 | (stream-names "empty_table"))) 51 | (is (let [stream-names (set (map #(get % "stream") (vals ((catalog/discover test-db-config) "streams"))))] 52 | (stream-names "empty_table_ids"))) 53 | (is (let [stream-names (set (map #(get % "stream") (vals ((catalog/discover test-db-config) "streams"))))] 54 | (stream-names "another_empty_table")))) 55 | -------------------------------------------------------------------------------- /test/tap_mssql/discover_tables_in_schema_test.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.discover-tables-in-schema-test 2 | (:require [tap-mssql.catalog :as catalog] 3 | [tap-mssql.config :as config] 4 | [clojure.test :refer [is deftest use-fixtures]] 5 | [clojure.java.io :as io] 6 | [clojure.java.jdbc :as jdbc] 7 | [clojure.set :as set] 8 | [clojure.string :as string] 9 | [tap-mssql.core :refer :all] 10 | [tap-mssql.test-utils :refer [with-out-and-err-to-dev-null 11 | test-db-config]])) 12 | (defn get-destroy-database-command 13 | [database] 14 | (format "DROP DATABASE %s" (:table_cat database))) 15 | 16 | (defn maybe-destroy-test-db 17 | [] 18 | (let [destroy-database-commands (->> (catalog/get-databases test-db-config) 19 | (filter catalog/non-system-database?) 20 | (map get-destroy-database-command))] 21 | (let [db-spec (config/->conn-map test-db-config)] 22 | (jdbc/db-do-commands db-spec destroy-database-commands)))) 23 | 24 | 25 | (defn create-test-db 26 | [] 27 | (let [db-spec (config/->conn-map test-db-config)] 28 | (jdbc/db-do-commands db-spec ["CREATE DATABASE database_with_schema"]) 29 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_schema") ["CREATE SCHEMA schema_with_table"]) 30 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_schema") ["CREATE TABLE schema_with_table.data_table (id uniqueidentifier NOT NULL PRIMARY KEY DEFAULT NEWID(), value int)"]) 31 | (jdbc/db-do-commands (assoc db-spec :dbname "database_with_schema") 32 | [(jdbc/create-table-ddl 33 | "data_table_without_schema" 34 | [[:id "uniqueidentifier NOT NULL PRIMARY KEY DEFAULT NEWID()"] 35 | [:value "int"] 36 | [:deselected_value "int"]])]) 37 | )) 38 | 39 | (defn test-db-fixture [f] 40 | (with-out-and-err-to-dev-null 41 | (maybe-destroy-test-db) 42 | (create-test-db) 43 | (f))) 44 | 45 | (use-fixtures :each test-db-fixture) 46 | 47 | (deftest ^:integration verify-populated-catalog-with-schema 48 | (is (let [stream-names (set (map #(get % "stream") 49 | (vals ((catalog/discover test-db-config) "streams"))))] 50 | (= stream-names #{"data_table" "data_table_without_schema"}))) 51 | (is (let [tap-stream-ids (set (map #(get % "tap_stream_id") 52 | (vals ((catalog/discover test-db-config) "streams"))))] 53 | (= tap-stream-ids #{"database_with_schema_schema_with_table_data_table" 54 | "database_with_schema_dbo_data_table_without_schema"}))) 55 | (is (= 56 | (get-in (catalog/discover test-db-config) ["streams" 57 | "database_with_schema_schema_with_table_data_table" 58 | "metadata" 59 | "schema-name"]) 60 | "schema_with_table")) 61 | (is (= 62 | (get-in (catalog/discover test-db-config) ["streams" 63 | "database_with_schema_dbo_data_table_without_schema" 64 | "metadata" 65 | "schema-name"]) 66 | "dbo")) 67 | ) 68 | -------------------------------------------------------------------------------- /test/tap_mssql/messages_test.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.messages-test 2 | (:require [tap-mssql.catalog :as catalog] 3 | [tap-mssql.serialized-catalog :as serialized-catalog] 4 | [tap-mssql.config :as config] 5 | [tap-mssql.singer.messages :as singer-messages] 6 | [tap-mssql.singer.parse :as singer-parse] 7 | [clojure.test :refer [is deftest]] 8 | [clojure.string :as string] 9 | [clojure.data.json :as json] 10 | [tap-mssql.core :refer :all])) 11 | 12 | (defn get-messages-from-output 13 | [catalog stream-name] 14 | (as-> (with-out-str 15 | (singer-messages/write-schema! catalog stream-name)) 16 | output 17 | (string/split output #"\n") 18 | (filter (complement empty?) output) 19 | (map json/read-str 20 | output) 21 | (vec output))) 22 | 23 | (deftest view-primary-key-test 24 | (let [stream-name "test-stream" 25 | view-catalog {"streams" {stream-name {"metadata" {"is-view" true 26 | "view-key-properties" ["2"] 27 | "table-key-properties" [] 28 | "replication-key" "4"} 29 | "schema" {"type" "object" 30 | "properties" {"id" {"type" ["null" "integer"]}}} 31 | "tap_stream_id" stream-name 32 | "table_name" stream-name}}} 33 | view-expected-schema-message {"type" "SCHEMA" 34 | "stream" "test-stream" 35 | "key_properties" ["2"] 36 | "schema" {"type" "object" 37 | "properties" {"id" {"type" ["null" 38 | "integer"]}}} 39 | "bookmark_properties" ["4"]} 40 | view-actual-schema-message (first (get-messages-from-output view-catalog stream-name)) 41 | table-catalog {"streams" {stream-name {"metadata" {"is-view" false 42 | "table-key-properties" ["3"] 43 | "replication-key" "4"} 44 | "schema" {"type" "object" 45 | "properties" {"id" {"type" ["null" "integer"]}}} 46 | "tap_stream_id" stream-name 47 | "table_name" stream-name}}} 48 | table-expected-schema-message {"type" "SCHEMA" 49 | "stream" "test-stream" 50 | "key_properties" ["3"] 51 | "schema" {"type" "object" 52 | "properties" {"id" {"type" ["null" 53 | "integer"]}}} 54 | "bookmark_properties" ["4"]} 55 | table-actual-schema-message (first (get-messages-from-output table-catalog stream-name))] 56 | (is (= table-actual-schema-message 57 | table-expected-schema-message)) 58 | (is (= view-actual-schema-message 59 | view-expected-schema-message)))) 60 | -------------------------------------------------------------------------------- /test/tap_mssql/sync_strategies/common_test.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.sync-strategies.common-test 2 | (:require [tap-mssql.sync-strategies.common :refer :all] 3 | [clojure.test :refer [is deftest]])) 4 | 5 | (deftest names-are-sanitized [] 6 | (is (= "[chicken]" (sanitize-names "chicken")) 7 | "Normal strings should be surrounded by []") 8 | (is (= "[chicken]]potpie]" (sanitize-names "chicken]potpie")) 9 | "Right square brackets should be closed and sanitized")) 10 | -------------------------------------------------------------------------------- /test/tap_mssql/test_utils.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.test-utils 2 | (:require [clojure.java.io :as io] 3 | [clojure.string :as string] 4 | [clojure.tools.logging :as log] 5 | [clojure.data.json :as json] 6 | [clojure.test :refer [deftest]])) 7 | 8 | (defmacro with-out-and-err-to-dev-null 9 | [& body] 10 | `(let [null-out# (io/writer 11 | (proxy [java.io.OutputStream] [] 12 | (write [& args#])))] 13 | (binding [*err* null-out# 14 | *out* null-out#] 15 | (let [no-op# (constantly nil)] 16 | (with-redefs [log/log* no-op#] 17 | ~@body))))) 18 | 19 | (def test-db-config 20 | "Default to local docker instance." 21 | {"host" "localhost" 22 | "user" (System/getenv "STITCH_TAP_MSSQL_TEST_DATABASE_USER") 23 | "password" (System/getenv "STITCH_TAP_MSSQL_TEST_DATABASE_PASSWORD") 24 | "port" (or (System/getenv "STITCH_TAP_MSSQL_TEST_DATABASE_PORT") "1433")}) 25 | 26 | (def test-db-configs 27 | "Maps over `bin/testing-resources.json` and creates a list of tap config 28 | objects based on its contents." 29 | ;; TODO: Recover support for RDS instances from git history if needed. 30 | [test-db-config]) 31 | 32 | (def ^:dynamic *test-db-config*) 33 | 34 | ;; Def multiple assertions for the same test 35 | (defmacro with-matrix-assertions 36 | "Expands to a copy of the passed in `~@body` forms for each config in 37 | `test-db-configs-form` that is wrapped with an anonymous function that is 38 | passed into `fixture-fn`. 39 | 40 | Within `fixture-fn` and the wrapped forms, the 41 | symbol `test-db-config` is bound to the current config. 42 | 43 | NOTE: At execution time, this shadows any other `test-db-config` in the 44 | namespace." 45 | [test-db-configs-form fixture-fn & body] 46 | (let [test-configs (eval test-db-configs-form)] 47 | (assert (every? (fn [test-db-config] 48 | (map? test-db-config)) 49 | test-configs) 50 | "test-db-configs must eval to a sequence of config objects.") 51 | (assert (fn? (eval fixture-fn)) 52 | "fixture-fn must be a valid clojure.test fixture-fn") 53 | `(do 54 | ~@(map (fn [test-db-config] 55 | ;; `let` here provides ease of use without dynamic binding 56 | ;; at the cost of shadowing test-db-config if imported. 57 | ;; The benefit is that no code needs to change to add or 58 | ;; remove the matrix. 59 | `(let [~(symbol "test-db-config") ~test-db-config] 60 | (~fixture-fn (fn [] ~@body) ~(symbol "test-db-config")))) 61 | test-configs)))) 62 | 63 | (defn sql-server-exception [] 64 | (.newInstance (doto (.getDeclaredConstructor com.microsoft.sqlserver.jdbc.SQLServerException 65 | (into-array [String Throwable])) 66 | (.setAccessible true)) 67 | (object-array ["__TEST_BOOM__" 68 | (Exception. "Inner BOOM")]))) 69 | -------------------------------------------------------------------------------- /test/tap_mssql/try_read_only_utils_test.clj: -------------------------------------------------------------------------------- 1 | (ns tap-mssql.try-read-only-utils-test 2 | (:require [tap-mssql.utils :refer [try-read-only]] 3 | [clojure.test :refer :all] 4 | [tap-mssql.test-utils :refer [sql-server-exception]]) 5 | (:import [java.sql Date])) 6 | 7 | 8 | (deftest ^:integration verify-application-intent-only-set-if-body-succeeds 9 | (is (= "ReadOnly" 10 | (:ApplicationIntent (try-read-only [db-spec {}] 11 | db-spec))))) 12 | 13 | (deftest ^:integration verify-application-intent-only-unset-if-body-fails-first-time 14 | (let [times (atom 0)] 15 | (is (= nil 16 | (:ApplicationIntent 17 | (try-read-only [db-spec {:ApplicationIntent "ReadOnly"}] 18 | (when (= 0 @times) 19 | (swap! times inc) 20 | (throw (sql-server-exception))) 21 | db-spec)))))) 22 | 23 | (deftest ^:integration verify-application-intent-only-unset-if-body-fails-continuously 24 | (is (thrown-with-msg? 25 | com.microsoft.sqlserver.jdbc.SQLServerException 26 | #"__TEST_BOOM__" 27 | (try-read-only [db-spec {:ApplicationIntent "ReadOnly"}] 28 | (throw (sql-server-exception)))))) 29 | -------------------------------------------------------------------------------- /tests/spec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | 4 | 5 | class TapSpec: 6 | """ Base class to specify tap-specific configuration. """ 7 | 8 | ### TABLE PROPERTIES ### 9 | DATABASE_NAME = "database-name" 10 | SCHEMA = "schema-name" 11 | STREAM = "stream_name" 12 | VIEW = "is-view" 13 | PRIMARY_KEYS = "table-key-properties" 14 | ROWS = "row-count" 15 | SELECTED = "selected" # is this for after discovery from field selection? 16 | FIELDS = "fields" 17 | VALUES = "values" 18 | 19 | ### FIELD PROPERTIES 20 | DATATYPE = "sql-datatype" 21 | INCLUSION = "inclusion" 22 | DEFAULT_SELECT = "selected-by-default" 23 | 24 | ### OTHERS 25 | REPLICATION_KEYS = "valid-replication-keys" 26 | FOREIGN_KEYS = "table-foreign-key-properties" 27 | AUTOMATIC_FIELDS = "automatic" 28 | AVAILABLE_FIELDS = "available" 29 | UNAVAILABLE_FIELDS = "unsupported" 30 | REPLICATION_METHOD = "forced-replication-method" 31 | API_LIMIT = "max-row-limit" 32 | INCREMENTAL = "INCREMENTAL" 33 | FULL = "FULL_TABLE" 34 | 35 | # TODO - This assumes all columns are nullable which isn't true 36 | DATATYPE_SCHEMAS = { 37 | "bigint": {"type": ["integer", "null"], "maximum": 2 ** (8 * 8 - 1) - 1, "minimum": -2 ** (8 * 8 - 1)}, 38 | "int": {"type": ["integer", "null"], "maximum": 2 ** (8 * 4 - 1) - 1, "minimum": -2 ** (8 * 4 - 1)}, 39 | "smallint": {"type": ["integer", "null"], "maximum": 2 ** (8 * 2 - 1) - 1, "minimum": -2 ** (8 * 2 - 1)}, 40 | "tinyint": {"type": ["integer", "null"], "maximum": 255, "minimum": 0}, 41 | "bit": {"type": ["boolean", "null"]}, 42 | "real": {"type": ["number", "null"]}, 43 | "float": {"type": ["number", "null"]}, 44 | "numeric": {'multipleOf': 10 ** 0, 45 | "type": ["number", "null"], 46 | 'maximum': 10 ** 18, 'exclusiveMinimum': True, 47 | 'minimum': -10 ** 18, 'exclusiveMaximum': True}, 48 | "decimal": {'multipleOf': 10 ** 0, 49 | "type": ["number", "null"], 50 | 'maximum': 10 ** 18, 'exclusiveMinimum': True, 51 | 'minimum': -10 ** 18, 'exclusiveMaximum': True}, 52 | "char": {"type": ["string", "null"], "minLength": 2}, # TODO this is just for char(2) 53 | "varchar": {"type": ["string", "null"], "minLength": 0}, # TODO this is just for varchar(800) 54 | "nvarchar": {"type": ["string", "null"], "minLength": 0}, # TODO this is just for nvarchar(800) 55 | "date": {"type": ["string", "null"], 'format': 'date-time'}, 56 | "datetime": {"type": ["string", "null"], 'format': 'date-time'}, 57 | "time": {"type": ["string", "null"]}, 58 | } 59 | 60 | for precision in range(1, 39): 61 | for scale in range(precision + 1): 62 | DATATYPE_SCHEMAS["numeric({},{})".format(precision, scale)] = { 63 | 'multipleOf': 10 ** (0 - scale), 64 | "type": ["number", "null"], 65 | 'maximum': 10 ** (precision - scale), 'exclusiveMinimum': True, 66 | 'minimum': -10 ** (precision - scale), 'exclusiveMaximum': True 67 | } 68 | 69 | for precision in range(1, 39): 70 | for scale in range(precision + 1): 71 | DATATYPE_SCHEMAS["decimal({},{})".format(precision, scale)] = { 72 | 'multipleOf': 10 ** (0 - scale), 73 | "type": ["number", "null"], 74 | 'maximum': 10 ** (precision - scale), 'exclusiveMinimum': True, 75 | 'minimum': -10 ** (precision - scale), 'exclusiveMaximum': True 76 | } 77 | 78 | # TODO - BUG https://stitchdata.atlassian.net/browse/SRCE-1008 79 | SUPPORTED_DATATYPES = [ 80 | "bigint", "int", "smallint", "tinyint", "bit", "real", "float", 81 | "date", "datetime", "time", "datetime2", "datetimeoffset", "smalldatetime", 82 | "char", "varchar", "varchar(max)", "nchar", "nvarchar", "nvarchar(max)", 83 | "binary", "varbinary", "varbinary(max)", "uniqueidentifier", "timestamp", "rowversion", 84 | "numeric", "decimal", "money", "smallmoney" 85 | ] 86 | SUPPORTED_DATATYPES.extend([ 87 | "numeric({0},{1})".format(precision, scale) 88 | for precision in range(1, 39) 89 | for scale in range(precision + 1) 90 | ]) 91 | SUPPORTED_DATATYPES.extend([ 92 | "decimal({0},{1})".format(precision, scale) 93 | for precision in range(1, 39) 94 | for scale in range(precision + 1) 95 | ]) 96 | SUPPORTED_DATATYPES.extend(["float({0})".format(bits + 1) for bits in range(53)]) 97 | SUPPORTED_DATATYPES.extend(["char({0})".format(chars + 1) for chars in range(8000)]) 98 | SUPPORTED_DATATYPES.extend(["varchar({0})".format(chars + 1) for chars in range(8000)]) 99 | SUPPORTED_DATATYPES.extend(["nchar({0})".format(chars + 1) for chars in range(4000)]) 100 | SUPPORTED_DATATYPES.extend(["nvarchar({0})".format(chars + 1) for chars in range(4000)]) 101 | SUPPORTED_DATATYPES.extend(["binary({0})".format(chars + 1) for chars in range(8000)]) 102 | SUPPORTED_DATATYPES.extend(["varbinary({0})".format(chars + 1) for chars in range(8000)]) 103 | 104 | CONFIGURATION_ENVIRONMENT = { 105 | "properties": { 106 | "user": "STITCH_TAP_MSSQL_TEST_DATABASE_USER", 107 | "port": "STITCH_TAP_MSSQL_TEST_DATABASE_PORT" 108 | }, 109 | "credentials": { 110 | "password": "STITCH_TAP_MSSQL_TEST_DATABASE_PASSWORD", 111 | } 112 | } 113 | TEST_DB_HOST="localhost" 114 | 115 | @staticmethod 116 | def tap_name(): 117 | """The name of the tap""" 118 | return "mssql" 119 | 120 | @staticmethod 121 | def get_type(): 122 | """the expected url route ending""" 123 | return "platform.mssql" 124 | 125 | def get_properties(self, original: bool = True): 126 | """Configuration properties required for the tap.""" 127 | properties_env = self.CONFIGURATION_ENVIRONMENT['properties'] 128 | return_value = {k: os.getenv(v) for k, v in properties_env.items()} 129 | return_value['host'] = 'localhost' 130 | return_value['include_schemas_in_destination_stream_name'] = 'true' 131 | return return_value 132 | 133 | def get_credentials(self): 134 | """Authentication information for the test account""" 135 | credentials_env = self.CONFIGURATION_ENVIRONMENT['credentials'] 136 | return {k: os.getenv(v) for k, v in credentials_env.items()} 137 | 138 | def expected_metadata(self): 139 | """The expected streams and metadata about the streams""" 140 | 141 | default = { 142 | self.REPLICATION_KEYS: {"updated_at"}, 143 | self.PRIMARY_KEYS: {"id"}, 144 | self.AVAILABLE_FIELDS: {}, # added, need to add to template 145 | self.UNAVAILABLE_FIELDS: {}, # added, need to add to template 146 | self.REPLICATION_METHOD: self.FULL, 147 | self.API_LIMIT: 250 148 | } 149 | 150 | meta = default.copy() 151 | meta.update({self.FOREIGN_KEYS: {"owner_id", "owner_resource"}}) 152 | 153 | return { 154 | "full": default 155 | } 156 | -------------------------------------------------------------------------------- /tests/test_discovery_unsupported_pks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test tap discovery TODO this test appears to be incomplete 3 | """ 4 | 5 | from tap_tester import menagerie, LOGGER 6 | 7 | from database import drop_all_user_databases, create_database, \ 8 | create_table, mssql_cursor_context_manager 9 | 10 | from base import BaseTapTest 11 | 12 | 13 | class DiscoveryTestUnsupportedKeys(BaseTapTest): 14 | """ Test the tap discovery """ 15 | 16 | EXPECTED_METADATA = dict() 17 | 18 | def name(self): 19 | return "{}_discovery_test_unsupported_pks".format(super().name()) 20 | 21 | @classmethod 22 | def discovery_expected_metadata(cls): 23 | """The expected streams and metadata about the streams""" 24 | 25 | return cls.EXPECTED_METADATA 26 | 27 | @classmethod 28 | def setUpClass(cls) -> None: 29 | """Create the expected schema in the test database""" 30 | # drop_all_user_databases() 31 | # database_name = "unsupported_pk_database" 32 | # schema_name = "dbo" 33 | # 34 | # query_list = list(create_database(database_name, "Latin1_General_CS_AS")) 35 | # # query_list.extend(create_schema(database_name, schema_name)) 36 | # 37 | # table_name = "unsupported_pk" 38 | # column_name = ["column_name"] 39 | # column_type = ["datetimeoffset"] 40 | # primary_key = {"column_name"} 41 | # column_def = [" ".join(x) for x in list(zip(column_name, column_type))] 42 | # query_list.extend(create_table(database_name, schema_name, table_name, column_def, 43 | # primary_key=primary_key)) 44 | # cls.add_expected_metadata(cls, database_name, schema_name, table_name, column_name, 45 | # column_type, primary_key) 46 | # 47 | # mssql_cursor_context_manager(*query_list) 48 | # 49 | # cls.expected_metadata = cls.discovery_expected_metadata 50 | 51 | def test_run(self): 52 | """ 53 | Default Test Setup 54 | Remove previous connections (with the same name) 55 | Create a new connection (with the properties and credentials above) 56 | Run discovery and ensure it completes successfully 57 | """ 58 | LOGGER.info("running test %s", self.name()) 59 | self.create_connection() 60 | -------------------------------------------------------------------------------- /tests/test_mssql_log_based_no_pk.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from tap_tester import connections, menagerie, runner, LOGGER 4 | 5 | from base import BaseTapTest 6 | 7 | from database import drop_all_user_databases, create_database, create_table, mssql_cursor_context_manager, insert, enable_database_tracking 8 | 9 | 10 | class LogBasedNoPkTest(BaseTapTest): 11 | 12 | EXPECTED_METADATA = dict() 13 | 14 | def name(self): 15 | return "{}_log_based_no_pk_test".format(super().name()) 16 | 17 | @classmethod 18 | def discovery_expected_metadata(cls): 19 | """ Expected streams and metadata about the streams """ 20 | return cls.EXPECTED_METADATA 21 | 22 | @classmethod 23 | def setUpClass(cls) -> None: 24 | """Create the expected schema in the test database""" 25 | drop_all_user_databases() 26 | global database_name 27 | database_name = "log_based_no_pk" 28 | global schema_name 29 | schema_name = "dbo" 30 | 31 | int_values = [(0, 0, False), (1, 255, True), (2, 42, None), (3, 230, False)] 32 | int_values_no_pk = [(1, 1, True), (1, 1, True), (1, 2, False)] 33 | 34 | int_schema = { 35 | 'type': 'object', 36 | 'properties': { 37 | 'MyTinyIntColumn': { 38 | 'type': ['integer', 'null'], 39 | 'minimum': 0, 40 | 'maximum': 255, 41 | 'inclusion': 'available', 42 | 'selected': True}, 43 | 'pk': { 44 | 'type': ['integer'], 45 | 'minimum': -2147483648, 46 | 'maximum': 2147483647, 47 | 'inclusion': 'automatic', 48 | 'selected': True}, 49 | 'my_boolean': { 50 | 'type': ['boolean', 'null'], 51 | 'inclusion': 'available', 52 | 'selected': True}}, 53 | 'selected': True} 54 | 55 | cls.EXPECTED_METADATA = { 56 | 'int_data': { 57 | 'is-view': False, 58 | 'schema-name': schema_name, 59 | 'row-count': 0, 60 | 'values': int_values, 61 | 'table-key-properties': {'pk'}, 62 | 'selected': None, 63 | 'database-name': database_name, 64 | 'stream_name': 'int_data', 65 | 'fields': [ 66 | {'pk': {'sql-datatype': 'int', 'selected-by-default': True, 'inclusion': 'automatic'}}, 67 | {'MyTinyIntColumn': {'sql-datatype': 'tinyint', 'selected-by-default': True, 68 | 'inclusion': 'available'}}, 69 | {'my_boolean': {'sql-datatype': 'bit', 'selected-by-default': True, 'inclusion': 'available'}}], 70 | 'schema': int_schema} 71 | } 72 | 73 | global query_list 74 | query_list = list(create_database(database_name, "Latin1_General_CS_AS")) 75 | query_list.extend(enable_database_tracking(database_name)) 76 | 77 | table_name = "int_data" 78 | column_name = ["pk", "MyTinyIntColumn", "my_boolean"] 79 | column_type = ["int", "tinyint", "bit"] 80 | primary_key = {"pk"} 81 | column_def = [" ".join(x) for x in list(zip(column_name, column_type))] 82 | query_list.extend(create_table(database_name, schema_name, table_name, column_def, 83 | primary_key=primary_key, tracking=True)) 84 | query_list.extend(insert(database_name, schema_name, table_name, int_values)) 85 | 86 | table_name = "int_data_no_pk" 87 | column_name = ["pk", "MyTinyIntColumn", "my_boolean"] 88 | column_type = ["int", "tinyint", "bit"] 89 | primary_key = {} 90 | column_def = [" ".join(x) for x in list(zip(column_name, column_type))] 91 | query_list.extend(create_table(database_name, schema_name, table_name, column_def, 92 | primary_key=primary_key)) 93 | query_list.extend(insert(database_name, schema_name, table_name, int_values_no_pk)) 94 | 95 | mssql_cursor_context_manager(*query_list) 96 | 97 | cls.expected_metadata = cls.discovery_expected_metadata 98 | 99 | def test_run(self): 100 | 101 | LOGGER.info("running test %s", self.name()) 102 | 103 | """ 104 | MSSQL does not allow change tracking to be enabled for a table 105 | with no primary key, so we need to assert if this raises an exception 106 | """ 107 | conn_id = self.create_connection() 108 | 109 | # run in check mode 110 | check_job_name = runner.run_check_mode(self, conn_id) 111 | 112 | # verify check exit codes 113 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 114 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 115 | 116 | # get the catalog information of discovery 117 | found_catalogs = menagerie.get_catalogs(conn_id) 118 | additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'LOG_BASED'}}] 119 | BaseTapTest.select_all_streams_and_fields( 120 | conn_id, found_catalogs, additional_md=additional_md, non_selected_properties=[]) 121 | 122 | # Run a sync and verify exit codes 123 | sync_job_name = runner.run_sync_mode(self, conn_id) 124 | 125 | # verify check exit codes 126 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 127 | 128 | # validate the exit status message on table which does not have pk 129 | self.assertEqual(exit_status['tap_error_message'], '[main] tap-mssql.core - Fatal Error Occured - Cannot sync stream: log_based_no_pk_dbo_int_data_no_pk using log-based replication. Change Tracking is not enabled for table: int_data_no_pk') 130 | -------------------------------------------------------------------------------- /tests/test_saas_stream.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module is a template of test objectives to complete 3 | for each stream of a SAAS API 4 | """ 5 | 6 | # 7 | # def test_discovery(): 8 | # """ 9 | # WIP 10 | # # verify that the number of actual streams equals the number of expected streams 11 | # # verify the actual stream names equal the expected stream names 12 | # # stream names only have [a-z_] 13 | # # verify that the annotated schema has the correct properties 14 | # # verify that the metadata has the correct breadcrumbs / properties -- 15 | # # verify that non pk's and replication fields 16 | # have inclusion as available in annotated schema. 17 | # # verify that non pk's and replication fields 18 | # have inclusion as available in metadata. 19 | # # verify custom fields are in the schema - TBD 20 | # # ensure schema doesn't allow extra types? - TBD 21 | # """ 22 | 23 | 24 | def test_stream_schema(): 25 | """ 26 | TOO MUCH WORK FOR THE VALUE OF THE TEST - MORE APPLICABLE IF THERE IS NO ALPHA TESTER 27 | PREREQUISITE 28 | For EACH stream populate the target endpoint with data in each field 29 | with each expected data type allowed for that field. 30 | 31 | For instance if an optional field can take a float from 0..100 32 | sample data should include the endpoints of the range, null at a minimum. 33 | Maybe the data can be stored as a string. 34 | • 0 35 | • 0.00001 36 | • 100 37 | • 99.9999999 38 | • "100" (if possible) 39 | • null value 40 | • no key (if possible) 41 | 42 | For date-times as an example we should try as many formats as we can to make 43 | sure we can handle them correctly. Some examples might look like: 44 | • 2018-04-25T13:51:12-04:00 45 | • 20080915T155300 46 | • 20080915T155300Z 47 | • 2008-09-15 48 | • null 49 | • "" 50 | 51 | TEST OBJECTIVES: 52 | >> • Run a sync with all fields selected and verify there are no errors. 53 | • verify the sync captured the setup data 54 | • verify every field is present in the target data for the stream 55 | """ 56 | 57 | 58 | # def test_stream_pagination(): 59 | # """ 60 | # PREREQUISITE 61 | # For EACH stream add enough data that you surpass the limit of a single 62 | # fetch of data. For instance if you have a limit of 250 records ensure 63 | # that 251 records have been posted for that stream. 64 | # 65 | # TEST OBJECTIVES: 66 | # • Run a sync with ALL fields selected and 67 | # verify that the number of records sent to the target 68 | # exceeds the limit 69 | # """ 70 | 71 | 72 | # def test_only_selected_streams(): 73 | # """ 74 | # Verify that the tap only sends data to the target for selected streams 75 | # 76 | # Think about child streams. child without parent and vice versa. 77 | # Think about this in combination with bookmarks. 78 | # """ 79 | 80 | 81 | # def test_stream_min_fields(): 82 | # """ 83 | # PREREQUISITE 84 | # For EACH stream add enough data that you surpass the limit of a single 85 | # fetch of data. For instance if you have a limit of 250 records ensure 86 | # that 251 records have been posted for that stream. 87 | # 88 | # TEST OBJECTIVES: 89 | # • Run a sync with NO fields selected (assuming there is at least 1 90 | # automatic field, or 1 selected if there are no automatic fields) 91 | # and verify that the number of records sent to the target 92 | # exceeds the limit 93 | # • Verify that automatic fields (or 1 selected) 94 | # are the ONLY ones present in the target data for the stream 95 | # (plus fields where selected-by-default metadata is set to true) 96 | # """ 97 | 98 | 99 | def test_field_conflicts(): 100 | """ 101 | PHASE 2 102 | 103 | Test the business rules around field selection. 104 | Verify that selecting field 1 means you can't select field 2 105 | """ 106 | 107 | 108 | # def test_stream_inc_bookmarks(): 109 | # """ 110 | # SOME VALUE - NOT SURE IF WE NEED TO TEST BOOKMARK USE. NEED TO ENSURE BOOKMARK SETTING 111 | # PREREQUISITE 112 | # For EACH stream that is incrementally replicated there are multiple rows of data. 113 | # 114 | # NOTE: It is typical that there will be an automatic field that is used as the 115 | # replication key (bookmark). If this is not the case, and there are no 116 | # automatic fields the sync should select a single field that is not 117 | # the replication key. (For example S3 uses a replication key that is the 118 | # file modified date and may not have a primary key, in this case select 119 | # a single field that isn't relevant to the modified date) 120 | # 121 | # TEST STEPS 122 | # • NOTE the start time of the test (for test_start_date below). 123 | # • Run a sync with NO fields selected for each stream 124 | # so that the bookmark is up to date 125 | # • For each stream Update a subset of the records for the stream 126 | # and note the number n where 0 < n < Total records 127 | # • For each stream Insert m records 0 < m 128 | # • NOTE n and m for each stream (for test_start_date below). 129 | # • Run another sync 130 | # 131 | # TEST OBJECTIVES: 132 | # • Verify that the number of records returned in the second sync 133 | # is equal to n + m + 1 above for each stream. (The tap can find updated 134 | # records and do not get records that were not updated.) 135 | # """ 136 | 137 | 138 | # def test_start_date(): 139 | # """ 140 | # PREREQUISITE 141 | # The test_stream_inc_bookmarks test has been run and no other data has been 142 | # modified in the dev/test account since that test has been run 143 | # and you have the start time from that test and the n + m values 144 | # for each stream for that test. 145 | # 146 | # TEST STEPS 147 | # • Run a sync with the start date set to 148 | # the start time of the test_stream_inc_bookmarks test 149 | # 150 | # TEST OBJECTIVES: 151 | # • Verify that the number of records returned in the sync 152 | # is equal to n + m above for each stream. 153 | # """ 154 | 155 | # 156 | # def test_stream_full(): 157 | # """ 158 | # TEST BASED ON NOT SAVING STATE 159 | # PREREQUISITE 160 | # For EACH stream that is fully replicated there are multiple rows of data. 161 | # 162 | # TEST STEPS 163 | # • Run a sync for each stream 164 | # • For each stream Update a subset of the records for the stream 165 | # and note the number n where 0 < n < Total records 166 | # • For each stream Insert m records 0 < m 167 | # • Run another sync 168 | # 169 | # TEST OBJECTIVES: 170 | # • Verify that the number of records returned in the second sync 171 | # is equal to the number of records in the first sync + m 172 | # """ 173 | 174 | 175 | def test_error_handling(): 176 | """TBD""" 177 | 178 | 179 | def test_performance(): 180 | """ 181 | TBD - This is probably not necessary in tap-tester as it would cause longer 182 | run times. This is something that would be useful to do manually. 183 | 184 | Run a large set of data and ensure you don't run into memory leaks, disk 185 | space issues, runs that take a long time to run. 186 | """ 187 | --------------------------------------------------------------------------------