├── .circleci └── config.yml ├── .github └── pull_request_template.md ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── bin ├── populate_test_database.py └── test-db ├── setup.py ├── spikes ├── atlas_setup.md ├── dbreps_and_hackathon_review.md ├── local_mongo_setup.md ├── pymongo_spike.py └── supported_versions_spike.md ├── tap_mongodb ├── __init__.py └── sync_strategies │ ├── common.py │ ├── full_table.py │ ├── incremental.py │ └── oplog.py └── tests ├── __init__.py ├── mongodb_common.py ├── test_mongodb_cname_restrictions.py ├── test_mongodb_configurable_properties.py ├── test_mongodb_datatype.py ├── test_mongodb_discovery.py ├── test_mongodb_fname_restrictions.py ├── test_mongodb_full_table.py ├── test_mongodb_full_table_id.py ├── test_mongodb_full_table_interruptible.py ├── test_mongodb_id_pk_variations.py ├── test_mongodb_incremental.py ├── test_mongodb_incremental_open_transactions.py ├── test_mongodb_index.py ├── test_mongodb_log_based_interruptible.py ├── test_mongodb_name_restrictions.py ├── test_mongodb_namespace_restrictions.py ├── test_mongodb_oplog.py ├── test_mongodb_oplog_aged_out.py ├── test_mongodb_oplog_bookmarks.py ├── test_mongodb_projection.py ├── test_mongodb_table_reset_inc.py ├── test_mongodb_table_reset_log.py ├── test_mongodb_views.py └── unittests └── test_common.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | orbs: 3 | slack: circleci/slack@3.4.2 4 | 5 | executors: 6 | tap_tester_mongo_4_4: 7 | docker: 8 | - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester-18.04 9 | - image: singerio/mongo:4.4-bionic 10 | environment: 11 | MONGO_INITDB_ROOT_USERNAME: dev 12 | MONGO_INITDB_ROOT_PASSWORD: Password1 13 | command: [mongod, --replSet, rs0, --keyFile, /opt/mongo/keyfile] 14 | tap_tester_mongo_5_0: 15 | docker: 16 | - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester-18.04 17 | - image: singerio/mongo:5.0 18 | environment: 19 | MONGO_INITDB_ROOT_USERNAME: dev 20 | MONGO_INITDB_ROOT_PASSWORD: Password1 21 | command: [mongod, --replSet, rs0, --keyFile, /opt/mongo/keyfile] 22 | tap_tester_mongo_6_0: 23 | docker: 24 | - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester-18.04 25 | - image: singerio/mongo:6.0 26 | environment: 27 | MONGO_INITDB_ROOT_USERNAME: dev 28 | MONGO_INITDB_ROOT_PASSWORD: Password1 29 | command: [mongod, --replSet, rs0, --keyFile, /opt/mongo/keyfile] 30 | 31 | jobs: 32 | build: 33 | executor: tap_tester_mongo_4_4 34 | steps: 35 | - checkout 36 | - run: 37 | name: 'Install Dockerize' 38 | command: | 39 | wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz 40 | tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz 41 | rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz 42 | environment: 43 | DOCKERIZE_VERSION: v0.3.0 44 | - run: 45 | name: 'Wait for Mongo' 46 | command: | 47 | dockerize -wait tcp://127.0.0.1:27017 -timeout 1m 48 | sleep 10 49 | - run: 50 | name: 'Setup Mongo' 51 | command: | 52 | aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox tap-tester.env 53 | source tap-tester.env 54 | wget -qO - https://www.mongodb.org/static/pgp/server-5.0.asc | apt-key add - 55 | echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/5.0 multiverse" \ 56 | | tee /etc/apt/sources.list.d/mongodb-org-5.0.list 57 | apt-get update 58 | apt-get install -y mongodb-org-shell mongodb-mongosh mongodb-org 59 | mongosh -u $TAP_MONGODB_USER \ 60 | -p $TAP_MONGODB_PASSWORD \ 61 | --authenticationDatabase admin \ 62 | --eval "rs.initiate({_id: \"rs0\", members: [{_id: 0, host: \"$TAP_MONGODB_HOST:$TAP_MONGODB_PORT\"}]})" 63 | - run: 64 | name: 'Setup virtual env' 65 | command: | 66 | pyenv local 3.9.6 67 | python3 -mvenv /usr/local/share/virtualenvs/tap-mongodb 68 | source /usr/local/share/virtualenvs/tap-mongodb/bin/activate 69 | pip install -U 'pip==23.2' 'setuptools==68.0.0' 70 | pip install .[dev] 71 | - run: 72 | name: 'pylint' 73 | command: | 74 | source /usr/local/share/virtualenvs/tap-mongodb/bin/activate 75 | make test 76 | - run: 77 | name: "Unit Tests" 78 | command: | 79 | source /usr/local/share/virtualenvs/tap-mongodb/bin/activate 80 | pip install pymongo==4.4.0 nose2 81 | nose2 -v -s tests/unittests/ 82 | - run: 83 | name: 'Integration Tests' 84 | command: | 85 | source tap-tester.env 86 | mkdir /tmp/${CIRCLE_PROJECT_REPONAME} 87 | export STITCH_CONFIG_DIR=/tmp/${CIRCLE_PROJECT_REPONAME} 88 | source /usr/local/share/virtualenvs/tap-tester/bin/activate 89 | pip install pymongo==4.4.0 90 | run-test --tap=tap-mongodb tests 91 | - run: 92 | name: 'Get Curl' 93 | command: | 94 | apt update 95 | apt install -y curl 96 | - slack/notify-on-failure: 97 | only_for_branches: master 98 | - store_artifacts: 99 | path: /tmp/tap-mongodb 100 | build_mongo_5_0: 101 | executor: tap_tester_mongo_5_0 102 | steps: 103 | - checkout 104 | - run: 105 | name: 'Install Dockerize' 106 | command: | 107 | wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz 108 | tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz 109 | rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz 110 | environment: 111 | DOCKERIZE_VERSION: v0.3.0 112 | - run: 113 | name: 'Wait for Mongo' 114 | command: | 115 | dockerize -wait tcp://127.0.0.1:27017 -timeout 1m 116 | sleep 10 117 | - run: 118 | name: 'Setup Mongo' 119 | command: | 120 | aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox tap-tester.env 121 | source tap-tester.env 122 | wget -qO - https://www.mongodb.org/static/pgp/server-5.0.asc | apt-key add - 123 | echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/5.0 multiverse" \ 124 | | tee /etc/apt/sources.list.d/mongodb-org-5.0.list 125 | apt-get update 126 | apt-get install -y mongodb-org-shell mongodb-mongosh mongodb-org 127 | mongosh -u $TAP_MONGODB_USER \ 128 | -p $TAP_MONGODB_PASSWORD \ 129 | --authenticationDatabase admin \ 130 | --eval "rs.initiate({_id: \"rs0\", members: [{_id: 0, host: \"$TAP_MONGODB_HOST:$TAP_MONGODB_PORT\"}]})" 131 | - run: 132 | name: 'Setup virtual env' 133 | command: | 134 | pyenv local 3.9.6 135 | python3 -mvenv /usr/local/share/virtualenvs/tap-mongodb 136 | source /usr/local/share/virtualenvs/tap-mongodb/bin/activate 137 | pip install -U 'pip==23.2' 'setuptools==68.0.0' 138 | pip install .[dev] 139 | - run: 140 | name: 'pylint' 141 | command: | 142 | source /usr/local/share/virtualenvs/tap-mongodb/bin/activate 143 | make test 144 | - run: 145 | name: "Unit Tests" 146 | command: | 147 | source /usr/local/share/virtualenvs/tap-mongodb/bin/activate 148 | pip install pymongo==4.4.0 nose2 149 | nose2 -v -s tests/unittests/ 150 | - run: 151 | name: 'Integration Tests' 152 | command: | 153 | source tap-tester.env 154 | mkdir /tmp/${CIRCLE_PROJECT_REPONAME} 155 | export STITCH_CONFIG_DIR=/tmp/${CIRCLE_PROJECT_REPONAME} 156 | source /usr/local/share/virtualenvs/tap-tester/bin/activate 157 | pip install pymongo==4.4.0 158 | run-test --tap=tap-mongodb tests 159 | - run: 160 | name: 'Get Curl' 161 | command: | 162 | apt update 163 | apt install -y curl 164 | - slack/notify-on-failure: 165 | only_for_branches: master 166 | - store_artifacts: 167 | path: /tmp/tap-mongodb 168 | build_mongo_6_0: 169 | executor: tap_tester_mongo_6_0 170 | steps: 171 | - checkout 172 | - run: 173 | name: 'Install Dockerize' 174 | command: | 175 | wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz 176 | tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz 177 | rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz 178 | environment: 179 | DOCKERIZE_VERSION: v0.3.0 180 | - run: 181 | name: 'Wait for Mongo' 182 | command: | 183 | dockerize -wait tcp://127.0.0.1:27017 -timeout 1m 184 | sleep 10 185 | - run: 186 | name: 'Setup Mongo' 187 | command: | 188 | aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox tap-tester.env 189 | source tap-tester.env 190 | wget -qO - https://www.mongodb.org/static/pgp/server-6.0.asc | apt-key add - 191 | echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/6.0 multiverse" \ 192 | | tee /etc/apt/sources.list.d/mongodb-org-6.0.list 193 | apt-get update 194 | apt-get install -y mongodb-org-shell mongodb-mongosh mongodb-org 195 | mongosh -u $TAP_MONGODB_USER \ 196 | -p $TAP_MONGODB_PASSWORD \ 197 | --authenticationDatabase admin \ 198 | --eval "rs.initiate({_id: \"rs0\", members: [{_id: 0, host: \"$TAP_MONGODB_HOST:$TAP_MONGODB_PORT\"}]})" 199 | - run: 200 | name: 'Setup virtual env' 201 | command: | 202 | pyenv local 3.9.6 203 | python3 -mvenv /usr/local/share/virtualenvs/tap-mongodb 204 | source /usr/local/share/virtualenvs/tap-mongodb/bin/activate 205 | pip install -U 'pip==23.2' 'setuptools==68.0.0' 206 | pip install .[dev] 207 | - run: 208 | name: 'pylint' 209 | command: | 210 | source /usr/local/share/virtualenvs/tap-mongodb/bin/activate 211 | make test 212 | - run: 213 | name: "Unit Tests" 214 | command: | 215 | source /usr/local/share/virtualenvs/tap-mongodb/bin/activate 216 | pip install pymongo==4.4.0 nose2 217 | nose2 -v -s tests/unittests/ 218 | - run: 219 | name: 'Integration Tests' 220 | command: | 221 | source tap-tester.env 222 | mkdir /tmp/${CIRCLE_PROJECT_REPONAME} 223 | export STITCH_CONFIG_DIR=/tmp/${CIRCLE_PROJECT_REPONAME} 224 | source /usr/local/share/virtualenvs/tap-tester/bin/activate 225 | pip install pymongo==4.4.0 226 | run-test --tap=tap-mongodb tests 227 | - run: 228 | name: 'Get Curl' 229 | command: | 230 | apt update 231 | apt install -y curl 232 | - slack/notify-on-failure: 233 | only_for_branches: master 234 | - store_artifacts: 235 | path: /tmp/tap-mongodb 236 | 237 | workflows: 238 | version: 2 239 | commit: &commit_jobs 240 | jobs: 241 | - build_mongo_5_0: 242 | context: 243 | - circleci-user 244 | - tier-1-tap-user 245 | requires: 246 | - build 247 | - build_mongo_6_0: 248 | context: 249 | - circleci-user 250 | - tier-1-tap-user 251 | requires: 252 | - build_mongo_5_0 253 | - build: 254 | context: 255 | - circleci-user 256 | - tier-1-tap-user 257 | 258 | build_daily: 259 | <<: *commit_jobs 260 | triggers: 261 | - schedule: 262 | cron: "0 1 * * *" 263 | filters: 264 | branches: 265 | only: 266 | - master 267 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Description of change 2 | (write a short description here or paste a link to JIRA) 3 | 4 | # QA steps 5 | - [ ] automated tests passing 6 | - [ ] manual qa steps passing (list below) 7 | 8 | # Risks 9 | 10 | # Rollback steps 11 | - revert this branch 12 | 13 | #### AI generated code 14 | https://internal.qlik.dev/general/ways-of-working/code-reviews/#guidelines-for-ai-generated-code 15 | - [ ] this PR has been written with the help of GitHub Copilot or another generative AI tool 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # Emacs 104 | .tramp_history 105 | 106 | config.json 107 | state.json 108 | properties.json 109 | catalog.json 110 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 3.1.4 4 | * Update pymongo to 4.10.1 [#121](https://github.com/singer-io/tap-mongodb/pull/121) 5 | 6 | ## 3.1.3 7 | * Remove deprecated terminaltables dependency [#120](https://github.com/singer-io/tap-mongodb/pull/120) 8 | 9 | ## 3.1.2 10 | * Allows to specify string values as projection [#94](https://github.com/singer-io/tap-mongodb/pull/94) 11 | 12 | ## 3.1.1 13 | * Gracefully fallback to not using a session if sessions are not supported by the mongo server [#112](https://github.com/singer-io/tap-mongodb/pull/112) 14 | 15 | ## 3.1.0 16 | * Updates to run on python 3.11.7 [#111](https://github.com/singer-io/tap-mongodb/pull/111) 17 | 18 | ## 3.0.3 19 | * Refresh the session every 10 minutes during oplog queries [#110](https://github.com/singer-io/tap-mongodb/pull/110) 20 | 21 | ## 3.0.2 22 | * Fix an issue with connection params when SSL is used [#107](https://github.com/singer-io/tap-mongodb/pull/107) 23 | 24 | ## 3.0.1 25 | * Fix issue with SSH tunnel connections by connecting directly to a MongoDB node instead of allowing PyMongo to automatically discover replica sets [#105](https://github.com/singer-io/tap-mongodb/pull/105) 26 | 27 | ## 3.0.0 28 | * Upgrade PyMongo to v4.3+ [#99](https://github.com/singer-io/tap-mongodb/pull/99) 29 | * Fix uuid transformation [#100](https://github.com/singer-io/tap-mongodb/pull/100) 30 | * Fix empty projection [#102](https://github.com/singer-io/tap-mongodb/pull/102) 31 | 32 | ## 2.1.3 33 | * Fix a bug in Full Table sync that caused a sync to fail if document contained invalid BSON[#95](https://github.com/singer-io/tap-mongodb/pull/95) 34 | 35 | ## 2.1.2 36 | * Update pymongo to v3.12.3 [#81](https://github.com/singer-io/tap-mongodb/pull/81) 37 | 38 | ## 2.1.1 39 | * Fix bug in oplog bookmarking where the bookmark would not advance due to fencepost querying finding a single record [#80](https://github.com/singer-io/tap-mongodb/pull/80) 40 | 41 | ## 2.1.0 42 | * Optimize oplog extractions to only query for the selected tables [#78](https://github.com/singer-io/tap-mongodb/pull/78) 43 | 44 | ## 2.0.1 45 | * Modify `get_databases` function to return a unique list of databases [#58](https://github.com/singer-io/tap-mongodb/pull/58) 46 | 47 | ## 2.0.0 48 | * Build and write schema messages [#40](https://github.com/singer-io/tap-mongodb/pull/40) The main changes are: 49 | 1. date-time fields will have a `"type": "string", "format": "date-time"` schema that will cause them to get loaded as date-times instead of strings 50 | 2. decimal fields will have a `"type": "number", "multipleOf": 1e-34` schema written 51 | 3. double fields will have a `"type": "number"` schema written that should prevent them from splitting between doubles/decimals depending on the precision 52 | 53 | ## 1.1.0 54 | * Add optional `verify_mode` config value to replace the assumptions in version 1.0.4 [#38](https://github.com/singer-io/tap-mongodb/pull/38) 55 | 56 | ## 1.0.4 57 | * Add support for turning off ssl cert validation when using a ssh tunnel [#36](https://github.com/singer-io/tap-mongodb/pull/36) 58 | 59 | ## 1.0.3 60 | * Add support for floats as replication keys [#34](https://github.com/singer-io/tap-mongodb/pull/34) 61 | 62 | ## 1.0.2 63 | * Add support for DBRefs [#32](https://github.com/singer-io/tap-mongodb/pull/32) 64 | 65 | ## 1.0.1 66 | * Discover collections in the `admin` database and add support for `Int64` as a replication key type [#30](https://github.com/singer-io/tap-mongodb/pull/30) 67 | 68 | ## 1.0.0 69 | * Release out of Beta [#29](https://github.com/singer-io/tap-mongodb/pull/29) 70 | 71 | ## 0.3.0 72 | * Add support for UUID types in replication keys and records [#27](https://github.com/singer-io/tap-mongodb/pull/27) 73 | 74 | ## 0.2.2 75 | * Improve invalid datetime handling [#25](https://github.com/singer-io/tap-mongodb/pull/25) 76 | 77 | ## 0.2.1 78 | * Clear stream state if replication method changes [#24](https://github.com/singer-io/tap-mongodb/pull/24) 79 | 80 | ## 0.2.0 81 | * Improve Oplog query performance by using only a timestamp and the `oplog_replay` arg. [#23](https://github.com/singer-io/tap-mongodb/pull/23) 82 | 83 | ## 0.1.11 84 | * Only bookmark latest ts on first sync for oplog [#22](https://github.com/singer-io/tap-mongodb/pull/22) 85 | 86 | ## 0.1.10 87 | * Fix for additional empty string projections [#21](https://github.com/singer-io/tap-mongodb/pull/21) 88 | 89 | ## 0.1.9 90 | * Make tap robust against projection that is empty string 91 | * Actually respect `INCLUDE_SCHEMAS_IN_DESTINATION_STREAM_NAME` prop 92 | * [#20](https://github.com/singer-io/tap-mongodb/pull/20) 93 | 94 | ## 0.1.8 95 | * Prefer secondary when connecting to Mongo [#19](https://github.com/singer-io/tap-mongodb/pull/19) 96 | 97 | ## 0.1.7 98 | * Full Table syncs can handle empty collections [#18](https://github.com/singer-io/tap-mongodb/pull/18) 99 | 100 | ## 0.1.6 101 | * Fix a bug with supporting bookmarks of ObjectId [#17](https://github.com/singer-io/tap-mongodb/pull/17) 102 | 103 | ## 0.1.5 104 | * Check for cases when the Oplog may have aged out and execute a full resync [#16](https://github.com/singer-io/tap-mongodb/pull/16) 105 | 106 | ## 0.1.4 107 | * Get global oplog timestamp instead of collection-specific [#15](https://github.com/singer-io/tap-mongodb/pull/15) 108 | 109 | ## 0.1.3 110 | * Support several new types for the `_id` column aside from ObjectID [#14](https://github.com/singer-io/tap-mongodb/pull/14) 111 | 112 | ## 0.1.2 113 | * Encode bytes back to base64 strings as we do not know the encodings [#13](https://github.com/singer-io/tap-mongodb/pull/13) 114 | 115 | ## 0.1.1 116 | * During key-based incremental sync, if replication-key changes, wipe state and resync table [#10](https://github.com/singer-io/tap-mongodb/pull/10) 117 | * Only support replication keys of types `datetime`, `timestamp`, `integer`, `ObjectId` [#10](https://github.com/singer-io/tap-mongodb/pull/10) 118 | * Only discover databases the user has read access for [#11](https://github.com/singer-io/tap-mongodb/pull/11) 119 | 120 | ## 0.1.0 121 | * Added key-based incremental sync [commit](https://github.com/singer-io/tap-mongodb/commit/b618b11d91e111680f70b402c6e94c9bf40c7b8f) 122 | 123 | ## 0.0.5 124 | * Fixed bug in oplog projections [commit](https://github.com/singer-io/tap-mongodb/commit/b400836678440499d4a15fb7d5b0a40a13e3342e) 125 | 126 | ## 0.0.4 127 | * Fixed bug in oplog projections [commit](https://github.com/singer-io/tap-mongodb/commit/527287e69661e9dbce3f05696b269025d0fc4034) 128 | * Added metric log printout at end of tap run [commit](https://github.com/singer-io/tap-mongodb/commit/d0403d82028b1dcc9ba306b52b2103ef00188b7d) 129 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := test 2 | 3 | test: 4 | pylint tap_mongodb tap_mongodb/sync_strategies -d missing-docstring,fixme,duplicate-code,line-too-long,too-many-statements,too-many-locals,consider-using-f-string,consider-using-from-import,broad-exception-raised,superfluous-parens,consider-using-generator,use-yield-from 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tap-mongodb 2 | 3 | This is a [Singer](https://singer.io) tap that produces JSON-formatted data following the [Singer spec](https://github.com/singer-io/getting-started/blob/master/SPEC.md) from a MongoDB source. 4 | 5 | ## Set up Virtual Environment 6 | ``` 7 | python3 -m venv ~/.virtualenvs/tap-mongodb 8 | source ~/.virtualenvs/tap-mongodb/bin/activate 9 | ``` 10 | 11 | ## Install tap 12 | ``` 13 | pip install -U pip setuptools 14 | pip install tap-mongodb 15 | ``` 16 | 17 | ## Set up Config file 18 | Create json file called `config.json`, with the following contents: 19 | ``` 20 | { 21 | "password": "", 22 | "user": "", 23 | "host": "", 24 | "port": "", 25 | "database": "" 26 | } 27 | ``` 28 | The folowing parameters are optional for your config file: 29 | 30 | | Name | Type | Description | 31 | | -----|------|------------ | 32 | | `replica_set` | string | name of replica set | 33 | |`ssl` | Boolean | can be set to true to connect using ssl | 34 | | `include_schema_in_destination_stream_name` | Boolean | forces the stream names to take the form `_` instead of ``| 35 | 36 | All of the above attributes are required by the tap to connect to your mongo instance. 37 | 38 | ## Run in discovery mode 39 | Run the following command and redirect the output into the catalog file 40 | ``` 41 | tap-mongodb --config ~/config.json --discover > ~/catalog.json 42 | ``` 43 | 44 | Your catalog file should now look like this: 45 | ``` 46 | { 47 | "streams": [ 48 | { 49 | "table_name": "", 50 | "tap_stream_id": "", 51 | "metadata": [ 52 | { 53 | "breadcrumb": [], 54 | "metadata": { 55 | "row-count":, 56 | "is-view": , 57 | "database-name": "", 58 | "table-key-properties": [ 59 | "_id" 60 | ], 61 | "valid-replication-keys": [ 62 | "_id" 63 | ] 64 | } 65 | } 66 | ], 67 | "stream": "", 68 | "schema": { 69 | "type": "object" 70 | } 71 | } 72 | ] 73 | } 74 | ``` 75 | 76 | ## Edit Catalog file 77 | ### Using valid json, edit the config.json file 78 | To select a stream, enter the following to the stream's metadata: 79 | ``` 80 | "selected": true, 81 | "replication-method": , 82 | ``` 83 | 84 | `` must be either `FULL_TABLE` or `LOG_BASED` 85 | 86 | To add a projection to a stream, add the following to the stream's metadata field: 87 | ``` 88 | "tap-mongodb.projection": 89 | ``` 90 | 91 | For example, if you were to edit the example stream to select the stream as well as add a projection, config.json should look this: 92 | ``` 93 | { 94 | "streams": [ 95 | { 96 | "table_name": "
", 97 | "tap_stream_id": "", 98 | "metadata": [ 99 | { 100 | "breadcrumb": [], 101 | "metadata": { 102 | "row-count": , 103 | "is-view": , 104 | "database-name": "", 105 | "table-key-properties": [ 106 | "_id" 107 | ], 108 | "valid-replication-keys": [ 109 | "_id" 110 | ], 111 | "selected": true, 112 | "replication-method": "", 113 | "tap-mongodb.projection": "" 114 | } 115 | } 116 | ], 117 | "stream": "", 118 | "schema": { 119 | "type": "object" 120 | } 121 | } 122 | ] 123 | } 124 | 125 | ``` 126 | ## Run in sync mode: 127 | `tap-mongodb --config ~/config.json --catalog ~/catalog.json` 128 | 129 | The tap will write bookmarks to stdout which can be captured and passed as an optional `--state state.json` parameter to the tap for the next sync. 130 | 131 | ## Supplemental MongoDB Info 132 | 133 | ### Local MongoDB Setup 134 | If you haven't yet set up a local mongodb client, follow [these instructions](https://github.com/singer-io/tap-mongodb/blob/master/spikes/local_mongo_setup.md) 135 | 136 | --- 137 | 138 | Copyright © 2019 Stitch 139 | -------------------------------------------------------------------------------- /bin/populate_test_database.py: -------------------------------------------------------------------------------- 1 | import pymongo # requires dnspython package as well 2 | import sys 3 | import bson 4 | import datetime 5 | import re 6 | import pprint 7 | import time 8 | import decimal 9 | import string 10 | import random 11 | 12 | 13 | 14 | 15 | #------ Local mongo server ------ 16 | username = sys.argv[1] 17 | password = sys.argv[2] 18 | host= '127.0.0.1' 19 | auth_source = 'test' 20 | ssl = False 21 | client = pymongo.MongoClient(host=host, username=username, password=password, port=27017, authSource=auth_source, ssl=ssl) 22 | 23 | databases = { 24 | "simple_db": ["simple_coll_1", "simple_coll_2"], 25 | "datatype_db": ["datatype_coll_1", "datatype_coll_2"], 26 | } 27 | 28 | 29 | ############# Drop all dbs/collections ############# 30 | for db_name, colls in databases.items(): 31 | for coll_name in colls: 32 | print("---- Dropping database: " + db_name + ", collection: " + coll_name + " ----") 33 | client[db_name][coll_name].drop() 34 | 35 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 36 | return ''.join(random.choice(chars) for x in range(size)) 37 | 38 | def generate_simple_coll_docs(num_docs): 39 | docs = [] 40 | for int_value in range(num_docs): 41 | docs.append({"int_field": int_value, "string_field": random_string_generator()}) 42 | return docs 43 | 44 | 45 | ############# Add simple collections ############# 46 | # simple_coll_1 has 50 documents 47 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) 48 | 49 | # simple_coll_2 has 100 documents 50 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) 51 | 52 | 53 | ############# Add datatype collections ############# 54 | pattern = re.compile('.*') 55 | regex = bson.Regex.from_native(pattern) 56 | regex.flags ^= re.UNICODE 57 | 58 | datatype_doc = { 59 | "double_field": 4.3, 60 | "string_field": "a sample string", 61 | "object_field" : { 62 | "obj_field_1_key": "obj_field_1_val", 63 | "obj_field_2_key": "obj_field_2_val" 64 | }, 65 | "array_field" : [ 66 | "array_item_1", 67 | "array_item_2", 68 | "array_item_3" 69 | ], 70 | "binary_data_field" : b"a binary string", 71 | "object_id_field": bson.objectid.ObjectId(b'123456789123'), 72 | "boolean_field" : True, 73 | "date_field" : datetime.datetime.now(), 74 | "null_field": None, 75 | "regex_field" : regex, 76 | "32_bit_integer_field" : 32, 77 | "timestamp_field" : bson.timestamp.Timestamp(int(time.time()), 1), 78 | "64_bit_integer_field" : 34359738368, 79 | "decimal_field" : bson.Decimal128(decimal.Decimal('1.34')), 80 | "javaScript_field" : bson.code.Code("var x, y, z;"), 81 | "javaScript_with_scope_field" : bson.code.Code("function incrementX() { x++; }", scope={"x": 1}), 82 | "min_key_field" : bson.min_key.MinKey, 83 | "max_key_field" : bson.max_key.MaxKey 84 | } 85 | 86 | client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc) 87 | client["datatype_db"]["datatype_coll_2"].insert_one(datatype_doc) 88 | 89 | print("\nPrinting database contents") 90 | for db_name in client.list_database_names(): 91 | if db_name in ['admin', 'config', 'local']: 92 | continue 93 | for collection_name in client[db_name].list_collection_names(): 94 | print('\n---- Database: '+ db_name +', Collection: ' + collection_name + " ----") 95 | for doc in client[db_name][collection_name].find(): 96 | print(doc) 97 | 98 | 99 | -------------------------------------------------------------------------------- /bin/test-db: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import argparse 5 | import subprocess 6 | import time 7 | from argparse import RawTextHelpFormatter 8 | 9 | # singerio images have required keyfile 10 | image_name = "singerio/mongo" 11 | 12 | # organize command options based on image_tag 13 | command_opts = { # top level keys = supported image_tag versions, values = shell 14 | '4.2-bionic': 'mongo', # version 5.0.15 TODO Remove? 15 | '4.4-bionic': 'mongo', # version 4.4.6, also supports mongosh? 16 | '5.0': 'mongosh', # version 5.0.15, also supports mongo 17 | '6.0': 'mongosh', # version 6.0.4 18 | } 19 | 20 | def start_container(name, image_tag): 21 | 22 | START_COMMAND = """ 23 | sudo docker run -e "MONGO_INITDB_ROOT_USERNAME={0}" -e "MONGO_INITDB_ROOT_PASSWORD={1}" \ 24 | -p {2}:{2} --name {3} \ 25 | -d {4}:{5} \ 26 | --auth \ 27 | --keyFile /opt/mongo/keyfile --replSet rs0 28 | """.format(os.getenv('TAP_MONGODB_USER'), 29 | os.getenv('TAP_MONGODB_PASSWORD'), 30 | os.getenv('TAP_MONGODB_PORT'), 31 | name, 32 | image_name, 33 | image_tag) 34 | 35 | print("Starting Docker process mongo1 using command: {}".format(START_COMMAND)) 36 | 37 | proc = subprocess.run(START_COMMAND, shell=True) 38 | if proc.returncode != 0: 39 | sys.exit("Exited with code: {}, the docker process failed to start.".format(proc.returncode)) 40 | print("Process started successfully. Starting Oplog replication.") 41 | 42 | # Sleeping to allow Mongo enough time to start up 43 | time.sleep(5) 44 | 45 | ip_addr = get_ip_addr(name) 46 | # If using image_version <=4.4.0-bionic use mongo, not mongosh 47 | CONFIGURE_COMMAND = """ 48 | docker exec {} {} --host {} test -u {} -p {} --authenticationDatabase admin --eval {} 49 | """.format( 50 | name, 51 | command_opts[image_tag], 52 | ip_addr, 53 | os.getenv('TAP_MONGODB_USER'), 54 | os.getenv('TAP_MONGODB_PASSWORD'), 55 | '\'rs.initiate({_id: "rs0", members: [{_id: 0, host: "127.0.0.1:27017"}]})\'') 56 | print("Initiate replSet using: {}".format(CONFIGURE_COMMAND)) 57 | proc = subprocess.run(CONFIGURE_COMMAND, shell=True) 58 | if proc.returncode != 0: 59 | sys.exit("Exited with code: {}, the docker command failed.".format(proc.returncode)) 60 | print("Oplog configured correctly.") 61 | 62 | def get_ip_addr(name): 63 | IP_ADDR_COMMAND = "docker inspect {} | jq -r .[].NetworkSettings.IPAddress" 64 | print("Retrieving IP addr of mongodb container") 65 | ip_addr = subprocess.check_output(IP_ADDR_COMMAND.format(name), shell=True).decode('utf-8').rstrip() 66 | print(ip_addr) 67 | return ip_addr 68 | 69 | def stop_container(name): 70 | STOP_COMMAND = "sudo docker stop {0} && sudo docker rm {0}" 71 | 72 | print("Stopping Docker process {}".format(name)) 73 | proc = subprocess.run(STOP_COMMAND.format(name), shell=True) 74 | if proc.returncode != 0: 75 | sys.exit("Exited with code: {}, the docker process failed to stop.".format(proc.returncode)) 76 | print("Process stopped successfully") 77 | 78 | def connect_to_db(name, image_tag): 79 | CONNECT_COMMAND = "docker run -it --rm {}:{} {} --host {} test -u {} -p {} --authenticationDatabase admin" 80 | ip_addr = get_ip_addr(name) 81 | 82 | print("Attempting to connect to running container using a mongo container") 83 | # Note: Shell is determined based on user provided image_tag, connect may fail if the shell 84 | # associated with the user provided image_tag is not supported by the running DB version. 85 | connect_command_format = CONNECT_COMMAND.format(image_name, 86 | image_tag, 87 | command_opts[image_tag], 88 | ip_addr, 89 | os.getenv('TAP_MONGODB_USER'), 90 | os.getenv('TAP_MONGODB_PASSWORD')) 91 | print(connect_command_format) 92 | # NB: Using call instead of run here because it is blocking 93 | # This returns only an exit code. 94 | returncode = subprocess.call(connect_command_format, 95 | shell=True) 96 | if returncode != 0: 97 | sys.exit("Exited with code: {}, could not connect.".format(returncode)) 98 | 99 | DESCRIPTION = """ 100 | Manage docker instance for tap-mssql testing. 101 | 102 | Uses environment variables: 103 | TAP_MONGODB_USER 104 | TAP_MONGODB_PASSWORD 105 | """ 106 | parser = argparse.ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) 107 | parser.add_argument('action', choices=['start','stop', 'connect'], help='action to perform with the container') 108 | parser.add_argument('--name', help="name assigned to running docker process", default='mongo1') 109 | parser.add_argument('--image-tag', choices=command_opts.keys(), help='Supported image tags, default=6.0', default='6.0') 110 | 111 | def main(): 112 | parsed_args = parser.parse_args() 113 | # Potential arguments to add: pull, changing docker cointainer, changing password 114 | if parsed_args.action == 'start': 115 | start_container(parsed_args.name, parsed_args.image_tag) 116 | elif parsed_args.action == 'stop': 117 | stop_container(parsed_args.name) 118 | elif parsed_args.action == 'connect': 119 | connect_to_db(parsed_args.name, parsed_args.image_tag) 120 | 121 | if __name__ == "__main__": 122 | main() 123 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | 5 | setup(name='tap-mongodb', 6 | version='3.1.4', 7 | description='Singer.io tap for extracting data from MongoDB', 8 | author='Stitch', 9 | url='https://singer.io', 10 | classifiers=['Programming Language :: Python :: 3 :: Only'], 11 | py_modules=['tap_mongodb'], 12 | install_requires=[ 13 | 'singer-python==6.0.0', 14 | 'pymongo==4.10.1', 15 | 'tzlocal==2.0.0', 16 | ], 17 | extras_require={ 18 | 'dev': [ 19 | 'pylint', 20 | 'nose2', 21 | 'ipdb' 22 | ] 23 | }, 24 | entry_points=''' 25 | [console_scripts] 26 | tap-mongodb=tap_mongodb:main 27 | ''', 28 | packages=['tap_mongodb', 'tap_mongodb.sync_strategies'], 29 | 30 | ) 31 | -------------------------------------------------------------------------------- /spikes/atlas_setup.md: -------------------------------------------------------------------------------- 1 | # MongoDB Atlas Spike 2 | Atlas is MongoDB's cloud service that can be hosted on aws. We already have an Atlas account that can be accessed by: 3 | 1. Go to https://cloud.mongodb.com 4 | 2. Creds in 1pass 5 | 6 | ### Cluster Tiers 7 | | Tier | RAM | Storage | vCPU | Price/month | 8 | | ---- | --- | ------- | ---- | ---------- | 9 | | M0 | Shared | 512 MB | Shared | Free | 10 | | M2 | Shared | 2 GB | Shared | $9 | 11 | | M5 | Shared | 5 GB | Shared | $25 | 12 | | M10 | 2 GB | 10 GB | 1 vCPU | ~$58 | 13 | | M20 | 4 GB | 20 GB | 2 vCPU | ~$144 | 14 | 15 | M0, M2 shared, and M5 shared tiers have limitations 16 | - Each account can only have one M0 cluster 17 | - Can only use Mongo version 4.0 18 | - Cannot configure memory or storage size 19 | 20 | ### Possible Setups 21 | - Have one cluster with a separate db for dev/circle 22 | - Atlas allows, at a minimum, up to 100 connections 23 | - Have one cluster for dev, one for circle 24 | - Use projects -- Atlas projects are meant for separate, isolated environments 25 | 26 | Suggest having a separate project for dev & circle 27 | - each having a single cluster 28 | - https://docs.atlas.mongodb.com/tutorial/manage-projects/ 29 | - can add more if we start running into collisions (dev, circle, dev-tap-tester, harrison-tap-tester, etc) 30 | 31 | ### Connecting to Atlas 32 | - Download mongo community edition 33 | - [Installation Instructions](https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/) 34 | - Installs mongo shell and a number of other modules 35 | 36 | #### Using Mongo Shell 37 | To connect to our M0 free tier cluster (username/password in 1Pass): 38 | ``` 39 | mongo mongodb+srv://stitch-upwjw.mongodb.net/test -u -p 40 | ``` 41 | 42 | ## Other Options Considered 43 | - Set up local mongo server 44 | - Seems like a lot more overhead, especially when using circle 45 | - Set up Mongo Stack on AWS (spins up EC2 instance with Mongo) 46 | - Much more expensive (~$12,000/year!) 47 | 48 | -------------------------------------------------------------------------------- /spikes/dbreps_and_hackathon_review.md: -------------------------------------------------------------------------------- 1 | # Review of dbreps and Chris C's hackathon project 2 | 3 | ## dbreps 4 | [sync_table](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_table/mongodb.clj) 5 | file and 6 | [sync_structure](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_structure/mongodb.clj) 7 | file for mongo 8 | 9 | ### notes 10 | - retrieves fields to decide which fields are "bookmarkable" [get-index code](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_structure/mongodb.clj#L29)[bookmarkable code](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_structure/mongodb.clj#L39-L50) 11 | - retrieves row count for each collection [code](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_structure/mongodb.clj#L70) 12 | - when opening cursor 13 | - specifies `QUERYOPTION_SLAVEOK` [code](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_table/mongodb.clj#L47) 14 | - when turned on, read queries will be directed to slave servers instead of the primary server 15 | - specifies `batch_size` [code](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_table/mongodb.clj#L68) 16 | - Uses the max of (2, 16 MB), and calls getDynamicFetchSize, so will ultimately set fetch size to 8 MB 17 | - pymongo uses a default of 1 MB, may want to look into changing this for efficiency 18 | - uses projections [code](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_table/mongodb.clj#L64-L71) 19 | 20 | ## Hackathon 21 | - Supports op-log and full-table rep 22 | - Client accepts authsource (db name) which defaults to 'admin' 23 | - we did not do this in our spike, should do it in tap 24 | - Discovery 25 | - ignores 26 | - dbs = ['admin', 'system', 'local'] 27 | - collections = ['system.indexes'] 28 | - does not discover fields, only writes `database-name` and `row-count` metadata 29 | - 'schema': { 30 | 'type': 'object' 31 | } 32 | - Sync 33 | - Prioritizes streams in by: 34 | - Currently Syncing 35 | - Streams without state 36 | - streams with state 37 | - Non-oplog streams 38 | - Uses `custom-select-clause` metadata for a stream to get the select statement 39 | - streams that don't have this are skipped 40 | - whitelisting is done post select, we should use projections instead of this 41 | - oplog streams 42 | - works similar to other db taps 43 | - whitelisting again performed post select 44 | - Generally seems like a good starting point for our tap 45 | 46 | 47 | -------------------------------------------------------------------------------- /spikes/local_mongo_setup.md: -------------------------------------------------------------------------------- 1 | # Local MongoDB Setup 2 | 3 | ### Install MongoDB Community Edition 4 | Follow MongoDB Manual directions to install MongoDB Community Edition on ubuntu [[3.2](https://docs.mongodb.com/v3.2/tutorial/install-mongodb-on-ubuntu/), [3.4](https://docs.mongodb.com/v3.4/tutorial/install-mongodb-on-ubuntu/), [3.6](https://docs.mongodb.com/v3.6/tutorial/install-mongodb-on-ubuntu/), [4.0](https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/)] 5 | 6 | ### Add users, roles, authentication 7 | Follow steps 1-5 of these [instructions](https://docs.mongodb.com/manual/tutorial/enable-authentication/) to add user administrator 8 | 9 | After step 5, run the following commands to create: 10 | - a user `stitch_root` that can enable oplog 11 | 12 | ``` 13 | use admin 14 | db.createUser( 15 | { 16 | user: "stitch_root", 17 | pwd: "", 18 | roles: [{role: "root", db: "admin"}] 19 | } 20 | ) 21 | ``` 22 | 23 | - a user `stitch_dev` that can create/read/write new dbs and access oplog 24 | 25 | ``` 26 | use test 27 | db.createUser( 28 | { 29 | user: "stitch_dev", 30 | pwd: "", 31 | roles: [ { role: "readWriteAnyDatabase", db: "admin" }, {role: "read", db: "local"} ] 32 | } 33 | ) 34 | ``` 35 | 36 | ### Enable Oplog 37 | 1. Edit `/etc/mongod.conf` and add a replciation set: 38 | 39 | Assume superuser: 40 | ``` 41 | sudo su 42 | ``` 43 | 44 | Uncomment replication and add `replicationSetName` (indented) in `/etc/mongod.conf`: 45 | ``` 46 | replication: 47 | replSetName: rs0 48 | ``` 49 | 50 | Return to normal user with `C-d` 51 | 52 | 2. Restart mongod and pass it --config flag: 53 | ``` 54 | sudo mongod --auth --config /etc/mongod.conf 55 | ``` 56 | 57 | 3. Initiate replica set 58 | 59 | Connect to shell as `stitch_root` user: 60 | 61 | ``` 62 | mongo --port 27017 -u stitch_root -p --authenticationDatabase admin 63 | ``` 64 | 65 | and initiate replica set: 66 | ``` 67 | rs.initiate({_id: "rs0", members: [{_id: 0, host: "127.0.0.1:27017"}]}) 68 | ``` 69 | 70 | 4. Check out that oplog 71 | 72 | Disconnect from shell and reconnect as `stitch_dev` user; 73 | 74 | ``` 75 | mongo --port 27017 -u stitch_dev -p --authenticationDatabase test 76 | ``` 77 | 78 | switch to local 79 | ``` 80 | use local 81 | ``` 82 | 83 | view oplog rows 84 | ``` 85 | db.oplog.rs.find() 86 | ``` 87 | 88 | ### Connect with shell 89 | Can now connect to Mongo via the mongo shell with: 90 | ``` 91 | mongo --host localhost --port 27017 --authenticationDatabase --username --password 92 | ``` 93 | -------------------------------------------------------------------------------- /spikes/pymongo_spike.py: -------------------------------------------------------------------------------- 1 | import pymongo # requires dnspython package as well 2 | import sys 3 | 4 | #------------------------ Setup Client ------------------------ 5 | 6 | #----- Atlas using connection string ----- 7 | #username = sys.argv[1] 8 | #password = sys.argv[2] 9 | #host = 'stitch-upwjw.mongodb.net' 10 | # connection_string = "mongodb+srv://{}:{}@{}/test".format(username, password, host) 11 | # client = pymongo.MongoClient(connection_string) 12 | 13 | #----- Atlas using connection props ----- 14 | # username = sys.argv[1] 15 | # password = sys.argv[2] 16 | # host=['stitch-shard-00-00-upwjw.mongodb.net', 17 | # 'stitch-shard-00-01-upwjw.mongodb.net', 18 | # 'stitch-shard-00-02-upwjw.mongodb.net'] 19 | # ssl = True # client must have ssl=True to connect to atlas cluster 20 | # client = pymongo.MongoClient(host=host, username=username, password=password, port=27017, ssl=True) 21 | 22 | #------ Local mongo server ------ 23 | username = sys.argv[1] 24 | password = sys.argv[2] 25 | host= '127.0.0.1' 26 | auth_source = 'test' 27 | ssl = False 28 | client = pymongo.MongoClient(host=host, username=username, password=password, port=27017, authSource=auth_source, ssl=ssl) 29 | 30 | # Get connection Info 31 | print("\nConnecting to MongoDB version " + client.server_info()['version']) 32 | 33 | # List dbs 34 | print("\nShowing Initial Databases...") 35 | print(client.list_database_names()) 36 | 37 | 38 | # Make db and collection 39 | # Note: MongoDB waits until you have created a collection (table), with at least one document (record) before it actually creates the database (and collection). 40 | print("\nAdding database=spike_db and collection=sources_team_members...") 41 | spike_db = client["spike_db"] 42 | sources_team_members_coll = spike_db["sources_team_members"] 43 | 44 | # Add one document to collection 45 | print ("\nAdding nick to collection=sources_team_members...") 46 | sources_team_members_coll.insert_one({"name": "Nick", "membersince": 2018}) 47 | 48 | # Add multiple documents to collection 49 | print("\nAdding everyone else to collection=sources_team_members...") 50 | sources_team_members_coll.insert_many([{"name": "Jacob", "membersince": 2019, "my_object": {"nested_field": "some_value"}}, 51 | {"name": "Collin", "membersince": 2019}, 52 | {"name": "Dan", "membersince": 2017}, 53 | {"name": "Kyle", "membersince": 2016}, 54 | {"name": "Andy", "membersince": 2018}, 55 | {"name": "Brian", "membersince": 2014}, 56 | {"name": "Harrison", "membersince": 2018}]) 57 | 58 | 59 | print("\nShowing Databases...") 60 | print(client.list_database_names()) 61 | 62 | print("\nShowing collections in db=spike_db...") 63 | print(spike_db.list_collection_names()) 64 | 65 | print("\nShowing all documents in sources_team_members_coll...") 66 | for doc in sources_team_members_coll.find(): 67 | print(doc) 68 | 69 | print("\nShowing documents where membersince > 2016...") 70 | for doc in sources_team_members_coll.find({"membersince": {"$gt": 2016}}): 71 | print(doc) 72 | 73 | print("\nShow only name and id...") 74 | for doc in sources_team_members_coll.find({}, {"name": 1}): 75 | print(doc) 76 | 77 | print("\nShow only name...") 78 | for doc in sources_team_members_coll.find({}, {"name": 1, "_id": 0}): 79 | print(doc) 80 | 81 | print("\nUpdating Nick's membersince from 2017->2018...") 82 | update_result = sources_team_members_coll.update_one({"name": "Nick"}, {"$set": {"membersince": 2017}}) 83 | for doc in sources_team_members_coll.find(): 84 | print(doc) 85 | 86 | print("\nUpdating to add team field to all documents...") 87 | update_result = sources_team_members_coll.update_many({}, {"$set": {"team": "sources"}}) 88 | for doc in sources_team_members_coll.find(): 89 | print(doc) 90 | 91 | print("\nRemoving Harrison because he is NOT part of the team...") 92 | delete_result = sources_team_members_coll.delete_many({"name": "Harrison"}) 93 | for doc in sources_team_members_coll.find(): 94 | print(doc) 95 | 96 | oplog = client.local.oplog.rs 97 | first = oplog.find().sort('$natural', pymongo.ASCENDING).limit(-1).next() 98 | ts = first['ts'] 99 | 100 | should_print_oplog = True 101 | if should_print_oplog: 102 | print('\nPrinting oplog rows...') 103 | 104 | with client.local.oplog.rs.find({'ts': {'$gt': ts}}, 105 | oplog_replay=True) as cursor: 106 | for row in cursor: 107 | if row['op'] in ['i', 'u', 'd']: 108 | print({k: row[k] if row.get(k) else '' for k in ['o', 'o2', 'ns', 'op']}) 109 | 110 | 111 | print("\nDeleting the collection and database...") 112 | sources_team_members_coll.drop() 113 | 114 | print("\nShowing Databases...") 115 | print(client.list_database_names()) 116 | -------------------------------------------------------------------------------- /spikes/supported_versions_spike.md: -------------------------------------------------------------------------------- 1 | # tap-mongodb suppored versions & flavors spike 2 | 3 | ## Connecting to mongodb (shell and via pymongo) 4 | ### Mongo Shell 5 | Newer versions of the mongo shell should be backwards compatible for the 6 | commands we'll be running. Any new features (mostly helper stuff) 7 | introduced to the mongo shell won't work with previous versions of 8 | mongodb. 9 | 10 | ### PyMongo 11 | 12 | Mongo officially supports versions 3.4, 3.6, 4.0. They are ending support for 3.4 in Jan 2020 13 | 14 | According to the 15 | [Pymongo docs on compatability](https://docs.mongodb.com/ecosystem/drivers/pymongo/#compatibility), 16 | pymongo version 3.7/3.8 supports 17 | - 4.0 18 | - 3.6 19 | - 3.4 20 | - 3.2 21 | - 3.0 22 | - 2.6 23 | 24 | **We believe this means that any major differences in the client mongo version should be handled by pymongo** 25 | 26 | ## Replica sets and sharded clusters 27 | - A replica set is a cluster of MongoDB servers that implements 28 | replication and automated failover. MongoDB’s recommended replication 29 | strategy 30 | - With sharding, each shard contains a subset of sharded data for a 31 | sharded cluster. Together, the cluster’s shards hold the entire data set 32 | for the cluster. 33 | - Users, clients, or applications should only directly connect to a 34 | shard to perform local administrative and maintenance operations. 35 | - As of MongoDB 3.6, shards must be deployed as a replica set to provide 36 | redundancy and high availability. 37 | - [Docs on shards](https://docs.mongodb.com/manual/core/sharded-cluster-shards/) 38 | - [Docs on replication](https://docs.mongodb.com/manual/replication/) 39 | - Basically, we should connect to the cluster, never to an individual 40 | shard/replica 41 | 42 | ## Mongo-as-a-service mLab? 43 | - mLab is "not accepting new customers" and migrating existing ones to 44 | Atlas (what we test with) 45 | - It looks like there was no difference though in the way you connected 46 | to it via shell/driver 47 | 48 | ## Test Instance versions 49 | 50 | For Atlas, the free version (and M2/M5 shared clusters) default to the 51 | latest version. You can choose the version for the M10 (paid) clusters, so 52 | we'll have the ability to spin up test clusters for different versions if 53 | we choose (and pay). 54 | 55 | We recommend testing on the latest version since we believe pymongo will 56 | handle version differences within the tap. If we uncover major bugs due to 57 | version differences, we can consider spinning up multiple clusters on 58 | different versions to test with. 59 | -------------------------------------------------------------------------------- /tap_mongodb/sync_strategies/full_table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import copy 3 | import time 4 | import pymongo 5 | import singer 6 | from singer import metadata, utils 7 | import tap_mongodb.sync_strategies.common as common 8 | 9 | LOGGER = singer.get_logger() 10 | 11 | def get_max_id_value(collection, projection=None): 12 | if projection is None: 13 | row = collection.find_one(sort=[("_id", pymongo.DESCENDING)]) 14 | else: 15 | row = collection.find_one(sort=[("_id", pymongo.DESCENDING)], 16 | projection=projection) 17 | if row: 18 | return row['_id'] 19 | 20 | LOGGER.info("No max id found for collection: collection is likely empty") 21 | return None 22 | 23 | 24 | # pylint: disable=too-many-locals,invalid-name,too-many-statements 25 | def sync_collection(client, stream, state, projection): 26 | tap_stream_id = stream['tap_stream_id'] 27 | LOGGER.info('Starting full table sync for %s', tap_stream_id) 28 | 29 | md_map = metadata.to_map(stream['metadata']) 30 | database_name = metadata.get(md_map, (), 'database-name') 31 | 32 | db = client[database_name] 33 | collection = db[stream['stream']] 34 | 35 | #before writing the table version to state, check if we had one to begin with 36 | first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None 37 | 38 | # last run was interrupted if there is a last_id_fetched bookmark 39 | was_interrupted = singer.get_bookmark(state, 40 | stream['tap_stream_id'], 41 | 'last_id_fetched') is not None 42 | 43 | #pick a new table version if last run wasn't interrupted 44 | if was_interrupted: 45 | stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') 46 | else: 47 | stream_version = int(time.time() * 1000) 48 | 49 | state = singer.write_bookmark(state, 50 | stream['tap_stream_id'], 51 | 'version', 52 | stream_version) 53 | singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) 54 | 55 | activate_version_message = singer.ActivateVersionMessage( 56 | stream=common.calculate_destination_stream_name(stream), 57 | version=stream_version 58 | ) 59 | 60 | # For the initial replication, emit an ACTIVATE_VERSION message 61 | # at the beginning so the records show up right away. 62 | if first_run: 63 | singer.write_message(activate_version_message) 64 | 65 | if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'): 66 | # There is a bookmark 67 | max_id_value = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value') 68 | max_id_type = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type') 69 | max_id_value = common.string_to_class(max_id_value, max_id_type) 70 | else: 71 | max_id_value = get_max_id_value(collection, projection) 72 | 73 | last_id_fetched = singer.get_bookmark(state, 74 | stream['tap_stream_id'], 75 | 'last_id_fetched') 76 | 77 | if max_id_value: 78 | # Write the bookmark if max_id_value is defined 79 | state = singer.write_bookmark(state, 80 | stream['tap_stream_id'], 81 | 'max_id_value', 82 | common.class_to_string(max_id_value, 83 | max_id_value.__class__.__name__)) 84 | state = singer.write_bookmark(state, 85 | stream['tap_stream_id'], 86 | 'max_id_type', 87 | max_id_value.__class__.__name__) 88 | 89 | find_filter = {'$lte': max_id_value} 90 | if last_id_fetched: 91 | last_id_fetched_type = singer.get_bookmark(state, 92 | stream['tap_stream_id'], 93 | 'last_id_fetched_type') 94 | find_filter['$gte'] = common.string_to_class(last_id_fetched, last_id_fetched_type) 95 | 96 | query_message = 'Querying {} with:\n\tFind Parameters: {}'.format( 97 | stream['tap_stream_id'], 98 | find_filter) 99 | if projection: 100 | query_message += '\n\tProjection: {}'.format(projection) 101 | # pylint: disable=logging-format-interpolation 102 | LOGGER.info(query_message) 103 | 104 | 105 | with collection.find({'_id': find_filter}, 106 | projection, 107 | sort=[("_id", pymongo.ASCENDING)]) as cursor: 108 | rows_saved = 0 109 | time_extracted = utils.now() 110 | start_time = time.time() 111 | 112 | schema = {"type": "object", "properties": {}} 113 | for row in cursor: 114 | rows_saved += 1 115 | 116 | schema_build_start_time = time.time() 117 | if common.row_to_schema(schema, row): 118 | singer.write_message(singer.SchemaMessage( 119 | stream=common.calculate_destination_stream_name(stream), 120 | schema=schema, 121 | key_properties=['_id'])) 122 | common.SCHEMA_COUNT[stream['tap_stream_id']] += 1 123 | common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time() - schema_build_start_time 124 | 125 | record_message = common.row_to_singer_record(stream, 126 | row, 127 | stream_version, 128 | time_extracted) 129 | 130 | singer.write_message(record_message) 131 | 132 | state = singer.write_bookmark(state, 133 | stream['tap_stream_id'], 134 | 'last_id_fetched', 135 | common.class_to_string(row['_id'], 136 | row['_id'].__class__.__name__)) 137 | state = singer.write_bookmark(state, 138 | stream['tap_stream_id'], 139 | 'last_id_fetched_type', 140 | row['_id'].__class__.__name__) 141 | 142 | 143 | if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: 144 | singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) 145 | 146 | common.COUNTS[tap_stream_id] += rows_saved 147 | common.TIMES[tap_stream_id] += time.time()-start_time 148 | 149 | # clear max pk value and last pk fetched upon successful sync 150 | singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value') 151 | singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type') 152 | singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') 153 | singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') 154 | 155 | state = singer.write_bookmark(state, 156 | stream['tap_stream_id'], 157 | 'initial_full_table_complete', 158 | True) 159 | 160 | singer.write_message(activate_version_message) 161 | 162 | LOGGER.info('Synced {} records for {}'.format(rows_saved, tap_stream_id)) 163 | -------------------------------------------------------------------------------- /tap_mongodb/sync_strategies/incremental.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import copy 3 | import time 4 | import pymongo 5 | import singer 6 | from singer import metadata, utils 7 | import tap_mongodb.sync_strategies.common as common 8 | 9 | LOGGER = singer.get_logger() 10 | 11 | 12 | def update_bookmark(row, state, tap_stream_id, replication_key_name): 13 | replication_key_value = row.get(replication_key_name) 14 | if replication_key_value: 15 | replication_key_type = replication_key_value.__class__.__name__ 16 | 17 | replication_key_value_bookmark = common.class_to_string(replication_key_value, 18 | replication_key_type) 19 | state = singer.write_bookmark(state, 20 | tap_stream_id, 21 | 'replication_key_value', 22 | replication_key_value_bookmark) 23 | state = singer.write_bookmark(state, 24 | tap_stream_id, 25 | 'replication_key_type', 26 | replication_key_type) 27 | 28 | # pylint: disable=too-many-locals, too-many-statements 29 | def sync_collection(client, stream, state, projection): 30 | tap_stream_id = stream['tap_stream_id'] 31 | LOGGER.info('Starting incremental sync for %s', tap_stream_id) 32 | 33 | stream_metadata = metadata.to_map(stream['metadata']).get(()) 34 | collection = client[stream_metadata['database-name']][stream['stream']] 35 | 36 | #before writing the table version to state, check if we had one to begin with 37 | first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None 38 | 39 | #pick a new table version if last run wasn't interrupted 40 | if first_run: 41 | stream_version = int(time.time() * 1000) 42 | else: 43 | stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') 44 | 45 | state = singer.write_bookmark(state, 46 | stream['tap_stream_id'], 47 | 'version', 48 | stream_version) 49 | 50 | activate_version_message = singer.ActivateVersionMessage( 51 | stream=common.calculate_destination_stream_name(stream), 52 | version=stream_version 53 | ) 54 | 55 | 56 | # For the initial replication, emit an ACTIVATE_VERSION message 57 | # at the beginning so the records show up right away. 58 | if first_run: 59 | singer.write_message(activate_version_message) 60 | 61 | # get replication key, and bookmarked value/type 62 | stream_state = state.get('bookmarks', {}).get(tap_stream_id, {}) 63 | 64 | replication_key_name = stream_metadata.get('replication-key') 65 | replication_key_value_bookmark = stream_state.get('replication_key_value') 66 | 67 | # write state message 68 | singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) 69 | 70 | # create query 71 | find_filter = {} 72 | if replication_key_value_bookmark: 73 | find_filter[replication_key_name] = {} 74 | find_filter[replication_key_name]['$gte'] = \ 75 | common.string_to_class(replication_key_value_bookmark, 76 | stream_state.get('replication_key_type')) 77 | 78 | # log query 79 | query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(tap_stream_id, find_filter) 80 | if projection: 81 | query_message += '\n\tProjection: {}'.format(projection) 82 | LOGGER.info(query_message) 83 | 84 | 85 | # query collection 86 | schema = {"type": "object", "properties": {}} 87 | with collection.find(find_filter, 88 | projection, 89 | sort=[(replication_key_name, pymongo.ASCENDING)]) as cursor: 90 | rows_saved = 0 91 | time_extracted = utils.now() 92 | start_time = time.time() 93 | 94 | for row in cursor: 95 | schema_build_start_time = time.time() 96 | if common.row_to_schema(schema, row): 97 | singer.write_message(singer.SchemaMessage( 98 | stream=common.calculate_destination_stream_name(stream), 99 | schema=schema, 100 | key_properties=['_id'])) 101 | common.SCHEMA_COUNT[tap_stream_id] += 1 102 | common.SCHEMA_TIMES[tap_stream_id] += time.time() - schema_build_start_time 103 | 104 | 105 | record_message = common.row_to_singer_record(stream, 106 | row, 107 | stream_version, 108 | time_extracted) 109 | 110 | # gen_schema = common.row_to_schema_message(schema, record_message.record, row) 111 | # if DeepDiff(schema, gen_schema, ignore_order=True) != {}: 112 | # emit gen_schema 113 | # schema = gen_schema 114 | singer.write_message(record_message) 115 | rows_saved += 1 116 | 117 | update_bookmark(row, state, tap_stream_id, replication_key_name) 118 | 119 | if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: 120 | singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) 121 | 122 | 123 | common.COUNTS[tap_stream_id] += rows_saved 124 | common.TIMES[tap_stream_id] += time.time()-start_time 125 | 126 | singer.write_message(activate_version_message) 127 | 128 | LOGGER.info('Synced %s records for %s', rows_saved, tap_stream_id) 129 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/singer-io/tap-mongodb/9db2c92efc089272ab17bdeaa41bcfe8da82d12d/tests/__init__.py -------------------------------------------------------------------------------- /tests/mongodb_common.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pymongo 3 | from tap_tester.logger import LOGGER 4 | 5 | 6 | def ensure_environment_variables_set(): 7 | missing_envs = [x for x in ['TAP_MONGODB_HOST', 8 | 'TAP_MONGODB_USER', 9 | 'TAP_MONGODB_PASSWORD', 10 | 'TAP_MONGODB_PORT', 11 | 'TAP_MONGODB_DBNAME'] if os.getenv(x) is None] 12 | if len(missing_envs) != 0: 13 | raise Exception(f"set environment variables: {missing_envs}") 14 | 15 | ########################################################################## 16 | ### Database Interactions 17 | ########################################################################## 18 | 19 | def get_test_connection(): 20 | username = os.getenv('TAP_MONGODB_USER') 21 | password = os.getenv('TAP_MONGODB_PASSWORD') 22 | host= os.getenv('TAP_MONGODB_HOST') 23 | auth_source = os.getenv('TAP_MONGODB_DBNAME') 24 | port = int(os.getenv('TAP_MONGODB_PORT')) 25 | ssl = False 26 | conn = pymongo.MongoClient(host=host, username=username, password=password, port=port, 27 | authSource=auth_source, ssl=ssl, uuidRepresentation='standard') 28 | return conn 29 | 30 | def drop_all_collections(client): 31 | ############# Drop all dbs/collections ############# 32 | for db_name in client.list_database_names(): 33 | if db_name in ['config', 'local', 'system']: 34 | continue 35 | for collection_name in client[db_name].list_collection_names(): 36 | if collection_name in ['system.views', 'system.version', 'system.keys', 'system.users']: 37 | continue 38 | LOGGER.info("Dropping database: " + db_name + ", collection: " + collection_name) 39 | client[db_name][collection_name].drop() 40 | -------------------------------------------------------------------------------- /tests/test_mongodb_cname_restrictions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import string 4 | import unittest 5 | from bson import ObjectId 6 | 7 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 8 | from tap_tester import connections, menagerie, runner 9 | 10 | 11 | RECORD_COUNT = {} 12 | 13 | 14 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 15 | return ''.join(random.choice(chars) for x in range(size)) 16 | 17 | def generate_simple_coll_docs(num_docs): 18 | docs = [] 19 | for int_value in range(num_docs): 20 | docs.append({"int_field": int_value, "string_field": random_string_generator()}) 21 | return docs 22 | 23 | class MongoDBCollectionNameRestrictions(unittest.TestCase): 24 | ''' Test edge case collection name restrictions per the documentation (leading '_' or digit) 25 | Reference https://jira.talendforge.org/browse/TDL-18990 for details ''' 26 | 27 | def setUp(self): 28 | 29 | ensure_environment_variables_set() 30 | 31 | with get_test_connection() as client: 32 | ############# Drop all dbs/collections ############# 33 | drop_all_collections(client) 34 | 35 | ############# Add simple collections ############# 36 | # 1_simple_coll has 50 documents 37 | client["simple_db"]["1_simple_coll"].insert_many(generate_simple_coll_docs(50)) 38 | 39 | # _simple_coll_2 has 100 documents 40 | client["simple_db"]["_simple_coll_2"].insert_many(generate_simple_coll_docs(100)) 41 | 42 | def expected_check_streams(self): 43 | return { 44 | 'simple_db-1_simple_coll', 45 | 'simple_db-_simple_coll_2', 46 | } 47 | 48 | def expected_pks(self): 49 | return { 50 | '1_simple_coll': {'_id'}, 51 | '_simple_coll_2': {'_id'}, 52 | } 53 | 54 | def expected_row_counts(self): 55 | return { 56 | '1_simple_coll': 50, 57 | '_simple_coll_2': 100, 58 | } 59 | 60 | def expected_sync_streams(self): 61 | return { 62 | '1_simple_coll', 63 | '_simple_coll_2' 64 | } 65 | 66 | def name(self): 67 | return "tap_tester_mongodb_cname_restrict" 68 | 69 | def tap_name(self): 70 | return "tap-mongodb" 71 | 72 | def get_type(self): 73 | return "platform.mongodb" 74 | 75 | def get_credentials(self): 76 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 77 | 78 | def get_properties(self): 79 | return {'host' : os.getenv('TAP_MONGODB_HOST'), 80 | 'port' : os.getenv('TAP_MONGODB_PORT'), 81 | 'user' : os.getenv('TAP_MONGODB_USER'), 82 | 'database' : os.getenv('TAP_MONGODB_DBNAME') 83 | } 84 | 85 | 86 | def test_run(self): 87 | 88 | conn_id = connections.ensure_connection(self) 89 | 90 | # --------------------------------- 91 | # ----------- Discovery ---------- 92 | # --------------------------------- 93 | 94 | # run in discovery mode 95 | check_job_name = runner.run_check_mode(self, conn_id) 96 | 97 | # verify check exit codes 98 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 99 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 100 | 101 | # verify the tap discovered the right streams 102 | found_catalogs = menagerie.get_catalogs(conn_id) 103 | 104 | # assert we find the correct streams 105 | self.assertEqual(self.expected_check_streams(), 106 | {c['tap_stream_id'] for c in found_catalogs}) 107 | 108 | for tap_stream_id in self.expected_check_streams(): 109 | found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] 110 | 111 | # assert that the pks are correct 112 | self.assertEqual(self.expected_pks()[found_stream['stream_name']], 113 | set(found_stream.get('metadata', {}).get('table-key-properties'))) 114 | 115 | # assert that the row counts are correct 116 | self.assertEqual(self.expected_row_counts()[found_stream['stream_name']], 117 | found_stream.get('metadata', {}).get('row-count')) 118 | 119 | # ---------------------------------------- 120 | # ----------- Initial Full Table --------- 121 | # ---------------------------------------- 122 | 123 | # Select 1_simple_coll and _simple_coll_2 streams and add replication method metadata 124 | for stream_catalog in found_catalogs: 125 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 126 | additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}] 127 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 128 | stream_catalog, 129 | annotated_schema, 130 | additional_md) 131 | 132 | # Run sync 133 | sync_job_name = runner.run_sync_mode(self, conn_id) 134 | 135 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 136 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 137 | 138 | # verify the persisted schema was correct 139 | records_by_stream = runner.get_records_from_target_output() 140 | 141 | # assert that each of the streams that we synced are the ones that we expect to see 142 | record_count_by_stream = runner.examine_target_output_file(self, 143 | conn_id, 144 | self.expected_sync_streams(), 145 | self.expected_pks()) 146 | 147 | # Verify that the full table was synced 148 | for tap_stream_id in self.expected_sync_streams(): 149 | self.assertGreaterEqual(record_count_by_stream[tap_stream_id],self.expected_row_counts()[tap_stream_id]) 150 | 151 | # Verify that we have 'initial_full_table_complete' bookmark 152 | state = menagerie.get_state(conn_id) 153 | first_versions = {} 154 | 155 | for tap_stream_id in self.expected_check_streams(): 156 | # assert that the state has an initial_full_table_complete == True 157 | self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete']) 158 | # assert that there is a version bookmark in state 159 | first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version'] 160 | self.assertIsNotNone(first_versions[tap_stream_id]) 161 | # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark 162 | self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_time']) 163 | self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_inc']) 164 | 165 | 166 | changed_ids = set() 167 | with get_test_connection() as client: 168 | # Delete two documents for each collection 169 | 170 | changed_ids.add(client['simple_db']['1_simple_coll'].find({'int_field': 0})[0]['_id']) 171 | client["simple_db"]["1_simple_coll"].delete_one({'int_field': 0}) 172 | 173 | changed_ids.add(client['simple_db']['1_simple_coll'].find({'int_field': 1})[0]['_id']) 174 | client["simple_db"]["1_simple_coll"].delete_one({'int_field': 1}) 175 | 176 | changed_ids.add(client['simple_db']['_simple_coll_2'].find({'int_field': 0})[0]['_id']) 177 | client["simple_db"]["_simple_coll_2"].delete_one({'int_field': 0}) 178 | 179 | changed_ids.add(client['simple_db']['_simple_coll_2'].find({'int_field': 1})[0]['_id']) 180 | client["simple_db"]["_simple_coll_2"].delete_one({'int_field': 1}) 181 | 182 | # Update two documents for each collection 183 | changed_ids.add(client['simple_db']['1_simple_coll'].find({'int_field': 48})[0]['_id']) 184 | client["simple_db"]["1_simple_coll"].update_one({'int_field': 48},{'$set': {'int_field': -1}}) 185 | 186 | changed_ids.add(client['simple_db']['1_simple_coll'].find({'int_field': 49})[0]['_id']) 187 | client["simple_db"]["1_simple_coll"].update_one({'int_field': 49},{'$set': {'int_field': -1}}) 188 | 189 | changed_ids.add(client['simple_db']['_simple_coll_2'].find({'int_field': 98})[0]['_id']) 190 | client["simple_db"]["_simple_coll_2"].update_one({'int_field': 98},{'$set': {'int_field': -1}}) 191 | 192 | changed_ids.add(client['simple_db']['_simple_coll_2'].find({'int_field': 99})[0]['_id']) 193 | client["simple_db"]["_simple_coll_2"].update_one({'int_field': 99},{'$set': {'int_field': -1}}) 194 | 195 | # Insert two documents for each collection 196 | client["simple_db"]["1_simple_coll"].insert_one({"int_field": 50, "string_field": random_string_generator()}) 197 | changed_ids.add(client['simple_db']['1_simple_coll'].find({'int_field': 50})[0]['_id']) 198 | 199 | client["simple_db"]["1_simple_coll"].insert_one({"int_field": 51, "string_field": random_string_generator()}) 200 | changed_ids.add(client['simple_db']['1_simple_coll'].find({'int_field': 51})[0]['_id']) 201 | 202 | client["simple_db"]["_simple_coll_2"].insert_one({"int_field": 100, "string_field": random_string_generator()}) 203 | changed_ids.add(client['simple_db']['_simple_coll_2'].find({'int_field': 100})[0]['_id']) 204 | 205 | client["simple_db"]["_simple_coll_2"].insert_one({"int_field": 101, "string_field": random_string_generator()}) 206 | changed_ids.add(client['simple_db']['_simple_coll_2'].find({'int_field': 101})[0]['_id']) 207 | 208 | # ------------------------------------------- 209 | # ----------- Subsequent Oplog Sync --------- 210 | # ------------------------------------------- 211 | 212 | # Run sync 213 | sync_job_name = runner.run_sync_mode(self, conn_id) 214 | 215 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 216 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 217 | 218 | # verify the persisted schema was correct 219 | messages_by_stream = runner.get_records_from_target_output() 220 | records_by_stream = {} 221 | for stream_name in self.expected_sync_streams(): 222 | records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] 223 | if x.get('action') == 'upsert'] 224 | 225 | # assert that each of the streams that we synced are the ones that we expect to see 226 | record_count_by_stream = runner.examine_target_output_file(self, 227 | conn_id, 228 | self.expected_sync_streams(), 229 | self.expected_pks()) 230 | 231 | # Verify that we got at least 6 records due to changes 232 | # (could be more due to overlap in gte oplog clause) 233 | for k,v in record_count_by_stream.items(): 234 | self.assertGreaterEqual(v, 6) 235 | 236 | # Verify that we got 2 records with _SDC_DELETED_AT 237 | for stream in self.expected_sync_streams(): 238 | self.assertEqual(2, len([x['data'] for x in records_by_stream[stream] 239 | if x['data'].get('_sdc_deleted_at')])) 240 | # Verify that the _id of the records sent are the same set as the 241 | # _ids of the documents changed 242 | actual_ids = {ObjectId(x['data']['_id']) for stream in self.expected_sync_streams() 243 | for x in records_by_stream[stream]} 244 | self.assertEqual(changed_ids, actual_ids) 245 | -------------------------------------------------------------------------------- /tests/test_mongodb_configurable_properties.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import string 4 | import unittest 5 | 6 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 7 | from tap_tester import connections, menagerie, runner 8 | 9 | RECORD_COUNT = {} 10 | 11 | 12 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 13 | return ''.join(random.choice(chars) for x in range(size)) 14 | 15 | 16 | def generate_simple_coll_docs(num_docs): 17 | docs = [] 18 | for int_value in range(num_docs): 19 | docs.append({"int_field": int_value, "string_field": random_string_generator()}) 20 | return docs 21 | 22 | 23 | class MongoDBConfigurableProperty(unittest.TestCase): 24 | 25 | def setUp(self): 26 | ensure_environment_variables_set() 27 | 28 | with get_test_connection() as client: 29 | # drop all dbs/collections 30 | drop_all_collections(client) 31 | 32 | # simple_coll_1 has 25 documents 33 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(25)) 34 | 35 | # simple_coll_2 has 50 documents 36 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(50)) 37 | 38 | def name(self): 39 | return "tap_tester_mongodb_configurable_property" 40 | 41 | def tap_name(self): 42 | return "tap-mongodb" 43 | 44 | def get_type(self): 45 | return "platform.mongodb" 46 | 47 | def get_credentials(self): 48 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 49 | 50 | def expected_check_streams(self): 51 | return { 52 | 'simple_db-simple_coll_1', 53 | 'simple_db-simple_coll_2' 54 | } 55 | 56 | def expected_pks_log_based(self): 57 | return { 58 | 'simple_coll_1': {'_id'}, 59 | 'simple_coll_2': {'_id'} 60 | } 61 | 62 | def expected_pks_include_schemas(self): 63 | return { 64 | 'simple_db_simple_coll_1': {'_id'}, 65 | 'simple_db_simple_coll_2': {'_id'} 66 | } 67 | 68 | def expected_row_counts_log_based(self): 69 | return { 70 | 'simple_coll_1': 25, 71 | 'simple_coll_2': 50 72 | } 73 | 74 | def expected_row_counts_include_schemas(self): 75 | return { 76 | 'simple_db_simple_coll_1': 25, 77 | 'simple_db_simple_coll_2': 50 78 | } 79 | 80 | def expected_sync_streams_include_schemas(self): 81 | return { 82 | 'simple_db_simple_coll_1', 83 | 'simple_db_simple_coll_2' 84 | } 85 | 86 | def expected_sync_streams_log_based(self): 87 | return { 88 | 'simple_coll_1', 89 | 'simple_coll_2' 90 | } 91 | 92 | def run_test(self): 93 | 94 | conn_id = connections.ensure_connection(self) 95 | 96 | #original_properties=False) 97 | 98 | # ------------------------------- 99 | # ----------- Discovery ---------- 100 | # ------------------------------- 101 | 102 | # run in discovery mode 103 | check_job_name = runner.run_check_mode(self, conn_id) 104 | 105 | # verify check exit codes 106 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 107 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 108 | 109 | # verify the tap discovered the right streams 110 | found_catalogs = menagerie.get_catalogs(conn_id) 111 | 112 | # assert we find the correct streams 113 | self.assertEqual(self.expected_check_streams(), 114 | {c['tap_stream_id'] for c in found_catalogs}) 115 | 116 | # ------------------------------------------- 117 | # ----------- First full Table Sync --------- 118 | # ------------------------------------------- 119 | # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata 120 | for stream_catalog in found_catalogs: 121 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 122 | additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'FULL_TABLE'}}] 123 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 124 | stream_catalog, 125 | annotated_schema, 126 | additional_md) 127 | 128 | # run full table sync 129 | sync_job_name = runner.run_sync_mode(self, conn_id) 130 | 131 | # check exit status 132 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 133 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 134 | 135 | return conn_id 136 | 137 | 138 | class MongoDBUseLogBasedReplication(MongoDBConfigurableProperty): 139 | 140 | def name(self): 141 | return "tt_mongodb_config_prop_log_based" 142 | 143 | def get_credentials(self): 144 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 145 | 146 | def get_properties(self): 147 | return {'host': os.getenv('TAP_MONGODB_HOST'), 148 | 'port': os.getenv('TAP_MONGODB_PORT'), 149 | 'user': os.getenv('TAP_MONGODB_USER'), 150 | 'database': os.getenv('TAP_MONGODB_DBNAME'), 151 | 'use_log_based_replication': 'true' 152 | } 153 | 154 | def test_run(self): 155 | conn_id = self.run_test() 156 | 157 | # streams that we synced are the ones that we expect to see 158 | records_by_stream = runner.get_records_from_target_output() 159 | record_count_by_stream = runner.examine_target_output_file(self, 160 | conn_id, 161 | self.expected_sync_streams_log_based(), 162 | self.expected_pks_log_based()) 163 | 164 | # assert that we get the correct number of records for each stream 165 | self.assertEqual(self.expected_row_counts_log_based(), record_count_by_stream) 166 | 167 | 168 | class MongoDBIncludeSchema(MongoDBConfigurableProperty): 169 | 170 | def name(self): 171 | return "tt_mongodb_config_prop_inc_schema" 172 | 173 | def get_properties(self): 174 | return {'host': os.getenv('TAP_MONGODB_HOST'), 175 | 'port': os.getenv('TAP_MONGODB_PORT'), 176 | 'user': os.getenv('TAP_MONGODB_USER'), 177 | 'database': os.getenv('TAP_MONGODB_DBNAME'), 178 | 'include_schemas_in_destination_stream_name': 'true' 179 | } 180 | 181 | def test_run(self): 182 | conn_id = self.run_test() 183 | 184 | # streams that we synced are the ones that we expect to see 185 | records_by_stream = runner.get_records_from_target_output() 186 | record_count_by_stream = runner.examine_target_output_file(self, 187 | conn_id, 188 | self.expected_sync_streams_include_schemas(), 189 | self.expected_pks_include_schemas()) 190 | 191 | # assert that we get the correct number of records for each stream 192 | self.assertEqual(self.expected_row_counts_include_schemas(), record_count_by_stream) 193 | -------------------------------------------------------------------------------- /tests/test_mongodb_datatype.py: -------------------------------------------------------------------------------- 1 | import bson 2 | import datetime 3 | import decimal 4 | import os 5 | import re 6 | import subprocess 7 | import unittest 8 | import uuid 9 | 10 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 11 | from tap_tester import connections, menagerie, runner 12 | from tap_tester.logger import LOGGER 13 | 14 | 15 | RECORD_COUNT = {} 16 | 17 | 18 | def run_mongodb_javascript(database, js, mongo_version): 19 | """ 20 | Runs arbitrary javascript against the test Mongo instance. This is 21 | useful for setting up situations that Python can't handle (e.g., 22 | datetime with year 0) for testing. 23 | """ 24 | LOGGER.info("Running '{}' against database '{}'".format(js, database)) 25 | 26 | mongo_shell = "mongosh" if int(mongo_version.split(".")[0]) > 5 else "mongo" 27 | cmd = [mongo_shell, "-u", os.getenv('TAP_MONGODB_USER'), "-p", os.getenv('TAP_MONGODB_PASSWORD'), "--authenticationDatabase", os.getenv('TAP_MONGODB_DBNAME'), database, "--eval", "eval('{}')".format(js)] 28 | subprocess.run(cmd) 29 | 30 | 31 | class MongoDBDatatype(unittest.TestCase): 32 | # To compare large dictionaries 33 | maxDiff = None 34 | 35 | def setUp(self): 36 | ensure_environment_variables_set() 37 | 38 | with get_test_connection() as client: 39 | ############# Drop all dbs/collections ############# 40 | drop_all_collections(client) 41 | 42 | ############# Add datatype collections ############# 43 | pattern = re.compile('.*') 44 | regex = bson.Regex.from_native(pattern) 45 | regex.flags ^= re.UNICODE 46 | 47 | datatype_doc = { 48 | "double_field": 4.3, 49 | "string_field": "a sample string", 50 | "object_field" : { 51 | "obj_field_1_key": "obj_field_1_val", 52 | "obj_field_2_key": "obj_field_2_val" 53 | }, 54 | "array_field" : [ 55 | "array_item_1", 56 | "array_item_2", 57 | "array_item_3" 58 | ], 59 | "binary_data_field" : bson.Binary(b"a binary string"), 60 | "object_id_field": bson.objectid.ObjectId(b'123456789123'), 61 | "boolean_field" : True, 62 | "date_field" : datetime.datetime(2019, 8, 15, 19, 29, 14, 578000), 63 | "null_field": None, 64 | "regex_field" : regex, 65 | "32_bit_integer_field" : 32, 66 | "timestamp_field" : bson.timestamp.Timestamp(1565897157, 1), 67 | "64_bit_integer_field" : 34359738368, 68 | "decimal_field" : bson.Decimal128(decimal.Decimal('1.34')), 69 | "javaScript_field" : bson.code.Code("var x, y, z;"), 70 | "javaScript_with_scope_field" : bson.code.Code("function incrementX() { x++; }", scope={"x": 1}), 71 | "min_key_field" : bson.min_key.MinKey, 72 | "max_key_field" : bson.max_key.MaxKey, 73 | "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec72820c4'), 74 | "dbref_field": bson.dbref.DBRef("some_collection", bson.objectid.ObjectId(b'123456789123'), database='some_database') 75 | } 76 | 77 | client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc) 78 | 79 | # NB: Insert an invalid datetime to confirm that works correctly 80 | mongodb_version = client.server_info()["version"] 81 | run_mongodb_javascript(database="datatype_db", 82 | js="db.invalid_datatype_coll.insert({ \"date_field\": new ISODate(\"0000-01-01T00:00:00.000Z\") });", 83 | mongo_version=mongodb_version) 84 | 85 | def expected_check_streams(self): 86 | return { 87 | 'datatype_db-datatype_coll_1', 88 | 'datatype_db-invalid_datatype_coll' 89 | } 90 | 91 | def expected_pks(self): 92 | return { 93 | 'datatype_coll_1': {'_id'}, 94 | 'invalid_datatype_coll': {'_id'} 95 | } 96 | 97 | def expected_row_counts(self): 98 | return { 99 | 'datatype_coll_1': 1, 100 | 'invalid_datatype_coll': 1 101 | } 102 | 103 | 104 | def expected_sync_streams(self): 105 | return { 106 | 'datatype_coll_1', 107 | 'invalid_datatype_coll' 108 | } 109 | 110 | def name(self): 111 | return "tap_tester_mongodb_datatype" 112 | 113 | def tap_name(self): 114 | return "tap-mongodb" 115 | 116 | def get_type(self): 117 | return "platform.mongodb" 118 | 119 | def get_credentials(self): 120 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 121 | 122 | def get_properties(self): 123 | return {'host' : os.getenv('TAP_MONGODB_HOST'), 124 | 'port' : os.getenv('TAP_MONGODB_PORT'), 125 | 'user' : os.getenv('TAP_MONGODB_USER'), 126 | 'database' : os.getenv('TAP_MONGODB_DBNAME') 127 | } 128 | 129 | 130 | def test_run(self): 131 | 132 | conn_id = connections.ensure_connection(self) 133 | 134 | # ------------------------------- 135 | # ----------- Discovery ---------- 136 | # ------------------------------- 137 | 138 | # run in discovery mode 139 | check_job_name = runner.run_check_mode(self, conn_id) 140 | 141 | # verify check exit codes 142 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 143 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 144 | 145 | # verify the tap discovered the right streams 146 | found_catalogs = menagerie.get_catalogs(conn_id) 147 | 148 | # assert we find the correct streams 149 | self.assertEqual(self.expected_check_streams(), 150 | {c['tap_stream_id'] for c in found_catalogs}) 151 | 152 | 153 | 154 | for tap_stream_id in self.expected_check_streams(): 155 | found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] 156 | 157 | # assert that the pks are correct 158 | self.assertEqual(self.expected_pks()[found_stream['stream_name']], 159 | set(found_stream.get('metadata', {}).get('table-key-properties'))) 160 | 161 | # assert that the row counts are correct 162 | self.assertEqual(self.expected_row_counts()[found_stream['stream_name']], 163 | found_stream.get('metadata', {}).get('row-count')) 164 | 165 | # ----------------------------------- 166 | # ----------- Full Table Sync --------- 167 | # ----------------------------------- 168 | # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata 169 | for stream_catalog in found_catalogs: 170 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 171 | additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}] 172 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 173 | stream_catalog, 174 | annotated_schema, 175 | additional_md) 176 | 177 | # run full table sync 178 | sync_job_name = runner.run_sync_mode(self, conn_id) 179 | 180 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 181 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 182 | 183 | # verify the persisted schema was correct 184 | records_by_stream = runner.get_records_from_target_output() 185 | 186 | # assert that each of the streams that we synced are the ones that we expect to see 187 | record_count_by_stream = runner.examine_target_output_file(self, 188 | conn_id, 189 | self.expected_sync_streams(), 190 | self.expected_pks()) 191 | 192 | # assert that we get the correct number of records for each stream 193 | self.assertEqual(self.expected_row_counts(),record_count_by_stream) 194 | 195 | # assert that an activate_version_message is first and last message sent for each stream 196 | for stream_name in self.expected_sync_streams(): 197 | self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action']) 198 | self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action']) 199 | 200 | state = menagerie.get_state(conn_id) 201 | 202 | first_versions = {} 203 | 204 | for tap_stream_id in self.expected_check_streams(): 205 | # assert that the state has an initial_full_table_complete == True 206 | self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete']) 207 | # assert that there is a version bookmark in state 208 | first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version'] 209 | self.assertIsNotNone(first_versions[tap_stream_id]) 210 | 211 | record_id = None 212 | with get_test_connection() as client: 213 | record_id = str([x for x in client['datatype_db']['datatype_coll_1'].find()][0]['_id']) 214 | 215 | 216 | expected_record = { 217 | "javaScript_field": "var x, y, z;", 218 | "timestamp_field": "2019-08-15T19:25:57.000000Z", 219 | "_id": record_id, 220 | "date_field": "2019-08-15T19:29:14.578000Z", 221 | "string_field": "a sample string", 222 | "object_field": {"obj_field_2_key": "obj_field_2_val", 223 | "obj_field_1_key": "obj_field_1_val"}, 224 | "null_field": None, 225 | "regex_field": {"flags": 0, "pattern": ".*"}, 226 | "object_id_field": "313233343536373839313233", 227 | "64_bit_integer_field": 34359738368, 228 | "32_bit_integer_field": 32, 229 | "array_field": ["array_item_1", 230 | "array_item_2", 231 | "array_item_3"], 232 | "binary_data_field": "YSBiaW5hcnkgc3RyaW5n", 233 | "javaScript_with_scope_field": {"scope": "{'x': 1}", 234 | "value": "function incrementX() { x++; }"}, 235 | "double_field": decimal.Decimal('4.3'), 236 | "boolean_field": True, 237 | "decimal_field": decimal.Decimal('1.34'), 238 | 'uuid_field': "3e139ff5-d622-45c6-bf9e-1dfec72820c4", 239 | "dbref_field": {"id": "313233343536373839313233", 240 | "database": "some_database", 241 | "collection": "some_collection"} 242 | } 243 | 244 | dict_keys = list(expected_record.keys()) 245 | dict_keys.sort() 246 | 247 | self.assertEquals({i: expected_record[i] for i in dict_keys}, 248 | {i: records_by_stream['datatype_coll_1']['messages'][1]['data'][i] for i in dict_keys}) 249 | -------------------------------------------------------------------------------- /tests/test_mongodb_discovery.py: -------------------------------------------------------------------------------- 1 | import bson 2 | import datetime 3 | import decimal 4 | import os 5 | import pymongo 6 | import random 7 | import re 8 | import string 9 | import time 10 | import unittest 11 | 12 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 13 | from tap_tester import connections, menagerie, runner 14 | 15 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 16 | return ''.join(random.choice(chars) for x in range(size)) 17 | 18 | def generate_simple_coll_docs(num_docs): 19 | docs = [] 20 | for int_value in range(num_docs): 21 | docs.append({"int_field": int_value, "string_field": random_string_generator()}) 22 | return docs 23 | 24 | class MongoDBDiscovery(unittest.TestCase): 25 | AUTOMATIC = "automatic" 26 | UNSUPPORTED = "unsupported" 27 | VALID_REPLICATION_KEYS = "valid-replication-keys" 28 | PRIMARY_KEYS = "table-key-properties" 29 | FORCED_REPLICATION_METHOD = "forced-replication-method" 30 | INCREMENTAL = "INCREMENTAL" 31 | FULL_TABLE = "FULL_TABLE" 32 | LOG_BASED = "LOG_BASED" 33 | 34 | def setUp(self): 35 | 36 | ensure_environment_variables_set() 37 | 38 | with get_test_connection() as client: 39 | # drop all dbs/collections 40 | drop_all_collections(client) 41 | 42 | # simple_coll_1 has 50 documents 43 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) 44 | 45 | # simple_coll_2 has 100 documents 46 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) 47 | 48 | # admin_coll_1 has 50 documents 49 | client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(50)) 50 | 51 | # create view on simple_coll_1 52 | client["simple_db"].command(bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])])) 53 | 54 | # collections with same names as others in different dbs 55 | client["simple_db_2"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) 56 | client["simple_db_2"]["SIMPLE_COLL_1"].insert_many(generate_simple_coll_docs(50)) 57 | 58 | # collections with special characters in names 59 | client["special_db"]["hebrew_ישראל"].insert_many(generate_simple_coll_docs(50)) 60 | client['special_db']['hello!world?'].insert_many(generate_simple_coll_docs(50)) 61 | 62 | # Add datatype collections 63 | pattern = re.compile('.*') 64 | regex = bson.Regex.from_native(pattern) 65 | regex.flags ^= re.UNICODE 66 | datatype_doc = { 67 | "double_field": 4.3, 68 | "string_field": "a sample string", 69 | "object_field" : { 70 | "obj_field_1_key": "obj_field_1_val", 71 | "obj_field_2_key": "obj_field_2_val" 72 | }, 73 | "array_field" : [ 74 | "array_item_1", 75 | "array_item_2", 76 | "array_item_3" 77 | ], 78 | "binary_data_field" : b"a binary string", 79 | "object_id_field": bson.objectid.ObjectId(b'123456789123'), 80 | "boolean_field" : True, 81 | "date_field" : datetime.datetime.now(), 82 | "null_field": None, 83 | "regex_field" : regex, 84 | "32_bit_integer_field" : 32, 85 | "timestamp_field" : bson.timestamp.Timestamp(int(time.time()), 1), 86 | "64_bit_integer_field" : 34359738368, 87 | "decimal_field" : bson.Decimal128(decimal.Decimal('1.34')), 88 | "javaScript_field" : bson.code.Code("var x, y, z;"), 89 | "javaScript_with_scope_field" : bson.code.Code("function incrementX() { x++; }", scope={"x": 1}), 90 | "min_key_field" : bson.min_key.MinKey, 91 | "max_key_field" : bson.max_key.MaxKey 92 | } 93 | client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc) 94 | 95 | client["datatype_db"]["datatype_coll_2"].insert_one(datatype_doc) 96 | client["datatype_db"]["datatype_coll_2"].create_index([("date_field", pymongo.ASCENDING)]) 97 | client["datatype_db"]["datatype_coll_2"].create_index([("timestamp_field", pymongo.ASCENDING)]) 98 | client["datatype_db"]["datatype_coll_2"].create_index([("32_bit_integer_field", pymongo.ASCENDING)]) 99 | client["datatype_db"]["datatype_coll_2"].create_index([("64_bit_integer_field", pymongo.ASCENDING)]) 100 | 101 | def expected_check_streams(self): 102 | return { 103 | 'simple_db-simple_coll_1', 104 | 'simple_db-simple_coll_2', 105 | 'simple_db_2-simple_coll_1', 106 | 'simple_db_2-SIMPLE_COLL_1', 107 | 'admin-admin_coll_1', 108 | #'simple_db-simple_view_1', 109 | 'datatype_db-datatype_coll_1', 110 | 'datatype_db-datatype_coll_2', 111 | 'special_db-hebrew_ישראל', 112 | 'special_db-hello!world?' 113 | } 114 | 115 | def expected_primary_keys(self): 116 | """Defaults to '_id' in discovery, standard ObjectId(), any value can be provided (TODO where?)""" 117 | return { 118 | stream: {'_id'} 119 | for stream in self.expected_check_streams() 120 | } 121 | def expected_replication_keys(self): 122 | return { 123 | 'simple_db-simple_coll_1': {'_id'}, 124 | 'simple_db-simple_coll_2': {'_id'}, 125 | 'simple_db_2-simple_coll_1': {'_id'}, 126 | 'simple_db_2-SIMPLE_COLL_1': {'_id'}, 127 | 'admin-admin_coll_1': {'_id'}, 128 | #'simple_db-simple_view_1': {'_id'}, 129 | 'datatype_db-datatype_coll_1': { 130 | '_id', 131 | }, 132 | 'datatype_db-datatype_coll_2': { 133 | '_id', 134 | 'date_field', 135 | 'timestamp_field', 136 | '32_bit_integer_field', 137 | '64_bit_integer_field', 138 | }, 139 | 'special_db-hebrew_ישראל': {'_id'}, 140 | 'special_db-hello!world?': {'_id'}, 141 | } 142 | 143 | def expected_row_counts(self): 144 | return { 145 | 'simple_db-simple_coll_1': 50, 146 | 'simple_db-simple_coll_2': 100, 147 | 'simple_db_2-simple_coll_1': 50, 148 | 'simple_db_2-SIMPLE_COLL_1': 50, 149 | 'admin-admin_coll_1': 50, 150 | #'simple_db-simple_view_1': 50, 151 | 'datatype_db-datatype_coll_1': 1, 152 | 'datatype_db-datatype_coll_2': 1, 153 | 'special_db-hebrew_ישראל': 50, 154 | 'special_db-hello!world?': 50 155 | } 156 | 157 | def expected_table_names(self): 158 | return { 159 | 'simple_coll_1', 160 | 'simple_coll_2', 161 | 'SIMPLE_COLL_1', 162 | 'admin_coll_1', 163 | 'datatype_coll_1', 164 | 'datatype_coll_2', 165 | 'hebrew_ישראל', 166 | 'hello!world?' 167 | } 168 | 169 | def name(self): 170 | return "mongodb_discovery" 171 | 172 | def tap_name(self): 173 | return "tap-mongodb" 174 | 175 | def get_type(self): 176 | return "platform.mongodb" 177 | 178 | def get_credentials(self): 179 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 180 | 181 | def get_properties(self): 182 | return {'host' : os.getenv('TAP_MONGODB_HOST'), 183 | 'port' : os.getenv('TAP_MONGODB_PORT'), 184 | 'user' : os.getenv('TAP_MONGODB_USER'), 185 | 'database' : os.getenv('TAP_MONGODB_DBNAME') 186 | } 187 | 188 | def test_run(self): 189 | conn_id = connections.ensure_connection(self) 190 | 191 | # run in check mode 192 | check_job_name = runner.run_check_mode(self, conn_id) 193 | 194 | # check exit codes 195 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 196 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 197 | 198 | # Verify a catalog was produced by discovery 199 | catalog = menagerie.get_catalog(conn_id) 200 | self.assertGreater(len(catalog), 0) 201 | 202 | # Verify stream_name entries match the expected table names 203 | stream_catalogs = catalog['streams'] 204 | stream_names = {catalog['stream_name'] for catalog in stream_catalogs} 205 | self.assertSetEqual(self.expected_table_names(), stream_names) 206 | 207 | # Verify tap_stream_id entries follow naming convention - 208 | stream_ids = {catalog['tap_stream_id'] for catalog in stream_catalogs} 209 | self.assertSetEqual(self.expected_check_streams(), stream_ids) 210 | 211 | # Stream level assertions 212 | for stream in self.expected_check_streams(): 213 | with self.subTest(stream=stream): 214 | 215 | # gathering expectations 216 | expected_primary_keys = self.expected_primary_keys()[stream] 217 | expected_replication_keys = self.expected_replication_keys()[stream] 218 | expected_row_count = self.expected_row_counts()[stream] 219 | 220 | # collecting actual values... 221 | stream_catalog = [catalog for catalog in stream_catalogs 222 | if catalog["tap_stream_id"] == stream][0] 223 | schema_and_metadata = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 224 | stream_metadata = schema_and_metadata["metadata"] 225 | empty_breadcrumb_metadata = [item for item in stream_metadata if item.get("breadcrumb") == []] 226 | stream_properties = empty_breadcrumb_metadata[0]['metadata'] 227 | actual_primary_keys = set(stream_properties.get(self.PRIMARY_KEYS, [])) 228 | actual_replication_keys = set(stream_properties.get(self.VALID_REPLICATION_KEYS, [])) 229 | actual_replication_method = stream_properties.get(self.FORCED_REPLICATION_METHOD) 230 | actual_stream_inclusion = stream_properties.get('inclusion') 231 | actual_field_inclusions = set( 232 | item.get("metadata").get("inclusion") 233 | for item in stream_metadata 234 | if item.get("breadcrumb", []) != [] 235 | ) 236 | actual_fields_to_datatypes = { 237 | item['breadcrumb'][1]: item['metadata'].get('sql-datatype') 238 | for item in stream_metadata if item.get('breadcrumb') != [] 239 | } 240 | 241 | # Verify there is only 1 top level breadcrumb in metadata 242 | self.assertEqual(1, len(empty_breadcrumb_metadata)) 243 | 244 | # Verify replication key(s) match expectations 245 | self.assertSetEqual(expected_replication_keys, actual_replication_keys) 246 | 247 | # Verify primary key(s) match expectations 248 | self.assertSetEqual(expected_primary_keys, actual_primary_keys) 249 | 250 | # Verify no field-level inclusion exists 251 | self.assertSetEqual(set(), actual_field_inclusions) 252 | 253 | # Verify row-count metadata matches expectations 254 | self.assertEqual(expected_row_count, stream_properties['row-count']) 255 | 256 | # Verify selected metadata is None for all streams 257 | self.assertIsNone(stream_properties.get('selected')) 258 | 259 | # Verify is-view metadata is False 260 | self.assertFalse(stream_properties['is-view']) 261 | 262 | # Verify no forced-replication-method is present in metadata 263 | self.assertNotIn(self.FORCED_REPLICATION_METHOD, stream_properties.keys()) 264 | 265 | # Verify database-name is consistent with the tap_stream_id 266 | tap_stream_id_db_prefix = stream_catalog['tap_stream_id'].split('-')[0] 267 | self.assertEqual(tap_stream_id_db_prefix, stream_properties['database-name']) 268 | 269 | # Verify schema types match expectations 270 | self.assertDictEqual({'type': 'object'}, stream_catalog['schema']) 271 | -------------------------------------------------------------------------------- /tests/test_mongodb_full_table.py: -------------------------------------------------------------------------------- 1 | import bson 2 | import os 3 | import pdb 4 | import random 5 | import string 6 | import unittest 7 | 8 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 9 | from tap_tester import connections, menagerie, runner 10 | 11 | 12 | RECORD_COUNT = {} 13 | 14 | 15 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 16 | return ''.join(random.choice(chars) for x in range(size)) 17 | 18 | def generate_simple_coll_docs(num_docs): 19 | docs = [] 20 | for int_value in range(num_docs): 21 | docs.append({"int_field": int_value, "string_field": random_string_generator()}) 22 | return docs 23 | 24 | class MongoDBFullTable(unittest.TestCase): 25 | def setUp(self): 26 | ensure_environment_variables_set() 27 | 28 | with get_test_connection() as client: 29 | # drop all dbs/collections 30 | drop_all_collections(client) 31 | 32 | # simple_coll_1 has 50 documents 33 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) 34 | 35 | # create view on simple_coll_1 36 | client["simple_db"].command(bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])])) 37 | 38 | # simple_coll_2 has 100 documents 39 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) 40 | 41 | # admin_coll_1 has 50 documents 42 | client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(50)) 43 | 44 | # simple_coll_3 is an empty collection 45 | client["simple_db"].create_collection("simple_coll_3") 46 | 47 | # simple_coll_4 has documents with special chars and a lot of nesting 48 | client["simple_db"]["simple_coll_4"].insert_one({"hebrew_ישרא": "hebrew_ישרא"}) 49 | client["simple_db"]["simple_coll_4"].insert_one({"hebrew_ישרא": 2}) 50 | client["simple_db"]["simple_coll_4"].insert_one({"another_hebrew_ישראל": "another_hebrew_ישרא"}) 51 | nested_doc = {"field0": {}} 52 | current_doc = nested_doc 53 | for i in range(1, 101): 54 | current_doc["field{}".format(i-1)]["field{}".format(i)] = {} 55 | current_doc = current_doc["field{}".format(i-1)] 56 | current_doc["field100"] = "some_value" 57 | client["simple_db"]["simple_coll_4"].insert_one(nested_doc) 58 | 59 | max_col_doc = {} 60 | for x in range(1600): 61 | max_col_doc['col_{}'.format(x)] = x 62 | client["simple_db"]["simple_coll_4"].insert_one(max_col_doc) 63 | 64 | 65 | 66 | 67 | def tap_stream_id_to_stream(self): 68 | return { 69 | 'simple_db-simple_coll_1': 'simple_db_simple_coll_1', 70 | 'simple_db-simple_coll_2': 'simple_db_simple_coll_2', 71 | 'simple_db-simple_coll_3': 'simple_db_simple_coll_3', 72 | 'simple_db-simple_coll_4': 'simple_db_simple_coll_4', 73 | 'admin-admin_coll_1': 'admin_admin_coll_1' 74 | } 75 | 76 | def expected_check_streams(self): 77 | return { 78 | 'simple_db-simple_coll_1', 79 | 'simple_db-simple_coll_2', 80 | 'simple_db-simple_coll_3', 81 | 'simple_db-simple_coll_4', 82 | 'admin-admin_coll_1' 83 | } 84 | 85 | def expected_pks(self): 86 | return { 87 | 'simple_db_simple_coll_1': {'_id'}, 88 | 'simple_db_simple_coll_2': {'_id'}, 89 | 'simple_db_simple_coll_3': {'_id'}, 90 | 'simple_db_simple_coll_4': {'_id'}, 91 | 'admin_admin_coll_1': {'_id'} 92 | } 93 | 94 | def expected_row_counts(self): 95 | return { 96 | 'simple_db_simple_coll_1': 50, 97 | 'simple_db_simple_coll_2': 100, 98 | 'simple_db_simple_coll_3': 0, 99 | 'simple_db_simple_coll_4': 5, 100 | 'admin_admin_coll_1': 50 101 | } 102 | 103 | def expected_sync_streams(self): 104 | return { 105 | 'simple_db_simple_coll_1', 106 | 'simple_db_simple_coll_2', 107 | 'simple_db_simple_coll_3', 108 | 'simple_db_simple_coll_4', 109 | 'admin_admin_coll_1' 110 | } 111 | 112 | def name(self): 113 | return "tap_tester_mongodb_full_table" 114 | 115 | def tap_name(self): 116 | return "tap-mongodb" 117 | 118 | def get_type(self): 119 | return "platform.mongodb" 120 | 121 | def get_credentials(self): 122 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 123 | 124 | def get_properties(self): 125 | return {'host' : os.getenv('TAP_MONGODB_HOST'), 126 | 'port' : os.getenv('TAP_MONGODB_PORT'), 127 | 'user' : os.getenv('TAP_MONGODB_USER'), 128 | 'database' : os.getenv('TAP_MONGODB_DBNAME'), 129 | 'include_schemas_in_destination_stream_name': 'true' 130 | } 131 | 132 | def test_run(self): 133 | 134 | conn_id = connections.ensure_connection(self) 135 | 136 | # ------------------------------- 137 | # ----------- Discovery ---------- 138 | # ------------------------------- 139 | 140 | # run in discovery mode 141 | check_job_name = runner.run_check_mode(self, conn_id) 142 | 143 | # verify check exit codes 144 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 145 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 146 | 147 | # verify the tap discovered the right streams 148 | found_catalogs = menagerie.get_catalogs(conn_id) 149 | 150 | # assert we find the correct streams 151 | self.assertEqual(self.expected_check_streams(), 152 | {c['tap_stream_id'] for c in found_catalogs}) 153 | 154 | # ------------------------------------------- 155 | # ----------- First full Table Sync --------- 156 | # ------------------------------------------- 157 | # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata 158 | for stream_catalog in found_catalogs: 159 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 160 | additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}] 161 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 162 | stream_catalog, 163 | annotated_schema, 164 | additional_md) 165 | 166 | # run full table sync 167 | sync_job_name = runner.run_sync_mode(self, conn_id) 168 | 169 | # check exit status 170 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 171 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 172 | 173 | # streams that we synced are the ones that we expect to see 174 | records_by_stream = runner.get_records_from_target_output() 175 | record_count_by_stream = runner.examine_target_output_file(self, 176 | conn_id, 177 | self.expected_sync_streams(), 178 | self.expected_pks()) 179 | 180 | # assert that we get the correct number of records for each stream 181 | self.assertEqual(self.expected_row_counts(),record_count_by_stream) 182 | 183 | # assert that an activate_version_message is first and last message sent for each stream 184 | for stream_name in self.expected_sync_streams(): 185 | self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action']) 186 | self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action']) 187 | 188 | state = menagerie.get_state(conn_id) 189 | 190 | first_versions = {} 191 | 192 | for tap_stream_id in self.expected_check_streams(): 193 | 194 | # state has an initial_full_table_complete == True 195 | self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete']) 196 | 197 | # there is a version bookmark in state 198 | first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version'] 199 | self.assertIsNotNone(first_versions[tap_stream_id]) 200 | 201 | # ------------------------------------------- 202 | # ----------- Second full Table Sync --------- 203 | # ------------------------------------------- 204 | with get_test_connection() as client: 205 | # update existing documents in the collection to make sure we get the updates as well in the next sync 206 | doc_to_update = client["simple_db"]["simple_coll_1"].find_one() 207 | client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update["_id"]}, {"$set": {"int_field": 999}}) 208 | 209 | doc_to_update = client["simple_db"]["simple_coll_2"].find_one() 210 | client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update["_id"]}, {"$set": {"int_field": 888}}) 211 | 212 | doc_to_update = client["admin"]["admin_coll_1"].find_one() 213 | client["admin"]["admin_coll_1"].find_one_and_update({"_id": doc_to_update["_id"]}, {"$set": {"int_field": 777}}) 214 | 215 | # add 2 rows and run full table again, make sure we get initial number + 2 216 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(2)) 217 | 218 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(2)) 219 | 220 | client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(2)) 221 | 222 | sync_job_name = runner.run_sync_mode(self, conn_id) 223 | 224 | # check exit status 225 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 226 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 227 | 228 | # verify the persisted schema was correct 229 | records_by_stream = runner.get_records_from_target_output() 230 | 231 | # assert that each of the streams that we synced are the ones that we expect to see 232 | record_count_by_stream = runner.examine_target_output_file(self, 233 | conn_id, 234 | self.expected_sync_streams(), 235 | self.expected_pks()) 236 | 237 | state = menagerie.get_state(conn_id) 238 | 239 | # Verify that menagerie state does not include a key for currently syncing 240 | self.assertIsNone(state['currently_syncing']) 241 | 242 | # Verify that menagerie state does not include a key for oplog based syncing 243 | self.assertNotIn('oplog', state) 244 | 245 | # assert that we have correct number of records (including the two new records and the update which is to be resynced) 246 | new_expected_row_counts = {k: v+2 for k, v in self.expected_row_counts().items() if k not in ['simple_db_simple_coll_3', 247 | 'simple_db_simple_coll_4']} 248 | new_expected_row_counts['simple_db_simple_coll_3']=0 249 | new_expected_row_counts['simple_db_simple_coll_4']=5 250 | self.assertEqual(new_expected_row_counts, record_count_by_stream) 251 | 252 | # assert that we only have an ActivateVersionMessage as the last message and not the first 253 | for stream_name in self.expected_sync_streams(): 254 | if len(records_by_stream[stream_name]['messages']) > 1: 255 | self.assertNotEqual('activate_version', records_by_stream[stream_name]['messages'][0]['action'], stream_name + "failed") 256 | self.assertEqual('upsert', records_by_stream[stream_name]['messages'][0]['action'], stream_name + "failed") 257 | self.assertEqual('activate_version', records_by_stream[stream_name]['messages'][-1]['action'], stream_name + "failed") 258 | 259 | second_versions = {} 260 | for tap_stream_id in self.expected_check_streams(): 261 | found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] 262 | 263 | # state has an initial_full_table_complete == True 264 | self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete']) 265 | 266 | # version bookmark 267 | second_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version'] 268 | self.assertIsNotNone(second_versions[tap_stream_id]) 269 | 270 | # version in this state is different than that of the previous state 271 | self.assertNotEqual(first_versions[tap_stream_id], second_versions[tap_stream_id]) 272 | 273 | # version which is larger than the previous target version 274 | self.assertGreater(second_versions[tap_stream_id], first_versions[tap_stream_id]) 275 | 276 | # verify that menagerie state does include the version which matches the target version 277 | self.assertEqual(records_by_stream[self.tap_stream_id_to_stream()[tap_stream_id]]['table_version'], second_versions[tap_stream_id]) 278 | -------------------------------------------------------------------------------- /tests/test_mongodb_full_table_id.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import bson 3 | import os 4 | import random 5 | import string 6 | import time 7 | import unittest 8 | 9 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 10 | from tap_tester import connections, menagerie, runner 11 | 12 | 13 | RECORD_COUNT = {} 14 | 15 | 16 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 17 | return ''.join(random.choice(chars) for x in range(size)) 18 | 19 | def generate_simple_coll_docs(num_docs): 20 | docs = [] 21 | for int_value in range(num_docs): 22 | docs.append({"_id": int_value, "int_field": int_value, "string_field": random_string_generator()}) 23 | return docs 24 | 25 | def generate_simple_binary_coll_docs(num_docs): 26 | docs = [] 27 | for int_value in range(num_docs): 28 | docs.append({"_id": bson.Binary("test {}".format(int_value).encode()), "int_field": int_value, "string_field": random_string_generator()}) 29 | return docs 30 | 31 | 32 | class MongoDBFullTableID(unittest.TestCase): 33 | def setUp(self): 34 | ensure_environment_variables_set() 35 | 36 | with get_test_connection() as client: 37 | # drop all dbs/collections 38 | drop_all_collections(client) 39 | 40 | # simple_coll_1 has 50 documents, id is an integer instead of ObjectId 41 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) 42 | 43 | # simple_coll_2 has 100 documents, id is an integer instead of ObjectId 44 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_binary_coll_docs(50)) 45 | 46 | def expected_check_streams(self): 47 | return { 48 | 'simple_db-simple_coll_1', 49 | 'simple_db-simple_coll_2' 50 | } 51 | 52 | def expected_pks(self): 53 | return { 54 | 'simple_coll_1': {'_id'}, 55 | 'simple_coll_2': {'_id'} 56 | } 57 | 58 | def expected_row_counts(self): 59 | return { 60 | 'simple_coll_1': 50, 61 | 'simple_coll_2': 50 62 | } 63 | 64 | def expected_sync_streams(self): 65 | return { 66 | 'simple_coll_1', 67 | 'simple_coll_2' 68 | } 69 | 70 | def name(self): 71 | return "tap_tester_mongodb_full_table_id" 72 | 73 | def tap_name(self): 74 | return "tap-mongodb" 75 | 76 | def get_type(self): 77 | return "platform.mongodb" 78 | 79 | def get_credentials(self): 80 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 81 | 82 | def get_properties(self): 83 | return {'host' : os.getenv('TAP_MONGODB_HOST'), 84 | 'port' : os.getenv('TAP_MONGODB_PORT'), 85 | 'user' : os.getenv('TAP_MONGODB_USER'), 86 | 'database' : os.getenv('TAP_MONGODB_DBNAME') 87 | } 88 | 89 | def test_run(self): 90 | 91 | conn_id = connections.ensure_connection(self) 92 | 93 | # ------------------------------- 94 | # ----------- Discovery ---------- 95 | # ------------------------------- 96 | 97 | # run in discovery mode 98 | check_job_name = runner.run_check_mode(self, conn_id) 99 | 100 | # verify check exit codes 101 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 102 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 103 | 104 | # verify the tap discovered the right streams 105 | found_catalogs = menagerie.get_catalogs(conn_id) 106 | 107 | # assert we find the correct streams 108 | self.assertEqual(self.expected_check_streams(), 109 | {c['tap_stream_id'] for c in found_catalogs}) 110 | 111 | for tap_stream_id in self.expected_check_streams(): 112 | found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] 113 | 114 | # assert that the pks are correct 115 | self.assertEqual(self.expected_pks()[found_stream['stream_name']], 116 | set(found_stream.get('metadata', {}).get('table-key-properties'))) 117 | 118 | # assert that the row counts are correct 119 | self.assertEqual(self.expected_row_counts()[found_stream['stream_name']], 120 | found_stream.get('metadata', {}).get('row-count')) 121 | 122 | # ----------------------------------- 123 | # ----------- Full Table Sync --------- 124 | # ----------------------------------- 125 | # select simple_coll_1 stream and add replication method metadata 126 | for stream_catalog in found_catalogs: 127 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 128 | additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}] 129 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 130 | stream_catalog, 131 | annotated_schema, 132 | additional_md) 133 | # synthesize interrupted state 134 | interrupted_state = { 135 | 'currently_syncing' : 'simple_db-simple_coll_1', 136 | 'bookmarks' : {'simple_db-simple_coll_1': { 'max_id_value': 49, 137 | 'max_id_type': 'int', 138 | 'initial_full_table_complete': False, 139 | 'last_id_fetched': 25, 140 | 'last_id_fetched_type': 'int', 141 | 'version': int(time.time() * 1000)}, 142 | 'simple_db-simple_coll_2': { 'max_id_value': base64.b64encode("test {}".format(49).encode()), 143 | 'max_id_type': 'bytes', 144 | 'initial_full_table_complete': False, 145 | 'last_id_fetched': base64.b64encode("test {}".format(25).encode()), 146 | 'last_id_fetched_type': 'bytes', 147 | 'version': int(time.time() * 1000)}}} 148 | 149 | # update existing documents in collection with int_field value less than 25, and verify they do not come up in the sync 150 | # update existing documents in collection with int_field value greater than 25, and verify they come up in the sync 151 | with get_test_connection() as client: 152 | # find_one() is going to retreive the first document in the collection 153 | doc_to_update_1 = client["simple_db"]["simple_coll_1"].find_one() 154 | client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update_1["_id"]}, {"$set": {"int_field": 999}}) 155 | 156 | doc_to_update_2 = client["simple_db"]["simple_coll_2"].find_one() 157 | client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update_2["_id"]}, {"$set": {"int_field": 888}}) 158 | 159 | doc_to_update_3 = client["simple_db"]["simple_coll_1"].find_one({"int_field": 30}) 160 | client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update_3["_id"]}, {"$set": {"int_field": 777}}) 161 | 162 | doc_to_update_4 = client["simple_db"]["simple_coll_2"].find_one({"int_field": 40}) 163 | client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update_4["_id"]}, {"$set": {"int_field": 666}}) 164 | 165 | menagerie.set_state(conn_id, interrupted_state) 166 | runner.run_sync_mode(self, conn_id) 167 | 168 | # streams that we synced are the ones that we expect to see 169 | records_by_stream = runner.get_records_from_target_output() 170 | record_count_by_stream = runner.examine_target_output_file(self, 171 | conn_id, 172 | self.expected_sync_streams(), 173 | self.expected_pks()) 174 | 175 | # ActivateVersionMessage as the last message and not the first 176 | for stream_name in self.expected_sync_streams(): 177 | self.assertNotEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action']) 178 | self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action']) 179 | 180 | # _id of the first record sync'd for each stream is the bookmarked 181 | # last_id_fetched from the interrupted_state passed to the tap 182 | self.assertEqual(records_by_stream['simple_coll_1']['messages'][0]['data']['_id'], 183 | int(interrupted_state['bookmarks']['simple_db-simple_coll_1']['last_id_fetched'])) 184 | 185 | # _id of the last record sync'd for each stream is the bookmarked 186 | # max_id_value from the interrupted_state passed to the tap 187 | self.assertEqual(records_by_stream['simple_coll_1']['messages'][-2]['data']['_id'], 188 | int(interrupted_state['bookmarks']['simple_db-simple_coll_1']['max_id_value'])) 189 | 190 | # verify we are not seeing any documents which were updated having id < 25 191 | self.assertNotEqual(999, records_by_stream['simple_coll_1']['messages'][0]['data']['int_field']) 192 | self.assertNotEqual(888, records_by_stream['simple_coll_2']['messages'][0]['data']['int_field']) 193 | 194 | int_value = False 195 | for x in records_by_stream['simple_coll_1']['messages'][:-1]: 196 | # We are not considering the last element of this list because it does not have 'data' 197 | if int(x['data']['int_field']) == 999: 198 | int_value = True 199 | self.assertEqual(False, int_value) 200 | 201 | int_value2 = False 202 | for x in records_by_stream['simple_coll_1']['messages'][:-1]: 203 | if x['data']['int_field'] == 888: 204 | int_value2 = True 205 | self.assertEqual(False, int_value2) 206 | 207 | # verify we are seeing the documents which were updated having id > 25 208 | # we are picking the 5th and 15th element in the list because we updated the 30th and 40th document, (doc starting with 25) 209 | self.assertEqual(777, records_by_stream['simple_coll_1']['messages'][5]['data']['int_field']) 210 | self.assertEqual(666, records_by_stream['simple_coll_2']['messages'][15]['data']['int_field']) 211 | 212 | # assert that final state has no last_id_fetched and max_id_value bookmarks 213 | final_state = menagerie.get_state(conn_id) 214 | for tap_stream_id in self.expected_check_streams(): 215 | self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('last_id_fetched')) 216 | self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('max_id_value')) 217 | -------------------------------------------------------------------------------- /tests/test_mongodb_full_table_interruptible.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pymongo 3 | import random 4 | import string 5 | import time 6 | import unittest 7 | 8 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 9 | from tap_tester import connections, menagerie, runner 10 | 11 | 12 | RECORD_COUNT = {} 13 | 14 | 15 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 16 | return ''.join(random.choice(chars) for x in range(size)) 17 | 18 | def generate_simple_coll_docs(num_docs): 19 | docs = [] 20 | for int_value in range(num_docs): 21 | docs.append({"int_field": int_value, "string_field": random_string_generator()}) 22 | return docs 23 | 24 | class MongoDBFullTableInterruptible(unittest.TestCase): 25 | def setUp(self): 26 | ensure_environment_variables_set() 27 | 28 | with get_test_connection() as client: 29 | # drop all dbs/collections 30 | drop_all_collections(client) 31 | 32 | # simple_coll_1 has 50 documents 33 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) 34 | 35 | # simple_coll_2 has 100 documents 36 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) 37 | 38 | def expected_check_streams(self): 39 | return { 40 | 'simple_db-simple_coll_1', 41 | 'simple_db-simple_coll_2', 42 | } 43 | 44 | def expected_pks(self): 45 | return { 46 | 'simple_coll_1': {'_id'}, 47 | 'simple_coll_2': {'_id'}, 48 | } 49 | 50 | def expected_row_counts(self): 51 | return { 52 | 'simple_coll_1': 25, 53 | 'simple_coll_2': 50, 54 | } 55 | 56 | def expected_sync_streams(self): 57 | return { 58 | 'simple_coll_1', 59 | 'simple_coll_2' 60 | } 61 | 62 | def name(self): 63 | return "tap_tester_mongodb_full_table_interruptible" 64 | 65 | def tap_name(self): 66 | return "tap-mongodb" 67 | 68 | def get_type(self): 69 | return "platform.mongodb" 70 | 71 | def get_credentials(self): 72 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 73 | 74 | def get_properties(self): 75 | return {'host' : os.getenv('TAP_MONGODB_HOST'), 76 | 'port' : os.getenv('TAP_MONGODB_PORT'), 77 | 'user' : os.getenv('TAP_MONGODB_USER'), 78 | 'database' : os.getenv('TAP_MONGODB_DBNAME') 79 | } 80 | 81 | def test_run(self): 82 | 83 | conn_id = connections.ensure_connection(self) 84 | 85 | # ------------------------------- 86 | # ----------- Discovery ---------- 87 | # ------------------------------- 88 | 89 | # run in discovery mode 90 | check_job_name = runner.run_check_mode(self, conn_id) 91 | 92 | # verify check exit codes 93 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 94 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 95 | 96 | # verify the tap discovered the right streams 97 | found_catalogs = menagerie.get_catalogs(conn_id) 98 | 99 | # assert we find the correct streams 100 | self.assertEqual(self.expected_check_streams(), 101 | {c['tap_stream_id'] for c in found_catalogs}) 102 | 103 | # ----------------------------------- 104 | # ----------- Full Table Sync --------- 105 | # ----------------------------------- 106 | # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata 107 | for stream_catalog in found_catalogs: 108 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 109 | additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}] 110 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 111 | stream_catalog, 112 | annotated_schema, 113 | additional_md) 114 | # Synthesize interrupted state 115 | interrupted_state = { 116 | 'currently_syncing' : 'simple_db-simple_coll_1', 117 | 'bookmarks' : {} 118 | } 119 | 120 | versions = {} 121 | with get_test_connection() as client: 122 | for stream_name in self.expected_sync_streams(): 123 | rows = [x for x in client['simple_db'][stream_name].find(sort=[("_id", pymongo.ASCENDING)])] 124 | # set last_id_fetched to middle point of table 125 | last_id_fetched = str(rows[int(len(rows)/2)]['_id']) 126 | max_id_value = str(rows[-1]['_id']) 127 | 128 | tap_stream_id = 'simple_db-'+stream_name 129 | version = int(time.time() * 1000) 130 | interrupted_state['bookmarks'][tap_stream_id] = { 131 | 'max_id_value': max_id_value, 132 | 'max_id_type': 'ObjectId', 133 | 'initial_full_table_complete': False, 134 | 'last_id_fetched': last_id_fetched, 135 | 'last_id_fetched_type': 'ObjectId', 136 | 'version': version 137 | } 138 | versions[tap_stream_id] = version 139 | 140 | # update existing documents in collection with int_field value less than 25, and verify they do not come up in the sync 141 | # update existing documents in collection with int_field value greater than 25, and verify they come up in the sync 142 | 143 | # find_one() is going to retreive the first document in the collection 144 | doc_to_update_1 = client["simple_db"]["simple_coll_1"].find_one() 145 | client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update_1["_id"]}, {"$set": {"int_field": 999}}) 146 | 147 | doc_to_update_2 = client["simple_db"]["simple_coll_2"].find_one() 148 | client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update_2["_id"]}, {"$set": {"int_field": 888}}) 149 | 150 | doc_to_update_3 = client["simple_db"]["simple_coll_1"].find_one({"int_field": 30}) 151 | client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update_3["_id"]}, {"$set": {"int_field": 777}}) 152 | 153 | doc_to_update_4 = client["simple_db"]["simple_coll_2"].find_one({"int_field": 80}) 154 | client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update_4["_id"]}, {"$set": {"int_field": 666}}) 155 | 156 | 157 | menagerie.set_state(conn_id, interrupted_state) 158 | 159 | runner.run_sync_mode(self, conn_id) 160 | 161 | # streams that we synced are the ones that we expect to see 162 | record_count_by_stream = runner.examine_target_output_file(self, 163 | conn_id, 164 | self.expected_sync_streams(), 165 | self.expected_pks()) 166 | 167 | # record counts 168 | records_by_stream = runner.get_records_from_target_output() 169 | self.assertEqual(self.expected_row_counts(), record_count_by_stream) 170 | 171 | # ActivateVersionMessage as the last message and not the first 172 | for stream_name in self.expected_sync_streams(): 173 | self.assertNotEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action']) 174 | self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action']) 175 | 176 | # _id of the first record sync'd for each stream is the bookmarked 177 | # last_id_fetched from the interrupted_state passed to the tap 178 | self.assertEqual(records_by_stream['simple_coll_1']['messages'][0]['data']['_id'], 179 | interrupted_state['bookmarks']['simple_db-simple_coll_1']['last_id_fetched']) 180 | self.assertEqual(records_by_stream['simple_coll_2']['messages'][0]['data']['_id'], 181 | interrupted_state['bookmarks']['simple_db-simple_coll_2']['last_id_fetched']) 182 | 183 | # _id of the last record sync'd for each stream is the bookmarked 184 | # max_id_value from the interrupted_state passed to the tap 185 | self.assertEqual(records_by_stream['simple_coll_1']['messages'][-2]['data']['_id'], 186 | interrupted_state['bookmarks']['simple_db-simple_coll_1']['max_id_value']) 187 | self.assertEqual(records_by_stream['simple_coll_2']['messages'][-2]['data']['_id'], 188 | interrupted_state['bookmarks']['simple_db-simple_coll_2']['max_id_value']) 189 | 190 | # verify we are not seeing any documents which were updated having id < interrupted id value 191 | # checking just the first document value 192 | self.assertNotEqual(999, records_by_stream['simple_coll_1']['messages'][0]['data']['int_field']) 193 | self.assertNotEqual(888, records_by_stream['simple_coll_2']['messages'][0]['data']['int_field']) 194 | # checking if the updates are visible in all the documents in simple_coll_1 195 | int_value = False 196 | for x in records_by_stream['simple_coll_1']['messages'][:-1]: 197 | # We are not considering the last element of this list because it does not have 'data' 198 | if int(x['data']['int_field']) == 999: 199 | int_value = True 200 | self.assertEqual(False, int_value) 201 | # checking if the updates are visible in all the documents in simple_coll_2 202 | int_value2 = False 203 | for x in records_by_stream['simple_coll_1']['messages'][:-1]: 204 | if x['data']['int_field'] == 888: 205 | int_value2 = True 206 | self.assertEqual(False, int_value2) 207 | 208 | # verify we are seeing the documents which were updated having id > interruped id value 209 | # we are picking the 5th and 15th element in the list because we updated the 30th and 40th document, (doc starting with 25) 210 | self.assertEqual(777, records_by_stream['simple_coll_1']['messages'][5]['data']['int_field']) 211 | self.assertEqual(666, records_by_stream['simple_coll_2']['messages'][30]['data']['int_field']) 212 | 213 | # assert that final state has no last_id_fetched and max_id_value bookmarks 214 | final_state = menagerie.get_state(conn_id) 215 | for tap_stream_id in self.expected_check_streams(): 216 | self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('last_id_fetched')) 217 | self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('max_id_value')) 218 | 219 | state = menagerie.get_state(conn_id) 220 | for tap_stream_id, stream_bookmarks in state.get('bookmarks', {}).items(): 221 | self.assertTrue(stream_bookmarks.get('initial_full_table_complete', False)) 222 | -------------------------------------------------------------------------------- /tests/test_mongodb_id_pk_variations.py: -------------------------------------------------------------------------------- 1 | import bson 2 | import datetime 3 | import decimal 4 | import os 5 | import random 6 | import string 7 | import unittest 8 | from bson.decimal128 import Decimal128 9 | 10 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 11 | from tap_tester import connections, menagerie, runner 12 | 13 | 14 | RECORD_COUNT = {} 15 | 16 | replication_method = ["INCREMENTAL", "FULL_TABLE", "LOG_BASED"] 17 | 18 | 19 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 20 | return ''.join(random.choice(chars) for x in range(size)) 21 | 22 | 23 | def generate_docs_no_id(num_docs): 24 | docs = [] 25 | for int_value in range(num_docs): 26 | docs.append({"int_field": int_value, "string_field": random_string_generator()}) 27 | return docs 28 | 29 | 30 | def generate_docs_int_id(num_docs): 31 | docs = [] 32 | for int_value in range(num_docs): 33 | docs.append({"_id": int_value, "string_field": random_string_generator()}) 34 | return docs 35 | 36 | 37 | def generate_docs_double_id(): 38 | docs = [] 39 | docs.append({"_id": 546.43, "string_field": random_string_generator()}) 40 | docs.append({"_id": 555.56, "string_field": random_string_generator()}) 41 | return docs 42 | 43 | 44 | def generate_docs_string_id(): 45 | docs = [] 46 | docs.append({"_id": 'primary_key', "string_field": random_string_generator()}) 47 | docs.append({"_id": 'secondary_key', "string_field": random_string_generator()}) 48 | return docs 49 | 50 | 51 | def generate_docs_binary_id(): 52 | docs = [] 53 | docs.append({"_id": 0b10101011, "string_field": random_string_generator()}) 54 | docs.append({"_id": 0b10101000, "string_field": random_string_generator()}) 55 | return docs 56 | 57 | 58 | def generate_docs_boolean_id(): 59 | docs = [] 60 | docs.append({"_id": True, "string_field": random_string_generator()}) 61 | docs.append({"_id": False, "string_field": random_string_generator()}) 62 | return docs 63 | 64 | 65 | def generate_docs_date_id(): 66 | docs = [] 67 | d1 = datetime.datetime.utcnow() - datetime.timedelta(days=1) 68 | d2 = datetime.datetime.utcnow() 69 | docs.append({"_id": d1, "string_field": random_string_generator()}) 70 | docs.append({"_id": d2, "string_field": random_string_generator()}) 71 | return docs 72 | 73 | 74 | def generate_docs_32_bit_int_id(): 75 | docs = [] 76 | docs.append({'_id': 2147483640, 'string_field': random_string_generator()}) 77 | docs.append({'_id': 2147483620, 'string_field': random_string_generator()}) 78 | return docs 79 | 80 | 81 | def generate_docs_64_bit_int_id(): 82 | docs = [] 83 | docs.append({'_id': 9223372036854775800, 'string_field': random_string_generator()}) 84 | docs.append({'_id': 9223372036854775799, 'string_field': random_string_generator()}) 85 | return docs 86 | 87 | 88 | def generate_docs_128_decimal_id(): 89 | docs = [] 90 | docs.append({'_id': bson.Decimal128(decimal.Decimal('1.34')), 'string_field': random_string_generator()}) 91 | docs.append({'_id': bson.Decimal128(decimal.Decimal('2.34')), 'string_field': random_string_generator()}) 92 | return docs 93 | 94 | 95 | class MongoDbPrimaryKeyIdVariation(unittest.TestCase): 96 | 97 | def setUp(self): 98 | ensure_environment_variables_set() 99 | 100 | with get_test_connection() as client: 101 | # drop all dbs/collections 102 | drop_all_collections(client) 103 | 104 | # create collections for all the different variants for _id 105 | client["simple_db"]["coll_with_no_id"].insert_many(generate_docs_no_id(5)) 106 | client["simple_db"]["coll_with_int_id"].insert_many(generate_docs_int_id(5)) 107 | client["simple_db"]["coll_with_double_id"].insert_many(generate_docs_double_id()) 108 | client["simple_db"]["coll_with_string_id"].insert_many(generate_docs_string_id()) 109 | client["simple_db"]["coll_with_binary_id"].insert_many(generate_docs_binary_id()) 110 | client["simple_db"]["coll_with_date_id"].insert_many(generate_docs_date_id()) 111 | client["simple_db"]["coll_with_32_bit_int_id"].insert_many(generate_docs_32_bit_int_id()) 112 | client["simple_db"]["coll_with_64_bit_int_id"].insert_many(generate_docs_64_bit_int_id()) 113 | 114 | def expected_check_streams(self): 115 | return { 116 | 'simple_db-coll_with_no_id', 117 | 'simple_db-coll_with_int_id', 118 | 'simple_db-coll_with_double_id', 119 | 'simple_db-coll_with_string_id', 120 | 'simple_db-coll_with_binary_id', 121 | 'simple_db-coll_with_date_id', 122 | 'simple_db-coll_with_32_bit_int_id', 123 | 'simple_db-coll_with_64_bit_int_id' 124 | } 125 | 126 | def expected_pks(self): 127 | return { 128 | 'coll_with_no_id': {'_id'}, 129 | 'coll_with_int_id': {'_id'}, 130 | 'coll_with_double_id': {'_id'}, 131 | 'coll_with_string_id': {'_id'}, 132 | 'coll_with_binary_id': {'_id'}, 133 | 'coll_with_date_id': {'_id'}, 134 | 'coll_with_32_bit_int_id': {'_id'}, 135 | 'coll_with_64_bit_int_id': {'_id'} 136 | } 137 | 138 | def expected_sync_streams(self): 139 | return { 140 | 'coll_with_no_id', 141 | 'coll_with_int_id', 142 | 'coll_with_double_id', 143 | 'coll_with_string_id', 144 | 'coll_with_binary_id', 145 | 'coll_with_date_id', 146 | 'coll_with_32_bit_int_id', 147 | 'coll_with_64_bit_int_id' 148 | } 149 | 150 | def expected_record_count(self): 151 | return {'coll_with_double_id': 2, 152 | 'coll_with_32_bit_int_id': 2, 153 | 'coll_with_64_bit_int_id': 2, 154 | 'coll_with_no_id': 5, 155 | 'coll_with_binary_id': 2, 156 | 'coll_with_string_id': 2, 157 | 'coll_with_date_id': 2, 158 | 'coll_with_int_id': 5 159 | } 160 | 161 | def expected_pk_values(self): 162 | return { 163 | 'coll_with_string_id': ['primary_key', 'secondary_key'], 164 | 'coll_with_binary_id': [171, 168], 165 | 'coll_with_no_id': [], 166 | 'coll_with_64_bit_int_id': [9223372036854775800, 9223372036854775799], 167 | 'coll_with_int_id': [0, 1, 2, 3, 4], 168 | 'coll_with_32_bit_int_id': [2147483640, 2147483620], 169 | 'coll_with_date_id': [datetime.datetime.utcnow() - datetime.timedelta(days=1), datetime.datetime.utcnow()], 170 | 'coll_with_double_id': [decimal.Decimal('546.43'), decimal.Decimal('555.56')] 171 | } 172 | 173 | def expected_pk_datatype(self): 174 | return { 175 | 'coll_with_string_id': str, 176 | 'coll_with_binary_id': int, 177 | 'coll_with_no_id': [], 178 | 'coll_with_64_bit_int_id': int, 179 | 'coll_with_int_id': int, 180 | 'coll_with_32_bit_int_id': int, 181 | 'coll_with_date_id': [datetime.datetime.utcnow() - datetime.timedelta(days=1), datetime.datetime.utcnow()], 182 | 'coll_with_double_id': decimal.Decimal 183 | } 184 | 185 | def name(self): 186 | return "tap_tester_mongodb_id_pk_variations" 187 | 188 | def tap_name(self): 189 | return "tap-mongodb" 190 | 191 | def get_type(self): 192 | return "platform.mongodb" 193 | 194 | def get_credentials(self): 195 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 196 | 197 | def get_properties(self): 198 | return {'host': os.getenv('TAP_MONGODB_HOST'), 199 | 'port': os.getenv('TAP_MONGODB_PORT'), 200 | 'user': os.getenv('TAP_MONGODB_USER'), 201 | 'database': os.getenv('TAP_MONGODB_DBNAME') 202 | } 203 | 204 | def test_run(self): 205 | ''' 206 | Running the test with all the available replication methods 207 | ''' 208 | 209 | for replication in replication_method: 210 | if replication != 'INCREMENTAL': 211 | additional_metadata = [{"breadcrumb": [], "metadata": {'replication-method': replication}}] 212 | else: 213 | additional_metadata = [{"breadcrumb": [], "metadata": {'replication-method': replication, 'replication-key': '_id'}}] 214 | self.run_test(additional_metadata) 215 | 216 | def run_test(self, additional_metadata): 217 | 218 | conn_id = connections.ensure_connection(self) 219 | 220 | # ------------------------------- 221 | # ----------- Discovery ---------- 222 | # ------------------------------- 223 | 224 | # run in discovery mode 225 | check_job_name = runner.run_check_mode(self, conn_id) 226 | 227 | # verify check exit codes 228 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 229 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 230 | 231 | # verify the tap discovered the right streams 232 | found_catalogs = menagerie.get_catalogs(conn_id) 233 | 234 | # assert we find the correct streams 235 | self.assertEqual(self.expected_check_streams(), 236 | {c['tap_stream_id'] for c in found_catalogs}) 237 | 238 | # ----------------------------------- 239 | # -----------Initial Full Table Sync --------- 240 | # ----------------------------------- 241 | # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata 242 | for stream_catalog in found_catalogs: 243 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 244 | additional_md = additional_metadata 245 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 246 | stream_catalog, 247 | annotated_schema, 248 | additional_md) 249 | # verify _id is marked in metadata as table-key-property 250 | self.assertEqual(stream_catalog['metadata']['table-key-properties'][0], '_id') 251 | 252 | runner.run_sync_mode(self, conn_id) 253 | 254 | # streams that we synced are the ones that we expect to see 255 | record_count_by_stream = runner.examine_target_output_file(self, 256 | conn_id, 257 | self.expected_sync_streams(), 258 | self.expected_pks()) 259 | 260 | records_by_stream = runner.get_records_from_target_output() 261 | 262 | # verify if we are capturing all the data for all the streams 263 | self.assertEqual(record_count_by_stream, self.expected_record_count()) 264 | 265 | # verify the values of primary key and the datatype in the replicated records 266 | for stream in records_by_stream.keys(): 267 | if stream not in ['coll_with_date_id', 'coll_with_no_id']: 268 | for records in [rec['data'] for rec in records_by_stream[stream]['messages'] if rec.get('action') == 'upsert']: 269 | self.assertIn(records['_id'], self.expected_pk_values()[stream]) 270 | self.assertIsInstance(records['_id'], self.expected_pk_datatype()[stream]) 271 | -------------------------------------------------------------------------------- /tests/test_mongodb_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pymongo 3 | import random 4 | import string 5 | import unittest 6 | from pymongo import ASCENDING 7 | 8 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 9 | from tap_tester import connections, menagerie, runner 10 | 11 | 12 | RECORD_COUNT = {} 13 | 14 | 15 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 16 | return ''.join(random.choice(chars) for x in range(size)) 17 | 18 | def generate_simple_coll_docs(num_docs): 19 | docs = [] 20 | populated_string_fields = {f"string_field_{i}": random_string_generator() for i in range(1, 64)} 21 | for int_value in range(num_docs): 22 | docs.append({"int_field": int_value, **populated_string_fields}) 23 | return docs 24 | 25 | class MongoDBOplog(unittest.TestCase): 26 | def setUp(self): 27 | 28 | ensure_environment_variables_set() 29 | 30 | with get_test_connection() as client: 31 | ############# Drop all dbs/collections ############# 32 | drop_all_collections(client) 33 | 34 | ############# Add simple collections ############# 35 | # simple_coll_1 has 50 documents 36 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) 37 | 38 | # simple_coll_2 has 100 documents 39 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) 40 | 41 | for index in self.expected_string_fields(): 42 | client["simple_db"]["simple_coll_1"].create_index(index) 43 | 44 | # # max 32 fields in a compound index (NO PLANS TO SUPPORT THIS IN THE TAP) 45 | # client["simple_db"]["simple_coll_1"].create_index([ 46 | # ('string_field', pymongo.ASCENDING), ('string_field_02', pymongo.ASCENDING), 47 | # ('string_field_03', pymongo.ASCENDING), ('string_field_04', pymongo.ASCENDING), 48 | # ('string_field_05', pymongo.ASCENDING), ('string_field_06', pymongo.ASCENDING), 49 | # ('string_field_07', pymongo.ASCENDING), ('string_field_08', pymongo.ASCENDING), 50 | # ('string_field_09', pymongo.ASCENDING), ('string_field_10', pymongo.ASCENDING), 51 | # ('string_field_11', pymongo.ASCENDING), ('string_field_12', pymongo.ASCENDING), 52 | # ('string_field_13', pymongo.ASCENDING), ('string_field_14', pymongo.ASCENDING), 53 | # ('string_field_15', pymongo.ASCENDING), ('string_field_16', pymongo.ASCENDING), 54 | # ('string_field_17', pymongo.ASCENDING), ('string_field_18', pymongo.ASCENDING), 55 | # ('string_field_19', pymongo.ASCENDING), ('string_field_20', pymongo.ASCENDING), 56 | # ('string_field_21', pymongo.ASCENDING), ('string_field_22', pymongo.ASCENDING), 57 | # ('string_field_23', pymongo.ASCENDING), ('string_field_24', pymongo.ASCENDING), 58 | # ('string_field_25', pymongo.ASCENDING), ('string_field_26', pymongo.ASCENDING), 59 | # ('string_field_27', pymongo.ASCENDING), ('string_field_28', pymongo.ASCENDING), 60 | # ('string_field_29', pymongo.ASCENDING), ('string_field_30', pymongo.ASCENDING), 61 | # ('string_field_31', pymongo.ASCENDING), ('string_field_32', pymongo.ASCENDING)]) 62 | 63 | self.index_info = client["simple_db"]["simple_coll_1"].index_information() 64 | 65 | def expected_check_streams(self): 66 | return { 67 | 'simple_db-simple_coll_1', 68 | 'simple_db-simple_coll_2', 69 | } 70 | 71 | def expected_pks(self): 72 | return { 73 | 'simple_coll_1': {'_id'}, 74 | 'simple_coll_2': {'_id'}, 75 | } 76 | 77 | def expected_row_counts(self): 78 | return { 79 | 'simple_coll_1': 50, 80 | 'simple_coll_2': 100, 81 | } 82 | 83 | def expected_sync_streams(self): 84 | return { 85 | 'simple_coll_1', 86 | 'simple_coll_2' 87 | } 88 | 89 | def name(self): 90 | return "tap_tester_mongodb_index" 91 | 92 | def tap_name(self): 93 | return "tap-mongodb" 94 | 95 | def get_type(self): 96 | return "platform.mongodb" 97 | 98 | def get_credentials(self): 99 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 100 | 101 | def get_properties(self): 102 | return {'host' : os.getenv('TAP_MONGODB_HOST'), 103 | 'port' : os.getenv('TAP_MONGODB_PORT'), 104 | 'user' : os.getenv('TAP_MONGODB_USER'), 105 | 'database' : os.getenv('TAP_MONGODB_DBNAME') 106 | } 107 | 108 | def expected_string_fields(self): 109 | # Max index count = 64. 63 strings + '_id' 110 | return {f"string_field_{i}" for i in range(1, 64)} 111 | 112 | 113 | def test_run(self): 114 | 115 | conn_id = connections.ensure_connection(self) 116 | 117 | # ----------------------------------- 118 | # ----------- Discovery ------------ 119 | # ----------------------------------- 120 | 121 | # run in discovery mode 122 | check_job_name = runner.run_check_mode(self, conn_id) 123 | 124 | # verify check exit codes 125 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 126 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 127 | 128 | # verify the tap discovered the right streams 129 | found_catalogs = menagerie.get_catalogs(conn_id) 130 | 131 | # assert we find the correct streams 132 | self.assertEqual(self.expected_check_streams(), 133 | {c['tap_stream_id'] for c in found_catalogs}) 134 | 135 | for tap_stream_id in self.expected_check_streams(): 136 | found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] 137 | 138 | # assert that the pks are correct 139 | self.assertEqual(self.expected_pks()[found_stream['stream_name']], 140 | set(found_stream.get('metadata', {}).get('table-key-properties'))) 141 | 142 | # assert that the row counts are correct 143 | self.assertEqual(self.expected_row_counts()[found_stream['stream_name']], 144 | found_stream.get('metadata', {}).get('row-count')) 145 | 146 | # no plans for tap to support compound index, may not appear in valid-replication-keys list 147 | discovered_replication_keys = found_catalogs[0]['metadata']['valid-replication-keys'] 148 | for field in self.expected_string_fields(): 149 | self.assertIn(field, discovered_replication_keys) 150 | self.assertIn('_id', discovered_replication_keys) 151 | self.assertEqual(64, len(discovered_replication_keys)) 152 | -------------------------------------------------------------------------------- /tests/test_mongodb_oplog.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import string 4 | import unittest 5 | from bson import ObjectId 6 | 7 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 8 | from tap_tester import connections, menagerie, runner 9 | 10 | 11 | RECORD_COUNT = {} 12 | 13 | 14 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 15 | return ''.join(random.choice(chars) for x in range(size)) 16 | 17 | def generate_simple_coll_docs(num_docs): 18 | docs = [] 19 | for int_value in range(num_docs): 20 | docs.append({"int_field": int_value, "string_field": random_string_generator()}) 21 | return docs 22 | 23 | class MongoDBOplog(unittest.TestCase): 24 | def setUp(self): 25 | 26 | ensure_environment_variables_set() 27 | 28 | with get_test_connection() as client: 29 | ############# Drop all dbs/collections ############# 30 | drop_all_collections(client) 31 | 32 | ############# Add simple collections ############# 33 | # simple_coll_1 has 50 documents 34 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) 35 | 36 | # simple_coll_2 has 100 documents 37 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) 38 | 39 | 40 | 41 | 42 | def expected_check_streams(self): 43 | return { 44 | 'simple_db-simple_coll_1', 45 | 'simple_db-simple_coll_2', 46 | } 47 | 48 | def expected_pks(self): 49 | return { 50 | 'simple_coll_1': {'_id'}, 51 | 'simple_coll_2': {'_id'}, 52 | } 53 | 54 | def expected_row_counts(self): 55 | return { 56 | 'simple_coll_1': 50, 57 | 'simple_coll_2': 100, 58 | } 59 | 60 | 61 | def expected_sync_streams(self): 62 | return { 63 | 'simple_coll_1', 64 | 'simple_coll_2' 65 | } 66 | 67 | def name(self): 68 | return "tap_tester_mongodb_oplog" 69 | 70 | def tap_name(self): 71 | return "tap-mongodb" 72 | 73 | def get_type(self): 74 | return "platform.mongodb" 75 | 76 | def get_credentials(self): 77 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 78 | 79 | def get_properties(self): 80 | return {'host' : os.getenv('TAP_MONGODB_HOST'), 81 | 'port' : os.getenv('TAP_MONGODB_PORT'), 82 | 'user' : os.getenv('TAP_MONGODB_USER'), 83 | 'database' : os.getenv('TAP_MONGODB_DBNAME') 84 | } 85 | 86 | 87 | def test_run(self): 88 | 89 | conn_id = connections.ensure_connection(self) 90 | 91 | # ------------------------------- 92 | # ----------- Discovery ---------- 93 | # ------------------------------- 94 | 95 | # run in discovery mode 96 | check_job_name = runner.run_check_mode(self, conn_id) 97 | 98 | # verify check exit codes 99 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 100 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 101 | 102 | # verify the tap discovered the right streams 103 | found_catalogs = menagerie.get_catalogs(conn_id) 104 | 105 | # assert we find the correct streams 106 | self.assertEqual(self.expected_check_streams(), 107 | {c['tap_stream_id'] for c in found_catalogs}) 108 | 109 | 110 | 111 | for tap_stream_id in self.expected_check_streams(): 112 | found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] 113 | 114 | # assert that the pks are correct 115 | self.assertEqual(self.expected_pks()[found_stream['stream_name']], 116 | set(found_stream.get('metadata', {}).get('table-key-properties'))) 117 | 118 | # assert that the row counts are correct 119 | self.assertEqual(self.expected_row_counts()[found_stream['stream_name']], 120 | found_stream.get('metadata', {}).get('row-count')) 121 | 122 | # ----------------------------------- 123 | # ----------- Initial Full Table --------- 124 | # ----------------------------------- 125 | # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata 126 | for stream_catalog in found_catalogs: 127 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 128 | additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}] 129 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 130 | stream_catalog, 131 | annotated_schema, 132 | additional_md) 133 | 134 | # Run sync 135 | sync_job_name = runner.run_sync_mode(self, conn_id) 136 | 137 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 138 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 139 | 140 | 141 | # verify the persisted schema was correct 142 | records_by_stream = runner.get_records_from_target_output() 143 | 144 | # assert that each of the streams that we synced are the ones that we expect to see 145 | record_count_by_stream = runner.examine_target_output_file(self, 146 | conn_id, 147 | self.expected_sync_streams(), 148 | self.expected_pks()) 149 | 150 | # Verify that the full table was synced 151 | for tap_stream_id in self.expected_sync_streams(): 152 | self.assertGreaterEqual(record_count_by_stream[tap_stream_id],self.expected_row_counts()[tap_stream_id]) 153 | 154 | # Verify that we have 'initial_full_table_complete' bookmark 155 | state = menagerie.get_state(conn_id) 156 | first_versions = {} 157 | 158 | for tap_stream_id in self.expected_check_streams(): 159 | # assert that the state has an initial_full_table_complete == True 160 | self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete']) 161 | # assert that there is a version bookmark in state 162 | first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version'] 163 | self.assertIsNotNone(first_versions[tap_stream_id]) 164 | # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark 165 | self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_time']) 166 | self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_inc']) 167 | 168 | 169 | changed_ids = set() 170 | with get_test_connection() as client: 171 | # Delete two documents for each collection 172 | 173 | changed_ids.add(client['simple_db']['simple_coll_1'].find({'int_field': 0})[0]['_id']) 174 | client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0}) 175 | 176 | changed_ids.add(client['simple_db']['simple_coll_1'].find({'int_field': 1})[0]['_id']) 177 | client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1}) 178 | 179 | changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 0})[0]['_id']) 180 | client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0}) 181 | 182 | changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 1})[0]['_id']) 183 | client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1}) 184 | 185 | # Update two documents for each collection 186 | changed_ids.add(client['simple_db']['simple_coll_1'].find({'int_field': 48})[0]['_id']) 187 | client["simple_db"]["simple_coll_1"].update_one({'int_field': 48},{'$set': {'int_field': -1}}) 188 | 189 | changed_ids.add(client['simple_db']['simple_coll_1'].find({'int_field': 49})[0]['_id']) 190 | client["simple_db"]["simple_coll_1"].update_one({'int_field': 49},{'$set': {'int_field': -1}}) 191 | 192 | changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 98})[0]['_id']) 193 | client["simple_db"]["simple_coll_2"].update_one({'int_field': 98},{'$set': {'int_field': -1}}) 194 | 195 | changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 99})[0]['_id']) 196 | client["simple_db"]["simple_coll_2"].update_one({'int_field': 99},{'$set': {'int_field': -1}}) 197 | 198 | # Insert two documents for each collection 199 | client["simple_db"]["simple_coll_1"].insert_one({"int_field": 50, "string_field": random_string_generator()}) 200 | changed_ids.add(client['simple_db']['simple_coll_1'].find({'int_field': 50})[0]['_id']) 201 | 202 | client["simple_db"]["simple_coll_1"].insert_one({"int_field": 51, "string_field": random_string_generator()}) 203 | changed_ids.add(client['simple_db']['simple_coll_1'].find({'int_field': 51})[0]['_id']) 204 | 205 | client["simple_db"]["simple_coll_2"].insert_one({"int_field": 100, "string_field": random_string_generator()}) 206 | changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 100})[0]['_id']) 207 | 208 | client["simple_db"]["simple_coll_2"].insert_one({"int_field": 101, "string_field": random_string_generator()}) 209 | changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 101})[0]['_id']) 210 | 211 | # ----------------------------------- 212 | # ----------- Subsequent Oplog Sync --------- 213 | # ----------------------------------- 214 | 215 | # Run sync 216 | 217 | sync_job_name = runner.run_sync_mode(self, conn_id) 218 | 219 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 220 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 221 | 222 | 223 | # verify the persisted schema was correct 224 | messages_by_stream = runner.get_records_from_target_output() 225 | records_by_stream = {} 226 | for stream_name in self.expected_sync_streams(): 227 | records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert'] 228 | 229 | 230 | # assert that each of the streams that we synced are the ones that we expect to see 231 | record_count_by_stream = runner.examine_target_output_file(self, 232 | conn_id, 233 | self.expected_sync_streams(), 234 | self.expected_pks()) 235 | 236 | # Verify that we got at least 6 records due to changes 237 | # (could be more due to overlap in gte oplog clause) 238 | for k,v in record_count_by_stream.items(): 239 | self.assertGreaterEqual(v, 6) 240 | 241 | # Verify that we got 2 records with _SDC_DELETED_AT 242 | for stream in self.expected_sync_streams(): 243 | self.assertEqual(2, len([x['data'] for x in records_by_stream[stream] 244 | if x['data'].get('_sdc_deleted_at')])) 245 | 246 | # Verify that the _id of the records sent are the same set as the 247 | # _ids of the documents changed 248 | actual_ids = {ObjectId(x['data']['_id']) for stream in self.expected_sync_streams() 249 | for x in records_by_stream[stream]} 250 | self.assertEqual(changed_ids, actual_ids) 251 | -------------------------------------------------------------------------------- /tests/test_mongodb_oplog_aged_out.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import string 4 | import time 5 | import unittest 6 | 7 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 8 | from tap_tester import connections, menagerie, runner 9 | 10 | 11 | RECORD_COUNT = {} 12 | 13 | 14 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 15 | return ''.join(random.choice(chars) for x in range(size)) 16 | 17 | def generate_simple_coll_docs(num_docs): 18 | docs = [] 19 | for int_value in range(num_docs): 20 | docs.append({"int_field": int_value, "string_field": random_string_generator()}) 21 | return docs 22 | 23 | class MongoDBOplogAgedOut(unittest.TestCase): 24 | def setUp(self): 25 | ensure_environment_variables_set() 26 | 27 | with get_test_connection() as client: 28 | ############# Drop all dbs/collections ############# 29 | drop_all_collections(client) 30 | 31 | ############# Add simple collections ############ 32 | # simple_coll_1 has 50 documents 33 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) 34 | 35 | 36 | 37 | def expected_check_streams(self): 38 | return { 39 | 'simple_db-simple_coll_1' 40 | } 41 | 42 | def expected_pks(self): 43 | return { 44 | 'simple_coll_1': {'_id'} 45 | } 46 | 47 | def expected_row_counts(self): 48 | return { 49 | 'simple_coll_1': 50 50 | } 51 | 52 | 53 | def expected_sync_streams(self): 54 | return { 55 | 'simple_coll_1' 56 | } 57 | 58 | def name(self): 59 | return "tap_tester_mongodb_oplog_aged_out" 60 | 61 | def tap_name(self): 62 | return "tap-mongodb" 63 | 64 | def get_type(self): 65 | return "platform.mongodb" 66 | 67 | def get_credentials(self): 68 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 69 | 70 | def get_properties(self): 71 | return {'host' : os.getenv('TAP_MONGODB_HOST'), 72 | 'port' : os.getenv('TAP_MONGODB_PORT'), 73 | 'user' : os.getenv('TAP_MONGODB_USER'), 74 | 'database' : os.getenv('TAP_MONGODB_DBNAME') 75 | } 76 | 77 | 78 | def test_run(self): 79 | 80 | conn_id = connections.ensure_connection(self) 81 | 82 | # ------------------------------- 83 | # ----------- Discovery ---------- 84 | # ------------------------------- 85 | 86 | # run in discovery mode 87 | check_job_name = runner.run_check_mode(self, conn_id) 88 | 89 | # verify check exit codes 90 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 91 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 92 | 93 | # verify the tap discovered the right streams 94 | found_catalogs = menagerie.get_catalogs(conn_id) 95 | 96 | # assert we find the correct streams 97 | self.assertEqual(self.expected_check_streams(), 98 | {c['tap_stream_id'] for c in found_catalogs}) 99 | 100 | 101 | 102 | for tap_stream_id in self.expected_check_streams(): 103 | found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] 104 | 105 | # assert that the pks are correct 106 | self.assertEqual(self.expected_pks()[found_stream['stream_name']], 107 | set(found_stream.get('metadata', {}).get('table-key-properties'))) 108 | 109 | # assert that the row counts are correct 110 | self.assertEqual(self.expected_row_counts()[found_stream['stream_name']], 111 | found_stream.get('metadata', {}).get('row-count')) 112 | 113 | # ----------------------------------- 114 | # ----------- Full Table Sync --------- 115 | # ----------------------------------- 116 | # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata 117 | for stream_catalog in found_catalogs: 118 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 119 | additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}] 120 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 121 | stream_catalog, 122 | annotated_schema, 123 | additional_md) 124 | # Synthesize interrupted state 125 | original_version = int(time.time() * 1000) 126 | interrupted_state = { 127 | 'currently_syncing' : 'simple_db-simple_coll_1', 128 | 'bookmarks' : { 129 | 'simple_db-simple_coll_1': { 130 | 'version': original_version, 131 | 'initial_full_table_complete': True, 132 | 'oplog_ts_time': 1, 133 | 'oplog_ts_inc': 0 134 | } 135 | } 136 | } 137 | 138 | menagerie.set_state(conn_id, interrupted_state) 139 | 140 | # This should say the oplog has timed out and will execute a resync 141 | runner.run_sync_mode(self, conn_id) 142 | 143 | # verify the persisted schema was correct 144 | records_by_stream = runner.get_records_from_target_output() 145 | 146 | # assert that each of the streams that we synced are the ones that we expect to see 147 | record_count_by_stream = runner.examine_target_output_file(self, 148 | conn_id, 149 | self.expected_sync_streams(), 150 | self.expected_pks()) 151 | 152 | # assert that we only have an ActivateVersionMessage as the last message and not the first 153 | for stream_name in self.expected_sync_streams(): 154 | self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action']) 155 | self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][51]['action']) 156 | 157 | 158 | # assert that final state has no last_id_fetched and max_id_value bookmarks 159 | final_state = menagerie.get_state(conn_id) 160 | self.assertNotEqual(original_version, final_state.get('bookmarks', {}).get('simple_db-simple_coll_1', {}).get('version')) 161 | 162 | # assert that all rows in the collection were sync'd 163 | for stream_id, row_count in self.expected_row_counts().items(): 164 | self.assertGreaterEqual(record_count_by_stream[stream_id], row_count) 165 | 166 | # assert that each stream has a initial_full_table_complete=True bookmark 167 | self.assertIsNotNone(final_state.get('bookmarks', {}).get('simple_db-simple_coll_1', {}).get('oplog_ts_time')) 168 | self.assertIsNotNone(final_state.get('bookmarks', {}).get('simple_db-simple_coll_1', {}).get('oplog_ts_inc')) 169 | self.assertTrue(final_state.get('bookmarks', {}).get('simple_db-simple_coll_1', {}).get('initial_full_table_complete')) 170 | -------------------------------------------------------------------------------- /tests/test_mongodb_oplog_bookmarks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pymongo 3 | import random 4 | import string 5 | import time 6 | import unittest 7 | 8 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 9 | from tap_tester import connections, menagerie, runner 10 | 11 | 12 | RECORD_COUNT = {} 13 | 14 | 15 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 16 | return ''.join(random.choice(chars) for x in range(size)) 17 | 18 | def generate_simple_coll_docs(num_docs): 19 | docs = [] 20 | for int_value in range(num_docs): 21 | docs.append({"int_field": int_value, "string_field": random_string_generator()}) 22 | return docs 23 | 24 | class MongoDBOplogBookmarks(unittest.TestCase): 25 | def setUp(self): 26 | 27 | ensure_environment_variables_set() 28 | 29 | with get_test_connection() as client: 30 | drop_all_collections(client) 31 | 32 | # simple_coll_1 has 50 documents 33 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) 34 | 35 | # simple_coll_2 has 100 documents 36 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) 37 | 38 | 39 | def expected_check_streams(self): 40 | return { 41 | 'simple_db-simple_coll_1', 42 | 'simple_db-simple_coll_2', 43 | } 44 | 45 | def expected_pks(self): 46 | return { 47 | 'simple_coll_1': {'_id'}, 48 | 'simple_coll_2': {'_id'}, 49 | } 50 | 51 | def expected_row_counts(self): 52 | return { 53 | 'simple_coll_1': 50, 54 | 'simple_coll_2': 100, 55 | 56 | } 57 | 58 | 59 | def expected_sync_streams(self): 60 | return { 61 | 'simple_coll_1', 62 | 'simple_coll_2', 63 | } 64 | 65 | def name(self): 66 | return "tap_tester_mongodb_oplog_bookmarks" 67 | 68 | def tap_name(self): 69 | return "tap-mongodb" 70 | 71 | def get_type(self): 72 | return "platform.mongodb" 73 | 74 | def get_credentials(self): 75 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 76 | 77 | def get_properties(self): 78 | return { 79 | 'host' : os.getenv('TAP_MONGODB_HOST'), 80 | 'port' : os.getenv('TAP_MONGODB_PORT'), 81 | 'user' : os.getenv('TAP_MONGODB_USER'), 82 | 'database' : os.getenv('TAP_MONGODB_DBNAME') 83 | } 84 | 85 | 86 | def test_run(self): 87 | 88 | conn_id = connections.ensure_connection(self) 89 | 90 | # ------------------------------- 91 | # ----------- Discovery ---------- 92 | # ------------------------------- 93 | 94 | # run in discovery mode 95 | check_job_name = runner.run_check_mode(self, conn_id) 96 | 97 | # verify check exit codes 98 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 99 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 100 | 101 | # verify the tap discovered the right streams 102 | found_catalogs = menagerie.get_catalogs(conn_id) 103 | 104 | # assert we find the correct streams 105 | self.assertEqual(self.expected_check_streams(), 106 | {c['tap_stream_id'] for c in found_catalogs}) 107 | 108 | for tap_stream_id in self.expected_check_streams(): 109 | found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] 110 | 111 | # assert that the pks are correct 112 | self.assertEqual(self.expected_pks()[found_stream['stream_name']], 113 | set(found_stream.get('metadata', {}).get('table-key-properties'))) 114 | 115 | # assert that the row counts are correct 116 | self.assertEqual(self.expected_row_counts()[found_stream['stream_name']], 117 | found_stream.get('metadata', {}).get('row-count')) 118 | 119 | # ----------------------------------- 120 | # ----------- Initial Full Table --------- 121 | # ----------------------------------- 122 | # Select simple_coll_1 and add replication method metadata 123 | additional_md = [{ "breadcrumb" : [], 124 | "metadata" : {'replication-method' : 'LOG_BASED'}}] 125 | for stream_catalog in found_catalogs: 126 | if stream_catalog['tap_stream_id'] == 'simple_db-simple_coll_1': 127 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 128 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 129 | stream_catalog, 130 | annotated_schema, 131 | additional_md) 132 | 133 | # Run sync 134 | sync_job_name = runner.run_sync_mode(self, conn_id) 135 | 136 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 137 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 138 | 139 | 140 | # verify the persisted schema was correct 141 | records_by_stream = runner.get_records_from_target_output() 142 | 143 | # assert that each of the streams that we synced are the ones that we expect to see 144 | record_count_by_stream = runner.examine_target_output_file(self, 145 | conn_id, 146 | self.expected_sync_streams(), 147 | self.expected_pks()) 148 | 149 | # Verify that the full table was synced 150 | tap_stream_id = 'simple_db-simple_coll_1' 151 | self.assertGreaterEqual(record_count_by_stream['simple_coll_1'], 152 | self.expected_row_counts()['simple_coll_1']) 153 | 154 | # Verify that we have 'initial_full_table_complete' bookmark 155 | state = menagerie.get_state(conn_id) 156 | first_versions = {} 157 | 158 | # assert that the state has an initial_full_table_complete == True 159 | self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete']) 160 | # assert that there is a version bookmark in state 161 | first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version'] 162 | self.assertIsNotNone(first_versions[tap_stream_id]) 163 | # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark 164 | self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_time']) 165 | self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_inc']) 166 | 167 | 168 | 169 | # Insert records to coll_1 to get the bookmark to be a ts on coll_1 170 | with get_test_connection() as client: 171 | client["simple_db"]["simple_coll_1"].insert_one({"int_field": 101, "string_field": random_string_generator()}) 172 | sync_job_name = runner.run_sync_mode(self, conn_id) 173 | 174 | 175 | changed_ids = set() 176 | with get_test_connection() as client: 177 | # Make changes to not selected collection 178 | changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 0})[0]['_id']) 179 | client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0}) 180 | 181 | changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 1})[0]['_id']) 182 | client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1}) 183 | 184 | changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 98})[0]['_id']) 185 | client["simple_db"]["simple_coll_2"].update_one({'int_field': 98},{'$set': {'int_field': -1}}) 186 | 187 | changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 99})[0]['_id']) 188 | client["simple_db"]["simple_coll_2"].update_one({'int_field': 99},{'$set': {'int_field': -1}}) 189 | 190 | client["simple_db"]["simple_coll_2"].insert_one({"int_field": 100, "string_field": random_string_generator()}) 191 | changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 100})[0]['_id']) 192 | 193 | client["simple_db"]["simple_coll_2"].insert_one({"int_field": 101, "string_field": random_string_generator()}) 194 | changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 101})[0]['_id']) 195 | 196 | # ----------------------------------- 197 | # ----------- Subsequent Oplog Sync --------- 198 | # ----------------------------------- 199 | 200 | # Run sync 201 | sync_job_name = runner.run_sync_mode(self, conn_id) 202 | 203 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 204 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 205 | 206 | # verify the persisted schema was correct 207 | messages_by_stream = runner.get_records_from_target_output() 208 | records_by_stream = { 209 | 'simple_coll_1': [x 210 | for x in messages_by_stream['simple_coll_1']['messages'] 211 | if x.get('action') == 'upsert'] 212 | } 213 | 214 | # assert that each of the streams that we synced are the ones that we expect to see 215 | record_count_by_stream = runner.examine_target_output_file(self, 216 | conn_id, 217 | self.expected_sync_streams(), 218 | self.expected_pks()) 219 | 220 | # 1 record due to fencepost querying on oplog ts 221 | self.assertEqual(1, record_count_by_stream['simple_coll_1']) 222 | 223 | final_state = menagerie.get_state(conn_id) 224 | 225 | with get_test_connection() as client: 226 | row = client.local.oplog.rs.find_one(sort=[('$natural', pymongo.DESCENDING)]) 227 | latest_oplog_ts = row.get('ts') 228 | 229 | self.assertEqual( 230 | (latest_oplog_ts.time, latest_oplog_ts.inc), 231 | (final_state['bookmarks']['simple_db-simple_coll_1']['oplog_ts_time'], 232 | final_state['bookmarks']['simple_db-simple_coll_1']['oplog_ts_inc']) 233 | ) 234 | -------------------------------------------------------------------------------- /tests/test_mongodb_projection.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | import string 5 | import unittest 6 | 7 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 8 | from tap_tester import connections, menagerie, runner 9 | 10 | 11 | RECORD_COUNT = {} 12 | 13 | 14 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 15 | return ''.join(random.choice(chars) for x in range(size)) 16 | 17 | def generate_simple_coll_docs(num_docs): 18 | docs = [] 19 | for int_value in range(num_docs): 20 | docs.append({"int_field": int_value, "string_field": random_string_generator()}) 21 | return docs 22 | 23 | 24 | class MongoDBProjection(unittest.TestCase): 25 | 26 | def setUpDatabase(self): 27 | ensure_environment_variables_set() 28 | 29 | with get_test_connection() as client: 30 | ############# Drop all dbs/collections ############# 31 | drop_all_collections(client) 32 | 33 | ############# Add simple collections ############# 34 | # simple_coll_1 has 50 documents 35 | 36 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) 37 | 38 | # simple_coll_2 has 100 documents 39 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) 40 | 41 | def setUp(self): 42 | pass 43 | 44 | 45 | def expected_check_streams(self): 46 | return { 47 | 'simple_db-simple_coll_1', 48 | 'simple_db-simple_coll_2', 49 | } 50 | 51 | def expected_pks(self): 52 | return { 53 | 'simple_coll_1': {'_id'}, 54 | 'simple_coll_2': {'_id'}, 55 | } 56 | 57 | def expected_row_counts(self): 58 | return { 59 | 'simple_coll_1': 50, 60 | 'simple_coll_2': 100, 61 | } 62 | 63 | 64 | def expected_sync_streams(self): 65 | return { 66 | 'simple_coll_1', 67 | 'simple_coll_2' 68 | } 69 | 70 | def projection_expected_keys_list(self): 71 | return [ 72 | { 73 | "projection": {"int_field": 1}, 74 | "expected_keys": [{"_id", "int_field"}, 75 | {"_id", "_sdc_deleted_at"}] 76 | }, 77 | { 78 | "projection": {"int_field": 1, "_id": 1}, 79 | "expected_keys": [{"_id", "int_field"}, 80 | {"_id", "_sdc_deleted_at"}] 81 | }, 82 | { 83 | "projection": {"int_field": 0}, 84 | "expected_keys": [{"_id", "string_field"}, 85 | {"_id", "_sdc_deleted_at"}] 86 | }, 87 | { 88 | "projection": {"_id": 1}, 89 | "expected_keys": [{"_id"}, 90 | {"_id", "_sdc_deleted_at"}] 91 | }, 92 | { 93 | "projection": {}, 94 | "expected_keys": [{"_id", "string_field", "int_field"}, 95 | {"_id", "_sdc_deleted_at"}] 96 | }, 97 | { 98 | "projection": None, 99 | "expected_keys": [{"_id", "string_field", "int_field"}, 100 | {"_id", "_sdc_deleted_at"}] 101 | }, 102 | { 103 | "projection": "", 104 | "expected_keys": [{"_id", "string_field", "int_field"}, 105 | {"_id", "_sdc_deleted_at"}] 106 | } 107 | ] 108 | 109 | def name(self): 110 | return "tap_tester_mongodb_projection" 111 | 112 | def tap_name(self): 113 | return "tap-mongodb" 114 | 115 | def get_type(self): 116 | return "platform.mongodb" 117 | 118 | def get_credentials(self): 119 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 120 | 121 | def get_properties(self): 122 | return {'host' : os.getenv('TAP_MONGODB_HOST'), 123 | 'port' : os.getenv('TAP_MONGODB_PORT'), 124 | 'user' : os.getenv('TAP_MONGODB_USER'), 125 | 'database' : os.getenv('TAP_MONGODB_DBNAME') 126 | } 127 | 128 | def modify_database(self): 129 | with get_test_connection() as client: 130 | # Delete two documents for each collection 131 | 132 | client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0}) 133 | 134 | client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1}) 135 | 136 | client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0}) 137 | 138 | 139 | client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1}) 140 | 141 | # Update two documents for each collection 142 | client["simple_db"]["simple_coll_1"].update_one({'int_field': 48},{'$set': {'int_field': -1}}) 143 | 144 | client["simple_db"]["simple_coll_1"].update_one({'int_field': 49},{'$set': {'int_field': -1}}) 145 | 146 | client["simple_db"]["simple_coll_2"].update_one({'int_field': 98},{'$set': {'int_field': -1}}) 147 | 148 | client["simple_db"]["simple_coll_2"].update_one({'int_field': 99},{'$set': {'int_field': -1}}) 149 | 150 | # Insert two documents for each collection 151 | client["simple_db"]["simple_coll_1"].insert_one({"int_field": 50, "string_field": random_string_generator()}) 152 | 153 | client["simple_db"]["simple_coll_1"].insert_one({"int_field": 51, "string_field": random_string_generator()}) 154 | 155 | client["simple_db"]["simple_coll_2"].insert_one({"int_field": 100, "string_field": random_string_generator()}) 156 | 157 | client["simple_db"]["simple_coll_2"].insert_one({"int_field": 101, "string_field": random_string_generator()}) 158 | 159 | 160 | def run_single_projection(self, projection_mapping): 161 | self.setUpDatabase() 162 | conn_id = connections.ensure_connection(self) 163 | 164 | # ------------------------------- 165 | # ----------- Discovery ---------- 166 | # ------------------------------- 167 | 168 | # run in discovery mode 169 | check_job_name = runner.run_check_mode(self, conn_id) 170 | 171 | # verify check exit codes 172 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 173 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 174 | 175 | # verify the tap discovered the right streams 176 | found_catalogs = menagerie.get_catalogs(conn_id) 177 | 178 | # assert we find the correct streams 179 | self.assertEqual(self.expected_check_streams(), 180 | {c['tap_stream_id'] for c in found_catalogs}) 181 | 182 | 183 | 184 | for tap_stream_id in self.expected_check_streams(): 185 | found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] 186 | 187 | # assert that the pks are correct 188 | self.assertEqual(self.expected_pks()[found_stream['stream_name']], 189 | set(found_stream.get('metadata', {}).get('table-key-properties'))) 190 | 191 | # assert that the row counts are correct 192 | self.assertEqual(self.expected_row_counts()[found_stream['stream_name']], 193 | found_stream.get('metadata', {}).get('row-count')) 194 | 195 | # ----------------------------------- 196 | # ----------- Initial Full Table --------- 197 | # ----------------------------------- 198 | # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata 199 | for stream_catalog in found_catalogs: 200 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 201 | additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}] 202 | if projection_mapping['projection'] is not None: 203 | additional_md[0]['metadata']['tap_mongodb.projection'] = json.dumps(projection_mapping['projection']) 204 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 205 | stream_catalog, 206 | annotated_schema, 207 | additional_md) 208 | 209 | # Run sync 210 | sync_job_name = runner.run_sync_mode(self, conn_id) 211 | 212 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 213 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 214 | 215 | 216 | # verify the persisted schema was correct 217 | messages_by_stream = runner.get_records_from_target_output() 218 | 219 | 220 | for stream_name in self.expected_sync_streams(): 221 | stream_records = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert'] 222 | #actual_keys = set() 223 | 224 | for record in stream_records: 225 | # BUG TDL-23609. Pymongo v4.3+ returns entire document for empty projection 226 | if projection_mapping['projection'] == {}: 227 | continue 228 | 229 | self.assertIn(record['data'].keys(), projection_mapping['expected_keys']) 230 | #actual_keys = actual_keys.union(set(record['data'].keys())) 231 | 232 | #self.assertTrue(actual_keys.issubset(projection_mapping['expected_keys'])) 233 | 234 | self.modify_database() 235 | 236 | # ----------------------------------- 237 | # ----------- Subsequent Oplog Sync --------- 238 | # ----------------------------------- 239 | 240 | # Run sync 241 | sync_job_name = runner.run_sync_mode(self, conn_id) 242 | 243 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 244 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 245 | 246 | 247 | # verify the persisted schema was correct 248 | messages_by_stream = runner.get_records_from_target_output() 249 | 250 | for stream_name in self.expected_sync_streams(): 251 | stream_records = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert'] 252 | #actual_keys = set() 253 | for record in stream_records: 254 | # BUG TDL-23609. Pymongo v4.3+ returns entire document for empty projection 255 | if projection_mapping['projection'] == {}: 256 | continue 257 | 258 | self.assertIn(record['data'].keys(), projection_mapping['expected_keys']) 259 | #actual_keys = actual_keys.union(set(record['data'].keys())) 260 | #self.assertTrue(actual_keys.issubset(projection_mapping['expected_keys'])) 261 | 262 | 263 | def test_run(self): 264 | for projection_mapping in self.projection_expected_keys_list(): 265 | self.run_single_projection(projection_mapping) 266 | -------------------------------------------------------------------------------- /tests/test_mongodb_table_reset_log.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import string 4 | import unittest 5 | 6 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 7 | from tap_tester import connections, menagerie, runner 8 | 9 | 10 | RECORD_COUNT = {} 11 | 12 | 13 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 14 | return ''.join(random.choice(chars) for x in range(size)) 15 | 16 | def generate_simple_coll_docs(num_docs): 17 | docs = [] 18 | for int_value in range(num_docs): 19 | docs.append({"int_field": int_value, "string_field": random_string_generator()}) 20 | return docs 21 | 22 | class MongoDBTableResetLog(unittest.TestCase): 23 | def setUp(self): 24 | 25 | ensure_environment_variables_set() 26 | 27 | with get_test_connection() as client: 28 | ############# Drop all dbs/collections ############# 29 | drop_all_collections(client) 30 | 31 | ############# Add simple collections ############# 32 | # simple_coll_1 has 50 documents 33 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) 34 | 35 | # simple_coll_2 has 100 documents 36 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) 37 | 38 | 39 | def expected_check_streams(self): 40 | return { 41 | 'simple_db-simple_coll_1', 42 | 'simple_db-simple_coll_2', 43 | } 44 | 45 | def expected_pks(self): 46 | return { 47 | 'simple_coll_1': {'_id'}, 48 | 'simple_coll_2': {'_id'}, 49 | } 50 | 51 | def expected_row_counts(self): 52 | return { 53 | 'simple_coll_1': 50, 54 | 'simple_coll_2': 100, 55 | } 56 | 57 | def expected_sync_streams(self): 58 | return { 59 | 'simple_coll_1', 60 | 'simple_coll_2' 61 | } 62 | 63 | def name(self): 64 | return "tap_tester_mongodb_table_reset_log" 65 | 66 | def tap_name(self): 67 | return "tap-mongodb" 68 | 69 | def get_type(self): 70 | return "platform.mongodb" 71 | 72 | def get_credentials(self): 73 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 74 | 75 | def get_properties(self): 76 | return {'host' : os.getenv('TAP_MONGODB_HOST'), 77 | 'port' : os.getenv('TAP_MONGODB_PORT'), 78 | 'user' : os.getenv('TAP_MONGODB_USER'), 79 | 'database' : os.getenv('TAP_MONGODB_DBNAME') 80 | } 81 | 82 | 83 | def test_run(self): 84 | 85 | conn_id = connections.ensure_connection(self) 86 | 87 | # --------------------------------- 88 | # ----------- Discovery ---------- 89 | # --------------------------------- 90 | 91 | # run in discovery mode 92 | check_job_name = runner.run_check_mode(self, conn_id) 93 | 94 | # verify check exit codes 95 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 96 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 97 | 98 | # verify the tap discovered the right streams 99 | found_catalogs = menagerie.get_catalogs(conn_id) 100 | 101 | # assert we find the correct streams 102 | self.assertEqual(self.expected_check_streams(), 103 | {c['tap_stream_id'] for c in found_catalogs}) 104 | 105 | 106 | for tap_stream_id in self.expected_check_streams(): 107 | found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] 108 | 109 | # assert that the pks are correct 110 | self.assertEqual(self.expected_pks()[found_stream['stream_name']], 111 | set(found_stream.get('metadata', {}).get('table-key-properties'))) 112 | 113 | # assert that the row counts are correct 114 | self.assertEqual(self.expected_row_counts()[found_stream['stream_name']], 115 | found_stream.get('metadata', {}).get('row-count')) 116 | 117 | # ---------------------------------------- 118 | # ----------- Initial Full Table --------- 119 | # ---------------------------------------- 120 | 121 | # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata 122 | for stream_catalog in found_catalogs: 123 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 124 | additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}] 125 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 126 | stream_catalog, 127 | annotated_schema, 128 | additional_md) 129 | 130 | # Run sync 131 | sync_job_name = runner.run_sync_mode(self, conn_id) 132 | 133 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 134 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 135 | 136 | # verify the persisted schema was correct 137 | records_by_stream = runner.get_records_from_target_output() 138 | 139 | # assert that each of the streams that we synced are the ones that we expect to see 140 | record_count_by_stream = runner.examine_target_output_file(self, 141 | conn_id, 142 | self.expected_sync_streams(), 143 | self.expected_pks()) 144 | 145 | # Verify that the full table was synced 146 | for tap_stream_id in self.expected_sync_streams(): 147 | self.assertGreaterEqual(record_count_by_stream[tap_stream_id],self.expected_row_counts()[tap_stream_id]) 148 | 149 | # manipulate state to simulate table reset 150 | state = menagerie.get_state(conn_id) 151 | reset_stream = 'simple_db-simple_coll_2' 152 | state['bookmarks'].pop(reset_stream) 153 | menagerie.set_state(conn_id, state) 154 | 155 | 156 | # ------------------------------------------- 157 | # ----------- Subsequent Oplog Sync --------- 158 | # ------------------------------------------- 159 | 160 | # Run sync 161 | sync_job_name = runner.run_sync_mode(self, conn_id) 162 | 163 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 164 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 165 | 166 | # Verify that we have 'initial_full_table_complete' bookmark 167 | state = menagerie.get_state(conn_id) 168 | first_versions = {} 169 | 170 | for tap_stream_id in self.expected_check_streams(): 171 | # assert that the state has an initial_full_table_complete == True 172 | self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete']) 173 | # assert that there is a version bookmark in state 174 | first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version'] 175 | self.assertIsNotNone(first_versions[tap_stream_id]) 176 | # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark 177 | self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_time']) 178 | self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_inc']) 179 | 180 | # verify the persisted schema was correct 181 | messages_by_stream = runner.get_records_from_target_output() 182 | records_by_stream = {} 183 | for stream_name in self.expected_sync_streams(): 184 | records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert'] 185 | 186 | # assert that each of the streams that we synced are the ones that we expect to see 187 | record_count_by_stream = runner.examine_target_output_file(self, 188 | conn_id, 189 | self.expected_sync_streams(), 190 | self.expected_pks()) 191 | 192 | # Verify the expected number of records per table 193 | for k,v in record_count_by_stream.items(): 194 | if k == 'simple_coll_1': 195 | self.assertEqual(v, 0) # not reset 196 | if k == 'simple_coll_2': 197 | self.assertEqual(v, 100) # reset stream 198 | -------------------------------------------------------------------------------- /tests/test_mongodb_views.py: -------------------------------------------------------------------------------- 1 | import bson 2 | import os 3 | import random 4 | import string 5 | import unittest 6 | 7 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set 8 | from tap_tester import connections, menagerie, runner 9 | 10 | 11 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits): 12 | return ''.join(random.choice(chars) for x in range(size)) 13 | 14 | 15 | def generate_simple_coll_questions(num_docs): 16 | docs = [] 17 | for int_value in range(num_docs): 18 | docs.append({"question_id": int_value, "question": random_string_generator()}) 19 | return docs 20 | 21 | 22 | def generate_simple_coll_answers(num_docs): 23 | docs = [] 24 | for int_value in range(num_docs): 25 | docs.append({"answer_id": int_value, "answer": random_string_generator()}) 26 | return docs 27 | 28 | 29 | class MongoDBViewDiscovery(unittest.TestCase): 30 | 31 | def setUp(self): 32 | 33 | ensure_environment_variables_set() 34 | 35 | with get_test_connection() as client: 36 | # drop all dbs/collections 37 | drop_all_collections(client) 38 | 39 | # questions has 20 documents 40 | client["simple_db"]["questions"].insert_many(generate_simple_coll_questions(20)) 41 | 42 | # answers has 30 documents 43 | client["simple_db"]["answers"].insert_many(generate_simple_coll_answers(30)) 44 | 45 | # create view on questions 46 | client["simple_db"].command(bson.son.SON([("create", "question_view"), ("viewOn", "questions"), ("pipeline", [])])) 47 | 48 | # create a view by combining two collections 49 | client["simple_db"].create_collection( 50 | 'combined_view', 51 | viewOn='questions', 52 | pipeline=[{ 53 | '$lookup': { 54 | 'from': 'answers', 55 | 'localField': 'question_id', 56 | 'foreignField': 'answer_id', 57 | 'as': 'combined_view_final' 58 | } 59 | }] 60 | ) 61 | 62 | def name(self): 63 | return "tap_tester_mongodb_views" 64 | 65 | def tap_name(self): 66 | return "tap-mongodb" 67 | 68 | def get_type(self): 69 | return "platform.mongodb" 70 | 71 | def get_credentials(self): 72 | return {'password': os.getenv('TAP_MONGODB_PASSWORD')} 73 | 74 | def get_properties(self): 75 | return {'host': os.getenv('TAP_MONGODB_HOST'), 76 | 'port': os.getenv('TAP_MONGODB_PORT'), 77 | 'user': os.getenv('TAP_MONGODB_USER'), 78 | 'database': os.getenv('TAP_MONGODB_DBNAME'), 79 | 'include_schemas_in_destination_stream_name': 'true' 80 | } 81 | 82 | def expected_check_streams(self): 83 | return {'simple_db-questions', 84 | 'simple_db-answers'} 85 | 86 | def expected_pks(self): 87 | return { 88 | 'simple_db_questions': {'_id'}, 89 | 'simple_db_answers': {'_id'} 90 | } 91 | 92 | def expected_row_counts(self): 93 | return { 94 | 'simple_db_questions': 20, 95 | 'simple_db_answers': 30 96 | } 97 | 98 | def expected_sync_streams(self): 99 | return { 100 | 'simple_db_questions', 101 | 'simple_db_answers' 102 | } 103 | 104 | def test_run(self): 105 | 106 | conn_id = connections.ensure_connection(self) 107 | 108 | # ------------------------------- 109 | # ----------- Discovery ---------- 110 | # ------------------------------- 111 | 112 | # run in discovery mode 113 | check_job_name = runner.run_check_mode(self, conn_id) 114 | 115 | # verify check exit codes 116 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 117 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 118 | 119 | # verify the tap discovered the right streams 120 | found_catalogs = menagerie.get_catalogs(conn_id) 121 | 122 | # validate that the views are not discovered by the tap 123 | discovered_streams = set([catalog['tap_stream_id'] for catalog in found_catalogs]) 124 | self.assertEqual(discovered_streams, self.expected_check_streams()) 125 | 126 | # validate the discovered streams are not views 127 | for stream_catalog in found_catalogs: 128 | self.assertEqual(stream_catalog['metadata']['is-view'], False) 129 | 130 | for stream_catalog in found_catalogs: 131 | annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) 132 | additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'FULL_TABLE'}}] 133 | selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, 134 | stream_catalog, 135 | annotated_schema, 136 | additional_md) 137 | 138 | # run full table sync 139 | sync_job_name = runner.run_sync_mode(self, conn_id) 140 | 141 | # check exit status 142 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 143 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 144 | 145 | # streams that we synced are the ones that we expect to see 146 | records_by_stream = runner.get_records_from_target_output() 147 | record_count_by_stream = runner.examine_target_output_file(self, 148 | conn_id, 149 | self.expected_sync_streams(), 150 | self.expected_pks()) 151 | 152 | # assert that we get the correct number of records for each stream 153 | self.assertEqual(self.expected_row_counts(), record_count_by_stream) 154 | --------------------------------------------------------------------------------