├── .circleci
    └── config.yml
├── .github
    └── pull_request_template.md
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── Makefile
├── README.md
├── bin
    ├── populate_test_database.py
    └── test-db
├── setup.py
├── spikes
    ├── atlas_setup.md
    ├── dbreps_and_hackathon_review.md
    ├── local_mongo_setup.md
    ├── pymongo_spike.py
    └── supported_versions_spike.md
├── tap_mongodb
    ├── __init__.py
    └── sync_strategies
    │   ├── common.py
    │   ├── full_table.py
    │   ├── incremental.py
    │   └── oplog.py
└── tests
    ├── __init__.py
    ├── mongodb_common.py
    ├── test_mongodb_cname_restrictions.py
    ├── test_mongodb_configurable_properties.py
    ├── test_mongodb_datatype.py
    ├── test_mongodb_discovery.py
    ├── test_mongodb_fname_restrictions.py
    ├── test_mongodb_full_table.py
    ├── test_mongodb_full_table_id.py
    ├── test_mongodb_full_table_interruptible.py
    ├── test_mongodb_id_pk_variations.py
    ├── test_mongodb_incremental.py
    ├── test_mongodb_incremental_open_transactions.py
    ├── test_mongodb_index.py
    ├── test_mongodb_log_based_interruptible.py
    ├── test_mongodb_name_restrictions.py
    ├── test_mongodb_namespace_restrictions.py
    ├── test_mongodb_oplog.py
    ├── test_mongodb_oplog_aged_out.py
    ├── test_mongodb_oplog_bookmarks.py
    ├── test_mongodb_projection.py
    ├── test_mongodb_table_reset_inc.py
    ├── test_mongodb_table_reset_log.py
    ├── test_mongodb_views.py
    └── unittests
        └── test_common.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | version: 2.1
  2 | orbs:
  3 |   slack: circleci/slack@3.4.2
  4 | 
  5 | executors:
  6 |   tap_tester_mongo_4_4:
  7 |     docker:
  8 |       - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester-18.04
  9 |       - image: singerio/mongo:4.4-bionic
 10 |         environment:
 11 |           MONGO_INITDB_ROOT_USERNAME: dev
 12 |           MONGO_INITDB_ROOT_PASSWORD: Password1
 13 |         command: [mongod, --replSet, rs0, --keyFile, /opt/mongo/keyfile]
 14 |   tap_tester_mongo_5_0:
 15 |     docker:
 16 |       - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester-18.04
 17 |       - image: singerio/mongo:5.0
 18 |         environment:
 19 |           MONGO_INITDB_ROOT_USERNAME: dev
 20 |           MONGO_INITDB_ROOT_PASSWORD: Password1
 21 |         command: [mongod, --replSet, rs0, --keyFile, /opt/mongo/keyfile]
 22 |   tap_tester_mongo_6_0:
 23 |     docker:
 24 |       - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester-18.04
 25 |       - image: singerio/mongo:6.0
 26 |         environment:
 27 |           MONGO_INITDB_ROOT_USERNAME: dev
 28 |           MONGO_INITDB_ROOT_PASSWORD: Password1
 29 |         command: [mongod, --replSet, rs0, --keyFile, /opt/mongo/keyfile]
 30 | 
 31 | jobs:
 32 |   build:
 33 |     executor: tap_tester_mongo_4_4
 34 |     steps:
 35 |       - checkout
 36 |       - run:
 37 |           name: 'Install Dockerize'
 38 |           command: |
 39 |             wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
 40 |             tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
 41 |             rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
 42 |           environment:
 43 |             DOCKERIZE_VERSION: v0.3.0
 44 |       - run:
 45 |           name: 'Wait for Mongo'
 46 |           command: |
 47 |             dockerize -wait tcp://127.0.0.1:27017 -timeout 1m
 48 |             sleep 10
 49 |       - run:
 50 |           name: 'Setup Mongo'
 51 |           command: |
 52 |             aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox tap-tester.env
 53 |             source tap-tester.env
 54 |             wget -qO - https://www.mongodb.org/static/pgp/server-5.0.asc | apt-key add -
 55 |             echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/5.0 multiverse" \
 56 |                  | tee /etc/apt/sources.list.d/mongodb-org-5.0.list
 57 |             apt-get update
 58 |             apt-get install -y mongodb-org-shell mongodb-mongosh mongodb-org
 59 |             mongosh -u $TAP_MONGODB_USER \
 60 |                     -p $TAP_MONGODB_PASSWORD \
 61 |                     --authenticationDatabase admin \
 62 |                     --eval "rs.initiate({_id: \"rs0\", members: [{_id: 0, host: \"$TAP_MONGODB_HOST:$TAP_MONGODB_PORT\"}]})"
 63 |       - run:
 64 |           name: 'Setup virtual env'
 65 |           command: |
 66 |             pyenv local 3.9.6
 67 |             python3 -mvenv /usr/local/share/virtualenvs/tap-mongodb
 68 |             source /usr/local/share/virtualenvs/tap-mongodb/bin/activate
 69 |             pip install -U 'pip==23.2' 'setuptools==68.0.0'
 70 |             pip install .[dev]
 71 |       - run:
 72 |           name: 'pylint'
 73 |           command: |
 74 |             source /usr/local/share/virtualenvs/tap-mongodb/bin/activate
 75 |             make test
 76 |       - run:
 77 |           name: "Unit Tests"
 78 |           command: |
 79 |             source /usr/local/share/virtualenvs/tap-mongodb/bin/activate
 80 |             pip install pymongo==4.4.0 nose2
 81 |             nose2 -v -s tests/unittests/
 82 |       - run:
 83 |           name: 'Integration Tests'
 84 |           command: |
 85 |             source tap-tester.env
 86 |             mkdir /tmp/${CIRCLE_PROJECT_REPONAME}
 87 |             export STITCH_CONFIG_DIR=/tmp/${CIRCLE_PROJECT_REPONAME}
 88 |             source /usr/local/share/virtualenvs/tap-tester/bin/activate
 89 |             pip install pymongo==4.4.0
 90 |             run-test --tap=tap-mongodb tests
 91 |       - run:
 92 |           name: 'Get Curl'
 93 |           command: |
 94 |             apt update
 95 |             apt install -y curl
 96 |       - slack/notify-on-failure:
 97 |           only_for_branches: master
 98 |       - store_artifacts:
 99 |           path: /tmp/tap-mongodb
100 |   build_mongo_5_0:
101 |     executor: tap_tester_mongo_5_0
102 |     steps:
103 |       - checkout
104 |       - run:
105 |           name: 'Install Dockerize'
106 |           command: |
107 |             wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
108 |             tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
109 |             rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
110 |           environment:
111 |             DOCKERIZE_VERSION: v0.3.0
112 |       - run:
113 |           name: 'Wait for Mongo'
114 |           command: |
115 |             dockerize -wait tcp://127.0.0.1:27017 -timeout 1m
116 |             sleep 10
117 |       - run:
118 |           name: 'Setup Mongo'
119 |           command: |
120 |             aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox tap-tester.env
121 |             source tap-tester.env
122 |             wget -qO - https://www.mongodb.org/static/pgp/server-5.0.asc | apt-key add -
123 |             echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/5.0 multiverse" \
124 |                  | tee /etc/apt/sources.list.d/mongodb-org-5.0.list
125 |             apt-get update
126 |             apt-get install -y mongodb-org-shell mongodb-mongosh mongodb-org
127 |             mongosh -u $TAP_MONGODB_USER \
128 |                     -p $TAP_MONGODB_PASSWORD \
129 |                     --authenticationDatabase admin \
130 |                     --eval "rs.initiate({_id: \"rs0\", members: [{_id: 0, host: \"$TAP_MONGODB_HOST:$TAP_MONGODB_PORT\"}]})"
131 |       - run:
132 |           name: 'Setup virtual env'
133 |           command: |
134 |             pyenv local 3.9.6
135 |             python3 -mvenv /usr/local/share/virtualenvs/tap-mongodb
136 |             source /usr/local/share/virtualenvs/tap-mongodb/bin/activate
137 |             pip install -U 'pip==23.2' 'setuptools==68.0.0'
138 |             pip install .[dev]
139 |       - run:
140 |           name: 'pylint'
141 |           command: |
142 |             source /usr/local/share/virtualenvs/tap-mongodb/bin/activate
143 |             make test
144 |       - run:
145 |           name: "Unit Tests"
146 |           command: |
147 |             source /usr/local/share/virtualenvs/tap-mongodb/bin/activate
148 |             pip install pymongo==4.4.0 nose2
149 |             nose2 -v -s tests/unittests/
150 |       - run:
151 |           name: 'Integration Tests'
152 |           command: |
153 |             source tap-tester.env
154 |             mkdir /tmp/${CIRCLE_PROJECT_REPONAME}
155 |             export STITCH_CONFIG_DIR=/tmp/${CIRCLE_PROJECT_REPONAME}
156 |             source /usr/local/share/virtualenvs/tap-tester/bin/activate
157 |             pip install pymongo==4.4.0
158 |             run-test --tap=tap-mongodb tests
159 |       - run:
160 |           name: 'Get Curl'
161 |           command: |
162 |             apt update
163 |             apt install -y curl
164 |       - slack/notify-on-failure:
165 |           only_for_branches: master
166 |       - store_artifacts:
167 |           path: /tmp/tap-mongodb
168 |   build_mongo_6_0:
169 |     executor: tap_tester_mongo_6_0
170 |     steps:
171 |       - checkout
172 |       - run:
173 |           name: 'Install Dockerize'
174 |           command: |
175 |             wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
176 |             tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
177 |             rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
178 |           environment:
179 |             DOCKERIZE_VERSION: v0.3.0
180 |       - run:
181 |           name: 'Wait for Mongo'
182 |           command: |
183 |             dockerize -wait tcp://127.0.0.1:27017 -timeout 1m
184 |             sleep 10
185 |       - run:
186 |           name: 'Setup Mongo'
187 |           command: |
188 |             aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox tap-tester.env
189 |             source tap-tester.env
190 |             wget -qO - https://www.mongodb.org/static/pgp/server-6.0.asc | apt-key add -
191 |             echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/6.0 multiverse" \
192 |                  | tee /etc/apt/sources.list.d/mongodb-org-6.0.list
193 |             apt-get update
194 |             apt-get install -y mongodb-org-shell mongodb-mongosh mongodb-org
195 |             mongosh -u $TAP_MONGODB_USER \
196 |                     -p $TAP_MONGODB_PASSWORD \
197 |                     --authenticationDatabase admin \
198 |                     --eval "rs.initiate({_id: \"rs0\", members: [{_id: 0, host: \"$TAP_MONGODB_HOST:$TAP_MONGODB_PORT\"}]})"
199 |       - run:
200 |           name: 'Setup virtual env'
201 |           command: |
202 |             pyenv local 3.9.6
203 |             python3 -mvenv /usr/local/share/virtualenvs/tap-mongodb
204 |             source /usr/local/share/virtualenvs/tap-mongodb/bin/activate
205 |             pip install -U 'pip==23.2' 'setuptools==68.0.0'
206 |             pip install .[dev]
207 |       - run:
208 |           name: 'pylint'
209 |           command: |
210 |             source /usr/local/share/virtualenvs/tap-mongodb/bin/activate
211 |             make test
212 |       - run:
213 |           name: "Unit Tests"
214 |           command: |
215 |             source /usr/local/share/virtualenvs/tap-mongodb/bin/activate
216 |             pip install pymongo==4.4.0 nose2
217 |             nose2 -v -s tests/unittests/
218 |       - run:
219 |           name: 'Integration Tests'
220 |           command: |
221 |             source tap-tester.env
222 |             mkdir /tmp/${CIRCLE_PROJECT_REPONAME}
223 |             export STITCH_CONFIG_DIR=/tmp/${CIRCLE_PROJECT_REPONAME}
224 |             source /usr/local/share/virtualenvs/tap-tester/bin/activate
225 |             pip install pymongo==4.4.0
226 |             run-test --tap=tap-mongodb tests
227 |       - run:
228 |           name: 'Get Curl'
229 |           command: |
230 |             apt update
231 |             apt install -y curl
232 |       - slack/notify-on-failure:
233 |           only_for_branches: master
234 |       - store_artifacts:
235 |           path: /tmp/tap-mongodb
236 | 
237 | workflows:
238 |   version: 2
239 |   commit: &commit_jobs
240 |     jobs:
241 |       - build_mongo_5_0:
242 |           context:
243 |             - circleci-user
244 |             - tier-1-tap-user
245 |           requires:
246 |             - build
247 |       - build_mongo_6_0:
248 |           context:
249 |             - circleci-user
250 |             - tier-1-tap-user
251 |           requires:
252 |             - build_mongo_5_0
253 |       - build:
254 |           context:
255 |             - circleci-user
256 |             - tier-1-tap-user
257 |          
258 |   build_daily:
259 |     <<: *commit_jobs
260 |     triggers:
261 |       - schedule:
262 |           cron: "0 1 * * *"
263 |           filters:
264 |             branches:
265 |               only:
266 |                 - master
267 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # Description of change
 2 | (write a short description here or paste a link to JIRA)
 3 | 
 4 | # QA steps
 5 |  - [ ] automated tests passing
 6 |  - [ ] manual qa steps passing (list below)
 7 |  
 8 | # Risks
 9 | 
10 | # Rollback steps
11 |  - revert this branch
12 | 
13 | #### AI generated code
14 | https://internal.qlik.dev/general/ways-of-working/code-reviews/#guidelines-for-ai-generated-code
15 | - [ ] this PR has been written with the help of GitHub Copilot or another generative AI tool
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # Emacs
104 | .tramp_history
105 | 
106 | config.json
107 | state.json
108 | properties.json
109 | catalog.json
110 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## 3.1.4
  4 |   * Update pymongo to 4.10.1 [#121](https://github.com/singer-io/tap-mongodb/pull/121)
  5 | 
  6 | ## 3.1.3
  7 |   * Remove deprecated terminaltables dependency [#120](https://github.com/singer-io/tap-mongodb/pull/120)
  8 | 
  9 | ## 3.1.2
 10 |   * Allows to specify string values as projection [#94](https://github.com/singer-io/tap-mongodb/pull/94)
 11 | 
 12 | ## 3.1.1
 13 |   * Gracefully fallback to not using a session if sessions are not supported by the mongo server [#112](https://github.com/singer-io/tap-mongodb/pull/112)
 14 | 
 15 | ## 3.1.0
 16 |   * Updates to run on python 3.11.7 [#111](https://github.com/singer-io/tap-mongodb/pull/111)
 17 | 
 18 | ## 3.0.3
 19 |   * Refresh the session every 10 minutes during oplog queries [#110](https://github.com/singer-io/tap-mongodb/pull/110)
 20 | 
 21 | ## 3.0.2
 22 |   * Fix an issue with connection params when SSL is used [#107](https://github.com/singer-io/tap-mongodb/pull/107)
 23 | 
 24 | ## 3.0.1
 25 |   * Fix issue with SSH tunnel connections by connecting directly to a MongoDB node instead of allowing PyMongo to automatically discover replica sets [#105](https://github.com/singer-io/tap-mongodb/pull/105)
 26 | 
 27 | ## 3.0.0
 28 |   * Upgrade PyMongo to v4.3+ [#99](https://github.com/singer-io/tap-mongodb/pull/99)
 29 |   * Fix uuid transformation [#100](https://github.com/singer-io/tap-mongodb/pull/100)
 30 |   * Fix empty projection [#102](https://github.com/singer-io/tap-mongodb/pull/102)
 31 | 
 32 | ## 2.1.3
 33 |   * Fix a bug in Full Table sync that caused a sync to fail if document contained invalid BSON[#95](https://github.com/singer-io/tap-mongodb/pull/95)
 34 | 
 35 | ## 2.1.2
 36 |   * Update pymongo to v3.12.3 [#81](https://github.com/singer-io/tap-mongodb/pull/81)
 37 | 
 38 | ## 2.1.1
 39 |   * Fix bug in oplog bookmarking where the bookmark would not advance due to fencepost querying finding a single record [#80](https://github.com/singer-io/tap-mongodb/pull/80)
 40 | 
 41 | ## 2.1.0
 42 |   * Optimize oplog extractions to only query for the selected tables [#78](https://github.com/singer-io/tap-mongodb/pull/78)
 43 | 
 44 | ## 2.0.1
 45 |   * Modify `get_databases` function to return a unique list of databases [#58](https://github.com/singer-io/tap-mongodb/pull/58)
 46 | 
 47 | ## 2.0.0
 48 |   * Build and write schema messages [#40](https://github.com/singer-io/tap-mongodb/pull/40) The main changes are:
 49 |     1. date-time fields will have a `"type": "string", "format": "date-time"` schema that will cause them to get loaded as date-times instead of strings
 50 |     2. decimal fields will have a `"type": "number", "multipleOf": 1e-34` schema written
 51 |     3. double fields will have a `"type": "number"` schema written that should prevent them from splitting between doubles/decimals depending on the precision
 52 | 
 53 | ## 1.1.0
 54 |   * Add optional `verify_mode` config value to replace the assumptions in version 1.0.4 [#38](https://github.com/singer-io/tap-mongodb/pull/38)
 55 | 
 56 | ## 1.0.4
 57 |   * Add support for turning off ssl cert validation when using a ssh tunnel [#36](https://github.com/singer-io/tap-mongodb/pull/36)
 58 | 
 59 | ## 1.0.3
 60 |   * Add support for floats as replication keys [#34](https://github.com/singer-io/tap-mongodb/pull/34)
 61 | 
 62 | ## 1.0.2
 63 |   * Add support for DBRefs [#32](https://github.com/singer-io/tap-mongodb/pull/32)
 64 | 
 65 | ## 1.0.1
 66 |   * Discover collections in the `admin` database and add support for `Int64` as a replication key type [#30](https://github.com/singer-io/tap-mongodb/pull/30)
 67 | 
 68 | ## 1.0.0
 69 |   * Release out of Beta [#29](https://github.com/singer-io/tap-mongodb/pull/29)
 70 | 
 71 | ## 0.3.0
 72 |   * Add support for UUID types in replication keys and records [#27](https://github.com/singer-io/tap-mongodb/pull/27)
 73 | 
 74 | ## 0.2.2
 75 |   * Improve invalid datetime handling [#25](https://github.com/singer-io/tap-mongodb/pull/25)
 76 | 
 77 | ## 0.2.1
 78 |   * Clear stream state if replication method changes [#24](https://github.com/singer-io/tap-mongodb/pull/24)
 79 | 
 80 | ## 0.2.0
 81 |   * Improve Oplog query performance by using only a timestamp and the `oplog_replay` arg. [#23](https://github.com/singer-io/tap-mongodb/pull/23)
 82 | 
 83 | ## 0.1.11
 84 |   * Only bookmark latest ts on first sync for oplog [#22](https://github.com/singer-io/tap-mongodb/pull/22)
 85 | 
 86 | ## 0.1.10
 87 |   * Fix for additional empty string projections [#21](https://github.com/singer-io/tap-mongodb/pull/21)
 88 | 
 89 | ## 0.1.9
 90 |   * Make tap robust against projection that is empty string
 91 |   * Actually respect `INCLUDE_SCHEMAS_IN_DESTINATION_STREAM_NAME` prop
 92 |   * [#20](https://github.com/singer-io/tap-mongodb/pull/20)
 93 | 
 94 | ## 0.1.8
 95 |   * Prefer secondary when connecting to Mongo [#19](https://github.com/singer-io/tap-mongodb/pull/19)
 96 | 
 97 | ## 0.1.7
 98 |   * Full Table syncs can handle empty collections [#18](https://github.com/singer-io/tap-mongodb/pull/18)
 99 | 
100 | ## 0.1.6
101 |   * Fix a bug with supporting bookmarks of ObjectId [#17](https://github.com/singer-io/tap-mongodb/pull/17)
102 | 
103 | ## 0.1.5
104 |   * Check for cases when the Oplog may have aged out and execute a full resync [#16](https://github.com/singer-io/tap-mongodb/pull/16)
105 | 
106 | ## 0.1.4
107 |   * Get global oplog timestamp instead of collection-specific [#15](https://github.com/singer-io/tap-mongodb/pull/15)
108 | 
109 | ## 0.1.3
110 |   * Support several new types for the `_id` column aside from ObjectID [#14](https://github.com/singer-io/tap-mongodb/pull/14)
111 | 
112 | ## 0.1.2
113 |   * Encode bytes back to base64 strings as we do not know the encodings [#13](https://github.com/singer-io/tap-mongodb/pull/13)
114 | 
115 | ## 0.1.1
116 |   * During key-based incremental sync, if replication-key changes, wipe state and resync table [#10](https://github.com/singer-io/tap-mongodb/pull/10)
117 |   * Only support replication keys of types `datetime`, `timestamp`, `integer`, `ObjectId` [#10](https://github.com/singer-io/tap-mongodb/pull/10)
118 |   * Only discover databases the user has read access for [#11](https://github.com/singer-io/tap-mongodb/pull/11)
119 | 
120 | ## 0.1.0
121 |  * Added key-based incremental sync [commit](https://github.com/singer-io/tap-mongodb/commit/b618b11d91e111680f70b402c6e94c9bf40c7b8f)
122 | 
123 | ## 0.0.5
124 |  * Fixed bug in oplog projections [commit](https://github.com/singer-io/tap-mongodb/commit/b400836678440499d4a15fb7d5b0a40a13e3342e)
125 | 
126 | ## 0.0.4
127 |  * Fixed bug in oplog projections [commit](https://github.com/singer-io/tap-mongodb/commit/527287e69661e9dbce3f05696b269025d0fc4034)
128 |  * Added metric log printout at end of tap run [commit](https://github.com/singer-io/tap-mongodb/commit/d0403d82028b1dcc9ba306b52b2103ef00188b7d)
129 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .DEFAULT_GOAL := test
2 | 
3 | test:
4 | 	pylint tap_mongodb tap_mongodb/sync_strategies -d missing-docstring,fixme,duplicate-code,line-too-long,too-many-statements,too-many-locals,consider-using-f-string,consider-using-from-import,broad-exception-raised,superfluous-parens,consider-using-generator,use-yield-from
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # tap-mongodb
  2 | 
  3 | This is a [Singer](https://singer.io) tap that produces JSON-formatted data following the [Singer spec](https://github.com/singer-io/getting-started/blob/master/SPEC.md) from a MongoDB source.
  4 | 
  5 | ## Set up Virtual Environment
  6 | ```
  7 | python3 -m venv ~/.virtualenvs/tap-mongodb
  8 | source ~/.virtualenvs/tap-mongodb/bin/activate
  9 | ```
 10 | 
 11 | ## Install tap
 12 | ```
 13 | pip install -U pip setuptools
 14 | pip install tap-mongodb
 15 | ```
 16 | 
 17 | ## Set up Config file
 18 | Create json file called `config.json`, with the following contents:
 19 | ```
 20 | {
 21 |   "password": "<password>",
 22 |   "user": "<username>",
 23 |   "host": "<host ip address>",
 24 |   "port": "<port>",
 25 |   "database": "<database name>"
 26 | }
 27 | ```
 28 | The folowing parameters are optional for your config file:
 29 | 
 30 | | Name | Type | Description |
 31 | | -----|------|------------ |
 32 | | `replica_set` | string | name of replica set |
 33 | |`ssl` | Boolean | can be set to true to connect using ssl |
 34 | | `include_schema_in_destination_stream_name` | Boolean | forces the stream names to take the form `<database_name>_<collection_name>` instead of `<collection_name>`|
 35 | 
 36 | All of the above attributes are required by the tap to connect to your mongo instance. 
 37 | 
 38 | ## Run in discovery mode
 39 | Run the following command and redirect the output into the catalog file
 40 | ```
 41 | tap-mongodb --config ~/config.json --discover > ~/catalog.json
 42 | ```
 43 | 
 44 | Your catalog file should now look like this:
 45 | ```
 46 | {
 47 |   "streams": [
 48 |     {
 49 |       "table_name": "<table name>",
 50 |       "tap_stream_id": "<tap_stream_id>",
 51 |       "metadata": [
 52 |         {
 53 |           "breadcrumb": [],
 54 |           "metadata": {
 55 |             "row-count":<int>,
 56 |             "is-view": <bool>,
 57 |             "database-name": "<database name>",
 58 |             "table-key-properties": [
 59 |               "_id"
 60 |             ],
 61 |             "valid-replication-keys": [
 62 |               "_id"
 63 |             ]
 64 |           }
 65 |         }
 66 |       ],
 67 |       "stream": "<stream name>",
 68 |       "schema": {
 69 |         "type": "object"
 70 |       }
 71 |     }
 72 |   ]
 73 | }
 74 | ```
 75 | 
 76 | ## Edit Catalog file
 77 | ### Using valid json, edit the config.json file
 78 | To select a stream, enter the following to the stream's metadata:
 79 | ```
 80 | "selected": true,
 81 | "replication-method": <replication method>,
 82 | ```
 83 | 
 84 | `<replication-method>` must be either `FULL_TABLE` or `LOG_BASED`
 85 | 
 86 | To add a projection to a stream, add the following to the stream's metadata field:
 87 | ```
 88 | "tap-mongodb.projection": <projection>
 89 | ```
 90 | 
 91 | For example, if you were to edit the example stream to select the stream as well as add a projection, config.json should look this:
 92 | ```
 93 | {
 94 |   "streams": [
 95 |     {
 96 |       "table_name": "<table name>",
 97 |       "tap_stream_id": "<tap_stream_id>",
 98 |       "metadata": [
 99 |         {
100 |           "breadcrumb": [],
101 |           "metadata": {
102 |             "row-count": <int>,
103 |             "is-view": <bool>,
104 |             "database-name": "<database name>",
105 |             "table-key-properties": [
106 |               "_id"
107 |             ],
108 |             "valid-replication-keys": [
109 |               "_id"
110 |             ],
111 |             "selected": true,
112 |             "replication-method": "<replication method>",
113 |             "tap-mongodb.projection": "<projection>"
114 |           }
115 |         }
116 |       ],
117 |       "stream": "<stream name>",
118 |       "schema": {
119 |         "type": "object"
120 |       }
121 |     }
122 |   ]
123 | }
124 | 
125 | ```
126 | ## Run in sync mode:
127 | `tap-mongodb --config ~/config.json --catalog ~/catalog.json`
128 | 
129 | The tap will write bookmarks to stdout which can be captured and passed as an optional `--state state.json` parameter to the tap for the next sync.
130 | 
131 | ## Supplemental MongoDB Info
132 | 
133 | ### Local MongoDB Setup
134 | If you haven't yet set up a local mongodb client, follow [these instructions](https://github.com/singer-io/tap-mongodb/blob/master/spikes/local_mongo_setup.md)
135 | 
136 | ---
137 | 
138 | Copyright &copy; 2019 Stitch
139 | 


--------------------------------------------------------------------------------
/bin/populate_test_database.py:
--------------------------------------------------------------------------------
 1 | import pymongo # requires dnspython package as well
 2 | import sys
 3 | import bson
 4 | import datetime
 5 | import re
 6 | import pprint
 7 | import time
 8 | import decimal
 9 | import string
10 | import random
11 | 
12 | 
13 | 
14 | 
15 | #------ Local mongo server ------
16 | username = sys.argv[1]
17 | password = sys.argv[2]
18 | host= '127.0.0.1'
19 | auth_source = 'test'
20 | ssl = False
21 | client = pymongo.MongoClient(host=host, username=username, password=password, port=27017, authSource=auth_source, ssl=ssl)
22 | 
23 | databases = {
24 |     "simple_db": ["simple_coll_1", "simple_coll_2"],
25 |     "datatype_db": ["datatype_coll_1", "datatype_coll_2"],
26 | }
27 | 
28 | 
29 | ############# Drop all dbs/collections #############
30 | for db_name, colls in databases.items():
31 |     for coll_name in colls:
32 |         print("---- Dropping database: " + db_name + ", collection: " + coll_name + " ----")
33 |         client[db_name][coll_name].drop()
34 | 
35 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
36 |     return ''.join(random.choice(chars) for x in range(size))
37 | 
38 | def generate_simple_coll_docs(num_docs):
39 |     docs = []
40 |     for int_value in range(num_docs):
41 |         docs.append({"int_field": int_value, "string_field": random_string_generator()})
42 |     return docs
43 | 
44 | 
45 | ############# Add simple collections #############
46 | # simple_coll_1 has 50 documents
47 | client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
48 | 
49 | # simple_coll_2 has 100 documents
50 | client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))
51 | 
52 | 
53 | ############# Add datatype collections #############
54 | pattern = re.compile('.*')
55 | regex = bson.Regex.from_native(pattern)
56 | regex.flags ^= re.UNICODE
57 | 
58 | datatype_doc = {
59 |     "double_field": 4.3,
60 |     "string_field": "a sample string",
61 |     "object_field" : {
62 |         "obj_field_1_key": "obj_field_1_val",
63 |         "obj_field_2_key": "obj_field_2_val"
64 |     },
65 |     "array_field" : [
66 |         "array_item_1",
67 |         "array_item_2",
68 |         "array_item_3"
69 |     ],
70 |     "binary_data_field" : b"a binary string",
71 |     "object_id_field": bson.objectid.ObjectId(b'123456789123'),
72 |     "boolean_field" : True,
73 |     "date_field" : datetime.datetime.now(),
74 |     "null_field": None,
75 |     "regex_field" : regex,
76 |     "32_bit_integer_field" : 32,
77 |     "timestamp_field" : bson.timestamp.Timestamp(int(time.time()), 1),
78 |     "64_bit_integer_field" : 34359738368,
79 |     "decimal_field" : bson.Decimal128(decimal.Decimal('1.34')),
80 |     "javaScript_field" : bson.code.Code("var x, y, z;"),
81 |     "javaScript_with_scope_field" : bson.code.Code("function incrementX() { x++; }", scope={"x": 1}),
82 |     "min_key_field" : bson.min_key.MinKey,
83 |     "max_key_field" : bson.max_key.MaxKey
84 | }
85 | 
86 | client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc)
87 | client["datatype_db"]["datatype_coll_2"].insert_one(datatype_doc)
88 | 
89 | print("\nPrinting database contents")
90 | for db_name in client.list_database_names():
91 |     if db_name in ['admin', 'config', 'local']:
92 |         continue
93 |     for collection_name in client[db_name].list_collection_names():
94 |         print('\n---- Database: '+ db_name +', Collection: ' + collection_name + " ----")
95 |         for doc in client[db_name][collection_name].find():
96 |             print(doc)
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/bin/test-db:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import sys
  4 | import argparse
  5 | import subprocess
  6 | import time
  7 | from argparse import RawTextHelpFormatter
  8 | 
  9 | # singerio images have required keyfile
 10 | image_name = "singerio/mongo"
 11 | 
 12 | # organize command options based on image_tag
 13 | command_opts = {  # top level keys = supported image_tag versions, values = shell
 14 |     '4.2-bionic': 'mongo',  # version 5.0.15 TODO Remove?
 15 |     '4.4-bionic': 'mongo',  # version 4.4.6, also supports mongosh?
 16 |     '5.0': 'mongosh',       # version 5.0.15, also supports mongo
 17 |     '6.0': 'mongosh',       # version 6.0.4
 18 | }
 19 | 
 20 | def start_container(name, image_tag):
 21 | 
 22 |     START_COMMAND = """
 23 |     sudo docker run -e "MONGO_INITDB_ROOT_USERNAME={0}" -e "MONGO_INITDB_ROOT_PASSWORD={1}" \
 24 |         -p {2}:{2} --name {3} \
 25 |         -d {4}:{5} \
 26 |         --auth \
 27 |         --keyFile /opt/mongo/keyfile --replSet rs0
 28 |     """.format(os.getenv('TAP_MONGODB_USER'),
 29 |                os.getenv('TAP_MONGODB_PASSWORD'),
 30 |                os.getenv('TAP_MONGODB_PORT'),
 31 |                name,
 32 |                image_name,
 33 |                image_tag)
 34 | 
 35 |     print("Starting Docker process mongo1 using command: {}".format(START_COMMAND))
 36 | 
 37 |     proc = subprocess.run(START_COMMAND, shell=True)
 38 |     if proc.returncode != 0:
 39 |         sys.exit("Exited with code: {}, the docker process failed to start.".format(proc.returncode))
 40 |     print("Process started successfully. Starting Oplog replication.")
 41 | 
 42 |     # Sleeping to allow Mongo enough time to start up
 43 |     time.sleep(5)
 44 | 
 45 |     ip_addr = get_ip_addr(name)
 46 |     # If using image_version <=4.4.0-bionic use mongo, not mongosh
 47 |     CONFIGURE_COMMAND = """
 48 |      docker exec {} {} --host {} test -u {} -p {} --authenticationDatabase admin --eval {}
 49 |     """.format(
 50 |         name,
 51 |         command_opts[image_tag],
 52 |         ip_addr,
 53 |         os.getenv('TAP_MONGODB_USER'),
 54 |         os.getenv('TAP_MONGODB_PASSWORD'),
 55 |         '\'rs.initiate({_id: "rs0", members: [{_id: 0, host: "127.0.0.1:27017"}]})\'')
 56 |     print("Initiate replSet using: {}".format(CONFIGURE_COMMAND))
 57 |     proc = subprocess.run(CONFIGURE_COMMAND, shell=True)
 58 |     if proc.returncode != 0:
 59 |         sys.exit("Exited with code: {}, the docker command failed.".format(proc.returncode))
 60 |     print("Oplog configured correctly.")
 61 | 
 62 | def get_ip_addr(name):
 63 |     IP_ADDR_COMMAND = "docker inspect {} | jq -r .[].NetworkSettings.IPAddress"
 64 |     print("Retrieving IP addr of mongodb container")
 65 |     ip_addr = subprocess.check_output(IP_ADDR_COMMAND.format(name), shell=True).decode('utf-8').rstrip()
 66 |     print(ip_addr)
 67 |     return ip_addr
 68 | 
 69 | def stop_container(name):
 70 |     STOP_COMMAND = "sudo docker stop {0} && sudo docker rm {0}"
 71 | 
 72 |     print("Stopping Docker process {}".format(name))
 73 |     proc = subprocess.run(STOP_COMMAND.format(name), shell=True)
 74 |     if proc.returncode != 0:
 75 |         sys.exit("Exited with code: {}, the docker process failed to stop.".format(proc.returncode))
 76 |     print("Process stopped successfully")
 77 | 
 78 | def connect_to_db(name, image_tag):
 79 |     CONNECT_COMMAND = "docker run -it --rm {}:{} {} --host {} test -u {} -p {} --authenticationDatabase admin"
 80 |     ip_addr = get_ip_addr(name)
 81 | 
 82 |     print("Attempting to connect to running container using a mongo container")
 83 |     # Note: Shell is determined based on user provided image_tag, connect may fail if the shell
 84 |     #       associated with the user provided image_tag is not supported by the running DB version.
 85 |     connect_command_format = CONNECT_COMMAND.format(image_name,
 86 |                                                     image_tag,
 87 |                                                     command_opts[image_tag],
 88 |                                                     ip_addr,
 89 |                                                     os.getenv('TAP_MONGODB_USER'),
 90 |                                                     os.getenv('TAP_MONGODB_PASSWORD'))
 91 |     print(connect_command_format)
 92 |     # NB: Using call instead of run here because it is blocking
 93 |     #     This returns only an exit code.
 94 |     returncode = subprocess.call(connect_command_format,
 95 |                                  shell=True)
 96 |     if returncode != 0:
 97 |         sys.exit("Exited with code: {}, could not connect.".format(returncode))
 98 | 
 99 | DESCRIPTION = """
100 | Manage docker instance for tap-mssql testing.
101 | 
102 | Uses environment variables:
103 |     TAP_MONGODB_USER
104 |     TAP_MONGODB_PASSWORD
105 | """
106 | parser = argparse.ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter)
107 | parser.add_argument('action', choices=['start','stop', 'connect'], help='action to perform with the container')
108 | parser.add_argument('--name', help="name assigned to running docker process", default='mongo1')
109 | parser.add_argument('--image-tag', choices=command_opts.keys(), help='Supported image tags, default=6.0', default='6.0')
110 | 
111 | def main():
112 |     parsed_args = parser.parse_args()
113 |     # Potential arguments to add: pull, changing docker cointainer, changing password
114 |     if parsed_args.action == 'start':
115 |         start_container(parsed_args.name, parsed_args.image_tag)
116 |     elif parsed_args.action == 'stop':
117 |         stop_container(parsed_args.name)
118 |     elif parsed_args.action == 'connect':
119 |         connect_to_db(parsed_args.name, parsed_args.image_tag)
120 | 
121 | if __name__ == "__main__":
122 |     main()
123 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | setup(name='tap-mongodb',
 6 |       version='3.1.4',
 7 |       description='Singer.io tap for extracting data from MongoDB',
 8 |       author='Stitch',
 9 |       url='https://singer.io',
10 |       classifiers=['Programming Language :: Python :: 3 :: Only'],
11 |       py_modules=['tap_mongodb'],
12 |       install_requires=[
13 |           'singer-python==6.0.0',
14 |           'pymongo==4.10.1',
15 |           'tzlocal==2.0.0',
16 |       ],
17 |       extras_require={
18 |           'dev': [
19 |               'pylint',
20 |               'nose2',
21 |               'ipdb'
22 |           ]
23 |       },
24 |       entry_points='''
25 |           [console_scripts]
26 |           tap-mongodb=tap_mongodb:main
27 |       ''',
28 |       packages=['tap_mongodb', 'tap_mongodb.sync_strategies'],
29 | 
30 | )
31 | 


--------------------------------------------------------------------------------
/spikes/atlas_setup.md:
--------------------------------------------------------------------------------
 1 | # MongoDB Atlas Spike
 2 | Atlas is MongoDB's cloud service that can be hosted on aws. We already have an Atlas account that can be accessed by:
 3 | 1. Go to https://cloud.mongodb.com
 4 | 2. Creds in 1pass
 5 | 
 6 | ### Cluster Tiers
 7 | | Tier | RAM | Storage | vCPU | Price/month |
 8 | | ---- | --- | ------- | ---- | ---------- |
 9 | | M0 | Shared | 512 MB | Shared | Free |
10 | | M2 | Shared | 2 GB | Shared | $9 |
11 | | M5 | Shared | 5 GB | Shared | $25 |
12 | | M10 | 2 GB | 10 GB | 1 vCPU | ~$58 |
13 | | M20 | 4 GB | 20 GB | 2 vCPU | ~$144 | 
14 |   
15 | M0, M2 shared, and M5 shared tiers have limitations
16 | - Each account can only have one M0 cluster
17 | - Can only use Mongo version 4.0
18 | - Cannot configure memory or storage size
19 | 
20 | ### Possible Setups
21 | - Have one cluster with a separate db for dev/circle
22 |   - Atlas allows, at a minimum, up to 100 connections
23 | - Have one cluster for dev, one for circle
24 | - Use projects -- Atlas projects are meant for separate, isolated environments
25 | 
26 | Suggest having a separate project for dev & circle
27 |   - each having a single cluster
28 |   - https://docs.atlas.mongodb.com/tutorial/manage-projects/
29 |   - can add more if we start running into collisions (dev, circle, dev-tap-tester, harrison-tap-tester, etc)
30 | 
31 | ### Connecting to Atlas
32 | - Download mongo community edition
33 |   - [Installation Instructions](https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/)
34 |   - Installs mongo shell and a number of other modules
35 | 
36 | #### Using Mongo Shell
37 | To connect to our M0 free tier cluster (username/password in 1Pass):
38 | ```
39 | mongo mongodb+srv://stitch-upwjw.mongodb.net/test -u <username> -p <password>
40 | ```
41 | 
42 | ## Other Options Considered
43 | - Set up local mongo server 
44 |   - Seems like a lot more overhead, especially when using circle
45 | - Set up Mongo Stack on AWS (spins up EC2 instance with Mongo)
46 |   - Much more expensive (~$12,000/year!)
47 |  
48 | 


--------------------------------------------------------------------------------
/spikes/dbreps_and_hackathon_review.md:
--------------------------------------------------------------------------------
 1 | # Review of dbreps and Chris C's hackathon project
 2 | 
 3 | ## dbreps
 4 | [sync_table](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_table/mongodb.clj)
 5 | file and
 6 | [sync_structure](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_structure/mongodb.clj)
 7 | file for mongo
 8 | 
 9 | ### notes
10 | - retrieves fields to decide which fields are "bookmarkable" [get-index code](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_structure/mongodb.clj#L29)[bookmarkable code](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_structure/mongodb.clj#L39-L50)
11 | - retrieves row count for each collection [code](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_structure/mongodb.clj#L70)
12 | - when opening cursor
13 |   - specifies `QUERYOPTION_SLAVEOK` [code](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_table/mongodb.clj#L47)
14 |     - when turned on, read queries will be directed to slave servers instead of the primary server
15 |   - specifies `batch_size` [code](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_table/mongodb.clj#L68)
16 |     - Uses the max of (2, 16 MB), and calls getDynamicFetchSize, so will ultimately set fetch size to 8 MB
17 |     - pymongo uses a default of 1 MB, may want to look into changing this for efficiency
18 | - uses projections [code](https://github.com/stitchdata/db-replicators/blob/3764f905a76952324c9f9b8ff8e1545fe9cd8113/src/com/rjmetrics/dbreplicator/worker/methods/sync_table/mongodb.clj#L64-L71)
19 | 
20 | ## Hackathon
21 | - Supports op-log and full-table rep
22 | - Client accepts authsource (db name) which defaults to 'admin'
23 |   - we did not do this in our spike, should do it in tap
24 | - Discovery
25 |   - ignores 
26 |     - dbs = ['admin', 'system', 'local']
27 |     - collections = ['system.indexes']
28 |   - does not discover fields, only writes `database-name` and `row-count` metadata
29 |     - 'schema': {
30 |         'type': 'object'
31 |       }
32 | - Sync
33 |   - Prioritizes streams in by:
34 |     - Currently Syncing
35 |     - Streams without state
36 |     - streams with state
37 |   - Non-oplog streams
38 |     - Uses `custom-select-clause` metadata for a stream to get the select statement
39 |       - streams that don't have this are skipped
40 |     - whitelisting is done post select, we should use projections instead of this
41 |   - oplog streams
42 |     - works similar to other db taps
43 |     - whitelisting again performed post select
44 | - Generally seems like a good starting point for our tap
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/spikes/local_mongo_setup.md:
--------------------------------------------------------------------------------
 1 | # Local MongoDB Setup
 2 | 
 3 | ### Install MongoDB Community Edition
 4 | Follow MongoDB Manual directions to install MongoDB Community Edition on ubuntu [[3.2](https://docs.mongodb.com/v3.2/tutorial/install-mongodb-on-ubuntu/), [3.4](https://docs.mongodb.com/v3.4/tutorial/install-mongodb-on-ubuntu/), [3.6](https://docs.mongodb.com/v3.6/tutorial/install-mongodb-on-ubuntu/), [4.0](https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/)]
 5 | 
 6 | ### Add users, roles, authentication
 7 | Follow steps 1-5 of these [instructions](https://docs.mongodb.com/manual/tutorial/enable-authentication/) to add user administrator
 8 | 
 9 | After step 5, run the following commands to create:
10 | - a user `stitch_root` that can enable oplog
11 | 
12 | ```
13 | use admin
14 | db.createUser(
15 |   {
16 |     user: "stitch_root",
17 |     pwd: "<password>",
18 |     roles: [{role: "root", db: "admin"}]
19 |   }
20 | )
21 | ```
22 | 
23 | - a user `stitch_dev` that can create/read/write new dbs and access oplog
24 | 
25 | ```
26 | use test
27 | db.createUser(
28 |   {
29 |     user: "stitch_dev",
30 |     pwd: "<password>",
31 |     roles: [ { role: "readWriteAnyDatabase", db: "admin" }, {role: "read", db: "local"} ]
32 |   }
33 | )
34 | ```
35 | 
36 | ### Enable Oplog
37 | 1. Edit `/etc/mongod.conf` and add a replciation set:
38 | 
39 | Assume superuser:
40 | ```
41 | sudo su
42 | ```
43 | 
44 | Uncomment replication and add `replicationSetName` (indented) in `/etc/mongod.conf`:
45 | ```
46 | replication:
47 |   replSetName: rs0
48 | ```
49 | 
50 | Return to normal user with `C-d`
51 | 
52 | 2. Restart mongod and pass it --config flag:
53 | ```
54 | sudo mongod --auth --config /etc/mongod.conf
55 | ```
56 | 
57 | 3. Initiate replica set
58 | 
59 | Connect to shell as `stitch_root` user:
60 | 
61 | ```
62 | mongo --port 27017 -u stitch_root -p <password> --authenticationDatabase admin
63 | ```
64 | 
65 | and initiate replica set:
66 | ```
67 | rs.initiate({_id: "rs0", members: [{_id: 0, host: "127.0.0.1:27017"}]})
68 | ```
69 | 
70 | 4. Check out that oplog
71 | 
72 | Disconnect from shell and reconnect as `stitch_dev` user;
73 | 
74 | ```
75 | mongo --port 27017 -u stitch_dev -p <password> --authenticationDatabase test
76 | ```
77 | 
78 | switch to local
79 | ```
80 | use local
81 | ```
82 | 
83 | view oplog rows
84 | ```
85 | db.oplog.rs.find()
86 | ```
87 | 
88 | ### Connect with shell
89 | Can now connect to Mongo via the mongo shell with:
90 | ```
91 | mongo --host localhost --port 27017 --authenticationDatabase <db_name> --username <username> --password <password>
92 | ```
93 | 


--------------------------------------------------------------------------------
/spikes/pymongo_spike.py:
--------------------------------------------------------------------------------
  1 | import pymongo # requires dnspython package as well
  2 | import sys
  3 | 
  4 | #------------------------ Setup Client ------------------------
  5 | 
  6 | #----- Atlas using connection string -----
  7 | #username = sys.argv[1]
  8 | #password = sys.argv[2]
  9 | #host = 'stitch-upwjw.mongodb.net'
 10 | # connection_string = "mongodb+srv://{}:{}@{}/test".format(username, password, host)
 11 | # client = pymongo.MongoClient(connection_string)
 12 | 
 13 | #----- Atlas using connection props -----
 14 | # username = sys.argv[1]
 15 | # password = sys.argv[2]
 16 | # host=['stitch-shard-00-00-upwjw.mongodb.net',
 17 | #       'stitch-shard-00-01-upwjw.mongodb.net',
 18 | #       'stitch-shard-00-02-upwjw.mongodb.net']
 19 | # ssl = True # client must have ssl=True to connect to atlas cluster
 20 | # client = pymongo.MongoClient(host=host, username=username, password=password, port=27017, ssl=True)
 21 | 
 22 | #------ Local mongo server ------
 23 | username = sys.argv[1]
 24 | password = sys.argv[2]
 25 | host= '127.0.0.1'
 26 | auth_source = 'test'
 27 | ssl = False
 28 | client = pymongo.MongoClient(host=host, username=username, password=password, port=27017, authSource=auth_source, ssl=ssl)
 29 | 
 30 | # Get connection Info
 31 | print("\nConnecting to MongoDB version " + client.server_info()['version'])
 32 | 
 33 | # List dbs
 34 | print("\nShowing Initial Databases...")
 35 | print(client.list_database_names())
 36 | 
 37 | 
 38 | # Make db and collection
 39 | # Note: MongoDB waits until you have created a collection (table), with at least one document (record) before it actually creates the database (and collection).
 40 | print("\nAdding database=spike_db and collection=sources_team_members...")
 41 | spike_db = client["spike_db"]
 42 | sources_team_members_coll = spike_db["sources_team_members"]
 43 | 
 44 | # Add one document to collection
 45 | print ("\nAdding nick to collection=sources_team_members...")
 46 | sources_team_members_coll.insert_one({"name": "Nick", "membersince": 2018})
 47 | 
 48 | # Add multiple documents to collection
 49 | print("\nAdding everyone else to collection=sources_team_members...")
 50 | sources_team_members_coll.insert_many([{"name": "Jacob", "membersince": 2019, "my_object": {"nested_field": "some_value"}},
 51 |                                        {"name": "Collin", "membersince": 2019},
 52 |                                        {"name": "Dan", "membersince": 2017},
 53 |                                        {"name": "Kyle", "membersince": 2016},
 54 |                                        {"name": "Andy", "membersince": 2018},
 55 |                                        {"name": "Brian", "membersince": 2014},
 56 |                                        {"name": "Harrison", "membersince": 2018}])
 57 | 
 58 | 
 59 | print("\nShowing Databases...")
 60 | print(client.list_database_names())
 61 | 
 62 | print("\nShowing collections in db=spike_db...")
 63 | print(spike_db.list_collection_names())
 64 | 
 65 | print("\nShowing all documents in sources_team_members_coll...")
 66 | for doc in sources_team_members_coll.find():
 67 |     print(doc)
 68 | 
 69 | print("\nShowing documents where membersince > 2016...")
 70 | for doc in sources_team_members_coll.find({"membersince": {"$gt": 2016}}):
 71 |     print(doc)
 72 | 
 73 | print("\nShow only name and id...")
 74 | for doc in sources_team_members_coll.find({}, {"name": 1}):
 75 |     print(doc)
 76 | 
 77 | print("\nShow only name...")
 78 | for doc in sources_team_members_coll.find({}, {"name": 1, "_id": 0}):
 79 |     print(doc)
 80 | 
 81 | print("\nUpdating Nick's membersince from 2017->2018...")
 82 | update_result = sources_team_members_coll.update_one({"name": "Nick"}, {"$set": {"membersince": 2017}})
 83 | for doc in sources_team_members_coll.find():
 84 |     print(doc)
 85 | 
 86 | print("\nUpdating to add team field to all documents...")
 87 | update_result = sources_team_members_coll.update_many({}, {"$set": {"team": "sources"}})
 88 | for doc in sources_team_members_coll.find():
 89 |     print(doc)
 90 | 
 91 | print("\nRemoving Harrison because he is NOT part of the team...")
 92 | delete_result = sources_team_members_coll.delete_many({"name": "Harrison"})
 93 | for doc in sources_team_members_coll.find():
 94 |     print(doc)
 95 | 
 96 | oplog = client.local.oplog.rs
 97 | first = oplog.find().sort('$natural', pymongo.ASCENDING).limit(-1).next()
 98 | ts = first['ts']
 99 | 
100 | should_print_oplog = True
101 | if should_print_oplog:
102 |     print('\nPrinting oplog rows...')
103 | 
104 |     with client.local.oplog.rs.find({'ts': {'$gt': ts}},
105 |                                     oplog_replay=True) as cursor:
106 |         for row in cursor:
107 |             if row['op'] in ['i', 'u', 'd']:
108 |                 print({k: row[k] if row.get(k) else '' for k in ['o', 'o2', 'ns', 'op']})
109 | 
110 | 
111 | print("\nDeleting the collection and database...")
112 | sources_team_members_coll.drop()
113 | 
114 | print("\nShowing Databases...")
115 | print(client.list_database_names())
116 | 


--------------------------------------------------------------------------------
/spikes/supported_versions_spike.md:
--------------------------------------------------------------------------------
 1 | # tap-mongodb suppored versions & flavors spike
 2 | 
 3 | ## Connecting to mongodb (shell and via pymongo)
 4 | ### Mongo Shell
 5 | Newer versions of the mongo shell should be backwards compatible for the
 6 | commands we'll be running. Any new features (mostly helper stuff)
 7 | introduced to the mongo shell won't work with previous versions of
 8 | mongodb.
 9 | 
10 | ### PyMongo
11 | 
12 | Mongo officially supports versions 3.4, 3.6, 4.0. They are ending support for 3.4 in Jan 2020
13 | 
14 | According to the
15 | [Pymongo docs on compatability](https://docs.mongodb.com/ecosystem/drivers/pymongo/#compatibility),
16 | pymongo version 3.7/3.8 supports
17 | - 4.0
18 | - 3.6
19 | - 3.4
20 | - 3.2
21 | - 3.0
22 | - 2.6
23 | 
24 | **We believe this means that any major differences in the client mongo version should be handled by pymongo**
25 | 
26 | ## Replica sets and sharded clusters
27 | - A replica set is a cluster of MongoDB servers that implements
28 |   replication and automated failover. MongoDB’s recommended replication
29 |   strategy
30 | - With sharding, each shard contains a subset of sharded data for a
31 |   sharded cluster. Together, the cluster’s shards hold the entire data set
32 |   for the cluster.
33 |   - Users, clients, or applications should only directly connect to a
34 |     shard to perform local administrative and maintenance operations.
35 | - As of MongoDB 3.6, shards must be deployed as a replica set to provide
36 |   redundancy and high availability.
37 | - [Docs on shards](https://docs.mongodb.com/manual/core/sharded-cluster-shards/)
38 | - [Docs on replication](https://docs.mongodb.com/manual/replication/)
39 | - Basically, we should connect to the cluster, never to an individual
40 |   shard/replica
41 | 
42 | ## Mongo-as-a-service mLab?
43 |  - mLab is "not accepting new customers" and migrating existing ones to
44 |    Atlas (what we test with)
45 |  - It looks like there was no difference though in the way you connected
46 |    to it via shell/driver
47 | 
48 | ## Test Instance versions
49 | 
50 | For Atlas, the free version (and M2/M5 shared clusters) default to the
51 | latest version. You can choose the version for the M10 (paid) clusters, so
52 | we'll have the ability to spin up test clusters for different versions if
53 | we choose (and pay).
54 | 
55 | We recommend testing on the latest version since we believe pymongo will
56 | handle version differences within the tap. If we uncover major bugs due to
57 | version differences, we can consider spinning up multiple clusters on
58 | different versions to test with.
59 | 


--------------------------------------------------------------------------------
/tap_mongodb/sync_strategies/full_table.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import copy
  3 | import time
  4 | import pymongo
  5 | import singer
  6 | from singer import metadata, utils
  7 | import tap_mongodb.sync_strategies.common as common
  8 | 
  9 | LOGGER = singer.get_logger()
 10 | 
 11 | def get_max_id_value(collection, projection=None):
 12 |     if projection is None:
 13 |         row = collection.find_one(sort=[("_id", pymongo.DESCENDING)])
 14 |     else:
 15 |         row = collection.find_one(sort=[("_id", pymongo.DESCENDING)],
 16 |                                   projection=projection)
 17 |     if row:
 18 |         return row['_id']
 19 | 
 20 |     LOGGER.info("No max id found for collection: collection is likely empty")
 21 |     return None
 22 | 
 23 | 
 24 | # pylint: disable=too-many-locals,invalid-name,too-many-statements
 25 | def sync_collection(client, stream, state, projection):
 26 |     tap_stream_id = stream['tap_stream_id']
 27 |     LOGGER.info('Starting full table sync for %s', tap_stream_id)
 28 | 
 29 |     md_map = metadata.to_map(stream['metadata'])
 30 |     database_name = metadata.get(md_map, (), 'database-name')
 31 | 
 32 |     db = client[database_name]
 33 |     collection = db[stream['stream']]
 34 | 
 35 |     #before writing the table version to state, check if we had one to begin with
 36 |     first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None
 37 | 
 38 |     # last run was interrupted if there is a last_id_fetched bookmark
 39 |     was_interrupted = singer.get_bookmark(state,
 40 |                                           stream['tap_stream_id'],
 41 |                                           'last_id_fetched') is not None
 42 | 
 43 |     #pick a new table version if last run wasn't interrupted
 44 |     if was_interrupted:
 45 |         stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version')
 46 |     else:
 47 |         stream_version = int(time.time() * 1000)
 48 | 
 49 |     state = singer.write_bookmark(state,
 50 |                                   stream['tap_stream_id'],
 51 |                                   'version',
 52 |                                   stream_version)
 53 |     singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
 54 | 
 55 |     activate_version_message = singer.ActivateVersionMessage(
 56 |         stream=common.calculate_destination_stream_name(stream),
 57 |         version=stream_version
 58 |     )
 59 | 
 60 |     # For the initial replication, emit an ACTIVATE_VERSION message
 61 |     # at the beginning so the records show up right away.
 62 |     if first_run:
 63 |         singer.write_message(activate_version_message)
 64 | 
 65 |     if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'):
 66 |         # There is a bookmark
 67 |         max_id_value = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value')
 68 |         max_id_type = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type')
 69 |         max_id_value = common.string_to_class(max_id_value, max_id_type)
 70 |     else:
 71 |         max_id_value = get_max_id_value(collection, projection)
 72 | 
 73 |     last_id_fetched = singer.get_bookmark(state,
 74 |                                           stream['tap_stream_id'],
 75 |                                           'last_id_fetched')
 76 | 
 77 |     if max_id_value:
 78 |         # Write the bookmark if max_id_value is defined
 79 |         state = singer.write_bookmark(state,
 80 |                                       stream['tap_stream_id'],
 81 |                                       'max_id_value',
 82 |                                       common.class_to_string(max_id_value,
 83 |                                                              max_id_value.__class__.__name__))
 84 |         state = singer.write_bookmark(state,
 85 |                                       stream['tap_stream_id'],
 86 |                                       'max_id_type',
 87 |                                       max_id_value.__class__.__name__)
 88 | 
 89 |     find_filter = {'$lte': max_id_value}
 90 |     if last_id_fetched:
 91 |         last_id_fetched_type = singer.get_bookmark(state,
 92 |                                                    stream['tap_stream_id'],
 93 |                                                    'last_id_fetched_type')
 94 |         find_filter['$gte'] = common.string_to_class(last_id_fetched, last_id_fetched_type)
 95 | 
 96 |     query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(
 97 |         stream['tap_stream_id'],
 98 |         find_filter)
 99 |     if projection:
100 |         query_message += '\n\tProjection: {}'.format(projection)
101 |     # pylint: disable=logging-format-interpolation
102 |     LOGGER.info(query_message)
103 | 
104 | 
105 |     with collection.find({'_id': find_filter},
106 |                          projection,
107 |                          sort=[("_id", pymongo.ASCENDING)]) as cursor:
108 |         rows_saved = 0
109 |         time_extracted = utils.now()
110 |         start_time = time.time()
111 | 
112 |         schema = {"type": "object", "properties": {}}
113 |         for row in cursor:
114 |             rows_saved += 1
115 | 
116 |             schema_build_start_time = time.time()
117 |             if common.row_to_schema(schema, row):
118 |                 singer.write_message(singer.SchemaMessage(
119 |                     stream=common.calculate_destination_stream_name(stream),
120 |                     schema=schema,
121 |                     key_properties=['_id']))
122 |                 common.SCHEMA_COUNT[stream['tap_stream_id']] += 1
123 |             common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time() - schema_build_start_time
124 | 
125 |             record_message = common.row_to_singer_record(stream,
126 |                                                          row,
127 |                                                          stream_version,
128 |                                                          time_extracted)
129 | 
130 |             singer.write_message(record_message)
131 | 
132 |             state = singer.write_bookmark(state,
133 |                                           stream['tap_stream_id'],
134 |                                           'last_id_fetched',
135 |                                           common.class_to_string(row['_id'],
136 |                                                                  row['_id'].__class__.__name__))
137 |             state = singer.write_bookmark(state,
138 |                                           stream['tap_stream_id'],
139 |                                           'last_id_fetched_type',
140 |                                           row['_id'].__class__.__name__)
141 | 
142 | 
143 |             if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
144 |                 singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
145 | 
146 |         common.COUNTS[tap_stream_id] += rows_saved
147 |         common.TIMES[tap_stream_id] += time.time()-start_time
148 | 
149 |     # clear max pk value and last pk fetched upon successful sync
150 |     singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
151 |     singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type')
152 |     singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')
153 |     singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type')
154 | 
155 |     state = singer.write_bookmark(state,
156 |                                   stream['tap_stream_id'],
157 |                                   'initial_full_table_complete',
158 |                                   True)
159 | 
160 |     singer.write_message(activate_version_message)
161 | 
162 |     LOGGER.info('Synced {} records for {}'.format(rows_saved, tap_stream_id))
163 | 


--------------------------------------------------------------------------------
/tap_mongodb/sync_strategies/incremental.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import copy
  3 | import time
  4 | import pymongo
  5 | import singer
  6 | from singer import metadata, utils
  7 | import tap_mongodb.sync_strategies.common as common
  8 | 
  9 | LOGGER = singer.get_logger()
 10 | 
 11 | 
 12 | def update_bookmark(row, state, tap_stream_id, replication_key_name):
 13 |     replication_key_value = row.get(replication_key_name)
 14 |     if replication_key_value:
 15 |         replication_key_type = replication_key_value.__class__.__name__
 16 | 
 17 |         replication_key_value_bookmark = common.class_to_string(replication_key_value,
 18 |                                                                 replication_key_type)
 19 |         state = singer.write_bookmark(state,
 20 |                                       tap_stream_id,
 21 |                                       'replication_key_value',
 22 |                                       replication_key_value_bookmark)
 23 |         state = singer.write_bookmark(state,
 24 |                                       tap_stream_id,
 25 |                                       'replication_key_type',
 26 |                                       replication_key_type)
 27 | 
 28 | # pylint: disable=too-many-locals, too-many-statements
 29 | def sync_collection(client, stream, state, projection):
 30 |     tap_stream_id = stream['tap_stream_id']
 31 |     LOGGER.info('Starting incremental sync for %s', tap_stream_id)
 32 | 
 33 |     stream_metadata = metadata.to_map(stream['metadata']).get(())
 34 |     collection = client[stream_metadata['database-name']][stream['stream']]
 35 | 
 36 |     #before writing the table version to state, check if we had one to begin with
 37 |     first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None
 38 | 
 39 |     #pick a new table version if last run wasn't interrupted
 40 |     if first_run:
 41 |         stream_version = int(time.time() * 1000)
 42 |     else:
 43 |         stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version')
 44 | 
 45 |     state = singer.write_bookmark(state,
 46 |                                   stream['tap_stream_id'],
 47 |                                   'version',
 48 |                                   stream_version)
 49 | 
 50 |     activate_version_message = singer.ActivateVersionMessage(
 51 |         stream=common.calculate_destination_stream_name(stream),
 52 |         version=stream_version
 53 |     )
 54 | 
 55 | 
 56 |     # For the initial replication, emit an ACTIVATE_VERSION message
 57 |     # at the beginning so the records show up right away.
 58 |     if first_run:
 59 |         singer.write_message(activate_version_message)
 60 | 
 61 |     # get replication key, and bookmarked value/type
 62 |     stream_state = state.get('bookmarks', {}).get(tap_stream_id, {})
 63 | 
 64 |     replication_key_name = stream_metadata.get('replication-key')
 65 |     replication_key_value_bookmark = stream_state.get('replication_key_value')
 66 | 
 67 |     # write state message
 68 |     singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
 69 | 
 70 |     # create query
 71 |     find_filter = {}
 72 |     if replication_key_value_bookmark:
 73 |         find_filter[replication_key_name] = {}
 74 |         find_filter[replication_key_name]['$gte'] = \
 75 |             common.string_to_class(replication_key_value_bookmark,
 76 |                                    stream_state.get('replication_key_type'))
 77 | 
 78 |     # log query
 79 |     query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(tap_stream_id, find_filter)
 80 |     if projection:
 81 |         query_message += '\n\tProjection: {}'.format(projection)
 82 |     LOGGER.info(query_message)
 83 | 
 84 | 
 85 |     # query collection
 86 |     schema = {"type": "object", "properties": {}}
 87 |     with collection.find(find_filter,
 88 |                          projection,
 89 |                          sort=[(replication_key_name, pymongo.ASCENDING)]) as cursor:
 90 |         rows_saved = 0
 91 |         time_extracted = utils.now()
 92 |         start_time = time.time()
 93 | 
 94 |         for row in cursor:
 95 |             schema_build_start_time = time.time()
 96 |             if common.row_to_schema(schema, row):
 97 |                 singer.write_message(singer.SchemaMessage(
 98 |                     stream=common.calculate_destination_stream_name(stream),
 99 |                     schema=schema,
100 |                     key_properties=['_id']))
101 |                 common.SCHEMA_COUNT[tap_stream_id] += 1
102 |             common.SCHEMA_TIMES[tap_stream_id] += time.time() - schema_build_start_time
103 | 
104 | 
105 |             record_message = common.row_to_singer_record(stream,
106 |                                                          row,
107 |                                                          stream_version,
108 |                                                          time_extracted)
109 | 
110 |             # gen_schema = common.row_to_schema_message(schema, record_message.record, row)
111 |             # if DeepDiff(schema, gen_schema, ignore_order=True) != {}:
112 |             #   emit gen_schema
113 |             #   schema = gen_schema
114 |             singer.write_message(record_message)
115 |             rows_saved += 1
116 | 
117 |             update_bookmark(row, state, tap_stream_id, replication_key_name)
118 | 
119 |             if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
120 |                 singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
121 | 
122 | 
123 |         common.COUNTS[tap_stream_id] += rows_saved
124 |         common.TIMES[tap_stream_id] += time.time()-start_time
125 | 
126 |     singer.write_message(activate_version_message)
127 | 
128 |     LOGGER.info('Synced %s records for %s', rows_saved, tap_stream_id)
129 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/singer-io/tap-mongodb/9db2c92efc089272ab17bdeaa41bcfe8da82d12d/tests/__init__.py


--------------------------------------------------------------------------------
/tests/mongodb_common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pymongo
 3 | from tap_tester.logger import LOGGER
 4 | 
 5 | 
 6 | def ensure_environment_variables_set():
 7 |     missing_envs = [x for x in ['TAP_MONGODB_HOST',
 8 |                                 'TAP_MONGODB_USER',
 9 |                                 'TAP_MONGODB_PASSWORD',
10 |                                 'TAP_MONGODB_PORT',
11 |                                 'TAP_MONGODB_DBNAME'] if os.getenv(x) is None]
12 |     if len(missing_envs) != 0:
13 |         raise Exception(f"set environment variables: {missing_envs}")
14 | 
15 | ##########################################################################
16 | ### Database Interactions
17 | ##########################################################################
18 | 
19 | def get_test_connection():
20 |     username = os.getenv('TAP_MONGODB_USER')
21 |     password = os.getenv('TAP_MONGODB_PASSWORD')
22 |     host= os.getenv('TAP_MONGODB_HOST')
23 |     auth_source = os.getenv('TAP_MONGODB_DBNAME')
24 |     port = int(os.getenv('TAP_MONGODB_PORT'))
25 |     ssl = False
26 |     conn = pymongo.MongoClient(host=host, username=username, password=password, port=port,
27 |                                authSource=auth_source, ssl=ssl, uuidRepresentation='standard')
28 |     return conn
29 | 
30 | def drop_all_collections(client):
31 |     ############# Drop all dbs/collections #############
32 |     for db_name in client.list_database_names():
33 |         if db_name in ['config', 'local', 'system']:
34 |             continue
35 |         for collection_name in client[db_name].list_collection_names():
36 |             if collection_name in ['system.views', 'system.version', 'system.keys', 'system.users']:
37 |                 continue
38 |             LOGGER.info("Dropping database: " + db_name + ", collection: " + collection_name)
39 |             client[db_name][collection_name].drop()
40 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_cname_restrictions.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import string
  4 | import unittest
  5 | from bson import ObjectId
  6 | 
  7 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
  8 | from tap_tester import connections, menagerie, runner
  9 | 
 10 | 
 11 | RECORD_COUNT = {}
 12 | 
 13 | 
 14 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 15 |     return ''.join(random.choice(chars) for x in range(size))
 16 | 
 17 | def generate_simple_coll_docs(num_docs):
 18 |     docs = []
 19 |     for int_value in range(num_docs):
 20 |         docs.append({"int_field": int_value, "string_field": random_string_generator()})
 21 |     return docs
 22 | 
 23 | class MongoDBCollectionNameRestrictions(unittest.TestCase):
 24 |     ''' Test edge case collection name restrictions per the documentation (leading '_' or digit)
 25 |         Reference https://jira.talendforge.org/browse/TDL-18990 for details  '''
 26 | 
 27 |     def setUp(self):
 28 | 
 29 |         ensure_environment_variables_set()
 30 | 
 31 |         with get_test_connection() as client:
 32 |             ############# Drop all dbs/collections #############
 33 |             drop_all_collections(client)
 34 | 
 35 |             ############# Add simple collections #############
 36 |             # 1_simple_coll has 50 documents
 37 |             client["simple_db"]["1_simple_coll"].insert_many(generate_simple_coll_docs(50))
 38 | 
 39 |             # _simple_coll_2 has 100 documents
 40 |             client["simple_db"]["_simple_coll_2"].insert_many(generate_simple_coll_docs(100))
 41 | 
 42 |     def expected_check_streams(self):
 43 |         return {
 44 |             'simple_db-1_simple_coll',
 45 |             'simple_db-_simple_coll_2',
 46 |         }
 47 | 
 48 |     def expected_pks(self):
 49 |         return {
 50 |             '1_simple_coll': {'_id'},
 51 |             '_simple_coll_2': {'_id'},
 52 |         }
 53 | 
 54 |     def expected_row_counts(self):
 55 |         return {
 56 |             '1_simple_coll': 50,
 57 |             '_simple_coll_2': 100,
 58 |         }
 59 | 
 60 |     def expected_sync_streams(self):
 61 |         return {
 62 |             '1_simple_coll',
 63 |             '_simple_coll_2'
 64 |         }
 65 | 
 66 |     def name(self):
 67 |         return "tap_tester_mongodb_cname_restrict"
 68 | 
 69 |     def tap_name(self):
 70 |         return "tap-mongodb"
 71 | 
 72 |     def get_type(self):
 73 |         return "platform.mongodb"
 74 | 
 75 |     def get_credentials(self):
 76 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
 77 | 
 78 |     def get_properties(self):
 79 |         return {'host' : os.getenv('TAP_MONGODB_HOST'),
 80 |                 'port' : os.getenv('TAP_MONGODB_PORT'),
 81 |                 'user' : os.getenv('TAP_MONGODB_USER'),
 82 |                 'database' : os.getenv('TAP_MONGODB_DBNAME')
 83 |         }
 84 | 
 85 | 
 86 |     def test_run(self):
 87 | 
 88 |         conn_id = connections.ensure_connection(self)
 89 | 
 90 |         #  ---------------------------------
 91 |         #  -----------  Discovery ----------
 92 |         #  ---------------------------------
 93 | 
 94 |         # run in discovery mode
 95 |         check_job_name = runner.run_check_mode(self, conn_id)
 96 | 
 97 |         # verify check  exit codes
 98 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
 99 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
100 | 
101 |         # verify the tap discovered the right streams
102 |         found_catalogs = menagerie.get_catalogs(conn_id)
103 | 
104 |         # assert we find the correct streams
105 |         self.assertEqual(self.expected_check_streams(),
106 |                          {c['tap_stream_id'] for c in found_catalogs})
107 | 
108 |         for tap_stream_id in self.expected_check_streams():
109 |             found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]
110 | 
111 |             # assert that the pks are correct
112 |             self.assertEqual(self.expected_pks()[found_stream['stream_name']],
113 |                              set(found_stream.get('metadata', {}).get('table-key-properties')))
114 | 
115 |             # assert that the row counts are correct
116 |             self.assertEqual(self.expected_row_counts()[found_stream['stream_name']],
117 |                              found_stream.get('metadata', {}).get('row-count'))
118 | 
119 |         #  ----------------------------------------
120 |         #  ----------- Initial Full Table ---------
121 |         #  ----------------------------------------
122 | 
123 |         # Select 1_simple_coll and _simple_coll_2 streams and add replication method metadata
124 |         for stream_catalog in found_catalogs:
125 |             annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
126 |             additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}]
127 |             selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
128 |                                                                                     stream_catalog,
129 |                                                                                     annotated_schema,
130 |                                                                                     additional_md)
131 | 
132 |         # Run sync
133 |         sync_job_name = runner.run_sync_mode(self, conn_id)
134 | 
135 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
136 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
137 | 
138 |         # verify the persisted schema was correct
139 |         records_by_stream = runner.get_records_from_target_output()
140 | 
141 |         # assert that each of the streams that we synced are the ones that we expect to see
142 |         record_count_by_stream = runner.examine_target_output_file(self,
143 |                                                                    conn_id,
144 |                                                                    self.expected_sync_streams(),
145 |                                                                    self.expected_pks())
146 | 
147 |         # Verify that the full table was synced
148 |         for tap_stream_id in self.expected_sync_streams():
149 |             self.assertGreaterEqual(record_count_by_stream[tap_stream_id],self.expected_row_counts()[tap_stream_id])
150 | 
151 |         # Verify that we have 'initial_full_table_complete' bookmark
152 |         state = menagerie.get_state(conn_id)
153 |         first_versions = {}
154 | 
155 |         for tap_stream_id in self.expected_check_streams():
156 |             # assert that the state has an initial_full_table_complete == True
157 |             self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete'])
158 |             # assert that there is a version bookmark in state
159 |             first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version']
160 |             self.assertIsNotNone(first_versions[tap_stream_id])
161 |             # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark
162 |             self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_time'])
163 |             self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_inc'])
164 | 
165 | 
166 |         changed_ids = set()
167 |         with get_test_connection() as client:
168 |             # Delete two documents for each collection
169 | 
170 |             changed_ids.add(client['simple_db']['1_simple_coll'].find({'int_field': 0})[0]['_id'])
171 |             client["simple_db"]["1_simple_coll"].delete_one({'int_field': 0})
172 | 
173 |             changed_ids.add(client['simple_db']['1_simple_coll'].find({'int_field': 1})[0]['_id'])
174 |             client["simple_db"]["1_simple_coll"].delete_one({'int_field': 1})
175 | 
176 |             changed_ids.add(client['simple_db']['_simple_coll_2'].find({'int_field': 0})[0]['_id'])
177 |             client["simple_db"]["_simple_coll_2"].delete_one({'int_field': 0})
178 | 
179 |             changed_ids.add(client['simple_db']['_simple_coll_2'].find({'int_field': 1})[0]['_id'])
180 |             client["simple_db"]["_simple_coll_2"].delete_one({'int_field': 1})
181 | 
182 |             # Update two documents for each collection
183 |             changed_ids.add(client['simple_db']['1_simple_coll'].find({'int_field': 48})[0]['_id'])
184 |             client["simple_db"]["1_simple_coll"].update_one({'int_field': 48},{'$set': {'int_field': -1}})
185 | 
186 |             changed_ids.add(client['simple_db']['1_simple_coll'].find({'int_field': 49})[0]['_id'])
187 |             client["simple_db"]["1_simple_coll"].update_one({'int_field': 49},{'$set': {'int_field': -1}})
188 | 
189 |             changed_ids.add(client['simple_db']['_simple_coll_2'].find({'int_field': 98})[0]['_id'])
190 |             client["simple_db"]["_simple_coll_2"].update_one({'int_field': 98},{'$set': {'int_field': -1}})
191 | 
192 |             changed_ids.add(client['simple_db']['_simple_coll_2'].find({'int_field': 99})[0]['_id'])
193 |             client["simple_db"]["_simple_coll_2"].update_one({'int_field': 99},{'$set': {'int_field': -1}})
194 | 
195 |             # Insert two documents for each collection
196 |             client["simple_db"]["1_simple_coll"].insert_one({"int_field": 50, "string_field": random_string_generator()})
197 |             changed_ids.add(client['simple_db']['1_simple_coll'].find({'int_field': 50})[0]['_id'])
198 | 
199 |             client["simple_db"]["1_simple_coll"].insert_one({"int_field": 51, "string_field": random_string_generator()})
200 |             changed_ids.add(client['simple_db']['1_simple_coll'].find({'int_field': 51})[0]['_id'])
201 | 
202 |             client["simple_db"]["_simple_coll_2"].insert_one({"int_field": 100, "string_field": random_string_generator()})
203 |             changed_ids.add(client['simple_db']['_simple_coll_2'].find({'int_field': 100})[0]['_id'])
204 | 
205 |             client["simple_db"]["_simple_coll_2"].insert_one({"int_field": 101, "string_field": random_string_generator()})
206 |             changed_ids.add(client['simple_db']['_simple_coll_2'].find({'int_field': 101})[0]['_id'])
207 | 
208 |         #  -------------------------------------------
209 |         #  ----------- Subsequent Oplog Sync ---------
210 |         #  -------------------------------------------
211 | 
212 |         # Run sync
213 |         sync_job_name = runner.run_sync_mode(self, conn_id)
214 | 
215 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
216 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
217 | 
218 |         # verify the persisted schema was correct
219 |         messages_by_stream = runner.get_records_from_target_output()
220 |         records_by_stream = {}
221 |         for stream_name in self.expected_sync_streams():
222 |             records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages']
223 |                                               if x.get('action') == 'upsert']
224 | 
225 |         # assert that each of the streams that we synced are the ones that we expect to see
226 |         record_count_by_stream = runner.examine_target_output_file(self,
227 |                                                                    conn_id,
228 |                                                                    self.expected_sync_streams(),
229 |                                                                    self.expected_pks())
230 | 
231 |         # Verify that we got at least 6 records due to changes
232 |         # (could be more due to overlap in gte oplog clause)
233 |         for k,v in record_count_by_stream.items():
234 |             self.assertGreaterEqual(v, 6)
235 | 
236 |         # Verify that we got 2 records with _SDC_DELETED_AT
237 |         for stream in self.expected_sync_streams():
238 |             self.assertEqual(2, len([x['data'] for x in records_by_stream[stream]
239 |                                      if x['data'].get('_sdc_deleted_at')]))
240 |         # Verify that the _id of the records sent are the same set as the
241 |         # _ids of the documents changed
242 |         actual_ids = {ObjectId(x['data']['_id']) for stream in self.expected_sync_streams()
243 |                       for x in records_by_stream[stream]}
244 |         self.assertEqual(changed_ids, actual_ids)
245 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_configurable_properties.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import string
  4 | import unittest
  5 | 
  6 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
  7 | from tap_tester import connections, menagerie, runner
  8 | 
  9 | RECORD_COUNT = {}
 10 | 
 11 | 
 12 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 13 |     return ''.join(random.choice(chars) for x in range(size))
 14 | 
 15 | 
 16 | def generate_simple_coll_docs(num_docs):
 17 |     docs = []
 18 |     for int_value in range(num_docs):
 19 |         docs.append({"int_field": int_value, "string_field": random_string_generator()})
 20 |     return docs
 21 | 
 22 | 
 23 | class MongoDBConfigurableProperty(unittest.TestCase):
 24 | 
 25 |     def setUp(self):
 26 |         ensure_environment_variables_set()
 27 | 
 28 |         with get_test_connection() as client:
 29 |             # drop all dbs/collections
 30 |             drop_all_collections(client)
 31 | 
 32 |             # simple_coll_1 has 25 documents
 33 |             client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(25))
 34 | 
 35 |              # simple_coll_2 has 50 documents
 36 |             client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(50))
 37 | 
 38 |     def name(self):
 39 |         return "tap_tester_mongodb_configurable_property"
 40 | 
 41 |     def tap_name(self):
 42 |         return "tap-mongodb"
 43 | 
 44 |     def get_type(self):
 45 |         return "platform.mongodb"
 46 | 
 47 |     def get_credentials(self):
 48 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
 49 | 
 50 |     def expected_check_streams(self):
 51 |         return {
 52 |             'simple_db-simple_coll_1',
 53 |             'simple_db-simple_coll_2'
 54 |         }
 55 | 
 56 |     def expected_pks_log_based(self):
 57 |         return {
 58 |             'simple_coll_1': {'_id'},
 59 |             'simple_coll_2': {'_id'}
 60 |          }
 61 | 
 62 |     def expected_pks_include_schemas(self):
 63 |         return {
 64 |             'simple_db_simple_coll_1': {'_id'},
 65 |             'simple_db_simple_coll_2': {'_id'}
 66 |          }
 67 | 
 68 |     def expected_row_counts_log_based(self):
 69 |         return {
 70 |             'simple_coll_1': 25,
 71 |             'simple_coll_2': 50
 72 |         }
 73 | 
 74 |     def expected_row_counts_include_schemas(self):
 75 |         return {
 76 |             'simple_db_simple_coll_1': 25,
 77 |             'simple_db_simple_coll_2': 50
 78 |         }
 79 | 
 80 |     def expected_sync_streams_include_schemas(self):
 81 |         return {
 82 |             'simple_db_simple_coll_1',
 83 |             'simple_db_simple_coll_2'
 84 |         }
 85 | 
 86 |     def expected_sync_streams_log_based(self):
 87 |         return {
 88 |             'simple_coll_1',
 89 |             'simple_coll_2'
 90 |         }
 91 | 
 92 |     def run_test(self):
 93 | 
 94 |         conn_id = connections.ensure_connection(self)
 95 | 
 96 |         #original_properties=False)
 97 | 
 98 |         #  -------------------------------
 99 |         # -----------  Discovery ----------
100 |         #  -------------------------------
101 | 
102 |         # run in discovery mode
103 |         check_job_name = runner.run_check_mode(self, conn_id)
104 | 
105 |         # verify check exit codes
106 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
107 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
108 | 
109 |         # verify the tap discovered the right streams
110 |         found_catalogs = menagerie.get_catalogs(conn_id)
111 | 
112 |         # assert we find the correct streams
113 |         self.assertEqual(self.expected_check_streams(),
114 |                          {c['tap_stream_id'] for c in found_catalogs})
115 | 
116 |         #  -------------------------------------------
117 |         #  ----------- First full Table Sync ---------
118 |         #  -------------------------------------------
119 |         # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
120 |         for stream_catalog in found_catalogs:
121 |             annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
122 |             additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'FULL_TABLE'}}]
123 |             selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
124 |                                                                                     stream_catalog,
125 |                                                                                     annotated_schema,
126 |                                                                                     additional_md)
127 | 
128 |         # run full table sync
129 |         sync_job_name = runner.run_sync_mode(self, conn_id)
130 | 
131 |         # check exit status
132 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
133 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
134 | 
135 |         return conn_id
136 | 
137 | 
138 | class MongoDBUseLogBasedReplication(MongoDBConfigurableProperty):
139 | 
140 |     def name(self):
141 |         return "tt_mongodb_config_prop_log_based"
142 | 
143 |     def get_credentials(self):
144 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
145 | 
146 |     def get_properties(self):
147 |         return {'host': os.getenv('TAP_MONGODB_HOST'),
148 |                 'port': os.getenv('TAP_MONGODB_PORT'),
149 |                 'user': os.getenv('TAP_MONGODB_USER'),
150 |                 'database': os.getenv('TAP_MONGODB_DBNAME'),
151 |                 'use_log_based_replication': 'true'
152 |                 }
153 | 
154 |     def test_run(self):
155 |         conn_id = self.run_test()
156 | 
157 |         # streams that we synced are the ones that we expect to see
158 |         records_by_stream = runner.get_records_from_target_output()
159 |         record_count_by_stream = runner.examine_target_output_file(self,
160 |                                                                    conn_id,
161 |                                                                    self.expected_sync_streams_log_based(),
162 |                                                                    self.expected_pks_log_based())
163 | 
164 |         # assert that we get the correct number of records for each stream
165 |         self.assertEqual(self.expected_row_counts_log_based(), record_count_by_stream)
166 | 
167 | 
168 | class MongoDBIncludeSchema(MongoDBConfigurableProperty):
169 | 
170 |     def name(self):
171 |         return "tt_mongodb_config_prop_inc_schema"
172 | 
173 |     def get_properties(self):
174 |         return {'host': os.getenv('TAP_MONGODB_HOST'),
175 |                 'port': os.getenv('TAP_MONGODB_PORT'),
176 |                 'user': os.getenv('TAP_MONGODB_USER'),
177 |                 'database': os.getenv('TAP_MONGODB_DBNAME'),
178 |                 'include_schemas_in_destination_stream_name': 'true'
179 |                 }
180 | 
181 |     def test_run(self):
182 |         conn_id = self.run_test()
183 | 
184 |         # streams that we synced are the ones that we expect to see
185 |         records_by_stream = runner.get_records_from_target_output()
186 |         record_count_by_stream = runner.examine_target_output_file(self,
187 |                                                                    conn_id,
188 |                                                                    self.expected_sync_streams_include_schemas(),
189 |                                                                    self.expected_pks_include_schemas())
190 | 
191 |         # assert that we get the correct number of records for each stream
192 |         self.assertEqual(self.expected_row_counts_include_schemas(), record_count_by_stream)
193 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_datatype.py:
--------------------------------------------------------------------------------
  1 | import bson
  2 | import datetime
  3 | import decimal
  4 | import os
  5 | import re
  6 | import subprocess
  7 | import unittest
  8 | import uuid
  9 | 
 10 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
 11 | from tap_tester import connections, menagerie, runner
 12 | from tap_tester.logger import LOGGER
 13 | 
 14 | 
 15 | RECORD_COUNT = {}
 16 | 
 17 | 
 18 | def run_mongodb_javascript(database, js, mongo_version):
 19 |     """
 20 |     Runs arbitrary javascript against the test Mongo instance. This is
 21 |     useful for setting up situations that Python can't handle (e.g.,
 22 |     datetime with year 0) for testing.
 23 |     """
 24 |     LOGGER.info("Running '{}' against database '{}'".format(js, database))
 25 | 
 26 |     mongo_shell = "mongosh" if int(mongo_version.split(".")[0]) > 5 else "mongo"
 27 |     cmd = [mongo_shell, "-u", os.getenv('TAP_MONGODB_USER'), "-p", os.getenv('TAP_MONGODB_PASSWORD'), "--authenticationDatabase", os.getenv('TAP_MONGODB_DBNAME'), database, "--eval", "eval('{}')".format(js)]
 28 |     subprocess.run(cmd)
 29 | 
 30 | 
 31 | class MongoDBDatatype(unittest.TestCase):
 32 |     # To compare large dictionaries
 33 |     maxDiff = None
 34 | 
 35 |     def setUp(self):
 36 |         ensure_environment_variables_set()
 37 | 
 38 |         with get_test_connection() as client:
 39 |             ############# Drop all dbs/collections #############
 40 |             drop_all_collections(client)
 41 | 
 42 |             ############# Add datatype collections #############
 43 |             pattern = re.compile('.*')
 44 |             regex = bson.Regex.from_native(pattern)
 45 |             regex.flags ^= re.UNICODE
 46 | 
 47 |             datatype_doc = {
 48 |                 "double_field": 4.3,
 49 |                 "string_field": "a sample string",
 50 |                 "object_field" : {
 51 |                     "obj_field_1_key": "obj_field_1_val",
 52 |                     "obj_field_2_key": "obj_field_2_val"
 53 |                 },
 54 |                 "array_field" : [
 55 |                     "array_item_1",
 56 |                     "array_item_2",
 57 |                     "array_item_3"
 58 |                 ],
 59 |                 "binary_data_field" : bson.Binary(b"a binary string"),
 60 |                 "object_id_field": bson.objectid.ObjectId(b'123456789123'),
 61 |                 "boolean_field" : True,
 62 |                 "date_field" : datetime.datetime(2019, 8, 15, 19, 29, 14, 578000),
 63 |                 "null_field": None,
 64 |                 "regex_field" : regex,
 65 |                 "32_bit_integer_field" : 32,
 66 |                 "timestamp_field" : bson.timestamp.Timestamp(1565897157, 1),
 67 |                 "64_bit_integer_field" : 34359738368,
 68 |                 "decimal_field" : bson.Decimal128(decimal.Decimal('1.34')),
 69 |                 "javaScript_field" : bson.code.Code("var x, y, z;"),
 70 |                 "javaScript_with_scope_field" : bson.code.Code("function incrementX() { x++; }", scope={"x": 1}),
 71 |                 "min_key_field" : bson.min_key.MinKey,
 72 |                 "max_key_field" : bson.max_key.MaxKey,
 73 |                 "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec72820c4'),
 74 |                 "dbref_field": bson.dbref.DBRef("some_collection", bson.objectid.ObjectId(b'123456789123'), database='some_database')
 75 |             }
 76 | 
 77 |             client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc)
 78 | 
 79 |             # NB: Insert an invalid datetime to confirm that works correctly
 80 |             mongodb_version = client.server_info()["version"]
 81 |             run_mongodb_javascript(database="datatype_db",
 82 |                                    js="db.invalid_datatype_coll.insert({ \"date_field\": new ISODate(\"0000-01-01T00:00:00.000Z\") });",
 83 |                                    mongo_version=mongodb_version)
 84 | 
 85 |     def expected_check_streams(self):
 86 |         return {
 87 |             'datatype_db-datatype_coll_1',
 88 |             'datatype_db-invalid_datatype_coll'
 89 |         }
 90 | 
 91 |     def expected_pks(self):
 92 |         return {
 93 |             'datatype_coll_1': {'_id'},
 94 |             'invalid_datatype_coll': {'_id'}
 95 |         }
 96 | 
 97 |     def expected_row_counts(self):
 98 |         return {
 99 |             'datatype_coll_1': 1,
100 |             'invalid_datatype_coll': 1
101 |         }
102 | 
103 | 
104 |     def expected_sync_streams(self):
105 |         return {
106 |             'datatype_coll_1',
107 |             'invalid_datatype_coll'
108 |         }
109 | 
110 |     def name(self):
111 |         return "tap_tester_mongodb_datatype"
112 | 
113 |     def tap_name(self):
114 |         return "tap-mongodb"
115 | 
116 |     def get_type(self):
117 |         return "platform.mongodb"
118 | 
119 |     def get_credentials(self):
120 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
121 | 
122 |     def get_properties(self):
123 |         return {'host' : os.getenv('TAP_MONGODB_HOST'),
124 |                 'port' : os.getenv('TAP_MONGODB_PORT'),
125 |                 'user' : os.getenv('TAP_MONGODB_USER'),
126 |                 'database' : os.getenv('TAP_MONGODB_DBNAME')
127 |         }
128 | 
129 | 
130 |     def test_run(self):
131 | 
132 |         conn_id = connections.ensure_connection(self)
133 | 
134 |         #  -------------------------------
135 |         # -----------  Discovery ----------
136 |         #  -------------------------------
137 | 
138 |         # run in discovery mode
139 |         check_job_name = runner.run_check_mode(self, conn_id)
140 | 
141 |         # verify check  exit codes
142 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
143 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
144 | 
145 |         # verify the tap discovered the right streams
146 |         found_catalogs = menagerie.get_catalogs(conn_id)
147 | 
148 |         # assert we find the correct streams
149 |         self.assertEqual(self.expected_check_streams(),
150 |                          {c['tap_stream_id'] for c in found_catalogs})
151 | 
152 | 
153 | 
154 |         for tap_stream_id in self.expected_check_streams():
155 |             found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]
156 | 
157 |             # assert that the pks are correct
158 |             self.assertEqual(self.expected_pks()[found_stream['stream_name']],
159 |                              set(found_stream.get('metadata', {}).get('table-key-properties')))
160 | 
161 |             # assert that the row counts are correct
162 |             self.assertEqual(self.expected_row_counts()[found_stream['stream_name']],
163 |                              found_stream.get('metadata', {}).get('row-count'))
164 | 
165 |         #  -----------------------------------
166 |         # ----------- Full Table Sync ---------
167 |         #  -----------------------------------
168 |         # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
169 |         for stream_catalog in found_catalogs:
170 |             annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
171 |             additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}]
172 |             selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
173 |                                                                                     stream_catalog,
174 |                                                                                     annotated_schema,
175 |                                                                                     additional_md)
176 | 
177 |         # run full table sync
178 |         sync_job_name = runner.run_sync_mode(self, conn_id)
179 | 
180 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
181 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
182 | 
183 |         # verify the persisted schema was correct
184 |         records_by_stream = runner.get_records_from_target_output()
185 | 
186 |         # assert that each of the streams that we synced are the ones that we expect to see
187 |         record_count_by_stream = runner.examine_target_output_file(self,
188 |                                                                    conn_id,
189 |                                                                    self.expected_sync_streams(),
190 |                                                                    self.expected_pks())
191 | 
192 |         # assert that we get the correct number of records for each stream
193 |         self.assertEqual(self.expected_row_counts(),record_count_by_stream)
194 | 
195 |         # assert that an activate_version_message is first and last message sent for each stream
196 |         for stream_name in self.expected_sync_streams():
197 |             self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action'])
198 |             self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action'])
199 | 
200 |         state = menagerie.get_state(conn_id)
201 | 
202 |         first_versions = {}
203 | 
204 |         for tap_stream_id in self.expected_check_streams():
205 |             # assert that the state has an initial_full_table_complete == True
206 |             self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete'])
207 |             # assert that there is a version bookmark in state
208 |             first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version']
209 |             self.assertIsNotNone(first_versions[tap_stream_id])
210 | 
211 |         record_id = None
212 |         with get_test_connection() as client:
213 |             record_id = str([x for x in client['datatype_db']['datatype_coll_1'].find()][0]['_id'])
214 | 
215 | 
216 |         expected_record = {
217 |             "javaScript_field": "var x, y, z;",
218 |             "timestamp_field": "2019-08-15T19:25:57.000000Z",
219 |             "_id": record_id,
220 |             "date_field": "2019-08-15T19:29:14.578000Z",
221 |             "string_field": "a sample string",
222 |             "object_field": {"obj_field_2_key": "obj_field_2_val",
223 |                              "obj_field_1_key": "obj_field_1_val"},
224 |             "null_field": None,
225 |             "regex_field": {"flags": 0, "pattern": ".*"},
226 |             "object_id_field": "313233343536373839313233",
227 |             "64_bit_integer_field": 34359738368,
228 |             "32_bit_integer_field": 32,
229 |             "array_field": ["array_item_1",
230 |                             "array_item_2",
231 |                             "array_item_3"],
232 |             "binary_data_field": "YSBiaW5hcnkgc3RyaW5n",
233 |             "javaScript_with_scope_field": {"scope": "{'x': 1}",
234 |                                             "value": "function incrementX() { x++; }"},
235 |             "double_field": decimal.Decimal('4.3'),
236 |             "boolean_field": True,
237 |             "decimal_field": decimal.Decimal('1.34'),
238 |             'uuid_field': "3e139ff5-d622-45c6-bf9e-1dfec72820c4",
239 |             "dbref_field": {"id": "313233343536373839313233",
240 |                             "database": "some_database",
241 |                             "collection": "some_collection"}
242 |         }
243 | 
244 |         dict_keys = list(expected_record.keys())
245 |         dict_keys.sort()
246 | 
247 |         self.assertEquals({i: expected_record[i] for i in dict_keys},
248 |                           {i: records_by_stream['datatype_coll_1']['messages'][1]['data'][i] for i in dict_keys})
249 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_discovery.py:
--------------------------------------------------------------------------------
  1 | import bson
  2 | import datetime
  3 | import decimal
  4 | import os
  5 | import pymongo
  6 | import random
  7 | import re
  8 | import string
  9 | import time
 10 | import unittest
 11 | 
 12 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
 13 | from tap_tester import connections, menagerie, runner
 14 | 
 15 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 16 |     return ''.join(random.choice(chars) for x in range(size))
 17 | 
 18 | def generate_simple_coll_docs(num_docs):
 19 |     docs = []
 20 |     for int_value in range(num_docs):
 21 |         docs.append({"int_field": int_value, "string_field": random_string_generator()})
 22 |     return docs
 23 | 
 24 | class MongoDBDiscovery(unittest.TestCase):
 25 |     AUTOMATIC = "automatic"
 26 |     UNSUPPORTED = "unsupported"
 27 |     VALID_REPLICATION_KEYS = "valid-replication-keys"
 28 |     PRIMARY_KEYS = "table-key-properties"
 29 |     FORCED_REPLICATION_METHOD = "forced-replication-method"
 30 |     INCREMENTAL = "INCREMENTAL"
 31 |     FULL_TABLE = "FULL_TABLE"
 32 |     LOG_BASED = "LOG_BASED"
 33 | 
 34 |     def setUp(self):
 35 | 
 36 |         ensure_environment_variables_set()
 37 | 
 38 |         with get_test_connection() as client:
 39 |             # drop all dbs/collections
 40 |             drop_all_collections(client)
 41 | 
 42 |             # simple_coll_1 has 50 documents
 43 |             client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
 44 | 
 45 |             # simple_coll_2 has 100 documents
 46 |             client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))
 47 | 
 48 |             # admin_coll_1 has 50 documents
 49 |             client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(50))
 50 | 
 51 |             # create view on simple_coll_1
 52 |             client["simple_db"].command(bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])]))
 53 | 
 54 |             # collections with same names as others in different dbs
 55 |             client["simple_db_2"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
 56 |             client["simple_db_2"]["SIMPLE_COLL_1"].insert_many(generate_simple_coll_docs(50))
 57 | 
 58 |             # collections with special characters in names
 59 |             client["special_db"]["hebrew_ישראל"].insert_many(generate_simple_coll_docs(50))
 60 |             client['special_db']['hello!world?'].insert_many(generate_simple_coll_docs(50))
 61 | 
 62 |             # Add datatype collections
 63 |             pattern = re.compile('.*')
 64 |             regex = bson.Regex.from_native(pattern)
 65 |             regex.flags ^= re.UNICODE
 66 |             datatype_doc = {
 67 |                 "double_field": 4.3,
 68 |                 "string_field": "a sample string",
 69 |                 "object_field" : {
 70 |                     "obj_field_1_key": "obj_field_1_val",
 71 |                     "obj_field_2_key": "obj_field_2_val"
 72 |                 },
 73 |                 "array_field" : [
 74 |                     "array_item_1",
 75 |                     "array_item_2",
 76 |                     "array_item_3"
 77 |                 ],
 78 |                 "binary_data_field" : b"a binary string",
 79 |                 "object_id_field": bson.objectid.ObjectId(b'123456789123'),
 80 |                 "boolean_field" : True,
 81 |                 "date_field" : datetime.datetime.now(),
 82 |                 "null_field": None,
 83 |                 "regex_field" : regex,
 84 |                 "32_bit_integer_field" : 32,
 85 |                 "timestamp_field" : bson.timestamp.Timestamp(int(time.time()), 1),
 86 |                 "64_bit_integer_field" : 34359738368,
 87 |                 "decimal_field" : bson.Decimal128(decimal.Decimal('1.34')),
 88 |                 "javaScript_field" : bson.code.Code("var x, y, z;"),
 89 |                 "javaScript_with_scope_field" : bson.code.Code("function incrementX() { x++; }", scope={"x": 1}),
 90 |                 "min_key_field" : bson.min_key.MinKey,
 91 |                 "max_key_field" : bson.max_key.MaxKey
 92 |             }
 93 |             client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc)
 94 | 
 95 |             client["datatype_db"]["datatype_coll_2"].insert_one(datatype_doc)
 96 |             client["datatype_db"]["datatype_coll_2"].create_index([("date_field", pymongo.ASCENDING)])
 97 |             client["datatype_db"]["datatype_coll_2"].create_index([("timestamp_field", pymongo.ASCENDING)])
 98 |             client["datatype_db"]["datatype_coll_2"].create_index([("32_bit_integer_field", pymongo.ASCENDING)])
 99 |             client["datatype_db"]["datatype_coll_2"].create_index([("64_bit_integer_field", pymongo.ASCENDING)])
100 | 
101 |     def expected_check_streams(self):
102 |         return {
103 |             'simple_db-simple_coll_1',
104 |             'simple_db-simple_coll_2',
105 |             'simple_db_2-simple_coll_1',
106 |             'simple_db_2-SIMPLE_COLL_1',
107 |             'admin-admin_coll_1',
108 |             #'simple_db-simple_view_1',
109 |             'datatype_db-datatype_coll_1',
110 |             'datatype_db-datatype_coll_2',
111 |             'special_db-hebrew_ישראל',
112 |             'special_db-hello!world?'
113 |         }
114 | 
115 |     def expected_primary_keys(self):
116 |         """Defaults to '_id' in discovery, standard ObjectId(), any value can be provided (TODO where?)"""
117 |         return {
118 |             stream: {'_id'}
119 |             for stream in self.expected_check_streams()
120 |         }
121 |     def expected_replication_keys(self):
122 |         return {
123 |             'simple_db-simple_coll_1': {'_id'},
124 |             'simple_db-simple_coll_2': {'_id'},
125 |             'simple_db_2-simple_coll_1': {'_id'},
126 |             'simple_db_2-SIMPLE_COLL_1': {'_id'},
127 |             'admin-admin_coll_1': {'_id'},
128 |             #'simple_db-simple_view_1': {'_id'},
129 |             'datatype_db-datatype_coll_1': {
130 |                 '_id',
131 |             },
132 |             'datatype_db-datatype_coll_2': {
133 |                 '_id',
134 |                 'date_field',
135 |                 'timestamp_field',
136 |                 '32_bit_integer_field',
137 |                 '64_bit_integer_field',
138 |             },
139 |             'special_db-hebrew_ישראל': {'_id'},
140 |             'special_db-hello!world?': {'_id'},
141 |         }
142 | 
143 |     def expected_row_counts(self):
144 |         return {
145 |             'simple_db-simple_coll_1': 50,
146 |             'simple_db-simple_coll_2': 100,
147 |             'simple_db_2-simple_coll_1': 50,
148 |             'simple_db_2-SIMPLE_COLL_1': 50,
149 |             'admin-admin_coll_1': 50,
150 |             #'simple_db-simple_view_1': 50,
151 |             'datatype_db-datatype_coll_1': 1,
152 |             'datatype_db-datatype_coll_2': 1,
153 |             'special_db-hebrew_ישראל': 50,
154 |             'special_db-hello!world?': 50
155 |         }
156 | 
157 |     def expected_table_names(self):
158 |         return {
159 |             'simple_coll_1',
160 |             'simple_coll_2',
161 |             'SIMPLE_COLL_1',
162 |             'admin_coll_1',
163 |             'datatype_coll_1',
164 |             'datatype_coll_2',
165 |             'hebrew_ישראל',
166 |             'hello!world?'
167 |         }
168 | 
169 |     def name(self):
170 |         return "mongodb_discovery"
171 | 
172 |     def tap_name(self):
173 |         return "tap-mongodb"
174 | 
175 |     def get_type(self):
176 |         return "platform.mongodb"
177 | 
178 |     def get_credentials(self):
179 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
180 | 
181 |     def get_properties(self):
182 |         return {'host' : os.getenv('TAP_MONGODB_HOST'),
183 |                 'port' : os.getenv('TAP_MONGODB_PORT'),
184 |                 'user' : os.getenv('TAP_MONGODB_USER'),
185 |                 'database' : os.getenv('TAP_MONGODB_DBNAME')
186 |         }
187 | 
188 |     def test_run(self):
189 |         conn_id = connections.ensure_connection(self)
190 | 
191 |         # run in check mode
192 |         check_job_name = runner.run_check_mode(self, conn_id)
193 | 
194 |         # check exit codes
195 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
196 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
197 | 
198 |         # Verify a catalog was produced by discovery
199 |         catalog = menagerie.get_catalog(conn_id)
200 |         self.assertGreater(len(catalog), 0)
201 | 
202 |         # Verify stream_name entries match the expected table names
203 |         stream_catalogs = catalog['streams']
204 |         stream_names = {catalog['stream_name'] for catalog in stream_catalogs}
205 |         self.assertSetEqual(self.expected_table_names(), stream_names)
206 | 
207 |         # Verify tap_stream_id entries follow naming convention <db_name>-<table_name>
208 |         stream_ids = {catalog['tap_stream_id'] for catalog in stream_catalogs}
209 |         self.assertSetEqual(self.expected_check_streams(), stream_ids)
210 | 
211 |         # Stream level assertions
212 |         for stream in self.expected_check_streams():
213 |             with self.subTest(stream=stream):
214 | 
215 |                 # gathering expectations
216 |                 expected_primary_keys = self.expected_primary_keys()[stream]
217 |                 expected_replication_keys = self.expected_replication_keys()[stream]
218 |                 expected_row_count = self.expected_row_counts()[stream]
219 | 
220 |                 # collecting actual values...
221 |                 stream_catalog = [catalog for catalog in stream_catalogs
222 |                                   if catalog["tap_stream_id"] == stream][0]
223 |                 schema_and_metadata = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
224 |                 stream_metadata = schema_and_metadata["metadata"]
225 |                 empty_breadcrumb_metadata = [item for item in stream_metadata if item.get("breadcrumb") == []]
226 |                 stream_properties = empty_breadcrumb_metadata[0]['metadata']
227 |                 actual_primary_keys = set(stream_properties.get(self.PRIMARY_KEYS, []))
228 |                 actual_replication_keys = set(stream_properties.get(self.VALID_REPLICATION_KEYS, []))
229 |                 actual_replication_method = stream_properties.get(self.FORCED_REPLICATION_METHOD)
230 |                 actual_stream_inclusion = stream_properties.get('inclusion')
231 |                 actual_field_inclusions = set(
232 |                     item.get("metadata").get("inclusion")
233 |                     for item in stream_metadata
234 |                     if item.get("breadcrumb", []) != []
235 |                 )
236 |                 actual_fields_to_datatypes = {
237 |                     item['breadcrumb'][1]: item['metadata'].get('sql-datatype')
238 |                     for item in stream_metadata if item.get('breadcrumb') != []
239 |                 }
240 | 
241 |                 # Verify there is only 1 top level breadcrumb in metadata
242 |                 self.assertEqual(1, len(empty_breadcrumb_metadata))
243 | 
244 |                 # Verify replication key(s) match expectations
245 |                 self.assertSetEqual(expected_replication_keys, actual_replication_keys)
246 | 
247 |                 # Verify primary key(s) match expectations
248 |                 self.assertSetEqual(expected_primary_keys, actual_primary_keys)
249 | 
250 |                 # Verify no field-level inclusion exists
251 |                 self.assertSetEqual(set(), actual_field_inclusions)
252 | 
253 |                 # Verify row-count metadata matches expectations
254 |                 self.assertEqual(expected_row_count, stream_properties['row-count'])
255 | 
256 |                 # Verify selected metadata is None for all streams
257 |                 self.assertIsNone(stream_properties.get('selected'))
258 | 
259 |                 # Verify is-view metadata is False
260 |                 self.assertFalse(stream_properties['is-view'])
261 | 
262 |                 # Verify no forced-replication-method is present in metadata
263 |                 self.assertNotIn(self.FORCED_REPLICATION_METHOD, stream_properties.keys())
264 | 
265 |                 # Verify database-name is consistent with the tap_stream_id
266 |                 tap_stream_id_db_prefix = stream_catalog['tap_stream_id'].split('-')[0]
267 |                 self.assertEqual(tap_stream_id_db_prefix, stream_properties['database-name'])
268 | 
269 |                 # Verify schema types match expectations
270 |                 self.assertDictEqual({'type': 'object'}, stream_catalog['schema'])
271 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_full_table.py:
--------------------------------------------------------------------------------
  1 | import bson
  2 | import os
  3 | import pdb
  4 | import random
  5 | import string
  6 | import unittest
  7 | 
  8 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
  9 | from tap_tester import connections, menagerie, runner
 10 | 
 11 | 
 12 | RECORD_COUNT = {}
 13 | 
 14 | 
 15 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 16 |     return ''.join(random.choice(chars) for x in range(size))
 17 | 
 18 | def generate_simple_coll_docs(num_docs):
 19 |     docs = []
 20 |     for int_value in range(num_docs):
 21 |         docs.append({"int_field": int_value, "string_field": random_string_generator()})
 22 |     return docs
 23 | 
 24 | class MongoDBFullTable(unittest.TestCase):
 25 |     def setUp(self):
 26 |         ensure_environment_variables_set()
 27 | 
 28 |         with get_test_connection() as client:
 29 |             # drop all dbs/collections
 30 |             drop_all_collections(client)
 31 | 
 32 |             # simple_coll_1 has 50 documents
 33 |             client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
 34 | 
 35 |             # create view on simple_coll_1
 36 |             client["simple_db"].command(bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])]))
 37 | 
 38 |             # simple_coll_2 has 100 documents
 39 |             client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))
 40 | 
 41 |             # admin_coll_1 has 50 documents
 42 |             client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(50))
 43 | 
 44 |             # simple_coll_3 is an empty collection
 45 |             client["simple_db"].create_collection("simple_coll_3")
 46 | 
 47 |             # simple_coll_4 has documents with special chars and a lot of nesting
 48 |             client["simple_db"]["simple_coll_4"].insert_one({"hebrew_ישרא": "hebrew_ישרא"})
 49 |             client["simple_db"]["simple_coll_4"].insert_one({"hebrew_ישרא": 2})
 50 |             client["simple_db"]["simple_coll_4"].insert_one({"another_hebrew_ישראל": "another_hebrew_ישרא"})
 51 |             nested_doc = {"field0": {}}
 52 |             current_doc = nested_doc
 53 |             for i in range(1, 101):
 54 |                 current_doc["field{}".format(i-1)]["field{}".format(i)] = {}
 55 |                 current_doc = current_doc["field{}".format(i-1)]
 56 |             current_doc["field100"] = "some_value"
 57 |             client["simple_db"]["simple_coll_4"].insert_one(nested_doc)
 58 | 
 59 |             max_col_doc = {}
 60 |             for x in range(1600):
 61 |                 max_col_doc['col_{}'.format(x)] = x
 62 |             client["simple_db"]["simple_coll_4"].insert_one(max_col_doc)
 63 | 
 64 | 
 65 | 
 66 | 
 67 |     def tap_stream_id_to_stream(self):
 68 |         return {
 69 |             'simple_db-simple_coll_1': 'simple_db_simple_coll_1',
 70 |             'simple_db-simple_coll_2': 'simple_db_simple_coll_2',
 71 |             'simple_db-simple_coll_3': 'simple_db_simple_coll_3',
 72 |             'simple_db-simple_coll_4': 'simple_db_simple_coll_4',
 73 |             'admin-admin_coll_1': 'admin_admin_coll_1'
 74 |         }
 75 | 
 76 |     def expected_check_streams(self):
 77 |         return {
 78 |             'simple_db-simple_coll_1',
 79 |             'simple_db-simple_coll_2',
 80 |             'simple_db-simple_coll_3',
 81 |             'simple_db-simple_coll_4',
 82 |             'admin-admin_coll_1'
 83 |         }
 84 | 
 85 |     def expected_pks(self):
 86 |         return {
 87 |             'simple_db_simple_coll_1': {'_id'},
 88 |             'simple_db_simple_coll_2': {'_id'},
 89 |             'simple_db_simple_coll_3': {'_id'},
 90 |             'simple_db_simple_coll_4': {'_id'},
 91 |             'admin_admin_coll_1': {'_id'}
 92 |         }
 93 | 
 94 |     def expected_row_counts(self):
 95 |         return {
 96 |             'simple_db_simple_coll_1': 50,
 97 |             'simple_db_simple_coll_2': 100,
 98 |             'simple_db_simple_coll_3': 0,
 99 |             'simple_db_simple_coll_4': 5,
100 |             'admin_admin_coll_1': 50
101 |         }
102 | 
103 |     def expected_sync_streams(self):
104 |         return {
105 |             'simple_db_simple_coll_1',
106 |             'simple_db_simple_coll_2',
107 |             'simple_db_simple_coll_3',
108 |             'simple_db_simple_coll_4',
109 |             'admin_admin_coll_1'
110 |         }
111 | 
112 |     def name(self):
113 |         return "tap_tester_mongodb_full_table"
114 | 
115 |     def tap_name(self):
116 |         return "tap-mongodb"
117 | 
118 |     def get_type(self):
119 |         return "platform.mongodb"
120 | 
121 |     def get_credentials(self):
122 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
123 | 
124 |     def get_properties(self):
125 |         return {'host' : os.getenv('TAP_MONGODB_HOST'),
126 |                 'port' : os.getenv('TAP_MONGODB_PORT'),
127 |                 'user' : os.getenv('TAP_MONGODB_USER'),
128 |                 'database' : os.getenv('TAP_MONGODB_DBNAME'),
129 |                 'include_schemas_in_destination_stream_name': 'true'
130 |         }
131 | 
132 |     def test_run(self):
133 | 
134 |         conn_id = connections.ensure_connection(self)
135 | 
136 |         #  -------------------------------
137 |         # -----------  Discovery ----------
138 |         #  -------------------------------
139 | 
140 |         # run in discovery mode
141 |         check_job_name = runner.run_check_mode(self, conn_id)
142 | 
143 |         # verify check exit codes
144 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
145 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
146 | 
147 |         # verify the tap discovered the right streams
148 |         found_catalogs = menagerie.get_catalogs(conn_id)
149 | 
150 |         # assert we find the correct streams
151 |         self.assertEqual(self.expected_check_streams(),
152 |                          {c['tap_stream_id'] for c in found_catalogs})
153 | 
154 |         #  -------------------------------------------
155 |         #  ----------- First full Table Sync ---------
156 |         #  -------------------------------------------
157 |         # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
158 |         for stream_catalog in found_catalogs:
159 |             annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
160 |             additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}]
161 |             selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
162 |                                                                                     stream_catalog,
163 |                                                                                     annotated_schema,
164 |                                                                                     additional_md)
165 | 
166 |         # run full table sync
167 |         sync_job_name = runner.run_sync_mode(self, conn_id)
168 | 
169 |         # check exit status
170 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
171 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
172 | 
173 |         # streams that we synced are the ones that we expect to see
174 |         records_by_stream = runner.get_records_from_target_output()
175 |         record_count_by_stream = runner.examine_target_output_file(self,
176 |                                                                    conn_id,
177 |                                                                    self.expected_sync_streams(),
178 |                                                                    self.expected_pks())
179 | 
180 |         # assert that we get the correct number of records for each stream
181 |         self.assertEqual(self.expected_row_counts(),record_count_by_stream)
182 | 
183 |         # assert that an activate_version_message is first and last message sent for each stream
184 |         for stream_name in self.expected_sync_streams():
185 |             self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action'])
186 |             self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action'])
187 | 
188 |         state = menagerie.get_state(conn_id)
189 | 
190 |         first_versions = {}
191 | 
192 |         for tap_stream_id in self.expected_check_streams():
193 | 
194 |             # state has an initial_full_table_complete == True
195 |             self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete'])
196 | 
197 |             # there is a version bookmark in state
198 |             first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version']
199 |             self.assertIsNotNone(first_versions[tap_stream_id])
200 | 
201 |         #  -------------------------------------------
202 |         #  ----------- Second full Table Sync ---------
203 |         #  -------------------------------------------
204 |         with get_test_connection() as client:
205 |             # update existing documents in the collection to make sure we get the updates as well in the next sync
206 |             doc_to_update = client["simple_db"]["simple_coll_1"].find_one()
207 |             client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update["_id"]}, {"$set": {"int_field": 999}})
208 | 
209 |             doc_to_update = client["simple_db"]["simple_coll_2"].find_one()
210 |             client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update["_id"]}, {"$set": {"int_field": 888}})
211 | 
212 |             doc_to_update = client["admin"]["admin_coll_1"].find_one()
213 |             client["admin"]["admin_coll_1"].find_one_and_update({"_id": doc_to_update["_id"]}, {"$set": {"int_field": 777}})
214 | 
215 |             # add 2 rows and run full table again, make sure we get initial number + 2
216 |             client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(2))
217 | 
218 |             client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(2))
219 | 
220 |             client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(2))
221 | 
222 |         sync_job_name = runner.run_sync_mode(self, conn_id)
223 | 
224 |         # check exit status
225 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
226 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
227 | 
228 |         # verify the persisted schema was correct
229 |         records_by_stream = runner.get_records_from_target_output()
230 | 
231 |         # assert that each of the streams that we synced are the ones that we expect to see
232 |         record_count_by_stream = runner.examine_target_output_file(self,
233 |                                                                    conn_id,
234 |                                                                    self.expected_sync_streams(),
235 |                                                                    self.expected_pks())
236 | 
237 |         state = menagerie.get_state(conn_id)
238 | 
239 |         # Verify  that menagerie state does not include a key for currently syncing
240 |         self.assertIsNone(state['currently_syncing'])
241 | 
242 |         # Verify that menagerie state does not include a key for oplog based syncing
243 |         self.assertNotIn('oplog', state)
244 | 
245 |         # assert that we have correct number of records (including the two new records and the update which is to be resynced)
246 |         new_expected_row_counts = {k: v+2 for k, v in self.expected_row_counts().items() if k not in ['simple_db_simple_coll_3',
247 |                                                                                                     'simple_db_simple_coll_4']}
248 |         new_expected_row_counts['simple_db_simple_coll_3']=0
249 |         new_expected_row_counts['simple_db_simple_coll_4']=5
250 |         self.assertEqual(new_expected_row_counts, record_count_by_stream)
251 | 
252 |         # assert that we only have an ActivateVersionMessage as the last message and not the first
253 |         for stream_name in self.expected_sync_streams():
254 |             if len(records_by_stream[stream_name]['messages']) > 1:
255 |                 self.assertNotEqual('activate_version', records_by_stream[stream_name]['messages'][0]['action'], stream_name + "failed")
256 |                 self.assertEqual('upsert', records_by_stream[stream_name]['messages'][0]['action'], stream_name + "failed")
257 |             self.assertEqual('activate_version', records_by_stream[stream_name]['messages'][-1]['action'], stream_name + "failed")
258 | 
259 |         second_versions = {}
260 |         for tap_stream_id in self.expected_check_streams():
261 |             found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]
262 | 
263 |             # state has an initial_full_table_complete == True
264 |             self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete'])
265 | 
266 |             # version bookmark
267 |             second_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version']
268 |             self.assertIsNotNone(second_versions[tap_stream_id])
269 | 
270 |             # version in this state is different than that of the previous state
271 |             self.assertNotEqual(first_versions[tap_stream_id], second_versions[tap_stream_id])
272 | 
273 |             # version which is larger than the previous target version
274 |             self.assertGreater(second_versions[tap_stream_id], first_versions[tap_stream_id])
275 | 
276 |             # verify that menagerie state does include the version which matches the target version
277 |             self.assertEqual(records_by_stream[self.tap_stream_id_to_stream()[tap_stream_id]]['table_version'], second_versions[tap_stream_id])
278 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_full_table_id.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import bson
  3 | import os
  4 | import random
  5 | import string
  6 | import time
  7 | import unittest
  8 | 
  9 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
 10 | from tap_tester import connections, menagerie, runner
 11 | 
 12 | 
 13 | RECORD_COUNT = {}
 14 | 
 15 | 
 16 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 17 |     return ''.join(random.choice(chars) for x in range(size))
 18 | 
 19 | def generate_simple_coll_docs(num_docs):
 20 |     docs = []
 21 |     for int_value in range(num_docs):
 22 |         docs.append({"_id": int_value, "int_field": int_value, "string_field": random_string_generator()})
 23 |     return docs
 24 | 
 25 | def generate_simple_binary_coll_docs(num_docs):
 26 |     docs = []
 27 |     for int_value in range(num_docs):
 28 |         docs.append({"_id": bson.Binary("test {}".format(int_value).encode()), "int_field": int_value, "string_field": random_string_generator()})
 29 |     return docs
 30 | 
 31 | 
 32 | class MongoDBFullTableID(unittest.TestCase):
 33 |     def setUp(self):
 34 |         ensure_environment_variables_set()
 35 | 
 36 |         with get_test_connection() as client:
 37 |             # drop all dbs/collections
 38 |             drop_all_collections(client)
 39 | 
 40 |             # simple_coll_1 has 50 documents, id is an integer instead of ObjectId
 41 |             client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
 42 | 
 43 |             # simple_coll_2 has 100 documents, id is an integer instead of ObjectId
 44 |             client["simple_db"]["simple_coll_2"].insert_many(generate_simple_binary_coll_docs(50))
 45 | 
 46 |     def expected_check_streams(self):
 47 |         return {
 48 |             'simple_db-simple_coll_1',
 49 |             'simple_db-simple_coll_2'
 50 |         }
 51 | 
 52 |     def expected_pks(self):
 53 |         return {
 54 |             'simple_coll_1': {'_id'},
 55 |             'simple_coll_2': {'_id'}
 56 |         }
 57 | 
 58 |     def expected_row_counts(self):
 59 |         return {
 60 |             'simple_coll_1': 50,
 61 |             'simple_coll_2': 50
 62 |         }
 63 | 
 64 |     def expected_sync_streams(self):
 65 |         return {
 66 |             'simple_coll_1',
 67 |             'simple_coll_2'
 68 |         }
 69 | 
 70 |     def name(self):
 71 |         return "tap_tester_mongodb_full_table_id"
 72 | 
 73 |     def tap_name(self):
 74 |         return "tap-mongodb"
 75 | 
 76 |     def get_type(self):
 77 |         return "platform.mongodb"
 78 | 
 79 |     def get_credentials(self):
 80 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
 81 | 
 82 |     def get_properties(self):
 83 |         return {'host' : os.getenv('TAP_MONGODB_HOST'),
 84 |                 'port' : os.getenv('TAP_MONGODB_PORT'),
 85 |                 'user' : os.getenv('TAP_MONGODB_USER'),
 86 |                 'database' : os.getenv('TAP_MONGODB_DBNAME')
 87 |         }
 88 | 
 89 |     def test_run(self):
 90 | 
 91 |         conn_id = connections.ensure_connection(self)
 92 | 
 93 |         #  -------------------------------
 94 |         # -----------  Discovery ----------
 95 |         #  -------------------------------
 96 | 
 97 |         # run in discovery mode
 98 |         check_job_name = runner.run_check_mode(self, conn_id)
 99 | 
100 |         # verify check  exit codes
101 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
102 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
103 | 
104 |         # verify the tap discovered the right streams
105 |         found_catalogs = menagerie.get_catalogs(conn_id)
106 | 
107 |         # assert we find the correct streams
108 |         self.assertEqual(self.expected_check_streams(),
109 |                          {c['tap_stream_id'] for c in found_catalogs})
110 | 
111 |         for tap_stream_id in self.expected_check_streams():
112 |             found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]
113 | 
114 |             # assert that the pks are correct
115 |             self.assertEqual(self.expected_pks()[found_stream['stream_name']],
116 |                              set(found_stream.get('metadata', {}).get('table-key-properties')))
117 | 
118 |             # assert that the row counts are correct
119 |             self.assertEqual(self.expected_row_counts()[found_stream['stream_name']],
120 |                              found_stream.get('metadata', {}).get('row-count'))
121 | 
122 |         #  -----------------------------------
123 |         # ----------- Full Table Sync ---------
124 |         #  -----------------------------------
125 |         # select simple_coll_1 stream and add replication method metadata
126 |         for stream_catalog in found_catalogs:
127 |             annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
128 |             additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}]
129 |             selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
130 |                                                                                     stream_catalog,
131 |                                                                                     annotated_schema,
132 |                                                                                     additional_md)
133 |         # synthesize interrupted state
134 |         interrupted_state = {
135 |             'currently_syncing' : 'simple_db-simple_coll_1',
136 |             'bookmarks' : {'simple_db-simple_coll_1': { 'max_id_value': 49,
137 |                                                         'max_id_type': 'int',
138 |                                                         'initial_full_table_complete': False,
139 |                                                         'last_id_fetched': 25,
140 |                                                         'last_id_fetched_type': 'int',
141 |                                                         'version': int(time.time() * 1000)},
142 |                            'simple_db-simple_coll_2': { 'max_id_value': base64.b64encode("test {}".format(49).encode()),
143 |                                                         'max_id_type': 'bytes',
144 |                                                         'initial_full_table_complete': False,
145 |                                                         'last_id_fetched': base64.b64encode("test {}".format(25).encode()),
146 |                                                         'last_id_fetched_type': 'bytes',
147 |                                                         'version': int(time.time() * 1000)}}}
148 | 
149 |         # update existing documents in collection with int_field value less than 25, and verify they do not come up in the sync
150 |         # update existing documents in collection with int_field value greater than 25, and verify they come up in the sync
151 |         with get_test_connection() as client:
152 |             # find_one() is going to retreive the first document in the collection
153 |             doc_to_update_1 = client["simple_db"]["simple_coll_1"].find_one()
154 |             client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update_1["_id"]}, {"$set": {"int_field": 999}})
155 | 
156 |             doc_to_update_2 = client["simple_db"]["simple_coll_2"].find_one()
157 |             client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update_2["_id"]}, {"$set": {"int_field": 888}})
158 | 
159 |             doc_to_update_3 = client["simple_db"]["simple_coll_1"].find_one({"int_field": 30})
160 |             client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update_3["_id"]}, {"$set": {"int_field": 777}})
161 | 
162 |             doc_to_update_4 = client["simple_db"]["simple_coll_2"].find_one({"int_field": 40})
163 |             client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update_4["_id"]}, {"$set": {"int_field": 666}})
164 | 
165 |         menagerie.set_state(conn_id, interrupted_state)
166 |         runner.run_sync_mode(self, conn_id)
167 | 
168 |         # streams that we synced are the ones that we expect to see
169 |         records_by_stream = runner.get_records_from_target_output()
170 |         record_count_by_stream = runner.examine_target_output_file(self,
171 |                                                                    conn_id,
172 |                                                                    self.expected_sync_streams(),
173 |                                                                    self.expected_pks())
174 | 
175 |         # ActivateVersionMessage as the last message and not the first
176 |         for stream_name in self.expected_sync_streams():
177 |             self.assertNotEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action'])
178 |             self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action'])
179 | 
180 |         # _id of the first record sync'd for each stream is the bookmarked
181 |         # last_id_fetched from the interrupted_state passed to the tap
182 |         self.assertEqual(records_by_stream['simple_coll_1']['messages'][0]['data']['_id'],
183 |                          int(interrupted_state['bookmarks']['simple_db-simple_coll_1']['last_id_fetched']))
184 | 
185 |         # _id of the last record sync'd for each stream is the bookmarked
186 |         # max_id_value from the interrupted_state passed to the tap
187 |         self.assertEqual(records_by_stream['simple_coll_1']['messages'][-2]['data']['_id'],
188 |                          int(interrupted_state['bookmarks']['simple_db-simple_coll_1']['max_id_value']))
189 | 
190 |         # verify we are not seeing any documents which were updated having id < 25
191 |         self.assertNotEqual(999, records_by_stream['simple_coll_1']['messages'][0]['data']['int_field'])
192 |         self.assertNotEqual(888, records_by_stream['simple_coll_2']['messages'][0]['data']['int_field'])
193 | 
194 |         int_value = False
195 |         for x in records_by_stream['simple_coll_1']['messages'][:-1]:
196 |             # We are not considering the last element of this list because it does not have 'data'
197 |             if int(x['data']['int_field']) == 999:
198 |                 int_value = True
199 |         self.assertEqual(False, int_value)
200 | 
201 |         int_value2 = False
202 |         for x in records_by_stream['simple_coll_1']['messages'][:-1]:
203 |             if x['data']['int_field'] == 888:
204 |                 int_value2 = True
205 |         self.assertEqual(False, int_value2)
206 | 
207 |         # verify we are seeing the documents which were updated having id > 25
208 |         # we are picking the 5th and 15th element in the list because we updated the 30th and 40th document, (doc starting with 25)
209 |         self.assertEqual(777, records_by_stream['simple_coll_1']['messages'][5]['data']['int_field'])
210 |         self.assertEqual(666, records_by_stream['simple_coll_2']['messages'][15]['data']['int_field'])
211 | 
212 |         # assert that final state has no last_id_fetched and max_id_value bookmarks
213 |         final_state = menagerie.get_state(conn_id)
214 |         for tap_stream_id in self.expected_check_streams():
215 |             self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('last_id_fetched'))
216 |             self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('max_id_value'))
217 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_full_table_interruptible.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pymongo
  3 | import random
  4 | import string
  5 | import time
  6 | import unittest
  7 | 
  8 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
  9 | from tap_tester import connections, menagerie, runner
 10 | 
 11 | 
 12 | RECORD_COUNT = {}
 13 | 
 14 | 
 15 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 16 |     return ''.join(random.choice(chars) for x in range(size))
 17 | 
 18 | def generate_simple_coll_docs(num_docs):
 19 |     docs = []
 20 |     for int_value in range(num_docs):
 21 |         docs.append({"int_field": int_value, "string_field": random_string_generator()})
 22 |     return docs
 23 | 
 24 | class MongoDBFullTableInterruptible(unittest.TestCase):
 25 |     def setUp(self):
 26 |         ensure_environment_variables_set()
 27 | 
 28 |         with get_test_connection() as client:
 29 |             # drop all dbs/collections
 30 |             drop_all_collections(client)
 31 | 
 32 |             # simple_coll_1 has 50 documents
 33 |             client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
 34 | 
 35 |             # simple_coll_2 has 100 documents
 36 |             client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))
 37 | 
 38 |     def expected_check_streams(self):
 39 |         return {
 40 |             'simple_db-simple_coll_1',
 41 |             'simple_db-simple_coll_2',
 42 |         }
 43 | 
 44 |     def expected_pks(self):
 45 |         return {
 46 |             'simple_coll_1': {'_id'},
 47 |             'simple_coll_2': {'_id'},
 48 |         }
 49 | 
 50 |     def expected_row_counts(self):
 51 |         return {
 52 |             'simple_coll_1': 25,
 53 |             'simple_coll_2': 50,
 54 |         }
 55 | 
 56 |     def expected_sync_streams(self):
 57 |         return {
 58 |             'simple_coll_1',
 59 |             'simple_coll_2'
 60 |         }
 61 | 
 62 |     def name(self):
 63 |         return "tap_tester_mongodb_full_table_interruptible"
 64 | 
 65 |     def tap_name(self):
 66 |         return "tap-mongodb"
 67 | 
 68 |     def get_type(self):
 69 |         return "platform.mongodb"
 70 | 
 71 |     def get_credentials(self):
 72 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
 73 | 
 74 |     def get_properties(self):
 75 |         return {'host' : os.getenv('TAP_MONGODB_HOST'),
 76 |                 'port' : os.getenv('TAP_MONGODB_PORT'),
 77 |                 'user' : os.getenv('TAP_MONGODB_USER'),
 78 |                 'database' : os.getenv('TAP_MONGODB_DBNAME')
 79 |         }
 80 | 
 81 |     def test_run(self):
 82 | 
 83 |         conn_id = connections.ensure_connection(self)
 84 | 
 85 |         #  -------------------------------
 86 |         # -----------  Discovery ----------
 87 |         #  -------------------------------
 88 | 
 89 |         # run in discovery mode
 90 |         check_job_name = runner.run_check_mode(self, conn_id)
 91 | 
 92 |         # verify check exit codes
 93 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
 94 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
 95 | 
 96 |         # verify the tap discovered the right streams
 97 |         found_catalogs = menagerie.get_catalogs(conn_id)
 98 | 
 99 |         # assert we find the correct streams
100 |         self.assertEqual(self.expected_check_streams(),
101 |                          {c['tap_stream_id'] for c in found_catalogs})
102 | 
103 |         #  -----------------------------------
104 |         # ----------- Full Table Sync ---------
105 |         #  -----------------------------------
106 |         # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
107 |         for stream_catalog in found_catalogs:
108 |             annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
109 |             additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}]
110 |             selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
111 |                                                                                     stream_catalog,
112 |                                                                                     annotated_schema,
113 |                                                                                     additional_md)
114 |         # Synthesize interrupted state
115 |         interrupted_state = {
116 |             'currently_syncing' : 'simple_db-simple_coll_1',
117 |             'bookmarks' : {}
118 |         }
119 | 
120 |         versions = {}
121 |         with get_test_connection() as client:
122 |             for stream_name in self.expected_sync_streams():
123 |                 rows = [x for x in client['simple_db'][stream_name].find(sort=[("_id", pymongo.ASCENDING)])]
124 |                 # set last_id_fetched to middle point of table
125 |                 last_id_fetched = str(rows[int(len(rows)/2)]['_id'])
126 |                 max_id_value = str(rows[-1]['_id'])
127 | 
128 |                 tap_stream_id = 'simple_db-'+stream_name
129 |                 version = int(time.time() * 1000)
130 |                 interrupted_state['bookmarks'][tap_stream_id] = {
131 |                     'max_id_value': max_id_value,
132 |                     'max_id_type': 'ObjectId',
133 |                     'initial_full_table_complete': False,
134 |                     'last_id_fetched': last_id_fetched,
135 |                     'last_id_fetched_type': 'ObjectId',
136 |                     'version': version
137 |                 }
138 |                 versions[tap_stream_id] = version
139 | 
140 |         # update existing documents in collection with int_field value less than 25, and verify they do not come up in the sync
141 |         # update existing documents in collection with int_field value greater than 25, and verify they come up in the sync
142 | 
143 |             # find_one() is going to retreive the first document in the collection
144 |             doc_to_update_1 = client["simple_db"]["simple_coll_1"].find_one()
145 |             client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update_1["_id"]}, {"$set": {"int_field": 999}})
146 | 
147 |             doc_to_update_2 = client["simple_db"]["simple_coll_2"].find_one()
148 |             client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update_2["_id"]}, {"$set": {"int_field": 888}})
149 | 
150 |             doc_to_update_3 = client["simple_db"]["simple_coll_1"].find_one({"int_field": 30})
151 |             client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update_3["_id"]}, {"$set": {"int_field": 777}})
152 | 
153 |             doc_to_update_4 = client["simple_db"]["simple_coll_2"].find_one({"int_field": 80})
154 |             client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update_4["_id"]}, {"$set": {"int_field": 666}})
155 | 
156 | 
157 |         menagerie.set_state(conn_id, interrupted_state)
158 | 
159 |         runner.run_sync_mode(self, conn_id)
160 | 
161 |         # streams that we synced are the ones that we expect to see
162 |         record_count_by_stream = runner.examine_target_output_file(self,
163 |                                                                    conn_id,
164 |                                                                    self.expected_sync_streams(),
165 |                                                                    self.expected_pks())
166 | 
167 |         # record counts
168 |         records_by_stream = runner.get_records_from_target_output()
169 |         self.assertEqual(self.expected_row_counts(), record_count_by_stream)
170 | 
171 |         # ActivateVersionMessage as the last message and not the first
172 |         for stream_name in self.expected_sync_streams():
173 |             self.assertNotEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action'])
174 |             self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action'])
175 | 
176 |         # _id of the first record sync'd for each stream is the bookmarked
177 |         # last_id_fetched from the interrupted_state passed to the tap
178 |         self.assertEqual(records_by_stream['simple_coll_1']['messages'][0]['data']['_id'],
179 |                          interrupted_state['bookmarks']['simple_db-simple_coll_1']['last_id_fetched'])
180 |         self.assertEqual(records_by_stream['simple_coll_2']['messages'][0]['data']['_id'],
181 |                          interrupted_state['bookmarks']['simple_db-simple_coll_2']['last_id_fetched'])
182 | 
183 |         # _id of the last record sync'd for each stream is the bookmarked
184 |         # max_id_value from the interrupted_state passed to the tap
185 |         self.assertEqual(records_by_stream['simple_coll_1']['messages'][-2]['data']['_id'],
186 |                          interrupted_state['bookmarks']['simple_db-simple_coll_1']['max_id_value'])
187 |         self.assertEqual(records_by_stream['simple_coll_2']['messages'][-2]['data']['_id'],
188 |                          interrupted_state['bookmarks']['simple_db-simple_coll_2']['max_id_value'])
189 | 
190 |         # verify we are not seeing any documents which were updated having id < interrupted id value
191 |         # checking just the first document value
192 |         self.assertNotEqual(999, records_by_stream['simple_coll_1']['messages'][0]['data']['int_field'])
193 |         self.assertNotEqual(888, records_by_stream['simple_coll_2']['messages'][0]['data']['int_field'])
194 |         # checking if the updates are visible in all the documents in simple_coll_1
195 |         int_value = False
196 |         for x in records_by_stream['simple_coll_1']['messages'][:-1]:
197 |             # We are not considering the last element of this list because it does not have 'data'
198 |             if int(x['data']['int_field']) == 999:
199 |                 int_value = True
200 |         self.assertEqual(False, int_value)
201 |         # checking if the updates are visible in all the documents in simple_coll_2
202 |         int_value2 = False
203 |         for x in records_by_stream['simple_coll_1']['messages'][:-1]:
204 |             if x['data']['int_field'] == 888:
205 |                 int_value2 = True
206 |         self.assertEqual(False, int_value2)
207 | 
208 |         # verify we are seeing the documents which were updated having id > interruped id value
209 |         # we are picking the 5th and 15th element in the list because we updated the 30th and 40th document, (doc starting with 25)
210 |         self.assertEqual(777, records_by_stream['simple_coll_1']['messages'][5]['data']['int_field'])
211 |         self.assertEqual(666, records_by_stream['simple_coll_2']['messages'][30]['data']['int_field'])
212 | 
213 |         # assert that final state has no last_id_fetched and max_id_value bookmarks
214 |         final_state = menagerie.get_state(conn_id)
215 |         for tap_stream_id in self.expected_check_streams():
216 |             self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('last_id_fetched'))
217 |             self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('max_id_value'))
218 | 
219 |         state = menagerie.get_state(conn_id)
220 |         for tap_stream_id, stream_bookmarks in state.get('bookmarks', {}).items():
221 |             self.assertTrue(stream_bookmarks.get('initial_full_table_complete', False))
222 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_id_pk_variations.py:
--------------------------------------------------------------------------------
  1 | import bson
  2 | import datetime
  3 | import decimal
  4 | import os
  5 | import random
  6 | import string
  7 | import unittest
  8 | from bson.decimal128 import Decimal128
  9 | 
 10 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
 11 | from tap_tester import connections, menagerie, runner
 12 | 
 13 | 
 14 | RECORD_COUNT = {}
 15 | 
 16 | replication_method = ["INCREMENTAL", "FULL_TABLE", "LOG_BASED"]
 17 | 
 18 | 
 19 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 20 |     return ''.join(random.choice(chars) for x in range(size))
 21 | 
 22 | 
 23 | def generate_docs_no_id(num_docs):
 24 |     docs = []
 25 |     for int_value in range(num_docs):
 26 |         docs.append({"int_field": int_value, "string_field": random_string_generator()})
 27 |     return docs
 28 | 
 29 | 
 30 | def generate_docs_int_id(num_docs):
 31 |     docs = []
 32 |     for int_value in range(num_docs):
 33 |         docs.append({"_id": int_value, "string_field": random_string_generator()})
 34 |     return docs
 35 | 
 36 | 
 37 | def generate_docs_double_id():
 38 |     docs = []
 39 |     docs.append({"_id": 546.43, "string_field": random_string_generator()})
 40 |     docs.append({"_id": 555.56, "string_field": random_string_generator()})
 41 |     return docs
 42 | 
 43 | 
 44 | def generate_docs_string_id():
 45 |     docs = []
 46 |     docs.append({"_id": 'primary_key', "string_field": random_string_generator()})
 47 |     docs.append({"_id": 'secondary_key', "string_field": random_string_generator()})
 48 |     return docs
 49 | 
 50 | 
 51 | def generate_docs_binary_id():
 52 |     docs = []
 53 |     docs.append({"_id": 0b10101011, "string_field": random_string_generator()})
 54 |     docs.append({"_id": 0b10101000, "string_field": random_string_generator()})
 55 |     return docs
 56 | 
 57 | 
 58 | def generate_docs_boolean_id():
 59 |     docs = []
 60 |     docs.append({"_id": True, "string_field": random_string_generator()})
 61 |     docs.append({"_id": False, "string_field": random_string_generator()})
 62 |     return docs
 63 | 
 64 | 
 65 | def generate_docs_date_id():
 66 |     docs = []
 67 |     d1 = datetime.datetime.utcnow() - datetime.timedelta(days=1)
 68 |     d2 = datetime.datetime.utcnow()
 69 |     docs.append({"_id": d1, "string_field": random_string_generator()})
 70 |     docs.append({"_id": d2, "string_field": random_string_generator()})
 71 |     return docs
 72 | 
 73 | 
 74 | def generate_docs_32_bit_int_id():
 75 |     docs = []
 76 |     docs.append({'_id': 2147483640, 'string_field': random_string_generator()})
 77 |     docs.append({'_id': 2147483620, 'string_field': random_string_generator()})
 78 |     return docs
 79 | 
 80 | 
 81 | def generate_docs_64_bit_int_id():
 82 |     docs = []
 83 |     docs.append({'_id': 9223372036854775800, 'string_field': random_string_generator()})
 84 |     docs.append({'_id': 9223372036854775799, 'string_field': random_string_generator()})
 85 |     return docs
 86 | 
 87 | 
 88 | def generate_docs_128_decimal_id():
 89 |     docs = []
 90 |     docs.append({'_id': bson.Decimal128(decimal.Decimal('1.34')), 'string_field': random_string_generator()})
 91 |     docs.append({'_id': bson.Decimal128(decimal.Decimal('2.34')), 'string_field': random_string_generator()})
 92 |     return docs
 93 | 
 94 | 
 95 | class MongoDbPrimaryKeyIdVariation(unittest.TestCase):
 96 | 
 97 |     def setUp(self):
 98 |         ensure_environment_variables_set()
 99 | 
100 |         with get_test_connection() as client:
101 |             # drop all dbs/collections
102 |             drop_all_collections(client)
103 | 
104 |             # create collections for all the different variants for _id
105 |             client["simple_db"]["coll_with_no_id"].insert_many(generate_docs_no_id(5))
106 |             client["simple_db"]["coll_with_int_id"].insert_many(generate_docs_int_id(5))
107 |             client["simple_db"]["coll_with_double_id"].insert_many(generate_docs_double_id())
108 |             client["simple_db"]["coll_with_string_id"].insert_many(generate_docs_string_id())
109 |             client["simple_db"]["coll_with_binary_id"].insert_many(generate_docs_binary_id())
110 |             client["simple_db"]["coll_with_date_id"].insert_many(generate_docs_date_id())
111 |             client["simple_db"]["coll_with_32_bit_int_id"].insert_many(generate_docs_32_bit_int_id())
112 |             client["simple_db"]["coll_with_64_bit_int_id"].insert_many(generate_docs_64_bit_int_id())
113 | 
114 |     def expected_check_streams(self):
115 |         return {
116 |             'simple_db-coll_with_no_id',
117 |             'simple_db-coll_with_int_id',
118 |             'simple_db-coll_with_double_id',
119 |             'simple_db-coll_with_string_id',
120 |             'simple_db-coll_with_binary_id',
121 |             'simple_db-coll_with_date_id',
122 |             'simple_db-coll_with_32_bit_int_id',
123 |             'simple_db-coll_with_64_bit_int_id'
124 |             }
125 | 
126 |     def expected_pks(self):
127 |         return {
128 |             'coll_with_no_id': {'_id'},
129 |             'coll_with_int_id': {'_id'},
130 |             'coll_with_double_id': {'_id'},
131 |             'coll_with_string_id': {'_id'},
132 |             'coll_with_binary_id': {'_id'},
133 |             'coll_with_date_id': {'_id'},
134 |             'coll_with_32_bit_int_id': {'_id'},
135 |             'coll_with_64_bit_int_id': {'_id'}
136 |             }
137 | 
138 |     def expected_sync_streams(self):
139 |         return {
140 |             'coll_with_no_id',
141 |             'coll_with_int_id',
142 |             'coll_with_double_id',
143 |             'coll_with_string_id',
144 |             'coll_with_binary_id',
145 |             'coll_with_date_id',
146 |             'coll_with_32_bit_int_id',
147 |             'coll_with_64_bit_int_id'
148 |             }
149 | 
150 |     def expected_record_count(self):
151 |         return {'coll_with_double_id': 2,
152 |                 'coll_with_32_bit_int_id': 2,
153 |                 'coll_with_64_bit_int_id': 2,
154 |                 'coll_with_no_id': 5,
155 |                 'coll_with_binary_id': 2,
156 |                 'coll_with_string_id': 2,
157 |                 'coll_with_date_id': 2,
158 |                 'coll_with_int_id': 5
159 |                 }
160 | 
161 |     def expected_pk_values(self):
162 |         return {
163 |             'coll_with_string_id': ['primary_key', 'secondary_key'],
164 |             'coll_with_binary_id': [171, 168],
165 |             'coll_with_no_id': [],
166 |             'coll_with_64_bit_int_id': [9223372036854775800, 9223372036854775799],
167 |             'coll_with_int_id': [0, 1, 2, 3, 4],
168 |             'coll_with_32_bit_int_id': [2147483640, 2147483620],
169 |             'coll_with_date_id': [datetime.datetime.utcnow() - datetime.timedelta(days=1), datetime.datetime.utcnow()],
170 |             'coll_with_double_id': [decimal.Decimal('546.43'), decimal.Decimal('555.56')]
171 |             }
172 | 
173 |     def expected_pk_datatype(self):
174 |         return {
175 |             'coll_with_string_id': str,
176 |             'coll_with_binary_id': int,
177 |             'coll_with_no_id': [],
178 |             'coll_with_64_bit_int_id': int,
179 |             'coll_with_int_id': int,
180 |             'coll_with_32_bit_int_id': int,
181 |             'coll_with_date_id': [datetime.datetime.utcnow() - datetime.timedelta(days=1), datetime.datetime.utcnow()],
182 |             'coll_with_double_id': decimal.Decimal
183 |             }
184 | 
185 |     def name(self):
186 |         return "tap_tester_mongodb_id_pk_variations"
187 | 
188 |     def tap_name(self):
189 |         return "tap-mongodb"
190 | 
191 |     def get_type(self):
192 |         return "platform.mongodb"
193 | 
194 |     def get_credentials(self):
195 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
196 | 
197 |     def get_properties(self):
198 |         return {'host': os.getenv('TAP_MONGODB_HOST'),
199 |                 'port': os.getenv('TAP_MONGODB_PORT'),
200 |                 'user': os.getenv('TAP_MONGODB_USER'),
201 |                 'database': os.getenv('TAP_MONGODB_DBNAME')
202 |                 }
203 | 
204 |     def test_run(self):
205 |         '''
206 |         Running the test with all the available replication methods
207 |         '''
208 | 
209 |         for replication in replication_method:
210 |             if replication != 'INCREMENTAL':
211 |                 additional_metadata = [{"breadcrumb": [], "metadata": {'replication-method': replication}}]
212 |             else:
213 |                 additional_metadata = [{"breadcrumb": [], "metadata": {'replication-method': replication, 'replication-key': '_id'}}]
214 |             self.run_test(additional_metadata)
215 | 
216 |     def run_test(self, additional_metadata):
217 | 
218 |         conn_id = connections.ensure_connection(self)
219 | 
220 |         #  -------------------------------
221 |         # -----------  Discovery ----------
222 |         #  -------------------------------
223 | 
224 |         # run in discovery mode
225 |         check_job_name = runner.run_check_mode(self, conn_id)
226 | 
227 |         # verify check exit codes
228 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
229 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
230 | 
231 |         # verify the tap discovered the right streams
232 |         found_catalogs = menagerie.get_catalogs(conn_id)
233 | 
234 |         # assert we find the correct streams
235 |         self.assertEqual(self.expected_check_streams(),
236 |                          {c['tap_stream_id'] for c in found_catalogs})
237 | 
238 |         #  -----------------------------------
239 |         # -----------Initial Full Table Sync ---------
240 |         #  -----------------------------------
241 |         # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
242 |         for stream_catalog in found_catalogs:
243 |             annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
244 |             additional_md = additional_metadata
245 |             selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
246 |                                                                                    stream_catalog,
247 |                                                                                    annotated_schema,
248 |                                                                                    additional_md)
249 |             # verify _id is marked in metadata as table-key-property
250 |             self.assertEqual(stream_catalog['metadata']['table-key-properties'][0], '_id')
251 | 
252 |         runner.run_sync_mode(self, conn_id)
253 | 
254 |         # streams that we synced are the ones that we expect to see
255 |         record_count_by_stream = runner.examine_target_output_file(self,
256 |                                                                    conn_id,
257 |                                                                    self.expected_sync_streams(),
258 |                                                                    self.expected_pks())
259 | 
260 |         records_by_stream = runner.get_records_from_target_output()
261 | 
262 |         # verify if we are capturing all the data for all the streams
263 |         self.assertEqual(record_count_by_stream, self.expected_record_count())
264 | 
265 |         # verify the values of primary key and the datatype in the replicated records
266 |         for stream in records_by_stream.keys():
267 |             if stream not in ['coll_with_date_id', 'coll_with_no_id']:
268 |                 for records in [rec['data'] for rec in records_by_stream[stream]['messages'] if rec.get('action') == 'upsert']:
269 |                     self.assertIn(records['_id'], self.expected_pk_values()[stream])
270 |                     self.assertIsInstance(records['_id'], self.expected_pk_datatype()[stream])
271 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_index.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pymongo
  3 | import random
  4 | import string
  5 | import unittest
  6 | from pymongo import ASCENDING
  7 | 
  8 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
  9 | from tap_tester import connections, menagerie, runner
 10 | 
 11 | 
 12 | RECORD_COUNT = {}
 13 | 
 14 | 
 15 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 16 |     return ''.join(random.choice(chars) for x in range(size))
 17 | 
 18 | def generate_simple_coll_docs(num_docs):
 19 |     docs = []
 20 |     populated_string_fields = {f"string_field_{i}": random_string_generator() for i in range(1, 64)}
 21 |     for int_value in range(num_docs):
 22 |         docs.append({"int_field": int_value, **populated_string_fields})
 23 |     return docs
 24 | 
 25 | class MongoDBOplog(unittest.TestCase):
 26 |     def setUp(self):
 27 | 
 28 |         ensure_environment_variables_set()
 29 | 
 30 |         with get_test_connection() as client:
 31 |             ############# Drop all dbs/collections #############
 32 |             drop_all_collections(client)
 33 | 
 34 |             ############# Add simple collections #############
 35 |             # simple_coll_1 has 50 documents
 36 |             client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
 37 | 
 38 |             # simple_coll_2 has 100 documents
 39 |             client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))
 40 | 
 41 |             for index in self.expected_string_fields():
 42 |                 client["simple_db"]["simple_coll_1"].create_index(index)
 43 | 
 44 |             # # max 32 fields in a compound index (NO PLANS TO SUPPORT THIS IN THE TAP)
 45 |             # client["simple_db"]["simple_coll_1"].create_index([
 46 |             #     ('string_field', pymongo.ASCENDING), ('string_field_02', pymongo.ASCENDING),
 47 |             #     ('string_field_03', pymongo.ASCENDING), ('string_field_04', pymongo.ASCENDING),
 48 |             #     ('string_field_05', pymongo.ASCENDING), ('string_field_06', pymongo.ASCENDING),
 49 |             #     ('string_field_07', pymongo.ASCENDING), ('string_field_08', pymongo.ASCENDING),
 50 |             #     ('string_field_09', pymongo.ASCENDING), ('string_field_10', pymongo.ASCENDING),
 51 |             #     ('string_field_11', pymongo.ASCENDING), ('string_field_12', pymongo.ASCENDING),
 52 |             #     ('string_field_13', pymongo.ASCENDING), ('string_field_14', pymongo.ASCENDING),
 53 |             #     ('string_field_15', pymongo.ASCENDING), ('string_field_16', pymongo.ASCENDING),
 54 |             #     ('string_field_17', pymongo.ASCENDING), ('string_field_18', pymongo.ASCENDING),
 55 |             #     ('string_field_19', pymongo.ASCENDING), ('string_field_20', pymongo.ASCENDING),
 56 |             #     ('string_field_21', pymongo.ASCENDING), ('string_field_22', pymongo.ASCENDING),
 57 |             #     ('string_field_23', pymongo.ASCENDING), ('string_field_24', pymongo.ASCENDING),
 58 |             #     ('string_field_25', pymongo.ASCENDING), ('string_field_26', pymongo.ASCENDING),
 59 |             #     ('string_field_27', pymongo.ASCENDING), ('string_field_28', pymongo.ASCENDING),
 60 |             #     ('string_field_29', pymongo.ASCENDING), ('string_field_30', pymongo.ASCENDING),
 61 |             #     ('string_field_31', pymongo.ASCENDING), ('string_field_32', pymongo.ASCENDING)])
 62 | 
 63 |             self.index_info = client["simple_db"]["simple_coll_1"].index_information()
 64 | 
 65 |     def expected_check_streams(self):
 66 |         return {
 67 |             'simple_db-simple_coll_1',
 68 |             'simple_db-simple_coll_2',
 69 |         }
 70 | 
 71 |     def expected_pks(self):
 72 |         return {
 73 |             'simple_coll_1': {'_id'},
 74 |             'simple_coll_2': {'_id'},
 75 |         }
 76 | 
 77 |     def expected_row_counts(self):
 78 |         return {
 79 |             'simple_coll_1': 50,
 80 |             'simple_coll_2': 100,
 81 |         }
 82 | 
 83 |     def expected_sync_streams(self):
 84 |         return {
 85 |             'simple_coll_1',
 86 |             'simple_coll_2'
 87 |         }
 88 | 
 89 |     def name(self):
 90 |         return "tap_tester_mongodb_index"
 91 | 
 92 |     def tap_name(self):
 93 |         return "tap-mongodb"
 94 | 
 95 |     def get_type(self):
 96 |         return "platform.mongodb"
 97 | 
 98 |     def get_credentials(self):
 99 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
100 | 
101 |     def get_properties(self):
102 |         return {'host' : os.getenv('TAP_MONGODB_HOST'),
103 |                 'port' : os.getenv('TAP_MONGODB_PORT'),
104 |                 'user' : os.getenv('TAP_MONGODB_USER'),
105 |                 'database' : os.getenv('TAP_MONGODB_DBNAME')
106 |         }
107 | 
108 |     def expected_string_fields(self):
109 |         # Max index count = 64.  63 strings + '_id'
110 |         return {f"string_field_{i}" for i in range(1, 64)}
111 | 
112 | 
113 |     def test_run(self):
114 | 
115 |         conn_id = connections.ensure_connection(self)
116 | 
117 |         #  -----------------------------------
118 |         #  -----------  Discovery ------------
119 |         #  -----------------------------------
120 | 
121 |         # run in discovery mode
122 |         check_job_name = runner.run_check_mode(self, conn_id)
123 | 
124 |         # verify check  exit codes
125 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
126 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
127 | 
128 |         # verify the tap discovered the right streams
129 |         found_catalogs = menagerie.get_catalogs(conn_id)
130 | 
131 |         # assert we find the correct streams
132 |         self.assertEqual(self.expected_check_streams(),
133 |                          {c['tap_stream_id'] for c in found_catalogs})
134 | 
135 |         for tap_stream_id in self.expected_check_streams():
136 |             found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]
137 | 
138 |             # assert that the pks are correct
139 |             self.assertEqual(self.expected_pks()[found_stream['stream_name']],
140 |                              set(found_stream.get('metadata', {}).get('table-key-properties')))
141 | 
142 |             # assert that the row counts are correct
143 |             self.assertEqual(self.expected_row_counts()[found_stream['stream_name']],
144 |                              found_stream.get('metadata', {}).get('row-count'))
145 | 
146 |         # no plans for tap to support compound index, may not appear in valid-replication-keys list
147 |         discovered_replication_keys = found_catalogs[0]['metadata']['valid-replication-keys']
148 |         for field in self.expected_string_fields():
149 |             self.assertIn(field, discovered_replication_keys)
150 |         self.assertIn('_id', discovered_replication_keys)
151 |         self.assertEqual(64, len(discovered_replication_keys))
152 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_oplog.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import string
  4 | import unittest
  5 | from bson import ObjectId
  6 | 
  7 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
  8 | from tap_tester import connections, menagerie, runner
  9 | 
 10 | 
 11 | RECORD_COUNT = {}
 12 | 
 13 | 
 14 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 15 |     return ''.join(random.choice(chars) for x in range(size))
 16 | 
 17 | def generate_simple_coll_docs(num_docs):
 18 |     docs = []
 19 |     for int_value in range(num_docs):
 20 |         docs.append({"int_field": int_value, "string_field": random_string_generator()})
 21 |     return docs
 22 | 
 23 | class MongoDBOplog(unittest.TestCase):
 24 |     def setUp(self):
 25 | 
 26 |         ensure_environment_variables_set()
 27 | 
 28 |         with get_test_connection() as client:
 29 |             ############# Drop all dbs/collections #############
 30 |             drop_all_collections(client)
 31 | 
 32 |             ############# Add simple collections #############
 33 |             # simple_coll_1 has 50 documents
 34 |             client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
 35 | 
 36 |             # simple_coll_2 has 100 documents
 37 |             client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))
 38 | 
 39 | 
 40 | 
 41 | 
 42 |     def expected_check_streams(self):
 43 |         return {
 44 |             'simple_db-simple_coll_1',
 45 |             'simple_db-simple_coll_2',
 46 |         }
 47 | 
 48 |     def expected_pks(self):
 49 |         return {
 50 |             'simple_coll_1': {'_id'},
 51 |             'simple_coll_2': {'_id'},
 52 |         }
 53 | 
 54 |     def expected_row_counts(self):
 55 |         return {
 56 |             'simple_coll_1': 50,
 57 |             'simple_coll_2': 100,
 58 |         }
 59 | 
 60 | 
 61 |     def expected_sync_streams(self):
 62 |         return {
 63 |             'simple_coll_1',
 64 |             'simple_coll_2'
 65 |         }
 66 | 
 67 |     def name(self):
 68 |         return "tap_tester_mongodb_oplog"
 69 | 
 70 |     def tap_name(self):
 71 |         return "tap-mongodb"
 72 | 
 73 |     def get_type(self):
 74 |         return "platform.mongodb"
 75 | 
 76 |     def get_credentials(self):
 77 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
 78 | 
 79 |     def get_properties(self):
 80 |         return {'host' : os.getenv('TAP_MONGODB_HOST'),
 81 |                 'port' : os.getenv('TAP_MONGODB_PORT'),
 82 |                 'user' : os.getenv('TAP_MONGODB_USER'),
 83 |                 'database' : os.getenv('TAP_MONGODB_DBNAME')
 84 |         }
 85 | 
 86 | 
 87 |     def test_run(self):
 88 | 
 89 |         conn_id = connections.ensure_connection(self)
 90 | 
 91 |         #  -------------------------------
 92 |         # -----------  Discovery ----------
 93 |         #  -------------------------------
 94 | 
 95 |         # run in discovery mode
 96 |         check_job_name = runner.run_check_mode(self, conn_id)
 97 | 
 98 |         # verify check  exit codes
 99 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
100 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
101 | 
102 |         # verify the tap discovered the right streams
103 |         found_catalogs = menagerie.get_catalogs(conn_id)
104 | 
105 |         # assert we find the correct streams
106 |         self.assertEqual(self.expected_check_streams(),
107 |                          {c['tap_stream_id'] for c in found_catalogs})
108 | 
109 | 
110 | 
111 |         for tap_stream_id in self.expected_check_streams():
112 |             found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]
113 | 
114 |             # assert that the pks are correct
115 |             self.assertEqual(self.expected_pks()[found_stream['stream_name']],
116 |                              set(found_stream.get('metadata', {}).get('table-key-properties')))
117 | 
118 |             # assert that the row counts are correct
119 |             self.assertEqual(self.expected_row_counts()[found_stream['stream_name']],
120 |                              found_stream.get('metadata', {}).get('row-count'))
121 | 
122 |         #  -----------------------------------
123 |         # ----------- Initial Full Table ---------
124 |         #  -----------------------------------
125 |         # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
126 |         for stream_catalog in found_catalogs:
127 |             annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
128 |             additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}]
129 |             selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
130 |                                                                                     stream_catalog,
131 |                                                                                     annotated_schema,
132 |                                                                                     additional_md)
133 | 
134 |         # Run sync
135 |         sync_job_name = runner.run_sync_mode(self, conn_id)
136 | 
137 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
138 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
139 | 
140 | 
141 |         # verify the persisted schema was correct
142 |         records_by_stream = runner.get_records_from_target_output()
143 | 
144 |         # assert that each of the streams that we synced are the ones that we expect to see
145 |         record_count_by_stream = runner.examine_target_output_file(self,
146 |                                                                    conn_id,
147 |                                                                    self.expected_sync_streams(),
148 |                                                                    self.expected_pks())
149 | 
150 |         # Verify that the full table was synced
151 |         for tap_stream_id in self.expected_sync_streams():
152 |             self.assertGreaterEqual(record_count_by_stream[tap_stream_id],self.expected_row_counts()[tap_stream_id])
153 | 
154 |         # Verify that we have 'initial_full_table_complete' bookmark
155 |         state = menagerie.get_state(conn_id)
156 |         first_versions = {}
157 | 
158 |         for tap_stream_id in self.expected_check_streams():
159 |             # assert that the state has an initial_full_table_complete == True
160 |             self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete'])
161 |             # assert that there is a version bookmark in state
162 |             first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version']
163 |             self.assertIsNotNone(first_versions[tap_stream_id])
164 |             # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark
165 |             self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_time'])
166 |             self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_inc'])
167 | 
168 | 
169 |         changed_ids = set()
170 |         with get_test_connection() as client:
171 |             # Delete two documents for each collection
172 | 
173 |             changed_ids.add(client['simple_db']['simple_coll_1'].find({'int_field': 0})[0]['_id'])
174 |             client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0})
175 | 
176 |             changed_ids.add(client['simple_db']['simple_coll_1'].find({'int_field': 1})[0]['_id'])
177 |             client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1})
178 | 
179 |             changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 0})[0]['_id'])
180 |             client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0})
181 | 
182 |             changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 1})[0]['_id'])
183 |             client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1})
184 | 
185 |             # Update two documents for each collection
186 |             changed_ids.add(client['simple_db']['simple_coll_1'].find({'int_field': 48})[0]['_id'])
187 |             client["simple_db"]["simple_coll_1"].update_one({'int_field': 48},{'$set': {'int_field': -1}})
188 | 
189 |             changed_ids.add(client['simple_db']['simple_coll_1'].find({'int_field': 49})[0]['_id'])
190 |             client["simple_db"]["simple_coll_1"].update_one({'int_field': 49},{'$set': {'int_field': -1}})
191 | 
192 |             changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 98})[0]['_id'])
193 |             client["simple_db"]["simple_coll_2"].update_one({'int_field': 98},{'$set': {'int_field': -1}})
194 | 
195 |             changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 99})[0]['_id'])
196 |             client["simple_db"]["simple_coll_2"].update_one({'int_field': 99},{'$set': {'int_field': -1}})
197 | 
198 |             # Insert two documents for each collection
199 |             client["simple_db"]["simple_coll_1"].insert_one({"int_field": 50, "string_field": random_string_generator()})
200 |             changed_ids.add(client['simple_db']['simple_coll_1'].find({'int_field': 50})[0]['_id'])
201 | 
202 |             client["simple_db"]["simple_coll_1"].insert_one({"int_field": 51, "string_field": random_string_generator()})
203 |             changed_ids.add(client['simple_db']['simple_coll_1'].find({'int_field': 51})[0]['_id'])
204 | 
205 |             client["simple_db"]["simple_coll_2"].insert_one({"int_field": 100, "string_field": random_string_generator()})
206 |             changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 100})[0]['_id'])
207 | 
208 |             client["simple_db"]["simple_coll_2"].insert_one({"int_field": 101, "string_field": random_string_generator()})
209 |             changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 101})[0]['_id'])
210 | 
211 |         #  -----------------------------------
212 |         # ----------- Subsequent Oplog Sync ---------
213 |         #  -----------------------------------
214 | 
215 |         # Run sync
216 | 
217 |         sync_job_name = runner.run_sync_mode(self, conn_id)
218 | 
219 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
220 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
221 | 
222 | 
223 |         # verify the persisted schema was correct
224 |         messages_by_stream = runner.get_records_from_target_output()
225 |         records_by_stream = {}
226 |         for stream_name in self.expected_sync_streams():
227 |             records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert']
228 | 
229 | 
230 |         # assert that each of the streams that we synced are the ones that we expect to see
231 |         record_count_by_stream = runner.examine_target_output_file(self,
232 |                                                                    conn_id,
233 |                                                                    self.expected_sync_streams(),
234 |                                                                    self.expected_pks())
235 | 
236 |         # Verify that we got at least 6 records due to changes
237 |         # (could be more due to overlap in gte oplog clause)
238 |         for k,v in record_count_by_stream.items():
239 |             self.assertGreaterEqual(v, 6)
240 | 
241 |         # Verify that we got 2 records with _SDC_DELETED_AT
242 |         for stream in self.expected_sync_streams():
243 |             self.assertEqual(2, len([x['data'] for x in records_by_stream[stream]
244 |                                      if x['data'].get('_sdc_deleted_at')]))
245 | 
246 |         # Verify that the _id of the records sent are the same set as the
247 |         # _ids of the documents changed
248 |         actual_ids = {ObjectId(x['data']['_id']) for stream in self.expected_sync_streams()
249 |                       for x in records_by_stream[stream]}
250 |         self.assertEqual(changed_ids, actual_ids)
251 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_oplog_aged_out.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import string
  4 | import time
  5 | import unittest
  6 | 
  7 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
  8 | from tap_tester import connections, menagerie, runner
  9 | 
 10 | 
 11 | RECORD_COUNT = {}
 12 | 
 13 | 
 14 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 15 |     return ''.join(random.choice(chars) for x in range(size))
 16 | 
 17 | def generate_simple_coll_docs(num_docs):
 18 |     docs = []
 19 |     for int_value in range(num_docs):
 20 |         docs.append({"int_field": int_value, "string_field": random_string_generator()})
 21 |     return docs
 22 | 
 23 | class MongoDBOplogAgedOut(unittest.TestCase):
 24 |     def setUp(self):
 25 |         ensure_environment_variables_set()
 26 | 
 27 |         with get_test_connection() as client:
 28 |             ############# Drop all dbs/collections #############
 29 |             drop_all_collections(client)
 30 | 
 31 |             ############# Add simple collections ############
 32 |             # simple_coll_1 has 50 documents
 33 |             client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
 34 | 
 35 | 
 36 | 
 37 |     def expected_check_streams(self):
 38 |         return {
 39 |             'simple_db-simple_coll_1'
 40 |         }
 41 | 
 42 |     def expected_pks(self):
 43 |         return {
 44 |             'simple_coll_1': {'_id'}
 45 |         }
 46 | 
 47 |     def expected_row_counts(self):
 48 |         return {
 49 |             'simple_coll_1': 50
 50 |         }
 51 | 
 52 | 
 53 |     def expected_sync_streams(self):
 54 |         return {
 55 |             'simple_coll_1'
 56 |         }
 57 | 
 58 |     def name(self):
 59 |         return "tap_tester_mongodb_oplog_aged_out"
 60 | 
 61 |     def tap_name(self):
 62 |         return "tap-mongodb"
 63 | 
 64 |     def get_type(self):
 65 |         return "platform.mongodb"
 66 | 
 67 |     def get_credentials(self):
 68 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
 69 | 
 70 |     def get_properties(self):
 71 |         return {'host' : os.getenv('TAP_MONGODB_HOST'),
 72 |                 'port' : os.getenv('TAP_MONGODB_PORT'),
 73 |                 'user' : os.getenv('TAP_MONGODB_USER'),
 74 |                 'database' : os.getenv('TAP_MONGODB_DBNAME')
 75 |         }
 76 | 
 77 | 
 78 |     def test_run(self):
 79 | 
 80 |         conn_id = connections.ensure_connection(self)
 81 | 
 82 |         #  -------------------------------
 83 |         # -----------  Discovery ----------
 84 |         #  -------------------------------
 85 | 
 86 |         # run in discovery mode
 87 |         check_job_name = runner.run_check_mode(self, conn_id)
 88 | 
 89 |         # verify check  exit codes
 90 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
 91 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
 92 | 
 93 |         # verify the tap discovered the right streams
 94 |         found_catalogs = menagerie.get_catalogs(conn_id)
 95 | 
 96 |         # assert we find the correct streams
 97 |         self.assertEqual(self.expected_check_streams(),
 98 |                          {c['tap_stream_id'] for c in found_catalogs})
 99 | 
100 | 
101 | 
102 |         for tap_stream_id in self.expected_check_streams():
103 |             found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]
104 | 
105 |             # assert that the pks are correct
106 |             self.assertEqual(self.expected_pks()[found_stream['stream_name']],
107 |                              set(found_stream.get('metadata', {}).get('table-key-properties')))
108 | 
109 |             # assert that the row counts are correct
110 |             self.assertEqual(self.expected_row_counts()[found_stream['stream_name']],
111 |                              found_stream.get('metadata', {}).get('row-count'))
112 | 
113 |         #  -----------------------------------
114 |         # ----------- Full Table Sync ---------
115 |         #  -----------------------------------
116 |         # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
117 |         for stream_catalog in found_catalogs:
118 |             annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
119 |             additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}]
120 |             selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
121 |                                                                                     stream_catalog,
122 |                                                                                     annotated_schema,
123 |                                                                                     additional_md)
124 |         # Synthesize interrupted state
125 |         original_version = int(time.time() * 1000)
126 |         interrupted_state = {
127 |             'currently_syncing' : 'simple_db-simple_coll_1',
128 |             'bookmarks' : {
129 |                 'simple_db-simple_coll_1': {
130 |                     'version': original_version,
131 |                     'initial_full_table_complete': True,
132 |                     'oplog_ts_time': 1,
133 |                     'oplog_ts_inc': 0
134 |                 }
135 |             }
136 |         }
137 | 
138 |         menagerie.set_state(conn_id, interrupted_state)
139 | 
140 |         # This should say the oplog has timed out and will execute a resync
141 |         runner.run_sync_mode(self, conn_id)
142 | 
143 |         # verify the persisted schema was correct
144 |         records_by_stream = runner.get_records_from_target_output()
145 | 
146 |         # assert that each of the streams that we synced are the ones that we expect to see
147 |         record_count_by_stream = runner.examine_target_output_file(self,
148 |                                                                    conn_id,
149 |                                                                    self.expected_sync_streams(),
150 |                                                                    self.expected_pks())
151 | 
152 |         # assert that we only have an ActivateVersionMessage as the last message and not the first
153 |         for stream_name in self.expected_sync_streams():
154 |             self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action'])
155 |             self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][51]['action'])
156 | 
157 | 
158 |         # assert that final state has no last_id_fetched and max_id_value bookmarks
159 |         final_state = menagerie.get_state(conn_id)
160 |         self.assertNotEqual(original_version, final_state.get('bookmarks', {}).get('simple_db-simple_coll_1', {}).get('version'))
161 | 
162 |         # assert that all rows in the collection were sync'd
163 |         for stream_id, row_count in self.expected_row_counts().items():
164 |             self.assertGreaterEqual(record_count_by_stream[stream_id], row_count)
165 | 
166 |         # assert that each stream has a initial_full_table_complete=True bookmark
167 |         self.assertIsNotNone(final_state.get('bookmarks', {}).get('simple_db-simple_coll_1', {}).get('oplog_ts_time'))
168 |         self.assertIsNotNone(final_state.get('bookmarks', {}).get('simple_db-simple_coll_1', {}).get('oplog_ts_inc'))
169 |         self.assertTrue(final_state.get('bookmarks', {}).get('simple_db-simple_coll_1', {}).get('initial_full_table_complete'))
170 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_oplog_bookmarks.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pymongo
  3 | import random
  4 | import string
  5 | import time
  6 | import unittest
  7 | 
  8 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
  9 | from tap_tester import connections, menagerie, runner
 10 | 
 11 | 
 12 | RECORD_COUNT = {}
 13 | 
 14 | 
 15 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 16 |     return ''.join(random.choice(chars) for x in range(size))
 17 | 
 18 | def generate_simple_coll_docs(num_docs):
 19 |     docs = []
 20 |     for int_value in range(num_docs):
 21 |         docs.append({"int_field": int_value, "string_field": random_string_generator()})
 22 |     return docs
 23 | 
 24 | class MongoDBOplogBookmarks(unittest.TestCase):
 25 |     def setUp(self):
 26 | 
 27 |         ensure_environment_variables_set()
 28 | 
 29 |         with get_test_connection() as client:
 30 |             drop_all_collections(client)
 31 | 
 32 |             # simple_coll_1 has 50 documents
 33 |             client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
 34 | 
 35 |             # simple_coll_2 has 100 documents
 36 |             client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))
 37 | 
 38 | 
 39 |     def expected_check_streams(self):
 40 |         return {
 41 |             'simple_db-simple_coll_1',
 42 |             'simple_db-simple_coll_2',
 43 |         }
 44 | 
 45 |     def expected_pks(self):
 46 |         return {
 47 |             'simple_coll_1': {'_id'},
 48 |             'simple_coll_2': {'_id'},
 49 |         }
 50 | 
 51 |     def expected_row_counts(self):
 52 |         return {
 53 |             'simple_coll_1': 50,
 54 |             'simple_coll_2': 100,
 55 | 
 56 |         }
 57 | 
 58 | 
 59 |     def expected_sync_streams(self):
 60 |         return {
 61 |             'simple_coll_1',
 62 |             'simple_coll_2',
 63 |         }
 64 | 
 65 |     def name(self):
 66 |         return "tap_tester_mongodb_oplog_bookmarks"
 67 | 
 68 |     def tap_name(self):
 69 |         return "tap-mongodb"
 70 | 
 71 |     def get_type(self):
 72 |         return "platform.mongodb"
 73 | 
 74 |     def get_credentials(self):
 75 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
 76 | 
 77 |     def get_properties(self):
 78 |         return {
 79 |             'host' : os.getenv('TAP_MONGODB_HOST'),
 80 |             'port' : os.getenv('TAP_MONGODB_PORT'),
 81 |             'user' : os.getenv('TAP_MONGODB_USER'),
 82 |             'database' : os.getenv('TAP_MONGODB_DBNAME')
 83 |         }
 84 | 
 85 | 
 86 |     def test_run(self):
 87 | 
 88 |         conn_id = connections.ensure_connection(self)
 89 | 
 90 |         #  -------------------------------
 91 |         # -----------  Discovery ----------
 92 |         #  -------------------------------
 93 | 
 94 |         # run in discovery mode
 95 |         check_job_name = runner.run_check_mode(self, conn_id)
 96 | 
 97 |         # verify check  exit codes
 98 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
 99 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
100 | 
101 |         # verify the tap discovered the right streams
102 |         found_catalogs = menagerie.get_catalogs(conn_id)
103 | 
104 |         # assert we find the correct streams
105 |         self.assertEqual(self.expected_check_streams(),
106 |                          {c['tap_stream_id'] for c in found_catalogs})
107 | 
108 |         for tap_stream_id in self.expected_check_streams():
109 |             found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]
110 | 
111 |             # assert that the pks are correct
112 |             self.assertEqual(self.expected_pks()[found_stream['stream_name']],
113 |                              set(found_stream.get('metadata', {}).get('table-key-properties')))
114 | 
115 |             # assert that the row counts are correct
116 |             self.assertEqual(self.expected_row_counts()[found_stream['stream_name']],
117 |                              found_stream.get('metadata', {}).get('row-count'))
118 | 
119 |         #  -----------------------------------
120 |         # ----------- Initial Full Table ---------
121 |         #  -----------------------------------
122 |         # Select simple_coll_1 and add replication method metadata
123 |         additional_md = [{ "breadcrumb" : [],
124 |                            "metadata" : {'replication-method' : 'LOG_BASED'}}]
125 |         for stream_catalog in found_catalogs:
126 |             if stream_catalog['tap_stream_id'] == 'simple_db-simple_coll_1':
127 |                 annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
128 |                 selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
129 |                                                                                        stream_catalog,
130 |                                                                                        annotated_schema,
131 |                                                                                        additional_md)
132 | 
133 |         # Run sync
134 |         sync_job_name = runner.run_sync_mode(self, conn_id)
135 | 
136 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
137 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
138 | 
139 | 
140 |         # verify the persisted schema was correct
141 |         records_by_stream = runner.get_records_from_target_output()
142 | 
143 |         # assert that each of the streams that we synced are the ones that we expect to see
144 |         record_count_by_stream = runner.examine_target_output_file(self,
145 |                                                                    conn_id,
146 |                                                                    self.expected_sync_streams(),
147 |                                                                    self.expected_pks())
148 | 
149 |         # Verify that the full table was synced
150 |         tap_stream_id = 'simple_db-simple_coll_1'
151 |         self.assertGreaterEqual(record_count_by_stream['simple_coll_1'],
152 |                                 self.expected_row_counts()['simple_coll_1'])
153 | 
154 |         # Verify that we have 'initial_full_table_complete' bookmark
155 |         state = menagerie.get_state(conn_id)
156 |         first_versions = {}
157 | 
158 |         # assert that the state has an initial_full_table_complete == True
159 |         self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete'])
160 |         # assert that there is a version bookmark in state
161 |         first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version']
162 |         self.assertIsNotNone(first_versions[tap_stream_id])
163 |         # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark
164 |         self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_time'])
165 |         self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_inc'])
166 | 
167 | 
168 | 
169 |         # Insert records to coll_1 to get the bookmark to be a ts on coll_1
170 |         with get_test_connection() as client:
171 |             client["simple_db"]["simple_coll_1"].insert_one({"int_field": 101, "string_field": random_string_generator()})
172 |         sync_job_name = runner.run_sync_mode(self, conn_id)
173 | 
174 | 
175 |         changed_ids = set()
176 |         with get_test_connection() as client:
177 |             # Make changes to not selected collection
178 |             changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 0})[0]['_id'])
179 |             client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0})
180 | 
181 |             changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 1})[0]['_id'])
182 |             client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1})
183 | 
184 |             changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 98})[0]['_id'])
185 |             client["simple_db"]["simple_coll_2"].update_one({'int_field': 98},{'$set': {'int_field': -1}})
186 | 
187 |             changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 99})[0]['_id'])
188 |             client["simple_db"]["simple_coll_2"].update_one({'int_field': 99},{'$set': {'int_field': -1}})
189 | 
190 |             client["simple_db"]["simple_coll_2"].insert_one({"int_field": 100, "string_field": random_string_generator()})
191 |             changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 100})[0]['_id'])
192 | 
193 |             client["simple_db"]["simple_coll_2"].insert_one({"int_field": 101, "string_field": random_string_generator()})
194 |             changed_ids.add(client['simple_db']['simple_coll_2'].find({'int_field': 101})[0]['_id'])
195 | 
196 |         #  -----------------------------------
197 |         # ----------- Subsequent Oplog Sync ---------
198 |         #  -----------------------------------
199 | 
200 |         # Run sync
201 |         sync_job_name = runner.run_sync_mode(self, conn_id)
202 | 
203 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
204 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
205 | 
206 |         # verify the persisted schema was correct
207 |         messages_by_stream = runner.get_records_from_target_output()
208 |         records_by_stream = {
209 |             'simple_coll_1': [x
210 |                               for x in messages_by_stream['simple_coll_1']['messages']
211 |                               if x.get('action') == 'upsert']
212 |         }
213 | 
214 |         # assert that each of the streams that we synced are the ones that we expect to see
215 |         record_count_by_stream = runner.examine_target_output_file(self,
216 |                                                                    conn_id,
217 |                                                                    self.expected_sync_streams(),
218 |                                                                    self.expected_pks())
219 | 
220 |         # 1 record due to fencepost querying on oplog ts
221 |         self.assertEqual(1, record_count_by_stream['simple_coll_1'])
222 | 
223 |         final_state = menagerie.get_state(conn_id)
224 | 
225 |         with get_test_connection() as client:
226 |             row = client.local.oplog.rs.find_one(sort=[('$natural', pymongo.DESCENDING)])
227 |             latest_oplog_ts = row.get('ts')
228 | 
229 |         self.assertEqual(
230 |             (latest_oplog_ts.time, latest_oplog_ts.inc),
231 |             (final_state['bookmarks']['simple_db-simple_coll_1']['oplog_ts_time'],
232 |              final_state['bookmarks']['simple_db-simple_coll_1']['oplog_ts_inc'])
233 |         )
234 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_projection.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import random
  4 | import string
  5 | import unittest
  6 | 
  7 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
  8 | from tap_tester import connections, menagerie, runner
  9 | 
 10 | 
 11 | RECORD_COUNT = {}
 12 | 
 13 | 
 14 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 15 |     return ''.join(random.choice(chars) for x in range(size))
 16 | 
 17 | def generate_simple_coll_docs(num_docs):
 18 |     docs = []
 19 |     for int_value in range(num_docs):
 20 |         docs.append({"int_field": int_value, "string_field": random_string_generator()})
 21 |     return docs
 22 | 
 23 | 
 24 | class MongoDBProjection(unittest.TestCase):
 25 | 
 26 |     def setUpDatabase(self):
 27 |         ensure_environment_variables_set()
 28 | 
 29 |         with get_test_connection() as client:
 30 |             ############# Drop all dbs/collections #############
 31 |             drop_all_collections(client)
 32 | 
 33 |             ############# Add simple collections #############
 34 |             # simple_coll_1 has 50 documents
 35 | 
 36 |             client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
 37 | 
 38 |             # simple_coll_2 has 100 documents
 39 |             client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))
 40 | 
 41 |     def setUp(self):
 42 |         pass
 43 | 
 44 | 
 45 |     def expected_check_streams(self):
 46 |         return {
 47 |             'simple_db-simple_coll_1',
 48 |             'simple_db-simple_coll_2',
 49 |         }
 50 | 
 51 |     def expected_pks(self):
 52 |         return {
 53 |             'simple_coll_1': {'_id'},
 54 |             'simple_coll_2': {'_id'},
 55 |         }
 56 | 
 57 |     def expected_row_counts(self):
 58 |         return {
 59 |             'simple_coll_1': 50,
 60 |             'simple_coll_2': 100,
 61 |         }
 62 | 
 63 | 
 64 |     def expected_sync_streams(self):
 65 |         return {
 66 |             'simple_coll_1',
 67 |             'simple_coll_2'
 68 |         }
 69 | 
 70 |     def projection_expected_keys_list(self):
 71 |         return [
 72 |             {
 73 |                 "projection": {"int_field": 1},
 74 |                 "expected_keys": [{"_id", "int_field"},
 75 |                                   {"_id", "_sdc_deleted_at"}]
 76 |             },
 77 |             {
 78 |                 "projection": {"int_field": 1, "_id": 1},
 79 |                 "expected_keys": [{"_id", "int_field"},
 80 |                                   {"_id", "_sdc_deleted_at"}]
 81 |             },
 82 |             {
 83 |                 "projection": {"int_field": 0},
 84 |                 "expected_keys": [{"_id", "string_field"},
 85 |                                   {"_id", "_sdc_deleted_at"}]
 86 |             },
 87 |             {
 88 |                 "projection": {"_id": 1},
 89 |                 "expected_keys": [{"_id"},
 90 |                                   {"_id", "_sdc_deleted_at"}]
 91 |             },
 92 |             {
 93 |                 "projection": {},
 94 |                 "expected_keys": [{"_id", "string_field", "int_field"},
 95 |                                   {"_id", "_sdc_deleted_at"}]
 96 |             },
 97 |             {
 98 |                 "projection": None,
 99 |                 "expected_keys": [{"_id", "string_field", "int_field"},
100 |                                   {"_id", "_sdc_deleted_at"}]
101 |             },
102 |             {
103 |                 "projection": "",
104 |                 "expected_keys": [{"_id", "string_field", "int_field"},
105 |                                   {"_id", "_sdc_deleted_at"}]
106 |             }
107 |         ]
108 | 
109 |     def name(self):
110 |         return "tap_tester_mongodb_projection"
111 | 
112 |     def tap_name(self):
113 |         return "tap-mongodb"
114 | 
115 |     def get_type(self):
116 |         return "platform.mongodb"
117 | 
118 |     def get_credentials(self):
119 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
120 | 
121 |     def get_properties(self):
122 |         return {'host' : os.getenv('TAP_MONGODB_HOST'),
123 |                 'port' : os.getenv('TAP_MONGODB_PORT'),
124 |                 'user' : os.getenv('TAP_MONGODB_USER'),
125 |                 'database' : os.getenv('TAP_MONGODB_DBNAME')
126 |         }
127 | 
128 |     def modify_database(self):
129 |         with get_test_connection() as client:
130 |             # Delete two documents for each collection
131 | 
132 |             client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0})
133 | 
134 |             client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1})
135 | 
136 |             client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0})
137 | 
138 | 
139 |             client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1})
140 | 
141 |             # Update two documents for each collection
142 |             client["simple_db"]["simple_coll_1"].update_one({'int_field': 48},{'$set': {'int_field': -1}})
143 | 
144 |             client["simple_db"]["simple_coll_1"].update_one({'int_field': 49},{'$set': {'int_field': -1}})
145 | 
146 |             client["simple_db"]["simple_coll_2"].update_one({'int_field': 98},{'$set': {'int_field': -1}})
147 | 
148 |             client["simple_db"]["simple_coll_2"].update_one({'int_field': 99},{'$set': {'int_field': -1}})
149 | 
150 |             # Insert two documents for each collection
151 |             client["simple_db"]["simple_coll_1"].insert_one({"int_field": 50, "string_field": random_string_generator()})
152 | 
153 |             client["simple_db"]["simple_coll_1"].insert_one({"int_field": 51, "string_field": random_string_generator()})
154 | 
155 |             client["simple_db"]["simple_coll_2"].insert_one({"int_field": 100, "string_field": random_string_generator()})
156 | 
157 |             client["simple_db"]["simple_coll_2"].insert_one({"int_field": 101, "string_field": random_string_generator()})
158 | 
159 | 
160 |     def run_single_projection(self, projection_mapping):
161 |         self.setUpDatabase()
162 |         conn_id = connections.ensure_connection(self)
163 | 
164 |         #  -------------------------------
165 |         # -----------  Discovery ----------
166 |         #  -------------------------------
167 | 
168 |         # run in discovery mode
169 |         check_job_name = runner.run_check_mode(self, conn_id)
170 | 
171 |         # verify check  exit codes
172 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
173 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
174 | 
175 |         # verify the tap discovered the right streams
176 |         found_catalogs = menagerie.get_catalogs(conn_id)
177 | 
178 |         # assert we find the correct streams
179 |         self.assertEqual(self.expected_check_streams(),
180 |                          {c['tap_stream_id'] for c in found_catalogs})
181 | 
182 | 
183 | 
184 |         for tap_stream_id in self.expected_check_streams():
185 |             found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]
186 | 
187 |             # assert that the pks are correct
188 |             self.assertEqual(self.expected_pks()[found_stream['stream_name']],
189 |                              set(found_stream.get('metadata', {}).get('table-key-properties')))
190 | 
191 |             # assert that the row counts are correct
192 |             self.assertEqual(self.expected_row_counts()[found_stream['stream_name']],
193 |                              found_stream.get('metadata', {}).get('row-count'))
194 | 
195 |         #  -----------------------------------
196 |         # ----------- Initial Full Table ---------
197 |         #  -----------------------------------
198 |         # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
199 |         for stream_catalog in found_catalogs:
200 |             annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
201 |             additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}]
202 |             if projection_mapping['projection'] is not None:
203 |                 additional_md[0]['metadata']['tap_mongodb.projection'] = json.dumps(projection_mapping['projection'])
204 |             selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
205 |                                                                                     stream_catalog,
206 |                                                                                     annotated_schema,
207 |                                                                                     additional_md)
208 | 
209 |         # Run sync
210 |         sync_job_name = runner.run_sync_mode(self, conn_id)
211 | 
212 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
213 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
214 | 
215 | 
216 |         # verify the persisted schema was correct
217 |         messages_by_stream = runner.get_records_from_target_output()
218 | 
219 | 
220 |         for stream_name in self.expected_sync_streams():
221 |             stream_records = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert']
222 |             #actual_keys = set()
223 | 
224 |             for record in stream_records:
225 |                 # BUG TDL-23609. Pymongo v4.3+ returns entire document for empty projection
226 |                 if projection_mapping['projection'] == {}:
227 |                     continue
228 | 
229 |                 self.assertIn(record['data'].keys(), projection_mapping['expected_keys'])
230 |                 #actual_keys = actual_keys.union(set(record['data'].keys()))
231 | 
232 |             #self.assertTrue(actual_keys.issubset(projection_mapping['expected_keys']))
233 | 
234 |         self.modify_database()
235 | 
236 |         #  -----------------------------------
237 |         # ----------- Subsequent Oplog Sync ---------
238 |         #  -----------------------------------
239 | 
240 |         # Run sync
241 |         sync_job_name = runner.run_sync_mode(self, conn_id)
242 | 
243 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
244 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
245 | 
246 | 
247 |         # verify the persisted schema was correct
248 |         messages_by_stream = runner.get_records_from_target_output()
249 | 
250 |         for stream_name in self.expected_sync_streams():
251 |             stream_records = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert']
252 |             #actual_keys = set()
253 |             for record in stream_records:
254 |                 # BUG TDL-23609. Pymongo v4.3+ returns entire document for empty projection
255 |                 if projection_mapping['projection'] == {}:
256 |                     continue
257 | 
258 |                 self.assertIn(record['data'].keys(), projection_mapping['expected_keys'])
259 |                 #actual_keys = actual_keys.union(set(record['data'].keys()))
260 |             #self.assertTrue(actual_keys.issubset(projection_mapping['expected_keys']))
261 | 
262 | 
263 |     def test_run(self):
264 |         for projection_mapping in self.projection_expected_keys_list():
265 |             self.run_single_projection(projection_mapping)
266 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_table_reset_log.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import string
  4 | import unittest
  5 | 
  6 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
  7 | from tap_tester import connections, menagerie, runner
  8 | 
  9 | 
 10 | RECORD_COUNT = {}
 11 | 
 12 | 
 13 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 14 |     return ''.join(random.choice(chars) for x in range(size))
 15 | 
 16 | def generate_simple_coll_docs(num_docs):
 17 |     docs = []
 18 |     for int_value in range(num_docs):
 19 |         docs.append({"int_field": int_value, "string_field": random_string_generator()})
 20 |     return docs
 21 | 
 22 | class MongoDBTableResetLog(unittest.TestCase):
 23 |     def setUp(self):
 24 | 
 25 |         ensure_environment_variables_set()
 26 | 
 27 |         with get_test_connection() as client:
 28 |             ############# Drop all dbs/collections #############
 29 |             drop_all_collections(client)
 30 | 
 31 |             ############# Add simple collections #############
 32 |             # simple_coll_1 has 50 documents
 33 |             client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
 34 | 
 35 |             # simple_coll_2 has 100 documents
 36 |             client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))
 37 | 
 38 | 
 39 |     def expected_check_streams(self):
 40 |         return {
 41 |             'simple_db-simple_coll_1',
 42 |             'simple_db-simple_coll_2',
 43 |         }
 44 | 
 45 |     def expected_pks(self):
 46 |         return {
 47 |             'simple_coll_1': {'_id'},
 48 |             'simple_coll_2': {'_id'},
 49 |         }
 50 | 
 51 |     def expected_row_counts(self):
 52 |         return {
 53 |             'simple_coll_1': 50,
 54 |             'simple_coll_2': 100,
 55 |         }
 56 | 
 57 |     def expected_sync_streams(self):
 58 |         return {
 59 |             'simple_coll_1',
 60 |             'simple_coll_2'
 61 |         }
 62 | 
 63 |     def name(self):
 64 |         return "tap_tester_mongodb_table_reset_log"
 65 | 
 66 |     def tap_name(self):
 67 |         return "tap-mongodb"
 68 | 
 69 |     def get_type(self):
 70 |         return "platform.mongodb"
 71 | 
 72 |     def get_credentials(self):
 73 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
 74 | 
 75 |     def get_properties(self):
 76 |         return {'host' : os.getenv('TAP_MONGODB_HOST'),
 77 |                 'port' : os.getenv('TAP_MONGODB_PORT'),
 78 |                 'user' : os.getenv('TAP_MONGODB_USER'),
 79 |                 'database' : os.getenv('TAP_MONGODB_DBNAME')
 80 |         }
 81 | 
 82 | 
 83 |     def test_run(self):
 84 | 
 85 |         conn_id = connections.ensure_connection(self)
 86 | 
 87 |         #  ---------------------------------
 88 |         #  -----------  Discovery ----------
 89 |         #  ---------------------------------
 90 | 
 91 |         # run in discovery mode
 92 |         check_job_name = runner.run_check_mode(self, conn_id)
 93 | 
 94 |         # verify check  exit codes
 95 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
 96 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
 97 | 
 98 |         # verify the tap discovered the right streams
 99 |         found_catalogs = menagerie.get_catalogs(conn_id)
100 | 
101 |         # assert we find the correct streams
102 |         self.assertEqual(self.expected_check_streams(),
103 |                          {c['tap_stream_id'] for c in found_catalogs})
104 | 
105 | 
106 |         for tap_stream_id in self.expected_check_streams():
107 |             found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]
108 | 
109 |             # assert that the pks are correct
110 |             self.assertEqual(self.expected_pks()[found_stream['stream_name']],
111 |                              set(found_stream.get('metadata', {}).get('table-key-properties')))
112 | 
113 |             # assert that the row counts are correct
114 |             self.assertEqual(self.expected_row_counts()[found_stream['stream_name']],
115 |                              found_stream.get('metadata', {}).get('row-count'))
116 | 
117 |         #  ----------------------------------------
118 |         #  ----------- Initial Full Table ---------
119 |         #  ----------------------------------------
120 | 
121 |         # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
122 |         for stream_catalog in found_catalogs:
123 |             annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
124 |             additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}]
125 |             selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
126 |                                                                                     stream_catalog,
127 |                                                                                     annotated_schema,
128 |                                                                                     additional_md)
129 | 
130 |         # Run sync
131 |         sync_job_name = runner.run_sync_mode(self, conn_id)
132 | 
133 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
134 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
135 | 
136 |         # verify the persisted schema was correct
137 |         records_by_stream = runner.get_records_from_target_output()
138 | 
139 |         # assert that each of the streams that we synced are the ones that we expect to see
140 |         record_count_by_stream = runner.examine_target_output_file(self,
141 |                                                                    conn_id,
142 |                                                                    self.expected_sync_streams(),
143 |                                                                    self.expected_pks())
144 | 
145 |         # Verify that the full table was synced
146 |         for tap_stream_id in self.expected_sync_streams():
147 |             self.assertGreaterEqual(record_count_by_stream[tap_stream_id],self.expected_row_counts()[tap_stream_id])
148 | 
149 |         # manipulate state to simulate table reset
150 |         state = menagerie.get_state(conn_id)
151 |         reset_stream = 'simple_db-simple_coll_2'
152 |         state['bookmarks'].pop(reset_stream)
153 |         menagerie.set_state(conn_id, state)
154 | 
155 | 
156 |         #  -------------------------------------------
157 |         #  ----------- Subsequent Oplog Sync ---------
158 |         #  -------------------------------------------
159 | 
160 |         # Run sync
161 |         sync_job_name = runner.run_sync_mode(self, conn_id)
162 | 
163 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
164 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
165 | 
166 |         # Verify that we have 'initial_full_table_complete' bookmark
167 |         state = menagerie.get_state(conn_id)
168 |         first_versions = {}
169 | 
170 |         for tap_stream_id in self.expected_check_streams():
171 |             # assert that the state has an initial_full_table_complete == True
172 |             self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete'])
173 |             # assert that there is a version bookmark in state
174 |             first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version']
175 |             self.assertIsNotNone(first_versions[tap_stream_id])
176 |             # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark
177 |             self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_time'])
178 |             self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_inc'])
179 | 
180 |         # verify the persisted schema was correct
181 |         messages_by_stream = runner.get_records_from_target_output()
182 |         records_by_stream = {}
183 |         for stream_name in self.expected_sync_streams():
184 |             records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert']
185 | 
186 |         # assert that each of the streams that we synced are the ones that we expect to see
187 |         record_count_by_stream = runner.examine_target_output_file(self,
188 |                                                                    conn_id,
189 |                                                                    self.expected_sync_streams(),
190 |                                                                    self.expected_pks())
191 | 
192 |         # Verify the expected number of records per table
193 |         for k,v in record_count_by_stream.items():
194 |             if k == 'simple_coll_1':
195 |                 self.assertEqual(v, 0) # not reset
196 |             if k == 'simple_coll_2':
197 |                 self.assertEqual(v, 100) # reset stream
198 | 


--------------------------------------------------------------------------------
/tests/test_mongodb_views.py:
--------------------------------------------------------------------------------
  1 | import bson
  2 | import os
  3 | import random
  4 | import string
  5 | import unittest
  6 | 
  7 | from mongodb_common import drop_all_collections, get_test_connection, ensure_environment_variables_set
  8 | from tap_tester import connections, menagerie, runner
  9 | 
 10 | 
 11 | def random_string_generator(size=6, chars=string.ascii_uppercase + string.digits):
 12 |     return ''.join(random.choice(chars) for x in range(size))
 13 | 
 14 | 
 15 | def generate_simple_coll_questions(num_docs):
 16 |     docs = []
 17 |     for int_value in range(num_docs):
 18 |         docs.append({"question_id": int_value, "question": random_string_generator()})
 19 |     return docs
 20 | 
 21 | 
 22 | def generate_simple_coll_answers(num_docs):
 23 |     docs = []
 24 |     for int_value in range(num_docs):
 25 |         docs.append({"answer_id": int_value, "answer": random_string_generator()})
 26 |     return docs
 27 | 
 28 | 
 29 | class MongoDBViewDiscovery(unittest.TestCase):
 30 | 
 31 |     def setUp(self):
 32 | 
 33 |         ensure_environment_variables_set()
 34 | 
 35 |         with get_test_connection() as client:
 36 |             # drop all dbs/collections
 37 |             drop_all_collections(client)
 38 | 
 39 |             # questions has 20 documents
 40 |             client["simple_db"]["questions"].insert_many(generate_simple_coll_questions(20))
 41 | 
 42 |             # answers has 30 documents
 43 |             client["simple_db"]["answers"].insert_many(generate_simple_coll_answers(30))
 44 | 
 45 |             # create view on questions
 46 |             client["simple_db"].command(bson.son.SON([("create", "question_view"), ("viewOn", "questions"), ("pipeline", [])]))
 47 | 
 48 |             # create a view by combining two collections
 49 |             client["simple_db"].create_collection(
 50 |                 'combined_view',
 51 |                 viewOn='questions',
 52 |                 pipeline=[{
 53 |                     '$lookup': {
 54 |                         'from': 'answers',
 55 |                         'localField': 'question_id',
 56 |                         'foreignField': 'answer_id',
 57 |                         'as': 'combined_view_final'
 58 |                         }
 59 |                     }]
 60 |                 )
 61 | 
 62 |     def name(self):
 63 |         return "tap_tester_mongodb_views"
 64 | 
 65 |     def tap_name(self):
 66 |         return "tap-mongodb"
 67 | 
 68 |     def get_type(self):
 69 |         return "platform.mongodb"
 70 | 
 71 |     def get_credentials(self):
 72 |         return {'password': os.getenv('TAP_MONGODB_PASSWORD')}
 73 | 
 74 |     def get_properties(self):
 75 |         return {'host': os.getenv('TAP_MONGODB_HOST'),
 76 |                 'port': os.getenv('TAP_MONGODB_PORT'),
 77 |                 'user': os.getenv('TAP_MONGODB_USER'),
 78 |                 'database': os.getenv('TAP_MONGODB_DBNAME'),
 79 |                 'include_schemas_in_destination_stream_name': 'true'
 80 |                 }
 81 | 
 82 |     def expected_check_streams(self):
 83 |         return {'simple_db-questions',
 84 |                 'simple_db-answers'}
 85 | 
 86 |     def expected_pks(self):
 87 |         return {
 88 |             'simple_db_questions': {'_id'},
 89 |             'simple_db_answers': {'_id'}
 90 |         }
 91 | 
 92 |     def expected_row_counts(self):
 93 |         return {
 94 |             'simple_db_questions': 20,
 95 |             'simple_db_answers': 30
 96 |         }
 97 | 
 98 |     def expected_sync_streams(self):
 99 |         return {
100 |             'simple_db_questions',
101 |             'simple_db_answers'
102 |         }
103 | 
104 |     def test_run(self):
105 | 
106 |         conn_id = connections.ensure_connection(self)
107 | 
108 |         #  -------------------------------
109 |         # -----------  Discovery ----------
110 |         #  -------------------------------
111 | 
112 |         # run in discovery mode
113 |         check_job_name = runner.run_check_mode(self, conn_id)
114 | 
115 |         # verify check exit codes
116 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
117 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
118 | 
119 |         # verify the tap discovered the right streams
120 |         found_catalogs = menagerie.get_catalogs(conn_id)
121 | 
122 |         # validate that the views are not discovered by the tap
123 |         discovered_streams = set([catalog['tap_stream_id'] for catalog in found_catalogs])
124 |         self.assertEqual(discovered_streams, self.expected_check_streams())
125 | 
126 |         # validate the discovered streams are not views
127 |         for stream_catalog in found_catalogs:
128 |             self.assertEqual(stream_catalog['metadata']['is-view'], False)
129 | 
130 |         for stream_catalog in found_catalogs:
131 |             annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
132 |             additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'FULL_TABLE'}}]
133 |             selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
134 |                                                                                    stream_catalog,
135 |                                                                                    annotated_schema,
136 |                                                                                    additional_md)
137 | 
138 |         # run full table sync
139 |         sync_job_name = runner.run_sync_mode(self, conn_id)
140 | 
141 |         # check exit status
142 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
143 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
144 | 
145 |         # streams that we synced are the ones that we expect to see
146 |         records_by_stream = runner.get_records_from_target_output()
147 |         record_count_by_stream = runner.examine_target_output_file(self,
148 |                                                                    conn_id,
149 |                                                                    self.expected_sync_streams(),
150 |                                                                    self.expected_pks())
151 | 
152 |         # assert that we get the correct number of records for each stream
153 |         self.assertEqual(self.expected_row_counts(), record_count_by_stream)
154 | 


--------------------------------------------------------------------------------