├── .gitignore ├── CHANGELOG.rst ├── LICENSE ├── README.rst ├── config.json ├── ez_setup.py ├── mongo_connector ├── __init__.py ├── command_helper.py ├── compat.py ├── config.py ├── connector.py ├── constants.py ├── doc_managers │ ├── __init__.py │ ├── algolia_doc_manager.py │ ├── doc_manager_base.py │ ├── doc_manager_simulator.py │ ├── elastic_doc_manager.py │ ├── formatters.py │ ├── mongo_doc_manager.py │ ├── schema.xml │ └── solr_doc_manager.py ├── errors.py ├── gridfs_file.py ├── locking_dict.py ├── oplog_manager.py └── util.py ├── scripts └── mongo-connector ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── lib └── dummy.pwd ├── setup_cluster.py ├── test_algolia.py ├── test_algolia_doc_manager.py ├── test_command_replication.py ├── test_config.py ├── test_connector_sharded.py ├── test_elastic.py ├── test_elastic_doc_manager.py ├── test_formatters.py ├── test_gridfs_file.py ├── test_mongo.py ├── test_mongo_connector.py ├── test_mongo_doc_manager.py ├── test_oplog_manager.py ├── test_oplog_manager_sharded.py ├── test_rollbacks.py ├── test_solr.py ├── test_solr_doc_manager.py ├── test_synchronizer.py ├── test_util.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *~ 3 | .DS_Store 4 | /test/data 5 | /test/logs 6 | /dist 7 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | Version 2.0.3 5 | ------------- 6 | 7 | Version 2.0.3 requires that the PyMongo version installed be in the range [2.7.2, 3.0). It also adds more fine-grained control over log levels. 8 | 9 | Version 2.0.2 10 | ------------- 11 | 12 | Version 2.0.2 fixes the following issues: 13 | 14 | - Fix configuring timezone-aware datetimes (--tz-aware). 15 | - Fix password file from the command line (--password-file). 16 | - Automatically escape certain characters from field names in documents sent to Solr. 17 | - Add a lot more testing around the configuration file and command-line options. 18 | 19 | Version 2.0.1 20 | ------------- 21 | 22 | Version 2.0.1 fixes filtering by namespace (--namespace-set, namespaces.include). 23 | 24 | Version 2.0 25 | ---------------- 26 | 27 | Version 2.0 is a major version of Mongo Connector and includes breaking changes, new features, and bug fixes. 28 | 29 | Improvements 30 | ~~~~~~~~~~~~ 31 | 32 | - SSL certificates may now be given to Mongo Connector to validate connections to MongoDB. 33 | - A new JSON configuration file makes configuring and starting Mongo Connector as a system service much easier. 34 | - The `setup.py` file can now install Mongo Connector as a service automatically. 35 | - Support for replicating files in GridFS. 36 | - Allow DocManagers to be distributed as separate packages, rather than needing a fork or pull request. 37 | - DocManagers may handle arbitrary database commands in the oplog. 38 | 39 | Bug Fixes 40 | ~~~~~~~~~ 41 | 42 | - Adding an element beyond the end of an array in MongoDB no longer throws an exception. 43 | - All errors that cause Mongo Connector to exit are written to the log. 44 | - Automatically use all-lowercase index names when targeting Elasticsearch. 45 | 46 | Breaking Changes 47 | ~~~~~~~~~~~~~~~~ 48 | 49 | - The constructor signatures for OplogThread and Connector have changed: 50 | - The `u_key` and `target_url` keyword arguments have been removed from the constructor for Connector. 51 | - `target_url` is gone from the OplogThread constructor. 52 | - The `doc_manager` keyword argument in the constructors for Connector and OplogThread is now called `doc_managers`. 53 | - The `doc_managers` keyword argument in Connector takes a list of **instances** of `DocManager`, rather that a list of strings corresponding to files that define DocManagers. 54 | - ConnectorError has been removed. Exceptions that occur when constructing Connector will be passed on to the caller. 55 | - The DocManagerBase class moved from mongo_connector.doc_managers to mongo_connector.doc_managers.doc_manager_base 56 | - The exception_wrapper function moved from mongo_connector.doc_managers to mongo_connector.util 57 | - The arguments to many DocManager methods have changed. For an up-to-date overview of how to write a custom DocManager, see the `Writing Your Own DocManager wiki page `__. A synopsis: 58 | - The `remove` method now takes a document id, namespace, and a timestamp instead of a whole document. 59 | - The `upsert`, `bulk_upsert`, and `update` methods all take two additional arguments: namespace and timestamp. 60 | 61 | Version 1.3.1 62 | ------------- 63 | 64 | Version 1.3.1 contains mostly bug fixes and adds timezone-aware timestamp support. Bugs fixed include: 65 | 66 | - Fixes for update operations to Solr. 67 | - Re-insert documents that were deleted before a rollback. 68 | - Catch a few additional exceptions sometimes thrown by the Elasticsearch Python driver. 69 | 70 | 71 | Version 1.3 72 | ----------- 73 | 74 | Version 1.3 fixes many issues and adds a couple minor features. Highlights include: 75 | 76 | - Use proper updates instead of upserting the most recent version of a document. 77 | 78 | .. Warning:: Update operations require ``_source`` field to be enabled in Elasticsearch. 79 | 80 | - Fix many issues relating to sending BSON types to external drivers, such as for Elasticsearch and Solr. 81 | - Fix several issues related to using a unique key other than ``_id``. 82 | - Support all UTF8 database and collection names. 83 | - Keep namespace and timestamp metadata in a separate Elasticsearch index. 84 | - Documentation overhaul for using Mongo Connector with Elasticsearch. 85 | - New ``--continue-on-error`` flag for collection dumps. 86 | - ``_id`` is no longer duplicated in ``_source`` field in Elasticsearch. 87 | 88 | Version 1.2.1 89 | ------------- 90 | 91 | Version 1.2.1 fixes some trivial installation issues and renames the CHANGELOG to CHANGELOG.rst. 92 | 93 | Version 1.2 94 | ----------- 95 | 96 | Version 1.2 is a major release with a large number of fixes since the last release on PyPI. It also includes a number of improvements for use with Solr and ElasticSearch. 97 | 98 | Improvements 99 | ~~~~~~~~~~~~ 100 | 101 | - Ability to have multiple targets of replication 102 | - Ability to upsert documents containing arrays and nested documents with the Solr DocManager 103 | - Upserts during a collection dump may happen in bulk, resulting in a performance boost 104 | - mongo-connector does not commit writes in target systems by default, resulting in a peformance boost 105 | 106 | .. Warning:: This new behavior may give unexpected delays before 107 | documents are comitted in the target system. Most 108 | indexing systems provide some way of configuring how 109 | often changes should be comitted. Please see the relevant 110 | wiki articles for `Solr 111 | `_ 112 | and `ElasticSearch 113 | `_ 114 | for more information on configuring commit behavior for 115 | your system. Note that MongoDB as a target system is 116 | unaffected by this change. 117 | 118 | - Addition of ``auto-commit-interval`` to the command-line options 119 | - Ability to change the destination namespace of upserted documents 120 | - Ability to restrict the fields upserted in documents 121 | - Memory footprint reduced 122 | - Collection dumps may happen in batch, resulting in huge performance gains 123 | 124 | Fixes 125 | ~~~~~ 126 | 127 | - Fix for unexpected exit during chunk migrations and orphan documents in MongoDB 128 | - Fix installation problems due to namespace issues 129 | 130 | .. Warning:: RENAME of ``mongo_connector.py`` module to 131 | ``connector.py``. Thus, if you should need to import the 132 | ``Connector`` object, you now should do 133 | ``from mongo_connector.connector import Connector`` 134 | 135 | - Fix user-specified unique keys in Solr and ElasticSearch DocManagers 136 | - Fix for keyboard exit taking large amounts of time to be effective 137 | 138 | Version 1.1.1 139 | ------------- 140 | 141 | This was the first release of mongo-connector. 142 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | **WARNING** : This connector is deprecated, please use an `API Client `__. 2 | 3 | For complete documentation, check out the `Mongo Connector Wiki `__. 4 | 5 | DISCLAIMER 6 | ---------- 7 | 8 | Please note: all tools/ scripts in this repo are released for use "AS IS" without any warranties of any kind, including, but not limited to their installation, use, or performance. We disclaim any and all warranties, either express or implied, including but not limited to any warranty of noninfringement, merchantability, and/ or fitness for a particular purpose. We do not warrant that the technology will meet your requirements, that the operation thereof will be uninterrupted or error-free, or that any errors will be corrected. 9 | Any use of these scripts and tools is at your own risk. There is no guarantee that they have been through thorough testing in a comparable environment and we are not responsible for any damage or data loss incurred with their use. 10 | You are responsible for reviewing and testing any scripts you run thoroughly before use in any non-testing environment. 11 | 12 | System Overview 13 | --------------- 14 | 15 | mongo-connector creates a pipeline from a MongoDB cluster to one or more 16 | target systems, such as Solr, Elasticsearch, or another MongoDB cluster. 17 | By tailing the MongoDB oplog, it replicates operations from MongoDB to 18 | these systems in real-time. It has been tested with Python 2.6, 2.7, 19 | 3.3, and 3.4. Detailed documentation is available on the 20 | `wiki `__. 21 | 22 | Getting Started 23 | --------------- 24 | 25 | Installation 26 | ~~~~~~~~~~~~ 27 | 28 | You can install the development version of mongo-connector 29 | manually:: 30 | 31 | git clone https://github.com/algolia/mongo-connector.git 32 | cd mongo-connector 33 | python setup.py install 34 | 35 | You may have to run ``python setup.py install`` with ``sudo``, depending 36 | on where you're installing mongo-connector and what privileges you have. 37 | 38 | Using mongo-connector 39 | ~~~~~~~~~~~~~~~~~~~~~ 40 | 41 | mongo-connector replicates operations from the MongoDB oplog, so a 42 | `replica 43 | set `__ 44 | must be running before startup. For development purposes, you may find 45 | it convenient to run a one-node replica set (note that this is **not** 46 | recommended for production):: 47 | 48 | mongod --replSet myDevReplSet 49 | 50 | To initialize your server as a replica set, run the following command in 51 | the mongo shell:: 52 | 53 | rs.initiate() 54 | 55 | Once the replica set is running, you may start mongo-connector. The 56 | simplest invocation resembles the following:: 57 | 58 | mongo-connector -m : \ 59 | -t \ 60 | -d 61 | 62 | mongo-connector has many other options besides those demonstrated above. 63 | To get a full listing with descriptions, try ``mongo-connector --help``. 64 | You can also use mongo-connector with a `configuration file `__. 65 | 66 | Usage With Algolia 67 | ------------------ 68 | 69 | The simplest way to synchronize a collection ``myData`` from db ``myDb`` to index ``MyIndex`` is:: 70 | 71 | mongo-connector -m localhost:27017 -n myDb.myCollection -d algolia_doc_manager -t MyApplicationID:MyApiKey:MyIndex 72 | 73 | **Note**: If you synchronize multiple collections with multiple indexes, do not forget to specify a specific connector configuration file for each index using the ``-o config.txt`` option (a config.txt file is created by default). 74 | 75 | Attributes remapping 76 | ~~~~~~~~~~~~~~~~~~~~ 77 | 78 | If you want to map an attribute to a specific index field, you can configure it creating a 79 | ``algolia_remap_.json`` JSON configuration file at the root of the mongo-connector folder:: 80 | 81 | { 82 | "user.email": "email" 83 | } 84 | 85 | Alternatively, you can use python-style subscript notation:: 86 | 87 | { 88 | "['user']['email']": "['email']" 89 | } 90 | 91 | **Note**: 92 | 93 | - The remapping operation will run first. 94 | 95 | Example 96 | """"""" 97 | 98 | Consider the following object:: 99 | 100 | { 101 | "user": { "email": "my@algolia.com" } 102 | } 103 | 104 | The connector will send:: 105 | 106 | { 107 | "email": "my@algolia.com" 108 | } 109 | 110 | Attributes filtering 111 | ~~~~~~~~~~~~~~~~~~~~ 112 | 113 | You can filter the attributes sent to Algolia creating a ``algolia_fields_INDEXNAME.json`` JSON configuration file:: 114 | 115 | { 116 | "":"_$ < 0", 117 | "": "" 118 | } 119 | 120 | Considering the following object:: 121 | 122 | { 123 | "" : 1, 124 | "" : 2 125 | } 126 | 127 | The connector will send:: 128 | 129 | { 130 | "" : 2, 131 | } 132 | 133 | 134 | **Note**: 135 | 136 | - ``_$`` represents the value of the field. 137 | - An empty value for the check of a field is ``True``. 138 | - You can put any line of python in the value of a field. 139 | - The filtering operation will run between remapping and post-processing. 140 | 141 | Filter an array attribute sent to Algolia 142 | """"""""""""""""""""""""""""""""""""""""" 143 | 144 | To select all elements from attribute ```` matching a specific condition:: 145 | 146 | { 147 | "": "re.match(r'algolia', _$, re.I)" 148 | } 149 | 150 | Considering the following object:: 151 | 152 | { 153 | "" : ["algolia", "AlGoLiA", "alogia"] 154 | } 155 | 156 | The connector will send:: 157 | 158 | { 159 | "": ["algolia", "AlGoLia"] 160 | } 161 | 162 | Filter an object attribute in an array sent to Algolia 163 | """""""""""""""""""""""""""""""""""""""""""""""""""""" 164 | 165 | To select all elements from attribute ``status`` matching a specific condition:: 166 | 167 | { 168 | "status": { "action": "", "outdated" : "_$ == false" } 169 | } 170 | 171 | Considering the following object:: 172 | 173 | { 174 | "status" : [ 175 | {"action": "send", "outdated": "true"}, 176 | {"action": "in transit", "outdated": true}, 177 | {"action": "receive", "outdated": false} 178 | ] 179 | } 180 | 181 | The connector will send:: 182 | 183 | { 184 | "status": [{"action": "receive", "outdated": false}] 185 | } 186 | 187 | Advanced nested objects filtering 188 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 189 | 190 | If you want to send a ```` attribute matching advanced filtering conditions, you can use:: 191 | 192 | { 193 | "": { "_all_" : "or", "neg": "_$ < 0", "pos": "_$ > 0"} 194 | } 195 | 196 | Considering the following object:: 197 | 198 | { 199 | "": { "neg": 42, "pos": 42} 200 | } 201 | 202 | The connector will send:: 203 | 204 | { 205 | "": { "pos": 42} 206 | } 207 | 208 | Post processing 209 | ~~~~~~~~~~~~~~~ 210 | 211 | You can modify the attributes sent to Algolia creating a ``algolia_postproc_INDEXNAME.py`` Python script file:: 212 | 213 | if (_$.get("") == 0): 214 | _$[""] = false 215 | else: 216 | _$[""] = true 217 | 218 | **Note**: 219 | 220 | - ``_$`` represents the record. 221 | - The post-processing operation will run last. 222 | 223 | Considering the following object:: 224 | 225 | { 226 | "": 0 227 | } 228 | 229 | The connector will send:: 230 | 231 | { 232 | "": false 233 | } 234 | 235 | 236 | Usage With Solr 237 | --------------- 238 | 239 | There is an example Solr schema called 240 | `schema.xml `__, 241 | which provides several field definitions on which mongo-connector 242 | relies, including: 243 | 244 | - ``_id``, the default unique key for documents in MongoDB (this may be 245 | changed with the ``--unique-key`` option) 246 | - ``ns``, the namespace from which the document came 247 | - ``_ts``, the timestamp from the oplog entry that last modified the 248 | document 249 | 250 | The sample XML schema is designed to work with the tests. For a more 251 | complete guide to adding fields, review the `Solr 252 | documentation `__. 253 | 254 | You may also want to jump to the mongo-connector `Solr 255 | wiki `__ 256 | for more detailed information on using mongo-connector with Solr. 257 | 258 | Troubleshooting 259 | --------------- 260 | 261 | **Installation** 262 | 263 | Some users have experienced trouble installing mongo-connector, noting 264 | error messages like the following:: 265 | 266 | Processing elasticsearch-0.4.4.tar.gz 267 | Running elasticsearch-0.4.4/setup.py -q bdist_egg --dist-dir /tmp/easy_install-gg9U5p/elasticsearch-0.4.4/egg-dist-tmp-vajGnd 268 | error: /tmp/easy_install-gg9U5p/elasticsearch-0.4.4/README.rst: No such file or directory 269 | 270 | The workaround for this is making sure you have a recent version of 271 | ``setuptools`` installed. Any version *after* 0.6.26 should do the 272 | trick:: 273 | 274 | pip install --upgrade setuptools 275 | 276 | **Running mongo-connector after a long time** 277 | 278 | If you want to jump-start into using mongo-connector with a another particular system, check out: 279 | 280 | - `Usage with Solr `__ 281 | - `Usage with Elasticsearch `__ 282 | - `Usage with MongoDB `__ 283 | 284 | Troubleshooting/Questions 285 | ------------------------- 286 | 287 | Having trouble with installation? Have a question about Mongo Connector? 288 | Your question or problem may be answered in the `FAQ `__ or in the `wiki `__. 289 | If you can't find the answer to your question or problem there, feel free to `open an issue `__ on Mongo Connector's Github page. 290 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "__comment__": "Configuration options starting with '__' are disabled", 3 | "__comment__": "To enable them, remove the preceding '__'", 4 | 5 | "mainAddress": "localhost:27017", 6 | "oplogFile": "/var/log/mongo-connector/oplog.timestamp", 7 | "noDump": false, 8 | "batchSize": -1, 9 | "verbosity": 0, 10 | "continueOnError": false, 11 | 12 | "logging": { 13 | "type": "file", 14 | "filename": "/var/log/mongo-connector/mongo-connector.log", 15 | "__rotationWhen": "D", 16 | "__rotationInterval": 1, 17 | "__rotationBackups": 10, 18 | 19 | "__type": "syslog", 20 | "__host": "localhost:512" 21 | }, 22 | 23 | "authentication": { 24 | "__adminUsername": "username", 25 | "__password": "password", 26 | "__passwordFile": "mongo-connector.pwd" 27 | }, 28 | 29 | "__comment__": "For more information about SSL with MongoDB, please see http://docs.mongodb.org/manual/tutorial/configure-ssl-clients/", 30 | "__ssl": { 31 | "__sslCertfile": "Path to certificate to identify the local connection against MongoDB", 32 | "__sslKeyfile": "Path to the private key for sslCertfile. Not necessary if already included in sslCertfile.", 33 | "__sslCACerts": "Path to concatenated set of certificate authority certificates to validate the other side of the connection", 34 | "__sslCertificatePolicy": "Policy for validating SSL certificates provided from the other end of the connection. Possible values are 'required' (require and validate certificates), 'optional' (validate but don't require a certificate), and 'ignored' (ignore certificates)." 35 | }, 36 | 37 | "__fields": ["field1", "field2", "field3"], 38 | 39 | "namespaces": { 40 | "__include": ["db.source1", "db.source2"], 41 | "__mapping": { 42 | "db.source1": "db.dest1", 43 | "db.source2": "db.dest2" 44 | }, 45 | "__gridfs": ["db.fs"] 46 | }, 47 | 48 | "docManagers": [ 49 | { 50 | "docManager": "elastic_doc_manager", 51 | "targetURL": "localhost:9200", 52 | "__bulkSize": 1000, 53 | "__uniqueKey": "_id", 54 | "__autoCommitInterval": null 55 | } 56 | ] 57 | } 58 | -------------------------------------------------------------------------------- /ez_setup.py: -------------------------------------------------------------------------------- 1 | #!python 2 | """Bootstrap setuptools installation 3 | 4 | If you want to use setuptools in your package's setup.py, just include this 5 | file in the same directory with it, and add this to the top of your setup.py:: 6 | 7 | from ez_setup import use_setuptools 8 | use_setuptools() 9 | 10 | If you want to require a specific version of setuptools, set a download 11 | mirror, or use an alternate download directory, you can do so by supplying 12 | the appropriate options to ``use_setuptools()``. 13 | 14 | This file can also be run as a script to install or upgrade setuptools. 15 | """ 16 | import os 17 | import shutil 18 | import sys 19 | import tempfile 20 | import tarfile 21 | import optparse 22 | import subprocess 23 | 24 | from distutils import log 25 | 26 | try: 27 | from site import USER_SITE 28 | except ImportError: 29 | USER_SITE = None 30 | 31 | DEFAULT_VERSION = "0.9.7" 32 | DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/" 33 | 34 | def _python_cmd(*args): 35 | args = (sys.executable,) + args 36 | return subprocess.call(args) == 0 37 | 38 | def _install(tarball, install_args=()): 39 | # extracting the tarball 40 | tmpdir = tempfile.mkdtemp() 41 | log.warn('Extracting in %s', tmpdir) 42 | old_wd = os.getcwd() 43 | try: 44 | os.chdir(tmpdir) 45 | tar = tarfile.open(tarball) 46 | _extractall(tar) 47 | tar.close() 48 | 49 | # going in the directory 50 | subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) 51 | os.chdir(subdir) 52 | log.warn('Now working in %s', subdir) 53 | 54 | # installing 55 | log.warn('Installing Setuptools') 56 | if not _python_cmd('setup.py', 'install', *install_args): 57 | log.warn('Something went wrong during the installation.') 58 | log.warn('See the error message above.') 59 | # exitcode will be 2 60 | return 2 61 | finally: 62 | os.chdir(old_wd) 63 | shutil.rmtree(tmpdir) 64 | 65 | 66 | def _build_egg(egg, tarball, to_dir): 67 | # extracting the tarball 68 | tmpdir = tempfile.mkdtemp() 69 | log.warn('Extracting in %s', tmpdir) 70 | old_wd = os.getcwd() 71 | try: 72 | os.chdir(tmpdir) 73 | tar = tarfile.open(tarball) 74 | _extractall(tar) 75 | tar.close() 76 | 77 | # going in the directory 78 | subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) 79 | os.chdir(subdir) 80 | log.warn('Now working in %s', subdir) 81 | 82 | # building an egg 83 | log.warn('Building a Setuptools egg in %s', to_dir) 84 | _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) 85 | 86 | finally: 87 | os.chdir(old_wd) 88 | shutil.rmtree(tmpdir) 89 | # returning the result 90 | log.warn(egg) 91 | if not os.path.exists(egg): 92 | raise IOError('Could not build the egg.') 93 | 94 | 95 | def _do_download(version, download_base, to_dir, download_delay): 96 | egg = os.path.join(to_dir, 'setuptools-%s-py%d.%d.egg' 97 | % (version, sys.version_info[0], sys.version_info[1])) 98 | if not os.path.exists(egg): 99 | tarball = download_setuptools(version, download_base, 100 | to_dir, download_delay) 101 | _build_egg(egg, tarball, to_dir) 102 | sys.path.insert(0, egg) 103 | import setuptools 104 | setuptools.bootstrap_install_from = egg 105 | 106 | 107 | def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, 108 | to_dir=os.curdir, download_delay=15): 109 | # making sure we use the absolute path 110 | to_dir = os.path.abspath(to_dir) 111 | was_imported = 'pkg_resources' in sys.modules or \ 112 | 'setuptools' in sys.modules 113 | try: 114 | import pkg_resources 115 | except ImportError: 116 | return _do_download(version, download_base, to_dir, download_delay) 117 | try: 118 | pkg_resources.require("setuptools>=" + version) 119 | return 120 | except pkg_resources.VersionConflict: 121 | e = sys.exc_info()[1] 122 | if was_imported: 123 | sys.stderr.write( 124 | "The required version of setuptools (>=%s) is not available,\n" 125 | "and can't be installed while this script is running. Please\n" 126 | "install a more recent version first, using\n" 127 | "'easy_install -U setuptools'." 128 | "\n\n(Currently using %r)\n" % (version, e.args[0])) 129 | sys.exit(2) 130 | else: 131 | del pkg_resources, sys.modules['pkg_resources'] # reload ok 132 | return _do_download(version, download_base, to_dir, 133 | download_delay) 134 | except pkg_resources.DistributionNotFound: 135 | return _do_download(version, download_base, to_dir, 136 | download_delay) 137 | 138 | 139 | def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, 140 | to_dir=os.curdir, delay=15): 141 | """Download setuptools from a specified location and return its filename 142 | 143 | `version` should be a valid setuptools version number that is available 144 | as an egg for download under the `download_base` URL (which should end 145 | with a '/'). `to_dir` is the directory where the egg will be downloaded. 146 | `delay` is the number of seconds to pause before an actual download 147 | attempt. 148 | """ 149 | # making sure we use the absolute path 150 | to_dir = os.path.abspath(to_dir) 151 | try: 152 | from urllib.request import urlopen 153 | except ImportError: 154 | from urllib2 import urlopen 155 | tgz_name = "setuptools-%s.tar.gz" % version 156 | url = download_base + tgz_name 157 | saveto = os.path.join(to_dir, tgz_name) 158 | src = dst = None 159 | if not os.path.exists(saveto): # Avoid repeated downloads 160 | try: 161 | log.warn("Downloading %s", url) 162 | src = urlopen(url) 163 | # Read/write all in one block, so we don't create a corrupt file 164 | # if the download is interrupted. 165 | data = src.read() 166 | dst = open(saveto, "wb") 167 | dst.write(data) 168 | finally: 169 | if src: 170 | src.close() 171 | if dst: 172 | dst.close() 173 | return os.path.realpath(saveto) 174 | 175 | 176 | def _extractall(self, path=".", members=None): 177 | """Extract all members from the archive to the current working 178 | directory and set owner, modification time and permissions on 179 | directories afterwards. `path' specifies a different directory 180 | to extract to. `members' is optional and must be a subset of the 181 | list returned by getmembers(). 182 | """ 183 | import copy 184 | import operator 185 | from tarfile import ExtractError 186 | directories = [] 187 | 188 | if members is None: 189 | members = self 190 | 191 | for tarinfo in members: 192 | if tarinfo.isdir(): 193 | # Extract directories with a safe mode. 194 | directories.append(tarinfo) 195 | tarinfo = copy.copy(tarinfo) 196 | tarinfo.mode = 448 # decimal for oct 0700 197 | self.extract(tarinfo, path) 198 | 199 | # Reverse sort directories. 200 | if sys.version_info < (2, 4): 201 | def sorter(dir1, dir2): 202 | return cmp(dir1.name, dir2.name) 203 | directories.sort(sorter) 204 | directories.reverse() 205 | else: 206 | directories.sort(key=operator.attrgetter('name'), reverse=True) 207 | 208 | # Set correct owner, mtime and filemode on directories. 209 | for tarinfo in directories: 210 | dirpath = os.path.join(path, tarinfo.name) 211 | try: 212 | self.chown(tarinfo, dirpath) 213 | self.utime(tarinfo, dirpath) 214 | self.chmod(tarinfo, dirpath) 215 | except ExtractError: 216 | e = sys.exc_info()[1] 217 | if self.errorlevel > 1: 218 | raise 219 | else: 220 | self._dbg(1, "tarfile: %s" % e) 221 | 222 | 223 | def _build_install_args(options): 224 | """ 225 | Build the arguments to 'python setup.py install' on the setuptools package 226 | """ 227 | install_args = [] 228 | if options.user_install: 229 | if sys.version_info < (2, 6): 230 | log.warn("--user requires Python 2.6 or later") 231 | raise SystemExit(1) 232 | install_args.append('--user') 233 | return install_args 234 | 235 | def _parse_args(): 236 | """ 237 | Parse the command line for options 238 | """ 239 | parser = optparse.OptionParser() 240 | parser.add_option( 241 | '--user', dest='user_install', action='store_true', default=False, 242 | help='install in user site package (requires Python 2.6 or later)') 243 | parser.add_option( 244 | '--download-base', dest='download_base', metavar="URL", 245 | default=DEFAULT_URL, 246 | help='alternative URL from where to download the setuptools package') 247 | options, args = parser.parse_args() 248 | # positional arguments are ignored 249 | return options 250 | 251 | def main(version=DEFAULT_VERSION): 252 | """Install or upgrade setuptools and EasyInstall""" 253 | options = _parse_args() 254 | tarball = download_setuptools(download_base=options.download_base) 255 | return _install(tarball, _build_install_args(options)) 256 | 257 | if __name__ == '__main__': 258 | sys.exit(main()) 259 | -------------------------------------------------------------------------------- /mongo_connector/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from pkgutil import extend_path 15 | __path__ = extend_path(__path__, __name__) 16 | -------------------------------------------------------------------------------- /mongo_connector/command_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Preprocesses the oplog command entries. 16 | """ 17 | 18 | import logging 19 | import mongo_connector.errors 20 | 21 | LOG = logging.getLogger(__name__) 22 | 23 | 24 | class CommandHelper(object): 25 | def __init__(self, namespace_set=[], dest_mapping={}): 26 | self.namespace_set = namespace_set 27 | self.dest_mapping = dest_mapping 28 | 29 | # Create a db to db mapping from the namespace mapping. 30 | db_pairs = set((ns.split('.')[0], 31 | self.map_namespace(ns).split('.')[0]) 32 | for ns in self.namespace_set) 33 | targets = set() 34 | for _, dest in db_pairs: 35 | if dest in targets: 36 | dbs = [src2 for src2, dest2 in db_pairs 37 | if dest == dest2] 38 | raise mongo_connector.errors.MongoConnectorError( 39 | "Database mapping is not one-to-one." 40 | " %s %s have collections mapped to %s" 41 | % (", ".join(dbs), 42 | "both" if len(dbs) == 2 else "all", 43 | dest)) 44 | else: 45 | targets.add(dest) 46 | 47 | self.db_mapping = {} 48 | for src, dest in db_pairs: 49 | arr = self.db_mapping.get(src, []) 50 | arr.append(dest) 51 | self.db_mapping[src] = arr 52 | 53 | # Applies the namespace mapping to a database. 54 | # Individual collections in a database can be mapped to 55 | # different target databases, so map_db can return multiple results. 56 | def map_db(self, db): 57 | if self.db_mapping: 58 | return self.db_mapping.get(db, []) 59 | else: 60 | return [db] 61 | 62 | # Applies the namespace mapping to a "db.collection" string 63 | def map_namespace(self, ns): 64 | if not self.namespace_set: 65 | return ns 66 | elif ns not in self.namespace_set: 67 | return None 68 | else: 69 | return self.dest_mapping.get(ns, ns) 70 | 71 | # Applies the namespace mapping to a db and collection 72 | def map_collection(self, db, coll): 73 | ns = self.map_namespace(db + '.' + coll) 74 | if ns: 75 | return tuple(ns.split('.', 1)) 76 | else: 77 | return None, None 78 | -------------------------------------------------------------------------------- /mongo_connector/compat.py: -------------------------------------------------------------------------------- 1 | """Utilities for maintaining portability between various Python versions""" 2 | 3 | import sys 4 | 5 | PY3 = (sys.version_info[0] == 3) 6 | 7 | if PY3: 8 | def reraise(exctype, value, trace=None): 9 | raise exctype(str(value)).with_traceback(trace) 10 | 11 | def is_string(x): 12 | return isinstance(x, str) 13 | 14 | from urllib.request import Request 15 | from urllib.request import urlopen 16 | from urllib.parse import urlencode 17 | from urllib.error import URLError 18 | from urllib.error import HTTPError 19 | 20 | def u(s): 21 | return str(s) 22 | 23 | else: 24 | exec("""def reraise(exctype, value, trace=None): 25 | raise exctype, str(value), trace""") 26 | 27 | def is_string(x): 28 | return isinstance(x, basestring) 29 | 30 | from urllib import urlencode 31 | from urllib2 import Request 32 | from urllib2 import urlopen 33 | from urllib2 import URLError 34 | from urllib2 import HTTPError 35 | 36 | def u(s): 37 | return unicode(s) 38 | -------------------------------------------------------------------------------- /mongo_connector/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0# 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import json 15 | import logging 16 | import optparse 17 | import sys 18 | 19 | from mongo_connector import compat, errors 20 | from mongo_connector.compat import reraise 21 | 22 | 23 | def default_apply_function(option, cli_values): 24 | first_value = list(cli_values.values())[0] 25 | if first_value is not None: 26 | option.value = first_value 27 | 28 | 29 | class Option(object): 30 | """A config file option which can be overwritten on the command line. 31 | 32 | config_key is the corresponding field in the JSON config file. 33 | 34 | apply_function has the following signature: 35 | def apply_function(option, cli_values): 36 | # modify option.value ... 37 | 38 | When apply_function is invoked, option.value will be set to the 39 | value given in the config file (or the default value). 40 | 41 | apply_function reads the cli_values and modifies option.value accordingly 42 | """ 43 | 44 | def __init__(self, config_key=None, default=None, type=None, 45 | apply_function=default_apply_function): 46 | self.config_key = config_key 47 | self.value = default 48 | self.type = type 49 | self.apply_function = apply_function 50 | 51 | self.cli_names = [] 52 | self.cli_options = [] 53 | 54 | def validate_type(self): 55 | if self.type == str: 56 | return compat.is_string(self.value) 57 | else: 58 | return isinstance(self.value, 59 | self.type) 60 | 61 | def add_cli(self, *args, **kwargs): 62 | """Add a command line argument. 63 | 64 | All of the given arguments will be passed directly to 65 | optparse.OptionParser().add_option 66 | """ 67 | self.cli_options.append((args, kwargs)) 68 | 69 | 70 | class Config(object): 71 | """Manages command line application configuration. 72 | 73 | conf = Config(options) 74 | conf.parse_args() 75 | value = conf['key'] 76 | value2 = conf['key1.key2'] # same as conf['key1']['key2'] 77 | """ 78 | 79 | def __init__(self, options): 80 | self.options = options 81 | 82 | self.config_key_to_option = dict( 83 | [(option.config_key, option) for option in self.options]) 84 | 85 | def parse_args(self, argv=None): 86 | """Parses command line arguments from stdin (or given argv). 87 | 88 | Does the following: 89 | 1. Parses command line arguments 90 | 2. Loads config file into options (if config file specified) 91 | 3. calls option.apply_function with the parsed cli_values 92 | """ 93 | 94 | # parse the command line options 95 | parser = optparse.OptionParser() 96 | for option in self.options: 97 | for args, kwargs in option.cli_options: 98 | cli_option = parser.add_option(*args, **kwargs) 99 | option.cli_names.append(cli_option.dest) 100 | parsed_options, args = parser.parse_args(argv) 101 | if args: 102 | raise errors.InvalidConfiguration( 103 | 'The following command line arguments are not recognized: ' 104 | + ', '.join(args)) 105 | 106 | # load the config file 107 | if parsed_options.config_file: 108 | try: 109 | with open(parsed_options.config_file) as f: 110 | self.load_json(f.read()) 111 | except (OSError, IOError, ValueError): 112 | reraise(errors.InvalidConfiguration, *sys.exc_info()[1:]) 113 | 114 | # apply the command line arguments 115 | values = parsed_options.__dict__ 116 | for option in self.options: 117 | option.apply_function( 118 | option, dict((k, values.get(k)) for k in option.cli_names)) 119 | 120 | def __getitem__(self, key): 121 | keys = key.split('.') 122 | cur = self.config_key_to_option[keys[0]].value 123 | for k in keys[1:]: 124 | if cur is not None: 125 | if isinstance(cur, dict): 126 | cur = cur.get(k) 127 | else: 128 | cur = None 129 | return cur 130 | 131 | def load_json(self, text): 132 | parsed_config = json.loads(text) 133 | for k in parsed_config: 134 | option = self.config_key_to_option.get(k) 135 | if option: 136 | # load into option.value 137 | if isinstance(parsed_config[k], dict): 138 | for k2 in parsed_config[k]: 139 | option.value[k2] = parsed_config[k][k2] 140 | else: 141 | option.value = parsed_config[k] 142 | 143 | # type check 144 | if not option.validate_type(): 145 | raise errors.InvalidConfiguration( 146 | "%s should be a %r, %r was given!" % 147 | (option.config_key, 148 | option.type.__name__, 149 | type(option.value).__name__)) 150 | else: 151 | if not k.startswith("__"): 152 | logging.warning("Unrecognized option: %s" % k) 153 | -------------------------------------------------------------------------------- /mongo_connector/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Maximum # of documents to process before recording timestamp 16 | # default = -1 (no maximum) 17 | DEFAULT_BATCH_SIZE = -1 18 | 19 | # Interval in seconds between doc manager flushes (i.e. auto commit) 20 | # default = None (never auto commit) 21 | DEFAULT_COMMIT_INTERVAL = None 22 | 23 | # Maximum # of documents to send in a single bulk request through a 24 | # DocManager. 25 | DEFAULT_MAX_BULK = 1000 26 | 27 | # The default MongoDB field that will serve as the unique key for the 28 | # target system. 29 | DEFAULT_UNIQUE_KEY = "_id" 30 | 31 | # Default host and facility for logging to the syslog. 32 | DEFAULT_SYSLOG_HOST = "localhost:512" 33 | DEFAULT_SYSLOG_FACILITY = "user" 34 | 35 | # ROTATING LOGFILE 36 | # The type of interval 37 | # (seconds, minutes, hours... c.f. logging.handlers.TimedRotatingFileHandler) 38 | DEFAULT_LOGFILE_WHEN = "midnight" 39 | # The rollover interval 40 | DEFAULT_LOGFILE_INTERVAL = 1 41 | # Number of log files to keep 42 | DEFAULT_LOGFILE_BACKUPCOUNT = 7 43 | -------------------------------------------------------------------------------- /mongo_connector/doc_managers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from pkgutil import extend_path 15 | __path__ = extend_path(__path__, __name__) 16 | -------------------------------------------------------------------------------- /mongo_connector/doc_managers/doc_manager_base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import sys 15 | 16 | from mongo_connector.compat import reraise 17 | from mongo_connector.errors import UpdateDoesNotApply 18 | 19 | 20 | class DocManagerBase(object): 21 | """Base class for all DocManager implementations.""" 22 | 23 | def apply_update(self, doc, update_spec): 24 | """Apply an update operation to a document.""" 25 | 26 | # Helper to cast a key for a list or dict, or raise ValueError 27 | def _convert_or_raise(container, key): 28 | if isinstance(container, dict): 29 | return key 30 | elif isinstance(container, list): 31 | return int(key) 32 | else: 33 | raise ValueError 34 | 35 | # Helper to retrieve (and/or create) 36 | # a dot-separated path within a document. 37 | def _retrieve_path(container, path, create=False): 38 | looking_at = container 39 | for part in path: 40 | if isinstance(looking_at, dict): 41 | if create and not part in looking_at: 42 | looking_at[part] = {} 43 | looking_at = looking_at[part] 44 | elif isinstance(looking_at, list): 45 | index = int(part) 46 | # Do we need to create additional space in the array? 47 | if create and len(looking_at) <= index: 48 | # Fill buckets with None up to the index we need. 49 | looking_at.extend( 50 | [None] * (index - len(looking_at))) 51 | # Bucket we need gets the empty dictionary. 52 | looking_at.append({}) 53 | looking_at = looking_at[index] 54 | else: 55 | raise ValueError 56 | return looking_at 57 | 58 | # wholesale document replacement 59 | if not "$set" in update_spec and not "$unset" in update_spec: 60 | # update spec contains the new document in its entirety 61 | return update_spec 62 | else: 63 | try: 64 | # $set 65 | for to_set in update_spec.get("$set", []): 66 | value = update_spec['$set'][to_set] 67 | if '.' in to_set: 68 | path = to_set.split(".") 69 | where = _retrieve_path(doc, path[:-1], create=True) 70 | wl = len(where) 71 | index = _convert_or_raise(where, path[-1]) 72 | if isinstance(where, list) and index >= wl: 73 | where.extend([None] * (index + 1 - wl)) 74 | where[index] = value 75 | else: 76 | doc[to_set] = value 77 | 78 | # $unset 79 | for to_unset in update_spec.get("$unset", []): 80 | if '.' in to_unset: 81 | path = to_unset.split(".") 82 | where = _retrieve_path(doc, path[:-1]) 83 | where.pop(_convert_or_raise(where, path[-1])) 84 | else: 85 | doc.pop(to_unset) 86 | except (KeyError, ValueError, AttributeError, IndexError): 87 | exc_t, exc_v, exc_tb = sys.exc_info() 88 | reraise(UpdateDoesNotApply, 89 | "Cannot apply update %r to %r" % (update_spec, doc), 90 | exc_tb) 91 | return doc 92 | 93 | def bulk_upsert(self, docs, namespace, timestamp): 94 | """Upsert each document in a set of documents. 95 | 96 | This method may be overridden to upsert many documents at once. 97 | """ 98 | for doc in docs: 99 | self.upsert(doc, namespace, timestamp) 100 | 101 | def update(self, doc, update_spec, namespace, timestamp): 102 | """Update a document. 103 | 104 | ``update_spec`` is the update operation as provided by an oplog record 105 | in the "o" field. 106 | """ 107 | raise NotImplementedError 108 | 109 | def upsert(self, document, namespace, timestamp): 110 | """(Re-)insert a document.""" 111 | raise NotImplementedError 112 | 113 | def remove(self, document_id, namespace, timestamp): 114 | """Remove a document. 115 | 116 | ``document_id`` is a dict that provides the id of the document 117 | to be removed. ``namespace`` and ``timestamp`` provide the database + 118 | collection name and the timestamp from the corresponding oplog entry. 119 | """ 120 | raise NotImplementedError 121 | 122 | def insert_file(self, f, namespace, timestamp): 123 | """Insert a file from GridFS.""" 124 | raise NotImplementedError 125 | 126 | def handle_command(self, command_doc, namespace, timestamp): 127 | """Handle a MongoDB command.""" 128 | raise NotImplementedError 129 | 130 | def search(self, start_ts, end_ts): 131 | """Get an iterable of documents that were inserted, updated, or deleted 132 | between ``start_ts`` and ``end_ts``. 133 | """ 134 | raise NotImplementedError 135 | 136 | def commit(self): 137 | """Commit all outstanding writes.""" 138 | raise NotImplementedError 139 | 140 | def get_last_doc(self): 141 | """Get the document that was modified or deleted most recently.""" 142 | raise NotImplementedError 143 | 144 | def stop(self): 145 | """Stop all threads started by this DocManager.""" 146 | raise NotImplementedError 147 | -------------------------------------------------------------------------------- /mongo_connector/doc_managers/doc_manager_simulator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """A class to serve as proxy for the target engine for testing. 16 | 17 | Receives documents from the oplog worker threads and indexes them 18 | into the backend. 19 | 20 | Please look at the Solr and ElasticSearch doc manager classes for a sample 21 | implementation with real systems. 22 | """ 23 | 24 | from threading import RLock 25 | 26 | from mongo_connector import constants 27 | from mongo_connector.errors import OperationFailed 28 | from mongo_connector.doc_managers.doc_manager_base import DocManagerBase 29 | from mongo_connector.compat import u 30 | 31 | 32 | class DocumentStore(dict): 33 | 34 | def __init__(self): 35 | self._lock = RLock() 36 | 37 | def __getitem__(self, key): 38 | with self._lock: 39 | return super(DocumentStore, self).__getitem__(key) 40 | 41 | def __setitem__(self, key, value): 42 | with self._lock: 43 | return super(DocumentStore, self).__setitem__(key, value) 44 | 45 | def __iter__(self): 46 | def __myiter__(): 47 | with self._lock: 48 | for item in super(DocumentStore, self).__iter__(): 49 | yield item 50 | return __myiter__() 51 | 52 | 53 | class Entry(object): 54 | 55 | def __init__(self, doc, ns, ts): 56 | self.doc, self.ns, self.ts = doc, ns, ts 57 | self._id = self.doc['_id'] 58 | 59 | @property 60 | def meta_dict(self): 61 | return {'_id': self._id, 'ns': self.ns, '_ts': self.ts} 62 | 63 | @property 64 | def merged_dict(self): 65 | d = self.doc.copy() 66 | d.update(**self.meta_dict) 67 | return d 68 | 69 | def update(self, ns, ts): 70 | self.ns, self.ts = ns, ts 71 | 72 | 73 | class DocManager(DocManagerBase): 74 | """BackendSimulator emulates both a target DocManager and a server. 75 | 76 | The DocManager class creates a connection to the backend engine and 77 | adds/removes documents, and in the case of rollback, searches for them. 78 | 79 | The reason for storing id/doc pairs as opposed to doc's is so that multiple 80 | updates to the same doc reflect the most up to date version as opposed to 81 | multiple, slightly different versions of a doc. 82 | """ 83 | 84 | def __init__(self, url=None, unique_key='_id', 85 | auto_commit_interval=None, 86 | chunk_size=constants.DEFAULT_MAX_BULK, **kwargs): 87 | """Creates a dictionary to hold document id keys mapped to the 88 | documents as values. 89 | """ 90 | self.unique_key = unique_key 91 | self.auto_commit_interval = auto_commit_interval 92 | self.doc_dict = DocumentStore() 93 | self.url = url 94 | self.chunk_size = chunk_size 95 | self.kwargs = kwargs 96 | 97 | def stop(self): 98 | """Stops any running threads in the DocManager. 99 | """ 100 | pass 101 | 102 | def update(self, document_id, update_spec, namespace, timestamp): 103 | """Apply updates given in update_spec to the document whose id 104 | matches that of doc. 105 | 106 | """ 107 | document = self.doc_dict[document_id].doc 108 | updated = self.apply_update(document, update_spec) 109 | updated[self.unique_key] = updated.pop("_id") 110 | self.upsert(updated, namespace, timestamp) 111 | return updated 112 | 113 | def upsert(self, doc, namespace, timestamp): 114 | """Adds a document to the doc dict. 115 | """ 116 | 117 | # Allow exceptions to be triggered (for testing purposes) 118 | if doc.get('_upsert_exception'): 119 | raise Exception("upsert exception") 120 | 121 | doc_id = doc["_id"] 122 | self.doc_dict[doc_id] = Entry(doc=doc, ns=namespace, ts=timestamp) 123 | 124 | def insert_file(self, f, namespace, timestamp): 125 | """Inserts a file to the doc dict. 126 | """ 127 | doc = f.get_metadata() 128 | doc['content'] = f.read() 129 | self.doc_dict[f._id] = Entry(doc=doc, ns=namespace, ts=timestamp) 130 | 131 | def remove(self, document_id, namespace, timestamp): 132 | """Removes the document from the doc dict. 133 | """ 134 | try: 135 | entry = self.doc_dict[document_id] 136 | entry.doc = None 137 | entry.update(namespace, timestamp) 138 | except KeyError: 139 | raise OperationFailed("Document does not exist: %s" 140 | % u(document_id)) 141 | 142 | def search(self, start_ts, end_ts): 143 | """Searches through all documents and finds all documents that were 144 | modified or deleted within the range. 145 | 146 | Since we have very few documents in the doc dict when this is called, 147 | linear search is fine. This method is only used by rollbacks to query 148 | all the documents in the target engine within a certain timestamp 149 | window. The input will be two longs (converted from Bson timestamp) 150 | which specify the time range. The start_ts refers to the timestamp 151 | of the last oplog entry after a rollback. The end_ts is the timestamp 152 | of the last document committed to the backend. 153 | """ 154 | for _id in self.doc_dict: 155 | entry = self.doc_dict[_id] 156 | if entry.ts <= end_ts or entry.ts >= start_ts: 157 | yield entry.meta_dict 158 | 159 | def commit(self): 160 | """Simply passes since we're not using an engine that needs commiting. 161 | """ 162 | pass 163 | 164 | def get_last_doc(self): 165 | """Searches through the doc dict to find the document that was 166 | modified or deleted most recently.""" 167 | return max(self.doc_dict.values(), key=lambda x: x.ts).meta_dict 168 | 169 | def handle_command(self, command_doc, namespace, timestamp): 170 | pass 171 | 172 | def _search(self): 173 | """Returns all documents in the doc dict. 174 | 175 | This function is not a part of the DocManager API, and is only used 176 | to simulate searching all documents from a backend. 177 | """ 178 | results = [] 179 | for _id in self.doc_dict: 180 | entry = self.doc_dict[_id] 181 | if entry.doc is not None: 182 | results.append(entry.merged_dict) 183 | return results 184 | 185 | def _delete(self): 186 | """Deletes all documents. 187 | 188 | This function is not a part of the DocManager API, and is only used 189 | to simulate deleting all documents from a backend. 190 | """ 191 | self.doc_dict = {} 192 | -------------------------------------------------------------------------------- /mongo_connector/doc_managers/elastic_doc_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Elasticsearch implementation of the DocManager interface. 16 | 17 | Receives documents from an OplogThread and takes the appropriate actions on 18 | Elasticsearch. 19 | """ 20 | import base64 21 | import logging 22 | 23 | from threading import Timer 24 | 25 | import bson.json_util 26 | 27 | from elasticsearch import Elasticsearch, exceptions as es_exceptions 28 | from elasticsearch.helpers import scan, streaming_bulk 29 | 30 | from mongo_connector import errors 31 | from mongo_connector.compat import u 32 | from mongo_connector.constants import (DEFAULT_COMMIT_INTERVAL, 33 | DEFAULT_MAX_BULK) 34 | from mongo_connector.util import exception_wrapper, retry_until_ok 35 | from mongo_connector.doc_managers.doc_manager_base import DocManagerBase 36 | from mongo_connector.doc_managers.formatters import DefaultDocumentFormatter 37 | 38 | wrap_exceptions = exception_wrapper({ 39 | es_exceptions.ConnectionError: errors.ConnectionFailed, 40 | es_exceptions.TransportError: errors.OperationFailed, 41 | es_exceptions.NotFoundError: errors.OperationFailed, 42 | es_exceptions.RequestError: errors.OperationFailed}) 43 | 44 | LOG = logging.getLogger(__name__) 45 | 46 | 47 | class DocManager(DocManagerBase): 48 | """Elasticsearch implementation of the DocManager interface. 49 | 50 | Receives documents from an OplogThread and takes the appropriate actions on 51 | Elasticsearch. 52 | """ 53 | 54 | def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, 55 | unique_key='_id', chunk_size=DEFAULT_MAX_BULK, 56 | meta_index_name="mongodb_meta", meta_type="mongodb_meta", 57 | attachment_field="content", **kwargs): 58 | self.elastic = Elasticsearch(hosts=[url]) 59 | self.auto_commit_interval = auto_commit_interval 60 | self.meta_index_name = meta_index_name 61 | self.meta_type = meta_type 62 | self.unique_key = unique_key 63 | self.chunk_size = chunk_size 64 | if self.auto_commit_interval not in [None, 0]: 65 | self.run_auto_commit() 66 | self._formatter = DefaultDocumentFormatter() 67 | 68 | self.has_attachment_mapping = False 69 | self.attachment_field = attachment_field 70 | 71 | def _index_and_mapping(self, namespace): 72 | """Helper method for getting the index and type from a namespace.""" 73 | index, doc_type = namespace.split('.', 1) 74 | return index.lower(), doc_type 75 | 76 | def stop(self): 77 | """Stop the auto-commit thread.""" 78 | self.auto_commit_interval = None 79 | 80 | def apply_update(self, doc, update_spec): 81 | if "$set" not in update_spec and "$unset" not in update_spec: 82 | # Don't try to add ns and _ts fields back in from doc 83 | return update_spec 84 | return super(DocManager, self).apply_update(doc, update_spec) 85 | 86 | @wrap_exceptions 87 | def handle_command(self, doc, namespace, timestamp): 88 | db = namespace.split('.', 1)[0] 89 | if doc.get('dropDatabase'): 90 | dbs = self.command_helper.map_db(db) 91 | for _db in dbs: 92 | self.elastic.indices.delete(index=_db.lower()) 93 | 94 | if doc.get('renameCollection'): 95 | raise errors.OperationFailed( 96 | "elastic_doc_manager does not support renaming a mapping.") 97 | 98 | if doc.get('create'): 99 | db, coll = self.command_helper.map_collection(db, doc['create']) 100 | if db and coll: 101 | self.elastic.indices.put_mapping( 102 | index=db.lower(), doc_type=coll, 103 | body={ 104 | "_source": {"enabled": True} 105 | }) 106 | 107 | if doc.get('drop'): 108 | db, coll = self.command_helper.map_collection(db, doc['drop']) 109 | if db and coll: 110 | self.elastic.indices.delete_mapping(index=db.lower(), 111 | doc_type=coll) 112 | 113 | @wrap_exceptions 114 | def update(self, document_id, update_spec, namespace, timestamp): 115 | """Apply updates given in update_spec to the document whose id 116 | matches that of doc. 117 | """ 118 | self.commit() 119 | index, doc_type = self._index_and_mapping(namespace) 120 | document = self.elastic.get(index=index, doc_type=doc_type, 121 | id=u(document_id)) 122 | updated = self.apply_update(document['_source'], update_spec) 123 | # _id is immutable in MongoDB, so won't have changed in update 124 | updated['_id'] = document['_id'] 125 | self.upsert(updated, namespace, timestamp) 126 | # upsert() strips metadata, so only _id + fields in _source still here 127 | return updated 128 | 129 | @wrap_exceptions 130 | def upsert(self, doc, namespace, timestamp): 131 | """Insert a document into Elasticsearch.""" 132 | index, doc_type = self._index_and_mapping(namespace) 133 | # No need to duplicate '_id' in source document 134 | doc_id = u(doc.pop("_id")) 135 | metadata = { 136 | "ns": namespace, 137 | "_ts": timestamp 138 | } 139 | # Index the source document, using lowercase namespace as index name. 140 | self.elastic.index(index=index, doc_type=doc_type, 141 | body=self._formatter.format_document(doc), id=doc_id, 142 | refresh=(self.auto_commit_interval == 0)) 143 | # Index document metadata with original namespace (mixed upper/lower). 144 | self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, 145 | body=bson.json_util.dumps(metadata), id=doc_id, 146 | refresh=(self.auto_commit_interval == 0)) 147 | # Leave _id, since it's part of the original document 148 | doc['_id'] = doc_id 149 | 150 | @wrap_exceptions 151 | def bulk_upsert(self, docs, namespace, timestamp): 152 | """Insert multiple documents into Elasticsearch.""" 153 | def docs_to_upsert(): 154 | doc = None 155 | for doc in docs: 156 | # Remove metadata and redundant _id 157 | index, doc_type = self._index_and_mapping(namespace) 158 | doc_id = u(doc.pop("_id")) 159 | document_action = { 160 | "_index": index, 161 | "_type": doc_type, 162 | "_id": doc_id, 163 | "_source": self._formatter.format_document(doc) 164 | } 165 | document_meta = { 166 | "_index": self.meta_index_name, 167 | "_type": self.meta_type, 168 | "_id": doc_id, 169 | "_source": { 170 | "ns": index, 171 | "_ts": timestamp 172 | } 173 | } 174 | yield document_action 175 | yield document_meta 176 | if not doc: 177 | raise errors.EmptyDocsError( 178 | "Cannot upsert an empty sequence of " 179 | "documents into Elastic Search") 180 | try: 181 | kw = {} 182 | if self.chunk_size > 0: 183 | kw['chunk_size'] = self.chunk_size 184 | 185 | responses = streaming_bulk(client=self.elastic, 186 | actions=docs_to_upsert(), 187 | **kw) 188 | 189 | for ok, resp in responses: 190 | if not ok: 191 | LOG.error( 192 | "Could not bulk-upsert document " 193 | "into ElasticSearch: %r" % resp) 194 | if self.auto_commit_interval == 0: 195 | self.commit() 196 | except errors.EmptyDocsError: 197 | # This can happen when mongo-connector starts up, there is no 198 | # config file, but nothing to dump 199 | pass 200 | 201 | @wrap_exceptions 202 | def insert_file(self, f, namespace, timestamp): 203 | doc = f.get_metadata() 204 | doc_id = str(doc.pop('_id')) 205 | index, doc_type = self._index_and_mapping(namespace) 206 | 207 | # make sure that elasticsearch treats it like a file 208 | if not self.has_attachment_mapping: 209 | body = { 210 | "properties": { 211 | self.attachment_field: {"type": "attachment"} 212 | } 213 | } 214 | self.elastic.indices.put_mapping(index=index, 215 | doc_type=doc_type, 216 | body=body) 217 | self.has_attachment_mapping = True 218 | 219 | metadata = { 220 | 'ns': namespace, 221 | '_ts': timestamp, 222 | } 223 | 224 | doc = self._formatter.format_document(doc) 225 | doc[self.attachment_field] = base64.b64encode(f.read()).decode() 226 | 227 | self.elastic.index(index=index, doc_type=doc_type, 228 | body=doc, id=doc_id, 229 | refresh=(self.auto_commit_interval == 0)) 230 | self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, 231 | body=bson.json_util.dumps(metadata), id=doc_id, 232 | refresh=(self.auto_commit_interval == 0)) 233 | 234 | @wrap_exceptions 235 | def remove(self, document_id, namespace, timestamp): 236 | """Remove a document from Elasticsearch.""" 237 | index, doc_type = self._index_and_mapping(namespace) 238 | self.elastic.delete(index=index, doc_type=doc_type, 239 | id=u(document_id), 240 | refresh=(self.auto_commit_interval == 0)) 241 | self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type, 242 | id=u(document_id), 243 | refresh=(self.auto_commit_interval == 0)) 244 | 245 | @wrap_exceptions 246 | def _stream_search(self, *args, **kwargs): 247 | """Helper method for iterating over ES search results.""" 248 | for hit in scan(self.elastic, query=kwargs.pop('body', None), 249 | scroll='10m', **kwargs): 250 | hit['_source']['_id'] = hit['_id'] 251 | yield hit['_source'] 252 | 253 | def search(self, start_ts, end_ts): 254 | """Query Elasticsearch for documents in a time range. 255 | 256 | This method is used to find documents that may be in conflict during 257 | a rollback event in MongoDB. 258 | """ 259 | return self._stream_search( 260 | index=self.meta_index_name, 261 | body={ 262 | "query": { 263 | "filtered": { 264 | "filter": { 265 | "range": { 266 | "_ts": {"gte": start_ts, "lte": end_ts} 267 | } 268 | } 269 | } 270 | } 271 | }) 272 | 273 | def commit(self): 274 | """Refresh all Elasticsearch indexes.""" 275 | retry_until_ok(self.elastic.indices.refresh, index="") 276 | 277 | def run_auto_commit(self): 278 | """Periodically commit to the Elastic server.""" 279 | self.elastic.indices.refresh() 280 | if self.auto_commit_interval not in [None, 0]: 281 | Timer(self.auto_commit_interval, self.run_auto_commit).start() 282 | 283 | @wrap_exceptions 284 | def get_last_doc(self): 285 | """Get the most recently modified document from Elasticsearch. 286 | 287 | This method is used to help define a time window within which documents 288 | may be in conflict after a MongoDB rollback. 289 | """ 290 | try: 291 | result = self.elastic.search( 292 | index=self.meta_index_name, 293 | body={ 294 | "query": {"match_all": {}}, 295 | "sort": [{"_ts": "desc"}], 296 | }, 297 | size=1 298 | )["hits"]["hits"] 299 | for r in result: 300 | r['_source']['_id'] = r['_id'] 301 | return r['_source'] 302 | except es_exceptions.RequestError: 303 | # no documents so ES returns 400 because of undefined _ts mapping 304 | return None 305 | -------------------------------------------------------------------------------- /mongo_connector/doc_managers/formatters.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import datetime 3 | import re 4 | 5 | from uuid import UUID 6 | from math import isnan, isinf 7 | 8 | import logging 9 | LOG = logging.getLogger(__name__) 10 | 11 | 12 | import bson 13 | import bson.json_util 14 | 15 | from mongo_connector.compat import PY3 16 | 17 | if PY3: 18 | long = int 19 | unicode = str 20 | 21 | RE_TYPE = type(re.compile("")) 22 | try: 23 | from bson.regex import Regex 24 | RE_TYPES = (RE_TYPE, Regex) 25 | except ImportError: 26 | RE_TYPES = (RE_TYPE,) 27 | 28 | 29 | class DocumentFormatter(object): 30 | """Interface for classes that can transform documents to conform to 31 | external drivers' expectations. 32 | """ 33 | 34 | def transform_value(self, value): 35 | """Transform a leaf-node in a document. 36 | 37 | This method may be overridden to provide custom handling for specific 38 | types of values. 39 | """ 40 | raise NotImplementedError 41 | 42 | def transform_element(self, key, value): 43 | """Transform a single key-value pair within a document. 44 | 45 | This method may be overridden to provide custom handling for specific 46 | types of values. This method should return an iterator over the 47 | resulting key-value pairs. 48 | """ 49 | raise NotImplementedError 50 | 51 | def format_document(self, document): 52 | """Format a document in preparation to be sent to an external driver.""" 53 | raise NotImplementedError 54 | 55 | 56 | class DefaultDocumentFormatter(DocumentFormatter): 57 | """Basic DocumentFormatter that preserves numbers, base64-encodes binary, 58 | and stringifies everything else. 59 | """ 60 | 61 | def transform_value(self, value): 62 | # This is largely taken from bson.json_util.default, though not the same 63 | # so we don't modify the structure of the document 64 | if isinstance(value, dict): 65 | return self.format_document(value) 66 | elif isinstance(value, list): 67 | return [self.transform_value(v) for v in value] 68 | if isinstance(value, RE_TYPES): 69 | flags = "" 70 | if value.flags & re.IGNORECASE: 71 | flags += "i" 72 | if value.flags & re.LOCALE: 73 | flags += "l" 74 | if value.flags & re.MULTILINE: 75 | flags += "m" 76 | if value.flags & re.DOTALL: 77 | flags += "s" 78 | if value.flags & re.UNICODE: 79 | flags += "u" 80 | if value.flags & re.VERBOSE: 81 | flags += "x" 82 | pattern = value.pattern 83 | # quasi-JavaScript notation (may include non-standard flags) 84 | return '/%s/%s' % (pattern, flags) 85 | elif (isinstance(value, bson.Binary) or 86 | (PY3 and isinstance(value, bytes))): 87 | # Just include body of binary data without subtype 88 | return base64.b64encode(value).decode() 89 | elif isinstance(value, UUID): 90 | return value.hex 91 | elif isinstance(value, (int, long, float)): 92 | if isnan(value): 93 | raise ValueError("nan") 94 | elif isinf(value): 95 | raise ValueError("inf") 96 | return value 97 | elif isinstance(value, datetime.datetime): 98 | return value 99 | elif value is None: 100 | return value 101 | # Default 102 | return unicode(value) 103 | 104 | def transform_element(self, key, value): 105 | try: 106 | new_value = self.transform_value(value) 107 | yield key, new_value 108 | except ValueError as e: 109 | LOG.warn("Invalid value for key: %s as %s" 110 | % (key, str(e))) 111 | 112 | def format_document(self, document): 113 | def _kernel(doc): 114 | for key in doc: 115 | value = doc[key] 116 | for new_k, new_v in self.transform_element(key, value): 117 | yield new_k, new_v 118 | return dict(_kernel(document)) 119 | 120 | 121 | class DocumentFlattener(DefaultDocumentFormatter): 122 | """Formatter that completely flattens documents and unwinds arrays: 123 | 124 | An example: 125 | {"a": 2, 126 | "b": { 127 | "c": { 128 | "d": 5 129 | } 130 | }, 131 | "e": [6, 7, 8] 132 | } 133 | 134 | becomes: 135 | {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} 136 | 137 | """ 138 | 139 | def transform_element(self, key, value): 140 | if isinstance(value, list): 141 | for li, lv in enumerate(value): 142 | for inner_k, inner_v in self.transform_element( 143 | "%s.%s" % (key, li), lv): 144 | yield inner_k, inner_v 145 | elif isinstance(value, dict): 146 | formatted = self.format_document(value) 147 | for doc_key in formatted: 148 | yield "%s.%s" % (key, doc_key), formatted[doc_key] 149 | else: 150 | # We assume that transform_value will return a 'flat' value, 151 | # not a list or dict 152 | yield key, self.transform_value(value) 153 | 154 | def format_document(self, document): 155 | def flatten(doc, path): 156 | top_level = (len(path) == 0) 157 | if not top_level: 158 | path_string = ".".join(path) 159 | for k in doc: 160 | v = doc[k] 161 | if isinstance(v, dict): 162 | path.append(k) 163 | for inner_k, inner_v in flatten(v, path): 164 | yield inner_k, inner_v 165 | path.pop() 166 | else: 167 | transformed = self.transform_element(k, v) 168 | for new_k, new_v in transformed: 169 | if top_level: 170 | yield new_k, new_v 171 | else: 172 | yield "%s.%s" % (path_string, new_k), new_v 173 | return dict(flatten(document, [])) 174 | -------------------------------------------------------------------------------- /mongo_connector/doc_managers/mongo_doc_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Receives documents from the oplog worker threads and indexes them 16 | into the backend. 17 | 18 | This file is a document manager for MongoDB, but the intent 19 | is that this file can be used as an example to add on different backends. 20 | To extend this to other systems, simply implement the exact same class and 21 | replace the method definitions with API calls for the desired backend. 22 | """ 23 | 24 | import logging 25 | import pymongo 26 | 27 | from gridfs import GridFS 28 | from mongo_connector import errors, constants 29 | from mongo_connector.util import exception_wrapper 30 | from mongo_connector.doc_managers.doc_manager_base import DocManagerBase 31 | 32 | wrap_exceptions = exception_wrapper({ 33 | pymongo.errors.ConnectionFailure: errors.ConnectionFailed, 34 | pymongo.errors.OperationFailure: errors.OperationFailed}) 35 | 36 | LOG = logging.getLogger(__name__) 37 | 38 | 39 | class DocManager(DocManagerBase): 40 | """The DocManager class creates a connection to the backend engine and 41 | adds/removes documents, and in the case of rollback, searches for them. 42 | 43 | The reason for storing id/doc pairs as opposed to doc's is so that 44 | multiple updates to the same doc reflect the most up to date version as 45 | opposed to multiple, slightly different versions of a doc. 46 | 47 | We are using MongoDB native fields for _id and ns, but we also store 48 | them as fields in the document, due to compatibility issues. 49 | """ 50 | 51 | def __init__(self, url, **kwargs): 52 | """ Verify URL and establish a connection. 53 | """ 54 | try: 55 | self.mongo = pymongo.MongoClient(url) 56 | except pymongo.errors.InvalidURI: 57 | raise errors.ConnectionFailed("Invalid URI for MongoDB") 58 | except pymongo.errors.ConnectionFailure: 59 | raise errors.ConnectionFailed("Failed to connect to MongoDB") 60 | self.namespace_set = kwargs.get("namespace_set") 61 | self.chunk_size = kwargs.get('chunk_size', constants.DEFAULT_MAX_BULK) 62 | 63 | def _db_and_collection(self, namespace): 64 | return namespace.split('.', 1) 65 | 66 | @wrap_exceptions 67 | def _namespaces(self): 68 | """Provides the list of namespaces being replicated to MongoDB 69 | """ 70 | if self.namespace_set: 71 | return self.namespace_set 72 | 73 | user_namespaces = [] 74 | db_list = self.mongo.database_names() 75 | for database in db_list: 76 | if database == "config" or database == "local": 77 | continue 78 | coll_list = self.mongo[database].collection_names() 79 | for coll in coll_list: 80 | if coll.startswith("system"): 81 | continue 82 | namespace = "%s.%s" % (database, coll) 83 | user_namespaces.append(namespace) 84 | return user_namespaces 85 | 86 | def stop(self): 87 | """Stops any running threads 88 | """ 89 | LOG.info( 90 | "Mongo DocManager Stopped: If you will not target this system " 91 | "again with mongo-connector then you may drop the database " 92 | "__mongo_connector, which holds metadata for Mongo Connector." 93 | ) 94 | 95 | @wrap_exceptions 96 | def handle_command(self, doc, namespace, timestamp): 97 | db, _ = self._db_and_collection(namespace) 98 | if doc.get('dropDatabase'): 99 | for new_db in self.command_helper.map_db(db): 100 | self.mongo.drop_database(db) 101 | 102 | if doc.get('renameCollection'): 103 | a = self.command_helper.map_namespace(doc['renameCollection']) 104 | b = self.command_helper.map_namespace(doc['to']) 105 | if a and b: 106 | self.mongo.admin.command( 107 | "renameCollection", a, to=b) 108 | 109 | if doc.get('create'): 110 | new_db, coll = self.command_helper.map_collection( 111 | db, doc['create']) 112 | if new_db: 113 | self.mongo[new_db].create_collection(coll) 114 | 115 | if doc.get('drop'): 116 | new_db, coll = self.command_helper.map_collection( 117 | db, doc['drop']) 118 | if new_db: 119 | self.mongo[new_db].drop_collection(coll) 120 | 121 | @wrap_exceptions 122 | def update(self, document_id, update_spec, namespace, timestamp): 123 | """Apply updates given in update_spec to the document whose id 124 | matches that of doc. 125 | 126 | """ 127 | db, coll = self._db_and_collection(namespace) 128 | updated = self.mongo[db][coll].find_and_modify( 129 | {'_id': document_id}, 130 | update_spec, 131 | new=True 132 | ) 133 | return updated 134 | 135 | @wrap_exceptions 136 | def upsert(self, doc, namespace, timestamp): 137 | """Update or insert a document into Mongo 138 | """ 139 | database, coll = self._db_and_collection(namespace) 140 | 141 | self.mongo["__mongo_connector"][namespace].save({ 142 | '_id': doc['_id'], 143 | "_ts": timestamp, 144 | "ns": namespace 145 | }) 146 | self.mongo[database][coll].save(doc) 147 | 148 | @wrap_exceptions 149 | def bulk_upsert(self, docs, namespace, timestamp): 150 | def iterate_chunks(): 151 | dbname, collname = self._db_and_collection(namespace) 152 | collection = self.mongo[dbname][collname] 153 | meta_collection = self.mongo['__mongo_connector'][namespace] 154 | more_chunks = True 155 | while more_chunks: 156 | bulk = collection.initialize_ordered_bulk_op() 157 | bulk_meta = meta_collection.initialize_ordered_bulk_op() 158 | for i in range(self.chunk_size): 159 | try: 160 | doc = next(docs) 161 | selector = {'_id': doc['_id']} 162 | bulk.find(selector).upsert().replace_one(doc) 163 | bulk_meta.find(selector).upsert().replace_one({ 164 | '_id': doc['_id'], 165 | 'ns': namespace, 166 | '_ts': timestamp 167 | }) 168 | except StopIteration: 169 | more_chunks = False 170 | if i > 0: 171 | yield bulk, bulk_meta 172 | break 173 | if more_chunks: 174 | yield bulk, bulk_meta 175 | 176 | for bulk_op, meta_bulk_op in iterate_chunks(): 177 | try: 178 | bulk_op.execute() 179 | meta_bulk_op.execute() 180 | except pymongo.errors.DuplicateKeyError as e: 181 | LOG.warn('Continuing after DuplicateKeyError: ' 182 | + str(e)) 183 | 184 | @wrap_exceptions 185 | def remove(self, document_id, namespace, timestamp): 186 | """Removes document from Mongo 187 | 188 | The input is a python dictionary that represents a mongo document. 189 | The documents has ns and _ts fields. 190 | """ 191 | database, coll = self._db_and_collection(namespace) 192 | 193 | doc2 = self.mongo['__mongo_connector'][namespace].find_and_modify( 194 | {'_id': document_id}, remove=True) 195 | if (doc2 and doc2.get('gridfs_id')): 196 | GridFS(self.mongo[database], coll).delete(doc2['gridfs_id']) 197 | else: 198 | self.mongo[database][coll].remove({'_id': document_id}) 199 | 200 | @wrap_exceptions 201 | def insert_file(self, f, namespace, timestamp): 202 | database, coll = self._db_and_collection(namespace) 203 | 204 | id = GridFS(self.mongo[database], coll).put(f, filename=f.filename) 205 | self.mongo["__mongo_connector"][namespace].save({ 206 | '_id': f._id, 207 | '_ts': timestamp, 208 | 'ns': namespace, 209 | 'gridfs_id': id 210 | }) 211 | 212 | @wrap_exceptions 213 | def search(self, start_ts, end_ts): 214 | """Called to query Mongo for documents in a time range. 215 | """ 216 | for namespace in self._namespaces(): 217 | database, coll = self._db_and_collection(namespace) 218 | for ts_ns_doc in self.mongo["__mongo_connector"][namespace].find( 219 | {'_ts': {'$lte': end_ts, 220 | '$gte': start_ts}} 221 | ): 222 | yield ts_ns_doc 223 | 224 | def commit(self): 225 | """ Performs a commit 226 | """ 227 | return 228 | 229 | @wrap_exceptions 230 | def get_last_doc(self): 231 | """Returns the last document stored in Mongo. 232 | """ 233 | def docs_by_ts(): 234 | for namespace in self._namespaces(): 235 | database, coll = self._db_and_collection(namespace) 236 | mc_coll = self.mongo["__mongo_connector"][namespace] 237 | for ts_ns_doc in mc_coll.find(limit=1).sort('_ts', -1): 238 | yield ts_ns_doc 239 | 240 | return max(docs_by_ts(), key=lambda x: x["_ts"]) 241 | -------------------------------------------------------------------------------- /mongo_connector/errors.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Exceptions raised by the mongo_connector package.""" 16 | 17 | 18 | class MongoConnectorError(Exception): 19 | """Base class for all exceptions in the mongo_connector package 20 | """ 21 | 22 | 23 | class ConnectionFailed(MongoConnectorError): 24 | """Raised when mongo-connector can't connect to target system 25 | """ 26 | 27 | 28 | class OperationFailed(MongoConnectorError): 29 | """Raised for failed commands on the destination database 30 | """ 31 | 32 | 33 | class InvalidConfiguration(MongoConnectorError): 34 | """Raised when the user specifies an invalid configuration 35 | """ 36 | 37 | 38 | class EmptyDocsError(MongoConnectorError): 39 | """Raised on attempts to upsert empty sequences of documents 40 | """ 41 | 42 | 43 | class UpdateDoesNotApply(OperationFailed): 44 | """Raised when an update operation cannot be applied to a document.""" 45 | -------------------------------------------------------------------------------- /mongo_connector/gridfs_file.py: -------------------------------------------------------------------------------- 1 | import gridfs 2 | 3 | from mongo_connector import errors, util 4 | 5 | wrap_exceptions = util.exception_wrapper({ 6 | gridfs.errors.CorruptGridFile: errors.OperationFailed 7 | }) 8 | 9 | 10 | class GridFSFile(object): 11 | @wrap_exceptions 12 | def __init__(self, collection, doc): 13 | self._id = doc['_id'] 14 | self.f = gridfs.GridOut(collection, file_document=doc) 15 | self.filename = self.f.filename 16 | self.length = self.f.length 17 | self.upload_date = self.f.upload_date 18 | self.md5 = self.f.md5 19 | 20 | def get_metadata(self): 21 | result = { 22 | '_id': self._id, 23 | 'upload_date': self.upload_date, 24 | 'md5': self.md5, 25 | } 26 | if self.filename is not None: 27 | result['filename'] = self.filename 28 | return result 29 | 30 | def __len__(self): 31 | return self.length 32 | 33 | @wrap_exceptions 34 | def read(self, n=-1): 35 | return self.f.read(n) 36 | -------------------------------------------------------------------------------- /mongo_connector/locking_dict.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | 4 | class LockingDict(): 5 | 6 | def __init__(self): 7 | 8 | self.dict = {} 9 | self.lock = threading.Lock() 10 | 11 | def __enter__(self): 12 | self.acquire_lock() 13 | return self 14 | 15 | def __exit__(self, type, value, traceback): 16 | self.release_lock() 17 | 18 | def get_dict(self): 19 | return self.dict 20 | 21 | def acquire_lock(self): 22 | self.lock.acquire() 23 | 24 | def release_lock(self): 25 | self.lock.release() 26 | -------------------------------------------------------------------------------- /mongo_connector/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """A set of utilities used throughout the mongo-connector 16 | """ 17 | 18 | import logging 19 | import sys 20 | import time 21 | 22 | from bson.timestamp import Timestamp 23 | from mongo_connector.compat import reraise 24 | 25 | LOG = logging.getLogger(__name__) 26 | 27 | 28 | def exception_wrapper(mapping): 29 | def decorator(f): 30 | def wrapped(*args, **kwargs): 31 | try: 32 | return f(*args, **kwargs) 33 | except: 34 | exc_type, exc_value, exc_tb = sys.exc_info() 35 | new_type = mapping.get(exc_type) 36 | if new_type is None: 37 | raise 38 | reraise(new_type, exc_value, exc_tb) 39 | return wrapped 40 | return decorator 41 | 42 | 43 | def bson_ts_to_long(timestamp): 44 | """Convert BSON timestamp into integer. 45 | 46 | Conversion rule is based from the specs 47 | (http://bsonspec.org/#/specification). 48 | """ 49 | return ((timestamp.time << 32) + timestamp.inc) 50 | 51 | 52 | def long_to_bson_ts(val): 53 | """Convert integer into BSON timestamp. 54 | """ 55 | seconds = val >> 32 56 | increment = val & 0xffffffff 57 | 58 | return Timestamp(seconds, increment) 59 | 60 | 61 | def retry_until_ok(func, *args, **kwargs): 62 | """Retry code block until it succeeds. 63 | 64 | If it does not succeed in 60 attempts, the function re-raises any 65 | error the function raised on its last attempt. 66 | 67 | """ 68 | 69 | count = 0 70 | while True: 71 | try: 72 | return func(*args, **kwargs) 73 | except Exception: 74 | count += 1 75 | if count > 120: 76 | LOG.exception('Call to %s failed too many times in ' 77 | 'retry_until_ok', func) 78 | raise 79 | time.sleep(1) 80 | 81 | 82 | def log_fatal_exceptions(func): 83 | def wrapped(*args, **kwargs): 84 | try: 85 | func(*args, **kwargs) 86 | except Exception: 87 | LOG.error("Fatal Exception") 88 | raise 89 | return wrapped 90 | -------------------------------------------------------------------------------- /scripts/mongo-connector: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # mongo-connector Start Mongo Connector 4 | # 5 | # chkconfig: 345 90 25 6 | # description: Mongo Connector replicates data from MongoDB to external 7 | # database systems. 8 | 9 | ### BEGIN INIT INFO 10 | # Provides: mongo-connector 11 | # Default-Start: 3 4 5 12 | # Default-Stop: 0 1 6 13 | # Required-Start: 14 | # Required-Stop: 15 | # Short-Description: Start up Mongo Connector 16 | # Description: Mongo Connector replicates data from MongoDB to external 17 | # database systems. 18 | ### END INIT INFO 19 | 20 | # source function library 21 | DEBIAN= 22 | if [ -f /lib/lsb/init-functions ]; then 23 | DEBIAN=1 24 | . /lib/lsb/init-functions 25 | else 26 | . /etc/rc.d/init.d/functions 27 | fi 28 | 29 | RETVAL=0 30 | 31 | pidfile=/var/run/mongo-connector.pid 32 | config=/etc/mongo-connector.json 33 | error=/var/log/mongo-connector/mongo-connector.err 34 | 35 | mc="/usr/bin/python -m mongo_connector.connector \ 36 | -c $config >/dev/null 2> $error & \ 37 | echo "'$!'" > $pidfile" 38 | 39 | start() 40 | { 41 | echo "starting mongo-connector " 42 | if [ "$DEBIAN" ]; then 43 | /sbin/start-stop-daemon --start --exec /bin/sh -p $pidfile -- -c "$mc" 44 | else 45 | daemon --pidfile $pidfile "$mc" 46 | fi 47 | 48 | RETVAL=$? 49 | if [ $RETVAL -eq 0 ]; then 50 | echo "done." 51 | else 52 | echo "failed. Please check exit code and logs for more information" 53 | fi 54 | 55 | return $RETVAL 56 | } 57 | 58 | stop() 59 | { 60 | echo "stopping mongo-connector: " 61 | killproc -p $pidfile 62 | 63 | RETVAL=$? 64 | if [ $RETVAL -eq 0 ]; then 65 | echo "done." 66 | rm -f $lockfile 67 | else 68 | echo "failed. Please check exit code and logs for more information" 69 | fi 70 | return $RETVAL 71 | } 72 | 73 | restart() { 74 | $0 stop 75 | $0 start 76 | } 77 | 78 | check_status() { 79 | if [ "$DEBIAN" ]; then 80 | echo -n "mongo-connector " 81 | status_of_proc -p $pidfile mongo-connector 82 | else 83 | status -p $pidfile mongo-connector 84 | fi 85 | } 86 | 87 | case "$1" in 88 | start) 89 | start 90 | ;; 91 | stop) 92 | stop 93 | ;; 94 | restart) 95 | restart 96 | ;; 97 | status) 98 | check_status 99 | ;; 100 | *) 101 | echo $"Usage: $0 {start|stop|restart|status}" 102 | RETVAL=2 103 | esac 104 | exit $RETVAL 105 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | classifiers = """\ 16 | Development Status :: 4 - Beta 17 | Intended Audience :: Developers 18 | License :: OSI Approved :: Apache Software License 19 | Programming Language :: Python :: 2.6 20 | Programming Language :: Python :: 2.7 21 | Programming Language :: Python :: 3.3 22 | Programming Language :: Python :: 3.4 23 | Topic :: Database 24 | Topic :: Software Development :: Libraries :: Python Modules 25 | Operating System :: Unix 26 | Operating System :: MacOS :: MacOS X 27 | Operating System :: Microsoft :: Windows 28 | Operating System :: POSIX 29 | """ 30 | 31 | import os 32 | import platform 33 | import sys 34 | from distutils.core import Command 35 | from distutils.dir_util import mkpath, remove_tree 36 | from distutils.file_util import copy_file 37 | try: 38 | from setuptools import setup 39 | except ImportError: 40 | from ez_setup import use_setuptools 41 | use_setuptools() 42 | from setuptools import setup 43 | 44 | extra_opts = {"test_suite": "tests", 45 | "tests_require": ["mongo-orchestration>=0.2", "requests>=2.5.1"]} 46 | 47 | if sys.version_info[:2] == (2, 6): 48 | # Need unittest2 to run unittests in Python 2.6 49 | extra_opts["tests_require"].append("unittest2") 50 | extra_opts["test_suite"] = "unittest2.collector" 51 | 52 | try: 53 | with open("README.rst", "r") as fd: 54 | extra_opts['long_description'] = fd.read() 55 | except IOError: 56 | pass # Install without README.rst 57 | 58 | 59 | class InstallService(Command): 60 | description = "Installs Mongo Connector as a Linux system daemon" 61 | 62 | user_options = [] 63 | 64 | def initialize_options(self): 65 | pass 66 | 67 | def finalize_options(self): 68 | pass 69 | 70 | def run(self): 71 | if platform.system() != 'Linux': 72 | print("Must be running Linux") 73 | elif os.geteuid() > 0: 74 | print("Must be root user") 75 | else: 76 | mkpath("/var/log/mongo-connector") 77 | mkpath("/etc/init.d") 78 | copy_file("./config.json", "/etc/mongo-connector.json") 79 | copy_file("./scripts/mongo-connector", 80 | "/etc/init.d/mongo-connector") 81 | 82 | 83 | class UninstallService(Command): 84 | description = "Uninstalls Mongo Connector as a Linux system daemon" 85 | 86 | user_options = [] 87 | 88 | def initialize_options(self): 89 | pass 90 | 91 | def finalize_options(self): 92 | pass 93 | 94 | def remove_file(self, path): 95 | if os.path.exists(path): 96 | os.remove(path) 97 | print("removing '%s'" % path) 98 | 99 | def run(self): 100 | if platform.system() != 'Linux': 101 | print("Must be running Linux") 102 | elif os.geteuid() > 0: 103 | print("Must be root user") 104 | else: 105 | if os.path.exists("/var/log/mongo-connector"): 106 | remove_tree("/var/log/mongo-connector") 107 | self.remove_file("/etc/mongo-connector.json") 108 | self.remove_file("/etc/init.d/mongo-connector") 109 | 110 | extra_opts['cmdclass'] = { 111 | "install_service": InstallService, 112 | "uninstall_service": UninstallService 113 | } 114 | 115 | setup(name='mongo-connector', 116 | version="2.1.dev0", 117 | author="MongoDB, Inc.", 118 | author_email='mongodb-user@googlegroups.com', 119 | description='Mongo Connector', 120 | keywords=['mongo-connector', 'mongo', 'mongodb', 'solr', 'elasticsearch'], 121 | url='https://github.com/10gen-labs/mongo-connector', 122 | license="http://www.apache.org/licenses/LICENSE-2.0.html", 123 | platforms=["any"], 124 | classifiers=filter(None, classifiers.split("\n")), 125 | install_requires=['pymongo >= 2.7.2, < 3.0.0', 126 | 'pysolr >= 3.1.0', 127 | 'elasticsearch >= 1.2', 128 | 'algoliasearch >= 1.5.4'], 129 | packages=["mongo_connector", "mongo_connector.doc_managers"], 130 | package_data={ 131 | 'mongo_connector.doc_managers': ['schema.xml'] 132 | }, 133 | entry_points={ 134 | 'console_scripts': [ 135 | 'mongo-connector = mongo_connector.connector:main', 136 | ], 137 | }, 138 | **extra_opts 139 | ) 140 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import logging 15 | import os 16 | import sys 17 | 18 | logging.basicConfig(stream=sys.stdout) 19 | 20 | if sys.version_info[0] == 3: 21 | unicode = str 22 | 23 | if sys.version_info[:2] == (2, 6): 24 | import unittest2 as unittest 25 | from unittest2.case import SkipTest 26 | else: 27 | import unittest 28 | from unittest.case import SkipTest 29 | 30 | # Configurable hosts and ports used in the tests 31 | elastic_host = unicode(os.environ.get("ES_HOST", 'localhost')) 32 | elastic_port = unicode(os.environ.get("ES_PORT", 9200)) 33 | elastic_pair = '%s:%s' % (elastic_host, elastic_port) 34 | solr_url = unicode(os.environ.get('SOLR_URL', 'http://localhost:8983/solr')) 35 | db_user = unicode(os.environ.get("DB_USER", "")) 36 | db_password = unicode(os.environ.get("DB_PASSWORD", "")) 37 | # Extra keyword options to provide to Connector. 38 | connector_opts = {} 39 | if db_user: 40 | connector_opts = {'auth_username': db_user, 'auth_key': db_password} 41 | 42 | # Document count for stress tests 43 | STRESS_COUNT = 100 44 | 45 | # Test namespace, timestamp arguments 46 | TESTARGS = ('test.test', 1) 47 | -------------------------------------------------------------------------------- /tests/lib/dummy.pwd: -------------------------------------------------------------------------------- 1 | secret -------------------------------------------------------------------------------- /tests/setup_cluster.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import atexit 16 | import itertools 17 | import os 18 | 19 | import pymongo 20 | import requests 21 | 22 | from tests import db_user, db_password 23 | 24 | _mo_address = os.environ.get("MO_ADDRESS", "localhost:8889") 25 | _mongo_start_port = int(os.environ.get("MONGO_PORT", 27017)) 26 | _free_port = itertools.count(_mongo_start_port) 27 | 28 | DEFAULT_OPTIONS = { 29 | 'logappend': True, 30 | 'setParameter': {'enableTestCommands': 1} 31 | } 32 | 33 | 34 | _post_request_template = {} 35 | if db_user and db_password: 36 | _post_request_template = {'login': db_user, 'password': db_password} 37 | 38 | 39 | def _proc_params(mongos=False): 40 | params = dict(port=next(_free_port), **DEFAULT_OPTIONS) 41 | if not mongos: 42 | params['smallfiles'] = True 43 | params['noprealloc'] = True 44 | params['nojournal'] = True 45 | 46 | return params 47 | 48 | 49 | def _mo_url(resource, *args): 50 | return 'http://' + '/'.join([_mo_address, resource] + list(args)) 51 | 52 | 53 | @atexit.register 54 | def kill_all(): 55 | clusters = requests.get(_mo_url('sharded_clusters')).json() 56 | repl_sets = requests.get(_mo_url('replica_sets')).json() 57 | servers = requests.get(_mo_url('servers')).json() 58 | for cluster in clusters['sharded_clusters']: 59 | requests.delete(_mo_url('sharded_clusters', cluster['id'])) 60 | for rs in repl_sets['replica_sets']: 61 | requests.delete(_mo_url('relica_sets', rs['id'])) 62 | for server in servers['servers']: 63 | requests.delete(_mo_url('servers', server['id'])) 64 | 65 | 66 | class MCTestObject(object): 67 | 68 | def get_config(self): 69 | raise NotImplementedError 70 | 71 | def _make_post_request(self): 72 | config = _post_request_template.copy() 73 | config.update(self.get_config()) 74 | return requests.post( 75 | _mo_url(self._resource), timeout=None, json=config).json() 76 | 77 | def client(self, **kwargs): 78 | client = pymongo.MongoClient(self.uri, **kwargs) 79 | if db_user: 80 | client.admin.authenticate(db_user, db_password) 81 | return client 82 | 83 | def stop(self): 84 | requests.delete(_mo_url(self._resource, self.id)) 85 | 86 | 87 | class Server(MCTestObject): 88 | 89 | _resource = 'servers' 90 | 91 | def __init__(self, id=None, uri=None): 92 | self.id = id 93 | self.uri = uri 94 | 95 | def get_config(self): 96 | return {'name': 'mongod', 'procParams': _proc_params()} 97 | 98 | def start(self): 99 | if self.id is None: 100 | response = self._make_post_request() 101 | self.id = response['id'] 102 | self.uri = response.get('mongodb_auth_uri', response['mongodb_uri']) 103 | else: 104 | requests.post( 105 | _mo_url('servers', self.id), timeout=None, 106 | json={'action': 'start'} 107 | ) 108 | return self 109 | 110 | def stop(self, destroy=True): 111 | if destroy: 112 | super(Server, self).stop() 113 | else: 114 | requests.post(_mo_url('servers', self.id), timeout=None, 115 | json={'action': 'stop'}) 116 | 117 | 118 | class ReplicaSet(MCTestObject): 119 | 120 | _resource = 'replica_sets' 121 | 122 | def __init__(self, id=None, uri=None, primary=None, secondary=None): 123 | self.id = id 124 | self.uri = uri 125 | self.primary = primary 126 | self.secondary = secondary 127 | 128 | def get_config(self): 129 | return { 130 | 'members': [ 131 | {'procParams': _proc_params()}, 132 | {'procParams': _proc_params()}, 133 | {'rsParams': {'arbiterOnly': True}, 134 | 'procParams': _proc_params()} 135 | ] 136 | } 137 | 138 | def _init_from_response(self, response): 139 | self.id = response['id'] 140 | self.uri = response.get('mongodb_auth_uri', response['mongodb_uri']) 141 | for member in response['members']: 142 | if member['state'] == 1: 143 | self.primary = Server(member['server_id'], member['host']) 144 | elif member['state'] == 2: 145 | self.secondary = Server(member['server_id'], member['host']) 146 | return self 147 | 148 | def start(self): 149 | # We never need to restart a replica set, only start new ones. 150 | return self._init_from_response(self._make_post_request()) 151 | 152 | 153 | class ShardedCluster(MCTestObject): 154 | 155 | _resource = 'sharded_clusters' 156 | 157 | def __init__(self): 158 | self.id = None 159 | self.uri = None 160 | self.shards = [] 161 | 162 | def get_config(self): 163 | return { 164 | 'shards': [ 165 | {'id': 'demo-set-0', 'shardParams': ReplicaSet().get_config()}, 166 | {'id': 'demo-set-1', 'shardParams': ReplicaSet().get_config()} 167 | ], 168 | 'routers': [_proc_params(mongos=True)], 169 | 'configsvrs': [_proc_params()] 170 | } 171 | 172 | def start(self): 173 | # We never need to restart a sharded cluster, only start new ones. 174 | response = self._make_post_request() 175 | for shard in response['shards']: 176 | if shard['id'] == 'demo-set-0': 177 | repl1_id = shard['_id'] 178 | elif shard['id'] == 'demo-set-1': 179 | repl2_id = shard['_id'] 180 | shard1 = requests.get(_mo_url('replica_sets', repl1_id)).json() 181 | shard2 = requests.get(_mo_url('replica_sets', repl2_id)).json() 182 | self.id = response['id'] 183 | self.uri = response.get('mongodb_auth_uri', response['mongodb_uri']) 184 | self.shards = [ReplicaSet()._init_from_response(resp) 185 | for resp in (shard1, shard2)] 186 | return self 187 | -------------------------------------------------------------------------------- /tests/test_algolia.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Integration tests for mongo-connector + Algolia.""" 16 | """Integration tests for mongo-connector + Elasticsearch.""" 17 | import base64 18 | import os 19 | import sys 20 | import time 21 | 22 | from algoliasearch import algoliasearch 23 | from gridfs import GridFS 24 | 25 | sys.path[0:0] = [""] 26 | 27 | from tests import elastic_pair 28 | from tests.setup_cluster import ReplicaSet 29 | from mongo_connector.doc_managers.algolia_doc_manager import DocManager 30 | from mongo_connector.connector import Connector 31 | from mongo_connector.util import retry_until_ok 32 | from tests.util import assert_soon 33 | from tests import unittest 34 | 35 | class AlgoliaTestCase(unittest.TestCase): 36 | """Base class for all Algolia TestCases.""" 37 | 38 | @classmethod 39 | def setUpClass(cls): 40 | cls.algolia_client = algoliasearch.Client(os.environ['ALGOLIA_APPLICATION_ID'], os.environ['ALGOLIA_API_KEY']) 41 | cls.algolia_doc = DocManager('%s:%s:%s' % (os.environ['ALGOLIA_APPLICATION_ID'], os.environ['ALGOLIA_API_KEY'], 'test_mongo_connector'), auto_commit=False) 42 | 43 | def setUp(self): 44 | self.algolia_index = self.algolia_client.initIndex('test_mongo_connector') 45 | self.algolia_index.clearIndex() 46 | res = self.algolia_index.setSettings({ 'hitsPerPage': 20 }) # work-around empty settings 47 | self.algolia_index.waitTask(res['taskID']) 48 | 49 | def tearDown(self): 50 | self.algolia_client.deleteIndex('test_mongo_connector') 51 | 52 | if __name__ == '__main__': 53 | unittest.main() 54 | -------------------------------------------------------------------------------- /tests/test_algolia_doc_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Unit tests for the Algolia DocManager.""" 16 | import base64 17 | import sys 18 | import time 19 | 20 | sys.path[0:0] = [""] 21 | 22 | from tests import elastic_pair, unittest, TESTARGS 23 | from tests.test_algolia import AlgoliaTestCase 24 | from tests.test_gridfs_file import MockGridFSFile 25 | 26 | from mongo_connector.command_helper import CommandHelper 27 | from mongo_connector.doc_managers.algolia_doc_manager import DocManager 28 | 29 | class AlgoliaDocManagerTester(AlgoliaTestCase): 30 | """Unit tests for the Algolia DocManager.""" 31 | 32 | def test_update(self): 33 | """Test the update method.""" 34 | doc = {"_id": '1', "a": 1, "b": 2} 35 | self.algolia_doc.upsert(doc) 36 | self.algolia_doc.commit(True) 37 | # $set only 38 | update_spec = {"$set": {"a": 1, "b": 2}} 39 | self.algolia_doc.update(doc, update_spec) 40 | self.algolia_doc.commit(True) 41 | doc = self.algolia_index.getObject('1') 42 | self.assertEqual(doc, {"_id": '1', "objectID": '1', "a": 1, "b": 2}) 43 | # $unset only 44 | update_spec = {"$unset": {"a": True}} 45 | self.algolia_doc.update(doc, update_spec) 46 | self.algolia_doc.commit(True) 47 | doc = self.algolia_index.getObject('1') 48 | self.assertEqual(doc, {"_id": '1', "objectID": '1', "b": 2, "a": None}) 49 | # mixed $set/$unset 50 | update_spec = {"$unset": {"b": True}, "$set": {"c": 3}} 51 | self.algolia_doc.update(doc, update_spec) 52 | self.algolia_doc.commit(True) 53 | doc = self.algolia_index.getObject('1') 54 | self.assertEqual(doc, {"_id": '1', "objectID": '1', "c": 3, "a": None, "b": None}) 55 | 56 | def test_upsert(self): 57 | """Test the upsert method.""" 58 | docc = {'_id': '1', 'name': 'John'} 59 | self.algolia_doc.upsert(docc) 60 | self.algolia_doc.commit(True) 61 | res = self.algolia_index.search('')["hits"] 62 | for doc in res: 63 | self.assertEqual(doc['_id'], '1') 64 | self.assertEqual(doc['name'], 'John') 65 | 66 | def test_bulk_upsert(self): 67 | """Test the bulk_upsert method.""" 68 | self.algolia_doc.bulk_upsert([], *TESTARGS) 69 | self.algolia_doc.commit(True) 70 | 71 | docs = ({"_id": i} for i in range(100)) 72 | self.algolia_doc.bulk_upsert(docs, *TESTARGS) 73 | self.algolia_doc.commit(True) 74 | res = self.algolia_index.search('', { 'hitsPerPage': 101 })["hits"] 75 | returned_ids = sorted(int(doc["_id"]) for doc in res) 76 | self.assertEqual(len(returned_ids), 100) 77 | for i, r in enumerate(returned_ids): 78 | self.assertEqual(r, i) 79 | 80 | docs = ({"_id": i, "weight": 2*i} for i in range(100)) 81 | self.algolia_doc.bulk_upsert(docs, *TESTARGS) 82 | self.algolia_doc.commit(True) 83 | 84 | res = self.algolia_index.search('', { 'hitsPerPage': 101 })["hits"] 85 | returned_ids = sorted(int(doc["weight"]) for doc in res) 86 | self.assertEqual(len(returned_ids), 100) 87 | for i, r in enumerate(returned_ids): 88 | self.assertEqual(r, 2*i) 89 | 90 | def test_remove(self): 91 | """Test the remove method.""" 92 | docc = {'_id': '1', 'name': 'John'} 93 | self.algolia_doc.upsert(docc) 94 | self.algolia_doc.commit(True) 95 | res = self.algolia_index.search('')["hits"] 96 | self.assertEqual(len(res), 1) 97 | 98 | self.algolia_doc.remove(docc) 99 | self.algolia_doc.commit(True) 100 | res = self.algolia_index.search('')["hits"] 101 | self.assertEqual(len(res), 0) 102 | 103 | @unittest.skip("WIP") 104 | def test_get_last_doc(self): 105 | """Test the get_last_doc method. 106 | 107 | Make sure we can retrieve the document most recently modified from Algolia. 108 | """ 109 | base = self.algolia_doc.get_last_doc() 110 | ts = base.get("_ts", 0) if base else 0 111 | docc = {'_id': '4', 'name': 'Hare', '_ts': ts+3, 'ns': 'test.test'} 112 | self.algolia_doc.upsert(docc) 113 | docc = {'_id': '5', 'name': 'Tortoise', '_ts': ts+2, 'ns': 'test.test'} 114 | self.algolia_doc.upsert(docc) 115 | docc = {'_id': '6', 'name': 'Mr T.', '_ts': ts+1, 'ns': 'test.test'} 116 | self.algolia_doc.upsert(docc) 117 | self.algolia_doc.commit(True) 118 | 119 | self.assertEqual(self.algolia_index.search('')['nbHits'], 3) 120 | doc = self.elastic_doc.get_last_doc() 121 | self.assertEqual(doc['_id'], '4') 122 | 123 | docc = {'_id': '6', 'name': 'HareTwin', '_ts': ts+4, 'ns': 'test.test'} 124 | self.elastic_doc.upsert(docc) 125 | self.algolia_doc.commit(True) 126 | 127 | doc = self.elastic_doc.get_last_doc() 128 | self.assertEqual(doc['_id'], '6') 129 | self.assertEqual(self.algolia_index.search('')['nbHits'], 3) 130 | 131 | 132 | if __name__ == '__main__': 133 | unittest.main() 134 | -------------------------------------------------------------------------------- /tests/test_command_replication.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Test replication of commands 16 | """ 17 | 18 | import sys 19 | 20 | sys.path[0:0] = [""] 21 | 22 | import pymongo 23 | 24 | from mongo_connector import errors 25 | from mongo_connector.command_helper import CommandHelper 26 | from mongo_connector.doc_managers.doc_manager_base import DocManagerBase 27 | from mongo_connector.locking_dict import LockingDict 28 | from mongo_connector.oplog_manager import OplogThread 29 | from tests import unittest 30 | from tests.setup_cluster import ReplicaSet 31 | from tests.util import assert_soon 32 | 33 | 34 | class CommandLoggerDocManager(DocManagerBase): 35 | def __init__(self, url=None, **kwargs): 36 | self.commands = [] 37 | 38 | def stop(self): 39 | pass 40 | 41 | def upsert(self, doc, namespace, timestamp): 42 | pass 43 | 44 | def remove(self, document_id, namespace, timestamp): 45 | pass 46 | 47 | def commit(self): 48 | pass 49 | 50 | def handle_command(self, doc, namespace, timestamp): 51 | self.commands.append(doc) 52 | 53 | 54 | class TestCommandReplication(unittest.TestCase): 55 | def setUp(self): 56 | self.repl_set = ReplicaSet().start() 57 | self.primary_conn = self.repl_set.client() 58 | self.oplog_progress = LockingDict() 59 | self.opman = None 60 | 61 | def tearDown(self): 62 | try: 63 | if self.opman: 64 | self.opman.join() 65 | except RuntimeError: 66 | pass 67 | self.primary_conn.close() 68 | self.repl_set.stop() 69 | 70 | def initOplogThread(self, namespace_set=[], dest_mapping={}): 71 | self.docman = CommandLoggerDocManager() 72 | self.docman.command_helper = CommandHelper(namespace_set, dest_mapping) 73 | self.opman = OplogThread( 74 | primary_client=self.primary_conn, 75 | doc_managers=(self.docman,), 76 | oplog_progress_dict=self.oplog_progress, 77 | ns_set=namespace_set, 78 | dest_mapping=dest_mapping, 79 | collection_dump=False 80 | ) 81 | self.opman.start() 82 | 83 | def test_command_helper(self): 84 | # Databases cannot be merged 85 | mapping = { 86 | 'a.x': 'c.x', 87 | 'b.x': 'c.y' 88 | } 89 | self.assertRaises(errors.MongoConnectorError, 90 | CommandHelper, 91 | list(mapping), mapping) 92 | 93 | mapping = { 94 | 'a.x': 'b.x', 95 | 'a.y': 'c.y' 96 | } 97 | helper = CommandHelper(list(mapping) + ['a.z'], mapping) 98 | 99 | self.assertEqual(set(helper.map_db('a')), set(['a', 'b', 'c'])) 100 | self.assertEqual(helper.map_db('d'), []) 101 | 102 | self.assertEqual(helper.map_namespace('a.x'), 'b.x') 103 | self.assertEqual(helper.map_namespace('a.z'), 'a.z') 104 | self.assertEqual(helper.map_namespace('d.x'), None) 105 | 106 | self.assertEqual(helper.map_collection('a', 'x'), ('b', 'x')) 107 | self.assertEqual(helper.map_collection('a', 'z'), ('a', 'z')) 108 | self.assertEqual(helper.map_collection('d', 'x'), (None, None)) 109 | 110 | def test_create_collection(self): 111 | self.initOplogThread() 112 | pymongo.collection.Collection( 113 | self.primary_conn['test'], 'test', create=True) 114 | assert_soon(lambda: self.docman.commands) 115 | self.assertEqual(self.docman.commands[0], {'create': 'test'}) 116 | 117 | def test_create_collection_skipped(self): 118 | self.initOplogThread(['test.test']) 119 | 120 | pymongo.collection.Collection( 121 | self.primary_conn['test2'], 'test2', create=True) 122 | pymongo.collection.Collection( 123 | self.primary_conn['test'], 'test', create=True) 124 | 125 | assert_soon(lambda: self.docman.commands) 126 | self.assertEqual(len(self.docman.commands), 1) 127 | self.assertEqual(self.docman.commands[0], {'create': 'test'}) 128 | 129 | def test_drop_collection(self): 130 | self.initOplogThread() 131 | coll = pymongo.collection.Collection( 132 | self.primary_conn['test'], 'test', create=True) 133 | coll.drop() 134 | assert_soon(lambda: len(self.docman.commands) == 2) 135 | self.assertEqual(self.docman.commands[1], {'drop': 'test'}) 136 | 137 | def test_drop_database(self): 138 | self.initOplogThread() 139 | pymongo.collection.Collection( 140 | self.primary_conn['test'], 'test', create=True) 141 | self.primary_conn.drop_database('test') 142 | assert_soon(lambda: len(self.docman.commands) == 2) 143 | self.assertEqual(self.docman.commands[1], {'dropDatabase': 1}) 144 | 145 | def test_rename_collection(self): 146 | self.initOplogThread() 147 | coll = pymongo.collection.Collection( 148 | self.primary_conn['test'], 'test', create=True) 149 | coll.rename('test2') 150 | assert_soon(lambda: len(self.docman.commands) == 2) 151 | self.assertEqual( 152 | self.docman.commands[1], 153 | {'renameCollection': 'test.test', 'to': 'test.test2'}) 154 | 155 | 156 | if __name__ == '__main__': 157 | unittest.main() 158 | -------------------------------------------------------------------------------- /tests/test_connector_sharded.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from mongo_connector.connector import Connector 4 | from mongo_connector.doc_managers.doc_manager_simulator import DocManager 5 | 6 | from tests import unittest, SkipTest, db_user, db_password 7 | from tests.setup_cluster import ShardedCluster 8 | from tests.util import assert_soon 9 | 10 | 11 | class TestConnectorSharded(unittest.TestCase): 12 | 13 | def setUp(self): 14 | if not (db_user and db_password): 15 | raise SkipTest('Need to set a user/password to test this.') 16 | self.cluster = ShardedCluster().start() 17 | 18 | def tearDown(self): 19 | try: 20 | os.unlink('oplog.timestamp') 21 | except OSError: 22 | pass 23 | self.cluster.stop() 24 | 25 | def test_start_with_auth(self): 26 | dm = DocManager() 27 | connector = Connector( 28 | mongo_address=self.cluster.uri, 29 | doc_managers=[dm], 30 | auth_username=db_user, 31 | auth_key=db_password 32 | ) 33 | connector.start() 34 | 35 | # Insert some documents into the sharded cluster. These 36 | # should go to the DocManager, and the connector should not 37 | # have an auth failure. 38 | self.cluster.client().test.test.insert({'auth_failure': False}) 39 | assert_soon(lambda: len(dm._search()) > 0) 40 | 41 | connector.join() 42 | -------------------------------------------------------------------------------- /tests/test_elastic.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Integration tests for mongo-connector + Elasticsearch.""" 16 | import base64 17 | import os 18 | import sys 19 | import time 20 | 21 | from elasticsearch import Elasticsearch 22 | from gridfs import GridFS 23 | 24 | sys.path[0:0] = [""] 25 | 26 | from tests import elastic_pair 27 | from tests.setup_cluster import ReplicaSet 28 | from mongo_connector.doc_managers.elastic_doc_manager import DocManager 29 | from mongo_connector.connector import Connector 30 | from mongo_connector.util import retry_until_ok 31 | from tests.util import assert_soon 32 | from tests import unittest 33 | 34 | 35 | class ElasticsearchTestCase(unittest.TestCase): 36 | """Base class for all ES TestCases.""" 37 | 38 | @classmethod 39 | def setUpClass(cls): 40 | cls.elastic_conn = Elasticsearch(hosts=[elastic_pair]) 41 | cls.elastic_doc = DocManager(elastic_pair, 42 | auto_commit_interval=0) 43 | 44 | def setUp(self): 45 | # Create target index in elasticsearch 46 | self.elastic_conn.indices.create(index='test', ignore=400) 47 | self.elastic_conn.cluster.health(wait_for_status='yellow', 48 | index='test') 49 | 50 | def tearDown(self): 51 | self.elastic_conn.indices.delete(index='test', ignore=404) 52 | 53 | def _search(self, query=None): 54 | query = query or {"match_all": {}} 55 | return self.elastic_doc._stream_search( 56 | index="test", doc_type='test', 57 | body={"query": query} 58 | ) 59 | 60 | def _count(self): 61 | return self.elastic_conn.count(index='test')['count'] 62 | 63 | def _remove(self): 64 | self.elastic_conn.indices.delete_mapping( 65 | index="test", doc_type='test' 66 | ) 67 | self.elastic_conn.indices.refresh(index="test") 68 | 69 | def _mappings(self, index='_all'): 70 | mappings = self.elastic_conn.indices.get_mapping(index=index) 71 | if index in mappings: 72 | return list(mappings[index]['mappings'].keys()) 73 | return [] 74 | 75 | def _indices(self): 76 | return list(self.elastic_conn.indices.stats()['indices'].keys()) 77 | 78 | 79 | class TestElastic(ElasticsearchTestCase): 80 | """Integration tests for mongo-connector + Elasticsearch.""" 81 | 82 | @classmethod 83 | def setUpClass(cls): 84 | """Start the cluster.""" 85 | super(TestElastic, cls).setUpClass() 86 | cls.repl_set = ReplicaSet().start() 87 | cls.conn = cls.repl_set.client() 88 | 89 | @classmethod 90 | def tearDownClass(cls): 91 | """Kill the cluster.""" 92 | cls.repl_set.stop() 93 | 94 | def tearDown(self): 95 | """Stop the Connector thread.""" 96 | super(TestElastic, self).tearDown() 97 | self.connector.join() 98 | 99 | def setUp(self): 100 | """Start a new Connector for each test.""" 101 | super(TestElastic, self).setUp() 102 | try: 103 | os.unlink("oplog.timestamp") 104 | except OSError: 105 | pass 106 | docman = DocManager(elastic_pair) 107 | self.connector = Connector( 108 | mongo_address=self.repl_set.uri, 109 | ns_set=['test.test'], 110 | doc_managers=(docman,), 111 | gridfs_set=['test.test'] 112 | ) 113 | 114 | self.conn.test.test.drop() 115 | self.conn.test.test.files.drop() 116 | self.conn.test.test.chunks.drop() 117 | 118 | self.connector.start() 119 | assert_soon(lambda: len(self.connector.shard_set) > 0) 120 | assert_soon(lambda: self._count() == 0) 121 | 122 | def test_insert(self): 123 | """Test insert operations.""" 124 | self.conn['test']['test'].insert({'name': 'paulie'}) 125 | assert_soon(lambda: self._count() > 0) 126 | result_set_1 = list(self._search()) 127 | self.assertEqual(len(result_set_1), 1) 128 | result_set_2 = self.conn['test']['test'].find_one() 129 | for item in result_set_1: 130 | self.assertEqual(item['_id'], str(result_set_2['_id'])) 131 | self.assertEqual(item['name'], result_set_2['name']) 132 | 133 | def test_remove(self): 134 | """Tests remove operations.""" 135 | self.conn['test']['test'].insert({'name': 'paulie'}) 136 | assert_soon(lambda: self._count() == 1) 137 | self.conn['test']['test'].remove({'name': 'paulie'}) 138 | assert_soon(lambda: self._count() != 1) 139 | self.assertEqual(self._count(), 0) 140 | 141 | def test_insert_file(self): 142 | """Tests inserting a gridfs file 143 | """ 144 | fs = GridFS(self.conn['test'], 'test') 145 | test_data = b"test_insert_file test file" 146 | id = fs.put(test_data, filename="test.txt", encoding='utf8') 147 | assert_soon(lambda: self._count() > 0) 148 | 149 | query = {"match": {"_all": "test_insert_file"}} 150 | res = list(self._search(query)) 151 | self.assertEqual(len(res), 1) 152 | doc = res[0] 153 | self.assertEqual(doc['filename'], 'test.txt') 154 | self.assertEqual(doc['_id'], str(id)) 155 | self.assertEqual(base64.b64decode(doc['content']), test_data) 156 | 157 | def test_remove_file(self): 158 | fs = GridFS(self.conn['test'], 'test') 159 | id = fs.put("test file", filename="test.txt", encoding='utf8') 160 | assert_soon(lambda: self._count() == 1) 161 | fs.delete(id) 162 | assert_soon(lambda: self._count() == 0) 163 | 164 | def test_update(self): 165 | """Test update operations.""" 166 | # Insert 167 | self.conn.test.test.insert({"a": 0}) 168 | assert_soon(lambda: sum(1 for _ in self._search()) == 1) 169 | 170 | def check_update(update_spec): 171 | updated = self.conn.test.test.find_and_modify( 172 | {"a": 0}, 173 | update_spec, 174 | new=True 175 | ) 176 | # Stringify _id to match what will be retrieved from ES 177 | updated['_id'] = str(updated['_id']) 178 | # Allow some time for update to propagate 179 | time.sleep(1) 180 | replicated = next(self._search()) 181 | self.assertEqual(replicated, updated) 182 | 183 | # Update by adding a field. Note that ES can't mix types within an array 184 | check_update({"$set": {"b": [{"c": 10}, {"d": 11}]}}) 185 | 186 | # Update by setting an attribute of a sub-document beyond end of array. 187 | check_update({"$set": {"b.10.c": 42}}) 188 | 189 | # Update by changing a value within a sub-document (contains array) 190 | check_update({"$inc": {"b.0.c": 1}}) 191 | 192 | # Update by changing the value within an array 193 | check_update({"$inc": {"b.1.f": 12}}) 194 | 195 | # Update by adding new bucket to list 196 | check_update({"$push": {"b": {"e": 12}}}) 197 | 198 | # Update by changing an entire sub-document 199 | check_update({"$set": {"b.0": {"e": 4}}}) 200 | 201 | # Update by adding a sub-document 202 | check_update({"$set": {"b": {"0": {"c": 100}}}}) 203 | 204 | # Update whole document 205 | check_update({"a": 0, "b": {"1": {"d": 10000}}}) 206 | 207 | def test_rollback(self): 208 | """Test behavior during a MongoDB rollback. 209 | 210 | We force a rollback by adding a doc, killing the primary, 211 | adding another doc, killing the new primary, and then 212 | restarting both. 213 | """ 214 | primary_conn = self.repl_set.primary.client() 215 | 216 | self.conn['test']['test'].insert({'name': 'paul'}) 217 | condition1 = lambda: self.conn['test']['test'].find( 218 | {'name': 'paul'}).count() == 1 219 | condition2 = lambda: self._count() == 1 220 | assert_soon(condition1) 221 | assert_soon(condition2) 222 | 223 | self.repl_set.primary.stop(destroy=False) 224 | 225 | new_primary_conn = self.repl_set.secondary.client() 226 | 227 | admin = new_primary_conn['admin'] 228 | assert_soon(lambda: admin.command("isMaster")['ismaster']) 229 | time.sleep(5) 230 | retry_until_ok(self.conn.test.test.insert, 231 | {'name': 'pauline'}) 232 | assert_soon(lambda: self._count() == 2) 233 | result_set_1 = list(self._search()) 234 | result_set_2 = self.conn['test']['test'].find_one({'name': 'pauline'}) 235 | self.assertEqual(len(result_set_1), 2) 236 | #make sure pauline is there 237 | for item in result_set_1: 238 | if item['name'] == 'pauline': 239 | self.assertEqual(item['_id'], str(result_set_2['_id'])) 240 | self.repl_set.secondary.stop(destroy=False) 241 | 242 | self.repl_set.primary.start() 243 | while primary_conn['admin'].command("isMaster")['ismaster'] is False: 244 | time.sleep(1) 245 | 246 | self.repl_set.secondary.start() 247 | 248 | time.sleep(2) 249 | result_set_1 = list(self._search()) 250 | self.assertEqual(len(result_set_1), 1) 251 | 252 | for item in result_set_1: 253 | self.assertEqual(item['name'], 'paul') 254 | find_cursor = retry_until_ok(self.conn['test']['test'].find) 255 | self.assertEqual(retry_until_ok(find_cursor.count), 1) 256 | 257 | def test_bad_int_value(self): 258 | self.conn.test.test.insert({ 259 | 'inf': float('inf'), 'nan': float('nan'), 260 | 'still_exists': True}) 261 | assert_soon(lambda: self._count() > 0) 262 | for doc in self._search(): 263 | self.assertNotIn('inf', doc) 264 | self.assertNotIn('nan', doc) 265 | self.assertTrue(doc['still_exists']) 266 | 267 | if __name__ == '__main__': 268 | unittest.main() 269 | -------------------------------------------------------------------------------- /tests/test_elastic_doc_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Unit tests for the Elastic DocManager.""" 16 | import base64 17 | import sys 18 | import time 19 | 20 | sys.path[0:0] = [""] 21 | 22 | from tests import elastic_pair, unittest, TESTARGS 23 | from tests.test_elastic import ElasticsearchTestCase 24 | from tests.test_gridfs_file import MockGridFSFile 25 | 26 | from mongo_connector.command_helper import CommandHelper 27 | from mongo_connector.doc_managers.elastic_doc_manager import DocManager 28 | 29 | 30 | class TestElasticDocManager(ElasticsearchTestCase): 31 | """Unit tests for the Elastic DocManager.""" 32 | 33 | def test_update(self): 34 | """Test the update method.""" 35 | doc_id = 1 36 | doc = {"_id": doc_id, "a": 1, "b": 2} 37 | self.elastic_doc.upsert(doc, *TESTARGS) 38 | # $set only 39 | update_spec = {"$set": {"a": 1, "b": 2}} 40 | doc = self.elastic_doc.update(doc_id, update_spec, *TESTARGS) 41 | self.assertEqual(doc, {"_id": '1', "a": 1, "b": 2}) 42 | # $unset only 43 | update_spec = {"$unset": {"a": True}} 44 | doc = self.elastic_doc.update(doc_id, update_spec, *TESTARGS) 45 | self.assertEqual(doc, {"_id": '1', "b": 2}) 46 | # mixed $set/$unset 47 | update_spec = {"$unset": {"b": True}, "$set": {"c": 3}} 48 | doc = self.elastic_doc.update(doc_id, update_spec, *TESTARGS) 49 | self.assertEqual(doc, {"_id": '1', "c": 3}) 50 | 51 | def test_upsert(self): 52 | """Test the upsert method.""" 53 | docc = {'_id': '1', 'name': 'John'} 54 | self.elastic_doc.upsert(docc, *TESTARGS) 55 | res = self.elastic_conn.search( 56 | index="test", doc_type='test', 57 | body={"query": {"match_all": {}}} 58 | )["hits"]["hits"] 59 | for doc in res: 60 | self.assertEqual(doc['_id'], '1') 61 | self.assertEqual(doc['_source']['name'], 'John') 62 | 63 | def test_bulk_upsert(self): 64 | """Test the bulk_upsert method.""" 65 | self.elastic_doc.bulk_upsert([], *TESTARGS) 66 | 67 | docs = ({"_id": i} for i in range(1000)) 68 | self.elastic_doc.bulk_upsert(docs, *TESTARGS) 69 | self.elastic_doc.commit() 70 | res = self.elastic_conn.search( 71 | index="test", doc_type='test', 72 | body={"query": {"match_all": {}}}, 73 | size=1001 74 | )["hits"]["hits"] 75 | returned_ids = sorted(int(doc["_id"]) for doc in res) 76 | self.assertEqual(len(returned_ids), 1000) 77 | for i, r in enumerate(returned_ids): 78 | self.assertEqual(r, i) 79 | 80 | docs = ({"_id": i, "weight": 2*i} for i in range(1000)) 81 | self.elastic_doc.bulk_upsert(docs, *TESTARGS) 82 | 83 | res = self.elastic_conn.search( 84 | index="test", doc_type='test', 85 | body={"query": {"match_all": {}}}, 86 | size=1001 87 | )["hits"]["hits"] 88 | returned_ids = sorted(int(doc["_source"]["weight"]) for doc in res) 89 | self.assertEqual(len(returned_ids), 1000) 90 | for i, r in enumerate(returned_ids): 91 | self.assertEqual(r, 2*i) 92 | 93 | def test_remove(self): 94 | """Test the remove method.""" 95 | docc = {'_id': '1', 'name': 'John'} 96 | self.elastic_doc.upsert(docc, *TESTARGS) 97 | res = self.elastic_conn.search( 98 | index="test", doc_type='test', 99 | body={"query": {"match_all": {}}} 100 | )["hits"]["hits"] 101 | res = [x["_source"] for x in res] 102 | self.assertEqual(len(res), 1) 103 | 104 | self.elastic_doc.remove(docc['_id'], *TESTARGS) 105 | res = self.elastic_conn.search( 106 | index="test", doc_type='test', 107 | body={"query": {"match_all": {}}} 108 | )["hits"]["hits"] 109 | res = [x["_source"] for x in res] 110 | self.assertEqual(len(res), 0) 111 | 112 | def test_insert_file(self): 113 | """Ensure we can properly insert a file into ElasticSearch 114 | """ 115 | test_data = ' '.join(str(x) for x in range(100000)).encode('utf8') 116 | docc = { 117 | '_id': 'test_id', 118 | 'filename': 'test_filename', 119 | 'upload_date': 5, 120 | 'md5': 'test_md5' 121 | } 122 | self.elastic_doc.insert_file( 123 | MockGridFSFile(docc, test_data), *TESTARGS) 124 | res = self._search() 125 | for doc in res: 126 | self.assertEqual(doc['_id'], docc['_id']) 127 | self.assertEqual(doc['filename'], docc['filename']) 128 | self.assertEqual(base64.b64decode(doc['content']), 129 | test_data.strip()) 130 | 131 | def test_remove_file(self): 132 | test_data = b'hello world' 133 | docc = { 134 | '_id': 'test_id', 135 | '_ts': 10, 136 | 'ns': 'test.test', 137 | 'filename': 'test_filename', 138 | 'upload_date': 5, 139 | 'md5': 'test_md5' 140 | } 141 | 142 | self.elastic_doc.insert_file( 143 | MockGridFSFile(docc, test_data), *TESTARGS) 144 | res = list(self._search()) 145 | self.assertEqual(len(res), 1) 146 | 147 | self.elastic_doc.remove('test_id', *TESTARGS) 148 | res = list(self._search()) 149 | self.assertEqual(len(res), 0) 150 | 151 | def test_search(self): 152 | """Test the search method. 153 | 154 | Make sure we can retrieve documents last modified within a time range. 155 | """ 156 | docc = {'_id': '1', 'name': 'John'} 157 | self.elastic_doc.upsert(docc, 'test.test', 5767301236327972865) 158 | docc2 = {'_id': '2', 'name': 'John Paul'} 159 | self.elastic_doc.upsert(docc2, 'test.test', 5767301236327972866) 160 | docc3 = {'_id': '3', 'name': 'Paul'} 161 | self.elastic_doc.upsert(docc3, 'test.test', 5767301236327972870) 162 | search = list(self.elastic_doc.search(5767301236327972865, 163 | 5767301236327972866)) 164 | self.assertEqual(len(search), 2) 165 | result_ids = [result.get("_id") for result in search] 166 | self.assertIn('1', result_ids) 167 | self.assertIn('2', result_ids) 168 | 169 | def test_elastic_commit(self): 170 | """Test the auto_commit_interval attribute.""" 171 | docc = {'_id': '3', 'name': 'Waldo'} 172 | docman = DocManager(elastic_pair) 173 | # test cases: 174 | # -1 = no autocommit 175 | # 0 = commit immediately 176 | # x > 0 = commit within x seconds 177 | for autocommit_interval in [None, 0, 1, 2]: 178 | docman.auto_commit_interval = autocommit_interval 179 | docman.upsert(docc, *TESTARGS) 180 | if autocommit_interval is None: 181 | docman.commit() 182 | else: 183 | # Allow just a little extra time 184 | time.sleep(autocommit_interval + 1) 185 | results = list(self._search()) 186 | self.assertEqual(len(results), 1, 187 | "should commit document with " 188 | "auto_commit_interval = %s" % str( 189 | autocommit_interval)) 190 | self.assertEqual(results[0]["name"], "Waldo") 191 | self._remove() 192 | docman.stop() 193 | 194 | def test_get_last_doc(self): 195 | """Test the get_last_doc method. 196 | 197 | Make sure we can retrieve the document most recently modified from ES. 198 | """ 199 | base = self.elastic_doc.get_last_doc() 200 | ts = base.get("_ts", 0) if base else 0 201 | docc = {'_id': '4', 'name': 'Hare'} 202 | self.elastic_doc.upsert(docc, 'test.test', ts + 3) 203 | docc = {'_id': '5', 'name': 'Tortoise'} 204 | self.elastic_doc.upsert(docc, 'test.test', ts + 2) 205 | docc = {'_id': '6', 'name': 'Mr T.'} 206 | self.elastic_doc.upsert(docc, 'test.test', ts + 1) 207 | 208 | self.assertEqual( 209 | self.elastic_doc.elastic.count(index="test")['count'], 3) 210 | doc = self.elastic_doc.get_last_doc() 211 | self.assertEqual(doc['_id'], '4') 212 | 213 | docc = {'_id': '6', 'name': 'HareTwin'} 214 | self.elastic_doc.upsert(docc, 'test.test', ts + 4) 215 | doc = self.elastic_doc.get_last_doc() 216 | self.assertEqual(doc['_id'], '6') 217 | self.assertEqual( 218 | self.elastic_doc.elastic.count(index="test")['count'], 3) 219 | 220 | def test_commands(self): 221 | cmd_args = ('test.$cmd', 1) 222 | self.elastic_doc.command_helper = CommandHelper() 223 | 224 | self.elastic_doc.handle_command({'create': 'test2'}, *cmd_args) 225 | time.sleep(1) 226 | self.assertIn('test2', self._mappings('test')) 227 | 228 | self.elastic_doc.handle_command({'drop': 'test2'}, *cmd_args) 229 | time.sleep(1) 230 | self.assertNotIn('test2', self._mappings('test')) 231 | 232 | self.elastic_doc.handle_command({'create': 'test2'}, *cmd_args) 233 | self.elastic_doc.handle_command({'create': 'test3'}, *cmd_args) 234 | time.sleep(1) 235 | self.elastic_doc.handle_command({'dropDatabase': 1}, *cmd_args) 236 | time.sleep(1) 237 | self.assertNotIn('test', self._indices()) 238 | self.assertNotIn('test2', self._mappings()) 239 | self.assertNotIn('test3', self._mappings()) 240 | 241 | 242 | if __name__ == '__main__': 243 | unittest.main() 244 | -------------------------------------------------------------------------------- /tests/test_formatters.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import re 3 | import sys 4 | import uuid 5 | 6 | import bson 7 | 8 | sys.path[0:0] = [""] 9 | 10 | from mongo_connector.compat import PY3 11 | from mongo_connector.doc_managers.formatters import ( 12 | DefaultDocumentFormatter, DocumentFlattener) 13 | from tests import unittest 14 | 15 | 16 | class TestFormatters(unittest.TestCase): 17 | 18 | @classmethod 19 | def setUpClass(cls): 20 | # Some test values to use 21 | cls.bin1 = bson.Binary(b"\x00hello\x00", 0) 22 | cls.bin2 = b'\x00hello\x00' 23 | cls.xuuid = uuid.uuid4() 24 | cls.oid = bson.ObjectId() 25 | cls.regex = re.compile("hello", re.VERBOSE | re.MULTILINE) 26 | cls.lst = [cls.regex, cls.bin1, cls.bin2, cls.xuuid, cls.oid] 27 | cls.date = datetime.datetime.now() 28 | cls.doc = {'r': cls.regex, 'b1': cls.bin1, 'b2': cls.bin2, 29 | 'uuid': cls.xuuid, 'oid': cls.oid, 'd': cls.date} 30 | cls.doc_nested = {"doc": cls.doc} 31 | cls.doc_list = {"list": [cls.doc, cls.doc_nested, cls.lst]} 32 | 33 | def test_types(self): 34 | trans = DefaultDocumentFormatter().transform_value 35 | 36 | # regex 37 | _, patt, flags = trans(self.regex).rsplit("/") 38 | self.assertIn('x', flags) 39 | self.assertIn('m', flags) 40 | self.assertNotIn('l', flags) 41 | self.assertEqual(patt, 'hello') 42 | 43 | # binary 44 | self.assertEqual(trans(self.bin1), 'AGhlbGxvAA==') 45 | if PY3: 46 | self.assertEqual(trans(self.bin2), 'AGhlbGxvAA==') 47 | else: 48 | self.assertEqual(trans(self.bin2), self.bin2) 49 | 50 | # datetime 51 | self.assertEqual(trans(self.date), self.date) 52 | 53 | # UUID 54 | self.assertEqual(trans(self.xuuid), self.xuuid.hex) 55 | 56 | # Other type 57 | self.assertEqual(trans(self.oid), str(self.oid)) 58 | 59 | # Compound types 60 | transformed = trans(self.doc) 61 | for k, v in self.doc.items(): 62 | self.assertEqual(trans(v), transformed[k]) 63 | for el1, el2 in zip(self.lst, map(trans, self.lst)): 64 | self.assertEqual(trans(el1), el2) 65 | 66 | # Infinity/NaN 67 | self.assertRaises(ValueError, trans, float('inf')) 68 | self.assertRaises(ValueError, trans, float('nan')) 69 | 70 | def test_default_formatter(self): 71 | formatter = DefaultDocumentFormatter() 72 | 73 | def check_format(document): 74 | transformed = dict((k, formatter.transform_value(v)) 75 | for k, v in document.items()) 76 | self.assertEqual(transformed, formatter.format_document(document)) 77 | 78 | # Flat 79 | check_format(self.doc) 80 | 81 | # Nested 82 | check_format(self.doc_nested) 83 | 84 | # With a list 85 | check_format(self.doc_list) 86 | 87 | def test_flattener(self): 88 | formatter = DocumentFlattener() 89 | 90 | # Flat already 91 | transformed = dict((k, formatter.transform_value(v)) 92 | for k, v in self.doc.items()) 93 | self.assertEqual(transformed, formatter.format_document(self.doc)) 94 | 95 | # Nested 96 | transformed2 = formatter.format_document(self.doc_nested) 97 | constructed = dict(("doc.%s" % k, formatter.transform_value(v)) 98 | for k, v in self.doc.items()) 99 | self.assertEqual(transformed2, constructed) 100 | 101 | # With a list 102 | constructed1 = dict(("list.0.%s" % k, formatter.transform_value(v)) 103 | for k, v in self.doc.items()) 104 | constructed2 = dict(("list.1.%s" % k, v) 105 | for k, v in transformed2.items()) 106 | constructed3 = dict(("list.2.%d" % i, formatter.transform_value(v)) 107 | for i, v in enumerate(self.lst)) 108 | constructed1.update(constructed2) 109 | constructed1.update(constructed3) 110 | self.assertEqual(formatter.format_document(self.doc_list), constructed1) 111 | -------------------------------------------------------------------------------- /tests/test_gridfs_file.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import sys 16 | 17 | import gridfs 18 | 19 | sys.path[0:0] = [""] 20 | 21 | from mongo_connector.gridfs_file import GridFSFile 22 | from mongo_connector import errors 23 | from tests import unittest 24 | from tests.setup_cluster import ReplicaSet 25 | 26 | 27 | class MockGridFSFile: 28 | def __init__(self, doc, data): 29 | self._id = doc['_id'] 30 | self.filename = doc['filename'] 31 | self.upload_date = doc['upload_date'] 32 | self.md5 = doc['md5'] 33 | self.data = data 34 | self.length = len(self.data) 35 | self.pos = 0 36 | 37 | def get_metadata(self): 38 | return { 39 | '_id': self._id, 40 | 'filename': self.filename, 41 | 'upload_date': self.upload_date, 42 | 'md5': self.md5 43 | } 44 | 45 | def __len__(self): 46 | return self.length 47 | 48 | def read(self, n=-1): 49 | if n < 0 or self.pos + n > self.length: 50 | n = self.length - self.pos 51 | s = self.data[self.pos:self.pos+n] 52 | self.pos += n 53 | return s 54 | 55 | 56 | class TestGridFSFile(unittest.TestCase): 57 | 58 | @classmethod 59 | def setUpClass(cls): 60 | # Start up a replica set and connect to it 61 | cls.repl_set = ReplicaSet().start() 62 | cls.main_connection = cls.repl_set.client() 63 | 64 | @classmethod 65 | def tearDownClass(cls): 66 | cls.main_connection.close() 67 | cls.repl_set.stop() 68 | 69 | def setUp(self): 70 | # clear existing data 71 | self.main_connection.drop_database("test") 72 | self.collection = self.main_connection.test.fs 73 | self.fs = gridfs.GridFS(self.main_connection.test) 74 | 75 | def get_file(self, doc): 76 | return GridFSFile(self.collection, doc) 77 | 78 | def test_insert(self): 79 | def test_insert_file(data, filename, read_size): 80 | # insert file 81 | id = self.fs.put(data, filename=filename, encoding='utf8') 82 | doc = self.collection.files.find_one(id) 83 | f = self.get_file(doc) 84 | 85 | # test metadata 86 | self.assertEqual(id, f._id) 87 | self.assertEqual(filename, f.filename) 88 | 89 | # test data 90 | result = [] 91 | while True: 92 | s = f.read(read_size) 93 | if len(s) > 0: 94 | result.append(s.decode('utf8')) 95 | if read_size >= 0: 96 | self.assertLessEqual(len(s), read_size) 97 | else: 98 | break 99 | result = "".join(result) 100 | self.assertEqual(f.length, len(result)) 101 | self.assertEqual(data, result) 102 | 103 | # test with 1-chunk files 104 | test_insert_file("hello world", "hello.txt", -1) 105 | test_insert_file("hello world 2", "hello.txt", 10) 106 | test_insert_file("hello world 3", "hello.txt", 100) 107 | 108 | # test with multiple-chunk files 109 | size = 4 * 1024 * 1024 110 | bigger = "".join([chr(ord('a') + (n % 26)) for n in range(size)]) 111 | test_insert_file(bigger, "bigger.txt", -1) 112 | test_insert_file(bigger, "bigger.txt", 1024) 113 | test_insert_file(bigger, "bigger.txt", 1024 * 1024) 114 | 115 | def test_missing_chunk(self): 116 | data = "test data" 117 | id = self.fs.put(data, encoding='utf8') 118 | doc = self.collection.files.find_one(id) 119 | f = self.get_file(doc) 120 | 121 | self.main_connection['test']['fs.chunks'].remove({ 122 | 'files_id': id 123 | }) 124 | 125 | self.assertRaises(errors.OperationFailed, f.read) 126 | 127 | 128 | if __name__ == '__main__': 129 | unittest.main() 130 | -------------------------------------------------------------------------------- /tests/test_mongo.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Test mongo using the synchronizer, i.e. as it would be used by an 16 | user 17 | """ 18 | import os 19 | import sys 20 | import time 21 | 22 | from gridfs import GridFS 23 | 24 | sys.path[0:0] = [""] 25 | 26 | from tests.setup_cluster import ReplicaSet, Server 27 | from mongo_connector.doc_managers.mongo_doc_manager import DocManager 28 | from mongo_connector.connector import Connector 29 | from mongo_connector.util import retry_until_ok 30 | from tests import unittest, connector_opts 31 | from tests.util import assert_soon 32 | 33 | 34 | class MongoTestCase(unittest.TestCase): 35 | 36 | @classmethod 37 | def setUpClass(cls): 38 | cls.standalone = Server().start() 39 | cls.mongo_doc = DocManager(cls.standalone.uri) 40 | cls.mongo_conn = cls.standalone.client() 41 | cls.mongo = cls.mongo_conn['test']['test'] 42 | 43 | @classmethod 44 | def tearDownClass(cls): 45 | cls.standalone.stop() 46 | 47 | def _search(self, **kwargs): 48 | for doc in self.mongo.find(**kwargs): 49 | yield doc 50 | 51 | fs = GridFS(self.mongo_conn['test'], 'test') 52 | for doc in self.mongo_conn['__mongo_connector']['test.test'].find(): 53 | if doc.get('gridfs_id'): 54 | for f in fs.find({'_id': doc['gridfs_id']}): 55 | doc['filename'] = f.filename 56 | doc['content'] = f.read() 57 | yield doc 58 | 59 | def _remove(self): 60 | self.mongo_conn['test']['test'].drop() 61 | self.mongo_conn['test']['test.files'].drop() 62 | self.mongo_conn['test']['test.chunks'].drop() 63 | 64 | 65 | class TestMongo(MongoTestCase): 66 | """ Tests the mongo instance 67 | """ 68 | 69 | @classmethod 70 | def setUpClass(cls): 71 | MongoTestCase.setUpClass() 72 | cls.repl_set = ReplicaSet().start() 73 | cls.conn = cls.repl_set.client() 74 | 75 | @classmethod 76 | def tearDownClass(cls): 77 | """ Kills cluster instance 78 | """ 79 | MongoTestCase.tearDownClass() 80 | cls.repl_set.stop() 81 | 82 | def tearDown(self): 83 | self.connector.join() 84 | 85 | def setUp(self): 86 | try: 87 | os.unlink("oplog.timestamp") 88 | except OSError: 89 | pass 90 | self._remove() 91 | self.connector = Connector( 92 | mongo_address=self.repl_set.uri, 93 | ns_set=['test.test'], 94 | doc_managers=(self.mongo_doc,), 95 | gridfs_set=['test.test'], 96 | **connector_opts 97 | ) 98 | 99 | self.conn.test.test.drop() 100 | self.conn.test.test.files.drop() 101 | self.conn.test.test.chunks.drop() 102 | 103 | self.connector.start() 104 | assert_soon(lambda: len(self.connector.shard_set) > 0) 105 | assert_soon(lambda: sum(1 for _ in self._search()) == 0) 106 | 107 | def test_insert(self): 108 | """Tests insert 109 | """ 110 | 111 | self.conn['test']['test'].insert({'name': 'paulie'}) 112 | assert_soon(lambda: sum(1 for _ in self._search()) == 1) 113 | result_set_1 = self._search() 114 | self.assertEqual(sum(1 for _ in result_set_1), 1) 115 | result_set_2 = self.conn['test']['test'].find_one() 116 | for item in result_set_1: 117 | self.assertEqual(item['_id'], result_set_2['_id']) 118 | self.assertEqual(item['name'], result_set_2['name']) 119 | 120 | def test_remove(self): 121 | """Tests remove 122 | """ 123 | 124 | self.conn['test']['test'].insert({'name': 'paulie'}) 125 | assert_soon(lambda: sum(1 for _ in self._search()) == 1) 126 | self.conn['test']['test'].remove({'name': 'paulie'}) 127 | assert_soon(lambda: sum(1 for _ in self._search()) != 1) 128 | self.assertEqual(sum(1 for _ in self._search()), 0) 129 | 130 | def test_insert_file(self): 131 | """Tests inserting a gridfs file 132 | """ 133 | fs = GridFS(self.conn['test'], 'test') 134 | test_data = b"test_insert_file test file" 135 | id = fs.put(test_data, filename="test.txt", encoding='utf8') 136 | assert_soon(lambda: sum(1 for _ in self._search()) > 0) 137 | 138 | res = list(self._search()) 139 | self.assertEqual(len(res), 1) 140 | doc = res[0] 141 | self.assertEqual(doc['filename'], 'test.txt') 142 | self.assertEqual(doc['_id'], id) 143 | self.assertEqual(doc['content'], test_data) 144 | 145 | def test_remove_file(self): 146 | fs = GridFS(self.conn['test'], 'test') 147 | id = fs.put("test file", filename="test.txt", encoding='utf8') 148 | assert_soon(lambda: sum(1 for _ in self._search()) == 1) 149 | fs.delete(id) 150 | assert_soon(lambda: sum(1 for _ in self._search()) == 0) 151 | 152 | def test_update(self): 153 | """Test update operations.""" 154 | # Insert 155 | self.conn.test.test.insert({"a": 0}) 156 | assert_soon(lambda: sum(1 for _ in self._search()) == 1) 157 | 158 | def check_update(update_spec): 159 | updated = self.conn.test.test.find_and_modify( 160 | {"a": 0}, 161 | update_spec, 162 | new=True 163 | ) 164 | # Allow some time for update to propagate 165 | time.sleep(2) 166 | replicated = self.mongo_doc.mongo.test.test.find_one({"a": 0}) 167 | self.assertEqual(replicated, updated) 168 | 169 | # Update by adding a field 170 | check_update({"$set": {"b": [{"c": 10}, {"d": 11}]}}) 171 | 172 | # Update by setting an attribute of a sub-document beyond end of array. 173 | check_update({"$set": {"b.10.c": 42}}) 174 | 175 | # Update by changing a value within a sub-document (contains array) 176 | check_update({"$inc": {"b.0.c": 1}}) 177 | 178 | # Update by changing the value within an array 179 | check_update({"$inc": {"b.1.f": 12}}) 180 | 181 | # Update by adding new bucket to list 182 | check_update({"$push": {"b": {"e": 12}}}) 183 | 184 | # Update by changing an entire sub-document 185 | check_update({"$set": {"b.0": {"e": 4}}}) 186 | 187 | # Update by adding a sub-document 188 | check_update({"$set": {"b": {"0": {"c": 100}}}}) 189 | 190 | # Update whole document 191 | check_update({"a": 0, "b": {"1": {"d": 10000}}}) 192 | 193 | def test_rollback(self): 194 | """Tests rollback. We force a rollback by adding a doc, killing the 195 | primary, adding another doc, killing the new primary, and then 196 | restarting both. 197 | """ 198 | primary_conn = self.repl_set.primary.client() 199 | self.conn['test']['test'].insert({'name': 'paul'}) 200 | condition = lambda: self.conn['test']['test'].find_one( 201 | {'name': 'paul'}) is not None 202 | assert_soon(condition) 203 | assert_soon(lambda: sum(1 for _ in self._search()) == 1) 204 | 205 | self.repl_set.primary.stop(destroy=False) 206 | new_primary_conn = self.repl_set.secondary.client() 207 | admin = new_primary_conn['admin'] 208 | condition = lambda: admin.command("isMaster")['ismaster'] 209 | assert_soon(lambda: retry_until_ok(condition)) 210 | 211 | retry_until_ok(self.conn.test.test.insert, 212 | {'name': 'pauline'}) 213 | assert_soon(lambda: sum(1 for _ in self._search()) == 2) 214 | result_set_1 = list(self._search()) 215 | result_set_2 = self.conn['test']['test'].find_one({'name': 'pauline'}) 216 | self.assertEqual(len(result_set_1), 2) 217 | #make sure pauline is there 218 | for item in result_set_1: 219 | if item['name'] == 'pauline': 220 | self.assertEqual(item['_id'], result_set_2['_id']) 221 | self.repl_set.secondary.stop(destroy=False) 222 | 223 | self.repl_set.primary.start() 224 | assert_soon( 225 | lambda: primary_conn['admin'].command("isMaster")['ismaster']) 226 | 227 | self.repl_set.secondary.start() 228 | 229 | time.sleep(2) 230 | result_set_1 = list(self._search()) 231 | self.assertEqual(len(result_set_1), 1) 232 | for item in result_set_1: 233 | self.assertEqual(item['name'], 'paul') 234 | find_cursor = retry_until_ok(self.conn['test']['test'].find) 235 | self.assertEqual(retry_until_ok(find_cursor.count), 1) 236 | 237 | 238 | if __name__ == '__main__': 239 | unittest.main() 240 | -------------------------------------------------------------------------------- /tests/test_mongo_connector.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Tests methods for mongo_connector 16 | """ 17 | 18 | import json 19 | import os 20 | import sys 21 | import time 22 | 23 | from bson.timestamp import Timestamp 24 | 25 | sys.path[0:0] = [""] 26 | 27 | from mongo_connector.connector import Connector 28 | from tests import unittest, connector_opts 29 | from tests.setup_cluster import ReplicaSet 30 | from mongo_connector.util import long_to_bson_ts 31 | 32 | 33 | class TestMongoConnector(unittest.TestCase): 34 | """ Test Class for the Mongo Connector 35 | """ 36 | 37 | @classmethod 38 | def setUpClass(cls): 39 | """ Initializes the cluster 40 | """ 41 | try: 42 | os.unlink("oplog.timestamp") 43 | except OSError: 44 | pass 45 | open("oplog.timestamp", "w").close() 46 | cls.repl_set = ReplicaSet().start() 47 | 48 | @classmethod 49 | def tearDownClass(cls): 50 | """ Kills cluster instance 51 | """ 52 | cls.repl_set.stop() 53 | 54 | def test_connector(self): 55 | """Test whether the connector initiates properly 56 | """ 57 | conn = Connector( 58 | mongo_address=self.repl_set.uri, 59 | ns_set=['test.test'], 60 | **connector_opts 61 | ) 62 | conn.start() 63 | 64 | while len(conn.shard_set) != 1: 65 | time.sleep(2) 66 | conn.join() 67 | 68 | self.assertFalse(conn.can_run) 69 | time.sleep(5) 70 | for thread in conn.shard_set.values(): 71 | self.assertFalse(thread.running) 72 | 73 | def test_write_oplog_progress(self): 74 | """Test write_oplog_progress under several circumstances 75 | """ 76 | try: 77 | os.unlink("temp_oplog.timestamp") 78 | except OSError: 79 | pass 80 | open("temp_oplog.timestamp", "w").close() 81 | conn = Connector( 82 | mongo_address=self.repl_set.uri, 83 | oplog_checkpoint="temp_oplog.timestamp", 84 | ns_set=['test.test'], 85 | **connector_opts 86 | ) 87 | 88 | #test that None is returned if there is no config file specified. 89 | self.assertEqual(conn.write_oplog_progress(), None) 90 | 91 | conn.oplog_progress.get_dict()[1] = Timestamp(12, 34) 92 | #pretend to insert a thread/timestamp pair 93 | conn.write_oplog_progress() 94 | 95 | data = json.load(open("temp_oplog.timestamp", 'r')) 96 | self.assertEqual(1, int(data[0])) 97 | self.assertEqual(long_to_bson_ts(int(data[1])), Timestamp(12, 34)) 98 | 99 | #ensure the temp file was deleted 100 | self.assertFalse(os.path.exists("temp_oplog.timestamp" + '~')) 101 | 102 | #ensure that updates work properly 103 | conn.oplog_progress.get_dict()[1] = Timestamp(44, 22) 104 | conn.write_oplog_progress() 105 | 106 | config_file = open("temp_oplog.timestamp", 'r') 107 | data = json.load(config_file) 108 | self.assertEqual(1, int(data[0])) 109 | self.assertEqual(long_to_bson_ts(int(data[1])), Timestamp(44, 22)) 110 | 111 | config_file.close() 112 | os.unlink("temp_oplog.timestamp") 113 | 114 | def test_read_oplog_progress(self): 115 | """Test read_oplog_progress 116 | """ 117 | 118 | conn = Connector( 119 | mongo_address=self.repl_set.uri, 120 | oplog_checkpoint=None, 121 | ns_set=['test.test'], 122 | **connector_opts 123 | ) 124 | 125 | #testing with no file 126 | self.assertEqual(conn.read_oplog_progress(), None) 127 | 128 | try: 129 | os.unlink("temp_oplog.timestamp") 130 | except OSError: 131 | pass 132 | open("temp_oplog.timestamp", "w").close() 133 | 134 | conn.oplog_checkpoint = "temp_oplog.timestamp" 135 | 136 | #testing with empty file 137 | self.assertEqual(conn.read_oplog_progress(), None) 138 | 139 | oplog_dict = conn.oplog_progress.get_dict() 140 | 141 | #add a value to the file, delete the dict, and then read in the value 142 | oplog_dict['oplog1'] = Timestamp(12, 34) 143 | conn.write_oplog_progress() 144 | del oplog_dict['oplog1'] 145 | 146 | self.assertEqual(len(oplog_dict), 0) 147 | 148 | conn.read_oplog_progress() 149 | oplog_dict = conn.oplog_progress.get_dict() 150 | 151 | self.assertTrue('oplog1' in oplog_dict.keys()) 152 | self.assertTrue(oplog_dict['oplog1'], Timestamp(12, 34)) 153 | 154 | oplog_dict['oplog1'] = Timestamp(55, 11) 155 | 156 | #see if oplog progress dict is properly updated 157 | conn.read_oplog_progress() 158 | self.assertTrue(oplog_dict['oplog1'], Timestamp(55, 11)) 159 | 160 | os.unlink("temp_oplog.timestamp") 161 | 162 | 163 | if __name__ == '__main__': 164 | unittest.main() 165 | -------------------------------------------------------------------------------- /tests/test_mongo_doc_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Tests each of the functions in mongo_doc_manager 16 | """ 17 | 18 | import sys 19 | 20 | sys.path[0:0] = [""] 21 | 22 | from mongo_connector.command_helper import CommandHelper 23 | from mongo_connector.doc_managers.mongo_doc_manager import DocManager 24 | from tests import unittest, TESTARGS 25 | from tests.test_gridfs_file import MockGridFSFile 26 | from tests.test_mongo import MongoTestCase 27 | 28 | 29 | class TestMongoDocManager(MongoTestCase): 30 | """Test class for MongoDocManager 31 | """ 32 | 33 | @classmethod 34 | def setUpClass(cls): 35 | MongoTestCase.setUpClass() 36 | cls.namespaces_inc = ["test.test_include1", "test.test_include2"] 37 | cls.namespaces_exc = ["test.test_exclude1", "test.test_exclude2"] 38 | cls.choosy_docman = DocManager( 39 | cls.standalone.uri, 40 | namespace_set=TestMongoDocManager.namespaces_inc 41 | ) 42 | 43 | def setUp(self): 44 | """Empty Mongo at the start of every test 45 | """ 46 | 47 | self.mongo_conn.drop_database("__mongo_connector") 48 | self._remove() 49 | 50 | conn = self.standalone.client() 51 | for ns in self.namespaces_inc + self.namespaces_exc: 52 | db, coll = ns.split('.', 1) 53 | conn[db][coll].remove() 54 | 55 | def test_namespaces(self): 56 | """Ensure that a DocManager instantiated with a namespace set 57 | has the correct namespaces 58 | """ 59 | 60 | self.assertEqual(set(self.namespaces_inc), 61 | set(self.choosy_docman._namespaces())) 62 | 63 | def test_update(self): 64 | doc_id = '1' 65 | doc = {"_id": doc_id, "a": 1, "b": 2} 66 | self.mongo.insert(doc) 67 | # $set only 68 | update_spec = {"$set": {"a": 1, "b": 2}} 69 | doc = self.choosy_docman.update(doc_id, update_spec, *TESTARGS) 70 | self.assertEqual(doc, {"_id": doc_id, "a": 1, "b": 2}) 71 | # $unset only 72 | update_spec = {"$unset": {"a": True}} 73 | doc = self.choosy_docman.update(doc_id, update_spec, *TESTARGS) 74 | self.assertEqual(doc, {"_id": doc_id, "b": 2}) 75 | # mixed $set/$unset 76 | update_spec = {"$unset": {"b": True}, "$set": {"c": 3}} 77 | doc = self.choosy_docman.update(doc_id, update_spec, *TESTARGS) 78 | self.assertEqual(doc, {"_id": doc_id, "c": 3}) 79 | 80 | def test_upsert(self): 81 | """Ensure we can properly insert into Mongo via DocManager. 82 | """ 83 | 84 | docc = {'_id': '1', 'name': 'John'} 85 | self.mongo_doc.upsert(docc, *TESTARGS) 86 | res = list(self._search()) 87 | self.assertEqual(len(res), 1) 88 | for doc in res: 89 | self.assertEqual(doc['_id'], '1') 90 | self.assertEqual(doc['name'], 'John') 91 | 92 | docc = {'_id': '1', 'name': 'Paul'} 93 | self.mongo_doc.upsert(docc, *TESTARGS) 94 | res = list(self._search()) 95 | self.assertEqual(len(res), 1) 96 | for doc in res: 97 | self.assertEqual(doc['_id'], '1') 98 | self.assertEqual(doc['name'], 'Paul') 99 | 100 | def test_bulk_upsert(self): 101 | """Test the bulk_upsert method.""" 102 | docs = ({"_id": i} for i in range(1000)) 103 | self.mongo_doc.bulk_upsert(docs, *TESTARGS) 104 | res = list(self._search(sort=[('_id', 1)])) 105 | self.assertEqual(len(res), 1000) 106 | for i, r in enumerate(res): 107 | self.assertEqual(r['_id'], i) 108 | 109 | docs = ({"_id": i, "weight": 2*i} for i in range(1000)) 110 | self.mongo_doc.bulk_upsert(docs, *TESTARGS) 111 | 112 | res = list(self._search(sort=[('_id', 1)])) 113 | self.assertEqual(len(res), 1000) 114 | for i, r in enumerate(res): 115 | self.assertEqual(r['weight'], 2*i) 116 | 117 | def test_remove(self): 118 | """Ensure we can properly delete from Mongo via DocManager. 119 | """ 120 | 121 | docc = {'_id': '1', 'name': 'John'} 122 | self.mongo_doc.upsert(docc, *TESTARGS) 123 | self.assertEqual(len(list(self._search())), 1) 124 | self.mongo_doc.remove(docc['_id'], *TESTARGS) 125 | self.assertEqual(len(list(self._search())), 0) 126 | 127 | def test_insert_file(self): 128 | # Drop database, so that mongo_doc's client refreshes its index cache. 129 | self.mongo_doc.mongo.drop_database('test') 130 | test_data = ' '.join(str(x) for x in range(100000)).encode('utf8') 131 | docc = { 132 | '_id': 'test_id', 133 | 'filename': 'test_filename', 134 | 'upload_date': 5, 135 | 'md5': 'test_md5' 136 | } 137 | self.mongo_doc.insert_file(MockGridFSFile(docc, test_data), *TESTARGS) 138 | res = self._search() 139 | for doc in res: 140 | self.assertEqual(doc['_id'], docc['_id']) 141 | self.assertEqual(doc['filename'], docc['filename']) 142 | self.assertEqual(doc['content'], test_data) 143 | 144 | def test_remove_file(self): 145 | # Drop database, so that mongo_doc's client refreshes its index cache. 146 | self.mongo_doc.mongo.drop_database('test') 147 | test_data = b'hello world' 148 | docc = { 149 | '_id': 'test_id', 150 | 'filename': 'test_filename', 151 | 'upload_date': 5, 152 | 'md5': 'test_md5' 153 | } 154 | 155 | self.mongo_doc.insert_file(MockGridFSFile(docc, test_data), *TESTARGS) 156 | res = list(self._search()) 157 | self.assertEqual(len(res), 1) 158 | 159 | self.mongo_doc.remove(docc['_id'], *TESTARGS) 160 | res = list(self._search()) 161 | self.assertEqual(len(res), 0) 162 | 163 | def test_search(self): 164 | """Query Mongo for docs in a timestamp range. 165 | 166 | We use API and DocManager's search(start_ts,end_ts), and then compare. 167 | """ 168 | 169 | docc = {'_id': '1', 'name': 'John'} 170 | self.mongo_doc.upsert(docc, 'test.test', 5767301236327972865) 171 | docc2 = {'_id': '2', 'name': 'John Paul'} 172 | self.mongo_doc.upsert(docc2, 'test.test', 5767301236327972866) 173 | docc3 = {'_id': '3', 'name': 'Paul'} 174 | self.mongo_doc.upsert(docc3, 'test.test', 5767301236327972870) 175 | search = list(self.mongo_doc.search(5767301236327972865, 176 | 5767301236327972866)) 177 | self.assertEqual(len(search), 2) 178 | result_id = [result.get("_id") for result in search] 179 | self.assertIn('1', result_id) 180 | self.assertIn('2', result_id) 181 | 182 | def test_search_namespaces(self): 183 | """Test search within timestamp range with a given namespace set 184 | """ 185 | 186 | for ns in self.namespaces_inc: 187 | for i in range(100): 188 | self.choosy_docman.upsert({"_id": i}, ns, i) 189 | for ns in self.namespaces_exc: 190 | for i in range(100): 191 | self.choosy_docman.upsert({"_id": -i}, ns, i) 192 | 193 | results = list(self.choosy_docman.search(0, 49)) 194 | self.assertEqual(len(results), 100) 195 | for r in results: 196 | self.assertGreaterEqual(r['_id'], 0) 197 | 198 | def test_get_last_doc(self): 199 | """Insert documents, verify that get_last_doc() returns the one with 200 | the latest timestamp. 201 | """ 202 | docc = {'_id': '4', 'name': 'Hare'} 203 | self.mongo_doc.upsert(docc, 'test.test', 3) 204 | docc = {'_id': '5', 'name': 'Tortoise'} 205 | self.mongo_doc.upsert(docc, 'test.test', 2) 206 | docc = {'_id': '6', 'name': 'Mr T.'} 207 | self.mongo_doc.upsert(docc, 'test.test', 1) 208 | doc = self.mongo_doc.get_last_doc() 209 | self.assertEqual(doc['_id'], '4') 210 | docc = {'_id': '6', 'name': 'HareTwin'} 211 | self.mongo_doc.upsert(docc, 'test.test', 4) 212 | doc = self.mongo_doc.get_last_doc() 213 | self.assertEqual(doc['_id'], '6') 214 | 215 | def test_get_last_doc_namespaces(self): 216 | """Ensure that get_last_doc returns the latest document in one of 217 | the given namespaces 218 | """ 219 | 220 | # latest document is not in included namespace 221 | for i in range(100): 222 | ns = (self.namespaces_inc, self.namespaces_exc)[i % 2][0] 223 | self.choosy_docman.upsert({"_id": i}, ns, i) 224 | last_doc = self.choosy_docman.get_last_doc() 225 | # Even value for _id means ns was in self.namespaces_inc. 226 | self.assertEqual(last_doc["_id"], 98) 227 | 228 | # remove latest document so last doc is in included namespace, 229 | # shouldn't change result 230 | db, coll = self.namespaces_inc[0].split(".", 1) 231 | self.standalone.client()[db][coll].remove({"_id": 99}) 232 | last_doc = self.choosy_docman.get_last_doc() 233 | self.assertEqual(last_doc["_id"], 98) 234 | 235 | def test_commands(self): 236 | self.mongo_doc.command_helper = CommandHelper() 237 | 238 | # create test thing, assert 239 | self.mongo_doc.handle_command({'create': 'test'}, *TESTARGS) 240 | self.assertIn('test', self.mongo_conn['test'].collection_names()) 241 | 242 | self.mongo_doc.handle_command( 243 | {'renameCollection': 'test.test', 'to': 'test.test2'}, 244 | 'admin.$cmd', 1) 245 | self.assertNotIn('test', self.mongo_conn['test'].collection_names()) 246 | self.assertIn('test2', self.mongo_conn['test'].collection_names()) 247 | 248 | self.mongo_doc.handle_command({'drop': 'test2'}, 'test.$cmd', 1) 249 | self.assertNotIn('test2', self.mongo_conn['test'].collection_names()) 250 | 251 | self.assertIn('test', self.mongo_conn.database_names()) 252 | self.mongo_doc.handle_command({'dropDatabase': 1}, 'test.$cmd', 1) 253 | self.assertNotIn('test', self.mongo_conn.database_names()) 254 | 255 | 256 | if __name__ == '__main__': 257 | unittest.main() 258 | -------------------------------------------------------------------------------- /tests/test_rollbacks.py: -------------------------------------------------------------------------------- 1 | """Test Mongo Connector's behavior when its source MongoDB system is 2 | experiencing a rollback. 3 | 4 | """ 5 | 6 | import os 7 | import sys 8 | import time 9 | 10 | from pymongo.read_preferences import ReadPreference 11 | from pymongo import MongoClient 12 | 13 | sys.path[0:0] = [""] 14 | 15 | from mongo_connector.util import retry_until_ok 16 | from mongo_connector.locking_dict import LockingDict 17 | from mongo_connector.doc_managers.doc_manager_simulator import DocManager 18 | from mongo_connector.oplog_manager import OplogThread 19 | 20 | from tests import unittest, STRESS_COUNT 21 | from tests.util import assert_soon 22 | from tests.setup_cluster import ReplicaSet 23 | 24 | 25 | class TestRollbacks(unittest.TestCase): 26 | 27 | def tearDown(self): 28 | self.repl_set.stop() 29 | 30 | def setUp(self): 31 | # Create a new oplog progress file 32 | try: 33 | os.unlink("oplog.timestamp") 34 | except OSError: 35 | pass 36 | open("oplog.timestamp", "w").close() 37 | 38 | # Start a replica set 39 | self.repl_set = ReplicaSet().start() 40 | # Connection to the replica set as a whole 41 | self.main_conn = self.repl_set.client() 42 | # Connection to the primary specifically 43 | self.primary_conn = self.repl_set.primary.client() 44 | # Connection to the secondary specifically 45 | self.secondary_conn = self.repl_set.secondary.client( 46 | read_preference=ReadPreference.SECONDARY_PREFERRED) 47 | 48 | # Wipe any test data 49 | self.main_conn["test"]["mc"].drop() 50 | 51 | # Oplog thread 52 | doc_manager = DocManager() 53 | oplog_progress = LockingDict() 54 | self.opman = OplogThread( 55 | primary_client=self.main_conn, 56 | doc_managers=(doc_manager,), 57 | oplog_progress_dict=oplog_progress, 58 | ns_set=["test.mc"] 59 | ) 60 | 61 | def test_single_target(self): 62 | """Test with a single replication target""" 63 | 64 | self.opman.start() 65 | 66 | # Insert first document with primary up 67 | self.main_conn["test"]["mc"].insert({"i": 0}) 68 | self.assertEqual(self.primary_conn["test"]["mc"].find().count(), 1) 69 | 70 | # Make sure the insert is replicated 71 | secondary = self.secondary_conn 72 | assert_soon(lambda: secondary["test"]["mc"].count() == 1, 73 | "first write didn't replicate to secondary") 74 | 75 | # Kill the primary 76 | self.repl_set.primary.stop(destroy=False) 77 | 78 | # Wait for the secondary to be promoted 79 | assert_soon(lambda: secondary["admin"].command("isMaster")["ismaster"]) 80 | 81 | # Insert another document. This will be rolled back later 82 | retry_until_ok(self.main_conn["test"]["mc"].insert, {"i": 1}) 83 | self.assertEqual(secondary["test"]["mc"].count(), 2) 84 | 85 | # Wait for replication to doc manager 86 | assert_soon(lambda: len(self.opman.doc_managers[0]._search()) == 2, 87 | "not all writes were replicated to doc manager") 88 | 89 | # Kill the new primary 90 | self.repl_set.secondary.stop(destroy=False) 91 | 92 | # Start both servers back up 93 | self.repl_set.primary.start() 94 | primary_admin = self.primary_conn["admin"] 95 | assert_soon(lambda: primary_admin.command("isMaster")["ismaster"], 96 | "restarted primary never resumed primary status") 97 | self.repl_set.secondary.start() 98 | assert_soon(lambda: retry_until_ok(secondary.admin.command, 99 | 'replSetGetStatus')['myState'] == 2, 100 | "restarted secondary never resumed secondary status") 101 | assert_soon(lambda: 102 | retry_until_ok(self.main_conn.test.mc.find().count) > 0, 103 | "documents not found after primary/secondary restarted") 104 | 105 | # Only first document should exist in MongoDB 106 | self.assertEqual(self.main_conn["test"]["mc"].count(), 1) 107 | self.assertEqual(self.main_conn["test"]["mc"].find_one()["i"], 0) 108 | 109 | # Same case should hold for the doc manager 110 | doc_manager = self.opman.doc_managers[0] 111 | assert_soon(lambda: len(doc_manager._search()) == 1, 112 | 'documents never rolled back in doc manager.') 113 | self.assertEqual(doc_manager._search()[0]["i"], 0) 114 | 115 | # cleanup 116 | self.opman.join() 117 | 118 | def test_many_targets(self): 119 | """Test with several replication targets""" 120 | 121 | # OplogThread has multiple doc managers 122 | doc_managers = [DocManager(), DocManager(), DocManager()] 123 | self.opman.doc_managers = doc_managers 124 | 125 | self.opman.start() 126 | 127 | # Insert a document into each namespace 128 | self.main_conn["test"]["mc"].insert({"i": 0}) 129 | self.assertEqual(self.primary_conn["test"]["mc"].count(), 1) 130 | 131 | # Make sure the insert is replicated 132 | secondary = self.secondary_conn 133 | assert_soon(lambda: secondary["test"]["mc"].count() == 1, 134 | "first write didn't replicate to secondary") 135 | 136 | # Kill the primary 137 | self.repl_set.primary.stop(destroy=False) 138 | 139 | # Wait for the secondary to be promoted 140 | assert_soon(lambda: secondary.admin.command("isMaster")['ismaster'], 141 | 'secondary was never promoted') 142 | 143 | # Insert more documents. This will be rolled back later 144 | # Some of these documents will be manually removed from 145 | # certain doc managers, to emulate the effect of certain 146 | # target systems being ahead/behind others 147 | secondary_ids = [] 148 | for i in range(1, 10): 149 | secondary_ids.append( 150 | retry_until_ok(self.main_conn["test"]["mc"].insert, 151 | {"i": i})) 152 | self.assertEqual(self.secondary_conn["test"]["mc"].count(), 10) 153 | 154 | # Wait for replication to the doc managers 155 | def docmans_done(): 156 | for dm in self.opman.doc_managers: 157 | if len(dm._search()) != 10: 158 | return False 159 | return True 160 | assert_soon(docmans_done, 161 | "not all writes were replicated to doc managers") 162 | 163 | # Remove some documents from the doc managers to simulate 164 | # uneven replication 165 | ts = self.opman.doc_managers[0].get_last_doc()['_ts'] 166 | for id in secondary_ids[8:]: 167 | self.opman.doc_managers[1].remove(id, 'test.mc', ts) 168 | for id in secondary_ids[2:]: 169 | self.opman.doc_managers[2].remove(id, 'test.mc', ts) 170 | 171 | # Kill the new primary 172 | self.repl_set.secondary.stop(destroy=False) 173 | 174 | # Start both servers back up 175 | self.repl_set.primary.start() 176 | primary_admin = self.primary_conn["admin"] 177 | assert_soon(lambda: primary_admin.command("isMaster")['ismaster'], 178 | 'restarted primary never resumed primary status') 179 | self.repl_set.secondary.start() 180 | assert_soon(lambda: retry_until_ok(secondary.admin.command, 181 | 'replSetGetStatus')['myState'] == 2, 182 | "restarted secondary never resumed secondary status") 183 | assert_soon(lambda: 184 | retry_until_ok(self.primary_conn.test.mc.find().count) > 0, 185 | "documents not found after primary/secondary restarted") 186 | 187 | # Only first document should exist in MongoDB 188 | self.assertEqual(self.primary_conn["test"]["mc"].count(), 1) 189 | self.assertEqual(self.primary_conn["test"]["mc"].find_one()["i"], 0) 190 | 191 | # Give OplogThread some time to catch up 192 | time.sleep(10) 193 | 194 | # Same case should hold for the doc managers 195 | for dm in self.opman.doc_managers: 196 | self.assertEqual(len(dm._search()), 1) 197 | self.assertEqual(dm._search()[0]["i"], 0) 198 | 199 | self.opman.join() 200 | 201 | def test_deletions(self): 202 | """Test rolling back 'd' operations""" 203 | 204 | self.opman.start() 205 | 206 | # Insert a document, wait till it replicates to secondary 207 | self.main_conn["test"]["mc"].insert({"i": 0}) 208 | self.main_conn["test"]["mc"].insert({"i": 1}) 209 | self.assertEqual(self.primary_conn["test"]["mc"].find().count(), 2) 210 | assert_soon(lambda: self.secondary_conn["test"]["mc"].count() == 2, 211 | "first write didn't replicate to secondary") 212 | 213 | # Kill the primary, wait for secondary to be promoted 214 | self.repl_set.primary.stop(destroy=False) 215 | assert_soon(lambda: self.secondary_conn["admin"] 216 | .command("isMaster")["ismaster"]) 217 | 218 | # Delete first document 219 | retry_until_ok(self.main_conn["test"]["mc"].remove, {"i": 0}) 220 | self.assertEqual(self.secondary_conn["test"]["mc"].count(), 1) 221 | 222 | # Wait for replication to doc manager 223 | assert_soon(lambda: len(self.opman.doc_managers[0]._search()) == 1, 224 | "delete was not replicated to doc manager") 225 | 226 | # Kill the new primary 227 | self.repl_set.secondary.stop(destroy=False) 228 | 229 | # Start both servers back up 230 | self.repl_set.primary.start() 231 | primary_admin = self.primary_conn["admin"] 232 | assert_soon(lambda: primary_admin.command("isMaster")["ismaster"], 233 | "restarted primary never resumed primary status") 234 | self.repl_set.secondary.start() 235 | assert_soon(lambda: retry_until_ok(self.secondary_conn.admin.command, 236 | 'replSetGetStatus')['myState'] == 2, 237 | "restarted secondary never resumed secondary status") 238 | 239 | # Both documents should exist in mongo 240 | assert_soon(lambda: retry_until_ok( 241 | self.main_conn["test"]["mc"].count) == 2) 242 | 243 | # Both document should exist in doc manager 244 | doc_manager = self.opman.doc_managers[0] 245 | docs = list(doc_manager._search()) 246 | self.assertEqual(len(docs), 2, 247 | "Expected two documents, but got %r" % docs) 248 | 249 | self.opman.join() 250 | 251 | def test_stressed_rollback(self): 252 | """Stress test for a rollback with many documents.""" 253 | self.opman.start() 254 | 255 | c = self.main_conn.test.mc 256 | docman = self.opman.doc_managers[0] 257 | 258 | c.insert({'i': i} for i in range(STRESS_COUNT)) 259 | assert_soon(lambda: c.count() == STRESS_COUNT) 260 | condition = lambda: len(docman._search()) == STRESS_COUNT 261 | assert_soon(condition, ("Was expecting %d documents in DocManager, " 262 | "but %d found instead." 263 | % (STRESS_COUNT, len(docman._search())))) 264 | 265 | primary_conn = self.repl_set.primary.client() 266 | self.repl_set.primary.stop(destroy=False) 267 | new_primary_conn = self.repl_set.secondary.client() 268 | 269 | admin = new_primary_conn.admin 270 | assert_soon( 271 | lambda: retry_until_ok(admin.command, "isMaster")['ismaster']) 272 | 273 | retry_until_ok(c.insert, 274 | [{'i': str(STRESS_COUNT + i)} 275 | for i in range(STRESS_COUNT)]) 276 | assert_soon(lambda: len(docman._search()) == c.count()) 277 | 278 | self.repl_set.secondary.stop(destroy=False) 279 | 280 | self.repl_set.primary.start() 281 | admin = primary_conn.admin 282 | assert_soon( 283 | lambda: retry_until_ok(admin.command, "isMaster")['ismaster']) 284 | self.repl_set.secondary.start() 285 | 286 | assert_soon(lambda: retry_until_ok(c.count) == STRESS_COUNT) 287 | assert_soon(condition, ("Was expecting %d documents in DocManager, " 288 | "but %d found instead." 289 | % (STRESS_COUNT, len(docman._search())))) 290 | 291 | self.opman.join() 292 | -------------------------------------------------------------------------------- /tests/test_solr_doc_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import datetime 16 | import sys 17 | import time 18 | 19 | sys.path[0:0] = [""] 20 | 21 | from mongo_connector.command_helper import CommandHelper 22 | from mongo_connector.doc_managers.solr_doc_manager import DocManager 23 | 24 | from tests import unittest, TESTARGS, solr_url 25 | from tests.test_gridfs_file import MockGridFSFile 26 | from tests.test_solr import SolrTestCase 27 | 28 | 29 | class TestSolrDocManager(SolrTestCase): 30 | """Test class for SolrDocManager 31 | """ 32 | 33 | def setUp(self): 34 | """Empty Solr at the start of every test 35 | """ 36 | self._remove() 37 | 38 | def test_update(self): 39 | doc_id = '1' 40 | doc = {"_id": doc_id, "title": "abc", "description": "def"} 41 | self.docman.upsert(doc, *TESTARGS) 42 | # $set only 43 | update_spec = {"$set": {"title": "qaz", "description": "wsx"}} 44 | doc = self.docman.update(doc_id, update_spec, *TESTARGS) 45 | expected = {"_id": doc_id, "title": "qaz", "description": "wsx"} 46 | # We can't use assertEqual here, because Solr adds some 47 | # additional fields like _version_ to all documents 48 | for k, v in expected.items(): 49 | self.assertEqual(doc[k], v) 50 | 51 | # $unset only 52 | update_spec = {"$unset": {"title": True}} 53 | doc = self.docman.update(doc_id, update_spec, *TESTARGS) 54 | expected = {"_id": '1', "description": "wsx"} 55 | for k, v in expected.items(): 56 | self.assertEqual(doc[k], v) 57 | self.assertNotIn("title", doc) 58 | 59 | # mixed $set/$unset 60 | update_spec = {"$unset": {"description": True}, 61 | "$set": {"subject": "edc"}} 62 | doc = self.docman.update(doc_id, update_spec, *TESTARGS) 63 | expected = {"_id": '1', "subject": "edc"} 64 | for k, v in expected.items(): 65 | self.assertEqual(doc[k], v) 66 | self.assertNotIn("description", doc) 67 | 68 | def test_upsert(self): 69 | """Ensure we can properly insert into Solr via DocManager. 70 | """ 71 | #test upsert 72 | docc = {'_id': '1', 'name': 'John'} 73 | self.docman.upsert(docc, *TESTARGS) 74 | res = self.solr_conn.search('*:*') 75 | for doc in res: 76 | self.assertTrue(doc['_id'] == '1' and doc['name'] == 'John') 77 | 78 | docc = {'_id': '1', 'name': 'Paul'} 79 | self.docman.upsert(docc, *TESTARGS) 80 | res = self.solr_conn.search('*:*') 81 | for doc in res: 82 | self.assertTrue(doc['_id'] == '1' and doc['name'] == 'Paul') 83 | 84 | def test_bulk_upsert(self): 85 | """Ensure we can properly insert many documents at once into 86 | Solr via DocManager 87 | 88 | """ 89 | self.docman.bulk_upsert([], *TESTARGS) 90 | 91 | docs = ({"_id": i} for i in range(1000)) 92 | self.docman.bulk_upsert(docs, *TESTARGS) 93 | 94 | res = sorted(int(x["_id"]) 95 | for x in self.solr_conn.search("*:*", rows=1001)) 96 | self.assertEqual(len(res), 1000) 97 | for i, r in enumerate(res): 98 | self.assertEqual(r, i) 99 | 100 | docs = ({"_id": i, "weight": 2*i} for i in range(1000)) 101 | self.docman.bulk_upsert(docs, *TESTARGS) 102 | 103 | res = sorted(int(x["weight"]) 104 | for x in self.solr_conn.search("*:*", rows=1001)) 105 | self.assertEqual(len(res), 1000) 106 | for i, r in enumerate(res): 107 | self.assertEqual(r, 2*i) 108 | 109 | def test_remove(self): 110 | """Ensure we can properly delete from Solr via DocManager. 111 | """ 112 | #test remove 113 | docc = {'_id': '1', 'name': 'John'} 114 | self.docman.upsert(docc, *TESTARGS) 115 | res = self.solr_conn.search('*:*') 116 | self.assertEqual(len(res), 1) 117 | 118 | self.docman.remove(docc['_id'], *TESTARGS) 119 | res = self.solr_conn.search('*:*') 120 | self.assertEqual(len(res), 0) 121 | 122 | def test_insert_file(self): 123 | """Ensure we can properly insert a file into Solr via DocManager. 124 | """ 125 | test_data = ' '.join(str(x) for x in range(100000)).encode('utf8') 126 | docc = { 127 | '_id': 'test_id', 128 | 'filename': 'test_filename', 129 | 'upload_date': datetime.datetime.now(), 130 | 'md5': 'test_md5' 131 | } 132 | self.docman.insert_file(MockGridFSFile(docc, test_data), *TESTARGS) 133 | res = self.solr_conn.search('*:*') 134 | for doc in res: 135 | self.assertEqual(doc['_id'], docc['_id']) 136 | self.assertEqual(doc['filename'], docc['filename']) 137 | self.assertIn(test_data.strip(), 138 | doc['content'][0].strip().encode('utf8')) 139 | 140 | def test_remove_file(self): 141 | test_data = b'hello world' 142 | docc = { 143 | '_id': 'test_id', 144 | 'filename': 'test_filename', 145 | 'upload_date': datetime.datetime.now(), 146 | 'md5': 'test_md5' 147 | } 148 | 149 | self.docman.insert_file(MockGridFSFile(docc, test_data), *TESTARGS) 150 | res = self.solr_conn.search('*:*') 151 | self.assertEqual(len(res), 1) 152 | 153 | self.docman.remove(docc['_id'], *TESTARGS) 154 | res = self.solr_conn.search('*:*') 155 | self.assertEqual(len(res), 0) 156 | 157 | def test_search(self): 158 | """Query Solr for docs in a timestamp range. 159 | 160 | We use API and DocManager's search(start_ts,end_ts), and then compare. 161 | """ 162 | #test search 163 | docc = {'_id': '1', 'name': 'John'} 164 | self.docman.upsert(docc, 'test.test', 5767301236327972865) 165 | docc = {'_id': '2', 'name': 'John Paul'} 166 | self.docman.upsert(docc, 'test.test', 5767301236327972866) 167 | docc = {'_id': '3', 'name': 'Paul'} 168 | self.docman.upsert(docc, 'test.test', 5767301236327972870) 169 | search = list(self.docman.search(5767301236327972865, 170 | 5767301236327972866)) 171 | self.assertEqual(2, len(search), 172 | 'Should find two documents in timestamp range.') 173 | result_names = [result.get("name") for result in search] 174 | self.assertIn('John', result_names) 175 | self.assertIn('John Paul', result_names) 176 | 177 | def test_solr_commit(self): 178 | """Test that documents get properly added to Solr. 179 | """ 180 | docman = DocManager(solr_url) 181 | # test cases: 182 | # None = no autocommit 183 | # 0 = commit immediately 184 | # x > 0 = commit within x seconds 185 | for autocommit_interval in [None, 0, 1, 2]: 186 | docman.auto_commit_interval = autocommit_interval 187 | docman.upsert({'_id': '3', 'name': 'Waldo'}, *TESTARGS) 188 | if autocommit_interval is None: 189 | docman.commit() 190 | else: 191 | # Allow just a little extra time 192 | time.sleep(autocommit_interval + 1) 193 | results = list(self._search("name:Waldo")) 194 | self.assertEqual(len(results), 1, 195 | "should commit document with " 196 | "auto_commit_interval = %s" % str( 197 | autocommit_interval)) 198 | self.assertEqual(results[0]["name"], "Waldo") 199 | self._remove() 200 | 201 | def test_get_last_doc(self): 202 | """Insert documents, Verify the doc with the latest timestamp. 203 | """ 204 | #test get last doc 205 | docc = {'_id': '4', 'name': 'Hare'} 206 | self.docman.upsert(docc, 'test.test', 2) 207 | docc = {'_id': '5', 'name': 'Tortoise'} 208 | self.docman.upsert(docc, 'test.test', 1) 209 | doc = self.docman.get_last_doc() 210 | self.assertTrue(doc['_id'] == '4') 211 | 212 | docc = {'_id': '6', 'name': 'HareTwin', 'ts': '2'} 213 | doc = self.docman.get_last_doc() 214 | self.assertTrue(doc['_id'] == '4' or doc['_id'] == '6') 215 | 216 | def test_commands(self): 217 | self.docman.command_helper = CommandHelper() 218 | 219 | def count_ns(ns): 220 | return sum(1 for _ in self._search("ns:%s" % ns)) 221 | 222 | self.docman.upsert({'_id': '1', 'test': 'data'}, *TESTARGS) 223 | self.assertEqual(count_ns("test.test"), 1) 224 | 225 | self.docman.handle_command({'drop': 'test'}, *TESTARGS) 226 | time.sleep(1) 227 | self.assertEqual(count_ns("test.test"), 0) 228 | 229 | self.docman.upsert({'_id': '2', 'test': 'data'}, 'test.test2', '2') 230 | self.docman.upsert({'_id': '3', 'test': 'data'}, 'test.test3', '3') 231 | self.docman.handle_command({'dropDatabase': 1}, 'test.$cmd', 1) 232 | time.sleep(1) 233 | self.assertEqual(count_ns("test.test2"), 0) 234 | self.assertEqual(count_ns("test.test3"), 0) 235 | 236 | 237 | if __name__ == '__main__': 238 | unittest.main() 239 | -------------------------------------------------------------------------------- /tests/test_synchronizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Test synchronizer using DocManagerSimulator 16 | """ 17 | import os 18 | import sys 19 | import time 20 | 21 | sys.path[0:0] = [""] 22 | 23 | from mongo_connector.connector import Connector 24 | from tests import unittest, connector_opts 25 | from tests.setup_cluster import ReplicaSet 26 | from tests.util import assert_soon 27 | 28 | 29 | class TestSynchronizer(unittest.TestCase): 30 | """ Tests the synchronizers 31 | """ 32 | 33 | @classmethod 34 | def setUpClass(cls): 35 | """ Initializes the cluster 36 | """ 37 | try: 38 | os.unlink("oplog.timestamp") 39 | except OSError: 40 | pass 41 | open("oplog.timestamp", "w").close() 42 | 43 | cls.repl_set = ReplicaSet().start() 44 | cls.conn = cls.repl_set.client() 45 | cls.connector = Connector( 46 | mongo_address=cls.repl_set.uri, 47 | ns_set=['test.test'], 48 | **connector_opts 49 | ) 50 | cls.synchronizer = cls.connector.doc_managers[0] 51 | cls.connector.start() 52 | assert_soon(lambda: len(cls.connector.shard_set) != 0) 53 | 54 | @classmethod 55 | def tearDownClass(cls): 56 | """ Tears down connector 57 | """ 58 | cls.connector.join() 59 | cls.repl_set.stop() 60 | 61 | def setUp(self): 62 | """ Clears the db 63 | """ 64 | self.conn['test']['test'].remove() 65 | assert_soon(lambda: len(self.synchronizer._search()) == 0) 66 | 67 | def test_insert(self): 68 | """Tests insert 69 | """ 70 | self.conn['test']['test'].insert({'name': 'paulie'}) 71 | while (len(self.synchronizer._search()) == 0): 72 | time.sleep(1) 73 | result_set_1 = self.synchronizer._search() 74 | self.assertEqual(len(result_set_1), 1) 75 | result_set_2 = self.conn['test']['test'].find_one() 76 | for item in result_set_1: 77 | self.assertEqual(item['_id'], result_set_2['_id']) 78 | self.assertEqual(item['name'], result_set_2['name']) 79 | 80 | def test_ns_set(self): 81 | self.conn.test.other.insert({"replicated": False}) 82 | results = self.synchronizer._search() 83 | self.assertEqual(len(results), 0, 84 | "Should not replicate outside of test.test namespace") 85 | 86 | def test_remove(self): 87 | """Tests remove 88 | """ 89 | self.conn['test']['test'].insert({'name': 'paulie'}) 90 | while (len(self.synchronizer._search()) != 1): 91 | time.sleep(1) 92 | self.conn['test']['test'].remove({'name': 'paulie'}) 93 | 94 | while (len(self.synchronizer._search()) == 1): 95 | time.sleep(1) 96 | result_set_1 = self.synchronizer._search() 97 | self.assertEqual(len(result_set_1), 0) 98 | 99 | def test_update(self): 100 | """Test that Connector can replicate updates successfully.""" 101 | doc = {"a": 1, "b": 2} 102 | self.conn.test.test.insert(doc) 103 | selector = {"_id": doc['_id']} 104 | 105 | def update_and_retrieve(update_spec): 106 | self.conn.test.test.update(selector, update_spec) 107 | # Give the connector some time to perform update 108 | time.sleep(1) 109 | return self.synchronizer._search()[0] 110 | 111 | # Update whole document 112 | doc = update_and_retrieve({"a": 1, "b": 2, "c": 10}) 113 | self.assertEqual(doc['a'], 1) 114 | self.assertEqual(doc['b'], 2) 115 | self.assertEqual(doc['c'], 10) 116 | 117 | # $set only 118 | doc = update_and_retrieve({"$set": {"b": 4}}) 119 | self.assertEqual(doc['a'], 1) 120 | self.assertEqual(doc['b'], 4) 121 | 122 | # $unset only 123 | doc = update_and_retrieve({"$unset": {"a": True}}) 124 | self.assertNotIn('a', doc) 125 | self.assertEqual(doc['b'], 4) 126 | 127 | # mixed $set/$unset 128 | doc = update_and_retrieve({"$unset": {"b": True}, "$set": {"c": 3}}) 129 | self.assertEqual(doc['c'], 3) 130 | self.assertNotIn('b', doc) 131 | 132 | # ensure update works when fields are given 133 | opthread = self.connector.shard_set[0] 134 | opthread.fields = ['a', 'b', 'c'] 135 | try: 136 | doc = update_and_retrieve({"$set": {"d": 10}}) 137 | self.assertEqual(self.conn.test.test.find_one(doc['_id'])['d'], 10) 138 | self.assertNotIn('d', doc) 139 | doc = update_and_retrieve({"$set": {"a": 10}}) 140 | self.assertEqual(doc['a'], 10) 141 | finally: 142 | # cleanup 143 | opthread.fields = None 144 | 145 | 146 | if __name__ == '__main__': 147 | unittest.main() 148 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Tests methods in util.py 16 | """ 17 | import sys 18 | 19 | from bson import timestamp 20 | 21 | sys.path[0:0] = [""] 22 | 23 | from mongo_connector.util import (bson_ts_to_long, 24 | long_to_bson_ts, 25 | retry_until_ok) 26 | from tests import unittest 27 | 28 | 29 | def err_func(): 30 | """Helper function for retry_until_ok test 31 | """ 32 | 33 | err_func.counter += 1 34 | if err_func.counter == 3: 35 | return True 36 | else: 37 | raise TypeError 38 | 39 | err_func.counter = 0 40 | 41 | 42 | class TestUtil(unittest.TestCase): 43 | """ Tests the utils 44 | """ 45 | 46 | def test_bson_ts_to_long(self): 47 | """Test bson_ts_to_long and long_to_bson_ts 48 | """ 49 | 50 | tstamp = timestamp.Timestamp(0x12345678, 0x90abcdef) 51 | 52 | self.assertEqual(0x1234567890abcdef, 53 | bson_ts_to_long(tstamp)) 54 | self.assertEqual(long_to_bson_ts(0x1234567890abcdef), 55 | tstamp) 56 | 57 | def test_retry_until_ok(self): 58 | """Test retry_until_ok 59 | """ 60 | 61 | self.assertTrue(retry_until_ok(err_func)) 62 | self.assertEqual(err_func.counter, 3) 63 | 64 | 65 | if __name__ == '__main__': 66 | 67 | unittest.main() 68 | -------------------------------------------------------------------------------- /tests/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Utilities for mongo-connector tests. There are no actual tests in here. 16 | """ 17 | 18 | import time 19 | 20 | 21 | def wait_for(condition, max_tries=60): 22 | """Wait for a condition to be true up to a maximum number of tries 23 | """ 24 | while not condition() and max_tries > 1: 25 | time.sleep(1) 26 | max_tries -= 1 27 | return condition() 28 | 29 | 30 | def assert_soon(condition, message=None, max_tries=60): 31 | """Assert that a condition eventually evaluates to True after at most 32 | max_tries number of attempts 33 | 34 | """ 35 | if not wait_for(condition, max_tries=max_tries): 36 | raise AssertionError(message or "") 37 | --------------------------------------------------------------------------------