├── .flake8 ├── .gitignore ├── .travis.yml ├── CHANGELOG.rst ├── LICENSE ├── README.rst ├── mongo_connector ├── __init__.py └── doc_managers │ ├── __init__.py │ └── elastic2_doc_manager.py ├── pyproject.toml ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── test_elastic2.py └── test_elastic2_doc_manager.py └── tox.ini /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # defer to black http://bit.ly/2CKCP8W 3 | max-line-length = 88 4 | ignore = 5 | E203 6 | W503 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yougov/elastic2-doc-manager/ad92138d1fd6656bb2e71cb5cc840f9ba0109c49/.gitignore -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: python 3 | 4 | # Elasticsearch 5 requires Java 8 5 | addons: 6 | apt: 7 | packages: 8 | - oracle-java8-set-default 9 | 10 | env: 11 | global: 12 | - ELASTIC_2_URL=http://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/tar/elasticsearch/2.4.3/elasticsearch-2.4.3.tar.gz 13 | - ELASTIC_5_URL=http://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-5.1.1.tar.gz 14 | # We need to manually set JAVA_HOME to Java 8 too. 15 | - JAVA_HOME=/usr/lib/jvm/java-8-oracle 16 | 17 | jobs: 18 | fast_finish: true 19 | include: 20 | - python: &latest_py3 3.6 21 | env: MONGODB=3.2.11 ELASTIC=5.1.1 ELASTIC_URL=$ELASTIC_5_URL TOXENV=elastic5 22 | - python: 3.4 23 | env: MONGODB=2.6.12 ELASTIC=2.4.3 ELASTIC_URL=$ELASTIC_2_URL TOXENV=elastic2 24 | - python: 3.5 25 | env: MONGODB=2.4.14 ELASTIC=2.4.3 ELASTIC_URL=$ELASTIC_2_URL TOXENV=elastic2 26 | - stage: deploy 27 | if: tag IS present 28 | python: *latest_py3 29 | install: skip 30 | script: skip 31 | deploy: 32 | provider: pypi 33 | on: 34 | tags: true 35 | all_branches: true 36 | user: jaraco 37 | password: 38 | secure: rx5Yz5qT+wPWiT+wOn0+U3F4G0Kk/rouMkJeOnGNxb/NHWwc9iFqeLRRw5rWyF6COPMuKh32MlMIdL7EQNPyadEZ9djtPWC5csJYNLJAH/VC/+vZd5ZEwbsp1BYB+dZqUSUf7+G1yANvi4jd6x8zK1M9FHtL+LrWda/t2gsJePIx9rzm5PVHZO9GX9+ljb+pKP2Pk768BqL43z8tPOmlue/ONiC5OEc1Bd2mmxSm3ObaFqAJyr8F05GF90SNxZ8E+eCZFLGG4os5ul3yqjCTAFx9edRgXZVZmBInp6nzF0R69PZvJQj/+yyIUcZdrpZra9Glnc5tXNCx2EA9qChMD0aTsFyWvtRTk3AREzd9Ph4oSJ25jxjdDfaGkNDJvMIp7WeGLwuYpZtQ/IKlA7Nvjt5IcYYvzLWWpsA/EamBIglJn1SyYRmbXo2j3xymMF1jVskXgNO/nt9U9rPQApPUVXjuTGUsu1tbqzQG1Hnm+Mo3vbwkmj4DYoa7XcE+WXs2Mtk5cHsMdlBapnEB9bn5reuK4cvSOidbZNgsm8ObI6Yk6pRU/gTiy5wBcH4NGy9vvK0IxFT2/I+q83HtJk/JRu1m7ImRYEluG+0y5HLi444rY2zcrdGq54UFdvLbgXkxJsj91au7NHyvukbZ3KchvK2937rrn3WgBcD24T0qfEc= 39 | distributions: dists 40 | skip_cleanup: true 41 | 42 | install: 43 | - java -version 44 | - # Install Elasticsearch with mapper-attachments 45 | - wget $ELASTIC_URL 46 | - tar -xvf elasticsearch-${ELASTIC}.tar.gz 47 | - export PATH=${PWD}/elasticsearch-${ELASTIC}/bin/:${PATH} 48 | - echo 'y' | elasticsearch-plugin install mapper-attachments || echo 'y' | plugin install mapper-attachments 49 | - elasticsearch --version 50 | - # Install MongoDB 51 | - wget http://fastdl.mongodb.org/linux/mongodb-linux-x86_64-${MONGODB}.tgz 52 | - tar -zxf mongodb-linux-x86_64-${MONGODB}.tgz 53 | - export PATH=${PWD}/mongodb-linux-x86_64-${MONGODB}/bin/:${PATH} 54 | - mongod --version 55 | - pip install -U tox tox-venv 56 | 57 | before_script: 58 | - elasticsearch > temp.txt & 59 | - sleep 10 60 | 61 | script: 62 | - tox 63 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | Version 1.0.0 5 | ------------- 6 | 7 | - Drop support for Python 3.3 and earlier, including Python 2. 8 | 9 | Version 0.4.0 10 | ------------- 11 | 12 | - Remove reliance on ``mongo_connector.compat``. 13 | 14 | Version 0.3.0 15 | ------------- 16 | 17 | - Support for Elasticsearch 5.x. 18 | - Significant performance improvements because operations are buffered. 19 | - BulkIndexErrors are now caught and reraised as OperationFailed. 20 | 21 | Version 0.2.0 22 | ------------- 23 | 24 | - Bug fix for namespace information saved in the mongo-connector metadata index. 25 | - Support AWS Elasticsearch Service Request Signing. 26 | 27 | Version 0.1.0 28 | ------------- 29 | 30 | This was the first release of elastic-doc-manager. 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | elastic2-doc-manager 3 | ==================== 4 | 5 | The mongo-connector project originated as a MongoDB mongo-labs 6 | project and is now community-maintained under the custody of YouGov, Plc. 7 | 8 | .. image:: https://travis-ci.org/yougov/elastic2-doc-manager.svg?branch=master 9 | :alt: View build status 10 | :target: https://travis-ci.org/yougov/elastic2-doc-manager 11 | 12 | Getting Started 13 | =============== 14 | 15 | This package is a document manager for 16 | `mongo-connector `_ that 17 | targets Elasticsearch versions 2.x and 5.x -- don't let the name fool you! 18 | For information on running mongo-connector with Elasticsearch, please see the 19 | `MongoConnector Usage with Elasticsearch 20 | `_ 21 | wiki page. 22 | 23 | Installation 24 | ============ 25 | 26 | The installation of the elastic2-doc-manager depends on which version of 27 | Elasticsearch you are targeting. 28 | 29 | Elasticsearch 1.x 30 | ----------------- 31 | 32 | This is the document manager for Elasticsearch 2.x and 5.x. If you 33 | want to target Elasticsearch 1.x, please install the 34 | `elastic-doc-manager `_. 35 | 36 | Elasticsearch 2.x 37 | ----------------- 38 | 39 | For use with an Elasticsearch 2.x server, install with 40 | `pip `__:: 41 | 42 | pip install 'elastic2-doc-manager[elastic2]' 43 | 44 | Elasticsearch 5.x 45 | ----------------- 46 | 47 | For use with an Elasticsearch 5.x server, install with:: 48 | 49 | pip install 'elastic2-doc-manager[elastic5]' 50 | 51 | .. note:: Version 0.3.0 added support for Elasticsearch 5.x. 52 | 53 | 54 | Amazon Elasticsearch Service 55 | ---------------------------- 56 | 57 | To use with Amazon Elasticsearch Service, you must install the required AWS 58 | dependencies along with the version of Elasticsearch:: 59 | 60 | pip install 'elastic2-doc-manager[elastic2,aws]' 61 | 62 | 63 | Development 64 | ----------- 65 | 66 | You can also install the development version of elastic2-doc-manager 67 | manually:: 68 | 69 | git clone https://github.com/yougov/elastic2-doc-manager.git 70 | pip install -e './elastic2-doc-manager[elastic2]' 71 | 72 | You may have to run ``pip`` with ``sudo``, depending on where you're 73 | installing and what privileges you have. 74 | 75 | .. note:: Please note that before mongo-connector version 2.2.2, the elastic 76 | doc manager was packaged with mongo-connector and only supported 77 | Elasticsearch 1.x. 78 | 79 | Running the tests 80 | ----------------- 81 | Requirements 82 | ~~~~~~~~~~~~ 83 | 84 | 1. Copy of the Elastic 2.x Document Manager Github repository 85 | 86 | The tests are not included in the package from PyPI and can only be acquired 87 | by cloning this repository on Github:: 88 | 89 | git clone https://github.com/yougov/elastic2-doc-manager 90 | 91 | 2. Tox 92 | 93 | Install `tox `_. 94 | 95 | 2. Environment variables 96 | 97 | There are a few influential environment variables that affect the tests. These are 98 | defined in the tox.ini. 99 | 100 | All the tests live in the `tests` directory. 101 | 102 | Running tests on the command-line 103 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 104 | 105 | While the tests take care of setting up and tearing down MongoDB clusters on 106 | their own, make sure to start Elasticsearch before doing a full test run! 107 | 108 | You can run all the tests with one command (this works in all supported Python versions):: 109 | 110 | tox 111 | 112 | Error messages 113 | ~~~~~~~~~~~~~~ 114 | 115 | Some of the tests are meant to generate lots of ``ERROR``-level log messages, 116 | especially the rollback tests. mongo-connector logs exceptions it encounters 117 | while iterating the cursor in the oplog, so we see these in the console output 118 | while MongoDB clusters are being torn apart in the tests. As long as all the 119 | tests pass with an `OK` message, all is well. 120 | -------------------------------------------------------------------------------- /mongo_connector/__init__.py: -------------------------------------------------------------------------------- 1 | from pkgutil import extend_path 2 | 3 | __path__ = extend_path(__path__, __name__) 4 | -------------------------------------------------------------------------------- /mongo_connector/doc_managers/__init__.py: -------------------------------------------------------------------------------- 1 | from pkgutil import extend_path 2 | 3 | __path__ = extend_path(__path__, __name__) 4 | -------------------------------------------------------------------------------- /mongo_connector/doc_managers/elastic2_doc_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Elasticsearch implementation of the DocManager interface. 16 | 17 | Receives documents from an OplogThread and takes the appropriate actions on 18 | Elasticsearch. 19 | """ 20 | import base64 21 | import logging 22 | import threading 23 | import time 24 | import warnings 25 | 26 | import bson.json_util 27 | 28 | try: 29 | __import__("elasticsearch") 30 | except ImportError: 31 | raise ImportError( 32 | "Error: elasticsearch (https://pypi.python.org/pypi/elasticsearch) " 33 | "version 2.x or 5.x is not installed.\n" 34 | "Install with:\n" 35 | " pip install elastic2-doc-manager[elastic2]\n" 36 | "or:\n" 37 | " pip install elastic2-doc-manager[elastic5]\n" 38 | ) 39 | 40 | from elasticsearch import ( 41 | Elasticsearch, 42 | exceptions as es_exceptions, 43 | connection as es_connection, 44 | ) 45 | from elasticsearch.helpers import bulk, scan, streaming_bulk, BulkIndexError 46 | 47 | import importlib_metadata 48 | 49 | from mongo_connector import errors 50 | from mongo_connector.constants import DEFAULT_COMMIT_INTERVAL, DEFAULT_MAX_BULK 51 | from mongo_connector.util import exception_wrapper, retry_until_ok 52 | from mongo_connector.doc_managers.doc_manager_base import DocManagerBase 53 | from mongo_connector.doc_managers.formatters import DefaultDocumentFormatter 54 | 55 | _HAS_AWS = True 56 | try: 57 | from boto3 import session 58 | from requests_aws_sign import AWSV4Sign 59 | except ImportError: 60 | _HAS_AWS = False 61 | 62 | wrap_exceptions = exception_wrapper( 63 | { 64 | BulkIndexError: errors.OperationFailed, 65 | es_exceptions.ConnectionError: errors.ConnectionFailed, 66 | es_exceptions.TransportError: errors.OperationFailed, 67 | es_exceptions.NotFoundError: errors.OperationFailed, 68 | es_exceptions.RequestError: errors.OperationFailed, 69 | } 70 | ) 71 | 72 | LOG = logging.getLogger(__name__) 73 | 74 | DEFAULT_SEND_INTERVAL = 5 75 | """The default interval in seconds to send buffered operations.""" 76 | 77 | DEFAULT_AWS_REGION = "us-east-1" 78 | 79 | __version__ = importlib_metadata.version("elastic2_doc_manager") 80 | 81 | 82 | def convert_aws_args(aws_args): 83 | """Convert old style options into arguments to boto3.session.Session.""" 84 | if not isinstance(aws_args, dict): 85 | raise errors.InvalidConfiguration( 86 | 'Elastic DocManager config option "aws" must be a dict' 87 | ) 88 | old_session_kwargs = dict( 89 | region="region_name", 90 | access_id="aws_access_key_id", 91 | secret_key="aws_secret_access_key", 92 | ) 93 | new_kwargs = {} 94 | for arg in aws_args: 95 | if arg in old_session_kwargs: 96 | new_kwargs[old_session_kwargs[arg]] = aws_args[arg] 97 | else: 98 | new_kwargs[arg] = aws_args[arg] 99 | return new_kwargs 100 | 101 | 102 | def create_aws_auth(aws_args): 103 | try: 104 | aws_session = session.Session(**convert_aws_args(aws_args)) 105 | except TypeError as exc: 106 | raise errors.InvalidConfiguration( 107 | "Elastic DocManager unknown aws config option: %s" % (exc,) 108 | ) 109 | return AWSV4Sign( 110 | aws_session.get_credentials(), 111 | aws_session.region_name or DEFAULT_AWS_REGION, 112 | "es", 113 | ) 114 | 115 | 116 | class AutoCommiter(threading.Thread): 117 | """Thread that periodically sends buffered operations to Elastic. 118 | 119 | :Parameters: 120 | - `docman`: The Elasticsearch DocManager. 121 | - `send_interval`: Number of seconds to wait before sending buffered 122 | operations to Elasticsearch. Set to None or 0 to disable. 123 | - `commit_interval`: Number of seconds to wait before committing 124 | buffered operations to Elasticsearch. Set to None or 0 to disable. 125 | - `sleep_interval`: Number of seconds to sleep. 126 | """ 127 | 128 | def __init__(self, docman, send_interval, commit_interval, sleep_interval=1): 129 | super(AutoCommiter, self).__init__() 130 | self._docman = docman 131 | # Change `None` intervals to 0 132 | self._send_interval = send_interval if send_interval else 0 133 | self._commit_interval = commit_interval if commit_interval else 0 134 | self._should_auto_send = self._send_interval > 0 135 | self._should_auto_commit = self._commit_interval > 0 136 | self._sleep_interval = max(sleep_interval, 1) 137 | self._stopped = False 138 | self.daemon = True 139 | 140 | def join(self, timeout=None): 141 | self._stopped = True 142 | super(AutoCommiter, self).join(timeout=timeout) 143 | 144 | def run(self): 145 | """Periodically sends buffered operations and/or commit. 146 | """ 147 | if not self._should_auto_commit and not self._should_auto_send: 148 | return 149 | last_send, last_commit = 0, 0 150 | while not self._stopped: 151 | if self._should_auto_commit: 152 | if last_commit > self._commit_interval: 153 | self._docman.commit() 154 | # commit also sends so reset both 155 | last_send, last_commit = 0, 0 156 | # Give a chance to exit the loop 157 | if self._stopped: 158 | break 159 | 160 | if self._should_auto_send: 161 | if last_send > self._send_interval: 162 | self._docman.send_buffered_operations() 163 | last_send = 0 164 | time.sleep(self._sleep_interval) 165 | last_send += self._sleep_interval 166 | last_commit += self._sleep_interval 167 | 168 | 169 | class DocManager(DocManagerBase): 170 | """Elasticsearch implementation of the DocManager interface. 171 | 172 | Receives documents from an OplogThread and takes the appropriate actions on 173 | Elasticsearch. 174 | """ 175 | 176 | def __init__( 177 | self, 178 | url, 179 | auto_commit_interval=DEFAULT_COMMIT_INTERVAL, 180 | unique_key="_id", 181 | chunk_size=DEFAULT_MAX_BULK, 182 | meta_index_name="mongodb_meta", 183 | meta_type="mongodb_meta", 184 | attachment_field="content", 185 | **kwargs 186 | ): 187 | client_options = kwargs.get("clientOptions", {}) 188 | if "aws" in kwargs: 189 | if not _HAS_AWS: 190 | raise errors.InvalidConfiguration( 191 | "aws extras must be installed to sign Elasticsearch " 192 | "requests. Install with: " 193 | "pip install elastic2-doc-manager[aws]" 194 | ) 195 | client_options["http_auth"] = create_aws_auth(kwargs["aws"]) 196 | client_options["use_ssl"] = True 197 | client_options["verify_certs"] = True 198 | client_options["connection_class"] = es_connection.RequestsHttpConnection 199 | if type(url) is not list: 200 | url = [url] 201 | self.elastic = Elasticsearch(hosts=url, **client_options) 202 | 203 | self._formatter = DefaultDocumentFormatter() 204 | self.BulkBuffer = BulkBuffer(self) 205 | 206 | # As bulk operation can be done in another thread 207 | # lock is needed to prevent access to BulkBuffer 208 | # while commiting documents to Elasticsearch 209 | # It is because BulkBuffer might get outdated 210 | # docs from Elasticsearch if bulk is still ongoing 211 | self.lock = threading.Lock() 212 | 213 | self.auto_commit_interval = auto_commit_interval 214 | self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL) 215 | self.meta_index_name = meta_index_name 216 | self.meta_type = meta_type 217 | self.unique_key = unique_key 218 | self.chunk_size = chunk_size 219 | self.has_attachment_mapping = False 220 | self.attachment_field = attachment_field 221 | self.auto_commiter = AutoCommiter( 222 | self, self.auto_send_interval, self.auto_commit_interval 223 | ) 224 | self.auto_commiter.start() 225 | 226 | def _index_and_mapping(self, namespace): 227 | """Helper method for getting the index and type from a namespace.""" 228 | index, doc_type = namespace.split(".", 1) 229 | return index.lower(), doc_type 230 | 231 | def stop(self): 232 | """Stop the auto-commit thread.""" 233 | self.auto_commiter.join() 234 | self.auto_commit_interval = 0 235 | # Commit any remaining docs from buffer 236 | self.commit() 237 | 238 | def apply_update(self, doc, update_spec): 239 | if "$set" not in update_spec and "$unset" not in update_spec: 240 | # Don't try to add ns and _ts fields back in from doc 241 | return update_spec 242 | return super(DocManager, self).apply_update(doc, update_spec) 243 | 244 | @wrap_exceptions 245 | def handle_command(self, doc, namespace, timestamp): 246 | # Flush buffer before handle command 247 | self.commit() 248 | db = namespace.split(".", 1)[0] 249 | if doc.get("dropDatabase"): 250 | dbs = self.command_helper.map_db(db) 251 | for _db in dbs: 252 | self.elastic.indices.delete(index=_db.lower()) 253 | 254 | if doc.get("renameCollection"): 255 | raise errors.OperationFailed( 256 | "elastic_doc_manager does not support renaming a mapping." 257 | ) 258 | 259 | if doc.get("create"): 260 | db, coll = self.command_helper.map_collection(db, doc["create"]) 261 | if db and coll: 262 | self.elastic.indices.put_mapping( 263 | index=db.lower(), doc_type=coll, body={"_source": {"enabled": True}} 264 | ) 265 | 266 | if doc.get("drop"): 267 | db, coll = self.command_helper.map_collection(db, doc["drop"]) 268 | if db and coll: 269 | # This will delete the items in coll, but not get rid of the 270 | # mapping. 271 | warnings.warn( 272 | "Deleting all documents of type %s on index %s." 273 | "The mapping definition will persist and must be" 274 | "removed manually." % (coll, db) 275 | ) 276 | responses = streaming_bulk( 277 | self.elastic, 278 | ( 279 | dict(result, _op_type="delete") 280 | for result in scan( 281 | self.elastic, index=db.lower(), doc_type=coll 282 | ) 283 | ), 284 | ) 285 | for ok, resp in responses: 286 | if not ok: 287 | LOG.error( 288 | "Error occurred while deleting ElasticSearch docum" 289 | "ent during handling of 'drop' command: %r" % resp 290 | ) 291 | 292 | @wrap_exceptions 293 | def update(self, document_id, update_spec, namespace, timestamp): 294 | """Apply updates given in update_spec to the document whose id 295 | matches that of doc. 296 | """ 297 | 298 | index, doc_type = self._index_and_mapping(namespace) 299 | with self.lock: 300 | # Check if document source is stored in local buffer 301 | document = self.BulkBuffer.get_from_sources( 302 | index, doc_type, str(document_id) 303 | ) 304 | if document: 305 | # Document source collected from local buffer 306 | # Perform apply_update on it and then it will be 307 | # ready for commiting to Elasticsearch 308 | updated = self.apply_update(document, update_spec) 309 | # _id is immutable in MongoDB, so won't have changed in update 310 | updated["_id"] = document_id 311 | self.upsert(updated, namespace, timestamp) 312 | else: 313 | # Document source needs to be retrieved from Elasticsearch 314 | # before performing update. Pass update_spec to upsert function 315 | updated = {"_id": document_id} 316 | self.upsert(updated, namespace, timestamp, update_spec) 317 | # upsert() strips metadata, so only _id + fields in _source still here 318 | return updated 319 | 320 | @wrap_exceptions 321 | def upsert(self, doc, namespace, timestamp, update_spec=None): 322 | """Insert a document into Elasticsearch.""" 323 | index, doc_type = self._index_and_mapping(namespace) 324 | # No need to duplicate '_id' in source document 325 | doc_id = str(doc.pop("_id")) 326 | metadata = {"ns": namespace, "_ts": timestamp} 327 | 328 | # Index the source document, using lowercase namespace as index name. 329 | action = { 330 | "_op_type": "index", 331 | "_index": index, 332 | "_type": doc_type, 333 | "_id": doc_id, 334 | "_source": self._formatter.format_document(doc), 335 | } 336 | # Index document metadata with original namespace (mixed upper/lower). 337 | meta_action = { 338 | "_op_type": "index", 339 | "_index": self.meta_index_name, 340 | "_type": self.meta_type, 341 | "_id": doc_id, 342 | "_source": bson.json_util.dumps(metadata), 343 | } 344 | 345 | self.index(action, meta_action, doc, update_spec) 346 | 347 | # Leave _id, since it's part of the original document 348 | doc["_id"] = doc_id 349 | 350 | @wrap_exceptions 351 | def bulk_upsert(self, docs, namespace, timestamp): 352 | """Insert multiple documents into Elasticsearch.""" 353 | 354 | def docs_to_upsert(): 355 | doc = None 356 | for doc in docs: 357 | # Remove metadata and redundant _id 358 | index, doc_type = self._index_and_mapping(namespace) 359 | doc_id = str(doc.pop("_id")) 360 | document_action = { 361 | "_index": index, 362 | "_type": doc_type, 363 | "_id": doc_id, 364 | "_source": self._formatter.format_document(doc), 365 | } 366 | document_meta = { 367 | "_index": self.meta_index_name, 368 | "_type": self.meta_type, 369 | "_id": doc_id, 370 | "_source": {"ns": namespace, "_ts": timestamp}, 371 | } 372 | yield document_action 373 | yield document_meta 374 | if doc is None: 375 | raise errors.EmptyDocsError( 376 | "Cannot upsert an empty sequence of " 377 | "documents into Elastic Search" 378 | ) 379 | 380 | try: 381 | kw = {} 382 | if self.chunk_size > 0: 383 | kw["chunk_size"] = self.chunk_size 384 | 385 | responses = streaming_bulk( 386 | client=self.elastic, actions=docs_to_upsert(), **kw 387 | ) 388 | 389 | for ok, resp in responses: 390 | if not ok: 391 | LOG.error( 392 | "Could not bulk-upsert document " 393 | "into ElasticSearch: %r" % resp 394 | ) 395 | if self.auto_commit_interval == 0: 396 | self.commit() 397 | except errors.EmptyDocsError: 398 | # This can happen when mongo-connector starts up, there is no 399 | # config file, but nothing to dump 400 | pass 401 | 402 | @wrap_exceptions 403 | def insert_file(self, f, namespace, timestamp): 404 | doc = f.get_metadata() 405 | doc_id = str(doc.pop("_id")) 406 | index, doc_type = self._index_and_mapping(namespace) 407 | 408 | # make sure that elasticsearch treats it like a file 409 | if not self.has_attachment_mapping: 410 | body = {"properties": {self.attachment_field: {"type": "attachment"}}} 411 | self.elastic.indices.put_mapping(index=index, doc_type=doc_type, body=body) 412 | self.has_attachment_mapping = True 413 | 414 | metadata = {"ns": namespace, "_ts": timestamp} 415 | 416 | doc = self._formatter.format_document(doc) 417 | doc[self.attachment_field] = base64.b64encode(f.read()).decode() 418 | 419 | action = { 420 | "_op_type": "index", 421 | "_index": index, 422 | "_type": doc_type, 423 | "_id": doc_id, 424 | "_source": doc, 425 | } 426 | meta_action = { 427 | "_op_type": "index", 428 | "_index": self.meta_index_name, 429 | "_type": self.meta_type, 430 | "_id": doc_id, 431 | "_source": bson.json_util.dumps(metadata), 432 | } 433 | 434 | self.index(action, meta_action) 435 | 436 | @wrap_exceptions 437 | def remove(self, document_id, namespace, timestamp): 438 | """Remove a document from Elasticsearch.""" 439 | index, doc_type = self._index_and_mapping(namespace) 440 | 441 | action = { 442 | "_op_type": "delete", 443 | "_index": index, 444 | "_type": doc_type, 445 | "_id": str(document_id), 446 | } 447 | 448 | meta_action = { 449 | "_op_type": "delete", 450 | "_index": self.meta_index_name, 451 | "_type": self.meta_type, 452 | "_id": str(document_id), 453 | } 454 | 455 | self.index(action, meta_action) 456 | 457 | @wrap_exceptions 458 | def _stream_search(self, *args, **kwargs): 459 | """Helper method for iterating over ES search results.""" 460 | for hit in scan( 461 | self.elastic, query=kwargs.pop("body", None), scroll="10m", **kwargs 462 | ): 463 | hit["_source"]["_id"] = hit["_id"] 464 | yield hit["_source"] 465 | 466 | def search(self, start_ts, end_ts): 467 | """Query Elasticsearch for documents in a time range. 468 | 469 | This method is used to find documents that may be in conflict during 470 | a rollback event in MongoDB. 471 | """ 472 | return self._stream_search( 473 | index=self.meta_index_name, 474 | body={"query": {"range": {"_ts": {"gte": start_ts, "lte": end_ts}}}}, 475 | ) 476 | 477 | def index(self, action, meta_action, doc_source=None, update_spec=None): 478 | with self.lock: 479 | self.BulkBuffer.add_upsert(action, meta_action, doc_source, update_spec) 480 | 481 | # Divide by two to account for meta actions 482 | if ( 483 | len(self.BulkBuffer.action_buffer) / 2 >= self.chunk_size 484 | or self.auto_commit_interval == 0 485 | ): 486 | self.commit() 487 | 488 | def send_buffered_operations(self): 489 | """Send buffered operations to Elasticsearch. 490 | 491 | This method is periodically called by the AutoCommitThread. 492 | """ 493 | with self.lock: 494 | try: 495 | action_buffer = self.BulkBuffer.get_buffer() 496 | if action_buffer: 497 | successes, errors = bulk(self.elastic, action_buffer) 498 | LOG.debug( 499 | "Bulk request finished, successfully sent %d " "operations", 500 | successes, 501 | ) 502 | if errors: 503 | LOG.error("Bulk request finished with errors: %r", errors) 504 | except es_exceptions.ElasticsearchException: 505 | LOG.exception("Bulk request failed with exception") 506 | 507 | def commit(self): 508 | """Send buffered requests and refresh all indexes.""" 509 | self.send_buffered_operations() 510 | retry_until_ok(self.elastic.indices.refresh, index="") 511 | 512 | @wrap_exceptions 513 | def get_last_doc(self): 514 | """Get the most recently modified document from Elasticsearch. 515 | 516 | This method is used to help define a time window within which documents 517 | may be in conflict after a MongoDB rollback. 518 | """ 519 | try: 520 | result = self.elastic.search( 521 | index=self.meta_index_name, 522 | body={"query": {"match_all": {}}, "sort": [{"_ts": "desc"}]}, 523 | size=1, 524 | )["hits"]["hits"] 525 | for r in result: 526 | r["_source"]["_id"] = r["_id"] 527 | return r["_source"] 528 | except es_exceptions.RequestError: 529 | # no documents so ES returns 400 because of undefined _ts mapping 530 | return None 531 | 532 | 533 | class BulkBuffer(object): 534 | def __init__(self, docman): 535 | 536 | # Parent object 537 | self.docman = docman 538 | 539 | # Action buffer for bulk indexing 540 | self.action_buffer = [] 541 | 542 | # Docs to update 543 | # Dict stores all documents for which firstly 544 | # source has to be retrieved from Elasticsearch 545 | # and then apply_update needs to be performed 546 | # Format: [ (doc, update_spec, action_buffer_index, get_from_ES) ] 547 | self.doc_to_update = [] 548 | 549 | # Below dictionary contains ids of documents 550 | # which need to be retrieved from Elasticsearch 551 | # It prevents from getting same document multiple times from ES 552 | # Format: {"_index": {"_type": {"_id": True}}} 553 | self.doc_to_get = {} 554 | 555 | # Dictionary of sources 556 | # Format: {"_index": {"_type": {"_id": {"_source": actual_source}}}} 557 | self.sources = {} 558 | 559 | def add_upsert(self, action, meta_action, doc_source, update_spec): 560 | """ 561 | Function which stores sources for "insert" actions 562 | and decide if for "update" action has to add docs to 563 | get source buffer 564 | """ 565 | 566 | # Whenever update_spec is provided to this method 567 | # it means that doc source needs to be retrieved 568 | # from Elasticsearch. It means also that source 569 | # is not stored in local buffer 570 | if update_spec: 571 | self.bulk_index(action, meta_action) 572 | 573 | # -1 -> to get latest index number 574 | # -1 -> to get action instead of meta_action 575 | # Update document based on source retrieved from ES 576 | self.add_doc_to_update(action, update_spec, len(self.action_buffer) - 2) 577 | else: 578 | # Insert and update operations provide source 579 | # Store it in local buffer and use for comming updates 580 | # inside same buffer 581 | # add_to_sources will not be called for delete operation 582 | # as it does not provide doc_source 583 | if doc_source: 584 | self.add_to_sources(action, doc_source) 585 | self.bulk_index(action, meta_action) 586 | 587 | def add_doc_to_update(self, action, update_spec, action_buffer_index): 588 | """ 589 | Prepare document for update based on Elasticsearch response. 590 | Set flag if document needs to be retrieved from Elasticsearch 591 | """ 592 | 593 | doc = { 594 | "_index": action["_index"], 595 | "_type": action["_type"], 596 | "_id": action["_id"], 597 | } 598 | 599 | # If get_from_ES == True -> get document's source from Elasticsearch 600 | get_from_ES = self.should_get_id(action) 601 | self.doc_to_update.append((doc, update_spec, action_buffer_index, get_from_ES)) 602 | 603 | def should_get_id(self, action): 604 | """ 605 | Mark document to retrieve its source from Elasticsearch. 606 | Returns: 607 | True - if marking document for the first time in this bulk 608 | False - if document has been already marked 609 | """ 610 | mapping_ids = self.doc_to_get.setdefault(action["_index"], {}).setdefault( 611 | action["_type"], set() 612 | ) 613 | if action["_id"] in mapping_ids: 614 | # There is an update on this id already 615 | return False 616 | else: 617 | mapping_ids.add(action["_id"]) 618 | return True 619 | 620 | def get_docs_sources_from_ES(self): 621 | """Get document sources using MGET elasticsearch API""" 622 | docs = [doc for doc, _, _, get_from_ES in self.doc_to_update if get_from_ES] 623 | if docs: 624 | documents = self.docman.elastic.mget(body={"docs": docs}, realtime=True) 625 | return iter(documents["docs"]) 626 | else: 627 | return iter([]) 628 | 629 | @wrap_exceptions 630 | def update_sources(self): 631 | """Update local sources based on response from Elasticsearch""" 632 | ES_documents = self.get_docs_sources_from_ES() 633 | 634 | for doc, update_spec, action_buffer_index, get_from_ES in self.doc_to_update: 635 | if get_from_ES: 636 | # Update source based on response from ES 637 | ES_doc = next(ES_documents) 638 | if ES_doc["found"]: 639 | source = ES_doc["_source"] 640 | else: 641 | # Document not found in elasticsearch, 642 | # Seems like something went wrong during replication 643 | LOG.error( 644 | "mGET: Document id: %s has not been found " 645 | "in Elasticsearch. Due to that " 646 | "following update failed: %s", 647 | doc["_id"], 648 | update_spec, 649 | ) 650 | self.reset_action(action_buffer_index) 651 | continue 652 | else: 653 | # Get source stored locally before applying update 654 | # as it is up-to-date 655 | source = self.get_from_sources(doc["_index"], doc["_type"], doc["_id"]) 656 | if not source: 657 | LOG.error( 658 | "mGET: Document id: %s has not been found " 659 | "in local sources. Due to that following " 660 | "update failed: %s", 661 | doc["_id"], 662 | update_spec, 663 | ) 664 | self.reset_action(action_buffer_index) 665 | continue 666 | 667 | updated = self.docman.apply_update(source, update_spec) 668 | 669 | # Remove _id field from source 670 | if "_id" in updated: 671 | del updated["_id"] 672 | 673 | # Everytime update locally stored sources to keep them up-to-date 674 | self.add_to_sources(doc, updated) 675 | 676 | self.action_buffer[action_buffer_index][ 677 | "_source" 678 | ] = self.docman._formatter.format_document(updated) 679 | 680 | # Remove empty actions if there were errors 681 | self.action_buffer = [ 682 | each_action for each_action in self.action_buffer if each_action 683 | ] 684 | 685 | def reset_action(self, action_buffer_index): 686 | """Reset specific action as update failed""" 687 | self.action_buffer[action_buffer_index] = {} 688 | self.action_buffer[action_buffer_index + 1] = {} 689 | 690 | def add_to_sources(self, action, doc_source): 691 | """Store sources locally""" 692 | mapping = self.sources.setdefault(action["_index"], {}).setdefault( 693 | action["_type"], {} 694 | ) 695 | mapping[action["_id"]] = doc_source 696 | 697 | def get_from_sources(self, index, doc_type, document_id): 698 | """Get source stored locally""" 699 | return self.sources.get(index, {}).get(doc_type, {}).get(document_id, {}) 700 | 701 | def bulk_index(self, action, meta_action): 702 | self.action_buffer.append(action) 703 | self.action_buffer.append(meta_action) 704 | 705 | def clean_up(self): 706 | """Do clean-up before returning buffer""" 707 | self.action_buffer = [] 708 | self.sources = {} 709 | self.doc_to_get = {} 710 | self.doc_to_update = [] 711 | 712 | def get_buffer(self): 713 | """Get buffer which needs to be bulked to elasticsearch""" 714 | 715 | # Get sources for documents which are in Elasticsearch 716 | # and they are not in local buffer 717 | if self.doc_to_update: 718 | self.update_sources() 719 | 720 | ES_buffer = self.action_buffer 721 | self.clean_up() 722 | return ES_buffer 723 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=34.4", "wheel", "setuptools_scm>=1.15"] 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | dists = clean --all sdist bdist_wheel 3 | 4 | [bdist_wheel] 5 | universal = 1 6 | 7 | [metadata] 8 | long_description = file:README.rst 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup( 4 | name="elastic2-doc-manager", 5 | use_scm_version=True, 6 | maintainer="mongodb", 7 | description="Elastic2 plugin for mongo-connector", 8 | platforms=["any"], 9 | author="anna herlihy", 10 | author_email="mongodb-user@googlegroups.com", 11 | url="https://github.com/mongodb-labs/elastic2-doc-manager", 12 | install_requires=["mongo-connector>=2.5.0", "importlib_metadata"], 13 | python_requires=">=3.4", 14 | extras_require={ 15 | "aws": ["boto3 >= 1.4.0", "requests-aws-sign >= 0.1.2"], 16 | "elastic2": ["elasticsearch>=2.0.0,<3.0.0"], 17 | "elastic5": ["elasticsearch>=5.0.0,<6.0.0"], 18 | }, 19 | packages=["mongo_connector", "mongo_connector.doc_managers"], 20 | classifiers=[ 21 | "Development Status :: 4 - Beta", 22 | "Intended Audience :: Developers", 23 | "License :: OSI Approved :: Apache Software License", 24 | "Programming Language :: Python :: 3", 25 | "Topic :: Database", 26 | "Topic :: Software Development :: Libraries :: Python Modules", 27 | "Operating System :: Unix", 28 | "Operating System :: MacOS :: MacOS X", 29 | "Operating System :: Microsoft :: Windows", 30 | "Operating System :: POSIX", 31 | ], 32 | keywords=["mongo-connector", "mongodb", "elastic", "elasticsearch"], 33 | setup_requires=["setuptools_scm>=1.15"], 34 | ) 35 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import logging 15 | import os 16 | import sys 17 | 18 | 19 | logging.basicConfig(stream=sys.stdout) 20 | 21 | elastic_host = str(os.environ.get("ES_HOST", "localhost")) 22 | elastic_port = str(os.environ.get("ES_PORT", 9200)) 23 | elastic_pair = "%s:%s" % (elastic_host, elastic_port) 24 | elastic_nodes = [elastic_pair, "%s:%s" % (elastic_host, str(int(elastic_port) + 1))] 25 | -------------------------------------------------------------------------------- /tests/test_elastic2.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Integration tests for mongo-connector + Elasticsearch 2.x.""" 16 | import base64 17 | import os 18 | import time 19 | import unittest 20 | 21 | from bson import SON 22 | from elasticsearch import Elasticsearch 23 | from elasticsearch.helpers import bulk, scan 24 | from gridfs import GridFS 25 | 26 | from mongo_connector.connector import Connector 27 | from mongo_connector.doc_managers.elastic2_doc_manager import DocManager 28 | from mongo_connector.test_utils import ReplicaSet, assert_soon, close_client 29 | 30 | from mongo_connector.util import retry_until_ok 31 | from tests import elastic_pair, elastic_nodes 32 | 33 | 34 | class ElasticsearchTestCase(unittest.TestCase): 35 | """Base class for all ES TestCases.""" 36 | 37 | @classmethod 38 | def setUpClass(cls): 39 | cls.elastic_conn = Elasticsearch(hosts=[elastic_pair]) 40 | 41 | def setUp(self): 42 | # Create target index in elasticsearch 43 | self.elastic_conn.indices.create(index="test", ignore=400) 44 | self.elastic_conn.cluster.health(wait_for_status="yellow", index="test") 45 | self.elastic_doc = DocManager(elastic_pair, auto_commit_interval=0) 46 | 47 | def tearDown(self): 48 | self.elastic_conn.indices.delete(index="test", ignore=404) 49 | self.elastic_doc.stop() 50 | 51 | def _search(self, query=None): 52 | query = query or {"match_all": {}} 53 | return self.elastic_doc._stream_search( 54 | index="test", doc_type="test", body={"query": query} 55 | ) 56 | 57 | def _count(self): 58 | return self.elastic_conn.count(index="test")["count"] 59 | 60 | def _remove(self): 61 | bulk_deletes = [] 62 | for result in scan(self.elastic_conn, index="test", doc_type="test"): 63 | result["_op_type"] = "delete" 64 | bulk_deletes.append(result) 65 | bulk(self.elastic_conn, bulk_deletes) 66 | 67 | def _mappings(self, index="_all"): 68 | mappings = self.elastic_conn.indices.get_mapping(index=index) 69 | if index in mappings: 70 | return list(mappings[index]["mappings"].keys()) 71 | return [] 72 | 73 | def _indices(self): 74 | return list(self.elastic_conn.indices.stats()["indices"].keys()) 75 | 76 | 77 | class TestElastic(ElasticsearchTestCase): 78 | """Integration tests for mongo-connector + Elasticsearch.""" 79 | 80 | @classmethod 81 | def setUpClass(cls): 82 | """Start the cluster.""" 83 | super(TestElastic, cls).setUpClass() 84 | cls.repl_set = ReplicaSet().start() 85 | cls.conn = cls.repl_set.client() 86 | 87 | @classmethod 88 | def tearDownClass(cls): 89 | """Kill the cluster.""" 90 | close_client(cls.conn) 91 | cls.repl_set.stop() 92 | 93 | def tearDown(self): 94 | """Stop the Connector thread.""" 95 | super(TestElastic, self).tearDown() 96 | self.connector.join() 97 | 98 | def setUp(self): 99 | """Start a new Connector for each test.""" 100 | super(TestElastic, self).setUp() 101 | try: 102 | os.unlink("oplog.timestamp") 103 | except OSError: 104 | pass 105 | self.connector = Connector( 106 | mongo_address=self.repl_set.uri, 107 | ns_set=["test.test"], 108 | doc_managers=(self.elastic_doc,), 109 | gridfs_set=["test.test"], 110 | ) 111 | 112 | self.conn.test.test.drop() 113 | self.conn.test.test.files.drop() 114 | self.conn.test.test.chunks.drop() 115 | 116 | self.connector.start() 117 | assert_soon(lambda: len(self.connector.shard_set) > 0) 118 | assert_soon(lambda: self._count() == 0) 119 | 120 | def test_insert(self): 121 | """Test insert operations.""" 122 | self.conn["test"]["test"].insert_one({"name": "paulie"}) 123 | assert_soon(lambda: self._count() > 0) 124 | result_set_1 = list(self._search()) 125 | self.assertEqual(len(result_set_1), 1) 126 | result_set_2 = self.conn["test"]["test"].find_one() 127 | for item in result_set_1: 128 | self.assertEqual(item["_id"], str(result_set_2["_id"])) 129 | self.assertEqual(item["name"], result_set_2["name"]) 130 | 131 | def test_remove(self): 132 | """Tests remove operations.""" 133 | self.conn["test"]["test"].insert_one({"name": "paulie"}) 134 | assert_soon(lambda: self._count() == 1) 135 | self.conn["test"]["test"].delete_one({"name": "paulie"}) 136 | assert_soon(lambda: self._count() != 1) 137 | self.assertEqual(self._count(), 0) 138 | 139 | def test_insert_file(self): 140 | """Tests inserting a gridfs file 141 | """ 142 | fs = GridFS(self.conn["test"], "test") 143 | test_data = b"test_insert_file test file" 144 | id = fs.put(test_data, filename="test.txt", encoding="utf8") 145 | assert_soon(lambda: self._count() > 0) 146 | 147 | query = {"match": {"_all": "test_insert_file"}} 148 | res = list(self._search(query)) 149 | self.assertEqual(len(res), 1) 150 | doc = res[0] 151 | self.assertEqual(doc["filename"], "test.txt") 152 | self.assertEqual(doc["_id"], str(id)) 153 | self.assertEqual(base64.b64decode(doc["content"]), test_data) 154 | 155 | def test_remove_file(self): 156 | fs = GridFS(self.conn["test"], "test") 157 | id = fs.put("test file", filename="test.txt", encoding="utf8") 158 | assert_soon(lambda: self._count() == 1) 159 | fs.delete(id) 160 | assert_soon(lambda: self._count() == 0) 161 | 162 | def test_update(self): 163 | """Test update operations.""" 164 | # Insert 165 | self.conn.test.test.insert_one({"a": 0}) 166 | assert_soon(lambda: sum(1 for _ in self._search()) == 1) 167 | 168 | def check_update(update_spec): 169 | updated = self.conn.test.command( 170 | SON( 171 | [ 172 | ("findAndModify", "test"), 173 | ("query", {"a": 0}), 174 | ("update", update_spec), 175 | ("new", True), 176 | ] 177 | ) 178 | )["value"] 179 | # Stringify _id to match what will be retrieved from ES 180 | updated["_id"] = str(updated["_id"]) 181 | assert_soon(lambda: next(self._search()) == updated) 182 | 183 | # Update by adding a field. Note that ES can't mix types within an array 184 | check_update({"$set": {"b": [{"c": 10}, {"d": 11}]}}) 185 | 186 | # Update by setting an attribute of a sub-document beyond end of array. 187 | check_update({"$set": {"b.10.c": 42}}) 188 | 189 | # Update by changing a value within a sub-document (contains array) 190 | check_update({"$inc": {"b.0.c": 1}}) 191 | 192 | # Update by changing the value within an array 193 | check_update({"$inc": {"b.1.f": 12}}) 194 | 195 | # Update by adding new bucket to list 196 | check_update({"$push": {"b": {"e": 12}}}) 197 | 198 | # Update by changing an entire sub-document 199 | check_update({"$set": {"b.0": {"e": 4}}}) 200 | 201 | # Update by adding a sub-document 202 | check_update({"$set": {"b": {"0": {"c": 100}}}}) 203 | 204 | # Update whole document 205 | check_update({"a": 0, "b": {"1": {"d": 10000}}}) 206 | 207 | def test_rollback(self): 208 | """Test behavior during a MongoDB rollback. 209 | 210 | We force a rollback by adding a doc, killing the primary, 211 | adding another doc, killing the new primary, and then 212 | restarting both. 213 | """ 214 | primary_conn = self.repl_set.primary.client() 215 | 216 | # This doc can be picked up in the collection dump 217 | self.conn["test"]["test"].insert_one({"name": "paul"}) 218 | condition1 = ( 219 | lambda: self.conn["test"]["test"].find({"name": "paul"}).count() == 1 220 | ) 221 | 222 | def condition2(): 223 | return self._count() == 1 224 | 225 | assert_soon(condition1) 226 | assert_soon(condition2) 227 | 228 | # This doc is definitely not picked up by collection dump 229 | self.conn["test"]["test"].insert_one({"name": "pauly"}) 230 | 231 | self.repl_set.primary.stop(destroy=False) 232 | 233 | new_primary_conn = self.repl_set.secondary.client() 234 | 235 | admin = new_primary_conn["admin"] 236 | assert_soon(lambda: admin.command("isMaster")["ismaster"]) 237 | time.sleep(5) 238 | retry_until_ok(self.conn.test.test.insert_one, {"name": "pauline"}) 239 | assert_soon(lambda: self._count() == 3) 240 | result_set_1 = list(self._search()) 241 | result_set_2 = self.conn["test"]["test"].find_one({"name": "pauline"}) 242 | self.assertEqual(len(result_set_1), 3) 243 | # make sure pauline is there 244 | for item in result_set_1: 245 | if item["name"] == "pauline": 246 | self.assertEqual(item["_id"], str(result_set_2["_id"])) 247 | self.repl_set.secondary.stop(destroy=False) 248 | 249 | self.repl_set.primary.start() 250 | while primary_conn["admin"].command("isMaster")["ismaster"] is False: 251 | time.sleep(1) 252 | 253 | self.repl_set.secondary.start() 254 | 255 | time.sleep(2) 256 | result_set_1 = list(self._search()) 257 | self.assertEqual(len(result_set_1), 2) 258 | 259 | if result_set_1[0]["name"] == "paul": 260 | self.assertEqual(result_set_1[1]["name"], "pauly") 261 | elif result_set_1[0]["name"] == "pauly": 262 | self.assertEqual(result_set_1[1]["name"], "paul") 263 | else: 264 | self.assertTrue(0, "Unknown document retrieved") 265 | 266 | find_cursor = retry_until_ok(self.conn["test"]["test"].find) 267 | self.assertEqual(retry_until_ok(find_cursor.count), 2) 268 | 269 | def test_bad_int_value(self): 270 | self.conn.test.test.insert_one( 271 | {"inf": float("inf"), "nan": float("nan"), "still_exists": True} 272 | ) 273 | assert_soon(lambda: self._count() > 0) 274 | for doc in self._search(): 275 | self.assertNotIn("inf", doc) 276 | self.assertNotIn("nan", doc) 277 | self.assertTrue(doc["still_exists"]) 278 | 279 | 280 | class TestElasticMultipleHosts(unittest.TestCase): 281 | """Integration tests for mongo-connector + Elasticsearch Cluster.""" 282 | 283 | def test_multiple_hosts(self): 284 | elastic_doc = DocManager(elastic_nodes) 285 | self.assertEqual(len(elastic_doc.elastic.transport.hosts), 2) 286 | 287 | 288 | if __name__ == "__main__": 289 | unittest.main() 290 | -------------------------------------------------------------------------------- /tests/test_elastic2_doc_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 MongoDB, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Unit tests for the Elastic2 DocManager.""" 16 | import base64 17 | import time 18 | import unittest 19 | 20 | from functools import wraps 21 | 22 | from mongo_connector import errors 23 | from mongo_connector.command_helper import CommandHelper 24 | from mongo_connector.doc_managers.elastic2_doc_manager import ( 25 | DocManager, 26 | _HAS_AWS, 27 | convert_aws_args, 28 | create_aws_auth, 29 | ) 30 | from mongo_connector.test_utils import MockGridFSFile, TESTARGS 31 | from mongo_connector.util import retry_until_ok 32 | 33 | from tests import elastic_pair 34 | from tests.test_elastic2 import ElasticsearchTestCase 35 | 36 | 37 | def disable_auto_refresh(func): 38 | """Disable default 1 second auto refresh in Elasticsearch for a test. 39 | 40 | https://www.elastic.co/guide/en/elasticsearch/reference/current/indices 41 | -update-settings.html 42 | """ 43 | 44 | @wraps(func) 45 | def _disable_auto_refresh(self, *args, **kwargs): 46 | try: 47 | self.elastic_conn.indices.put_settings( 48 | index="test", body={"index": {"refresh_interval": "-1"}} 49 | ) 50 | return func(self, *args, **kwargs) 51 | finally: 52 | self.elastic_conn.indices.put_settings( 53 | index="test", body={"index": {"refresh_interval": "1s"}} 54 | ) 55 | 56 | return _disable_auto_refresh 57 | 58 | 59 | class TestElasticDocManager(ElasticsearchTestCase): 60 | """Unit tests for the Elastic DocManager.""" 61 | 62 | def test_update(self): 63 | """Test the update method using locally stored source""" 64 | 65 | # If testing with BulkBuffer, auto_commit_interval 66 | # needs to be None to not clear locally stored sources 67 | self.elastic_doc.auto_commit_interval = None 68 | 69 | doc_id = 1 70 | doc = {"_id": doc_id, "a": 1, "b": 2} 71 | self.elastic_doc.upsert(doc, *TESTARGS) 72 | 73 | # $set only 74 | update_spec = {"$set": {"a": 1, "b": 2}} 75 | doc = self.elastic_doc.update(doc_id, update_spec, *TESTARGS) 76 | 77 | self.assertEqual(doc, {"_id": "1", "a": 1, "b": 2}) 78 | # $unset only 79 | update_spec = {"$unset": {"a": True}} 80 | doc = self.elastic_doc.update(doc_id, update_spec, *TESTARGS) 81 | self.assertEqual(doc, {"_id": "1", "b": 2}) 82 | # mixed $set/$unset 83 | update_spec = {"$unset": {"b": True}, "$set": {"c": 3}} 84 | doc = self.elastic_doc.update(doc_id, update_spec, *TESTARGS) 85 | self.assertEqual(doc, {"_id": "1", "c": 3}) 86 | 87 | # Commit doc to Elasticsearch and get it from there 88 | # to test if BulkBuffer works fine 89 | self.elastic_doc.commit() 90 | res = self._search() 91 | self.assertEqual(doc, next(res)) 92 | 93 | # set auto_commit_interval back to 0 94 | self.elastic_doc.auto_commit_interval = 0 95 | 96 | def test_upsert(self): 97 | """Test the upsert method.""" 98 | docc = {"_id": "1", "name": "John"} 99 | self.elastic_doc.upsert(docc, *TESTARGS) 100 | res = self.elastic_conn.search( 101 | index="test", doc_type="test", body={"query": {"match_all": {}}} 102 | )["hits"]["hits"] 103 | for doc in res: 104 | self.assertEqual(doc["_id"], "1") 105 | self.assertEqual(doc["_source"]["name"], "John") 106 | 107 | def test_update_using_ES(self): 108 | """ 109 | Test the update method and getting sources for update 110 | for Elasticsearch 111 | """ 112 | 113 | # If testing with BulkBuffer, auto_commit_interval 114 | # needs to be None to not clear locally stored sources 115 | self.elastic_doc.auto_commit_interval = None 116 | 117 | doc_id = 1 118 | doc = {"_id": doc_id, "a": 1, "b": 2} 119 | self.elastic_doc.upsert(doc, *TESTARGS) 120 | self.elastic_doc.commit() 121 | 122 | update_spec = {"$set": {"a": 1, "b": 2}} 123 | self.elastic_doc.update(doc_id, update_spec, *TESTARGS) 124 | 125 | update_spec = {"$set": {"a": 10, "b": 20}} 126 | self.elastic_doc.update(doc_id, update_spec, *TESTARGS) 127 | 128 | update_spec = {"$set": {"a": 100, "b": 200}} 129 | self.elastic_doc.update(doc_id, update_spec, *TESTARGS) 130 | 131 | # Commit doc to Elasticsearch and get it from there 132 | # to test if BulkBuffer works fine 133 | doc["a"] = 100 134 | doc["b"] = 200 135 | self.elastic_doc.commit() 136 | res = self._search() 137 | self.assertEqual(doc, next(res)) 138 | 139 | # set auto_commit_interval back to 0 140 | self.elastic_doc.auto_commit_interval = 0 141 | 142 | def test_upsert_with_updates(self): 143 | """Test the upsert method with multi updates 144 | and clearing buffer (commit) after each update.""" 145 | 146 | doc_id = 1 147 | docc = {"_id": doc_id, "name": "John"} 148 | self.elastic_doc.upsert(docc, *TESTARGS) 149 | 150 | update_spec = {"$set": {"a": 1, "b": 2}} 151 | self.elastic_doc.update(doc_id, update_spec, *TESTARGS) 152 | 153 | update_spec = {"$set": {"a": 2, "b": 3, "c": ["test"]}} 154 | self.elastic_doc.update(doc_id, update_spec, *TESTARGS) 155 | 156 | update_spec = {"$set": {"c": ["test", "test2"]}} 157 | self.elastic_doc.update(doc_id, update_spec, *TESTARGS) 158 | 159 | res = self.elastic_conn.search( 160 | index="test", doc_type="test", body={"query": {"match_all": {}}} 161 | )["hits"]["hits"] 162 | 163 | for doc in res: 164 | self.assertEqual(doc["_id"], "1") 165 | self.assertEqual(doc["_source"]["name"], "John") 166 | self.assertEqual(doc["_source"]["a"], 2) 167 | self.assertEqual(doc["_source"]["b"], 3) 168 | self.assertEqual(doc["_source"]["c"], ["test", "test2"]) 169 | 170 | def test_bulk_upsert(self): 171 | """Test the bulk_upsert method.""" 172 | self.elastic_doc.bulk_upsert([], *TESTARGS) 173 | 174 | docs = ({"_id": i} for i in range(1000)) 175 | self.elastic_doc.bulk_upsert(docs, *TESTARGS) 176 | self.elastic_doc.commit() 177 | returned_ids = sorted(int(doc["_id"]) for doc in self._search()) 178 | self.assertEqual(self._count(), 1000) 179 | self.assertEqual(len(returned_ids), 1000) 180 | for i, r in enumerate(returned_ids): 181 | self.assertEqual(r, i) 182 | 183 | docs = ({"_id": i, "weight": 2 * i} for i in range(1000)) 184 | self.elastic_doc.bulk_upsert(docs, *TESTARGS) 185 | 186 | returned_ids = sorted(int(doc["weight"]) for doc in self._search()) 187 | self.assertEqual(len(returned_ids), 1000) 188 | for i, r in enumerate(returned_ids): 189 | self.assertEqual(r, 2 * i) 190 | 191 | def test_remove(self): 192 | """Test the remove method.""" 193 | docc = {"_id": "1", "name": "John"} 194 | self.elastic_doc.upsert(docc, *TESTARGS) 195 | res = self.elastic_conn.search( 196 | index="test", doc_type="test", body={"query": {"match_all": {}}} 197 | )["hits"]["hits"] 198 | res = [x["_source"] for x in res] 199 | self.assertEqual(len(res), 1) 200 | 201 | self.elastic_doc.remove(docc["_id"], *TESTARGS) 202 | res = self.elastic_conn.search( 203 | index="test", doc_type="test", body={"query": {"match_all": {}}} 204 | )["hits"]["hits"] 205 | res = [x["_source"] for x in res] 206 | self.assertEqual(len(res), 0) 207 | 208 | def test_insert_file(self): 209 | """Ensure we can properly insert a file into ElasticSearch 210 | """ 211 | test_data = " ".join(str(x) for x in range(100000)).encode("utf8") 212 | docc = { 213 | "_id": "test_id", 214 | "filename": "test_filename", 215 | "upload_date": 5, 216 | "md5": "test_md5", 217 | } 218 | self.elastic_doc.insert_file(MockGridFSFile(docc, test_data), *TESTARGS) 219 | res = self._search() 220 | for doc in res: 221 | self.assertEqual(doc["_id"], docc["_id"]) 222 | self.assertEqual(doc["filename"], docc["filename"]) 223 | self.assertEqual(base64.b64decode(doc["content"]), test_data.strip()) 224 | 225 | def test_remove_file(self): 226 | test_data = b"hello world" 227 | docc = { 228 | "_id": "test_id", 229 | "_ts": 10, 230 | "ns": "test.test", 231 | "filename": "test_filename", 232 | "upload_date": 5, 233 | "md5": "test_md5", 234 | } 235 | 236 | self.elastic_doc.insert_file(MockGridFSFile(docc, test_data), *TESTARGS) 237 | res = list(self._search()) 238 | self.assertEqual(len(res), 1) 239 | 240 | self.elastic_doc.remove("test_id", *TESTARGS) 241 | res = list(self._search()) 242 | self.assertEqual(len(res), 0) 243 | 244 | def test_search(self): 245 | """Test the search method. 246 | 247 | Make sure we can retrieve documents last modified within a time range. 248 | """ 249 | docc = {"_id": "1", "name": "John"} 250 | self.elastic_doc.upsert(docc, "test.test", 5767301236327972865) 251 | docc2 = {"_id": "2", "name": "John Paul"} 252 | self.elastic_doc.upsert(docc2, "test.test", 5767301236327972866) 253 | docc3 = {"_id": "3", "name": "Paul"} 254 | self.elastic_doc.upsert(docc3, "test.test", 5767301236327972870) 255 | search = list(self.elastic_doc.search(5767301236327972865, 5767301236327972866)) 256 | self.assertEqual(len(search), 2) 257 | result_ids = [result.get("_id") for result in search] 258 | self.assertIn("1", result_ids) 259 | self.assertIn("2", result_ids) 260 | 261 | @disable_auto_refresh 262 | def test_elastic_commit(self): 263 | """Test the auto_commit_interval attribute.""" 264 | doc = {"_id": "3", "name": "Waldo"} 265 | 266 | # test cases: 267 | # None = no autocommit 268 | # 0 = commit immediately 269 | # x > 0 = commit within x seconds 270 | for commit_interval in [None, 0, 2, 8]: 271 | docman = DocManager(elastic_pair, auto_commit_interval=commit_interval) 272 | docman.upsert(doc, *TESTARGS) 273 | if commit_interval: 274 | # Allow just a little extra time 275 | time.sleep(commit_interval + 2) 276 | results = list(self._search()) 277 | if commit_interval is None: 278 | self.assertEqual( 279 | len(results), 280 | 0, 281 | "should not commit document with " "auto_commit_interval = None", 282 | ) 283 | else: 284 | self.assertEqual( 285 | len(results), 286 | 1, 287 | "should commit document with " 288 | "auto_commit_interval = %s" % (commit_interval,), 289 | ) 290 | self.assertEqual(results[0]["name"], "Waldo") 291 | docman.stop() 292 | self._remove() 293 | retry_until_ok(self.elastic_conn.indices.refresh, index="") 294 | 295 | @disable_auto_refresh 296 | def test_auto_send_interval(self): 297 | """Test the auto_send_interval 298 | 299 | auto_send_interval should control the amount of time to wait before 300 | sending (but not committing) buffered operations. 301 | """ 302 | doc = {"_id": "3", "name": "Waldo"} 303 | 304 | # test cases: 305 | # None, 0 = no auto send 306 | # x > 0 = send buffered operations within x seconds 307 | for send_interval in [None, 0, 3, 8]: 308 | docman = DocManager( 309 | elastic_pair, autoSendInterval=send_interval, auto_commit_interval=None 310 | ) 311 | docman.upsert(doc, *TESTARGS) 312 | if send_interval: 313 | # Allow just a little extra time 314 | time.sleep(send_interval + 2) 315 | results = list(self._search()) 316 | self.assertEqual( 317 | len(results), 318 | 0, 319 | "documents should not be commited with " 320 | "auto_commit_interval=None and auto_commit_interval=%s" 321 | % (send_interval,), 322 | ) 323 | # Commit the possibly sent changes and search again 324 | retry_until_ok(self.elastic_conn.indices.refresh, index="") 325 | results = list(self._search()) 326 | if not send_interval: 327 | self.assertEqual( 328 | len(results), 329 | 0, 330 | "should not send document with auto_send_interval=%s" 331 | % (send_interval,), 332 | ) 333 | else: 334 | self.assertEqual( 335 | len(results), 336 | 1, 337 | "should send document with auto_send_interval=%s" 338 | % (send_interval,), 339 | ) 340 | self.assertEqual(results[0]["name"], "Waldo") 341 | docman.stop() 342 | self._remove() 343 | retry_until_ok(self.elastic_conn.indices.refresh, index="") 344 | 345 | def test_get_last_doc(self): 346 | """Test the get_last_doc method. 347 | 348 | Make sure we can retrieve the document most recently modified from ES. 349 | """ 350 | base = self.elastic_doc.get_last_doc() 351 | ts = base.get("_ts", 0) if base else 0 352 | docc = {"_id": "4", "name": "Hare"} 353 | self.elastic_doc.upsert(docc, "test.test", ts + 3) 354 | docc = {"_id": "5", "name": "Tortoise"} 355 | self.elastic_doc.upsert(docc, "test.test", ts + 2) 356 | docc = {"_id": "6", "name": "Mr T."} 357 | self.elastic_doc.upsert(docc, "test.test", ts + 1) 358 | 359 | self.assertEqual(self.elastic_doc.elastic.count(index="test")["count"], 3) 360 | doc = self.elastic_doc.get_last_doc() 361 | self.assertEqual(doc["_id"], "4") 362 | 363 | docc = {"_id": "6", "name": "HareTwin"} 364 | self.elastic_doc.upsert(docc, "test.test", ts + 4) 365 | doc = self.elastic_doc.get_last_doc() 366 | self.assertEqual(doc["_id"], "6") 367 | self.assertEqual(self.elastic_doc.elastic.count(index="test")["count"], 3) 368 | 369 | def test_commands(self): 370 | cmd_args = ("test.$cmd", 1) 371 | self.elastic_doc.command_helper = CommandHelper() 372 | 373 | self.elastic_doc.handle_command({"create": "test2"}, *cmd_args) 374 | retry_until_ok(self.elastic_conn.indices.refresh, index="") 375 | self.assertIn("test2", self._mappings("test")) 376 | 377 | docs = [ 378 | {"_id": 0, "name": "ted"}, 379 | {"_id": 1, "name": "marsha"}, 380 | {"_id": 2, "name": "nikolas"}, 381 | ] 382 | self.elastic_doc.upsert(docs[0], "test.test2", 1) 383 | self.elastic_doc.upsert(docs[1], "test.test2", 1) 384 | self.elastic_doc.upsert(docs[2], "test.test2", 1) 385 | 386 | # Commit upserted docs as they are in buffer 387 | self.elastic_doc.commit() 388 | 389 | res = list( 390 | self.elastic_doc._stream_search( 391 | index="test", doc_type="test2", body={"query": {"match_all": {}}} 392 | ) 393 | ) 394 | for d in docs: 395 | self.assertTrue(d in res) 396 | 397 | self.elastic_doc.handle_command({"drop": "test2"}, *cmd_args) 398 | retry_until_ok(self.elastic_conn.indices.refresh, index="") 399 | res = list( 400 | self.elastic_doc._stream_search( 401 | index="test", doc_type="test2", body={"query": {"match_all": {}}} 402 | ) 403 | ) 404 | self.assertEqual(0, len(res)) 405 | 406 | self.elastic_doc.handle_command({"create": "test2"}, *cmd_args) 407 | self.elastic_doc.handle_command({"create": "test3"}, *cmd_args) 408 | retry_until_ok(self.elastic_conn.indices.refresh, index="") 409 | self.elastic_doc.handle_command({"dropDatabase": 1}, *cmd_args) 410 | retry_until_ok(self.elastic_conn.indices.refresh, index="") 411 | self.assertNotIn("test", self._indices()) 412 | self.assertNotIn("test2", self._mappings()) 413 | self.assertNotIn("test3", self._mappings()) 414 | 415 | def buffer_and_drop(self): 416 | """Insert document and drop collection while doc is in buffer""" 417 | 418 | self.elastic_doc.command_helper = CommandHelper() 419 | 420 | self.elastic_doc.auto_commit_interval = None 421 | index = "test3" 422 | doc_type = "foo" 423 | cmd_args = ("%s.%s" % (index, doc_type), 1) 424 | 425 | doc_id = 1 426 | doc = {"_id": doc_id, "name": "bar"} 427 | self.elastic_doc.upsert(doc, *cmd_args) 428 | 429 | self.elastic_doc.handle_command({"drop": doc_type}, *cmd_args) 430 | retry_until_ok(self.elastic_conn.indices.refresh, index="") 431 | 432 | # Commit should be called before command has been handled 433 | # Which means that buffer should be empty 434 | self.assertFalse(self.elastic_doc.BulkBuffer.get_buffer()) 435 | 436 | # After drop, below search should return no results 437 | res = list( 438 | self.elastic_doc._stream_search( 439 | index=index, doc_type=doc_type, body={"query": {"match_all": {}}} 440 | ) 441 | ) 442 | self.assertFalse(res) 443 | 444 | # Test dropDatabase as well 445 | # Firstly add document to database again 446 | # This time update doc as well 447 | self.elastic_doc.upsert(doc, *cmd_args) 448 | update_spec = {"$set": {"name": "foo2"}} 449 | self.elastic_doc.update(doc_id, update_spec, *cmd_args) 450 | self.elastic_doc.handle_command({"dropDatabase": 1}, *cmd_args) 451 | retry_until_ok(self.elastic_conn.indices.refresh, index="") 452 | self.assertFalse(self.elastic_doc.BulkBuffer.get_buffer()) 453 | self.assertNotIn(index, self._mappings()) 454 | 455 | # set auto_commit_interval back to 0 456 | self.elastic_doc.auto_commit_interval = 0 457 | 458 | 459 | class TestElasticDocManagerAWS(unittest.TestCase): 460 | @unittest.skipIf(_HAS_AWS, "Cannot test with AWS extension installed") 461 | def test_aws_raises_invalid_configuration(self): 462 | with self.assertRaises(errors.InvalidConfiguration): 463 | DocManager("notimportant", aws={}) 464 | 465 | def test_convert_aws_args(self): 466 | aws_args = dict( 467 | region_name="name", 468 | aws_access_key_id="id", 469 | aws_secret_access_key="key", 470 | aws_session_token="token", 471 | profile_name="profile_name", 472 | ) 473 | self.assertEqual(convert_aws_args(aws_args), aws_args) 474 | 475 | def test_convert_aws_args_raises_invalid_configuration(self): 476 | with self.assertRaises(errors.InvalidConfiguration): 477 | convert_aws_args("not_dict") 478 | 479 | def test_convert_aws_args_old_options(self): 480 | self.assertEqual( 481 | convert_aws_args(dict(region="name", access_id="id", secret_key="key")), 482 | dict( 483 | region_name="name", aws_access_key_id="id", aws_secret_access_key="key" 484 | ), 485 | ) 486 | 487 | @unittest.skipUnless(_HAS_AWS, "Cannot test without AWS extension") 488 | def test_create_aws_auth_raises_invalid_configuration(self): 489 | with self.assertRaises(errors.InvalidConfiguration): 490 | create_aws_auth({"unknown_option": ""}) 491 | 492 | 493 | if __name__ == "__main__": 494 | unittest.main() 495 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [testenv] 2 | deps = 3 | mongo-orchestration>=0.6.7,<1.0 4 | requests>=2.5.1 5 | unittest2; python_version < "2.7" 6 | commands_pre = 7 | mongo-orchestration start -p 20000 -b localhost 8 | setenv = 9 | MO_ADDRESS = localhost:20000 10 | commands = 11 | !py26: python -m unittest discover tests 12 | py26: python -m unittest2 discover tests 13 | commands_post = 14 | mongo-orchestration stop 15 | extras = 16 | elastic2: elastic2 17 | elastic5: elastic5 18 | passenv = 19 | # the username to use if running the tests with authentication enabled 20 | DB_USER 21 | # the password for the above 22 | DB_PASSWORD 23 | # the starting port for running MongoDB. Future nodes will be started on sequentially increasing ports 24 | MONGO_PORT 25 | # the hostname on which Elasticsearch is running 26 | ES_HOST 27 | # the port for the above 28 | ES_PORT 29 | --------------------------------------------------------------------------------