├── .flake8
├── .gitignore
├── .travis.yml
├── CHANGELOG.rst
├── LICENSE
├── README.rst
├── mongo_connector
    ├── __init__.py
    └── doc_managers
    │   ├── __init__.py
    │   └── elastic2_doc_manager.py
├── pyproject.toml
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── test_elastic2.py
    └── test_elastic2_doc_manager.py
└── tox.ini


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | # defer to black http://bit.ly/2CKCP8W
3 | max-line-length = 88
4 | ignore =
5 | 	E203
6 | 	W503
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yougov/elastic2-doc-manager/ad92138d1fd6656bb2e71cb5cc840f9ba0109c49/.gitignore


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: python
 3 | 
 4 | # Elasticsearch 5 requires Java 8
 5 | addons:
 6 |   apt:
 7 |     packages:
 8 |       - oracle-java8-set-default
 9 | 
10 | env:
11 |   global:
12 |     - ELASTIC_2_URL=http://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/tar/elasticsearch/2.4.3/elasticsearch-2.4.3.tar.gz
13 |     - ELASTIC_5_URL=http://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-5.1.1.tar.gz
14 |     # We need to manually set JAVA_HOME to Java 8 too.
15 |     - JAVA_HOME=/usr/lib/jvm/java-8-oracle
16 | 
17 | jobs:
18 |   fast_finish: true
19 |   include:
20 |   - python: &latest_py3 3.6
21 |     env: MONGODB=3.2.11 ELASTIC=5.1.1 ELASTIC_URL=$ELASTIC_5_URL TOXENV=elastic5
22 |   - python: 3.4
23 |     env: MONGODB=2.6.12 ELASTIC=2.4.3 ELASTIC_URL=$ELASTIC_2_URL TOXENV=elastic2
24 |   - python: 3.5
25 |     env: MONGODB=2.4.14 ELASTIC=2.4.3 ELASTIC_URL=$ELASTIC_2_URL TOXENV=elastic2
26 |   - stage: deploy
27 |     if: tag IS present
28 |     python: *latest_py3
29 |     install: skip
30 |     script: skip
31 |     deploy:
32 |       provider: pypi
33 |       on:
34 |         tags: true
35 |         all_branches: true
36 |       user: jaraco
37 |       password:
38 |         secure: rx5Yz5qT+wPWiT+wOn0+U3F4G0Kk/rouMkJeOnGNxb/NHWwc9iFqeLRRw5rWyF6COPMuKh32MlMIdL7EQNPyadEZ9djtPWC5csJYNLJAH/VC/+vZd5ZEwbsp1BYB+dZqUSUf7+G1yANvi4jd6x8zK1M9FHtL+LrWda/t2gsJePIx9rzm5PVHZO9GX9+ljb+pKP2Pk768BqL43z8tPOmlue/ONiC5OEc1Bd2mmxSm3ObaFqAJyr8F05GF90SNxZ8E+eCZFLGG4os5ul3yqjCTAFx9edRgXZVZmBInp6nzF0R69PZvJQj/+yyIUcZdrpZra9Glnc5tXNCx2EA9qChMD0aTsFyWvtRTk3AREzd9Ph4oSJ25jxjdDfaGkNDJvMIp7WeGLwuYpZtQ/IKlA7Nvjt5IcYYvzLWWpsA/EamBIglJn1SyYRmbXo2j3xymMF1jVskXgNO/nt9U9rPQApPUVXjuTGUsu1tbqzQG1Hnm+Mo3vbwkmj4DYoa7XcE+WXs2Mtk5cHsMdlBapnEB9bn5reuK4cvSOidbZNgsm8ObI6Yk6pRU/gTiy5wBcH4NGy9vvK0IxFT2/I+q83HtJk/JRu1m7ImRYEluG+0y5HLi444rY2zcrdGq54UFdvLbgXkxJsj91au7NHyvukbZ3KchvK2937rrn3WgBcD24T0qfEc=
39 |       distributions: dists
40 |       skip_cleanup: true
41 | 
42 | install:
43 |   - java -version
44 |   - # Install Elasticsearch with mapper-attachments
45 |   - wget $ELASTIC_URL
46 |   - tar -xvf elasticsearch-${ELASTIC}.tar.gz
47 |   - export PATH=${PWD}/elasticsearch-${ELASTIC}/bin/:${PATH}
48 |   - echo 'y' |  elasticsearch-plugin install mapper-attachments || echo 'y' |  plugin install mapper-attachments
49 |   - elasticsearch --version
50 |   - # Install MongoDB
51 |   - wget http://fastdl.mongodb.org/linux/mongodb-linux-x86_64-${MONGODB}.tgz
52 |   - tar -zxf mongodb-linux-x86_64-${MONGODB}.tgz
53 |   - export PATH=${PWD}/mongodb-linux-x86_64-${MONGODB}/bin/:${PATH}
54 |   - mongod --version
55 |   - pip install -U tox tox-venv
56 | 
57 | before_script:
58 |   - elasticsearch > temp.txt &
59 |   - sleep 10
60 | 
61 | script:
62 |   - tox
63 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | Changelog
 2 | =========
 3 | 
 4 | Version 1.0.0
 5 | -------------
 6 | 
 7 | - Drop support for Python 3.3 and earlier, including Python 2.
 8 | 
 9 | Version 0.4.0
10 | -------------
11 | 
12 | - Remove reliance on ``mongo_connector.compat``.
13 | 
14 | Version 0.3.0
15 | -------------
16 | 
17 | - Support for Elasticsearch 5.x.
18 | - Significant performance improvements because operations are buffered.
19 | - BulkIndexErrors are now caught and reraised as OperationFailed.
20 | 
21 | Version 0.2.0
22 | -------------
23 | 
24 | - Bug fix for namespace information saved in the mongo-connector metadata index.
25 | - Support AWS Elasticsearch Service Request Signing.
26 | 
27 | Version 0.1.0
28 | -------------
29 | 
30 | This was the first release of elastic-doc-manager.
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ====================
  2 | elastic2-doc-manager
  3 | ====================
  4 | 
  5 | The mongo-connector project originated as a MongoDB mongo-labs
  6 | project and is now community-maintained under the custody of YouGov, Plc.
  7 | 
  8 | .. image:: https://travis-ci.org/yougov/elastic2-doc-manager.svg?branch=master
  9 |    :alt: View build status
 10 |    :target: https://travis-ci.org/yougov/elastic2-doc-manager
 11 | 
 12 | Getting Started
 13 | ===============
 14 | 
 15 | This package is a document manager for
 16 | `mongo-connector <https://github.com/yougov/mongo-connector>`_ that
 17 | targets Elasticsearch versions 2.x and 5.x -- don't let the name fool you!
 18 | For information on running mongo-connector with Elasticsearch, please see the
 19 | `MongoConnector Usage with Elasticsearch
 20 | <https://github.com/yougov/mongo-connector/wiki/Usage%20with%20ElasticSearch>`_
 21 | wiki page.
 22 | 
 23 | Installation
 24 | ============
 25 | 
 26 | The installation of the elastic2-doc-manager depends on which version of
 27 | Elasticsearch you are targeting.
 28 | 
 29 | Elasticsearch 1.x
 30 | -----------------
 31 | 
 32 | This is the document manager for Elasticsearch 2.x and 5.x. If you
 33 | want to target Elasticsearch 1.x, please install the
 34 | `elastic-doc-manager <https://github.com/yougov/elastic-doc-manager>`_.
 35 | 
 36 | Elasticsearch 2.x
 37 | -----------------
 38 | 
 39 | For use with an Elasticsearch 2.x server, install with
 40 | `pip <https://pypi.python.org/pypi/pip>`__::
 41 | 
 42 |   pip install 'elastic2-doc-manager[elastic2]'
 43 | 
 44 | Elasticsearch 5.x
 45 | -----------------
 46 | 
 47 | For use with an Elasticsearch 5.x server, install with::
 48 | 
 49 |   pip install 'elastic2-doc-manager[elastic5]'
 50 | 
 51 | .. note:: Version 0.3.0 added support for Elasticsearch 5.x.
 52 | 
 53 | 
 54 | Amazon Elasticsearch Service
 55 | ----------------------------
 56 | 
 57 | To use with Amazon Elasticsearch Service, you must install the required AWS
 58 | dependencies along with the version of Elasticsearch::
 59 | 
 60 |   pip install 'elastic2-doc-manager[elastic2,aws]'
 61 | 
 62 | 
 63 | Development
 64 | -----------
 65 | 
 66 | You can also install the development version of elastic2-doc-manager
 67 | manually::
 68 | 
 69 |   git clone https://github.com/yougov/elastic2-doc-manager.git
 70 |   pip install -e './elastic2-doc-manager[elastic2]'
 71 | 
 72 | You may have to run ``pip`` with ``sudo``, depending on where you're
 73 | installing and what privileges you have.
 74 | 
 75 | .. note:: Please note that before mongo-connector version 2.2.2, the elastic
 76 | doc manager was packaged with mongo-connector and only supported
 77 | Elasticsearch 1.x.
 78 | 
 79 | Running the tests
 80 | -----------------
 81 | Requirements
 82 | ~~~~~~~~~~~~
 83 | 
 84 | 1. Copy of the Elastic 2.x Document Manager Github repository
 85 | 
 86 |   The tests are not included in the package from PyPI and can only be acquired
 87 |   by cloning this repository on Github::
 88 | 
 89 |       git clone https://github.com/yougov/elastic2-doc-manager
 90 | 
 91 | 2. Tox
 92 | 
 93 |   Install `tox <https://pypi.org/project/tox>`_.
 94 | 
 95 | 2. Environment variables
 96 | 
 97 |   There are a few influential environment variables that affect the tests. These are
 98 |   defined in the tox.ini.
 99 | 
100 | All the tests live in the `tests` directory.
101 | 
102 | Running tests on the command-line
103 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
104 | 
105 | While the tests take care of setting up and tearing down MongoDB clusters on
106 | their own, make sure to start Elasticsearch before doing a full test run!
107 | 
108 | You can run all the tests with one command (this works in all supported Python versions)::
109 | 
110 |   tox
111 | 
112 | Error messages
113 | ~~~~~~~~~~~~~~
114 | 
115 | Some of the tests are meant to generate lots of ``ERROR``-level log messages,
116 | especially the rollback tests. mongo-connector logs exceptions it encounters
117 | while iterating the cursor in the oplog, so we see these in the console output
118 | while MongoDB clusters are being torn apart in the tests. As long as all the
119 | tests pass with an `OK` message, all is well.
120 | 


--------------------------------------------------------------------------------
/mongo_connector/__init__.py:
--------------------------------------------------------------------------------
1 | from pkgutil import extend_path
2 | 
3 | __path__ = extend_path(__path__, __name__)
4 | 


--------------------------------------------------------------------------------
/mongo_connector/doc_managers/__init__.py:
--------------------------------------------------------------------------------
1 | from pkgutil import extend_path
2 | 
3 | __path__ = extend_path(__path__, __name__)
4 | 


--------------------------------------------------------------------------------
/mongo_connector/doc_managers/elastic2_doc_manager.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 MongoDB, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Elasticsearch implementation of the DocManager interface.
 16 | 
 17 | Receives documents from an OplogThread and takes the appropriate actions on
 18 | Elasticsearch.
 19 | """
 20 | import base64
 21 | import logging
 22 | import threading
 23 | import time
 24 | import warnings
 25 | 
 26 | import bson.json_util
 27 | 
 28 | try:
 29 |     __import__("elasticsearch")
 30 | except ImportError:
 31 |     raise ImportError(
 32 |         "Error: elasticsearch (https://pypi.python.org/pypi/elasticsearch) "
 33 |         "version 2.x or 5.x is not installed.\n"
 34 |         "Install with:\n"
 35 |         "  pip install elastic2-doc-manager[elastic2]\n"
 36 |         "or:\n"
 37 |         "  pip install elastic2-doc-manager[elastic5]\n"
 38 |     )
 39 | 
 40 | from elasticsearch import (
 41 |     Elasticsearch,
 42 |     exceptions as es_exceptions,
 43 |     connection as es_connection,
 44 | )
 45 | from elasticsearch.helpers import bulk, scan, streaming_bulk, BulkIndexError
 46 | 
 47 | import importlib_metadata
 48 | 
 49 | from mongo_connector import errors
 50 | from mongo_connector.constants import DEFAULT_COMMIT_INTERVAL, DEFAULT_MAX_BULK
 51 | from mongo_connector.util import exception_wrapper, retry_until_ok
 52 | from mongo_connector.doc_managers.doc_manager_base import DocManagerBase
 53 | from mongo_connector.doc_managers.formatters import DefaultDocumentFormatter
 54 | 
 55 | _HAS_AWS = True
 56 | try:
 57 |     from boto3 import session
 58 |     from requests_aws_sign import AWSV4Sign
 59 | except ImportError:
 60 |     _HAS_AWS = False
 61 | 
 62 | wrap_exceptions = exception_wrapper(
 63 |     {
 64 |         BulkIndexError: errors.OperationFailed,
 65 |         es_exceptions.ConnectionError: errors.ConnectionFailed,
 66 |         es_exceptions.TransportError: errors.OperationFailed,
 67 |         es_exceptions.NotFoundError: errors.OperationFailed,
 68 |         es_exceptions.RequestError: errors.OperationFailed,
 69 |     }
 70 | )
 71 | 
 72 | LOG = logging.getLogger(__name__)
 73 | 
 74 | DEFAULT_SEND_INTERVAL = 5
 75 | """The default interval in seconds to send buffered operations."""
 76 | 
 77 | DEFAULT_AWS_REGION = "us-east-1"
 78 | 
 79 | __version__ = importlib_metadata.version("elastic2_doc_manager")
 80 | 
 81 | 
 82 | def convert_aws_args(aws_args):
 83 |     """Convert old style options into arguments to boto3.session.Session."""
 84 |     if not isinstance(aws_args, dict):
 85 |         raise errors.InvalidConfiguration(
 86 |             'Elastic DocManager config option "aws" must be a dict'
 87 |         )
 88 |     old_session_kwargs = dict(
 89 |         region="region_name",
 90 |         access_id="aws_access_key_id",
 91 |         secret_key="aws_secret_access_key",
 92 |     )
 93 |     new_kwargs = {}
 94 |     for arg in aws_args:
 95 |         if arg in old_session_kwargs:
 96 |             new_kwargs[old_session_kwargs[arg]] = aws_args[arg]
 97 |         else:
 98 |             new_kwargs[arg] = aws_args[arg]
 99 |     return new_kwargs
100 | 
101 | 
102 | def create_aws_auth(aws_args):
103 |     try:
104 |         aws_session = session.Session(**convert_aws_args(aws_args))
105 |     except TypeError as exc:
106 |         raise errors.InvalidConfiguration(
107 |             "Elastic DocManager unknown aws config option: %s" % (exc,)
108 |         )
109 |     return AWSV4Sign(
110 |         aws_session.get_credentials(),
111 |         aws_session.region_name or DEFAULT_AWS_REGION,
112 |         "es",
113 |     )
114 | 
115 | 
116 | class AutoCommiter(threading.Thread):
117 |     """Thread that periodically sends buffered operations to Elastic.
118 | 
119 |     :Parameters:
120 |       - `docman`: The Elasticsearch DocManager.
121 |       - `send_interval`: Number of seconds to wait before sending buffered
122 |         operations to Elasticsearch. Set to None or 0 to disable.
123 |       - `commit_interval`: Number of seconds to wait before committing
124 |         buffered operations to Elasticsearch. Set to None or 0 to disable.
125 |       - `sleep_interval`: Number of seconds to sleep.
126 |     """
127 | 
128 |     def __init__(self, docman, send_interval, commit_interval, sleep_interval=1):
129 |         super(AutoCommiter, self).__init__()
130 |         self._docman = docman
131 |         # Change `None` intervals to 0
132 |         self._send_interval = send_interval if send_interval else 0
133 |         self._commit_interval = commit_interval if commit_interval else 0
134 |         self._should_auto_send = self._send_interval > 0
135 |         self._should_auto_commit = self._commit_interval > 0
136 |         self._sleep_interval = max(sleep_interval, 1)
137 |         self._stopped = False
138 |         self.daemon = True
139 | 
140 |     def join(self, timeout=None):
141 |         self._stopped = True
142 |         super(AutoCommiter, self).join(timeout=timeout)
143 | 
144 |     def run(self):
145 |         """Periodically sends buffered operations and/or commit.
146 |         """
147 |         if not self._should_auto_commit and not self._should_auto_send:
148 |             return
149 |         last_send, last_commit = 0, 0
150 |         while not self._stopped:
151 |             if self._should_auto_commit:
152 |                 if last_commit > self._commit_interval:
153 |                     self._docman.commit()
154 |                     # commit also sends so reset both
155 |                     last_send, last_commit = 0, 0
156 |                     # Give a chance to exit the loop
157 |                     if self._stopped:
158 |                         break
159 | 
160 |             if self._should_auto_send:
161 |                 if last_send > self._send_interval:
162 |                     self._docman.send_buffered_operations()
163 |                     last_send = 0
164 |             time.sleep(self._sleep_interval)
165 |             last_send += self._sleep_interval
166 |             last_commit += self._sleep_interval
167 | 
168 | 
169 | class DocManager(DocManagerBase):
170 |     """Elasticsearch implementation of the DocManager interface.
171 | 
172 |     Receives documents from an OplogThread and takes the appropriate actions on
173 |     Elasticsearch.
174 |     """
175 | 
176 |     def __init__(
177 |         self,
178 |         url,
179 |         auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
180 |         unique_key="_id",
181 |         chunk_size=DEFAULT_MAX_BULK,
182 |         meta_index_name="mongodb_meta",
183 |         meta_type="mongodb_meta",
184 |         attachment_field="content",
185 |         **kwargs
186 |     ):
187 |         client_options = kwargs.get("clientOptions", {})
188 |         if "aws" in kwargs:
189 |             if not _HAS_AWS:
190 |                 raise errors.InvalidConfiguration(
191 |                     "aws extras must be installed to sign Elasticsearch "
192 |                     "requests. Install with: "
193 |                     "pip install elastic2-doc-manager[aws]"
194 |                 )
195 |             client_options["http_auth"] = create_aws_auth(kwargs["aws"])
196 |             client_options["use_ssl"] = True
197 |             client_options["verify_certs"] = True
198 |             client_options["connection_class"] = es_connection.RequestsHttpConnection
199 |         if type(url) is not list:
200 |             url = [url]
201 |         self.elastic = Elasticsearch(hosts=url, **client_options)
202 | 
203 |         self._formatter = DefaultDocumentFormatter()
204 |         self.BulkBuffer = BulkBuffer(self)
205 | 
206 |         # As bulk operation can be done in another thread
207 |         # lock is needed to prevent access to BulkBuffer
208 |         # while commiting documents to Elasticsearch
209 |         # It is because BulkBuffer might get outdated
210 |         # docs from Elasticsearch if bulk is still ongoing
211 |         self.lock = threading.Lock()
212 | 
213 |         self.auto_commit_interval = auto_commit_interval
214 |         self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL)
215 |         self.meta_index_name = meta_index_name
216 |         self.meta_type = meta_type
217 |         self.unique_key = unique_key
218 |         self.chunk_size = chunk_size
219 |         self.has_attachment_mapping = False
220 |         self.attachment_field = attachment_field
221 |         self.auto_commiter = AutoCommiter(
222 |             self, self.auto_send_interval, self.auto_commit_interval
223 |         )
224 |         self.auto_commiter.start()
225 | 
226 |     def _index_and_mapping(self, namespace):
227 |         """Helper method for getting the index and type from a namespace."""
228 |         index, doc_type = namespace.split(".", 1)
229 |         return index.lower(), doc_type
230 | 
231 |     def stop(self):
232 |         """Stop the auto-commit thread."""
233 |         self.auto_commiter.join()
234 |         self.auto_commit_interval = 0
235 |         # Commit any remaining docs from buffer
236 |         self.commit()
237 | 
238 |     def apply_update(self, doc, update_spec):
239 |         if "$set" not in update_spec and "$unset" not in update_spec:
240 |             # Don't try to add ns and _ts fields back in from doc
241 |             return update_spec
242 |         return super(DocManager, self).apply_update(doc, update_spec)
243 | 
244 |     @wrap_exceptions
245 |     def handle_command(self, doc, namespace, timestamp):
246 |         # Flush buffer before handle command
247 |         self.commit()
248 |         db = namespace.split(".", 1)[0]
249 |         if doc.get("dropDatabase"):
250 |             dbs = self.command_helper.map_db(db)
251 |             for _db in dbs:
252 |                 self.elastic.indices.delete(index=_db.lower())
253 | 
254 |         if doc.get("renameCollection"):
255 |             raise errors.OperationFailed(
256 |                 "elastic_doc_manager does not support renaming a mapping."
257 |             )
258 | 
259 |         if doc.get("create"):
260 |             db, coll = self.command_helper.map_collection(db, doc["create"])
261 |             if db and coll:
262 |                 self.elastic.indices.put_mapping(
263 |                     index=db.lower(), doc_type=coll, body={"_source": {"enabled": True}}
264 |                 )
265 | 
266 |         if doc.get("drop"):
267 |             db, coll = self.command_helper.map_collection(db, doc["drop"])
268 |             if db and coll:
269 |                 # This will delete the items in coll, but not get rid of the
270 |                 # mapping.
271 |                 warnings.warn(
272 |                     "Deleting all documents of type %s on index %s."
273 |                     "The mapping definition will persist and must be"
274 |                     "removed manually." % (coll, db)
275 |                 )
276 |                 responses = streaming_bulk(
277 |                     self.elastic,
278 |                     (
279 |                         dict(result, _op_type="delete")
280 |                         for result in scan(
281 |                             self.elastic, index=db.lower(), doc_type=coll
282 |                         )
283 |                     ),
284 |                 )
285 |                 for ok, resp in responses:
286 |                     if not ok:
287 |                         LOG.error(
288 |                             "Error occurred while deleting ElasticSearch docum"
289 |                             "ent during handling of 'drop' command: %r" % resp
290 |                         )
291 | 
292 |     @wrap_exceptions
293 |     def update(self, document_id, update_spec, namespace, timestamp):
294 |         """Apply updates given in update_spec to the document whose id
295 |         matches that of doc.
296 |         """
297 | 
298 |         index, doc_type = self._index_and_mapping(namespace)
299 |         with self.lock:
300 |             # Check if document source is stored in local buffer
301 |             document = self.BulkBuffer.get_from_sources(
302 |                 index, doc_type, str(document_id)
303 |             )
304 |         if document:
305 |             # Document source collected from local buffer
306 |             # Perform apply_update on it and then it will be
307 |             # ready for commiting to Elasticsearch
308 |             updated = self.apply_update(document, update_spec)
309 |             # _id is immutable in MongoDB, so won't have changed in update
310 |             updated["_id"] = document_id
311 |             self.upsert(updated, namespace, timestamp)
312 |         else:
313 |             # Document source needs to be retrieved from Elasticsearch
314 |             # before performing update. Pass update_spec to upsert function
315 |             updated = {"_id": document_id}
316 |             self.upsert(updated, namespace, timestamp, update_spec)
317 |         # upsert() strips metadata, so only _id + fields in _source still here
318 |         return updated
319 | 
320 |     @wrap_exceptions
321 |     def upsert(self, doc, namespace, timestamp, update_spec=None):
322 |         """Insert a document into Elasticsearch."""
323 |         index, doc_type = self._index_and_mapping(namespace)
324 |         # No need to duplicate '_id' in source document
325 |         doc_id = str(doc.pop("_id"))
326 |         metadata = {"ns": namespace, "_ts": timestamp}
327 | 
328 |         # Index the source document, using lowercase namespace as index name.
329 |         action = {
330 |             "_op_type": "index",
331 |             "_index": index,
332 |             "_type": doc_type,
333 |             "_id": doc_id,
334 |             "_source": self._formatter.format_document(doc),
335 |         }
336 |         # Index document metadata with original namespace (mixed upper/lower).
337 |         meta_action = {
338 |             "_op_type": "index",
339 |             "_index": self.meta_index_name,
340 |             "_type": self.meta_type,
341 |             "_id": doc_id,
342 |             "_source": bson.json_util.dumps(metadata),
343 |         }
344 | 
345 |         self.index(action, meta_action, doc, update_spec)
346 | 
347 |         # Leave _id, since it's part of the original document
348 |         doc["_id"] = doc_id
349 | 
350 |     @wrap_exceptions
351 |     def bulk_upsert(self, docs, namespace, timestamp):
352 |         """Insert multiple documents into Elasticsearch."""
353 | 
354 |         def docs_to_upsert():
355 |             doc = None
356 |             for doc in docs:
357 |                 # Remove metadata and redundant _id
358 |                 index, doc_type = self._index_and_mapping(namespace)
359 |                 doc_id = str(doc.pop("_id"))
360 |                 document_action = {
361 |                     "_index": index,
362 |                     "_type": doc_type,
363 |                     "_id": doc_id,
364 |                     "_source": self._formatter.format_document(doc),
365 |                 }
366 |                 document_meta = {
367 |                     "_index": self.meta_index_name,
368 |                     "_type": self.meta_type,
369 |                     "_id": doc_id,
370 |                     "_source": {"ns": namespace, "_ts": timestamp},
371 |                 }
372 |                 yield document_action
373 |                 yield document_meta
374 |             if doc is None:
375 |                 raise errors.EmptyDocsError(
376 |                     "Cannot upsert an empty sequence of "
377 |                     "documents into Elastic Search"
378 |                 )
379 | 
380 |         try:
381 |             kw = {}
382 |             if self.chunk_size > 0:
383 |                 kw["chunk_size"] = self.chunk_size
384 | 
385 |             responses = streaming_bulk(
386 |                 client=self.elastic, actions=docs_to_upsert(), **kw
387 |             )
388 | 
389 |             for ok, resp in responses:
390 |                 if not ok:
391 |                     LOG.error(
392 |                         "Could not bulk-upsert document "
393 |                         "into ElasticSearch: %r" % resp
394 |                     )
395 |             if self.auto_commit_interval == 0:
396 |                 self.commit()
397 |         except errors.EmptyDocsError:
398 |             # This can happen when mongo-connector starts up, there is no
399 |             # config file, but nothing to dump
400 |             pass
401 | 
402 |     @wrap_exceptions
403 |     def insert_file(self, f, namespace, timestamp):
404 |         doc = f.get_metadata()
405 |         doc_id = str(doc.pop("_id"))
406 |         index, doc_type = self._index_and_mapping(namespace)
407 | 
408 |         # make sure that elasticsearch treats it like a file
409 |         if not self.has_attachment_mapping:
410 |             body = {"properties": {self.attachment_field: {"type": "attachment"}}}
411 |             self.elastic.indices.put_mapping(index=index, doc_type=doc_type, body=body)
412 |             self.has_attachment_mapping = True
413 | 
414 |         metadata = {"ns": namespace, "_ts": timestamp}
415 | 
416 |         doc = self._formatter.format_document(doc)
417 |         doc[self.attachment_field] = base64.b64encode(f.read()).decode()
418 | 
419 |         action = {
420 |             "_op_type": "index",
421 |             "_index": index,
422 |             "_type": doc_type,
423 |             "_id": doc_id,
424 |             "_source": doc,
425 |         }
426 |         meta_action = {
427 |             "_op_type": "index",
428 |             "_index": self.meta_index_name,
429 |             "_type": self.meta_type,
430 |             "_id": doc_id,
431 |             "_source": bson.json_util.dumps(metadata),
432 |         }
433 | 
434 |         self.index(action, meta_action)
435 | 
436 |     @wrap_exceptions
437 |     def remove(self, document_id, namespace, timestamp):
438 |         """Remove a document from Elasticsearch."""
439 |         index, doc_type = self._index_and_mapping(namespace)
440 | 
441 |         action = {
442 |             "_op_type": "delete",
443 |             "_index": index,
444 |             "_type": doc_type,
445 |             "_id": str(document_id),
446 |         }
447 | 
448 |         meta_action = {
449 |             "_op_type": "delete",
450 |             "_index": self.meta_index_name,
451 |             "_type": self.meta_type,
452 |             "_id": str(document_id),
453 |         }
454 | 
455 |         self.index(action, meta_action)
456 | 
457 |     @wrap_exceptions
458 |     def _stream_search(self, *args, **kwargs):
459 |         """Helper method for iterating over ES search results."""
460 |         for hit in scan(
461 |             self.elastic, query=kwargs.pop("body", None), scroll="10m", **kwargs
462 |         ):
463 |             hit["_source"]["_id"] = hit["_id"]
464 |             yield hit["_source"]
465 | 
466 |     def search(self, start_ts, end_ts):
467 |         """Query Elasticsearch for documents in a time range.
468 | 
469 |         This method is used to find documents that may be in conflict during
470 |         a rollback event in MongoDB.
471 |         """
472 |         return self._stream_search(
473 |             index=self.meta_index_name,
474 |             body={"query": {"range": {"_ts": {"gte": start_ts, "lte": end_ts}}}},
475 |         )
476 | 
477 |     def index(self, action, meta_action, doc_source=None, update_spec=None):
478 |         with self.lock:
479 |             self.BulkBuffer.add_upsert(action, meta_action, doc_source, update_spec)
480 | 
481 |         # Divide by two to account for meta actions
482 |         if (
483 |             len(self.BulkBuffer.action_buffer) / 2 >= self.chunk_size
484 |             or self.auto_commit_interval == 0
485 |         ):
486 |             self.commit()
487 | 
488 |     def send_buffered_operations(self):
489 |         """Send buffered operations to Elasticsearch.
490 | 
491 |         This method is periodically called by the AutoCommitThread.
492 |         """
493 |         with self.lock:
494 |             try:
495 |                 action_buffer = self.BulkBuffer.get_buffer()
496 |                 if action_buffer:
497 |                     successes, errors = bulk(self.elastic, action_buffer)
498 |                     LOG.debug(
499 |                         "Bulk request finished, successfully sent %d " "operations",
500 |                         successes,
501 |                     )
502 |                     if errors:
503 |                         LOG.error("Bulk request finished with errors: %r", errors)
504 |             except es_exceptions.ElasticsearchException:
505 |                 LOG.exception("Bulk request failed with exception")
506 | 
507 |     def commit(self):
508 |         """Send buffered requests and refresh all indexes."""
509 |         self.send_buffered_operations()
510 |         retry_until_ok(self.elastic.indices.refresh, index="")
511 | 
512 |     @wrap_exceptions
513 |     def get_last_doc(self):
514 |         """Get the most recently modified document from Elasticsearch.
515 | 
516 |         This method is used to help define a time window within which documents
517 |         may be in conflict after a MongoDB rollback.
518 |         """
519 |         try:
520 |             result = self.elastic.search(
521 |                 index=self.meta_index_name,
522 |                 body={"query": {"match_all": {}}, "sort": [{"_ts": "desc"}]},
523 |                 size=1,
524 |             )["hits"]["hits"]
525 |             for r in result:
526 |                 r["_source"]["_id"] = r["_id"]
527 |                 return r["_source"]
528 |         except es_exceptions.RequestError:
529 |             # no documents so ES returns 400 because of undefined _ts mapping
530 |             return None
531 | 
532 | 
533 | class BulkBuffer(object):
534 |     def __init__(self, docman):
535 | 
536 |         # Parent object
537 |         self.docman = docman
538 | 
539 |         # Action buffer for bulk indexing
540 |         self.action_buffer = []
541 | 
542 |         # Docs to update
543 |         # Dict stores all documents for which firstly
544 |         # source has to be retrieved from Elasticsearch
545 |         # and then apply_update needs to be performed
546 |         # Format: [ (doc, update_spec, action_buffer_index, get_from_ES) ]
547 |         self.doc_to_update = []
548 | 
549 |         # Below dictionary contains ids of documents
550 |         # which need to be retrieved from Elasticsearch
551 |         # It prevents from getting same document multiple times from ES
552 |         # Format: {"_index": {"_type": {"_id": True}}}
553 |         self.doc_to_get = {}
554 | 
555 |         # Dictionary of sources
556 |         # Format: {"_index": {"_type": {"_id": {"_source": actual_source}}}}
557 |         self.sources = {}
558 | 
559 |     def add_upsert(self, action, meta_action, doc_source, update_spec):
560 |         """
561 |         Function which stores sources for "insert" actions
562 |         and decide if for "update" action has to add docs to
563 |         get source buffer
564 |         """
565 | 
566 |         # Whenever update_spec is provided to this method
567 |         # it means that doc source needs to be retrieved
568 |         # from Elasticsearch. It means also that source
569 |         # is not stored in local buffer
570 |         if update_spec:
571 |             self.bulk_index(action, meta_action)
572 | 
573 |             # -1 -> to get latest index number
574 |             # -1 -> to get action instead of meta_action
575 |             # Update document based on source retrieved from ES
576 |             self.add_doc_to_update(action, update_spec, len(self.action_buffer) - 2)
577 |         else:
578 |             # Insert and update operations provide source
579 |             # Store it in local buffer and use for comming updates
580 |             # inside same buffer
581 |             # add_to_sources will not be called for delete operation
582 |             # as it does not provide doc_source
583 |             if doc_source:
584 |                 self.add_to_sources(action, doc_source)
585 |             self.bulk_index(action, meta_action)
586 | 
587 |     def add_doc_to_update(self, action, update_spec, action_buffer_index):
588 |         """
589 |         Prepare document for update based on Elasticsearch response.
590 |         Set flag if document needs to be retrieved from Elasticsearch
591 |         """
592 | 
593 |         doc = {
594 |             "_index": action["_index"],
595 |             "_type": action["_type"],
596 |             "_id": action["_id"],
597 |         }
598 | 
599 |         # If get_from_ES == True -> get document's source from Elasticsearch
600 |         get_from_ES = self.should_get_id(action)
601 |         self.doc_to_update.append((doc, update_spec, action_buffer_index, get_from_ES))
602 | 
603 |     def should_get_id(self, action):
604 |         """
605 |         Mark document to retrieve its source from Elasticsearch.
606 |         Returns:
607 |             True - if marking document for the first time in this bulk
608 |             False - if document has been already marked
609 |         """
610 |         mapping_ids = self.doc_to_get.setdefault(action["_index"], {}).setdefault(
611 |             action["_type"], set()
612 |         )
613 |         if action["_id"] in mapping_ids:
614 |             # There is an update on this id already
615 |             return False
616 |         else:
617 |             mapping_ids.add(action["_id"])
618 |             return True
619 | 
620 |     def get_docs_sources_from_ES(self):
621 |         """Get document sources using MGET elasticsearch API"""
622 |         docs = [doc for doc, _, _, get_from_ES in self.doc_to_update if get_from_ES]
623 |         if docs:
624 |             documents = self.docman.elastic.mget(body={"docs": docs}, realtime=True)
625 |             return iter(documents["docs"])
626 |         else:
627 |             return iter([])
628 | 
629 |     @wrap_exceptions
630 |     def update_sources(self):
631 |         """Update local sources based on response from Elasticsearch"""
632 |         ES_documents = self.get_docs_sources_from_ES()
633 | 
634 |         for doc, update_spec, action_buffer_index, get_from_ES in self.doc_to_update:
635 |             if get_from_ES:
636 |                 # Update source based on response from ES
637 |                 ES_doc = next(ES_documents)
638 |                 if ES_doc["found"]:
639 |                     source = ES_doc["_source"]
640 |                 else:
641 |                     # Document not found in elasticsearch,
642 |                     # Seems like something went wrong during replication
643 |                     LOG.error(
644 |                         "mGET: Document id: %s has not been found "
645 |                         "in Elasticsearch. Due to that "
646 |                         "following update failed: %s",
647 |                         doc["_id"],
648 |                         update_spec,
649 |                     )
650 |                     self.reset_action(action_buffer_index)
651 |                     continue
652 |             else:
653 |                 # Get source stored locally before applying update
654 |                 # as it is up-to-date
655 |                 source = self.get_from_sources(doc["_index"], doc["_type"], doc["_id"])
656 |                 if not source:
657 |                     LOG.error(
658 |                         "mGET: Document id: %s has not been found "
659 |                         "in local sources. Due to that following "
660 |                         "update failed: %s",
661 |                         doc["_id"],
662 |                         update_spec,
663 |                     )
664 |                     self.reset_action(action_buffer_index)
665 |                     continue
666 | 
667 |             updated = self.docman.apply_update(source, update_spec)
668 | 
669 |             # Remove _id field from source
670 |             if "_id" in updated:
671 |                 del updated["_id"]
672 | 
673 |             # Everytime update locally stored sources to keep them up-to-date
674 |             self.add_to_sources(doc, updated)
675 | 
676 |             self.action_buffer[action_buffer_index][
677 |                 "_source"
678 |             ] = self.docman._formatter.format_document(updated)
679 | 
680 |         # Remove empty actions if there were errors
681 |         self.action_buffer = [
682 |             each_action for each_action in self.action_buffer if each_action
683 |         ]
684 | 
685 |     def reset_action(self, action_buffer_index):
686 |         """Reset specific action as update failed"""
687 |         self.action_buffer[action_buffer_index] = {}
688 |         self.action_buffer[action_buffer_index + 1] = {}
689 | 
690 |     def add_to_sources(self, action, doc_source):
691 |         """Store sources locally"""
692 |         mapping = self.sources.setdefault(action["_index"], {}).setdefault(
693 |             action["_type"], {}
694 |         )
695 |         mapping[action["_id"]] = doc_source
696 | 
697 |     def get_from_sources(self, index, doc_type, document_id):
698 |         """Get source stored locally"""
699 |         return self.sources.get(index, {}).get(doc_type, {}).get(document_id, {})
700 | 
701 |     def bulk_index(self, action, meta_action):
702 |         self.action_buffer.append(action)
703 |         self.action_buffer.append(meta_action)
704 | 
705 |     def clean_up(self):
706 |         """Do clean-up before returning buffer"""
707 |         self.action_buffer = []
708 |         self.sources = {}
709 |         self.doc_to_get = {}
710 |         self.doc_to_update = []
711 | 
712 |     def get_buffer(self):
713 |         """Get buffer which needs to be bulked to elasticsearch"""
714 | 
715 |         # Get sources for documents which are in Elasticsearch
716 |         # and they are not in local buffer
717 |         if self.doc_to_update:
718 |             self.update_sources()
719 | 
720 |         ES_buffer = self.action_buffer
721 |         self.clean_up()
722 |         return ES_buffer
723 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=34.4", "wheel", "setuptools_scm>=1.15"]
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | dists = clean --all sdist bdist_wheel
3 | 
4 | [bdist_wheel]
5 | universal = 1
6 | 
7 | [metadata]
8 | long_description = file:README.rst
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | setuptools.setup(
 4 |     name="elastic2-doc-manager",
 5 |     use_scm_version=True,
 6 |     maintainer="mongodb",
 7 |     description="Elastic2 plugin for mongo-connector",
 8 |     platforms=["any"],
 9 |     author="anna herlihy",
10 |     author_email="mongodb-user@googlegroups.com",
11 |     url="https://github.com/mongodb-labs/elastic2-doc-manager",
12 |     install_requires=["mongo-connector>=2.5.0", "importlib_metadata"],
13 |     python_requires=">=3.4",
14 |     extras_require={
15 |         "aws": ["boto3 >= 1.4.0", "requests-aws-sign >= 0.1.2"],
16 |         "elastic2": ["elasticsearch>=2.0.0,<3.0.0"],
17 |         "elastic5": ["elasticsearch>=5.0.0,<6.0.0"],
18 |     },
19 |     packages=["mongo_connector", "mongo_connector.doc_managers"],
20 |     classifiers=[
21 |         "Development Status :: 4 - Beta",
22 |         "Intended Audience :: Developers",
23 |         "License :: OSI Approved :: Apache Software License",
24 |         "Programming Language :: Python :: 3",
25 |         "Topic :: Database",
26 |         "Topic :: Software Development :: Libraries :: Python Modules",
27 |         "Operating System :: Unix",
28 |         "Operating System :: MacOS :: MacOS X",
29 |         "Operating System :: Microsoft :: Windows",
30 |         "Operating System :: POSIX",
31 |     ],
32 |     keywords=["mongo-connector", "mongodb", "elastic", "elasticsearch"],
33 |     setup_requires=["setuptools_scm>=1.15"],
34 | )
35 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 MongoDB, Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import logging
15 | import os
16 | import sys
17 | 
18 | 
19 | logging.basicConfig(stream=sys.stdout)
20 | 
21 | elastic_host = str(os.environ.get("ES_HOST", "localhost"))
22 | elastic_port = str(os.environ.get("ES_PORT", 9200))
23 | elastic_pair = "%s:%s" % (elastic_host, elastic_port)
24 | elastic_nodes = [elastic_pair, "%s:%s" % (elastic_host, str(int(elastic_port) + 1))]
25 | 


--------------------------------------------------------------------------------
/tests/test_elastic2.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 MongoDB, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Integration tests for mongo-connector + Elasticsearch 2.x."""
 16 | import base64
 17 | import os
 18 | import time
 19 | import unittest
 20 | 
 21 | from bson import SON
 22 | from elasticsearch import Elasticsearch
 23 | from elasticsearch.helpers import bulk, scan
 24 | from gridfs import GridFS
 25 | 
 26 | from mongo_connector.connector import Connector
 27 | from mongo_connector.doc_managers.elastic2_doc_manager import DocManager
 28 | from mongo_connector.test_utils import ReplicaSet, assert_soon, close_client
 29 | 
 30 | from mongo_connector.util import retry_until_ok
 31 | from tests import elastic_pair, elastic_nodes
 32 | 
 33 | 
 34 | class ElasticsearchTestCase(unittest.TestCase):
 35 |     """Base class for all ES TestCases."""
 36 | 
 37 |     @classmethod
 38 |     def setUpClass(cls):
 39 |         cls.elastic_conn = Elasticsearch(hosts=[elastic_pair])
 40 | 
 41 |     def setUp(self):
 42 |         # Create target index in elasticsearch
 43 |         self.elastic_conn.indices.create(index="test", ignore=400)
 44 |         self.elastic_conn.cluster.health(wait_for_status="yellow", index="test")
 45 |         self.elastic_doc = DocManager(elastic_pair, auto_commit_interval=0)
 46 | 
 47 |     def tearDown(self):
 48 |         self.elastic_conn.indices.delete(index="test", ignore=404)
 49 |         self.elastic_doc.stop()
 50 | 
 51 |     def _search(self, query=None):
 52 |         query = query or {"match_all": {}}
 53 |         return self.elastic_doc._stream_search(
 54 |             index="test", doc_type="test", body={"query": query}
 55 |         )
 56 | 
 57 |     def _count(self):
 58 |         return self.elastic_conn.count(index="test")["count"]
 59 | 
 60 |     def _remove(self):
 61 |         bulk_deletes = []
 62 |         for result in scan(self.elastic_conn, index="test", doc_type="test"):
 63 |             result["_op_type"] = "delete"
 64 |             bulk_deletes.append(result)
 65 |         bulk(self.elastic_conn, bulk_deletes)
 66 | 
 67 |     def _mappings(self, index="_all"):
 68 |         mappings = self.elastic_conn.indices.get_mapping(index=index)
 69 |         if index in mappings:
 70 |             return list(mappings[index]["mappings"].keys())
 71 |         return []
 72 | 
 73 |     def _indices(self):
 74 |         return list(self.elastic_conn.indices.stats()["indices"].keys())
 75 | 
 76 | 
 77 | class TestElastic(ElasticsearchTestCase):
 78 |     """Integration tests for mongo-connector + Elasticsearch."""
 79 | 
 80 |     @classmethod
 81 |     def setUpClass(cls):
 82 |         """Start the cluster."""
 83 |         super(TestElastic, cls).setUpClass()
 84 |         cls.repl_set = ReplicaSet().start()
 85 |         cls.conn = cls.repl_set.client()
 86 | 
 87 |     @classmethod
 88 |     def tearDownClass(cls):
 89 |         """Kill the cluster."""
 90 |         close_client(cls.conn)
 91 |         cls.repl_set.stop()
 92 | 
 93 |     def tearDown(self):
 94 |         """Stop the Connector thread."""
 95 |         super(TestElastic, self).tearDown()
 96 |         self.connector.join()
 97 | 
 98 |     def setUp(self):
 99 |         """Start a new Connector for each test."""
100 |         super(TestElastic, self).setUp()
101 |         try:
102 |             os.unlink("oplog.timestamp")
103 |         except OSError:
104 |             pass
105 |         self.connector = Connector(
106 |             mongo_address=self.repl_set.uri,
107 |             ns_set=["test.test"],
108 |             doc_managers=(self.elastic_doc,),
109 |             gridfs_set=["test.test"],
110 |         )
111 | 
112 |         self.conn.test.test.drop()
113 |         self.conn.test.test.files.drop()
114 |         self.conn.test.test.chunks.drop()
115 | 
116 |         self.connector.start()
117 |         assert_soon(lambda: len(self.connector.shard_set) > 0)
118 |         assert_soon(lambda: self._count() == 0)
119 | 
120 |     def test_insert(self):
121 |         """Test insert operations."""
122 |         self.conn["test"]["test"].insert_one({"name": "paulie"})
123 |         assert_soon(lambda: self._count() > 0)
124 |         result_set_1 = list(self._search())
125 |         self.assertEqual(len(result_set_1), 1)
126 |         result_set_2 = self.conn["test"]["test"].find_one()
127 |         for item in result_set_1:
128 |             self.assertEqual(item["_id"], str(result_set_2["_id"]))
129 |             self.assertEqual(item["name"], result_set_2["name"])
130 | 
131 |     def test_remove(self):
132 |         """Tests remove operations."""
133 |         self.conn["test"]["test"].insert_one({"name": "paulie"})
134 |         assert_soon(lambda: self._count() == 1)
135 |         self.conn["test"]["test"].delete_one({"name": "paulie"})
136 |         assert_soon(lambda: self._count() != 1)
137 |         self.assertEqual(self._count(), 0)
138 | 
139 |     def test_insert_file(self):
140 |         """Tests inserting a gridfs file
141 |         """
142 |         fs = GridFS(self.conn["test"], "test")
143 |         test_data = b"test_insert_file test file"
144 |         id = fs.put(test_data, filename="test.txt", encoding="utf8")
145 |         assert_soon(lambda: self._count() > 0)
146 | 
147 |         query = {"match": {"_all": "test_insert_file"}}
148 |         res = list(self._search(query))
149 |         self.assertEqual(len(res), 1)
150 |         doc = res[0]
151 |         self.assertEqual(doc["filename"], "test.txt")
152 |         self.assertEqual(doc["_id"], str(id))
153 |         self.assertEqual(base64.b64decode(doc["content"]), test_data)
154 | 
155 |     def test_remove_file(self):
156 |         fs = GridFS(self.conn["test"], "test")
157 |         id = fs.put("test file", filename="test.txt", encoding="utf8")
158 |         assert_soon(lambda: self._count() == 1)
159 |         fs.delete(id)
160 |         assert_soon(lambda: self._count() == 0)
161 | 
162 |     def test_update(self):
163 |         """Test update operations."""
164 |         # Insert
165 |         self.conn.test.test.insert_one({"a": 0})
166 |         assert_soon(lambda: sum(1 for _ in self._search()) == 1)
167 | 
168 |         def check_update(update_spec):
169 |             updated = self.conn.test.command(
170 |                 SON(
171 |                     [
172 |                         ("findAndModify", "test"),
173 |                         ("query", {"a": 0}),
174 |                         ("update", update_spec),
175 |                         ("new", True),
176 |                     ]
177 |                 )
178 |             )["value"]
179 |             # Stringify _id to match what will be retrieved from ES
180 |             updated["_id"] = str(updated["_id"])
181 |             assert_soon(lambda: next(self._search()) == updated)
182 | 
183 |         # Update by adding a field. Note that ES can't mix types within an array
184 |         check_update({"$set": {"b": [{"c": 10}, {"d": 11}]}})
185 | 
186 |         # Update by setting an attribute of a sub-document beyond end of array.
187 |         check_update({"$set": {"b.10.c": 42}})
188 | 
189 |         # Update by changing a value within a sub-document (contains array)
190 |         check_update({"$inc": {"b.0.c": 1}})
191 | 
192 |         # Update by changing the value within an array
193 |         check_update({"$inc": {"b.1.f": 12}})
194 | 
195 |         # Update by adding new bucket to list
196 |         check_update({"$push": {"b": {"e": 12}}})
197 | 
198 |         # Update by changing an entire sub-document
199 |         check_update({"$set": {"b.0": {"e": 4}}})
200 | 
201 |         # Update by adding a sub-document
202 |         check_update({"$set": {"b": {"0": {"c": 100}}}})
203 | 
204 |         # Update whole document
205 |         check_update({"a": 0, "b": {"1": {"d": 10000}}})
206 | 
207 |     def test_rollback(self):
208 |         """Test behavior during a MongoDB rollback.
209 | 
210 |         We force a rollback by adding a doc, killing the primary,
211 |         adding another doc, killing the new primary, and then
212 |         restarting both.
213 |         """
214 |         primary_conn = self.repl_set.primary.client()
215 | 
216 |         # This doc can be picked up in the collection dump
217 |         self.conn["test"]["test"].insert_one({"name": "paul"})
218 |         condition1 = (
219 |             lambda: self.conn["test"]["test"].find({"name": "paul"}).count() == 1
220 |         )
221 | 
222 |         def condition2():
223 |             return self._count() == 1
224 | 
225 |         assert_soon(condition1)
226 |         assert_soon(condition2)
227 | 
228 |         # This doc is definitely not picked up by collection dump
229 |         self.conn["test"]["test"].insert_one({"name": "pauly"})
230 | 
231 |         self.repl_set.primary.stop(destroy=False)
232 | 
233 |         new_primary_conn = self.repl_set.secondary.client()
234 | 
235 |         admin = new_primary_conn["admin"]
236 |         assert_soon(lambda: admin.command("isMaster")["ismaster"])
237 |         time.sleep(5)
238 |         retry_until_ok(self.conn.test.test.insert_one, {"name": "pauline"})
239 |         assert_soon(lambda: self._count() == 3)
240 |         result_set_1 = list(self._search())
241 |         result_set_2 = self.conn["test"]["test"].find_one({"name": "pauline"})
242 |         self.assertEqual(len(result_set_1), 3)
243 |         # make sure pauline is there
244 |         for item in result_set_1:
245 |             if item["name"] == "pauline":
246 |                 self.assertEqual(item["_id"], str(result_set_2["_id"]))
247 |         self.repl_set.secondary.stop(destroy=False)
248 | 
249 |         self.repl_set.primary.start()
250 |         while primary_conn["admin"].command("isMaster")["ismaster"] is False:
251 |             time.sleep(1)
252 | 
253 |         self.repl_set.secondary.start()
254 | 
255 |         time.sleep(2)
256 |         result_set_1 = list(self._search())
257 |         self.assertEqual(len(result_set_1), 2)
258 | 
259 |         if result_set_1[0]["name"] == "paul":
260 |             self.assertEqual(result_set_1[1]["name"], "pauly")
261 |         elif result_set_1[0]["name"] == "pauly":
262 |             self.assertEqual(result_set_1[1]["name"], "paul")
263 |         else:
264 |             self.assertTrue(0, "Unknown document retrieved")
265 | 
266 |         find_cursor = retry_until_ok(self.conn["test"]["test"].find)
267 |         self.assertEqual(retry_until_ok(find_cursor.count), 2)
268 | 
269 |     def test_bad_int_value(self):
270 |         self.conn.test.test.insert_one(
271 |             {"inf": float("inf"), "nan": float("nan"), "still_exists": True}
272 |         )
273 |         assert_soon(lambda: self._count() > 0)
274 |         for doc in self._search():
275 |             self.assertNotIn("inf", doc)
276 |             self.assertNotIn("nan", doc)
277 |             self.assertTrue(doc["still_exists"])
278 | 
279 | 
280 | class TestElasticMultipleHosts(unittest.TestCase):
281 |     """Integration tests for mongo-connector + Elasticsearch Cluster."""
282 | 
283 |     def test_multiple_hosts(self):
284 |         elastic_doc = DocManager(elastic_nodes)
285 |         self.assertEqual(len(elastic_doc.elastic.transport.hosts), 2)
286 | 
287 | 
288 | if __name__ == "__main__":
289 |     unittest.main()
290 | 


--------------------------------------------------------------------------------
/tests/test_elastic2_doc_manager.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 MongoDB, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Unit tests for the Elastic2 DocManager."""
 16 | import base64
 17 | import time
 18 | import unittest
 19 | 
 20 | from functools import wraps
 21 | 
 22 | from mongo_connector import errors
 23 | from mongo_connector.command_helper import CommandHelper
 24 | from mongo_connector.doc_managers.elastic2_doc_manager import (
 25 |     DocManager,
 26 |     _HAS_AWS,
 27 |     convert_aws_args,
 28 |     create_aws_auth,
 29 | )
 30 | from mongo_connector.test_utils import MockGridFSFile, TESTARGS
 31 | from mongo_connector.util import retry_until_ok
 32 | 
 33 | from tests import elastic_pair
 34 | from tests.test_elastic2 import ElasticsearchTestCase
 35 | 
 36 | 
 37 | def disable_auto_refresh(func):
 38 |     """Disable default 1 second auto refresh in Elasticsearch for a test.
 39 | 
 40 |     https://www.elastic.co/guide/en/elasticsearch/reference/current/indices
 41 |     -update-settings.html
 42 |     """
 43 | 
 44 |     @wraps(func)
 45 |     def _disable_auto_refresh(self, *args, **kwargs):
 46 |         try:
 47 |             self.elastic_conn.indices.put_settings(
 48 |                 index="test", body={"index": {"refresh_interval": "-1"}}
 49 |             )
 50 |             return func(self, *args, **kwargs)
 51 |         finally:
 52 |             self.elastic_conn.indices.put_settings(
 53 |                 index="test", body={"index": {"refresh_interval": "1s"}}
 54 |             )
 55 | 
 56 |     return _disable_auto_refresh
 57 | 
 58 | 
 59 | class TestElasticDocManager(ElasticsearchTestCase):
 60 |     """Unit tests for the Elastic DocManager."""
 61 | 
 62 |     def test_update(self):
 63 |         """Test the update method using locally stored source"""
 64 | 
 65 |         # If testing with BulkBuffer, auto_commit_interval
 66 |         # needs to be None to not clear locally stored sources
 67 |         self.elastic_doc.auto_commit_interval = None
 68 | 
 69 |         doc_id = 1
 70 |         doc = {"_id": doc_id, "a": 1, "b": 2}
 71 |         self.elastic_doc.upsert(doc, *TESTARGS)
 72 | 
 73 |         # $set only
 74 |         update_spec = {"$set": {"a": 1, "b": 2}}
 75 |         doc = self.elastic_doc.update(doc_id, update_spec, *TESTARGS)
 76 | 
 77 |         self.assertEqual(doc, {"_id": "1", "a": 1, "b": 2})
 78 |         # $unset only
 79 |         update_spec = {"$unset": {"a": True}}
 80 |         doc = self.elastic_doc.update(doc_id, update_spec, *TESTARGS)
 81 |         self.assertEqual(doc, {"_id": "1", "b": 2})
 82 |         # mixed $set/$unset
 83 |         update_spec = {"$unset": {"b": True}, "$set": {"c": 3}}
 84 |         doc = self.elastic_doc.update(doc_id, update_spec, *TESTARGS)
 85 |         self.assertEqual(doc, {"_id": "1", "c": 3})
 86 | 
 87 |         # Commit doc to Elasticsearch and get it from there
 88 |         # to test if BulkBuffer works fine
 89 |         self.elastic_doc.commit()
 90 |         res = self._search()
 91 |         self.assertEqual(doc, next(res))
 92 | 
 93 |         # set auto_commit_interval back to 0
 94 |         self.elastic_doc.auto_commit_interval = 0
 95 | 
 96 |     def test_upsert(self):
 97 |         """Test the upsert method."""
 98 |         docc = {"_id": "1", "name": "John"}
 99 |         self.elastic_doc.upsert(docc, *TESTARGS)
100 |         res = self.elastic_conn.search(
101 |             index="test", doc_type="test", body={"query": {"match_all": {}}}
102 |         )["hits"]["hits"]
103 |         for doc in res:
104 |             self.assertEqual(doc["_id"], "1")
105 |             self.assertEqual(doc["_source"]["name"], "John")
106 | 
107 |     def test_update_using_ES(self):
108 |         """
109 |         Test the update method and getting sources for update
110 |         for Elasticsearch
111 |         """
112 | 
113 |         # If testing with BulkBuffer, auto_commit_interval
114 |         # needs to be None to not clear locally stored sources
115 |         self.elastic_doc.auto_commit_interval = None
116 | 
117 |         doc_id = 1
118 |         doc = {"_id": doc_id, "a": 1, "b": 2}
119 |         self.elastic_doc.upsert(doc, *TESTARGS)
120 |         self.elastic_doc.commit()
121 | 
122 |         update_spec = {"$set": {"a": 1, "b": 2}}
123 |         self.elastic_doc.update(doc_id, update_spec, *TESTARGS)
124 | 
125 |         update_spec = {"$set": {"a": 10, "b": 20}}
126 |         self.elastic_doc.update(doc_id, update_spec, *TESTARGS)
127 | 
128 |         update_spec = {"$set": {"a": 100, "b": 200}}
129 |         self.elastic_doc.update(doc_id, update_spec, *TESTARGS)
130 | 
131 |         # Commit doc to Elasticsearch and get it from there
132 |         # to test if BulkBuffer works fine
133 |         doc["a"] = 100
134 |         doc["b"] = 200
135 |         self.elastic_doc.commit()
136 |         res = self._search()
137 |         self.assertEqual(doc, next(res))
138 | 
139 |         # set auto_commit_interval back to 0
140 |         self.elastic_doc.auto_commit_interval = 0
141 | 
142 |     def test_upsert_with_updates(self):
143 |         """Test the upsert method with multi updates
144 |         and clearing buffer (commit) after each update."""
145 | 
146 |         doc_id = 1
147 |         docc = {"_id": doc_id, "name": "John"}
148 |         self.elastic_doc.upsert(docc, *TESTARGS)
149 | 
150 |         update_spec = {"$set": {"a": 1, "b": 2}}
151 |         self.elastic_doc.update(doc_id, update_spec, *TESTARGS)
152 | 
153 |         update_spec = {"$set": {"a": 2, "b": 3, "c": ["test"]}}
154 |         self.elastic_doc.update(doc_id, update_spec, *TESTARGS)
155 | 
156 |         update_spec = {"$set": {"c": ["test", "test2"]}}
157 |         self.elastic_doc.update(doc_id, update_spec, *TESTARGS)
158 | 
159 |         res = self.elastic_conn.search(
160 |             index="test", doc_type="test", body={"query": {"match_all": {}}}
161 |         )["hits"]["hits"]
162 | 
163 |         for doc in res:
164 |             self.assertEqual(doc["_id"], "1")
165 |             self.assertEqual(doc["_source"]["name"], "John")
166 |             self.assertEqual(doc["_source"]["a"], 2)
167 |             self.assertEqual(doc["_source"]["b"], 3)
168 |             self.assertEqual(doc["_source"]["c"], ["test", "test2"])
169 | 
170 |     def test_bulk_upsert(self):
171 |         """Test the bulk_upsert method."""
172 |         self.elastic_doc.bulk_upsert([], *TESTARGS)
173 | 
174 |         docs = ({"_id": i} for i in range(1000))
175 |         self.elastic_doc.bulk_upsert(docs, *TESTARGS)
176 |         self.elastic_doc.commit()
177 |         returned_ids = sorted(int(doc["_id"]) for doc in self._search())
178 |         self.assertEqual(self._count(), 1000)
179 |         self.assertEqual(len(returned_ids), 1000)
180 |         for i, r in enumerate(returned_ids):
181 |             self.assertEqual(r, i)
182 | 
183 |         docs = ({"_id": i, "weight": 2 * i} for i in range(1000))
184 |         self.elastic_doc.bulk_upsert(docs, *TESTARGS)
185 | 
186 |         returned_ids = sorted(int(doc["weight"]) for doc in self._search())
187 |         self.assertEqual(len(returned_ids), 1000)
188 |         for i, r in enumerate(returned_ids):
189 |             self.assertEqual(r, 2 * i)
190 | 
191 |     def test_remove(self):
192 |         """Test the remove method."""
193 |         docc = {"_id": "1", "name": "John"}
194 |         self.elastic_doc.upsert(docc, *TESTARGS)
195 |         res = self.elastic_conn.search(
196 |             index="test", doc_type="test", body={"query": {"match_all": {}}}
197 |         )["hits"]["hits"]
198 |         res = [x["_source"] for x in res]
199 |         self.assertEqual(len(res), 1)
200 | 
201 |         self.elastic_doc.remove(docc["_id"], *TESTARGS)
202 |         res = self.elastic_conn.search(
203 |             index="test", doc_type="test", body={"query": {"match_all": {}}}
204 |         )["hits"]["hits"]
205 |         res = [x["_source"] for x in res]
206 |         self.assertEqual(len(res), 0)
207 | 
208 |     def test_insert_file(self):
209 |         """Ensure we can properly insert a file into ElasticSearch
210 |         """
211 |         test_data = " ".join(str(x) for x in range(100000)).encode("utf8")
212 |         docc = {
213 |             "_id": "test_id",
214 |             "filename": "test_filename",
215 |             "upload_date": 5,
216 |             "md5": "test_md5",
217 |         }
218 |         self.elastic_doc.insert_file(MockGridFSFile(docc, test_data), *TESTARGS)
219 |         res = self._search()
220 |         for doc in res:
221 |             self.assertEqual(doc["_id"], docc["_id"])
222 |             self.assertEqual(doc["filename"], docc["filename"])
223 |             self.assertEqual(base64.b64decode(doc["content"]), test_data.strip())
224 | 
225 |     def test_remove_file(self):
226 |         test_data = b"hello world"
227 |         docc = {
228 |             "_id": "test_id",
229 |             "_ts": 10,
230 |             "ns": "test.test",
231 |             "filename": "test_filename",
232 |             "upload_date": 5,
233 |             "md5": "test_md5",
234 |         }
235 | 
236 |         self.elastic_doc.insert_file(MockGridFSFile(docc, test_data), *TESTARGS)
237 |         res = list(self._search())
238 |         self.assertEqual(len(res), 1)
239 | 
240 |         self.elastic_doc.remove("test_id", *TESTARGS)
241 |         res = list(self._search())
242 |         self.assertEqual(len(res), 0)
243 | 
244 |     def test_search(self):
245 |         """Test the search method.
246 | 
247 |         Make sure we can retrieve documents last modified within a time range.
248 |         """
249 |         docc = {"_id": "1", "name": "John"}
250 |         self.elastic_doc.upsert(docc, "test.test", 5767301236327972865)
251 |         docc2 = {"_id": "2", "name": "John Paul"}
252 |         self.elastic_doc.upsert(docc2, "test.test", 5767301236327972866)
253 |         docc3 = {"_id": "3", "name": "Paul"}
254 |         self.elastic_doc.upsert(docc3, "test.test", 5767301236327972870)
255 |         search = list(self.elastic_doc.search(5767301236327972865, 5767301236327972866))
256 |         self.assertEqual(len(search), 2)
257 |         result_ids = [result.get("_id") for result in search]
258 |         self.assertIn("1", result_ids)
259 |         self.assertIn("2", result_ids)
260 | 
261 |     @disable_auto_refresh
262 |     def test_elastic_commit(self):
263 |         """Test the auto_commit_interval attribute."""
264 |         doc = {"_id": "3", "name": "Waldo"}
265 | 
266 |         # test cases:
267 |         # None = no autocommit
268 |         # 0 = commit immediately
269 |         # x > 0 = commit within x seconds
270 |         for commit_interval in [None, 0, 2, 8]:
271 |             docman = DocManager(elastic_pair, auto_commit_interval=commit_interval)
272 |             docman.upsert(doc, *TESTARGS)
273 |             if commit_interval:
274 |                 # Allow just a little extra time
275 |                 time.sleep(commit_interval + 2)
276 |             results = list(self._search())
277 |             if commit_interval is None:
278 |                 self.assertEqual(
279 |                     len(results),
280 |                     0,
281 |                     "should not commit document with " "auto_commit_interval = None",
282 |                 )
283 |             else:
284 |                 self.assertEqual(
285 |                     len(results),
286 |                     1,
287 |                     "should commit document with "
288 |                     "auto_commit_interval = %s" % (commit_interval,),
289 |                 )
290 |                 self.assertEqual(results[0]["name"], "Waldo")
291 |             docman.stop()
292 |             self._remove()
293 |             retry_until_ok(self.elastic_conn.indices.refresh, index="")
294 | 
295 |     @disable_auto_refresh
296 |     def test_auto_send_interval(self):
297 |         """Test the auto_send_interval
298 | 
299 |         auto_send_interval should control the amount of time to wait before
300 |         sending (but not committing) buffered operations.
301 |         """
302 |         doc = {"_id": "3", "name": "Waldo"}
303 | 
304 |         # test cases:
305 |         # None, 0 = no auto send
306 |         # x > 0 = send buffered operations within x seconds
307 |         for send_interval in [None, 0, 3, 8]:
308 |             docman = DocManager(
309 |                 elastic_pair, autoSendInterval=send_interval, auto_commit_interval=None
310 |             )
311 |             docman.upsert(doc, *TESTARGS)
312 |             if send_interval:
313 |                 # Allow just a little extra time
314 |                 time.sleep(send_interval + 2)
315 |             results = list(self._search())
316 |             self.assertEqual(
317 |                 len(results),
318 |                 0,
319 |                 "documents should not be commited with "
320 |                 "auto_commit_interval=None and auto_commit_interval=%s"
321 |                 % (send_interval,),
322 |             )
323 |             # Commit the possibly sent changes and search again
324 |             retry_until_ok(self.elastic_conn.indices.refresh, index="")
325 |             results = list(self._search())
326 |             if not send_interval:
327 |                 self.assertEqual(
328 |                     len(results),
329 |                     0,
330 |                     "should not send document with auto_send_interval=%s"
331 |                     % (send_interval,),
332 |                 )
333 |             else:
334 |                 self.assertEqual(
335 |                     len(results),
336 |                     1,
337 |                     "should send document with auto_send_interval=%s"
338 |                     % (send_interval,),
339 |                 )
340 |                 self.assertEqual(results[0]["name"], "Waldo")
341 |             docman.stop()
342 |             self._remove()
343 |             retry_until_ok(self.elastic_conn.indices.refresh, index="")
344 | 
345 |     def test_get_last_doc(self):
346 |         """Test the get_last_doc method.
347 | 
348 |         Make sure we can retrieve the document most recently modified from ES.
349 |         """
350 |         base = self.elastic_doc.get_last_doc()
351 |         ts = base.get("_ts", 0) if base else 0
352 |         docc = {"_id": "4", "name": "Hare"}
353 |         self.elastic_doc.upsert(docc, "test.test", ts + 3)
354 |         docc = {"_id": "5", "name": "Tortoise"}
355 |         self.elastic_doc.upsert(docc, "test.test", ts + 2)
356 |         docc = {"_id": "6", "name": "Mr T."}
357 |         self.elastic_doc.upsert(docc, "test.test", ts + 1)
358 | 
359 |         self.assertEqual(self.elastic_doc.elastic.count(index="test")["count"], 3)
360 |         doc = self.elastic_doc.get_last_doc()
361 |         self.assertEqual(doc["_id"], "4")
362 | 
363 |         docc = {"_id": "6", "name": "HareTwin"}
364 |         self.elastic_doc.upsert(docc, "test.test", ts + 4)
365 |         doc = self.elastic_doc.get_last_doc()
366 |         self.assertEqual(doc["_id"], "6")
367 |         self.assertEqual(self.elastic_doc.elastic.count(index="test")["count"], 3)
368 | 
369 |     def test_commands(self):
370 |         cmd_args = ("test.$cmd", 1)
371 |         self.elastic_doc.command_helper = CommandHelper()
372 | 
373 |         self.elastic_doc.handle_command({"create": "test2"}, *cmd_args)
374 |         retry_until_ok(self.elastic_conn.indices.refresh, index="")
375 |         self.assertIn("test2", self._mappings("test"))
376 | 
377 |         docs = [
378 |             {"_id": 0, "name": "ted"},
379 |             {"_id": 1, "name": "marsha"},
380 |             {"_id": 2, "name": "nikolas"},
381 |         ]
382 |         self.elastic_doc.upsert(docs[0], "test.test2", 1)
383 |         self.elastic_doc.upsert(docs[1], "test.test2", 1)
384 |         self.elastic_doc.upsert(docs[2], "test.test2", 1)
385 | 
386 |         # Commit upserted docs as they are in buffer
387 |         self.elastic_doc.commit()
388 | 
389 |         res = list(
390 |             self.elastic_doc._stream_search(
391 |                 index="test", doc_type="test2", body={"query": {"match_all": {}}}
392 |             )
393 |         )
394 |         for d in docs:
395 |             self.assertTrue(d in res)
396 | 
397 |         self.elastic_doc.handle_command({"drop": "test2"}, *cmd_args)
398 |         retry_until_ok(self.elastic_conn.indices.refresh, index="")
399 |         res = list(
400 |             self.elastic_doc._stream_search(
401 |                 index="test", doc_type="test2", body={"query": {"match_all": {}}}
402 |             )
403 |         )
404 |         self.assertEqual(0, len(res))
405 | 
406 |         self.elastic_doc.handle_command({"create": "test2"}, *cmd_args)
407 |         self.elastic_doc.handle_command({"create": "test3"}, *cmd_args)
408 |         retry_until_ok(self.elastic_conn.indices.refresh, index="")
409 |         self.elastic_doc.handle_command({"dropDatabase": 1}, *cmd_args)
410 |         retry_until_ok(self.elastic_conn.indices.refresh, index="")
411 |         self.assertNotIn("test", self._indices())
412 |         self.assertNotIn("test2", self._mappings())
413 |         self.assertNotIn("test3", self._mappings())
414 | 
415 |     def buffer_and_drop(self):
416 |         """Insert document and drop collection while doc is in buffer"""
417 | 
418 |         self.elastic_doc.command_helper = CommandHelper()
419 | 
420 |         self.elastic_doc.auto_commit_interval = None
421 |         index = "test3"
422 |         doc_type = "foo"
423 |         cmd_args = ("%s.%s" % (index, doc_type), 1)
424 | 
425 |         doc_id = 1
426 |         doc = {"_id": doc_id, "name": "bar"}
427 |         self.elastic_doc.upsert(doc, *cmd_args)
428 | 
429 |         self.elastic_doc.handle_command({"drop": doc_type}, *cmd_args)
430 |         retry_until_ok(self.elastic_conn.indices.refresh, index="")
431 | 
432 |         # Commit should be called before command has been handled
433 |         # Which means that buffer should be empty
434 |         self.assertFalse(self.elastic_doc.BulkBuffer.get_buffer())
435 | 
436 |         # After drop, below search should return no results
437 |         res = list(
438 |             self.elastic_doc._stream_search(
439 |                 index=index, doc_type=doc_type, body={"query": {"match_all": {}}}
440 |             )
441 |         )
442 |         self.assertFalse(res)
443 | 
444 |         # Test dropDatabase as well
445 |         # Firstly add document to database again
446 |         # This time update doc as well
447 |         self.elastic_doc.upsert(doc, *cmd_args)
448 |         update_spec = {"$set": {"name": "foo2"}}
449 |         self.elastic_doc.update(doc_id, update_spec, *cmd_args)
450 |         self.elastic_doc.handle_command({"dropDatabase": 1}, *cmd_args)
451 |         retry_until_ok(self.elastic_conn.indices.refresh, index="")
452 |         self.assertFalse(self.elastic_doc.BulkBuffer.get_buffer())
453 |         self.assertNotIn(index, self._mappings())
454 | 
455 |         # set auto_commit_interval back to 0
456 |         self.elastic_doc.auto_commit_interval = 0
457 | 
458 | 
459 | class TestElasticDocManagerAWS(unittest.TestCase):
460 |     @unittest.skipIf(_HAS_AWS, "Cannot test with AWS extension installed")
461 |     def test_aws_raises_invalid_configuration(self):
462 |         with self.assertRaises(errors.InvalidConfiguration):
463 |             DocManager("notimportant", aws={})
464 | 
465 |     def test_convert_aws_args(self):
466 |         aws_args = dict(
467 |             region_name="name",
468 |             aws_access_key_id="id",
469 |             aws_secret_access_key="key",
470 |             aws_session_token="token",
471 |             profile_name="profile_name",
472 |         )
473 |         self.assertEqual(convert_aws_args(aws_args), aws_args)
474 | 
475 |     def test_convert_aws_args_raises_invalid_configuration(self):
476 |         with self.assertRaises(errors.InvalidConfiguration):
477 |             convert_aws_args("not_dict")
478 | 
479 |     def test_convert_aws_args_old_options(self):
480 |         self.assertEqual(
481 |             convert_aws_args(dict(region="name", access_id="id", secret_key="key")),
482 |             dict(
483 |                 region_name="name", aws_access_key_id="id", aws_secret_access_key="key"
484 |             ),
485 |         )
486 | 
487 |     @unittest.skipUnless(_HAS_AWS, "Cannot test without AWS extension")
488 |     def test_create_aws_auth_raises_invalid_configuration(self):
489 |         with self.assertRaises(errors.InvalidConfiguration):
490 |             create_aws_auth({"unknown_option": ""})
491 | 
492 | 
493 | if __name__ == "__main__":
494 |     unittest.main()
495 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [testenv]
 2 | deps =
 3 | 	mongo-orchestration>=0.6.7,<1.0
 4 | 	requests>=2.5.1
 5 | 	unittest2; python_version < "2.7"
 6 | commands_pre =
 7 | 	mongo-orchestration start -p 20000 -b localhost
 8 | setenv =
 9 | 	MO_ADDRESS = localhost:20000
10 | commands =
11 | 	!py26: python -m unittest discover tests
12 | 	py26: python -m unittest2 discover tests
13 | commands_post =
14 | 	mongo-orchestration stop
15 | extras =
16 | 	elastic2: elastic2
17 | 	elastic5: elastic5
18 | passenv =
19 | 	# the username to use if running the tests with authentication enabled
20 | 	DB_USER
21 | 	# the password for the above
22 |     DB_PASSWORD
23 |     # the starting port for running MongoDB. Future nodes will be started on sequentially increasing ports
24 |     MONGO_PORT
25 |     # the hostname on which Elasticsearch is running
26 |     ES_HOST
27 |     # the port for the above
28 |     ES_PORT
29 | 


--------------------------------------------------------------------------------