├── .gitignore ├── LICENSE.txt ├── README.md ├── _config.yml ├── check.py ├── example ├── es_conf.toml └── mongo_conf.toml ├── mongosync ├── __init__.py ├── command_options.py ├── common_syncer.py ├── config.py ├── config_file.py ├── data_filter.py ├── doc_utils.py ├── es │ ├── __init__.py │ ├── handler.py │ └── syncer.py ├── logger.py ├── mongo │ ├── __init__.py │ ├── handler.py │ └── syncer.py ├── mongo_utils.py ├── multi_oplog_replayer.py ├── optime_logger.py └── progress_logger.py ├── requirements.txt └── sync.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | # 39 | *.pyc 40 | tags 41 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2020 caosiyang 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # py-mongo-sync 2 | 3 | It synchronizes data from a replica set to another MongoDB deployment, e.g., standalone, replica set, and sharded cluster. 4 | 5 | It's oplog-based and provides a realtime data synchronization. 6 | 7 | It's written in Python 2.7. 8 | 9 | ## Support 10 | 11 | - MongoDB 2.4 12 | - MongoDB 2.6 13 | - MongoDB 3.0 14 | - MongoDB 3.2 15 | - MongoDB 3.4 16 | 17 | ## Features 18 | 19 | - initial sync and oplog based incremental sync 20 | - sync the specified databases and collections 21 | - concurrent oplog replaying 22 | 23 | ## Requirements 24 | 25 | See [requirements](./requirements.txt) for details. 26 | 27 | - gevent 28 | - toml 29 | - mmh3 30 | - pymongo 31 | 32 | Always use pymongo 3.5.1. 33 | 34 | Refer to [https://api.mongodb.com/python/3.6.0/changelog.html](https://api.mongodb.com/python/3.6.0/changelog.html) 35 | 36 | > Version 3.6 adds support for MongoDB 3.6, drops support for CPython 3.3 (PyPy3 is still supported), and drops support for MongoDB versions older than 2.6. If connecting to a MongoDB 2.4 server or older, PyMongo now throws a ConfigurationError. 37 | 38 | ## Notice 39 | 40 | - source **MUST** be a replica set 41 | - ignore system databases 42 | - admin 43 | - local 44 | - ignore system collections 45 | - system.\* 46 | - create users for destination manually if necessary 47 | - suggest to authenticate with administrator if source enabled authentication 48 | - not support geospatial index 49 | 50 | if the source is a sharded cluster 51 | 52 | - first, stop the balancer 53 | - then, start a seprate sync process for each shard 54 | 55 | ## Configurations 56 | 57 | Use [TOML](https://github.com/toml-lang/toml) as configuration file format. 58 | 59 | Refer to [mongo_conf.toml](example/mongo_conf.toml). 60 | 61 | ### src 62 | 63 | Source config items. 64 | 65 | - src.hosts - hostportstr of a member of replica set 66 | - src.username - username 67 | - src.password - password 68 | - src.authdb - authentiction database 69 | 70 | ### dst 71 | 72 | Destination config items. 73 | 74 | - dst.mongo.hosts 75 | - dst.mongo.authdb 76 | - dst.mongo.username 77 | - dst.mongo.password 78 | 79 | ### sync 80 | 81 | Custom options for synchronization. 82 | 83 | `sync.dbs` specfies the databases to sync. 84 | `sync.dbs.colls` specifies the collections to sync. 85 | 86 | - sync.dbs - databases to sync, sync all databases if not specify 87 | - sync.dbs.db - source database name 88 | - sync.dbs.rename_db - destination database name, stay the same if not specify 89 | - sync.dbs.colls - collectons to sync, sync all collections if not specify 90 | 91 | `coll` in `sync.dbs.colls` element specifies the collection to sync. 92 | `fileds` in `sync.dbs.colls` element specifies the fields of current collection to sync. 93 | 94 | ### log 95 | 96 | - log.filepath - log file path, write to stdout if empty or not set 97 | 98 | ## Usage 99 | 100 | Command options has functional limitations. 101 | It's strongly recommended that use config file. 102 | 103 | ### sync 104 | 105 | ```bash 106 | usage: sync.py [-h] [-f [CONFIG]] [--src [SRC]] [--src-authdb [SRC_AUTHDB]] 107 | [--src-username [SRC_USERNAME]] [--src-password [SRC_PASSWORD]] 108 | [--dst [DST]] [--dst-authdb [DST_AUTHDB]] 109 | [--dst-username [DST_USERNAME]] [--dst-password [DST_PASSWORD]] 110 | [--start-optime [START_OPTIME]] 111 | [--optime-logfile [OPTIME_LOGFILE]] [--logfile [LOGFILE]] 112 | 113 | Sync data from a replica-set to another MongoDB/Elasticsearch. 114 | 115 | optional arguments: 116 | -h, --help show this help message and exit 117 | -f [CONFIG], --config [CONFIG] 118 | configuration file, note that command options will 119 | override items in config file 120 | --src [SRC] source should be hostportstr of a replica-set member 121 | --src-authdb [SRC_AUTHDB] 122 | src authentication database, default is 'admin' 123 | --src-username [SRC_USERNAME] 124 | src username 125 | --src-password [SRC_PASSWORD] 126 | src password 127 | --dst [DST] destination should be hostportstr of a mongos or 128 | mongod instance 129 | --dst-authdb [DST_AUTHDB] 130 | dst authentication database, default is 'admin', for 131 | MongoDB 132 | --dst-username [DST_USERNAME] 133 | dst username, for MongoDB 134 | --dst-password [DST_PASSWORD] 135 | dst password, for MongoDB 136 | --start-optime [START_OPTIME] 137 | timestamp in second, indicates oplog based increment 138 | sync 139 | --optime-logfile [OPTIME_LOGFILE] 140 | optime log file path, use this as start optime if 141 | without '--start-optime' 142 | --logfile [LOGFILE] log file path 143 | 144 | ``` 145 | 146 | ## TODO List 147 | 148 | - [ ] command options tuning 149 | - [ ] config file format tuning 150 | - [ ] sync sharding config (enableSharding & shardCollection) 151 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-hacker -------------------------------------------------------------------------------- /check.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pymongo 3 | from mongosync.command_options import CheckCommandOptions 4 | 5 | 6 | def connect(uri): 7 | mc = pymongo.MongoClient(uri, connect=True, serverSelectionTimeoutMS=3000) 8 | return mc 9 | 10 | 11 | def info(s): 12 | print s 13 | 14 | 15 | def warn(s): 16 | print '\033[01;33;40m%s\033[0m' % s 17 | 18 | 19 | def error(s): 20 | print '\033[01;31;40m%s\033[0m' % s 21 | 22 | 23 | def get_standard_index_name(index_items): 24 | """ User can specify any name for a index. 25 | We should generate a standard name for a index and then compare them. 26 | """ 27 | index_keys = [] 28 | for key, direction in index_items['key']: 29 | if isinstance(direction, int) or isinstance(direction, long) or isinstance(direction, float): 30 | index_keys.append('%s_%d' % (key, int(direction))) 31 | elif isinstance(direction, str) or isinstance(direction, unicode): 32 | index_keys.append('%s_%s' % (key, direction)) 33 | else: 34 | print 'invalid direction for', index_items['key'] 35 | sys.exit(1) 36 | return '_'.join(index_keys) 37 | 38 | 39 | ignore_dbs = ['admin', 'config', 'local'] 40 | ignore_colls = ['system.users', 'system.profile'] 41 | 42 | 43 | if __name__ == '__main__': 44 | conf = CheckCommandOptions.parse() 45 | 46 | print '=' * 48 47 | print 'origin : %s' % conf.src_uri 48 | print 'target : %s' % conf.dst_uri 49 | print 'dbs : %s' % conf.dbs 50 | print 'src db : %s' % conf.src_db 51 | print 'dst db : %s' % conf.dst_db 52 | print '=' * 48 53 | 54 | src_mc = connect(conf.src_uri) 55 | dst_mc = connect(conf.dst_uri) 56 | 57 | rename_db_mode = False 58 | if conf.src_db and conf.dst_db: 59 | assert conf.dbs == [] 60 | conf.dbs.append(conf.src_db) 61 | rename_db_mode = True 62 | else: 63 | conf.dbs = src_mc.database_names() 64 | 65 | src_version = src_mc['admin'].command('serverStatus')['version'] 66 | dst_version = dst_mc['admin'].command('serverStatus')['version'] 67 | 68 | if src_version.startswith('2') and dst_version.startswith('3'): 69 | ignore_colls.append('system.indexes') 70 | 71 | # check data 72 | data_pass = True 73 | print '-' * 88 74 | print '%s%s%s%s' % ('RESULT'.ljust(8), 'COLL'.ljust(48), 'ORIGIN'.rjust(16), 'TARGET'.rjust(16)) 75 | print '-' * 88 76 | for dbname in sorted(src_mc.database_names()): 77 | if dbname in ignore_dbs: 78 | continue 79 | if dbname not in conf.dbs: 80 | continue 81 | for collname in sorted(src_mc[dbname].collection_names(include_system_collections=False)): 82 | if collname in ignore_colls: 83 | continue 84 | if rename_db_mode: 85 | assert dbname == conf.src_db 86 | ddb = conf.dst_db 87 | else: 88 | ddb = dbname 89 | src_coll_cnt = src_mc[dbname][collname].count() 90 | dst_coll_cnt = dst_mc[ddb][collname].count() 91 | if src_coll_cnt == dst_coll_cnt: 92 | res = 'OK' 93 | info('%s%s%s%s' % (res.ljust(8), (dbname + '.' + collname).ljust(48), str(src_coll_cnt).rjust(16), str(dst_coll_cnt).rjust(16))) 94 | else: 95 | res = 'ERR' 96 | data_pass = False 97 | warn('%s%s%s%s' % (res.ljust(8), (dbname + '.' + collname).ljust(48), str(src_coll_cnt).rjust(16), str(dst_coll_cnt).rjust(16))) 98 | print '-' * 96 99 | 100 | # check index 101 | index_pass = True 102 | print '-' * 120 103 | print '%s%s%s' % ('RES'.ljust(8), 'COLL'.ljust(48), 'INDEX'.rjust(64)) 104 | print '-' * 120 105 | for dbname in sorted(src_mc.database_names()): 106 | if dbname in ignore_dbs: 107 | continue 108 | if dbname not in conf.dbs: 109 | continue 110 | for collname in sorted(src_mc[dbname].collection_names()): 111 | if collname in ignore_colls: 112 | continue 113 | if rename_db_mode: 114 | assert dbname == conf.src_db 115 | ddb = conf.dst_db 116 | else: 117 | ddb = dbname 118 | src_index_info = src_mc[dbname][collname].index_information() 119 | dst_index_info = dst_mc[ddb][collname].index_information() 120 | src_index_names = set() 121 | dst_index_names = set() 122 | for index_items in src_index_info.itervalues(): 123 | index_name = get_standard_index_name(index_items) 124 | src_index_names.add(index_name) 125 | for index_items in dst_index_info.itervalues(): 126 | index_name = get_standard_index_name(index_items) 127 | dst_index_names.add(index_name) 128 | for index_name in src_index_names: 129 | if index_name in dst_index_names: 130 | res = 'OK' 131 | info('%s%s%s' % (res.ljust(8), (dbname + '.' + collname).ljust(48), index_name.rjust(64))) 132 | else: 133 | res = 'ERR' 134 | index_pass = False 135 | warn('%s%s%s' % (res.ljust(8), (dbname + '.' + collname).ljust(48), index_name.rjust(64))) 136 | print '-' * 120 137 | 138 | if data_pass: 139 | info('data: SUCCESS') 140 | else: 141 | error('data: FAILED') 142 | 143 | if index_pass: 144 | info('index: SUCCESS') 145 | else: 146 | error('index: FAILED') 147 | -------------------------------------------------------------------------------- /example/es_conf.toml: -------------------------------------------------------------------------------- 1 | # source config 2 | [src] 3 | hosts = "127.0.0.1:27017" # hostportstr of a member of replica set 4 | authdb = "admin" 5 | username = "yourusername" 6 | password = "yourpassword" 7 | 8 | # destination config 9 | [dst] 10 | type = "es" 11 | hosts = [ 12 | "your_es_host1:9200", 13 | "your_es_host2:9200", 14 | "your_es_host3:9200", 15 | "your_es_host4:9200", 16 | "your_es_host5:9200" 17 | ] 18 | 19 | # sync config 20 | [sync] 21 | 22 | # sync database "test0" 23 | [[sync.dbs]] 24 | db = "test0" 25 | 26 | # sync database (test1 => test11) 27 | [[sync.dbs]] 28 | db = "test1" 29 | rename_db = "test11" 30 | 31 | # sync collections (test2.coll0 => test22.coll0, test2.coll1 => test22.coll1) 32 | [[sync.dbs]] 33 | db = "test2" 34 | rename_db = "test22" 35 | colls = [ "coll0", "coll1" ] 36 | 37 | # sync collections with specfied fields 38 | [[sync.dbs]] 39 | db = "test3" 40 | rename_db = "test33" 41 | colls = [ 42 | { coll = "coll2", fields = [ "f0", "f1" ] }, 43 | { coll = "coll3", fields = [ "f2", "f3" ] } 44 | ] 45 | 46 | # log config 47 | [log] 48 | filepath = "sync.log" # write to stdout if empty or not set 49 | -------------------------------------------------------------------------------- /example/mongo_conf.toml: -------------------------------------------------------------------------------- 1 | # source config 2 | [src] 3 | hosts = "127.0.0.1:27017" # hostportstr of a member of replica set 4 | authdb = "admin" 5 | username = "yourusername" 6 | password = "yourpassword" 7 | 8 | # destination config 9 | [dst] 10 | hosts = "127.0.0.1:27018" # hostportstr of standalone, mongos or a member of replica set 11 | authdb = "admin" 12 | username = "yourusername" 13 | password = "yourpassword" 14 | 15 | # sync config 16 | [sync] 17 | # dbs specifies databases to sync 18 | # colls specifies collections to sync 19 | # if not set dbs, sync all collections 20 | dbs = [ 21 | # sync a database (test0) 22 | { db = "test0" }, 23 | 24 | # sync collections (test1.coll0, test1.coll1) 25 | { db = "test1", colls = [ "coll0", "coll1" ] }, 26 | 27 | # sync a database and rename (test2 => test22) 28 | { db = "test2", rename_db = "test22" }, 29 | 30 | # sync collections and rename (test3.coll2 => test33.coll2, test3.coll3 => test33.coll3) 31 | { db = "test3", rename_db = "test33", colls = [ "coll2", "coll3" ] } 32 | ] 33 | 34 | # log config 35 | [log] 36 | filepath = "sync.log" # write to stdout if empty or not set 37 | -------------------------------------------------------------------------------- /mongosync/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caosiyang/py-mongo-sync/fda1a798f976d41e47e993c618bcc0f2e60c7449/mongosync/__init__.py -------------------------------------------------------------------------------- /mongosync/command_options.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | from bson.timestamp import Timestamp 4 | from mongosync.config import Config, CheckConfig 5 | from mongosync.config_file import ConfigFile 6 | from mongosync.mongo_utils import parse_hostportstr 7 | from mongosync.optime_logger import OptimeLogger 8 | 9 | 10 | class CommandOptions(object): 11 | """ Command options. 12 | """ 13 | @staticmethod 14 | def parse(): 15 | """ Parse command options and generate config. 16 | """ 17 | conf = Config() 18 | 19 | parser = argparse.ArgumentParser(description='Sync data from a replica-set to another MongoDB/Elasticsearch.') 20 | parser.add_argument('-f', '--config', nargs='?', required=False, help='configuration file, note that command options will override items in config file') 21 | parser.add_argument('--src', nargs='?', required=False, help='source should be hostportstr of a replica-set member') 22 | parser.add_argument('--src-authdb', nargs='?', required=False, help="src authentication database, default is 'admin'") 23 | parser.add_argument('--src-username', nargs='?', required=False, help='src username') 24 | parser.add_argument('--src-password', nargs='?', required=False, help='src password') 25 | parser.add_argument('--dst', nargs='?', required=False, help='destination should be hostportstr of a mongos or mongod instance') 26 | parser.add_argument('--dst-authdb', nargs='?', required=False, help="dst authentication database, default is 'admin', for MongoDB") 27 | parser.add_argument('--dst-username', nargs='?', required=False, help='dst username, for MongoDB') 28 | parser.add_argument('--dst-password', nargs='?', required=False, help='dst password, for MongoDB') 29 | parser.add_argument('--start-optime', type=int, nargs='?', required=False, help='timestamp in second, indicates oplog based increment sync') 30 | parser.add_argument('--optime-logfile', nargs='?', required=False, help="optime log file path, use this as start optime if without '--start-optime'") 31 | parser.add_argument('--logfile', nargs='?', required=False, help='log file path') 32 | 33 | args = parser.parse_args() 34 | 35 | if args.config is not None: 36 | conf = ConfigFile.load(args.config) 37 | if args.src is not None: 38 | conf.src_conf.hosts = args.src 39 | if args.src_authdb is not None: 40 | conf.src_conf.authdb = args.src_authdb 41 | if args.src_username is not None: 42 | conf.src_conf.username = args.src_username 43 | if args.src_password is not None: 44 | conf.src_conf.password = args.src_password 45 | if args.dst is not None: 46 | conf.dst_conf.hosts = args.dst 47 | if args.dst_authdb is not None: 48 | conf.dst_conf.authdb = args.dst_authdb 49 | if args.dst_username is not None: 50 | conf.dst_conf.username = args.dst_username 51 | if args.dst_password is not None: 52 | conf.dst_conf.password = args.dst_password 53 | if args.start_optime is not None: 54 | conf.start_optime = Timestamp(args.start_optime, 0) 55 | if args.optime_logfile is not None: 56 | conf.optime_logfilepath = args.optime_logfile 57 | if args.start_optime is None: 58 | optime_logger = OptimeLogger(args.optime_logfile) 59 | conf.start_optime = optime_logger.read() 60 | if args.logfile is not None: 61 | conf.logfilepath = args.logfile 62 | 63 | return conf 64 | 65 | 66 | class CheckCommandOptions(object): 67 | """ Check command options. 68 | """ 69 | @staticmethod 70 | def parse(): 71 | """ Parse command options and generate config. 72 | """ 73 | parser = argparse.ArgumentParser(description='Check data consistency including data and indexes.') 74 | parser.add_argument('--origin', nargs='?', metavar='URI', required=True, help='origin mongos URI') 75 | parser.add_argument('--target', nargs='?', metavar='URI', required=True, help='target mongos URI') 76 | parser.add_argument('--dbs', nargs='+', required=False, help='databases to check') 77 | parser.add_argument('--src-db', nargs='?', required=False, help="database to check in origin, work with '--dst-db', conflicts with '--dbs'") 78 | parser.add_argument('--dst-db', nargs='?', required=False, help="database to check in target, work with '--src-db', conflicts with '--dbs'") 79 | 80 | args = vars(parser.parse_args()) 81 | 82 | conf = CheckConfig() 83 | conf.src_uri = args['origin'] 84 | conf.dst_uri = args['target'] 85 | if args['dbs'] is not None: 86 | conf.dbs = args['dbs'] 87 | if args['src_db'] is not None: 88 | conf.src_db = args['src_db'] 89 | if args['dst_db'] is not None: 90 | conf.dst_db = args['dst_db'] 91 | 92 | if conf.dbs and (conf.src_db or conf.dst_db): 93 | print "Terminated, conflict command options found" 94 | sys.exit(1) 95 | if conf.src_db and not conf.dst_db: 96 | print "Terminated, require command option '--dst-db'" 97 | sys.exit(1) 98 | if conf.dst_db and not conf.src_db: 99 | print "Terminated, require command option '--src-db'" 100 | sys.exit(1) 101 | 102 | return conf 103 | -------------------------------------------------------------------------------- /mongosync/common_syncer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import datetime 4 | import exceptions 5 | import gevent 6 | from mongosync.config import Config 7 | from mongosync.logger import Logger 8 | from mongosync.mongo_utils import get_optime 9 | from mongosync.optime_logger import OptimeLogger 10 | from mongosync.progress_logger import LoggerThread 11 | 12 | log = Logger.get() 13 | 14 | 15 | class Stage(object): 16 | """ 17 | - post_initial_sync 18 | Catching up oplogs until to time point that initial sync was done. 19 | If unique index existed, duplicate key error might occur in this stage. 20 | """ 21 | stopped = 0 22 | initial_sync = 1 23 | post_initial_sync = 2 24 | oplog_sync = 3 25 | 26 | 27 | class CommonSyncer(object): 28 | """ Common database synchronizer. 29 | 30 | Specific database synchronizer should implement the following methods: 31 | - __init__ 32 | - _initial_sync 33 | - _sync_collection 34 | - _sync_large_collection 35 | - _replay_oplog 36 | """ 37 | def __init__(self, conf): 38 | if not isinstance(conf, Config): 39 | raise RuntimeError('invalid config type') 40 | self._conf = conf 41 | 42 | self._ignore_dbs = ['admin', 'local'] 43 | self._ignore_colls = ['system.indexes', 'system.profile', 'system.users'] 44 | 45 | if conf.optime_logfilepath: 46 | self._optime_logger = OptimeLogger(conf.optime_logfilepath) 47 | else: 48 | self._optime_logger = None 49 | self._optime_log_interval = 10 # default 10s 50 | self._last_optime = None # optime of the last oplog was applied 51 | self._last_optime_logtime = time.time() 52 | 53 | self._log_interval = 2 # default 2s 54 | self._last_logtime = time.time() # use in oplog replay 55 | 56 | # for large collections 57 | self._n_workers = 8 # multi-process 58 | self._large_coll_docs = 1000000 # 100w 59 | 60 | self._initial_sync_start_optime = None 61 | self._initial_sync_end_optime = None 62 | 63 | self._stage = Stage.stopped 64 | self._oplog_batchsize = 1000 65 | 66 | @property 67 | def from_to(self): 68 | return "%s => %s" % (self._conf.src_hostportstr, self._conf.dst_hostportstr) 69 | 70 | @property 71 | def log_interval(self): 72 | return self._log_interval 73 | 74 | @log_interval.setter 75 | def log_interval(self, n_secs): 76 | if n_secs < 0: 77 | n_secs = 0 78 | self._log_interval = n_secs 79 | 80 | def run(self): 81 | """ Start to sync. 82 | """ 83 | # never drop database automatically 84 | # clear data manually if necessary 85 | try: 86 | self._sync() 87 | except exceptions.KeyboardInterrupt: 88 | log.info('keyboard interrupt') 89 | 90 | def _sync(self): 91 | """ Sync databases and oplog. 92 | """ 93 | if self._conf.start_optime: 94 | log.info("locating oplog, it will take a while") 95 | doc = self._src.client()['local']['oplog.rs'].find_one({'ts': {'$gte': self._conf.start_optime}}) 96 | if not doc: 97 | log.error('oplog is stale') 98 | return 99 | start_optime = doc['ts'] 100 | log.info('start timestamp is %s actually' % start_optime) 101 | self._stage = Stage.oplog_sync 102 | self._replay_oplog(start_optime) 103 | else: 104 | # initial sync 105 | log.info('step into stage: initial_sync') 106 | self._initial_sync_start_optime = get_optime(self._src.client()) 107 | self._stage = Stage.initial_sync 108 | self._initial_sync() 109 | 110 | # markup post initial sync 111 | log.info('step into stage: post_initial_sync') 112 | self._stage = Stage.post_initial_sync 113 | self._initial_sync_end_optime = get_optime(self._src.client()) 114 | 115 | # oplog sync 116 | if self._optime_logger: 117 | self._optime_logger.write(self._initial_sync_start_optime) 118 | self._replay_oplog(self._initial_sync_start_optime) 119 | 120 | def _collect_colls(self): 121 | """ Collect collections to sync. 122 | """ 123 | colls = [] 124 | for dbname in self._src.client().database_names(): 125 | if dbname in self._ignore_dbs: 126 | continue 127 | if not self._conf.data_filter.valid_db(dbname): 128 | continue 129 | for collname in self._src.client()[dbname].collection_names(include_system_collections=False): 130 | if collname in self._ignore_colls: 131 | continue 132 | if not self._conf.data_filter.valid_coll(dbname, collname): 133 | continue 134 | colls.append((dbname, collname)) 135 | return colls 136 | 137 | def _split_coll(self, namespace_tuple, n_partitions): 138 | """ Split a collection into n partitions. 139 | 140 | Return a list of split points. 141 | 142 | splitPointCount = partitionCount - 1 143 | splitPointCount = keyTotalCount / (keyCount + 1) 144 | keyCount = maxChunkSize / (2 * avgObjSize) 145 | => 146 | maxChunkSize = (keyTotalCount / (partionCount - 1) - 1) * 2 * avgObjSize 147 | 148 | Note: maxChunkObjects is default 250000. 149 | """ 150 | if n_partitions <= 1: 151 | raise RuntimeError('n_partitions need greater than 1, but %s' % n_partitions) 152 | 153 | dbname, collname = namespace_tuple 154 | ns = '.'.join(namespace_tuple) 155 | db = self._src.client()[dbname] 156 | collstats = db.command('collstats', collname) 157 | 158 | if 'avgObjSize' not in collstats: # empty collection 159 | return [] 160 | 161 | n_points = n_partitions - 1 162 | max_chunk_size = ((collstats['count'] / (n_partitions - 1) - 1) * 2 * collstats['avgObjSize']) / 1024 / 1024 163 | 164 | if max_chunk_size <= 0: 165 | return [] 166 | 167 | res = db.command('splitVector', ns, keyPattern={'_id': 1}, maxSplitPoints=n_points, maxChunkSize=max_chunk_size, maxChunkObjects=collstats['count']) 168 | 169 | if res['ok'] != 1: 170 | return [] 171 | else: 172 | return [doc['_id'] for doc in res['splitKeys']] 173 | 174 | def _initial_sync(self): 175 | """ Initial sync. 176 | """ 177 | def classify(ns_tuple, large_colls, small_colls): 178 | """ Find out large and small collections. 179 | """ 180 | if self._is_large_collection(ns_tuple): 181 | points = self._split_coll(ns_tuple, self._n_workers) 182 | if points: 183 | large_colls.append((ns_tuple, points)) 184 | else: 185 | small_colls.append(ns_tuple) 186 | else: 187 | small_colls.append(ns_tuple) 188 | 189 | large_colls = [] 190 | small_colls = [] 191 | 192 | pool = gevent.pool.Pool(8) 193 | colls = self._collect_colls() 194 | for ns in colls: 195 | dbname, collname = ns 196 | log.info('%d\t%s.%s' % (self._src.client()[dbname][collname].count(), dbname, collname)) 197 | pool.spawn(classify, ns, large_colls, small_colls) 198 | pool.join() 199 | 200 | if len(large_colls) + len(small_colls) != len(colls): 201 | raise RuntimeError('classify collections error') 202 | 203 | log.info('large collections: %s' % ['.'.join(ns) for ns, points in large_colls]) 204 | log.info('small collections: %s' % ['.'.join(ns) for ns in small_colls]) 205 | 206 | # create progress logger 207 | self._progress_logger = LoggerThread(len(colls)) 208 | self._progress_logger.start() 209 | 210 | # small collections first 211 | pool = gevent.pool.Pool(8) 212 | for res in pool.imap(self._sync_collection, small_colls): 213 | if res is not None: 214 | sys.exit(1) 215 | 216 | # then large collections 217 | for ns, points in large_colls: 218 | self._sync_large_collection(ns, points) 219 | 220 | def _sync_collection(self, namespace_tuple): 221 | """ Sync a collection until success. 222 | """ 223 | raise NotImplementedError('you should implement %s.%s' % (self.__class__.__name__, self._sync_collection.__name__)) 224 | 225 | def _is_large_collection(self, namespace_tuple): 226 | """ Check if large collection or not. 227 | """ 228 | dbname, collname = namespace_tuple 229 | return self._src.client()[dbname][collname].count() > self._large_coll_docs 230 | 231 | def _sync_large_collection(self, namespace_tuple): 232 | """ Sync large collection until success. 233 | """ 234 | raise NotImplementedError('you should implement %s.%s' % (self.__class__.__name__, self._sync_large_collection.__name__)) 235 | 236 | def _replay_oplog(self, oplog_start): 237 | """ Replay oplog. 238 | """ 239 | raise NotImplementedError('you should implement %s.%s' % (self.__class__.__name__, self._replay_oplog.__name__)) 240 | 241 | def _log_progress(self, tag=''): 242 | """ Print progress periodically. 243 | """ 244 | now = time.time() 245 | if now - self._last_logtime >= self._log_interval: 246 | delay = now - self._last_optime.time 247 | time_unit = 'second' if delay <= 1 else 'seconds' 248 | if tag: 249 | log.info('%s - sync to %s - %d %s delay - %s - %s' % (self.from_to, 250 | datetime.datetime.fromtimestamp(self._last_optime.time), 251 | delay, 252 | time_unit, 253 | self._last_optime, 254 | tag)) 255 | else: 256 | log.info('%s - sync to %s - %d %s delay - %s' % (self.from_to, 257 | datetime.datetime.fromtimestamp(self._last_optime.time), 258 | delay, 259 | time_unit, 260 | self._last_optime)) 261 | self._last_logtime = now 262 | 263 | def _log_optime(self, optime): 264 | """ Record optime periodically. 265 | """ 266 | if not self._optime_logger: 267 | return 268 | now = time.time() 269 | if now - self._last_optime_logtime >= self._optime_log_interval: 270 | self._optime_logger.write(optime) 271 | self._last_optime_logtime = now 272 | log.info("flush optime into file '%s': %s" % (self._optime_logger.filepath, optime)) 273 | -------------------------------------------------------------------------------- /mongosync/config.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | import pymongo 4 | from mongosync.mongo_utils import get_version 5 | from mongosync.data_filter import DataFilter 6 | 7 | 8 | class CheckConfig(object): 9 | def __init__(self): 10 | self.src_uri = '' 11 | self.dst_uri = '' 12 | self.dbs = [] 13 | self.src_db = '' 14 | self.dst_db = '' 15 | 16 | 17 | class MongoConfig(object): 18 | def __init__(self, hosts, authdb, username, password): 19 | self.hosts = hosts 20 | self.authdb = authdb 21 | self.username = username 22 | self.password = password 23 | 24 | 25 | class EsConfig(object): 26 | def __init__(self, hosts): 27 | self.hosts = hosts 28 | 29 | 30 | class Config(object): 31 | """ Configuration. 32 | """ 33 | def __init__(self): 34 | self.src_conf = None 35 | self.dst_conf = None 36 | 37 | self.data_filter = DataFilter() 38 | 39 | # rename mapping 40 | self.dbmap = {} 41 | 42 | # fields {'ns' : frozenset(['field0', 'field1'])} 43 | self.fieldmap = {} 44 | 45 | self.start_optime = None 46 | self.optime_logfilepath = '' 47 | self.logfilepath = '' 48 | 49 | @property 50 | def src_hostportstr(self): 51 | return self.hostportstr(self.src_conf.hosts) 52 | 53 | @property 54 | def dst_hostportstr(self): 55 | return self.hostportstr(self.dst_conf.hosts) 56 | 57 | @property 58 | def dbmap_str(self): 59 | return ', '.join(['%s => %s' % (k, v) for k, v in self.dbmap.iteritems()]) 60 | 61 | @property 62 | def fieldmap_str(self): 63 | return ', '.join(['%s {%s}' % (k, ', '.join(v)) for k, v in self.fieldmap.iteritems()]) 64 | 65 | def db_mapping(self, dbname): 66 | mapping_dbname = self.dbmap.get(dbname.strip()) 67 | return mapping_dbname if mapping_dbname else dbname 68 | 69 | def db_coll_mapping(self, dbname, collname): 70 | return self.db_mapping(dbname.strip()), collname.strip() 71 | 72 | def ns_mapping(self, dbname, collname): 73 | return '%s.%s' % (self.db_mapping(dbname.strip()), collname.strip()) 74 | 75 | def hostportstr(self, hosts): 76 | if isinstance(hosts, str) or isinstance(hosts, unicode): 77 | return hosts 78 | elif isinstance(hosts, list): 79 | return ', '.join(hosts) 80 | 81 | def info(self, logger): 82 | """ Output to logfile or stdout. 83 | """ 84 | if isinstance(logger, logging.Logger): 85 | f = lambda s: logger.info(s) 86 | elif isinstance(logger, file): 87 | f = lambda s: logger.write('%s\n' % s) 88 | else: 89 | raise Exception('error logger') 90 | 91 | f('================================================') 92 | f('src hostportstr : %s' % self.src_hostportstr) 93 | f('src authdb : %s' % self.src_conf.authdb) 94 | f('src username : %s' % self.src_conf.username) 95 | f('src password : %s' % self.src_conf.password) 96 | if isinstance(self.src_conf.hosts, str) or isinstance(self.src_conf.hosts, unicode): 97 | f('src db version : %s' % get_version(self.src_conf.hosts)) 98 | 99 | f('dst hostportstr : %s' % self.dst_hostportstr) 100 | if isinstance(self.dst_conf, MongoConfig): 101 | if isinstance(self.dst_conf.hosts, str) or isinstance(self.dst_conf.hosts, unicode): 102 | f('dst authdb : %s' % self.dst_conf.authdb) 103 | f('dst username : %s' % self.dst_conf.username) 104 | f('dst password : %s' % self.dst_conf.password) 105 | f('dst db version : %s' % get_version(self.dst_conf.hosts)) 106 | 107 | f('databases : %s' % ', '.join(self.data_filter._related_dbs)) 108 | f('collections : %s' % ', '.join(self.data_filter._include_colls)) 109 | f('db mapping : %s' % self.dbmap_str) 110 | f('fileds : %s' % self.fieldmap_str) 111 | 112 | f('start optime : %s' % self.start_optime) 113 | f('optime logfile : %s' % self.optime_logfilepath) 114 | f('log filepath : %s' % self.logfilepath) 115 | f('pymongo version : %s' % pymongo.version) 116 | f('================================================') 117 | -------------------------------------------------------------------------------- /mongosync/config_file.py: -------------------------------------------------------------------------------- 1 | import toml 2 | from bson.timestamp import Timestamp 3 | from mongosync.config import Config, MongoConfig, EsConfig 4 | from mongosync.mongo_utils import gen_namespace 5 | 6 | 7 | class ConfigFile(object): 8 | @staticmethod 9 | def load(filepath): 10 | """ Load config file and generate conf. 11 | """ 12 | conf = Config() 13 | tml = toml.load(filepath) 14 | conf.src_conf = MongoConfig(tml['src']['hosts'], 15 | tml['src'].get('authdb', 'admin'), 16 | tml['src'].get('username', ''), 17 | tml['src'].get('password', '')) 18 | 19 | if type not in tml['dst'] or tml['dst']['type'] == 'mongo': 20 | conf.dst_conf = MongoConfig(tml['dst']['hosts'], 21 | tml['dst'].get('authdb', 'admin'), 22 | tml['dst'].get('username', ''), 23 | tml['dst'].get('password', '')) 24 | elif tml['dst']['type'] == 'es': 25 | conf.dst_conf = EsConfig(tml['dst']['hosts']) 26 | else: 27 | raise Exception('invalid dst.type') 28 | 29 | if 'sync' in tml and 'dbs' in tml['sync']: 30 | for dbentry in tml['sync']['dbs']: 31 | if 'db' not in dbentry: 32 | raise Exception("'db' is missing in sync.dbs") 33 | if not dbentry['db']: 34 | raise Exception("'db' is empty in sync.dbs") 35 | dbname = dbentry['db'].strip() 36 | rename_db = dbentry['rename_db'].strip() if 'rename_db' in dbentry else "" 37 | 38 | # update db map 39 | if dbname and rename_db: 40 | if dbname in conf.dbmap: 41 | raise Exception('duplicate dbname in sync.dbs: %s' % dbname) 42 | conf.dbmap[dbname] = rename_db 43 | 44 | if 'colls' in dbentry and dbentry['colls']: 45 | for collentry in dbentry['colls']: 46 | if isinstance(collentry, str) or isinstance(collentry, unicode): 47 | collname = collentry.strip() 48 | ns = gen_namespace(dbname, collname) 49 | conf.data_filter.add_include_coll(ns) 50 | elif isinstance(collentry, dict): 51 | if 'coll' not in collentry: 52 | raise Exception("'coll' is missing in sync.dbs.colls") 53 | if not collentry['coll']: 54 | raise Exception("'coll' is empty in sync.dbs.colls") 55 | 56 | collname = collentry['coll'].strip() 57 | fields = frozenset([f.strip() for f in collentry['fields']] if 'fields' in collentry else []) 58 | 59 | # update coll filter 60 | ns = gen_namespace(dbname, collname) 61 | conf.data_filter.add_include_coll(ns) 62 | 63 | # update fields 64 | if fields: 65 | if ns in conf.fieldmap: 66 | raise Exception("duplicate collname in sync.dbs.colls: %s" % ns) 67 | conf.fieldmap[ns] = fields 68 | else: 69 | raise Exception('invalid entry in sync.dbs.colls: %s' % collentry) 70 | else: 71 | # update coll filter 72 | conf.data_filter.add_include_coll(gen_namespace(dbname, '*')) 73 | 74 | if 'sync' in tml and 'start_optime' in tml['sync']: 75 | conf.start_optime = Timestamp(tml['sync']['start_optime'], 0) 76 | 77 | if 'log' in tml and 'filepath' in tml['log']: 78 | conf.logfilepath = tml['log']['filepath'] 79 | 80 | return conf 81 | -------------------------------------------------------------------------------- /mongosync/data_filter.py: -------------------------------------------------------------------------------- 1 | from mongo_utils import parse_namespace, gen_namespace 2 | 3 | 4 | class DataFilter(object): 5 | """ Filter for database and collection. 6 | """ 7 | def __init__(self): 8 | self._include_colls = set() 9 | self._related_dbs = set() 10 | 11 | def add_include_coll(self, ns): 12 | self._include_colls.add(ns) 13 | self._related_dbs.add(ns.split('.', 1)[0]) 14 | 15 | def add_include_colls(self, ns_list): 16 | for ns in ns_list: 17 | self.add_include_coll(ns) 18 | 19 | def valid_db(self, dbname): 20 | if not self._related_dbs: 21 | return True 22 | else: 23 | return dbname in self._related_dbs 24 | 25 | def valid_coll(self, dbname, collname): 26 | if not self._include_colls: 27 | return True 28 | else: 29 | if '%s.*' % dbname in self._include_colls: 30 | return True 31 | return gen_namespace(dbname, collname) in self._include_colls 32 | 33 | def valid_ns(self, ns): 34 | dbname, collname = parse_namespace(ns) 35 | return self.valid_coll(dbname, collname) 36 | 37 | def valid_index(self, dbname, collname): 38 | return self.valid_coll(dbname, collname) 39 | 40 | def valid_oplog(self, oplog): 41 | if not self._include_colls: 42 | return True 43 | op = oplog['op'] 44 | ns = oplog['ns'] 45 | if op == 'n': 46 | return False 47 | elif op == 'c': 48 | dbname, _ = parse_namespace(ns) 49 | return dbname in self._related_dbs 50 | else: 51 | return self.valid_ns(ns) 52 | 53 | @property 54 | def active(self): 55 | return True if self._include_colls else False 56 | 57 | @property 58 | def include_colls(self): 59 | return self._include_colls 60 | 61 | 62 | # test case 63 | if __name__ == '__main__': 64 | f = DataFilter() 65 | f.add_include_colls(['db0.*']) 66 | f.add_include_colls(['db1.coll']) 67 | assert f.valid_db('db0') 68 | assert f.valid_db('db1') 69 | assert f.valid_coll('db0', 'coll') 70 | assert f.valid_coll('dbx', 'coll') is False 71 | assert f.valid_coll('db1', 'coll') 72 | assert f.valid_coll('db1', 'collx') is False 73 | assert f.valid_ns('db0.coll') 74 | assert f.valid_ns('dbx.coll') is False 75 | assert f.valid_ns('db1.coll') 76 | assert f.valid_ns('db1.collx') is False 77 | assert f.valid_index('db0', 'coll') 78 | assert f.valid_index('dbx', 'coll') is False 79 | assert f.valid_index('db1', 'coll') 80 | assert f.valid_index('db1', 'collx') is False 81 | 82 | oplog4 = {'op': 'i', 'ns': 'db0.coll'} 83 | oplog5 = {'op': 'u', 'ns': 'db1.coll'} 84 | oplog6 = {'op': 'd', 'ns': 'db1.collx'} 85 | oplog7 = {'op': 'c', 'ns': 'db0.$cmd'} 86 | oplog8 = {'op': 'c', 'ns': 'db1.$cmd'} 87 | oplog9 = {'op': 'c', 'ns': 'dbx.$cmd'} 88 | assert f.valid_oplog(oplog4) 89 | assert f.valid_oplog(oplog5) 90 | assert f.valid_oplog(oplog6) is False 91 | assert f.valid_oplog(oplog7) 92 | assert f.valid_oplog(oplog8) 93 | assert f.valid_oplog(oplog9) is False 94 | 95 | print 'test cases all pass' 96 | -------------------------------------------------------------------------------- /mongosync/doc_utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | def doc_flat_to_nested(key_list, val): 5 | """ Convert a flat keys and value into a nested document. 6 | e.g.: 7 | 8 | { a.b.c: 1 } 9 | 10 | => 11 | 12 | { a: { b: { c: 1 } } } 13 | """ 14 | res = {} 15 | if len(key_list) > 1: 16 | res[key_list[0]] = doc_flat_to_nested(key_list[1:], val) 17 | elif len(key_list) == 1: 18 | res[key_list[0]] = val 19 | else: 20 | raise Exception('invalid key_list @%s' % doc_flat_to_nested.__name__) 21 | return res 22 | 23 | 24 | def get_val_by_flat_keys(doc, key_list): 25 | """ Get value through flat keys from a nested document. 26 | """ 27 | res = None 28 | if len(key_list) > 1: 29 | res = get_val_by_flat_keys(doc[key_list[0]], key_list[1:]) 30 | elif len(key_list) == 1: 31 | res = doc[key_list[0]] 32 | else: 33 | raise Exception('invalid key_list @%s' % get_val_by_flat_keys.__name__) 34 | return res 35 | 36 | 37 | def gen_doc_with_fields(doc, include_fields): 38 | """ Generate document with the specfied fields. 39 | """ 40 | res = {} 41 | for f in include_fields: 42 | try: 43 | keylist = f.split('.') 44 | val = get_val_by_flat_keys(doc, keylist) 45 | nested = doc_flat_to_nested(keylist, val) 46 | res.update(nested) 47 | except KeyError: 48 | pass 49 | return res 50 | 51 | 52 | def merge_doc(doc1, doc2): 53 | """ Merge doc2 into doc1. 54 | """ 55 | for k, v in doc2.iteritems(): 56 | if isinstance(v, collections.Mapping): 57 | doc1[k] = merge_doc(doc1.get(k, {}), v) 58 | else: 59 | doc1[k] = v 60 | return doc1 61 | 62 | 63 | if __name__ == '__main__': 64 | doc = {'a': {'b': {'c': 1, 'd': 2}}} 65 | doc1 = {'a': {'b': {'c': 1}}} 66 | doc2 = {'a': {'b': {'d': 2}}} 67 | 68 | assert doc_flat_to_nested('a.b.c'.split('.'), 1) == doc1 69 | assert doc_flat_to_nested('a.b.d'.split('.'), 2) == doc2 70 | 71 | assert get_val_by_flat_keys(doc1, 'a.b.c'.split('.')) == 1 72 | assert get_val_by_flat_keys(doc2, 'a.b.d'.split('.')) == 2 73 | 74 | assert gen_doc_with_fields(doc, ['a.b.c']) == doc1 75 | assert gen_doc_with_fields(doc, ['a.b.d']) == doc2 76 | 77 | print 'test cases all pass' 78 | -------------------------------------------------------------------------------- /mongosync/es/__init__.py: -------------------------------------------------------------------------------- 1 | raise ImportError('MODULE IS DEPRECATED: %s' % __name__) 2 | -------------------------------------------------------------------------------- /mongosync/es/handler.py: -------------------------------------------------------------------------------- 1 | import time 2 | import elasticsearch 3 | import elasticsearch.helpers 4 | from mongosync.config import EsConfig 5 | from mongosync.logger import Logger 6 | 7 | log = Logger.get() 8 | 9 | 10 | class EsHandler(object): 11 | def __init__(self, conf): 12 | if not isinstance(conf, EsConfig): 13 | raise Exception('expect EsConfig') 14 | self._conf = conf 15 | self._es = None 16 | 17 | def __del__(self): 18 | self.close() 19 | 20 | def connect(self): 21 | self._es = elasticsearch.Elasticsearch(self._conf.hosts, timeout=600) 22 | return self._es.ping() 23 | 24 | def reconnect(self): 25 | while True: 26 | res = self.connect() 27 | if not res: 28 | time.sleep(1) 29 | continue 30 | return 31 | 32 | def close(self): 33 | self._es = None 34 | 35 | def client(self): 36 | return self._es 37 | 38 | def bulk_write(self, actions): 39 | try: 40 | elasticsearch.helpers.bulk(client=self._es, actions=actions) 41 | except Exception as e: 42 | log.error('bulk write failed: %s' % e) 43 | -------------------------------------------------------------------------------- /mongosync/es/syncer.py: -------------------------------------------------------------------------------- 1 | import time 2 | import gevent 3 | import pymongo 4 | import bson 5 | import elasticsearch 6 | import elasticsearch.helpers 7 | from mongosync.logger import Logger 8 | from mongosync.common_syncer import CommonSyncer 9 | from mongosync.config import MongoConfig, EsConfig 10 | from mongosync.doc_utils import gen_doc_with_fields, doc_flat_to_nested, merge_doc 11 | from mongosync.mongo_utils import parse_namespace, gen_namespace 12 | from mongosync.mongo.handler import MongoHandler 13 | from mongosync.es.handler import EsHandler 14 | 15 | log = Logger.get() 16 | 17 | 18 | class EsSyncer(CommonSyncer): 19 | """ Elasticsearch synchronizer. 20 | """ 21 | def __init__(self, conf): 22 | CommonSyncer.__init__(self, conf) 23 | 24 | if not isinstance(self._conf.src_conf, MongoConfig): 25 | raise Exception('invalid src config type') 26 | self._src = MongoHandler(self._conf.src_conf) 27 | if not self._src.connect(): 28 | raise Exception('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr) 29 | 30 | if not isinstance(self._conf.dst_conf, EsConfig): 31 | raise Exception('invalid dst config type') 32 | self._dst = EsHandler(self._conf.dst_conf) 33 | if not self._dst.connect(): 34 | raise Exception('connect to elasticsearch(dst) failed: %s' % self._conf.dst_hostportstr) 35 | 36 | self._action_buf = [] # used to bulk write oplogs 37 | self._last_bulk_optime = None 38 | 39 | def _action_buf_full(self): 40 | return len(self._action_buf) >= 40 41 | 42 | def _sync_database(self, dbname): 43 | """ Sync a database. 44 | """ 45 | log.info("sync database '%s'" % dbname) 46 | # create index 47 | idxname = self._conf.db_mapping(dbname) 48 | if self._dst.client().indices.exists(index=idxname): 49 | log.info('index already existed: %s' % idxname) 50 | else: 51 | log.info('create index: %s' % idxname) 52 | self._dst.client().indices.create(index=idxname) 53 | self._sync_collections(dbname) 54 | 55 | def _sync_collection(self, namespace_tuple): 56 | """ Sync a collection until success. 57 | """ 58 | src_dbname, src_collname = namespace_tuple[0], namespace_tuple[1] 59 | idxname, typename = self._conf.db_coll_mapping(src_dbname, src_collname) 60 | fields = self._conf.fieldmap.get(gen_namespace(src_dbname, src_collname)) 61 | 62 | while True: 63 | try: 64 | log.info("sync collection '%s.%s' => '%s.%s'" % (src_dbname, src_collname, idxname, typename)) 65 | cursor = self._src.client()[src_dbname][src_collname].find(filter=None, 66 | cursor_type=pymongo.cursor.CursorType.EXHAUST, 67 | no_cursor_timeout=True, 68 | modifiers={'$snapshot': True}) 69 | count = cursor.count() 70 | if count == 0: 71 | log.info(' skip empty collection') 72 | return 73 | 74 | n = 0 75 | actions = [] 76 | actions_max = 20 77 | groups = [] 78 | groups_max = 10 79 | 80 | for doc in cursor: 81 | id = str(doc['_id']) 82 | del doc['_id'] 83 | source = gen_doc_with_fields(doc, fields) if fields else doc 84 | if source: 85 | actions.append({'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': source}) 86 | if len(actions) == actions_max: 87 | groups.append(actions) 88 | actions = [] 89 | if len(groups) == groups_max: 90 | threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(groups_max)] 91 | gevent.joinall(threads, raise_error=True) 92 | groups = [] 93 | 94 | n += 1 95 | if n % 1000 == 0: 96 | log.info(' %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100)) 97 | 98 | if len(groups) > 0: 99 | threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(len(groups))] 100 | gevent.joinall(threads, raise_error=True) 101 | if len(actions) > 0: 102 | elasticsearch.helpers.bulk(client=self._dst.client(), actions=actions) 103 | 104 | log.info(' %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100)) 105 | return 106 | except pymongo.errors.AutoReconnect: 107 | self._src.reconnect() 108 | 109 | def _replay_oplog(self, oplog_start): 110 | """ Replay oplog. 111 | """ 112 | self._last_bulk_optime = oplog_start 113 | 114 | n_total = 0 115 | n_skip = 0 116 | 117 | while True: 118 | # try to get cursor until success 119 | try: 120 | host, port = self._src.client().address 121 | log.info('try to sync oplog from %s on %s:%d' % (self._last_bulk_optime, host, port)) 122 | # set codec options to guarantee the order of keys in command 123 | coll = self._src.client()['local'].get_collection('oplog.rs', 124 | codec_options=bson.codec_options.CodecOptions(document_class=bson.son.SON)) 125 | cursor = coll.find({'ts': {'$gte': oplog_start}}, 126 | cursor_type=pymongo.cursor.CursorType.TAILABLE_AWAIT, 127 | no_cursor_timeout=True) 128 | 129 | # New in version 3.2 130 | # src_version = mongo_utils.get_version(self._src.client()) 131 | # if mongo_utils.version_higher_or_equal(src_version, '3.2.0'): 132 | # cursor.max_await_time_ms(1000) 133 | 134 | valid_start_optime = False # need to validate 135 | 136 | while True: 137 | try: 138 | if not cursor.alive: 139 | log.error('cursor is dead') 140 | raise pymongo.errors.AutoReconnect 141 | 142 | oplog = cursor.next() 143 | n_total += 1 144 | 145 | if not valid_start_optime: 146 | if oplog['ts'] == oplog_start: 147 | log.info('oplog is ok: %s' % oplog_start) 148 | valid_start_optime = True 149 | else: 150 | log.error('oplog %s is stale, terminate' % oplog_start) 151 | return 152 | 153 | # validate oplog 154 | if not self._conf.data_filter.valid_oplog(oplog): 155 | n_skip += 1 156 | self._last_optime = oplog['ts'] 157 | continue 158 | 159 | op = oplog['op'] 160 | ns = oplog['ns'] 161 | 162 | if op == 'i': # insert 163 | dbname, collname = parse_namespace(ns) 164 | idxname, typename = self._conf.db_coll_mapping(dbname, collname) 165 | fields = self._conf.fieldmap.get(gen_namespace(dbname, collname)) 166 | 167 | doc = oplog['o'] 168 | id = str(doc['_id']) 169 | del doc['_id'] 170 | if fields: 171 | doc = gen_doc_with_fields(doc, fields) 172 | if doc: 173 | self._action_buf.append({'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': doc}) 174 | 175 | elif op == 'u': # update 176 | dbname, collname = parse_namespace(ns) 177 | idxname, typename = self._conf.db_coll_mapping(dbname, collname) 178 | fields = self._conf.fieldmap.get(gen_namespace(dbname, collname)) 179 | 180 | id = str(oplog['o2']['_id']) 181 | 182 | if '$set' in oplog['o']: 183 | doc = {} 184 | for k, v in oplog['o']['$set'].iteritems(): 185 | if not fields or k in fields: 186 | sub_doc = doc_flat_to_nested(k.split('.'), v) 187 | merge_doc(doc, sub_doc) 188 | if doc: 189 | self._action_buf.append({'_op_type': 'update', 190 | '_index': idxname, 191 | '_type': typename, 192 | '_id': id, 193 | '_retry_on_conflict': 3, 194 | 'doc': doc, 195 | 'doc_as_upsert': True}) 196 | 197 | if '$unset' in oplog['o']: 198 | script_statements = [] 199 | for keypath in oplog['o']['$unset'].iterkeys(): 200 | if not fields or keypath in fields: 201 | pos = keypath.rfind('.') 202 | if pos >= 0: 203 | script_statements.append('ctx._source.%s.remove("%s")' % (keypath[:pos], keypath[pos+1:])) 204 | else: 205 | script_statements.append('ctx._source.remove("%s")' % keypath) 206 | if script_statements: 207 | doc = {'script': '; '.join(script_statements)} 208 | self._action_buf.append({'_op_type': 'update', 209 | '_index': idxname, 210 | '_type': typename, 211 | '_id': id, 212 | '_retry_on_conflict': 3, 213 | 'script': doc['script']}) 214 | 215 | if '$set' not in oplog['o'] and '$unset' not in oplog['o']: 216 | log.warn('unexpect oplog: %s', oplog['o']) 217 | 218 | elif op == 'd': # delete 219 | dbname, collname = parse_namespace(ns) 220 | idxname, typename = self._conf.db_coll_mapping(dbname, collname) 221 | id = str(oplog['o']['_id']) 222 | self._action_buf.append({'_op_type': 'delete', '_index': idxname, '_type': typename, '_id': id}) 223 | 224 | elif op == 'c': # command 225 | dbname, _ = parse_namespace(ns) 226 | idxname = self._conf.db_mapping(dbname) 227 | if 'drop' in oplog['o']: 228 | # TODO 229 | # how to delete type? 230 | pass 231 | log.warn('you should implement document type deletion.') 232 | if 'dropDatabase' in oplog['o']: 233 | # delete index 234 | self._dst.client().indices.delete(index=idxname) 235 | 236 | elif op == 'n': # no-op 237 | pass 238 | else: 239 | log.error('invalid optype: %s' % oplog) 240 | 241 | # flush 242 | if self._action_buf_full(): 243 | self._dst.bulk_write(self._action_buf) 244 | self._action_buf = [] 245 | self._last_bulk_optime = oplog['ts'] 246 | 247 | self._last_optime = oplog['ts'] 248 | self._log_optime(oplog['ts']) 249 | self._log_progress() 250 | except StopIteration as e: 251 | # flush 252 | if len(self._action_buf) > 0: 253 | self._dst.bulk_write(self._action_buf) 254 | self._action_buf = [] 255 | self._last_bulk_optime = self._last_optime 256 | self._log_optime(self._last_optime) 257 | self._log_progress('latest') 258 | time.sleep(0.1) 259 | except pymongo.errors.AutoReconnect as e: 260 | log.error(e) 261 | self._src.reconnect() 262 | break 263 | except elasticsearch.helpers.BulkIndexError as e: 264 | log.error(e) 265 | self._action_buf = [] 266 | except IndexError as e: 267 | log.error(e) 268 | log.error('%s not found, terminate' % oplog_start) 269 | return 270 | -------------------------------------------------------------------------------- /mongosync/logger.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | import logging.handlers 4 | 5 | 6 | class Logger(object): 7 | """ Global logger. 8 | """ 9 | @staticmethod 10 | def init(filepath): 11 | """ Init logger. 12 | """ 13 | logger = logging.getLogger('py-mongo-sync') 14 | logger.setLevel(logging.INFO) 15 | formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') 16 | if filepath: 17 | handler_log = logging.handlers.RotatingFileHandler(filepath, mode='a', maxBytes=1024*1024*100, backupCount=3) 18 | handler_log.setFormatter(formatter) 19 | handler_log.setLevel(logging.INFO) 20 | logger.addHandler(handler_log) 21 | else: 22 | handler_stdout = logging.StreamHandler(sys.stdout) 23 | handler_stdout.setFormatter(formatter) 24 | handler_stdout.setLevel(logging.INFO) 25 | logger.addHandler(handler_stdout) 26 | 27 | @staticmethod 28 | def get(): 29 | """ Get logger. 30 | """ 31 | return logging.getLogger('py-mongo-sync') 32 | -------------------------------------------------------------------------------- /mongosync/mongo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caosiyang/py-mongo-sync/fda1a798f976d41e47e993c618bcc0f2e60c7449/mongosync/mongo/__init__.py -------------------------------------------------------------------------------- /mongosync/mongo/handler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import pymongo 4 | import bson 5 | from mongosync import mongo_utils 6 | from mongosync.config import MongoConfig 7 | from mongosync.logger import Logger 8 | 9 | log = Logger.get() 10 | 11 | 12 | class MongoHandler(object): 13 | def __init__(self, conf): 14 | if not isinstance(conf, MongoConfig): 15 | raise Exception('expect MongoConfig') 16 | self._conf = conf 17 | self._mc = None 18 | 19 | def __del__(self): 20 | self.close() 21 | 22 | def connect(self): 23 | """ Connect to server. 24 | """ 25 | try: 26 | if isinstance(self._conf.hosts, unicode): 27 | host, port = mongo_utils.parse_hostportstr(self._conf.hosts) 28 | self._mc = mongo_utils.connect(host, port, 29 | authdb=self._conf.authdb, 30 | username=self._conf.username, 31 | password=self._conf.password) 32 | self._mc.admin.command('ismaster') 33 | return True 34 | elif isinstance(self._conf.__hosts, list): 35 | # TODO 36 | return False 37 | except Exception as e: 38 | log.error('connect failed: %s' % e) 39 | return False 40 | 41 | def reconnect(self): 42 | """ Try to reconnect until success. 43 | """ 44 | while True: 45 | try: 46 | self.close() 47 | self.connect() 48 | self.client().admin.command('ismaster') 49 | return 50 | except Exception as e: 51 | log.error('reconnect failed: %s' % e) 52 | time.sleep(1) 53 | 54 | def close(self): 55 | """ Close connection. 56 | """ 57 | if self._mc: 58 | self._mc.close() 59 | self._mc = None 60 | 61 | def client(self): 62 | return self._mc 63 | 64 | def create_index(self, dbname, collname, keys, **options): 65 | """ Create index. 66 | """ 67 | while True: 68 | try: 69 | self._mc[dbname][collname].create_index(keys, **options) 70 | return 71 | except pymongo.errors.AutoReconnect as e: 72 | log.error('%s' % e) 73 | self.reconnect() 74 | 75 | def bulk_write(self, dbname, collname, reqs, ordered=True, ignore_duplicate_key_error=False): 76 | """ Bulk write until success. 77 | """ 78 | while True: 79 | try: 80 | self._mc[dbname][collname].bulk_write(reqs, 81 | ordered=ordered, 82 | bypass_document_validation=False) 83 | return 84 | except pymongo.errors.AutoReconnect as e: 85 | log.error('%s' % e) 86 | self.reconnect() 87 | except Exception as e: 88 | log.error('bulk write failed: %s' % e) 89 | # retry to write one by one 90 | for req in reqs: 91 | while True: 92 | try: 93 | if isinstance(req, pymongo.ReplaceOne): 94 | mc[dbname][collname].replace_one(req._filter, req._doc, upsert=req._upsert) 95 | elif isinstance(req, pymongo.InsertOne): 96 | mc[dbname][collname].insert_one(req._doc) 97 | elif isinstance(req, pymongo.UpdateOne): 98 | mc[dbname][collname].update_one(req._filter, req._doc, upsert=req._upsert) 99 | elif isinstance(req, pymongo.DeleteOne): 100 | mc[dbname][collname].delete_one(req._filter) 101 | else: 102 | log.error('invalid req: %s' % req) 103 | sys.exit(1) 104 | break 105 | except pymongo.errors.AutoReconnect as e: 106 | log.error('%s' % e) 107 | self.reconnect() 108 | continue 109 | except pymongo.errors.DuplicateKeyError as e: 110 | if ignore_duplicate_key_error: 111 | log.info('ignore duplicate key error: %s: %s' % (e, req)) 112 | break 113 | else: 114 | log.error('%s: %s' % (e, req)) 115 | sys.exit(1) 116 | except Exception as e: 117 | # generally it's an odd oplog that program cannot process 118 | # so abort it and bugfix 119 | log.error('%s when excuting %s on %s.%s' % (e, req, dbname, collname)) 120 | sys.exit(1) 121 | 122 | def tail_oplog(self, start_optime=None, await_time_ms=None): 123 | """ Return a tailable curosr of local.oplog.rs from the specified optime. 124 | """ 125 | # set codec options to guarantee the order of keys in command 126 | coll = self._mc['local'].get_collection('oplog.rs', 127 | codec_options=bson.codec_options.CodecOptions(document_class=bson.son.SON)) 128 | cursor = coll.find({'fromMigrate': {'$exists': False}, 'ts': {'$gte': start_optime}}, 129 | cursor_type=pymongo.cursor.CursorType.TAILABLE_AWAIT, 130 | no_cursor_timeout=True) 131 | # New in version 3.2 132 | # src_version = mongo_utils.get_version(self._mc) 133 | # if mongo_utils.version_higher_or_equal(src_version, '3.2.0'): 134 | # cursor.max_await_time_ms(1000) 135 | return cursor 136 | 137 | def apply_oplog(self, oplog, ignore_duplicate_key_error=False): 138 | """ Apply oplog. 139 | """ 140 | dbname, collname = mongo_utils.parse_namespace(oplog['ns']) 141 | while True: 142 | try: 143 | op = oplog['op'] # 'n' or 'i' or 'u' or 'c' or 'd' 144 | if op == 'i': # insert 145 | if '_id' in oplog['o']: 146 | self._mc[dbname][collname].replace_one({'_id': oplog['o']['_id']}, oplog['o'], upsert=True) 147 | else: 148 | # create index 149 | # insert into db.system.indexes 150 | self._mc[dbname][collname].insert(oplog['o'], check_keys=False) 151 | elif op == 'u': # update 152 | self._mc[dbname][collname].update(oplog['o2'], oplog['o']) 153 | elif op == 'd': # delete 154 | self._mc[dbname][collname].delete_one(oplog['o']) 155 | elif op == 'c': # command 156 | # FIX ISSUE #4 and #5 157 | # if use '--colls' option to sync target collections, 158 | # running command that belongs to exclusive brother collections in the same database may failed. 159 | # Just skip it. 160 | try: 161 | self._mc[dbname].command(oplog['o']) 162 | except pymongo.errors.OperationFailure as e: 163 | log.info('%s: %s' % (e, oplog)) 164 | elif op == 'n': # no-op 165 | pass 166 | else: 167 | log.error('invaid op: %s' % oplog) 168 | return 169 | except pymongo.errors.AutoReconnect as e: 170 | self.reconnect() 171 | continue 172 | except pymongo.errors.DuplicateKeyError as e: 173 | if ignore_duplicate_key_error: 174 | log.info('ignore duplicate key error: %s :%s' % (e, oplog)) 175 | break 176 | else: 177 | log.error('%s: %s' % (e, oplog)) 178 | sys.exit(1) 179 | except pymongo.errors.WriteError as e: 180 | log.error('%s' % e) 181 | 182 | # For case: 183 | # Update the values of shard key fields when syncing from replica set to sharded cluster. 184 | # 185 | # Once you shard a collection, the shard key and the shard key values are immutable. 186 | # Reference: https://docs.mongodb.com/manual/core/sharding-shard-key/ 187 | if self._mc.is_mongos and oplog['op'] == 'u' and 'the (immutable) field' in str(e): 188 | old_doc = self._mc[dbname][collname].find_one(oplog['o2']) 189 | if not old_doc: 190 | log.error('replay update failed: document not found:', oplog['o2']) 191 | sys.exit(1) 192 | if '$set' in oplog['o']: 193 | new_doc = old_doc.update(oplog['o']['$set']) 194 | else: 195 | new_doc = oplog['o'] 196 | 197 | # TODO: here need a transaction to delete old and insert new 198 | 199 | # delete old document 200 | res = self._mc[dbname][collname].delete_one(oplog['o2']) 201 | if res.deleted_count != 1: 202 | log.error('replay update failed: delete old document failed:', oplog['o2']) 203 | sys.exit(1) 204 | # insert new document 205 | res = self._dst_mc[dbname][collname].insert_one(new_doc) 206 | if not res.inserted_id: 207 | log.error('replay update failed: insert new document failed:', new_doc) 208 | sys.exit(1) 209 | -------------------------------------------------------------------------------- /mongosync/mongo/syncer.py: -------------------------------------------------------------------------------- 1 | import time 2 | import multiprocessing 3 | import gevent 4 | import pymongo 5 | from mongosync import mongo_utils 6 | from mongosync.logger import Logger 7 | from mongosync.config import MongoConfig 8 | from mongosync.common_syncer import CommonSyncer, Stage 9 | from mongosync.mongo.handler import MongoHandler 10 | from mongosync.multi_oplog_replayer import MultiOplogReplayer 11 | 12 | log = Logger.get() 13 | 14 | 15 | class MongoSyncer(CommonSyncer): 16 | """ MongoDB synchronizer. 17 | """ 18 | def __init__(self, conf): 19 | CommonSyncer.__init__(self, conf) 20 | 21 | if not isinstance(self._conf.src_conf, MongoConfig): 22 | raise RuntimeError('invalid src config type') 23 | self._src = MongoHandler(self._conf.src_conf) 24 | if not self._src.connect(): 25 | raise RuntimeError('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr) 26 | if not isinstance(self._conf.dst_conf, MongoConfig): 27 | raise RuntimeError('invalid dst config type') 28 | self._dst = MongoHandler(self._conf.dst_conf) 29 | if not self._dst.connect(): 30 | raise RuntimeError('connect to mongodb(dst) failed: %s' % self._conf.dst_hostportstr) 31 | self._multi_oplog_replayer = MultiOplogReplayer(self._dst, 10) 32 | 33 | def _create_index(self, namespace_tuple): 34 | """ Create indexes. 35 | """ 36 | def format(key_direction_list): 37 | """ Format key and direction of index. 38 | """ 39 | res = [] 40 | for key, direction in key_direction_list: 41 | if isinstance(direction, float) or isinstance(direction, long): 42 | direction = int(direction) 43 | res.append((key, direction)) 44 | return res 45 | 46 | dbname, collname = namespace_tuple 47 | dst_dbname, dst_collname = self._conf.db_coll_mapping(dbname, collname) 48 | index_info = self._src.client()[dbname][collname].index_information() 49 | for name, info in index_info.iteritems(): 50 | keys = info['key'] 51 | options = {} 52 | options['name'] = name 53 | if 'unique' in info: 54 | options['unique'] = info['unique'] 55 | if 'sparse' in info: 56 | options['sparse'] = info['sparse'] 57 | if 'expireAfterSeconds' in info: 58 | options['expireAfterSeconds'] = info['expireAfterSeconds'] 59 | if 'partialFilterExpression' in info: 60 | options['partialFilterExpression'] = info['partialFilterExpression'] 61 | if 'dropDups' in info: 62 | options['dropDups'] = info['dropDups'] 63 | 64 | ## create indexes before import documents, so not need 'background' option 65 | # if 'background' in info: 66 | # options['background'] = info['background'] 67 | 68 | # for text index 69 | if 'weights' in info: 70 | options['weights'] = info['weights'] 71 | if 'default_language' in info: 72 | options['default_language'] = info['default_language'] 73 | if 'language_override' in info: 74 | options['language_override'] = info['language_override'] 75 | 76 | self._dst.create_index(dst_dbname, dst_collname, format(keys), **options) 77 | 78 | def _sync_collection(self, namespace_tuple): 79 | """ Sync a collection until success. 80 | """ 81 | # create indexes first 82 | self._create_index(namespace_tuple) 83 | 84 | src_dbname, src_collname = namespace_tuple 85 | dst_dbname, dst_collname = self._conf.db_coll_mapping(src_dbname, src_collname) 86 | src_ns = '%s.%s' % (src_dbname, src_collname) 87 | 88 | total = self._src.client()[src_dbname][src_collname].count() 89 | self._progress_logger.register(src_ns, total) 90 | 91 | while True: 92 | try: 93 | cursor = self._src.client()[src_dbname][src_collname].find(filter=None, 94 | cursor_type=pymongo.cursor.CursorType.EXHAUST, 95 | no_cursor_timeout=True, 96 | modifiers={'$snapshot': True}) 97 | 98 | reqs = [] 99 | reqs_max = 100 100 | groups = [] 101 | groups_max = 10 102 | n = 0 103 | 104 | for doc in cursor: 105 | reqs.append(pymongo.ReplaceOne({'_id': doc['_id']}, doc, upsert=True)) 106 | if len(reqs) == reqs_max: 107 | groups.append(reqs) 108 | reqs = [] 109 | if len(groups) == groups_max: 110 | threads = [gevent.spawn(self._dst.bulk_write, dst_dbname, dst_collname, groups[i], ordered=False, ignore_duplicate_key_error=True) for i in xrange(groups_max)] 111 | gevent.joinall(threads, raise_error=True) 112 | groups = [] 113 | 114 | n += 1 115 | if n % 10000 == 0: 116 | self._progress_logger.add(src_ns, n) 117 | n = 0 118 | 119 | if len(groups) > 0: 120 | threads = [gevent.spawn(self._dst.bulk_write, dst_dbname, dst_collname, groups[i], ordered=False, ignore_duplicate_key_error=True) for i in xrange(len(groups))] 121 | gevent.joinall(threads, raise_error=True) 122 | if len(reqs) > 0: 123 | self._dst.bulk_write(dst_dbname, dst_collname, reqs, ordered=False, ignore_duplicate_key_error=True) 124 | 125 | self._progress_logger.add(src_ns, n, done=True) 126 | return 127 | except pymongo.errors.AutoReconnect: 128 | self._src.reconnect() 129 | 130 | def _sync_large_collection(self, namespace_tuple, split_points): 131 | """ Sync large collection. 132 | """ 133 | # create indexes first 134 | self._create_index(namespace_tuple) 135 | 136 | dbname, collname = namespace_tuple 137 | ns = '.'.join(namespace_tuple) 138 | 139 | log.info('pending to sync %s with %d processes' % (ns, len(split_points) + 1)) 140 | 141 | coll = self._src.client()[dbname][collname] 142 | total = coll.count() 143 | self._progress_logger.register(ns, total) 144 | 145 | prog_q = multiprocessing.Queue() 146 | res_q = multiprocessing.Queue() 147 | 148 | proc_logging = multiprocessing.Process(target=logging_progress, args=(ns, total, prog_q)) 149 | proc_logging.start() 150 | 151 | queries = [] 152 | lower_bound = None 153 | for point in split_points: 154 | if lower_bound is None: 155 | queries.append({'_id': {'$lt': point}}) 156 | else: 157 | queries.append({'_id': {'$gte': lower_bound, '$lt': point}}) 158 | lower_bound = point 159 | queries.append({'_id': {'$gte': lower_bound}}) 160 | 161 | procs = [] 162 | for query in queries: 163 | p = multiprocessing.Process(target=self._sync_collection_with_query, args=(namespace_tuple, query, prog_q, res_q)) 164 | p.start() 165 | procs.append(p) 166 | log.info('start process %s with query %s' % (p.name, query)) 167 | 168 | for p in procs: 169 | p.join() 170 | 171 | n_docs = 0 172 | for p in procs: 173 | n_docs += res_q.get() 174 | self._progress_logger.add(ns, n_docs, done=True) 175 | 176 | prog_q.put(True) 177 | prog_q.close() 178 | prog_q.join_thread() 179 | proc_logging.join() 180 | 181 | def _sync_collection_with_query(self, namespace_tuple, query, prog_q, res_q): 182 | """ Sync collection with query. 183 | """ 184 | self._src.reconnect() 185 | self._dst.reconnect() 186 | 187 | src_dbname, src_collname = namespace_tuple 188 | dst_dbname, dst_collname = self._conf.db_coll_mapping(src_dbname, src_collname) 189 | 190 | while True: 191 | try: 192 | cursor = self._src.client()[src_dbname][src_collname].find(filter=query, 193 | cursor_type=pymongo.cursor.CursorType.EXHAUST, 194 | no_cursor_timeout=True, 195 | # snapshot cause blocking, maybe bug 196 | # modifiers={'$snapshot': True} 197 | ) 198 | total = 0 199 | n = 0 200 | reqs = [] 201 | reqs_max = 100 202 | groups = [] 203 | groups_max = 10 204 | 205 | for doc in cursor: 206 | reqs.append(pymongo.ReplaceOne({'_id': doc['_id']}, doc, upsert=True)) 207 | if len(reqs) == reqs_max: 208 | groups.append(reqs) 209 | reqs = [] 210 | if len(groups) == groups_max: 211 | threads = [gevent.spawn(self._dst.bulk_write, dst_dbname, dst_collname, groups[i], ordered=False, ignore_duplicate_key_error=True) for i in xrange(groups_max)] 212 | gevent.joinall(threads, raise_error=True) 213 | groups = [] 214 | 215 | n += 1 216 | total += 1 217 | if n % 10000 == 0: 218 | prog_q.put(n) 219 | n = 0 220 | 221 | if len(groups) > 0: 222 | threads = [gevent.spawn(self._dst.bulk_write, dst_dbname, dst_collname, groups[i], ordered=False, ignore_duplicate_key_error=True) for i in xrange(len(groups))] 223 | gevent.joinall(threads, raise_error=True) 224 | if len(reqs) > 0: 225 | self._dst.bulk_write(dst_dbname, dst_collname, reqs, ordered=False, ignore_duplicate_key_error=True) 226 | 227 | if n > 0: 228 | prog_q.put(n) 229 | res_q.put(total) 230 | 231 | prog_q.close() 232 | prog_q.join_thread() 233 | res_q.close() 234 | res_q.join_thread() 235 | return 236 | except pymongo.errors.AutoReconnect: 237 | self._src.reconnect() 238 | 239 | def _replay_oplog(self, start_optime): 240 | """ Replay oplog. 241 | """ 242 | self._last_optime = start_optime 243 | 244 | n_total = 0 245 | n_skip = 0 246 | 247 | while True: 248 | try: 249 | start_optime_valid = False 250 | need_log = False 251 | host, port = self._src.client().address 252 | log.info('try to sync oplog from %s on %s:%d' % (self._last_optime, host, port)) 253 | cursor = self._src.tail_oplog(start_optime) 254 | except IndexError as e: 255 | log.error(e) 256 | log.error('%s not found, terminate' % self._last_optime) 257 | return 258 | except Exception as e: 259 | log.error('get oplog cursor failed: %s' % e) 260 | continue 261 | 262 | # loop: read and apply oplog 263 | while True: 264 | try: 265 | if need_log: 266 | self._log_optime(self._last_optime) 267 | self._log_progress() 268 | need_log = False 269 | 270 | if not cursor.alive: 271 | log.error('cursor is dead') 272 | raise pymongo.errors.AutoReconnect 273 | 274 | oplog = cursor.next() 275 | n_total += 1 276 | 277 | # check start optime once 278 | if not start_optime_valid: 279 | if oplog['ts'] == self._last_optime: 280 | log.info('oplog is ok: %s' % self._last_optime) 281 | start_optime_valid = True 282 | else: 283 | log.error('oplog %s is stale, terminate' % self._last_optime) 284 | return 285 | 286 | if oplog['op'] == 'n': # no-op 287 | self._last_optime = oplog['ts'] 288 | need_log = True 289 | continue 290 | 291 | # validate oplog 292 | if not self._conf.data_filter.valid_oplog(oplog): 293 | n_skip += 1 294 | self._last_optime = oplog['ts'] 295 | need_log = True 296 | continue 297 | 298 | dbname, collname = mongo_utils.parse_namespace(oplog['ns']) 299 | dst_dbname, dst_collname = self._conf.db_coll_mapping(dbname, collname) 300 | if dst_dbname != dbname or dst_collname != collname: 301 | oplog['ns'] = '%s.%s' % (dst_dbname, dst_collname) 302 | 303 | if self._stage == Stage.post_initial_sync: 304 | if self._multi_oplog_replayer: 305 | if mongo_utils.is_command(oplog): 306 | self._multi_oplog_replayer.apply(ignore_duplicate_key_error=True) 307 | self._multi_oplog_replayer.clear() 308 | self._dst.apply_oplog(oplog) 309 | self._last_optime = oplog['ts'] 310 | need_log = True 311 | else: 312 | self._multi_oplog_replayer.push(oplog) 313 | if oplog['ts'] == self._initial_sync_end_optime or self._multi_oplog_replayer.count() == self._oplog_batchsize: 314 | self._multi_oplog_replayer.apply(ignore_duplicate_key_error=True) 315 | self._multi_oplog_replayer.clear() 316 | self._last_optime = oplog['ts'] 317 | need_log = True 318 | else: 319 | self._dst.apply_oplog(oplog, ignore_duplicate_key_error=True) 320 | self._last_optime = oplog['ts'] 321 | need_log = True 322 | 323 | if oplog['ts'] == self._initial_sync_end_optime: 324 | log.info('step into stage: oplog_sync') 325 | self._stage = Stage.oplog_sync 326 | else: 327 | if self._multi_oplog_replayer: 328 | if mongo_utils.is_command(oplog): 329 | self._multi_oplog_replayer.apply() 330 | self._multi_oplog_replayer.clear() 331 | self._dst.apply_oplog(oplog) 332 | self._last_optime = oplog['ts'] 333 | need_log = True 334 | else: 335 | self._multi_oplog_replayer.push(oplog) 336 | if self._multi_oplog_replayer.count() == self._oplog_batchsize: 337 | self._multi_oplog_replayer.apply() 338 | self._multi_oplog_replayer.clear() 339 | self._last_optime = oplog['ts'] 340 | need_log = True 341 | else: 342 | self._dst.apply_oplog(oplog) 343 | self._last_optime = oplog['ts'] 344 | need_log = True 345 | except StopIteration as e: 346 | if self._multi_oplog_replayer and self._multi_oplog_replayer.count() > 0: 347 | self._multi_oplog_replayer.apply() 348 | self._multi_oplog_replayer.clear() 349 | self._last_optime = self._multi_oplog_replayer.last_optime() 350 | need_log = True 351 | # no more oplogs, wait a moment 352 | time.sleep(0.1) 353 | self._log_optime(self._last_optime) 354 | self._log_progress('latest') 355 | except pymongo.errors.DuplicateKeyError as e: 356 | if self._stage == Stage.oplog_sync: 357 | log.error(e) 358 | log.error('terminate') 359 | return 360 | else: 361 | log.error('ignore duplicate key error: %s' % e) 362 | continue 363 | except pymongo.errors.AutoReconnect as e: 364 | log.error(e) 365 | self._src.reconnect() 366 | break 367 | 368 | 369 | def logging_progress(ns, total, prog_q): 370 | curr = 0 371 | while True: 372 | m = prog_q.get() 373 | if isinstance(m, bool): 374 | return 375 | curr += m 376 | s = '\t%s\t%d/%d\t[%.2f%%]' % ( 377 | ns, 378 | curr, 379 | total, 380 | float(curr)/total*100 if total > 0 else float(curr+1)/(total+1)*100) 381 | log.info(s) 382 | -------------------------------------------------------------------------------- /mongosync/mongo_utils.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | import bson 3 | 4 | 5 | def gen_uri(hosts, username=None, password=None, authdb='admin'): 6 | def parse(hosts): 7 | if isinstance(hosts, str) or isinstance(hosts, unicode): 8 | return hosts 9 | if isinstance(hosts, list) or isinstance(hosts, tuple): 10 | hostportstrs = [] 11 | for host in hosts: 12 | if isinstance(host, str) or isinstance(host, unicode): 13 | hostportstrs.append(host) 14 | continue 15 | if isinstance(host, tuple): 16 | hostportstrs.append(parse_tuple(host)) 17 | continue 18 | return ','.join(hostportstrs) 19 | raise Exception('invalid hosts: %v' % hosts) 20 | 21 | def parse_tuple(host_port_tuple): 22 | """ host is string and port is int. 23 | """ 24 | if not isinstance(host_port_tuple, tuple): 25 | raise Exception('not a tuple: %s', (host_port_tuple,)) 26 | if len(host_port_tuple) != 2: 27 | raise Exception('invalid tuple length: %s', (host_port_tuple,)) 28 | host, port = host_port_tuple 29 | if not isinstance(host, str) and not isinstance(host, unicode): 30 | raise Exception('invalid host in tuple: %s' % (host_port_tuple,)) 31 | if not isinstance(port, int): 32 | raise Exception('invalid port in tuple: %s' % (host_port_tuple,)) 33 | return '%s:%d' % (host, port) 34 | 35 | if username and password and authdb: 36 | return 'mongodb://%s:%s@%s/%s' % (username, password, parse(hosts), authdb) 37 | else: 38 | return 'mongodb://%s' % parse(hosts) 39 | 40 | 41 | def connect(host, port, **kwargs): 42 | """ Connect and return a available handler. 43 | Recognize replica set automatically. 44 | Authenticate automatically if necessary. 45 | 46 | default: 47 | authdb = admin 48 | read_preference = PRIMARY 49 | w = 1 50 | """ 51 | authdb = kwargs.get('authdb', 'admin') # default authdb is 'admin' 52 | username = kwargs.get('username', '') 53 | password = kwargs.get('password', '') 54 | w = kwargs.get('w', 1) 55 | replset_name = get_replica_set_name(host, port, **kwargs) 56 | if replset_name: 57 | mc = pymongo.MongoClient(host=host, 58 | port=port, 59 | document_class=bson.son.SON, 60 | connect=True, 61 | serverSelectionTimeoutMS=3000, 62 | replicaSet=replset_name, 63 | read_preference=pymongo.read_preferences.ReadPreference.PRIMARY, 64 | w=w) 65 | else: 66 | mc = pymongo.MongoClient(host, 67 | port, 68 | document_class=bson.son.SON, 69 | connect=True, 70 | serverSelectionTimeoutMS=3000, 71 | w=w) 72 | if username and password and authdb: 73 | # raise exception if auth failed here 74 | mc[authdb].authenticate(username, password) 75 | return mc 76 | 77 | 78 | def get_version(arg): 79 | """ Get version. 80 | """ 81 | if isinstance(arg, pymongo.MongoClient): 82 | return arg.server_info()['version'] 83 | elif isinstance(arg, str) or isinstance(arg, unicode): 84 | host, port = parse_hostportstr(arg) 85 | with pymongo.MongoClient(host, port, connect=True, serverSelectionTimeoutMS=3000) as mc: 86 | return mc.server_info()['version'] 87 | elif isinstance(arg, tuple): 88 | with pymongo.MongoClient(arg[0], arg[1], connect=True, serverSelectionTimeoutMS=3000) as mc: 89 | return mc.server_info()['version'] 90 | else: 91 | raise Exception('invalid argument type @%s' % get_version.__name__) 92 | 93 | 94 | def get_replica_set_name(host, port, **kwargs): 95 | """ Get replica set name. 96 | Return a empty string if it's not a replica set. 97 | Raise exception if execute failed. 98 | """ 99 | try: 100 | username = kwargs.get('username', '') 101 | password = kwargs.get('password', '') 102 | authdb = kwargs.get('authdb', 'admin') 103 | mc = pymongo.MongoClient(host, port, connect=True, serverSelectionTimeoutMS=3000) 104 | if username and password and authdb: 105 | mc[authdb].authenticate(username, password) 106 | status = mc.admin.command({'replSetGetStatus': 1}) 107 | mc.close() 108 | if status['ok'] == 1: 109 | return status['set'] 110 | else: 111 | return '' 112 | except pymongo.errors.OperationFailure: 113 | return '' 114 | 115 | 116 | def get_primary(host, port, **kwargs): 117 | """ Get host, port, replsetName of the primary node. 118 | """ 119 | try: 120 | username = kwargs.get('username', '') 121 | password = kwargs.get('password', '') 122 | authdb = kwargs.get('authdb', 'admin') 123 | mc = pymongo.MongoClient(host, port, connect=True, serverSelectionTimeoutMS=3000) 124 | if username and password and authdb: 125 | mc[authdb].authenticate(username, password) 126 | status = mc.admin.command({'replSetGetStatus': 1}) 127 | mc.close() 128 | if status['ok'] == 1: 129 | for member in status['members']: 130 | if member['stateStr'] == 'PRIMARY': 131 | hostportstr = member['name'] 132 | host = hostportstr.split(':')[0] 133 | port = int(hostportstr.split(':')[1]) 134 | replset_name = status['set'] 135 | return host, port, replset_name 136 | else: 137 | raise Exception('no primary in replica set') 138 | except Exception as e: 139 | raise Exception('get_primary %s' % e) 140 | 141 | 142 | def get_optime(mc): 143 | """ Get optime of primary in the replica set. 144 | 145 | Changed in version 3.2. 146 | If using protocolVersion: 1, optime returns a document that contains: 147 | - ts, the Timestamp of the last operation applied to this member of the replica set from the oplog. 148 | - t, the term in which the last applied operation was originally generated on the primary. 149 | If using protocolVersion: 0, optime returns the Timestamp of the last operation applied to this member of the replica set from the oplog. 150 | 151 | Refer to https://docs.mongodb.com/manual/reference/command/replSetGetStatus/ 152 | """ 153 | rs_status = mc['admin'].command({'replSetGetStatus': 1}) 154 | members = rs_status.get('members') 155 | if not members: 156 | raise Exception('no member in replica set') 157 | for member in rs_status['members']: 158 | role = member.get('stateStr') 159 | if role == 'PRIMARY': 160 | optime = member.get('optime') 161 | if isinstance(optime, dict) and 'ts' in optime: # for MongoDB v3.2 162 | return optime['ts'] 163 | else: 164 | return optime 165 | raise Exception('no primary in replica set') 166 | 167 | 168 | def get_optime_tokumx(mc): 169 | """ Get optime of primary in the replica set. 170 | """ 171 | rs_status = mc['admin'].command({'replSetGetStatus': 1}) 172 | members = rs_status.get('members') 173 | if members: 174 | for member in members: 175 | role = member.get('stateStr') 176 | if role == 'PRIMARY': 177 | optime = member.get('optimeDate') 178 | return optime 179 | return None 180 | 181 | 182 | def parse_namespace(ns): 183 | """ Parse namespace. 184 | """ 185 | res = ns.split('.', 1) 186 | return res[0], res[1] 187 | 188 | 189 | def gen_namespace(dbname, collname): 190 | """ Generate namespace. 191 | """ 192 | return '%s.%s' % (dbname, collname) 193 | 194 | 195 | def parse_hostportstr(hostportstr): 196 | """ Parse hostportstr like 'xxx.xxx.xxx.xxx:xxx' 197 | """ 198 | host = hostportstr.split(':')[0] 199 | port = int(hostportstr.split(':')[1]) 200 | return host, port 201 | 202 | 203 | def collect_server_info(host, port): 204 | """ Collect general information of server. 205 | """ 206 | info = {} 207 | with pymongo.MongoClient(host, port, connect=True, serverSelectionTimeoutMS=3000) as mc: 208 | info['version'] = mc.server_info()['version'] 209 | return info 210 | 211 | 212 | def version_higher_or_equal(v1, v2): 213 | """ Check if v1 is higher than or equal to v2. 214 | """ 215 | t1 = tuple(int(val) for val in v1.split('.')) 216 | t2 = tuple(int(val) for val in v2.split('.')) 217 | return t1 >= t2 218 | 219 | 220 | def is_command(oplog): 221 | """ Check if oplog is a command. 222 | """ 223 | op = oplog['op'] 224 | # createIndex() could insert a document without _id into *.system.indexes 225 | if op == 'c' or (op == 'i' and '_id' not in oplog['o']): 226 | return True 227 | return False 228 | -------------------------------------------------------------------------------- /mongosync/multi_oplog_replayer.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | import gevent 3 | import mmh3 4 | import mongo_utils 5 | from mongosync.mongo.syncer import MongoHandler 6 | from mongosync.logger import Logger 7 | 8 | log = Logger.get() 9 | 10 | 11 | class OplogVector(object): 12 | """ A set of oplogs with same namespace. 13 | """ 14 | def __init__(self, dbname, collname): 15 | self._dbname = dbname 16 | self._collname = collname 17 | self._oplogs = [] 18 | 19 | 20 | class MultiOplogReplayer(object): 21 | """ Concurrent oplog replayer for MongoDB. 22 | """ 23 | def __init__(self, mongo_handler, n_writers=10, batch_size=40): 24 | """ 25 | Parameter: 26 | - n_writers: maximum coroutine count 27 | - batch_size: maximum oplog count in a batch, 40 is empiric value 28 | """ 29 | assert isinstance(mongo_handler, MongoHandler) 30 | assert n_writers > 0 31 | assert batch_size > 0 32 | self._mongo_handler = mongo_handler # type of MongoHandler 33 | self._pool = gevent.pool.Pool(n_writers) 34 | self._batch_size = batch_size 35 | self._map = {} 36 | self._count = 0 37 | self._last_optime = None 38 | 39 | def clear(self): 40 | """ Clear oplogs. 41 | """ 42 | self._map.clear() 43 | self._count = 0 44 | 45 | def push(self, oplog): 46 | """ Push oplog and group by namespace. 47 | """ 48 | ns = oplog['ns'] 49 | if ns not in self._map: 50 | self._map[ns] = [] 51 | self._map[ns].append(oplog) 52 | self._count += 1 53 | self._last_optime = oplog['ts'] 54 | 55 | def apply(self, ignore_duplicate_key_error=False): 56 | """ Apply oplogs. 57 | """ 58 | oplog_vecs = [] 59 | for ns, oplogs in self._map.iteritems(): 60 | dbname, collname = mongo_utils.parse_namespace(ns) 61 | n = len(oplogs) / self._batch_size + 1 62 | if n == 1: 63 | vec = OplogVector(dbname, collname) 64 | for oplog in oplogs: 65 | op = self.__convert(oplog) 66 | assert op is not None 67 | vec._oplogs.append(op) 68 | oplog_vecs.append(vec) 69 | else: 70 | vecs = [OplogVector(dbname, collname) for i in xrange(n)] 71 | for oplog in oplogs: 72 | op = self.__convert(oplog) 73 | assert op is not None 74 | # filter of UpdateOne/ReplaceOne/DeleteOne is {'_id': ObjectID} 75 | # @ref https://github.com/mongodb/mongo-python-driver/blob/master/pymongo/operations.py 76 | m = self.__hash(op._filter['_id']) 77 | vecs[m % n]._oplogs.append(op) 78 | oplog_vecs.extend(vecs) 79 | 80 | for vec in oplog_vecs: 81 | if vec._oplogs: 82 | self._pool.spawn(self._mongo_handler.bulk_write, 83 | vec._dbname, 84 | vec._collname, 85 | vec._oplogs, 86 | ignore_duplicate_key_error=ignore_duplicate_key_error) 87 | self._pool.join() 88 | 89 | def count(self): 90 | """ Return count of oplogs. 91 | """ 92 | return self._count 93 | 94 | def last_optime(self): 95 | """ Return timestamp of the last oplog. 96 | """ 97 | return self._last_optime 98 | 99 | def __convert(self, oplog): 100 | """ Convert oplog to operation that supports bulk write. 101 | """ 102 | op = oplog['op'] 103 | if op == 'u': 104 | # it could be an update or replace 105 | # @ref https://docs.mongodb.com/manual/reference/limits/#naming-restrictions 106 | is_update = False 107 | for key in oplog['o'].iterkeys(): 108 | if key[0] == '$': 109 | is_update = True 110 | break 111 | if is_update: 112 | return pymongo.operations.UpdateOne({'_id': oplog['o2']['_id']}, oplog['o']) 113 | else: 114 | return pymongo.operations.ReplaceOne({'_id': oplog['o2']['_id']}, oplog['o'], upsert=True) 115 | elif op == 'i': 116 | return pymongo.operations.ReplaceOne({'_id': oplog['o']['_id']}, oplog['o'], upsert=True) 117 | elif op == 'd': 118 | return pymongo.operations.DeleteOne({'_id': oplog['o']['_id']}) 119 | else: 120 | log.error('invaid op: %s' % oplog) 121 | return None 122 | 123 | def __hash(self, oid): 124 | """ Hash ObjectID with murmurhash3. 125 | """ 126 | try: 127 | # str(oid) may contain non-ascii characters 128 | m = mmh3.hash(str(oid), signed=False) 129 | except Exception as e: 130 | m = 0 131 | return m 132 | -------------------------------------------------------------------------------- /mongosync/optime_logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import struct 3 | from bson.timestamp import Timestamp 4 | 5 | 6 | class OptimeLogger(object): 7 | """ Record optime in file. 8 | """ 9 | def __init__(self, filepath): 10 | assert filepath 11 | assert isinstance(filepath, str) or isinstance(filepath, unicode) 12 | if not os.path.exists(filepath): 13 | os.mknod(filepath) 14 | assert os.path.isfile(filepath) 15 | self._filepath = filepath 16 | self._fd = open(filepath, 'rb+') 17 | 18 | def __del__(self): 19 | self._fd.close() 20 | 21 | def write(self, optime): 22 | """ Write optime. 23 | """ 24 | self._fd.seek(0, os.SEEK_SET) 25 | time_data = struct.pack('I', optime.time) 26 | inc_data = struct.pack('I', optime.inc) 27 | self._fd.write(time_data) 28 | self._fd.write(inc_data) 29 | self._fd.flush() 30 | 31 | def read(self): 32 | """ Read optime. 33 | Return optime if OK else None. 34 | """ 35 | if self.filesize != 8: 36 | return None 37 | self._fd.seek(0, os.SEEK_SET) 38 | time = struct.unpack('I', self._fd.read(4))[0] 39 | inc = struct.unpack('I', self._fd.read(4))[0] 40 | return Timestamp(time, inc) 41 | 42 | @property 43 | def filesize(self): 44 | """ Return the length of file. 45 | """ 46 | self._fd.seek(0, os.SEEK_END) 47 | return self._fd.tell() 48 | 49 | @property 50 | def filepath(self): 51 | """ Return filepath. 52 | """ 53 | return self._filepath 54 | 55 | 56 | if __name__ == '__main__': 57 | optime_logger = OptimeLogger('optimelog.tmp.0') 58 | optime_logger.write(Timestamp(0, 1)) 59 | optime = optime_logger.read() 60 | assert optime is not None 61 | assert optime.time == 0 62 | assert optime.inc == 1 63 | assert optime_logger.filesize == 8 64 | 65 | optime_logger = OptimeLogger('optimelog.tmp.1') 66 | optime_logger.write(Timestamp(4294967295, 2)) 67 | optime_logger = OptimeLogger('optimelog.tmp.1') 68 | optime = optime_logger.read() 69 | assert optime is not None 70 | assert optime.time == 4294967295 71 | assert optime.inc == 2 72 | assert optime_logger.filesize == 8 73 | 74 | optime_logger = OptimeLogger('optimelog.tmp.emtpy') 75 | optime = optime_logger.read() 76 | assert optime is None 77 | assert optime_logger.filesize == 0 78 | print 'test pass' 79 | -------------------------------------------------------------------------------- /mongosync/progress_logger.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import multiprocessing 4 | import threading 5 | import Queue 6 | from mongosync.logger import Logger 7 | 8 | log = Logger.get() 9 | 10 | 11 | class Message(object): 12 | """ Progress change message. 13 | """ 14 | def __init__(self, ns, cnt, done): 15 | self.ns = ns 16 | self.cnt = cnt 17 | self.done = done 18 | 19 | 20 | class Progress(object): 21 | """ Progress attibutes. 22 | """ 23 | def __init__(self, ns, total): 24 | self.ns = ns 25 | self.curr = 0 26 | self.total = total 27 | self.start_time = time.time() 28 | self.done = False 29 | 30 | 31 | class LoggerThread(threading.Thread): 32 | """ Logger thread. 33 | """ 34 | def __init__(self, n_colls, **kwargs): 35 | self._n_colls = n_colls 36 | self._q = Queue.Queue() 37 | self._ns_map = {} 38 | super(LoggerThread, self).__init__(**kwargs) 39 | 40 | def run(self): 41 | n_colls_done = 0 42 | while n_colls_done < self._n_colls: 43 | m = self._q.get() 44 | if m.ns not in self._ns_map: 45 | raise Exception('missing namespace: %s' % m.ns) 46 | self._ns_map[m.ns].curr += m.cnt 47 | prog = self._ns_map[m.ns] 48 | s = '\t%s\t%d/%d\t[%.2f%%]' % ( 49 | prog.ns, 50 | prog.curr, 51 | prog.total, 52 | float(prog.curr)/prog.total*100 if prog.total > 0 else float(prog.curr+1)/(prog.total+1)*100) 53 | 54 | if not m.done: 55 | log.info(s) 56 | else: 57 | log.info('[ OK ] ' + s) 58 | n_colls_done += 1 59 | time_used = time.time() - prog.start_time 60 | sys.stdout.write('\r\33[K') 61 | sys.stdout.write('\r[\033[32m OK \033[0m]\t[%d/%d]\t%s\t%d/%d\t%.1fs\n' % (n_colls_done, self._n_colls, m.ns, prog.curr, prog.total, time_used)) 62 | sys.stdout.flush() 63 | del self._ns_map[m.ns] 64 | 65 | # s = '' 66 | # for ns, prog in self._ns_map.iteritems(): 67 | # s += '|| %s %d/%d %.1f%% ' % (ns, prog.curr, prog.total, float(prog.curr)/prog.total*100) 68 | # if len(s) > 0: 69 | # s += '||' 70 | # sys.stdout.write('\r%s' % s) 71 | # sys.stdout.flush() 72 | 73 | log.info('ProgressLogger thread %s exit' % threading.currentThread().name) 74 | 75 | def register(self, ns, total): 76 | """ Register collection. 77 | """ 78 | if ns in self._ns_map: 79 | raise Exception('duplicate collection %s' % ns) 80 | self._ns_map[ns] = Progress(ns, total) 81 | 82 | def add(self, ns, count, done=False): 83 | """ Update progress. 84 | """ 85 | self._q.put(Message(ns, count, done)) 86 | 87 | 88 | class LoggerProcess(multiprocessing.Process): 89 | """ Logger progress. 90 | """ 91 | def __init__(self, n_colls, **kwargs): 92 | self._n_colls = n_colls 93 | self._q = multiprocessing.Queue() 94 | self._ns_map = multiprocessing.Manager().dict() 95 | super(LoggerProcess, self).__init__(**kwargs) 96 | 97 | def run(self): 98 | n_colls_done = 0 99 | while n_colls_done < self._n_colls: 100 | m = self._q.get() 101 | if m.ns not in self._ns_map: 102 | raise Exception('missing namespace: %s' % m.ns) 103 | self._ns_map[m.ns].curr += m.cnt 104 | prog = self._ns_map[m.ns] 105 | s = '\t%s\t%d/%d\t[%.2f%%]' % ( 106 | prog.ns, 107 | prog.curr, 108 | prog.total, 109 | float(prog.curr)/prog.total*100 if prog.total > 0 else float(prog.curr+1)/(prog.total+1)*100) 110 | 111 | if not m.done: 112 | log.info(s) 113 | else: 114 | log.info('[ OK ] ' + s) 115 | n_colls_done += 1 116 | time_used = time.time() - prog.start_time 117 | sys.stdout.write('\r\33[K') 118 | sys.stdout.write('\r[\033[32m OK \033[0m]\t[%d/%d]\t%s\t%d/%d\t%.1fs\n' % (n_colls_done, self._n_colls, m.ns, prog.curr, prog.total, time_used)) 119 | sys.stdout.flush() 120 | del self._ns_map[m.ns] 121 | 122 | # s = '' 123 | # for ns, prog in self._ns_map.iteritems(): 124 | # s += '|| %s %d/%d %.1f%% ' % (ns, prog.curr, prog.total, float(prog.curr)/prog.total*100) 125 | # if len(s) > 0: 126 | # s += '||' 127 | # sys.stdout.write('\r%s' % s) 128 | # sys.stdout.flush() 129 | 130 | log.info('ProgressLogger process %s exit' % multiprocessing.current_process().name) 131 | 132 | def register(self, ns, total): 133 | """ Register collection. 134 | """ 135 | if ns in self._ns_map: 136 | raise Exception('duplicate collection %s' % ns) 137 | self._ns_map[ns] = Progress(ns, total) 138 | 139 | def add(self, ns, count, done=False): 140 | """ Update progress. 141 | """ 142 | self._q.put(Message(ns, count, done)) 143 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gevent==1.4.0 2 | toml==0.10.0 3 | mmh3==2.5.1 4 | pymongo==3.5.1 5 | -------------------------------------------------------------------------------- /sync.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # summary: MongoDB sync tool 5 | # author: caosiyang 6 | # date: 2013/09/16 7 | 8 | from gevent import monkey 9 | monkey.patch_all() 10 | 11 | import sys 12 | from mongosync.command_options import CommandOptions 13 | from mongosync.config import MongoConfig, EsConfig 14 | from mongosync.logger import Logger 15 | 16 | if __name__ == '__main__': 17 | conf = CommandOptions.parse() 18 | Logger.init(conf.logfilepath) 19 | log = Logger.get() 20 | 21 | conf.info(log) 22 | if conf.logfilepath: 23 | conf.info(sys.stdout) 24 | 25 | if isinstance(conf.dst_conf, MongoConfig): 26 | from mongosync.mongo.syncer import MongoSyncer 27 | syncer = MongoSyncer(conf) 28 | syncer.run() 29 | elif isinstance(conf.dst_conf, EsConfig): 30 | from mongosync.es.syncer import EsSyncer 31 | syncer = EsSyncer(conf) 32 | syncer.run() 33 | else: 34 | raise Exception('invalid config type') 35 | 36 | log.info('exit') 37 | --------------------------------------------------------------------------------