├── hooks ├── __init__.py └── es_data_mapping.py ├── .gitignore ├── run_op_emitter.py ├── run_es_consumer.py ├── config.json.default ├── README.md ├── scrutineer.py ├── elasticsearch_api.py └── oplog.py /hooks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | config.json 2 | *.pyc -------------------------------------------------------------------------------- /hooks/es_data_mapping.py: -------------------------------------------------------------------------------- 1 | #this function will remap your data from a mongo document to whatever you want indexed. 2 | #you probably dont want to do anything 3 | def remap(document): 4 | return document -------------------------------------------------------------------------------- /run_op_emitter.py: -------------------------------------------------------------------------------- 1 | from oplog import Oplog 2 | from multiprocessing import Process 3 | 4 | def run(): 5 | op_emitter = Oplog() 6 | Process(target=op_emitter.start).start() 7 | 8 | if __name__ == "__main__": 9 | run() -------------------------------------------------------------------------------- /run_es_consumer.py: -------------------------------------------------------------------------------- 1 | from elasticsearch_api import ElasticSearch 2 | from multiprocessing import Process 3 | 4 | def run(): 5 | es = ElasticSearch() 6 | Process(target=es.start).start() 7 | 8 | if __name__ == "__main__": 9 | run() -------------------------------------------------------------------------------- /config.json.default: -------------------------------------------------------------------------------- 1 | { 2 | "elasticsearch":{ 3 | "connectionString":"", 4 | "index":"" 5 | }, 6 | "types_we_care_about":[ 7 | "", 8 | "" 9 | ], 10 | "oplog":{ 11 | "host":"", 12 | "port":, 13 | "db":"", 14 | "collection":"" 15 | }, 16 | "data":{ 17 | "host":"", 18 | "port":, 19 | "db":"" 20 | }, 21 | "queue_purge_frequency":2 22 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 3 | Version 2, December 2004 4 | 5 | Copyright (C) 2012 Oren Mazor16 | 17 |6 | 7 | Everyone is permitted to copy and distribute verbatim or modified 8 | copies of this license document, and changing it is allowed as long 9 | as the name is changed. 10 | 11 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 12 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 13 | 14 | 0. You just DO WHAT THE FUCK YOU WANT TO. 15 |
I wrote this tool so that I can consume an extremely high bandwidth mongodb cluster via an ElasticSearch (lucene) index cluster. I need everything to be real time (i.e. the ES index HAS to be as close as possible to the mongodb cluster), and so all of the work I do in this repository is aiming for that goal.
18 | 19 |Current versions of guaranteed compatibility: ElasticSearch 0.19.2, MongoDB 2.0.4, Python 2.7.2
-------------------------------------------------------------------------------- /scrutineer.py: -------------------------------------------------------------------------------- 1 | #sometimes you gotta check that everything's correct 2 | from oplog import Oplog 3 | from elasticsearch_api import ElasticSearch 4 | from json import loads 5 | import pdb 6 | pdb.set_trace() 7 | 8 | db = Oplog().get_actual_database() 9 | es = ElasticSearch() 10 | 11 | config = loads(open("config.json").read()) 12 | 13 | query = { "query": { "bool": { "must": [{"term":{"TY":0}}],"must_not": [ ],"should": [ ]}},"from": 0,"size": 50,"sort": [ ],"facets": { }} 14 | 15 | #connect to your data from the ES side of things, to avoid blocking mongo. 16 | #we'll iterate over all of the records in ES and make sure they're up to date 17 | #with what is in mongo 18 | for record in es.scroll_search(query,index=config["elasticsearch"]["index"]): 19 | docs = db.find({"_id":record['_source']['OID']}) 20 | if docs.count() == 0: 21 | #the document exists in ES but not mongo, so we need to delete 22 | result = es.delete(record["_id"],bulk=True) 23 | print "deleted "+ record["_id"] + " from ES because its not in mongo" 24 | else: 25 | #this will never be above 1 26 | for doc in docs: 27 | #write your own sync code here. this is mine. 28 | if not doc['TY'] != record['TY']: 29 | result = es.insert(doc,index=config["elasticsearch"]["index"],doctype="DeliveryEvent",bulk=True) 30 | print "inserted new version of " + record['_id'] + " because its old." 31 | -------------------------------------------------------------------------------- /elasticsearch_api.py: -------------------------------------------------------------------------------- 1 | from json import dumps,loads 2 | from hashlib import sha224 3 | from datetime import datetime 4 | from requests import post,put,get 5 | from time import sleep 6 | import zmq 7 | import sys 8 | 9 | class ElasticSearch: 10 | #we keep deletes and index actions separate to control their purge frequencies 11 | activity_queue = [] 12 | clear_queue = False 13 | 14 | def __init__(self): 15 | self.last_queue_purge = datetime.now() 16 | self.config = loads(open("config.json").read()) 17 | 18 | def start(self): 19 | context = zmq.Context() 20 | op_queue = context.socket(zmq.PULL) 21 | op_queue.connect("tcp://127.0.0.1:5555") 22 | print "started listening" 23 | while True: 24 | message = op_queue.recv() 25 | operation = loads(message) 26 | self.index(index=operation["index"],document=operation["data"],doctype=operation["doctype"],bulk=True) 27 | 28 | #each document needs to have a totally unique id. if they're ever in conflict, you'll overwrite documents passively 29 | def generate_document_id(self,items): 30 | return sha224("".join(map(str,items))).hexdigest() 31 | 32 | def index(self,document,index,doctype,bulk=True): 33 | if bulk: 34 | index_command = {"index":{"_index":index,"_type":doctype,"_id":self.generate_document_id([document["MID"],document["OID"]])}} 35 | message = dumps(index_command) + "\n" + dumps(document) + "\n" 36 | self.activity_queue.append(message) 37 | 38 | #is it time to purge the queue? 39 | if len(self.activity_queue) > 1000: 40 | #if (datetime.now() - self.last_queue_purge).seconds >= self.config["queue_purge_frequency"]: 41 | self.purge_queue() 42 | self.last_queue_purge = datetime.now() 43 | else: 44 | pass 45 | 46 | def delete(self,documentID,index,doctype,bulk=True,routing=None): 47 | if bulk: 48 | delete_command = {} 49 | if routing == None: 50 | delete_command = {"delete":{"_index":index,"_type":doctype,"_id":documentID}} 51 | else: 52 | delete_command = {"delete":{"_index":index,"_type":doctype,"_id":documentID,"_routing":routing}} 53 | 54 | self.activity_queue.append(dumps(delete_command) + "\n") 55 | 56 | #is it time to purge the queue? 57 | if len(self.activity_queue) >= 0: 58 | #if (datetime.now() - self.last_queue_purge).seconds >= self.config["queue_purge_frequency"]: 59 | self.purge_queue() 60 | self.last_queue_purge = datetime.now() 61 | else: 62 | pass 63 | 64 | def count_matches(self,doctype,query): 65 | req = get(self.config["elasticsearch"]["connectionString"] + self.config["elasticsearch"]["index"] + "/"+doctype+"/_search",data=dumps(query)) 66 | if req.status_code == 200: 67 | return loads(req.content)["hits"]["total"] 68 | 69 | return 0 70 | 71 | def scroll_search(self,doctype,query): 72 | 73 | req = get(self.config["elasticsearch"]["connectionString"]+self.config["elasticsearch"]["index"] +"/"+ doctype+"/_search?search_type=scan&scroll=10m",data=dumps(query)) 74 | 75 | if not req.status_code == 200: 76 | raise Exception(req.content) 77 | 78 | res = loads(req.content) 79 | total_hits = res['hits']['total'] 80 | print "got back " + str(total_hits) + " results" 81 | scroll_id = res['_scroll_id'] 82 | 83 | new_total_hits = total_hits 84 | while not new_total_hits == 0: 85 | new_req = get(self.config["elasticsearch"]["connectionString"]+ '_search/scroll?scroll=10m',data=str(scroll_id)) 86 | if not new_req.status_code == 200: 87 | raise Exception(new_req.content) 88 | 89 | new_res = loads(new_req.content) 90 | for esdoc in new_res['hits']['hits']: 91 | yield esdoc 92 | 93 | def purge_queue(self): 94 | message = "".join(self.activity_queue) 95 | #print message 96 | 97 | #what if ES is down? we should wait on it. 98 | res = post(self.config["elasticsearch"]["connectionString"]+self.config["elasticsearch"]["index"]+"/_bulk",data=message) 99 | print res.content 100 | print "purged queue of " + str(len(self.activity_queue)) + " items. ES responded with: " + str(res.status_code) 101 | self.activity_queue = [] 102 | -------------------------------------------------------------------------------- /oplog.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from pymongo.errors import AutoReconnect 3 | from json import loads,dumps 4 | import sys 5 | from hooks import es_data_mapping 6 | import zmq 7 | from time import sleep 8 | 9 | class Oplog: 10 | 11 | def __init__(self): 12 | self.last_record = None 13 | try: 14 | self.config = loads(open("config.json").read()) 15 | except: 16 | print "failed to read config.json" 17 | sys.exit() 18 | 19 | def start(self): 20 | context = zmq.Context() 21 | es_queue = context.socket(zmq.PUSH) 22 | es_queue.bind("tcp://127.0.0.1:5555") 23 | print "loading oplog...." 24 | for op,data,namespace in self.watch_oplog(): 25 | if op == "i": 26 | document = es_data_mapping.remap(data) 27 | es_queue.send(dumps({"op":"index","data":document,"index":self.config["elasticsearch"]["index"],"doctype":namespace})) 28 | elif op == "d": 29 | #dont do this for now 30 | pass 31 | #self.es_queue.send(dumps({"op":"delete","data":document,"index":self.config["elasticsearch"]["index"],"doctype":namespace})) 32 | #mongo_delete_event.delete(index=self.config["elasticsearch"]["index"],doctype=namespace,documentID=data) 33 | 34 | def get_oplog_database(self): 35 | #open a connection to the oplog collection 36 | try: 37 | connection = pymongo.Connection(self.config["oplog"]["host"],self.config["oplog"]["port"]) 38 | db = connection[self.config["oplog"]["db"]] 39 | oplog_collection = db[self.config["oplog"]["collection"]] 40 | return oplog_collection 41 | except: 42 | print "failed to connect to oplog" 43 | sys.exit() 44 | 45 | def get_actual_database(self): 46 | #open a connection to the db. 47 | try: 48 | connection = pymongo.Connection(self.config["data"]["host"],self.config["data"]["port"]) 49 | actual_data = connection[self.config["data"]["db"]] 50 | return actual_data 51 | except: 52 | print "failed to connect to actual database" 53 | sys.exit() 54 | 55 | def watch_oplog(self): 56 | oplog_collection = self.get_oplog_database() 57 | actual_data = self.get_actual_database() 58 | 59 | #run forevermore 60 | while True: 61 | try: 62 | #oplogs are in a natural order, so we dont need to actually 63 | #iterate to find the latest insertion. we can just grab the last entry. 64 | #and maybe back up a little bit. you dont have to if you dont want to. 65 | last_index = oplog_collection.count() - 100 66 | cursor = oplog_collection.find(skip=last_index,tailable=True) 67 | 68 | while cursor.alive: 69 | for op in cursor: 70 | 71 | #parse the namespace of the op to figure out what collection its coming from 72 | try: 73 | op_collection_name = op['ns'].split('.')[1] 74 | except: 75 | continue 76 | 77 | #if its a collection we want to index, dump it 78 | if not op_collection_name in self.config["types_we_care_about"]: 79 | continue 80 | 81 | if op['op'] == 'i': 82 | yield ('i',op['o'],op_collection_name) 83 | elif op['op'] == 'u': 84 | #get the latest version of this document. luckily, we get the criteria 85 | criteria = op['o2'] 86 | data_cursor = actual_data[op_collection_name] 87 | docs = data_cursor.find(criteria) 88 | 89 | #its basically impossible for there to be more than one document here. 90 | #but just in case 91 | for doc in docs: 92 | yield ('i',doc,op_collection_name) 93 | 94 | elif op['op'] == 'd': 95 | yield ('d',op['o']['_id'],op_collection_name) 96 | 97 | except AutoReconnect: 98 | #log this terrible fact 99 | #continue 100 | pass --------------------------------------------------------------------------------