├── hooks
    ├── __init__.py
    └── es_data_mapping.py
├── .gitignore
├── run_op_emitter.py
├── run_es_consumer.py
├── config.json.default
├── README.md
├── scrutineer.py
├── elasticsearch_api.py
└── oplog.py


/hooks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | config.json
2 | *.pyc


--------------------------------------------------------------------------------
/hooks/es_data_mapping.py:
--------------------------------------------------------------------------------
1 | #this function will remap your data from a mongo document to whatever you want indexed.
2 | #you probably dont want to do anything
3 | def remap(document):
4 |     return document


--------------------------------------------------------------------------------
/run_op_emitter.py:
--------------------------------------------------------------------------------
1 | from oplog import Oplog
2 | from multiprocessing import Process
3 | 
4 | def run():
5 |     op_emitter = Oplog()
6 |     Process(target=op_emitter.start).start()
7 | 
8 | if __name__ == "__main__":
9 |     run()


--------------------------------------------------------------------------------
/run_es_consumer.py:
--------------------------------------------------------------------------------
1 | from elasticsearch_api import ElasticSearch
2 | from multiprocessing import Process
3 | 
4 | def run():
5 |         es = ElasticSearch()
6 |         Process(target=es.start).start()
7 | 
8 | if __name__ == "__main__":
9 |     run()


--------------------------------------------------------------------------------
/config.json.default:
--------------------------------------------------------------------------------
 1 | {
 2 |    "elasticsearch":{
 3 |       "connectionString":"",
 4 |       "index":""
 5 |    },
 6 |    "types_we_care_about":[
 7 |       "",
 8 |       ""
 9 |    ],
10 |    "oplog":{
11 |       "host":"",
12 |       "port":,
13 |       "db":"",
14 |       "collection":""
15 |    },
16 |    "data":{
17 |       "host":"",
18 |       "port":,
19 |       "db":""
20 |    },
21 |    "queue_purge_frequency":2
22 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <pre>
 2 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 3 |                     Version 2, December 2004
 4 | 
 5 |  Copyright (C) 2012 Oren Mazor <oren.mazor@gmail.com>
 6 | 
 7 |  Everyone is permitted to copy and distribute verbatim or modified
 8 |  copies of this license document, and changing it is allowed as long
 9 |  as the name is changed.
10 | 
11 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
12 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
13 | 
14 |   0. You just DO WHAT THE FUCK YOU WANT TO.
15 | </pre>
16 | 
17 | <p>I wrote this tool so that I can consume an extremely high bandwidth mongodb cluster via an ElasticSearch (lucene) index cluster. I need everything to be real time (i.e. the ES index HAS to be as close as possible to the mongodb cluster), and so all of the work I do in this repository is aiming for that goal.</p>
18 | 
19 | <p>Current versions of guaranteed compatibility: ElasticSearch 0.19.2, MongoDB 2.0.4, Python 2.7.2</p>


--------------------------------------------------------------------------------
/scrutineer.py:
--------------------------------------------------------------------------------
 1 | #sometimes you gotta check that everything's correct
 2 | from oplog import Oplog
 3 | from elasticsearch_api import ElasticSearch
 4 | from json import loads
 5 | import pdb
 6 | pdb.set_trace()
 7 | 
 8 | db = Oplog().get_actual_database()
 9 | es = ElasticSearch()
10 | 
11 | config = loads(open("config.json").read())
12 | 
13 | query = { "query": { "bool": { "must": [{"term":{"TY":0}}],"must_not": [ ],"should": [ ]}},"from": 0,"size": 50,"sort": [ ],"facets": { }}
14 | 
15 | #connect to your data from the ES side of things, to avoid blocking mongo.
16 | #we'll iterate over all of the records in ES and make sure they're up to date
17 | #with what is in mongo
18 | for record in es.scroll_search(query,index=config["elasticsearch"]["index"]):
19 |     docs = db.find({"_id":record['_source']['OID']})
20 |     if docs.count() == 0:
21 |         #the document exists in ES but not mongo, so we need to delete
22 |         result = es.delete(record["_id"],bulk=True)
23 |         print "deleted "+ record["_id"] + " from ES because its not in mongo"
24 |     else:
25 |         #this will never be above 1
26 |         for doc in docs:
27 |             #write your own sync code here. this is mine.
28 |             if not doc['TY'] != record['TY']:
29 |                 result = es.insert(doc,index=config["elasticsearch"]["index"],doctype="DeliveryEvent",bulk=True)
30 |                 print "inserted new version of " + record['_id'] + " because its old."
31 | 


--------------------------------------------------------------------------------
/elasticsearch_api.py:
--------------------------------------------------------------------------------
  1 | from json import dumps,loads
  2 | from hashlib import sha224
  3 | from datetime import datetime
  4 | from requests import post,put,get
  5 | from time import sleep
  6 | import zmq
  7 | import sys
  8 | 
  9 | class ElasticSearch:
 10 | 	#we keep deletes and index actions separate to control their purge frequencies
 11 | 	activity_queue = []
 12 | 	clear_queue = False
 13 | 
 14 | 	def __init__(self):
 15 | 		self.last_queue_purge = datetime.now()
 16 | 		self.config = loads(open("config.json").read())
 17 | 
 18 | 	def start(self):
 19 | 		context = zmq.Context()
 20 | 		op_queue = context.socket(zmq.PULL)
 21 |         op_queue.connect("tcp://127.0.0.1:5555")	
 22 |         print "started listening"
 23 |         while True:
 24 | 			message = op_queue.recv()
 25 | 			operation = loads(message)
 26 | 			self.index(index=operation["index"],document=operation["data"],doctype=operation["doctype"],bulk=True)
 27 | 								
 28 | 	#each document needs to have a totally unique id. if they're ever in conflict, you'll overwrite documents passively
 29 | 	def generate_document_id(self,items):
 30 | 		return sha224("".join(map(str,items))).hexdigest()
 31 | 
 32 | 	def index(self,document,index,doctype,bulk=True):
 33 | 		if bulk:
 34 | 			index_command = {"index":{"_index":index,"_type":doctype,"_id":self.generate_document_id([document["MID"],document["OID"]])}}
 35 | 			message = dumps(index_command) + "\n" + dumps(document) + "\n"
 36 | 			self.activity_queue.append(message)
 37 | 
 38 | 			#is it time to purge the queue?
 39 | 			if len(self.activity_queue) > 1000:
 40 | 			#if (datetime.now() - self.last_queue_purge).seconds >= self.config["queue_purge_frequency"]:
 41 | 				self.purge_queue()
 42 | 				self.last_queue_purge = datetime.now()
 43 | 		else:
 44 | 			pass
 45 | 
 46 | 	def delete(self,documentID,index,doctype,bulk=True,routing=None):
 47 | 		if bulk:
 48 | 			delete_command = {}
 49 | 			if routing == None:
 50 | 				delete_command = {"delete":{"_index":index,"_type":doctype,"_id":documentID}}
 51 | 			else:
 52 | 				delete_command = {"delete":{"_index":index,"_type":doctype,"_id":documentID,"_routing":routing}}
 53 | 
 54 | 			self.activity_queue.append(dumps(delete_command) + "\n")
 55 | 
 56 | 			#is it time to purge the queue?
 57 | 			if len(self.activity_queue) >= 0:
 58 | 			#if (datetime.now() - self.last_queue_purge).seconds >= self.config["queue_purge_frequency"]:
 59 | 				self.purge_queue()
 60 | 				self.last_queue_purge = datetime.now()
 61 | 		else:
 62 | 			pass
 63 | 
 64 | 	def count_matches(self,doctype,query):
 65 | 		req = get(self.config["elasticsearch"]["connectionString"] + self.config["elasticsearch"]["index"] + "/"+doctype+"/_search",data=dumps(query))
 66 | 		if req.status_code == 200:
 67 | 			return loads(req.content)["hits"]["total"]
 68 | 		
 69 | 		return 0
 70 | 
 71 | 	def scroll_search(self,doctype,query):
 72 | 		
 73 | 		req = get(self.config["elasticsearch"]["connectionString"]+self.config["elasticsearch"]["index"] +"/"+ doctype+"/_search?search_type=scan&scroll=10m",data=dumps(query))
 74 | 
 75 | 		if not req.status_code == 200:
 76 | 			raise Exception(req.content)
 77 | 
 78 | 		res = loads(req.content)
 79 | 		total_hits = res['hits']['total']
 80 | 		print "got back " + str(total_hits) + " results"
 81 | 		scroll_id = res['_scroll_id']
 82 | 
 83 | 		new_total_hits = total_hits
 84 | 		while not new_total_hits == 0:
 85 | 			new_req = get(self.config["elasticsearch"]["connectionString"]+ '_search/scroll?scroll=10m',data=str(scroll_id))
 86 | 			if not new_req.status_code == 200:
 87 | 				raise Exception(new_req.content)
 88 | 
 89 | 			new_res = loads(new_req.content)
 90 | 			for esdoc in new_res['hits']['hits']:
 91 | 				yield esdoc
 92 | 
 93 | 	def purge_queue(self):
 94 | 		message = "".join(self.activity_queue)
 95 | 		#print message
 96 | 		
 97 | 		#what if ES is down? we should wait on it.
 98 | 		res = post(self.config["elasticsearch"]["connectionString"]+self.config["elasticsearch"]["index"]+"/_bulk",data=message)
 99 | 		print res.content	
100 | 		print "purged queue of " + str(len(self.activity_queue)) + " items. ES responded with: " + str(res.status_code)
101 | 		self.activity_queue = []
102 | 


--------------------------------------------------------------------------------
/oplog.py:
--------------------------------------------------------------------------------
  1 | import pymongo
  2 | from pymongo.errors import AutoReconnect
  3 | from json import loads,dumps
  4 | import sys
  5 | from hooks import es_data_mapping
  6 | import zmq
  7 | from time import sleep
  8 | 
  9 | class Oplog:
 10 | 
 11 |     def __init__(self):
 12 |         self.last_record = None
 13 |         try:
 14 |             self.config = loads(open("config.json").read())
 15 |         except:
 16 |             print "failed to read config.json"
 17 |             sys.exit()
 18 | 
 19 |     def start(self):
 20 |         context = zmq.Context()
 21 |         es_queue = context.socket(zmq.PUSH)
 22 |         es_queue.bind("tcp://127.0.0.1:5555")
 23 |         print "loading oplog...."
 24 |         for op,data,namespace in self.watch_oplog():
 25 |             if op == "i":
 26 |                 document = es_data_mapping.remap(data)
 27 |                 es_queue.send(dumps({"op":"index","data":document,"index":self.config["elasticsearch"]["index"],"doctype":namespace}))
 28 |             elif op == "d":
 29 |                 #dont do this for now
 30 |                 pass
 31 |                 #self.es_queue.send(dumps({"op":"delete","data":document,"index":self.config["elasticsearch"]["index"],"doctype":namespace}))
 32 |                 #mongo_delete_event.delete(index=self.config["elasticsearch"]["index"],doctype=namespace,documentID=data)
 33 | 
 34 |     def get_oplog_database(self):
 35 |         #open a connection to the oplog collection
 36 |         try:
 37 |             connection = pymongo.Connection(self.config["oplog"]["host"],self.config["oplog"]["port"])
 38 |             db = connection[self.config["oplog"]["db"]]
 39 |             oplog_collection = db[self.config["oplog"]["collection"]]
 40 |             return oplog_collection
 41 |         except:
 42 |             print "failed to connect to oplog"
 43 |             sys.exit()       
 44 | 
 45 |     def get_actual_database(self):
 46 |         #open a connection to the db.
 47 |         try:
 48 |             connection = pymongo.Connection(self.config["data"]["host"],self.config["data"]["port"])
 49 |             actual_data = connection[self.config["data"]["db"]]
 50 |             return actual_data
 51 |         except:
 52 |             print "failed to connect to actual database"
 53 |             sys.exit()
 54 | 
 55 |     def watch_oplog(self):
 56 |         oplog_collection = self.get_oplog_database()
 57 |         actual_data = self.get_actual_database()
 58 | 
 59 |         #run forevermore
 60 |         while True:
 61 |             try:
 62 |                 #oplogs are in a natural order, so we dont need to actually
 63 |                 #iterate to find the latest insertion. we can just grab the last entry.
 64 |                 #and maybe back up a little bit. you dont have to if you dont want to.
 65 |                 last_index = oplog_collection.count() - 100 
 66 |                 cursor = oplog_collection.find(skip=last_index,tailable=True)
 67 | 
 68 |                 while cursor.alive:
 69 |                     for op in cursor:
 70 | 
 71 |                         #parse the namespace of the op to figure out what collection its coming from
 72 |                         try:
 73 |                             op_collection_name = op['ns'].split('.')[1]
 74 |                         except:
 75 |                             continue
 76 | 
 77 |                         #if its a collection we want to index, dump it
 78 |                         if not op_collection_name in self.config["types_we_care_about"]:
 79 |                             continue
 80 | 
 81 |                         if op['op'] == 'i':
 82 |                             yield ('i',op['o'],op_collection_name)
 83 |                         elif op['op'] == 'u':
 84 |                             #get the latest version of this document. luckily, we get the criteria
 85 |                             criteria = op['o2']
 86 |                             data_cursor = actual_data[op_collection_name]
 87 |                             docs = data_cursor.find(criteria)
 88 | 
 89 |                             #its basically impossible for there to be more than one document here. 
 90 |                             #but just in case
 91 |                             for doc in docs:
 92 |                                 yield ('i',doc,op_collection_name)
 93 | 
 94 |                         elif op['op'] == 'd':
 95 |                             yield ('d',op['o']['_id'],op_collection_name)
 96 | 
 97 |             except AutoReconnect:
 98 |                 #log this terrible fact
 99 |                 #continue
100 |                 pass


--------------------------------------------------------------------------------