├── .gitignore ├── README.md ├── avalanche.py ├── console.py ├── pipelines └── default.json ├── plugins ├── __init__.py ├── base.py ├── capacitor.py ├── dumper.py ├── emailer.py ├── geoip │ ├── GeoIPASNum.dat │ └── plugin.py ├── kafka-consumer.py ├── lru-cache.py ├── matcher.py ├── mongo.py ├── mysql.py ├── replayer.py ├── sampler.py ├── template.py ├── test │ ├── counter.py │ ├── random.py │ └── tagger.py └── unifier.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.DS_Store 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Avalanche 2 | Realtime and Online Model Development Framework 3 | 4 | 5 | ## Quick Overview 6 | 7 | Avalanche is a framework that helps data analysts of the world implement their realtime models in a modular and distributed fashion. It is currently written in Python and leverages the mighty power of ZMQ streams. It technically helps you define a pipeline made of data processing nodes and stream connection edges. The edges connect the nodes and control how the data runs through the pipelines; the nodes process the data in your desired fashion. 8 | 9 | ## How to use 10 | 11 | Let's take a more practical look at the tool: 12 | 13 |
$ git clone https://github.com/ThibaultReuille/avalanche.git
14 | $ cd avalanche
15 | $ ./avalanche.py pipelines/default.json
16 | 
17 | 18 | Your are now running the default processing graph! It doesn't do much for now but this is about to change. We will now take a look at our default pipeline and understand how to manipulate it to make it do what we want. 19 | 20 | [Default Avalanche Pipeline](https://github.com/ThibaultReuille/avalanche/blob/master/pipelines/default.json "Default Avalanche Pipeline") 21 | 22 | This default pipeline configuration consists of 3 parts: 23 | 24 | - Attributes: This part defines the general environment of the stream graph. In particular, it can load the various plugins used in your pipeline. 25 | 26 | - Nodes: In this part, you can define all the processing nodes. In practice, each node will usually receive incoming messages, process them and send them through the pipeline. 27 | 28 | - Edges : This is where you connect the different nodes together and connect them to create your full data processing pipeline. 29 | 30 | Great! Now you know how to virtually define your full message processing pipeline with Avalanche! Let us dig deeper and explore how to create your own custom models. 31 | 32 | ## Write your own plugin 33 | 34 | For a more accurate description, let's refer to our plugin template: 35 | 36 | [Avalanche Plugin Template](https://github.com/ThibaultReuille/avalanche/blob/master/plugins/template.py "Avalanche Plugin Template") 37 | 38 | For the most part, the comments in this template file are self-explanatory. However we may simply add that each node plugin will loaded and bound to its node. The node information and members can be retrieved either in the constructor or through the node instance. The node processing code will run in its own thread and will receive/send info through the input/output node streams. The plugin definition interface is easy and simple enough for you to implement any kind of metrics, models, filters or any other realtime pipeline element. 39 | 40 | The rest is up to you! 41 | 42 | -------------------------------------------------------------------------------- /avalanche.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import json 5 | import pprint as pp 6 | import zmq 7 | import time 8 | import threading 9 | import md5 10 | import os.path 11 | import imp 12 | import traceback 13 | import string 14 | import random 15 | 16 | from abc import ABCMeta, abstractmethod 17 | 18 | import plugins.base 19 | 20 | context = dict() 21 | 22 | # ----- Nodes ----- 23 | 24 | class Node(object): 25 | def __init__(self, info): 26 | global context 27 | 28 | self.info = info 29 | 30 | # NOTE: Default connectors are sub/pub 31 | if "connectors" not in self.info: 32 | self.info['connectors'] = [ "sub", "pub" ] 33 | 34 | self.connectors = list() 35 | for connector in self. info['connectors']: 36 | if isinstance(connector, dict): 37 | self.connectors.append(connector) 38 | else: 39 | self.connectors.append({ 'type' : connector }) 40 | 41 | self.predecessors = list() 42 | self.successors = list() 43 | 44 | self.thread = None 45 | if 'type' in self.info and self.info['type'] == 'rack': 46 | self.plugin = plugins.base.PluginRack() 47 | print(" . plugins") 48 | for p in self.info['plugins']: 49 | rack_plugin_type = p['type'] 50 | print(" + {0}".format(rack_plugin_type)) 51 | rack_plugin = context['plugins'][rack_plugin_type](p) 52 | self.plugin.plugins.append(rack_plugin) 53 | elif 'type' in self.info and self.info['type'] != 'virtual': 54 | self.plugin = context['plugins'][self.info['type']](info) 55 | else: 56 | self.plugin = None 57 | 58 | class ZMQ_Node(Node): 59 | def __init__(self, info): 60 | global context 61 | super(ZMQ_Node, self).__init__(info) 62 | 63 | binders = [ 'pull', 'pub', 'router' ] 64 | 65 | for connector in self.connectors: 66 | if connector['type'] is None: 67 | continue 68 | if connector['type'] in binders: 69 | if 'url' in connector: 70 | print(" . {0}: {1}".format(connector['type'], connector['url'])) 71 | elif 'port' not in connector: 72 | if context['ports']['next'] > context['ports']['stop']: 73 | print("[WARNING] Defined port range is too small for pipeline! Collision may happen.") 74 | connector['port'] = context['ports']['next'] 75 | context['ports']['next'] += 1 76 | print(" . {0}: {1}".format(connector['type'], connector['port'])) 77 | else: 78 | print(" . {0}".format(connector['type'])) 79 | 80 | #print(" Connectors: {0}".format(self.connectors)) 81 | 82 | def initialize(self): 83 | 84 | if self.info['type'] == "virtual": 85 | return 86 | 87 | context = zmq.Context.instance() 88 | 89 | # ------ Input ----- 90 | 91 | input_connector = self.connectors[0] 92 | 93 | if input_connector['type'] is None: 94 | self.input = None 95 | 96 | elif input_connector['type'] == "sub": 97 | self.input = context.socket(zmq.SUB) 98 | for predecessor in self.predecessors: 99 | if 'url' in predecessor.connectors[1]: 100 | src_url = predecessor.connectors[1]['url'] 101 | else: 102 | src_url = "tcp://localhost:{0}".format(predecessor.connectors[1]['port']) 103 | 104 | print(" Connecting sub to {0} ...".format(src_url)) 105 | self.input.connect(src_url) 106 | self.input.setsockopt(zmq.SUBSCRIBE, '') 107 | 108 | elif input_connector['type'] == "pull": 109 | self.input = context.socket(zmq.PULL) 110 | url = "tcp://*:{0}".format(input_connector['port']) 111 | print(" Binding pull on {0} ...".format(url)) 112 | self.input.bind(url) 113 | 114 | elif input_connector['type'] == "dealer": 115 | self.input = context.socket(zmq.DEALER) 116 | self.identity = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8)) 117 | self.input.setsockopt(zmq.IDENTITY, self.identity) 118 | for predecessor in self.predecessors: 119 | if 'url' in predecessor.connectors[1]: 120 | src_url = predecessor.connectors[1]['url'] 121 | else: 122 | src_url = "tcp://localhost:{0}".format(predecessor.connectors[1]['port']) 123 | 124 | print(" Connecting dealer to {0} ...".format(src_url)) 125 | self.input.connect(src_url) 126 | 127 | else: 128 | print("[ERROR] '{0}': Unsupported input connector type!".format(self.connectors[0])) 129 | 130 | # ----- Output ----- 131 | 132 | output_connector = self.connectors[1] 133 | 134 | if output_connector['type'] is None: 135 | self.output = None 136 | 137 | elif output_connector['type'] == "pub": 138 | self.output = context.socket(zmq.PUB) 139 | url = "tcp://*:{0}".format(output_connector['port']) 140 | print(" Binding pub on {0} ...".format(url)) 141 | self.output.bind(url) 142 | 143 | elif output_connector['type'] == "push": 144 | self.output = context.socket(zmq.PUSH) 145 | for successor in self.successors: 146 | if 'url' in successor.connectors[0]: 147 | dst_url = successor.connectors[0]['url'] 148 | else: 149 | dst_url = "tcp://localhost:{0}".format(successor.connectors[0]['port']) 150 | 151 | print(" Connecting push to {0} ...".format(dst_url)) 152 | self.output.connect(dst_url) 153 | 154 | elif output_connector['type'] == "router": 155 | self.output = context.socket(zmq.ROUTER) 156 | url = "tcp://*:{0}".format(output_connector['port']) 157 | print(" Binding router on {0} ...".format(url)) 158 | self.output.bind(url) 159 | 160 | else: 161 | print("[ERROR] '{0}': Unsupported output connector type!".format(self.connectors[1])) 162 | 163 | def run(self): 164 | self.initialize() 165 | # TODO : We should have plugins wait here for all to be ready 166 | if self.plugin is not None: 167 | self.plugin.run(self) 168 | 169 | # ----- Edges ----- 170 | 171 | class Edge(object): 172 | def __init__(self, info): 173 | pass 174 | def run(self): 175 | pass 176 | 177 | class ZMQ_Edge(Edge): 178 | def __init__(self, info): 179 | super(ZMQ_Edge, self).__init__(info) 180 | 181 | global context 182 | self.info = info 183 | self.src = context['graph'].nodes[info['src']] 184 | self.dst = context['graph'].nodes[info['dst']] 185 | 186 | print("\t. Connecting {0} and {1} ...".format(self.src.info['id'], self.dst.info['id'])) 187 | self.dst.predecessors.append(self.src) 188 | self.src.successors.append(self.dst) 189 | 190 | def run(self): 191 | pass 192 | 193 | class Graph(object): 194 | def __init__(self): 195 | self.nodes = dict() 196 | self.edges = dict() 197 | self.threads = list() 198 | 199 | def create_node(self, info): 200 | if 'id' not in info or 'type' not in info: 201 | return None 202 | uid = info['id'] 203 | self.nodes[uid] = ZMQ_Node(info) 204 | 205 | def create_edge(self, info): 206 | if 'id' not in info: 207 | return None 208 | uid = info['id'] 209 | self.edges[uid] = ZMQ_Edge(info) 210 | 211 | def start(self): 212 | for k in self.nodes.keys(): 213 | if self.nodes[k].plugin is None: 214 | continue 215 | repr(self.nodes[k]) 216 | self.nodes[k].thread = threading.Thread(target=self.nodes[k].run) 217 | self.nodes[k].thread.start() 218 | self.threads.append(self.nodes[k].thread) 219 | 220 | def wait(self): 221 | for thread in self.threads: 222 | thread.join() 223 | 224 | # ----- Main ----- 225 | 226 | def load_module(code_path): 227 | try: 228 | try: 229 | code_dir = os.path.dirname(code_path) 230 | code_file = os.path.basename(code_path) 231 | fin = open(code_path, 'rb') 232 | return imp.load_source(md5.new(code_path).hexdigest(), code_path, fin) 233 | finally: 234 | try: fin.close() 235 | except: pass 236 | except ImportError, x: 237 | traceback.print_exc(file = sys.stderr) 238 | raise 239 | except: 240 | traceback.print_exc(file = sys.stderr) 241 | raise 242 | 243 | def main(conf): 244 | 245 | global context 246 | 247 | # ----- Load plugins ----- 248 | 249 | context['plugins'] = dict() 250 | 251 | if "attributes" in conf: 252 | if "plugins" in conf['attributes']: 253 | for plugin in conf['attributes']['plugins']: 254 | name = plugin['name'] 255 | filename = plugin['filename'] 256 | 257 | plugin_module = load_module(filename) 258 | plugin_class = getattr(plugin_module, 'Plugin') 259 | context['plugins'][name] = plugin_class 260 | 261 | print("[PLUGIN] {0}: {1}".format(name, filename)) 262 | 263 | # ----- Load graph ----- 264 | 265 | context['graph'] = Graph() 266 | 267 | for item in conf['nodes']: 268 | print("[NODE] {0}: {1}".format(item['id'], item['type'])) 269 | context['graph'].create_node(item) 270 | 271 | edges = dict() 272 | for item in conf['edges']: 273 | print("[EDGE] {0}: {1} -> {2}".format(item['id'], item['src'], item['dst'])) 274 | context['graph'].create_edge(item) 275 | 276 | print("[INFO] Launching node threads ...") 277 | context['graph'].start() 278 | print("[INFO] Running.") 279 | context['graph'].wait() 280 | 281 | while True: 282 | print("Main idle.") 283 | time.sleep(1) 284 | 285 | if __name__ == "__main__": 286 | 287 | if len(sys.argv) != 3: 288 | print("Usage: {0} ".format(sys.argv[0])) 289 | sys.exit(0) 290 | 291 | ports = [ int(p) for p in sys.argv[2].split('-') ] 292 | if len(ports) == 1: 293 | ports.append(65535) 294 | 295 | context['ports'] = { 296 | 'start' : ports[0], 297 | 'stop' : ports[1], 298 | 'next' : ports[0] 299 | } 300 | 301 | with open(sys.argv[1], "rU") as conf: 302 | jconf = json.load(conf) 303 | main(jconf) -------------------------------------------------------------------------------- /console.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import zmq 5 | import json 6 | import time 7 | import pprint as pp 8 | 9 | if len(sys.argv) != 3: 10 | print("Usage: {0} ".format(sys.argv[0])) 11 | sys.exit(0) 12 | 13 | ctx = zmq.Context() 14 | s = ctx.socket(zmq.SUB) 15 | 16 | url = "tcp://{0}:{1}".format(sys.argv[1], sys.argv[2]) 17 | print("Connecting to " + url + " ...") 18 | s.connect(url) 19 | s.setsockopt(zmq.SUBSCRIBE, '') 20 | 21 | metrics = dict() 22 | metrics['volume'] = 0 23 | metrics['start_time'] = time.time() 24 | 25 | time_delay = 10 26 | 27 | while True: 28 | line = s.recv() 29 | 30 | metrics['current_time'] = time.time() 31 | metrics['volume'] += 1 32 | 33 | if metrics['current_time'] - metrics['start_time'] > time_delay: 34 | 35 | metrics['msg/sec'] = metrics['volume'] / (metrics['current_time'] - metrics['start_time']) 36 | print(json.dumps(metrics)) 37 | 38 | metrics['start_time'] = metrics['current_time'] 39 | metrics['volume'] = 0 40 | 41 | print("{0}: {1}".format(time.time(), line)) 42 | 43 | -------------------------------------------------------------------------------- /pipelines/default.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "attributes" : { 4 | "plugins" : [ 5 | { "name" : "sampler", "filename" : "plugins/sampler.py" } 6 | ] 7 | }, 8 | 9 | "nodes" : [ 10 | { 11 | "id" : 0, 12 | "type" : "stream", 13 | "url" : "tcp://localhost:10000/" 14 | }, 15 | 16 | { 17 | "id" : 1, 18 | "type" : "sampler", 19 | "port" : 10001, 20 | "attributes" : 21 | { 22 | "probability" : 0.01 23 | } 24 | } 25 | ], 26 | 27 | "edges" : [ 28 | { 29 | "id" : 0, 30 | "src" : 0, 31 | "dst" : 1 32 | } 33 | ] 34 | } 35 | -------------------------------------------------------------------------------- /plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThibaultReuille/avalanche/846a0c83bc782049403d08ce1dbb75d5ec3be5d6/plugins/__init__.py -------------------------------------------------------------------------------- /plugins/base.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import datetime 4 | 5 | from collections import deque 6 | 7 | class Plugin(object): 8 | def __init__(self, configuration): 9 | self.configuration = configuration 10 | 11 | def log(self, message): 12 | print("[{}] {} - {}".format( 13 | self.configuration["type"], 14 | datetime.datetime.now().strftime("%Y%m%d %H:%M:%S"), 15 | message 16 | )) 17 | 18 | def run(self, node): 19 | while True: 20 | data = node.input.recv() 21 | message = json.loads(data) 22 | 23 | output = self.process_message(message) 24 | 25 | if output is None: 26 | continue 27 | 28 | if isinstance(output, list): 29 | for msg in output: 30 | node.output.send_json(msg) 31 | else: 32 | node.output.send_json(output) 33 | 34 | def process_message(self, message): 35 | return message 36 | 37 | class PluginRack(Plugin): 38 | def __init__(self): 39 | self.plugins = list() 40 | 41 | def run(self, node): 42 | while True: 43 | 44 | data = node.input.recv() 45 | message = json.loads(data) 46 | 47 | input_messages = deque() 48 | input_messages.append(message) 49 | 50 | output_messages = deque() 51 | 52 | for plugin in self.plugins: 53 | 54 | while len(input_messages) > 0: 55 | msg = input_messages.popleft() 56 | output = plugin.process_message(msg) 57 | 58 | if output is None: 59 | continue 60 | elif isinstance(output, list): 61 | output_messages.extend(output) 62 | else: 63 | output_messages.append(output) 64 | 65 | input_messages, output_messages = output_messages, input_messages 66 | 67 | while len(input_messages) > 0: 68 | node.output.send_json(input_messages.popleft()) -------------------------------------------------------------------------------- /plugins/capacitor.py: -------------------------------------------------------------------------------- 1 | import plugins.base 2 | 3 | import json 4 | import time 5 | from datetime import datetime 6 | import random 7 | import string 8 | import os 9 | 10 | class Plugin(plugins.base.Plugin): 11 | def __init__(self, info): 12 | self.message_limit = None 13 | self.time_limit = None 14 | self.cache = None 15 | 16 | if 'message-limit' in info['attributes']: 17 | self.message_limit = info['attributes']['message-limit'] 18 | if 'time-limit' in info['attributes']: 19 | self.time_limit = info['attributes']['time-limit'] 20 | 21 | if self.message_limit is None and self.time_limit is None: 22 | raise('[ERROR] No capacitor limit defined!') 23 | 24 | if 'cache' in info['attributes']: 25 | self.cache = info['attributes']['cache'] 26 | self.create_cache_dir(self.cache) 27 | 28 | self.last_time = time.time() 29 | self.last_count = 0 30 | self.messages = list() 31 | self.flush = False 32 | 33 | def create_cache_dir(self, path): 34 | try: 35 | os.makedirs(path) 36 | except OSError: 37 | if not os.path.isdir(path): 38 | raise 39 | 40 | def create_random_word(self, length): 41 | return ''.join(random.choice(string.lowercase) for i in range(length)) 42 | 43 | def process_message(self, message): 44 | 45 | if self.flush: 46 | self.messages = list() 47 | self.flush = False 48 | 49 | self.messages.append(message) 50 | 51 | t = time.time() 52 | 53 | if (self.message_limit is not None and len(self.messages) > self.message_limit) or \ 54 | (self.time_limit is not None and t - self.last_time > self.time_limit): 55 | 56 | # NOTE : This doesn't work if we don't receive messages 57 | if self.cache is not None: 58 | filename = "{0}/{1}.{2}.json".format(self.cache, datetime.utcnow().strftime("%Y-%m-%d_%H:%M:%S"), self.create_random_word(4)) 59 | with open(filename, 'w') as outfile: 60 | json.dump(self.messages, outfile) 61 | 62 | self.last_time = t 63 | self.flush = True 64 | 65 | return self.messages 66 | else: 67 | return None 68 | 69 | if __name__ == "__main__": 70 | print("Please import this file!") -------------------------------------------------------------------------------- /plugins/dumper.py: -------------------------------------------------------------------------------- 1 | import plugins.base 2 | import json 3 | import time 4 | import datetime 5 | import os 6 | 7 | class Plugin(plugins.base.Plugin): 8 | def __init__(self, info): 9 | self.folder = info['attributes']['folder'] 10 | 11 | if "filename" in info['attributes']: 12 | self.filename = info['attributes']['filename'] 13 | else: 14 | self.filename = "%Y-%m-%d_%H:%M:%S.json" 15 | 16 | self.create_cache_dir(self.folder) 17 | 18 | def create_cache_dir(self, path): 19 | try: 20 | os.makedirs(path) 21 | except OSError: 22 | if not os.path.isdir(path): 23 | raise 24 | 25 | def process_message(self, message): 26 | 27 | now = time.time() 28 | 29 | timestamp = datetime.datetime.fromtimestamp(now).strftime(self.filename) 30 | with open("{0}/{1}".format(self.folder, timestamp), "w") as result_file: 31 | json.dump(message, result_file, indent=4) 32 | 33 | return message 34 | 35 | if __name__ == "__main__": 36 | print("Please import this file!") -------------------------------------------------------------------------------- /plugins/emailer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import uuid 3 | import time 4 | import datetime 5 | 6 | import smtplib 7 | import email.utils 8 | from email.mime.text import MIMEText 9 | 10 | class Plugin(object): 11 | def __init__(self, info): 12 | self.smtp_server = info['attributes']['smtp-server'] 13 | self.mail_from = info['attributes']['mail-from'] 14 | self.mail_to = info['attributes']['mail-to'] 15 | self.mail_name = info['attributes']['mail-name'] 16 | self.mail_subject = info['attributes']['mail-subject'] 17 | 18 | def make_content(self, uid, message): 19 | content = "" 20 | content += "{0}\n\n".format(uid) 21 | content += json.dumps(message, indent=4, sort_keys=True) 22 | content += "\n\n{0}\n".format(uid) 23 | return content 24 | 25 | def run(self, node): 26 | while True: 27 | data = node.input.recv() 28 | message = json.loads(data) 29 | 30 | uid = uuid.uuid4() 31 | timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S UTC') 32 | 33 | subject = "{0} - {1}".format(self.mail_subject, timestamp) 34 | text = self.make_content(uid, message) 35 | 36 | msg = MIMEText(text, 'plain') 37 | msg['From'] = email.utils.formataddr((self.mail_name, self.mail_from)) 38 | msg['To'] = email.utils.formataddr(('Recipient', self.mail_to)) 39 | msg['Subject'] = subject 40 | 41 | server = smtplib.SMTP(self.smtp_server) 42 | try: 43 | server.sendmail(self.mail_name, [self.mail_to], msg.as_string()) 44 | finally: 45 | server.quit() 46 | 47 | node.output.send_json({ 48 | 'From' : [self.mail_name, self.mail_from], 49 | 'To' : self.mail_to, 50 | 'Subject' : subject, 51 | 'uid' : "{0}".format(uid) 52 | }) 53 | 54 | if __name__ == "__main__": 55 | print("Please import this file!") -------------------------------------------------------------------------------- /plugins/geoip/GeoIPASNum.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThibaultReuille/avalanche/846a0c83bc782049403d08ce1dbb75d5ec3be5d6/plugins/geoip/GeoIPASNum.dat -------------------------------------------------------------------------------- /plugins/geoip/plugin.py: -------------------------------------------------------------------------------- 1 | import plugins.base 2 | 3 | import pygeoip 4 | import json 5 | import os 6 | 7 | class Plugin(plugins.base.Plugin): 8 | def __init__(self, info): 9 | current_dir = os.path.dirname(os.path.realpath(__file__)) 10 | self.gi_asn = pygeoip.GeoIP(current_dir + "/GeoIPASNum.dat") 11 | self.actions = info['attributes']['actions'] 12 | 13 | def process_message(self, message): 14 | geoip = dict() 15 | 16 | for action in self.actions: 17 | value = None 18 | try: 19 | get_key = action['get'] 20 | if get_key not in message: 21 | continue 22 | set_key = action['set'] 23 | action_key = action['action'] 24 | if action_key == 'asn_by_addr': 25 | value = self.gi_asn.asn_by_addr(message[get_key]) 26 | # TODO : Implement other GeoIP actions 27 | except: 28 | pass 29 | finally: 30 | geoip[set_key] = value 31 | 32 | message['geoip'] = geoip 33 | 34 | return message 35 | 36 | 37 | if __name__ == "__main__": 38 | print("Please import this file!") -------------------------------------------------------------------------------- /plugins/kafka-consumer.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from kafka.client import KafkaClient 4 | from kafka.consumer import SimpleConsumer 5 | 6 | class Plugin(object): 7 | def __init__(self, info): 8 | self.host = info['attributes']['host'] 9 | self.group = info['attributes']['group'] 10 | self.topic = info['attributes']['topic'] 11 | 12 | self.client = KafkaClient(self.host) 13 | self.consumer = SimpleConsumer(client, self.group, self.topic) 14 | 15 | def run(self, node): 16 | 17 | while True: 18 | 19 | for message in self.consumer: 20 | node.output.send_json(message) 21 | 22 | if __name__ == "__main__": 23 | print("Please import this file!") -------------------------------------------------------------------------------- /plugins/lru-cache.py: -------------------------------------------------------------------------------- 1 | import plugins.base 2 | 3 | import pylru 4 | 5 | class Plugin(plugins.base.Plugin): 6 | def __init__(self, info): 7 | self.size = info['attributes']['size'] 8 | self.key = info['attributes']['key'] 9 | 10 | self.cache = pylru.lrucache(self.size) 11 | self.ready = False 12 | 13 | def process_message(self, message): 14 | key = message[self.key] 15 | 16 | if key not in self.cache: 17 | self.cache[key] = True 18 | if self.ready: 19 | return message 20 | else: 21 | if len(self.cache) >= self.size: 22 | self.ready = True 23 | print("[LRU-CACHE] Cache is now ready.") 24 | return None 25 | else: 26 | # NOTE: Forcing lookup to update cache even though we don't need the value 27 | value = self.cache[key] 28 | return None 29 | 30 | if __name__ == "__main__": 31 | print("Please import this file!") -------------------------------------------------------------------------------- /plugins/matcher.py: -------------------------------------------------------------------------------- 1 | import plugins.base 2 | 3 | import json 4 | import fnmatch 5 | import re 6 | 7 | class Filter(object): 8 | def __init__(self, attributes): 9 | self.field = attributes['field'] 10 | self.condition = attributes['condition'] 11 | self.result = attributes['result'] 12 | 13 | if 'values' in attributes: 14 | self.values = attributes['values'] 15 | elif 'file' in attributes: 16 | with open(attributes['file'], "rU") as infile: 17 | self.values = [ line.strip() for line in infile.readlines() ] 18 | else: 19 | raise Exception("No 'values' or 'file' field defined!") 20 | 21 | def test(self, message): 22 | raise Exception("Not implemented!") 23 | 24 | class InFilter(Filter): 25 | def __init__(self, attributes): 26 | super(InFilter, self).__init__(attributes) 27 | self.elements = set(self.values) 28 | 29 | def test(self, message): 30 | return message[self.field] in self.elements 31 | 32 | class MatchFilter(Filter): 33 | def __init__(self, attributes): 34 | super(MatchFilter, self).__init__(attributes) 35 | 36 | def test(self, message): 37 | for element in self.values: 38 | if not fnmatch.fnmatch(message[self.field], element): 39 | return False 40 | return True 41 | 42 | class RegexFilter(Filter): 43 | def __init__(self, attributes): 44 | super(RegexFilter, self).__init__(attributes) 45 | 46 | self.expressions = list() 47 | for element in self.values: 48 | self.expressions.append(re.compile(element)) 49 | 50 | def test(self, message): 51 | for expression in self.expressions: 52 | if not expression.match(message[self.field]): 53 | return False 54 | return True 55 | 56 | class Plugin(plugins.base.Plugin): 57 | def __init__(self, info): 58 | 59 | processor_info = info['attributes']['processor'] 60 | 61 | self.processor = list() 62 | for i in range(len(processor_info)): 63 | try: 64 | p = None 65 | if processor_info[i]['condition'] == "in": 66 | p = InFilter(processor_info[i]) 67 | elif processor_info[i]['condition'] == "match": 68 | p = MatchFilter(processor_info[i]) 69 | elif processor_info[i]['condition'] == "regex": 70 | p = RegexFilter(processor_info[i]) 71 | else: 72 | raise Exception("Unknown condition: '{0}'!".format(processor_info['condition'])) 73 | self.processor.append(p) 74 | except Exception, e: 75 | print("[ERROR] Couldn't parse matcher processor: {0}".format(str(e))) 76 | 77 | def process_message(self, message): 78 | result = True 79 | for item in self.processor: 80 | result = result and (item.test(message) == item.result) 81 | if not result: 82 | break 83 | 84 | return message if result else None 85 | 86 | if __name__ == "__main__": 87 | print("Please import this file!") -------------------------------------------------------------------------------- /plugins/mongo.py: -------------------------------------------------------------------------------- 1 | import plugins.base 2 | 3 | import json 4 | import pymongo 5 | 6 | class Plugin(plugins.base.Plugin): 7 | def __init__(self, info): 8 | self.host = info['attributes']["host"] 9 | self.port = int(info['attributes']["port"]) 10 | self.database = info['attributes']["database"] 11 | self.collection = info['attributes']["collection"] 12 | self.indices = info['attributes']['indices'] 13 | 14 | print("[Mongo] Connecting to {}:{}/{}/{}".format(self.host, self.port, self.database, self.collection)) 15 | self.client = pymongo.MongoClient(self.host, self.port) 16 | self.target = self.client[self.database][self.collection] 17 | 18 | for index in self.indices: 19 | print("[Mongo] Creating index on key '{}' ...".format(index)) 20 | self.target.create_index([ (index, pymongo.ASCENDING) ]) 21 | 22 | def process_message(self, message): 23 | # NOTE: This ensures we have the right JSON format for BSON encoding (UTF-8) 24 | json_string = json.dumps(message) 25 | json_obj = json.loads(json_string) 26 | 27 | result = self.target.insert_one(json_obj) 28 | return message 29 | -------------------------------------------------------------------------------- /plugins/mysql.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import MySQLdb 4 | 5 | class Plugin(object): 6 | 7 | def __init__(self, info): 8 | self.host = info['attributes']['host'] 9 | self.user = info['attributes']['user'] 10 | self.passwd = info['attributes']['passwd'] 11 | self.database = info['attributes']['database'] 12 | self.query = info['attributes']['query'] 13 | self.period = info['attributes']['period'] 14 | 15 | def run(self, node): 16 | 17 | while True: 18 | 19 | connection = MySQLdb.connect(host=self.host, user=self.user, passwd=self.passwd, db=self.database) 20 | 21 | cursor = connection.cursor() 22 | cursor.execute(self.query) 23 | 24 | for row in cursor.fetchall() : 25 | print row 26 | node.output.send_json(row) 27 | 28 | cursor.close() 29 | connection.close() 30 | 31 | time.sleep(self.period) 32 | 33 | if __name__ == "__main__": 34 | print("Please import this file!") 35 | 36 | 37 | -------------------------------------------------------------------------------- /plugins/replayer.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | import mmap 4 | import glob 5 | 6 | class Plugin(object): 7 | def __init__(self, info): 8 | self.path = info['attributes']['path'] 9 | self.schema = info['attributes']['schema'] 10 | self.delimiter = info['attributes']['delimiter'] 11 | self.delay = 1.0 12 | 13 | self.log_files = glob.glob(self.path) 14 | if len(self.log_files) == 0: 15 | print("[ERROR] No file found in path for replay!") 16 | 17 | def run(self, node): 18 | while True: 19 | for filename in self.log_files: 20 | with open(filename, "rU") as logfile: 21 | 22 | for line in logfile: 23 | split = line.strip().split(self.delimiter) 24 | if len(split) != len(self.schema): 25 | continue 26 | 27 | message = dict() 28 | for i in range(len(split)): 29 | message[self.schema[i]] = split[i] 30 | 31 | node.output.send_json(message) 32 | 33 | time.sleep(self.delay) 34 | #break 35 | 36 | if __name__ == "__main__": 37 | print("Please import this file!") 38 | -------------------------------------------------------------------------------- /plugins/sampler.py: -------------------------------------------------------------------------------- 1 | import plugins.base 2 | import random 3 | 4 | class Plugin(plugins.base.Plugin): 5 | def __init__(self, info): 6 | self.probability = info['attributes']['probability'] 7 | 8 | def process_message(self, message): 9 | return message if random.uniform(0, 1) <= self.probability else None 10 | 11 | if __name__ == "__main__": 12 | print("Please import this file!") -------------------------------------------------------------------------------- /plugins/template.py: -------------------------------------------------------------------------------- 1 | import json 2 | import plugins.base 3 | 4 | class Plugin1(plugins.base.Plugin): 5 | def __init__(self, info): 6 | super(Plugin, self).__init__(info) 7 | # NOTE: The info argument contains the full node definition 8 | # written in the pipeline configuration file. 9 | pass 10 | 11 | def process_message(self, message): 12 | # NOTE : Here we can process the message, add field, remove, etc. 13 | # Retuning None drops the message from the pipeline. 14 | return message 15 | 16 | class Plugin2(plugins.base.Plugin): 17 | def __init__(self, info): 18 | super(Plugin, self).__init__(info) 19 | # NOTE: The info argument contains the full node definition 20 | # written in the pipeline configuration file. 21 | pass 22 | 23 | def run(self, node): 24 | # NOTE: Each node runs on its own thread/process, 25 | # Here we enter our infinite loop. 26 | while True: 27 | 28 | # NOTE: Read incoming data sent to our node 29 | data = node.input.recv() 30 | 31 | # NOTE: Parse it as a JSON message 32 | message = json.loads(data) 33 | 34 | # NOTE: This template plugin doesn't do anything except being a passthru filter. 35 | # This is where the processing would actually happen in a real processor. 36 | # You can send whatever data you like in the output stream. That can be a modified 37 | # version of the incoming messages or any other message of your creation. 38 | 39 | # NOTE: Send it back through the pipeline 40 | node.output.send_json(message) 41 | 42 | if __name__ == "__main__": 43 | print("Please import this file!") 44 | -------------------------------------------------------------------------------- /plugins/test/counter.py: -------------------------------------------------------------------------------- 1 | import plugins.base 2 | 3 | import random 4 | 5 | class Plugin(plugins.base.Plugin): 6 | def __init__(self, info): 7 | self.count = 0 8 | 9 | def process_message(self, message): 10 | self.count += 1 11 | message['counter'] = self.count 12 | return message 13 | 14 | if __name__ == "__main__": 15 | print("Please import this file!") -------------------------------------------------------------------------------- /plugins/test/random.py: -------------------------------------------------------------------------------- 1 | import random 2 | import json 3 | 4 | class Plugin(object): 5 | def __init__(self, info): 6 | pass 7 | 8 | def run(self, node): 9 | while True: 10 | #node.output.send_json({ 'number' : random.uniform(-1, 1) }) 11 | node.output.send_json({ 'number' : random.randint(0, 100) }) 12 | 13 | if __name__ == "__main__": 14 | print("Please import this file!") -------------------------------------------------------------------------------- /plugins/test/tagger.py: -------------------------------------------------------------------------------- 1 | import plugins.base 2 | 3 | class Plugin(plugins.base.Plugin): 4 | def __init__(self, info): 5 | self.field = info['attributes']['field'] 6 | self.value = info['attributes']['value'] 7 | 8 | def process_message(self, message): 9 | message[self.field] = self.value 10 | return message 11 | 12 | if __name__ == "__main__": 13 | print("Please import this file!") -------------------------------------------------------------------------------- /plugins/unifier.py: -------------------------------------------------------------------------------- 1 | import plugins.base 2 | 3 | class Plugin(plugins.base.Plugin): 4 | def __init__(self, info): 5 | self.history = info['attributes']['history'] 6 | self.fields = info['attributes']['fields'] 7 | 8 | self.cache = list() 9 | self.index = -1 10 | 11 | def process_message(self, message): 12 | 13 | vector = list() 14 | for k in self.fields: 15 | if k not in message: 16 | return message 17 | vector.append(message[k]) 18 | 19 | found = False 20 | for e in self.cache: 21 | if e == vector: 22 | found = True 23 | break 24 | 25 | #print(self.cache, vector, found) 26 | 27 | if found: 28 | return None 29 | else: 30 | if len(self.cache) < self.history: 31 | self.cache.append(vector) 32 | else: 33 | self.cache[self.index] = vector 34 | self.index = (self.index + 1) % self.history 35 | 36 | return message 37 | 38 | if __name__ == "__main__": 39 | print("Please import this file!") -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyzmq==16.0.2 2 | --------------------------------------------------------------------------------