├── .gitignore
├── README.md
├── avalanche.py
├── console.py
├── pipelines
└── default.json
├── plugins
├── __init__.py
├── base.py
├── capacitor.py
├── dumper.py
├── emailer.py
├── geoip
│ ├── GeoIPASNum.dat
│ └── plugin.py
├── kafka-consumer.py
├── lru-cache.py
├── matcher.py
├── mongo.py
├── mysql.py
├── replayer.py
├── sampler.py
├── template.py
├── test
│ ├── counter.py
│ ├── random.py
│ └── tagger.py
└── unifier.py
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.DS_Store
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Avalanche
2 | Realtime and Online Model Development Framework
3 |
4 |
5 | ## Quick Overview
6 |
7 | Avalanche is a framework that helps data analysts of the world implement their realtime models in a modular and distributed fashion. It is currently written in Python and leverages the mighty power of ZMQ streams. It technically helps you define a pipeline made of data processing nodes and stream connection edges. The edges connect the nodes and control how the data runs through the pipelines; the nodes process the data in your desired fashion.
8 |
9 | ## How to use
10 |
11 | Let's take a more practical look at the tool:
12 |
13 |
$ git clone https://github.com/ThibaultReuille/avalanche.git
14 | $ cd avalanche
15 | $ ./avalanche.py pipelines/default.json
16 |
17 |
18 | Your are now running the default processing graph! It doesn't do much for now but this is about to change. We will now take a look at our default pipeline and understand how to manipulate it to make it do what we want.
19 |
20 | [Default Avalanche Pipeline](https://github.com/ThibaultReuille/avalanche/blob/master/pipelines/default.json "Default Avalanche Pipeline")
21 |
22 | This default pipeline configuration consists of 3 parts:
23 |
24 | - Attributes: This part defines the general environment of the stream graph. In particular, it can load the various plugins used in your pipeline.
25 |
26 | - Nodes: In this part, you can define all the processing nodes. In practice, each node will usually receive incoming messages, process them and send them through the pipeline.
27 |
28 | - Edges : This is where you connect the different nodes together and connect them to create your full data processing pipeline.
29 |
30 | Great! Now you know how to virtually define your full message processing pipeline with Avalanche! Let us dig deeper and explore how to create your own custom models.
31 |
32 | ## Write your own plugin
33 |
34 | For a more accurate description, let's refer to our plugin template:
35 |
36 | [Avalanche Plugin Template](https://github.com/ThibaultReuille/avalanche/blob/master/plugins/template.py "Avalanche Plugin Template")
37 |
38 | For the most part, the comments in this template file are self-explanatory. However we may simply add that each node plugin will loaded and bound to its node. The node information and members can be retrieved either in the constructor or through the node instance. The node processing code will run in its own thread and will receive/send info through the input/output node streams. The plugin definition interface is easy and simple enough for you to implement any kind of metrics, models, filters or any other realtime pipeline element.
39 |
40 | The rest is up to you!
41 |
42 |
--------------------------------------------------------------------------------
/avalanche.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | import json
5 | import pprint as pp
6 | import zmq
7 | import time
8 | import threading
9 | import md5
10 | import os.path
11 | import imp
12 | import traceback
13 | import string
14 | import random
15 |
16 | from abc import ABCMeta, abstractmethod
17 |
18 | import plugins.base
19 |
20 | context = dict()
21 |
22 | # ----- Nodes -----
23 |
24 | class Node(object):
25 | def __init__(self, info):
26 | global context
27 |
28 | self.info = info
29 |
30 | # NOTE: Default connectors are sub/pub
31 | if "connectors" not in self.info:
32 | self.info['connectors'] = [ "sub", "pub" ]
33 |
34 | self.connectors = list()
35 | for connector in self. info['connectors']:
36 | if isinstance(connector, dict):
37 | self.connectors.append(connector)
38 | else:
39 | self.connectors.append({ 'type' : connector })
40 |
41 | self.predecessors = list()
42 | self.successors = list()
43 |
44 | self.thread = None
45 | if 'type' in self.info and self.info['type'] == 'rack':
46 | self.plugin = plugins.base.PluginRack()
47 | print(" . plugins")
48 | for p in self.info['plugins']:
49 | rack_plugin_type = p['type']
50 | print(" + {0}".format(rack_plugin_type))
51 | rack_plugin = context['plugins'][rack_plugin_type](p)
52 | self.plugin.plugins.append(rack_plugin)
53 | elif 'type' in self.info and self.info['type'] != 'virtual':
54 | self.plugin = context['plugins'][self.info['type']](info)
55 | else:
56 | self.plugin = None
57 |
58 | class ZMQ_Node(Node):
59 | def __init__(self, info):
60 | global context
61 | super(ZMQ_Node, self).__init__(info)
62 |
63 | binders = [ 'pull', 'pub', 'router' ]
64 |
65 | for connector in self.connectors:
66 | if connector['type'] is None:
67 | continue
68 | if connector['type'] in binders:
69 | if 'url' in connector:
70 | print(" . {0}: {1}".format(connector['type'], connector['url']))
71 | elif 'port' not in connector:
72 | if context['ports']['next'] > context['ports']['stop']:
73 | print("[WARNING] Defined port range is too small for pipeline! Collision may happen.")
74 | connector['port'] = context['ports']['next']
75 | context['ports']['next'] += 1
76 | print(" . {0}: {1}".format(connector['type'], connector['port']))
77 | else:
78 | print(" . {0}".format(connector['type']))
79 |
80 | #print(" Connectors: {0}".format(self.connectors))
81 |
82 | def initialize(self):
83 |
84 | if self.info['type'] == "virtual":
85 | return
86 |
87 | context = zmq.Context.instance()
88 |
89 | # ------ Input -----
90 |
91 | input_connector = self.connectors[0]
92 |
93 | if input_connector['type'] is None:
94 | self.input = None
95 |
96 | elif input_connector['type'] == "sub":
97 | self.input = context.socket(zmq.SUB)
98 | for predecessor in self.predecessors:
99 | if 'url' in predecessor.connectors[1]:
100 | src_url = predecessor.connectors[1]['url']
101 | else:
102 | src_url = "tcp://localhost:{0}".format(predecessor.connectors[1]['port'])
103 |
104 | print(" Connecting sub to {0} ...".format(src_url))
105 | self.input.connect(src_url)
106 | self.input.setsockopt(zmq.SUBSCRIBE, '')
107 |
108 | elif input_connector['type'] == "pull":
109 | self.input = context.socket(zmq.PULL)
110 | url = "tcp://*:{0}".format(input_connector['port'])
111 | print(" Binding pull on {0} ...".format(url))
112 | self.input.bind(url)
113 |
114 | elif input_connector['type'] == "dealer":
115 | self.input = context.socket(zmq.DEALER)
116 | self.identity = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
117 | self.input.setsockopt(zmq.IDENTITY, self.identity)
118 | for predecessor in self.predecessors:
119 | if 'url' in predecessor.connectors[1]:
120 | src_url = predecessor.connectors[1]['url']
121 | else:
122 | src_url = "tcp://localhost:{0}".format(predecessor.connectors[1]['port'])
123 |
124 | print(" Connecting dealer to {0} ...".format(src_url))
125 | self.input.connect(src_url)
126 |
127 | else:
128 | print("[ERROR] '{0}': Unsupported input connector type!".format(self.connectors[0]))
129 |
130 | # ----- Output -----
131 |
132 | output_connector = self.connectors[1]
133 |
134 | if output_connector['type'] is None:
135 | self.output = None
136 |
137 | elif output_connector['type'] == "pub":
138 | self.output = context.socket(zmq.PUB)
139 | url = "tcp://*:{0}".format(output_connector['port'])
140 | print(" Binding pub on {0} ...".format(url))
141 | self.output.bind(url)
142 |
143 | elif output_connector['type'] == "push":
144 | self.output = context.socket(zmq.PUSH)
145 | for successor in self.successors:
146 | if 'url' in successor.connectors[0]:
147 | dst_url = successor.connectors[0]['url']
148 | else:
149 | dst_url = "tcp://localhost:{0}".format(successor.connectors[0]['port'])
150 |
151 | print(" Connecting push to {0} ...".format(dst_url))
152 | self.output.connect(dst_url)
153 |
154 | elif output_connector['type'] == "router":
155 | self.output = context.socket(zmq.ROUTER)
156 | url = "tcp://*:{0}".format(output_connector['port'])
157 | print(" Binding router on {0} ...".format(url))
158 | self.output.bind(url)
159 |
160 | else:
161 | print("[ERROR] '{0}': Unsupported output connector type!".format(self.connectors[1]))
162 |
163 | def run(self):
164 | self.initialize()
165 | # TODO : We should have plugins wait here for all to be ready
166 | if self.plugin is not None:
167 | self.plugin.run(self)
168 |
169 | # ----- Edges -----
170 |
171 | class Edge(object):
172 | def __init__(self, info):
173 | pass
174 | def run(self):
175 | pass
176 |
177 | class ZMQ_Edge(Edge):
178 | def __init__(self, info):
179 | super(ZMQ_Edge, self).__init__(info)
180 |
181 | global context
182 | self.info = info
183 | self.src = context['graph'].nodes[info['src']]
184 | self.dst = context['graph'].nodes[info['dst']]
185 |
186 | print("\t. Connecting {0} and {1} ...".format(self.src.info['id'], self.dst.info['id']))
187 | self.dst.predecessors.append(self.src)
188 | self.src.successors.append(self.dst)
189 |
190 | def run(self):
191 | pass
192 |
193 | class Graph(object):
194 | def __init__(self):
195 | self.nodes = dict()
196 | self.edges = dict()
197 | self.threads = list()
198 |
199 | def create_node(self, info):
200 | if 'id' not in info or 'type' not in info:
201 | return None
202 | uid = info['id']
203 | self.nodes[uid] = ZMQ_Node(info)
204 |
205 | def create_edge(self, info):
206 | if 'id' not in info:
207 | return None
208 | uid = info['id']
209 | self.edges[uid] = ZMQ_Edge(info)
210 |
211 | def start(self):
212 | for k in self.nodes.keys():
213 | if self.nodes[k].plugin is None:
214 | continue
215 | repr(self.nodes[k])
216 | self.nodes[k].thread = threading.Thread(target=self.nodes[k].run)
217 | self.nodes[k].thread.start()
218 | self.threads.append(self.nodes[k].thread)
219 |
220 | def wait(self):
221 | for thread in self.threads:
222 | thread.join()
223 |
224 | # ----- Main -----
225 |
226 | def load_module(code_path):
227 | try:
228 | try:
229 | code_dir = os.path.dirname(code_path)
230 | code_file = os.path.basename(code_path)
231 | fin = open(code_path, 'rb')
232 | return imp.load_source(md5.new(code_path).hexdigest(), code_path, fin)
233 | finally:
234 | try: fin.close()
235 | except: pass
236 | except ImportError, x:
237 | traceback.print_exc(file = sys.stderr)
238 | raise
239 | except:
240 | traceback.print_exc(file = sys.stderr)
241 | raise
242 |
243 | def main(conf):
244 |
245 | global context
246 |
247 | # ----- Load plugins -----
248 |
249 | context['plugins'] = dict()
250 |
251 | if "attributes" in conf:
252 | if "plugins" in conf['attributes']:
253 | for plugin in conf['attributes']['plugins']:
254 | name = plugin['name']
255 | filename = plugin['filename']
256 |
257 | plugin_module = load_module(filename)
258 | plugin_class = getattr(plugin_module, 'Plugin')
259 | context['plugins'][name] = plugin_class
260 |
261 | print("[PLUGIN] {0}: {1}".format(name, filename))
262 |
263 | # ----- Load graph -----
264 |
265 | context['graph'] = Graph()
266 |
267 | for item in conf['nodes']:
268 | print("[NODE] {0}: {1}".format(item['id'], item['type']))
269 | context['graph'].create_node(item)
270 |
271 | edges = dict()
272 | for item in conf['edges']:
273 | print("[EDGE] {0}: {1} -> {2}".format(item['id'], item['src'], item['dst']))
274 | context['graph'].create_edge(item)
275 |
276 | print("[INFO] Launching node threads ...")
277 | context['graph'].start()
278 | print("[INFO] Running.")
279 | context['graph'].wait()
280 |
281 | while True:
282 | print("Main idle.")
283 | time.sleep(1)
284 |
285 | if __name__ == "__main__":
286 |
287 | if len(sys.argv) != 3:
288 | print("Usage: {0} ".format(sys.argv[0]))
289 | sys.exit(0)
290 |
291 | ports = [ int(p) for p in sys.argv[2].split('-') ]
292 | if len(ports) == 1:
293 | ports.append(65535)
294 |
295 | context['ports'] = {
296 | 'start' : ports[0],
297 | 'stop' : ports[1],
298 | 'next' : ports[0]
299 | }
300 |
301 | with open(sys.argv[1], "rU") as conf:
302 | jconf = json.load(conf)
303 | main(jconf)
--------------------------------------------------------------------------------
/console.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | import zmq
5 | import json
6 | import time
7 | import pprint as pp
8 |
9 | if len(sys.argv) != 3:
10 | print("Usage: {0} ".format(sys.argv[0]))
11 | sys.exit(0)
12 |
13 | ctx = zmq.Context()
14 | s = ctx.socket(zmq.SUB)
15 |
16 | url = "tcp://{0}:{1}".format(sys.argv[1], sys.argv[2])
17 | print("Connecting to " + url + " ...")
18 | s.connect(url)
19 | s.setsockopt(zmq.SUBSCRIBE, '')
20 |
21 | metrics = dict()
22 | metrics['volume'] = 0
23 | metrics['start_time'] = time.time()
24 |
25 | time_delay = 10
26 |
27 | while True:
28 | line = s.recv()
29 |
30 | metrics['current_time'] = time.time()
31 | metrics['volume'] += 1
32 |
33 | if metrics['current_time'] - metrics['start_time'] > time_delay:
34 |
35 | metrics['msg/sec'] = metrics['volume'] / (metrics['current_time'] - metrics['start_time'])
36 | print(json.dumps(metrics))
37 |
38 | metrics['start_time'] = metrics['current_time']
39 | metrics['volume'] = 0
40 |
41 | print("{0}: {1}".format(time.time(), line))
42 |
43 |
--------------------------------------------------------------------------------
/pipelines/default.json:
--------------------------------------------------------------------------------
1 |
2 | {
3 | "attributes" : {
4 | "plugins" : [
5 | { "name" : "sampler", "filename" : "plugins/sampler.py" }
6 | ]
7 | },
8 |
9 | "nodes" : [
10 | {
11 | "id" : 0,
12 | "type" : "stream",
13 | "url" : "tcp://localhost:10000/"
14 | },
15 |
16 | {
17 | "id" : 1,
18 | "type" : "sampler",
19 | "port" : 10001,
20 | "attributes" :
21 | {
22 | "probability" : 0.01
23 | }
24 | }
25 | ],
26 |
27 | "edges" : [
28 | {
29 | "id" : 0,
30 | "src" : 0,
31 | "dst" : 1
32 | }
33 | ]
34 | }
35 |
--------------------------------------------------------------------------------
/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThibaultReuille/avalanche/846a0c83bc782049403d08ce1dbb75d5ec3be5d6/plugins/__init__.py
--------------------------------------------------------------------------------
/plugins/base.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import datetime
4 |
5 | from collections import deque
6 |
7 | class Plugin(object):
8 | def __init__(self, configuration):
9 | self.configuration = configuration
10 |
11 | def log(self, message):
12 | print("[{}] {} - {}".format(
13 | self.configuration["type"],
14 | datetime.datetime.now().strftime("%Y%m%d %H:%M:%S"),
15 | message
16 | ))
17 |
18 | def run(self, node):
19 | while True:
20 | data = node.input.recv()
21 | message = json.loads(data)
22 |
23 | output = self.process_message(message)
24 |
25 | if output is None:
26 | continue
27 |
28 | if isinstance(output, list):
29 | for msg in output:
30 | node.output.send_json(msg)
31 | else:
32 | node.output.send_json(output)
33 |
34 | def process_message(self, message):
35 | return message
36 |
37 | class PluginRack(Plugin):
38 | def __init__(self):
39 | self.plugins = list()
40 |
41 | def run(self, node):
42 | while True:
43 |
44 | data = node.input.recv()
45 | message = json.loads(data)
46 |
47 | input_messages = deque()
48 | input_messages.append(message)
49 |
50 | output_messages = deque()
51 |
52 | for plugin in self.plugins:
53 |
54 | while len(input_messages) > 0:
55 | msg = input_messages.popleft()
56 | output = plugin.process_message(msg)
57 |
58 | if output is None:
59 | continue
60 | elif isinstance(output, list):
61 | output_messages.extend(output)
62 | else:
63 | output_messages.append(output)
64 |
65 | input_messages, output_messages = output_messages, input_messages
66 |
67 | while len(input_messages) > 0:
68 | node.output.send_json(input_messages.popleft())
--------------------------------------------------------------------------------
/plugins/capacitor.py:
--------------------------------------------------------------------------------
1 | import plugins.base
2 |
3 | import json
4 | import time
5 | from datetime import datetime
6 | import random
7 | import string
8 | import os
9 |
10 | class Plugin(plugins.base.Plugin):
11 | def __init__(self, info):
12 | self.message_limit = None
13 | self.time_limit = None
14 | self.cache = None
15 |
16 | if 'message-limit' in info['attributes']:
17 | self.message_limit = info['attributes']['message-limit']
18 | if 'time-limit' in info['attributes']:
19 | self.time_limit = info['attributes']['time-limit']
20 |
21 | if self.message_limit is None and self.time_limit is None:
22 | raise('[ERROR] No capacitor limit defined!')
23 |
24 | if 'cache' in info['attributes']:
25 | self.cache = info['attributes']['cache']
26 | self.create_cache_dir(self.cache)
27 |
28 | self.last_time = time.time()
29 | self.last_count = 0
30 | self.messages = list()
31 | self.flush = False
32 |
33 | def create_cache_dir(self, path):
34 | try:
35 | os.makedirs(path)
36 | except OSError:
37 | if not os.path.isdir(path):
38 | raise
39 |
40 | def create_random_word(self, length):
41 | return ''.join(random.choice(string.lowercase) for i in range(length))
42 |
43 | def process_message(self, message):
44 |
45 | if self.flush:
46 | self.messages = list()
47 | self.flush = False
48 |
49 | self.messages.append(message)
50 |
51 | t = time.time()
52 |
53 | if (self.message_limit is not None and len(self.messages) > self.message_limit) or \
54 | (self.time_limit is not None and t - self.last_time > self.time_limit):
55 |
56 | # NOTE : This doesn't work if we don't receive messages
57 | if self.cache is not None:
58 | filename = "{0}/{1}.{2}.json".format(self.cache, datetime.utcnow().strftime("%Y-%m-%d_%H:%M:%S"), self.create_random_word(4))
59 | with open(filename, 'w') as outfile:
60 | json.dump(self.messages, outfile)
61 |
62 | self.last_time = t
63 | self.flush = True
64 |
65 | return self.messages
66 | else:
67 | return None
68 |
69 | if __name__ == "__main__":
70 | print("Please import this file!")
--------------------------------------------------------------------------------
/plugins/dumper.py:
--------------------------------------------------------------------------------
1 | import plugins.base
2 | import json
3 | import time
4 | import datetime
5 | import os
6 |
7 | class Plugin(plugins.base.Plugin):
8 | def __init__(self, info):
9 | self.folder = info['attributes']['folder']
10 |
11 | if "filename" in info['attributes']:
12 | self.filename = info['attributes']['filename']
13 | else:
14 | self.filename = "%Y-%m-%d_%H:%M:%S.json"
15 |
16 | self.create_cache_dir(self.folder)
17 |
18 | def create_cache_dir(self, path):
19 | try:
20 | os.makedirs(path)
21 | except OSError:
22 | if not os.path.isdir(path):
23 | raise
24 |
25 | def process_message(self, message):
26 |
27 | now = time.time()
28 |
29 | timestamp = datetime.datetime.fromtimestamp(now).strftime(self.filename)
30 | with open("{0}/{1}".format(self.folder, timestamp), "w") as result_file:
31 | json.dump(message, result_file, indent=4)
32 |
33 | return message
34 |
35 | if __name__ == "__main__":
36 | print("Please import this file!")
--------------------------------------------------------------------------------
/plugins/emailer.py:
--------------------------------------------------------------------------------
1 | import json
2 | import uuid
3 | import time
4 | import datetime
5 |
6 | import smtplib
7 | import email.utils
8 | from email.mime.text import MIMEText
9 |
10 | class Plugin(object):
11 | def __init__(self, info):
12 | self.smtp_server = info['attributes']['smtp-server']
13 | self.mail_from = info['attributes']['mail-from']
14 | self.mail_to = info['attributes']['mail-to']
15 | self.mail_name = info['attributes']['mail-name']
16 | self.mail_subject = info['attributes']['mail-subject']
17 |
18 | def make_content(self, uid, message):
19 | content = ""
20 | content += "{0}\n\n".format(uid)
21 | content += json.dumps(message, indent=4, sort_keys=True)
22 | content += "\n\n{0}\n".format(uid)
23 | return content
24 |
25 | def run(self, node):
26 | while True:
27 | data = node.input.recv()
28 | message = json.loads(data)
29 |
30 | uid = uuid.uuid4()
31 | timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S UTC')
32 |
33 | subject = "{0} - {1}".format(self.mail_subject, timestamp)
34 | text = self.make_content(uid, message)
35 |
36 | msg = MIMEText(text, 'plain')
37 | msg['From'] = email.utils.formataddr((self.mail_name, self.mail_from))
38 | msg['To'] = email.utils.formataddr(('Recipient', self.mail_to))
39 | msg['Subject'] = subject
40 |
41 | server = smtplib.SMTP(self.smtp_server)
42 | try:
43 | server.sendmail(self.mail_name, [self.mail_to], msg.as_string())
44 | finally:
45 | server.quit()
46 |
47 | node.output.send_json({
48 | 'From' : [self.mail_name, self.mail_from],
49 | 'To' : self.mail_to,
50 | 'Subject' : subject,
51 | 'uid' : "{0}".format(uid)
52 | })
53 |
54 | if __name__ == "__main__":
55 | print("Please import this file!")
--------------------------------------------------------------------------------
/plugins/geoip/GeoIPASNum.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThibaultReuille/avalanche/846a0c83bc782049403d08ce1dbb75d5ec3be5d6/plugins/geoip/GeoIPASNum.dat
--------------------------------------------------------------------------------
/plugins/geoip/plugin.py:
--------------------------------------------------------------------------------
1 | import plugins.base
2 |
3 | import pygeoip
4 | import json
5 | import os
6 |
7 | class Plugin(plugins.base.Plugin):
8 | def __init__(self, info):
9 | current_dir = os.path.dirname(os.path.realpath(__file__))
10 | self.gi_asn = pygeoip.GeoIP(current_dir + "/GeoIPASNum.dat")
11 | self.actions = info['attributes']['actions']
12 |
13 | def process_message(self, message):
14 | geoip = dict()
15 |
16 | for action in self.actions:
17 | value = None
18 | try:
19 | get_key = action['get']
20 | if get_key not in message:
21 | continue
22 | set_key = action['set']
23 | action_key = action['action']
24 | if action_key == 'asn_by_addr':
25 | value = self.gi_asn.asn_by_addr(message[get_key])
26 | # TODO : Implement other GeoIP actions
27 | except:
28 | pass
29 | finally:
30 | geoip[set_key] = value
31 |
32 | message['geoip'] = geoip
33 |
34 | return message
35 |
36 |
37 | if __name__ == "__main__":
38 | print("Please import this file!")
--------------------------------------------------------------------------------
/plugins/kafka-consumer.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from kafka.client import KafkaClient
4 | from kafka.consumer import SimpleConsumer
5 |
6 | class Plugin(object):
7 | def __init__(self, info):
8 | self.host = info['attributes']['host']
9 | self.group = info['attributes']['group']
10 | self.topic = info['attributes']['topic']
11 |
12 | self.client = KafkaClient(self.host)
13 | self.consumer = SimpleConsumer(client, self.group, self.topic)
14 |
15 | def run(self, node):
16 |
17 | while True:
18 |
19 | for message in self.consumer:
20 | node.output.send_json(message)
21 |
22 | if __name__ == "__main__":
23 | print("Please import this file!")
--------------------------------------------------------------------------------
/plugins/lru-cache.py:
--------------------------------------------------------------------------------
1 | import plugins.base
2 |
3 | import pylru
4 |
5 | class Plugin(plugins.base.Plugin):
6 | def __init__(self, info):
7 | self.size = info['attributes']['size']
8 | self.key = info['attributes']['key']
9 |
10 | self.cache = pylru.lrucache(self.size)
11 | self.ready = False
12 |
13 | def process_message(self, message):
14 | key = message[self.key]
15 |
16 | if key not in self.cache:
17 | self.cache[key] = True
18 | if self.ready:
19 | return message
20 | else:
21 | if len(self.cache) >= self.size:
22 | self.ready = True
23 | print("[LRU-CACHE] Cache is now ready.")
24 | return None
25 | else:
26 | # NOTE: Forcing lookup to update cache even though we don't need the value
27 | value = self.cache[key]
28 | return None
29 |
30 | if __name__ == "__main__":
31 | print("Please import this file!")
--------------------------------------------------------------------------------
/plugins/matcher.py:
--------------------------------------------------------------------------------
1 | import plugins.base
2 |
3 | import json
4 | import fnmatch
5 | import re
6 |
7 | class Filter(object):
8 | def __init__(self, attributes):
9 | self.field = attributes['field']
10 | self.condition = attributes['condition']
11 | self.result = attributes['result']
12 |
13 | if 'values' in attributes:
14 | self.values = attributes['values']
15 | elif 'file' in attributes:
16 | with open(attributes['file'], "rU") as infile:
17 | self.values = [ line.strip() for line in infile.readlines() ]
18 | else:
19 | raise Exception("No 'values' or 'file' field defined!")
20 |
21 | def test(self, message):
22 | raise Exception("Not implemented!")
23 |
24 | class InFilter(Filter):
25 | def __init__(self, attributes):
26 | super(InFilter, self).__init__(attributes)
27 | self.elements = set(self.values)
28 |
29 | def test(self, message):
30 | return message[self.field] in self.elements
31 |
32 | class MatchFilter(Filter):
33 | def __init__(self, attributes):
34 | super(MatchFilter, self).__init__(attributes)
35 |
36 | def test(self, message):
37 | for element in self.values:
38 | if not fnmatch.fnmatch(message[self.field], element):
39 | return False
40 | return True
41 |
42 | class RegexFilter(Filter):
43 | def __init__(self, attributes):
44 | super(RegexFilter, self).__init__(attributes)
45 |
46 | self.expressions = list()
47 | for element in self.values:
48 | self.expressions.append(re.compile(element))
49 |
50 | def test(self, message):
51 | for expression in self.expressions:
52 | if not expression.match(message[self.field]):
53 | return False
54 | return True
55 |
56 | class Plugin(plugins.base.Plugin):
57 | def __init__(self, info):
58 |
59 | processor_info = info['attributes']['processor']
60 |
61 | self.processor = list()
62 | for i in range(len(processor_info)):
63 | try:
64 | p = None
65 | if processor_info[i]['condition'] == "in":
66 | p = InFilter(processor_info[i])
67 | elif processor_info[i]['condition'] == "match":
68 | p = MatchFilter(processor_info[i])
69 | elif processor_info[i]['condition'] == "regex":
70 | p = RegexFilter(processor_info[i])
71 | else:
72 | raise Exception("Unknown condition: '{0}'!".format(processor_info['condition']))
73 | self.processor.append(p)
74 | except Exception, e:
75 | print("[ERROR] Couldn't parse matcher processor: {0}".format(str(e)))
76 |
77 | def process_message(self, message):
78 | result = True
79 | for item in self.processor:
80 | result = result and (item.test(message) == item.result)
81 | if not result:
82 | break
83 |
84 | return message if result else None
85 |
86 | if __name__ == "__main__":
87 | print("Please import this file!")
--------------------------------------------------------------------------------
/plugins/mongo.py:
--------------------------------------------------------------------------------
1 | import plugins.base
2 |
3 | import json
4 | import pymongo
5 |
6 | class Plugin(plugins.base.Plugin):
7 | def __init__(self, info):
8 | self.host = info['attributes']["host"]
9 | self.port = int(info['attributes']["port"])
10 | self.database = info['attributes']["database"]
11 | self.collection = info['attributes']["collection"]
12 | self.indices = info['attributes']['indices']
13 |
14 | print("[Mongo] Connecting to {}:{}/{}/{}".format(self.host, self.port, self.database, self.collection))
15 | self.client = pymongo.MongoClient(self.host, self.port)
16 | self.target = self.client[self.database][self.collection]
17 |
18 | for index in self.indices:
19 | print("[Mongo] Creating index on key '{}' ...".format(index))
20 | self.target.create_index([ (index, pymongo.ASCENDING) ])
21 |
22 | def process_message(self, message):
23 | # NOTE: This ensures we have the right JSON format for BSON encoding (UTF-8)
24 | json_string = json.dumps(message)
25 | json_obj = json.loads(json_string)
26 |
27 | result = self.target.insert_one(json_obj)
28 | return message
29 |
--------------------------------------------------------------------------------
/plugins/mysql.py:
--------------------------------------------------------------------------------
1 | import json
2 | import time
3 | import MySQLdb
4 |
5 | class Plugin(object):
6 |
7 | def __init__(self, info):
8 | self.host = info['attributes']['host']
9 | self.user = info['attributes']['user']
10 | self.passwd = info['attributes']['passwd']
11 | self.database = info['attributes']['database']
12 | self.query = info['attributes']['query']
13 | self.period = info['attributes']['period']
14 |
15 | def run(self, node):
16 |
17 | while True:
18 |
19 | connection = MySQLdb.connect(host=self.host, user=self.user, passwd=self.passwd, db=self.database)
20 |
21 | cursor = connection.cursor()
22 | cursor.execute(self.query)
23 |
24 | for row in cursor.fetchall() :
25 | print row
26 | node.output.send_json(row)
27 |
28 | cursor.close()
29 | connection.close()
30 |
31 | time.sleep(self.period)
32 |
33 | if __name__ == "__main__":
34 | print("Please import this file!")
35 |
36 |
37 |
--------------------------------------------------------------------------------
/plugins/replayer.py:
--------------------------------------------------------------------------------
1 | import time
2 | import json
3 | import mmap
4 | import glob
5 |
6 | class Plugin(object):
7 | def __init__(self, info):
8 | self.path = info['attributes']['path']
9 | self.schema = info['attributes']['schema']
10 | self.delimiter = info['attributes']['delimiter']
11 | self.delay = 1.0
12 |
13 | self.log_files = glob.glob(self.path)
14 | if len(self.log_files) == 0:
15 | print("[ERROR] No file found in path for replay!")
16 |
17 | def run(self, node):
18 | while True:
19 | for filename in self.log_files:
20 | with open(filename, "rU") as logfile:
21 |
22 | for line in logfile:
23 | split = line.strip().split(self.delimiter)
24 | if len(split) != len(self.schema):
25 | continue
26 |
27 | message = dict()
28 | for i in range(len(split)):
29 | message[self.schema[i]] = split[i]
30 |
31 | node.output.send_json(message)
32 |
33 | time.sleep(self.delay)
34 | #break
35 |
36 | if __name__ == "__main__":
37 | print("Please import this file!")
38 |
--------------------------------------------------------------------------------
/plugins/sampler.py:
--------------------------------------------------------------------------------
1 | import plugins.base
2 | import random
3 |
4 | class Plugin(plugins.base.Plugin):
5 | def __init__(self, info):
6 | self.probability = info['attributes']['probability']
7 |
8 | def process_message(self, message):
9 | return message if random.uniform(0, 1) <= self.probability else None
10 |
11 | if __name__ == "__main__":
12 | print("Please import this file!")
--------------------------------------------------------------------------------
/plugins/template.py:
--------------------------------------------------------------------------------
1 | import json
2 | import plugins.base
3 |
4 | class Plugin1(plugins.base.Plugin):
5 | def __init__(self, info):
6 | super(Plugin, self).__init__(info)
7 | # NOTE: The info argument contains the full node definition
8 | # written in the pipeline configuration file.
9 | pass
10 |
11 | def process_message(self, message):
12 | # NOTE : Here we can process the message, add field, remove, etc.
13 | # Retuning None drops the message from the pipeline.
14 | return message
15 |
16 | class Plugin2(plugins.base.Plugin):
17 | def __init__(self, info):
18 | super(Plugin, self).__init__(info)
19 | # NOTE: The info argument contains the full node definition
20 | # written in the pipeline configuration file.
21 | pass
22 |
23 | def run(self, node):
24 | # NOTE: Each node runs on its own thread/process,
25 | # Here we enter our infinite loop.
26 | while True:
27 |
28 | # NOTE: Read incoming data sent to our node
29 | data = node.input.recv()
30 |
31 | # NOTE: Parse it as a JSON message
32 | message = json.loads(data)
33 |
34 | # NOTE: This template plugin doesn't do anything except being a passthru filter.
35 | # This is where the processing would actually happen in a real processor.
36 | # You can send whatever data you like in the output stream. That can be a modified
37 | # version of the incoming messages or any other message of your creation.
38 |
39 | # NOTE: Send it back through the pipeline
40 | node.output.send_json(message)
41 |
42 | if __name__ == "__main__":
43 | print("Please import this file!")
44 |
--------------------------------------------------------------------------------
/plugins/test/counter.py:
--------------------------------------------------------------------------------
1 | import plugins.base
2 |
3 | import random
4 |
5 | class Plugin(plugins.base.Plugin):
6 | def __init__(self, info):
7 | self.count = 0
8 |
9 | def process_message(self, message):
10 | self.count += 1
11 | message['counter'] = self.count
12 | return message
13 |
14 | if __name__ == "__main__":
15 | print("Please import this file!")
--------------------------------------------------------------------------------
/plugins/test/random.py:
--------------------------------------------------------------------------------
1 | import random
2 | import json
3 |
4 | class Plugin(object):
5 | def __init__(self, info):
6 | pass
7 |
8 | def run(self, node):
9 | while True:
10 | #node.output.send_json({ 'number' : random.uniform(-1, 1) })
11 | node.output.send_json({ 'number' : random.randint(0, 100) })
12 |
13 | if __name__ == "__main__":
14 | print("Please import this file!")
--------------------------------------------------------------------------------
/plugins/test/tagger.py:
--------------------------------------------------------------------------------
1 | import plugins.base
2 |
3 | class Plugin(plugins.base.Plugin):
4 | def __init__(self, info):
5 | self.field = info['attributes']['field']
6 | self.value = info['attributes']['value']
7 |
8 | def process_message(self, message):
9 | message[self.field] = self.value
10 | return message
11 |
12 | if __name__ == "__main__":
13 | print("Please import this file!")
--------------------------------------------------------------------------------
/plugins/unifier.py:
--------------------------------------------------------------------------------
1 | import plugins.base
2 |
3 | class Plugin(plugins.base.Plugin):
4 | def __init__(self, info):
5 | self.history = info['attributes']['history']
6 | self.fields = info['attributes']['fields']
7 |
8 | self.cache = list()
9 | self.index = -1
10 |
11 | def process_message(self, message):
12 |
13 | vector = list()
14 | for k in self.fields:
15 | if k not in message:
16 | return message
17 | vector.append(message[k])
18 |
19 | found = False
20 | for e in self.cache:
21 | if e == vector:
22 | found = True
23 | break
24 |
25 | #print(self.cache, vector, found)
26 |
27 | if found:
28 | return None
29 | else:
30 | if len(self.cache) < self.history:
31 | self.cache.append(vector)
32 | else:
33 | self.cache[self.index] = vector
34 | self.index = (self.index + 1) % self.history
35 |
36 | return message
37 |
38 | if __name__ == "__main__":
39 | print("Please import this file!")
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyzmq==16.0.2
2 |
--------------------------------------------------------------------------------